#!/usr/bin/python2
# -*- coding: utf-8 -*-
import sqlite3 as lite
import os
import math
try:
from process.error_handling import KillByUser
except ImportError:
from trifusion.process.error_handling import KillByUser
"""my @steps = ( # Common
['updateMinimumEvalueExponent'],
['bestQueryTaxonScore'],
['qtscore_ix'],
# Ortholog
['bestHit'],
['best_hit_ix'],
['ortholog', ["drop table BestHit"]],
['orthologTaxon'],
['orthologAvg'],
['orthologAvgIndex'],
['orthologsNormalization', ["drop table OrthologAvgScore", "drop table OrthologTaxon", "drop table OrthologTemp"]],
# InParalog
['bestInterTaxonScore', ["drop table BestQueryTaxonScore"]],
['bis_uids_ix'],
['uniqueSimSeqsQueryId'],
['ust_qids_ix'],
['betterHit', ["drop table BestInterTaxonScore", "drop table UniqSimSeqsQueryId"]],
['better_hit_ix'],
['inParalog', ["drop table BetterHit"]],
['inParalogTaxonAvg'],
['orthologUniqueId'],
['orthologUniqueIdIndex'],
['inplgOrthTaxonAvg', ["drop table OrthologUniqueId"]],
['inParalogAvg',["drop table InParalogTaxonAvg", "drop table InplgOrthTaxonAvg"]],
['inParalogAvgIndex'],
['inParalogsNormalization', ["drop table InParalogAvgScore", "drop table InParalogTemp"]],
# CoOrtholog
['inParalog2Way'],
['in2a_ix'],
['in2b_ix'],
['ortholog2Way'],
['ortholog2WayIndex'],
['inplgOrthoInplg'],
['inParalogOrtholog'],
['coOrthologCandidate', ["drop table Ortholog2Way", "drop table InParalog2Way", "drop table InplgOrthoInplg", "drop table InParalogOrtholog"]],
['coOrthologNotOrtholog', ["drop table CoOrthologCandidate"]],
['coOrthologNotOrthologIndex'],
['coOrtholog', ["drop table CoOrthNotOrtholog"]],
['coOrthologTaxon'],
['coOrthologAvg'],
['coOrthologAvgIndex'],
['coOrthologsNormalization', ["drop table CoOrthologAvgScore", "drop table CoOrthologTaxon", "drop table CoOrthologTemp"]],
['cleanall', ["truncate table InParalog", "truncate table Ortholog", "truncate table CoOrtholog"]],
);
"""
################################################################################
############################### Auxiliar #################################
################################################################################
[docs]def log(value):
return math.log10(value)
[docs]def orthologTaxonSub (cur, co):
#assuming in perl a var is true if not ""
coCaps = "" if co == "" else "Co"
t1 = coCaps + 'OrthologTaxon'
t2 = coCaps + 'OrthologTemp'
cur.execute("create table %s as\
select case\
when taxon_id_a < taxon_id_b\
then taxon_id_a\
else taxon_id_b\
end as smaller_tax_id,\
case\
when taxon_id_a < taxon_id_b\
then taxon_id_b\
else taxon_id_a\
end as bigger_tax_id,\
unnormalized_score\
from %s " % (t1, t2))
#from (?) ", (coCaps + 'OrthologTaxon', coCaps + 'OrthologTemp'))
[docs]def normalizeOrthologsSub (cur, co, table):
#assuming in perl a var is true if not ""
coCaps = "" if co == "" else "Co"
co = "o" if co == "" else "coO"
t1 = coCaps + 'OrthologAvgScore'
t2 = coCaps + 'OrthologTaxon'
cur.execute("create table %s as\
select smaller_tax_id, bigger_tax_id, avg(unnormalized_score) avg_score\
from %s\
group by smaller_tax_id, bigger_tax_id" % (t1, t2))
################################################################
t1 = co+'orthoAvg_ix'
t2 = coCaps+'OrthologAvgScore'
cur.execute("create unique index %s on %s(smaller_tax_id,bigger_tax_id,avg_score)" % (t1, t2))
################################################################
t1 = coCaps + 'OrthologTemp'
t2 = coCaps + 'OrthologAvgScore'
cur.execute("insert into %s (sequence_id_a, sequence_id_b, taxon_id_a, taxon_id_b, unnormalized_score, normalized_score)\
select ot.sequence_id_a, ot.sequence_id_b, ot.taxon_id_a, ot.taxon_id_b, ot.unnormalized_score, ot.unnormalized_score/a.avg_score\
from %s ot, %s a\
where min(ot.taxon_id_a, ot.taxon_id_b) = a.smaller_tax_id\
and max(ot.taxon_id_a, ot.taxon_id_b) = a.bigger_tax_id" % (table, t1, t2))
################################################################################
############################### Common tables #################################
################################################################################
[docs]def commonTempTables (cur):
cur.execute("select min(evalue_exp)\
from SimilarSequences\
where evalue_mant != 0")
tup = cur.fetchone()
minEvalueExp = tup[0] - 1
cur.execute("update SimilarSequences\
set evalue_exp = ?\
where evalue_exp = 0 and evalue_mant = 0", (minEvalueExp,))
##########################################################################
cur.execute("create table BestQueryTaxonScore as\
select im.query_id as query_id, im.subject_taxon_id as subject_taxon_id, low_exp.evalue_exp as evalue_exp, min(im.evalue_mant) as evalue_mant\
from InterTaxonMatch im,\
(select query_id, subject_taxon_id, min(evalue_exp) as evalue_exp\
from InterTaxonMatch\
group by query_id, subject_taxon_id) low_exp\
where im.query_id = low_exp.query_id\
and im.subject_taxon_id = low_exp.subject_taxon_id\
and im.evalue_exp = low_exp.evalue_exp\
group by im.query_id, im.subject_taxon_id, low_exp.evalue_exp")
################################################################################
cur.execute("create unique index qtscore_ix on BestQueryTaxonScore(query_id, subject_taxon_id, evalue_exp, evalue_mant)")
################################################################################
############################### Orthologs #####################################
################################################################################
[docs]def orthologs(cur):
cur.execute("create table BestHit as\
select s.query_id, s.subject_id,\
s.query_taxon_id, s.subject_taxon_id,\
s.evalue_exp, s.evalue_mant\
from SimilarSequences s, BestQueryTaxonScore cutoff\
where s.query_id = cutoff.query_id\
and s.subject_taxon_id = cutoff.subject_taxon_id\
and s.query_taxon_id != s.subject_taxon_id\
and s.evalue_exp <= -5\
and s.percent_match >= 50\
and (s.evalue_mant < 0.01\
or s.evalue_exp = cutoff.evalue_exp\
and s.evalue_mant = cutoff.evalue_mant)")
cur.execute("create unique index best_hit_ix on BestHit(query_id,subject_id)")
######################################################################
cur.execute("create table OrthologTemp as\
select bh1.query_id as sequence_id_a, bh1.subject_id as sequence_id_b,\
bh1.query_taxon_id as taxon_id_a, bh1.subject_taxon_id as taxon_id_b,\
case\
when bh1.evalue_mant < 0.01 or bh2.evalue_mant < 0.01\
then (bh1.evalue_exp + bh2.evalue_exp) / -2\
else\
(log(bh1.evalue_mant * bh2.evalue_mant)\
+ bh1.evalue_exp + bh2.evalue_exp) / -2\
end as unnormalized_score\
from BestHit bh1, BestHit bh2\
where bh1.query_id < bh1.subject_id\
and bh1.query_id = bh2.subject_id\
and bh1.subject_id = bh2.query_id")
######################################################################
orthologTaxonSub(cur, '')
######################################################################
normalizeOrthologsSub(cur, '', "Ortholog")
################################################################################
############################### InParalogs ####################################
################################################################################
[docs]def inparalogs (cur):
cur.execute("create table BestInterTaxonScore as\
select im.query_id as query_id, low_exp.evalue_exp as evalue_exp, min(im.evalue_mant) as evalue_mant\
from BestQueryTaxonScore im,\
(select query_id, min(evalue_exp) as evalue_exp\
from BestQueryTaxonScore\
group by query_id) low_exp\
where im.query_id = low_exp.query_id\
and im.evalue_exp = low_exp.evalue_exp\
group by im.query_id, low_exp.evalue_exp")
###########################################################################
cur.execute("create unique index bis_uids_ix on BestInterTaxonScore(query_id)")
###########################################################################
cur.execute("create table UniqSimSeqsQueryId as\
select distinct s.query_id from SimilarSequences s")
###########################################################################
cur.execute("create unique index ust_qids_ix on UniqSimSeqsQueryId (query_id)")
###########################################################################
cur.execute("CREATE TABLE BetterHit as\
select s.query_id, s.subject_id,\
s.query_taxon_id as taxon_id,\
s.evalue_exp, s.evalue_mant\
from SimilarSequences s, BestInterTaxonScore bis\
where s.query_id != s.subject_id \
and s.query_taxon_id = s.subject_taxon_id\
and s.query_id = bis.query_id\
and s.evalue_exp <= -5\
and s.percent_match >= 50\
and (s.evalue_mant < 0.001\
or s.evalue_exp < bis.evalue_exp\
or (s.evalue_exp = bis.evalue_exp and s.evalue_mant <= bis.evalue_mant))\
union\
select s.query_id, s.subject_id, s.query_taxon_id as taxon_id, s.evalue_exp, s.evalue_mant\
from SimilarSequences s\
where s.query_taxon_id = s.subject_taxon_id \
and s.evalue_exp <= -5\
and s.percent_match >= 50\
and s.query_id in \
(SELECT distinct ust.query_id\
from UniqSimSeqsQueryId ust\
LEFT OUTER JOIN BestInterTaxonScore bis ON bis.query_id = ust.query_id\
WHERE bis.query_id IS NULL)")
###########################################################################
cur.execute("create index better_hit_ix on BetterHit (query_id,subject_id)")
###########################################################################
cur.execute("create table InParalogTemp as\
select bh1.query_id as sequence_id_a, bh1.subject_id as sequence_id_b,\
bh1.taxon_id,\
case\
when bh1.evalue_mant < 0.01 or bh2.evalue_mant < 0.01\
then (bh1.evalue_exp + bh2.evalue_exp) / -2\
else\
(log(bh1.evalue_mant * bh2.evalue_mant)\
+ bh1.evalue_exp + bh2.evalue_exp) / -2\
end as unnormalized_score\
from BetterHit bh1, BetterHit bh2\
where bh1.query_id < bh1.subject_id\
and bh1.query_id = bh2.subject_id\
and bh1.subject_id = bh2.query_id")
################################################################
cur.execute("create table InParalogTaxonAvg as\
select avg(i.unnormalized_score) average, i.taxon_id as taxon_id\
from InParalogTemp i\
group by i.taxon_id")
################################################################
cur.execute("create table OrthologUniqueId as\
select distinct(sequence_id) from (\
select sequence_id_a as sequence_id from Ortholog\
union\
select sequence_id_b as sequence_id from Ortholog) i")
################################################################
cur.execute("create unique index ortho_uniq_id_ix on OrthologUniqueId (sequence_id)")
################################################################
cur.execute(" create table InplgOrthTaxonAvg as\
select avg(i.unnormalized_score) average, i.taxon_id as taxon_id\
from InParalogTemp i\
where i.sequence_id_a in\
(select sequence_id from OrthologUniqueId)\
or i.sequence_id_b in\
(select sequence_id from OrthologUniqueId)\
group by i.taxon_id")
################################################################
cur.execute("create table InParalogAvgScore as\
select case\
when orth_i.average is NULL\
then all_i.average\
else orth_i.average\
end as avg_score,\
all_i.taxon_id\
from InParalogTaxonAvg all_i LEFT OUTER JOIN InplgOrthTaxonAvg orth_i\
ON all_i.taxon_id = orth_i.taxon_id")
################################################################
cur.execute("create unique index inparalog_avg_ix on InParalogAvgScore(taxon_id,avg_score)")
################################################################
cur.execute(" insert into InParalog(sequence_id_a, sequence_id_b, taxon_id, unnormalized_score, normalized_score)\
select it.sequence_id_a, it.sequence_id_b, it.taxon_id, it.unnormalized_score, it.unnormalized_score/a.avg_score\
from InParalogTemp it, InParalogAvgScore a\
where it.taxon_id = a.taxon_id")
################################################################################
############################### CoOrthologs ###################################
################################################################################
[docs]def coorthologs (cur):
cur.execute("create table InParalog2Way as\
select sequence_id_a, sequence_id_b from InParalog\
union\
select sequence_id_b as sequence_id_a, sequence_id_a as sequence_id_b from InParalog")
######################################################################
cur.execute("create unique index in2a_ix on InParalog2Way(sequence_id_a, sequence_id_b)")
######################################################################
cur.execute("create unique index in2b_ix on InParalog2Way(sequence_id_b, sequence_id_a)")
######################################################################
cur.execute("create table Ortholog2Way as\
select sequence_id_a, sequence_id_b from Ortholog\
union\
select sequence_id_b as sequence_id_a, sequence_id_a as sequence_id_b from Ortholog")
######################################################################
cur.execute("create unique index ortholog2way_ix on Ortholog2Way(sequence_id_a, sequence_id_b)")
######################################################################
cur.execute("create table InplgOrthoInplg as\
select ip1.sequence_id_a, ip2.sequence_id_b\
from Ortholog2Way o, InParalog2Way ip2, InParalog2Way ip1\
where ip1.sequence_id_b = o.sequence_id_a\
and o.sequence_id_b = ip2.sequence_id_a")
##################################################################
cur.execute("create table InParalogOrtholog as\
select ip.sequence_id_a, o.sequence_id_b\
from InParalog2Way ip, Ortholog2Way o\
where ip.sequence_id_b = o.sequence_id_a")
##################################################################
cur.execute("create table CoOrthologCandidate as\
select distinct\
min(sequence_id_a, sequence_id_b) as sequence_id_a,\
max(sequence_id_a, sequence_id_b) as sequence_id_b\
from (select sequence_id_a, sequence_id_b from InplgOrthoInplg\
union\
select sequence_id_a, sequence_id_b from InParalogOrtholog) t")
######################################################################
cur.execute("create table CoOrthNotOrtholog as\
SELECT cc.sequence_id_a, cc.sequence_id_b\
FROM CoOrthologCandidate cc\
LEFT OUTER JOIN Ortholog o\
ON cc.sequence_id_a = o.sequence_id_a\
AND cc.sequence_id_b = o.sequence_id_b\
WHERE o.sequence_id_a IS NULL")
#####################################################################
cur.execute("create index cno_ix on CoOrthNotOrtholog(sequence_id_a,sequence_id_b)")
######################################################################
cur.execute("create table CoOrthologTemp as\
select candidate.sequence_id_a, candidate.sequence_id_b,\
ab.query_taxon_id as taxon_id_a, ab.subject_taxon_id as taxon_id_b,\
case\
when ab.evalue_mant < 0.00001 or ba.evalue_mant < 0.00001\
then (ab.evalue_exp + ba.evalue_exp) / -2\
else\
(log(ab.evalue_mant * ba.evalue_mant)\
+ ab.evalue_exp + ba.evalue_exp) / -2\
end as unnormalized_score\
from SimilarSequences ab, SimilarSequences ba, CoOrthNotOrtholog candidate\
where ab.query_id = candidate.sequence_id_a\
and ab.subject_id = candidate.sequence_id_b\
and ab.evalue_exp <= -5\
and ab.percent_match >= 50\
and ba.query_id = candidate.sequence_id_b\
and ba.subject_id = candidate.sequence_id_a\
and ba.evalue_exp <= -5\
and ba.percent_match >= 50")
######################################################################
orthologTaxonSub(cur, 'co')
######################################################################
normalizeOrthologsSub(cur, "Co", "CoOrtholog")
[docs]def execute(db_dir, nm=None):
con = lite.connect(os.path.join(db_dir, "orthoDB.db"))
with con:
if nm:
if nm.stop:
raise KillByUser("")
nm.total = 4
nm.counter = 0
nm.msg = None
con.create_function("log", 1, log)
cur = con.cursor()
for func in [commonTempTables, orthologs, inparalogs, coorthologs]:
if nm:
if nm.stop:
raise KillByUser("")
nm.counter += 1
func(cur)
if __name__ == '__main__':
execute(".")
__author__ = "Fernando Alves and Diogo N. Silva"