#!/usr/bin/python2
# -*- coding: utf-8 -*-
import os
import re
try:
from process.error_handling import KillByUser
except ImportError:
from trifusion.process.error_handling import KillByUser
[docs]def orthomcl_filter_fasta(input_dir, min_length, max_stop_percent, db, dest,
nm=None):
def handle_seq(seq, length, stop_cnt):
is_bad = 0
stop_percent = ((length - stop_cnt) / length) * 100
if length < min_length or stop_percent > max_stop_percent:
bad.write(seq + "\n")
is_bad = 1
else:
good.write(seq + "\n")
return is_bad
good = open(os.path.join(dest, "backstage_files", db), "w")
bad = open(os.path.join(dest, "backstage_files", "poorProteins.txt"), "w")
filenames = [os.path.join(input_dir, x) for x in os.listdir(input_dir)]
reject_rates = []
# Setup progression information
if nm:
if nm.stop:
raise KillByUser("")
nm.total = len(filenames)
nm.counter = 0
for filename in filenames:
if nm:
if nm.stop:
raise KillByUser("")
nm.counter += 1
nm.msg = "Filtering file {}".format(os.path.basename(filename))
if filename.startswith('.'):
continue
input_file = open(filename, 'r')
seq_count = 0
reject_seq_count = 0
current_seq = ""
current_len = 0
current_stop_cnt = 0
# process lines of one file
for line in input_file:
if nm:
if nm.stop:
raise KillByUser("")
if line.startswith('>'):
if current_seq:
seq_count += 1
reject_seq_count += handle_seq(current_seq,
current_len,
current_stop_cnt)
current_seq = ""
current_len = 0
current_stop_cnt = 0
else:
line_len = len(line)
current_len += line_len
line = re.sub('[^A-Za-z]', '', line)
current_stop_cnt += line_len - len(line)
current_seq += line
reject_seq_count += handle_seq(current_seq,
current_len,
current_stop_cnt)
seq_count += 1
# add file stats to reject count if it qualifies
if reject_seq_count:
pct = reject_seq_count / seq_count * 100
if pct > 10:
reject_rates.append([input_file, pct])
input_file.close()
good.close()
bad.close()
__author__ = "Fernando Alves"