Source code for trifusion.TriSeq

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
#  Copyright 2012 Unknown <diogo@arch>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.

import warnings

# Suppress import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    import os
    import sys
    import shutil
    import time
    import argparse
    from glob import glob

    try:
        from process.base import print_col, RED, GREEN, YELLOW, CleanUp
        from process import sequence as seqset
        from process import data
        from process.error_handling import *
        from base.sanity import triseq_arg_check, mfilters, post_aln_checks, \
            check_infile_list
        from progressbar import ProgressBar, Timer, Bar, Percentage, \
            SimpleProgress
    except ImportError:
        from trifusion.process.base import print_col, RED, GREEN, YELLOW,\
            CleanUp
        from trifusion.process import sequence as seqset
        from trifusion.process import data
        from trifusion.process.error_handling import *
        from trifusion.base.sanity import triseq_arg_check, mfilters, \
            post_aln_checks, check_infile_list
        from trifusion.progressbar import ProgressBar, Timer, Bar,\
            Percentage, SimpleProgress


[docs]def gen_wgt(msg): bar_wdg = [ "( ", SimpleProgress(), " ) ", Bar(), Percentage(), " [", Timer(), "] ", ] return bar_wdg
@CleanUp def main_parser(arg, alignment_list): """ Function with the main operations of TriSeq """ print_col("Executing TriSeq module at %s %s" % ( time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN, quiet=arg.quiet) # Create temp directory tmp_dir = ".trifusion-temp" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Set path to temporary sqlite database sql_db = os.path.join(tmp_dir, "trifusion.db") # If database already exists, erase it. Make sure we start fresh. if os.path.exists(sql_db): os.remove(sql_db) # Defining main variables conversion = arg.conversion output_format = arg.output_format outfile = arg.outfile interleave = arg.interleave model_phy = arg.model_phy # outgroup_taxa = arg.outgroup_taxa # Defining output file name if conversion is None and arg.outfile is not None: outfile = "".join(arg.outfile) elif conversion is None and arg.outfile is not None: outfile = "".join(arg.outfile) elif arg.consensus and arg.consensus_single and not arg.outfile: outfile = "consensus" elif conversion is not None and arg.outfile is None: outfile = "".join(alignment_list).split(".")[0] # The input file at this stage is not necessary # If just converting the partition file format do this and exit if arg.partition_file is not None: # Initializing Partitions instance and reading partitions file partition = data.Partitions() partition.read_from_file(arg.partition_file) if partition.partition_format == "nexus": partition.write_to_file("raxml", outfile, model_phy) else: partition.write_to_file("nexus", outfile) return 0 # Support wildcars as arguments for windows fl = [] if sys.platform in ["win32", "cygwin"]: for p in alignment_list: fl += glob(p) alignment_list = fl # Check input files for directories alignment_list, dirs, lost = check_infile_list(alignment_list) if dirs: print_col("Ignoring input files pointing to a directory: {}".format( " ".join(dirs)), YELLOW) if lost: print_col("Ignoring input files that do not exist: {}".format( " ".join(lost)), YELLOW) if not alignment_list: print_col("No valid input files have been provided. Terminating...", RED) # Input alignments are mandatory from now on if not arg.quiet: pbar = ProgressBar(max_value=len(alignment_list), widgets=gen_wgt("")) else: pbar = None print_col("Parsing %s alignments" % len(alignment_list), GREEN, quiet=arg.quiet) alignments = seqset.AlignmentList(alignment_list, sql_db=sql_db, pbar=pbar) post_aln_checks(arg, alignments) # ################################ Utilities ############################## # Return a file with taxa list and exit if arg.get_taxa is True: print_col("Writing taxa to new file", GREEN, quiet=arg.quiet) alignments.write_taxa_to_file() return 0 # Remove taxa if arg.remove: print_col("Removing taxa", GREEN, quiet=arg.quiet) alignments.remove_taxa(arg.remove) # Grep taxa if arg.grep: print_col("Grepping taxa", GREEN, quiet=arg.quiet) alignments.remove_taxa(arg.grep, mode="inverse") # Select alignments if arg.select: print_col("Selecting alignments", GREEN, quiet=arg.quiet) if not os.path.exists("Taxa_selection"): os.makedirs("Taxa_selection") # Check if any of the provided taxa is absent from the alignments absent_taxa = [x for x in arg.select if x not in alignments.taxa_names] if absent_taxa: print_col("The following taxa were not found in any alignment and" " will be ignored: {}".format(" ".join(absent_taxa)), YELLOW, quiet=arg.quiet) selected_alignments = alignments.select_by_taxa(arg.select, mode="relaxed") for aln in selected_alignments: alignment_file = aln.path shutil.copy(alignment_file, "Taxa_selection") return # ############################# Main operations ########################### # Reverse concatenation if arg.reverse is not None: print_col("Reverse concatenating", GREEN, quiet=arg.quiet) if len(alignment_list) > 1: raise ArgumentError("Only one input file allowed for reverse " "concatenation") aln = alignments.alignments.values()[0] # Initializing and reading partition file partition = data.Partitions() partition.read_from_file(arg.reverse) # Updating alignment partitions aln.set_partitions(partition) alignments = aln.reverse_concatenate(pbar=pbar) # Filtering # Filter by minimum taxa if arg.min_taxa: print_col("Filtering by minimum taxa", GREEN, quiet=arg.quiet) alignments.filter_min_taxa(arg.min_taxa, pbar=pbar) # Filter by alignments that contain taxa if arg.contain_filter: print_col("Filtering alignment(s) including a taxa group", GREEN, quiet=arg.quiet) alignments.filter_by_taxa(arg.contain_filter, "Contain", pbar=pbar) # Filter by alignments that exclude taxa if arg.exclude_filter: print_col("Filtering alignments excluding a taxa group", GREEN, quiet=arg.quiet) alignments.filter_by_taxa(arg.exclude_filter, "Exclude", pbar=pbar) # Filter by codon position if arg.codon_filter: print_col("Filtering by codon positions", GREEN, quiet=arg.quiet) if alignments.sequence_code[0] == "DNA": codon_settings = [True if str(x) in arg.codon_filter else False for x in range(1, 4)] alignments.filter_codon_positions(codon_settings, pbar=pbar) # Filter by missing data if arg.m_filter: print_col("Filtering by missing data", GREEN, quiet=arg.quiet) alignments.filter_missing_data(arg.m_filter[0], arg.m_filter[1], pbar=pbar, use_main_table=True) # Filtering by variable sites if arg.var_filter: print_col("Filtering by variable sites", GREEN, quiet=arg.quiet) alignments.filter_segregating_sites(arg.var_filter[0], arg.var_filter[1], pbar=pbar) # Filtering by informative sites if arg.inf_filter: print_col("Filtering by variable sites", GREEN, quiet=arg.quiet) alignments.filter_informative_sites(arg.inf_filter[0], arg.inf_filter[1], pbar=pbar) # Concatenation if not arg.conversion and not arg.reverse and not arg.consensus: print_col("Concatenating", GREEN, quiet=arg.quiet) alignments = alignments.concatenate(pbar=pbar) # Concatenate zorro files if arg.zorro: zorro = data.Zorro(alignment_list, arg.zorro) zorro.write_to_file(outfile) # Collapsing if arg.collapse: print_col("Collapsing", GREEN, quiet=arg.quiet) alignments.collapse(use_main_table=True, pbar=pbar, haplotypes_file=outfile) # Gcoder if arg.gcoder: print_col("Coding gaps", GREEN, quiet=arg.quiet) if output_format == ["nexus"]: alignments.code_gaps(use_main_table=True, pbar=pbar) # Consensus if arg.consensus: consensus_type = arg.consensus[0] print_col("Creating consensus sequences", GREEN, quiet=arg.quiet) if arg.consensus_single: if isinstance(alignments, seqset.AlignmentList): alignments = alignments.consensus( consensus_type, single_file=arg.consensus_single, pbar=pbar, use_main_table=True) else: alignments.consensus(consensus_type=consensus_type, pbar=pbar, use_main_table=True) else: alignments.consensus(consensus_type=consensus_type, pbar=pbar, use_main_table=True) # Write output print_col("Writing output", GREEN, quiet=arg.quiet) if isinstance(alignments, seqset.Alignment): alignments.write_to_file(output_format, outfile, interleave=interleave, partition_file=True, use_charset=True, ima2_params=arg.ima2_params, pbar=pbar) elif isinstance(alignments, seqset.AlignmentList): alignments.write_to_file(output_format, interleave=interleave, ima2_params=arg.ima2_params, pbar=pbar)
[docs]def get_args(arg_list=None, unittest=False): # The inclusion of the argument definition in main, makes it possible to # import this file as a module and not triggering argparse. The # alternative of using a if __name__ == "__main__" statement does not # work well with the entry_points parameter of setup.py, since they call # the main function but do nothing inside said statement. parser = argparse.ArgumentParser(description="Command line interface for " "TriFusion Process module") # Main execution main_exec = parser.add_argument_group("Main execution") main_exec.add_argument("-in", dest="infile", nargs="+", help="Provide the " "input file name. If multiple files are provided" ", please separated the names with spaces") main_exec.add_argument("-of", dest="output_format", nargs="+", default=["nexus"], choices=["nexus", "phylip", "fasta", "mcmctree", "ima2", "stockholm", "gphocs"], help="Format of the output file(s). You may select " "multiple output formats simultaneously (default is" " '%(default)s')") main_exec.add_argument("-o", dest="outfile", help="Name of the output file") # Alternative modes alternative = parser.add_argument_group("Alternative execution modes") alternative.add_argument("-c", dest="conversion", action="store_const", const=True, help="Used for conversion of the " "input files passed as arguments with the -in " "option. This flag precludes the usage of the -o " "option, as the output file name is automatically " "generated based on the input file name") alternative.add_argument("-r", dest="reverse", help="Reverse a concatenated" " file into its original single locus alignments." " A partition file similar to the one read by " "RAxML must be provided") alternative.add_argument("-z", "--zorro-suffix", dest="zorro", type=str, help="Use this option if you wish to concatenate " "auxiliary Zorro files associated with each " "alignment. Provide the sufix for the concatenated" " zorro file") alternative.add_argument("-p", "--partition-file", dest="partition_file", type=str, help="Using this option and providing " "the partition file will convert it between a " "RAxML or Nexus format") alternative.add_argument("-s", dest="select", nargs="*", help="Selects " "alignments containing the provided taxa " "(separate multiple taxa with whitespace)") alternative.add_argument("--collapse", dest="collapse", action="store_const", const=True, default=False, help="Use this flag if " "you would like to collapse the input alignment(s)" " into unique haplotypes") alternative.add_argument("--code-gaps", dest="gcoder", action="store_const", const=True, default=False, help="Use this flag to " "code the gaps of the alignment into a binary " "state matrix that is appended to the end of " "the alignment") alternative.add_argument("--consensus", dest="consensus", nargs=1, choices=["First sequence", "IUPAC", "Soft mask", "Remove"], help="Creates a consensus of the final alignments" " specifying how variation is handled") alternative.add_argument("--consensus-single-file", dest="consensus_single", action="store_const", const=True, default=False, help="Merges " "consensus sequences in a single file") filter_g = parser.add_argument_group("Filter options") filter_g.add_argument("--missing-filter", dest="m_filter", nargs=2, help="Use this option if you wish to filter the" " alignment's missing data. Along with this " "option provide the threshold percentages for " "gap and missing data, respectively (e.g. -filter " "50 75 - filters alignments columns with more " "than 50%% of gap+missing data and columns with " "more than 75%% of true missing data)", type=mfilters) filter_g.add_argument("--min-taxa", dest="min_taxa", type=mfilters, help="Set the minimum percentage " "of taxa that needs to be present in an alignment") filter_g.add_argument("--contain-taxa", dest="contain_filter", nargs="*", help="Only " "processes alignments that contain the specified " "taxa") filter_g.add_argument("--exclude-taxa", dest="exclude_filter", help="Only " "process alignments that do NOT contain the " "specified taxa") filter_g.add_argument("--codon-filter", dest="codon_filter", nargs="*", choices=["1", "2", "3"], help="Include only the " "codon positions specified by this option (DNA only)") filter_g.add_argument("--variable-filter", dest="var_filter", nargs=2, help="Provide minimum and maximum values of " "variable sites for each alignment. Filters " "alignments with a number of variable sites outside " "the specified range", type=int) filter_g.add_argument("--informative-filter", dest="inf_filter", nargs=2, help="Provide minimum and maximum values of " "informative sites for each alignment. Filters " "alignments with a number of informative sites " "outside the specified range", type=int) # Formatting options formatting = parser.add_argument_group("Formatting options") formatting.add_argument("--model", dest="model_phy", default="LG", choices=["DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG"], help="This option only applies for the " "concatenation of protein data into phylip " "format. Specify the model for all partitions " "defined in the partition file (default is '%(" "default)s')") formatting.add_argument("--interleave", dest="interleave", action="store_const", const="interleave", default=False, help="Specify this option to " "write output files in interleave format (currently" " only supported for nexus files") formatting.add_argument("--ima2-params", dest="ima2_params", nargs=4, help="Provide 4 additional arguments needed to " "write the output in a format compliant with " "IMa2. The order of the required arguments " "(separated by whitespace is as follows: " "[(1) File name of population mapping]" "[(2) Population tree]" "[(3) Mutational model]" "[(4) Inheritance Scalar]. " "Additional notes: (1) The population mapping " "file is a simple .csv file containing two " "columns separated by a semi-colon, " "in which the first column contains the taxon " "name and the second column contains the " "corresponding population name; (2) The order of " "the population names in the population tree " "must be the same as the order in the file with " "the population mapping") # Data manipulation manipulation = parser.add_argument_group("Data manipulation") manipulation.add_argument("-rm", dest="remove", nargs="*", help="Removes " "the specified taxa from the final alignment. " "Unwanted taxa my be provided in a csv file " "containing 1 column with a species name in " "each line or they may be specified in the " "command line and separated by whitespace") manipulation.add_argument("-grep", dest="grep", nargs="*", help="The " "inverse of the -rm command. It removes all " "taxa from the alignment except for the ones " "specified with this option. Taxa names may be " "specified in a csv file containing 1 column " "with a species name in each line or in the " "command line separated by whitespace") # manipulation.add_argument("-outgroup", dest="outgroup_taxa", nargs="*", # help="Provide taxon names/number for the " # "outgroup (This option is only supported for " # "NEXUS output format files)") utilities = parser.add_argument_group("Utilities") utilities.add_argument("--get-taxa", dest="get_taxa", action="store_const", const=True, default=False, help="Writes all taxa names into a" " .csv file") miscellaneous = parser.add_argument_group("Miscellaneous") miscellaneous.add_argument("-quiet", dest="quiet", action="store_const", const=True, default=False, help="Removes all " "terminal output") args = parser.parse_args(arg_list) # Print help when no arguments are provided if len(sys.argv) == 1 and not unittest: parser.print_help() sys.exit(1) return args
[docs]def main(): arguments = get_args() triseq_arg_check(arguments) main_parser(arguments, arguments.infile)
if __name__ == "__main__": main() __author__ = "Diogo N. Silva"