Source code for trifusion.TriSeq

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
#  Copyright 2012 Unknown <diogo@arch>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.

import warnings

# Suppress import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    import os
    import sys
    import shutil
    import time
    import argparse
    from glob import glob

    try:
        from process.base import print_col, RED, GREEN, YELLOW, CleanUp
        from process import sequence as seqset
        from process import data
        from process.error_handling import *
        from base.sanity import triseq_arg_check, mfilters, post_aln_checks, \
            check_infile_list
        from progressbar import ProgressBar, Timer, Bar, Percentage, \
            SimpleProgress
    except ImportError:
        from trifusion.process.base import print_col, RED, GREEN, YELLOW,\
            CleanUp
        from trifusion.process import sequence as seqset
        from trifusion.process import data
        from trifusion.process.error_handling import *
        from trifusion.base.sanity import triseq_arg_check, mfilters, \
            post_aln_checks, check_infile_list
        from trifusion.progressbar import ProgressBar, Timer, Bar,\
            Percentage, SimpleProgress


[docs]def gen_wgt(msg):

    bar_wdg = [
        "( ", SimpleProgress(), " ) ",
        Bar(),
        Percentage(),
        " [", Timer(), "] ",
    ]

    return bar_wdg

@CleanUp
def main_parser(arg, alignment_list):
    """ Function with the main operations of TriSeq """

    print_col("Executing TriSeq module at %s %s" % (
        time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")), GREEN,
              quiet=arg.quiet)

    # Create temp directory
    tmp_dir = ".trifusion-temp"
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Set path to temporary sqlite database
    sql_db = os.path.join(tmp_dir, "trifusion.db")

    # If database already exists, erase it. Make sure we start fresh.
    if os.path.exists(sql_db):
        os.remove(sql_db)

    # Defining main variables
    conversion = arg.conversion
    output_format = arg.output_format
    outfile = arg.outfile
    interleave = arg.interleave
    model_phy = arg.model_phy
    # outgroup_taxa = arg.outgroup_taxa

    # Defining output file name
    if conversion is None and arg.outfile is not None:
        outfile = "".join(arg.outfile)
    elif conversion is None and arg.outfile is not None:
        outfile = "".join(arg.outfile)
    elif arg.consensus and arg.consensus_single and not arg.outfile:
        outfile = "consensus"
    elif conversion is not None and arg.outfile is None:
        outfile = "".join(alignment_list).split(".")[0]

    # The input file at this stage is not necessary
    # If just converting the partition file format do this and exit
    if arg.partition_file is not None:
        # Initializing Partitions instance and reading partitions file
        partition = data.Partitions()
        partition.read_from_file(arg.partition_file)
        if partition.partition_format == "nexus":
            partition.write_to_file("raxml", outfile, model_phy)
        else:
            partition.write_to_file("nexus", outfile)
        return 0

    # Support wildcars as arguments for windows
    fl = []
    if sys.platform in ["win32", "cygwin"]:
        for p in alignment_list:
            fl += glob(p)
        alignment_list = fl

    # Check input files for directories
    alignment_list, dirs, lost = check_infile_list(alignment_list)

    if dirs:
        print_col("Ignoring input files pointing to a directory: {}".format(
            " ".join(dirs)), YELLOW)
    if lost:
        print_col("Ignoring input files that do not exist: {}".format(
            " ".join(lost)), YELLOW)
    if not alignment_list:
        print_col("No valid input files have been provided. Terminating...",
                  RED)

    # Input alignments are mandatory from now on
    if not arg.quiet:
        pbar = ProgressBar(max_value=len(alignment_list), widgets=gen_wgt(""))
    else:
        pbar = None

    print_col("Parsing %s alignments" % len(alignment_list), GREEN,
              quiet=arg.quiet)
    alignments = seqset.AlignmentList(alignment_list, sql_db=sql_db,
                                      pbar=pbar)

    post_aln_checks(arg, alignments)

    # ################################ Utilities ##############################
    # Return a file with taxa list and exit
    if arg.get_taxa is True:
        print_col("Writing taxa to new file", GREEN, quiet=arg.quiet)
        alignments.write_taxa_to_file()
        return 0

    # Remove taxa
    if arg.remove:
        print_col("Removing taxa", GREEN, quiet=arg.quiet)
        alignments.remove_taxa(arg.remove)

    # Grep taxa
    if arg.grep:
        print_col("Grepping taxa", GREEN, quiet=arg.quiet)
        alignments.remove_taxa(arg.grep, mode="inverse")

    # Select alignments
    if arg.select:
        print_col("Selecting alignments", GREEN, quiet=arg.quiet)
        if not os.path.exists("Taxa_selection"):
            os.makedirs("Taxa_selection")

        # Check if any of the provided taxa is absent from the alignments
        absent_taxa = [x for x in arg.select if x not in alignments.taxa_names]
        if absent_taxa:
            print_col("The following taxa were not found in any alignment and"
                      " will be ignored: {}".format(" ".join(absent_taxa)),
                      YELLOW, quiet=arg.quiet)

        selected_alignments = alignments.select_by_taxa(arg.select,
                                                        mode="relaxed")
        for aln in selected_alignments:
            alignment_file = aln.path
            shutil.copy(alignment_file, "Taxa_selection")

        return

    # ############################# Main operations ###########################
    # Reverse concatenation
    if arg.reverse is not None:
        print_col("Reverse concatenating", GREEN, quiet=arg.quiet)
        if len(alignment_list) > 1:
            raise ArgumentError("Only one input file allowed for reverse "
                                "concatenation")
        aln = alignments.alignments.values()[0]
        # Initializing and reading partition file
        partition = data.Partitions()
        partition.read_from_file(arg.reverse)
        # Updating alignment partitions
        aln.set_partitions(partition)
        alignments = aln.reverse_concatenate(pbar=pbar)

    # Filtering
    # Filter by minimum taxa
    if arg.min_taxa:
        print_col("Filtering by minimum taxa", GREEN, quiet=arg.quiet)
        alignments.filter_min_taxa(arg.min_taxa, pbar=pbar)

    # Filter by alignments that contain taxa
    if arg.contain_filter:
        print_col("Filtering alignment(s) including a taxa group", GREEN,
                  quiet=arg.quiet)
        alignments.filter_by_taxa(arg.contain_filter, "Contain", pbar=pbar)

    # Filter by alignments that exclude taxa
    if arg.exclude_filter:
        print_col("Filtering alignments excluding a taxa group", GREEN,
                  quiet=arg.quiet)
        alignments.filter_by_taxa(arg.exclude_filter, "Exclude", pbar=pbar)

    # Filter by codon position
    if arg.codon_filter:
        print_col("Filtering by codon positions", GREEN, quiet=arg.quiet)
        if alignments.sequence_code[0] == "DNA":
            codon_settings = [True if str(x) in arg.codon_filter else False
                              for x in range(1, 4)]
            alignments.filter_codon_positions(codon_settings, pbar=pbar)

    # Filter by missing data
    if arg.m_filter:
        print_col("Filtering by missing data", GREEN, quiet=arg.quiet)
        alignments.filter_missing_data(arg.m_filter[0], arg.m_filter[1],
                                       pbar=pbar, use_main_table=True)

    # Filtering by variable sites
    if arg.var_filter:
        print_col("Filtering by variable sites", GREEN, quiet=arg.quiet)
        alignments.filter_segregating_sites(arg.var_filter[0],
                                            arg.var_filter[1],
                                            pbar=pbar)

    # Filtering by informative sites
    if arg.inf_filter:
        print_col("Filtering by variable sites", GREEN, quiet=arg.quiet)
        alignments.filter_informative_sites(arg.inf_filter[0],
                                            arg.inf_filter[1],
                                            pbar=pbar)

    # Concatenation
    if not arg.conversion and not arg.reverse and not arg.consensus:
        print_col("Concatenating", GREEN, quiet=arg.quiet)
        alignments = alignments.concatenate(pbar=pbar)

        # Concatenate zorro files
        if arg.zorro:
            zorro = data.Zorro(alignment_list, arg.zorro)
            zorro.write_to_file(outfile)

    # Collapsing
    if arg.collapse:
        print_col("Collapsing", GREEN, quiet=arg.quiet)
        alignments.collapse(use_main_table=True, pbar=pbar,
                            haplotypes_file=outfile)

    # Gcoder
    if arg.gcoder:
        print_col("Coding gaps", GREEN, quiet=arg.quiet)
        if output_format == ["nexus"]:
            alignments.code_gaps(use_main_table=True, pbar=pbar)

    # Consensus
    if arg.consensus:
        consensus_type = arg.consensus[0]
        print_col("Creating consensus sequences", GREEN, quiet=arg.quiet)
        if arg.consensus_single:
            if isinstance(alignments, seqset.AlignmentList):
                alignments = alignments.consensus(
                    consensus_type, single_file=arg.consensus_single,
                    pbar=pbar, use_main_table=True)
            else:
                alignments.consensus(consensus_type=consensus_type, pbar=pbar,
                                     use_main_table=True)
        else:
            alignments.consensus(consensus_type=consensus_type, pbar=pbar,
                                 use_main_table=True)

    # Write output
    print_col("Writing output", GREEN, quiet=arg.quiet)
    if isinstance(alignments, seqset.Alignment):
        alignments.write_to_file(output_format, outfile,
                                 interleave=interleave,
                                 partition_file=True,
                                 use_charset=True,
                                 ima2_params=arg.ima2_params,
                                 pbar=pbar)
    elif isinstance(alignments, seqset.AlignmentList):
        alignments.write_to_file(output_format,
                                 interleave=interleave,
                                 ima2_params=arg.ima2_params,
                                 pbar=pbar)


[docs]def get_args(arg_list=None, unittest=False):

    # The inclusion of the argument definition in main, makes it possible to
    # import this file as a module and not triggering argparse. The
    # alternative of using a if __name__ == "__main__" statement does not
    # work well with the entry_points parameter of setup.py, since they call
    # the main function but do nothing inside said statement.
    parser = argparse.ArgumentParser(description="Command line interface for "
                                                 "TriFusion Process module")

    # Main execution
    main_exec = parser.add_argument_group("Main execution")
    main_exec.add_argument("-in", dest="infile", nargs="+", help="Provide the "
                           "input file name. If multiple files are provided"
                           ", please separated the names with spaces")
    main_exec.add_argument("-of", dest="output_format", nargs="+",
                           default=["nexus"],
                           choices=["nexus", "phylip", "fasta", "mcmctree",
                                    "ima2", "stockholm", "gphocs"],
                           help="Format of the output file(s). You may select "
                           "multiple output formats simultaneously (default is"
                           " '%(default)s')")
    main_exec.add_argument("-o", dest="outfile", help="Name of the output file")

    # Alternative modes
    alternative = parser.add_argument_group("Alternative execution modes")
    alternative.add_argument("-c", dest="conversion", action="store_const",
                             const=True, help="Used for conversion of the "
                             "input files passed as arguments with the -in "
                             "option. This flag precludes the usage of the -o "
                             "option, as the output file name is automatically "
                             "generated based on the input file name")
    alternative.add_argument("-r", dest="reverse", help="Reverse a concatenated"
                             " file into its original single locus alignments."
                             " A partition file similar to the one read by "
                             "RAxML must be provided")
    alternative.add_argument("-z", "--zorro-suffix", dest="zorro", type=str,
                             help="Use this option if you wish to concatenate "
                             "auxiliary Zorro files associated with each "
                             "alignment. Provide the sufix for the concatenated"
                             " zorro file")
    alternative.add_argument("-p", "--partition-file", dest="partition_file",
                             type=str, help="Using this option and providing "
                             "the partition file will convert it between a "
                             "RAxML or Nexus format")
    alternative.add_argument("-s", dest="select", nargs="*", help="Selects "
                             "alignments containing the provided taxa "
                             "(separate multiple taxa with whitespace)")
    alternative.add_argument("--collapse", dest="collapse",
                             action="store_const",
                             const=True, default=False, help="Use this flag if "
                             "you would like to collapse the input alignment(s)"
                             " into unique haplotypes")
    alternative.add_argument("--code-gaps", dest="gcoder", action="store_const",
                             const=True, default=False, help="Use this flag to "
                             "code the gaps of the alignment into a binary "
                             "state  matrix that is appended to the end of "
                             "the alignment")
    alternative.add_argument("--consensus", dest="consensus", nargs=1,
                             choices=["First sequence", "IUPAC", "Soft mask",
                                      "Remove"],
                             help="Creates a consensus of the final alignments"
                             " specifying how variation is handled")
    alternative.add_argument("--consensus-single-file",
                             dest="consensus_single", action="store_const",
                             const=True, default=False, help="Merges "
                             "consensus sequences in a single file")

    filter_g = parser.add_argument_group("Filter options")
    filter_g.add_argument("--missing-filter", dest="m_filter", nargs=2,
                          help="Use this  option if you wish to filter the"
                          " alignment's missing data. Along with this "
                          "option provide the threshold percentages for "
                          "gap and missing data, respectively (e.g. -filter "
                          "50 75 - filters alignments columns with more "
                          "than 50%% of gap+missing data and columns with "
                          "more than 75%% of true missing data)",
                          type=mfilters)
    filter_g.add_argument("--min-taxa", dest="min_taxa",
                          type=mfilters, help="Set the minimum percentage "
                          "of taxa that needs to be present in an alignment")
    filter_g.add_argument("--contain-taxa", dest="contain_filter",
                          nargs="*", help="Only "
                          "processes alignments that contain the specified "
                          "taxa")
    filter_g.add_argument("--exclude-taxa", dest="exclude_filter", help="Only "
                          "process alignments that do NOT contain the "
                          "specified taxa")
    filter_g.add_argument("--codon-filter", dest="codon_filter", nargs="*",
                          choices=["1", "2", "3"], help="Include only the "
                          "codon positions specified by this option (DNA only)")
    filter_g.add_argument("--variable-filter", dest="var_filter", nargs=2,
                          help="Provide minimum and maximum values of "
                          "variable sites for each alignment. Filters "
                          "alignments with a number of variable sites outside "
                          "the specified range", type=int)
    filter_g.add_argument("--informative-filter", dest="inf_filter", nargs=2,
                          help="Provide minimum and maximum values of "
                          "informative sites for each alignment. Filters "
                          "alignments with a number of informative sites "
                          "outside the specified range", type=int)

    # Formatting options
    formatting = parser.add_argument_group("Formatting options")
    formatting.add_argument("--model", dest="model_phy", default="LG",
                            choices=["DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG",
                                     "RTREV", "CPREV", "VT", "BLOSUM62",
                                     "MTMAM", "LG"],
                            help="This option only applies for the "
                            "concatenation  of protein data into phylip "
                            "format. Specify the model for all partitions "
                            "defined in the partition file  (default is '%("
                            "default)s')")
    formatting.add_argument("--interleave", dest="interleave",
                            action="store_const", const="interleave",
                            default=False, help="Specify this  option to "
                            "write output files in interleave format (currently"
                            " only supported for nexus files")
    formatting.add_argument("--ima2-params", dest="ima2_params", nargs=4,
                            help="Provide 4 additional arguments needed to "
                            "write the output in a format compliant with "
                            "IMa2. The order of the required arguments "
                            "(separated by whitespace is as follows: "
                            "[(1) File name of population mapping]"
                            "[(2) Population tree]"
                            "[(3) Mutational model]"
                            "[(4) Inheritance Scalar]. "
                            "Additional notes: (1) The  population mapping "
                            "file is a simple .csv file  containing two "
                            "columns separated by a semi-colon, "
                            "in which the first column contains the taxon "
                            "name and the second column contains the "
                            "corresponding population name; (2) The order of "
                            "the population names in the population tree "
                            "must be the same as the order in the file with "
                            "the population mapping")

    # Data manipulation
    manipulation = parser.add_argument_group("Data manipulation")
    manipulation.add_argument("-rm", dest="remove", nargs="*", help="Removes "
                              "the  specified taxa from the final alignment. "
                              "Unwanted taxa my be provided in a csv file "
                              "containing 1 column with a species name in "
                              "each line or they may be specified in the "
                              "command line and separated by whitespace")
    manipulation.add_argument("-grep", dest="grep", nargs="*", help="The "
                              "inverse  of the -rm command. It removes all "
                              "taxa from the alignment except for the ones "
                              "specified with this option. Taxa names may be "
                              "specified in a csv file containing 1 column "
                              "with a species name in each line or in the "
                              "command line separated by whitespace")
    # manipulation.add_argument("-outgroup", dest="outgroup_taxa", nargs="*",
    #                           help="Provide taxon names/number for the "
    #                           "outgroup  (This option is only supported for "
    #                           "NEXUS output format files)")

    utilities = parser.add_argument_group("Utilities")
    utilities.add_argument("--get-taxa", dest="get_taxa",
                           action="store_const", const=True,
                           default=False, help="Writes all taxa names into a"
                           " .csv file")

    miscellaneous = parser.add_argument_group("Miscellaneous")
    miscellaneous.add_argument("-quiet", dest="quiet", action="store_const",
                               const=True, default=False, help="Removes all "
                               "terminal output")

    args = parser.parse_args(arg_list)

    # Print help when no arguments are provided
    if len(sys.argv) == 1 and not unittest:
        parser.print_help()
        sys.exit(1)

    return args


[docs]def main():
    arguments = get_args()
    triseq_arg_check(arguments)
    main_parser(arguments, arguments.infile)


if __name__ == "__main__":

    main()


__author__ = "Diogo N. Silva"