Source code for trifusion.ortho.OrthomclToolbox

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
#  Copyright 2012 Unknown <diogo@arch>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.

try:
    from process.sequence import Alignment
    from base.plotter import bar_plot, multi_bar_plot
    from process.error_handling import KillByUser
except ImportError:
    from trifusion.process.sequence import Alignment
    from trifusion.base.plotter import bar_plot, multi_bar_plot
    from trifusion.process.error_handling import KillByUser

from collections import OrderedDict, Counter
import pickle
import os
import sqlite3
from os.path import join
import random
import string
import copy


[docs]class Cluster(object):
    """ Object for clusters of the OrthoMCL groups file. It is useful to set a
     number of attributes that will make subsequent filtration and
     processing much easier """

    def __init__(self, line_string):
        """
        To initialize a Cluster object, only a string compliant with the
        format of a cluster in an OrthoMCL groups file has to be provided.
        This line should contain the name of the group, a colon, and the
        sequences belonging to that group separated by whitespace
        :param line_string: String of a cluster
        """

        # Initializing attributes for parse_string
        self.name = None
        self.sequences = None
        self.species_frequency = {}

        # Initializing attributes for apply filter
        # If the value is different than None, this will inform downstream
        # objects of whether this cluster is compliant with the specified
        # gene_threshold
        self.gene_compliant = None
        # If the value is different than None, this will inform downstream
        # objects of whether this cluster is compliant with the specified
        # species_threshold
        self.species_compliant = None

        self.parse_string(line_string)

[docs]    def parse_string(self, cluster_string):
        """
        Parses the string and sets the group name and sequence list attributes
        """

        fields = cluster_string.split(":")
        # Setting the name and sequence list of the clusters
        self.name = fields[0].strip()
        self.sequences = fields[1].strip().split()

        # Setting the gene frequency for each species in the cluster
        self.species_frequency = Counter([field.split("|")[0] for field in
                                          self.sequences])

[docs]    def remove_taxa(self, taxa_list):
        """
        Removes the taxa contained in taxa_list from self.sequences and
        self.species_frequency
        :param taxa_list: list, each element should be a taxon name
        """

        self.sequences = [x for x in self.sequences if x.split("|")[0]
                          not in taxa_list]

        self.species_frequency = dict((taxon, val) for taxon, val in
                                      self.species_frequency.items()
                                      if taxon not in taxa_list)

[docs]    def apply_filter(self, gene_threshold, species_threshold):
        """
        This method will update two Cluster attributes, self.gene_flag and
        self.species_flag, which will inform downstream objects if this
        cluster respects the gene and species threshold
        :param gene_threshold: Integer for the maximum number of gene copies
        per species
        :param species_threshold: Integer for the minimum number of species
        present
        """

        # Check whether cluster is compliant with species_threshold
        if len(self.species_frequency) >= species_threshold and \
                species_threshold:
            self.species_compliant = True
        else:
            self.species_compliant = False

        # Check whether cluster is compliant with gene_threshold
        if max(self.species_frequency.values()) <= gene_threshold and \
                gene_threshold:
            self.gene_compliant = True
        else:
            self.gene_compliant = False


[docs]class OrthoGroupException(Exception):
    pass


[docs]class GroupLight(object):
    """
    Analogous to Group object but with several changes to reduce memory usage
    """

    def __init__(self, groups_file, gene_threshold=None,
                 species_threshold=None, ns=None):

        self.gene_threshold = gene_threshold if gene_threshold else None
        self.species_threshold = species_threshold if species_threshold \
            else None

        # Attribute containing the list of included species
        self.species_list = []
        # Attribute that will contain taxa to be excluded from analyses
        self.excluded_taxa = []
        self.species_frequency = []

        # Attributes that will store the number (int) of cluster after gene and
        # species filter
        self.all_clusters = 0
        self.num_gene_compliant = 0
        self.num_species_compliant = 0
        self.all_compliant = 0

        # Attribute containing the total number of sequences
        self.total_seqs = 0
        # Attribute containing the maximum number of extra copies found in the
        # clusters
        self.max_extra_copy = 0

        # Attribute with name of the group file, which will be an ID
        self.name = os.path.abspath(groups_file)
        self.table = groups_file.split(os.sep)[-1].split(".")[0]

        # Initialize atribute containing the groups filtered using the gene and
        # species threshold. This attribute can be updated at any time using
        # the update_filtered_group method
        self.filtered_groups = []

        self._parse_groups(ns)

        if type(self.species_threshold) is float:
            self._get_sp_proportion()

[docs]    def groups(self):
        """
        Generator for group file. This replaces the self.groups attribute of
        the original Group Object. Instead of loading the whole file into
        memory, a generator is created to iterate over its contents. It may
        run a bit slower but its a lot more memory efficient.
        :return:
        """

        file_handle = open(self.name)
        for line in file_handle:
            if line.strip() != "":
                yield line.strip()

[docs]    def iter_species_frequency(self):
        """
        In order to prevent permanent changes to the species_frequency
        attribute due to the filtering of taxa, this iterable should be used
        instead of the said variable. This creates a temporary deepcopy of
        species_frequency which will be iterated over and eventually modified.
        """

        # Since the items of species_frequency are mutable, this ensures
        # that even those objects are correctly cloned
        sp_freq = copy.deepcopy(self.species_frequency)

        for cl in sp_freq:
            yield cl

    def _remove_tx(self, line):
        """
        Given a group line, remove all references to the excluded taxa
        :param line: raw group file line
        """

        new_line = "{}:".format(line.split(":")[0])

        tx_str = "\t".join([x for x in line.split(":")[1].split() if
                           x.split("|")[0] not in self.excluded_taxa])

        return new_line + tx_str

    def _apply_filter(self, cl):
        """
        Sets or updates the basic group statistics, such as the number of
        orthologs compliant with the gene copy and minimum taxa filters.

        :param cl: dictionary. Contains the number of occurrences for each
         taxon present in the ortholog cluster
         (e.g. {"taxonA": 2, "taxonB": 1).
        """

        # First, remove excluded taxa from the cl object since this will
        # impact all other filters
        for tx in self.excluded_taxa:
            cl.pop(tx, None)

        if cl:

            self.all_clusters += 1

            extra_copies = max(cl.values())
            if extra_copies > self.max_extra_copy:
                self.max_extra_copy = extra_copies

            if extra_copies <= self.gene_threshold and self.gene_threshold and\
                len(cl) >= self.species_threshold and  \
                    self.species_threshold:
                self.num_gene_compliant += 1
                self.num_species_compliant += 1
                self.all_compliant += 1

            elif (extra_copies <= self.gene_threshold and
                      self.gene_threshold) or self.gene_threshold == 0:
                self.num_gene_compliant += 1

            elif len(cl) >= self.species_threshold and \
                    self.species_threshold:
                self.num_species_compliant += 1

    def _get_compliance(self, cl):
        """
        Determines whether an ortholog cluster is compliant with the specified
        ortholog filters.
        :param ccl: dictionary. Contains the number of occurrences for each
         taxon present in the ortholog cluster
         (e.g. {"taxonA": 2, "taxonB": 1).

        :return: tuple. The first element refers to the gene copy filter
        while the second refers to the minimum taxa filter. Values of 1
        indicate that the ortholg cluster is compliant.
        """

        for tx in self.excluded_taxa:
            cl.pop(tx, None)

        if cl:

            cp = max(cl.values())

            if not self.gene_threshold and not self.species_threshold:
                return 1, 1

            if cp <= self.gene_threshold and self.gene_threshold and\
                len(cl) >= self.species_threshold and  \
                    self.species_threshold:
                return 1, 1

            elif (cp <= self.gene_threshold and self.gene_threshold) or \
                    not self.gene_threshold:
                return 1, 0

            elif (len(cl) >= self.species_threshold and
                    self.species_threshold) or not self.species_threshold:
                return 0, 1

            else:
                return 0, 0

    def _reset_counter(self):

        self.all_clusters = 0
        self.num_gene_compliant = 0
        self.num_species_compliant = 0
        self.all_compliant = 0

    def _parse_groups(self, ns=None):

        for cl in self.groups():

            if ns:
                if ns.stop:
                    raise KillByUser("")

            # Retrieve the field containing the ortholog sequences
            sequence_field = cl.split(":")[1]

            # Update species frequency list
            sp_freq = Counter((x.split("|")[0] for x in
                              sequence_field.split()))

            self.species_frequency.append(sp_freq)

            # Update number of sequences
            self.total_seqs += len(sequence_field)

            # Update max number of extra copies
            extra_copies = max(sp_freq.values())
            if extra_copies > self.max_extra_copy:
                self.max_extra_copy = max(sp_freq.values())

            self.species_list.extend([x for x in sp_freq if x not in
                                      self.species_list])

            # Apply filters, if any
            # gene filter
            if self.species_threshold and self.gene_threshold:
                self._apply_filter(sp_freq)

[docs]    def exclude_taxa(self, taxa_list, update_stats=False):
        """
        Updates the excluded_taxa attribute and updates group statistics if
        update_stats is True. This does not change the Group object data
        permanently, only sets an attribute that will be taken into account
        when plotting and exporting data.
        :param taxa_list: list. List of taxa that should be excluded from
        downstream operations
        :param update_stats: boolean. If True, it will update the group
        statistics
        """

        # IF the taxa_list is the same as the excluded_taxa attribute,
        # there is nothing to do
        if sorted(taxa_list) == sorted(self.excluded_taxa):
            return

        self.species_list = [x for x in self.species_list + self.excluded_taxa
                             if x not in taxa_list]

        self.excluded_taxa = taxa_list

        if update_stats:
            self._reset_counter()
            for cl in self.iter_species_frequency():
                self._apply_filter(cl)

[docs]    def basic_group_statistics(self, update_stats=True):

        if update_stats:
            self._reset_counter()
            for cl in self.iter_species_frequency():
                self._apply_filter(cl)

        return len(self.species_frequency), self.total_seqs, \
            self.num_gene_compliant, self.num_species_compliant, \
            self.all_compliant

    def _get_sp_proportion(self):
        """
        When the species filter is a float value between 0 and 1, convert
        this proportion into absolute values (rounded up), since filters were
        already designed for absolutes.
        """

        self.species_threshold = int(self.species_threshold *
                                     len(self.species_list))

[docs]    def update_filters(self, gn_filter, sp_filter, update_stats=False):
        """
        Updates the group filter attributes and group summary stats if
        update_stats is True. This method does not change the
        data of the Group object, only sets attributes that will be taken into
        account when plotting or exporting data
        :param gn_filter: integer. Maximum number of gene copies allowed in an
        ortholog cluster
        :param sp_filter: integer/float. Minimum number/proportion of taxa
        representation
        :param update_stats: boolean. If True it will update the group summary
        statistics
        """

        # If the provided filters are the same as the current group attributes
        # there is nothing to do
        if (gn_filter, sp_filter) == (self.gene_threshold,
                                      self.species_threshold):
            return

        self.gene_threshold = gn_filter
        self.species_threshold = sp_filter

        if type(self.species_threshold) is float:
            self._get_sp_proportion()

        if update_stats:
            self._reset_counter()
            for cl in self.iter_species_frequency():
                self._apply_filter(cl)

[docs]    def retrieve_sequences(self, sqldb, protein_db, dest="./",
                             shared_namespace=None, outfile=None):
        """
        :param sqldb: srting. Path to sqlite database file
        :param protein_db: string. Path to protein database file
        :param dest: string. Directory where sequences will be exported
        :param shared_namespace: Namespace object to communicate with
        TriFusion's main process
        :param outfile: If set, all sequeces will be instead saved in a
        single output file. This is used for the nucleotide sequence export
        :return:
        """

        if not os.path.exists(dest) and not outfile:
            os.makedirs(dest)

        if shared_namespace:
            shared_namespace.act = shared_namespace.msg = "Creating database"
            # Stores sequences that could not be retrieved
            shared_namespace.missed = shared_namespace.counter = 0
            shared_namespace.progress = 0
            # Get number of lines of protein database
            p = 0
            with open(protein_db) as fh:
                for p, _ in enumerate(fh):
                    pass
            shared_namespace.max_pb = shared_namespace.total = p + 1

        # Connect to database
        conn = sqlite3.connect(sqldb, timeout=100000)
        c = conn.cursor()
        table_name = "".join([x for x in protein_db if x.isalnum()]).encode(
            "utf8")

        # Create table if it does not exist
        if not c.execute("SELECT name FROM sqlite_master WHERE type='table' "
                         "AND name='{}'".format(table_name)).fetchall():

            c.execute("CREATE TABLE {} (seq_id text PRIMARY KEY, seq text)".
                      format(table_name))

            # Populate database
            with open(protein_db) as ph:
                seq = ""
                for line in ph:

                    # Kill switch
                    if shared_namespace:
                        if shared_namespace.stop:
                            conn.close()
                            raise KillByUser("")
                        shared_namespace.progress += 1
                        shared_namespace.counter += 1

                    if line.startswith(">"):
                        seq_id = line.strip()[1:]
                        if seq != "":
                            c.execute("INSERT INTO {} VALUES (?, ?)".
                                      format(table_name), (seq_id, seq))
                        seq = ""
                    else:
                        seq += line.strip()

            conn.commit()

        if shared_namespace:
            shared_namespace.act = shared_namespace.msg = "Fetching sequences"
            shared_namespace.good = shared_namespace.counter = 0
            shared_namespace.progress = 0
            shared_namespace.max_pb = shared_namespace.total = \
                self.all_compliant

        # Set single output file, if option is set
        if outfile:
            output_handle = open(join(dest, outfile), "w")

        # Fetching sequences
        for line, cl in zip(self.groups(), self.iter_species_frequency()):

            # Kill switch
            if shared_namespace:
                if shared_namespace.stop:
                    conn.close()
                    raise KillByUser("")

            # Filter sequences
            if self._get_compliance(cl) == (1, 1):

                if shared_namespace:
                    shared_namespace.good += 1
                    shared_namespace.progress += 1
                    shared_namespace.counter += 1

                # Retrieve sequences from current cluster
                if self.excluded_taxa:
                    line = self._remove_tx(line)
                fields = line.split(":")

                # Open file
                if not outfile:
                    cl_name = fields[0]
                    output_handle = open(os.path.join(dest, cl_name) + ".fas",
                                         "w")

                seqs = fields[-1].split()
                for i in seqs:
                    # Query database
                    c.execute("SELECT * FROM {} WHERE seq_id = ?".
                              format(table_name), (i,))
                    vals = c.fetchone()
                    # Handles cases where the sequence could not be retrieved
                    # If outfile is set, output_handle will be a single file
                    # for all groups. If not, it will represent an individual
                    # group file
                    try:
                        output_handle.write(">{}\n{}\n".format(vals[0],
                                                               vals[1]))
                    except TypeError:
                        pass

                if not outfile:
                    output_handle.close()

        if outfile:
            output_handle.close()

        conn.close()

[docs]    def export_filtered_group(self, output_file_name="filtered_groups",
                                 dest="./", shared_namespace=None):

        if shared_namespace:
            shared_namespace.act = "Exporting filtered orthologs"
            shared_namespace.missed = 0
            shared_namespace.good = 0

        output_handle = open(os.path.join(dest, output_file_name), "w")

        for p, (line, cl) in enumerate(zip(self.groups(),
                                           self.iter_species_frequency())):

            if shared_namespace:
                if shared_namespace.stop:
                    raise KillByUser("")

            if shared_namespace:
                shared_namespace.progress = p

            if self._get_compliance(cl) == (1, 1):
                if shared_namespace:
                    shared_namespace.good += 1
                if self.excluded_taxa:
                    l = self._remove_tx(line)
                else:
                    l = line
                output_handle.write("{}\n".format(l))

        output_handle.close()

[docs]    def bar_species_distribution(self, filt=False):

        if filt:
            data = Counter((len(cl) for cl in self.iter_species_frequency() if
                           self._get_compliance(cl) == (1, 1)))
        else:
            data = Counter((len(cl) for cl in self.species_frequency))

        x_labels = [x for x in list(data)]
        data = list(data.values())

        # When data is empty, return an exception
        if not data:
            return {"data": None}

        # Sort lists
        x_labels = [list(x) for x in zip(*sorted(zip(x_labels, data)))][0]

        # Convert label to strings
        x_labels = [str(x) for x in x_labels]

        title = "Taxa frequency distribution"
        ax_names = ["Number of taxa", "Ortholog frequency"]

        return {"data": [data],
                "title": title,
                "ax_names": ax_names,
                "labels": x_labels,
                "table_header": ["Number of species",
                                      "Ortholog frequency"]}

[docs]    def bar_genecopy_distribution(self, filt=False):
        """
        Creates a bar plot with the distribution of gene copies across
        clusters
        :param filt: Boolean, whether or not to use the filtered groups.
        """

        if filt:
            data = Counter((max(cl.values()) for cl in
                            self.iter_species_frequency() if
                            self._get_compliance(cl) == (1, 1)))
        else:
            data = Counter((max(cl.values()) for cl in self.species_frequency
                           if cl))

        x_labels = [x for x in list(data)]
        data = list(data.values())

        # When data is empty, return an exception
        if not data:
            return {"data": None}

        x_labels, data = (list(x) for x in zip(*sorted(zip(x_labels, data))))

        # Convert label to strings
        x_labels = [str(x) for x in x_labels]

        title = "Gene copy distribution"
        ax_names = ["Number of gene copies", "Ortholog frequency"]

        return {"data": [data],
                "labels": x_labels,
                "title": title,
                "ax_names": ax_names,
                "table_header": ["Number of gene copies",
                                 "Ortholog frequency"]}

[docs]    def bar_species_coverage(self, filt=False):
        """
        Creates a stacked bar plot with the proportion of
        :return:
        """

        data = Counter(dict((x, 0) for x in self.species_list))

        self._reset_counter()

        for cl in self.iter_species_frequency():

            self._apply_filter(cl)
            if filt:
                data += Counter(dict((x, 1) for x, y in cl.items() if y > 0 and
                           self._get_compliance(cl) == (1, 1)))
            else:
                data += Counter(dict((x, 1) for x, y in cl.items() if y > 0))

        data = data.most_common()

        # When data is empty, return an exception
        if not data:
            return {"data": None}

        x_labels = [str(x[0]) for x in data]
        data = [[x[1] for x in data], [self.all_clusters - x[1] if not
                                      filt else self.all_compliant - x[1]
                                      for x in data]]

        lgd_list = ["Available data", "Missing data"]
        ax_names = [None, "Ortholog frequency"]

        return {"data": data,
                "labels": x_labels,
                "lgd_list": lgd_list,
                "ax_names": ax_names}

[docs]    def bar_genecopy_per_species(self, filt=False):

        data = Counter(dict((x, 0) for x in self.species_list))

        self._reset_counter()

        for cl in self.iter_species_frequency():

            self._apply_filter(cl)
            if filt:
                data += Counter(dict((x, y) for x, y in cl.items() if y > 1 and
                           self._get_compliance(cl) == (1, 1)))
            else:
                data += Counter(dict((x, y) for x, y in cl.items() if y > 1))

        data = data.most_common()

        # When data is empty, return an exception
        if not data:
            return {"data": None}

        x_labels = [str(x[0]) for x in data]
        data = [[x[1] for x in data]]
        ax_names = [None, "Gene copies"]

        return {"data": data,
                "labels": x_labels,
                "ax_names": ax_names}


[docs]class Group(object):
    """ This represents the main object of the orthomcl toolbox module. It is
     initialized with a file name of a orthomcl groups file and provides
     several methods that act on that group file. To process multiple Group
     objects, see MultiGroups object """

    def __init__(self, groups_file, gene_threshold=None,
                 species_threshold=None, project_prefix="MyGroups"):

        # Initializing thresholds. These may be set from the start, or using
        #  some method that uses them as arguments
        self.gene_threshold = gene_threshold
        self.species_threshold = species_threshold

        # Attribute containing the list of included species
        self.species_list = []
        # Attribute that will contain taxa to be excluded from analyses
        self.excluded_taxa = []

        # Attributes that will store the number (int) of cluster after gene and
        # species filter
        self.all_compliant = 0
        self.num_gene_compliant = 0
        self.num_species_compliant = 0

        # Attribute containing the total number of sequences
        self.total_seqs = 0
        # Attribute containing the maximum number of extra copies found in the
        # clusters
        self.max_extra_copy = 0

        # Attribute with name of the group file, which will be an ID
        self.group_name = groups_file
        # Initialize the project prefix for possible output files
        self.prefix = project_prefix
        # Initialize attribute containing the original groups
        self.groups = []
        # Initialize atribute containing the groups filtered using the gene and
        # species threshold. This attribute can be updated at any time using
        # the update_filtered_group method
        self.filtered_groups = []
        self.name = None
        # Parse groups file and populate groups attribute
        self.__parse_groups(groups_file)

    def __parse_groups(self, groups_file):
        """
        Parses the ortholog clusters in the groups file and populates the
         self.groups list with Cluster objects for each line in the groups file.
        :param groups_file: File name for the orthomcl groups file
        :return: populates the groups attribute
        """

        self.name = groups_file
        self.species_list = []
        groups_file_handle = open(groups_file)

        for line in groups_file_handle:
            cluster_object = Cluster(line)

            # Add cluster to general group list
            self.groups.append(cluster_object)

            # Update total sequence counter
            self.total_seqs += len(cluster_object.sequences)

            # Update maximum number of extra copies, if needed
            if max(cluster_object.species_frequency.values()) > \
                    self.max_extra_copy:
                self.max_extra_copy = \
                    max(cluster_object.species_frequency.values())

            # Update species_list attribute
            self.species_list = list(set(self.species_list).union(
                set(cluster_object.species_frequency.keys())))

            # If thresholds have been specified, update self.filtered_groups
            # attribute
            if self.species_threshold and self.gene_threshold:
                cluster_object.apply_filter(self.gene_threshold,
                                            self.species_threshold)
                if cluster_object.species_compliant and \
                        cluster_object.gene_compliant:
                    # Add cluster to the filtered group list
                    self.filtered_groups.append(cluster_object)
                    self.all_compliant += 1

                # Update num_species_compliant attribute
                if cluster_object.species_compliant:
                    self.num_species_compliant += 1
                # Update num_gene_compliant attribute
                if cluster_object.gene_compliant:
                    self.num_gene_compliant += 1

[docs]    def exclude_taxa(self, taxa_list):
        """
        Adds a taxon_name to the excluded_taxa list and updates the
        filtered_groups list
        """

        self.excluded_taxa.extend(taxa_list)

        # Storage variable for new filtered groups
        filtered_groups = []

        # Reset max_extra_copy attribute
        self.max_extra_copy = 0

        for cl in self.groups:
            cl.remove_taxa(taxa_list)
            if cl.iter_sequences and cl.species_frequency:
                filtered_groups.append(cl)

                # Update maximum number of extra copies, if needed
                if max(cl.species_frequency.values()) > self.max_extra_copy:
                    self.max_extra_copy = max(cl.species_frequency.values())

        # Update species_list
        self.species_list = sorted(list(set(self.species_list) -
                                        set(taxa_list)))

        self.filtered_groups = self.groups = filtered_groups

[docs]    def get_filters(self):
        """
        Returns a tuple with the thresholds for max gene copies and min species
        """

        return self.gene_threshold, self.species_threshold

[docs]    def basic_group_statistics(self):
        """
        This method creates a basic table in list format containing basic
        information of the groups file (total number of clusters, total number
        of sequences, number of clusters below the gene threshold, number of
        clusters below the species threshold and number of clusters below the
        gene AND species threshold)
        :return: List containing number of

        [total clusters,
         total sequences,
         clusters above gene threshold,
         clusters above species threshold,
         clusters above gene and species threshold]
        """

        # Total number of clusters
        total_cluster_num = len(self.groups)

        # Total number of sequenes
        total_sequence_num = self.total_seqs

        # Gene compliant clusters
        clusters_gene_threshold = self.num_gene_compliant

        # Species compliant clusters
        clusters_species_threshold = self.num_species_compliant

        clusters_all_threshold = len(self.filtered_groups)

        statistics = [total_cluster_num, total_sequence_num,
                      clusters_gene_threshold, clusters_species_threshold,
                      clusters_all_threshold]

        return statistics

[docs]    def paralog_per_species_statistic(self, output_file_name=
                                      "Paralog_per_species.csv", filt=True):
        """
        This method creates a CSV table with information on the number of
        paralog clusters per species
        :param output_file_name: string. Name of the output csv file
        :param filt: Boolean. Whether to use the filtered groups (True) or
        total groups (False)
        """

        # Setting which clusters to use
        if filt:
            groups = self.filtered_groups
        else:
            groups = self.groups

        paralog_count = dict((species, 0) for species in self.species_list)

        for cluster in groups:
            for species in paralog_count:
                if cluster.species_frequency[species] > 1:
                    paralog_count[species] += 1

        # Writing table
        output_handle = open(output_file_name, "w")
        output_handle.write("Species; Clusters with paralogs\n")

        for species, val in paralog_count.items():
            output_handle.write("%s; %s\n" % (species, val))

        output_handle.close()

[docs]    def export_filtered_group(self, output_file_name="filtered_groups",
                              dest="./", get_stats=False,
                              shared_namespace=None):
        """
        Export the filtered groups into a new file.
        :param output_file_name: string, name of the filtered groups file
        :param dest: string, path to directory where the filtered groups file
        will be created
        :param get_stats: Boolean, whether to return the basic count stats or
        not
        :param shared_namespace: Namespace object, for communicating with
        main process.
        """

        if self.filtered_groups:

            if shared_namespace:
                shared_namespace.act = "Exporting filtered orthologs"

            output_handle = open(os.path.join(dest, output_file_name), "w")

            if get_stats:
                all_orthologs = len(self.groups)
                sp_compliant = 0
                gene_compliant = 0
                final_orthologs = 0

            for cluster in self.filtered_groups:

                if shared_namespace:
                    shared_namespace.progress = \
                        self.filtered_groups.index(cluster)

                if cluster.species_compliant and cluster.gene_compliant:
                    output_handle.write("%s: %s\n" % (
                                    cluster.name, " ".join(cluster.iter_sequences)))
                    if get_stats:
                        final_orthologs += 1
                if get_stats:
                    if cluster.species_compliant:
                        sp_compliant += 1
                    if cluster.gene_compliant:
                        gene_compliant += 1

            output_handle.close()

            if get_stats:
                return all_orthologs, sp_compliant, gene_compliant,\
                           final_orthologs

        else:
            raise OrthoGroupException("The groups object must be filtered "
                                      "before using the export_filtered_group"
                                      "method")

[docs]    def update_filters(self, gn_filter, sp_filter):
        """
        Sets new values for the self.species_threshold and self.gene_threshold
        and updates the filtered_group
        :param gn_filter: int. Maximum value for gene copies in cluster
        :param sp_filter:  int. Minimum value for species in cluster
        """

        self.species_threshold = int(sp_filter)
        self.gene_threshold = int(gn_filter)

        self.update_filtered_group()

[docs]    def update_filtered_group(self):
        """
        This method creates a new filtered group variable, like
        export_filtered_group, but instead of writing into a new file, it
        replaces the self.filtered_groups variable
        """

        self.filtered_groups = []

        # Reset gene and species compliant counters
        self.num_gene_compliant = 0
        self.num_species_compliant = 0

        for cluster in self.groups:
            cluster.apply_filter(self.gene_threshold, self.species_threshold)
            if cluster.species_compliant and cluster.gene_compliant:
                self.filtered_groups.append(cluster)

            # Update num_species_compliant attribute
            if cluster.species_compliant:
                self.num_species_compliant += 1
            # Update num_gene_compliant attribute
            if cluster.gene_compliant:
                self.num_gene_compliant += 1

[docs]    def retrieve_sequences(self, database, dest="./", mode="fasta",
                             filt=True, shared_namespace=None):
        """
        When provided with a database in Fasta format, this will use the
        Alignment object to retrieve sequences
        :param database: String. Fasta file
        :param dest: directory where files will be save
        :param mode: string, whether to retrieve sequences to a file ('fasta'),
        or a dictionary ('dict')
        :param filt: Boolean. Whether to use the filtered groups (True) or
        total groups (False)
        :param shared_namespace: Namespace object. This argument is meant for
        when fast are retrieved in a background process, where there is a need
        to update the main process of the changes in this method
        :param dest: string. Path to directory where the retrieved sequences
        will be created.
        """

        if mode == "dict":
            seq_storage = {}

        if filt:
            groups = self.filtered_groups
        else:
            groups = self.groups

        if not os.path.exists("Orthologs"):
            os.makedirs("Orthologs")

        # Update method progress
        if shared_namespace:
            shared_namespace.act = "Creating database"
            shared_namespace.progress = 0

        print("Creating db")
        # Check what type of database was provided
        #TODO: Add exception handling if file is not parsed with Aligment
        if isinstance(database, str):
            try:
                db_aln = pickle.load(open(database, "rb"))
            except (EnvironmentError, pickle.UnpicklingError):
                db_aln = Alignment(database)
                db_aln = db_aln.alignment
        elif isinstance(database, dict):
            db_aln = database
        else:
            raise OrthoGroupException("The input database is neither a string"
                                      "nor a dictionary object")

        print("Retrieving seqs")
        # Update method progress
        if shared_namespace:
            shared_namespace.act = "Retrieving sequences"
        for cluster in groups:

            if shared_namespace:
                shared_namespace.progress += 1

            if mode == "dict":
                seq_storage[cluster.name] = []

            output_handle = open(join(dest, cluster.name + ".fas"), "w")
            for sequence_id in cluster.iter_sequences:
                seq = db_aln[sequence_id]
                if mode == "fasta":
                    output_handle.write(">%s\n%s\n" % (sequence_id, seq))
                elif mode == "dict":
                    seq_storage[cluster.name].append([sequence_id.split("|")[0],
                                                      seq])
            output_handle.close()

        if mode == "dict":
            return seq_storage

[docs]    def bar_species_distribution(self, dest="./", filt=False, ns=None,
                                 output_file_name="Species_distribution"):
        """
        Creates a bar plot with the distribution of species numbers across
        clusters
        :param dest: string, destination directory
        :param filt: Boolean, whether or not to use the filtered groups.
        :param output_file_name: string, name of the output file
        """

        data = []

        # Determine which groups to use
        if filt:
            groups = self.filtered_groups
        else:
            groups = self.groups

        for i in groups:
            if ns:
                if ns.stop:
                    raise KillByUser("")
            data.append(len([x for x, y in i.species_frequency.items()
                             if y > 0]))

        # Transform data into histogram-like
        transform_data = Counter(data)
        x_labels = [x for x in list(transform_data)]
        y_vals = list(transform_data.values())

        # Sort lists
        x_labels, y_vals = (list(x) for x in zip(*sorted(zip(x_labels,
                                                             y_vals))))
        # Convert label to strings
        x_labels = [str(x) for x in x_labels]

        if ns:
            if ns.stop:
                raise KillByUser("")

        # Create plot
        b_plt, lgd, _ = bar_plot([y_vals], x_labels,
                                 title="Taxa frequency distribution",
                                 ax_names=["Number of taxa", "Ortholog frequency"])
        b_plt.savefig(os.path.join(dest, output_file_name), bbox_inches="tight",
                      dpi=400)

        # Create table
        table_list = [["Number of species", "Ortholog frequency"]]
        for x, y in zip(x_labels, y_vals):
            table_list.append([x, y])

        return b_plt, lgd, table_list

[docs]    def bar_genecopy_distribution(self, dest="./", filt=False,
                                output_file_name="Gene_copy_distribution.png"):
        """
        Creates a bar plot with the distribution of gene copies across
        clusters
        :param dest: string, destination directory
        :param filt: Boolean, whether or not to use the filtered groups.
        :param output_file_name: string, name of the output file
        """

        data = []

        # Determin which groups to use
        if filt:
            groups = self.filtered_groups
        else:
            groups = self.groups

        for cl in groups:
            # Get max number of copies
            max_copies = max(cl.species_frequency.values())

            data.append(max_copies)

        # Transform data into histogram-like
        transform_data = Counter(data)
        x_labels = [x for x in list(transform_data)]
        y_vals = list(transform_data.values())

        # Sort lists
        x_labels, y_vals = (list(x) for x in zip(*sorted(zip(x_labels,
                                                             y_vals))))
        # Convert label to strings
        x_labels = [str(x) for x in x_labels]

        # Create plot
        b_plt, lgd, _ = bar_plot([y_vals], x_labels,
                    title="Gene copy distribution",
                    ax_names=["Number of gene copies", "Ortholog frequency"],
                    reverse_x=False)
        b_plt.savefig(os.path.join(dest, output_file_name), bbox_inches="tight",
                      figsize=(8 * len(x_labels) / 4, 6), dpi=200)

        # Create table
        table_list = [["Number of gene copies", "Ortholog frequency"]]
        for x, y in zip(x_labels, y_vals):
            table_list.append([x, y])

        return b_plt, lgd, table_list

[docs]    def bar_species_coverage(self, dest="./", filt=False, ns=None,
                             output_file_name="Species_coverage"):
        """
        Creates a stacked bar plot with the proportion of
        :return:
        """

        # Determine which groups to use
        if filt:
            groups = self.filtered_groups
        else:
            groups = self.groups

        data = Counter(dict((x, 0) for x in self.species_list))

        for cl in groups:
            if ns:
                if ns.stop:
                    raise KillByUser("")
            data += Counter(dict((x, 1) for x, y in cl.species_frequency.items()
                            if y > 0))

        xlabels = [str(x) for x in list(data.keys())]
        data = [list(data.values()), [len(groups) - x for x in
                                      data.values()]]

        lgd_list = ["Available data", "Missing data"]

        if ns:
            if ns.stop:
                raise KillByUser("")

        b_plt, lgd, _ = bar_plot(data, xlabels, lgd_list=lgd_list,
                              ax_names=[None, "Ortholog frequency"])
        b_plt.savefig(os.path.join(dest, output_file_name), bbox_inches="tight",
                      dpi=200)

        return b_plt, lgd, ""


[docs]class MultiGroups(object):
    """ Creates an object composed of multiple Group objects """

    def __init__(self, groups_files=None, gene_threshold=None,
                 species_threshold=None, project_prefix="MyGroups"):
        """
        :param groups_files: A list containing the file names of the multiple
        group files
        :return: Populates the self.multiple_groups attribute
        """

        # If a MultiGroups is initialized with duplicate Group objects, these
        # will be stored in a list. If all Group objects are unique, the list
        # will remain empty
        self.duplicate_groups = []

        # Initializing thresholds. These may be set from the start, or using
        # some method that uses them as arguments
        self.gene_threshold = gene_threshold
        self.species_threshold = species_threshold

        self.prefix = project_prefix

        self.multiple_groups = {}
        self.filters = {}

        if groups_files:
            for group_file in groups_files:

                # If group_file is already a Group object, just add it
                if not isinstance(group_file, Group):
                    # Check for duplicate group files
                    group_object = Group(group_file, self.gene_threshold,
                                         self.species_threshold)
                else:
                    group_object = group_file

                if group_object.name in self.multiple_groups:
                    self.duplicate_groups.append(group_object.name)
                else:
                    self.multiple_groups[group_object.name] = group_object
                    self.filters[group_object.name] = (1,
                                                len(group_object.species_list))

    def __iter__(self):

        return iter(self.multiple_groups)

[docs]    def iter_gnames(self):

        return (x.name for x in self.multiple_groups)

[docs]    def get_gnames(self):

        return [x.name for x in self.multiple_groups]

[docs]    def add_group(self, group_obj):
        """
        Adds a group object
        :param group_obj: Group object
        """

        # Check for duplicate groups
        if group_obj.name in self.multiple_groups:
            self.duplicate_groups.append(group_obj.name)
        else:
            self.multiple_groups[group_obj.name] = group_obj

[docs]    def remove_group(self, group_id):
        """
        Removes a group object according to its name
        :param group_id: string, name matching a Group object name attribute
        """

        if group_id in self.multiple_groups:
            del self.multiple_groups[group_id]

[docs]    def get_group(self, group_id):
        """
        Returns a group object based on its name. If the name does not match
        any group object, returns None
        :param group_id: string. Name of group object
        """

        try:
            return self.multiple_groups[group_id]
        except KeyError:
            return

[docs]    def add_multigroups(self, multigroup_obj):
        """
        Merges a MultiGroup object
        :param multigroup_obj: MultiGroup object
        """

        for group_obj in multigroup_obj:
            self.add_group(group_obj)

[docs]    def update_filters(self, gn_filter, sp_filter, group_names=None,
                       default=False):
        """
        This will not change the Group object themselves, only the filter
        mapping. The filter is only applied when the Group object is retrieved
        to reduce computations
        :param gn_filter: int, filter for max gene copies
        :param sp_filter: int, filter for min species
        :param group_names: list, with names of group objects
        """

        if group_names:
            for group_name in group_names:
                # Get group object
                group_obj = self.multiple_groups[group_name]
                # Define filters
                gn_filter = gn_filter if not default else 1
                sp_filter = sp_filter if not default else \
                    len(group_obj.species_list)
                # Update Group object with new filters
                group_obj.update_filters(gn_filter, sp_filter)
                # Update filter map
                self.filters[group_name] = (gn_filter, sp_filter)

            for group_name, group_obj in self.multiple_groups.items():
                # Define filters
                gn_filter = gn_filter if not default else 1
                sp_filter = sp_filter if not default else \
                    len(group_obj.species_list)
                # Update Group object with new filters
                group_obj.update_filters(gn_filter, sp_filter)
                # Update filter map
                self.filters[group_name] = (gn_filter, sp_filter)

[docs]    def basic_multigroup_statistics(self, output_file_name=
                                    "multigroup_base_statistics.csv"):
        """
        :param output_file_name:
        :return:
        """

        # Creates the storage for the statistics of the several files
        statistics_storage = OrderedDict()

        for group in self.multiple_groups:
            group_statistics = group.basic_group_statistics()
            statistics_storage[group.name] = group_statistics

        output_handle = open(self.prefix + "." + output_file_name, "w")
        output_handle.write("Group file; Total clusters; Total sequences; "
                            "Clusters below gene threshold; Clusters above "
                            "species threshold; Clusters below gene and above"
                            " species thresholds\n")

        for group, vals in statistics_storage.items():
            output_handle.write("%s; %s\n" % (group, ";".join([str(x) for x
                                                               in vals])))

        output_handle.close()

[docs]    def bar_orthologs(self, output_file_name="Final_orthologs",
                             dest="./", stats="total"):
        """
        Creates a bar plot with the final ortholog values for each group file
        :param output_file_name: string. Name of output file
        :param dest: string. output directory
        :param stats: string. The statistics that should be used to generate
        the bar plot. Options are:
            ..: "1": Total orthologs
            ..: "2": Species compliant orthologs
            ..: "3": Gene compliant orthologs
            ..: "4": Final orthologs
            ..: "all": All of the above
            Multiple combinations can be provided, for instance: "123" will
            display bars for total, species compliant and gene compliant stats
        """

        # Stores the x-axis labels
        x_labels = []
        # Stores final ortholog values for all 4 possible data sets
        vals = [[], [], [], []]
        lgd = ["Total orthologs", "After species filter", "After gene filter",
               "Final orthologs"]

        # Get final ortholog values
        for g_obj in self.multiple_groups:

            x_labels.append(g_obj.name.split(os.sep)[-1])
            # Populate total orthologs
            if "1" in stats or stats == "all":
                vals[0].append(len(g_obj.groups))
            # Populate species compliant orthologs
            if "2" in stats or stats == "all":
                vals[1].append(g_obj.num_species_compliant)
            # Populate gene compliant orthologs
            if "3" in stats or stats == "all":
                vals[2].append(g_obj.num_gene_compliant)
            # Populate final orthologs
            if "4" in stats or stats == "all":
                vals[3].append(len(g_obj.filtered_groups))

        # Filter valid data sets
        lgd_list = [x for x in lgd if vals[lgd.index(x)]]
        vals = [l for l in vals if l]

        # Create plot
        b_plt, lgd = multi_bar_plot(vals, x_labels, lgd_list=lgd_list)
        b_plt.savefig(os.path.join(dest, output_file_name),
                      bbox_extra_artists=(lgd,), bbox_inches="tight")

        # Create table list object
        table_list = []
        # Create header
        table_list.append([""] + x_labels)
        # Create content
        for i in range(len(vals)):
            table_list += [x for x in [[lgd_list[i]] + vals[i]]]

        return b_plt, lgd, table_list

[docs]    def group_overlap(self):
        """
        This will find the overlap of orthologs between two group files.
        THIS METHOD IS TEMPORARY AND EXPERIMENTAL
        """

        def parse_groups(group_obj):
            """
            Returns a list with the sorted ortholog clusters
            """

            storage = []

            for cluster in group_obj.groups:
                storage.append(set(cluster.iter_sequences))

            return storage

        if len(self.multiple_groups) != 2:
            raise SystemExit("This method can only be used with two group "
                             "files")

        group1 = self.multiple_groups[0]
        group2 = self.multiple_groups[1]

        group1_list = parse_groups(group1)
        group2_list = parse_groups(group2)

        counter = 0
        for i in group1_list:
            if i in group2_list:
                counter += 1

        print(counter)


[docs]class MultiGroupsLight(object):
    """
    Creates an object composed of multiple Group objects like MultiGroups.
    However, instead of storing the groups in memory, these are shelved in
    the disk
    """

    # The report calls available
    calls = ['bar_genecopy_distribution',
             'bar_species_distribution',
             'bar_species_coverage',
             'bar_genecopy_per_species']

    def __init__(self, db_path, groups=None, gene_threshold=None,
                 species_threshold=None, project_prefix="MyGroups",
                 ns=None):
        """
        :param groups: A list containing the file names of the multiple
        group files
        :return: Populates the self.multiple_groups attribute
        """

        self.db_path = db_path

        # If a MultiGroups is initialized with duplicate Group objects, their
        # names will be stored in a list. If all Group objects are unique, the
        # list will remain empty
        self.duplicate_groups = []

        self.groups = {}

        self.groups_stats = {}

        # Attribute that will store the paths of badly formated group files
        self.bad_groups = []

        # Initializing thresholds. These may be set from the start, or using
        # some method that uses them as arguments
        self.gene_threshold = gene_threshold
        self.species_threshold = species_threshold

        # Initializing mapping of group filters to their names. Should be
        # something like {"groupA": (1, 10)}
        self.filters = {}

        self.taxa_list = {}
        self.excluded_taxa = {}

        # This attribute will contain a dictionary with the maximum extra copies
        # for each group object
        self.max_extra_copy = {}
        # This attribute will contain a list with the number of species for
        # each group object, excluding replicates. If a MultiGroupLight object
        # contains Group objects with different taxa numbers, this attribute
        # can be used to issue a warning
        self.species_number = []

        self.prefix = project_prefix

        if ns:
            ns.files = len(groups)

        if groups:
            for group_file in groups:
                # If group_file is already a Group object, just add it
                if not isinstance(group_file, GroupLight):
                    try:
                        if ns:
                            if ns.stop:
                                raise KillByUser("")
                            ns.counter += 1
                        group_object = GroupLight(group_file,
                                                  self.gene_threshold,
                                                  self.species_threshold,
                                                  ns=ns)
                    except:
                        self.bad_groups.append(group_file)
                        continue
                else:
                    group_object = group_file

                # Check for duplicate group files
                if group_object.name in self.groups:
                    self.duplicate_groups.append(group_file.name)
                else:
                    self.add_group(group_object)

    def __iter__(self):
        for k, val in self.groups.items():
            yield k, pickle.load(open(val, "rb"))

[docs]    def clear_groups(self):
        """
        Clears the current MultiGroupsLight object
        """

        for f in self.groups.values():
            os.remove(f)

        self.duplicate_groups = []
        self.groups = {}
        self.groups_stats = {}
        self.filters = {}
        self.max_extra_copy = {}
        self.species_number = []
        self.gene_threshold = self.species_threshold = 0

[docs]    def add_group(self, group_obj):
        """
        Adds a group object
        :param group_obj: Group object
        """

        # Check for duplicate groups
        if group_obj.name not in self.groups:
            gpath = os.path.join(self.db_path,
                    "".join(random.choice(string.ascii_uppercase) for _ in
                            range(15)))
            pickle.dump(group_obj, open(gpath, "wb"))
            self.groups[group_obj.name] = gpath
            self.filters[group_obj.name] = (1, len(group_obj.species_list), [])
            self.max_extra_copy[group_obj.name] = group_obj.max_extra_copy
            if len(group_obj.species_list) not in self.species_number:
                self.species_number.append(len(group_obj.species_list))
        else:
            self.duplicate_groups.append(group_obj.name)

[docs]    def remove_group(self, group_id):
        """
        Removes a group object according to its name
        :param group_id: string, name matching a Group object name attribute
        """

        if group_id in self.groups:
            os.remove(self.groups[group_id])
            del self.groups[group_id]

[docs]    def get_group(self, group_id):
        """
        Returns a group object based on its name. If the name does not match
        any group object, returns None
        :param group_id: string. Name of group object
        """

        try:
            return pickle.load(open(self.groups[group_id], "rb"))
        except KeyError:
            return

[docs]    def add_multigroups(self, multigroup_obj):
        """
        Merges a MultiGroup object
        :param multigroup_obj: MultiGroup object
        """

        for _, group_obj in multigroup_obj:
            self.add_group(group_obj)

[docs]    def update_filters(self, gn_filter, sp_filter, excluded_taxa,
                       group_names=None, default=False):
        """
        This will not change the Group object themselves, only the filter
        mapping. The filter is only applied when the Group object is retrieved
        to reduce computations

        :param gn_filter: int, filter for max gene copies
        :param sp_filter: int, filter for min species
        :param group_names: list, with names of group objects
        """

        # There are no groups to update
        if group_names == []:
            return

        if group_names:
            glist = group_names
        else:
            glist = self.groups

        for group_name in glist:
            # Get group object
            group_obj = pickle.load(open(self.groups[group_name], "rb"))

            # Define excluded taxa
            group_obj.exclude_taxa(excluded_taxa, True)

            # Define filters
            gn_filter = gn_filter if not default else 1
            sp_filter = sp_filter if not default else \
                len(group_obj.species_list)

            # Correct maximum filter values after excluding taxa
            gn_filter = gn_filter if gn_filter <= group_obj.max_extra_copy \
                else group_obj.max_extra_copy
            sp_filter = sp_filter if sp_filter <= len(group_obj.species_list) \
                else len(group_obj.species_list)

            # Update Group object with new filters
            group_obj.update_filters(gn_filter, sp_filter)
            # Update group stats

            self.get_multigroup_statistics(group_obj)
            pickle.dump(group_obj, open(self.groups[group_name], "wb"))
            # Update filter map

            self.filters[group_name] = (gn_filter, group_obj.species_threshold)
            self.taxa_list[group_name] = group_obj.species_list
            self.excluded_taxa[group_name] = group_obj.excluded_taxa

[docs]    def get_multigroup_statistics(self, group_obj):
        """
        :return:
        """

        stats = group_obj.basic_group_statistics()

        self.groups_stats[group_obj.name] = {"stats": stats,
                                        "species": group_obj.species_list,
                                        "max_copies": group_obj.max_extra_copy}

[docs]    def bar_orthologs(self, group_names=None, output_file_name="Final_orthologs",
                             dest="./", stats="all"):
        """
        Creates a bar plot with the final ortholog values for each group file
        :param group_names: list. If None, all groups in self.group_stats will
        be used to generate the plot. Else, only the groups with the names in
        the list will be plotted.
        :param output_file_name: string. Name of output file
        :param dest: string. output directory
        :param stats: string. The statistics that should be used to generate
        the bar plot. Options are:
            ..: "1": Total orthologs
            ..: "2": Species compliant orthologs
            ..: "3": Gene compliant orthologs
            ..: "4": Final orthologs
            ..: "all": All of the above
            Multiple combinations can be provided, for instance: "123" will
            display bars for total, species compliant and gene compliant stats
        """

        # Stores the x-axis labels
        x_labels = []
        # Stores final ortholog values for all 4 possible data sets
        vals = [[], [], [], []]
        lgd = ["Total orthologs", "After species filter", "After gene filter",
               "Final orthologs"]

        # Determine which groups will be plotted
        if group_names:
            groups_lst = group_names
        else:
            groups_lst = self.groups_stats.keys()

        for gname in groups_lst:

            gstats = self.groups_stats[gname]

            x_labels.append(gname.split(os.sep)[-1])
            # Populate total orthologs
            if "1" in stats or stats == "all":
                vals[0].append(gstats["stats"][0])
            # Populate species compliant orthologs
            if "2" in stats or stats == "all":
                vals[1].append(gstats["stats"][3])
            # Populate gene compliant orthologs
            if "3" in stats or stats == "all":
                vals[2].append(gstats["stats"][2])
            # Populate final orthologs
            if "4" in stats or stats == "all":
                vals[3].append(gstats["stats"][4])

        # Filter valid data sets
        lgd_list = [x for x in lgd if vals[lgd.index(x)]]
        vals = [l for l in vals if l]

        # Create plot
        b_plt, lgd = multi_bar_plot(vals, x_labels, lgd_list=lgd_list)
        b_plt.savefig(os.path.join(dest, output_file_name),
                      bbox_extra_artists=(lgd,), bbox_inches="tight", dpi=200)

        # Create table list object
        table_list = []
        # Create header
        table_list.append([""] + x_labels)
        # Create content
        for i in range(len(vals)):
            table_list += [x for x in [[lgd_list[i]] + vals[i]]]

        return b_plt, lgd, table_list


__author__ = "Diogo N. Silva"