AnnotateClusters.py

#!/usr/local/bin/python
#Created on 10/2/15

__author__ = 'Juan A. Ugalde'


def annotate_cluster(protein_list, feature_type, annotation):

    from collections import defaultdict

    th = None  # Top hits
    tc = None  # Total conflicts
    uc = None  # Unresolved conflicts

    summary_annotation = defaultdict(int)

    for protein_id in protein_list:
        #First check that the protein is in the annotation
        if not protein_id in annotation:
            continue
        else:
            #Check that it has COG annotation
            if feature_type in annotation[protein_id].keys():
                protein_info = annotation[protein_id][feature_type]
                summary_annotation[protein_info] += 1

    if len(summary_annotation) == 0:
        pass

    elif len(summary_annotation) == 1:
        th = summary_annotation.keys()[0]

    else:
        tc = summary_annotation

        #Get the total number of values
        total_annotations = sum(summary_annotation.itervalues())

        for hit in summary_annotation:
            if summary_annotation[hit] / float(total_annotations) >= float(0.5):
                th = hit

        #If no decision was made
        if th is None:
            uc = summary_annotation

    return th, tc, uc

if __name__ == '__main__':
    import os
    import argparse
    from collections import defaultdict
    from tools.data_input import AnnotationData, GenomeData
    from tools.AnnotationInfo import cog_definitions

    program_description = "This script annotates a cluster generated by the Summarize OrthoMCLResults script"

    parser = argparse.ArgumentParser(description=program_description)

    #Arguments
    parser.add_argument("-l", "--genome_list_index", type=str,
                        help="File with the genome list. Format GenomeID, FullName, ShortName", required=True)
    parser.add_argument("-a", "--annotation_folder", type=str,
                        help="Folder with the annotation files from JGI", required=True)
    parser.add_argument("-c", "--cluster_file", type=str,
                        help="Cluster file", required=True)
    parser.add_argument("-o", "--output_directory", type=str,
                        help="Output folder", required=True)

    args = parser.parse_args()

    #Create the output directory
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)

    #####Read the genome list
    genome_id_dictionary, genome_count = GenomeData.read_genome_list(args.genome_list_index)

    ###Read the annotation information
    protein_annotation, function_definitions = \
        AnnotationData.parse_annotation_folder(genome_id_dictionary.keys(), args.annotation_folder)

    ##Read the cluster information
    total_clusters = AnnotationData.get_cluster_information(args.cluster_file)

    ##Print log file
    logfile = open(args.output_directory + "/logfile.txt", 'w')

    ##Total number of clusters
    logfile.write("Total number of analyzed clusters: %d" % len(total_clusters) + "\n")

    features_to_annotate = ["COG", "KO", "PFAM", "Product"]

    #Get the COG definitions
    cog_one_letter, desc_cog_letter, desc_cog_number = cog_definitions()

    #Create the output files
    for feature in features_to_annotate:

        output_top_hit = open(args.output_directory + "/" + feature + "_clusters.txt", 'w')
        output_conflicts = open(args.output_directory + "/" + feature + "_conflicts.txt", 'w')

        for cluster in total_clusters:

            protein_id_list = [id_tag.split("|")[1] for id_tag in total_clusters[cluster].split(",")]

            function_description = None

            #Get the annotations
            top_hit, total_conflicts, unresolved_conflicts = \
                annotate_cluster(protein_id_list, feature, protein_annotation)

            #Print the output file
            if total_conflicts is None and unresolved_conflicts is None and top_hit is not None:

                if feature == "Product":
                    output_top_hit.write(cluster + "\t" + top_hit + "\n")

                else:
                    if top_hit in function_definitions:
                        function_description = function_definitions[top_hit]

                    if feature == "COG":
                        letter = cog_one_letter[top_hit]
                        try:
                            definition = desc_cog_letter[letter[0]]
                        except IndexError:
                            definition = None
                            print cluster, letter, top_hit

                        output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description)
                                             + "\t" + letter[0] + "\t" + definition + "\n")
                    else:
                        output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description) + "\n")

            elif total_conflicts is not None and top_hit is not None and unresolved_conflicts is None:
                if feature == "Product":
                    output_top_hit.write(cluster + "\t" + top_hit + "\t" +
                                         " ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")
                else:
                    if top_hit in function_definitions:
                        function_description = function_definitions[top_hit]

                    if feature == "COG":
                        letter = cog_one_letter[top_hit]
                        definition = desc_cog_letter[letter[0]]
                        output_top_hit.write(cluster + "\t" + top_hit + "\t" + function_description +
                                             "\t" + letter + "\t" + definition + "\t" +
                                             " ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")

                    else:
                        output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description) + "\t" +
                                             " ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")

            elif top_hit is None and total_conflicts is not None and unresolved_conflicts is not None:

                if feature == "Product":
                    output_conflicts.write(cluster + "\t" + " ; ".join(["%s=%s" % (k, v) for k, v in
                                                                        unresolved_conflicts.items()]) + "\n")

                else:
                    output_conflicts.write(cluster + "\t" + " ; ".join(["%s, %s=%s" % (k, function_definitions[k], v)
                                                                        if k in function_definitions else "%s, None=%s" % (k, v) for k, v in
                                                                        unresolved_conflicts.items()]) + "\n")

        output_top_hit.close()
        output_conflicts.close()