dup_annotation

albamgarces · Dec 14, 2020 · 303d663 · 303d663
1 parent cf2a9d8
commit 303d663
Show file tree

Hide file tree

Showing 12 changed files with 148 additions and 51 deletions.
diff --git a/ejercicios/__pycache__/blast_caller.cpython-38.pyc b/ejercicios/__pycache__/blast_caller.cpython-38.pyc
diff --git a/ejercicios/__pycache__/input_parser.cpython-38.pyc b/ejercicios/__pycache__/input_parser.cpython-38.pyc
diff --git a/ejercicios/__pycache__/output.cpython-38.pyc b/ejercicios/__pycache__/output.cpython-38.pyc
diff --git a/ejercicios/__pycache__/protein_gbf.cpython-38.pyc b/ejercicios/__pycache__/protein_gbf.cpython-38.pyc
diff --git a/ejercicios/__pycache__/protein_gff.cpython-38.pyc b/ejercicios/__pycache__/protein_gff.cpython-38.pyc
diff --git a/ejercicios/blast_caller.py b/ejercicios/blast_caller.py
@@ -70,7 +70,7 @@ def create_blast_results(arg_dict):
         makeblastdb_exe = arg_dict["blast_folder"] + "/makeblastdb"
         blastp_exe = arg_dict["blast_folder"] + "/blastp"
         ##-> blast_folder = /usr/bin
-        #name_file = "name"
+
         if (arg_dict["db_name"]):
             db_path_name = os.path.abspath(arg_dict["db_name"])+ "/" + arg_dict["db_name"]
             output_file = os.path.abspath(arg_dict["db_name"]) + "/BLAST_raw_results.txt"
@@ -135,8 +135,6 @@ def system_call(cmd, returned=False, message=True):
 parser.add_argument("-d", "--db_name", metavar="", help="New database name")
 parser.add_argument("-f", "--fasta_file", metavar="", help="Protein sequences FASTA file")
 parser.add_argument("-b", "--blast_folder", metavar="", help="BLAST binary folder")
-# parser.add_argument("-ev", "--evalue", type=float, metavar="", help="BLAST e-value: number of expected hits of similar quality (score) that could be found just by chance.")
-# parser.add_argument("-b", "--bitscore", type=int, metavar="", help="BLAST bit-score: requires size of a sequence database in which the current match could be found just by chance.")
 parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
 parser.add_argument("--debug", action="store_true", default=False)   
 

diff --git a/ejercicios/dup_annot.py b/ejercicios/dup_annot.py
@@ -0,0 +1,135 @@
+'''
+Created on 13 dic. 2020
+
+@author: alba
+'''
+
+import os
+import argparse
+from argparse import ArgumentParser
+import input_parser
+
+#import blast_caller
+import dup_searcher
+
+#############
+### DONE! ###
+#############
+# 1. provide a GFF+FASTA/GBF file
+#
+# 2. get sequence protein FASTA file ############
+#                                    ############## input_parser --> protein_gbf or protein_gff
+#    get annotation csv file from GFF/GBF file ##
+#
+# 3. convert protein FASTA file into a BLAST database ##
+# 4. get duplicated proteins BLASTA .txt file ############ dup_searcher --> blast_caller
+# 5. get filtered results into a .csv file #############
+###############
+#### TO DO ####
+###############
+# 6. get results annotation info from annotation csv file ######
+#                                          ######################## dup_annot -> dup_searcher -> input_parser  
+# 7. classify results by duplicated number #####################
+###################################################################################
+
+# def get_seq:
+#     # get seq proteins FASTA file
+#     # get annot csv file
+#     input_parser.input_parser(arg_dict)
+#     
+# def get_dup:
+
+
+def get_dupannot(arg_dict):
+    compt = {}
+    compt["fasta"] = [".fa", ".faa", ".mpfa", ".fna", ".fsa", ".fas", ".fasta"]
+    compt["genbank"] = [".genbank", ".gb", ".gbf", ".gbff", ".gbk"]
+    compt["GFF"] = [".gff"]
+
+    output_path = os.path.abspath(arg_dict["out_folder"])
+
+    if arg_dict["annot_file"]:
+#         file_name_abs_path = os.path.abspath(arg_dict["annot_file"])
+#         name_file, extension = os.path.splitext(file_name_abs_path)
+        # get seq proteins FASTA file
+        # get annot csv file
+        input_parser.input_parser(arg_dict)
+
+        arg_dict["fasta_file"]= "%s/proteins.fa" % output_path
+        arg_dict["annot_table"] = "%s/df.csv" % output_path
+
+    # get filtered results file
+    dup_searcher.filter_data(arg_dict)
+
+    if (arg_dict["db_name"]):
+        output_path= os.path.abspath(arg_dict["db_name"])
+        sort_csv = "%s/filtered_results.csv" % output_path
+    elif (arg_dict["out_folder"]):
+        sort_csv = "%s/filtered_results.csv" % output_path
+    else:
+        sort_csv = "filtered_results.csv"
+
+    if arg_dict["fasta_file"] or arg_dict["text_file"]:
+        if arg_dict["annot_table"] is None:
+            print("#####")
+            print("Please provide an annotation table")
+            print("#####")
+
+        else:
+            file_name_abs_path = os.path.abspath(arg_dict["annot_table"])
+            name_file, extension = os.path.splitext(file_name_abs_path)
+            if extension != ".csv":
+                print("#####")
+                print("Please provide a .csv file")
+                print("#####")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+parser = ArgumentParser(prog='dupAnnotation',
+                        formatter_class=argparse.RawDescriptionHelpFormatter,
+                        description="Get an annotation file with duplicated protein on genome")
+parser.add_argument("-a", "--annot_file", metavar="", help="Annotation file: genbank or GFF")
+parser.add_argument("-r", "--ref_file", metavar="", help="Genome references FASTA file")
+###
+parser.add_argument("-d", "--db_name", metavar="", help="New database name")
+parser.add_argument("-f", "--fasta_file", metavar="", help="Protein sequences FASTA file")
+parser.add_argument("-b", "--blast_folder", metavar="", help="BLAST binary folder")
+###
+parser.add_argument("-t", "--text_file", metavar="", help="Blast raw results text file")
+parser.add_argument("-e", "--evalue", type=float, metavar="", default= 1e-05, help="BLAST e-value: number of expected hits of similar quality (score) that could be found just by chance.")
+parser.add_argument("-bs", "--bitscore", type=float, metavar="", default=50, help="BLAST bit-score: requires size of a sequence database in which the current match could be found just by chance.")
+parser.add_argument("-p", "--percentage", type=float, metavar="", default=80, help="Percentage of alignment in query")
+###
+parser.add_argument("-c", "--annot_table", metavar="", help="Genome annotation .csv file")
+###
+parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
+parser.add_argument("--debug", action="store_true", default=False)
+
+arg = parser.parse_args()
+arg_dict = vars(arg)
+
+if arg.annot_file is None and arg.fasta_file is None and arg.text_file is None:
+    print("######")
+    print(parser.print_help())
+    print("######")
+    exit() 
+if arg.debug:
+    print("##DEBUG: ##")
+    print("arguments dictionary: ")
+    print(arg)
+
+if __name__ == '__main__':
+    get_dupannot(arg_dict)
+
+
diff --git a/ejercicios/dup_searcher.py b/ejercicios/dup_searcher.py
@@ -73,14 +73,15 @@ def filter_data(arg_dict):
                 exit()
         else:
             print("#####")
-            print("Please provide either a a BLAST results .txt file OR an annotation FASTA file")
+            print("Please provide either a BLAST results .txt file OR an annotation FASTA file")
             print("#####")
 
     elif arg_dict["fasta_file"]:
         blast_caller.create_blast_results(arg_dict)
         file_name_abs_path = os.path.abspath(arg_dict["fasta_file"])
         name_file, extension = os.path.splitext(file_name_abs_path)
         basename= os.path.basename(name_file)
+
         if (arg_dict["db_name"]): 
             output_path= os.path.abspath(arg_dict["db_name"])
             raw_blast = pd.read_csv(output_path + "/BLAST_raw_results.txt",
@@ -106,12 +107,12 @@ def filter_data(arg_dict):
 
     #when Blast_file_results is done, evalue>10 is gone
     filter_evalue = raw_blast["evalue"] <= arg_dict["evalue"]
-    filter_bitscore = raw_blast["bitscore"] >= arg_dict["evalue"]
+    filter_bitscore = raw_blast["bitscore"] >= arg_dict["bitscore"]
     filter_alnpct = raw_blast["aln_pct"] >= arg_dict["percentage"]
     filter_id = raw_blast["qseqid"] != raw_blast["sseqid"]
     filtered_results = raw_blast[filter_evalue & filter_bitscore & filter_alnpct & filter_id]
 
-    #sort by pident (desc), evalue(asc), bitscore(desc)
+    #sort by aln_pct (desc), evalue(asc), bitscore(desc)
     by_alnpct = filtered_results.sort_values(["aln_pct", "evalue", "bitscore"],
                                        ascending=[False, True, False])
 
@@ -135,7 +136,7 @@ def filter_data(arg_dict):
 parser.add_argument("-t", "--text_file", metavar="", help="Blast raw results text file")
 parser.add_argument("-e", "--evalue", type=float, metavar="", default= 1e-05, help="BLAST e-value: number of expected hits of similar quality (score) that could be found just by chance.")
 parser.add_argument("-bs", "--bitscore", type=float, metavar="", default=50, help="BLAST bit-score: requires size of a sequence database in which the current match could be found just by chance.")
-parser.add_argument("-p", "--percentage", type=float, metavar="", default=80, help="Percentage of alignement in query")
+parser.add_argument("-p", "--percentage", type=float, metavar="", default=80, help="Percentage of alignment in query")
 parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
 parser.add_argument("-d", "--db_name", metavar="", help="New database name")
 parser.add_argument("-f", "--fasta_file", metavar="", help="Protein sequences FASTA file")
@@ -161,7 +162,7 @@ def filter_data(arg_dict):
 
 if arg.percentage is None:
     print("#####")
-    print("Note alignement in query percentage = 80% is set by default")
+    print("Note alignment in query percentage = 80% is set by default")
     print("#####")
     print(parser.print_help())
 

diff --git a/ejercicios/input_parser.py b/ejercicios/input_parser.py
@@ -30,7 +30,7 @@
 #############
 
 #####       
-def extension(arg_dict):
+def input_parser(arg_dict):
 
     compt = {}
     compt["fasta"] = [".fa", ".faa", ".mpfa", ".fna", ".fsa", ".fas", ".fasta"]
@@ -101,7 +101,7 @@ def extension(arg_dict):
 parser = ArgumentParser(prog='inputParser',
                         formatter_class=argparse.RawDescriptionHelpFormatter,
                         description="Get proteins sequences from annotation file")
-parser.add_argument("-a", "--annot_file", metavar="", help="Annotation file")
+parser.add_argument("-a", "--annot_file", metavar="", help="Annotation file: genbank or GFF")
 parser.add_argument("-r", "--ref_file", metavar="", help="Genome references FASTA file")
 #parser.add_argument("-p", "--prot_file", metavar="", help="Protein sequence file")
 parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
@@ -128,5 +128,5 @@ def extension(arg_dict):
 
 ## Cuando no se importe como un modulo este script, se ejecutara esto
 if __name__ == '__main__':
-    extension(arg_dict)
+    input_parser(arg_dict)
 
diff --git a/ejercicios/protein_gbf.py b/ejercicios/protein_gbf.py
@@ -37,7 +37,7 @@ def main (gbf_file, output_folder, debug=False):
 #####     
 def gbf_parser_caller(gbf_file, output_path, debug):
 
-    out_file = "%s/gbf_proteins.fa" % output_path
+    out_file = "%s/proteins.fa" % output_path
 
     with open(out_file, "w") as output_handle:
         SeqIO.write(gbf_parser(gbf_file, output_path, debug=debug), output_handle, "fasta")

diff --git a/ejercicios/protein_gff.py b/ejercicios/protein_gff.py
@@ -37,7 +37,7 @@ def main (gff_file, ref_file, output_folder, debug=False):
 #####
 def gff_parser_caller(gff_file, ref_file, output_path, debug):
 
-    out_file = "%s/gff_proteins.fa" % output_path
+    out_file = "%s/proteins.fa" % output_path
     with open (ref_file) as in_handle:
         ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))
 
@@ -133,7 +133,7 @@ def protein_recs(gff_file, ref_recs, output_path, debug=False):
 
                     yield(SeqRecord(protein_seq, feature.qualifiers["ID"][0], "", ""))
 
-    csv_file = "%s/gff_df.csv" % output_path            
+    csv_file = "%s/df.csv" % output_path            
     annot_df.to_csv(csv_file, header=True)
 
     if (debug):

diff --git a/ejercicios/pruebas_blast.py b/ejercicios/pruebas_blast.py