Skip to content

Commit

Permalink
dup_annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
albamgarces committed Dec 14, 2020
1 parent cf2a9d8 commit 303d663
Show file tree
Hide file tree
Showing 12 changed files with 148 additions and 51 deletions.
Binary file modified ejercicios/__pycache__/blast_caller.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file added ejercicios/__pycache__/output.cpython-38.pyc
Binary file not shown.
Binary file added ejercicios/__pycache__/protein_gbf.cpython-38.pyc
Binary file not shown.
Binary file added ejercicios/__pycache__/protein_gff.cpython-38.pyc
Binary file not shown.
4 changes: 1 addition & 3 deletions ejercicios/blast_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def create_blast_results(arg_dict):
makeblastdb_exe = arg_dict["blast_folder"] + "/makeblastdb"
blastp_exe = arg_dict["blast_folder"] + "/blastp"
##-> blast_folder = /usr/bin
#name_file = "name"

if (arg_dict["db_name"]):
db_path_name = os.path.abspath(arg_dict["db_name"])+ "/" + arg_dict["db_name"]
output_file = os.path.abspath(arg_dict["db_name"]) + "/BLAST_raw_results.txt"
Expand Down Expand Up @@ -135,8 +135,6 @@ def system_call(cmd, returned=False, message=True):
parser.add_argument("-d", "--db_name", metavar="", help="New database name")
parser.add_argument("-f", "--fasta_file", metavar="", help="Protein sequences FASTA file")
parser.add_argument("-b", "--blast_folder", metavar="", help="BLAST binary folder")
# parser.add_argument("-ev", "--evalue", type=float, metavar="", help="BLAST e-value: number of expected hits of similar quality (score) that could be found just by chance.")
# parser.add_argument("-b", "--bitscore", type=int, metavar="", help="BLAST bit-score: requires size of a sequence database in which the current match could be found just by chance.")
parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
parser.add_argument("--debug", action="store_true", default=False)

Expand Down
135 changes: 135 additions & 0 deletions ejercicios/dup_annot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
'''
Created on 13 dic. 2020
@author: alba
'''

import os
import argparse
from argparse import ArgumentParser
import input_parser

#import blast_caller
import dup_searcher

#############
### DONE! ###
#############
# 1. provide a GFF+FASTA/GBF file
#
# 2. get sequence protein FASTA file ############
# ############## input_parser --> protein_gbf or protein_gff
# get annotation csv file from GFF/GBF file ##
#
# 3. convert protein FASTA file into a BLAST database ##
# 4. get duplicated proteins BLASTA .txt file ############ dup_searcher --> blast_caller
# 5. get filtered results into a .csv file #############
###############
#### TO DO ####
###############
# 6. get results annotation info from annotation csv file ######
# ######################## dup_annot -> dup_searcher -> input_parser
# 7. classify results by duplicated number #####################
###################################################################################

# def get_seq:
# # get seq proteins FASTA file
# # get annot csv file
# input_parser.input_parser(arg_dict)
#
# def get_dup:


def get_dupannot(arg_dict):
compt = {}
compt["fasta"] = [".fa", ".faa", ".mpfa", ".fna", ".fsa", ".fas", ".fasta"]
compt["genbank"] = [".genbank", ".gb", ".gbf", ".gbff", ".gbk"]
compt["GFF"] = [".gff"]

output_path = os.path.abspath(arg_dict["out_folder"])

if arg_dict["annot_file"]:
# file_name_abs_path = os.path.abspath(arg_dict["annot_file"])
# name_file, extension = os.path.splitext(file_name_abs_path)
# get seq proteins FASTA file
# get annot csv file
input_parser.input_parser(arg_dict)

arg_dict["fasta_file"]= "%s/proteins.fa" % output_path
arg_dict["annot_table"] = "%s/df.csv" % output_path

# get filtered results file
dup_searcher.filter_data(arg_dict)

if (arg_dict["db_name"]):
output_path= os.path.abspath(arg_dict["db_name"])
sort_csv = "%s/filtered_results.csv" % output_path
elif (arg_dict["out_folder"]):
sort_csv = "%s/filtered_results.csv" % output_path
else:
sort_csv = "filtered_results.csv"

if arg_dict["fasta_file"] or arg_dict["text_file"]:
if arg_dict["annot_table"] is None:
print("#####")
print("Please provide an annotation table")
print("#####")

else:
file_name_abs_path = os.path.abspath(arg_dict["annot_table"])
name_file, extension = os.path.splitext(file_name_abs_path)
if extension != ".csv":
print("#####")
print("Please provide a .csv file")
print("#####")














parser = ArgumentParser(prog='dupAnnotation',
formatter_class=argparse.RawDescriptionHelpFormatter,
description="Get an annotation file with duplicated protein on genome")
parser.add_argument("-a", "--annot_file", metavar="", help="Annotation file: genbank or GFF")
parser.add_argument("-r", "--ref_file", metavar="", help="Genome references FASTA file")
###
parser.add_argument("-d", "--db_name", metavar="", help="New database name")
parser.add_argument("-f", "--fasta_file", metavar="", help="Protein sequences FASTA file")
parser.add_argument("-b", "--blast_folder", metavar="", help="BLAST binary folder")
###
parser.add_argument("-t", "--text_file", metavar="", help="Blast raw results text file")
parser.add_argument("-e", "--evalue", type=float, metavar="", default= 1e-05, help="BLAST e-value: number of expected hits of similar quality (score) that could be found just by chance.")
parser.add_argument("-bs", "--bitscore", type=float, metavar="", default=50, help="BLAST bit-score: requires size of a sequence database in which the current match could be found just by chance.")
parser.add_argument("-p", "--percentage", type=float, metavar="", default=80, help="Percentage of alignment in query")
###
parser.add_argument("-c", "--annot_table", metavar="", help="Genome annotation .csv file")
###
parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
parser.add_argument("--debug", action="store_true", default=False)

arg = parser.parse_args()
arg_dict = vars(arg)

if arg.annot_file is None and arg.fasta_file is None and arg.text_file is None:
print("######")
print(parser.print_help())
print("######")
exit()
if arg.debug:
print("##DEBUG: ##")
print("arguments dictionary: ")
print(arg)

if __name__ == '__main__':
get_dupannot(arg_dict)


11 changes: 6 additions & 5 deletions ejercicios/dup_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,15 @@ def filter_data(arg_dict):
exit()
else:
print("#####")
print("Please provide either a a BLAST results .txt file OR an annotation FASTA file")
print("Please provide either a BLAST results .txt file OR an annotation FASTA file")
print("#####")

elif arg_dict["fasta_file"]:
blast_caller.create_blast_results(arg_dict)
file_name_abs_path = os.path.abspath(arg_dict["fasta_file"])
name_file, extension = os.path.splitext(file_name_abs_path)
basename= os.path.basename(name_file)

if (arg_dict["db_name"]):
output_path= os.path.abspath(arg_dict["db_name"])
raw_blast = pd.read_csv(output_path + "/BLAST_raw_results.txt",
Expand All @@ -106,12 +107,12 @@ def filter_data(arg_dict):

#when Blast_file_results is done, evalue>10 is gone
filter_evalue = raw_blast["evalue"] <= arg_dict["evalue"]
filter_bitscore = raw_blast["bitscore"] >= arg_dict["evalue"]
filter_bitscore = raw_blast["bitscore"] >= arg_dict["bitscore"]
filter_alnpct = raw_blast["aln_pct"] >= arg_dict["percentage"]
filter_id = raw_blast["qseqid"] != raw_blast["sseqid"]
filtered_results = raw_blast[filter_evalue & filter_bitscore & filter_alnpct & filter_id]

#sort by pident (desc), evalue(asc), bitscore(desc)
#sort by aln_pct (desc), evalue(asc), bitscore(desc)
by_alnpct = filtered_results.sort_values(["aln_pct", "evalue", "bitscore"],
ascending=[False, True, False])

Expand All @@ -135,7 +136,7 @@ def filter_data(arg_dict):
parser.add_argument("-t", "--text_file", metavar="", help="Blast raw results text file")
parser.add_argument("-e", "--evalue", type=float, metavar="", default= 1e-05, help="BLAST e-value: number of expected hits of similar quality (score) that could be found just by chance.")
parser.add_argument("-bs", "--bitscore", type=float, metavar="", default=50, help="BLAST bit-score: requires size of a sequence database in which the current match could be found just by chance.")
parser.add_argument("-p", "--percentage", type=float, metavar="", default=80, help="Percentage of alignement in query")
parser.add_argument("-p", "--percentage", type=float, metavar="", default=80, help="Percentage of alignment in query")
parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
parser.add_argument("-d", "--db_name", metavar="", help="New database name")
parser.add_argument("-f", "--fasta_file", metavar="", help="Protein sequences FASTA file")
Expand All @@ -161,7 +162,7 @@ def filter_data(arg_dict):

if arg.percentage is None:
print("#####")
print("Note alignement in query percentage = 80% is set by default")
print("Note alignment in query percentage = 80% is set by default")
print("#####")
print(parser.print_help())

Expand Down
6 changes: 3 additions & 3 deletions ejercicios/input_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#############

#####
def extension(arg_dict):
def input_parser(arg_dict):

compt = {}
compt["fasta"] = [".fa", ".faa", ".mpfa", ".fna", ".fsa", ".fas", ".fasta"]
Expand Down Expand Up @@ -101,7 +101,7 @@ def extension(arg_dict):
parser = ArgumentParser(prog='inputParser',
formatter_class=argparse.RawDescriptionHelpFormatter,
description="Get proteins sequences from annotation file")
parser.add_argument("-a", "--annot_file", metavar="", help="Annotation file")
parser.add_argument("-a", "--annot_file", metavar="", help="Annotation file: genbank or GFF")
parser.add_argument("-r", "--ref_file", metavar="", help="Genome references FASTA file")
#parser.add_argument("-p", "--prot_file", metavar="", help="Protein sequence file")
parser.add_argument("-o", "--out_folder", metavar= "", help="Results folder")
Expand All @@ -128,5 +128,5 @@ def extension(arg_dict):

## Cuando no se importe como un modulo este script, se ejecutara esto
if __name__ == '__main__':
extension(arg_dict)
input_parser(arg_dict)

2 changes: 1 addition & 1 deletion ejercicios/protein_gbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def main (gbf_file, output_folder, debug=False):
#####
def gbf_parser_caller(gbf_file, output_path, debug):

out_file = "%s/gbf_proteins.fa" % output_path
out_file = "%s/proteins.fa" % output_path

with open(out_file, "w") as output_handle:
SeqIO.write(gbf_parser(gbf_file, output_path, debug=debug), output_handle, "fasta")
Expand Down
4 changes: 2 additions & 2 deletions ejercicios/protein_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def main (gff_file, ref_file, output_folder, debug=False):
#####
def gff_parser_caller(gff_file, ref_file, output_path, debug):

out_file = "%s/gff_proteins.fa" % output_path
out_file = "%s/proteins.fa" % output_path
with open (ref_file) as in_handle:
ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))

Expand Down Expand Up @@ -133,7 +133,7 @@ def protein_recs(gff_file, ref_recs, output_path, debug=False):

yield(SeqRecord(protein_seq, feature.qualifiers["ID"][0], "", ""))

csv_file = "%s/gff_df.csv" % output_path
csv_file = "%s/df.csv" % output_path
annot_df.to_csv(csv_file, header=True)

if (debug):
Expand Down
37 changes: 0 additions & 37 deletions ejercicios/pruebas_blast.py

This file was deleted.

0 comments on commit 303d663

Please sign in to comment.