From 30913801b888e131fdb2b447f4f8143c2923d5b8 Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Mon, 14 Jun 2021 12:46:15 +0200 Subject: [PATCH] fix pseudogene retrieval --- code/BacDup/scripts/gbf_parser.py | 81 +++++++++++++++++++++++-------- code/BacDup/scripts/gff_parser.py | 16 ++++-- 2 files changed, 73 insertions(+), 24 deletions(-) diff --git a/code/BacDup/scripts/gbf_parser.py b/code/BacDup/scripts/gbf_parser.py index e25e9b0..fe1cb40 100644 --- a/code/BacDup/scripts/gbf_parser.py +++ b/code/BacDup/scripts/gbf_parser.py @@ -19,6 +19,7 @@ from Bio import SeqIO, Seq from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import FeatureLocation, ExactPosition from Bio.SeqIO.FastaIO import SimpleFastaParser from HCGB.functions.aesthetics_functions import debug_message @@ -47,11 +48,9 @@ def gbf_parser_caller(annot_file, output_path, debug): ################################################################################ def gbf_parser(gbf_file, list_out_files, debug=False): - #create an empty dataframe. - + ## create dataframe. ## get common column names - columns = columns_annot_table() - + columns = columns_annot_table() annot_df = pd.DataFrame(data=None, columns=columns) genome_length = pd.DataFrame(data=None, columns=["length"]) @@ -70,45 +69,85 @@ def gbf_parser(gbf_file, list_out_files, debug=False): #sort by CDS type. Duplicate genes analysis needs coding regions to proteins. if feature.type=="CDS": + genome_seq = rec.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end] if int(feature.strand) > 0: strand = "pos" else: strand = "neg" - + genome_seq = genome_seq.reverse_complement() + #we create an ID for each entry protID = feature.type + "_" + rec.id + "_" + str(feature.location.nofuzzy_start) + "_" + str(feature.location.nofuzzy_end) + "_" + strand - annot_df.loc[protID, ["rec_id", "start", "end", "strand"]] = [ID, feature.location.nofuzzy_start, feature.location.nofuzzy_end, strand] + annot_df.loc[protID, ["rec_id", "start", + "end", "strand"]] = [ID, feature.location.nofuzzy_start, + feature.location.nofuzzy_end, strand] qualif = feature.qualifiers - + pseudo=False + ## Debug messages if (debug): debug_message('protID: ' + protID, 'yellow') + debug_message('qualif: ', 'yellow') print (qualif) - + + debug_message('feature: ', 'yellow') + print(feature) + + debug_message('genome_seq: ', 'yellow') + print(genome_seq) + + pseudo_seq = "" + ## fill datafarme for keys, values in qualif.items(): - #fill the dataframe info if keys not in columns: continue + + ## Save keys into dataframe annot_df.loc[protID,[keys]] = [values[0]] + #################################### + ## Pseudogenes: + #################################### if keys=="pseudo": - pseudo = "True" - annot_df.loc[protID,["pseudo"]] = [pseudo] - - #we create a FASTA file with protein sequences - + pseudo = True + + ## set pseudo True/False + annot_df.loc[protID,["pseudo"]] = ["True"] + table_code = feature.qualifiers["transl_table"][0] + pseudo_seq = genome_seq.translate(table=table_code, to_stop=False) + ## Debug messages if (debug): - debug_message('feature: ', 'yellow') - print(feature) + print("***************************************") + debug_message('Pseudogene: ', 'yellow') + print("***************************************") + debug_message('feature.location.nofuzzy_start: ', 'yellow') + print(feature.location.nofuzzy_start) + debug_message('feature.location.nofuzzy_end: ', 'yellow') + print(feature.location.nofuzzy_end) + debug_message('Translation table code: ', 'yellow') + print(table_code) + debug_message('genome_seq: ', 'yellow') + print(genome_seq) + debug_message('pseudo_seq: ', 'yellow') + print(pseudo_seq) + - if keys=="translation": - #pseudogenes have no translation item - gene_seq = Seq.Seq(feature.qualifiers["translation"][0]) - else: - pass + ## create a sequence fasta entry + if (pseudo): + # Pseudogenes have no translation item + # set translated CDS even including * + if len(pseudo_seq)!=0: + gene_seq = pseudo_seq + else: + ## sometimes it might fail + gene_seq=Seq.Seq('***') + else: + ## CDS provided by genbank + gene_seq = Seq.Seq(feature.qualifiers["translation"][0]) + yield(SeqRecord(gene_seq, protID,"","")) ## print to file diff --git a/code/BacDup/scripts/gff_parser.py b/code/BacDup/scripts/gff_parser.py index de3f29b..c3592d7 100644 --- a/code/BacDup/scripts/gff_parser.py +++ b/code/BacDup/scripts/gff_parser.py @@ -48,7 +48,8 @@ def gff_parser_caller(gff_file, ref_file, output_path, debug): ## parse with open(prot_file, "w") as out_handle: - SeqIO.write(protein_recs(gff_file, ref_recs, list_out_files, debug=debug), out_handle, "fasta") + SeqIO.write(protein_recs(gff_file, ref_recs, + list_out_files, debug=debug), out_handle, "fasta") ## return information return (list_out_files) @@ -121,8 +122,17 @@ def protein_recs(gff_file, ref_recs, list_out_files, debug=False): if feature.strand == -1: gene_seq = gene_seq.reverse_complement() - #delete STOP symbols - protein_seq = gene_seq.translate(to_stop=True) + # translate genome sequence + table_code = feature.qualifiers["transl_table"][0] + protein_seq = gene_seq.translate(table=table_code, to_stop=False) + + # delete STOP symbols + # we set gene_seq.translate to include all stop codons to include + # stop codons in pseudogenes. then, we removed last symbol * for + # each sequence + + if protein_seq.endswith("*"): + protein_seq = protein_seq[:-1] yield(SeqRecord(protein_seq, protID, "", ""))