Skip to content

Commit

Permalink
fix pseudogene retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
JFsanchezherrero committed Jun 14, 2021
1 parent 08a59e2 commit 3091380
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 24 deletions.
81 changes: 60 additions & 21 deletions code/BacDup/scripts/gbf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from Bio import SeqIO, Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, ExactPosition
from Bio.SeqIO.FastaIO import SimpleFastaParser

from HCGB.functions.aesthetics_functions import debug_message
Expand Down Expand Up @@ -47,11 +48,9 @@ def gbf_parser_caller(annot_file, output_path, debug):
################################################################################
def gbf_parser(gbf_file, list_out_files, debug=False):

#create an empty dataframe.

## create dataframe.
## get common column names
columns = columns_annot_table()

columns = columns_annot_table()
annot_df = pd.DataFrame(data=None, columns=columns)
genome_length = pd.DataFrame(data=None, columns=["length"])

Expand All @@ -70,45 +69,85 @@ def gbf_parser(gbf_file, list_out_files, debug=False):

#sort by CDS type. Duplicate genes analysis needs coding regions to proteins.
if feature.type=="CDS":
genome_seq = rec.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end]
if int(feature.strand) > 0:
strand = "pos"
else:
strand = "neg"

genome_seq = genome_seq.reverse_complement()

#we create an ID for each entry
protID = feature.type + "_" + rec.id + "_" + str(feature.location.nofuzzy_start) + "_" + str(feature.location.nofuzzy_end) + "_" + strand
annot_df.loc[protID, ["rec_id", "start", "end", "strand"]] = [ID, feature.location.nofuzzy_start, feature.location.nofuzzy_end, strand]
annot_df.loc[protID, ["rec_id", "start",
"end", "strand"]] = [ID, feature.location.nofuzzy_start,
feature.location.nofuzzy_end, strand]
qualif = feature.qualifiers

pseudo=False

## Debug messages
if (debug):
debug_message('protID: ' + protID, 'yellow')

debug_message('qualif: ', 'yellow')
print (qualif)


debug_message('feature: ', 'yellow')
print(feature)

debug_message('genome_seq: ', 'yellow')
print(genome_seq)

pseudo_seq = ""
## fill datafarme
for keys, values in qualif.items():
#fill the dataframe info
if keys not in columns:
continue

## Save keys into dataframe
annot_df.loc[protID,[keys]] = [values[0]]

####################################
## Pseudogenes:
####################################
if keys=="pseudo":
pseudo = "True"
annot_df.loc[protID,["pseudo"]] = [pseudo]

#we create a FASTA file with protein sequences

pseudo = True

## set pseudo True/False
annot_df.loc[protID,["pseudo"]] = ["True"]
table_code = feature.qualifiers["transl_table"][0]
pseudo_seq = genome_seq.translate(table=table_code, to_stop=False)

## Debug messages
if (debug):
debug_message('feature: ', 'yellow')
print(feature)
print("***************************************")
debug_message('Pseudogene: ', 'yellow')
print("***************************************")
debug_message('feature.location.nofuzzy_start: ', 'yellow')
print(feature.location.nofuzzy_start)
debug_message('feature.location.nofuzzy_end: ', 'yellow')
print(feature.location.nofuzzy_end)
debug_message('Translation table code: ', 'yellow')
print(table_code)
debug_message('genome_seq: ', 'yellow')
print(genome_seq)
debug_message('pseudo_seq: ', 'yellow')
print(pseudo_seq)


if keys=="translation":
#pseudogenes have no translation item
gene_seq = Seq.Seq(feature.qualifiers["translation"][0])
else:
pass
## create a sequence fasta entry
if (pseudo):
# Pseudogenes have no translation item
# set translated CDS even including *
if len(pseudo_seq)!=0:
gene_seq = pseudo_seq
else:
## sometimes it might fail
gene_seq=Seq.Seq('***')

else:
## CDS provided by genbank
gene_seq = Seq.Seq(feature.qualifiers["translation"][0])

yield(SeqRecord(gene_seq, protID,"",""))

## print to file
Expand Down
16 changes: 13 additions & 3 deletions code/BacDup/scripts/gff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def gff_parser_caller(gff_file, ref_file, output_path, debug):

## parse
with open(prot_file, "w") as out_handle:
SeqIO.write(protein_recs(gff_file, ref_recs, list_out_files, debug=debug), out_handle, "fasta")
SeqIO.write(protein_recs(gff_file, ref_recs,
list_out_files, debug=debug), out_handle, "fasta")

## return information
return (list_out_files)
Expand Down Expand Up @@ -121,8 +122,17 @@ def protein_recs(gff_file, ref_recs, list_out_files, debug=False):
if feature.strand == -1:
gene_seq = gene_seq.reverse_complement()

#delete STOP symbols
protein_seq = gene_seq.translate(to_stop=True)
# translate genome sequence
table_code = feature.qualifiers["transl_table"][0]
protein_seq = gene_seq.translate(table=table_code, to_stop=False)

# delete STOP symbols
# we set gene_seq.translate to include all stop codons to include
# stop codons in pseudogenes. then, we removed last symbol * for
# each sequence

if protein_seq.endswith("*"):
protein_seq = protein_seq[:-1]

yield(SeqRecord(protein_seq, protID, "", ""))

Expand Down

0 comments on commit 3091380

Please sign in to comment.