Skip to content

Commit

Permalink
Update lifton/extract_sequence.py to handle GTF file
Browse files Browse the repository at this point in the history
  • Loading branch information
Kuanhao-Chao committed Oct 10, 2024
1 parent ee2a686 commit 62c8df7
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 15 deletions.
63 changes: 50 additions & 13 deletions lifton/extract_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,69 @@ def extract_features(ref_db, features, ref_fai):
return ref_trans, ref_proteins


# def __inner_extract_feature(ref_db, feature, ref_fai, ref_trans, ref_proteins):
# # If exon is the first level children
# children_exons = list(ref_db.db_connection.children(feature, featuretype='exon', level=1))
# children_CDSs = list(ref_db.db_connection.children(feature, featuretype=('start_codon', 'CDS', 'stop_codon'), level=1))
# if len(children_exons) > 0 or len(children_CDSs) > 0:
# # print(f"Parent: {feature.id}; exon: {len(children_exons)}; CDS: {len(children_CDSs)}")
# if len(children_exons) > 0:
# trans_seq = get_dna_sequence(feature, ref_fai, children_exons)
# trans_seq = trans_seq.upper()
# ref_trans[feature.id] = trans_seq
# if len(children_CDSs) > 0:
# protein_seq = get_protein_sequence(feature, ref_fai, children_CDSs)
# protein_seq = protein_seq.upper()
# ref_proteins[feature.id] = protein_seq
# else:
# for child in ref_db.db_connection.children(feature, level=1, order_by='start'):
# __inner_extract_feature(ref_db, child, ref_fai, ref_trans, ref_proteins)


def __inner_extract_feature(ref_db, feature, ref_fai, ref_trans, ref_proteins):
# If exon is the first level children
children_exons = list(ref_db.db_connection.children(feature, featuretype='exon', level=1))
children_CDSs = list(ref_db.db_connection.children(feature, featuretype=('start_codon', 'CDS', 'stop_codon'), level=1))
if len(children_exons) > 0 or len(children_CDSs) > 0:
# print(f"Parent: {feature.id}; exon: {len(children_exons)}; CDS: {len(children_CDSs)}")
if len(children_exons) > 0:
trans_seq = get_dna_sequence(feature, ref_fai, children_exons)
if feature.featuretype == 'gene':
# For GTF files, transcripts are children of genes
transcripts = list(ref_db.db_connection.children(feature, featuretype='transcript', order_by='start'))
for transcript in transcripts:
__inner_extract_feature(ref_db, transcript, ref_fai, ref_trans, ref_proteins)
elif feature.featuretype == 'transcript':
# Get exons and CDSs from transcripts
exons = list(ref_db.db_connection.children(feature, featuretype='exon', order_by='start'))
CDSs = list(ref_db.db_connection.children(feature, featuretype='CDS', order_by='start'))
if exons:
trans_seq = get_dna_sequence(feature, ref_fai, exons)
trans_seq = trans_seq.upper()
ref_trans[feature.id] = trans_seq
if len(children_CDSs) > 0:
protein_seq = get_protein_sequence(feature, ref_fai, children_CDSs)
if CDSs:
protein_seq = get_protein_sequence(feature, ref_fai, CDSs)
protein_seq = protein_seq.upper()
ref_proteins[feature.id] = protein_seq
else:
for child in ref_db.db_connection.children(feature, level=1, order_by='start'):
__inner_extract_feature(ref_db, child, ref_fai, ref_trans, ref_proteins)
# For GFF files, exons and CDSs might be direct children of the feature
exons = list(ref_db.db_connection.children(feature, featuretype='exon', level=1, order_by='start'))
CDSs = list(ref_db.db_connection.children(feature, featuretype='CDS', level=1, order_by='start'))
if exons or CDSs:
if exons:
trans_seq = get_dna_sequence(feature, ref_fai, exons)
trans_seq = trans_seq.upper()
ref_trans[feature.id] = trans_seq
if CDSs:
protein_seq = get_protein_sequence(feature, ref_fai, CDSs)
protein_seq = protein_seq.upper()
ref_proteins[feature.id] = protein_seq
else:
# Recursively check for children in case of nested features
for child in ref_db.db_connection.children(feature, order_by='start'):
__inner_extract_feature(ref_db, child, ref_fai, ref_trans, ref_proteins)


def merge_children_intervals(children):
if len(children) == 0:
if not children:
return []
intervals = [[child.start, child.end] for child in children]
intervals.sort(key=lambda interval: interval[0])
merged = [intervals[0]]
for current in intervals:
for current in intervals[1:]:
previous = merged[-1]
if current[0] <= previous[1]:
previous[1] = max(previous[1], current[1])
Expand Down
4 changes: 2 additions & 2 deletions lifton/run_miniprot.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from lifton import align_bk, logger, lifton_class, lifton_utils
from lifton import align, logger, lifton_class, lifton_utils
import subprocess, os, sys, copy
from intervaltree import Interval, IntervalTree

Expand Down Expand Up @@ -95,7 +95,7 @@ def lifton_miniprot_with_ref_protein(m_feature, m_feature_db, ref_db, ref_gene_i
# Update LiftOn status
lifton_status = lifton_class.Lifton_Status()
m_entry = m_feature_db[mtrans_id]
m_lifton_aln = align_bk.lifton_parasail_align(Lifton_trans, m_entry, tgt_fai, ref_proteins, ref_trans_id)
m_lifton_aln = align.lifton_parasail_align(Lifton_trans, m_entry, tgt_fai, ref_proteins, ref_trans_id)
lifton_status.annotation = "miniprot"
lifton_status.lifton_aa = m_lifton_aln.identity
return lifton_gene, Lifton_trans, Lifton_trans.entry.id, lifton_status
Expand Down

0 comments on commit 62c8df7

Please sign in to comment.