Update lifton/extract_sequence.py to handle GTF file

Kuanhao-Chao · Oct 10, 2024 · 62c8df7 · 62c8df7
1 parent ee2a686
commit 62c8df7
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 15 deletions.
diff --git a/lifton/extract_sequence.py b/lifton/extract_sequence.py
@@ -9,32 +9,69 @@ def extract_features(ref_db, features, ref_fai):
     return ref_trans, ref_proteins
 
 
+# def __inner_extract_feature(ref_db, feature, ref_fai, ref_trans, ref_proteins):
+#     # If exon is the first level children
+#     children_exons = list(ref_db.db_connection.children(feature, featuretype='exon', level=1))
+#     children_CDSs = list(ref_db.db_connection.children(feature, featuretype=('start_codon', 'CDS', 'stop_codon'), level=1))
+#     if len(children_exons) > 0 or len(children_CDSs) > 0:
+#         # print(f"Parent: {feature.id};  exon: {len(children_exons)}; CDS: {len(children_CDSs)}")
+#         if len(children_exons) > 0:
+#             trans_seq = get_dna_sequence(feature, ref_fai, children_exons)
+#             trans_seq = trans_seq.upper()
+#             ref_trans[feature.id] = trans_seq
+#         if len(children_CDSs) > 0:
+#             protein_seq = get_protein_sequence(feature, ref_fai, children_CDSs)
+#             protein_seq = protein_seq.upper()
+#             ref_proteins[feature.id] = protein_seq
+#     else:
+#         for child in ref_db.db_connection.children(feature, level=1, order_by='start'):
+#             __inner_extract_feature(ref_db, child, ref_fai, ref_trans, ref_proteins)
+
+
 def __inner_extract_feature(ref_db, feature, ref_fai, ref_trans, ref_proteins):
-    # If exon is the first level children
-    children_exons = list(ref_db.db_connection.children(feature, featuretype='exon', level=1))
-    children_CDSs = list(ref_db.db_connection.children(feature, featuretype=('start_codon', 'CDS', 'stop_codon'), level=1))
-    if len(children_exons) > 0 or len(children_CDSs) > 0:
-        # print(f"Parent: {feature.id};  exon: {len(children_exons)}; CDS: {len(children_CDSs)}")
-        if len(children_exons) > 0:
-            trans_seq = get_dna_sequence(feature, ref_fai, children_exons)
+    if feature.featuretype == 'gene':
+        # For GTF files, transcripts are children of genes
+        transcripts = list(ref_db.db_connection.children(feature, featuretype='transcript', order_by='start'))
+        for transcript in transcripts:
+            __inner_extract_feature(ref_db, transcript, ref_fai, ref_trans, ref_proteins)
+    elif feature.featuretype == 'transcript':
+        # Get exons and CDSs from transcripts
+        exons = list(ref_db.db_connection.children(feature, featuretype='exon', order_by='start'))
+        CDSs = list(ref_db.db_connection.children(feature, featuretype='CDS', order_by='start'))
+        if exons:
+            trans_seq = get_dna_sequence(feature, ref_fai, exons)
             trans_seq = trans_seq.upper()
             ref_trans[feature.id] = trans_seq
-        if len(children_CDSs) > 0:
-            protein_seq = get_protein_sequence(feature, ref_fai, children_CDSs)
+        if CDSs:
+            protein_seq = get_protein_sequence(feature, ref_fai, CDSs)
             protein_seq = protein_seq.upper()
             ref_proteins[feature.id] = protein_seq
     else:
-        for child in ref_db.db_connection.children(feature, level=1, order_by='start'):
-            __inner_extract_feature(ref_db, child, ref_fai, ref_trans, ref_proteins)
+        # For GFF files, exons and CDSs might be direct children of the feature
+        exons = list(ref_db.db_connection.children(feature, featuretype='exon', level=1, order_by='start'))
+        CDSs = list(ref_db.db_connection.children(feature, featuretype='CDS', level=1, order_by='start'))
+        if exons or CDSs:
+            if exons:
+                trans_seq = get_dna_sequence(feature, ref_fai, exons)
+                trans_seq = trans_seq.upper()
+                ref_trans[feature.id] = trans_seq
+            if CDSs:
+                protein_seq = get_protein_sequence(feature, ref_fai, CDSs)
+                protein_seq = protein_seq.upper()
+                ref_proteins[feature.id] = protein_seq
+        else:
+            # Recursively check for children in case of nested features
+            for child in ref_db.db_connection.children(feature, order_by='start'):
+                __inner_extract_feature(ref_db, child, ref_fai, ref_trans, ref_proteins)
 
 
 def merge_children_intervals(children):
-    if len(children) == 0:
+    if not children:
         return []
     intervals = [[child.start, child.end] for child in children]
     intervals.sort(key=lambda interval: interval[0])
     merged = [intervals[0]]
-    for current in intervals:
+    for current in intervals[1:]:
         previous = merged[-1]
         if current[0] <= previous[1]:
             previous[1] = max(previous[1], current[1])

diff --git a/lifton/run_miniprot.py b/lifton/run_miniprot.py
@@ -1,4 +1,4 @@
-from lifton import align_bk, logger, lifton_class, lifton_utils
+from lifton import align, logger, lifton_class, lifton_utils
 import subprocess, os, sys, copy
 from intervaltree import Interval, IntervalTree
 
@@ -95,7 +95,7 @@ def lifton_miniprot_with_ref_protein(m_feature, m_feature_db, ref_db, ref_gene_i
     # Update LiftOn status
     lifton_status = lifton_class.Lifton_Status()                
     m_entry = m_feature_db[mtrans_id]
-    m_lifton_aln = align_bk.lifton_parasail_align(Lifton_trans, m_entry, tgt_fai, ref_proteins, ref_trans_id)
+    m_lifton_aln = align.lifton_parasail_align(Lifton_trans, m_entry, tgt_fai, ref_proteins, ref_trans_id)
     lifton_status.annotation =  "miniprot"
     lifton_status.lifton_aa = m_lifton_aln.identity
     return lifton_gene, Lifton_trans, Lifton_trans.entry.id, lifton_status