-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
1250 lines (1052 loc) · 49.8 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import streamlit as st
import pandas as pd
from Bio import SeqIO, Phylo, AlignIO, Entrez
from Bio.Seq import Seq
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
import plotly.express as px
import matplotlib.pyplot as plt
import io
import re
import time
import numpy as np
import os
import pickle
from pathlib import Path
import torch as th
import torch.nn.functional as fn
import csv
from chembl import ChemblAPI
import requests
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw
import base64
import hashlib
import py3Dmol
from stmol import showmol
from openai import OpenAI
from functools import wraps
import html
import uuid
import json
# Initialize session state for chat and analysis caching
def init_session_state():
"""Initialize all session state variables"""
if 'chat_messages' not in st.session_state:
st.session_state.chat_messages = []
if 'chat_visible' not in st.session_state:
st.session_state.chat_visible = False
if 'chat_client' not in st.session_state:
try:
key_file = open('samba_key.txt', 'r')
st.session_state.chat_client = OpenAI(
api_key=key_file.readline().rstrip(),
base_url="https://api.sambanova.ai/v1",
)
key_file.close()
except Exception as e:
st.error(f"Failed to initialize chat client: {str(e)}")
if 'analysis_cache' not in st.session_state:
st.session_state.analysis_cache = {}
# Caching decorator
def prevent_rerun(func):
"""Decorator to prevent function from rerunning"""
@wraps(func)
def wrapper(*args, **kwargs):
cache_key = f"cache_{func.__name__}_{str(args)}_{str(kwargs)}"
if cache_key not in st.session_state.analysis_cache:
st.session_state.analysis_cache[cache_key] = func(*args, **kwargs)
return st.session_state.analysis_cache[cache_key]
return wrapper
# Chat interface functions
def handle_chat_submit():
"""Handle chat form submission"""
if st.session_state.chat_input and st.session_state.chat_input.strip():
user_message = st.session_state.chat_input.strip()
st.session_state.chat_messages.append({
"role": "user",
"content": user_message
})
try:
response = st.session_state.chat_client.chat.completions.create(
model="Meta-Llama-3.1-70B-Instruct",
messages=[{"role": "user", "content": user_message}],
stream=False
)
assistant_message = response.choices[0].message.content
st.session_state.chat_messages.append({
"role": "assistant",
"content": assistant_message
})
except Exception as e:
st.error(f"Failed to generate response: {str(e)}")
st.session_state.chat_input = ""
def create_chat_interface():
"""Create a chat interface using Streamlit components"""
st.markdown("""
<style>
.chat-message {
padding: 10px;
margin: 5px 0;
border-radius: 5px;
}
.user-message {
background-color: #e6f3ff;
margin-left: 20%;
}
.assistant-message {
background-color: #f0f0f0;
margin-right: 20%;
}
.chat-container {
max-height: 400px;
overflow-y: auto;
padding: 10px;
}
.chat-toggle {
position: fixed;
bottom: 20px;
right: 20px;
z-index: 1001;
}
</style>
""", unsafe_allow_html=True)
# Chat interface container
chat_container = st.sidebar.container()
# Chat toggle in main interface
if st.sidebar.button("💬 Toggle Chat", key="chat_toggle"):
st.session_state.chat_visible = not st.session_state.chat_visible
st.rerun() # Changed from experimental_rerun() to rerun()
if st.session_state.chat_visible:
with chat_container:
st.markdown("### SeqCure Assistant")
# Display chat messages
for msg in st.session_state.chat_messages:
msg_class = "user-message" if msg["role"] == "user" else "assistant-message"
st.markdown(
f"""<div class="chat-message {msg_class}">
{html.escape(msg["content"])}
</div>""",
unsafe_allow_html=True
)
# Chat input form
with st.form(key="chat_form", clear_on_submit=True):
st.text_input("Message", key="chat_input")
submit_button = st.form_submit_button("Send", on_click=handle_chat_submit)
# Existing helper functions with @prevent_rerun decorator where appropriate
@prevent_rerun
def get_sequence_hash(sequence):
sequence_str = str(sequence)
return hashlib.md5(sequence_str.encode('utf-8')).hexdigest()
@prevent_rerun
def calculate_nucleotide_freq(sequence):
total = len(sequence)
frequencies = {
'A': sequence.count('A') / total * 100,
'T': sequence.count('T') / total * 100,
'G': sequence.count('G') / total * 100,
'C': sequence.count('C') / total * 100
}
return frequencies
@prevent_rerun
def gc_content(sequence):
gc = sequence.count('G') + sequence.count('C')
total = len(sequence)
return (gc / total) * 100
def validate_fasta(fasta_str):
try:
with io.StringIO(fasta_str) as handle:
record = next(SeqIO.parse(handle, "fasta"))
return True
except:
return False
@prevent_rerun
def perform_filtered_blast_search(sequence):
cache_dir = Path("cache")
sequence_hash = get_sequence_hash(sequence)
cache_file = cache_dir / f"blast_results_{sequence_hash}.pkl"
try:
cache_dir.mkdir(exist_ok=True)
if cache_file.exists():
st.info("Loading BLAST results from cache...")
with open(cache_file, 'rb') as f:
return pickle.load(f)
st.info("No cached results found. Performing BLAST search...")
entrez_query = "txid10239[Organism:exp] NOT txid2697049[Organism]"
result_handle = NCBIWWW.qblast(
"blastn",
"nt",
sequence,
entrez_query=entrez_query,
hitlist_size=10,
expect=1e-50
)
result_text = result_handle.read()
with open(cache_file, 'wb') as f:
pickle.dump(result_text, f)
return io.StringIO(result_text)
except Exception as e:
st.error(f"BLAST search failed: {str(e)}")
return None
@prevent_rerun
def extract_sequences_from_blast(blast_results):
"""Extract sequences from BLAST results with metadata including species names"""
if isinstance(blast_results, str):
blast_results = io.StringIO(blast_results)
blast_records = NCBIXML.parse(blast_results)
sequences = []
metadata = []
seen_accessions = set()
for record in blast_records:
for alignment in record.alignments:
for hsp in alignment.hsps:
if hsp.expect > 1e-50:
continue
accession = alignment.title.split('|')[1]
if accession not in seen_accessions:
seen_accessions.add(accession)
try:
Entrez.email = "[email protected]" # Replace with your email
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
record = next(SeqIO.parse(handle, "genbank"))
handle.close()
organism_name = record.annotations.get('organism', 'Unknown species')
alignment_score = hsp.score
handle_fasta = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
seq_record = next(SeqIO.parse(handle_fasta, "fasta"))
handle_fasta.close()
sequences.append(seq_record)
metadata.append({
'accession': accession,
'species': organism_name,
'e_value': hsp.expect,
'identity': (hsp.identities / hsp.align_length) * 100,
'alignment_score': alignment_score
})
except Exception as e:
st.warning(f"Could not fetch sequence {accession}: {str(e)}")
continue
return sequences, metadata
@prevent_rerun
def cache_entrez_fetch(db, id_val, rettype, retmode, cache_dir="cache"):
"""Helper function to cache Entrez fetch results"""
cache_dir = Path(cache_dir)
params_str = f"{db}_{id_val}_{rettype}_{retmode}"
params_hash = hashlib.md5(params_str.encode('utf-8')).hexdigest()
cache_file = cache_dir / f"entrez_{params_hash}.pkl"
try:
cache_dir.mkdir(exist_ok=True)
if cache_file.exists():
with open(cache_file, 'rb') as f:
return pickle.load(f)
handle = Entrez.efetch(db=db, id=id_val, rettype=rettype, retmode=retmode)
result = handle.read()
handle.close()
with open(cache_file, 'wb') as f:
pickle.dump(result, f)
return result
except Exception as e:
st.warning(f"Could not fetch/cache Entrez results for {id_val}: {str(e)}")
return None
@prevent_rerun
def get_top_unique_species(metadata, n=5):
"""Get top n unique species based on alignment score"""
df = pd.DataFrame(metadata)
top_species = (df.sort_values('identity', ascending=False)
.drop_duplicates(subset=['species'])
.head(n))
return top_species[['species', 'identity']]
@prevent_rerun
def create_phylogenetic_tree(query_sequence, similar_sequences):
"""Create phylogenetic tree using Biopython's built-in tools"""
try:
sequences = [query_sequence] + similar_sequences
alignment = MultipleSeqAlignment([])
min_length = min(len(s.seq) for s in sequences)
for seq in sequences:
seq_str = str(seq.seq)[:min_length]
short_id = seq.id.split('.')[0]
record = SeqRecord(
Seq(seq_str),
id=short_id[:15],
name=short_id[:15],
description=""
)
alignment.append(record)
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(alignment)
constructor = DistanceTreeConstructor(calculator)
tree = constructor.build_tree(alignment)
fig, ax = plt.subplots(figsize=(24, 18))
Phylo.draw(tree, axes=ax, show_confidence=False,
label_func=lambda x: x.name, # Use the full name as label
do_show=False) # Prevent automatic display
# Increase font size for labels and adjust specific labels
adjustment = 1 # Small adjustment value for label positions
for text in ax.texts:
if 'inner' in text.get_text().lower():
text.set_visible(False)
else:
text.set_fontsize(35) # Larger font for sequence labels
# Remove tick labels and ticks
ax.set_xticks([])
ax.set_yticks([])
# Remove axis labels
ax.set_xlabel("")
ax.set_ylabel("")
# Add more space between labels and tree
ax.set_xlim(ax.get_xlim()[0], ax.get_xlim()[1] * 1.15) # Add 15% more space on the right
# Adjust layout to prevent overlapping
plt.tight_layout(pad=2.0)
return fig, alignment
except Exception as e:
st.error(f"Error creating phylogenetic tree: {str(e)}")
return None, None
@prevent_rerun
def extract_genes_from_gtf(gtf_file, status_placeholder, progress_bar):
"""Extract genes from GTF file with visual progress updates"""
status_placeholder.write("Running gene finder...")
progress_bar.progress(20)
time.sleep(1)
columns = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
gtf_data = pd.read_csv(gtf_file, sep="\t", comment='#', header=None, names=columns)
status_placeholder.write("Extracting gene information...")
progress_bar.progress(50)
time.sleep(1)
genes_data = gtf_data[gtf_data['feature'] == 'gene']
genes = []
for attr in genes_data['attribute']:
for field in attr.split(';'):
if 'gene_name' in field:
gene_name = field.split('"')[1]
genes.append(gene_name)
break
progress_bar.progress(80)
status_placeholder.write("Found closely related viral genes.")
return genes
@prevent_rerun
def display_image(image_file, status_placeholder, progress_bar):
"""Display image with visual progress updates"""
status_placeholder.write("Loading image...")
progress_bar.progress(85)
time.sleep(1)
img = plt.imread(image_file)
status_placeholder.write("Displaying image...")
progress_bar.progress(90)
fig, ax = plt.subplots(figsize=(10, 6))
ax.imshow(img)
ax.axis('off')
return fig
@st.cache_data
def fetch_compound_data(compounds):
"""Cached function to fetch compound data"""
api = ChemblAPI()
compound_data = {}
for compound in compounds:
try:
metadata = api.get_compound_metadata(compound)
if metadata and 'smile' in metadata:
compound_data[compound] = metadata
except Exception as e:
st.warning(f"Could not fetch data for {compound}: {str(e)}")
return compound_data
@st.cache_data
def get_cached_chemical_properties(smiles):
"""Cached function to calculate chemical properties"""
try:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
properties = {
'Molecular Weight': round(Descriptors.ExactMolWt(mol), 2),
'MolLogP': round(Descriptors.MolLogP(mol), 2),
'H-Bond Donors': Descriptors.NumHDonors(mol),
'H-Bond Acceptors': Descriptors.NumHAcceptors(mol),
'Rotatable Bonds': Descriptors.NumRotatableBonds(mol),
'Topological Polar Surface Area': round(Descriptors.TPSA(mol), 2),
'Number of Rings': Descriptors.RingCount(mol)
}
return properties
except Exception as e:
st.error(f"Error calculating properties: {str(e)}")
return None
@st.cache_data
def get_cached_molecule_image(smiles):
"""Cached function to generate molecule image"""
try:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
img = Draw.MolToImage(mol)
buffered = io.BytesIO()
img.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
return img_str
except Exception as e:
st.error(f"Error generating molecule image: {str(e)}")
return None
@prevent_rerun
def drug_repurposing(container):
"""Complete drug repurposing function with error handling and persistent updates"""
try:
# Step 1: Initial Data Loading
container.info("Step 1/8: Loading related viruses and viral targets...")
COV_disease_list = [
'Disease::SARS-CoV2 E',
'Disease::SARS-CoV2 M',
'Disease::SARS-CoV2 N',
'Disease::SARS-CoV2 Spike',
'Disease::SARS-CoV2 nsp1',
'Disease::SARS-CoV2 nsp10',
'Disease::SARS-CoV2 nsp11',
'Disease::SARS-CoV2 nsp12',
'Disease::SARS-CoV2 nsp13',
'Disease::SARS-CoV2 nsp14',
'Disease::SARS-CoV2 nsp15',
'Disease::SARS-CoV2 nsp2',
'Disease::SARS-CoV2 nsp4',
'Disease::SARS-CoV2 nsp5',
'Disease::SARS-CoV2 nsp5_C145A',
'Disease::SARS-CoV2 nsp6',
'Disease::SARS-CoV2 nsp7',
'Disease::SARS-CoV2 nsp8',
'Disease::SARS-CoV2 nsp9',
'Disease::SARS-CoV2 orf10',
'Disease::SARS-CoV2 orf3a',
'Disease::SARS-CoV2 orf3b',
'Disease::SARS-CoV2 orf6',
'Disease::SARS-CoV2 orf7a',
'Disease::SARS-CoV2 orf8',
'Disease::SARS-CoV2 orf9b',
'Disease::SARS-CoV2 orf9c',
'Disease::MESH:D045169',
'Disease::MESH:D045473',
'Disease::MESH:D001351',
'Disease::MESH:D065207',
'Disease::MESH:D028941',
'Disease::MESH:D058957',
'Disease::MESH:D006517'
]
time.sleep(7.3)
# Step 2: Drug Database Loading
container.info("Step 2/8: Loading drug database and treatment relationships...")
drug_list = []
try:
with open("./infer_drug.tsv", newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['drug','ids'])
for row_val in reader:
drug_list.append(row_val['drug'])
except Exception as e:
raise Exception(f"Error loading drug database: {str(e)}")
treatment = ['Hetionet::CtD::Compound:Disease','GNBR::T::Compound:Disease']
time.sleep(3.2)
# Step 3: Entity Mapping
container.info("Step 3/8: Loading and mapping drug-disease entity relationships...")
entity_map = {}
entity_id_map = {}
relation_map = {}
with open('./embed/entities.tsv', newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
for row_val in reader:
entity_map[row_val['name']] = int(row_val['id'])
entity_id_map[int(row_val['id'])] = row_val['name']
with open('./embed/relations.tsv', newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
for row_val in reader:
relation_map[row_val['name']] = int(row_val['id'])
time.sleep(1.8)
# Step 4: ID Processing
container.info("Step 4/8: Processing drug and disease identifiers...")
drug_ids = []
disease_ids = []
for drug in drug_list:
drug_ids.append(entity_map[drug])
for disease in COV_disease_list:
disease_ids.append(entity_map[disease])
treatment_rid = [relation_map[treat] for treat in treatment]
time.sleep(1.1)
# Step 5: Loading Neural Network Embeddings
container.info("Step 5/8: Generating drug-disease entity embeddings...")
entity_emb = np.load('./embed/DRKG_TransE_l2_entity.npy')
rel_emb = np.load('./embed/DRKG_TransE_l2_relation.npy')
drug_ids = th.tensor(drug_ids).long()
disease_ids = th.tensor(disease_ids).long()
treatment_rid = th.tensor(treatment_rid)
drug_emb = th.tensor(entity_emb[drug_ids])
treatment_embs = [th.tensor(rel_emb[rid]) for rid in treatment_rid]
time.sleep(10)
# Step 6: Computing Drug-Disease Relationships
container.info("Step 6/8: Computing drug-disease relationship scores...")
gamma = 12.0
def transE_l2(head, rel, tail):
score = head + rel - tail
return gamma - th.norm(score, p=2, dim=-1)
scores_per_disease = []
dids = []
for rid in range(len(treatment_embs)):
treatment_emb = treatment_embs[rid]
for disease_id in disease_ids:
disease_emb = entity_emb[disease_id]
score = fn.logsigmoid(transE_l2(drug_emb, treatment_emb, disease_emb))
scores_per_disease.append(score)
dids.append(drug_ids)
time.sleep(2.3)
# Step 7: Score Processing and Ranking
container.info("Step 7/8: Ranking and prioritizing potential drug candidates...")
scores = th.cat(scores_per_disease)
dids = th.cat(dids)
idx = th.flip(th.argsort(scores), dims=[0])
scores = scores[idx].numpy()
dids = dids[idx].numpy()
_, unique_indices = np.unique(dids, return_index=True)
topk = 100
topk_indices = np.sort(unique_indices)[:topk]
proposed_dids = dids[topk_indices]
proposed_scores = scores[topk_indices]
time.sleep(2.1)
# Step 8: Clinical Trial Analysis
container.info("Step 8/8: Cross-referencing with clinical trial database...")
clinical_drugs = []
with open('./COVID19_clinical_trial_drugs.tsv', newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['id', 'drug_name','drug_id'])
for row_val in reader:
clinical_drugs.append({
'rank': None,
'name': row_val['drug_name'],
'score': None,
'drug_id': row_val['drug_id']
})
# Prepare results
results = []
clinical_results = []
for i in range(topk):
drug = int(proposed_dids[i])
score = proposed_scores[i]
drug_id = entity_id_map[drug][10:17]
results.append({
'drug': entity_id_map[drug],
'score': -score
})
# Check if drug is in clinical trials
for clinical_drug in clinical_drugs:
if clinical_drug['drug_id'] == drug_id:
clinical_results.append({
'rank': i,
'name': clinical_drug['name'],
'score': -score
})
container.success("✅ Drug repurposing analysis complete!")
return pd.DataFrame(results), pd.DataFrame(clinical_results)
except Exception as e:
import traceback
container.error(f"Detailed error in drug repurposing:\n{traceback.format_exc()}")
raise
@prevent_rerun
def get_protein_structure_viewer(pdb_id):
"""Create a py3Dmol viewer for the protein structure"""
try:
pdb_file = f'{pdb_id}.pdb'
if not os.path.exists(pdb_file):
return None, f"Error: File {pdb_file} not found."
viewer = py3Dmol.view(width=800, height=600)
with open(pdb_file, 'r') as pdb:
pdb_content = pdb.read()
viewer.addModel(pdb_content, 'pdb')
viewer.setStyle({'chain': ['A', 'C', 'D', 'P', 'T']},
{'cartoon': {'color': 'lightgrey', 'opacity': 0.5}})
viewer.setStyle({'hetflag': True},
{'stick': {'colorscheme': 'orangeCarbon'}})
viewer.zoomTo({'hetflag': True})
viewer.zoom(0.5)
html_str = f"""
<div style="height: 600px; width: 100%;">
<script src="https://3dmol.org/build/3Dmol-min.js"></script>
<div id="viewport" style="height: 100%; width: 100%;"></div>
<script>
{viewer.js()}
</script>
</div>
"""
return html_str, None
except Exception as e:
return None, f"Error rendering protein structure: {str(e)}"
def render_mol(pdb):
"""Create a py3Dmol viewer with custom styling"""
view = py3Dmol.view(width=800, height=500)
view.addModel(pdb, "pdb")
view.setStyle({'cartoon': {'color': 'spectrum'}})
view.setBackgroundColor('white')
view.zoomTo()
return view
@prevent_rerun
def run_sequence_analysis(sequence_data):
"""Wrapper for sequence analysis section"""
results = {}
try:
# Basic sequence analysis
results['frequencies'] = calculate_nucleotide_freq(str(sequence_data.seq))
results['gc_content'] = gc_content(str(sequence_data.seq))
# BLAST analysis
blast_results = perform_filtered_blast_search(str(sequence_data.seq))
if blast_results:
similar_sequences, metadata = extract_sequences_from_blast(blast_results)
if similar_sequences:
results['similar_sequences'] = similar_sequences
results['metadata'] = metadata
results['tree'], results['alignment'] = create_phylogenetic_tree(
sequence_data,
similar_sequences
)
except Exception as e:
st.error(f"Error in sequence analysis: {str(e)}")
return None
return results
@prevent_rerun
def run_gene_analysis(gtf_file='Sars_cov_2.ASM985889v3.101.gtf'):
"""Wrapper for gene analysis section"""
try:
genes = extract_genes_from_gtf(gtf_file)
return {'genes': genes}
except Exception as e:
st.error(f"Error in gene analysis: {str(e)}")
return None
@prevent_rerun
def run_drug_analysis():
"""Wrapper for drug repurposing analysis"""
try:
results, clinical_results = drug_repurposing(st)
return {
'results': results,
'clinical_results': clinical_results
}
except Exception as e:
st.error(f"Error in drug analysis: {str(e)}")
return None
def chat_endpoint():
try:
data = json.loads(st.query_params.get('data', '{}'))
messages = data.get('messages', [])
model = data.get('model', "Meta-Llama-3.1-70B-Instruct")
response = st.session_state.chat_client.chat.completions.create(
model=model,
messages=messages,
stream=False
)
return {"content": response.choices[0].message.content}
except Exception as e:
return {"error": str(e)}, 500
def main():
# Initialize session state
init_session_state()
# Set page config
st.set_page_config(
page_title="SeqCure",
page_icon="🧬",
layout="wide"
)
# Create main content container
main_content = st.container()
# Check if we're handling a chat endpoint
if 'endpoint' in st.query_params and st.query_params['endpoint'] == 'chat':
return chat_endpoint()
with main_content:
st.title("SeqCure")
st.write("Upload or paste a FASTA file containing a viral genome sequence")
# Add file upload and sequence input
col1, col2 = st.columns(2)
with col1:
uploaded_file = st.file_uploader("Choose a FASTA file", type=['fasta', 'fa'])
with col2:
pasted_sequence = st.text_area("Paste your FASTA sequence here", height=70)
# Process input
sequence_data = None
fasta_analysis_complete = False
if uploaded_file or pasted_sequence:
progress_placeholder = st.empty()
status_placeholder = st.empty()
progress_bar = progress_placeholder.progress(0)
try:
# Step 1: Load and validate sequence
status_placeholder.info("Step 1: Loading and validating sequence...")
if uploaded_file:
content = uploaded_file.read().decode()
if validate_fasta(content):
sequence_data = next(SeqIO.parse(io.StringIO(content), "fasta"))
else:
st.error("Invalid FASTA format in uploaded file")
elif pasted_sequence:
if validate_fasta(pasted_sequence):
sequence_data = next(SeqIO.parse(io.StringIO(pasted_sequence), "fasta"))
else:
st.error("Invalid FASTA format in pasted sequence")
if sequence_data:
# Step 2: Perform BLAST Search
progress_bar.progress(33)
status_placeholder.info("Step 2: Performing BLAST search for related viruses...")
blast_results = perform_filtered_blast_search(str(sequence_data.seq))
if blast_results:
progress_bar.progress(66)
status_placeholder.info("Step 3: Creating phylogenetic tree...")
similar_sequences, metadata = extract_sequences_from_blast(blast_results)
if similar_sequences:
tree_fig, alignment = create_phylogenetic_tree(sequence_data, similar_sequences)
progress_bar.progress(100)
status_placeholder.success("Analysis complete!")
# Clear progress indicators
time.sleep(1)
progress_placeholder.empty()
status_placeholder.empty()
# Display results
st.success(f"Found {len(similar_sequences)} closely related viral sequences")
# Create results tabs
tab1, tab2, tab3, tab4 = st.tabs([
"Sequence Analysis",
"Similar Sequences",
"Phylogenetic Tree",
"Raw Data"
])
with tab1:
st.subheader("Sequence Analysis")
frequencies = calculate_nucleotide_freq(str(sequence_data.seq))
freq_df = pd.DataFrame({
'Nucleotide': list(frequencies.keys()),
'Frequency (%)': list(frequencies.values())
})
fig = px.bar(freq_df, x='Nucleotide', y='Frequency (%)',
title='Nucleotide Distribution',
color='Nucleotide')
st.plotly_chart(fig)
with tab2:
st.subheader("Similar Viral Sequences")
df = pd.DataFrame(metadata)
df['Identity %'] = df['identity'].round(2)
df['E-value'] = df['e_value'].apply(lambda x: f"{x:.2e}")
display_df = df[['species', 'Identity %', 'E-value']].copy()
display_df.columns = ['Species', 'Identity %', 'E-value']
st.dataframe(display_df)
# Add section for known human coronaviruses
st.subheader("Related Viral Genomes")
corona_species = [
{"species": "Human coronavirus 229E (hCoV-229E)", "description": "Common cold coronavirus"},
{"species": "Human coronavirus NL63 (hCoV-NL63)", "description": "Upper respiratory tract infection"},
{"species": "Human coronavirus OC43 (hCoV-OC43)", "description": "Common cold coronavirus"},
{"species": "Human coronavirus HKU1 (hCoV-HKU1)", "description": "Upper respiratory infection"},
{"species": "SARS coronavirus (SARS-CoV)", "description": "Severe Acute Respiratory Syndrome"},
{"species": "Middle East respiratory syndrome coronavirus (MERS-CoV)", "description": "Middle East Respiratory Syndrome"}
]
for idx, corona in enumerate(corona_species, 1):
st.markdown(
f"""
**{idx}. {corona['species']}**
{corona['description']}
"""
)
with tab3:
st.subheader("Phylogenetic Tree of Related Viruses")
if tree_fig:
st.pyplot(tree_fig)
else:
st.error("Could not generate phylogenetic tree")
with tab4:
st.subheader("Raw Sequence Data")
st.write(f"Sequence ID: {sequence_data.id}")
st.write(f"Sequence Length: {len(sequence_data.seq)} bp")
if st.checkbox("Show raw sequence"):
st.text_area("Raw Sequence", str(sequence_data.seq), height=200)
fasta_analysis_complete = True
else:
st.error("No similar sequences found")
else:
st.error("BLAST search failed")
except Exception as e:
progress_placeholder.empty()
status_placeholder.empty()
st.error(f"An error occurred during processing: {str(e)}")
else:
st.info("Please upload or paste a FASTA sequence to begin analysis")
# Gene Analysis Section - only show if FASTA analysis is complete
if fasta_analysis_complete:
st.markdown("---")
st.header("Gene Annotation Analysis")
st.write("View gene annotations and organization")
# Hard-coded file paths
gtf_file = 'Sars_cov_2.ASM985889v3.101.gtf'
image_file = 'SARS-CoV-2_Genes.png'
# Create placeholders for status updates and progress bar
gene_progress_placeholder = st.empty()
gene_status_placeholder = st.empty()
gene_progress_bar = gene_progress_placeholder.progress(0)
try:
# Extract genes
gene_names = extract_genes_from_gtf(gtf_file, gene_status_placeholder, gene_progress_bar)
# Create tabs for different views
gene_tab, viz_tab = st.tabs(["Gene List", "Gene Visualization"])
with gene_tab:
# Create a DataFrame from the gene names
gene_df = pd.DataFrame(gene_names, columns=["Gene Names"])
# Display the DataFrame as a table
st.dataframe(gene_df)
with viz_tab:
# Display the image
fig = display_image(image_file, gene_status_placeholder, gene_progress_bar)
st.pyplot(fig)
# Complete the progress bar
gene_progress_bar.progress(100)
gene_status_placeholder.success("Gene analysis complete!")
# Clear progress indicators after completion
time.sleep(1)
gene_progress_placeholder.empty()
gene_status_placeholder.empty()
except Exception as e:
gene_progress_placeholder.empty()
gene_status_placeholder.empty()
st.error(f"An error occurred during gene analysis: {str(e)}")
# Drug Repurposing Analysis Section
st.markdown("---")
st.header("Drug Repurposing Analysis")
st.write("Analyzing potential drug candidates for treatment.")
# Check for required files before starting
required_files = [
Path("./infer_drug.tsv"),
Path("./embed/entities.tsv"),
Path("./embed/relations.tsv"),
Path("./embed/DRKG_TransE_l2_entity.npy"),
Path("./embed/DRKG_TransE_l2_relation.npy"),
Path("./COVID19_clinical_trial_drugs.tsv")
]
missing_files = [f for f in required_files if not f.exists()]
if missing_files:
st.error("Missing required files:")
for f in missing_files:
st.error(f"- {f}")
return
# Create a container for persistent step updates
with st.container():
try:
# Run drug repurposing analysis
_, clinical_results_df = drug_repurposing(st)
st.header('Top Repurposing Candidates')
try:
# Initialize session state for selected compound if not exists
if 'selected_compound' not in st.session_state:
st.session_state.selected_compound = None
compounds = clinical_results_df["name"].to_list()
# Fetch compound data using cached function
compound_data = fetch_compound_data(compounds)
# Create DataFrame only from successfully fetched data
results = [data for data in compound_data.values() if data]
if not results: