Skip to content

Commit c3b9cd3

Browse files
author
Mikhail Lebedev
committed
NEW: dynamic generation of SMILES for PTMs
1 parent 5fde17e commit c3b9cd3

File tree

6 files changed

+652
-0
lines changed

6 files changed

+652
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
aa_smiles:
2+
A: N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn]
3+
R: N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn]
4+
N: N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn]
5+
D: N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn]
6+
C: N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn]
7+
Q: N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn]
8+
E: N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn]
9+
G: N([Xe])([Xe])CC(=O)[Rn]
10+
H: N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn]
11+
I: N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn]
12+
L: N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn]
13+
K: N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn]
14+
M: N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn]
15+
F: N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn]
16+
P: N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn]
17+
S: N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn]
18+
T: N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn]
19+
W: N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)[Rn]
20+
Y: N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn]
21+
V: N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn]
22+
23+
n_term_modifications:
24+
mTRAQ: C(=O)CN1CCN(CC1)C
25+
mTRAQ:d4: C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C
26+
mTRAQ:d8: C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H])
27+
Acetyl: C(=O)C
28+
Propionyl: C(=O)CC
29+
Biotin: C(=O)CCCCC1SCC2NC(=O)NC21
30+
Carbamidomethyl: C(=O)NC
31+
Carbamyl: C(=O)N
32+
Propionamide: C(=O)CC(N)=O
33+
Pyridylacetyl: C(=O)Cc1ccccn1
34+
Methyl: C
35+
Dimethyl: C
36+
Dimethyl:2H(6)13C(2): '[13C]([2H])([2H])([2H])'
37+
Dimethyl:2H(4): C([2H])([2H])([1H])
38+
Dimethyl:2H(4)13C(2): '[13C]([2H])([2H])([1H])'
39+
40+
c_term_modifications:
41+
Methyl: OC
42+
43+
ptm_dict:
44+
Carbamidomethyl@C: C(C(C(=O)[Rn])N([Xe])([Xe]))SCC(=O)N
45+
Oxidation@M: O=C([Rn])C(N([Xe])([Xe]))CCS(=O)C
46+
GlyGly@K: NCC(=O)NCC(=O)NCCCC[C@H](N([Xe])([Xe]))C([Rn])=O
47+
Deamidated@N: C([C@@H](C(=O)[Rn])N([Xe])([Xe]))C(=O)O
48+
Propionyl@K: CCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])
49+
Deamidated@Q: C(CC(=O)O)[C@@H](C(=O)[Rn])N([Xe])([Xe])
50+
Gln->pyro-Glu@Q^Any N-term: O=C([Rn])[C@H]1N([Xe])C(=O)CC1
51+
Glu->pyro-Glu@E^Any N-term: O=C([Rn])[C@H]1N([Xe])C(=O)CC1
52+
Phospho@S: O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe])
53+
Nitro@Y: O=[N+]([O-])c1cc(ccc1O)C[C@@H](C(=O)[Rn])N([Xe])([Xe])
54+
Acetyl@K: CC(=O)NCCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]
55+
Dimethyl@K: CN(C)CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]
56+
mTRAQ@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)CN1CCN(C)CC1'
57+
mTRAQ:d4@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1CCN(C)[13CH2][13CH2]1'
58+
mTRAQ:d8@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1[13CH2][13CH2][15N]([13CH3])[13CH2][13CH2]1'
59+
Pyridylethyl@C: C1=CN=CC=C1CCSCC(C(=O)[Rn])N([Xe])([Xe])
60+
Butyryl@K: CCCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])
61+
Phospho@T: CC(C(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O
62+
Methylthio@C: CSSC[C@H](N([Xe])([Xe]))C([Rn])=O
63+
Carbamidomethyl@M: C[S+](CCC(N([Xe])([Xe]))C([Rn])=O)CC(N)=O
64+
Succinyl@K: C(CCN)CC(C(=O)[Rn])N([Xe])C(=O)CCC(=O)O
65+
Crotonyl@K: CC=CC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])
66+
Phospho@Y: C1=CC(=CC=C1CC(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O
67+
Malonyl@K: N([Xe])([Xe])[C@@H](CCCC(NC(=O)CC(=O)O)=O)C(=O)[Rn]
68+
Met->Hse@M: N([Xe])([Xe])[C@H](C(=O)[Rn])CCO
69+
Pro->(2S,4R)-4-fluoroproline@P: F[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]
70+
Pro->(2S, 4S)-4fluoroproline@P: F[C@H]1C[C@H](N([Xe])C1)C(=O)[Rn]
71+
Pro->(2S)-1,3-thiazolidine-2-carboxylic acid@P: S1[C@H](N([Xe])CC1)C(=O)[Rn]
72+
Pro->(4R)-1,3-Thiazolidine-4-carboxylic acid@P: S1CN([Xe])[C@@H](C1)C(=O)[Rn]
73+
Pro->(2S,4R)-4-hydroxyproline@P: O[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]
74+
Pro->(DL)-pipecolic acid@P: C1CCN([Xe])C(C1)C(=O)[Rn]
75+
Pro->3,4-Dehydro-L-proline@P: C1C=CC(N1([Xe]))C(=O)[Rn]
76+
Pro->(1S,3S,5S)-2-Azabicyclo[3.1.0]hexane-3-carboxylic acid@P: '[C@H]12N([Xe])[C@@H](C[C@@H]2C1)C(=O)[Rn]'
77+
Pro->(1R,3S,5R)-2-Azabicyclo[3.1.0]hexane-3-carboxylic acid@P: '[C@@H]12N([Xe])[C@@H](C[C@H]2C1)C(=O)[Rn]'
78+
Pro->(2S,3aS,7aS)-Octahydro-1H-indole-2-carboxylic acid@P: N1([Xe])[C@@H](C[C@@H]2CCCC[C@H]12)C(=O)[Rn]
79+
Pro->(DL)-5-trifluoromethylproline@P: FC(C1CCC(N1([Xe]))C(=O)[Rn])(F)F

alphabase/smiles/smiles.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
from typing import Optional
2+
import os
3+
from rdkit import Chem
4+
from rdkit.Chem.rdmolops import ReplaceSubstructs, SanitizeMol
5+
6+
from alphabase.constants._const import CONST_FILE_FOLDER
7+
from alphabase.yaml_utils import load_yaml
8+
9+
SMILES_ALL: dict = load_yaml(os.path.join(CONST_FILE_FOLDER, "smiles.yaml"))
10+
11+
N_TERM_PLACEHOLDER = "[Xe]"
12+
C_TERM_PLACEHOLDER = "[Rn]"
13+
14+
aa_smiles = SMILES_ALL["aa_smiles"]
15+
n_term_modifications = SMILES_ALL["n_term_modifications"]
16+
c_term_modifications = SMILES_ALL["c_term_modifications"]
17+
ptm_dict = SMILES_ALL["ptm_dict"]
18+
19+
aa_mols = {aa: Chem.MolFromSmiles(smile) for aa, smile in aa_smiles.items()}
20+
21+
22+
def validate_correct_args(
23+
aa_smiles: str, n_term_mod: Optional[str] = None, c_term_mod: Optional[str] = None
24+
) -> None:
25+
"""
26+
Validate the amino acid, N-terminal modification, and C-terminal modification.
27+
28+
Args:
29+
aa_smiles (str): SMILES of an amino acid with or without PTMs, has to have the placeholder atoms [Xe] for N-term and [Rn] for C-term.
30+
n_term_mod (Optional[str]): N-terminal modification name. Options are the keys in the n_term_modifications dict.
31+
c_term_mod (Optional[str]): C-terminal modification name. Options are the keys in the c_term_modifications dict.
32+
33+
Raises:
34+
ValueError: If the amino acid is invalid, or if N-terminal / C-terminal modification is not recognized.
35+
"""
36+
if N_TERM_PLACEHOLDER not in aa_smiles or C_TERM_PLACEHOLDER not in aa_smiles:
37+
raise ValueError(
38+
f"The amino acid {aa_smiles} must contain the N-terminal and C-terminal placeholders in order to be modified."
39+
)
40+
if n_term_mod is not None and n_term_mod not in n_term_modifications:
41+
raise ValueError(f"Unrecognized N-terminal modification: {n_term_mod}")
42+
if c_term_mod is not None and c_term_mod not in c_term_modifications:
43+
raise ValueError(f"Unrecognized C-terminal modification: {c_term_mod}")
44+
_mol = Chem.MolFromSmiles(aa_smiles)
45+
if _mol is None:
46+
raise ValueError(f"Invalid amino acid SMILES: {aa_smiles}")
47+
48+
49+
def modify_amino_acid(
50+
aa_smiles: str, n_term_mod: Optional[str] = None, c_term_mod: Optional[str] = None
51+
) -> str:
52+
"""
53+
Modify an amino acid with N-terminal and C-terminal modifications.
54+
55+
Args:
56+
aa (str): SMILES of an amino acid with or without PTMs, has to have the placeholder atoms [Xe] for N-term and [Rn] for C-term.
57+
n_term_mod (Optional[str]): N-terminal modification name. Options are the keys in the n_term_modifications dict. If None, the amino acid is not modified, so using hydrogen atoms.
58+
c_term_mod (Optional[str]): C-terminal modification name. Options are the keys in the c_term_modifications dict. If None, the amino acid is not modified, so using the -OH group.
59+
60+
Returns:
61+
str: SMILES string of the modified amino acid.
62+
"""
63+
validate_correct_args(aa_smiles, n_term_mod, c_term_mod)
64+
mol = Chem.MolFromSmiles(aa_smiles)
65+
n_term_placeholder_mol = Chem.MolFromSmiles(N_TERM_PLACEHOLDER, sanitize=False)
66+
c_term_placeholder_mol = Chem.MolFromSmiles(C_TERM_PLACEHOLDER, sanitize=False)
67+
68+
# Apply N-terminal modification
69+
if n_term_mod:
70+
n_mod_smiles = n_term_modifications[n_term_mod]
71+
n_mod_mol = Chem.MolFromSmiles(n_mod_smiles)
72+
mol = ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0]
73+
74+
if "Dimethyl" in n_term_mod:
75+
mol = ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0]
76+
else:
77+
mol = ReplaceSubstructs(
78+
mol, n_term_placeholder_mol, Chem.MolFromSmiles("[H]", sanitize=False)
79+
)[0]
80+
else:
81+
mol = ReplaceSubstructs(
82+
mol,
83+
n_term_placeholder_mol,
84+
Chem.MolFromSmiles("[H]", sanitize=False),
85+
replaceAll=True,
86+
)[0]
87+
88+
# Apply C-terminal modification
89+
if c_term_mod:
90+
c_mod_smiles = c_term_modifications[c_term_mod]
91+
c_mod_mol = Chem.MolFromSmiles(c_mod_smiles)
92+
mol = ReplaceSubstructs(mol, c_term_placeholder_mol, c_mod_mol)[0]
93+
else:
94+
mol = ReplaceSubstructs(
95+
mol, c_term_placeholder_mol, Chem.MolFromSmiles("O", sanitize=False)
96+
)[0]
97+
98+
SanitizeMol(mol)
99+
return Chem.MolToSmiles(mol)

0 commit comments

Comments
 (0)