Skip to content

Commit

Permalink
NEW: dynamic generation of SMILES for PTMs
Browse files Browse the repository at this point in the history
  • Loading branch information
Mikhail Lebedev committed Jul 16, 2024
1 parent 5fde17e commit c3b9cd3
Show file tree
Hide file tree
Showing 6 changed files with 652 additions and 0 deletions.
79 changes: 79 additions & 0 deletions alphabase/constants/const_files/smiles.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
aa_smiles:
A: N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn]
R: N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn]
N: N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn]
D: N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn]
C: N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn]
Q: N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn]
E: N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn]
G: N([Xe])([Xe])CC(=O)[Rn]
H: N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn]
I: N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn]
L: N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn]
K: N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn]
M: N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn]
F: N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn]
P: N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn]
S: N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn]
T: N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn]
W: N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)[Rn]
Y: N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn]
V: N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn]

n_term_modifications:
mTRAQ: C(=O)CN1CCN(CC1)C
mTRAQ:d4: C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C
mTRAQ:d8: C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H])
Acetyl: C(=O)C
Propionyl: C(=O)CC
Biotin: C(=O)CCCCC1SCC2NC(=O)NC21
Carbamidomethyl: C(=O)NC
Carbamyl: C(=O)N
Propionamide: C(=O)CC(N)=O
Pyridylacetyl: C(=O)Cc1ccccn1
Methyl: C
Dimethyl: C
Dimethyl:2H(6)13C(2): '[13C]([2H])([2H])([2H])'
Dimethyl:2H(4): C([2H])([2H])([1H])
Dimethyl:2H(4)13C(2): '[13C]([2H])([2H])([1H])'

c_term_modifications:
Methyl: OC

ptm_dict:
Carbamidomethyl@C: C(C(C(=O)[Rn])N([Xe])([Xe]))SCC(=O)N
Oxidation@M: O=C([Rn])C(N([Xe])([Xe]))CCS(=O)C
GlyGly@K: NCC(=O)NCC(=O)NCCCC[C@H](N([Xe])([Xe]))C([Rn])=O
Deamidated@N: C([C@@H](C(=O)[Rn])N([Xe])([Xe]))C(=O)O
Propionyl@K: CCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])
Deamidated@Q: C(CC(=O)O)[C@@H](C(=O)[Rn])N([Xe])([Xe])
Gln->pyro-Glu@Q^Any N-term: O=C([Rn])[C@H]1N([Xe])C(=O)CC1
Glu->pyro-Glu@E^Any N-term: O=C([Rn])[C@H]1N([Xe])C(=O)CC1
Phospho@S: O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe])
Nitro@Y: O=[N+]([O-])c1cc(ccc1O)C[C@@H](C(=O)[Rn])N([Xe])([Xe])
Acetyl@K: CC(=O)NCCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]
Dimethyl@K: CN(C)CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]
mTRAQ@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)CN1CCN(C)CC1'
mTRAQ:d4@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1CCN(C)[13CH2][13CH2]1'
mTRAQ:d8@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1[13CH2][13CH2][15N]([13CH3])[13CH2][13CH2]1'
Pyridylethyl@C: C1=CN=CC=C1CCSCC(C(=O)[Rn])N([Xe])([Xe])
Butyryl@K: CCCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])
Phospho@T: CC(C(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O
Methylthio@C: CSSC[C@H](N([Xe])([Xe]))C([Rn])=O
Carbamidomethyl@M: C[S+](CCC(N([Xe])([Xe]))C([Rn])=O)CC(N)=O
Succinyl@K: C(CCN)CC(C(=O)[Rn])N([Xe])C(=O)CCC(=O)O
Crotonyl@K: CC=CC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])
Phospho@Y: C1=CC(=CC=C1CC(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O
Malonyl@K: N([Xe])([Xe])[C@@H](CCCC(NC(=O)CC(=O)O)=O)C(=O)[Rn]
Met->Hse@M: N([Xe])([Xe])[C@H](C(=O)[Rn])CCO
Pro->(2S,4R)-4-fluoroproline@P: F[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]
Pro->(2S, 4S)-4fluoroproline@P: F[C@H]1C[C@H](N([Xe])C1)C(=O)[Rn]
Pro->(2S)-1,3-thiazolidine-2-carboxylic acid@P: S1[C@H](N([Xe])CC1)C(=O)[Rn]
Pro->(4R)-1,3-Thiazolidine-4-carboxylic acid@P: S1CN([Xe])[C@@H](C1)C(=O)[Rn]
Pro->(2S,4R)-4-hydroxyproline@P: O[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]
Pro->(DL)-pipecolic acid@P: C1CCN([Xe])C(C1)C(=O)[Rn]
Pro->3,4-Dehydro-L-proline@P: C1C=CC(N1([Xe]))C(=O)[Rn]
Pro->(1S,3S,5S)-2-Azabicyclo[3.1.0]hexane-3-carboxylic acid@P: '[C@H]12N([Xe])[C@@H](C[C@@H]2C1)C(=O)[Rn]'
Pro->(1R,3S,5R)-2-Azabicyclo[3.1.0]hexane-3-carboxylic acid@P: '[C@@H]12N([Xe])[C@@H](C[C@H]2C1)C(=O)[Rn]'
Pro->(2S,3aS,7aS)-Octahydro-1H-indole-2-carboxylic acid@P: N1([Xe])[C@@H](C[C@@H]2CCCC[C@H]12)C(=O)[Rn]
Pro->(DL)-5-trifluoromethylproline@P: FC(C1CCC(N1([Xe]))C(=O)[Rn])(F)F
99 changes: 99 additions & 0 deletions alphabase/smiles/smiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from typing import Optional
import os
from rdkit import Chem
from rdkit.Chem.rdmolops import ReplaceSubstructs, SanitizeMol

from alphabase.constants._const import CONST_FILE_FOLDER
from alphabase.yaml_utils import load_yaml

SMILES_ALL: dict = load_yaml(os.path.join(CONST_FILE_FOLDER, "smiles.yaml"))

N_TERM_PLACEHOLDER = "[Xe]"
C_TERM_PLACEHOLDER = "[Rn]"

aa_smiles = SMILES_ALL["aa_smiles"]
n_term_modifications = SMILES_ALL["n_term_modifications"]
c_term_modifications = SMILES_ALL["c_term_modifications"]
ptm_dict = SMILES_ALL["ptm_dict"]

aa_mols = {aa: Chem.MolFromSmiles(smile) for aa, smile in aa_smiles.items()}


def validate_correct_args(
aa_smiles: str, n_term_mod: Optional[str] = None, c_term_mod: Optional[str] = None
) -> None:
"""
Validate the amino acid, N-terminal modification, and C-terminal modification.
Args:
aa_smiles (str): SMILES of an amino acid with or without PTMs, has to have the placeholder atoms [Xe] for N-term and [Rn] for C-term.
n_term_mod (Optional[str]): N-terminal modification name. Options are the keys in the n_term_modifications dict.
c_term_mod (Optional[str]): C-terminal modification name. Options are the keys in the c_term_modifications dict.
Raises:
ValueError: If the amino acid is invalid, or if N-terminal / C-terminal modification is not recognized.
"""
if N_TERM_PLACEHOLDER not in aa_smiles or C_TERM_PLACEHOLDER not in aa_smiles:
raise ValueError(
f"The amino acid {aa_smiles} must contain the N-terminal and C-terminal placeholders in order to be modified."
)
if n_term_mod is not None and n_term_mod not in n_term_modifications:
raise ValueError(f"Unrecognized N-terminal modification: {n_term_mod}")
if c_term_mod is not None and c_term_mod not in c_term_modifications:
raise ValueError(f"Unrecognized C-terminal modification: {c_term_mod}")
_mol = Chem.MolFromSmiles(aa_smiles)
if _mol is None:
raise ValueError(f"Invalid amino acid SMILES: {aa_smiles}")


def modify_amino_acid(
aa_smiles: str, n_term_mod: Optional[str] = None, c_term_mod: Optional[str] = None
) -> str:
"""
Modify an amino acid with N-terminal and C-terminal modifications.
Args:
aa (str): SMILES of an amino acid with or without PTMs, has to have the placeholder atoms [Xe] for N-term and [Rn] for C-term.
n_term_mod (Optional[str]): N-terminal modification name. Options are the keys in the n_term_modifications dict. If None, the amino acid is not modified, so using hydrogen atoms.
c_term_mod (Optional[str]): C-terminal modification name. Options are the keys in the c_term_modifications dict. If None, the amino acid is not modified, so using the -OH group.
Returns:
str: SMILES string of the modified amino acid.
"""
validate_correct_args(aa_smiles, n_term_mod, c_term_mod)
mol = Chem.MolFromSmiles(aa_smiles)
n_term_placeholder_mol = Chem.MolFromSmiles(N_TERM_PLACEHOLDER, sanitize=False)
c_term_placeholder_mol = Chem.MolFromSmiles(C_TERM_PLACEHOLDER, sanitize=False)

# Apply N-terminal modification
if n_term_mod:
n_mod_smiles = n_term_modifications[n_term_mod]
n_mod_mol = Chem.MolFromSmiles(n_mod_smiles)
mol = ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0]

if "Dimethyl" in n_term_mod:
mol = ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0]
else:
mol = ReplaceSubstructs(
mol, n_term_placeholder_mol, Chem.MolFromSmiles("[H]", sanitize=False)
)[0]
else:
mol = ReplaceSubstructs(
mol,
n_term_placeholder_mol,
Chem.MolFromSmiles("[H]", sanitize=False),
replaceAll=True,
)[0]

# Apply C-terminal modification
if c_term_mod:
c_mod_smiles = c_term_modifications[c_term_mod]
c_mod_mol = Chem.MolFromSmiles(c_mod_smiles)
mol = ReplaceSubstructs(mol, c_term_placeholder_mol, c_mod_mol)[0]
else:
mol = ReplaceSubstructs(
mol, c_term_placeholder_mol, Chem.MolFromSmiles("O", sanitize=False)
)[0]

SanitizeMol(mol)
return Chem.MolToSmiles(mol)
Loading

0 comments on commit c3b9cd3

Please sign in to comment.