-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
NEW: dynamic generation of SMILES for PTMs
- Loading branch information
Mikhail Lebedev
committed
Jul 16, 2024
1 parent
5fde17e
commit c3b9cd3
Showing
6 changed files
with
652 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
aa_smiles: | ||
A: N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn] | ||
R: N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn] | ||
N: N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn] | ||
D: N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn] | ||
C: N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn] | ||
Q: N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn] | ||
E: N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn] | ||
G: N([Xe])([Xe])CC(=O)[Rn] | ||
H: N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn] | ||
I: N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn] | ||
L: N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn] | ||
K: N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn] | ||
M: N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn] | ||
F: N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn] | ||
P: N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn] | ||
S: N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn] | ||
T: N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn] | ||
W: N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)[Rn] | ||
Y: N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn] | ||
V: N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn] | ||
|
||
n_term_modifications: | ||
mTRAQ: C(=O)CN1CCN(CC1)C | ||
mTRAQ:d4: C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C | ||
mTRAQ:d8: C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H]) | ||
Acetyl: C(=O)C | ||
Propionyl: C(=O)CC | ||
Biotin: C(=O)CCCCC1SCC2NC(=O)NC21 | ||
Carbamidomethyl: C(=O)NC | ||
Carbamyl: C(=O)N | ||
Propionamide: C(=O)CC(N)=O | ||
Pyridylacetyl: C(=O)Cc1ccccn1 | ||
Methyl: C | ||
Dimethyl: C | ||
Dimethyl:2H(6)13C(2): '[13C]([2H])([2H])([2H])' | ||
Dimethyl:2H(4): C([2H])([2H])([1H]) | ||
Dimethyl:2H(4)13C(2): '[13C]([2H])([2H])([1H])' | ||
|
||
c_term_modifications: | ||
Methyl: OC | ||
|
||
ptm_dict: | ||
Carbamidomethyl@C: C(C(C(=O)[Rn])N([Xe])([Xe]))SCC(=O)N | ||
Oxidation@M: O=C([Rn])C(N([Xe])([Xe]))CCS(=O)C | ||
GlyGly@K: NCC(=O)NCC(=O)NCCCC[C@H](N([Xe])([Xe]))C([Rn])=O | ||
Deamidated@N: C([C@@H](C(=O)[Rn])N([Xe])([Xe]))C(=O)O | ||
Propionyl@K: CCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) | ||
Deamidated@Q: C(CC(=O)O)[C@@H](C(=O)[Rn])N([Xe])([Xe]) | ||
Gln->pyro-Glu@Q^Any N-term: O=C([Rn])[C@H]1N([Xe])C(=O)CC1 | ||
Glu->pyro-Glu@E^Any N-term: O=C([Rn])[C@H]1N([Xe])C(=O)CC1 | ||
Phospho@S: O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe]) | ||
Nitro@Y: O=[N+]([O-])c1cc(ccc1O)C[C@@H](C(=O)[Rn])N([Xe])([Xe]) | ||
Acetyl@K: CC(=O)NCCCC[C@H](N([Xe])([Xe]))C(=O)[Rn] | ||
Dimethyl@K: CN(C)CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn] | ||
mTRAQ@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)CN1CCN(C)CC1' | ||
mTRAQ:d4@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1CCN(C)[13CH2][13CH2]1' | ||
mTRAQ:d8@K: '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1[13CH2][13CH2][15N]([13CH3])[13CH2][13CH2]1' | ||
Pyridylethyl@C: C1=CN=CC=C1CCSCC(C(=O)[Rn])N([Xe])([Xe]) | ||
Butyryl@K: CCCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) | ||
Phospho@T: CC(C(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O | ||
Methylthio@C: CSSC[C@H](N([Xe])([Xe]))C([Rn])=O | ||
Carbamidomethyl@M: C[S+](CCC(N([Xe])([Xe]))C([Rn])=O)CC(N)=O | ||
Succinyl@K: C(CCN)CC(C(=O)[Rn])N([Xe])C(=O)CCC(=O)O | ||
Crotonyl@K: CC=CC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) | ||
Phospho@Y: C1=CC(=CC=C1CC(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O | ||
Malonyl@K: N([Xe])([Xe])[C@@H](CCCC(NC(=O)CC(=O)O)=O)C(=O)[Rn] | ||
Met->Hse@M: N([Xe])([Xe])[C@H](C(=O)[Rn])CCO | ||
Pro->(2S,4R)-4-fluoroproline@P: F[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn] | ||
Pro->(2S, 4S)-4fluoroproline@P: F[C@H]1C[C@H](N([Xe])C1)C(=O)[Rn] | ||
Pro->(2S)-1,3-thiazolidine-2-carboxylic acid@P: S1[C@H](N([Xe])CC1)C(=O)[Rn] | ||
Pro->(4R)-1,3-Thiazolidine-4-carboxylic acid@P: S1CN([Xe])[C@@H](C1)C(=O)[Rn] | ||
Pro->(2S,4R)-4-hydroxyproline@P: O[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn] | ||
Pro->(DL)-pipecolic acid@P: C1CCN([Xe])C(C1)C(=O)[Rn] | ||
Pro->3,4-Dehydro-L-proline@P: C1C=CC(N1([Xe]))C(=O)[Rn] | ||
Pro->(1S,3S,5S)-2-Azabicyclo[3.1.0]hexane-3-carboxylic acid@P: '[C@H]12N([Xe])[C@@H](C[C@@H]2C1)C(=O)[Rn]' | ||
Pro->(1R,3S,5R)-2-Azabicyclo[3.1.0]hexane-3-carboxylic acid@P: '[C@@H]12N([Xe])[C@@H](C[C@H]2C1)C(=O)[Rn]' | ||
Pro->(2S,3aS,7aS)-Octahydro-1H-indole-2-carboxylic acid@P: N1([Xe])[C@@H](C[C@@H]2CCCC[C@H]12)C(=O)[Rn] | ||
Pro->(DL)-5-trifluoromethylproline@P: FC(C1CCC(N1([Xe]))C(=O)[Rn])(F)F |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
from typing import Optional | ||
import os | ||
from rdkit import Chem | ||
from rdkit.Chem.rdmolops import ReplaceSubstructs, SanitizeMol | ||
|
||
from alphabase.constants._const import CONST_FILE_FOLDER | ||
from alphabase.yaml_utils import load_yaml | ||
|
||
SMILES_ALL: dict = load_yaml(os.path.join(CONST_FILE_FOLDER, "smiles.yaml")) | ||
|
||
N_TERM_PLACEHOLDER = "[Xe]" | ||
C_TERM_PLACEHOLDER = "[Rn]" | ||
|
||
aa_smiles = SMILES_ALL["aa_smiles"] | ||
n_term_modifications = SMILES_ALL["n_term_modifications"] | ||
c_term_modifications = SMILES_ALL["c_term_modifications"] | ||
ptm_dict = SMILES_ALL["ptm_dict"] | ||
|
||
aa_mols = {aa: Chem.MolFromSmiles(smile) for aa, smile in aa_smiles.items()} | ||
|
||
|
||
def validate_correct_args( | ||
aa_smiles: str, n_term_mod: Optional[str] = None, c_term_mod: Optional[str] = None | ||
) -> None: | ||
""" | ||
Validate the amino acid, N-terminal modification, and C-terminal modification. | ||
Args: | ||
aa_smiles (str): SMILES of an amino acid with or without PTMs, has to have the placeholder atoms [Xe] for N-term and [Rn] for C-term. | ||
n_term_mod (Optional[str]): N-terminal modification name. Options are the keys in the n_term_modifications dict. | ||
c_term_mod (Optional[str]): C-terminal modification name. Options are the keys in the c_term_modifications dict. | ||
Raises: | ||
ValueError: If the amino acid is invalid, or if N-terminal / C-terminal modification is not recognized. | ||
""" | ||
if N_TERM_PLACEHOLDER not in aa_smiles or C_TERM_PLACEHOLDER not in aa_smiles: | ||
raise ValueError( | ||
f"The amino acid {aa_smiles} must contain the N-terminal and C-terminal placeholders in order to be modified." | ||
) | ||
if n_term_mod is not None and n_term_mod not in n_term_modifications: | ||
raise ValueError(f"Unrecognized N-terminal modification: {n_term_mod}") | ||
if c_term_mod is not None and c_term_mod not in c_term_modifications: | ||
raise ValueError(f"Unrecognized C-terminal modification: {c_term_mod}") | ||
_mol = Chem.MolFromSmiles(aa_smiles) | ||
if _mol is None: | ||
raise ValueError(f"Invalid amino acid SMILES: {aa_smiles}") | ||
|
||
|
||
def modify_amino_acid( | ||
aa_smiles: str, n_term_mod: Optional[str] = None, c_term_mod: Optional[str] = None | ||
) -> str: | ||
""" | ||
Modify an amino acid with N-terminal and C-terminal modifications. | ||
Args: | ||
aa (str): SMILES of an amino acid with or without PTMs, has to have the placeholder atoms [Xe] for N-term and [Rn] for C-term. | ||
n_term_mod (Optional[str]): N-terminal modification name. Options are the keys in the n_term_modifications dict. If None, the amino acid is not modified, so using hydrogen atoms. | ||
c_term_mod (Optional[str]): C-terminal modification name. Options are the keys in the c_term_modifications dict. If None, the amino acid is not modified, so using the -OH group. | ||
Returns: | ||
str: SMILES string of the modified amino acid. | ||
""" | ||
validate_correct_args(aa_smiles, n_term_mod, c_term_mod) | ||
mol = Chem.MolFromSmiles(aa_smiles) | ||
n_term_placeholder_mol = Chem.MolFromSmiles(N_TERM_PLACEHOLDER, sanitize=False) | ||
c_term_placeholder_mol = Chem.MolFromSmiles(C_TERM_PLACEHOLDER, sanitize=False) | ||
|
||
# Apply N-terminal modification | ||
if n_term_mod: | ||
n_mod_smiles = n_term_modifications[n_term_mod] | ||
n_mod_mol = Chem.MolFromSmiles(n_mod_smiles) | ||
mol = ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0] | ||
|
||
if "Dimethyl" in n_term_mod: | ||
mol = ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0] | ||
else: | ||
mol = ReplaceSubstructs( | ||
mol, n_term_placeholder_mol, Chem.MolFromSmiles("[H]", sanitize=False) | ||
)[0] | ||
else: | ||
mol = ReplaceSubstructs( | ||
mol, | ||
n_term_placeholder_mol, | ||
Chem.MolFromSmiles("[H]", sanitize=False), | ||
replaceAll=True, | ||
)[0] | ||
|
||
# Apply C-terminal modification | ||
if c_term_mod: | ||
c_mod_smiles = c_term_modifications[c_term_mod] | ||
c_mod_mol = Chem.MolFromSmiles(c_mod_smiles) | ||
mol = ReplaceSubstructs(mol, c_term_placeholder_mol, c_mod_mol)[0] | ||
else: | ||
mol = ReplaceSubstructs( | ||
mol, c_term_placeholder_mol, Chem.MolFromSmiles("O", sanitize=False) | ||
)[0] | ||
|
||
SanitizeMol(mol) | ||
return Chem.MolToSmiles(mol) |
Oops, something went wrong.