diff --git a/alphabase/constants/aa.py b/alphabase/constants/aa.py
index c2334e9e..ef2abb6d 100644
--- a/alphabase/constants/aa.py
+++ b/alphabase/constants/aa.py
@@ -11,12 +11,14 @@
parse_formula,
reset_elements,
)
-from alphabase.yaml_utils import load_yaml
# We use all 128 ASCII code to represent amino acids for flexible extensions in the future.
# The amino acid masses are stored in 128-lengh array :py:data:`AA_ASCII_MASS`.
-# If an ASCII code is not in `AA_Formula`, the mass will be set as a large value to disable MS search.
-AA_Formula: dict = load_yaml(os.path.join(CONST_FILE_FOLDER, "amino_acid.yaml"))
+# If an ASCII code is not in `aa_formula`, the mass will be set as a large value to disable MS search.
+aa_formula: pd.DataFrame = pd.read_csv(
+ os.path.join(CONST_FILE_FOLDER, "amino_acid.tsv"), sep="\t", index_col=0
+)
+
#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
AA_ASCII_MASS: np.ndarray = np.ones(128) * 1e8
@@ -28,20 +30,23 @@
def replace_atoms(atom_replace_dict: typing.Dict):
- for aa, formula in list(AA_Formula.items()):
+ for aa, row in aa_formula.iterrows():
+ formula = row["formula"]
atom_comp = dict(parse_formula(formula))
for atom_from, atom_to in atom_replace_dict.items():
if atom_from in atom_comp:
atom_comp[atom_to] = atom_comp[atom_from]
del atom_comp[atom_from]
- AA_Formula[aa] = "".join([f"{atom}({n})" for atom, n in atom_comp.items()])
+ aa_formula.loc[aa, "formula"] = "".join(
+ [f"{atom}({n})" for atom, n in atom_comp.items()]
+ )
def reset_AA_mass() -> np.ndarray:
"""AA mass in np.array with shape (128,)"""
global AA_ASCII_MASS
- for aa, chem in AA_Formula.items():
- AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
+ for aa, row in aa_formula.iterrows():
+ AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(row["formula"])
return AA_ASCII_MASS
@@ -51,15 +56,14 @@ def reset_AA_mass() -> np.ndarray:
def reset_AA_df():
global AA_DF
AA_DF = pd.DataFrame()
- AA_DF["aa"] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
- AA_DF["formula"] = [""] * len(AA_ASCII_MASS)
- aa_idxes = []
- formulas = []
- for aa, formula in AA_Formula.items():
- aa_idxes.append(ord(aa))
- formulas.append(formula)
- AA_DF.loc[aa_idxes, "formula"] = formulas
+ num_rows = len(AA_ASCII_MASS)
+ AA_DF["aa"] = [chr(aa) for aa in range(num_rows)]
+ AA_DF["formula"] = [""] * num_rows
+ AA_DF["smiles"] = [""] * num_rows
AA_DF["mass"] = AA_ASCII_MASS
+ for aa, row in aa_formula.iterrows():
+ AA_DF.loc[ord(aa), "formula"] = row["formula"]
+ AA_DF.loc[ord(aa), "smiles"] = row["smiles"]
return AA_DF
@@ -69,8 +73,8 @@ def reset_AA_df():
def reset_AA_Composition():
global AA_Composition
AA_Composition = {}
- for aa, formula, _mass in AA_DF.values:
- AA_Composition[aa] = dict(parse_formula(formula))
+ for aa, row in aa_formula.iterrows():
+ AA_Composition[aa] = dict(parse_formula(row["formula"]))
return AA_Composition
@@ -85,12 +89,14 @@ def reset_AA_atoms(atom_replace_dict: typing.Dict = {}):
reset_AA_Composition()
-def update_an_AA(aa: str, formula: str):
+def update_an_AA(aa: str, formula: str, smiles: str = ""):
aa_idx = ord(aa)
+ aa_formula.loc[aa, "formula"] = formula
+ aa_formula.loc[aa, "smiles"] = smiles
AA_DF.loc[aa_idx, "formula"] = formula
+ AA_DF.loc[aa_idx, "smiles"] = smiles
AA_ASCII_MASS[aa_idx] = calc_mass_from_formula(formula)
AA_DF.loc[aa_idx, "mass"] = AA_ASCII_MASS[aa_idx]
- AA_Formula[aa] = formula
AA_Composition[aa] = dict(parse_formula(formula))
diff --git a/alphabase/constants/atom.py b/alphabase/constants/atom.py
index 4d8e5f98..8acb0827 100644
--- a/alphabase/constants/atom.py
+++ b/alphabase/constants/atom.py
@@ -1,8 +1,12 @@
import os
+import re
import typing
+from collections import defaultdict
import numba
import numpy as np
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
from alphabase.constants._const import CONST_FILE_FOLDER, common_const_dict
from alphabase.yaml_utils import load_yaml
@@ -200,3 +204,215 @@ def calc_mass_from_formula(formula: str):
mass of the formula
"""
return np.sum([CHEM_MONO_MASS[elem] * n for elem, n in parse_formula(formula)])
+
+
+class ChemicalCompositonFormula:
+ """
+ Initialize the ChemicalCompositonFormula with a given formula.
+
+ Parameters
+ ----------
+ formula : str
+ The chemical formula as a string.
+
+ Returns
+ -------
+ None
+ """
+
+ def __init__(self, formula=None):
+ self.elements = (
+ defaultdict(int) if formula is None else self._parse_formula(formula)
+ )
+
+ @classmethod
+ def from_smiles(cls, smiles: str) -> "ChemicalCompositonFormula":
+ """
+ Create a ChemicalCompositonFormula instance from a SMILES string.
+
+ Parameters
+ ----------
+ smiles : str
+ The SMILES representation of the molecule.
+
+ Returns
+ -------
+ ChemicalCompositonFormula
+ An instance of the class based on the SMILES string.
+
+ Raises
+ ------
+ ValueError
+ If the SMILES string is invalid and can't be converted to an RDKit molecule.
+ """
+ mol = Chem.MolFromSmiles(smiles)
+ if not mol:
+ raise ValueError(f"Invalid RDKit molecule: {smiles}")
+ formula = rdMolDescriptors.CalcMolFormula(
+ mol, separateIsotopes=True, abbreviateHIsotopes=False
+ )
+ formula = formula.replace("[1H]", "H")
+ return cls._from_rdkit_formula(formula)
+
+ @classmethod
+ def _from_rdkit_formula(cls, formula: str) -> "ChemicalCompositonFormula":
+ """
+ Create a ChemicalCompositonFormula instance from an RDKit formula.
+
+ Parameters
+ ----------
+ formula : str
+ The chemical formula as generated by RDKit.
+
+ Returns
+ -------
+ ChemicalCompositonFormula
+ An instance of the class based on the RDKit formula.
+ """
+ instance = cls.__new__(cls)
+ instance.elements = instance._parse_rdkit_formula(formula)
+ return instance
+
+ def _parse_formula(self, formula) -> dict:
+ """
+ Parse a chemical formula string into a dictionary of elements and their counts.
+
+ Parameters
+ ----------
+ formula : str
+ The chemical formula to parse.
+
+ Returns
+ -------
+ dict
+ A dictionary with elements as keys and their counts as values.
+ """
+ # Expected pattern: (\d+)?: optional isotope number, ([A-Z][a-z]*): element symbol, (?:\(([-]?\d+)\))?: optional count in parentheses
+ # Example: 13C(2)H(3)O(-1) -> [('13', 'C', '2'), ('', 'H', '3'), ('', 'O', '-1')]
+ pattern = r"(\d+)?([A-Z][a-z]*)(?:\(([-]?\d+)\))?"
+ matches = re.findall(pattern, formula)
+ element_counts = defaultdict(int)
+
+ for isotope, element, count in matches:
+ if isotope:
+ element = f"{isotope}{element}"
+ count = int(count) if count else 1
+ element_counts[element] += count
+
+ self._validate_atoms(element_counts)
+ return element_counts
+
+ def _parse_rdkit_formula(self, formula: str) -> dict:
+ """
+ Parse an RDKit-generated formula string into a dictionary of elements and their counts.
+
+ Parameters
+ ----------
+ formula : str
+ The RDKit-generated chemical formula to parse.
+
+ Returns
+ -------
+ dict
+ A dictionary with elements as keys and their counts as values.
+ """
+ # Expected pattern: (\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*)): isotope in square brackets or element symbol, followed by (\d*): optional count
+ # Example: [13C]C2H5OH -> [('[13C]', '13', 'C', '', ''), ('C', '', '', 'C', '2'), ('H', '', '', 'H', '5'), ('O', '', '', 'O', ''), ('H', '', '', 'H', '')]
+ pattern = r"(\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*))(\d*)"
+ matches = re.findall(pattern, formula)
+ element_counts = defaultdict(int)
+
+ for match in matches:
+ count = int(match[4]) if match[4] else 1
+ if match[1]: # noqa: SIM108
+ # Isotope, see 0th element in the example above
+ element = f"{match[1]}{match[2]}"
+ else:
+ # Regular element, see the rest in the example above
+ element = match[3]
+ element_counts[element] += count
+
+ self._validate_atoms(element_counts)
+ return element_counts
+
+ def _validate_atoms(self, element_counts):
+ """
+ Validate the elements in the formula.
+
+ Parameters
+ ----------
+ element_counts : dict
+ The elements and their counts in the formula.
+
+ Raises
+ ------
+ ValueError
+ If the formula contains an unknown element.
+ """
+ for element in element_counts:
+ if element not in CHEM_MONO_MASS:
+ raise ValueError(f"Unknown element: {element}")
+
+ def __str__(self):
+ """
+ Return a string representation of the chemical formula.
+
+ Returns
+ -------
+ str
+ The chemical formula as a string.
+ """
+ return "".join(
+ f"{element}({count})"
+ for element, count in sorted(self.elements.items())
+ if count != 0
+ )
+
+ def __repr__(self):
+ """
+ Return a string representation of the ChemicalCompositonFormula instance.
+
+ Returns
+ -------
+ str
+ A string representation of the instance.
+ """
+ return f"ChemicalCompositonFormula('{self.__str__()}')"
+
+ def __add__(self, other):
+ """
+ Add two ChemicalCompositonFormula instances.
+
+ Parameters
+ ----------
+ other : ChemicalCompositonFormula
+ The other instance to add.
+
+ Returns
+ -------
+ ChemicalCompositonFormula
+ A new instance representing the sum of the two formulas.
+ """
+ result = ChemicalCompositonFormula()
+ for element in set(self.elements.keys()) | set(other.elements.keys()):
+ result.elements[element] = self.elements[element] + other.elements[element]
+ return result
+
+ def __sub__(self, other):
+ """
+ Subtract one ChemicalCompositonFormula instance from another.
+
+ Parameters
+ ----------
+ other : ChemicalCompositonFormula
+ The instance to subtract.
+
+ Returns
+ -------
+ ChemicalCompositonFormula
+ A new instance representing the difference of the two formulas.
+ """
+ result = ChemicalCompositonFormula()
+ for element in set(self.elements.keys()) | set(other.elements.keys()):
+ result.elements[element] = self.elements[element] - other.elements[element]
+ return result
diff --git a/alphabase/constants/const_files/amino_acid.tsv b/alphabase/constants/const_files/amino_acid.tsv
new file mode 100644
index 00000000..c4a0a933
--- /dev/null
+++ b/alphabase/constants/const_files/amino_acid.tsv
@@ -0,0 +1,30 @@
+ formula smiles
+A C(3)H(5)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn]
+B C(1000000)
+C C(3)H(5)N(1)O(1)S(1) N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn]
+D C(4)H(5)N(1)O(3)S(0) N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn]
+E C(5)H(7)N(1)O(3)S(0) N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn]
+F C(9)H(9)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn]
+G C(2)H(3)N(1)O(1)S(0) N([Xe])([Xe])CC(=O)[Rn]
+H C(6)H(7)N(3)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn]
+I C(6)H(11)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn]
+J C(6)H(11)N(1)O(1)S(0)
+K C(6)H(12)N(2)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn]
+L C(6)H(11)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn]
+M C(5)H(9)N(1)O(1)S(1) N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn]
+N C(4)H(6)N(2)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn]
+O C(12)H(19)N(3)O(2) C[C@@H]1CC=N[C@H]1C(=O)NCCCC[C@@H](C(=O)[Rn])N([Xe])([Xe])
+P C(5)H(7)N(1)O(1)S(0) N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn]
+Q C(5)H(8)N(2)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn]
+R C(6)H(12)N(4)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn]
+S C(3)H(5)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn]
+T C(4)H(7)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn]
+U C(3)H(5)N(1)O(1)Se(1) N([Xe])([Xe])[C@@]([H])(C[Se][H])C(=O)[Rn]
+V C(5)H(9)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn]
+W C(11)H(10)N(2)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)[Rn]
+X C(1000000)
+Y C(9)H(9)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn]
+Z C(1000000)
+s C(3)H(5)N(1)O(2)S(0)
+t C(4)H(8)N(1)O(5)P(1)
+y C(9)H(9)N(1)O(2)S(0)
diff --git a/alphabase/constants/const_files/amino_acid.yaml b/alphabase/constants/const_files/amino_acid.yaml
deleted file mode 100644
index 5d3ef0b1..00000000
--- a/alphabase/constants/const_files/amino_acid.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# https://en.wikipedia.org/wiki/Proteinogenic_amino_acid
-# B, X, Z are the placeholders for future
-A: 'C(3)H(5)N(1)O(1)S(0)'
-B: 'C(1000000)'
-C: 'C(3)H(5)N(1)O(1)S(1)'
-D: 'C(4)H(5)N(1)O(3)S(0)'
-E: 'C(5)H(7)N(1)O(3)S(0)'
-F: 'C(9)H(9)N(1)O(1)S(0)'
-G: 'C(2)H(3)N(1)O(1)S(0)'
-H: 'C(6)H(7)N(3)O(1)S(0)'
-I: 'C(6)H(11)N(1)O(1)S(0)'
-# I/L
-J: 'C(6)H(11)N(1)O(1)S(0)'
-K: 'C(6)H(12)N(2)O(1)S(0)'
-L: 'C(6)H(11)N(1)O(1)S(0)'
-M: 'C(5)H(9)N(1)O(1)S(1)'
-N: 'C(4)H(6)N(2)O(2)S(0)'
-# Pyrrolysine
-O: 'C(12)H(19)N(3)O(2)'
-P: 'C(5)H(7)N(1)O(1)S(0)'
-Q: 'C(5)H(8)N(2)O(2)S(0)'
-R: 'C(6)H(12)N(4)O(1)S(0)'
-S: 'C(3)H(5)N(1)O(2)S(0)'
-T: 'C(4)H(7)N(1)O(2)S(0)'
-# Selenocysteine
-U: 'C(3)H(5)N(1)O(1)Se(1)'
-V: 'C(5)H(9)N(1)O(1)S(0)'
-W: 'C(11)H(10)N(2)O(1)S(0)'
-X: 'C(1000000)'
-Y: 'C(9)H(9)N(1)O(2)S(0)'
-Z: 'C(1000000)'
-# Any other ASCII chars could be the placeholders for future usage.
-# For example:
-# phospho site-specific search (only lower case 'sty' can be modified)
-# s is S
-s: 'C(3)H(5)N(1)O(2)S(0)'
-# t is T
-t: 'C(4)H(8)N(1)O(5)P(1)'
-# y is Y
-y: 'C(9)H(9)N(1)O(2)S(0)'
diff --git a/alphabase/constants/const_files/modification.tsv b/alphabase/constants/const_files/modification.tsv
index 6d28216b..705c5886 100644
--- a/alphabase/constants/const_files/modification.tsv
+++ b/alphabase/constants/const_files/modification.tsv
@@ -1,16 +1,16 @@
mod_name unimod_mass unimod_avge_mass composition unimod_modloss modloss_composition classification unimod_id smiles modloss_importance
Acetyl@T 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 0.0
-Acetyl@Protein_N-term 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 0.0
+Acetyl@Protein_N-term 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 C(=O)C 0.0
Acetyl@S 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 0.0
Acetyl@C 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 0.0
-Acetyl@Any_N-term 42.010565 42.0367 H(2)C(2)O(1) 0.0 Multiple 1 0.0
-Acetyl@K 42.010565 42.0367 H(2)C(2)O(1) 0.0 Multiple 1 0.0
+Acetyl@Any_N-term 42.010565 42.0367 H(2)C(2)O(1) 0.0 Multiple 1 C(=O)C 0.0
+Acetyl@K 42.010565 42.0367 H(2)C(2)O(1) 0.0 Multiple 1 CC(=O)NCCCC[C@H](N([Xe])([Xe]))C(=O)[Rn] 0.0
Acetyl@Y 42.010565 42.0367 H(2)C(2)O(1) 0.0 Chemical derivative 1 0.0
Acetyl@H 42.010565 42.0367 H(2)C(2)O(1) 0.0 Chemical derivative 1 0.0
Acetyl@R 42.010565 42.0367 H(2)C(2)O(1) 0.0 Artefact 1 0.0
-Amidated@Any_C-term -0.984016 -0.9848 H(1)N(1)O(-1) 0.0 Artefact 2 0.0
-Amidated@Protein_C-term -0.984016 -0.9848 H(1)N(1)O(-1) 0.0 Post-translational 2 0.0
-Biotin@Any_N-term 226.077598 226.2954 H(14)C(10)N(2)O(2)S(1) 0.0 Chemical derivative 3 0.0
+Amidated@Any_C-term -0.984016 -0.9848 H(1)N(1)O(-1) 0.0 Artefact 2 N 0.0
+Amidated@Protein_C-term -0.984016 -0.9848 H(1)N(1)O(-1) 0.0 Post-translational 2 N 0.0
+Biotin@Any_N-term 226.077598 226.2954 H(14)C(10)N(2)O(2)S(1) 0.0 Chemical derivative 3 C(=O)CCCCC1SCC2NC(=O)NC21 0.0
Biotin@K 226.077598 226.2954 H(14)C(10)N(2)O(2)S(1) 0.0 Post-translational 3 0.0
Carbamidomethyl@Y 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
Carbamidomethyl@T 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
@@ -18,32 +18,32 @@ Carbamidomethyl@S 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
Carbamidomethyl@E 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
Carbamidomethyl@D 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
Carbamidomethyl@H 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
-Carbamidomethyl@Any_N-term 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
+Carbamidomethyl@Any_N-term 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 C(=O)NC 0.0
Carbamidomethyl@K 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0
-Carbamidomethyl@C 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 4 0.0
+Carbamidomethyl@C 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 4 C(C(C(=O)[Rn])N([Xe])([Xe]))SCC(=O)N 0.0
Carbamidomethyl@U 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 4 0.0
-Carbamidomethyl@M 57.021464 57.0513 H(3)C(2)N(1)O(1) 105.024835 H(7)C(3)N(1)O(1)S(1) Chemical derivative 4 0.5
+Carbamidomethyl@M 57.021464 57.0513 H(3)C(2)N(1)O(1) 105.024835 H(7)C(3)N(1)O(1)S(1) Chemical derivative 4 CS(CCC(N([Xe])([Xe]))C([Rn])=O)=CC(N)=O 0.5
Carbamyl@Y 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Chemical derivative 5 0.0
Carbamyl@T 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Chemical derivative 5 0.0
Carbamyl@S 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Chemical derivative 5 0.0
Carbamyl@M 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Artefact 5 0.0
Carbamyl@C 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Artefact 5 0.0
Carbamyl@R 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Artefact 5 0.0
-Carbamyl@Any_N-term 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Multiple 5 0.0
+Carbamyl@Any_N-term 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Multiple 5 C(=O)N 0.0
Carbamyl@K 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Multiple 5 0.0
-Carbamyl@Protein_N-term 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Post-translational 5 0.0
+Carbamyl@Protein_N-term 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Post-translational 5 C(=O)N 0.0
Carboxymethyl@Any_N-term 58.005479 58.0361 H(2)C(2)O(2) 0.0 Artefact 6 0.0
Carboxymethyl@K 58.005479 58.0361 H(2)C(2)O(2) 0.0 Artefact 6 0.0
Carboxymethyl@C 58.005479 58.0361 H(2)C(2)O(2) 0.0 Chemical derivative 6 0.0
Carboxymethyl@W 58.005479 58.0361 H(2)C(2)O(2) 0.0 Chemical derivative 6 0.0
Carboxymethyl@U 58.005479 58.0361 H(2)C(2)O(2) 0.0 Chemical derivative 6 0.0
-Deamidated@Q 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Artefact 7 0.0
+Deamidated@Q 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Artefact 7 C(CC(=O)O)[C@@H](C(=O)[Rn])N([Xe])([Xe]) 0.0
Deamidated@R 0.984016 0.9848 H(-1)N(-1)O(1) 43.005814 H(1)C(1)N(1)O(1) Post-translational 7 0.5
-Deamidated@N 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Artefact 7 0.0
+Deamidated@N 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Artefact 7 C([C@@H](C(=O)[Rn])N([Xe])([Xe]))C(=O)O 0.0
Deamidated@F^Protein_N-term 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Post-translational 7 0.0
ICAT-G@C 486.251206 486.6253 H(38)C(22)N(4)O(6)S(1) 0.0 Isotopic label 8 0.0
ICAT-G:2H(8)@C 494.30142 494.6746 H(30)2H(8)C(22)N(4)O(6)S(1) 0.0 Isotopic label 9 0.0
-Met->Hse@M^Any_C-term -29.992806 -30.0922 H(-2)C(-1)O(1)S(-1) 0.0 Chemical derivative 10 0.0
+Met->Hse@M^Any_C-term -29.992806 -30.0922 H(-2)C(-1)O(1)S(-1) 0.0 Chemical derivative 10 N([Xe])([Xe])[C@H](C(=O)[Rn])CCO 0.0
Met->Hsl@M^Any_C-term -48.003371 -48.1075 H(-4)C(-1)S(-1) 0.0 Chemical derivative 11 0.0
ICAT-D:2H(8)@C 450.275205 450.6221 H(26)2H(8)C(20)N(4)O(5)S(1) 0.0 Isotopic label 12 0.0
ICAT-D@C 442.224991 442.5728 H(34)C(20)N(4)O(5)S(1) 0.0 Isotopic label 13 0.0
@@ -55,9 +55,9 @@ Phospho@K 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0
Phospho@H 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0
Phospho@C 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0
Phospho@D 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0
-Phospho@Y 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0
-Phospho@T 79.966331 79.9799 H(1)O(3)P(1) 97.976896 H(3)O(4)P(1) Post-translational 21 10000000.0
-Phospho@S 79.966331 79.9799 H(1)O(3)P(1) 97.976896 H(3)O(4)P(1) Post-translational 21 100000000.0
+Phospho@Y 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 C1=CC(=CC=C1CC(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O 0.0
+Phospho@T 79.966331 79.9799 H(1)O(3)P(1) 97.976896 H(3)O(4)P(1) Post-translational 21 CC(C(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O 10000000.0
+Phospho@S 79.966331 79.9799 H(1)O(3)P(1) 97.976896 H(3)O(4)P(1) Post-translational 21 O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe]) 100000000.0
Methamidophos-S@Y 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0
Methamidophos-S@T 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0
Methamidophos-S@S 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0
@@ -73,27 +73,27 @@ Dehydrated@Q^Protein_C-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Post-translation
Dehydrated@C^Any_N-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Artefact 23 0.0
Propionamide@C 71.037114 71.0779 H(5)C(3)N(1)O(1) 0.0 Artefact 24 0.0
Propionamide@K 71.037114 71.0779 H(5)C(3)N(1)O(1) 0.0 Chemical derivative 24 0.0
-Propionamide@Any_N-term 71.037114 71.0779 H(5)C(3)N(1)O(1) 0.0 Chemical derivative 24 0.0
-Pyridylacetyl@Any_N-term 119.037114 119.1207 H(5)C(7)N(1)O(1) 0.0 Chemical derivative 25 0.0
+Propionamide@Any_N-term 71.037114 71.0779 H(5)C(3)N(1)O(1) 0.0 Chemical derivative 24 CCC(N)=O 0.0
+Pyridylacetyl@Any_N-term 119.037114 119.1207 H(5)C(7)N(1)O(1) 0.0 Chemical derivative 25 C(=O)Cc1ccccn1 0.0
Pyridylacetyl@K 119.037114 119.1207 H(5)C(7)N(1)O(1) 0.0 Chemical derivative 25 0.0
Pyro-carbamidomethyl@C^Any_N-term 39.994915 40.0208 C(2)O(1) 0.0 Artefact 26 0.0
-Glu->pyro-Glu@E^Any_N-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Artefact 27 0.0
-Gln->pyro-Glu@Q^Any_N-term -17.026549 -17.0305 H(-3)N(-1) 0.0 Artefact 28 0.0
+Glu->pyro-Glu@E^Any_N-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Artefact 27 O=C([Rn])[C@H]1N([Xe])C(=O)CC1 0.0
+Gln->pyro-Glu@Q^Any_N-term -17.026549 -17.0305 H(-3)N(-1) 0.0 Artefact 28 O=C([Rn])[C@H]1N([Xe])C(=O)CC1 0.0
SMA@Any_N-term 127.063329 127.1412 H(9)C(6)N(1)O(2) 0.0 Chemical derivative 29 0.0
SMA@K 127.063329 127.1412 H(9)C(6)N(1)O(2) 0.0 Chemical derivative 29 0.0
Cation:Na@D 21.981943 21.9818 H(-1)Na(1) 0.0 Artefact 30 0.0
-Cation:Na@Any_C-term 21.981943 21.9818 H(-1)Na(1) 0.0 Artefact 30 0.0
+Cation:Na@Any_C-term 21.981943 21.9818 H(-1)Na(1) 0.0 Artefact 30 O[Na] 0.0
Cation:Na@E 21.981943 21.9818 H(-1)Na(1) 0.0 Artefact 30 0.0
-Pyridylethyl@C 105.057849 105.1372 H(7)C(7)N(1) 0.0 Chemical derivative 31 0.0
+Pyridylethyl@C 105.057849 105.1372 H(7)C(7)N(1) 0.0 Chemical derivative 31 C1=CN=CC=C1CCSCC(C(=O)[Rn])N([Xe])([Xe]) 0.0
Methyl@E 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
Methyl@D 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
-Methyl@Any_C-term 14.01565 14.0266 H(2)C(1) 0.0 Multiple 34 0.0
-Methyl@Protein_N-term 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
+Methyl@Any_C-term 14.01565 14.0266 H(2)C(1) 0.0 Multiple 34 OC 0.0
+Methyl@Protein_N-term 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 C 0.0
Methyl@L 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
Methyl@I 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
Methyl@R 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
Methyl@Q 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
-Methyl@Any_N-term 14.01565 14.0266 H(2)C(1) 0.0 Chemical derivative 34 0.0
+Methyl@Any_N-term 14.01565 14.0266 H(2)C(1) 0.0 Chemical derivative 34 C 0.0
Methyl@N 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
Methyl@K 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
Methyl@H 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0
@@ -113,23 +113,23 @@ Oxidation@C 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0
Oxidation@H 15.994915 15.9994 O(1) 0.0 Artefact 35 0.0
Oxidation@V 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0
Oxidation@R 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0
-Oxidation@M 15.994915 15.9994 O(1) 63.998285 H(4)C(1)O(1)S(1) Artefact 35 0.5
+Oxidation@M 15.994915 15.9994 O(1) 63.998285 H(4)C(1)O(1)S(1) Artefact 35 O=C([Rn])C(N([Xe])([Xe]))CCS(=O)C 0.5
Oxidation@Y 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0
Oxidation@F 15.994915 15.9994 O(1) 0.0 Artefact 35 0.0
Oxidation@P 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0
Oxidation@N 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0
Oxidation@K 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0
Oxidation@D 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0
-Dimethyl@Protein_N-term 28.0313 28.0532 H(4)C(2) 0.0 Isotopic label 36 0.0
+Dimethyl@Protein_N-term 28.0313 28.0532 H(4)C(2) 0.0 Isotopic label 36 C 0.0
Dimethyl@P^Protein_N-term 28.0313 28.0532 H(4)C(2) 0.0 Post-translational 36 0.0
Dimethyl@N 28.0313 28.0532 H(4)C(2) 0.0 Post-translational 36 0.0
-Dimethyl@Any_N-term 28.0313 28.0532 H(4)C(2) 0.0 Isotopic label 36 0.0
-Dimethyl@K 28.0313 28.0532 H(4)C(2) 0.0 Multiple 36 0.0
+Dimethyl@Any_N-term 28.0313 28.0532 H(4)C(2) 0.0 Isotopic label 36 C 0.0
+Dimethyl@K 28.0313 28.0532 H(4)C(2) 0.0 Multiple 36 CN(C)CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn] 0.0
Dimethyl@R 28.0313 28.0532 H(4)C(2) 0.0 Post-translational 36 0.0
Trimethyl@A^Protein_N-term 42.04695 42.0797 H(6)C(3) 0.0 Post-translational 37 0.0
Trimethyl@R 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 37 0.0
Trimethyl@K 42.04695 42.0797 H(6)C(3) 59.073499 H(9)C(3)N(1) Post-translational 37 0.5
-Methylthio@C 45.987721 46.0916 H(2)C(1)S(1) 0.0 Multiple 39 0.0
+Methylthio@C 45.987721 46.0916 H(2)C(1)S(1) 0.0 Multiple 39 CSSC[C@H](N([Xe])([Xe]))C([Rn])=O 0.0
Methylthio@N 45.987721 46.0916 H(2)C(1)S(1) 0.0 Post-translational 39 0.0
Methylthio@D 45.987721 46.0916 H(2)C(1)S(1) 0.0 Post-translational 39 0.0
Methylthio@K 45.987721 46.0916 H(2)C(1)S(1) 0.0 Artefact 39 0.0
@@ -186,11 +186,11 @@ Acetyl:2H(3)@H 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0
Acetyl:2H(3)@Any_N-term 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0
Acetyl:2H(3)@K 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0
Acetyl:2H(3)@Protein_N-term 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0
-Propionyl@Protein_N-term 56.026215 56.0633 H(4)C(3)O(1) 0.0 Multiple 58 0.0
+Propionyl@Protein_N-term 56.026215 56.0633 H(4)C(3)O(1) 0.0 Multiple 58 C(=O)CC 0.0
Propionyl@T 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 0.0
Propionyl@S 56.026215 56.0633 H(4)C(3)O(1) 0.0 Chemical derivative 58 0.0
-Propionyl@K 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 0.0
-Propionyl@Any_N-term 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 0.0
+Propionyl@K 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 CCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) 0.0
+Propionyl@Any_N-term 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 C(=O)CC 0.0
Propionyl:13C(3)@Any_N-term 59.036279 59.0412 H(4)13C(3)O(1) 0.0 Isotopic label 59 0.0
Propionyl:13C(3)@K 59.036279 59.0412 H(4)13C(3)O(1) 0.0 Isotopic label 59 0.0
GIST-Quat@Any_N-term 127.099714 127.1842 H(13)C(7)N(1)O(1) 59.073499 H(9)C(3)N(1) Isotopic label 60 0.5
@@ -203,7 +203,7 @@ GIST-Quat:2H(9)@Any_N-term 136.156205 136.2397 H(4)2H(9)C(7)N(1)O(1) 68.12999 2H
GIST-Quat:2H(9)@K 136.156205 136.2397 H(4)2H(9)C(7)N(1)O(1) 68.12999 2H(9)C(3)N(1) Isotopic label 63 0.5
Succinyl@Protein_N-term 100.016044 100.0728 H(4)C(4)O(3) 0.0 Post-translational 64 0.0
Succinyl@Any_N-term 100.016044 100.0728 H(4)C(4)O(3) 0.0 Isotopic label 64 0.0
-Succinyl@K 100.016044 100.0728 H(4)C(4)O(3) 0.0 Isotopic label 64 0.0
+Succinyl@K 100.016044 100.0728 H(4)C(4)O(3) 0.0 Isotopic label 64 C(CCN)CC(C(=O)[Rn])N([Xe])C(=O)CCC(=O)O 0.0
Succinyl:2H(4)@Any_N-term 104.041151 104.0974 2H(4)C(4)O(3) 0.0 Isotopic label 65 0.0
Succinyl:2H(4)@K 104.041151 104.0974 2H(4)C(4)O(3) 0.0 Isotopic label 65 0.0
Succinyl:13C(4)@Any_N-term 104.029463 104.0434 H(4)13C(4)O(3) 0.0 Isotopic label 66 0.0
@@ -239,7 +239,7 @@ IMID@K 68.037448 68.0773 H(4)C(3)N(2) 0.0 Isotopic label 94 0.0
IMID:2H(4)@K 72.062555 72.1019 2H(4)C(3)N(2) 0.0 Isotopic label 95 0.0
Lysbiotinhydrazide@K 241.088497 241.31 H(15)C(10)N(3)O(2)S(1) 0.0 Chemical derivative 353 0.0
Propionamide:2H(3)@C 74.055944 74.0964 H(2)2H(3)C(3)N(1)O(1) 0.0 Isotopic label 97 0.0
-Nitro@Y 44.985078 44.9976 H(-1)N(1)O(2) 0.0 Chemical derivative 354 0.0
+Nitro@Y 44.985078 44.9976 H(-1)N(1)O(2) 0.0 Chemical derivative 354 O=[N+]([O-])c1cc(ccc1O)C[C@@H](C(=O)[Rn])N([Xe])([Xe]) 0.0
Nitro@W 44.985078 44.9976 H(-1)N(1)O(2) 0.0 Chemical derivative 354 0.0
Nitro@F 44.985078 44.9976 H(-1)N(1)O(2) 0.0 Artefact 354 0.0
ICAT-C@C 227.126991 227.2603 H(17)C(10)N(3)O(3) 0.0 Isotopic label 105 0.0
@@ -273,7 +273,7 @@ Formyl@Any_N-term 27.994915 28.0101 C(1)O(1) 0.0 Artefact 122 0.0
Formyl@S 27.994915 28.0101 C(1)O(1) 0.0 Artefact 122 0.0
ICAT-H@C 345.097915 345.7754 H(20)C(15)N(1)O(6)Cl(1) 0.0 Isotopic label 123 0.0
ICAT-H:13C(6)@C 351.118044 351.7313 H(20)C(9)13C(6)N(1)O(6)Cl(1) 0.0 Isotopic label 124 0.0
-Cation:K@Any_C-term 37.955882 38.0904 H(-1)K(1) 0.0 Artefact 530 0.0
+Cation:K@Any_C-term 37.955882 38.0904 H(-1)K(1) 0.0 Artefact 530 O[K] 0.0
Cation:K@E 37.955882 38.0904 H(-1)K(1) 0.0 Artefact 530 0.0
Cation:K@D 37.955882 38.0904 H(-1)K(1) 0.0 Artefact 530 0.0
Xlink:DTSSP[88]@Protein_N-term 87.998285 88.1283 H(4)C(3)O(1)S(1) 0.0 Chemical derivative 126 0.0
@@ -351,8 +351,8 @@ NBS:13C(6)@W 159.008578 159.1144 H(3)13C(6)N(1)O(2)S(1) 0.0 Chemical derivative
Methyl:2H(3)13C(1)@K 18.037835 18.0377 H(-1)2H(3)13C(1) 0.0 Isotopic label 329 0.0
Methyl:2H(3)13C(1)@R 18.037835 18.0377 H(-1)2H(3)13C(1) 0.0 Isotopic label 329 0.0
Methyl:2H(3)13C(1)@Any_N-term 18.037835 18.0377 H(-1)2H(3)13C(1) 0.0 Isotopic label 329 0.0
-Dimethyl:2H(6)13C(2)@Protein_N-term 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 0.0
-Dimethyl:2H(6)13C(2)@Any_N-term 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 0.0
+Dimethyl:2H(6)13C(2)@Protein_N-term 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 [13C]([2H])([2H])([2H]) 0.0
+Dimethyl:2H(6)13C(2)@Any_N-term 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 [13C]([2H])([2H])([2H]) 0.0
Dimethyl:2H(6)13C(2)@R 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 0.0
Dimethyl:2H(6)13C(2)@K 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 0.0
NBS@W 152.988449 153.1585 H(3)C(6)N(1)O(2)S(1) 0.0 Chemical derivative 172 0.0
@@ -378,8 +378,8 @@ QAT:2H(3)@C 174.168569 174.2784 H(16)2H(3)C(9)N(2)O(1) 0.0 Isotopic label 196
Label:18O(2)@Any_C-term 4.008491 3.9995 O(-2)18O(2) 0.0 Isotopic label 193 0.0
AccQTag@Any_N-term 170.048013 170.1674 H(6)C(10)N(2)O(1) 0.0 Chemical derivative 194 0.0
AccQTag@K 170.048013 170.1674 H(6)C(10)N(2)O(1) 0.0 Chemical derivative 194 0.0
-Dimethyl:2H(4)@Protein_N-term 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 0.0
-Dimethyl:2H(4)@Any_N-term 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 0.0
+Dimethyl:2H(4)@Protein_N-term 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 C([2H])([2H])([1H]) 0.0
+Dimethyl:2H(4)@Any_N-term 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 C([2H])([2H])([1H]) 0.0
Dimethyl:2H(4)@K 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 0.0
Dimethyl:2H(4)@R 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 0.0
EQAT@C 184.157563 184.2786 H(20)C(10)N(2)O(1) 0.0 Chemical derivative 197 0.0
@@ -475,7 +475,7 @@ Ethanolyl@C 44.026215 44.0526 H(4)C(2)O(1) 0.0 Chemical derivative 278 0.0
Ethanolyl@R 44.026215 44.0526 H(4)C(2)O(1) 0.0 Chemical derivative 278 0.0
Label:13C(6)15N(2)+Dimethyl@K 36.045499 35.9959 H(4)C(-4)13C(6)N(-2)15N(2) 0.0 Isotopic label 987 0.0
HMVK@C 86.036779 86.0892 H(6)C(4)O(2) 0.0 Chemical derivative 371 0.0
-Ethyl@Any_C-term 28.0313 28.0532 H(4)C(2) 0.0 Chemical derivative 280 0.0
+Ethyl@Any_C-term 28.0313 28.0532 H(4)C(2) 0.0 Chemical derivative 280 OCC 0.0
Ethyl@Protein_N-term 28.0313 28.0532 H(4)C(2) 0.0 Chemical derivative 280 0.0
Ethyl@E 28.0313 28.0532 H(4)C(2) 0.0 Artefact 280 0.0
Ethyl@Any_N-term 28.0313 28.0532 H(4)C(2) 0.0 Multiple 280 0.0
@@ -589,7 +589,7 @@ ICPL@Any_N-term 105.021464 105.0941 H(3)C(6)N(1)O(1) 0.0 Isotopic label 365 0.
Deamidated:18O(1)@Q 2.988261 2.9845 H(-1)N(-1)18O(1) 0.0 Isotopic label 366 0.0
Deamidated:18O(1)@N 2.988261 2.9845 H(-1)N(-1)18O(1) 0.0 Isotopic label 366 0.0
Arg->Orn@R -42.021798 -42.04 H(-2)C(-1)N(-2) 0.0 Artefact 372 0.0
-Cation:Cu[I]@Any_C-term 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 0.0
+Cation:Cu[I]@Any_C-term 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 O[Cu] 0.0
Cation:Cu[I]@E 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 0.0
Cation:Cu[I]@D 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 0.0
Cation:Cu[I]@H 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 0.0
@@ -761,10 +761,10 @@ Diethyl@Any_N-term 56.0626 56.1063 H(8)C(4) 0.0 Chemical derivative 518 0.0
Diethyl@K 56.0626 56.1063 H(8)C(4) 0.0 Chemical derivative 518 0.0
LG-Hlactam-K@Protein_N-term 348.193674 348.4333 H(28)C(20)O(5) 0.0 Post-translational 504 0.0
LG-Hlactam-K@K 348.193674 348.4333 H(28)C(20)O(5) 0.0 Post-translational 504 0.0
-Dimethyl:2H(4)13C(2)@Protein_N-term 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 0.0
+Dimethyl:2H(4)13C(2)@Protein_N-term 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 [13C]([2H])([2H])([1H]) 0.0
Dimethyl:2H(4)13C(2)@R 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 0.0
Dimethyl:2H(4)13C(2)@K 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 0.0
-Dimethyl:2H(4)13C(2)@Any_N-term 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 0.0
+Dimethyl:2H(4)13C(2)@Any_N-term 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 [13C]([2H])([2H])([1H]) 0.0
C8-QAT@Any_N-term 227.224915 227.3862 H(29)C(14)N(1)O(1) 0.0 Chemical derivative 513 0.0
C8-QAT@K 227.224915 227.3862 H(29)C(14)N(1)O(1) 0.0 Chemical derivative 513 0.0
Hex(2)@R 324.105647 324.2812 H(20)C(12)O(10) 0.0 Other glycosylation 512 0.0
@@ -1055,7 +1055,7 @@ Biotin:Aha-DADPS@M 922.465403 923.2022 H(70)C(42)N(8)O(11)S(1)Si(1) 0.0 Chemica
NO_SMX_SIMD@C 267.031377 267.2612 H(9)C(10)N(3)O(4)S(1) 0.0 Chemical derivative 746 0.0
Malonyl@C 86.000394 86.0462 H(2)C(3)O(3) 0.0 Chemical derivative 747 0.0
Malonyl@S 86.000394 86.0462 H(2)C(3)O(3) 0.0 Chemical derivative 747 0.0
-Malonyl@K 86.000394 86.0462 H(2)C(3)O(3) 0.0 Post-translational 747 0.0
+Malonyl@K 86.000394 86.0462 H(2)C(3)O(3) 0.0 Post-translational 747 N([Xe])([Xe])[C@@H](CCCC(NC(=O)CC(=O)O))C(=O)[Rn] 0.0
3sulfo@Any_N-term 183.983029 184.1693 H(4)C(7)O(4)S(1) 0.0 Chemical derivative 748 0.0
trifluoro@L 53.971735 53.9714 H(-3)F(3) 0.0 Non-standard residue 750 0.0
TNBS@Any_N-term 210.986535 211.0886 H(1)C(6)N(3)O(6) 0.0 Chemical derivative 751 0.0
@@ -1107,8 +1107,8 @@ cGMP@C 343.031785 343.1895 H(10)C(10)N(5)O(7)P(1) 0.0 Post-translational 849 0
cGMP+RMP-loss@C 150.041585 150.1182 H(4)C(5)N(5)O(1) 0.0 Post-translational 851 0.0
cGMP+RMP-loss@S 150.041585 150.1182 H(4)C(5)N(5)O(1) 0.0 Post-translational 851 0.0
mTRAQ@Y 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0
-mTRAQ@Any_N-term 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0
-mTRAQ@K 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0
+mTRAQ@Any_N-term 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 C(=O)CN1CCN(CC1)C 0.0
+mTRAQ@K 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 [H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)CN1CCN(C)CC1 0.0
mTRAQ@H 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0
mTRAQ@S 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0
mTRAQ@T 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0
@@ -1129,8 +1129,8 @@ mTRAQ:13C(3)15N(1)@S 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isot
mTRAQ:13C(3)15N(1)@T 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0
mTRAQ:13C(3)15N(1)@H 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0
mTRAQ:13C(3)15N(1)@Y 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0
-mTRAQ:13C(3)15N(1)@Any_N-term 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0
-mTRAQ:13C(3)15N(1)@K 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0
+mTRAQ:13C(3)15N(1)@Any_N-term 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C 0.0
+mTRAQ:13C(3)15N(1)@K 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 [H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1CCN(C)[13CH2][13CH2]1 0.0
Methyl-PEO12-Maleimide@C 710.383719 710.8073 H(58)C(32)N(2)O(15) 0.0 Chemical derivative 891 0.0
MDCC@C 383.148121 383.3978 H(21)C(20)N(3)O(5) 0.0 Chemical derivative 887 0.0
QQQTGG@K 599.266339 599.5942 H(37)C(23)N(9)O(10) 0.0 Other 877 0.0
@@ -1198,7 +1198,7 @@ LG-anhyropyrrole@K 298.19328 298.4192 H(26)C(20)O(2) 0.0 Post-translational 948
3-deoxyglucosone@R 144.042259 144.1253 H(8)C(6)O(4) 0.0 Multiple 949 0.0
Cation:Li@D 6.008178 5.9331 H(-1)Li(1) 0.0 Artefact 950 0.0
Cation:Li@E 6.008178 5.9331 H(-1)Li(1) 0.0 Artefact 950 0.0
-Cation:Li@Any_C-term 6.008178 5.9331 H(-1)Li(1) 0.0 Artefact 950 0.0
+Cation:Li@Any_C-term 6.008178 5.9331 H(-1)Li(1) 0.0 Artefact 950 O[Li] 0.0
Cation:Ca[II]@Any_C-term 37.946941 38.0621 H(-2)Ca(1) 0.0 Artefact 951 0.0
Cation:Ca[II]@E 37.946941 38.0621 H(-2)Ca(1) 0.0 Artefact 951 0.0
Cation:Ca[II]@D 37.946941 38.0621 H(-2)Ca(1) 0.0 Artefact 951 0.0
@@ -1577,7 +1577,7 @@ UgiJoullieProGlyProGly@E 308.148455 308.333 H(20)C(14)N(4)O(4) 0.0 Chemical der
Arg-loss@R^Any_C-term -156.101111 -156.1857 H(-12)C(-6)N(-4)O(-1) 0.0 Other 1287 0.0
Arg@Any_N-term 156.101111 156.1857 H(12)C(6)N(4)O(1) 0.0 Other 1288 0.0
IMEHex(2)NeuAc(1)@K 688.199683 688.6527 H(40)C(25)N(2)O(18)S(1) 0.0 Other glycosylation 1286 0.0
-Butyryl@K 70.041865 70.0898 H(6)C(4)O(1) 0.0 Post-translational 1289 0.0
+Butyryl@K 70.041865 70.0898 H(6)C(4)O(1) 0.0 Post-translational 1289 CCCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) 0.0
Dicarbamidomethyl@K 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0
Dicarbamidomethyl@H 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0
Dicarbamidomethyl@C 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0
@@ -1595,8 +1595,8 @@ Label:13C(4)15N(1)@D 5.010454 4.964 C(-4)13C(4)N(-1)15N(1) 0.0 Isotopic label 1
Label:2H(10)@L 10.062767 10.0616 H(-10)2H(10) 0.0 Isotopic label 1299 0.0
Label:2H(4)13C(1)@R 5.028462 5.0173 H(-4)2H(4)C(-1)13C(1) 0.0 Isotopic label 1300 0.0
Lys@Any_N-term 128.094963 128.1723 H(12)C(6)N(2)O(1) 0.0 Other 1301 0.0
-mTRAQ:13C(6)15N(2)@K 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0
-mTRAQ:13C(6)15N(2)@Any_N-term 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0
+mTRAQ:13C(6)15N(2)@K 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 [H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1[13CH2][13CH2][15N]([13CH3])[13CH2][13CH2]1 0.0
+mTRAQ:13C(6)15N(2)@Any_N-term 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H]) 0.0
mTRAQ:13C(6)15N(2)@Y 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0
mTRAQ:13C(6)15N(2)@H 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0
mTRAQ:13C(6)15N(2)@S 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0
@@ -1611,8 +1611,8 @@ Propyl@D 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 0.0
Propyl@K 42.04695 42.0797 H(6)C(3) 0.0 Isotopic label 1305 0.0
Propyl@Any_N-term 42.04695 42.0797 H(6)C(3) 0.0 Isotopic label 1305 0.0
Propyl@E 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 0.0
-Propyl@Any_C-term 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 0.0
-Propyl@Protein_C-term 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 0.0
+Propyl@Any_C-term 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 OCCC 0.0
+Propyl@Protein_C-term 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 OCCC 0.0
Propyl:2H(6)@Any_N-term 48.084611 48.1167 2H(6)C(3) 0.0 Isotopic label 1306 0.0
Propyl:2H(6)@K 48.084611 48.1167 2H(6)C(3) 0.0 Isotopic label 1306 0.0
Propiophenone@C 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0
@@ -1677,7 +1677,7 @@ HN2_mustard@H 101.084064 101.1469 H(11)C(5)N(1)O(1) 0.0 Post-translational 1388
HN2_mustard@K 101.084064 101.1469 H(11)C(5)N(1)O(1) 0.0 Post-translational 1388 0.0
HN2_mustard@C 101.084064 101.1469 H(11)C(5)N(1)O(1) 0.0 Post-translational 1388 0.0
NEM:2H(5)+H2O@C 148.089627 148.1714 H(4)2H(5)C(6)N(1)O(3) 0.0 Chemical derivative 1358 0.0
-Crotonyl@K 68.026215 68.074 H(4)C(4)O(1) 0.0 Post-translational 1363 0.0
+Crotonyl@K 68.026215 68.074 H(4)C(4)O(1) 0.0 Post-translational 1363 CC=CC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) 0.0
O-Et-N-diMePhospho@S 135.044916 135.1015 H(10)C(4)N(1)O(2)P(1) 0.0 Chemical derivative 1364 0.0
N-dimethylphosphate@S 107.013615 107.0483 H(6)C(2)N(1)O(2)P(1) 0.0 Chemical derivative 1365 0.0
phosphoRibosyl@E 212.00859 212.0945 H(9)C(5)O(7)P(1) 0.0 Post-translational 1356 0.0
@@ -2770,4 +2770,28 @@ NQTGG@K 457.192111 457.4384 H(27)C(17)N(7)O(8) 0.0 Other 2084 0.0
DVFQQQTGG@K 960.43011 960.9865 H(60)C(41)N(12)O(15) 0.0 Other 2085 0.0
iST-NHS_specific_cysteine_modification@C 113.084064 113.1576 H(11)C(6)N(1)O(1) 0.0 Chemical derivative 2086 0.0
Label:13C(2)15N(1)@G 3.003745 2.9787 C(-2)13C(2)N(-1)15N(1) 0.0 Isotopic label 2088 0.0
-GlyGly@K 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Multiple 121 1000000.0
+GlyGly@K 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Multiple 121 NCC(=O)NCC(=O)NCCCC[C@H](N([Xe])([Xe]))C([Rn])=O 1000000.0
+Pro->(2S,4R)-4-fluoroproline@P 0.0 0.0 F(1)H(-1) 0.0 User-added 0 F[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn] 0.0
+Pro->(2S,4S)-4fluoroproline@P 0.0 0.0 F(1)H(-1) 0.0 User-added 0 F[C@H]1C[C@H](N([Xe])C1)C(=O)[Rn] 0.0
+Pro->(2S)-1,3-thiazolidine-2-carboxylic_acid@P 0.0 0.0 C(-1)H(-2)S(1) 0.0 User-added 0 S1[C@H](N([Xe])CC1)C(=O)[Rn] 0.0
+Pro->(4R)-1,3-Thiazolidine-4-carboxylic_acid@P 0.0 0.0 C(-1)H(-2)S(1) 0.0 User-added 0 S1CN([Xe])[C@@H](C1)C(=O)[Rn] 0.0
+Pro->(2S,4R)-4-hydroxyproline@P 0.0 0.0 O(1) 0.0 User-added 0 O[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn] 0.0
+Pro->(DL)-pipecolic_acid@P 0.0 0.0 C(1)H(2) 0.0 User-added 0 C1CCN([Xe])C(C1)C(=O)[Rn] 0.0
+Pro->3,4-Dehydro-L-proline@P 0.0 0.0 H(-2) 0.0 User-added 0 C1C=CC(N1([Xe]))C(=O)[Rn] 0.0
+Pro->(1S,3S,5S)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P 0.0 0.0 C(1) 0.0 User-added 0 [C@H]12N([Xe])[C@@H](C[C@@H]2C1)C(=O)[Rn] 0.0
+Pro->(1R,3S,5R)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P 0.0 0.0 C(1) 0.0 User-added 0 [C@@H]12N([Xe])[C@@H](C[C@H]2C1)C(=O)[Rn] 0.0
+Pro->(2S,3aS,7aS)-Octahydro-1H-indole-2-carboxylic_acid@P 0.0 0.0 C(4)H(6) 0.0 User-added 0 N1([Xe])[C@@H](C[C@@H]2CCCC[C@H]12)C(=O)[Rn] 0.0
+Pro->(DL)-5-trifluoromethylproline@P 0.0 0.0 C(1)F(3)H(-1) 0.0 User-added 0 FC(C1CCC(N1([Xe]))C(=O)[Rn])(F)F 0.0
+mTRAQ@Protein_N-term 0.0 0.0 C(7)H(12)N(2)O(1) 0.0 User-added 0 C(=O)CN1CCN(CC1)C 0.0
+mTRAQ:13C(3)15N(1)@Protein_N-term 0.0 0.0 13C(3)15N(1)C(4)H(12)N(1)O(1) 0.0 User-added 0 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C 0.0
+mTRAQ:13C(6)15N(2)@Protein_N-term 0.0 0.0 13C(6)15N(2)C(1)H(12)O(1) 0.0 User-added 0 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H]) 0.0
+Biotin@Protein_N-term 0.0 0.0 C(10)H(14)N(2)O(2)S(1) 0.0 User-added 0 C(=O)CCCCC1SCC2NC(=O)NC21 0.0
+Carbamidomethyl@Protein_N-term 0.0 0.0 C(2)H(3)N(1)O(1) 0.0 User-added 0 C(=O)NC 0.0
+Propionamide@Protein_N-term 0.0 0.0 C(3)H(5)N(1)O(1) 0.0 User-added 0 CCC(N)=O 0.0
+Pyridylacetyl@Protein_N-term 0.0 0.0 C(7)H(5)N(1)O(1) 0.0 User-added 0 C(=O)Cc1ccccn1 0.0
+Methyl@Protein_C-term 0.0 0.0 C(1)H(2) 0.0 User-added 0 OC 0.0
+Ethyl@Protein_C-term 0.0 0.0 C(2)H(4) 0.0 User-added 0 OCC 0.0
+Cation:Na@Protein_C-term 0.0 0.0 H(-1)Na(1) 0.0 User-added 0 O[Na] 0.0
+Cation:K@Protein_C-term 0.0 0.0 H(-1)K(1) 0.0 User-added 0 O[K] 0.0
+Cation:Cu[I]@Protein_C-term 0.0 0.0 Cu(1)H(-1) 0.0 User-added 0 O[Cu] 0.0
+Cation:Li@Protein_C-term 0.0 0.0 H(-1)Li(1) 0.0 User-added 0 O[Li] 0.0
diff --git a/alphabase/constants/modification.py b/alphabase/constants/modification.py
index 58ee7146..4738628c 100644
--- a/alphabase/constants/modification.py
+++ b/alphabase/constants/modification.py
@@ -362,12 +362,70 @@ def calc_modloss_mass(
return _calc_modloss(mod_losses[::-1])[-3:0:-1]
+def _check_mass_sanity(
+ mod_name: str,
+ composition: str,
+ smiles: str,
+):
+ """
+ Check if the mass of the modification is consistent with the formula.
+
+ Parameters
+ ----------
+ mod_name : str
+ Modification name (e.g. Mod@S)
+
+ composition : str
+ Composition formula (e.g. "H(4)O(2)"), used to calculate the mass of the modification
+
+ smiles : str
+ SMILES string of the modification, used to calculate and compare the mass of the modification
+
+ Raises
+ ------
+ ValueError
+ If the mass of the modification is inconsistent with the formula
+ """
+ if not smiles or mod_name not in MOD_MASS:
+ return
+ composition_mass = calc_mass_from_formula(composition)
+ if not np.allclose(composition_mass, MOD_MASS[mod_name], atol=1e-5):
+ raise ValueError(
+ f"Modification mass of {mod_name} is inconsistent with the composition formula: {composition}, df version {MOD_DF.loc[mod_name,['composition']]}"
+ f" calculated_mass={composition_mass}, mod_mass={MOD_MASS[mod_name]}"
+ )
+
+
def _add_a_new_modification(
- mod_name: str, composition: str, modloss_composition: str = ""
+ mod_name: str,
+ composition: str,
+ modloss_composition: str = "",
+ smiles: str = "",
):
"""
- Add a new modification into :data:`MOD_DF`.
+ Add a new modification into :data:`MOD_DF` or update SMILES if modification already exists.
+
+ Parameters
+ ----------
+ mod_name : str
+ Modification name (e.g. Mod@S)
+
+ composition : str
+ Composition formula (e.g. "H(4)O(2)")
+
+ modloss_composition : str
+ Composition formula of the modification loss (e.g. "H(2)O(1)")
+
+ smiles : str
+ SMILES string of the modification
"""
+ _check_mass_sanity(mod_name, composition, smiles)
+
+ if mod_name in MOD_DF.index:
+ # If the modification already exists, only update the SMILES
+ MOD_DF.loc[mod_name, "smiles"] = smiles
+ return
+
MOD_DF.loc[
mod_name,
[
@@ -376,11 +434,14 @@ def _add_a_new_modification(
"modloss_composition",
"classification",
"unimod_id",
+ "smiles",
],
- ] = [mod_name, composition, modloss_composition, "User-added", 0]
+ ] = [mod_name, composition, modloss_composition, "User-added", 0, smiles]
+ composition_mass = calc_mass_from_formula(composition)
+ modloss_mass = calc_mass_from_formula(modloss_composition)
MOD_DF.loc[mod_name, ["mass", "modloss"]] = (
- calc_mass_from_formula(composition),
- calc_mass_from_formula(modloss_composition),
+ composition_mass,
+ modloss_mass,
)
if MOD_DF.loc[mod_name, "modloss"] > 0:
MOD_DF.loc[mod_name, "modloss_importance"] = 1e10
diff --git a/alphabase/smiles/__init__.py b/alphabase/smiles/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/alphabase/smiles/smiles.py b/alphabase/smiles/smiles.py
new file mode 100644
index 00000000..4f264069
--- /dev/null
+++ b/alphabase/smiles/smiles.py
@@ -0,0 +1,285 @@
+from typing import Dict, Optional
+
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem.rdmolops import ReplaceSubstructs, SanitizeMol
+
+from alphabase.constants.aa import aa_formula
+from alphabase.constants.modification import MOD_DF
+
+
+class AminoAcidModifier:
+ """
+ A class for modifying amino acids with N-terminal and C-terminal modifications.
+ """
+
+ N_TERM_PLACEHOLDER = "[Xe]"
+ C_TERM_PLACEHOLDER = "[Rn]"
+
+ def __init__(self):
+ self._aa_smiles = None
+ self._aa_mols = None
+ self._n_term_modifications = None
+ self._c_term_modifications = None
+ self._ptm_dict = None
+
+ @property
+ def aa_smiles(self) -> Dict[str, str]:
+ """
+ Dictionary of amino acid SMILES.
+
+ Returns
+ -------
+ Dict[str, str]
+ A dictionary with amino acid codes as keys and their SMILES as values.
+ """
+ if self._aa_smiles is None:
+ self._aa_smiles = {
+ aa: aa_formula.loc[aa]["smiles"]
+ for aa in aa_formula.index
+ if not pd.isna(aa_formula.loc[aa]["smiles"])
+ }
+ return self._aa_smiles
+
+ @property
+ def aa_mols(self) -> Dict[str, Chem.Mol]:
+ """
+ Dictionary of amino acid RDKit molecules.
+
+ Returns
+ -------
+ Dict[str, Chem.Mol]
+ A dictionary with amino acid codes as keys and their RDKit molecules as values.
+ """
+ if self._aa_mols is None:
+ self._aa_mols = {
+ aa: Chem.MolFromSmiles(smiles) for aa, smiles in self.aa_smiles.items()
+ }
+ return self._aa_mols
+
+ @property
+ def n_term_modifications(self) -> Dict[str, str]:
+ """
+ Dictionary of N-terminal modifications.
+
+ Returns
+ -------
+ Dict[str, str]
+ A dictionary with modification names as keys and their SMILES as values.
+ """
+ if self._n_term_modifications is None:
+ self._n_term_modifications = self._get_modifications(
+ "@Protein_N-term", "@Any_N-term"
+ )
+ return self._n_term_modifications
+
+ @property
+ def c_term_modifications(self) -> Dict[str, str]:
+ """
+ Dictionary of C-terminal modifications.
+
+ Returns
+ -------
+ Dict[str, str]
+ A dictionary with modification names as keys and their SMILES as values.
+ """
+ if self._c_term_modifications is None:
+ self._c_term_modifications = self._get_modifications(
+ "@Protein_C-term", "@Any_C-term"
+ )
+ return self._c_term_modifications
+
+ @property
+ def ptm_dict(self) -> Dict[str, str]:
+ """
+ Dictionary of post-translational modifications (PTMs).
+
+ Returns
+ -------
+ Dict[str, str]
+ A dictionary with modification names as keys and their SMILES as values.
+ """
+ if self._ptm_dict is None:
+ self._ptm_dict = {
+ name: smiles
+ for name, smiles in self._get_modifications("@").items()
+ if name not in self.n_term_modifications
+ and name not in self.c_term_modifications
+ }
+ return self._ptm_dict
+
+ def _get_modifications(self, *terms: str) -> Dict[str, str]:
+ """
+ Helper method to get modifications based on specific terms.
+
+ Parameters
+ ----------
+ *terms : str
+ Terms to filter modifications by.
+
+ Returns
+ -------
+ Dict[str, str]
+ A dictionary of modifications with names as keys and SMILES as values.
+ """
+ mod_df_smiles = MOD_DF[MOD_DF["smiles"] != ""]
+ return {
+ name: smiles
+ for name, smiles in zip(mod_df_smiles["mod_name"], mod_df_smiles["smiles"])
+ if any(term in name for term in terms)
+ }
+
+ def validate_correct_args(
+ self,
+ aa_smiles: str,
+ n_term_mod: Optional[str] = None,
+ c_term_mod: Optional[str] = None,
+ ) -> None:
+ """
+ Validate the amino acid, N-terminal modification, and C-terminal modification.
+
+ Parameters
+ ----------
+ aa_smiles : str
+ SMILES of an amino acid with or without PTMs, must have the placeholder atoms [Xe] for N-term and [Rn] for C-term.
+ n_term_mod : Optional[str], optional
+ N-terminal modification name. Options are the keys in the n_term_modifications dict.
+ c_term_mod : Optional[str], optional
+ C-terminal modification name. Options are the keys in the c_term_modifications dict.
+
+ Raises
+ ------
+ ValueError
+ If the amino acid is invalid, or if N-terminal / C-terminal modification is not recognized.
+ """
+ errors = []
+
+ if self.N_TERM_PLACEHOLDER not in aa_smiles:
+ errors.append(
+ f"The amino acid {aa_smiles} must contain the N-terminal placeholder [{self.N_TERM_PLACEHOLDER}]."
+ )
+ if self.C_TERM_PLACEHOLDER not in aa_smiles:
+ errors.append(
+ f"The amino acid {aa_smiles} must contain the C-terminal placeholder [{self.C_TERM_PLACEHOLDER}]."
+ )
+
+ if n_term_mod is not None and n_term_mod not in self.n_term_modifications:
+ errors.append(f"Unrecognized N-terminal modification: {n_term_mod}")
+
+ if c_term_mod is not None and c_term_mod not in self.c_term_modifications:
+ errors.append(f"Unrecognized C-terminal modification: {c_term_mod}")
+
+ mol = Chem.MolFromSmiles(aa_smiles)
+ if mol is None:
+ errors.append(f"Invalid amino acid SMILES: {aa_smiles}")
+
+ if errors:
+ raise ValueError("\n".join(errors))
+
+ def modify_amino_acid(
+ self,
+ aa_smiles: str,
+ n_term_mod: Optional[str] = None,
+ c_term_mod: Optional[str] = None,
+ ) -> str:
+ """
+ Modify an amino acid with N-terminal and C-terminal modifications.
+
+ Parameters
+ ----------
+ aa_smiles : str
+ SMILES of an amino acid with or without PTMs, must have the placeholder atoms [Xe] for N-term and [Rn] for C-term.
+ n_term_mod : Optional[str], optional
+ N-terminal modification name. Options are the keys in the n_term_modifications dict.
+ If None, the amino acid is not modified, so using hydrogen atoms.
+ c_term_mod : Optional[str], optional
+ C-terminal modification name. Options are the keys in the c_term_modifications dict.
+ If None, the amino acid is not modified, so using the -OH group.
+
+ Returns
+ -------
+ str
+ SMILES string of the modified amino acid.
+ """
+ self.validate_correct_args(aa_smiles, n_term_mod, c_term_mod)
+ mol = Chem.MolFromSmiles(aa_smiles)
+ n_term_placeholder_mol = Chem.MolFromSmiles(
+ self.N_TERM_PLACEHOLDER, sanitize=False
+ )
+ c_term_placeholder_mol = Chem.MolFromSmiles(
+ self.C_TERM_PLACEHOLDER, sanitize=False
+ )
+
+ mol = self._apply_n_terminal_modification(
+ mol, n_term_placeholder_mol, n_term_mod
+ )
+ mol = self._apply_c_terminal_modification(
+ mol, c_term_placeholder_mol, c_term_mod
+ )
+
+ SanitizeMol(mol)
+ return Chem.MolToSmiles(mol)
+
+ def _apply_n_terminal_modification(
+ self, mol: Chem.Mol, n_term_placeholder_mol: Chem.Mol, n_term_mod: Optional[str]
+ ) -> Chem.Mol:
+ """
+ Apply N-terminal modification to the molecule.
+
+ Parameters
+ ----------
+ mol : Chem.Mol
+ The molecule to modify.
+ n_term_placeholder_mol : Chem.Mol
+ The N-terminal placeholder molecule.
+ n_term_mod : Optional[str]
+ The N-terminal modification to apply.
+
+ Returns
+ -------
+ Chem.Mol
+ The modified molecule.
+ """
+ if n_term_mod:
+ n_mod_smiles = self.n_term_modifications[n_term_mod]
+ n_mod_mol = Chem.MolFromSmiles(n_mod_smiles)
+ mol = ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0]
+
+ if "Dimethyl" in n_term_mod:
+ return ReplaceSubstructs(mol, n_term_placeholder_mol, n_mod_mol)[0]
+
+ # replacing all leftover N-terminal placeholders with hydrogen atoms
+ return ReplaceSubstructs(
+ mol,
+ n_term_placeholder_mol,
+ Chem.MolFromSmiles("[H]", sanitize=False),
+ replaceAll=True,
+ )[0]
+
+ def _apply_c_terminal_modification(
+ self, mol: Chem.Mol, c_term_placeholder_mol: Chem.Mol, c_term_mod: Optional[str]
+ ) -> Chem.Mol:
+ """
+ Apply C-terminal modification to the molecule.
+
+ Parameters
+ ----------
+ mol : Chem.Mol
+ The molecule to modify.
+ c_term_placeholder_mol : Chem.Mol
+ The C-terminal placeholder molecule.
+ c_term_mod : Optional[str]
+ The C-terminal modification to apply.
+
+ Returns
+ -------
+ Chem.Mol
+ The modified molecule.
+ """
+ if c_term_mod:
+ c_mod_smiles = self.c_term_modifications[c_term_mod]
+ c_mod_mol = Chem.MolFromSmiles(c_mod_smiles)
+ return ReplaceSubstructs(mol, c_term_placeholder_mol, c_mod_mol)[0]
+ return ReplaceSubstructs(
+ mol, c_term_placeholder_mol, Chem.MolFromSmiles("O", sanitize=False)
+ )[0]
diff --git a/docs/nbs/adding_smiles.ipynb b/docs/nbs/adding_smiles.ipynb
new file mode 100644
index 00000000..e0899474
--- /dev/null
+++ b/docs/nbs/adding_smiles.ipynb
@@ -0,0 +1,885 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Post-Translational Modification (PTM) SMILES Notebook\n",
+ "\n",
+ "This notebook is designed to update a Unimod table with post-translational modifications (PTMs) by adding SMILES formulas for some of the PTMs. We'll also calculate their molecular weights to validate the correctness of the structures against the ground truth data in the table.\n",
+ "\n",
+ "## Setup and Imports\n",
+ "\n",
+ "First, we import the necessary libraries and modules:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import pandas as pd\n",
+ "from rdkit import Chem\n",
+ "\n",
+ "from alphabase.constants._const import CONST_FILE_FOLDER\n",
+ "from alphabase.constants.atom import ChemicalCompositonFormula\n",
+ "from alphabase.constants.aa import aa_formula\n",
+ "\n",
+ "from alphabase.constants.modification import MOD_DF, add_new_modifications\n",
+ "\n",
+ "from alphabase.smiles.smiles import AminoAcidModifier\n",
+ "\n",
+ "aa_modifier = AminoAcidModifier()\n",
+ "modify_amino_acid = aa_modifier.modify_amino_acid\n",
+ "n_term_modifications_smi = aa_modifier.n_term_modifications\n",
+ "c_term_modifications_smi = aa_modifier.c_term_modifications\n",
+ "ptm_dict_smi = aa_modifier.ptm_dict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We then define dictionaries with SMILES structures for N-terminal modifications, C-terminal modifications, and PTMs:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "defaultdict(int, {'O': -1, 'H': 2, 'C': 1})"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(ChemicalCompositonFormula(\"C(1)H(4)\") - ChemicalCompositonFormula(\"H(2)O(1)\")).elements"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ChemicalCompositonFormula('C(1)H(2)O(-1)')"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ChemicalCompositonFormula(\"C(1)H(2)O(-1)\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n_term_modifications = {'mTRAQ@Any_N-term': 'C(=O)CN1CCN(CC1)C',\n",
+ " 'mTRAQ:13C(3)15N(1)@Any_N-term': 'C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C',\n",
+ " 'mTRAQ:13C(6)15N(2)@Any_N-term': 'C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H])',\n",
+ " 'Acetyl@Any_N-term': 'C(=O)C',\n",
+ " 'Propionyl@Any_N-term': 'C(=O)CC',\n",
+ " 'Biotin@Any_N-term': 'C(=O)CCCCC1SCC2NC(=O)NC21',\n",
+ " 'Carbamidomethyl@Any_N-term': 'C(=O)NC',\n",
+ " 'Carbamyl@Any_N-term': 'C(=O)N',\n",
+ " 'Propionamide@Any_N-term': 'CCC(N)=O',\n",
+ " 'Pyridylacetyl@Any_N-term': 'C(=O)Cc1ccccn1',\n",
+ " 'Methyl@Any_N-term': 'C',\n",
+ " 'Dimethyl@Any_N-term': 'C',\n",
+ " 'Dimethyl:2H(6)13C(2)@Any_N-term': '[13C]([2H])([2H])([2H])',\n",
+ " 'Dimethyl:2H(4)@Any_N-term': 'C([2H])([2H])([1H])',\n",
+ " 'Dimethyl:2H(4)13C(2)@Any_N-term': '[13C]([2H])([2H])([1H])'}\n",
+ "\n",
+ "\n",
+ "c_term_modifications = {'Methyl@Any_C-term': 'OC',\n",
+ " 'Ethyl@Any_C-term': 'OCC',\n",
+ " 'Propyl@Any_C-term': 'OCCC',\n",
+ " 'Amidated@Any_C-term': 'N',\n",
+ " 'Cation:Na@Any_C-term': 'O[Na]',\n",
+ " 'Cation:K@Any_C-term': 'O[K]',\n",
+ " 'Cation:Cu[I]@Any_C-term': 'O[Cu]',\n",
+ " 'Cation:Li@Any_C-term': 'O[Li]'}\n",
+ "\n",
+ "\n",
+ "ptm_dict = {'Carbamidomethyl@C': 'C(C(C(=O)[Rn])N([Xe])([Xe]))SCC(=O)N',\n",
+ " 'Oxidation@M': 'O=C([Rn])C(N([Xe])([Xe]))CCS(=O)C',\n",
+ " 'GlyGly@K': 'NCC(=O)NCC(=O)NCCCC[C@H](N([Xe])([Xe]))C([Rn])=O',\n",
+ " 'Deamidated@N': 'C([C@@H](C(=O)[Rn])N([Xe])([Xe]))C(=O)O',\n",
+ " 'Propionyl@K': 'CCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])',\n",
+ " 'Deamidated@Q': 'C(CC(=O)O)[C@@H](C(=O)[Rn])N([Xe])([Xe])',\n",
+ " 'Gln->pyro-Glu@Q^Any_N-term': 'O=C([Rn])[C@H]1N([Xe])C(=O)CC1',\n",
+ " 'Glu->pyro-Glu@E^Any_N-term': 'O=C([Rn])[C@H]1N([Xe])C(=O)CC1',\n",
+ " 'Phospho@S': 'O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe])',\n",
+ " 'Nitro@Y': 'O=[N+]([O-])c1cc(ccc1O)C[C@@H](C(=O)[Rn])N([Xe])([Xe])',\n",
+ " 'Acetyl@K': 'CC(=O)NCCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]',\n",
+ " 'Dimethyl@K': 'CN(C)CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]',\n",
+ " 'mTRAQ@K': '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)CN1CCN(C)CC1',\n",
+ " 'mTRAQ:13C(3)15N(1)@K': '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1CCN(C)[13CH2][13CH2]1',\n",
+ " 'mTRAQ:13C(6)15N(2)@K': '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1[13CH2][13CH2][15N]([13CH3])[13CH2][13CH2]1',\n",
+ " 'Pyridylethyl@C': 'C1=CN=CC=C1CCSCC(C(=O)[Rn])N([Xe])([Xe])',\n",
+ " 'Butyryl@K': 'CCCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])',\n",
+ " 'Phospho@T': 'CC(C(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O',\n",
+ " 'Methylthio@C': 'CSSC[C@H](N([Xe])([Xe]))C([Rn])=O',\n",
+ " 'Carbamidomethyl@M': 'CS(CCC(N([Xe])([Xe]))C([Rn])=O)=CC(N)=O',\n",
+ " 'Succinyl@K': 'C(CCN)CC(C(=O)[Rn])N([Xe])C(=O)CCC(=O)O',\n",
+ " 'Crotonyl@K': 'CC=CC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])',\n",
+ " 'Phospho@Y': 'C1=CC(=CC=C1CC(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O',\n",
+ " 'Malonyl@K': 'N([Xe])([Xe])[C@@H](CCCC(NC(=O)CC(=O)O))C(=O)[Rn]',\n",
+ " 'Met->Hse@M^Any_C-term': 'N([Xe])([Xe])[C@H](C(=O)[Rn])CCO',\n",
+ " 'Pro->(2S,4R)-4-fluoroproline@P': 'F[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]',\n",
+ " 'Pro->(2S,4S)-4fluoroproline@P': 'F[C@H]1C[C@H](N([Xe])C1)C(=O)[Rn]',\n",
+ " 'Pro->(2S)-1,3-thiazolidine-2-carboxylic_acid@P': 'S1[C@H](N([Xe])CC1)C(=O)[Rn]',\n",
+ " 'Pro->(4R)-1,3-Thiazolidine-4-carboxylic_acid@P': 'S1CN([Xe])[C@@H](C1)C(=O)[Rn]',\n",
+ " 'Pro->(2S,4R)-4-hydroxyproline@P': 'O[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]',\n",
+ " 'Pro->(DL)-pipecolic_acid@P': 'C1CCN([Xe])C(C1)C(=O)[Rn]',\n",
+ " 'Pro->3,4-Dehydro-L-proline@P': 'C1C=CC(N1([Xe]))C(=O)[Rn]',\n",
+ " 'Pro->(1S,3S,5S)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P': '[C@H]12N([Xe])[C@@H](C[C@@H]2C1)C(=O)[Rn]',\n",
+ " 'Pro->(1R,3S,5R)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P': '[C@@H]12N([Xe])[C@@H](C[C@H]2C1)C(=O)[Rn]',\n",
+ " 'Pro->(2S,3aS,7aS)-Octahydro-1H-indole-2-carboxylic_acid@P': 'N1([Xe])[C@@H](C[C@@H]2CCCC[C@H]12)C(=O)[Rn]',\n",
+ " 'Pro->(DL)-5-trifluoromethylproline@P': 'FC(C1CCC(N1([Xe]))C(=O)[Rn])(F)F'}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "These dictionaries contain the SMILES representations of various modifications.\n",
+ "\n",
+ "## Updating Modification Dictionaries\n",
+ "\n",
+ "We overwrite the existing modification dictionaries with our new SMILES representations (this script can be used when the `modification.tsv` is still not updated, thus regular `n_term_modifications` in `alphabase.smiles.AminoAcidModifier` would be empty):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in n_term_modifications:\n",
+ " n_term_modifications_smi[i] = n_term_modifications[i]\n",
+ "\n",
+ "for i in c_term_modifications:\n",
+ " c_term_modifications_smi[i] = c_term_modifications[i]\n",
+ "\n",
+ "for i in ptm_dict:\n",
+ " ptm_dict_smi[i] = ptm_dict[i]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Validation of Amino Acid Formulas\n",
+ "\n",
+ "Next, we validate the amino acid formulas by comparing their chemical compositions:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for aa in aa_formula.index:\n",
+ " aa_row = aa_formula.loc[aa]\n",
+ " if pd.isna(aa_row[\"smiles\"]):\n",
+ " continue\n",
+ " aa_smiles = modify_amino_acid(aa_row[\"smiles\"])\n",
+ " chem_composition = ChemicalCompositonFormula.from_smiles(aa_smiles)\n",
+ " assert str(chem_composition - ChemicalCompositonFormula(aa_row[\"formula\"]) - ChemicalCompositonFormula(\"H(2)O(1)\")) == \"\"\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "This step ensures that the chemical compositions derived from SMILES match the known formulas for each amino acid.\n",
+ "\n",
+ "## Processing PTMs\n",
+ "\n",
+ "We then process the PTMs, calculating their compositions and creating a dictionary of modifications to add:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'Carbamidomethyl@C': {'composition': 'C(2)H(3)N(1)O(1)',\n",
+ " 'smiles': 'C(C(C(=O)[Rn])N([Xe])([Xe]))SCC(=O)N'},\n",
+ " 'Oxidation@M': {'composition': 'O(1)',\n",
+ " 'smiles': 'O=C([Rn])C(N([Xe])([Xe]))CCS(=O)C'},\n",
+ " 'GlyGly@K': {'composition': 'C(4)H(6)N(2)O(2)',\n",
+ " 'smiles': 'NCC(=O)NCC(=O)NCCCC[C@H](N([Xe])([Xe]))C([Rn])=O'},\n",
+ " 'Deamidated@N': {'composition': 'H(-1)N(-1)O(1)',\n",
+ " 'smiles': 'C([C@@H](C(=O)[Rn])N([Xe])([Xe]))C(=O)O'},\n",
+ " 'Propionyl@K': {'composition': 'C(3)H(4)O(1)',\n",
+ " 'smiles': 'CCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])'},\n",
+ " 'Deamidated@Q': {'composition': 'H(-1)N(-1)O(1)',\n",
+ " 'smiles': 'C(CC(=O)O)[C@@H](C(=O)[Rn])N([Xe])([Xe])'},\n",
+ " 'Gln->pyro-Glu@Q^Any_N-term': {'composition': 'H(-3)N(-1)',\n",
+ " 'smiles': 'O=C([Rn])[C@H]1N([Xe])C(=O)CC1'},\n",
+ " 'Glu->pyro-Glu@E^Any_N-term': {'composition': 'H(-2)O(-1)',\n",
+ " 'smiles': 'O=C([Rn])[C@H]1N([Xe])C(=O)CC1'},\n",
+ " 'Phospho@S': {'composition': 'H(1)O(3)P(1)',\n",
+ " 'smiles': 'O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe])'},\n",
+ " 'Nitro@Y': {'composition': 'H(-1)N(1)O(2)',\n",
+ " 'smiles': 'O=[N+]([O-])c1cc(ccc1O)C[C@@H](C(=O)[Rn])N([Xe])([Xe])'},\n",
+ " 'Acetyl@K': {'composition': 'C(2)H(2)O(1)',\n",
+ " 'smiles': 'CC(=O)NCCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]'},\n",
+ " 'Dimethyl@K': {'composition': 'C(2)H(4)',\n",
+ " 'smiles': 'CN(C)CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn]'},\n",
+ " 'mTRAQ@K': {'composition': 'C(7)H(12)N(2)O(1)',\n",
+ " 'smiles': '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)CN1CCN(C)CC1'},\n",
+ " 'mTRAQ:13C(3)15N(1)@K': {'composition': '13C(3)15N(1)C(4)H(12)N(1)O(1)',\n",
+ " 'smiles': '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1CCN(C)[13CH2][13CH2]1'},\n",
+ " 'mTRAQ:13C(6)15N(2)@K': {'composition': '13C(6)15N(2)C(1)H(12)O(1)',\n",
+ " 'smiles': '[H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1[13CH2][13CH2][15N]([13CH3])[13CH2][13CH2]1'},\n",
+ " 'Pyridylethyl@C': {'composition': 'C(7)H(7)N(1)',\n",
+ " 'smiles': 'C1=CN=CC=C1CCSCC(C(=O)[Rn])N([Xe])([Xe])'},\n",
+ " 'Butyryl@K': {'composition': 'C(4)H(6)O(1)',\n",
+ " 'smiles': 'CCCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])'},\n",
+ " 'Phospho@T': {'composition': 'H(1)O(3)P(1)',\n",
+ " 'smiles': 'CC(C(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O'},\n",
+ " 'Methylthio@C': {'composition': 'C(1)H(2)S(1)',\n",
+ " 'smiles': 'CSSC[C@H](N([Xe])([Xe]))C([Rn])=O'},\n",
+ " 'Carbamidomethyl@M': {'composition': 'C(2)H(3)N(1)O(1)',\n",
+ " 'smiles': 'CS(CCC(N([Xe])([Xe]))C([Rn])=O)=CC(N)=O'},\n",
+ " 'Succinyl@K': {'composition': 'C(4)H(4)O(3)',\n",
+ " 'smiles': 'C(CCN)CC(C(=O)[Rn])N([Xe])C(=O)CCC(=O)O'},\n",
+ " 'Crotonyl@K': {'composition': 'C(4)H(4)O(1)',\n",
+ " 'smiles': 'CC=CC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe])'},\n",
+ " 'Phospho@Y': {'composition': 'H(1)O(3)P(1)',\n",
+ " 'smiles': 'C1=CC(=CC=C1CC(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O'},\n",
+ " 'Malonyl@K': {'composition': 'C(3)H(2)O(3)',\n",
+ " 'smiles': 'N([Xe])([Xe])[C@@H](CCCC(NC(=O)CC(=O)O))C(=O)[Rn]'},\n",
+ " 'Met->Hse@M^Any_C-term': {'composition': 'C(-1)H(-2)O(1)S(-1)',\n",
+ " 'smiles': 'N([Xe])([Xe])[C@H](C(=O)[Rn])CCO'},\n",
+ " 'Pro->(2S,4R)-4-fluoroproline@P': {'composition': 'F(1)H(-1)',\n",
+ " 'smiles': 'F[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]'},\n",
+ " 'Pro->(2S,4S)-4fluoroproline@P': {'composition': 'F(1)H(-1)',\n",
+ " 'smiles': 'F[C@H]1C[C@H](N([Xe])C1)C(=O)[Rn]'},\n",
+ " 'Pro->(2S)-1,3-thiazolidine-2-carboxylic_acid@P': {'composition': 'C(-1)H(-2)S(1)',\n",
+ " 'smiles': 'S1[C@H](N([Xe])CC1)C(=O)[Rn]'},\n",
+ " 'Pro->(4R)-1,3-Thiazolidine-4-carboxylic_acid@P': {'composition': 'C(-1)H(-2)S(1)',\n",
+ " 'smiles': 'S1CN([Xe])[C@@H](C1)C(=O)[Rn]'},\n",
+ " 'Pro->(2S,4R)-4-hydroxyproline@P': {'composition': 'O(1)',\n",
+ " 'smiles': 'O[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn]'},\n",
+ " 'Pro->(DL)-pipecolic_acid@P': {'composition': 'C(1)H(2)',\n",
+ " 'smiles': 'C1CCN([Xe])C(C1)C(=O)[Rn]'},\n",
+ " 'Pro->3,4-Dehydro-L-proline@P': {'composition': 'H(-2)',\n",
+ " 'smiles': 'C1C=CC(N1([Xe]))C(=O)[Rn]'},\n",
+ " 'Pro->(1S,3S,5S)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P': {'composition': 'C(1)',\n",
+ " 'smiles': '[C@H]12N([Xe])[C@@H](C[C@@H]2C1)C(=O)[Rn]'},\n",
+ " 'Pro->(1R,3S,5R)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P': {'composition': 'C(1)',\n",
+ " 'smiles': '[C@@H]12N([Xe])[C@@H](C[C@H]2C1)C(=O)[Rn]'},\n",
+ " 'Pro->(2S,3aS,7aS)-Octahydro-1H-indole-2-carboxylic_acid@P': {'composition': 'C(4)H(6)',\n",
+ " 'smiles': 'N1([Xe])[C@@H](C[C@@H]2CCCC[C@H]12)C(=O)[Rn]'},\n",
+ " 'Pro->(DL)-5-trifluoromethylproline@P': {'composition': 'C(1)F(3)H(-1)',\n",
+ " 'smiles': 'FC(C1CCC(N1([Xe]))C(=O)[Rn])(F)F'}}"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ptms_to_add = {}\n",
+ "\n",
+ "for ptm in ptm_dict:\n",
+ " smi = modify_amino_acid(ptm_dict[ptm])\n",
+ " ptm_formula = ChemicalCompositonFormula.from_smiles(smi)\n",
+ " original_aa = ptm.split(\"@\")[1].split(\"^\")[0]\n",
+ " if original_aa.startswith(\"Any\"):\n",
+ " original_aa = \"A\"\n",
+ " original_aa_brutto_formula = aa_formula.loc[original_aa, \"formula\"]\n",
+ " ptms_to_add[ptm] = {\"composition\": str(ptm_formula - ChemicalCompositonFormula(original_aa_brutto_formula) - ChemicalCompositonFormula(\"H(2)O(1)\")),\n",
+ " \"smiles\": ptm_dict[ptm]}\n",
+ "ptms_to_add"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We update the dataframe with those modifications:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "add_new_modifications(ptms_to_add)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Processing N-terminal Modifications\n",
+ "\n",
+ "Similar to PTMs, we process N-terminal modifications:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'mTRAQ@Any_N-term': {'composition': 'C(7)H(12)N(2)O(1)',\n",
+ " 'smiles': 'C(=O)CN1CCN(CC1)C'},\n",
+ " 'mTRAQ@Protein_N-term': {'composition': 'C(7)H(12)N(2)O(1)',\n",
+ " 'smiles': 'C(=O)CN1CCN(CC1)C'},\n",
+ " 'mTRAQ:13C(3)15N(1)@Any_N-term': {'composition': '13C(3)15N(1)C(4)H(12)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C'},\n",
+ " 'mTRAQ:13C(3)15N(1)@Protein_N-term': {'composition': '13C(3)15N(1)C(4)H(12)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C'},\n",
+ " 'mTRAQ:13C(6)15N(2)@Any_N-term': {'composition': '13C(6)15N(2)C(1)H(12)O(1)',\n",
+ " 'smiles': 'C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H])'},\n",
+ " 'mTRAQ:13C(6)15N(2)@Protein_N-term': {'composition': '13C(6)15N(2)C(1)H(12)O(1)',\n",
+ " 'smiles': 'C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H])'},\n",
+ " 'Acetyl@Any_N-term': {'composition': 'C(2)H(2)O(1)', 'smiles': 'C(=O)C'},\n",
+ " 'Acetyl@Protein_N-term': {'composition': 'C(2)H(2)O(1)', 'smiles': 'C(=O)C'},\n",
+ " 'Propionyl@Any_N-term': {'composition': 'C(3)H(4)O(1)', 'smiles': 'C(=O)CC'},\n",
+ " 'Propionyl@Protein_N-term': {'composition': 'C(3)H(4)O(1)',\n",
+ " 'smiles': 'C(=O)CC'},\n",
+ " 'Biotin@Any_N-term': {'composition': 'C(10)H(14)N(2)O(2)S(1)',\n",
+ " 'smiles': 'C(=O)CCCCC1SCC2NC(=O)NC21'},\n",
+ " 'Biotin@Protein_N-term': {'composition': 'C(10)H(14)N(2)O(2)S(1)',\n",
+ " 'smiles': 'C(=O)CCCCC1SCC2NC(=O)NC21'},\n",
+ " 'Carbamidomethyl@Any_N-term': {'composition': 'C(2)H(3)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)NC'},\n",
+ " 'Carbamidomethyl@Protein_N-term': {'composition': 'C(2)H(3)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)NC'},\n",
+ " 'Carbamyl@Any_N-term': {'composition': 'C(1)H(1)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)N'},\n",
+ " 'Carbamyl@Protein_N-term': {'composition': 'C(1)H(1)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)N'},\n",
+ " 'Propionamide@Any_N-term': {'composition': 'C(3)H(5)N(1)O(1)',\n",
+ " 'smiles': 'CCC(N)=O'},\n",
+ " 'Propionamide@Protein_N-term': {'composition': 'C(3)H(5)N(1)O(1)',\n",
+ " 'smiles': 'CCC(N)=O'},\n",
+ " 'Pyridylacetyl@Any_N-term': {'composition': 'C(7)H(5)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)Cc1ccccn1'},\n",
+ " 'Pyridylacetyl@Protein_N-term': {'composition': 'C(7)H(5)N(1)O(1)',\n",
+ " 'smiles': 'C(=O)Cc1ccccn1'},\n",
+ " 'Methyl@Any_N-term': {'composition': 'C(1)H(2)', 'smiles': 'C'},\n",
+ " 'Methyl@Protein_N-term': {'composition': 'C(1)H(2)', 'smiles': 'C'},\n",
+ " 'Dimethyl@Any_N-term': {'composition': 'C(2)H(4)', 'smiles': 'C'},\n",
+ " 'Dimethyl@Protein_N-term': {'composition': 'C(2)H(4)', 'smiles': 'C'},\n",
+ " 'Dimethyl:2H(6)13C(2)@Any_N-term': {'composition': '13C(2)2H(6)H(-2)',\n",
+ " 'smiles': '[13C]([2H])([2H])([2H])'},\n",
+ " 'Dimethyl:2H(6)13C(2)@Protein_N-term': {'composition': '13C(2)2H(6)H(-2)',\n",
+ " 'smiles': '[13C]([2H])([2H])([2H])'},\n",
+ " 'Dimethyl:2H(4)@Any_N-term': {'composition': '2H(4)C(2)',\n",
+ " 'smiles': 'C([2H])([2H])([1H])'},\n",
+ " 'Dimethyl:2H(4)@Protein_N-term': {'composition': '2H(4)C(2)',\n",
+ " 'smiles': 'C([2H])([2H])([1H])'},\n",
+ " 'Dimethyl:2H(4)13C(2)@Any_N-term': {'composition': '13C(2)2H(4)',\n",
+ " 'smiles': '[13C]([2H])([2H])([1H])'},\n",
+ " 'Dimethyl:2H(4)13C(2)@Protein_N-term': {'composition': '13C(2)2H(4)',\n",
+ " 'smiles': '[13C]([2H])([2H])([1H])'}}"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nterms_to_add = {}\n",
+ "\n",
+ "\n",
+ "for ptm in n_term_modifications:\n",
+ " original_mod = ptm.split(\"@\")[0]\n",
+ " smi = modify_amino_acid(aa_formula.loc[\"A\", \"smiles\"], n_term_mod=ptm)\n",
+ " ptm_formula = ChemicalCompositonFormula.from_smiles(smi)\n",
+ " original_aa_brutto_formula = aa_formula.loc[\"A\", \"formula\"]\n",
+ " suffixes = [\"Any_N-term\", \"Protein_N-term\"]\n",
+ " for suffix in suffixes:\n",
+ " nterms_to_add[original_mod + \"@\" + suffix] = {\"composition\": str(ptm_formula - ChemicalCompositonFormula(original_aa_brutto_formula) - ChemicalCompositonFormula(\"H(2)O(1)\")),\n",
+ " \"smiles\": n_term_modifications[ptm]}\n",
+ "nterms_to_add"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "add_new_modifications(nterms_to_add)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Processing C-terminal Modifications\n",
+ "\n",
+ "We also process C-terminal modifications in a similar manner:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'Methyl@Any_C-term': {'composition': 'C(1)H(2)', 'smiles': 'OC'},\n",
+ " 'Methyl@Protein_C-term': {'composition': 'C(1)H(2)', 'smiles': 'OC'},\n",
+ " 'Ethyl@Any_C-term': {'composition': 'C(2)H(4)', 'smiles': 'OCC'},\n",
+ " 'Ethyl@Protein_C-term': {'composition': 'C(2)H(4)', 'smiles': 'OCC'},\n",
+ " 'Propyl@Any_C-term': {'composition': 'C(3)H(6)', 'smiles': 'OCCC'},\n",
+ " 'Propyl@Protein_C-term': {'composition': 'C(3)H(6)', 'smiles': 'OCCC'},\n",
+ " 'Amidated@Any_C-term': {'composition': 'H(1)N(1)O(-1)', 'smiles': 'N'},\n",
+ " 'Amidated@Protein_C-term': {'composition': 'H(1)N(1)O(-1)', 'smiles': 'N'},\n",
+ " 'Cation:Na@Any_C-term': {'composition': 'H(-1)Na(1)', 'smiles': 'O[Na]'},\n",
+ " 'Cation:Na@Protein_C-term': {'composition': 'H(-1)Na(1)', 'smiles': 'O[Na]'},\n",
+ " 'Cation:K@Any_C-term': {'composition': 'H(-1)K(1)', 'smiles': 'O[K]'},\n",
+ " 'Cation:K@Protein_C-term': {'composition': 'H(-1)K(1)', 'smiles': 'O[K]'},\n",
+ " 'Cation:Cu[I]@Any_C-term': {'composition': 'Cu(1)H(-1)', 'smiles': 'O[Cu]'},\n",
+ " 'Cation:Cu[I]@Protein_C-term': {'composition': 'Cu(1)H(-1)',\n",
+ " 'smiles': 'O[Cu]'},\n",
+ " 'Cation:Li@Any_C-term': {'composition': 'H(-1)Li(1)', 'smiles': 'O[Li]'},\n",
+ " 'Cation:Li@Protein_C-term': {'composition': 'H(-1)Li(1)', 'smiles': 'O[Li]'}}"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cterms_to_add = {}\n",
+ "\n",
+ "\n",
+ "for ptm in c_term_modifications:\n",
+ " original_mod = ptm.split(\"@\")[0]\n",
+ " smi = modify_amino_acid(aa_formula.loc[\"A\", \"smiles\"], c_term_mod=ptm)\n",
+ " ptm_formula = ChemicalCompositonFormula.from_smiles(smi)\n",
+ " original_aa_brutto_formula = aa_formula.loc[\"A\", \"formula\"]\n",
+ " suffixes = [\"Any_C-term\", \"Protein_C-term\"]\n",
+ " for suffix in suffixes:\n",
+ " composition = str(ptm_formula - ChemicalCompositonFormula(original_aa_brutto_formula) - ChemicalCompositonFormula(\"H(2)O(1)\"))\n",
+ " cterms_to_add[original_mod + \"@\" + suffix] = {\"composition\": composition,\n",
+ " \"smiles\": c_term_modifications[ptm]}\n",
+ "cterms_to_add"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "add_new_modifications(cterms_to_add)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Final Steps\n",
+ "\n",
+ "Finally, we examine the updated modification database:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mod_name | \n",
+ " unimod_mass | \n",
+ " unimod_avge_mass | \n",
+ " composition | \n",
+ " unimod_modloss | \n",
+ " modloss_composition | \n",
+ " classification | \n",
+ " unimod_id | \n",
+ " smiles | \n",
+ " modloss_importance | \n",
+ " mass | \n",
+ " modloss_original | \n",
+ " modloss | \n",
+ "
\n",
+ " \n",
+ " mod_name | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Acetyl@T | \n",
+ " Acetyl@T | \n",
+ " 42.010565 | \n",
+ " 42.0367 | \n",
+ " H(2)C(2)O(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " Post-translational | \n",
+ " 1 | \n",
+ " | \n",
+ " 0.0 | \n",
+ " 42.010565 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Acetyl@Protein_N-term | \n",
+ " Acetyl@Protein_N-term | \n",
+ " 42.010565 | \n",
+ " 42.0367 | \n",
+ " H(2)C(2)O(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " Post-translational | \n",
+ " 1 | \n",
+ " C(=O)C | \n",
+ " 0.0 | \n",
+ " 42.010565 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Acetyl@S | \n",
+ " Acetyl@S | \n",
+ " 42.010565 | \n",
+ " 42.0367 | \n",
+ " H(2)C(2)O(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " Post-translational | \n",
+ " 1 | \n",
+ " | \n",
+ " 0.0 | \n",
+ " 42.010565 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Acetyl@C | \n",
+ " Acetyl@C | \n",
+ " 42.010565 | \n",
+ " 42.0367 | \n",
+ " H(2)C(2)O(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " Post-translational | \n",
+ " 1 | \n",
+ " | \n",
+ " 0.0 | \n",
+ " 42.010565 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Acetyl@Any_N-term | \n",
+ " Acetyl@Any_N-term | \n",
+ " 42.010565 | \n",
+ " 42.0367 | \n",
+ " H(2)C(2)O(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " Multiple | \n",
+ " 1 | \n",
+ " C(=O)C | \n",
+ " 0.0 | \n",
+ " 42.010565 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " Ethyl@Protein_C-term | \n",
+ " Ethyl@Protein_C-term | \n",
+ " 0.000000 | \n",
+ " 0.0000 | \n",
+ " C(2)H(4) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " User-added | \n",
+ " 0 | \n",
+ " OCC | \n",
+ " 0.0 | \n",
+ " 28.031300 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Cation:Na@Protein_C-term | \n",
+ " Cation:Na@Protein_C-term | \n",
+ " 0.000000 | \n",
+ " 0.0000 | \n",
+ " H(-1)Na(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " User-added | \n",
+ " 0 | \n",
+ " O[Na] | \n",
+ " 0.0 | \n",
+ " 21.981944 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Cation:K@Protein_C-term | \n",
+ " Cation:K@Protein_C-term | \n",
+ " 0.000000 | \n",
+ " 0.0000 | \n",
+ " H(-1)K(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " User-added | \n",
+ " 0 | \n",
+ " O[K] | \n",
+ " 0.0 | \n",
+ " 37.955881 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Cation:Cu[I]@Protein_C-term | \n",
+ " Cation:Cu[I]@Protein_C-term | \n",
+ " 0.000000 | \n",
+ " 0.0000 | \n",
+ " Cu(1)H(-1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " User-added | \n",
+ " 0 | \n",
+ " O[Cu] | \n",
+ " 0.0 | \n",
+ " 61.921773 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " Cation:Li@Protein_C-term | \n",
+ " Cation:Li@Protein_C-term | \n",
+ " 0.000000 | \n",
+ " 0.0000 | \n",
+ " H(-1)Li(1) | \n",
+ " 0.0 | \n",
+ " | \n",
+ " User-added | \n",
+ " 0 | \n",
+ " O[Li] | \n",
+ " 0.0 | \n",
+ " 6.008178 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2796 rows × 13 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " mod_name unimod_mass \\\n",
+ "mod_name \n",
+ "Acetyl@T Acetyl@T 42.010565 \n",
+ "Acetyl@Protein_N-term Acetyl@Protein_N-term 42.010565 \n",
+ "Acetyl@S Acetyl@S 42.010565 \n",
+ "Acetyl@C Acetyl@C 42.010565 \n",
+ "Acetyl@Any_N-term Acetyl@Any_N-term 42.010565 \n",
+ "... ... ... \n",
+ "Ethyl@Protein_C-term Ethyl@Protein_C-term 0.000000 \n",
+ "Cation:Na@Protein_C-term Cation:Na@Protein_C-term 0.000000 \n",
+ "Cation:K@Protein_C-term Cation:K@Protein_C-term 0.000000 \n",
+ "Cation:Cu[I]@Protein_C-term Cation:Cu[I]@Protein_C-term 0.000000 \n",
+ "Cation:Li@Protein_C-term Cation:Li@Protein_C-term 0.000000 \n",
+ "\n",
+ " unimod_avge_mass composition unimod_modloss \\\n",
+ "mod_name \n",
+ "Acetyl@T 42.0367 H(2)C(2)O(1) 0.0 \n",
+ "Acetyl@Protein_N-term 42.0367 H(2)C(2)O(1) 0.0 \n",
+ "Acetyl@S 42.0367 H(2)C(2)O(1) 0.0 \n",
+ "Acetyl@C 42.0367 H(2)C(2)O(1) 0.0 \n",
+ "Acetyl@Any_N-term 42.0367 H(2)C(2)O(1) 0.0 \n",
+ "... ... ... ... \n",
+ "Ethyl@Protein_C-term 0.0000 C(2)H(4) 0.0 \n",
+ "Cation:Na@Protein_C-term 0.0000 H(-1)Na(1) 0.0 \n",
+ "Cation:K@Protein_C-term 0.0000 H(-1)K(1) 0.0 \n",
+ "Cation:Cu[I]@Protein_C-term 0.0000 Cu(1)H(-1) 0.0 \n",
+ "Cation:Li@Protein_C-term 0.0000 H(-1)Li(1) 0.0 \n",
+ "\n",
+ " modloss_composition classification \\\n",
+ "mod_name \n",
+ "Acetyl@T Post-translational \n",
+ "Acetyl@Protein_N-term Post-translational \n",
+ "Acetyl@S Post-translational \n",
+ "Acetyl@C Post-translational \n",
+ "Acetyl@Any_N-term Multiple \n",
+ "... ... ... \n",
+ "Ethyl@Protein_C-term User-added \n",
+ "Cation:Na@Protein_C-term User-added \n",
+ "Cation:K@Protein_C-term User-added \n",
+ "Cation:Cu[I]@Protein_C-term User-added \n",
+ "Cation:Li@Protein_C-term User-added \n",
+ "\n",
+ " unimod_id smiles modloss_importance mass \\\n",
+ "mod_name \n",
+ "Acetyl@T 1 0.0 42.010565 \n",
+ "Acetyl@Protein_N-term 1 C(=O)C 0.0 42.010565 \n",
+ "Acetyl@S 1 0.0 42.010565 \n",
+ "Acetyl@C 1 0.0 42.010565 \n",
+ "Acetyl@Any_N-term 1 C(=O)C 0.0 42.010565 \n",
+ "... ... ... ... ... \n",
+ "Ethyl@Protein_C-term 0 OCC 0.0 28.031300 \n",
+ "Cation:Na@Protein_C-term 0 O[Na] 0.0 21.981944 \n",
+ "Cation:K@Protein_C-term 0 O[K] 0.0 37.955881 \n",
+ "Cation:Cu[I]@Protein_C-term 0 O[Cu] 0.0 61.921773 \n",
+ "Cation:Li@Protein_C-term 0 O[Li] 0.0 6.008178 \n",
+ "\n",
+ " modloss_original modloss \n",
+ "mod_name \n",
+ "Acetyl@T 0.0 0.0 \n",
+ "Acetyl@Protein_N-term 0.0 0.0 \n",
+ "Acetyl@S 0.0 0.0 \n",
+ "Acetyl@C 0.0 0.0 \n",
+ "Acetyl@Any_N-term 0.0 0.0 \n",
+ "... ... ... \n",
+ "Ethyl@Protein_C-term 0.0 0.0 \n",
+ "Cation:Na@Protein_C-term 0.0 0.0 \n",
+ "Cation:K@Protein_C-term 0.0 0.0 \n",
+ "Cation:Cu[I]@Protein_C-term 0.0 0.0 \n",
+ "Cation:Li@Protein_C-term 0.0 0.0 \n",
+ "\n",
+ "[2796 rows x 13 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "MOD_DF[\"unimod_id\"] = MOD_DF[\"unimod_id\"].astype(int)\n",
+ "MOD_DF"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This displays the final, updated database of modifications, including the newly added SMILES representations and calculated compositions.\n",
+ "If needed, we can save the updated database to a file:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# orig_df = pd.read_csv(os.path.join(CONST_FILE_FOLDER, \"modification.tsv\"), sep=\"\\t\", index_col=0)\n",
+ "# MOD_DF[[\"mod_name\", *orig_df.columns]].to_csv(os.path.join(CONST_FILE_FOLDER, \"modification.tsv\"), sep=\"\\t\", index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "alphabase",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/nbs/tutorial_smiles.ipynb b/docs/nbs/tutorial_smiles.ipynb
new file mode 100644
index 00000000..c4dea1f8
--- /dev/null
+++ b/docs/nbs/tutorial_smiles.ipynb
@@ -0,0 +1,381 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Working with Amino Acids and Post-Translational Modifications using SMILES Notation\n",
+ "\n",
+ "This notebook introduces working with amino acids and post-translational modifications on the molecular level, using the SMILES (Simplified Molecular Input Line Entry System) notation. We'll explore how to represent amino acids, add modifications, and visualize the results using RDKit.\n",
+ "\n",
+ "## Introduction to SMILES and RDKit\n",
+ "\n",
+ "SMILES is a line notation for describing the structure of chemical species using short ASCII strings. For example, the SMILES for ethanol is `CCO`.\n",
+ "\n",
+ "RDKit is an open-source cheminformatics software that we'll use to work with and visualize molecular structures.\n",
+ "\n",
+ "Let's start by importing the necessary functions and data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from rdkit import Chem\n",
+ "from rdkit.Chem import Draw\n",
+ "\n",
+ "from alphabase.smiles.smiles import AminoAcidModifier\n",
+ "\n",
+ "\n",
+ "aa_modifier = AminoAcidModifier()\n",
+ "modify_amino_acid = aa_modifier.modify_amino_acid\n",
+ "aa_smiles = aa_modifier.aa_smiles\n",
+ "n_term_modifications = aa_modifier.n_term_modifications\n",
+ "c_term_modifications = aa_modifier.c_term_modifications\n",
+ "ptm_dict = aa_modifier.ptm_dict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Understanding the Data Structure\n",
+ "\n",
+ "Our data is organized into several dictionaries:\n",
+ "\n",
+ "1. `aa_smiles`: Contains SMILES representations of amino acids\n",
+ "2. `n_term_modifications`: Contains N-terminal modifications\n",
+ "3. `c_term_modifications`: Contains C-terminal modifications\n",
+ "4. `ptm_dict`: Contains post-translational modifications\n",
+ "\n",
+ "Let's examine the SMILES representation of an amino acid, such as Lysine (K):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Lysine SMILES with dummy atoms: N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn]\n"
+ ]
+ },
+ {
+ "data": {
+ "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAEsASwDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAriPHupy2WqeGrQ67Jo1leXUyXV1G8aEKsLMo3SKVHzADp3rt653xB4el1nX/Dt5i3e1064mluI5uSwaFkGBgg8kHnFAHL6L46fT4NUa7vLjX9Nh1OGxsL+3iQvcGRRlfk2q+1vlyo5zXW6L4lGq6peaXc6bd6bqFrGkzQXJjbdG5IVlZGYHlSCM8GuP0nw9Lqmj2qaDf2Vz4dg1yDUdPfe4aOJJSZocbeiuG2+xwcYFdfDotzH49vNdLxfZZtNhtFUE796SSMSRjGMOO/rQBvUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQB5/wDB7914MubL/nz1S7gx6YkJ/wDZq9Arz/4YfuLvxrZf88vEdzIB6K4UivQKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKAPP/AAT+4+JPxBs+m26tJwP+ukJP9K9ArxifxHqekfGrxNJoGk/25C1pbm/ht5QJI9g2/L2LDdyvU/ga9A8OeP8Aw94nlNtaXZt9QTiSwvF8m4Q9xsPX8M0AdPRRRQAUUUUAFFFFABRRWZ4kvptL8L6vqFsVE9rZTTRlhkblQsMj6iqhBzkoLd6A9DToritc8bQWngR9SstTsH1MW8TiMSqx3sV3DbnPc8V2ta1cPUpRUpq12191v8xJphRXA+FfGmoajBqEGqrEl1tuLiwkRcLNFHI0ZH+8rLz7MDXUeGNQn1bwrpOo3RU3F1aRTSFRgbmUE4H41eIwdXD35+jt+f8Al94lJPY1qKKK5SgooooAKKKKACiuD174gSS6lJ4e8G2i6xrg4lkB/wBGs+26V+mR/dHPGOvFUILP4o+GF+1tqFj4ricB7izdBbSxt3ET4wR9fwAoA9LorkNA+JGha3ef2bcNNpOsDhtP1JPJlz/s54b2wc+1dfQAUUUUAFFFUtW1fT9D06XUNUu4rW0iGWllbA+g9T7Dk0AXa8y+IPxj0rwik9jpirqesIMNGhzFbn/pow7/AOyOfXFBvvE/xMOzTDc+HvCrcNeuu27vV/6Zj/lmp/vdfryK7TRPCmh+HtGOk6dp0EdmwxKrKGMx7lyfvH60AeO+Cv2hkldLPxdbLEScC+tlO0f76fnyPyr3LT9RstVso73T7qG6tpBlJYXDKfxFefeJvgd4Q1/fNa276VdNk77ThCT6oePyxXna/Dj4k/DW9e/8K3o1C1zl4oD/AKwDpvhbqef4ckUAfR1FeWeEvjXpmpXC6X4ntm0LVlOxhOCsTN06nlOc8Nxx1r1JWV1DKwZSMgg5BFAC0UVyfirx7YeHbiPTLWCXVdenH+j6Za8yN7uf4F9z27UAdFqWp2Oj2Et9qN1Fa2sQy8srbVH/ANf2rzw6v4l+JLGLw+Z9C8MNw+qyLtubtfSFT91T/eP9CKtad4E1DxDfxa34/uI72dDvttIh/wCPS0+o/wCWje54+oxXoQAVQqgAAYAHagDzC1+GmreCZ5r3wFq6DzsG4sNVQSJOR38xQGU9fxPUVR1bWvC/iGePS/iP4ck0DVukN3KfkJHeK5X88Hj6169Ve+sLPU7OS0v7WG6tpBh4pkDqfwNAHG2GneKvDFov9m6n/wAJNpv3kjvJALgJ2Cyjh/qfwrZ0nxlpep3H2OUyWGoDhrO8Xy3z7Z4b8Koy+DrvSpGuPCuqSWLE7ms7gmW3c/Q8rn1FZ+o6xY3SLYeO9A+yEnal2AZICf8AZkXlD7fnXM5yi7vT12+/p8z26eHoV4KMFzW6x0mvWLdpf9uu/dnf0VyFpHeWF8vhvRL5ikUX2qW7vszmNGOEjQZHoep4FaWl6lfx65PomqNDLOsAuYLiFCgkj3bTuUk4YHHQ4Oa1VS+6OCpg3FNxknZXts7d7fpe9tdjdooorQ4wrK8TWU+o+FNYsbVN9xc2M8MSkgbnZCAMngcmtWirpzcJqa3WoPU4fXvBsV18P3sLHSLIaqbeJQVjjVt4K7vm/A85rf1LWLzTr6APp8bWMtxFbif7R+8LSEAFY9vKgkZywPBOOOdmsO50K6ufEUOqHU8xQ48m2eAMsfGGKnP3mGRuIJAOB3z1wxHtfdrvRXet93bTT09N9ybW2Oa/4Q7UZfAUVsqrb67ZXN1c2jbgRl5ZDsJB+66Ng89/aksrbx34a0bSvsdtZaja29jBDPpjMI5o3VAG2Scq3Oev4DvXoVFV/aVRpqcVJNt2a79F1S9Hfz1YuRdDldD+IGi6xdf2fO02maqOGsNQTypM/wCznhvw59q6qsrXPDej+JLX7Pq+nw3SD7rMMMn+6w5H4GuRl8O+M/Cb+b4W1VdX05QP+JTqrfOoA6Rzdvo3ArGu8PKPNSun2eq+T/Rr5sav1PQ6K4jSPifo9zerpmuQXHh7V+n2XUhsV/8Ack+6w9DxnsKf4k+IVrpt8NF0K2bXPEMmQllbMCsX+1K/RAPfn6da5SjpNa1zTPD2mSajq15Fa2sfV5D1PoB1J9hzXBeZ4o+JvEP2nw54TfrIRtvb5fb/AJ5ofXqfcHi/ovgC4vdSi8QeN7tNW1ZfmgtQP9Es/aND1P8AtH+YzXfUAZmheH9K8NaZHp2kWcdrbJztQcsfVieWPua06KKAMjX/AAvoniiz+y61psF3GPul1w6f7rDlfwNch/wi/jPwf8/hTWf7Y05f+YTq7ZdR6RzdR7A8D3r0aigDiNI+J+kXN6uma5BceHtX6fZdSGxXP+xJ91h6dM9hXbggjIOQaoavoml69ZNZ6tYQXluf4JkDYPqPQ+45rgrj4Z67pa/ZPB/jO+0rS5jtktLhftAhX/pix5X6ZH1oA3fFHj600O8XR9NtpNY8QzD9zp1sclf9qRuiL7n+XNZuleAbzV9Ri17x5dR6lfod1vp0Y/0Oz+in77e5/XANdH4X8H6R4Rs2h02AmaU7ri7mO+adu7Ox5PPbpW9QAYwMCiiigAooooAytb8M6J4kt/I1jTLa8TGAZU+Zfow5H4GuasfBGqeFD/xSWtP9izn+ytUJlg6nhJB88fX0b3BruqKAPM73xP4s8Va/e+FNBsk0SaxWP+1NRnkWUwbxkLCo+8SOQxx3yFOK6rwr4L0jwjbyCxjeW8nO65vrht887dSWb69hxWDoX+j/ABs8WxdPtVhZz/XaClegUAFFFFABRRRQAU2SNJo2jlRXRuCrDIP4U6igE7HO31pf6d4kOs2Nmb2K4t1guIEkVJFKklXXcQD1IIyO1O0uzvrvxFPrl/a/YwLYWtvbs4d9u7czMVJAJOOAT0roKKz9mr3Op4uXJy2V7Wvre3328ttgooorQ5QooooAKKKKACiiigAooooAoavoel6/ZGz1awt7y3P8EyBsH1B6g+45qn4b8I6F4Rs5LXQ9PS1jkbdIdxdnPuzEk/TOBW3RQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFAHBeU9v8AHnzcDyrrw7t6jJdZ/Tr0rva4bW/9H+L/AIWl6fabK7g+u0B67muivRVONOS+1G/4tfoJO9wooornGFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQBx/irT7ybxp4O1C2tpZY7S5nWZ0UkRq8e3J9BxXYV5al1caB478R+IRI7aat/Fa6jHnISMwx7JgP9hic/7LH0rpvAUitpuqDeCW1i+KjPUec3T8xXrYvDSVCEr3UUkv8At68mn6X+aaZnF6s6yiiivJNAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACio5po7aCSeZ1jijUu7scBVAySarSavp0OkjVZL63TTygkF0ZB5ZU4wd3TByPzoAu0UgIZQR0PIrl/FXjvTfDDx2Sxy6jrNxxbaZaDdNIexIH3V9z74zigDory9ttOs5bu8uI7e3iXc8sjBVUe5rg31vX/AB87W/hnzNK0LJWTWJUxLOO4gU8gf7R/QjBXTfBuseJb1NY8ezxyhTuttEgOba37guf+Wj/p16g4r0FEWNFRFCqowFAwAPSuuhXpUY8yjefnsvl1frouzJab9DG03w1bWC6oss0t4upFTOLjB3YiWM5wBnIXJ+prn4vhXosWkW9ml1fx3FrLJJa30M5SaHe2SARwe3UV3VFEMdiINuM2r2/BWX4aBypnAi78ceE223tuvifSwf8Aj4tlEV3GP9pOj/hz6mtzw3458PeKtyaZqCG6TiS0mBjnjI6go3PHqMiuirm/EngTw/4qIl1Cy2XqYMd9bN5U8ZHQhxycehyKzr1/bWbik+ttL/LZfJIaVjpKK42x8KeIYND1LSrvxdd3SuFOnXmzZcW5GT87A/vBnb16jIPWsnwt4i1XxzriRSy/YYNAbZqSW04xd3YLKACpz5IwW56kgc7TWAz0iiuAHxRtWullWwT+yGvRZC6+2J5u7f5e/wAn72zdxnOcc4rt7S9gvRMYGZvJlaF9yMuHXqBkDI9xxQBYooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigCuWuzqAj8mE2RiJMvmHf5mRhduMYxk5z+FeP6WLR/G9vpzSTf8IOmpSnSwyjyHvxg+XuzzGG8woMYLAgZwK9iu7WO9s5rWUuI5kMb+W5RsEYOGHIPuKz5/DOj3Hh1NAeyQaZGiokKErsC4KkMDkEEA5znPNAHCaxrnirUvEfiK10Y6rGNLZIbSOyhtmjeUxh8zmU7iCWAwuOOck13Oj6VaLINbl0q3tNavoIzeOqguGCjKFvQdOPSoL/wbo+oX7X0gvIbmSNY5pLW9mgM6rwBJsYb8ep5rdRFjjVFztUADJzxQA6iiigAooooAKKKKAMzxBZahqOjT2WmXq2U8+I2ucEtGhPzFf8AaxnHoee1Y6eC4NJ1HR73w80dibKMWk8RBK3Nt12tj+MN8wb1JznNdXRQBwukeCL3RLkWtsNDn0sXTTJJc2Ja5RGcuY9wYAkZIDHpxwck=",
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(\"Lysine SMILES with dummy atoms:\", aa_smiles[\"K\"])\n",
+ "mol = Chem.MolFromSmiles(aa_smiles[\"K\"])\n",
+ "Draw.MolToImage(mol)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this SMILES representation:\n",
+ "- `[Xe]` represents placeholder atoms for the N-terminus\n",
+ "- `[Rn]` represents placeholder atoms for the C-terminus\n",
+ "\n",
+ "These placeholders allow for easy addition of N- and C-terminal modifications."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## N-terminal Modifications\n",
+ "\n",
+ "Let's look at the available N-terminal modifications:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Available N-terminal modifications:\n",
+ "- Acetyl@Protein_N-term\n",
+ "- Acetyl@Any_N-term\n",
+ "- Biotin@Any_N-term\n",
+ "- Carbamidomethyl@Any_N-term\n",
+ "- Carbamyl@Any_N-term\n",
+ "- Carbamyl@Protein_N-term\n",
+ "- Propionamide@Any_N-term\n",
+ "- Pyridylacetyl@Any_N-term\n",
+ "- Methyl@Protein_N-term\n",
+ "- Methyl@Any_N-term\n",
+ "- Dimethyl@Protein_N-term\n",
+ "- Dimethyl@Any_N-term\n",
+ "- Propionyl@Protein_N-term\n",
+ "- Propionyl@Any_N-term\n",
+ "- Dimethyl:2H(6)13C(2)@Protein_N-term\n",
+ "- Dimethyl:2H(6)13C(2)@Any_N-term\n",
+ "- Dimethyl:2H(4)@Protein_N-term\n",
+ "- Dimethyl:2H(4)@Any_N-term\n",
+ "- Dimethyl:2H(4)13C(2)@Protein_N-term\n",
+ "- Dimethyl:2H(4)13C(2)@Any_N-term\n",
+ "- mTRAQ@Any_N-term\n",
+ "- mTRAQ:13C(3)15N(1)@Any_N-term\n",
+ "- mTRAQ:13C(6)15N(2)@Any_N-term\n",
+ "- mTRAQ@Protein_N-term\n",
+ "- mTRAQ:13C(3)15N(1)@Protein_N-term\n",
+ "- mTRAQ:13C(6)15N(2)@Protein_N-term\n",
+ "- Biotin@Protein_N-term\n",
+ "- Carbamidomethyl@Protein_N-term\n",
+ "- Propionamide@Protein_N-term\n",
+ "- Pyridylacetyl@Protein_N-term\n",
+ "\n",
+ "Biotin SMILES: C(=O)CCCCC1SCC2NC(=O)NC21\n"
+ ]
+ },
+ {
+ "data": {
+ "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAEsASwDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigCtf31tpljLeXcgjgiGWbGfYAAdSTxis638SQS3cFtc2V/YtcHbA11CFWRsZ2ggnBwDwcGo/F1vNNo8UsMTzfZbuC5eJBlnRHBYAdzjnHtWdq+rWPiJ9MsdJnF1OL6GdzGCfIRG3MzH+E8YwecmsZzadv6Z6WGw0KlNSabu3d/y2Ss/+H32R19FFFbHmhRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAct4xuJ/tXh6xt5pInudTjLmNipMaglhx26VLY6jd3fj/VbMTN9isrSFTFgY8x8tn1ziq2p/6Z8TdEt+osrOe6I/3/AN2KPBn+k6n4m1Lr52pNCp9ViAUfzrmu3U+f5L/M9pwjDCXa15PxlP8A+RR1tFFFdJ4oUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFAHnek3xl+KvjXUpZGNrpFjb2yjd8vKGV8D14q/8JvtMnw40y8vG3XN4ZbmQ7QM7pGx0/wBnFecQ+LdKtvBfxIuG1K3XVNSvLsw25kHmGE4iQgfif517P4Y0/wDsnwppGn4wbayhiI91QA/rSstyueVuW+n+Rq0UUUyQooooAKKKKACiiqerXjafo19eooZ7e3klVT3KqTj9KTdlcqEXOSit2XKK4uXSHh8JNrS6nff2slp9rNyblyrMF37THnZs7Yx0rrLC5N5p1rdMu0zRJIV9MgHH61MZ3dmjevh1TjzRldXa2tqv0/qyLFFFFWcwUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUVg6zrzaV4g0Syd7eO1vjOJpJjgrsQMuDkAZPrmsfX/G01hc6ummtY3MVlo5vkbJf96HK7WKt0wBxwfeuqlgq1Xl5Vur/jb8yXJI7aiuNXxjdSaPpsot4U1BtTi06/t3yRE5OG289+GUnPDDrVNtP8beLmP9o3I8M6UTj7NZuJLuRf8Aak6J+H4itFgZK7qyUUu7/JLV/LTzDm7Gv4h8eaL4el+yM8t9qbcR6fYp5s7H02jp+OK5/wDs7x342GdUuv8AhFdHf/lzsnD3ki+jydE/DnsRXW+H/CeieGISml2KRO3+snb5pZP95zyfp0rarnr+x5rUb27vr8lt6XfqNX6nnWrfBfwleeFm0iwsY7K5T54b4DfKJPVmPLKe69PTHFaHg7xhdXl7L4Z8TRJZ+JrRcso4jvI+0sR7g9x259wO1rnPFvhC08UW0Em82uqWbebY30fDwSfh1Xpkd6yik2k3YZ0dFcj4T8V3F7dy+H/EES2niK0XMiD7lynaWM9we47fy66ta9CdCfJP/gNd13TEmmroKKKKxGFFFFABTZI0mieORQyOCrKehB6inUUBscrd+Ho7LR5Le8127XQoU+e3ZUyIh/AZMbivbHXHGadqviOTRrrTb0LDJ4cuEEb3EQ5hY/cc9tmMD/IFdM6JLG0cihkYFWVhkEHqDXCJBH4U1BtB1FPO8M6mxS2aTkW7t1iY9lPUH/65rnmuT4dP6/I9fCzWJuqvvNXdtFdPdq321vre9vLXvFYMoZSCpGQR0NLXD6PqLeENag8K6tcZtbliNIuZDy4/54k/3h2/AdwK7itoS5ldqx52IpKlUcYyuuj8v0fddAoooqjEKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigDnde0E6v4j0G5ltoLiytDcGdZgGHzIAvynrzWH4h8FzT3OtnSLG1ghvNENnGsYWMNNvY8gexHNd9RXZSx1aly8v2Vb8b/mS4pnA+IdBN5420ibS7u3F0ksFxqVkZAGeKNvkmA9Rll9wR6V31cNqv8AoPxl0C56DUdNuLPPr5ZEn9a7mrxkpOnRTd1y3X3tfhay8gjuwooorgKCiiigDnfFnhODxLaxSRzNZ6raN5llfR8PC/8AVT3FV9G8UuNYg8M60EXXFtVllkhH7iRjk7VJ53YBOMdjjpXVVwd/oM2s+KPEoiLW91HDYz2N0VOEnTziDnuOcEejH1r0cNONWnKlWfuxV0+zbS+7W7RDVndG5D4us5r6G0WCcPNqM2ngkDAeJGYt16EKcd60G1eFfEUWimOTz5LR7sPxtCq6oR65y4rzHS7O/wBYOlNeW17p003iK8lnEJKPDmF84bsM8Z711Njor6Z8S7d1udQuom0eYGW7lMgVvOi4BPTuce1b4jB0KcnG+qi3bzTfX5CUmztaKKK8c0CiiigAqnqul2us6ZPp97HvgmXaw7j0I9CDzVyik0mrMqMpQkpRdmjyWG3fWo7v4beNJidThHn6Rqija06LnZInpInQgdRn3J67wJq2t3VjdaV4jtJY9V0uQQS3WwiK7XGVkRuhJHUdj6ZwJvG3hKPxXpKLDMbTVbN/P0+9XhoJR05/unABH+Arn7TWvitc2scLeFNHtLhVCvcXV+GR2A5YLHkgE9s0yW7no1Fef/2L8T9QH+leLNH0vPUafpxmx9DKaQ/DS/vf+Qx488S3Q7pbTrbI31VQaAO8uLmC1jMlxPHDGP4pHCj8zXP3HxC8H215FaSeJdM86VtiqlwrAH/aIyF/HFZFv8HPBEUvnXGlyX03/PS8upJCfqC2P0rTu/hx4Pu9In03/hHtOhhlQqXgt0SRT/eDgZBHrQB1NFef+Ctcv9H1Z/AviWYyajbJv069bgX1uOh/31AwR7d8En0CgAooooAKKKKACiiigAooooAKKKKACiiigAooooA4X4hstjqvg/ViQog1mO3ZvRZgVP8AIV3VcH8Y4Hf4aahdRDM1jLDdR+xSRc/oTXcW86XNtFcRnMcqB1PsRkVrOtKcIwf2b2+bv+okrO5JRRRWQwooooAq6lqFtpOm3OoXcmy3to2lkb0AGfzrmPh5DqNxpV14g1SSUXOtTfakgZyVghxiNQOg+XBz34z0qh4yk/4SnxRp3giB824232rbT0gU/JGf95vxHBrv1VUUKoAUDAAHAFdrlClheVO8p7+SWy+b1fkl3J3l6C0UUVxFBRRRQAUUUUAFFFFABRRRQAUUUUAFFFFAHM+NfCUfivSUWGY2mq2b+fp96v3oJR0/4CcAEf4CofA3i2TxFZXFjqcItPEGmt5OoWh7N2dfVG6g/wD1iesrhfHPhu/W9t/GHhlANf09cPCOBfQdWib1P90+v4YAO6orH8MeJLDxZoNvq2nOTFKMPG33onH3kYdiD/j0NbFABRRSMyopZmCqOSScAUALRWDf+NvC2l5+2+IdMhYfwG6Qt/3yDmk8OeNvDni2S4j0PVIrx7fHmqFZCAe+GAyPccUAb9FFFABRRRQAUUUUAFFFFAHNfEN7SP4deITeyLHAbCVdx/vFSF/HcRj3rkPDfxIn0fwvpVprfhLxLE0FnFG13FYmSJ9qAbsg5GcZxirniv8A4rTx/p3g+P59M03bqOr46Mf+WUJ+p+Yj0+lekUAcTZfFzwNev5f9uxW0o4aO7jeEqfcsAP1rp7DXNI1XH9napZXmf+fe4ST+RNS3umWGopsvrG2uk/uzxK4/UVzGofCnwNqWTN4bs4z62wMGP++CKAOxqhrWr2ugaJeatevttrSJpXPc47D3JwB7muNHwotrPnRvFPibTMdI4r8vH+KsDn865jxf4f8AENjeeG9O8QeKpda8P6hrEFvLBJaJE27JKqzLyykg8cdqAOv+GWkXUek3XiTVkxrGvy/a5wesUX/LKMeyr/PHau5oAAGBwKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooA831vw14n8OeKpde8C29pPFqX/ACEtOuZNkZk7TLyMHrnH5HPE/wBj+K1//rtV8NaUp6fZbeSd1+u/jNeg0UAef/8ACAeIr3/kL/EXW5c9Rp8cdn+W0GlX4OeFJWDamNT1Zxzuv7+Rzn14IFd/RQBzlh4A8Iabg2vhvTEYdHa2V2H4sCaw/GXhK4s7m38WeEbaKHXNPB8y3iUKt/B/FEwHU+h9fwx39FAGP4Y8SWHizQbfVtOcmKUYeNvvROPvIw7EH/Hoa2K8z8RWtx8O/EcvjDS4Xk0K9cDXLKMZ8s9BcoPUZ+Yd/wAcj0a0ure+tIbu1mSa3mQSRyIcqykZBBoAmooooAKKKKACsnxNr9r4Y8OX2s3h/dWsRfbnBduiqPckgfjWtXm+v/8AFbfEmy8Np8+kaFtv9T/uyTn/AFMR/wDQiO4z6UAa3w20C60nw9JqOqjOt6zKb6+YjBVm5VPYKDjHY5rsqKKACiiigArgPjGrReAjqaAl9Lvra9XHXKyAfyY139c54/sP7S+H3iC1AyzWErKPVlUsv6gUAdErK6B1IKsMgjuKWsLwVf8A9p+BtCvSctLYQlz/ALWwBv1zW7QAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFADJYo54nilRZI3Uq6MMhgeCCO4rzPTJZPhd4lj0O6dj4S1SY/2bcOcixnbkwseyk8qf/rmvT6ztd0Ow8SaLdaTqUIltbhNrDuD2YHsQeQaANGivMtL0n4s6dZrpKar4dktbX93Df3KSvcSxj7pIHy5AwOfTqepujwZ41vudU+Il0inrFp9hHBj6PyaAPQKy77xLoOl5F/rWnWpHUT3SIfyJrkv+FQ6Lc/8AIX1jxDq+fvLfak5B/BccVqWHwv8ABGm4Nv4Z09iOhnj87/0PNAGVrnxm8G6XYXL2mrRX96kbGGC3R3DsBwCwG0DPU5q98L9IFh4Oh1Ca4S61DWGOoXlyjBg8knOAR2UYH1BrqrfTLC0gaC2sbaGJhtaOOJVUj0IArzi5trv4S6jJqFhHLc+C7mTdd2iZZtNcnmSMd4z3HagD1GioLO8ttQs4byznSe2mQPHLGcqynoQanoAKKKKACmTRJPBJDIMpIpVh6gjBp9FAHBfByV/+FcWllKczafcT2kn1WVsfoRXe1578N2W08ReOtHB/1GstdhfRZ1DD/wBBNehUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAU2SNJY2jkRXjcFWVhkMD1BFOooA52w0u28HW4t9NtxHo+9mMKZPkliSSPbJ/CugR1kRXRgysMgjoaUgMCCAQeCD3rMWFtI8+VXzYhS4i7q3oPauJ8+Hm5bwer/u/8D8vTat/U1KKqG7lhhea5tiiqAR5bbyfbGKq51LUOg+wwHueZCP6Vc8XFWUU230tr872t8w5St4n8YaP4StEm1Sd98rbIbeGMySyt/dVR3+uBXLFvH3jfhF/4RDRn/ib95fyr9OkX8x713drpdpasHSIPKDnzZPmbPTOe34Vcram5uN6is/LX/ITt0MDwv4O0jwhbzppsczTXLBrm5uJTJLOwzgsx+p6YHNb9FFaCCiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACo59vkPvjMqkYKAZJH0qSilJXTQGbYRsl24hSeO02fclzw2f4QeQMVpUUVlQpKlDlX9enkNu4UUUVsIKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKAP/Z",
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(\"Available N-terminal modifications:\")\n",
+ "for mod in n_term_modifications.keys():\n",
+ " print(f\"- {mod}\")\n",
+ "\n",
+ "# Let's visualize one of these modifications, e.g., Biotin\n",
+ "print(\"\\nBiotin SMILES:\", n_term_modifications[\"Biotin@Any_N-term\"])\n",
+ "biotin_mol = Chem.MolFromSmiles(n_term_modifications[\"Biotin@Any_N-term\"])\n",
+ "Draw.MolToImage(biotin_mol)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## The `modify_amino_acid` Function\n",
+ "\n",
+ "Now, let's explore the `modify_amino_acid` function, which allows us to add modifications to amino acids. This function takes three arguments:\n",
+ "\n",
+ "1. `aa_smiles`: SMILES string of an amino acid\n",
+ "2. `n_term_mod`: N-terminal modification (optional)\n",
+ "3. `c_term_mod`: C-terminal modification (optional)\n",
+ "\n",
+ "Let's see it in action:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Modified Lysine SMILES: [H]N(C(=O)CCCCC1SCC2NC(=O)NC21)[C@@H](CCCCN)C(=O)O\n"
+ ]
+ },
+ {
+ "data": {
+ "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAEsASwDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigDP/ALXg+0+V5c3l+b5Hn7R5fmf3euevGcYzxnNaFcCuuxt4i/s/yJvs/wDbJh8vzV2+YBv3/dzjdztz1744rvqzpz5rnZjMM6HLdWur/wBeYUUUVocYUUUUAFFFMlljhieWV1jjRSzu5wFA5JJ7CjcB9FNEsbStEHUyKAzIDyAc4JHvg/kajhu7a4EZhuIpBIpZNjg7gCASMdQCQD9RT5WBNRRRSAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKZLLHBC800iRxIpZ3c4CgdSSegoA8fg1C1bx2kQl+c+J5o8bT94RjI6V7HXzTYeIbT/hYsN+6yJo7eKbm5XVWUi2YMgUDf0zxn6GvpVWV1DKQykZBByCKiFNQvbqdWJxdTE8vP8AZVhaKKKs5QooooAKoa5ZS6l4f1KwhKiW5tZYULHADMhAz+dX6KqEnCSkugHOGGXQNQur0zfaUuhbwRrPPiQsDJkL8vP3gQPqSQBWDJa3fgOC01Z7aS80+2szFcwWvzNC7tF5koBxlMR7iOuST3JHoDKrY3KDjnkUpAIweRXXTxji/eV09/NJW+WnUlxKunalZ6vp8N/YXCXFrMu6ORDwR/Q+3arVecavo+qeAb2XXvCtqbrSJHMmpaMnUDvLAOxHde/5Y1W+K3ghNIh1J/EFqI5l3LCCWmB9DGuWB+ornrKmpv2Tbj0vv/SGr21Oyorzw+P/ABBrfy+E/Bd/NGel7qhFrDj+8AfmcfTBqG68JfETXIGl1HxvDpc6/NDbaTbERK3bc7EMw9jxWQz0miuK8I+NLi91CTw34mt00/xLbLkxg/u7tP8AnrCe4PcdRz6HHa0AFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFQNeWyGQNPGPLkWJ8sPldsbVPudy4+oqeucvPDL3V7d3Xnsry39rcoonkVNsRiLBkB2kny2xkHqPw1oxhJ2m7L/goTv0JdZ8Y6LoukR6lNdCeKYf6OlsPMac5AwgHXkgenNcwPDetfEBkufFyvp2ihxJDosMhDSgdDOw69jt7YHTnNrQrK30XxjYaI9wpmi0bzDBsO3fuiV3RjxgmPOOucnvXd111/ZUIezhG7lrzPtfSy6eb18rEq71ZQm0TS59GOjy6fbNppj8v7L5YEYX0A7Vwy6brfwzXdpC3Gs+F1JZ7Fm33FkvrET95B/d/wDrmvSKK5aFVU53lFSXZ/1o/Mpq5l6R4j0jXNNjv7C/hlgcKcltpUsdoDA8gkgjnuKvx3MEvl+XPG/moZI9rg71GMsPUcjn3HrXD3VhBqXi3UrbSriGO+tJrN7mHYwUw7lcqSBjdldw79em41saH4dutLudKlluWlFtp8ltIjSZVXZoiNgwOP3bdfaumth6MVzKTV9UnvZq6/4fqSmzpaKKK4CwooooAKKKKACsiz8K+H9P1KbUbTRrGG9mcvJOsC7yx6nPb8K16KACiiigDnvF3g+w8XaekVwz217bt5lnfQ8S20nZlPpwMjv9cEc9oPjqfRr5vDnj2W3sNTiTdBqDuI7a+jH8ascBW9V4/pXoVZus+H9I8Q28cGr6db3sUbiRFnQNtb1H+eaAOVu/iz4fNw1poUN/4hvF4MWl2zSKD7ucLj3BNQfafif4h/497PS/C9o38dw/2u5A9Qo+T8DXeWlna2FutvZ20NtAv3Y4Ywij6AcVPQBy9ouqeF9Nt49Q1NtRtovNuNQ1S9whRAMhERecknjsAD1OBXSwyrNDHKoYK6hgHUqQCM8g8g+xp/WqAtTYTTXEBJhlZ57lW3ySO20BQnPAwvQD0xQBfornNe8e+FvDTNHqutWsM68GBW8yXP8AuLlv0rGTxzruuID4Y8I3skTdLvVD9ljHuFPzMPpWtGlKrLljb5tJfexN2N3WPtx8Q2C2d1BBmxutwmQuGO6HGFDrz155x+NX9A8weHdMEpJk+yRbyTkk7BnnvXLr4d8U6hJHe6zfac1/BFMbU2sbKkDsYsLzyykI4J64Y/hd8L+JbVro+F7uKSy1bT4kjEM2B9oRVA8yM/xKcH3rtq0b0OWm1Ll3t6vXu1qv+G3lPXU6uiiivNLCiiigAooooAKKKKACiiigAooooA4P4guui6x4a8VFgkdlefZbpuceTMNpJ+hxj3Nd5XPeOtKh1vwJrdhOyqklo7Bm6KyjcrH2DKD+FcZ4a+LQk8MaYbvwv4pubkWyCWe200yRysAAWVt3IOM1rUrSqRjGX2VZel2/1ElY9UqO4uIrS2luZ3CQxIZHY9FUDJP5Vwf/AAtiy7+FfFo9c6U3H/j1c546+JMWt+FbjRNP0rW7G51R47IXN/ZGGJFdgGyxPdcjHvWQzq/hhC91ol74kuBi4127e756rFnbGv0AGR9a7iq2nWEGl6Za6fbLtgtYUhjHoqgAfoKs1tXrSr1HUl1/pL5CSsrBRRRWIwooooAKKKKACiiigAooooAKKKKACiiigApk00dvBJPKwWONS7MewAyTT65D4pao2k/DXXJ0J82W3+zRgdS0pEfHv82fwoAxfhFotnP4Y/4Se6sIDqmrXc9607xgyKrSHADHkDAzgetek1m+HtLXRPDemaWoH+iWscJx3KqAT+daVABWB4p8J2fie0j3u1tqFs3mWd9FxJA/qCOo9R/I81v0VdOrOlJTg7NCavuefaB4/fTb2Tw747kttM1m3Tel27hLe9j6CRGOAD6r/wDXAluvizoDXDWmhQah4hvFODHpds0iqfdzhce4Jrzb4qxx+IvGetQGNZZLWOw0ez3DIW4nk80sPfYGH4175p+n2ml2MVlZQRwW8ShVSNQoGBjtUDOF+0fE/wAQ/wCptdK8LWjfxzt9ruQPUAfJ+Bqi83in4Z3Rv9X1S68S+HJzm7naPE9i2fvhQTmP1A6e3f1GkdFkRkdQysMFSMgj0oAgsb611OxhvbG4juLWdQ8csbZVge4NWK8wvtK1L4YXs2seHoJbzwvK5kv9ITlrXPWWD29V/pyu/cfFLwTbabDfP4is2jmUMkcbGSU57GNcsD7ECgDsKK88Pj3xJrny+FPBd7JGel7qzC1ix/eCn5nH0xUF14T+JGrQG6u/G9vYXsZ8yC0061It946B2b5mU9MEH6GgD0qiuP8AB/jVtZuZtD1u1Gm+JbMf6RZsfllH/PSI/wASH9P1rsKACiiigDhfixfTx+Dxo1k2L7XLmPTYMdhIfnP02gg/Wux06xg0vTLXT7ZdsFrCkMY9FUAD9BXCz/8AFSfGyCH71n4ZsTK/p9pn4UH/AIAMj3Feh0AFc18QNCPiPwJq+mopM7wGSDHXzU+dMfioH410tFAGD4K10eJfBmk6vuBe4t1MuP8AnoPlcf8AfQNb1eefDn/iSeIPFfhFvlSyvfttovbyJxuAHspyPqa9DoAKKKKACiiigAooooAKKKKACiiigAooooAKK5E2uu/YrYGR2ItZQUjVkcHcnBYscsV3AHjmpJdCOsvNFFeapp+mjyyEjkaJpGw28fNyF5T6kH1zWXtH0R3fVKa1lUVvS/W3/DfM0td8V6D4agM2sarbWij+F2y5+ijJP4CvIPiB8QR4sk0LS9B0i8lhOoJdpNfJ9mhvfKBYRRluu4kehzgYOa9a0zwb4f0khrbTIGlBz50w8x8+u5skH6VY8Q+HdM8U6PLpeq24mt5OQejRt2ZT2Yev9KuPNb3jmrKmpWpNteat+F3+ZW8KeLdN8XaYbqxLxzxN5d1aTDbLbSd1de3fnvW9XjbaNqfg3WIpJbwHU2Ajt9XkUCO/QZ2w3Ho4GAHJ5xyeAR6f4f1n+29N+0PaTWk8bmKeCVSCjjqAe496mNVSk4nRWwFSlQjX0afbp/X4PR6mrRRVPVriS00i7niVWeOJmAYZHTqR3A61bdlc5IRc5KK6nl2iIuu+KtMmkUSJc6lc6p8wztSP5IsfRg1eu1zOg6Zp9rrk7Woim+z2kcUc8YACq7uzR4XC9QG6Z+b6V01Y0IuMdT0c1rwrVlyKyS/za/BpfIKKKK3PMCsXTfCPh3R7yW807RLC2uZXLtLHAobJ9D2HsMCtqigAooooA5nxh4MtPFdtDKsz2Or2h32Oow8SQP8A1U9xWN4f+IBs7mXQPHD22k63apu8+SQR295H2kjY4H1X/wCuB39ZeteG9F8RLAusaZbXwgffF58Ybae/4HAyOhxzQBy118WtBed7XQLbUfEN2vBj0y2Z1U+7nCge4zUPnfE/xD/qrbSvC1q38Uzfa7kD1AHyfga721tLaxt1t7S3it4E4WOJAir9AOKmoA8nh0jxN8ML+81mJ5PE+mX7iXVMRBbuNwMeYgBwygfw9vbk16XpOrWeuaXb6lYSmS2uEEkbFSpwRkZB5FXazbjTWjlNzp7CGf8AiT+CT6j+tY1pzguaEb9+/wAu/oNJM0qw/FHirTfCWmi81Ayu0jiKC3gQvLPIeiKo7n8quvc3jo8UMcS3AxgOx9skccii00uOCTz52+0XR5Mrjp/ujtUKvKc1GnG66t6W/wA3/Vwt3ON8KaH4g1PxlL431+KLS3ls/sdvpkR3OIt24GZu7ewHp0xivQaKK6RBRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAVtQ0+01Sxlsr2BZreUYZGH+cH3rBvbKWBri3t4brzFijTTWiLlI8DHJBwMHk7uowOeldPRUSgpHRRxE6Wm67fd/lZ+WgUUUVZzjUjSJdsaKi5zhRgU6iigNwooooAKKKKACiiigAooooAKKKKACiiigCNYUWUy/MXIIyWJwD6enQVJRRSUUtkAUUUUwCiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigD/2Q==",
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Modify Lysine with Biotin N-terminal modification\n",
+ "modified_lys = modify_amino_acid(aa_smiles[\"K\"], n_term_mod=\"Biotin@Any_N-term\")\n",
+ "print(\"Modified Lysine SMILES:\", modified_lys)\n",
+ "mod_lys_mol = Chem.MolFromSmiles(modified_lys)\n",
+ "Draw.MolToImage(mod_lys_mol)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To obtained the unmodified aminoacid, just pass the corresponding SMILES to the function with no additional arguments:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Lysine SMILES: [H]N([H])[C@@H](CCCCN)C(=O)O\n"
+ ]
+ },
+ {
+ "data": {
+ "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAEsASwDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAK4/4iw3F1pGmWlteTWctxqlvEJ4WwyZJ59+cHFdhWB4vFhDpEWpancSQWumXUV6zRpuJKNwMehJrqwUuXEQa3v669NCZbHKnX7rU9X0OO4LW+oW0OoW9/AjEATJGnIHdTkMp9GqhL4nvP+FNiP+ytaEv9kKPt21dmdg+fdv3Y75xmtHR7zwj458aHV9Gvbj7fb2bxTp5DIjow2hiSPvDOPp9K6l/C8D+CP+EXNxJ5H2IWfnYG7aF25x0zXr1a1GhKEKkGmnFtNNW1k36rVW8iEm7tM2bYk2sJJySi/wAqlrm9TNzBrFhb2GpXDXReLdaKEMaQBv3jvxkZXIBz97GO9dJXh1KfKlK+5omFFFFZDCiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACue8d2ovfAeuwnvZSsPqqlh/KuhrP12Brnw/qVugy0trKgHqShFbYefJWhPs0/xE9UeQ/s9W6iLXrgj5swIP/Hz/hXt1eS/s/wxnwdqN4mczXxTkdlRcf8AoRr1qvQzyvCvj6lSDunaz9EkRSVoJMozaLpVxei9n0yzluwQRO8Cs4I6HcRnjtV6iivLcpSsm9jQKKKKkAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKjnnitbeW4ncRwxIXd26KoGST+FSVk+Kf+RQ1r/rwn/8ARbUpOybNKMFOpGD6tIvTX1rb2gu5p0S3O3EhPHzEBfzJH51BqOt6ZpLIt/ewwPJyiM3zMPUDrj3rj9bXXR4MhN1LpxtM2uRFG4kx5seOS2PTtW1ou3/hMfEfnY+1bofLz18nyxjHtu3Z96x9q27Jf1r/AJHf9SpxpupKV0r7eTit7afFrozftbu3vrZLm0njngcZWSNgyn8RU1croNza2uu+IAs0UVlJfRxw5YKrTmIeYF9Tkcj1zXVVpCXMrnJiaPsp8q20f3pP71ezCiiirOcKKKKACgjIwaK4/wATfETS9Bu/7Ls4ptY11+I9NsRvkz/tkZCD1J5xzg0AZnwW046d8PUDDBmvLiTH0kKf+yV6HXlfhXxrL4Tt4dE8a6LL4faWaSS3umcSWzmSRpNpkXIQjdjk9BzivUo5EmiWWJ1eNwGVlOQwPQg0AOooooAKKKKACiis7Wtd0vw7p73+r30NnbL/ABytjJ9AOrH2GTQBo0V5knxdkWYX1z4R1i38MMSF1hoyeOPnaILkJz97P4Z4HoOmarYa1YR32m3cN3ayDKyxMGB9vY+3WgC5RRRQAUUUUAFFFBIAyTgCgAorgNZ+K2m2momw0LTb/wAR3MJzdjTIzIluo6ksAQT7D8wa6Tw34t0XxZZG50i9SbbxLC3yyxH0dDyP5elAG3RRRQAUUUUAFFFFABRRRQAVHcQRXVtLbzoJIZUKOjdGUjBB/CpKKBptO6K81ja3FmLSaBHtxtxGRx8pBX8iB+VQahommaq6PfWcU0iDCuRhgPTI5x7VfopOKe6LjVqQd4ya+ZlXfhrRr3SV0ubT4PsafcjVduw+qkcg+9YH2bxJ4S5s3k13SF/5YSH/AEqEf7Lfxj26+ldpRUSpp6rRm9LG1IJxn70Xunr8+6fmjI0TxNpOvqRY3SmdBmS2f5ZY+cfMh5HIIz0rXrlPE3w/0jxHcrqKNNputRcxanYt5cyn/ax94ex7dxXLyeP9Y+H+o2uk+PVju7a5D/ZdYsk5dUxuMsQ5BAYZK/gDyatXtqc03FybgrL7/wDI9TrC8S+MND8JWqzaverE7/6q3T55ZT6Kg5PPfp71xNv4+1v4h3lzp/gOKOysrcqtzrF8uSm7OPLi7k4OC34gcV03hr4e6P4eum1KUzaprUnMupXzeZKT/s54QfTtxk0yTC2eN/H/APrDN4S8Pv8Awrzf3C+56RA/mPcV1/hvwlonhOzNto9ikG7mWU/NJKfVnPJ/l6Vt0UAV72xtNSs5LO+torm2lGHimQMrD3Brz+TwVr3g2R7vwHfCSyyWk0G/ctCfXynPKH2Jx6ntXpFFAHIeG/iHpeu3p0q8im0fXU4k02+GxyfVD0cemOe+K6+sTxJ4S0TxZZi31eySYpzFMvyyxH1RxyP5etchv8a/D7/WCfxZ4eT+JR/p9svuOkoH5/QUAelVHcXENpbyXFxNHDDGu55JGCqo9STwBXnsnxg0jU0itvCVlea9q0y5W0ihaIRe8rsAFAPfn+tEHgHVfFE6X3xA1IXaAh49GsmKWkR7bu8hHv8AqKAHXPxC1DxHcyad4A00agytsl1a6BSzhPfB6yEeg+vIq3o3w2tY9QTWfE97J4h1ocia6A8mA+kUX3VH/wCviu0trW3sraO2tYI4II12pFEoVVHoAOBUtACFQylWAIIwQe9cFqfw4Nlfyax4J1D+wdTbmSBV3Wlz7PH0H1Xp6ZrvqKAOB0z4jmyv49H8baedB1NztjnY7rS5PqknQfRunrmu9BDAEEEHkEd6qanpVhrVhJY6naQ3drIMNFKoYH39j71wR8LeJ/ArGbwZdnU9IXltCv5TlB6QSnp9Dx9TQB6TRXA2/wAYPCgsZpNTuJ9Kvrc7Z9Ou4WE6v6BQPm+o9s4qn9p8bePjizSbwnoD/wDLeVc31wv+yv8AyyB9evcZoA6DxN8QNG8NzrYEy6hrEvEOm2K+ZM57ZA+6Pc/hmsAeGPFfjo+b4wvDpOjtyuiafL88g9JpR1+g4+hrqvDPgzQ/CUDJpdmBPJzNdSnfNMe5ZzyeecdPat+gCjpOj6boWnpY6VZQ2lqnSOJcDPqfU+55rnvEvw80vXbwarZyzaRrqcx6lYnY+f8AbA4ceueccZrr6KAPPLbxfr/hJxaeO7NJLXcFi1uwQtE3/XVAMofpxzwOM13lneW2oWsd1Z3EVxbyDKSxOGVh7EVLJGksbRyIrowwysMgj0IrhrzwHdaNdSal4Ivxplwx3S6fLlrOc+6/wH3H4YrspQw9WChJ8k+71i/XqvXVem5Luju6K4IeOtWsNd0yy13R4rGGe3DXbLL5ht3aVo0YkcbCQv03jmrmoeMLuz1W/tVt4DHbarY2IZs5KTqhZjz1G44qnl1dStZbX3TW9t15tfLUOdHY0Viaprj2Wu6FYQiKRNQnlikYnJULEzgjB9VArbrknTlBRb6q6+9r9B3CiiioGFFFFABRRRQAUUUUAFefeLFV/i98P0YBlKalkEZBBgWvQa4XxLbTS/FrwPOigxQx3+87gMZiAHHU/hSbtuOMXJ2irkHgVQvxA8fhQABeWwAHbEVeg1wvgm2lh8ceOppFwk19CUOQcgR4/Cu6pp32CUXF2krBRRRQIKKKKACiiigCvbWFnZyTSWtpBA87b5mijCmRvViOp5PJqxRRQAUUUUAFFFFABRRRQBn3OhaRealBqN1pdlPfQY8q5kgVpI8HIwxGRg1oUUUAFFFFABRRRQAUUUUAcxf6AdT8YXT3lqJdLudH+ySMSMFjKSVx1zg5zXIQ+DdeeC+tNUtlvkfWbBvMdlIuLWIIhdgT12L8wPU5616tRXoUcyrUlaNun4O/6akOCZxt14OsbLxT4dv9F0SytUt7iY3UtvCkZCGF1GcYJG4j1rcs9Ynm1c6fd6e1rI0LTxHzVfKKwU7sfdPzDjJHXnitasXSNFutO1C6vLnUReSXP32aAKwAPyqCDwqgkBQB1J6kk5uv7aH753cVZb33b9OvXy9R2tsbVFFFcZQUUUUAFFFFABRRRQAVyWtf8lH8Le0V3/6AK62uY1a1uJfH/h65SCVoIYbkSSqhKoSoAyegzWdX4V6r8ztwDSqSb/ln/wCkMh8Kf8jP4sP/AE+R/wDoFdbXMeGbW4g1/wATSzW8saTXitEzoQHGzqpPUfSunopfD9/5hmDTr6do/wDpKCiiitDiCiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigArl7u3TXPGM+nXrSNZWdpHKtuJCqyO7MCzYIzgLgA8cmuorK1HRBeX0V/bXk9lfRoYvOhCnehOdrKwIIzyPSoqK6OrC1FTk23ZtWT7Pv38tO5n6Oh0vxXf6NBJI1j9ljuoo3cv5LFmVlBPODtBx25rpazdK0aPTZLi4e4mury5IM1xNjc2OgAAAAGTgAd60qKaaWosVUjUqXi76LXu0tX/Wr3eoUUUVZzk=",
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAASwAAAEsCAIAAAD2HxkiAAAUOElEQVR4nO3da3BU5R3H8d/mns0CcjEkBDAGU5AIyEUwkZsCUjBtp0wzdKyZMrZmOhUjwwtjncKKVARGSpgWZug4HdJiEXAqDQpIuIgoCCgClTtICAkXTYAICSGX/ffFCZtkk2yyye7+92x/n/FFcvb20J5vzrPPOdlYRAREpCdEewBE/+8YIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBESKWOERMoYIZEyRkikjBG2bPNmLFmCq1ebbHznHWzbBgDnz2PFCty40eTWixexYgVKS/03SAoOjLBl69fj1Vcxd26TjX/+Mz74AACOHsWcObh2rcmtJ05gzhwUF/tvkBQcGGGrevXCe++hoEB7HBTswrQHELgGDkRCAn7/e/z3v4iK0h4NBS8eCd15+21cuYK33tIeBwU1Hgnd6dcPr7yCRYvw7LMYOND11p//HJGRDd/euuXPoVHwYIRtyMnB2rV48UXs2OF6U0YGevdu+PbECaxa5c+hUZBghG2IjMRf/oIf/xibNrne9OyzGDSo4dutWxkhdQQjbNvUqfjFL/DKK3A4tIdCwYgLM+2yfDmuXMH58x48pLAQ69Zh716fjYmCBSNsl759MX++B/fftAnTp+P0abzxBl5+2WfDoqDA6WjLZs5EeXmTLXPmICQEKSkAMGwYli9vsioDYPBgLF+OhAQASEnB9u3o2xelpUhKwooV/ho3mZBFRLTHEMwuXMCTT6KwUHscFMB4JGzVkSMoLER4eMPJwORkPPCAB8/gcCAnB9nZvhgdBQ8eCVs1dSq2b2+yJTUV+/a19+F1dZgzB7du4e9/RwjfelPruHe0qnv3Jt9arUhNbe9jy8rwzDOIiWGB1DZOR1t1332uWx5/vL2PnT8f58+jtLT+Opv//Kd+wYaoOUbYqv79m3xbW4ukpPY+duVKrw+HghanSq0aMKDh6/BwRESgTx+90VDwYoStSkxERET911FR6NkTsbGqA6IgxQhbFRuLLl3qv7ZY0K0bQkNVB0RBihG2qndvWK31X9fVNQRJ5F2MsFVWK6KjAcBmQ0UFbDbtAVGQYoTuxMQAqJ+F8khIPsII3TEiNH6NsPlpQyKvYITuxMQgIgKVlUDTMxZEXsQI3YmJQVQU6uoA4OGHtUdDQYoRuuN8H2izefb7E0TtxwjdCQtzVFcLgJgYiY/XHg0FKV476k5i4sehoRdstug7dypjY1/UHg4FJ0bozhNPRFVVZdfV1Q0bNsxiYYTkE5yOuhMXF2ez2QB04VlC8hlG6E7v3r1jYmLACMmXGKE73bt3j4qKAiMkX2KE7lgsFuNI2KNHD+2xUNBihG0wIhzA62XIZxhhG2JiYiwWy6DGf/mFyKsYYRtsNpvNZktMTNQeCAUtD88TrlqFxx/HiBFNNv7tbxg+HI891rDl4kVs2YKSElitGDUKkyeb93P/rFZrly5d+vDjZchnPGzjxRfx8ceuG19+GZs3N3y7cCGSk2G3Y98+bN6M6dMxciSKijo7UiX3339/REREd5cPISXyHm8foNaswfz5eOMNXL2KXbuwfz+OHkVZGX72M9TWevm1/CIxMTE0NNRisWgPhIKWtyN8801MmoRXX22Yf6akYOVKHDmCDz/08mv5RXJysvYQKMh5NcLCQpw7h2nTXLdPm4awMOzc6c3X8peEhIRevXppj4KCmecXcK9ciX//u8mWu3frvyguBtDCL96FhSEhof5WU3E4HO++++7t27dPnTpl9rMUVVVVxtU/FGg8PxI++ih+9asm/4XdK9l449Tin3kSgdneVn355ZdxcXHLli07fvz44MGD586dW11drT2ojjh27NigQYOsVmtUVNSCBQu0h9NxN2/enDlz5owZMwoKCrTH4lXiEUAWLXLdGBUl8+aJiFy4IIC8/bbrHWpqJCxMXnrJs9fSc+nSpeeee85YjAkJCXFeOJqSkrJz507t0XmgsrJy4cKFVufHpwIA0tPTz549qz00j+Xn5xv/EIvFEhYWlpWVVVpaqj0o7/BqhCKSlCRPP+16h61bBZBNm0REbtzwfJD+U1FRsXjxYqO6iIiI7OzskpISEdm0adPD9z5kJj09/dtvv9Ueadvy8/MffPBBY8zDhw/fsmXLjBkzunbtCiA8PDw7O7u8vFx7jO1y8ODBJ554wviHREVFpaamhoaGAoiNjX3nnXfq6uq0B9hZ3o5w9WoBZNWqhlsvXpTkZBkyRGpqpKJC+veX9HS5eLETY/YJh8OxYcOGB+69oU1PTz9//nzjO1RXV+fm5ho7cXR0dE5Ozq1bt7RG696hQ4fGjh1r/EMGDx68detW502lpaXZ2dnGThwfH5+Xl+dwOBSH6l5JSUlWVpYx2p49e+bm5lZXV4vIyZMnn376aeMfOGLEiM8++0x7pJ3i7QgdDpk7VywWGTZMZs2S9HSxWmXgQDl3TkRkxw6JjhZAbDZ56y2pqur0+L3jwIEDaWlpzv9T9+zZ09o9L1++nJmZacxUExISAm0nLi4uzsrKCgkJAdCrV6/c3Nza2trmd/vqq6+c/97HHnts//79/h+qe5WVlc4piXHcvnnzpst98vPzjR+aFoslIyOjqKhIZaid52GES5fKF1+4bly+XFx+FB07Jn/6k2RlyZw5sm6d3L3bcNOlS5KZKYAAMmCAbNjQoWF7zaVLl5xR9enTZ/Xq1S3utS4OHjw4ZswYYyeeOHHi0aNH/TBU94yJtPE5AMZE2v1s0zjy9+vXz3jfm5mZee3aNb+N1g1jYM6LddPT088ZP8FbUlFRYbfbjVXfmJgYu91eFTA/2dvPwwi9ZfduGTq0PsVJk+Sbb/w/hNu3b9vt9ujoaOf08ocffmj/w+vq6vLy8mJjYwEY6wTff/+970brRpsTaTeM/xEiIyMB3HfffYsXL77b+Cem3zV++zdixIhPPvmkPY8qKirKzMw0HpWcnPzhhx/6epzepRShiNTUyOrV0quXABIeLtnZ4q91AofDkZeXFx8f75zJXLhwoWNPdePGjZycnIiICAA9evRobfrnO40n0iNHjnQzkXbjzJkzzzzzjPEkAwcO3LZtm9fH2abi4mLnlCQ+Pr6dU5LGdu7c+cgjjxj/ismTJ584ccJHQ/U6vQgNZWWSnS2hoQJIXJysXi0+Xuzav3//4/f+9vyoUaP27t3b+ec8derU1KlTneuQXnnONjWfSHdynbCgoEBlBdjTibQb1dXVy5YtMxbPIiMjly7dffu2dwfrE9oRGg4flrFj62eno0bJvn2+eBFj0uJcU+n8Xuui8SmB9PT0iz5bATbmkMYboQ5MpN0wVoCN5RA/rAAbE+n+/ft7t3xjBbhbt36xsXV9+khengTS2lkLAiNCEXE45J//lD59BJCQkKrZs724TnDr1i3nXmu1Wn23bxlresYPdavVarfb79y548XnN96IxsXFdX4i7UZJSYnzp1Xfvn3z8vK8/hIi8sUXX6Smpjon0p9++ql3n//QoeujR9f/YB83Tr7+2rtP700BE6GhokLsdomKWj9+vM1ms9vtnVwnMPba3r17O/fawsJCbw22NcZE0di9HnrooQ1eWgHevXv3o48+6jyv4OuTYwcOHHCuAD/55JPHjh3z1jM3npJ4ZSLdGodD8vIkLs74wS6ZmRIYC8CuAixCERGpO3Nmxk9/6jzXXFBQ0LHn2blz57Bhw4znGT169D7fzHJbs2vXriFDhjjXCY4fP97hp7p48aKzauPQ5J+Tk15fAW48kTamJN6aSLt9UbHbJTJSAOneXXJzpabG16/pmUCM0FBQUDB48OAOLLuLyJkzZzIyMozH9uvXT+uUek1NzerVq43fhOrYlWJ+m0i7cf369ezs7LCwsM6sADefSPthStLYmTMyfXr97HTQINFYAG5V4EYoTa8UM9bN2twFjXMGxokv4+ytd9+VdUBZWVnjK8XaOftqPpH23UpPe5w8ebLDK8C7du1qPJH+/PPPfTdO9/LzJSmpPsX0dPHBG+qOCOgIDZcvX3ZeiuXmSjHjsGPMnYxLQK5cueL/0bbm8OHDztPQo0aNcn+l2I4dO4YOHWrcecyYMX6eSLuRn59vXMvSzp8LZ8+edU5J/DmRduPuXcnNlS5dBJDoaLHbpbJSRCQ7WyZPFpeLc6ZNk3XrRETWr5fx413nsR98IOPHS+fPgpggQsOhQ4ec5/cmTJjgcqVYQUGB8w3YxIkTvw7ItbDGK/IWiyUzM/Pq1asu9zl9+nQgTKTdqKysdF5p5OZKMWMi3XhKUmns7IHh0iX55S/FYhFAHnxQjhyRtDQBXH8FKDKy/lrpt98WQFxWCf/6VwGk2TWtHjNNhHLvShfnDC0zM/O777776KOPnnrqKWOv7d+/v4/W072otSvFrl+/HmgTaTdcVoA3b97svCnQJtJu7Nkjw4ZJbKzcvClpaTJ8uISGynvvNdyBEbbs2rVruMfYZQ3Jycm61z165PTp09OnTzdGHhUVNXr06PDwcAChoaEvvPBC8yNkYGq8Ajxy5MgtW7YsXrzYednNmDFjAvD3M1zU1opxfVtammRmym9/K/HxDV0xwpZVVVUZ6zQTJkwwzjUZvwowZcoU7aF5rKCgoPEnmg4dOjQwJ9JuVFdXL1++vFu3bmgkKSnp/fff1x6aZ4wIr12Tbt1k9uz6jS4Rnjol5883/Ldgwf9rhJWVlQCio6NFZPv27YcOHdq2bRuAqVOnag+tI8rLy9PT0ydMmPDaa69pj6Xjzp07l5KSYrFYQkJCpkyZEsgT6dYYEYpIbq6EhIhxCHeJsMX/Oh+h+f5ctsPhAGAslk6ZMgXA1q1bnVtMp2vXrpsbf365OQ0YMOCbb74pLi6Ojo7u2bOn9nA6ZfZs5OVh9mwcPOh6U1ERwsMbvl2zBn/4gxde0XwRigiAxh+J3XwLqejbt6/2ELwgNBQrV2LsWKxd63pT796IiGj41lt/OdZ8R4/GR8LWthB1Rmoqnn8ef/wjHA5/vJz5dlxGSH6wZAnu3EFNTXvvX1mJN99ERgbmzUN5uWevZb4dl9NR8oMePbBokQf3z8nBDz9gwQKUlyM727PXMt97Qh4JyRc2bmz4KHnDb36DSZPQowcAZGUhI6PJG0IAv/41fvITdO0KALm5CA2tv+fMmZ69NCMkAoDmfwY2JARJSfVfd+nSwjKMzQabrf5ro0AAhw/j3u/PtVcwRMjpKAWI06fx+uvYts2zR5nv6NE8OR4JKRB89RUyMvCPf+BHP/LsgebbcTkdpQC0Zg2mTMHvfoeqKuzY4dljg2E6amzhdJQUXb+OrCwUFaGoCAAmT/bgseaLsLVTFDwSkqK5czv+WPPtuJyOUpAx35HwfpFLqal1ja4SnhgZeWLcOMTHK46KqMPMF6HV4bDu399wBgfoV1mJvXsxYIDiqIg6zIRTOOOi2saTz+ZbiMzDhDsuI6TgYsIdVwQAGp+QaL6FyDxMGCGPhBRcTLjjMkIKLibccTkdpeBiwgh5JKTgYsIdlxFScDHhjsvpKAUXE0bIIyEFFxPuuIyQgosJd1xORym4mDBCHgkpuJhwx2WEFFxMuOM2T47TUTIzi/HZEESkxXy/1AsAtbXYuBEff4yrV9GjB9LSMGtWw+ewEpmKCaej5eUYPx6zZuH6dQwZgpgYzJuHlBScOaM9MqKOMOF09PnnsXEj9uzBiBH1W0pKMG4cunXD4cN8Z0imY7YjYVkZ1q7FSy81FAggIQGLFuHIEezZozcyog4yW4QHD6KmBmlprtvHjQOAzz/3/4iIOslsEX7/PQD06+e6PT4e4eG4ds3/IyLqJLNFGB4OABUVrtvv3kVtLSIj/T8iok4yW4TGx40WFrpuLyyESOMPIyUyC7NFOGIE4uKwcaPr9vXrERqKadM0xkTUKWaLMDwcdjs2bcLChaiuBgAR/OtfWLIEWVlITFQeHpHnTHieEMDSpXj9dYSHIykJJSUoK0NWFlascP2b4kRmYM4IAZSVYc8elJbCZsPYsejfX3tARB1k2giJgoXZ3hMSBR1GSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIpY4REyhghkTJGSKSMERIp+x+k2xdjRUF+GwAAAABJRU5ErkJggg==",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "non_modified_lys = modify_amino_acid(aa_smiles[\"K\"])\n",
+ "print(\"Lysine SMILES:\", non_modified_lys)\n",
+ "Draw.MolToImage(Chem.MolFromSmiles(non_modified_lys))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As we can see, the Biotin modification has been added to the N-terminus of Lysine.\n",
+ "\n",
+ "## Working with Post-Translational Modifications (PTMs)\n",
+ "\n",
+ "The `ptm_dict` contains various post-translational modifications. Let's examine one:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Phosphorylated Serine SMILES: O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe])\n"
+ ]
+ },
+ {
+ "data": {
+ "image/jpeg": "",
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(\"Phosphorylated Serine SMILES:\", ptm_dict[\"Phospho@S\"])\n",
+ "phos_ser_mol = Chem.MolFromSmiles(ptm_dict[\"Phospho@S\"])\n",
+ "Draw.MolToImage(phos_ser_mol)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can use the `modify_amino_acid` function with these PTMs as well:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Modified Phosphorylated Serine SMILES: [H]N(C(C)=O)[C@@H](COP(=O)(O)O)C(=O)O\n"
+ ]
+ },
+ {
+ "data": {
+ "image/jpeg": "",
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Modify phosphorylated Serine with Acetyl N-terminal modification\n",
+ "mod_phos_ser = modify_amino_acid(ptm_dict[\"Phospho@S\"], n_term_mod=\"Acetyl@Any_N-term\")\n",
+ "print(\"Modified Phosphorylated Serine SMILES:\", mod_phos_ser)\n",
+ "mod_phos_ser_mol = Chem.MolFromSmiles(mod_phos_ser)\n",
+ "Draw.MolToImage(mod_phos_ser_mol)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Conclusion\n",
+ "\n",
+ "In this tutorial, we've explored how to work with amino acids and post-translational modifications using SMILES notation and the `modify_amino_acid` function. We've seen how to:\n",
+ "\n",
+ "1. Represent amino acids using SMILES\n",
+ "2. Add N-terminal modifications\n",
+ "3. Work with post-translational modifications\n",
+ "4. Visualize molecular structures using RDKit\n",
+ "\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "alphabase",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/nbs_tests/constants/aa.ipynb b/nbs_tests/constants/aa.ipynb
index 9c352d60..02b112cd 100644
--- a/nbs_tests/constants/aa.ipynb
+++ b/nbs_tests/constants/aa.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -38,7 +38,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -64,6 +64,7 @@
" | \n",
" aa | \n",
" formula | \n",
+ " smiles | \n",
" mass | \n",
" \n",
" \n",
@@ -72,156 +73,182 @@
" 65 | \n",
" A | \n",
" C(3)H(5)N(1)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn] | \n",
" 7.103711e+01 | \n",
" \n",
" \n",
" 66 | \n",
" B | \n",
" C(1000000) | \n",
+ " NaN | \n",
" 1.200000e+07 | \n",
"
\n",
" \n",
" 67 | \n",
" C | \n",
" C(3)H(5)N(1)O(1)S(1) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn] | \n",
" 1.030092e+02 | \n",
"
\n",
" \n",
" 68 | \n",
" D | \n",
" C(4)H(5)N(1)O(3)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn] | \n",
" 1.150269e+02 | \n",
"
\n",
" \n",
" 69 | \n",
" E | \n",
" C(5)H(7)N(1)O(3)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn] | \n",
" 1.290426e+02 | \n",
"
\n",
" \n",
" 70 | \n",
" F | \n",
" C(9)H(9)N(1)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn] | \n",
" 1.470684e+02 | \n",
"
\n",
" \n",
" 71 | \n",
" G | \n",
" C(2)H(3)N(1)O(1)S(0) | \n",
+ " N([Xe])([Xe])CC(=O)[Rn] | \n",
" 5.702146e+01 | \n",
"
\n",
" \n",
" 72 | \n",
" H | \n",
" C(6)H(7)N(3)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn] | \n",
" 1.370589e+02 | \n",
"
\n",
" \n",
" 73 | \n",
" I | \n",
" C(6)H(11)N(1)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn] | \n",
" 1.130841e+02 | \n",
"
\n",
" \n",
" 74 | \n",
" J | \n",
" C(6)H(11)N(1)O(1)S(0) | \n",
+ " NaN | \n",
" 1.130841e+02 | \n",
"
\n",
" \n",
" 75 | \n",
" K | \n",
" C(6)H(12)N(2)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn] | \n",
" 1.280950e+02 | \n",
"
\n",
" \n",
" 76 | \n",
" L | \n",
" C(6)H(11)N(1)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn] | \n",
" 1.130841e+02 | \n",
"
\n",
" \n",
" 77 | \n",
" M | \n",
" C(5)H(9)N(1)O(1)S(1) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn] | \n",
" 1.310405e+02 | \n",
"
\n",
" \n",
" 78 | \n",
" N | \n",
" C(4)H(6)N(2)O(2)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn] | \n",
" 1.140429e+02 | \n",
"
\n",
" \n",
" 79 | \n",
" O | \n",
" C(12)H(19)N(3)O(2) | \n",
+ " C[C@@H]1CC=N[C@H]1C(=O)NCCCC[C@@H](C(=O)[Rn])N... | \n",
" 2.371477e+02 | \n",
"
\n",
" \n",
" 80 | \n",
" P | \n",
" C(5)H(7)N(1)O(1)S(0) | \n",
+ " N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn] | \n",
" 9.705276e+01 | \n",
"
\n",
" \n",
" 81 | \n",
" Q | \n",
" C(5)H(8)N(2)O(2)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn] | \n",
" 1.280586e+02 | \n",
"
\n",
" \n",
" 82 | \n",
" R | \n",
" C(6)H(12)N(4)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn] | \n",
" 1.561011e+02 | \n",
"
\n",
" \n",
" 83 | \n",
" S | \n",
" C(3)H(5)N(1)O(2)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn] | \n",
" 8.703203e+01 | \n",
"
\n",
" \n",
" 84 | \n",
" T | \n",
" C(4)H(7)N(1)O(2)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn] | \n",
" 1.010477e+02 | \n",
"
\n",
" \n",
" 85 | \n",
" U | \n",
" C(3)H(5)N(1)O(1)Se(1) | \n",
+ " N([Xe])([Xe])[C@@]([H])(C[Se][H])C(=O)[Rn] | \n",
" 1.509536e+02 | \n",
"
\n",
" \n",
" 86 | \n",
" V | \n",
" C(5)H(9)N(1)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn] | \n",
" 9.906841e+01 | \n",
"
\n",
" \n",
" 87 | \n",
" W | \n",
" C(11)H(10)N(2)O(1)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C... | \n",
" 1.860793e+02 | \n",
"
\n",
" \n",
" 88 | \n",
" X | \n",
" C(1000000) | \n",
+ " NaN | \n",
" 1.200000e+07 | \n",
"
\n",
" \n",
" 89 | \n",
" Y | \n",
" C(9)H(9)N(1)O(2)S(0) | \n",
+ " N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn] | \n",
" 1.630633e+02 | \n",
"
\n",
" \n",
" 90 | \n",
" Z | \n",
" C(1000000) | \n",
+ " NaN | \n",
" 1.200000e+07 | \n",
"
\n",
" \n",
@@ -229,36 +256,64 @@
""
],
"text/plain": [
- " aa formula mass\n",
- "65 A C(3)H(5)N(1)O(1)S(0) 7.103711e+01\n",
- "66 B C(1000000) 1.200000e+07\n",
- "67 C C(3)H(5)N(1)O(1)S(1) 1.030092e+02\n",
- "68 D C(4)H(5)N(1)O(3)S(0) 1.150269e+02\n",
- "69 E C(5)H(7)N(1)O(3)S(0) 1.290426e+02\n",
- "70 F C(9)H(9)N(1)O(1)S(0) 1.470684e+02\n",
- "71 G C(2)H(3)N(1)O(1)S(0) 5.702146e+01\n",
- "72 H C(6)H(7)N(3)O(1)S(0) 1.370589e+02\n",
- "73 I C(6)H(11)N(1)O(1)S(0) 1.130841e+02\n",
- "74 J C(6)H(11)N(1)O(1)S(0) 1.130841e+02\n",
- "75 K C(6)H(12)N(2)O(1)S(0) 1.280950e+02\n",
- "76 L C(6)H(11)N(1)O(1)S(0) 1.130841e+02\n",
- "77 M C(5)H(9)N(1)O(1)S(1) 1.310405e+02\n",
- "78 N C(4)H(6)N(2)O(2)S(0) 1.140429e+02\n",
- "79 O C(12)H(19)N(3)O(2) 2.371477e+02\n",
- "80 P C(5)H(7)N(1)O(1)S(0) 9.705276e+01\n",
- "81 Q C(5)H(8)N(2)O(2)S(0) 1.280586e+02\n",
- "82 R C(6)H(12)N(4)O(1)S(0) 1.561011e+02\n",
- "83 S C(3)H(5)N(1)O(2)S(0) 8.703203e+01\n",
- "84 T C(4)H(7)N(1)O(2)S(0) 1.010477e+02\n",
- "85 U C(3)H(5)N(1)O(1)Se(1) 1.509536e+02\n",
- "86 V C(5)H(9)N(1)O(1)S(0) 9.906841e+01\n",
- "87 W C(11)H(10)N(2)O(1)S(0) 1.860793e+02\n",
- "88 X C(1000000) 1.200000e+07\n",
- "89 Y C(9)H(9)N(1)O(2)S(0) 1.630633e+02\n",
- "90 Z C(1000000) 1.200000e+07"
+ " aa formula \\\n",
+ "65 A C(3)H(5)N(1)O(1)S(0) \n",
+ "66 B C(1000000) \n",
+ "67 C C(3)H(5)N(1)O(1)S(1) \n",
+ "68 D C(4)H(5)N(1)O(3)S(0) \n",
+ "69 E C(5)H(7)N(1)O(3)S(0) \n",
+ "70 F C(9)H(9)N(1)O(1)S(0) \n",
+ "71 G C(2)H(3)N(1)O(1)S(0) \n",
+ "72 H C(6)H(7)N(3)O(1)S(0) \n",
+ "73 I C(6)H(11)N(1)O(1)S(0) \n",
+ "74 J C(6)H(11)N(1)O(1)S(0) \n",
+ "75 K C(6)H(12)N(2)O(1)S(0) \n",
+ "76 L C(6)H(11)N(1)O(1)S(0) \n",
+ "77 M C(5)H(9)N(1)O(1)S(1) \n",
+ "78 N C(4)H(6)N(2)O(2)S(0) \n",
+ "79 O C(12)H(19)N(3)O(2) \n",
+ "80 P C(5)H(7)N(1)O(1)S(0) \n",
+ "81 Q C(5)H(8)N(2)O(2)S(0) \n",
+ "82 R C(6)H(12)N(4)O(1)S(0) \n",
+ "83 S C(3)H(5)N(1)O(2)S(0) \n",
+ "84 T C(4)H(7)N(1)O(2)S(0) \n",
+ "85 U C(3)H(5)N(1)O(1)Se(1) \n",
+ "86 V C(5)H(9)N(1)O(1)S(0) \n",
+ "87 W C(11)H(10)N(2)O(1)S(0) \n",
+ "88 X C(1000000) \n",
+ "89 Y C(9)H(9)N(1)O(2)S(0) \n",
+ "90 Z C(1000000) \n",
+ "\n",
+ " smiles mass \n",
+ "65 N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn] 7.103711e+01 \n",
+ "66 NaN 1.200000e+07 \n",
+ "67 N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn] 1.030092e+02 \n",
+ "68 N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn] 1.150269e+02 \n",
+ "69 N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn] 1.290426e+02 \n",
+ "70 N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn] 1.470684e+02 \n",
+ "71 N([Xe])([Xe])CC(=O)[Rn] 5.702146e+01 \n",
+ "72 N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn] 1.370589e+02 \n",
+ "73 N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn] 1.130841e+02 \n",
+ "74 NaN 1.130841e+02 \n",
+ "75 N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn] 1.280950e+02 \n",
+ "76 N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn] 1.130841e+02 \n",
+ "77 N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn] 1.310405e+02 \n",
+ "78 N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn] 1.140429e+02 \n",
+ "79 C[C@@H]1CC=N[C@H]1C(=O)NCCCC[C@@H](C(=O)[Rn])N... 2.371477e+02 \n",
+ "80 N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn] 9.705276e+01 \n",
+ "81 N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn] 1.280586e+02 \n",
+ "82 N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn] 1.561011e+02 \n",
+ "83 N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn] 8.703203e+01 \n",
+ "84 N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn] 1.010477e+02 \n",
+ "85 N([Xe])([Xe])[C@@]([H])(C[Se][H])C(=O)[Rn] 1.509536e+02 \n",
+ "86 N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn] 9.906841e+01 \n",
+ "87 N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C... 1.860793e+02 \n",
+ "88 NaN 1.200000e+07 \n",
+ "89 N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn] 1.630633e+02 \n",
+ "90 NaN 1.200000e+07 "
]
},
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -293,7 +348,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -314,7 +369,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -339,7 +394,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -359,7 +414,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -382,7 +437,7 @@
" 453.26996726, 396.24850354, 259.18959168, 146.1055277 ]])}"
]
},
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -404,7 +459,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -424,7 +479,7 @@
" 1.28094963e+02]])"
]
},
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -436,7 +491,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -447,16 +502,16 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"replace_atoms({'N':'15N'})\n",
- "assert '15N' in AA_Formula['A']\n",
- "assert '15N' in AA_Formula['K']\n",
+ "assert '15N' in aa_formula.loc['A'][\"formula\"]\n",
+ "assert '15N' in aa_formula.loc['K'][\"formula\"]\n",
"replace_atoms({\"15N\":'N'})\n",
- "assert '15N' not in AA_Formula['A']\n",
- "assert '15N' not in AA_Formula['K']"
+ "assert '15N' not in aa_formula.loc['A'][\"formula\"]\n",
+ "assert '15N' not in aa_formula.loc['K'][\"formula\"]"
]
},
{
@@ -472,6 +527,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
}
},
"nbformat": 4,
diff --git a/nbs_tests/constants/modification.ipynb b/nbs_tests/constants/modification.ipynb
index 355c919a..09e41e0e 100644
--- a/nbs_tests/constants/modification.ipynb
+++ b/nbs_tests/constants/modification.ipynb
@@ -1481,11 +1481,12 @@
"outputs": [],
"source": [
"#| hide\n",
+ "prev_value = (modification.MOD_DF.classification=='User-added').sum()\n",
"add_new_modifications([\n",
" (\"Hello@S\",\"H(2)\"),\n",
" (\"World@S\",\"O(10)\",\"O(3)\")\n",
"])\n",
- "assert (modification.MOD_DF.classification=='User-added').sum()==2\n",
+ "assert (modification.MOD_DF.classification=='User-added').sum() - prev_value == 2\n",
"assert 'Hello@S' in modification.MOD_DF.mod_name\n",
"assert 'World@S' in modification.MOD_DF.mod_name\n",
"assert modification.MOD_DF.loc['World@S','modloss'] > 0\n",
@@ -1504,7 +1505,7 @@
" \"Hi@S\":{'composition':\"H(2)\"},\n",
" \"AlphaX@S\":{'composition':\"O(10)\",'modloss_composition':\"O(3)\"}\n",
"})\n",
- "assert (modification.MOD_DF.classification=='User-added').sum()==4\n",
+ "assert (modification.MOD_DF.classification=='User-added').sum() - prev_value == 4\n",
"assert 'Hi@S' in modification.MOD_DF.mod_name\n",
"assert 'Hi@S' in modification.MOD_DF.index\n",
"assert 'AlphaX@S' in modification.MOD_DF.mod_name\n",
diff --git a/requirements.txt b/requirements.txt
index 8299761f..84de8990 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ dask_expr
pyahocorasick
pyteomics
lxml
+rdkit==2024.3.3
diff --git a/tests/run_tests.sh b/tests/run_tests.sh
index f39c67c6..8a159d9b 100755
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -13,3 +13,4 @@ TUTORIAL_NBS=$(find ../docs/tutorials -name "*.ipynb")
ALL_NBS=$(echo $DOCS_NBS$'\n'$TEST_NBS$'\n'$TUTORIAL_NBS)
python -m pytest --nbmake $(echo $ALL_NBS)
+python -m pytest
diff --git a/tests/test_smiles.py b/tests/test_smiles.py
new file mode 100644
index 00000000..5d611ca1
--- /dev/null
+++ b/tests/test_smiles.py
@@ -0,0 +1,198 @@
+from collections import defaultdict
+
+import pytest
+from rdkit import Chem
+
+from alphabase.constants.atom import ChemicalCompositonFormula
+from alphabase.smiles.smiles import AminoAcidModifier
+
+aa_modifier = AminoAcidModifier()
+modify_amino_acid = aa_modifier.modify_amino_acid
+aa_smiles = aa_modifier.aa_smiles
+n_term_modifications = aa_modifier.n_term_modifications
+c_term_modifications = aa_modifier.c_term_modifications
+ptm_dict = aa_modifier.ptm_dict
+
+
+@pytest.fixture
+def alanine_smiles():
+ return aa_smiles["A"]
+
+
+@pytest.fixture
+def lysine_smiles():
+ return aa_smiles["K"]
+
+
+def test_modify_amino_acid_no_modification(alanine_smiles):
+ result = modify_amino_acid(alanine_smiles)
+ expected = "N[C@@H](C)C(=O)O"
+ assert Chem.MolToSmiles(Chem.MolFromSmiles(result)) == Chem.MolToSmiles(
+ Chem.MolFromSmiles(expected)
+ )
+
+
+def test_modify_amino_acid_n_term_modification(alanine_smiles):
+ result = modify_amino_acid(alanine_smiles, n_term_mod="Acetyl@Any_N-term")
+ expected = "CC(=O)N[C@@H](C)C(=O)O"
+ assert Chem.MolToSmiles(Chem.MolFromSmiles(result)) == Chem.MolToSmiles(
+ Chem.MolFromSmiles(expected)
+ )
+
+
+def test_modify_amino_acid_c_term_modification(alanine_smiles):
+ result = modify_amino_acid(alanine_smiles, c_term_mod="Methyl@Any_C-term")
+ expected = "N[C@@H](C)C(=O)OC"
+ assert Chem.MolToSmiles(Chem.MolFromSmiles(result)) == Chem.MolToSmiles(
+ Chem.MolFromSmiles(expected)
+ )
+
+
+def test_modify_amino_acid_both_modifications(alanine_smiles):
+ result = modify_amino_acid(
+ alanine_smiles, n_term_mod="Acetyl@Any_N-term", c_term_mod="Methyl@Any_C-term"
+ )
+ expected = "CC(=O)N[C@@H](C)C(=O)OC"
+ assert Chem.MolToSmiles(Chem.MolFromSmiles(result)) == Chem.MolToSmiles(
+ Chem.MolFromSmiles(expected)
+ )
+
+
+def test_modify_amino_acid_with_ptm(lysine_smiles):
+ lysine_with_ptm = ptm_dict["Acetyl@K"]
+ result = modify_amino_acid(lysine_with_ptm)
+ expected = "CC(=O)NCCCC[C@H](N)C(=O)O"
+ assert Chem.MolToSmiles(Chem.MolFromSmiles(result)) == Chem.MolToSmiles(
+ Chem.MolFromSmiles(expected)
+ )
+
+
+def test_modify_amino_acid_with_ptm_and_n_term_mod(lysine_smiles):
+ lysine_with_ptm = ptm_dict["Acetyl@K"]
+ result = modify_amino_acid(lysine_with_ptm, n_term_mod="mTRAQ@Any_N-term")
+ expected = "CC(=O)NCCCC[C@H](NC(=O)CN1CCN(C)CC1)C(=O)O"
+ assert Chem.MolToSmiles(Chem.MolFromSmiles(result)) == Chem.MolToSmiles(
+ Chem.MolFromSmiles(expected)
+ )
+
+
+@pytest.mark.parametrize("n_term_mod", n_term_modifications.keys())
+def test_all_n_term_modifications(alanine_smiles, n_term_mod):
+ result = modify_amino_acid(alanine_smiles, n_term_mod=n_term_mod)
+ assert Chem.MolFromSmiles(result) is not None
+
+
+@pytest.mark.parametrize("c_term_mod", c_term_modifications.keys())
+def test_all_c_term_modifications(alanine_smiles, c_term_mod):
+ result = modify_amino_acid(alanine_smiles, c_term_mod=c_term_mod)
+ assert Chem.MolFromSmiles(result) is not None
+
+
+def test_invalid_n_term_modification(alanine_smiles):
+ with pytest.raises(ValueError, match="Unrecognized N-terminal modification"):
+ modify_amino_acid(alanine_smiles, n_term_mod="InvalidMod")
+
+
+def test_invalid_c_term_modification(alanine_smiles):
+ with pytest.raises(ValueError, match="Unrecognized C-terminal modification"):
+ modify_amino_acid(alanine_smiles, c_term_mod="InvalidMod")
+
+
+def test_invalid_amino_acid_smiles():
+ with pytest.raises(ValueError):
+ modify_amino_acid("InvalidSMILES")
+
+
+def test_dimethyl_n_term_modification(alanine_smiles):
+ result = modify_amino_acid(alanine_smiles, n_term_mod="Dimethyl@Any_N-term")
+ expected = "CN(C)[C@@H](C)C(=O)O"
+ assert Chem.MolToSmiles(Chem.MolFromSmiles(result)) == Chem.MolToSmiles(
+ Chem.MolFromSmiles(expected)
+ )
+
+
+@pytest.mark.parametrize("aa, smiles", aa_smiles.items())
+def test_all_amino_acids(aa, smiles):
+ result = modify_amino_acid(smiles)
+ assert Chem.MolFromSmiles(result) is not None
+
+
+@pytest.mark.parametrize("ptm, smiles", ptm_dict.items())
+def test_all_ptms(ptm, smiles):
+ result = modify_amino_acid(smiles)
+ assert Chem.MolFromSmiles(result) is not None
+
+
+@pytest.fixture
+def water_formula():
+ return ChemicalCompositonFormula("H(2)O(1)")
+
+
+@pytest.fixture
+def methane_formula():
+ return ChemicalCompositonFormula("C(1)H(4)")
+
+
+def test_init():
+ formula = ChemicalCompositonFormula("C(6)H(12)O(6)")
+ expected = defaultdict(int, {"C": 6, "H": 12, "O": 6})
+ assert formula.elements == expected
+
+
+def test_init_with_isotopes():
+ formula = ChemicalCompositonFormula("13C(1)C(5)H(12)O(6)")
+ expected = defaultdict(int, {"13C": 1, "C": 5, "H": 12, "O": 6})
+ assert formula.elements == expected
+
+
+def test_from_smiles():
+ formula = ChemicalCompositonFormula.from_smiles("CCO")
+ expected = defaultdict(int, {"C": 2, "H": 6, "O": 1})
+ assert formula.elements == expected
+
+
+def test_str_representation(water_formula):
+ assert str(water_formula) == "H(2)O(1)"
+
+
+def test_repr_representation(water_formula):
+ assert repr(water_formula) == "ChemicalCompositonFormula('H(2)O(1)')"
+
+
+def test_addition(water_formula, methane_formula):
+ result = water_formula + methane_formula
+ expected = ChemicalCompositonFormula("C(1)H(6)O(1)")
+ assert result.elements == expected.elements
+
+
+def test_subtraction(water_formula, methane_formula):
+ result = methane_formula - water_formula
+ expected = ChemicalCompositonFormula("C(1)H(2)O(-1)")
+ assert result.elements == expected.elements
+
+
+def test_parse_formula_with_parentheses():
+ formula = ChemicalCompositonFormula("C(6)H(12)O(6)")
+ expected = defaultdict(int, {"C": 6, "H": 12, "O": 6})
+ assert formula.elements == expected
+
+
+def test_parse_rdkit_formula():
+ formula = ChemicalCompositonFormula._from_rdkit_formula("[13C]CH3OH")
+ expected = defaultdict(int, {"13C": 1, "C": 1, "H": 4, "O": 1})
+ assert formula.elements == expected
+
+
+def test_zero_count_elements():
+ formula = ChemicalCompositonFormula("C(1)H(0)O(2)")
+ assert str(formula) == "C(1)O(2)"
+
+
+def test_error_handling_invalid_rdkit_mol():
+ with pytest.raises(ValueError):
+ ChemicalCompositonFormula.from_smiles("InvalidSMILES")
+
+
+def test_error_unknown_atom():
+ with pytest.raises(ValueError):
+ ChemicalCompositonFormula("C(6)H(12)Atom(6)")