Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dynamic generation of SMILES for PTMs #199

Merged
merged 8 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 24 additions & 18 deletions alphabase/constants/aa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
parse_formula,
reset_elements,
)
from alphabase.yaml_utils import load_yaml

# We use all 128 ASCII code to represent amino acids for flexible extensions in the future.
# The amino acid masses are stored in 128-lengh array :py:data:`AA_ASCII_MASS`.
# If an ASCII code is not in `AA_Formula`, the mass will be set as a large value to disable MS search.
AA_Formula: dict = load_yaml(os.path.join(CONST_FILE_FOLDER, "amino_acid.yaml"))
AA_Formula: pd.DataFrame = pd.read_csv(
boopthesnoot marked this conversation as resolved.
Show resolved Hide resolved
os.path.join(CONST_FILE_FOLDER, "amino_acid.tsv"), sep="\t", index_col=0
)

#: AA mass array with ASCII code, mass of 'A' is AA_ASCII_MASS[ord('A')]
AA_ASCII_MASS: np.ndarray = np.ones(128) * 1e8

Expand All @@ -28,20 +30,23 @@


def replace_atoms(atom_replace_dict: typing.Dict):
for aa, formula in list(AA_Formula.items()):
for aa, row in AA_Formula.iterrows():
formula = row["formula"]
atom_comp = dict(parse_formula(formula))
for atom_from, atom_to in atom_replace_dict.items():
if atom_from in atom_comp:
atom_comp[atom_to] = atom_comp[atom_from]
del atom_comp[atom_from]
AA_Formula[aa] = "".join([f"{atom}({n})" for atom, n in atom_comp.items()])
AA_Formula.loc[aa, "formula"] = "".join(
[f"{atom}({n})" for atom, n in atom_comp.items()]
)


def reset_AA_mass() -> np.ndarray:
"""AA mass in np.array with shape (128,)"""
global AA_ASCII_MASS
for aa, chem in AA_Formula.items():
AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(chem)
for aa, row in AA_Formula.iterrows():
AA_ASCII_MASS[ord(aa)] = calc_mass_from_formula(row["formula"])
return AA_ASCII_MASS


Expand All @@ -51,15 +56,14 @@ def reset_AA_mass() -> np.ndarray:
def reset_AA_df():
global AA_DF
AA_DF = pd.DataFrame()
AA_DF["aa"] = [chr(aa) for aa in range(len(AA_ASCII_MASS))]
AA_DF["formula"] = [""] * len(AA_ASCII_MASS)
aa_idxes = []
formulas = []
for aa, formula in AA_Formula.items():
aa_idxes.append(ord(aa))
formulas.append(formula)
AA_DF.loc[aa_idxes, "formula"] = formulas
num_rows = len(AA_ASCII_MASS)
AA_DF["aa"] = [chr(aa) for aa in range(num_rows)]
AA_DF["formula"] = [""] * num_rows
AA_DF["smiles"] = [""] * num_rows
AA_DF["mass"] = AA_ASCII_MASS
for aa, row in AA_Formula.iterrows():
AA_DF.loc[ord(aa), "formula"] = row["formula"]
AA_DF.loc[ord(aa), "smiles"] = row["smiles"]
return AA_DF


Expand All @@ -69,8 +73,8 @@ def reset_AA_df():
def reset_AA_Composition():
global AA_Composition
AA_Composition = {}
for aa, formula, _mass in AA_DF.values:
AA_Composition[aa] = dict(parse_formula(formula))
for aa, row in AA_Formula.iterrows():
AA_Composition[aa] = dict(parse_formula(row["formula"]))
return AA_Composition


Expand All @@ -85,12 +89,14 @@ def reset_AA_atoms(atom_replace_dict: typing.Dict = {}):
reset_AA_Composition()


def update_an_AA(aa: str, formula: str):
def update_an_AA(aa: str, formula: str, smiles: str = ""):
aa_idx = ord(aa)
AA_Formula.loc[aa, "formula"] = formula
AA_Formula.loc[aa, "smiles"] = smiles
AA_DF.loc[aa_idx, "formula"] = formula
AA_DF.loc[aa_idx, "smiles"] = smiles
AA_ASCII_MASS[aa_idx] = calc_mass_from_formula(formula)
AA_DF.loc[aa_idx, "mass"] = AA_ASCII_MASS[aa_idx]
AA_Formula[aa] = formula
AA_Composition[aa] = dict(parse_formula(formula))


Expand Down
196 changes: 196 additions & 0 deletions alphabase/constants/atom.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os
import re
import typing
from collections import defaultdict

import numba
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

from alphabase.constants._const import CONST_FILE_FOLDER, common_const_dict
from alphabase.yaml_utils import load_yaml
Expand Down Expand Up @@ -200,3 +204,195 @@ def calc_mass_from_formula(formula: str):
mass of the formula
"""
return np.sum([CHEM_MONO_MASS[elem] * n for elem, n in parse_formula(formula)])


class ChemicalCompositonFormula:
boopthesnoot marked this conversation as resolved.
Show resolved Hide resolved
"""
Initialize the ChemicalCompositonFormula with a given formula.

Parameters
----------
formula : str
The chemical formula as a string.

Returns
-------
None
"""

def __init__(self, formula=None):
self.elements = (
defaultdict(int) if formula is None else self._parse_formula(formula)
boopthesnoot marked this conversation as resolved.
Show resolved Hide resolved
)

@classmethod
def from_smiles(cls, smiles: str) -> "ChemicalCompositonFormula":
"""
Create a ChemicalCompositonFormula instance from a SMILES string.

Parameters
----------
smiles : str
The SMILES representation of the molecule.

Returns
-------
ChemicalCompositonFormula
An instance of the class based on the SMILES string.

Raises
------
ValueError
If the SMILES string is invalid and can't be converted to an RDKit molecule.
"""
mol = Chem.MolFromSmiles(smiles)
if not mol:
raise ValueError(f"Invalid RDKit molecule: {smiles}")
formula = rdMolDescriptors.CalcMolFormula(
mol, separateIsotopes=True, abbreviateHIsotopes=False
)
formula = formula.replace("[1H]", "H")
boopthesnoot marked this conversation as resolved.
Show resolved Hide resolved
return cls._from_rdkit_formula(formula)

@classmethod
def _from_rdkit_formula(cls, formula: str) -> "ChemicalCompositonFormula":
"""
Create a ChemicalCompositonFormula instance from an RDKit formula.

Parameters
----------
formula : str
The chemical formula as generated by RDKit.

Returns
-------
ChemicalCompositonFormula
An instance of the class based on the RDKit formula.
"""
instance = cls.__new__(cls)
instance.elements = instance._parse_rdkit_formula(formula)
return instance

def _parse_formula(self, formula) -> dict:
"""
Parse a chemical formula string into a dictionary of elements and their counts.

Parameters
----------
formula : str
The chemical formula to parse.

Returns
-------
dict
A dictionary with elements as keys and their counts as values.
"""
# Expected pattern: (\d+)?: optional isotope number, ([A-Z][a-z]*): element symbol, (?:\(([-]?\d+)\))?: optional count in parentheses
# Example: 13C(2)H(3)O(-1) -> [('13', 'C', '2'), ('', 'H', '3'), ('', 'O', '-1')]
pattern = r"(\d+)?([A-Z][a-z]*)(?:\(([-]?\d+)\))?"
boopthesnoot marked this conversation as resolved.
Show resolved Hide resolved
matches = re.findall(pattern, formula)
element_counts = defaultdict(int)

for isotope, element, count in matches:
if isotope:
element = f"{isotope}{element}"
count = int(count) if count else 1
element_counts[element] += count

return element_counts

def _parse_rdkit_formula(self, formula: str) -> dict:
"""
Parse an RDKit-generated formula string into a dictionary of elements and their counts.

Parameters
----------
formula : str
The RDKit-generated chemical formula to parse.

Returns
-------
dict
A dictionary with elements as keys and their counts as values.
"""
# Expected pattern: (\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*)): isotope in square brackets or element symbol, followed by (\d*): optional count
# Example: [13C]C2H5OH -> [('[13C]', '13', 'C', '', ''), ('C', '', '', 'C', '2'), ('H', '', '', 'H', '5'), ('O', '', '', 'O', ''), ('H', '', '', 'H', '')]
pattern = r"(\[(\d+)([A-Z][a-z]*)\]|([A-Z][a-z]*))(\d*)"
boopthesnoot marked this conversation as resolved.
Show resolved Hide resolved
matches = re.findall(pattern, formula)
element_counts = defaultdict(int)

for match in matches:
count = int(match[4]) if match[4] else 1
if match[1]: # noqa: SIM108
# Isotope, see 0th element in the example above
element = f"{match[1]}{match[2]}"
else:
# Regular element, see the rest in the example above
element = match[3]
element_counts[element] += count

return element_counts

def __str__(self):
"""
Return a string representation of the chemical formula.

Returns
-------
str
The chemical formula as a string.
"""
return "".join(
f"{element}({count})"
for element, count in sorted(self.elements.items())
if count != 0
)

def __repr__(self):
"""
Return a string representation of the ChemicalCompositonFormula instance.

Returns
-------
str
A string representation of the instance.
"""
return f"ChemicalCompositonFormula('{self.__str__()}')"

def __add__(self, other):
"""
Add two ChemicalCompositonFormula instances.

Parameters
----------
other : ChemicalCompositonFormula
The other instance to add.

Returns
-------
ChemicalCompositonFormula
A new instance representing the sum of the two formulas.
"""
result = ChemicalCompositonFormula()
for element in set(self.elements.keys()) | set(other.elements.keys()):
result.elements[element] = self.elements[element] + other.elements[element]
return result

def __sub__(self, other):
"""
Subtract one ChemicalCompositonFormula instance from another.

Parameters
----------
other : ChemicalCompositonFormula
The instance to subtract.

Returns
-------
ChemicalCompositonFormula
A new instance representing the difference of the two formulas.
"""
result = ChemicalCompositonFormula()
for element in set(self.elements.keys()) | set(other.elements.keys()):
result.elements[element] = self.elements[element] - other.elements[element]
return result
30 changes: 30 additions & 0 deletions alphabase/constants/const_files/amino_acid.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
formula smiles
A C(3)H(5)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(C)C(=O)[Rn]
B C(1000000)
C C(3)H(5)N(1)O(1)S(1) N([Xe])([Xe])[C@@]([H])(CS)C(=O)[Rn]
D C(4)H(5)N(1)O(3)S(0) N([Xe])([Xe])[C@@]([H])(CC(=O)O)C(=O)[Rn]
E C(5)H(7)N(1)O(3)S(0) N([Xe])([Xe])[C@@]([H])(CCC(=O)O)C(=O)[Rn]
F C(9)H(9)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(Cc1ccccc1)C(=O)[Rn]
G C(2)H(3)N(1)O(1)S(0) N([Xe])([Xe])CC(=O)[Rn]
H C(6)H(7)N(3)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC1=CN=C-N1)C(=O)[Rn]
I C(6)H(11)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])([C@]([H])(CC)C)C(=O)[Rn]
J C(6)H(11)N(1)O(1)S(0)
K C(6)H(12)N(2)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CCCCN)C(=O)[Rn]
L C(6)H(11)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC(C)C)C(=O)[Rn]
M C(5)H(9)N(1)O(1)S(1) N([Xe])([Xe])[C@@]([H])(CCSC)C(=O)[Rn]
N C(4)H(6)N(2)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CC(=O)N)C(=O)[Rn]
O C(12)H(19)N(3)O(2) C[C@@H]1CC=N[C@H]1C(=O)NCCCC[C@@H](C(=O)[Rn])N([Xe])([Xe])
P C(5)H(7)N(1)O(1)S(0) N1([Xe])[C@@]([H])(CCC1)C(=O)[Rn]
Q C(5)H(8)N(2)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CCC(=O)N)C(=O)[Rn]
R C(6)H(12)N(4)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CCCNC(=N)N)C(=O)[Rn]
S C(3)H(5)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])(CO)C(=O)[Rn]
T C(4)H(7)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])([C@]([H])(O)C)C(=O)[Rn]
U C(3)H(5)N(1)O(1)Se(1) N([Xe])([Xe])[C@@]([H])(C[Se][H])C(=O)[Rn]
V C(5)H(9)N(1)O(1)S(0) N([Xe])([Xe])[C@@]([H])(C(C)C)C(=O)[Rn]
W C(11)H(10)N(2)O(1)S(0) N([Xe])([Xe])[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)[Rn]
X C(1000000)
Y C(9)H(9)N(1)O(2)S(0) N([Xe])([Xe])[C@@]([H])(Cc1ccc(O)cc1)C(=O)[Rn]
Z C(1000000)
s C(3)H(5)N(1)O(2)S(0)
t C(4)H(8)N(1)O(5)P(1)
y C(9)H(9)N(1)O(2)S(0)
40 changes: 0 additions & 40 deletions alphabase/constants/const_files/amino_acid.yaml

This file was deleted.

Loading
Loading