Skip to content

Commit c1be34a

Browse files
committed
latest results
1 parent 2d43dcd commit c1be34a

File tree

404 files changed

+33549
-370
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

404 files changed

+33549
-370
lines changed

Eval-LLM.ipynb

Lines changed: 2003 additions & 318 deletions
Large diffs are not rendered by default.
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
Classifies: CHEBI:138141 17alpha-hydroxy-C21-steroid
3+
"""
4+
from rdkit import Chem
5+
from rdkit.Chem import AllChem
6+
from rdkit.Chem import Descriptors
7+
from rdkit.Chem import rdMolDescriptors
8+
9+
def is_17alpha_hydroxy_C21_steroid(smiles: str):
10+
"""
11+
Determines if a molecule is a 17alpha-hydroxy-C21-steroid.
12+
13+
A 17alpha-hydroxy-C21-steroid is defined as:
14+
Any C21-steroid carrying a hydroxy substituent at the 17alpha-position.
15+
Note that individual examples may have ring substituents at other positions
16+
and/or contain double bonds, aromatic A-rings, expanded/contracted rings etc.,
17+
so the formula and mass may vary from that given for the generic structure.
18+
19+
Args:
20+
smiles (str): SMILES string of the molecule
21+
22+
Returns:
23+
bool: True if molecule is a 17alpha-hydroxy-C21-steroid, False otherwise
24+
str: Reason for classification
25+
"""
26+
mol = Chem.MolFromSmiles(smiles)
27+
if mol is None:
28+
return False, "Invalid SMILES string"
29+
30+
# Check number of atoms
31+
if mol.GetNumAtoms() != 21:
32+
return False, "Molecule does not have 21 atoms (not a C21-steroid)"
33+
34+
# Check for 17alpha-hydroxy group
35+
atom_17 = mol.GetAtomWithIdx(16) # 17th atom is at index 16 in RDKit
36+
if atom_17.GetAtomicNum() != 8 or not atom_17.GetIsAromatic(): # Oxygen atom, not aromatic
37+
return False, "No hydroxy group at position 17alpha"
38+
39+
# Check for steroid backbone
40+
ring_info = mol.GetRingInfo()
41+
rings = ring_info.AtomRings()
42+
if len(rings) < 4:
43+
return False, "Less than 4 rings (not a steroid)"
44+
45+
# Check for cyclopentane rings
46+
cyclopentane_rings = [ring for ring in rings if len(ring) == 5]
47+
if len(cyclopentane_rings) != 3:
48+
return False, "Not 3 cyclopentane rings (not a steroid)"
49+
50+
# Check for cyclohexane ring
51+
cyclohexane_rings = [ring for ring in rings if len(ring) == 6]
52+
if len(cyclohexane_rings) != 1:
53+
return False, "Not 1 cyclohexane ring (not a steroid)"
54+
55+
# If all checks pass, it's a 17alpha-hydroxy-C21-steroid
56+
return True, "17alpha-hydroxy-C21-steroid"
57+
58+
59+
__metadata__ = { 'chemical_class': { 'id': 'CHEBI:138141',
60+
'name': '17alpha-hydroxy-C21-steroid',
61+
'definition': 'Any C21-steroid carrying a hydroxy '
62+
'substituent at the 17alpha-position. '
63+
'Note that individual examples may '
64+
'have ring substituents at other '
65+
'positions and/or contain double '
66+
'bonds, aromatic A-rings, '
67+
'expanded/contracted rings etc., so '
68+
'the formula and mass may vary from '
69+
'that given for the generic structure.',
70+
'parents': ['CHEBI:35342', 'CHEBI:61313']},
71+
'config': { 'llm_model_name': 'claude-3-sonnet',
72+
'f1_threshold': 0.8,
73+
'max_attempts': 5,
74+
'max_negative_to_test': None,
75+
'max_positive_in_prompt': 50,
76+
'max_negative_in_prompt': 20,
77+
'max_instances_in_prompt': 100,
78+
'test_proportion': 0.1},
79+
'message': None,
80+
'attempt': 0,
81+
'success': True,
82+
'best': True,
83+
'error': '',
84+
'stdout': None,
85+
'num_true_positives': 0,
86+
'num_false_positives': 0,
87+
'num_true_negatives': 183925,
88+
'num_false_negatives': 1,
89+
'num_negatives': None,
90+
'precision': 0.0,
91+
'recall': 0.0,
92+
'f1': 0.0,
93+
'accuracy': 0.9999945630307842}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
"""
2+
Classifies: CHEBI:32167 1,2-diacyl-3-(alpha-D-6-sulfoquinovosyl)-sn-glycerol
3+
"""
4+
from rdkit import Chem
5+
from rdkit.Chem import Descriptors
6+
from rdkit.Chem import rdMolDescriptors
7+
8+
def is_1_2_diacyl_3__alpha_D_6_sulfoquinovosyl__sn_glycerol(smiles: str):
9+
"""
10+
Determines if a molecule is a 1,2-diacyl-3-(alpha-D-6-sulfoquinovosyl)-sn-glycerol.
11+
12+
A glycosylglycerol derivative in which the glycosyl moiety is alpha-D-6-sulfoquinovosyl attached at O-3,
13+
with O-1 and O-2 both acylated.
14+
15+
Args:
16+
smiles (str): SMILES string of the molecule
17+
18+
Returns:
19+
bool: True if the molecule is a 1,2-diacyl-3-(alpha-D-6-sulfoquinovosyl)-sn-glycerol, False otherwise
20+
str: Reason for classification
21+
"""
22+
mol = Chem.MolFromSmiles(smiles)
23+
if mol is None:
24+
return False, "Invalid SMILES string"
25+
26+
# Check if the molecule has a glycerol backbone
27+
glycerol_atoms = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'O' and atom.GetTotalNumHs() == 1]
28+
if len(glycerol_atoms) != 3:
29+
return False, "Molecule does not have a glycerol backbone"
30+
31+
# Check if the glycosyl moiety is alpha-D-6-sulfoquinovosyl
32+
sulfoquinovosyl_atoms = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'S' and atom.GetTotalNumHs() == 0]
33+
if len(sulfoquinovosyl_atoms) != 1:
34+
return False, "Molecule does not contain a sulfoquinovosyl moiety"
35+
36+
sulfoquinovosyl_atom = mol.GetAtomWithIdx(sulfoquinovosyl_atoms[0])
37+
if len(sulfoquinovosyl_atom.GetNeighbors()) != 4:
38+
return False, "Sulfoquinovosyl moiety is not properly connected"
39+
40+
# Check if the sulfoquinovosyl moiety is attached to the glycerol backbone at O-3
41+
sulfoquinovosyl_neighbor = [neighbor.GetIdx() for neighbor in sulfoquinovosyl_atom.GetNeighbors() if neighbor.GetIdx() in glycerol_atoms]
42+
if len(sulfoquinovosyl_neighbor) != 1:
43+
return False, "Sulfoquinovosyl moiety is not attached to the glycerol backbone at O-3"
44+
45+
# Check if O-1 and O-2 are acylated
46+
acylated_atoms = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'O' and atom.GetTotalNumHs() == 0 and atom.GetIdx() not in sulfoquinovosyl_neighbor]
47+
if len(acylated_atoms) != 2:
48+
return False, "O-1 and O-2 are not both acylated"
49+
50+
return True, "Molecule is a 1,2-diacyl-3-(alpha-D-6-sulfoquinovosyl)-sn-glycerol"
51+
52+
53+
__metadata__ = { 'chemical_class': { 'id': 'CHEBI:32167',
54+
'name': '1,2-diacyl-3-(alpha-D-6-sulfoquinovosyl)-sn-glycerol',
55+
'definition': 'A glycosylglycerol derivative in '
56+
'which the glycosyl moiety is '
57+
'alpha-D-6-sulfoquinovosyl attached at '
58+
'O-3, with O-1 and O-2 both acylated.',
59+
'parents': ['CHEBI:63427']},
60+
'config': { 'llm_model_name': 'claude-3-sonnet',
61+
'f1_threshold': 0.0,
62+
'max_attempts': 5,
63+
'max_negative_to_test': None,
64+
'max_positive_in_prompt': 50,
65+
'max_negative_in_prompt': 20,
66+
'max_instances_in_prompt': 100,
67+
'test_proportion': 0.1},
68+
'message': None,
69+
'attempt': 0,
70+
'success': True,
71+
'best': True,
72+
'error': '',
73+
'stdout': None,
74+
'num_true_positives': 0,
75+
'num_false_positives': 13,
76+
'num_true_negatives': 183921,
77+
'num_false_negatives': 1,
78+
'num_negatives': None,
79+
'precision': 0.0,
80+
'recall': 0.0,
81+
'f1': 0,
82+
'accuracy': 0.9999238861554354}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""
2+
Classifies: CHEBI:16110 1,2-diacyl-sn-glycero-3-phosphocholine(1+)
3+
"""
4+
from rdkit import Chem
5+
from rdkit.Chem import AllChem
6+
from rdkit.Chem import Descriptors
7+
from rdkit.Chem import rdMolDescriptors
8+
9+
def is_1_2_diacyl_sn_glycero_3_phosphocholine_1__(smiles: str):
10+
"""
11+
Determines if a molecule is a 1,2-diacyl-sn-glycero-3-phosphocholine(1+).
12+
13+
Args:
14+
smiles (str): SMILES string of the molecule
15+
16+
Returns:
17+
bool: True if molecule is a 1,2-diacyl-sn-glycero-3-phosphocholine(1+), False otherwise
18+
str: Reason for classification
19+
"""
20+
mol = Chem.MolFromSmiles(smiles)
21+
if mol is None:
22+
return False, "Invalid SMILES string"
23+
24+
# Check for the presence of a phosphorus atom
25+
p_atom = None
26+
for atom in mol.GetAtoms():
27+
if atom.GetSymbol() == 'P':
28+
p_atom = atom
29+
break
30+
if p_atom is None:
31+
return False, "No phosphorus atom found"
32+
33+
# Check for the presence of a choline group
34+
choline_group = False
35+
for atom in mol.GetAtoms():
36+
if atom.GetSymbol() == 'N' and atom.GetFormalCharge() == 1:
37+
choline_group = True
38+
break
39+
if not choline_group:
40+
return False, "No choline group found"
41+
42+
# Check for the presence of two acyl chains
43+
acyl_chains = 0
44+
for atom in mol.GetAtoms():
45+
if atom.GetSymbol() == 'C' and atom.GetDegree() == 1 and atom.GetIsAromatic() is False:
46+
env = Chem.FindAtomEnvironmentOfRadiusN(mol, 4, atom.GetIdx())
47+
if 'O=C' in ''.join([mol.GetAtomWithIdx(i).GetSymbol() for i in env]):
48+
acyl_chains += 1
49+
if acyl_chains != 2:
50+
return False, f"Found {acyl_chains} acyl chains instead of 2"
51+
52+
# Check for the glycerol backbone
53+
glycerol_backbone = False
54+
for bond in mol.GetBonds():
55+
atom1 = mol.GetAtomWithIdx(bond.GetBeginAtomIdx())
56+
atom2 = mol.GetAtomWithIdx(bond.GetEndAtomIdx())
57+
if atom1.GetSymbol() == 'O' and atom2.GetSymbol() == 'P':
58+
glycerol_backbone = True
59+
break
60+
if not glycerol_backbone:
61+
return False, "No glycerol backbone found"
62+
63+
return True, "This molecule is a 1,2-diacyl-sn-glycero-3-phosphocholine(1+)"
64+
65+
66+
__metadata__ = { 'chemical_class': { 'id': 'CHEBI:16110',
67+
'name': '1,2-diacyl-sn-glycero-3-phosphocholine(1+)',
68+
'definition': 'A phosphatidylcholine that is a '
69+
'glycerol phosphatide '
70+
'(phosphoglyceride, '
71+
'glycerophospholipid) in which the '
72+
'hydroxy group of choline is '
73+
'esterified with the phosphate group '
74+
'of phosphatidic acid.',
75+
'parents': ['CHEBI:49183']},
76+
'config': { 'llm_model_name': 'claude-3-sonnet',
77+
'f1_threshold': 0.8,
78+
'max_attempts': 5,
79+
'max_negative_to_test': None,
80+
'max_positive_in_prompt': 50,
81+
'max_negative_in_prompt': 20,
82+
'max_instances_in_prompt': 100,
83+
'test_proportion': 0.1},
84+
'message': None,
85+
'attempt': 0,
86+
'success': False,
87+
'best': True,
88+
'error': 'Range Error\n'
89+
'\tidx\n'
90+
'\tViolation occurred on line 209 in file '
91+
'Code/GraphMol/ROMol.cpp\n'
92+
'\tFailed Expression: 22 < 22\n'
93+
'\tRDKIT: 2024.03.6\n'
94+
'\tBOOST: 1_85\n',
95+
'stdout': '',
96+
'num_true_positives': 0,
97+
'num_false_positives': 0,
98+
'num_true_negatives': 0,
99+
'num_false_negatives': 0,
100+
'num_negatives': None,
101+
'precision': 0.0,
102+
'recall': 0.0,
103+
'f1': 0.0,
104+
'accuracy': None}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
Classifies: CHEBI:29089 1,2-diacyl-sn-glycerol 3-phosphate
3+
"""
4+
from rdkit import Chem
5+
from rdkit.Chem import AllChem
6+
from rdkit.Chem import Descriptors
7+
from rdkit.Chem import rdMolDescriptors
8+
9+
def is_1_2_diacyl_sn_glycerol_3_phosphate(smiles: str):
10+
"""
11+
Determines if a molecule is a 1,2-diacyl-sn-glycerol 3-phosphate.
12+
13+
Args:
14+
smiles (str): SMILES string of the molecule
15+
16+
Returns:
17+
bool: True if molecule is a 1,2-diacyl-sn-glycerol 3-phosphate, False otherwise
18+
str: Reason for classification
19+
"""
20+
mol = Chem.MolFromSmiles(smiles)
21+
if mol is None:
22+
return False, "Invalid SMILES string"
23+
24+
# Check for a phosphate group
25+
if not any(atom.GetSymbol() == 'P' for atom in mol.GetAtoms()):
26+
return False, "No phosphate group found"
27+
28+
# Find the phosphate atom
29+
phosphate_atom = next((atom for atom in mol.GetAtoms() if atom.GetSymbol() == 'P'), None)
30+
if phosphate_atom is None:
31+
return False, "No phosphate group found"
32+
33+
# Check for three oxygen atoms bonded to the phosphate atom
34+
phosphate_neighbors = phosphate_atom.GetNeighbors()
35+
if len([neighbor for neighbor in phosphate_neighbors if neighbor.GetSymbol() == 'O']) != 3:
36+
return False, "Incorrect number of oxygen atoms bonded to phosphate atom"
37+
38+
# Find the glycerol backbone
39+
glycerol_atoms = []
40+
for neighbor in phosphate_neighbors:
41+
if neighbor.GetSymbol() == 'O':
42+
for bonded_atom in neighbor.GetNeighbors():
43+
if bonded_atom.GetSymbol() == 'C':
44+
glycerol_atoms.append(bonded_atom)
45+
46+
if len(glycerol_atoms) != 3:
47+
return False, "Incorrect glycerol backbone structure"
48+
49+
# Check for acyl groups at positions 1 and 2
50+
acyl_groups = []
51+
for atom in glycerol_atoms:
52+
for neighbor in atom.GetNeighbors():
53+
if neighbor.GetSymbol() == 'O' and len(neighbor.GetNeighbors()) == 2:
54+
acyl_atom = neighbor.GetNeighbors()[1]
55+
if acyl_atom.GetSymbol() == 'C':
56+
acyl_groups.append(acyl_atom)
57+
58+
if len(acyl_groups) != 2:
59+
return False, "Incorrect number of acyl groups at positions 1 and 2"
60+
61+
return True, "Molecule is a 1,2-diacyl-sn-glycerol 3-phosphate"
62+
63+
64+
__metadata__ = { 'chemical_class': { 'id': 'CHEBI:29089',
65+
'name': '1,2-diacyl-sn-glycerol 3-phosphate',
66+
'definition': 'An sn-glycerol 3-phosphate compound '
67+
'having unspecified O-acyl groups at '
68+
'the 1- and 2-positions.',
69+
'parents': ['CHEBI:16337', 'CHEBI:26706']},
70+
'config': { 'llm_model_name': 'claude-3-sonnet',
71+
'f1_threshold': 0.0,
72+
'max_attempts': 5,
73+
'max_negative_to_test': None,
74+
'max_positive_in_prompt': 50,
75+
'max_negative_in_prompt': 20,
76+
'max_instances_in_prompt': 100,
77+
'test_proportion': 0.1},
78+
'message': None,
79+
'attempt': 0,
80+
'success': True,
81+
'best': True,
82+
'error': '',
83+
'stdout': None,
84+
'num_true_positives': 0,
85+
'num_false_positives': 39,
86+
'num_true_negatives': 183445,
87+
'num_false_negatives': 45,
88+
'num_negatives': None,
89+
'precision': 0.0,
90+
'recall': 0.0,
91+
'f1': 0.0,
92+
'accuracy': 0.9995423066654316}

0 commit comments

Comments
 (0)