Skip to content

Commit b4ba7f0

Browse files
authored
Merge pull request #9 from cmungall/program-learning-claude
Program learning claude
2 parents df25d76 + 031c205 commit b4ba7f0

File tree

343 files changed

+20870
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

343 files changed

+20870
-0
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
Classifies: CHEBI:84948 11,12-saturated fatty acyl-CoA(4-)
3+
"""
4+
"""
5+
Classifies: CHEBI:84948 11,12-saturated fatty acyl-CoA(4-)
6+
"""
7+
from rdkit import Chem
8+
from rdkit.Chem import AllChem
9+
from rdkit.Chem import rdMolDescriptors
10+
11+
def is_11_12_saturated_fatty_acyl_CoA_4__(smiles: str):
12+
"""
13+
Determines if a molecule is a 11,12-saturated fatty acyl-CoA(4-) based on its SMILES string.
14+
A 11,12-saturated fatty acyl-CoA(4-) is a fatty acyl-CoA(4-) where the 11-12 bond of the fatty acyl group is saturated.
15+
16+
Args:
17+
smiles (str): SMILES string of the molecule
18+
19+
Returns:
20+
bool: True if molecule is a 11,12-saturated fatty acyl-CoA(4-), False otherwise
21+
str: Reason for classification
22+
"""
23+
24+
# Parse SMILES
25+
mol = Chem.MolFromSmiles(smiles)
26+
if mol is None:
27+
return False, "Invalid SMILES string"
28+
29+
# Check for CoA substructure
30+
coa_pattern = Chem.MolFromSmarts("C(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@H]([C@H](O)[C@@H]1OP(=O)([O-])[O-])n1cnc2c(N)ncnc12)C(=O)NCCC(=O)NCCS")
31+
if not mol.HasSubstructMatch(coa_pattern):
32+
return False, "No CoA substructure found"
33+
34+
# Check for fatty acyl chain
35+
acyl_pattern = Chem.MolFromSmarts("[CX3](=[OX1])[CX4,CX3]~[CX4,CX3]~[CX4,CX3]~[CX4,CX3]")
36+
acyl_matches = mol.GetSubstructMatches(acyl_pattern)
37+
if not acyl_matches:
38+
return False, "No fatty acyl chain found"
39+
40+
# Check for saturation at 11-12 position
41+
for acyl_match in acyl_matches:
42+
acyl_atoms = [mol.GetAtomWithIdx(idx) for idx in acyl_match]
43+
if len(acyl_atoms) < 12:
44+
continue # Acyl chain too short
45+
46+
atom11 = acyl_atoms[10]
47+
atom12 = acyl_atoms[11]
48+
if atom11.GetIsAromatic() or atom12.GetIsAromatic():
49+
continue # Aromatic atoms
50+
51+
bond = mol.GetBondBetweenAtoms(atom11.GetIdx(), atom12.GetIdx())
52+
if bond.GetBondType() == Chem.BondType.SINGLE:
53+
return True, "Fatty acyl chain has a saturated bond at position 11-12"
54+
55+
return False, "Fatty acyl chain does not have a saturated bond at position 11-12"
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Classifies: CHEBI:47787 11-oxo steroid
3+
"""
4+
"""
5+
Classifies: 11-oxo steroid
6+
Definition: Any oxo steroid that has an oxo substituent at position 11
7+
"""
8+
from rdkit import Chem
9+
from rdkit.Chem import AllChem
10+
11+
def is_11_oxo_steroid(smiles: str):
12+
"""
13+
Determines if a molecule is an 11-oxo steroid based on its SMILES string.
14+
15+
Args:
16+
smiles (str): SMILES string of the molecule
17+
18+
Returns:
19+
tuple: (bool, str) - (True if molecule is an 11-oxo steroid, reason for classification)
20+
"""
21+
# Parse SMILES
22+
mol = Chem.MolFromSmiles(smiles)
23+
if mol is None:
24+
return False, "Invalid SMILES string"
25+
26+
# Check for basic steroid core (four fused rings)
27+
steroid_core = Chem.MolFromSmarts("[#6]1~[#6]~[#6]~[#6]2~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~[#6]~1")
28+
if not mol.HasSubstructMatch(steroid_core):
29+
return False, "No steroid core structure found"
30+
31+
# SMARTS pattern for 11-oxo group in steroid context
32+
# This pattern looks for the specific environment of the 11-position ketone
33+
# in the steroid ring system
34+
oxo_11_pattern = Chem.MolFromSmarts("[#6]1~[#6]~[#6]2~[#6]~[#6]~[#6]~3~[#6](=[O:1])~[#6]~[#6]~[#6]~[#6]~3~[#6]~2~[#6]~[#6]~1")
35+
36+
# Find matches for the 11-oxo pattern
37+
matches = mol.GetSubstructMatches(oxo_11_pattern)
38+
if not matches:
39+
return False, "No ketone group at position 11"
40+
41+
# Count carbons to verify it's in the typical steroid range
42+
carbon_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6])
43+
if carbon_count < 19 or carbon_count > 30:
44+
return False, f"Carbon count ({carbon_count}) outside typical steroid range (19-30)"
45+
46+
# Additional check for reasonable molecular weight
47+
mol_wt = Chem.Descriptors.ExactMolWt(mol)
48+
if mol_wt < 250 or mol_wt > 500:
49+
return False, f"Molecular weight ({mol_wt:.1f}) outside typical steroid range (250-500)"
50+
51+
# Count rings to ensure we have the right ring system
52+
ring_info = mol.GetRingInfo()
53+
if ring_info.NumRings() < 4:
54+
return False, "Insufficient number of rings for steroid structure"
55+
56+
return True, "Molecule contains steroid core with ketone group at position 11"
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Classifies: CHEBI:35346 11beta-hydroxy steroid
3+
"""
4+
"""
5+
Classifies: 11beta-hydroxy steroids
6+
"""
7+
from rdkit import Chem
8+
from rdkit.Chem import AllChem
9+
10+
def is_11beta_hydroxy_steroid(smiles: str):
11+
"""
12+
Determines if a molecule is an 11beta-hydroxy steroid based on its SMILES string.
13+
14+
Args:
15+
smiles (str): SMILES string of the molecule
16+
17+
Returns:
18+
bool: True if molecule is an 11beta-hydroxy steroid, False otherwise
19+
str: Reason for classification
20+
"""
21+
22+
# Parse SMILES
23+
mol = Chem.MolFromSmiles(smiles)
24+
if mol is None:
25+
return False, "Invalid SMILES string"
26+
27+
# Check for basic steroid core (4 fused rings)
28+
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~1")
29+
if not mol.HasSubstructMatch(steroid_core):
30+
return False, "No steroid core structure found"
31+
32+
# Check for 11-beta hydroxy group
33+
# The SMARTS pattern looks for:
34+
# - The specific carbon at position 11 in the steroid skeleton
35+
# - An OH group in beta configuration (specified by '@H')
36+
# Note: The exact stereochemistry is critical here
37+
hydroxy_11beta = Chem.MolFromSmarts("[C]~1~[C]~[C]~[C]~2~[C]~[C]~[C]~[C]~3~[C]~[C@@H](O)~[C]~[C]~4~[C]~[C]~[C]~[C]~4~[C]~3~[C]~2~1")
38+
39+
if not mol.HasSubstructMatch(hydroxy_11beta):
40+
return False, "No 11-beta hydroxy group found"
41+
42+
# Additional check for common substituents often found in 11beta-hydroxy steroids
43+
# Look for common functional groups like ketones, other hydroxyls, etc.
44+
ketone_pattern = Chem.MolFromSmarts("C(=O)")
45+
other_hydroxy = Chem.MolFromSmarts("[CH]O")
46+
47+
if not (mol.HasSubstructMatch(ketone_pattern) or mol.HasSubstructMatch(other_hydroxy)):
48+
return False, "Missing typical steroid substituents"
49+
50+
# Count carbons to ensure reasonable size for a steroid
51+
carbon_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6])
52+
if carbon_count < 19 or carbon_count > 30: # Most steroids have 19-30 carbons
53+
return False, f"Carbon count ({carbon_count}) outside typical steroid range"
54+
55+
# Check for reasonable number of rings
56+
ring_info = mol.GetRingInfo()
57+
if ring_info.NumRings() < 4:
58+
return False, "Too few rings for steroid structure"
59+
60+
return True, "Contains steroid core with 11-beta hydroxy group and appropriate substituents"
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
Classifies: CHEBI:17354 16beta-hydroxy steroid
3+
"""
4+
"""
5+
Classifies: CHEBI:16beta-hydroxy steroid
6+
A 16-hydroxy steroid in which the hydroxy group at position 16 has a beta-configuration.
7+
"""
8+
from rdkit import Chem
9+
from rdkit.Chem import AllChem
10+
11+
def is_16beta_hydroxy_steroid(smiles: str):
12+
"""
13+
Determines if a molecule is a 16beta-hydroxy steroid based on its SMILES string.
14+
15+
Args:
16+
smiles (str): SMILES string of the molecule
17+
18+
Returns:
19+
bool: True if molecule is a 16beta-hydroxy steroid, False otherwise
20+
str: Reason for classification
21+
"""
22+
# Parse SMILES
23+
mol = Chem.MolFromSmiles(smiles)
24+
if mol is None:
25+
return False, "Invalid SMILES string"
26+
27+
# Add explicit hydrogens for stereochemistry
28+
mol = Chem.AddHs(mol)
29+
30+
# Basic steroid core pattern (four fused rings)
31+
# Using SMARTS that matches the basic steroid skeleton
32+
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~1")
33+
34+
if not mol.HasSubstructMatch(steroid_core):
35+
return False, "No steroid core structure found"
36+
37+
# Pattern for 16-beta-hydroxy group
38+
# The [H] specifies explicit hydrogen, OH is the hydroxy group
39+
# The @ symbols specify the stereochemistry
40+
beta_oh_pattern = Chem.MolFromSmarts("[C]12[C][C@H](O)[CH2][C@]1([CH2,CH3])[C@@H]3[C][C][C]2")
41+
42+
if not mol.HasSubstructMatch(beta_oh_pattern):
43+
return False, "No 16-beta-hydroxy group found"
44+
45+
# Additional checks for reasonable molecular weight and atom counts
46+
# Steroids typically have molecular weights between 250-1000
47+
mol_weight = Chem.Descriptors.ExactMolWt(mol)
48+
if mol_weight < 250 or mol_weight > 1000:
49+
return False, f"Molecular weight {mol_weight} outside typical steroid range (250-1000)"
50+
51+
# Count carbons (steroids typically have 17+ carbons)
52+
carbon_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6])
53+
if carbon_count < 17:
54+
return False, f"Too few carbons ({carbon_count}) for a steroid structure"
55+
56+
# Check for at least one oxygen (for the hydroxy group)
57+
oxygen_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 8])
58+
if oxygen_count < 1:
59+
return False, "No oxygen atoms found"
60+
61+
return True, "Contains steroid core with 16-beta-hydroxy group"
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Classifies: CHEBI:35342 17alpha-hydroxy steroid
3+
"""
4+
"""
5+
Classifies: 17alpha-hydroxy steroid
6+
"""
7+
from rdkit import Chem
8+
from rdkit.Chem import AllChem
9+
10+
def is_17alpha_hydroxy_steroid(smiles: str):
11+
"""
12+
Determines if a molecule is a 17alpha-hydroxy steroid based on its SMILES string.
13+
14+
Args:
15+
smiles (str): SMILES string of the molecule
16+
17+
Returns:
18+
bool: True if molecule is a 17alpha-hydroxy steroid, False otherwise
19+
str: Reason for classification
20+
"""
21+
22+
# Parse SMILES
23+
mol = Chem.MolFromSmiles(smiles)
24+
if mol is None:
25+
return False, "Invalid SMILES string"
26+
27+
# Check for basic steroid core (4 fused rings)
28+
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~1")
29+
if not mol.HasSubstructMatch(steroid_core):
30+
return False, "No steroid core structure found"
31+
32+
# Check for 17-OH with alpha stereochemistry
33+
# [C] is carbon 17, [OH1] is the hydroxyl group, '@' indicates stereochemistry
34+
# The exact SMARTS pattern depends on the numbering convention used in the structure
35+
oh_17_alpha_pattern = Chem.MolFromSmarts('[C;R1]-[C;R1]([OH1])')
36+
37+
if not mol.HasSubstructMatch(oh_17_alpha_pattern):
38+
return False, "No hydroxyl group at C17 position found"
39+
40+
# Additional checks to verify it's a steroid structure:
41+
# Count rings
42+
ri = mol.GetRingInfo()
43+
if ri.NumRings() < 4:
44+
return False, "Insufficient number of rings for steroid structure"
45+
46+
# Basic size check - steroids typically have at least 19 carbons
47+
c_count = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6)
48+
if c_count < 19:
49+
return False, "Too few carbons for steroid structure"
50+
51+
# Check for sp3 hybridized carbons typical in steroid core
52+
sp3_carbons = sum(1 for atom in mol.GetAtoms()
53+
if atom.GetAtomicNum() == 6 and atom.GetHybridization() == Chem.HybridizationType.SP3)
54+
if sp3_carbons < 10:
55+
return False, "Insufficient sp3 carbons for steroid structure"
56+
57+
return True, "Contains steroid core with 17-alpha hydroxyl group"
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
Classifies: CHEBI:35343 17beta-hydroxy steroid
3+
"""
4+
"""
5+
Classifies: 17beta-hydroxy steroid
6+
A 17-hydroxy steroid in which the hydroxy group at position 17 has a beta-configuration.
7+
"""
8+
from rdkit import Chem
9+
from rdkit.Chem import AllChem
10+
11+
def is_17beta_hydroxy_steroid(smiles: str):
12+
"""
13+
Determines if a molecule is a 17beta-hydroxy steroid based on its SMILES string.
14+
15+
Args:
16+
smiles (str): SMILES string of the molecule
17+
18+
Returns:
19+
bool: True if molecule is a 17beta-hydroxy steroid, False otherwise
20+
str: Reason for classification
21+
"""
22+
# Parse SMILES
23+
mol = Chem.MolFromSmiles(smiles)
24+
if mol is None:
25+
return False, "Invalid SMILES string"
26+
27+
# Check for basic steroid core (four fused rings)
28+
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~1~[#6]~2~[#6]~3~4")
29+
if not mol.HasSubstructMatch(steroid_core):
30+
return False, "No steroid core structure found"
31+
32+
# Check for 17-OH group in beta configuration
33+
# [C] is carbon 17, [OH1] is hydroxy group, '@' indicates stereochemistry
34+
# The [C] must be connected to 4 atoms (saturated)
35+
# Note: The exact SMARTS pattern depends on the numbering convention used
36+
oh_17_beta = Chem.MolFromSmarts('[C;X4](@[*])(@[*])(@[*])[OH1]')
37+
38+
if not mol.HasSubstructMatch(oh_17_beta):
39+
return False, "No hydroxyl group with correct connectivity found"
40+
41+
# Get matches for OH group
42+
oh_matches = mol.GetSubstructMatches(oh_17_beta)
43+
44+
# Check if any of the matches are at position 17
45+
found_17_beta_oh = False
46+
for match in oh_matches:
47+
c_atom = mol.GetAtomWithIdx(match[0]) # Get the carbon atom
48+
# Check if this carbon is part of the D ring (ring 4) of the steroid
49+
# by checking its environment
50+
ring_info = mol.GetRingInfo()
51+
if ring_info.NumAtomRings(match[0]) > 0: # Carbon must be part of a ring
52+
# Check chirality of the carbon
53+
if c_atom.GetChiralTag() == Chem.ChiralType.CHI_TETRAHEDRAL_CCW:
54+
found_17_beta_oh = True
55+
break
56+
57+
if not found_17_beta_oh:
58+
return False, "No 17-beta hydroxyl group found"
59+
60+
# Additional validation: molecule should have reasonable size for a steroid
61+
num_atoms = mol.GetNumAtoms()
62+
if num_atoms < 20 or num_atoms > 100:
63+
return False, "Molecule size not consistent with steroid structure"
64+
65+
# Count rings
66+
ring_info = mol.GetRingInfo()
67+
if ring_info.NumRings() < 4:
68+
return False, "Insufficient number of rings for steroid structure"
69+
70+
return True, "Contains steroid core with 17-beta hydroxyl group"

0 commit comments

Comments
 (0)