Skip to content

Commit b5498c5

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 1c34140 commit b5498c5

File tree

6 files changed

+159
-151
lines changed

6 files changed

+159
-151
lines changed

data/tabular/ld50_catmos/meta.yaml

Lines changed: 135 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -1,145 +1,144 @@
1-
---
21
name: ld50_catmos
32
description: |-
4-
Acute toxicity LD50 measures
5-
the most conservative dose that can lead to lethal adverse effects.
6-
The higher the dose, the more lethal of a drug.
7-
We aggregated the data from multiple SMILES by computing the mean.
3+
Acute toxicity LD50 measures
4+
the most conservative dose that can lead to lethal adverse effects.
5+
The higher the dose, the more lethal of a drug.
6+
We aggregated the data from multiple SMILES by computing the mean.
87
targets:
9-
- id: CATMoS_LD50_mgkg
10-
description: Acute Toxicity LD50.
11-
units: mg/kg
12-
type: continuous
13-
names:
14-
- noun: acute oral toxicity rat LD50
15-
- noun: acute oral toxicity (LD50 in rats)
16-
uris:
17-
- http://www.bioassayontology.org/bao#BAO_0002117
18-
significant_digits: 1
19-
- id: log10_LD50
20-
description: Acute Toxicity LD50.
21-
units: log10(mg/kg)
22-
type: continuous
23-
names:
24-
- noun: log10 acute oral toxicity rat LD50
25-
- noun: log10 acute oral toxicity (LD50 in rats)
26-
- noun: log10 LD50 in rats (oral exposure)
27-
- noun: log10 rat LD50 (oral exposure)
28-
significant_digits: 2
29-
- id: num_ghose_violations
30-
description: Ghose filter violations
31-
type: ordinal
32-
significant_digits: 0
33-
names:
34-
- noun: Ghose filter violations
35-
- noun: violations of the Ghose filter
36-
- id: num_lead_likeness_violations
37-
description: Lead likeness filter violations
38-
type: ordinal
39-
significant_digits: 0
40-
names:
41-
- noun: lead likeness filter violations
42-
- noun: violations of the lead likeness filter
43-
- id: num_lipinski_violations
44-
description: Lipinski filter violations
45-
type: ordinal
46-
significant_digits: 0
47-
names:
48-
- noun: Lipinski rule violations
49-
- noun: violations of the Lipinski rules
50-
- id: molecular_mass
51-
description: Molecular mass
52-
type: continuous
53-
units: g/mol
54-
names:
55-
- noun: molecular mass
56-
- noun: molecular weight
57-
- id: num_carbon_atoms
58-
description: Number of carbon atoms
59-
type: ordinal
60-
significant_digits: 0
61-
names:
62-
- noun: carbon atoms
63-
- id: num_oxygen_atoms
64-
description: Number of oxygen atoms
65-
type: ordinal
66-
significant_digits: 0
67-
names:
68-
- noun: oxygen atoms
8+
- id: CATMoS_LD50_mgkg
9+
description: Acute Toxicity LD50.
10+
units: mg/kg
11+
type: continuous
12+
names:
13+
- noun: acute oral toxicity rat LD50
14+
- noun: acute oral toxicity (LD50 in rats)
15+
uris:
16+
- http://www.bioassayontology.org/bao#BAO_0002117
17+
significant_digits: 1
18+
- id: log10_LD50
19+
description: Acute Toxicity LD50.
20+
units: log10(mg/kg)
21+
type: continuous
22+
names:
23+
- noun: log10 acute oral toxicity rat LD50
24+
- noun: log10 acute oral toxicity (LD50 in rats)
25+
- noun: log10 LD50 in rats (oral exposure)
26+
- noun: log10 rat LD50 (oral exposure)
27+
significant_digits: 2
28+
- id: num_ghose_violations
29+
description: Ghose filter violations
30+
type: ordinal
31+
significant_digits: 0
32+
names:
33+
- noun: Ghose filter violations
34+
- noun: violations of the Ghose filter
35+
- id: num_lead_likeness_violations
36+
description: Lead likeness filter violations
37+
type: ordinal
38+
significant_digits: 0
39+
names:
40+
- noun: lead likeness filter violations
41+
- noun: violations of the lead likeness filter
42+
- id: num_lipinski_violations
43+
description: Lipinski filter violations
44+
type: ordinal
45+
significant_digits: 0
46+
names:
47+
- noun: Lipinski rule violations
48+
- noun: violations of the Lipinski rules
49+
- id: molecular_mass
50+
description: Molecular mass
51+
type: continuous
52+
units: g/mol
53+
names:
54+
- noun: molecular mass
55+
- noun: molecular weight
56+
- id: num_carbon_atoms
57+
description: Number of carbon atoms
58+
type: ordinal
59+
significant_digits: 0
60+
names:
61+
- noun: carbon atoms
62+
- id: num_oxygen_atoms
63+
description: Number of oxygen atoms
64+
type: ordinal
65+
significant_digits: 0
66+
names:
67+
- noun: oxygen atoms
6968
identifiers:
70-
- id: SMILES
71-
type: SMILES
72-
description: SMILES
69+
- id: SMILES
70+
type: SMILES
71+
description: SMILES
7372
license: CC BY 4.0
7473
links:
75-
- url: https://ehp.niehs.nih.gov/doi/full/10.1289/EHP8495#supplementary-materials
76-
description: corresponding publication
74+
- url: https://ehp.niehs.nih.gov/doi/full/10.1289/EHP8495#supplementary-materials
75+
description: corresponding publication
7776
num_points: 9032
7877
bibtex:
79-
- |-
80-
@article{Mansouri_2021, title={CATMoS: Collaborative Acute Toxicity Modeling Suite},
81-
volume={129},
82-
ISSN={1552-9924},
83-
url={http://dx.doi.org/10.1289/EHP8495},
84-
DOI={10.1289/ehp8495},
85-
number={4},
86-
journal={Environmental Health Perspectives},
87-
publisher={Environmental Health Perspectives},
88-
author={Mansouri, Kamel and Karmaus, Agnes L. and Fitzpatrick, Jeremy
89-
and Patlewicz, Grace and Pradeep, Prachi and Alberga, Domenico and
90-
Alepee, Nathalie and Allen, Timothy E.H. and Allen, Dave and Alves, Vinicius M.
91-
and Andrade, Carolina H. and Auernhammer, Tyler R. and Ballabio, Davide and
92-
Bell, Shannon and Benfenati, Emilio and Bhattacharya, Sudin and
93-
Bastos, Joyce V. and Boyd, Stephen and Brown, J.B. and Capuzzi, Stephen J. and
94-
Chushak, Yaroslav and Ciallella, Heather and Clark, Alex M. and
95-
Consonni, Viviana and Daga, Pankaj R. and Ekins, Sean and Farag, Sherif and
96-
Fedorov, Maxim and Fourches, Denis and Gadaleta, Domenico and Gao, Feng and
97-
Gearhart, Jeffery M. and Goh, Garett and Goodman, Jonathan M. and
98-
Grisoni, Francesca and Grulke, Christopher M. and Hartung, Thomas and
99-
Hirn, Matthew and Karpov, Pavel and Korotcov, Alexandru and
100-
Lavado, Giovanna J. and Lawless, Michael and Li, Xinhao and
101-
Luechtefeld, Thomas and Lunghini, Filippo and Mangiatordi, Giuseppe F. and
102-
Marcou, Gilles and Marsh, Dan and Martin, Todd and Mauri, Andrea and
103-
Muratov, Eugene N. and Myatt, Glenn J. and Nguyen, Dac-Trung and
104-
Nicolotti, Orazio and Note, Reine and Pande, Paritosh and
105-
Parks, Amanda K. and Peryea, Tyler and Polash, Ahsan H. and
106-
Rallo, Robert and Roncaglioni, Alessandra and Rowlands, Craig and
107-
Ruiz, Patricia and Russo, Daniel P. and Sayed, Ahmed and Sayre, Risa and
108-
Sheils, Timothy and Siegel, Charles and Silva, Arthur C. and Simeonov, Anton and
109-
Sosnin, Sergey and Southall, Noel and Strickland, Judy and Tang, Yun and
110-
Teppen, Brian and Tetko, Igor V. and Thomas, Dennis and Tkachenko, Valery and
111-
Todeschini, Roberto and Toma, Cosimo and Tripodi, Ignacio and
112-
Trisciuzzi, Daniela and Tropsha, Alexander and Varnek, Alexandre and
113-
Vukovic, Kristijan and Wang, Zhongyu and Wang, Liguo and
114-
Waters, Katrina M. and Wedlake, Andrew J. and Wijeyesakere, Sanjeeva J. and
115-
Wilson, Dan and Xiao, Zijun and Yang, Hongbin and Zahoranszky-Kohalmi, Gergely and
116-
Zakharov, Alexey V. and Zhang, Fagen F. and Zhang, Zhen and Zhao, Tongan and
117-
Zhu, Hao and Zorn, Kimberley M. and Casey, Warren and Kleinstreuer, Nicole C.},
118-
year={2021}, month=apr }
78+
- |-
79+
@article{Mansouri_2021, title={CATMoS: Collaborative Acute Toxicity Modeling Suite},
80+
volume={129},
81+
ISSN={1552-9924},
82+
url={http://dx.doi.org/10.1289/EHP8495},
83+
DOI={10.1289/ehp8495},
84+
number={4},
85+
journal={Environmental Health Perspectives},
86+
publisher={Environmental Health Perspectives},
87+
author={Mansouri, Kamel and Karmaus, Agnes L. and Fitzpatrick, Jeremy
88+
and Patlewicz, Grace and Pradeep, Prachi and Alberga, Domenico and
89+
Alepee, Nathalie and Allen, Timothy E.H. and Allen, Dave and Alves, Vinicius M.
90+
and Andrade, Carolina H. and Auernhammer, Tyler R. and Ballabio, Davide and
91+
Bell, Shannon and Benfenati, Emilio and Bhattacharya, Sudin and
92+
Bastos, Joyce V. and Boyd, Stephen and Brown, J.B. and Capuzzi, Stephen J. and
93+
Chushak, Yaroslav and Ciallella, Heather and Clark, Alex M. and
94+
Consonni, Viviana and Daga, Pankaj R. and Ekins, Sean and Farag, Sherif and
95+
Fedorov, Maxim and Fourches, Denis and Gadaleta, Domenico and Gao, Feng and
96+
Gearhart, Jeffery M. and Goh, Garett and Goodman, Jonathan M. and
97+
Grisoni, Francesca and Grulke, Christopher M. and Hartung, Thomas and
98+
Hirn, Matthew and Karpov, Pavel and Korotcov, Alexandru and
99+
Lavado, Giovanna J. and Lawless, Michael and Li, Xinhao and
100+
Luechtefeld, Thomas and Lunghini, Filippo and Mangiatordi, Giuseppe F. and
101+
Marcou, Gilles and Marsh, Dan and Martin, Todd and Mauri, Andrea and
102+
Muratov, Eugene N. and Myatt, Glenn J. and Nguyen, Dac-Trung and
103+
Nicolotti, Orazio and Note, Reine and Pande, Paritosh and
104+
Parks, Amanda K. and Peryea, Tyler and Polash, Ahsan H. and
105+
Rallo, Robert and Roncaglioni, Alessandra and Rowlands, Craig and
106+
Ruiz, Patricia and Russo, Daniel P. and Sayed, Ahmed and Sayre, Risa and
107+
Sheils, Timothy and Siegel, Charles and Silva, Arthur C. and Simeonov, Anton and
108+
Sosnin, Sergey and Southall, Noel and Strickland, Judy and Tang, Yun and
109+
Teppen, Brian and Tetko, Igor V. and Thomas, Dennis and Tkachenko, Valery and
110+
Todeschini, Roberto and Toma, Cosimo and Tripodi, Ignacio and
111+
Trisciuzzi, Daniela and Tropsha, Alexander and Varnek, Alexandre and
112+
Vukovic, Kristijan and Wang, Zhongyu and Wang, Liguo and
113+
Waters, Katrina M. and Wedlake, Andrew J. and Wijeyesakere, Sanjeeva J. and
114+
Wilson, Dan and Xiao, Zijun and Yang, Hongbin and Zahoranszky-Kohalmi, Gergely and
115+
Zakharov, Alexey V. and Zhang, Fagen F. and Zhang, Zhen and Zhao, Tongan and
116+
Zhu, Hao and Zorn, Kimberley M. and Casey, Warren and Kleinstreuer, Nicole C.},
117+
year={2021}, month=apr }
119118
templates:
120-
- The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} an {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}.
121-
- The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}.
122-
- |
123-
Task: Determine the acute oral toxicity and molecular properties of a {#molecule|chemical|compound!} given the {SMILES__description}.
124-
Input: {SMILES#}
125-
Desired Output: {CATMoS_LD50_mgkg__names__noun}, {log10_LD50__names__noun}, {num_ghose_violations__names__noun}, {num_lead_likeness_violations__names__noun}, {num_lipinski_violations__names__noun}, {molecular_mass__names__noun}, {num_carbon_atoms__names__noun}, {num_oxygen_atoms__names__noun}
126-
Output: {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {log10_LD50#} {log10_LD50__units}, {num_ghose_violations#}, {num_lead_likeness_violations#}, {num_lipinski_violations#}, {molecular_mass#} {molecular_mass__units}, {num_carbon_atoms#}, {num_oxygen_atoms#}
127-
- |
128-
Context: You are {#an assistant|researcher|scientist!} in a pharmaceutical company. Your {#boss|superior|department head!} has asked you to {#design|create|synthesize!} a new drug.
129-
User: The {#drug|compound|chemical!} should have a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {molecular_mass#} {molecular_mass__names__noun} {molecular_mass__units}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}.
130-
Assistant: {#Happy to help!|Sure!|Of course!} The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties.
131-
- |
132-
User: I need a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}.
133-
Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}?
134-
User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}.
135-
Assistant: The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties.
136-
- |
137-
User: I need a {#drug|compound|chemical!} with a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}.
138-
Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}?
139-
User: The {#drug|compound|chemical!} should have a {num_carbon_atoms#} {num_carbon_atoms__names__noun}, {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}, and a {molecular_mass__names__noun} of {molecular_mass#} {molecular_mass__units}. Could you please only provide me with the {SMILES__description} and return no other information?
140-
Assistant: {SMILES#}
141-
- |
142-
User: I am looking for a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}.
143-
Assistant: {#That's interesting!|Interesting!|I see!} Can you provide me with more {#constraints|details|information!}?
144-
User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. Please return only the {SMILES__description} wrapped as follows [ANSWER]<SMILES>[/ANSWER].
145-
Assistant: [ANSWER]{SMILES#}[/ANSWER]
119+
- The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} an {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}.
120+
- The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}.
121+
- |
122+
Task: Determine the acute oral toxicity and molecular properties of a {#molecule|chemical|compound!} given the {SMILES__description}.
123+
Input: {SMILES#}
124+
Desired Output: {CATMoS_LD50_mgkg__names__noun}, {log10_LD50__names__noun}, {num_ghose_violations__names__noun}, {num_lead_likeness_violations__names__noun}, {num_lipinski_violations__names__noun}, {molecular_mass__names__noun}, {num_carbon_atoms__names__noun}, {num_oxygen_atoms__names__noun}
125+
Output: {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {log10_LD50#} {log10_LD50__units}, {num_ghose_violations#}, {num_lead_likeness_violations#}, {num_lipinski_violations#}, {molecular_mass#} {molecular_mass__units}, {num_carbon_atoms#}, {num_oxygen_atoms#}
126+
- |
127+
Context: You are {#an assistant|researcher|scientist!} in a pharmaceutical company. Your {#boss|superior|department head!} has asked you to {#design|create|synthesize!} a new drug.
128+
User: The {#drug|compound|chemical!} should have a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {molecular_mass#} {molecular_mass__names__noun} {molecular_mass__units}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}.
129+
Assistant: {#Happy to help!|Sure!|Of course!} The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties.
130+
- |
131+
User: I need a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}.
132+
Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}?
133+
User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}.
134+
Assistant: The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties.
135+
- |
136+
User: I need a {#drug|compound|chemical!} with a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}.
137+
Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}?
138+
User: The {#drug|compound|chemical!} should have a {num_carbon_atoms#} {num_carbon_atoms__names__noun}, {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}, and a {molecular_mass__names__noun} of {molecular_mass#} {molecular_mass__units}. Could you please only provide me with the {SMILES__description} and return no other information?
139+
Assistant: {SMILES#}
140+
- |
141+
User: I am looking for a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}.
142+
Assistant: {#That's interesting!|Interesting!|I see!} Can you provide me with more {#constraints|details|information!}?
143+
User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. Please return only the {SMILES__description} wrapped as follows [ANSWER]<SMILES>[/ANSWER].
144+
Assistant: [ANSWER]{SMILES#}[/ANSWER]

data/tabular/mona/example_processing_and_templates.ipynb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
"from tqdm import tqdm\n",
2121
"\n",
2222
"# import datasets\n",
23-
"import rdkit\n",
2423
"import rdkit.Chem as Chem\n",
2524
"import rdkit.RDLogger as RDLogger"
2625
]
@@ -1444,7 +1443,7 @@
14441443
" k = md[\"name\"]\n",
14451444
" v = md.get(\"value\", np.nan)\n",
14461445
" df_row[\"md_\" + transform_key(k)] = v\n",
1447-
" if not (v is np.nan):\n",
1446+
" if v is not np.nan:\n",
14481447
" md_keys.append(k)\n",
14491448
" md_key_counter.update(md_keys)\n",
14501449
" compounds = entry.get(\"compound\", [])\n",

data/tabular/ocp/transform.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ def uniCode2Latex(text: str) -> str:
2121
text = text.replace(chr(code), f"$_{code-8320}$")
2222

2323
text = text.replace("\u0305", "$^-$")
24-
text = text.replace("\u207A", "$^+$")
25-
text = text.replace("\u207B", "$^-$")
24+
text = text.replace("\u207a", "$^+$")
25+
text = text.replace("\u207b", "$^-$")
2626
text = text.replace("\u2074", "$^4$")
2727
text = text.replace("\u2070", "$^0$")
2828
text = text.replace("\u2078", "$^1$")

data/tabular/orbnet_denali/develop_transform.ipynb

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,7 @@
2525
"metadata": {},
2626
"outputs": [],
2727
"source": [
28-
"from pathlib import Path\n",
2928
"from rdkit import Chem\n",
30-
"import matplotlib.pyplot as plt\n",
31-
"import numpy as np\n",
32-
"import os\n",
3329
"import pandas as pd\n",
3430
"from glob import glob"
3531
]
@@ -474,7 +470,6 @@
474470
"metadata": {},
475471
"outputs": [],
476472
"source": [
477-
"from rdkit.Chem import rdDetermineBonds\n",
478473
"from chemnlp.utils import xyz_to_mol"
479474
]
480475
},

0 commit comments

Comments
 (0)