Skip to content

Commit

Permalink
Merge pull request OpenChemistry#82 from OpenChemistry/upload-molecul…
Browse files Browse the repository at this point in the history
…es-from-qcarchive

Added script to upload molecules from qcarchive
  • Loading branch information
psavery authored Feb 4, 2020
2 parents d4be4b2 + 61da203 commit c32b0a3
Show file tree
Hide file tree
Showing 4 changed files with 315 additions and 0 deletions.
86 changes: 86 additions & 0 deletions ingest/qcarchive/convert_to_cjson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python3

"""
This script converts qcarchive json to cjson
It includes automatic bond detection, since qcarchive
json does not store bonds.
"""

import json

import click

from avogadro.core import Molecule
from avogadro.io import FileFormatManager
from openbabel import OBMol, OBConversion


# Copied from mongochemserver
def avo_convert_str(str_data, in_format, out_format):
mol = Molecule()
conv = FileFormatManager()
conv.read_string(mol, str_data, in_format)

return conv.write_string(mol, out_format)


# Copied from mongochemserver
def cjson_to_ob_molecule(cjson):
cjson_str = json.dumps(cjson)
sdf_str = avo_convert_str(cjson_str, 'cjson', 'sdf')
conv = OBConversion()
conv.SetInFormat('sdf')
conv.SetOutFormat('sdf')
mol = OBMol()
conv.ReadString(mol, sdf_str)
return mol


# Copied from mongochemserver
def autodetect_bonds(cjson):
mol = cjson_to_ob_molecule(cjson)
mol.ConnectTheDots()
mol.PerceiveBondOrders()
conv = OBConversion()
conv.SetInFormat('sdf')
conv.SetOutFormat('sdf')
sdf_str = conv.WriteString(mol)
cjson_str = avo_convert_str(sdf_str, 'sdf', 'cjson')
return json.loads(cjson_str)


def convert_to_cjson(qcjson):
cjson = {}
cjson['chemicalJson'] = 1

# The qcjson geometry is in atomic units. Convert to angstrom.
for i in range(len(qcjson['geometry'])):
qcjson['geometry'][i] *= 0.529177249

cjson['atoms'] = {
'coords': {
'3d': qcjson['geometry']
},
'elements': {
'number': qcjson['atomic_numbers']
}
}

# Auto-detect bonds, since qcjson does not store them
cjson = autodetect_bonds(cjson)

return cjson


@click.command()
@click.argument('input_file', type=click.File('r'))
@click.argument('output_file', type=click.File('w'))
def main(input_file, output_file):
qcjson = json.load(input_file)
cjson = convert_to_cjson(qcjson)
json.dump(cjson, output_file)


if __name__ == '__main__':
main()
4 changes: 4 additions & 0 deletions ingest/qcarchive/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
qcfractal
girder_client
avogadro
openbabel
161 changes: 161 additions & 0 deletions ingest/qcarchive/search_qcarchive_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import qcfractal.interface as ptl"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client = ptl.FractalClient()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Shows all of the collections\n",
"client.list_collections()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Pick a collection type and a collection name\n",
"collection_type = 'OptimizationDataset'\n",
"collection_name = 'JGI Metabolite Set 1'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get the dataset\n",
"ds = client.get_collection(collection_type, collection_name)\n",
"print(ds)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get all of the records in the dataset\n",
"records = ds.status()\n",
"print(records)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get the number of records\n",
"num_records = ds.status().shape[0]\n",
"print(num_records)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Look at the specifications\n",
"ds.list_specifications()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Pick one\n",
"specification = 'default'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Pick a record to look at\n",
"index = 0\n",
"name = records.iloc[index].name\n",
"print(name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get the record\n",
"record = ds.get_record(name, specification)\n",
"print(record)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visualize the final molecule (result of optimization)\n",
"mol = record.get_final_molecule()\n",
"mol"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Look at the json\n",
"mol.json()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
64 changes: 64 additions & 0 deletions ingest/qcarchive/upload_qcarchive_molecules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python3

import json

import girder_client

import qcfractal.interface as ptl

from convert_to_cjson import convert_to_cjson

# These can be listed using the client
# We might be able to come up with a method to generate these automatically
collection_type = 'OptimizationDataset'
collection_name = 'JGI Metabolite Set 1'
specification = 'default'

# Need to use a girder api key
girder_api_key = ''

gc = girder_client.GirderClient()
gc.authenticate(apiKey=girder_api_key)

client = ptl.FractalClient()

print('Downloading records from:', collection_type, collection_name)
ds = client.get_collection(collection_type, collection_name)
records = ds.status()
num_records = records.shape[0]

print('Number of records found:', num_records)
for i in range(num_records):

name = records.iloc[i].name

print('Downloading data for record:', name)

try:
record = ds.get_record(name, specification)

if 'INCOMPLETE' in record.status:
print(name, 'is incomplete. Skipping')
continue

final_molecule = record.get_final_molecule()
qcjson = json.loads(final_molecule.json())
id = qcjson['id']

cjson = convert_to_cjson(qcjson)

# The id can be used to find the molecule in qcarchive like so:
# client.query_molecules(id=id)
provenance = 'qcarchive molecule: ' + id
params = {
'cjson': json.dumps(cjson),
'provenance': provenance
}

print('Uploading to girder...')
gc.post('/molecules', json=params)

except Exception as exc:
print('Failed to get records for:', name)
print('Exception was:', exc)
continue

0 comments on commit c32b0a3

Please sign in to comment.