Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add chemical clustering code #4

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
write main mcss clustering loop
  • Loading branch information
apayne97 committed Apr 27, 2024
commit f522d16fd83156b2d7a82b97a683eda766a4e111
204 changes: 199 additions & 5 deletions examples/chemical_series_clustering.ipynb
Original file line number Diff line number Diff line change
@@ -66,7 +66,7 @@
"outputs": [],
"source": [
"from rdkit import Chem\n",
"mols = Chem.SDMolSupplier(mypath)"
"mols = Chem.SDMolSupplier(str(mypath))"
]
},
{
@@ -93,7 +93,7 @@
"metadata": {},
"outputs": [],
"source": [
"# define the grid to show the scafffolds\n",
"# define the grid to show the scaffolds\n",
"grid = mols2grid.display(mols)"
]
},
@@ -106,12 +106,206 @@
"grid"
]
},
{
"cell_type": "markdown",
"source": [
"# MCSS-based Clustering"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from harbor.clustering.hierarchical import ClusterResults, ClusterCenter, HeirarchicalClustering\n",
"from openeye import oechem"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"mol: Chem.Mol = mols[0]\n",
"mol.GetPropsAsDict()"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"oemols = []\n",
"mol_ids = []\n",
"for rdkit_mol in mols[:20]:\n",
" smiles = Chem.MolToSmiles(rdkit_mol)\n",
" properties = rdkit_mol.GetPropsAsDict()\n",
" mol_ids.append(properties[\"Compound_ID\"])\n",
" mol = oechem.OEMol()\n",
" oechem.OESmilesToMol(mol, smiles)\n",
" oemols.append(mol)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from harbor.clustering import hierarchical as h\n",
"from importlib import reload\n",
"reload(h)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"clusterer = h.HeirarchicalClustering(molecules=oemols, mol_ids=mol_ids)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"clusters = clusterer.cluster(max_iterations=10)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"len(clusters)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"def get_descendents(cluster):\n",
" descendents = []\n",
" for child in cluster.children:\n",
" if isinstance(child, str):\n",
" descendents.append(cluster)\n",
" else:\n",
" descendents.extend(get_descendents(child))\n",
" return descendents"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"from harbor.plotting import ligands as l\n",
"reload(l)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"ids_found = []\n",
"for cluster_id, cluster in clusters.items():\n",
" print(f\"Cluster {cluster_id}\")\n",
" descendents = get_descendents(cluster)\n",
" print(f\"Children: {len(descendents)}\")\n",
" l.plot_ligands_with_mcs(filename=f\"cluster_{cluster_id}.png\", mols=[desc.repr for desc in descendents], mcs_mol=cluster.repr)\n",
" ids_found.extend([desc.children[0] for desc in descendents])"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"set(ids_found)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"set(mol_ids) - set(ids_found)"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"def get_row_col(i, max_cols, zero_indexed=True):\n",
" row = i // max_cols + (0 if zero_indexed else 1)\n",
" col = i % max_cols + (0 if zero_indexed else 1)\n",
" return row, col"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"for i in range(6):\n",
" print(get_row_col(i, 4, zero_indexed=False))"
],
"metadata": {
"collapsed": false
},
"execution_count": null
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
Loading