From c8fa14d0eadce413a6cf966202a932db98914d39 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 10:05:29 -0700 Subject: [PATCH 01/63] minor example update --- docs/source/upgrade_database.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/upgrade_database.rst b/docs/source/upgrade_database.rst index e413f0ab..952f4495 100644 --- a/docs/source/upgrade_database.rst +++ b/docs/source/upgrade_database.rst @@ -7,7 +7,7 @@ If you have previously written a database with Ringtail v<2.0.0, it will need to .. code-block:: bash - $ rt_db_to_v200.py -d + $ rt_db_to_v200.py -d old_database_1.db (required) old_database_2+.db (optional) Multiple databases may be specified at once. The update may take a few minutes per database. @@ -19,7 +19,7 @@ If you have previously written a database with Ringtail v1.0.0, it will need to .. code-block:: bash - $ rt_db_v100_to_v110.py -d + $ rt_db_v100_to_v110.py -d 100_database_1.db (required) 100_database_2+.db (optional) Multiple databases may be specified at once. The update may take a few minutes per database. From 4d1e69e20c37061dfe656a5299b3451c9743d960 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 10:52:22 -0700 Subject: [PATCH 02/63] updated installation instructions --- docs/source/installation.rst | 75 +++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 875e86bd..cb2188b8 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -2,33 +2,61 @@ Installing ringtail ################### +There are three different alternatives to installing Ringtail: through :ref:`conda-forge ` which will install all dependencies, through the Python package manager :ref:`PyPi ` where some packages need to be installed separately, and directly from :ref:`source code ` for advanced users looking to make their own code changes. It is necessary to use an environment manager like conda or mamba to organize your Ringtail :ref:`environment ` as some of the dependencies can only be installed in a managed environment. The installation instructions uses conda as an example, but you are free to use any python environment manager. Ringtail 2.0 requires Python 3.9 or higher (tested to 3.12). -Installation (from PyPI) +.. _pypi: +Installation from PyPI ************************* -Please note that Ringtail requires Python 3.9 or 3.10. +To install Ringtail from PyPi, create then activate your :ref:`ringtail environment `, then simply use pip in your terminal: .. code-block:: bash $ pip install ringtail -If using conda, ``pip`` installs the package in the active environment. +A few dependencies may be needed, including: -Also note that if using MacOS, you may need to install Multiprocess separately: +* meeko (another Forli lab tool) +* rdkit +* multiprocess (only needed on MacOS) +* scipy +* pandas +* chemicalite (only available through conda-forge) .. code-block:: bash - $ pip install multiprocess + $ pip install + $ conda install -c conda-forge chemicalite -Installation from source code +Upgrading to a newer Ringtail version +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If you have a previous version of Ringtail installed you can update the package by using the tag ``-U``, or by specifying the version + +.. code-block:: bash + + $ pip install -U ringtail + + $ pip install ringtail==2.0 + +Make sure to :ref:`upgrade any databases ` made with Ringtail v1 if you intend to use them with Ringtail v2.0. + + +.. _condaforge: +Installation from conda-forge ****************************** +To install from conda-forge create a ringtail environment if needed, and run the following in the active environment: .. code-block:: bash - $ conda create -n ringtail python=3.10 - $ conda activate ringtail + $ conda install -c conda-forge ringtail -After activating the environment, navigate to the desired directory for installing Ringtail and do the following: +The conda-forge installation will handle all dependencies, so no other installations are necessary. + +.. _sourcecode: +Installation from source code +****************************** +To install Ringtail from source code you will need the same dependencies as for the :ref:`PyPi installation `. +After activating the environment, navigate to the main Ringtail ringtail directory and run: .. code-block:: bash @@ -36,7 +64,7 @@ After activating the environment, navigate to the desired directory for installi $ cd Ringtail $ pip install . -This will automatically fetch the required modules and install them into the current conda environment. +This will automatically fetch the required modules and install them into the current environment. If you wish to make the code for Ringtail **editable** without having to re-run ``pip install .``, instead use @@ -45,17 +73,36 @@ If you wish to make the code for Ringtail **editable** without having to re-run $ pip install --editable . Test installation -******************* -If you would like to test your installation of Ringtail, a set of automated tests are included with the source code. To begin, you must install pytest in the Ringtail conda environment: +------------------ +If you would like to test your installation of Ringtail, or after you make changes to the code, a set of automated tests are included with the source code. To begin, you must install pytest in the Ringtail environment: .. code-block:: bash - $ pip install -U pytest + $ pip install pytest -Next, navigate to the ``test`` subdirectory within the cloned Ringtail directory and run pytest by simply calling +Next, navigate to the ``test`` subdirectory within the cloned Ringtail directory and run pytest by calling .. code-block:: bash $ pytest The compounds used for the testing dataset were taken from the `NCI Diversity Set V `_. The receptor used was `PDB: 4J8M `_. + +.. _envsetup: +Setting up your environment +************************** +To set up your environment use for example `conda `_ or `micromamba `_, and ensure the python version is 3.9, 3.10, 3.11, or 3.12 (Ringtail 2.0.0 has not been tested for other versions). + +.. code-block:: bash + + $ conda create -n ringtail python=3.10 + $ conda activate ringtail + +You can install packages from PyPi as well as other channels like ``conda-forge`` in your environment. To use PyPi/pip, you may have to first install it in your environment (especially for lightweight environment managers like micromamba). + +.. code-block:: bash + + $ conda install + + $ conda install -c conda-forge + From ace5b46901daca1f8442012a5e68dc48985b66c8 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:03:11 -0700 Subject: [PATCH 03/63] added mock import for matplotlib so all modules and methods show --- docs/source/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 6fac6554..1c0567f8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -36,3 +36,5 @@ html_theme = "sphinx_rtd_theme" html_static_path = ["_static"] + +autodoc_mock_imports = ["matplotlib"] From 8f272df38865fefe3cf1a552ba876eae7d2549a5 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:19:37 -0700 Subject: [PATCH 04/63] removed default debug level for cli --- ringtail/cli/rt_process_vs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ringtail/cli/rt_process_vs.py b/ringtail/cli/rt_process_vs.py index 16d20926..059c0b6f 100644 --- a/ringtail/cli/rt_process_vs.py +++ b/ringtail/cli/rt_process_vs.py @@ -10,13 +10,14 @@ from ringtail import logutils import traceback + def main(): time0 = time.perf_counter() try: # set up the logger logger = logutils.LOGGER - logger.add_filehandler(log_file="ringtail", level="DEBUG") + logger.add_filehandler(log_file="ringtail") # parse command line options and config file (if given) cmdinput = CLOptionParser() rtcore: RingtailCore = cmdinput.rtcore @@ -110,10 +111,11 @@ def main(): print(cmdinput.parser.epilog) return + if __name__ == "__main__": """Script that sets up a command line option parser (cloptionparser) and processes all arguments into dictionaries and options that are then used with the ringtail core api. This script will allow either a write or a read session at the time. Available database operations are described in the readme.md document of this codebase. """ - sys.exit(main()) \ No newline at end of file + sys.exit(main()) From 4f0331063ed348b6cf9d9c6bcddc5e5d87919078 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:20:08 -0700 Subject: [PATCH 05/63] removed default writing of log file --- ringtail/cli/rt_process_vs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ringtail/cli/rt_process_vs.py b/ringtail/cli/rt_process_vs.py index 059c0b6f..d589e28f 100644 --- a/ringtail/cli/rt_process_vs.py +++ b/ringtail/cli/rt_process_vs.py @@ -17,7 +17,6 @@ def main(): try: # set up the logger logger = logutils.LOGGER - logger.add_filehandler(log_file="ringtail") # parse command line options and config file (if given) cmdinput = CLOptionParser() rtcore: RingtailCore = cmdinput.rtcore From 678a94f61a42a1355d95452eac3510e88fa3a55f Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:21:19 -0700 Subject: [PATCH 06/63] updated log file change for debug mode --- docs/source/changes.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/changes.rst b/docs/source/changes.rst index 0f6e6c73..d2cfce83 100644 --- a/docs/source/changes.rst +++ b/docs/source/changes.rst @@ -3,7 +3,7 @@ Changes in Ringtail ###################### -Changes in 2.0.0: fully developed API +Changes in 2.0: fully developed API *************************************** Changes in keywords used for the command line tool @@ -24,16 +24,16 @@ Enhancements to the codebase ============================== * Fully developed API can use python for scripting exclusively (see :ref:`API ` page for full description) * Can add docking results directly without using file system (for vina only as output comes as a string). -* The Ringtail log is now written to a logging file in addition to STDOUT +* The Ringtail log is now written to a logging file in addition to STDOUT if log level is det to "DEBUG". Changes to code behavior ========================= * Interaction tables: one new table has been added (`Interactions`) which references the interaction id from `Interaction_indices`, while the table `Interaction_bitvectors` has been discontinued. -* A new method to update an existing database 1.1.0 (or 1.0.0) to 2.0.0 is included. However, if the existing database was created with the duplicate handling option, there is a chance of inconsistent behavior of anything involving interactions as the Pose_ID was not used as an explicit foreign key in db v1.0.0 and v1.1.0 (see Bug fixes below). +* A new method to update an existing database 1.1.0 (or 1.0.0) to 2.0 is included. However, if the existing database was created with the duplicate handling option, there is a chance of inconsistent behavior of anything involving interactions as the Pose_ID was not used as an explicit foreign key in db v1.0.0 and v1.1.0 (see Bug fixes below). Bug fixes =========== -* The option `duplicate_handling` could previously only be applied during database creation and produced inconsistent table behavior. Option can now be applied at any time results are added to a database, and will create internally consistent tables. **Please note: if you have created tables in the past and invoking the keyword `duplicate_handling` you may have errors in the "Interaction_bitvectors" table (<2.0.0). These errors cannot be recovered, and we recommend you re-make the database with Ringtail 2.0.0.** +* The option `duplicate_handling` could previously only be applied during database creation and produced inconsistent table behavior. Option can now be applied at any time results are added to a database, and will create internally consistent tables. **Please note: if you have created tables in the past and invoking the keyword `duplicate_handling` you may have errors in the "Interaction_bitvectors" table (<2.0). These errors cannot be recovered, and we recommend you re-make the database with Ringtail 2.0.** * Writing SDFs from filtering bookmarks: will check that bookmark exists and has data before writing, and will now produce SDFs for any bookmarks existing bookmarks. If the bookmark results from a filtering where `max_miss` < 0 it will note if the non-union bookmark is used, and if the base name for such bookmarks is provided it will default to the `basename_union` bookmark for writing the SDFs. * Output from filtering using `max_miss` and `output_all_poses=False`(default) now producing expected behavior of outputting only one pose per ligand. Filtering for interactions `max_miss` allows any given pose for a ligand to miss `max_miss` interactions and still be considered to pass the filter. Previously, in the resulting `union` bookmark and `output_log` text file some ligands would present with more than one pose, although the option to `output_all_poses` was `False` (and thus the expectation would be one pose outputted per ligand). This would give the wrong count for how many ligands passed a filter, as some were counted more than once. From 77074e55c71701dd634fd692d747b972aabfa5dc Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:28:38 -0700 Subject: [PATCH 07/63] updated remaining md to rst and some instruction changes --- docs/source/database_traversing.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/database_traversing.rst b/docs/source/database_traversing.rst index 2c0afa54..cd6fff29 100644 --- a/docs/source/database_traversing.rst +++ b/docs/source/database_traversing.rst @@ -2,20 +2,20 @@ Exploring the database in the command line ############################################ -View the data contained within the database using a terminal, we recommend using the [VisiData tool](https://www.visidata.org/). In addition to command line visualization, this tool has a number of other feature, including ploting. Upon opening the database with `vd`, the terminal should look like this: +View the data contained within the database using a terminal, we recommend using the `VisiData tool `_. In addition to command line visualization, this tool has a number of other feature, including plotting. Upon opening the database with ``vd``, the terminal should look like this: -![Screenshot from 2022-05-18 14-57-22](https://user-images.githubusercontent.com/41704502/169162632-3a71d338-faa1-4109-8f04-40a96ee6d24e.png) +.. image:: https://user-images.githubusercontent.com/41704502/169162632-3a71d338-faa1-4109-8f04-40a96ee6d24e.png -In this example (made with DLGs), the database contains ~3 poses for 9999 discrete ligands. Each of the rows here is a separate table or view within the database. From this screen, you can easily perform the sanity checks outline below. One should note that the number of column displayed on the first screen is 1 greater than the actual number of columns in a table (the number is correct for views). To more fully explore a given table, one may use the arrow keys or mouse to navigate to it, then press `Enter/Return` to access that table/view. The user may then scroll horizontally with the arrow keys, or press `q` to return up a level. +In this example (made with DLGs), the database contains ~3 poses for 9999 discrete ligands. Each of the rows here is a separate table or view within the database. From this screen, you can easily perform the sanity checks outline below. One should note that the number of column displayed on the first screen is 1 greater than the actual number of columns in a table (the number is correct for views). To more fully explore a given table, one may use the arrow keys or mouse to navigate to it, then press ``Enter/Return`` to access that table/view. The user may then scroll horizontally with the arrow keys, or press ``q`` to return up a level. -Using `vd` is particularly helpful to examine possible interactions of interest, stored within the `Interaction_indices` table. +Using ``vd`` is particularly helpful to examine possible interactions of interest, stored within the ``Interactions`` table. -To exit, return to the screen shown in the image above by pressing `q`, then press `q` to exit. +To exit, return to the screen shown in the image above by pressing ``q``, then press ``q`` to exit. Data integrity sanity checks ***************************** There are a few quick checks the user can make to ensure that the data has been properly written from the input files to the database. Discrepancies may indicate an error occurred while writing the database or the input file format did not match that which Ringtail expected. -- The number of rows in the `Ligands` table should match the number of input ligand files -- The number of rows in the `Results` table should be ~`max_poses`\* `number of files` and should be less than or equal to that number. For DLGs not every ligand may have up to `max_poses`, which is why the number of rows is typically smaller than `max_poses`\* `number of DLGs`. -- No ligand should have more than `max_poses` rows in the `Results` table. -- If storing all poses, the number of rows in the Results table should match the `number of ligands` * `number of output poses`. +- The number of rows in the ``Ligands`` table should match the number of input ligand files +- The number of rows in the ``Results`` table should be ~ ``max_poses`` * ``number of files`` and should be less than or equal to that number. For DLGs not every ligand may have up to ``max_poses``, which is why the number of rows is typically smaller than ``max_poses`` * ``number of DLGs``. +- No ligand should have more than ``max_poses`` rows in the ``Results`` table. +- If storing all poses, the number of rows in the Results table should match the ``number of ligands`` * ``number of output poses``. From 78de16716ee351dc637ddc6cbf5c764bef805e26 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:29:25 -0700 Subject: [PATCH 08/63] updated installation instructions --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index cb2188b8..f11c89bd 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -91,7 +91,7 @@ The compounds used for the testing dataset were taken from the `NCI Diversity Se .. _envsetup: Setting up your environment ************************** -To set up your environment use for example `conda `_ or `micromamba `_, and ensure the python version is 3.9, 3.10, 3.11, or 3.12 (Ringtail 2.0.0 has not been tested for other versions). +To set up your environment use for example `conda `_ or `micromamba `_, and ensure the python version is 3.9, 3.10, 3.11, or 3.12 (Ringtail 2.0 has not been tested for other versions). .. code-block:: bash From 0f4e124977e26c9f88845fe4a38a083a2da0e04a Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:29:54 -0700 Subject: [PATCH 09/63] updated how the scripts are referred to (removing .py and paths) --- docs/source/cmdline.rst | 62 ++++++++++++++++---------------- docs/source/compare.rst | 28 +++++++-------- docs/source/get_started.rst | 40 ++++++++++----------- docs/source/upgrade_database.rst | 6 ++-- 4 files changed, 68 insertions(+), 68 deletions(-) diff --git a/docs/source/cmdline.rst b/docs/source/cmdline.rst index 7a736f59..dfeab2d4 100644 --- a/docs/source/cmdline.rst +++ b/docs/source/cmdline.rst @@ -1,14 +1,14 @@ .. _cmdline: -Ringtail command line interface -############################### +Command line interface +####################### The Ringtail command line interface is the easiest method to use to for exploring virtual screening results in a database. If this is your first time learning about Ringtail, take a look at the page :ref:`Get started `. The current page uses the knowledge already presented on the 'Get started' page as we continue exploring the wealth of options that Ringtail offers. -The script for writing a database and filtering is ``rt_process_vs.py``. This is intended to be used for a set of DLGs/Vina PDBQTs pertaining to a single target and binding site. This may include multiple ligand libraries as long as the target and binding site is the same. Be cautious when adding results from multiple screening runs, since some target information is checked and some is not. One receptor PDBQT may also be saved to the database. +The script for writing a database and filtering is ``rt_process_vs``. This is intended to be used for a set of DLGs/Vina PDBQTs pertaining to a single target and binding site. This may include multiple ligand libraries as long as the target and binding site is the same. Be cautious when adding results from multiple screening runs, since some target information is checked and some is not. One receptor PDBQT may also be saved to the database. -The rt_process_vs.py script has two modes: ``write`` and ``read``. The desired mode must be specified in the command line before any other options are given (except ``-c [CONFIG]`` which is given first). The ``write`` mode is used to create a database for a virtual screening from ADGPU DLGs or Vina PDBQTs. After this initial run, a database is created and may be read directly by rt_process_vs.py in ``read`` mode for subsequent filtering and export operations. +The rt_process_vs script has two modes: ``write`` and ``read``. The desired mode must be specified in the command line before any other options are given (except ``-c [CONFIG]`` which is given first). The ``write`` mode is used to create a database for a virtual screening from ADGPU DLGs or Vina PDBQTs. After this initial run, a database is created and may be read directly by rt_process_vs in ``read`` mode for subsequent filtering and export operations. Please note that Ringtail does not automatically have permission to perform changes outside of the working directory, so be advised that any folders or documents that Ringtail outputs will be saved in the current working directory. @@ -19,7 +19,7 @@ Navigate to the data repository and chose one of several paths of adding results .. code-block:: bash - $ cd test/test_data/ + $ cd test/test_data/adpgu Input file_sources =================== @@ -32,7 +32,7 @@ For each of these options you can specify one or more arguments, and we can crea .. code-block:: bash - $ rt_process_vs.py write --file lig1.dlg lig2.dlg --file_path path1/ path2 --file_list filelist1.txt filelist2.txt --output_db example.db + $ rt_process_vs write --file lig1.dlg lig2.dlg --file_path path1/ path2 --file_list filelist1.txt filelist2.txt --output_db example.db Example file list: @@ -48,14 +48,14 @@ To include the details of a receptor in the database, it is necessary to provide .. code-block:: bash - $ python ../scripts/rt_process_vs.py write --file_list filelist1.txt --receptor_file test_data/4j8m.pdbqt.gz --save_receptor + $ rt_process_vs write --file_list filelist1.txt --receptor_file test_data/4j8m.pdbqt.gz --save_receptor It is possible to add docking results *or* a receptor file to a database that already exists. For this it is necessary to use the keyword ``--append_results``. You can also specify what to do if you are adding duplicate results for a ligand, by invoking the ``--duplicate_handling`` keyword with the value ``IGNORE`` (will not add the newest duplicate) or ``REPLACE`` (will overwrite the newest duplicate). Please note that the ``--duplicate_handling`` option makes database writing significantly slower. .. code-block:: bash - $ python ../scripts/rt_process_vs.py write --input_db output.db --file_path test_data/group2 --append_results --duplicate_handling REPLACE + $ rt_process_vs write --input_db output.db --file_path test_data/group2 --append_results --duplicate_handling REPLACE By default (for DLGs), Ringtail will store the best-scored (lowest energy) binding pose from the first 3 pose clusters in the DLG. For Vina, Ringtail will store the 3 best poses. Additional settings for writing to the database include how to handle the number of poses docked (``--max_poses``, or ``--store_all_poses`` which will overwrite the former). @@ -68,10 +68,10 @@ It is further possible to overwrite a database by use of the argument ``--overwr .. code-block:: bash #AD-GPU - $ python ../scripts/rt_process_vs.py write --input_db output.db --file_path test_data/group1 --max_poses 2 --interaction_tolerance 0.8 + $ rt_process_vs write --input_db output.db --file_path test_data/group1 --max_poses 2 --interaction_tolerance 0.8 #vina - $ python ../scripts/rt_process_vs.py write --input_db output.db --file_path test_data/vina --overwrite --receptor_file receptor.pdbqt --save_receptor --add_interactions --interaction_cutoffs 3.7,4.0 + $ rt_process_vs write --input_db output.db --file_path test_data/vina --overwrite --receptor_file receptor.pdbqt --save_receptor --add_interactions --interaction_cutoffs 3.7,4.0 Printing a database summary *************************** @@ -79,7 +79,7 @@ During both ``write`` and ``read`` it is possible to add the tag ``-su`` or ``-- .. code-block:: bash - $ rt_process_vs.py read --input_db output.db -su + $ rt_process_vs read --input_db output.db -su Total Stored Poses: 645 Total Unique Interactions: 183 @@ -105,14 +105,14 @@ Scoring filters .. code-block:: bash - $ python ../scripts/rt_process_vs.py read --input_db output.db --score_percentile 0.1 --log_file output_log_01percent.txt + $ rt_process_vs read --input_db output.db --score_percentile 0.1 --log_file output_log_01percent.txt The information written to the log can be specified with ``--outfields``. The full list of available output fields may be seen by using the ``--help`` option with ``read`` mode. By default, only the information for the top-scoring binding pose will be written to the log. If desired, each individual passing pose can be written by using the ``--output_all_poses`` flag. The passing results may also be ordered in the log file using the ``--order_results`` option. .. code-block:: bash - $ python ../scripts/rt_process_vs.py read --input_db output.db --eworst -6 --outfields Ligand_Name,e,rank,receptor --order_results ref_rmsd --bookmark_name eworst6 + $ rt_process_vs read --input_db output.db --eworst -6 --outfields Ligand_Name,e,rank,receptor --order_results ref_rmsd --bookmark_name eworst6 When filtering, the passing results are also saved as a view in the database. This view is named ``passing_results`` by default. The user can specify a name for the view using the ``--bookmark_name`` option. No filtering is performed if no filters are given (see full list of filters :ref:`here `). Filtering may take from seconds to minutes, depending on the size of the database, roughly scaling as O(n) for n database Results rows (i.e. stored poses). Data for poses in a view may be accessed later using the ``--data_from_bookmark`` option. @@ -127,7 +127,7 @@ The ``--max_miss`` option allows the user to filter by given interactions exclud .. code-block:: bash - $ python ../scripts/rt_process_vs.py read --input_db output.db --eworst -6 --hb_interactions A:VAL:279: A:LYS:162: --vdw_interactions A:VAL:279: A:LYS:162: --max_miss 1 --react_any + $ rt_process_vs read --input_db output.db --eworst -6 --hb_interactions A:VAL:279: A:LYS:162: --vdw_interactions A:VAL:279: A:LYS:162: --max_miss 1 --react_any Ligand filters ================= @@ -135,7 +135,7 @@ The ``--smarts_idxyz`` option may be used to filter for a specific ligand substr .. code-block:: bash - $ python ../scripts/rt_process_vs.py read --input_db output.db --eworst -6 --hb_interactions A:VAL:279: A:LYS:162: --vdw_interactions A:VAL:279: A:LYS:162: --max_miss 1 + $ rt_process_vs read --input_db output.db --eworst -6 --hb_interactions A:VAL:279: A:LYS:162: --vdw_interactions A:VAL:279: A:LYS:162: --max_miss 1 Clustering @@ -144,13 +144,13 @@ In addition to the filtering options outlined in the table below, ligands passin .. code-block:: bash - $ python ../scripts/rt_process_vs.py read --input_db output.db --filter_bookmark eworst6 --mfpt_cluster + $ rt_process_vs read --input_db output.db --filter_bookmark eworst6 --mfpt_cluster While not quite a filtering option, the user can provide a ligand name from a previously-run clustering and re-output other ligands that were clustered with that query ligand with ``--find_similar_ligands``. The user is prompted at runtime to choose a specific clustering group from which to re-output ligands. Filtering/clustering will be performed from the same command-line call prior to this similarity search, but all subsequent output tasks will be performed on the group of similar ligands obtained with this option unless otherwise specified. Outputs ********* -The primary outputs from ``rt_process_vs.py`` are the database itself (``write`` mode) and the filtering log file (``read`` mode). There are several other output options as well, intended to allow the user to further explore the data from a virtual screening. +The primary outputs from ``rt_process_vs`` are the database itself (``write`` mode) and the filtering log file (``read`` mode). There are several other output options as well, intended to allow the user to further explore the data from a virtual screening. The ``--plot`` flag generates a scatterplot of ligand efficiency vs docking score for the top-scoring pose from each ligand. Ligands passing the given filters or in the bookmark given with ``--bookmark_name`` will be highlighted in red. The plot also includes histograms of the ligand efficiencies and binding energies. The plot is saved as ``scatter.png``. @@ -170,18 +170,18 @@ Export results from a previous filtering as a CSV .. code-block:: bash - $ rt_process_vs.py write --file_path Files/ - $ rt_process_vs.py read --input_db output.db --score_percentile 0.1 --bookmark_name filter1 - $ rt_process_vs.py read --input_db output.db --export_bookmark_csv filter1 + $ rt_process_vs write --file_path Files/ + $ rt_process_vs read --input_db output.db --score_percentile 0.1 --bookmark_name filter1 + $ rt_process_vs read --input_db output.db --export_bookmark_csv filter1 Create scatterplot highlighting ligands passing filters ======================================================= .. code-block:: bash - $ rt_process_vs.py write --file_path Files/ - $ rt_process_vs.py read --input_db output.db --score_percentile 0.1 --bookmark_name filter1 - $ rt_process_vs.py read --input_db output.db --bookmark_name filter1 --plot + $ rt_process_vs write --file_path Files/ + $ rt_process_vs read --input_db output.db --score_percentile 0.1 --bookmark_name filter1 + $ rt_process_vs read --input_db output.db --bookmark_name filter1 --plot `all_ligands_scatter.png` @@ -193,8 +193,8 @@ It is possible to populate the argument list using a config file, which needs to .. code-block:: bash - $ rt_process_vs.py -c config_w.json write - $ rt_process_vs.py -c config_r.json read + $ rt_process_vs -c config_w.json write + $ rt_process_vs -c config_r.json read .. code-block:: python @@ -213,26 +213,26 @@ The Ringtail API can provide a config file template by running the following scr .. code-block:: bash - $ rt_generate_config_file.py + $ rt_generate_config_file Logging ******** -Ringtail comes with a global logger object that will write to a new text file for each time ``rt_process_vs.py`` is called. Any log messages will also be displayed in stdout. and the default logger level is "WARNING". It is possible to change the logger level by adding ``--debug`` for lowest level of logging (will make the process take longer) or ``--verbose`` for some additional, but not very deep, logging. +Ringtail comes with a global logger object that will write to a new text file for each time ``rt_process_vs`` is called. Any log messages will also be displayed in stdout. and the default logger level is "WARNING". It is possible to change the logger level by adding ``--debug`` for lowest level of logging (will make the process take longer) or ``--verbose`` for some additional, but not very deep, logging. .. code-block:: bash - $ python ../scripts/rt_process_vs.py write --verbose --file_list filelist1.txt + $ rt_process_vs write --verbose --file_list filelist1.txt Access help message ******************** .. code-block:: bash - $ rt_process_vs.py --help + $ rt_process_vs --help - $ rt_process_vs.py write --help + $ rt_process_vs write --help - $ rt_process_vs.py read --help + $ rt_process_vs read --help Available command line arguments ********************************** diff --git a/docs/source/compare.rst b/docs/source/compare.rst index 95a8bf84..07bd4638 100644 --- a/docs/source/compare.rst +++ b/docs/source/compare.rst @@ -4,27 +4,27 @@ Compare docking results from different virtual screenings ########################################################## -The script ``rt_compare.py`` is designed to be used with databases already made and filtered. It is used to combine information across multiple virtual screenings to allow or exclude the selection of ligands passing filters across multiple targets/models. This can be useful for filtering out promiscuous ligands, a technique commonly used in exerimental high-throughput screening. It may also be used if selection of ligands binding multiple protein structures/conformations/homologs are desired. +The script ``rt_compare`` is designed to be used with databases already made and filtered. It is used to combine information across multiple virtual screenings to allow or exclude the selection of ligands passing filters across multiple targets/models. This can be useful for filtering out promiscuous ligands, a technique commonly used in exerimental high-throughput screening. It may also be used if selection of ligands binding multiple protein structures/conformations/homologs are desired. -Programmatically, the ``rt_compare.py`` script is used to select ligands which are shared between the given filter bookmark(s) of some virtual screenings (``--wanted``) or exclusive to some screenings and not others (``--unwanted``). The script uses a subset of commands similar to ``rt_process_vs.py``. +Programmatically, the ``rt_compare`` script is used to select ligands which are shared between the given filter bookmark(s) of some virtual screenings (``--wanted``) or exclusive to some screenings and not others (``--unwanted``). The script uses a subset of commands similar to ``rt_process_vs``. The basic process of preparing to use this script and the concept behind it is thus: Let us assume that kinase1 is our target of interest. It has related proteins kinase1a and kinase1b. protein2 is an unrelated protein. 1. Create a database for each virtual screening on each target (kinase1.db, kinase1a.db, kinase1b.db, protein2.db) 2. Filter each database separately to get a set of virtual hits for each target. Each set of filters may be different as desired (e.g. change interaction filters for analogous residues). The bookmark within each database may be given as a single string (same bookmark name in every database) or multiple bookmark names (one per database) with the ``--bookmark_name`` option. If specifying multiple names, the order should match the order that the databases were provided in, beginning with wanted, then unwanted databases. The default name is ``passing_results``. -3. Use ``rt_compare.py`` to find ligands that pass the filters for kinase1 but not kinase1a or kinase1b. This will create a log file of the same format as that output from ``rt_process_vs.py``. +3. Use ``rt_compare`` to find ligands that pass the filters for kinase1 but not kinase1a or kinase1b. This will create a log file of the same format as that output from ``rt_process_vs``. .. code-block:: bash - $ rt_compare.py --wanted kinase1.db --unwanted kinase1a.db kinase1b.db + $ rt_compare --wanted kinase1.db --unwanted kinase1a.db kinase1b.db 4. Other usage examples and output options given below. For example, one can also select for potential dual-target ligands with .. code-block:: bash - $ rt_compare.py --wanted kinase1.db protein2.db --unwanted kinase1a.db kinase1b.db + $ rt_compare --wanted kinase1.db protein2.db --unwanted kinase1a.db kinase1b.db Usage examples @@ -35,48 +35,48 @@ Select ligands found in "passing_results" bookmarks of vs1 but not vs2 or vs3 .. code-block:: bash - $ rt_compare.py --wanted vs1.db --unwanted vs2.db vs3.db + $ rt_compare --wanted vs1.db --unwanted vs2.db vs3.db Select ligands found in "passing_results" bookmarks of vs1 and vs2 but not vs3 or vs4 ====================================================================================== .. code-block:: bash - $ rt_compare.py --wanted vs1.db vs2.db --unwanted vs3.db vs4.db + $ rt_compare --wanted vs1.db vs2.db --unwanted vs3.db vs4.db Select ligands found in "passing_results" bookmarks of every vs except vs4 ============================================================================ .. code-block:: bash - $ rt_compare.py --wanted vs1.db vs2.db vs3.db --unwanted vs4.db + $ rt_compare --wanted vs1.db vs2.db vs3.db --unwanted vs4.db Select ligands found in "filter1" bookmarks of vs1 but not vs2 ============================================================== .. code-block:: bash - $ rt_compare.py --wanted vs1.db --unwanted vs2.db --bookmark_name filter1 + $ rt_compare --wanted vs1.db --unwanted vs2.db --bookmark_name filter1 Save bookmark of ligands found in "filter1" bookmarks of vs1 and vs2 but not vs3 or vs4 as "selective_bookmark" in vs1.db ========================================================================================================================== .. code-block:: bash - $ rt_compare.py --wanted vs1.db vs2.db --unwanted vs3.db vs4.db --save_bookmark selective_bookmark + $ rt_compare --wanted vs1.db vs2.db --unwanted vs3.db vs4.db --save_bookmark selective_bookmark Export bookmark set of ligands found in "filter1" bookmarks of vs1 and vs2 but not vs3 or vs4 as CSV ===================================================================================================== .. code-block:: bash - $ rt_compare.py --wanted vs1.db vs2.db --unwanted vs3.db vs4.db --export_csv + $ rt_compare --wanted vs1.db vs2.db --unwanted vs3.db vs4.db --export_csv -Access help message for rt_compare.py -************************************* +Access help message for rt_compare +********************************** .. code-block:: bash - $ rt_compare.py --help + $ rt_compare --help Supported arguments for the comparison script diff --git a/docs/source/get_started.rst b/docs/source/get_started.rst index becb1204..a7a2f76a 100644 --- a/docs/source/get_started.rst +++ b/docs/source/get_started.rst @@ -1,17 +1,17 @@ .. _get_started: -Getting started with Ringtail using the command line interface -############################################################### +Getting started +############### -The Ringtail command line interface is orchestrated through the script ``rt_process_vs.py``. +The Ringtail command line interface is orchestrated through the script ``rt_process_vs``. Create and populate a database ********************************* -Navigate to the directory containing the data, in our case test_data: +Navigate to the directory containing the data, in our case test_data from Autodock-GPU (make sure your Ringtail environment is active): .. code-block:: bash - $ cd test/test_data/ + $ cd test/test_data/adgpu To write to the database we need to specify a few things: - that we are using ``write`` mode @@ -23,13 +23,13 @@ Let us add all docking files within the path test_data (specified by ``.`` meani .. code-block:: bash - $ rt_process_vs.py write --file_path . --recursive + $ rt_process_vs write --file_path . --recursive We can print a summary of the contents of the database by using the optional tag ``-su`` or ``--summary`` and specifying the database database from which to ``read``: .. code-block:: bash - $ rt_process_vs.py read --input_db output.db -su + $ rt_process_vs read --input_db output.db -su Total Stored Poses: 645 Total Unique Interactions: 183 @@ -51,7 +51,7 @@ Let us start filtering with a basic docking score cutoff of -6 kcal/mol: .. code-block:: bash - $ rt_process_vs.py read --input_db output.db --eworst -6 + $ rt_process_vs read --input_db output.db --eworst -6 This produces an output log ``output_log.txt`` with the names of ligands passing the filter, as well as their binding energies. Each round of filtering is also stored in the database as a SQLite view, which we refer to as a "bookmark" (default value is ``passing_results``). @@ -60,38 +60,38 @@ For example, start out with filtering out the compounds that are within the 5th .. code-block:: bash - $ rt_process_vs.py read --input_db output.db --score_percentile 5 --log ep5_log.txt --bookmark_name ep5 + $ rt_process_vs read --input_db output.db --score_percentile 5 --log ep5_log.txt --bookmark_name ep5 Let's then further refine the set of molecules by applying an interaction filter for van der Waals interactions with V279 on the receptor: .. code-block:: bash - $ rt_process_vs.py read --input_db output.db --filter_bookmark ep5 --vdw_interactions A:VAL:279: --log ep5_vdwV279_log.txt --bookmark_name ep5_vdwV279 + $ rt_process_vs read --input_db output.db --filter_bookmark ep5 --vdw_interactions A:VAL:279: --log ep5_vdwV279_log.txt --bookmark_name ep5_vdwV279 The filtered molecules can then be exported as an e.g., SDF file which can be used for visual inspection in molecular graphics programs. At the same time, if pymol is installed, we can kick off a pymol session of the ligands .. code-block:: bash - $ rt_process_vs.py read --input_db output.db --bookmark_name ep5_vdwV279 --export_sdf_path ep5_vdwV279_sdfs --pymol + $ rt_process_vs read --input_db output.db --bookmark_name ep5_vdwV279 --export_sdf_path ep5_vdwV279_sdfs --pymol -Access help message for rt_process_vs.py -***************************************** +Access help message for rt_process_vs +************************************** .. code-block:: bash - $ rt_process_vs.py --help + $ rt_process_vs --help -Access help message for rt_process_vs.py write mode -*************************************************** +Access help message for rt_process_vs write mode +************************************************ .. code-block:: bash - $ rt_process_vs.py write --help + $ rt_process_vs write --help -Access help message for rt_process_vs.py read mode -************************************************** +Access help message for rt_process_vs read mode +*********************************************** .. code-block:: bash - $ rt_process_vs.py read --help + $ rt_process_vs read --help diff --git a/docs/source/upgrade_database.rst b/docs/source/upgrade_database.rst index 952f4495..f4eb774d 100644 --- a/docs/source/upgrade_database.rst +++ b/docs/source/upgrade_database.rst @@ -1,9 +1,9 @@ .. _upgrade_database: -Updating database written with v1.0.0/v1.1.0 to work with v2.0.0 -################################################################# +Updating database written with v1.0.0/v1.1.0 to work with v2.0 +############################################################### -If you have previously written a database with Ringtail v<2.0.0, it will need to be updated to be compatible with the newest v2.0.0 Ringtail package. We have included a script ``rt_db_to_v200.py`` to perform this updated. Please note that all existing bookmarks will be removed during the update. The usage is as follows: +If you have previously written a database with Ringtail v<2.0, it will need to be updated to be compatible with the newest v2.0 Ringtail package. We have included a script ``rt_db_to_v200.py`` to perform this updated. Please note that all existing bookmarks will be removed during the update. The usage is as follows: .. code-block:: bash From 689ada993cc1ba5f497a6babb4252c35ed84aaee Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 11:30:29 -0700 Subject: [PATCH 10/63] updated how the scripts are referred to (removing .py and paths) --- docs/source/faq.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/faq.rst b/docs/source/faq.rst index e56f4d9d..eb1d6fa0 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -5,7 +5,7 @@ Frequently asked questions Potential pitfalls ********************** -Using the command line tool: any PDBQT files specified through any of the input options in ADGPU mode will be read by `rt_process_vs.py` as receptor files, even if the files actually represent ligands. Therefore, ligand PDBQT files should not be present in any directories given with `--file_path`. +Using the command line tool: any PDBQT files specified through any of the input options in ADGPU mode will be read by `rt_process_vs` as receptor files, even if the files actually represent ligands. Therefore, ligand PDBQT files should not be present in any directories given with `--file_path`. When writing from Vina PDBQTs, ensure there are no other PDBQTs (input or receptor) in directories specified with `file_path` UNLESS the receptor PDBQT is specified with the `receptor_file` option in the same command line/method call. From e72b519918003716fd2c4ecbcffd69bfb9bf83db Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 12:34:45 -0700 Subject: [PATCH 11/63] added instructions on how to finalize db write manually --- docs/source/api.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/api.rst b/docs/source/api.rst index e7cf27ae..8408b8d7 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -106,6 +106,18 @@ By default (for DLGs), Ringtail will store the best-scored (lowest energy) bindi rtc.add_results_from_files( file_path = "path2" max_poses = 5) +Iteratively appending to a database +------------------------------------ +When results are added to the database, there is a final step where some tables are indexed, and some database properties saved. If you are adding data iteratively through e.g., a for-loop and adding some number at files at once, it is time-consuming (and not necessary) to do this every iteration. Instead, you can invoke the keyword ``finalize=False``, and run the finalization method separately at the end: + +.. code-block:: python + + for folder in enumerate("path_with_many_folders"): + rtc.add_results_from_files( file_path = folder, + finalize = False) + + rtc.finalize_write() + Filtering ********** From 3d789bb7e033ddcbf92f0eeab9fdf5cd5a275a93 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 13:11:56 -0700 Subject: [PATCH 12/63] updated readme --- README.md | 219 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 121 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index 4ff36b17..70fb7317 100644 --- a/README.md +++ b/README.md @@ -46,22 +46,22 @@ at [Scripps Research](https://www.scripps.edu/). ##### Enhancements to the codebase - Fully developed API can use python for scripting exclusively - Can add docking results directly without using file system (for vina only as output comes as a string). -- The Ringtail log is now written to a logging file in addition to STDOUT +- The Ringtail log is now written to a logging file in addition to STDOUT if logging in DEBUG mode ##### Changes to code behavior - Interaction tables: one new table has been added (`Interactions`) which references the interaction id from `Interaction_indices`, while the table `Interaction_bitvectors` has been discontinued. -- A new method to update an existing database 1.1.0 (or 1.0.0) to 2.0.0 is included. However, if the existing database was created with the duplicate handling option, there is a chance of inconsistent behavior of anything involving interactions as the Pose_ID was not used as an explicit foreign key in db v1.0.0 and v1.1.0 (see Bug fixes below). +- A new method to update an existing database 1.1.0 (or 1.0.0) to 2.0 is included. However, if the existing database was created with the duplicate handling option, there is a chance of inconsistent behavior of anything involving interactions as the Pose_ID was not used as an explicit foreign key in db v1.0.0 and v1.1.0 (see Bug fixes below). ##### Bug fixes -- The option `duplicate_handling` could previously only be applied during database creation and produced inconsistent table behavior. Option can now be applied at any time results are added to a database, and will create internally consistent tables. **Please note: if you have created tables in the past and invoking the keyword `duplicate_handling` you may have errors in the "Interaction_bitvectors" table. These errors cannot be recovered, and we recommend you re-make the database with Ringtail 2.0.0.** +- The option `duplicate_handling` could previously only be applied during database creation and produced inconsistent table behavior. Option can now be applied at any time results are added to a database, and will create internally consistent tables. **Please note: if you have created tables in the past and invoking the keyword `duplicate_handling` you may have errors in the "Interaction_bitvectors" table. These errors cannot be recovered, and we recommend you re-make the database with Ringtail 2.0.** - Writing SDFs from filtering bookmarks: will check that bookmark exists and has data before writing, and will now produce SDFs for any bookmarks existing bookmarks. If the bookmark results from a filtering where `max_miss` < 0 it will note if the non-union bookmark is used, and if the base name for such bookmarks is provided it will default to the `basename_union` bookmark for writing the SDFs. - Output from filtering using `max_miss` and `output_all_poses=False`(default) now producing expected behavior of outputting only one pose per ligand. Filtering for interactions `max_miss` allows any given pose for a ligand to miss `max_miss` interactions and still be considered to pass the filter. Previously, in the resulting `union` bookmark and `output_log` text file some ligands would present with more than one pose, although the option to `output_all_poses` was `False` (and thus the expectation would be one pose outputted per ligand). This would give the wrong count for how many ligands passed a filter, as some were counted more than once. -#### Updating database to work with v2.0.0 -If you have previously written a database with Ringtail < v2.0.0, it will need to be updated to be compatible with filtering with v2.0.0. We have included a new script `rt_db_to_v200.py` to perform this updated. Please note that all existing bookmarks will be removed during the update. The usage is as follows: +#### Updating database to work with v2.0 +If you have previously written a database with Ringtail < v2.0, it will need to be updated to be compatible with filtering with v2.0. We have included a new script `rt_db_to_v200` to perform this updated. Please note that all existing bookmarks will be removed during the update. The usage is as follows: ``` -$ rt_db_to_v200.py -d +$ rt_db_to_v200 -d ``` Multiple databases may be specified at once. The update may take a few minutes per database. @@ -82,10 +82,10 @@ Code base and database schema version update ![rt_v11_timings](https://github.com/forlilab/Ringtail/assets/41704502/eac373fc-1324-45df-b845-6697dc9d1465) #### Updating database written with v1.0.0 to work with v1.1.0 -If you have previously written a database with Ringtail v1.0.0, it will need to be updated to be compatible with filtering with v1.1.0. We have included a new script `rt_db_v100_to_v110.py` to perform this updated. Please note that all existing bookmarks will be removed during the update. The usage is as follows: +If you have previously written a database with Ringtail v1.0.0, it will need to be updated to be compatible with filtering with v1.1.0. We have included a new script `rt_db_v100_to_v110` to perform this updated. Please note that all existing bookmarks will be removed during the update. The usage is as follows: ``` -$ rt_db_v100_to_v110.py -d +$ rt_db_v100_to_v110 -d ``` Multiple databases may be specified at once. The update may take a few minutes per database. @@ -105,49 +105,74 @@ Multiple databases may be specified at once. The update may take a few minutes p - [Definitions](https://github.com/forlilab/Ringtail#definitions) - [Getting Started Tutorial](https://github.com/forlilab/Ringtail#getting-started) - [Scripts](https://github.com/forlilab/Ringtail#scripts) -- [rt_process_vs.py Documentation](https://github.com/forlilab/Ringtail#rt_process_vspy-documentation) -- [rt_compare.py Documentation](https://github.com/forlilab/Ringtail#rt_comparepy-documentation) +- [rt_process_vs Documentation](https://github.com/forlilab/Ringtail#rt_process_vspy-documentation) +- [rt_compare Documentation](https://github.com/forlilab/Ringtail#rt_comparepy-documentation) - [Python tutorials](https://github.com/forlilab/Ringtail#brief-python-tutorials) -### Installation (from PyPI) -Please note that Ringtail requires Python 3.9 or 3.10. +### Installation +#### Create a Ringtail environment +It is necessary to create a Ringtail python environment for managing the external dependencies, conda will be used in the following examples but other environment managers such as the lightweight micromamba will also work. Please note that Ringtail requires Python 3.9 or higher. ```bash -$ pip install ringtail +$ conda create -n Ringtail python=3.10 +$ conda activate ringtail ``` -If using conda, `pip` installs the package in the active environment. -Also note that if using MacOS, you may need to install Multiprocess separately: +#### From PyPi +Make sure your Ringtail environment is active, then install via pip + ```bash -$ pip install multiprocess +$ pip install ringtail ``` -### Installation (from source code) +You may need to install one or more of the listed dependencies, please note that multiprocess is only necessary for MacOS. + +```bash +$ pip install ``` -$ conda create -n ringtail python=3.10 -$ conda activate ringtail + +Chemicalite is required and only available on conda-forge: + +```bash +$ conda install -c conda-forge chemicalite ``` -After this, navigate to the desired directory for installing Ringtail and do the following: + +#### From conda-forge +Ringtail 2.0 is now available on conda-forge, and installation from conda-forge will handle all of the dependencies. + +```bash +$ conda install -c conda-forge ringtail ``` + +#### From source code +If wishing to install from source code, navigate to the desired directory for installing Ringtail and do the following: + +```bash $ git clone git@github.com:forlilab/Ringtail.git $ cd Ringtail $ pip install . ``` + This will automatically fetch the required modules and install them into the current conda environment. If you wish to make the code for Ringtail editable without having to re-run `pip install .`, instead use -``` + +```bash $ pip install --editable . ``` -### Test installation -If you would like to test your installation of Ringtail, a set of automated tests are included with the source code. To begin, you must install pytest in the Ringtail conda environment: -``` -$ pip install -U pytest -``` -Next, navigate to the `test` subdirectory within the cloned Ringtail directory and run pytest by simply calling + +From source code installation you can test your installation using the automated tests in the `Ringtail/test` directory. To begin, you must install pytest in the Ringtail environment: + +```bash +$ pip install pytest ``` + +Next, navigate to the `test` subdirectory and run pytest by calling + +```bash $ pytest ``` + The compounds used for the testing dataset were taken from the [NCI Diversity Set V](https://wiki.nci.nih.gov/display/NCIDTPdata/Compound+Sets). The receptor used was [PDB: 4J8M](https://www.rcsb.org/structure/4J8M). ## Definitions @@ -161,25 +186,29 @@ The compounds used for the testing dataset were taken from the [NCI Diversity Se > Drat, I'm not a cat! Even though this eye-catching omnivore sports a few vaguely feline characteristics such as pointy ears, a sleek body, and a fluffy tail, the ringtail is really a member of the raccoon family. https://animals.sandiegozoo.org/animals/ringtail ## Getting started with the command line interface -The Ringtail command line interface is orchestrated through the script `rt_process_vs.py`. +The Ringtail command line interface is orchestrated through the script `rt_process_vs`. #### Create and populate a database -Navigate to the directory containing the data, in our case test_data: -``` -$ cd test/test_data/ +Navigate to the directory containing the data, in our case test_data/adgpu: + +```bash +$ cd test/test_data/adpgu/ ``` To write to the database we need to specify a few things: -- that we are using `write` mode +- that we are operating in `write` mode - source of docking results files. Docking results can be added either by providing one or more single files, a .txt file containing files, or by providing a directory containing docking results files. - optional database name: ringtail will default to creating a database of name `output.db` - optional docking mode: ringtail will default to assuming the files were produced by Autodock-GPU, if they are from vina specify `--mode vina` Let us add all docking files within the path test_data (specified by `.` meaning current directory), whose folders we can traverse recursively by specifying `--recursive` + +```bash +$ rt_process_vs write --file_path . --recursive ``` -$ rt_process_vs.py write --file_path . --recursive -``` + We can print a summary of the contents of the database by using the optional tag `-su` or `--summary` and specifying the database database from which to `read`: -``` -$ rt_process_vs.py read --input_db output.db -su + +```bash +$ rt_process_vs read --input_db output.db -su Total Stored Poses: 645 Total Unique Interactions: 183 @@ -194,39 +223,42 @@ max_leff: -0.13 kcal/mol 1%_leff: -0.58 kcal/mol 10%_leff: -0.47 kcal/mol ``` + #### Filtering and visualizing the data in the database Let us start filtering with a basic docking score cutoff of -6 kcal/mol: + +```bash +$ rt_process_vs read --input_db output.db --eworst -6 ``` -$ rt_process_vs.py read --input_db output.db --eworst -6 -``` + This produces an output log `output_log.txt` with the names of ligands passing the filter, as well as their binding energies. Each round of filtering is also stored in the database as a SQLite view, which we refer to as a "bookmark" (default value is `passing_results`). We can also save a round of filtering with a specific bookmark name, and perform more filtering on this bookmark. For example, start out with filtering out the compounds that are within the 5th percentile in terms of docking score and save the bookmark as `ep5`: + +```bash +$ rt_process_vs read --input_db output.db --score_percentile 5 --log_file ep5_log.txt --bookmark_name ep5 ``` -$ rt_process_vs.py read --input_db output.db --score_percentile 5 --log_file ep5_log.txt --bookmark_name ep5 -``` + Let's then further refine the set of molecules by applying an interaction filter for van der Waals interactions with V279 on the receptor: +```bash +$ rt_process_vs read --input_db output.db --filter_bookmark ep5 --vdw_interactions A:VAL:279: --log_file ep5_vdwV279_log.txt --bookmark_name ep5_vdwV279 ``` -$ rt_process_vs.py read --input_db output.db --filter_bookmark ep5 --vdw_interactions A:VAL:279: --log_file ep5_vdwV279_log.txt --bookmark_name ep5_vdwV279 -``` + The filtered molecules can then be exported as an e.g., SDF file which can be used for visual inspection in molecular graphics programs. At the same time, if pymol is installed, we can kick off a pymol session of the ligands +```bash +$ rt_process_vs read --input_db output.db --bookmark_name ep5_vdwV279 --export_sdf_path ep5_vdwV279_sdfs --pymol ``` -$ rt_process_vs.py read --input_db output.db --bookmark_name ep5_vdwV279 --export_sdf_path ep5_vdwV279_sdfs --pymol -``` -#### Access help message for rt_process_vs.py -``` -$ rt_process_vs.py --help -``` -#### Access help message for rt_process_vs.py write mode -``` -$ rt_process_vs.py write --help -``` -#### Access help message for rt_process_vs.py read mode -``` -$ rt_process_vs.py read --help + +#### Access help message for rt_process_vs +```bash +$ rt_process_vs --help + +$ rt_process_vs write --help + +$ rt_process_vs read --help ``` #### Ringtail arguments @@ -294,43 +326,43 @@ $ rt_process_vs.py read --help --- ### Scripts -The Ringtail package includes two command line oriented scripts: `rt_process_vs.py` and `rt_compare.py`. Both may be run with options specified in the command line and/or using options specified in a JSON-formatted file given with `--config`. Command line options override any conflicting options in the config file. +The Ringtail package includes two command line oriented scripts: `rt_process_vs` and `rt_compare`. Both may be run with options specified in the command line and/or using options specified in a JSON-formatted file given with `--config`. Command line options override any conflicting options in the config file. -[rt_process_vs.py](https://github.com/forlilab/Ringtail#rt_process_vspy-documentation) serves as the primary script for the package and is used to both write docking files to a SQLite database and to perform filtering and export tasks on the database. It is designed to handle docking output files associated with a single virtual screening in a single database. +[rt_process_vs](https://github.com/forlilab/Ringtail#rt_process_vspy-documentation) serves as the primary script for the package and is used to both write docking files to a SQLite database and to perform filtering and export tasks on the database. It is designed to handle docking output files associated with a single virtual screening in a single database. -[rt_compare.py](https://github.com/forlilab/Ringtail#rt_comparepy-documentation) is used to combine information across multiple virtual screenings (in separate databases) to allow or exclude the selection of ligands passing filters across multiple targets/models. This can be useful for filtering out promiscuous ligands, a technique commonly used in exerimental high-throughput screening. It may also be used if selection of ligands binding multiple protein structures/conformations/homologs are desired. +[rt_compare](https://github.com/forlilab/Ringtail#rt_comparepy-documentation) is used to combine information across multiple virtual screenings (in separate databases) to allow or exclude the selection of ligands passing filters across multiple targets/models. This can be useful for filtering out promiscuous ligands, a technique commonly used in exerimental high-throughput screening. It may also be used if selection of ligands binding multiple protein structures/conformations/homologs are desired. -[rt_generate_config_file.py](https://github.com/forlilab/Ringtail#rt_generate_config_filepy-documentation) can be ran to create a config file template +[rt_generate_config_file](https://github.com/forlilab/Ringtail#rt_generate_config_filepy-documentation) can be ran to create a config file template -[rt_db_to_v200.py](https://github.com/forlilab/Ringtail#Updating-database-to-work-with-v200) is used to update older databases to the latest version. +[rt_db_to_v200](https://github.com/forlilab/Ringtail#Updating-database-to-work-with-v200) is used to update older databases to the latest version. -[rt_db_v100_to_v110.py](https://github.com/forlilab/Ringtail#Updating-database-written-with-v100-to-work-with-v110) is used to update db v1.0.0 to 1.1.0. +[rt_db_v100_to_v110](https://github.com/forlilab/Ringtail#Updating-database-written-with-v100-to-work-with-v110) is used to update db v1.0.0 to 1.1.0. -#### rt_compare.py Documentation -The `rt_compare.py` script is designed to be used with databases already made and filtered. The script is used to select ligands which are shared between the given filter bookmark(s) of some virtual screenings (wanted) or exclusive to some screenings and not others (unwanted). The script uses a subset of commands similar to `rt_process_vs.py`. +#### rt_compare Documentation +The `rt_compare` script is designed to be used with databases already made and filtered. The script is used to select ligands which are shared between the given filter bookmark(s) of some virtual screenings (wanted) or exclusive to some screenings and not others (unwanted). The script uses a subset of commands similar to `rt_process_vs`. An example of use: select ligands found in "filter_bookmark" bookmarks of database1 but not database2 (they must both contain a bookmark named "filter1"): -``` -rt_compare.py --wanted database1.db --unwanted database2.db --bookmark_name filter_bookmark + +```bash +rt_compare --wanted database1.db --unwanted database2.db --bookmark_name filter_bookmark ``` For more detailed description of usage, please see [the readthedocs.org site for ringtail](https://ringtail.readthedocs.io/en/latest/compare.html). -#### rt_generate_config_file.py Documentation - - ## Advanced usage: scripting with Ringtail API Ringtail has been re-designed to allow for direct use of its API for e.g., scripting purposes. This circumvents the use of the command line tools, and allows for more advanced usage. The available operations and keywords are the same as for the command line interface, but the methods can now be accessed at a more granular level if desired. For docking engines that provides direct string output such as Vina, it is also possible to save the docking results output directly to the database as a string and thus circumventing use of the computer file system (some link to vina scripting, probably in readthedocs). #### Instantiating the Ringtail object A ringtail core is created by instantiating a `RingtailCore` object with a database. Currently, a database can only be added upon instantiation. -``` + +```bash rtc = RingtailCore("output.db") ``` Default logging level is "WARNING", and a different logger level can be set at the time of object instantiation, or later by the log level change API: -``` + +```bash rtc = RingtailCore(db_file="output.db", logging_level="DEBUG) # or rtc.logger.set_level("INFO") @@ -341,47 +373,33 @@ To add results to the database, use the `add_results_from_files` method that tak as well as a receptor path and database properties and how to handle the resutls (how many poses to save, how to deal with interactions if having vina results), and whether or not to print a summary after writing the results to the database. -``` +```python rtc.add_results_from_files( file_path = "test_data/", recursive = True, save_receptor = False, max_poses = 3) ``` -Both files (`filesources_dict`) and processing options (`optionsdict`) can be provided as dictionaries as well or instead of the the individual options. Any provided individual options will overwrite the options provided through dictionaries. The use and prioritization of dictionaries and method attributes is true for most of the available API methods. - -``` -file_sources = { - "file_path": "test_data/", - "recursive": True, -} - -writeoptions = { - "store_all_poses": True, - "max_proc": 4 -} - -rtc.add_results_from_files( filesources_dict = file_sources, - optionsdict = writeoptions,) -``` If at any point you wish to print a summary of the database, the method can be called directly: -``` + +```python rtc.produce_summary() ``` The default docking mode is "dlg", and can be changed to "vina" by accessing the ringtail core property `docking_mode`. -``` + +```python rtc_vina = RingtailCore("output_vina.db") rtc_vina.docking_mode = "vina" ``` + Since vina does not automatically write docking results to the file system, these can be added to the database by associating them with a ligand name in a dictionary and using this dictionary as the source of results when adding to the database: -``` -vina_docking_result1 = "long string of results" -vina_docking_result2 = "different string of results" + +```python vina_results = { - "ligand1": vina_docking_result1, - "ligand2": vina_docking_result2 + "ligand1": vina_docking_ligand1_result, + "ligand2": vina_docking_ligand2_result } rtc_vina.add_results_from_vina_string(results_strings = vina_results, @@ -391,26 +409,31 @@ rtc_vina.add_results_from_vina_string(results_strings = vina_results, #### Filtering and visualizing the data in the database To filter, simply access the API method `filter` and provide desired filter values. Names of bookmark and output log for containing filtered results can be specified in the method. -``` + +```python rtc.filter(eworst=-6, bookmark_name = "e6", log_file = "filtered_results.txt") ``` + Just like with the command line tool, you can choose to filter over a bookmark that has already been created: -``` + +```python rtc.filter(vdw_interactions=[('A:VAL:279:', True), ('A:LYS:162:', True)], bookmark_name = "e6vdw279162", filter_bookmark = "e6", log_file = "filtered_results_2.txt") ``` + To export filtered molecules in a specific bookmark to SDF files use the following method, where the `sdf_path` directory will be created if it does not already exist: -``` +```python rtc.write_molecule_sdfs(sdf_path = "sdf_files", bookmark_name = "e6vdw279162") ``` One or more of the filtered ligands can be visualized in PyMol: -``` + +```python rtc.pymol(bookmark_name = "e6vdw279162") ``` From 08f491916dc0d34d35c78de511f92d062a32928a Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 13:16:17 -0700 Subject: [PATCH 13/63] corrected minor sentence structure error --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 70fb7317..461c7d45 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ If you wish to make the code for Ringtail editable without having to re-run `pip $ pip install --editable . ``` -From source code installation you can test your installation using the automated tests in the `Ringtail/test` directory. To begin, you must install pytest in the Ringtail environment: +You can test the source code installation using the automated tests in the `Ringtail/test` directory. To begin, you must install pytest in the Ringtail environment: ```bash $ pip install pytest From 11c22e7f78039829bd3f8f0d29ec479c094a99e8 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 13:20:11 -0700 Subject: [PATCH 14/63] fix spelling error in test dir --- MANIFEST.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index f5329370..91ad2553 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ include README.md include LICENSE include docs/* -include tests/* \ No newline at end of file +include test/* \ No newline at end of file From 293fb6496248f5708e80fb9285218211dbb72a4d Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 14:51:03 -0700 Subject: [PATCH 15/63] update what python versions ringtail will work with --- README.md | 6 +++--- docs/source/installation.rst | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 461c7d45..99d78e07 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ $ rt_db_v100_to_v110 -d 3.9, tested up to 3.12) +- python (> 3.9, tested up to 3.11) - RDKit - SciPy - Matplotlib @@ -111,10 +111,10 @@ Multiple databases may be specified at once. The update may take a few minutes p ### Installation #### Create a Ringtail environment -It is necessary to create a Ringtail python environment for managing the external dependencies, conda will be used in the following examples but other environment managers such as the lightweight micromamba will also work. Please note that Ringtail requires Python 3.9 or higher. +It is necessary to create a Ringtail python environment for managing the external dependencies, conda will be used in the following examples but other environment managers such as the lightweight micromamba will also work. Please note that Ringtail requires Python 3.9, 3.10, or 3.11. ```bash -$ conda create -n Ringtail python=3.10 +$ conda create -n Ringtail python=3.11 $ conda activate ringtail ``` diff --git a/docs/source/installation.rst b/docs/source/installation.rst index f11c89bd..d79f0da7 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -2,7 +2,7 @@ Installing ringtail ################### -There are three different alternatives to installing Ringtail: through :ref:`conda-forge ` which will install all dependencies, through the Python package manager :ref:`PyPi ` where some packages need to be installed separately, and directly from :ref:`source code ` for advanced users looking to make their own code changes. It is necessary to use an environment manager like conda or mamba to organize your Ringtail :ref:`environment ` as some of the dependencies can only be installed in a managed environment. The installation instructions uses conda as an example, but you are free to use any python environment manager. Ringtail 2.0 requires Python 3.9 or higher (tested to 3.12). +There are three different alternatives to installing Ringtail: through :ref:`conda-forge ` which will install all dependencies, through the Python package manager :ref:`PyPi ` where some packages need to be installed separately, and directly from :ref:`source code ` for advanced users looking to make their own code changes. It is necessary to use an environment manager like conda or mamba to organize your Ringtail :ref:`environment ` as some of the dependencies can only be installed in a managed environment. The installation instructions uses conda as an example, but you are free to use any python environment manager. Ringtail 2.0 requires Python 3.9, 3.10, or 3.11). .. _pypi: Installation from PyPI @@ -91,11 +91,11 @@ The compounds used for the testing dataset were taken from the `NCI Diversity Se .. _envsetup: Setting up your environment ************************** -To set up your environment use for example `conda `_ or `micromamba `_, and ensure the python version is 3.9, 3.10, 3.11, or 3.12 (Ringtail 2.0 has not been tested for other versions). +To set up your environment use for example `conda `_ or `micromamba `_, and ensure the python version is 3.9, 3.10, or 3.11. .. code-block:: bash - $ conda create -n ringtail python=3.10 + $ conda create -n ringtail python=3.11 $ conda activate ringtail You can install packages from PyPi as well as other channels like ``conda-forge`` in your environment. To use PyPi/pip, you may have to first install it in your environment (especially for lightweight environment managers like micromamba). From 12561a8288488b33bbcd9630c5eebe3d7864256f Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 16:48:28 -0700 Subject: [PATCH 16/63] no longer count failed files in the final processed file talley --- ringtail/mpmanager.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ringtail/mpmanager.py b/ringtail/mpmanager.py index 3403a63b..a966b111 100644 --- a/ringtail/mpmanager.py +++ b/ringtail/mpmanager.py @@ -145,9 +145,7 @@ def process_results(self): w.join() - self.logger.info( - "Wrote {0} docking results to the database".format(self.num_files) - ) + self.logger.info(f"Wrote {self.num_files} docking results to the database") def _process_data_sources(self): """Adds each docking result item to the queue, including files and data provided as string/dict. @@ -243,6 +241,7 @@ def _check_for_worker_exceptions(self): f.write( str(datetime.now()) + f"\tRingtail failed to parse {filename}\n" ) + self.num_files -= 1 self.logger.debug(tb) def _kill_all_workers(self, error, filename, tb): From d5fdfb29fdb306a0c0139d242541ff8db2caa2bc Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 16 Sep 2024 18:22:56 -0700 Subject: [PATCH 17/63] bug fix for adding interaction while writing vina results --- ringtail/interactions.py | 2 +- ringtail/logutils.py | 2 ++ test/test_units.py | 3 +-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ringtail/interactions.py b/ringtail/interactions.py index 71be5ee7..09552623 100644 --- a/ringtail/interactions.py +++ b/ringtail/interactions.py @@ -23,7 +23,7 @@ def __init__(self, rec_string, interaction_cutoff_radii): try: self.pdb = PDBQTReceptor(rec_string) except OSError as e: - with tempfile.NamedTemporaryFile(dir="/dev/shm", mode="wt") as f: + with tempfile.NamedTemporaryFile(mode="wt") as f: f.write(rec_string) self.pdb = PDBQTReceptor(f.name) self.interaction_cutoff_radii = interaction_cutoff_radii diff --git a/ringtail/logutils.py b/ringtail/logutils.py index 212cdd51..2e7ede45 100644 --- a/ringtail/logutils.py +++ b/ringtail/logutils.py @@ -110,6 +110,8 @@ def set_level(self, log_level: str): return elif log_level != self.level(): self.logger.setLevel(log_level) + if self.logger.level == "DEBUG": + self.add_filehandler() if self._log_fp is not None: self._log_fp.setLevel(log_level) if self.log_console is not None: diff --git a/test/test_units.py b/test/test_units.py index 4aecd3b7..09127c34 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -483,8 +483,7 @@ def test_vina_string_add(self, countrows): def test_add_interactions(self, countrows): vina_path = "test_data/vina" - rtc = RingtailCore("output.db") - rtc.logger.set_level("DEBUG") + rtc = RingtailCore("output.db", logging_level="DEBUG") rtc.docking_mode = "vina" rtc.add_results_from_files( file_path=vina_path, From d261acd02e0113a28fdee8cb740c70dd255ad8d1 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Tue, 17 Sep 2024 13:20:31 -0700 Subject: [PATCH 18/63] added readthedocs link --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 99d78e07..25cdb433 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,8 @@ Ringtail is developed by the [Forli lab](https://forlilab.org/) at the [Center for Computational Structural Biology (CCSB)](https://ccsb.scripps.edu) at [Scripps Research](https://www.scripps.edu/). +In-depth documentation can be found on [ReadTheDocs](https://ringtail.readthedocs.io/en/latest/). + ### New in version 2.0 ##### Changes in keywords used for the command line tool From fd5b83d81f54b7266a065d3fd9833ef3d835b7bc Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 20 Sep 2024 10:52:43 -0700 Subject: [PATCH 19/63] writes bitvector table and use it for interaction filtering, no other functionality added back --- ringtail/storagemanager.py | 185 ++++++++++++++++++++++++++++++++----- 1 file changed, 161 insertions(+), 24 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 55f86e50..fce42ad5 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -140,8 +140,6 @@ def close_storage(self, attached_db=None, vacuum=False): """ if attached_db is not None: self._detach_db(attached_db) - # drop indices created when filtering - self._remove_indices() # close any open cursors self._close_open_cursors() # vacuum database @@ -196,14 +194,26 @@ def insert_interactions(self, Pose_IDs: list, interactions_list, duplicates): # for each pose id, list interaction_rows = [] + interaction_bv_rows = [] for index, Pose_ID in enumerate(Pose_IDs): # add interaction if unique, returns index of interaction + # insert_interaction_index_row will add a column in interaction_bitvectors if necessary pose_interactions = [ ((Pose_ID,) + self._insert_interaction_index_row(interaction_tuple)) for interaction_tuple in interactions_list[index] ] # adds each pose_interaction row to list interaction_rows.extend(pose_interactions) + # create list of indices + pose_bitvector_precursors = [ + interaction[1] for interaction in pose_interactions + ] + # prepend pose id + pose_bitvector_precursors.insert(0, Pose_ID) + interaction_bv_rows.extend([pose_bitvector_precursors]) + # has the form [pose_id, int_ind1, int_ind2, etc] + # then add new row for pose in bitvector table + self._insert_interaction_bitvector_rows(interaction_bv_rows, duplicates) self._insert_interaction_rows(interaction_rows, duplicates) # endregion @@ -252,6 +262,7 @@ def filter_results(self, all_filters: dict, suppress_output=False) -> iter: self.logger.debug("Running filtering query...") time0 = time.perf_counter() + print(" filter results str", filter_results_str) filtered_results = self._run_query(filter_results_str).fetchall() self.logger.debug( f"Time to run query: {time.perf_counter() - time0:.2f} seconds" @@ -508,6 +519,7 @@ def _create_tables(self): self._create_ligands_table() self._create_receptors_table() self._create_interaction_index_table() + self._create_interaction_bitvector_table() self._create_interaction_table() self._create_bookmark_table() self._create_db_properties_table() @@ -1287,6 +1299,120 @@ def _create_interaction_index_table(self): f"Error while creating interaction index table: {e}" ) from e + def _create_interaction_bitvector_table(self): + """Create table of Pose_IDs and their interaction bitvector fingerprint decomposed into columns (one per interaction). + + Columns are: + Pose_ID INTEGER FOREIGN KEY from RESULTS(Pose_ID), + int_0 (number corresponds to interaction_id in Interaction_indices table) + int_1 + ... + int_n + + Raises: + DatabaseTableCreationError + """ + + interaction_bv_table = f"""CREATE TABLE Interaction_bitvectors ( + interaction_bv_id INTEGER PRIMARY KEY AUTOINCREMENT, + Pose_ID INTEGER, + FOREIGN KEY (Pose_ID) REFERENCES RESULTS(Pose_ID));""" + + try: + cur = self.conn.cursor() + cur.execute(interaction_bv_table) + cur.close() + self.logger.debug("Interaction bitvector table has been created") + except sqlite3.OperationalError as e: + raise DatabaseTableCreationError( + f"Error while creating interaction bitvector table: {e}." + ) from e + + def _insert_interaction_bitvector_rows( + self, pose_id_interaction_indices: list, duplicates + ): + """One row is one Pose_id, will inserts a 1 in any column where the column name represents an interaction_index that pose_id has. + Pose_ID that is 1-to-1 with Results table. + + Args: + pose_id_interaction_indices (list): list of pose_id, followed by all interaction indices + duplicates (list(int)): list of pose_ids from results table deemed duplicates, can also contain Nones, will be treated according to self.duplicate_handling + + Raises: + DatabaseInsertionError + """ + # I need a for loop unpacking the list of tuples + # each item in the list is a pose id and its corresponding interaction indices + # so here is a first problem, do I do one insert statement for each pose id? + # I can do an executemany but then I have to make interaction tuples for all the represented indices + # I could do a: find longest interaction tuple, for each pose id make a list of that length of zeros + # then + # remove pose id + + # tuple of as many 1s as a pose id has interactions, used in the executemany statement, minus 1 since + + # make a for lop to prepare the insert statements, and do not use executemany to begin with + # this will force me to clean up these duplicate handling methods I think + sql_insert_full = """INSERT INTO Interaction_bitvectors (Pose_ID""" + unnamed_params = "(?" + try: + cur = self.conn.cursor() + if not self.duplicate_handling: # add all results + # for each pose id + for pose in pose_id_interaction_indices: + sql_insert_full = """INSERT INTO Interaction_bitvectors (Pose_ID""" + unnamed_params = "(?" + # make list of all interaction indices, remove pose id + interaction_indices: list = pose[1:] + for interaction in sorted(interaction_indices): + # add name of column for given interaction_index + sql_insert_full += f""",int_{interaction}""" + unnamed_params += ",?" + # remove the last comma + unnamed_params += ")" + sql_insert_full += ") VALUES " + unnamed_params + # create list of 1s for each interaction + interaction_bits = [1 for _ in range(len(interaction_indices))] + # add pose id to start of list + interaction_bits.insert(0, pose[0]) + # convert list to tuple for sql insert + interaction_bit_tuple = tuple(interaction_bits) + cur.execute(sql_insert_full, interaction_bit_tuple) + else: + # first, add any poses that are not duplicates + non_duplicates = [ + interaction_row + for interaction_row in interaction_rows + if interaction_row[0] not in duplicates + ] + # check if there are duplicates or if duplicates list contains only None + duplicates_exist = bool(duplicates.count(None) != len(duplicates)) + cur.executemany(sql_insert, non_duplicates) + + # only look for values to replace if there are duplicate pose ids + if self.duplicate_handling == "REPLACE" and duplicates_exist: + # delete all rows pertaining to duplicated pose_ids + duplicated_pose_ids = [id for id in duplicates if id is not None] + self._delete_interactions(duplicated_pose_ids) + # insert the interaction tuples for the new pose_ids + duplicates_only = [ + interaction_row + for interaction_row in interaction_rows + if interaction_row[0] in duplicates + ] + cur.executemany(sql_insert, duplicates_only) + + elif self.duplicate_handling == "IGNORE": + # ignore and don't add any poses that are duplicates + pass + self.conn.commit() + cur.close() + + except sqlite3.OperationalError as e: + raise DatabaseInsertionError( + f"Error while inserting an interaction row: {e}" + ) from e + def _create_interaction_table(self): """Create table a "tall-skinny" table of each pose-interaction. This table enables proper handling of duplicates if specified. @@ -1399,6 +1525,12 @@ def _insert_interaction_index_row(self, interaction_tuple) -> tuple: """ Writes unique interactions and returns the interaction_id of the given interaction + Args: + interaction_tuple (tuple): (rec_chain, rec_resname, rec_resid, rec_atom, rec_atomid) + + Returns: + tuple: if interaction index (int_index,) + Raises: DatabaseInsertionError """ @@ -1422,8 +1554,13 @@ def _insert_interaction_index_row(self, interaction_tuple) -> tuple: if not interaction_index: # get table length and use that as index interaction_index = (self._get_length_of_table("Interaction_indices"),) + # create and insert new interaction id input_tuple = interaction_index + interaction_tuple cur.execute(sql_insert, input_tuple) + # create new column in interaction_bitvector table + cur.execute( + f"""ALTER TABLE Interaction_bitvectors ADD COLUMN int_{str(interaction_index[0])}""" + ) self.conn.commit() else: interaction_index = interaction_index[0] @@ -1431,9 +1568,7 @@ def _insert_interaction_index_row(self, interaction_tuple) -> tuple: return interaction_index except sqlite3.OperationalError as e: raise DatabaseInsertionError( - "Error inserting unique interaction tuples in index table: {0}".format( - e - ) + f"Error inserting unique interaction tuples in index table: {e}" ) from e def _delete_interactions(self, Pose_IDs): @@ -1555,21 +1690,6 @@ def _create_indices(self): except sqlite3.OperationalError as e: raise StorageError("Error occurred while indexing") from e - def _remove_indices(self): - """Removes idx_filter_cols and idx_ligname - - Raises: - StorageError - """ - try: - cur = self.conn.cursor() - cur.execute("DROP INDEX IF EXISTS idx_filter_cols") - cur.execute("DROP INDEX IF EXISTS idx_ligname") - cur.close() - self.logger.info("Existing indicies pertaining to filtering were dropped.") - except sqlite3.OperationalError as e: - raise StorageError("Error while dropping indices") from e - def _delete_table(self, table_name: str): """ Method to delete a table @@ -2509,10 +2629,9 @@ def _generate_result_filtering_query(self, filters_dict): ) # remove bool include/exclude flag interaction_indices = self._run_query(interact_index_str) for i in interaction_indices: + # create a list of interaction indices interaction_filter_indices.append(i[0]) - # so it creates indexes for specified interactions to keep using in filtering - # catch if interaction not found in database if interaction_filter_indices == []: if interaction == ["R", "", "", "", "", True]: @@ -2537,10 +2656,13 @@ def _generate_result_filtering_query(self, filters_dict): ) # find pose ids for ligands with desired interactions # this query can be joining interaction_indices on the list of indices, and all the columns matching in interactions table (index that table) + # i think this is where things get slow + # this method has changed from _generate_interaction_filtering_query to _generate_interaction_bv_filtering_query for testing purposes + interaction_queries.append( "Pose_ID {include_str} ({interaction_str})".format( include_str=include_str, - interaction_str=self._generate_interaction_filtering_query( + interaction_str=self._generate_interaction_bv_filtering_query( interaction_filter_indices ), ) @@ -2927,6 +3049,21 @@ def _generate_interaction_filtering_query(self, interaction_index_list): [f"""interaction_id={index}""" for index in interaction_index_list] ) + def _generate_interaction_bv_filtering_query(self, interaction_index_list): + """takes list of interaction indices and searches for ligand ids + which have those interactions + + Args: + interaction_index_list (list): List of interaction indices + + Returns: + String: SQLite-formatted query + """ + return ( + "SELECT Pose_id FROM (SELECT * FROM Interaction_bitvectors WHERE Pose_ID IN subq) WHERE " + + " OR ".join([f"int_{index} = 1" for index in interaction_index_list]) + ) + def _generate_ligand_filtering_query(self, ligand_filters): """write string to select from ligand table @@ -3173,7 +3310,7 @@ def update_database_version(self, new_version, consent=False): ) cur.execute("ALTER TABLE Bookmarks ADD COLUMN filters") cur.execute( - "CREATE INDEX allind ON Results(LigName, docking_score, leff, deltas, reference_rmsd, energies_inter, energies_vdw, energies_electro, energies_intra, nr_interactions, run_number, pose_rank, num_hb)" + "CREATE INDEX IF NOT EXISTS allind ON Results(LigName, docking_score, leff, deltas, reference_rmsd, energies_inter, energies_vdw, energies_electro, energies_intra, nr_interactions, run_number, pose_rank, num_hb)" ) cur.execute( "CREATE INDEX IF NOT EXISTS intind ON Interaction_indices(interaction_type, rec_chain, rec_resname, rec_resid, rec_atom, rec_atomid)" From 7bf8d45abfdd7a2597156e0614b2d6afc539af69 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 20 Sep 2024 17:08:21 -0700 Subject: [PATCH 20/63] updated rdkit method to use bitvector table --- ringtail/ringtailcore.py | 16 +++++++++++----- ringtail/storagemanager.py | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index d9600a5b..9ed9f73a 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -153,18 +153,24 @@ def _add_poses( flexres_pose, ) in poses: # fetch info about pose interactions and format into string with format -::::, joined by commas - interactions = self.storageman.fetch_pose_interactions(Pose_ID) + pose_bitvector = self.storageman.fetch_interaction_bitvector(Pose_ID) # if that pose id has interactions - if interactions is not None: + if pose_bitvector is not None: # make a list of all of them + interaction_indices = [] interactions_list = [] - # for each interaction row, make into a string according to format above - for interaction_info in interactions: + # for each interaction bit, make into a string according to format above + for idx, bit in enumerate(pose_bitvector): + if bit == 1: + interaction_indices.append(idx) + for int_idx in interaction_indices: + interaction_info = self.storageman.fetch_interaction_info_by_index( + int_idx + ) interaction = ( interaction_info[0] + "-" + ":".join(interaction_info[1:]) ) interactions_list.append(interaction) - interactions_str = ", ".join(interactions_list) properties["Interactions"].append(interactions_str) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index fce42ad5..cc2ed328 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -262,7 +262,6 @@ def filter_results(self, all_filters: dict, suppress_output=False) -> iter: self.logger.debug("Running filtering query...") time0 = time.perf_counter() - print(" filter results str", filter_results_str) filtered_results = self._run_query(filter_results_str).fetchall() self.logger.debug( f"Time to run query: {time.perf_counter() - time0:.2f} seconds" @@ -2106,6 +2105,40 @@ def fetch_single_pose_properties(self, pose_ID: int): query = f"SELECT Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates FROM Results WHERE Pose_ID={pose_ID}" return self._run_query(query) + def fetch_interaction_info_by_index(self, interaction_idx): + """Returns tuple containing interaction info for given interaction_idx + + Args: + interaction_idx (int): interaction index to fetch info for + + Returns: + tuple: tuple of info for requested interaction + """ + query = "SELECT * FROM Interaction_indices WHERE interaction_id = {0}".format( + interaction_idx + ) + return self._run_query(query).fetchone()[1:] # cut off interaction index + + def fetch_interaction_bitvector(self, pose_id): + """Returns tuple containing interaction bitvector line for given pose_id + + Args: + pose_id (int): pose id to fetch interaction bitvector for + + Returns: + tuple: tuple representing interaction bitvector + None: if no interactions in database + """ + # catch if database does not have interactions + table_names = [table[0] for table in self._fetch_existing_table_names()] + if "Interaction_bitvectors" not in table_names: + return None + + query = "SELECT * FROM Interaction_bitvectors WHERE Pose_ID = {0}".format( + pose_id + ) + return self._run_query(query).fetchone()[1:] # cut off pose id + def fetch_pose_interactions(self, Pose_ID): """ Fetch all interactions parameters belonging to a Pose_ID @@ -2662,7 +2695,7 @@ def _generate_result_filtering_query(self, filters_dict): interaction_queries.append( "Pose_ID {include_str} ({interaction_str})".format( include_str=include_str, - interaction_str=self._generate_interaction_bv_filtering_query( + interaction_str=self._generate_interaction_filtering_query( interaction_filter_indices ), ) From 5fc22c1f60d2bc2ce31857d513e139993bc78903 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 20 Sep 2024 17:25:18 -0700 Subject: [PATCH 21/63] updated rdkit method to use bitvector table --- ringtail/storagemanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index cc2ed328..48498abd 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2695,7 +2695,7 @@ def _generate_result_filtering_query(self, filters_dict): interaction_queries.append( "Pose_ID {include_str} ({interaction_str})".format( include_str=include_str, - interaction_str=self._generate_interaction_filtering_query( + interaction_str=self._generate_interaction_bv_filtering_query( interaction_filter_indices ), ) From 438029fe2b7395819b454a6eee5f9446fb1d4b53 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Sun, 22 Sep 2024 18:35:54 -0700 Subject: [PATCH 22/63] added two more indices and renamed the existing ones --- ringtail/storagemanager.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 48498abd..41a328f9 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -1667,6 +1667,7 @@ def _insert_cluster_data( self.conn.commit() def _create_indices(self): + # TODO refactor """Create index containing possible filter and order by columns Raises: @@ -1675,11 +1676,16 @@ def _create_indices(self): try: cur = self.conn.cursor() self.logger.debug("Creating columns index...") + cur.execute( - "CREATE INDEX IF NOT EXISTS allind ON Results(LigName, docking_score, leff, deltas, reference_rmsd, energies_inter, energies_vdw, energies_electro, energies_intra, nr_interactions, run_number, pose_rank, num_hb)" + "CREATE INDEX IF NOT EXISTS ak_results ON Results(LigName, docking_score, leff, deltas, reference_rmsd, energies_inter, energies_vdw, energies_electro, energies_intra, nr_interactions, run_number, pose_rank, num_hb)" ) + cur.execute("CREATE INDEX IF NOT EXISTS ak_poseid ON Results(Pose_id)") cur.execute( - "CREATE INDEX IF NOT EXISTS intind ON Interaction_indices(interaction_type, rec_chain, rec_resname, rec_resid, rec_atom, rec_atomid)" + "CREATE INDEX IF NOT EXISTS ak_intind ON Interaction_indices(interaction_type, rec_chain, rec_resname, rec_resid, rec_atom, rec_atomid)" + ) + cur.execute( + "CREATE INDEX IF NOT EXISTS ak_interactions ON Interactions(Pose_id, interaction_id)" ) self.conn.commit() cur.close() @@ -2011,6 +2017,7 @@ def fetch_receptor_objects(self): return cursor.fetchall() def fetch_data_for_passing_results(self) -> iter: + # TODO refactor """Will return SQLite cursor with requested data for outfields for poses that passed filter in self.bookmark_name Returns: @@ -2058,6 +2065,7 @@ def fetch_flexres_info(self): raise DatabaseQueryError("Error retrieving flexible residue info") from e def fetch_passing_ligand_output_info(self): + # TODO refactor """fetch information required by vsmanager for writing out molecules Returns: @@ -2068,6 +2076,7 @@ def fetch_passing_ligand_output_info(self): return self._run_query(query) def fetch_single_ligand_output_info(self, ligname): + # TODO refactor """get output information for given ligand Args: @@ -2093,6 +2102,7 @@ def fetch_single_ligand_output_info(self, ligname): ) from e def fetch_single_pose_properties(self, pose_ID: int): + # TODO refactor """fetch coordinates for pose given by pose_ID Args: @@ -2106,6 +2116,7 @@ def fetch_single_pose_properties(self, pose_ID: int): return self._run_query(query) def fetch_interaction_info_by_index(self, interaction_idx): + # TODO refactor """Returns tuple containing interaction info for given interaction_idx Args: @@ -2120,6 +2131,7 @@ def fetch_interaction_info_by_index(self, interaction_idx): return self._run_query(query).fetchone()[1:] # cut off interaction index def fetch_interaction_bitvector(self, pose_id): + # TODO refactor """Returns tuple containing interaction bitvector line for given pose_id Args: @@ -2140,6 +2152,7 @@ def fetch_interaction_bitvector(self, pose_id): return self._run_query(query).fetchone()[1:] # cut off pose id def fetch_pose_interactions(self, Pose_ID): + # TODO refactor """ Fetch all interactions parameters belonging to a Pose_ID @@ -2197,6 +2210,7 @@ def _fetch_all_plot_data(self): ) def _fetch_passing_plot_data(self, bookmark_name: str | None = None): + # TODO refactor """Fetches cursor for best energies and leffs for ligands passing filtering @@ -2217,6 +2231,7 @@ def _fetch_passing_plot_data(self, bookmark_name: str | None = None): ) def _fetch_ligand_cluster_columns(self): + # TODO refactor """fetching columns from Ligand_clusters table Raises: @@ -2274,6 +2289,7 @@ def to_dataframe(self, requested_data: str, table=True) -> pd.DataFrame: return pd.read_sql_query(requested_data, self.conn) def _get_length_of_table(self, table_name: str): + # TODO refactor """ Finds the rowcount/length of a table based on the rowid @@ -2292,6 +2308,7 @@ def _get_length_of_table(self, table_name: str): # region Methods dealing with filtered results def _get_number_passing_ligands(self, bookmark_name: str | None = None): + # TODO refactor """Returns count of the number of ligands that passed filtering criteria @@ -2322,6 +2339,7 @@ def _get_number_passing_ligands(self, bookmark_name: str | None = None): ) from e def get_maxmiss_union(self, total_combinations: int): + # TODO refactor """Get results that are in union considering max miss Args: @@ -2358,6 +2376,7 @@ def get_maxmiss_union(self, total_combinations: int): def fetch_summary_data( self, columns=["docking_score", "leff"], percentiles=[1, 10] ) -> dict: + # TODO refactor """Collect summary data for database: Num Ligands Num stored poses @@ -2411,6 +2430,7 @@ def fetch_summary_data( raise StorageError("Error while fetching summary data!") from e def fetch_clustered_similars(self, ligname: str): + # TODO refactor """Given ligname, returns poseids for similar poses/ligands from previous clustering. User prompted at runtime to choose cluster. Args: @@ -2471,6 +2491,7 @@ def fetch_clustered_similars(self, ligname: str): return self._run_query(sql_query), self.bookmark_name, cluster_col_choice def fetch_passing_pose_properties(self, ligname): + # TODO refactor """fetch coordinates for poses passing filter for given ligand Args: @@ -2486,6 +2507,7 @@ def fetch_passing_pose_properties(self, ligname): return self._run_query(query) def fetch_nonpassing_pose_properties(self, ligname): + # TODO refactor """fetch coordinates for poses of ligname which did not pass the filter Args: @@ -2501,6 +2523,7 @@ def fetch_nonpassing_pose_properties(self, ligname): return self._run_query(query) def _calc_percentile_cutoff(self, percentile: float, column="docking_score"): + # TODO refactor """Make query for percentile by calculating energy or leff cutoff Args: @@ -2535,6 +2558,7 @@ def _calc_percentile_cutoff(self, percentile: float, column="docking_score"): # region Methods that generate SQLite query strings def _generate_outfield_string(self): + # TODO refactor """string describing outfields to be written Returns: @@ -2555,6 +2579,7 @@ def _generate_outfield_string(self): return ", ".join([self.field_to_column_name[field] for field in outfields_list]) def _generate_result_filtering_query(self, filters_dict): + # TODO refactor """takes lists of filters, writes sql filtering string Args: @@ -2691,7 +2716,6 @@ def _generate_result_filtering_query(self, filters_dict): # this query can be joining interaction_indices on the list of indices, and all the columns matching in interactions table (index that table) # i think this is where things get slow # this method has changed from _generate_interaction_filtering_query to _generate_interaction_bv_filtering_query for testing purposes - interaction_queries.append( "Pose_ID {include_str} ({interaction_str})".format( include_str=include_str, @@ -3030,6 +3054,7 @@ def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: return poseid_bv def _generate_interaction_index_filtering_query(self, interaction_list): + # TODO refactor # TODO I think this method can be combined with the next method """takes list of interaction info for a given ligand, looks up corresponding interaction index @@ -3065,6 +3090,7 @@ def _generate_interaction_index_filtering_query(self, interaction_list): return sql_string def _generate_interaction_filtering_query(self, interaction_index_list): + # TODO refactor """takes list of interaction indices and searches for ligand ids which have those interactions @@ -3083,6 +3109,7 @@ def _generate_interaction_filtering_query(self, interaction_index_list): ) def _generate_interaction_bv_filtering_query(self, interaction_index_list): + # TODO take out, remove bv table """takes list of interaction indices and searches for ligand ids which have those interactions @@ -3098,6 +3125,7 @@ def _generate_interaction_bv_filtering_query(self, interaction_index_list): ) def _generate_ligand_filtering_query(self, ligand_filters): + # TODO refactor """write string to select from ligand table Args: @@ -3147,6 +3175,7 @@ def _generate_ligand_filtering_query(self, ligand_filters): def _generate_selective_insert_query( self, bookmark1_name, bookmark2_name, select_str, new_db_name, temp_table ): + # TODO refactor """Generates string to select ligands found/not found in the given bookmark in both current db and new_db Args: From 10cac122936e4c5cc84ef455d1c74d2b088a8a22 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Sun, 22 Sep 2024 19:11:03 -0700 Subject: [PATCH 23/63] annotated what methods need refactoring --- ringtail/ringtailcore.py | 1 + ringtail/storagemanager.py | 173 +++++++++++++++---------------------- 2 files changed, 73 insertions(+), 101 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index 9ed9f73a..c5cc553e 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -164,6 +164,7 @@ def _add_poses( if bit == 1: interaction_indices.append(idx) for int_idx in interaction_indices: + # TODO refactor here if I refactor the method in storageman interaction_info = self.storageman.fetch_interaction_info_by_index( int_idx ) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 41a328f9..37cec29f 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -1667,7 +1667,6 @@ def _insert_cluster_data( self.conn.commit() def _create_indices(self): - # TODO refactor """Create index containing possible filter and order by columns Raises: @@ -2017,7 +2016,6 @@ def fetch_receptor_objects(self): return cursor.fetchall() def fetch_data_for_passing_results(self) -> iter: - # TODO refactor """Will return SQLite cursor with requested data for outfields for poses that passed filter in self.bookmark_name Returns: @@ -2043,9 +2041,7 @@ def fetch_data_for_passing_results(self) -> iter: query = ( "SELECT " + outfield_string - + " FROM Results WHERE Pose_ID IN (SELECT Pose_ID FROM {0})".format( - self.bookmark_name - ) + + f" FROM Results WHERE Pose_ID IN (SELECT Pose_ID FROM {self.bookmark_name})" ) return self._run_query(query) @@ -2065,7 +2061,6 @@ def fetch_flexres_info(self): raise DatabaseQueryError("Error retrieving flexible residue info") from e def fetch_passing_ligand_output_info(self): - # TODO refactor """fetch information required by vsmanager for writing out molecules Returns: @@ -2076,7 +2071,6 @@ def fetch_passing_ligand_output_info(self): return self._run_query(query) def fetch_single_ligand_output_info(self, ligname): - # TODO refactor """get output information for given ligand Args: @@ -2102,7 +2096,6 @@ def fetch_single_ligand_output_info(self, ligname): ) from e def fetch_single_pose_properties(self, pose_ID: int): - # TODO refactor """fetch coordinates for pose given by pose_ID Args: @@ -2116,7 +2109,7 @@ def fetch_single_pose_properties(self, pose_ID: int): return self._run_query(query) def fetch_interaction_info_by_index(self, interaction_idx): - # TODO refactor + # TODO refactor-> make it work for one or more indices """Returns tuple containing interaction info for given interaction_idx Args: @@ -2131,7 +2124,7 @@ def fetch_interaction_info_by_index(self, interaction_idx): return self._run_query(query).fetchone()[1:] # cut off interaction index def fetch_interaction_bitvector(self, pose_id): - # TODO refactor + # TODO remove """Returns tuple containing interaction bitvector line for given pose_id Args: @@ -2152,7 +2145,6 @@ def fetch_interaction_bitvector(self, pose_id): return self._run_query(query).fetchone()[1:] # cut off pose id def fetch_pose_interactions(self, Pose_ID): - # TODO refactor """ Fetch all interactions parameters belonging to a Pose_ID @@ -2210,7 +2202,6 @@ def _fetch_all_plot_data(self): ) def _fetch_passing_plot_data(self, bookmark_name: str | None = None): - # TODO refactor """Fetches cursor for best energies and leffs for ligands passing filtering @@ -2225,13 +2216,10 @@ def _fetch_passing_plot_data(self, bookmark_name: str | None = None): bookmark_name = self.bookmark_name return self._run_query( - "SELECT docking_score, leff, Pose_ID, LigName FROM Results WHERE LigName IN (SELECT DISTINCT LigName FROM {bookmark}) GROUP BY LigName".format( - bookmark=bookmark_name - ) + f"SELECT docking_score, leff, Pose_ID, LigName FROM Results WHERE LigName IN (SELECT DISTINCT LigName FROM {bookmark_name}) GROUP BY LigName" ) def _fetch_ligand_cluster_columns(self): - # TODO refactor """fetching columns from Ligand_clusters table Raises: @@ -2289,7 +2277,7 @@ def to_dataframe(self, requested_data: str, table=True) -> pd.DataFrame: return pd.read_sql_query(requested_data, self.conn) def _get_length_of_table(self, table_name: str): - # TODO refactor + # TODO check if index on table, and use that row if possible """ Finds the rowcount/length of a table based on the rowid @@ -2303,12 +2291,66 @@ def _get_length_of_table(self, table_name: str): return self._run_query(query).fetchone()[0] + def fetch_summary_data( + self, columns=["docking_score", "leff"], percentiles=[1, 10] + ) -> dict: + """Collect summary data for database: + Num Ligands + Num stored poses + Num unique interactions + + min, max, percentiles for columns in columns + + Args: + columns (list (str)): columns to be displayed and used in summary + percentiles (list(int)): percentiles to consider + + Returns: + dict: of data summary + """ + try: + summary_data = {} + cur = self.conn.cursor() + summary_data["num_ligands"] = cur.execute( + "SELECT COUNT(LigName) FROM Ligands" + ).fetchone()[0] + if summary_data["num_ligands"] == 0: + raise StorageError("There is no ligand data in the database. ") + summary_data["num_poses"] = cur.execute( + "SELECT COUNT(Pose_id) FROM Results" + ).fetchone()[0] + summary_data["num_unique_interactions"] = cur.execute( + "SELECT COUNT(interaction_id) FROM Interaction_indices" + ).fetchone()[0] + summary_data["num_interacting_residues"] = cur.execute( + "SELECT COUNT(*) FROM (SELECT interaction_id FROM Interaction_indices GROUP BY interaction_type,rec_resid,rec_chain)" + ).fetchone()[0] + + allowed_columns = self._fetch_results_column_names() + for col in columns: + if col not in allowed_columns: + raise StorageError( + f"Requested summary column {col} not found in Results table! Available columns: {allowed_columns}" + ) + summary_data[f"min_{col}"] = cur.execute( + f"SELECT MIN({col}) FROM Results" + ).fetchone()[0] + summary_data[f"max_{col}"] = cur.execute( + f"SELECT MAX({col}) FROM Results" + ).fetchone()[0] + for p in percentiles: + summary_data[f"{p}%_{col}"] = self._calc_percentile_cutoff(p, col) + + return summary_data + + except sqlite3.OperationalError as e: + raise StorageError("Error while fetching summary data!") from e + # endregion # region Methods dealing with filtered results def _get_number_passing_ligands(self, bookmark_name: str | None = None): - # TODO refactor """Returns count of the number of ligands that passed filtering criteria @@ -2325,11 +2367,7 @@ def _get_number_passing_ligands(self, bookmark_name: str | None = None): bookmark_name = self.current_bookmark_name try: cur = self.conn.cursor() - cur.execute( - "SELECT COUNT(DISTINCT LigName) FROM {results_view}".format( - results_view=bookmark_name - ) - ) + cur.execute(f"SELECT COUNT(DISTINCT LigName) FROM {bookmark_name}") n_ligands = int(cur.fetchone()[0]) cur.close() return n_ligands @@ -2339,7 +2377,8 @@ def _get_number_passing_ligands(self, bookmark_name: str | None = None): ) from e def get_maxmiss_union(self, total_combinations: int): - # TODO refactor + # TODO probably remove as union can happen automatically. Then if enumerating_interaction_combinations, + # just create the other bookmarks separately for each interaction combination through the method in the core """Get results that are in union considering max miss Args: @@ -2373,64 +2412,7 @@ def get_maxmiss_union(self, total_combinations: int): self.logger.debug("Running union query...") return self._run_query(union_select_query) - def fetch_summary_data( - self, columns=["docking_score", "leff"], percentiles=[1, 10] - ) -> dict: - # TODO refactor - """Collect summary data for database: - Num Ligands - Num stored poses - Num unique interactions - - min, max, percentiles for columns in columns - - Args: - columns (list (str)): columns to be displayed and used in summary - percentiles (list(int)): percentiles to consider - - Returns: - dict: of data summary - """ - try: - summary_data = {} - cur = self.conn.cursor() - summary_data["num_ligands"] = cur.execute( - "SELECT COUNT(*) FROM Ligands" - ).fetchone()[0] - if summary_data["num_ligands"] == 0: - raise StorageError("There is no ligand data in the database. ") - summary_data["num_poses"] = cur.execute( - "SELECT COUNT(*) FROM Results" - ).fetchone()[0] - summary_data["num_unique_interactions"] = cur.execute( - "SELECT COUNT(*) FROM Interaction_indices" - ).fetchone()[0] - summary_data["num_interacting_residues"] = cur.execute( - "SELECT COUNT(*) FROM (SELECT interaction_id FROM Interaction_indices GROUP BY interaction_type,rec_resid,rec_chain)" - ).fetchone()[0] - - allowed_columns = self._fetch_results_column_names() - for col in columns: - if col not in allowed_columns: - raise StorageError( - f"Requested summary column {col} not found in Results table! Available columns: {allowed_columns}" - ) - summary_data[f"min_{col}"] = cur.execute( - f"SELECT MIN({col}) FROM Results" - ).fetchone()[0] - summary_data[f"max_{col}"] = cur.execute( - f"SELECT MAX({col}) FROM Results" - ).fetchone()[0] - for p in percentiles: - summary_data[f"{p}%_{col}"] = self._calc_percentile_cutoff(p, col) - - return summary_data - - except sqlite3.OperationalError as e: - raise StorageError("Error while fetching summary data!") from e - def fetch_clustered_similars(self, ligname: str): - # TODO refactor """Given ligname, returns poseids for similar poses/ligands from previous clustering. User prompted at runtime to choose cluster. Args: @@ -2473,7 +2455,7 @@ def fetch_clustered_similars(self, ligname: str): raise ValueError( f"Given cluster number {cluster_choice} cannot be converted to int. Please be sure you are specifying integer." ) - + # TODO might be able to refactor these queries query_ligand_cluster = cur.execute( f"SELECT {cluster_col_choice} FROM Ligand_clusters WHERE pose_id IN (SELECT Pose_ID FROM Results WHERE LigName LIKE '{ligname}')" ).fetchone() @@ -2491,7 +2473,6 @@ def fetch_clustered_similars(self, ligname: str): return self._run_query(sql_query), self.bookmark_name, cluster_col_choice def fetch_passing_pose_properties(self, ligname): - # TODO refactor """fetch coordinates for poses passing filter for given ligand Args: @@ -2501,13 +2482,10 @@ def fetch_passing_pose_properties(self, ligname): iter: SQLite cursor that contains Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates, flexible_residues """ - query = "SELECT Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates FROM Results WHERE Pose_ID IN (SELECT Pose_ID FROM passing_temp WHERE LigName LIKE '{ligand}')".format( - ligand=ligname - ) + query = f"SELECT Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates FROM Results WHERE Pose_ID IN (SELECT Pose_ID FROM passing_temp WHERE LigName LIKE '{ligname}')" return self._run_query(query) def fetch_nonpassing_pose_properties(self, ligname): - # TODO refactor """fetch coordinates for poses of ligname which did not pass the filter Args: @@ -2517,13 +2495,10 @@ def fetch_nonpassing_pose_properties(self, ligname): iter: SQLite cursor that contains Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates, flexible_residues """ - query = "SELECT Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates FROM Results WHERE LigName LIKE '{ligand}' AND Pose_ID NOT IN (SELECT Pose_ID FROM passing_temp)".format( - ligand=ligname, - ) + query = f"SELECT Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates FROM Results WHERE LigName LIKE '{ligname}' AND Pose_ID NOT IN (SELECT Pose_ID FROM passing_temp)" return self._run_query(query) def _calc_percentile_cutoff(self, percentile: float, column="docking_score"): - # TODO refactor """Make query for percentile by calculating energy or leff cutoff Args: @@ -2558,7 +2533,7 @@ def _calc_percentile_cutoff(self, percentile: float, column="docking_score"): # region Methods that generate SQLite query strings def _generate_outfield_string(self): - # TODO refactor + # TODO this will probably need refactoring """string describing outfields to be written Returns: @@ -2579,7 +2554,7 @@ def _generate_outfield_string(self): return ", ".join([self.field_to_column_name[field] for field in outfields_list]) def _generate_result_filtering_query(self, filters_dict): - # TODO refactor + # TODO THE biggest one """takes lists of filters, writes sql filtering string Args: @@ -3054,7 +3029,6 @@ def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: return poseid_bv def _generate_interaction_index_filtering_query(self, interaction_list): - # TODO refactor # TODO I think this method can be combined with the next method """takes list of interaction info for a given ligand, looks up corresponding interaction index @@ -3090,7 +3064,7 @@ def _generate_interaction_index_filtering_query(self, interaction_list): return sql_string def _generate_interaction_filtering_query(self, interaction_index_list): - # TODO refactor + # TODO refactor -> THIS IS ONE OF THE MAJOR ONES """takes list of interaction indices and searches for ligand ids which have those interactions @@ -3109,7 +3083,7 @@ def _generate_interaction_filtering_query(self, interaction_index_list): ) def _generate_interaction_bv_filtering_query(self, interaction_index_list): - # TODO take out, remove bv table + # TODO remove """takes list of interaction indices and searches for ligand ids which have those interactions @@ -3125,7 +3099,7 @@ def _generate_interaction_bv_filtering_query(self, interaction_index_list): ) def _generate_ligand_filtering_query(self, ligand_filters): - # TODO refactor + # TODO this one is important and might be tricky, use sqlitestudio """write string to select from ligand table Args: @@ -3175,7 +3149,6 @@ def _generate_ligand_filtering_query(self, ligand_filters): def _generate_selective_insert_query( self, bookmark1_name, bookmark2_name, select_str, new_db_name, temp_table ): - # TODO refactor """Generates string to select ligands found/not found in the given bookmark in both current db and new_db Args: @@ -3188,9 +3161,7 @@ def _generate_selective_insert_query( Returns: str: sqlite formatted query string """ - return "INSERT INTO {0} SELECT Pose_ID, LigName FROM {1} WHERE LigName {2} (SELECT LigName FROM {3}.{4})".format( - temp_table, bookmark1_name, select_str, new_db_name, bookmark2_name - ) + return f"INSERT INTO {temp_table} SELECT Pose_ID, LigName FROM {bookmark1_name} WHERE LigName {select_str} (SELECT LigName FROM {new_db_name}.{bookmark2_name})" # endregion From 846edc1bf734eda76c4e86b76c2976d00558fce4 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 23 Sep 2024 09:52:10 -0700 Subject: [PATCH 24/63] removed variables for indexing columns on the fly, as it is not used anymore --- ringtail/storagemanager.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 37cec29f..340e276c 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -508,8 +508,7 @@ def __init__( } self.view_suffix = None self.temptable_suffix = 0 - self.filtering_window = "Results" - self.index_columns = [] + self.filtering_window = "Results" # TODO is this necessary? self.open_cursors = [] # region Methods for inserting into/removing from the database @@ -2605,7 +2604,6 @@ def _generate_result_filtering_query(self, filters_dict): continue # if filter has to do with docking energies if filter_key in energy_filter_col_name: - self.index_columns.append(energy_filter_col_name[filter_key]) if filter_key == "score_percentile" or filter_key == "le_percentile": # convert from percent to decimal cutoff = self._calc_percentile_cutoff( @@ -2625,7 +2623,6 @@ def _generate_result_filtering_query(self, filters_dict): # NOTE here if implementing other interaction count filters if k != "hb_count": continue - self.index_columns.append("num_hb") if v > 0: queries.append("num_hb > {value}".format(value=v)) else: From cec03f61372813f350cb9fc664a5d374062d5f94 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 23 Sep 2024 09:55:45 -0700 Subject: [PATCH 25/63] made filtering_window local variable to only method using it --- ringtail/storagemanager.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 340e276c..22910315 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -508,7 +508,6 @@ def __init__( } self.view_suffix = None self.temptable_suffix = 0 - self.filtering_window = "Results" # TODO is this necessary? self.open_cursors = [] # region Methods for inserting into/removing from the database @@ -2562,7 +2561,7 @@ def _generate_result_filtering_query(self, filters_dict): Returns: str: SQLite-formatted string for filtering query """ - + filtering_window = "Results" energy_filter_col_name = { "eworst": "docking_score", "ebest": "docking_score", @@ -2592,11 +2591,11 @@ def _generate_result_filtering_query(self, filters_dict): "Cannot use 'score_percentile' or 'le_percentile' with 'filter_bookmark'." ) # filtering window can be specified bookmark, or whole database (or other reduced versions of db) - self.filtering_window = self.filter_bookmark + filtering_window = self.filter_bookmark # write energy filters and compile list of interactions to search for queries = [] - interaction_filters = [] + interaction_filters = [] # TODO only define when needed for filter_key, filter_value in filters_dict.items(): # filter dict contains all possible filters, are None of not specified by user @@ -2810,30 +2809,30 @@ def _generate_result_filtering_query(self, filters_dict): ) sql_string = output_str = ( """SELECT {out_columns} FROM {window} WHERE """.format( - out_columns=outfield_string, window=self.filtering_window + out_columns=outfield_string, window=filtering_window ) ) if interaction_queries == [] and queries != []: joined_queries = " AND ".join(queries) sql_string = sql_string + joined_queries unclustered_query = ( - f"SELECT Pose_id FROM {self.filtering_window} WHERE " + joined_queries + f"SELECT Pose_id FROM {filtering_window} WHERE " + joined_queries ) elif queries == [] and interaction_queries == [] and clustering: # allows for clustering without filtering - unclustered_query = f"SELECT Pose_id FROM {self.filtering_window}" + unclustered_query = f"SELECT Pose_id FROM {filtering_window}" self.logger.info("Preparing to cluster results") self.logger.warning( "If clustering is not performed on a pre-filtered bookmark, thhe clustering process will be very slow." ) else: - with_stmt = f"WITH subq as (SELECT Pose_id FROM {self.filtering_window}) " + with_stmt = f"WITH subq as (SELECT Pose_id FROM {filtering_window}) " if queries != []: with_stmt = with_stmt[:-2] + f" WHERE {' AND '.join(queries)}) " joined_interact_queries = " AND ".join(interaction_queries) sql_string = with_stmt + sql_string + joined_interact_queries unclustered_query = ( - f"SELECT Pose_id FROM {self.filtering_window} WHERE " + f"SELECT Pose_id FROM {filtering_window} WHERE " + joined_interact_queries ) @@ -2997,9 +2996,9 @@ def mp_wrapper(input_tpl): return sql_string, sql_string.replace( """SELECT {out_columns} FROM {window}""".format( - out_columns=outfield_string, window=self.filtering_window + out_columns=outfield_string, window=filtering_window ), - f"SELECT * FROM {self.filtering_window}", + f"SELECT * FROM {filtering_window}", ) # sql_query, view_query def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: From 68707647259a2489e357089789b4b4c7ebb50081 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 23 Sep 2024 10:01:51 -0700 Subject: [PATCH 26/63] writes logger warning if an unrecognized interaction count filter is used --- ringtail/storagemanager.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 22910315..1faa55b6 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2597,8 +2597,10 @@ def _generate_result_filtering_query(self, filters_dict): queries = [] interaction_filters = [] # TODO only define when needed + # analyze and organize filters + # TODO make this a separate method for filter_key, filter_value in filters_dict.items(): - # filter dict contains all possible filters, are None of not specified by user + # filter dict contains all possible filters, are None if not specified by user if filter_value is None: continue # if filter has to do with docking energies @@ -2619,8 +2621,10 @@ def _generate_result_filtering_query(self, filters_dict): # write hb count filter(s) if filter_key == "hb_count": for k, v in filter_value: - # NOTE here if implementing other interaction count filters if k != "hb_count": + self.logger.warning( + f"An unrecognized interaction count filter was found: {k}, which will not be included in the filtering." + ) continue if v > 0: queries.append("num_hb > {value}".format(value=v)) From 54688d50ba424f86c6cdd77fae28d510dab46bf5 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 23 Sep 2024 10:21:40 -0700 Subject: [PATCH 27/63] made separate method for formatting filters for db query use --- ringtail/storagemanager.py | 84 ++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 1faa55b6..0b0d6d68 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2551,17 +2551,20 @@ def _generate_outfield_string(self): ) return ", ".join([self.field_to_column_name[field] for field in outfields_list]) - def _generate_result_filtering_query(self, filters_dict): - # TODO THE biggest one - """takes lists of filters, writes sql filtering string + def _format_filters_for_query(self, filters_dict: dict): + """ + Method that reformats the filters to the specified database columns, handles less than/more than filters, etc Args: - filters_dict (dict): dict of filters. Keys names and value formats must match those found in the Filters class + filters_dict (dict): all Ringtail filters, okay to contain None Returns: - str: SQLite-formatted string for filtering query + list: list of numerical filters formatted to be inserted in a query + list: list of interaction filters formatted to be inserted in a query """ - filtering_window = "Results" + # write energy filters and compile list of interactions to search for + queries = [] + interaction_filters = [] energy_filter_col_name = { "eworst": "docking_score", "ebest": "docking_score", @@ -2570,35 +2573,6 @@ def _generate_result_filtering_query(self, filters_dict): "score_percentile": "docking_score", "le_percentile": "leff", } - outfield_string = self._generate_outfield_string() - - # if filtering over a bookmark (i.e., already filtered results) as opposed to a whole database - if self.filter_bookmark is not None: - if self.filter_bookmark == self.bookmark_name: - # cannot write data from bookmark_a to bookmark_a - self.logger.error( - f"Specified 'filter_bookmark' and 'bookmark_name' are the same: {self.bookmark_name}" - ) - raise OptionError( - "'filter_bookmark' and 'bookmark_name' cannot be the same! Please rename 'bookmark_name'" - ) - # cannot use percentile for an already reduced dataset - if ( - filters_dict["score_percentile"] is not None - or filters_dict["le_percentile"] is not None - ): - raise OptionError( - "Cannot use 'score_percentile' or 'le_percentile' with 'filter_bookmark'." - ) - # filtering window can be specified bookmark, or whole database (or other reduced versions of db) - filtering_window = self.filter_bookmark - - # write energy filters and compile list of interactions to search for - queries = [] - interaction_filters = [] # TODO only define when needed - - # analyze and organize filters - # TODO make this a separate method for filter_key, filter_value in filters_dict.items(): # filter dict contains all possible filters, are None if not specified by user if filter_value is None: @@ -2629,11 +2603,13 @@ def _generate_result_filtering_query(self, filters_dict): if v > 0: queries.append("num_hb > {value}".format(value=v)) else: + # if value is negative, it means less than specified number of hydrogen bonds queries.append("num_hb <= {value}".format(value=-1 * v)) # reformat interaction filters as list if filter_key in Filters.get_filter_keys("interaction"): for interact in filter_value: + # interact has format ["chain:res:resno:resatom", bool(include or exclude interaction)] interaction_string = filter_key + ":" + interact[0] # add bool flag for included (T) or excluded (F) interaction interaction_filters.append( @@ -2646,6 +2622,44 @@ def _generate_result_filtering_query(self, filters_dict): interaction_filters.append( ["reactive_interactions", "", "", "", "", True] ) + return queries, interaction_filters + + def _generate_result_filtering_query(self, filters_dict): + # TODO THE biggest one + """takes lists of filters, writes sql filtering string + + Args: + filters_dict (dict): dict of filters. Keys names and value formats must match those found in the Filters class + + Returns: + str: SQLite-formatted string for filtering query + """ + filtering_window = "Results" + + outfield_string = self._generate_outfield_string() + + # if filtering over a bookmark (i.e., already filtered results) as opposed to a whole database + if self.filter_bookmark is not None: + if self.filter_bookmark == self.bookmark_name: + # cannot write data from bookmark_a to bookmark_a + self.logger.error( + f"Specified 'filter_bookmark' and 'bookmark_name' are the same: {self.bookmark_name}" + ) + raise OptionError( + "'filter_bookmark' and 'bookmark_name' cannot be the same! Please rename 'bookmark_name'" + ) + # cannot use percentile for an already reduced dataset + if ( + filters_dict["score_percentile"] is not None + or filters_dict["le_percentile"] is not None + ): + raise OptionError( + "Cannot use 'score_percentile' or 'le_percentile' with 'filter_bookmark'." + ) + # filtering window can be specified bookmark, or whole database (or other reduced versions of db) + filtering_window = self.filter_bookmark + + queries, interaction_filters = self._format_filters_for_query(filters_dict) # for each interaction filter, get the index from the interactions_indices table interaction_name_to_letter = { From 857a976b09d6f5ea81c6fb47038d62b52f84645e Mon Sep 17 00:00:00 2001 From: maylinnp Date: Tue, 24 Sep 2024 10:03:03 -0700 Subject: [PATCH 28/63] started major rewrite of the filtering query for interactions and energy filters only --- ringtail/storagemanager.py | 522 ++++++++++++++++++++++--------------- 1 file changed, 306 insertions(+), 216 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 0b0d6d68..2854bbd2 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2527,11 +2527,7 @@ def _calc_percentile_cutoff(self, percentile: float, column="docking_score"): except sqlite3.OperationalError as e: raise StorageError("Error while generating percentile query") from e - # endregion - - # region Methods that generate SQLite query strings def _generate_outfield_string(self): - # TODO this will probably need refactoring """string describing outfields to be written Returns: @@ -2549,9 +2545,9 @@ def _generate_outfield_string(self): out_f=outfield ) ) - return ", ".join([self.field_to_column_name[field] for field in outfields_list]) + return [self.field_to_column_name[field] for field in outfields_list] - def _format_filters_for_query(self, filters_dict: dict): + def _process_filters_for_query(self, filters_dict: dict): """ Method that reformats the filters to the specified database columns, handles less than/more than filters, etc @@ -2563,7 +2559,7 @@ def _format_filters_for_query(self, filters_dict: dict): list: list of interaction filters formatted to be inserted in a query """ # write energy filters and compile list of interactions to search for - queries = [] + numerical_filters = [] interaction_filters = [] energy_filter_col_name = { "eworst": "docking_score", @@ -2584,9 +2580,11 @@ def _format_filters_for_query(self, filters_dict: dict): cutoff = self._calc_percentile_cutoff( filter_value, energy_filter_col_name[filter_key] ) - queries.append(f"{energy_filter_col_name[filter_key]} < {cutoff}") + numerical_filters.append( + f"{energy_filter_col_name[filter_key]} < {cutoff}" + ) else: - queries.append( + numerical_filters.append( self.energy_filter_sqlite_call_dict[filter_key].format( value=filter_value ) @@ -2601,28 +2599,41 @@ def _format_filters_for_query(self, filters_dict: dict): ) continue if v > 0: - queries.append("num_hb > {value}".format(value=v)) + numerical_filters.append("num_hb > {value}".format(value=v)) else: # if value is negative, it means less than specified number of hydrogen bonds - queries.append("num_hb <= {value}".format(value=-1 * v)) - + numerical_filters.append( + "num_hb <= {value}".format(value=-1 * v) + ) + interaction_name_to_letter = { + "vdw_interactions": "V", + "hb_interactions": "H", + "reactive_interactions": "R", + } # reformat interaction filters as list if filter_key in Filters.get_filter_keys("interaction"): for interact in filter_value: # interact has format ["chain:res:resno:resatom", bool(include or exclude interaction)] - interaction_string = filter_key + ":" + interact[0] + interaction_string = ( + interaction_name_to_letter[filter_key] + ":" + interact[0] + ) # add bool flag for included (T) or excluded (F) interaction interaction_filters.append( interaction_string.split(":") + [interact[1]] ) - # add react_any flag as interaction filter - # check if react_any is true + # add react_any flag as interaction filter if not None if filter_key == "react_any" and filter_value: - interaction_filters.append( - ["reactive_interactions", "", "", "", "", True] - ) - return queries, interaction_filters + interaction_filters.append(["R", "", "", "", "", True]) + + # make dict of just the ligand filters + # TODO inefficient but same as it was before + ligand_filters_dict = { + k: v + for k, v in filters_dict.items() + if k in Filters.get_filter_keys("ligand") + } + return numerical_filters, interaction_filters, ligand_filters_dict def _generate_result_filtering_query(self, filters_dict): # TODO THE biggest one @@ -2636,7 +2647,7 @@ def _generate_result_filtering_query(self, filters_dict): """ filtering_window = "Results" - outfield_string = self._generate_outfield_string() + outfield_columns = self._generate_outfield_string() # if filtering over a bookmark (i.e., already filtered results) as opposed to a whole database if self.filter_bookmark is not None: @@ -2659,207 +2670,214 @@ def _generate_result_filtering_query(self, filters_dict): # filtering window can be specified bookmark, or whole database (or other reduced versions of db) filtering_window = self.filter_bookmark - queries, interaction_filters = self._format_filters_for_query(filters_dict) - - # for each interaction filter, get the index from the interactions_indices table - interaction_name_to_letter = { - "vdw_interactions": "V", - "hb_interactions": "H", - "reactive_interactions": "R", - } - interaction_queries = [] - for interaction in interaction_filters: - interaction = [interaction_name_to_letter[interaction[0]]] + interaction[1:] - interaction_filter_indices = [] - interact_index_str = self._generate_interaction_index_filtering_query( - interaction[:-1] - ) # remove bool include/exclude flag - interaction_indices = self._run_query(interact_index_str) - for i in interaction_indices: - # create a list of interaction indices - interaction_filter_indices.append(i[0]) - - # catch if interaction not found in database - if interaction_filter_indices == []: - if interaction == ["R", "", "", "", "", True]: - self.logger.warning( - "Given 'react_any' filter, no reactive interactions found. Excluded from filtering." - ) - else: - raise OptionError( - "Interaction {i} not found in the database, please check for spelling errors or remove from filter.".format( - i=":".join(interaction[:4]) - ) - ) - continue - # determine include/exclude string - if interaction[-1] is True: - include_str = "IN" - elif interaction[-1] is False: - include_str = "NOT IN" - else: - raise RuntimeError( - "Unrecognized flag in interaction. Please contact Forli Lab with traceback and context." - ) - # find pose ids for ligands with desired interactions - # this query can be joining interaction_indices on the list of indices, and all the columns matching in interactions table (index that table) - # i think this is where things get slow - # this method has changed from _generate_interaction_filtering_query to _generate_interaction_bv_filtering_query for testing purposes - interaction_queries.append( - "Pose_ID {include_str} ({interaction_str})".format( - include_str=include_str, - interaction_str=self._generate_interaction_bv_filtering_query( - interaction_filter_indices - ), - ) - ) - - # make dict of filters related to ligands - ligand_filters_dict = { - k: v - for k, v in filters_dict.items() - if k in Filters.get_filter_keys("ligand") - } - # if ligand_substruct or ligand_name have values in filters - if filters_dict["ligand_substruct"] != [] or filters_dict["ligand_name"] != []: - ligand_query_str = self._generate_ligand_filtering_query( - ligand_filters_dict - ) - queries.append( - "LigName IN ({ligand_str})".format(ligand_str=ligand_query_str) - ) - # if ligand_substruct_pos has the correct number of arguments provided - if len(ligand_filters_dict["ligand_substruct_pos"]): - nr_args_per_group = 6 - nr_smarts = int( - len(ligand_filters_dict["ligand_substruct_pos"]) / nr_args_per_group - ) - # create temporary table with molecules that pass all smiles - tmp_lig_filters = { - "ligand_operator": ligand_filters_dict["ligand_operator"] - } - if "ligand_max_atoms" in ligand_filters_dict: - tmp_lig_filters["ligand_max_atoms"] = ligand_filters_dict[ - "ligand_max_atoms" - ] - tmp_lig_filters["ligand_substruct"] = [ - ligand_filters_dict["ligand_substruct_pos"][i * nr_args_per_group] - for i in range(nr_smarts) - ] - cmd = self._generate_ligand_filtering_query(tmp_lig_filters) - cmd = cmd.replace( - "SELECT LigName FROM Ligands", - "SELECT " - "Results.Pose_ID, " - "Ligands.LigName, " - "Ligands.ligand_smile, " - "Ligands.atom_index_map, " - "Results.ligand_coordinates " - "FROM Ligands INNER JOIN Results ON Results.LigName = Ligands.LigName", - ) - cmd = "CREATE TEMP TABLE passed_smarts AS " + cmd - cur = self.conn.cursor() - cur.execute("DROP TABLE IF EXISTS passed_smarts") - cur.execute(cmd) - smarts_loc_filters = [] - for i in range(nr_smarts): - smarts = ligand_filters_dict["ligand_substruct_pos"][ - i * nr_args_per_group + 0 - ] - index = int( - ligand_filters_dict["ligand_substruct_pos"][ - i * nr_args_per_group + 1 - ] + queries, interaction_filters, ligand_filters_dict = ( + self._process_filters_for_query(filters_dict) + ) + num_of_interactions = len(interaction_filters) + interaction_queries = False + exclude_interactions = [] + include_interactions = [] + if interaction_filters != []: + int_not_found = [] + # figure out if each interaction is in database, make a list of list of indices for each interaction + for interaction in interaction_filters: + # get all interaction indices matching the interaction filter (returns more than one index if filter has a "wildcard") + interaction_index_tuples = ( + self._generate_interaction_index_filtering_query(interaction[:-1]) ) - sqdist = ( - float( - ligand_filters_dict["ligand_substruct_pos"][ - i * nr_args_per_group + 2 - ] + # make list of indices from iterable cursor tuples (should create empty list of no results) + interaction_indices = [i[0] for i in interaction_index_tuples] + # catch if interaction not found in database + if interaction_indices == []: + if interaction == ["R", "", "", "", "", True]: + self.logger.warning( + "Given 'react_any' filter, no reactive interactions found. Excluded from filtering." + ) + else: + # create string representation of ecah interaction not found + int_not_found.append(":".join(interaction[:4])) + continue # ends this iteration of the for loop + + # create a list of lists for interactions to either include or exclude + if interaction[-1] is True: + include_interactions.append(interaction_indices) + elif interaction[-1] is False: + exclude_interactions.append(interaction_indices) + else: + raise RuntimeError( + "Unrecognized flag in interaction. Please contact Forli Lab with traceback and context." ) - ** 2 + # if one or more interactions not found, raise error + if int_not_found: + raise OptionError( + f"The following interactions do not exist in the database: {int_not_found} not found in the database. Please check for spelling errors or remove from filter." ) - x = float( - ligand_filters_dict["ligand_substruct_pos"][ - i * nr_args_per_group + 3 - ] + # set to True if any interaction filters + interaction_queries = bool(exclude_interactions or include_interactions) + + # check if ligand filters have values + ligand_filters = bool( + ligand_filters_dict["ligand_substruct"] + or ligand_filters_dict["ligand_name"] + or ligand_filters_dict["ligand_substruct_pos"] + or ligand_filters_dict["ligand_max_atoms"] + ) + if ligand_filters: + if ( + ligand_filters_dict["ligand_substruct"] != [] + or ligand_filters_dict["ligand_name"] != [] + ): + # TODO definitely need to clean this up + ligand_query_str = self._generate_ligand_filtering_query( + ligand_filters_dict ) - y = float( - ligand_filters_dict["ligand_substruct_pos"][ - i * nr_args_per_group + 4 - ] + queries.append( + "LigName IN ({ligand_str})".format(ligand_str=ligand_query_str) ) - z = float( - ligand_filters_dict["ligand_substruct_pos"][ - i * nr_args_per_group + 5 - ] + # if ligand_substruct_pos has the correct number of arguments provided + if len(ligand_filters_dict["ligand_substruct_pos"]): + ligand_substruct_pos_filters = ( + self._ligand_substructure_position_filter(ligand_filters_dict) ) - # save filter for bookmark - smarts_loc_filters.append((smarts, index, x, y, z)) - poses = self._run_query("SELECT * FROM passed_smarts") - pose_id_list = [] - smartsmol = Chem.MolFromSmarts(smarts) - for pose_id, ligname, smiles, idxmap, coords in poses: - mol = Chem.MolFromSmiles(smiles) - idxmap = [int(value) - 1 for value in json.loads(idxmap)] - idxmap = { - idxmap[j * 2]: idxmap[j * 2 + 1] - for j in range(int(len(idxmap) / 2)) - } - for hit in mol.GetSubstructMatches(smartsmol): - xyz = [ - float(value) - for value in json.loads(coords)[idxmap[hit[index]]] - ] - d2 = (xyz[0] - x) ** 2 + (xyz[1] - y) ** 2 + (xyz[2] - z) ** 2 - if d2 <= sqdist: - pose_id_list.append(str(pose_id)) - break # add pose only once - queries.append("Pose_ID IN ({0})".format(",".join(pose_id_list))) - cur.close() + queries.append(ligand_substruct_pos_filters) - # format query string + # check if clustering clustering = bool(self.mfpt_cluster or self.interaction_cluster) - # raise error if query string is empty - if queries == [] and interaction_queries == [] and not clustering: + + # raise error if no filters are present + if not queries and not interaction_queries and not clustering: raise DatabaseQueryError( "Query strings are empty. Please check filter options and ensure requested interactions are present." ) + # starts to prepare the overarching filtering query + # TODO sql_string = output_str = ( - """SELECT {out_columns} FROM {window} WHERE """.format( - out_columns=outfield_string, window=filtering_window - ) + f"""SELECT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R""" ) - if interaction_queries == [] and queries != []: - joined_queries = " AND ".join(queries) - sql_string = sql_string + joined_queries + start_of_query = f"""SELECT DISTINCT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R """ + # if just num/ligand queries, build query + if queries and not interaction_queries: + joined_queries = " AND ".join( + queries + ) # TODO this might be the one I need to join in after the interaction queries + new_query = start_of_query + " WHERE " + joined_queries + sql_string += "WHERE " + joined_queries unclustered_query = ( f"SELECT Pose_id FROM {filtering_window} WHERE " + joined_queries ) - elif queries == [] and interaction_queries == [] and clustering: + # if clustering only + elif clustering and not queries and not interaction_queries: # allows for clustering without filtering unclustered_query = f"SELECT Pose_id FROM {filtering_window}" self.logger.info("Preparing to cluster results") - self.logger.warning( - "If clustering is not performed on a pre-filtered bookmark, thhe clustering process will be very slow." - ) + # if filtering window is Results the clustering happens on the entire database + if filtering_window == "Results": + self.logger.warning( + "If clustering is not performed on a pre-filtered bookmark, the clustering process will be very slow." + ) + # includes interactions else: - with_stmt = f"WITH subq as (SELECT Pose_id FROM {filtering_window}) " - if queries != []: - with_stmt = with_stmt[:-2] + f" WHERE {' AND '.join(queries)}) " - joined_interact_queries = " AND ".join(interaction_queries) - sql_string = with_stmt + sql_string + joined_interact_queries - unclustered_query = ( - f"SELECT Pose_id FROM {filtering_window} WHERE " - + joined_interact_queries + joined_queries = " AND ".join("R." + query for query in queries) + # TODO building my new query for testing + # add the join and select part + new_query = ( + start_of_query + "JOIN (SELECT Pose_id FROM (SELECT Pose_ID, CASE" ) - + # then iteratively build the WHEN for each OR interaction and iterate through the nonsensical index + index_for_wildcard_interactions = -10000 + or_part_of_query = "" + # if interaction id part of or statement, give nonsensical index for counting purposes + or_included_interactions = [ + indices for indices in include_interactions if len(indices) > 1 + ] + # # perform for included interactions + for indices in or_included_interactions: + index_for_wildcard_interactions += 1 + or_part_of_query += ( + " WHEN interaction_id IN " + + str(tuple(indices)) + + " THEN " + + str(index_for_wildcard_interactions) + ) + # perform for excluded interactions + or_excluded_interactions = [ + indices for indices in exclude_interactions if len(indices) > 1 + ] + for indices in or_excluded_interactions: + index_for_wildcard_interactions += 1 + or_part_of_query += ( + " WHEN interaction_id NOT IN " + + str(tuple(indices)) + + " THEN " + + str(index_for_wildcard_interactions) + ) + # finalize the CASE statement by counting other interactions normally + or_part_of_query += ( + " ELSE interaction_id END AS filtered_interactions FROM Interactions " + ) + # then prepare indices that are to be included, both from OR and AND statement + and_included_interactions = [] + for index in [ + indices for indices in include_interactions if len(indices) <= 1 + ]: + # ensure no duplicates + if index[0] not in and_included_interactions: + and_included_interactions.append(index[0]) + print(" AND included:", index[0]) + + # add the included OR indices + for index_list in or_included_interactions: + for index in index_list: + if index not in and_included_interactions: + and_included_interactions.append(index) + print(" OR included:", index) + + and_excluded_interactions = [] + for index in [ + indices for indices in exclude_interactions if len(indices) <= 1 + ]: + # ensure no duplicates + if index[0] not in and_excluded_interactions: + and_excluded_interactions.append(index[0]) + print(" AND excluded:", index[0]) + + # add the excluded OR indices + for index_list in or_excluded_interactions: + for index in index_list: + if index not in and_excluded_interactions: + and_excluded_interactions.append(index) + print(" OR excluded:", index) + + include_all = ( + "WHERE interaction_id IN (" + + ",".join([str(x) for x in and_included_interactions]) + + ")" + ) + print(" include_all", include_all) + # TODO something flashy to get whether or not both statements are included + exclude_all = ( + " AND interaction_id NOT IN (" + + ",".join([str(x) for x in and_excluded_interactions]) + + ")" + ) + print(" exclude_all", exclude_all) + num_of_interactions -= 1 + new_query += ( + or_part_of_query + + include_all + + exclude_all + + f") GROUP BY Pose_id HAVING COUNT (DISTINCT filtered_interactions) = {num_of_interactions}) I " + ) + new_query += "ON R.pose_id = I.pose_id WHERE " + joined_queries + print(new_query) + cur = self._run_query(new_query) + print(" This should be 19 hits:", (cur.fetchall())) # adding if we only want to keep one pose per ligand (will keep first entry) if not self.output_all_poses: sql_string += " GROUP BY LigName" # add how to order results - if self.order_results is not None: + if self.order_results: try: sql_string += ( " ORDER BY " + self.field_to_column_name[self.order_results] @@ -3019,7 +3037,96 @@ def mp_wrapper(input_tpl): f"SELECT * FROM {filtering_window}", ) # sql_query, view_query + def _ligand_substructure_position_filter(self, ligand_filters_dict: dict): + queries = [] + nr_args_per_group = 6 + nr_smarts = int( + len(ligand_filters_dict["ligand_substruct_pos"]) / nr_args_per_group + ) + # create temporary table with molecules that pass all smiles + tmp_lig_filters = {"ligand_operator": ligand_filters_dict["ligand_operator"]} + if "ligand_max_atoms" in ligand_filters_dict: + tmp_lig_filters["ligand_max_atoms"] = ligand_filters_dict[ + "ligand_max_atoms" + ] + tmp_lig_filters["ligand_substruct"] = [ + ligand_filters_dict["ligand_substruct_pos"][i * nr_args_per_group] + for i in range(nr_smarts) + ] + cmd = self._generate_ligand_filtering_query(tmp_lig_filters) + cmd = cmd.replace( + "SELECT LigName FROM Ligands", + "SELECT " + "Results.Pose_ID, " + "Ligands.LigName, " + "Ligands.ligand_smile, " + "Ligands.atom_index_map, " + "Results.ligand_coordinates " + "FROM Ligands INNER JOIN Results ON Results.LigName = Ligands.LigName", + ) + cmd = "CREATE TEMP TABLE passed_smarts AS " + cmd + cur = self.conn.cursor() + cur.execute("DROP TABLE IF EXISTS passed_smarts") + cur.execute(cmd) + smarts_loc_filters = [] + for i in range(nr_smarts): + smarts = ligand_filters_dict["ligand_substruct_pos"][ + i * nr_args_per_group + 0 + ] + index = int( + ligand_filters_dict["ligand_substruct_pos"][i * nr_args_per_group + 1] + ) + sqdist = ( + float( + ligand_filters_dict["ligand_substruct_pos"][ + i * nr_args_per_group + 2 + ] + ) + ** 2 + ) + x = float( + ligand_filters_dict["ligand_substruct_pos"][i * nr_args_per_group + 3] + ) + y = float( + ligand_filters_dict["ligand_substruct_pos"][i * nr_args_per_group + 4] + ) + z = float( + ligand_filters_dict["ligand_substruct_pos"][i * nr_args_per_group + 5] + ) + # save filter for bookmark + smarts_loc_filters.append((smarts, index, x, y, z)) + poses = self._run_query("SELECT * FROM passed_smarts") + pose_id_list = [] + smartsmol = Chem.MolFromSmarts(smarts) + for pose_id, ligname, smiles, idxmap, coords in poses: + mol = Chem.MolFromSmiles(smiles) + idxmap = [int(value) - 1 for value in json.loads(idxmap)] + idxmap = { + idxmap[j * 2]: idxmap[j * 2 + 1] + for j in range(int(len(idxmap) / 2)) + } + for hit in mol.GetSubstructMatches(smartsmol): + xyz = [ + float(value) for value in json.loads(coords)[idxmap[hit[index]]] + ] + d2 = (xyz[0] - x) ** 2 + (xyz[1] - y) ** 2 + (xyz[2] - z) ** 2 + if d2 <= sqdist: + pose_id_list.append(str(pose_id)) + break # add pose only once + queries.append("Pose_ID IN ({0})".format(",".join(pose_id_list))) + cur.close() + return queries + def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: + """ + Method to generate bitvector strings from pose_ids + + Args: + pose_ids (str): query formatted list of pose_ids (as tuple) + + Returns: + dict: of "pose_id":"bitvector" + """ # create a list of 0 items the length of interaction_indices table ii_length = self._get_length_of_table("Interaction_indices") # for each pose id, get a list of interaction_indices from joining the two tables i and ii @@ -3042,8 +3149,7 @@ def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: # return dict of pose id as string and bitvector return poseid_bv - def _generate_interaction_index_filtering_query(self, interaction_list): - # TODO I think this method can be combined with the next method + def _generate_interaction_index_filtering_query(self, interaction_list) -> iter: """takes list of interaction info for a given ligand, looks up corresponding interaction index @@ -3053,7 +3159,7 @@ def _generate_interaction_index_filtering_query(self, interaction_list): , ] Returns: - str: SQLite-formated query on Interaction_indices table + iter: sqlite cursor with the interaction index/indices """ interaction_info = [ "interaction_type", @@ -3075,7 +3181,7 @@ def _generate_interaction_index_filtering_query(self, interaction_list): ] ) - return sql_string + return self._run_query(sql_string).fetchall() def _generate_interaction_filtering_query(self, interaction_index_list): # TODO refactor -> THIS IS ONE OF THE MAJOR ONES @@ -3088,7 +3194,7 @@ def _generate_interaction_filtering_query(self, interaction_index_list): Returns: str: SQLite-formatted query """ - + # this creates one blob for each interaction, where I rather need one list for all fully specified, and one list for each or statement return """SELECT Pose_id FROM (SELECT Pose_ID, interaction_id FROM Interactions WHERE Pose_ID IN subq) @@ -3096,22 +3202,6 @@ def _generate_interaction_filtering_query(self, interaction_index_list): [f"""interaction_id={index}""" for index in interaction_index_list] ) - def _generate_interaction_bv_filtering_query(self, interaction_index_list): - # TODO remove - """takes list of interaction indices and searches for ligand ids - which have those interactions - - Args: - interaction_index_list (list): List of interaction indices - - Returns: - String: SQLite-formatted query - """ - return ( - "SELECT Pose_id FROM (SELECT * FROM Interaction_bitvectors WHERE Pose_ID IN subq) WHERE " - + " OR ".join([f"int_{index} = 1" for index in interaction_index_list]) - ) - def _generate_ligand_filtering_query(self, ligand_filters): # TODO this one is important and might be tricky, use sqlitestudio """write string to select from ligand table From 42561b08145798d664c5b41259effe2cd44ef88e Mon Sep 17 00:00:00 2001 From: maylinnp Date: Tue, 24 Sep 2024 13:35:23 -0700 Subject: [PATCH 29/63] only require ligand operator if specifying ligand substruct --- ringtail/ringtailoptions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ringtail/ringtailoptions.py b/ringtail/ringtailoptions.py index fe5aabde..e4bec778 100644 --- a/ringtail/ringtailoptions.py +++ b/ringtail/ringtailoptions.py @@ -583,7 +583,7 @@ class Filters(RTOptions): "description": "Maximum number of heavy atoms a ligand may have.", }, "ligand_operator": { - "default": "OR", + "default": None, "type": str, "description": "Logical join operator for multiple SMARTS.", }, @@ -622,7 +622,9 @@ def checks(self): f"Given 'score_percentile' {self.le_percentile} not allowed. Should be within percentile range of 0-100." ) - if self.ligand_operator not in ["OR", "AND"]: + if self.ligand_operator not in ["OR", "AND"] and ( + self.ligand_substruct or self.ligand_substruct_pos + ): raise OptionError( f"Given 'ligand_operator' {self.ligand_operator} not allowed. Must be 'OR' or 'AND'." ) From ce9d67fca57cf148f256ef1a11eb65fc746321b5 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 12:01:51 -0700 Subject: [PATCH 30/63] rewrote unclustered query --- ringtail/ringtailcore.py | 2 +- ringtail/ringtailoptions.py | 6 +- ringtail/storagemanager.py | 464 ++++++++++++++++++++---------------- ringtail/util.py | 5 + 4 files changed, 268 insertions(+), 209 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index c5cc553e..b135aa04 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -919,10 +919,10 @@ def set_filters( "react_any": react_any, "max_miss": max_miss, "ligand_name": ligand_name, + "ligand_operator": ligand_operator, "ligand_substruct": ligand_substruct, "ligand_substruct_pos": ligand_substruct_pos, "ligand_max_atoms": ligand_max_atoms, - "ligand_operator": ligand_operator, } # Create a filter object diff --git a/ringtail/ringtailoptions.py b/ringtail/ringtailoptions.py index e4bec778..6dcb8111 100644 --- a/ringtail/ringtailoptions.py +++ b/ringtail/ringtailoptions.py @@ -563,17 +563,17 @@ class Filters(RTOptions): "description": "Will compute all possible combinations of interaction filters excluding up to 'max_miss' number of interactions from given set. Default will only return union of poses interaction filter combinations. Use with 'enumerate_interaction_combs' for enumeration of poses passing each individual combination of interaction filters.", }, "ligand_name": { - "default": [], + "default": None, "type": list, "description": "Specify ligand name(s). Will combine name filters with 'OR'.", }, "ligand_substruct": { - "default": [], + "default": None, "type": list, "description": "SMARTS pattern(s) for substructure matching.", }, "ligand_substruct_pos": { - "default": [], + "default": None, "type": list, "description": "SMARTS pattern(s) for substructure matching, e.g., [''[Oh]C' 0 1.2 -5.5 10.0 15.5'] -> ['smart_string index_of_positioned_atom cutoff_distance x y z'].", }, diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 2854bbd2..dc2bff60 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -19,6 +19,7 @@ import time from importlib.metadata import version from .ringtailoptions import Filters +from .util import numlist2str from .exceptions import ( StorageError, DatabaseInsertionError, @@ -2528,10 +2529,10 @@ def _calc_percentile_cutoff(self, percentile: float, column="docking_score"): raise StorageError("Error while generating percentile query") from e def _generate_outfield_string(self): - """string describing outfields to be written + """list describing outfields to be written Returns: - str: query string + list: column names for which the data is to be displayed Raises: OptionError @@ -2548,6 +2549,7 @@ def _generate_outfield_string(self): return [self.field_to_column_name[field] for field in outfields_list] def _process_filters_for_query(self, filters_dict: dict): + # NOTE this method can maybe be a main class method once we get more database types """ Method that reformats the filters to the specified database columns, handles less than/more than filters, etc @@ -2561,6 +2563,7 @@ def _process_filters_for_query(self, filters_dict: dict): # write energy filters and compile list of interactions to search for numerical_filters = [] interaction_filters = [] + ligand_filters = {} energy_filter_col_name = { "eworst": "docking_score", "ebest": "docking_score", @@ -2625,15 +2628,20 @@ def _process_filters_for_query(self, filters_dict: dict): # add react_any flag as interaction filter if not None if filter_key == "react_any" and filter_value: interaction_filters.append(["R", "", "", "", "", True]) + # if filter has to do with ligands and SMARTS + if filter_key in Filters.get_filter_keys("ligand"): + ligand_filters[filter_key] = filter_value - # make dict of just the ligand filters - # TODO inefficient but same as it was before - ligand_filters_dict = { - k: v - for k, v in filters_dict.items() - if k in Filters.get_filter_keys("ligand") - } - return numerical_filters, interaction_filters, ligand_filters_dict + # put all processed filter in a dict + processed_filters = {} + if len(numerical_filters) > 0: + processed_filters["num_filters"] = numerical_filters + if len(interaction_filters) > 0: + processed_filters["int_filters"] = interaction_filters + if len(ligand_filters) > 0: + processed_filters["lig_filters"] = ligand_filters + + return processed_filters def _generate_result_filtering_query(self, filters_dict): # TODO THE biggest one @@ -2648,6 +2656,9 @@ def _generate_result_filtering_query(self, filters_dict): filtering_window = "Results" outfield_columns = self._generate_outfield_string() + num_query = "" + int_query = "" + lig_query = "" # if filtering over a bookmark (i.e., already filtered results) as opposed to a whole database if self.filter_bookmark is not None: @@ -2670,103 +2681,86 @@ def _generate_result_filtering_query(self, filters_dict): # filtering window can be specified bookmark, or whole database (or other reduced versions of db) filtering_window = self.filter_bookmark - queries, interaction_filters, ligand_filters_dict = ( - self._process_filters_for_query(filters_dict) - ) - num_of_interactions = len(interaction_filters) - interaction_queries = False - exclude_interactions = [] - include_interactions = [] - if interaction_filters != []: - int_not_found = [] - # figure out if each interaction is in database, make a list of list of indices for each interaction - for interaction in interaction_filters: - # get all interaction indices matching the interaction filter (returns more than one index if filter has a "wildcard") - interaction_index_tuples = ( - self._generate_interaction_index_filtering_query(interaction[:-1]) + # process filter values to lists and dicts that are easily incorporated in sql queries + processed_filters = self._process_filters_for_query(filters_dict) + + # check if clustering + clustering = bool(self.mfpt_cluster or self.interaction_cluster) + # raise error if no filters are present and no clusterings + if not processed_filters and not clustering: + raise DatabaseQueryError( + "Query strings are empty. Please check filter options and ensure requested interactions are present." + ) + + # check what filters are present, and prepare them as partial queries + if "num_filters" in processed_filters: + num_query = " AND ".join( + ["R." + filter for filter in processed_filters["num_filters"]] + ) + + # check for interactions and prepare for query + if "int_filters" in processed_filters: + # if interaction filters are present and valid, two lists of included and excluded interactions are returned + # each item in the lists to be joined by "AND", and each item within the list item (if >1) to be joined by "OR" + interaction_queries = [] + include_interactions, exclude_interactions = ( + self._prepare_interaction_indices_for_filtering( + interaction_list=processed_filters["int_filters"] ) - # make list of indices from iterable cursor tuples (should create empty list of no results) - interaction_indices = [i[0] for i in interaction_index_tuples] - # catch if interaction not found in database - if interaction_indices == []: - if interaction == ["R", "", "", "", "", True]: - self.logger.warning( - "Given 'react_any' filter, no reactive interactions found. Excluded from filtering." - ) - else: - # create string representation of ecah interaction not found - int_not_found.append(":".join(interaction[:4])) - continue # ends this iteration of the for loop - - # create a list of lists for interactions to either include or exclude - if interaction[-1] is True: - include_interactions.append(interaction_indices) - elif interaction[-1] is False: - exclude_interactions.append(interaction_indices) - else: - raise RuntimeError( - "Unrecognized flag in interaction. Please contact Forli Lab with traceback and context." - ) - # if one or more interactions not found, raise error - if int_not_found: - raise OptionError( - f"The following interactions do not exist in the database: {int_not_found} not found in the database. Please check for spelling errors or remove from filter." + ) + # ensure there are interactions in the list after processing + if bool(exclude_interactions or include_interactions): + # prepare partial queries for the different interaction combinations + int_query = self._prepare_interaction_filtering_query( + include_interactions, exclude_interactions ) - # set to True if any interaction filters - interaction_queries = bool(exclude_interactions or include_interactions) - - # check if ligand filters have values - ligand_filters = bool( - ligand_filters_dict["ligand_substruct"] - or ligand_filters_dict["ligand_name"] - or ligand_filters_dict["ligand_substruct_pos"] - or ligand_filters_dict["ligand_max_atoms"] - ) - if ligand_filters: + + # check if ligand filters and prepare for query + # returns ligand_queries with partial queries for the vairous ones, to be joined by AND or OR I belive + if "lig_filters" in processed_filters: + lig_filters = processed_filters["lig_filters"] + ligand_queries = [] + # if straight forward ligand filters, generate partial queries if ( - ligand_filters_dict["ligand_substruct"] != [] - or ligand_filters_dict["ligand_name"] != [] + lig_filters["ligand_substruct"] + or lig_filters["ligand_name"] + or lig_filters["ligand_max_atoms"] ): - # TODO definitely need to clean this up - ligand_query_str = self._generate_ligand_filtering_query( - ligand_filters_dict - ) - queries.append( - "LigName IN ({ligand_str})".format(ligand_str=ligand_query_str) + ligand_queries.append( + self._generate_ligand_filtering_query(lig_filters) ) - # if ligand_substruct_pos has the correct number of arguments provided - if len(ligand_filters_dict["ligand_substruct_pos"]): - ligand_substruct_pos_filters = ( - self._ligand_substructure_position_filter(ligand_filters_dict) + # if complex ligand filter, generate partial query + if lig_filters["ligand_substruct_pos"]: + ligand_queries.append( + self._ligand_substructure_position_filter(lig_filters) ) - queries.append(ligand_substruct_pos_filters) + # join all ligand queries that are not empty + lig_query = " AND ".join( + [lig_filter for lig_filter in ligand_queries if lig_filter] + ) - # check if clustering - clustering = bool(self.mfpt_cluster or self.interaction_cluster) + # choose columns to be selected from filtering_window + # TODO when to use "DISTINCT" + query_select_string = f"""SELECT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R """ + # TODO how to change the start of this for the bookmark query + # start stringing together queries + unclustered_query = query_select_string + if int_query: + # add with a join statement + unclustered_query += "JOIN " + int_query + " ON R.Pose_ID = I.Pose_ID " + if lig_query: + # add with a join statement + unclustered_query += "JOIN (" + lig_query + ") ON R.LigName = L.LigName " + if num_query: + unclustered_query += "WHERE " + num_query + + print(" unclustered query: ", unclustered_query) + cur = self.conn.cursor() + cur.execute(unclustered_query, (2,)) + print(" unknown number of hits:", (cur.fetchall())) - # raise error if no filters are present - if not queries and not interaction_queries and not clustering: - raise DatabaseQueryError( - "Query strings are empty. Please check filter options and ensure requested interactions are present." - ) - # starts to prepare the overarching filtering query - # TODO - sql_string = output_str = ( - f"""SELECT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R""" - ) - start_of_query = f"""SELECT DISTINCT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R """ - # if just num/ligand queries, build query - if queries and not interaction_queries: - joined_queries = " AND ".join( - queries - ) # TODO this might be the one I need to join in after the interaction queries - new_query = start_of_query + " WHERE " + joined_queries - sql_string += "WHERE " + joined_queries - unclustered_query = ( - f"SELECT Pose_id FROM {filtering_window} WHERE " + joined_queries - ) # if clustering only - elif clustering and not queries and not interaction_queries: + if clustering and not queries and not interaction_queries: # allows for clustering without filtering unclustered_query = f"SELECT Pose_id FROM {filtering_window}" self.logger.info("Preparing to cluster results") @@ -2775,100 +2769,7 @@ def _generate_result_filtering_query(self, filters_dict): self.logger.warning( "If clustering is not performed on a pre-filtered bookmark, the clustering process will be very slow." ) - # includes interactions - else: - joined_queries = " AND ".join("R." + query for query in queries) - # TODO building my new query for testing - # add the join and select part - new_query = ( - start_of_query + "JOIN (SELECT Pose_id FROM (SELECT Pose_ID, CASE" - ) - # then iteratively build the WHEN for each OR interaction and iterate through the nonsensical index - index_for_wildcard_interactions = -10000 - or_part_of_query = "" - # if interaction id part of or statement, give nonsensical index for counting purposes - or_included_interactions = [ - indices for indices in include_interactions if len(indices) > 1 - ] - # # perform for included interactions - for indices in or_included_interactions: - index_for_wildcard_interactions += 1 - or_part_of_query += ( - " WHEN interaction_id IN " - + str(tuple(indices)) - + " THEN " - + str(index_for_wildcard_interactions) - ) - # perform for excluded interactions - or_excluded_interactions = [ - indices for indices in exclude_interactions if len(indices) > 1 - ] - for indices in or_excluded_interactions: - index_for_wildcard_interactions += 1 - or_part_of_query += ( - " WHEN interaction_id NOT IN " - + str(tuple(indices)) - + " THEN " - + str(index_for_wildcard_interactions) - ) - # finalize the CASE statement by counting other interactions normally - or_part_of_query += ( - " ELSE interaction_id END AS filtered_interactions FROM Interactions " - ) - # then prepare indices that are to be included, both from OR and AND statement - and_included_interactions = [] - for index in [ - indices for indices in include_interactions if len(indices) <= 1 - ]: - # ensure no duplicates - if index[0] not in and_included_interactions: - and_included_interactions.append(index[0]) - print(" AND included:", index[0]) - # add the included OR indices - for index_list in or_included_interactions: - for index in index_list: - if index not in and_included_interactions: - and_included_interactions.append(index) - print(" OR included:", index) - - and_excluded_interactions = [] - for index in [ - indices for indices in exclude_interactions if len(indices) <= 1 - ]: - # ensure no duplicates - if index[0] not in and_excluded_interactions: - and_excluded_interactions.append(index[0]) - print(" AND excluded:", index[0]) - - # add the excluded OR indices - for index_list in or_excluded_interactions: - for index in index_list: - if index not in and_excluded_interactions: - and_excluded_interactions.append(index) - print(" OR excluded:", index) - - include_all = ( - "WHERE interaction_id IN (" - + ",".join([str(x) for x in and_included_interactions]) - + ")" - ) - print(" include_all", include_all) - # TODO something flashy to get whether or not both statements are included - exclude_all = ( - " AND interaction_id NOT IN (" - + ",".join([str(x) for x in and_excluded_interactions]) - + ")" - ) - print(" exclude_all", exclude_all) - num_of_interactions -= 1 - new_query += ( - or_part_of_query - + include_all - + exclude_all - + f") GROUP BY Pose_id HAVING COUNT (DISTINCT filtered_interactions) = {num_of_interactions}) I " - ) - new_query += "ON R.pose_id = I.pose_id WHERE " + joined_queries print(new_query) cur = self._run_query(new_query) print(" This should be 19 hits:", (cur.fetchall())) @@ -3037,6 +2938,107 @@ def mp_wrapper(input_tpl): f"SELECT * FROM {filtering_window}", ) # sql_query, view_query + def _prepare_interaction_filtering_query( + self, include_interactions: list, exclude_interactions: list + ) -> str: + """ + _summary_ + + Args: + include_interactions (list): _description_ + exclude_interactions (list): _description_ + + Returns: + str: _description_ + """ + # nonsensical number to count an interaction if it satisfies an incomplete ("wildcard") interaction + nonsense_counter = -10000 + + def _prepare_indices_for_query(interactions: list): + """ + Method to organize a list of indices into those to be included in an OR and an AND statement + + Args: + interactions (list): list of indices organized by how they were queried from the db + + Returns: + list, list: indices organized in lists appropriate for the query + """ + and_interactions = [] + or_interactions = [] + # add the included OR indices + for index_list in interactions: + # one list per interaction, mode indices if interaction had a wildcard + for index in index_list: + # if index not already represented + if index not in and_interactions: + # add to list + and_interactions.append(index) + # if index list has more than one element, they should also be combined in an "OR" statement + if len(index_list) > 1: + # adds a string element to the list + or_interactions.append(index_list) + return and_interactions, or_interactions + + # prepare lists of indices ready to be cast to tuples and strings + if include_interactions: + and_include_interactions, or_include_interactions = ( + _prepare_indices_for_query(include_interactions) + ) + if exclude_interactions: + and_exclude_interactions, or_exclude_interactions = ( + _prepare_indices_for_query(exclude_interactions) + ) + + # building the query + # 1. select pose id, call CASE, in paranthesis because grouping with different query + query = "(SELECT Pose_ID FROM (SELECT Pose_ID " + if or_include_interactions or or_exclude_interactions: + # add the case statements + query += ", CASE " + # 2. list all OR statements + # TODO catch if no OR statements + for interactions in or_include_interactions: + # iterate the nonsense counter + nonsense_counter += 1 + query += ( + "WHEN interaction_id IN (" + + numlist2str(interactions, ",") + + f") THEN {nonsense_counter} " + ) + for interactions in or_exclude_interactions: + # iterate the nonsense counter + nonsense_counter += 1 + query += ( + "WHEN interaction_id NOT IN (" + + numlist2str(interactions, ",") + + f") THEN {nonsense_counter} " + ) + query += "ELSE interaction_id END " + # 3. proceed with all interactions + query += "AS filtered_interactions FROM Interactions WHERE " + if and_include_interactions: + query += ( + "interaction_id IN (" + + numlist2str(and_include_interactions, ",") + + ") " + ) + # if both include and exclude are there, need "AND" + if and_exclude_interactions: + query += "AND " + if and_exclude_interactions: + query += ( + "interaction_id NOT IN (" + + numlist2str(and_exclude_interactions, ",") + + ") " + ) + # 4. add grouping and wildcard for total interactions minus max_miss, essentially + query += ( + ") GROUP BY Pose_ID HAVING COUNT(DISTINCT filtered_interactions) = (?)) I " + ) + + return query + def _ligand_substructure_position_filter(self, ligand_filters_dict: dict): queries = [] nr_args_per_group = 6 @@ -3055,14 +3057,14 @@ def _ligand_substructure_position_filter(self, ligand_filters_dict: dict): ] cmd = self._generate_ligand_filtering_query(tmp_lig_filters) cmd = cmd.replace( - "SELECT LigName FROM Ligands", + "SELECT L.LigName FROM Ligands L", "SELECT " - "Results.Pose_ID, " - "Ligands.LigName, " - "Ligands.ligand_smile, " - "Ligands.atom_index_map, " - "Results.ligand_coordinates " - "FROM Ligands INNER JOIN Results ON Results.LigName = Ligands.LigName", + "R.Pose_ID, " + "L.LigName, " + "L.ligand_smile, " + "L.atom_index_map, " + "R.ligand_coordinates " + "FROM Ligands L INNER JOIN Results R ON R.LigName = L.LigName", ) cmd = "CREATE TEMP TABLE passed_smarts AS " + cmd cur = self.conn.cursor() @@ -3113,7 +3115,8 @@ def _ligand_substructure_position_filter(self, ligand_filters_dict: dict): if d2 <= sqdist: pose_id_list.append(str(pose_id)) break # add pose only once - queries.append("Pose_ID IN ({0})".format(",".join(pose_id_list))) + if len(pose_id_list) > 0: + queries.append("Pose_ID IN ({0})".format(",".join(pose_id_list))) cur.close() return queries @@ -3149,7 +3152,59 @@ def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: # return dict of pose id as string and bitvector return poseid_bv - def _generate_interaction_index_filtering_query(self, interaction_list) -> iter: + def _prepare_interaction_indices_for_filtering(self, interaction_list): + """ + _summary_ + + Args: + interaction_list (): _description_ + + Raises: + OptionError + + Returns: + list: two lists of indices for interactions to exclude and to include + """ + # initialize variables + exclude_interactions = [] + include_interactions = [] + interaction_not_found = [] + + # figure out if each interaction is in database, make a list of list of indices for each interaction + for interaction in interaction_list: + # get all interaction indices matching the interaction filter (returns more than one index if filter has a "wildcard") + interaction_index_tuples = self._get_interaction_indices(interaction[:-1]) + # make list of indices from iterable cursor tuples (should create empty list if no results) + interaction_indices = [i[0] for i in interaction_index_tuples] + # catch if interaction not found in database + if interaction_indices == []: + if interaction == ["R", "", "", "", "", True]: + self.logger.warning( + "Given 'react_any' filter, no reactive interactions found. Excluded from filtering." + ) + else: + # create string representation of ecah interaction not found + interaction_not_found.append(":".join(interaction[:4])) + continue # ends this iteration of the for loop + + # create a list of lists for interactions to either include or exclude + if interaction[-1] is True: + include_interactions.append(interaction_indices) + elif interaction[-1] is False: + exclude_interactions.append(interaction_indices) + else: + raise OptionError( + "Unrecognized flag in interaction. Please contact Forli Lab with traceback and context." + ) + # if one or more interactions not found, raise error + if interaction_not_found: + raise OptionError( + f"The following interactions do not exist in the database: {interaction_not_found} not found in the database. Please check for spelling errors or remove from filter." + ) + else: + return include_interactions, exclude_interactions + + def _get_interaction_indices(self, interaction_list) -> iter: """takes list of interaction info for a given ligand, looks up corresponding interaction index @@ -3194,7 +3249,6 @@ def _generate_interaction_filtering_query(self, interaction_index_list): Returns: str: SQLite-formatted query """ - # this creates one blob for each interaction, where I rather need one list for all fully specified, and one list for each or statement return """SELECT Pose_id FROM (SELECT Pose_ID, interaction_id FROM Interactions WHERE Pose_ID IN subq) @@ -3202,8 +3256,8 @@ def _generate_interaction_filtering_query(self, interaction_index_list): [f"""interaction_id={index}""" for index in interaction_index_list] ) - def _generate_ligand_filtering_query(self, ligand_filters): - # TODO this one is important and might be tricky, use sqlitestudio + def _generate_ligand_filtering_query(self, ligand_filters: dict) -> str: + # TODO want to clean this one up """write string to select from ligand table Args: @@ -3213,7 +3267,7 @@ def _generate_ligand_filtering_query(self, ligand_filters): str: SQLite-formatted query, Dict: dictionary of filters and values """ - sql_ligand_string = "SELECT LigName FROM Ligands WHERE" + sql_ligand_string = "SELECT L.LigName FROM Ligands L WHERE" logical_operator = ligand_filters["ligand_operator"] if logical_operator is None: logical_operator = "AND" @@ -3223,7 +3277,7 @@ def _generate_ligand_filtering_query(self, ligand_filters): for name in fils: if name == "": continue - name_sql_str = " LigName LIKE '%{value}%' OR".format(value=name) + name_sql_str = " L.LigName LIKE '%{value}%' OR".format(value=name) sql_ligand_string += name_sql_str if kw == "ligand_max_atoms" and ligand_filters[kw] is not None: maxatom_sql_str = " mol_num_atms(ligand_rdmol) <= {} {}".format( diff --git a/ringtail/util.py b/ringtail/util.py index 112f3013..6adfec7c 100644 --- a/ringtail/util.py +++ b/ringtail/util.py @@ -80,3 +80,8 @@ def caller_info(skip=2): del parentframe return package, module, klass, caller, line + + +def numlist2str(list: list, separator: str) -> str: + + return separator.join([str(x) for x in list]) From 1102a37b0bf1b5ed5b76bea851326a2a4a68d3b7 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 13:56:30 -0700 Subject: [PATCH 31/63] rewritten entire filtering query constructor including for clustering --- ringtail/storagemanager.py | 274 ++++++++++++++++++------------------- 1 file changed, 132 insertions(+), 142 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index dc2bff60..c8c6450e 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2686,111 +2686,132 @@ def _generate_result_filtering_query(self, filters_dict): # check if clustering clustering = bool(self.mfpt_cluster or self.interaction_cluster) + # if clustering without filtering + if clustering: + # allows for clustering without filtering + self.logger.info("Preparing to cluster results") + unclustered_query = f"SELECT R.Pose_id FROM {filtering_window} R " + if not processed_filters and filtering_window == "Results": + self.logger.warning( + "If clustering is not performed on a pre-filtered bookmark, the clustering process will be very slow." + ) + else: + # start with empty string, will prepend SELECT statement later + unclustered_query = "" # raise error if no filters are present and no clusterings if not processed_filters and not clustering: raise DatabaseQueryError( "Query strings are empty. Please check filter options and ensure requested interactions are present." ) - - # check what filters are present, and prepare them as partial queries - if "num_filters" in processed_filters: - num_query = " AND ".join( - ["R." + filter for filter in processed_filters["num_filters"]] - ) - - # check for interactions and prepare for query - if "int_filters" in processed_filters: - # if interaction filters are present and valid, two lists of included and excluded interactions are returned - # each item in the lists to be joined by "AND", and each item within the list item (if >1) to be joined by "OR" - interaction_queries = [] - include_interactions, exclude_interactions = ( - self._prepare_interaction_indices_for_filtering( - interaction_list=processed_filters["int_filters"] + # create query string from filters if present + if processed_filters: + # start stringing together queries + # check what filters are present, and prepare them as partial queries + if "num_filters" in processed_filters: + num_query = " AND ".join( + ["R." + filter for filter in processed_filters["num_filters"]] ) - ) - # ensure there are interactions in the list after processing - if bool(exclude_interactions or include_interactions): - # prepare partial queries for the different interaction combinations - int_query = self._prepare_interaction_filtering_query( - include_interactions, exclude_interactions + + # check for interactions and prepare for query + if "int_filters" in processed_filters: + # if interaction filters are present and valid, two lists of included and excluded interactions are returned + # each item in the lists to be joined by "AND", and each item within the list item (if >1) to be joined by "OR" + include_interactions, exclude_interactions = ( + self._prepare_interaction_indices_for_filtering( + interaction_list=processed_filters["int_filters"] + ) ) + # ensure there are interactions in the list after processing + if bool(exclude_interactions or include_interactions): + # prepare partial queries for the different interaction combinations + int_query = self._prepare_interaction_filtering_query( + include_interactions, exclude_interactions + ) - # check if ligand filters and prepare for query - # returns ligand_queries with partial queries for the vairous ones, to be joined by AND or OR I belive - if "lig_filters" in processed_filters: - lig_filters = processed_filters["lig_filters"] - ligand_queries = [] - # if straight forward ligand filters, generate partial queries - if ( - lig_filters["ligand_substruct"] - or lig_filters["ligand_name"] - or lig_filters["ligand_max_atoms"] - ): - ligand_queries.append( - self._generate_ligand_filtering_query(lig_filters) + # check if ligand filters and prepare for query + if "lig_filters" in processed_filters: + lig_filters = processed_filters["lig_filters"] + ligand_queries = [] + # if straight forward ligand filters, generate partial queries + if ( + lig_filters["ligand_substruct"] + or lig_filters["ligand_name"] + or lig_filters["ligand_max_atoms"] + ): + ligand_queries.append( + self._generate_ligand_filtering_query(lig_filters) + ) + # if complex ligand filter, generate partial query + if lig_filters["ligand_substruct_pos"]: + ligand_queries.append( + self._ligand_substructure_position_filter(lig_filters) + ) + # join all ligand queries that are not empty + lig_query = " AND ".join( + [lig_filter for lig_filter in ligand_queries if lig_filter] ) - # if complex ligand filter, generate partial query - if lig_filters["ligand_substruct_pos"]: - ligand_queries.append( - self._ligand_substructure_position_filter(lig_filters) + if int_query: + # add with a join statement + unclustered_query += "JOIN " + int_query + " ON R.Pose_ID = I.Pose_ID " + if lig_query: + # add with a join statement + unclustered_query += ( + "JOIN (" + lig_query + ") ON R.LigName = L.LigName " ) - # join all ligand queries that are not empty - lig_query = " AND ".join( - [lig_filter for lig_filter in ligand_queries if lig_filter] - ) - + if num_query: + unclustered_query += "WHERE " + num_query + # if clustering is requested, do that before saving view or filtering results for output + if clustering: + # add appropriate select + try: + query = self._prepare_cluster_query(unclustered_query) + query = "WHERE " + query + except OptionError as e: + raise e + else: + # if not clustering, rename query + query = unclustered_query # choose columns to be selected from filtering_window # TODO when to use "DISTINCT" query_select_string = f"""SELECT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R """ - # TODO how to change the start of this for the bookmark query - # start stringing together queries - unclustered_query = query_select_string - if int_query: - # add with a join statement - unclustered_query += "JOIN " + int_query + " ON R.Pose_ID = I.Pose_ID " - if lig_query: - # add with a join statement - unclustered_query += "JOIN (" + lig_query + ") ON R.LigName = L.LigName " - if num_query: - unclustered_query += "WHERE " + num_query - - print(" unclustered query: ", unclustered_query) - cur = self.conn.cursor() - cur.execute(unclustered_query, (2,)) - print(" unknown number of hits:", (cur.fetchall())) - - # if clustering only - if clustering and not queries and not interaction_queries: - # allows for clustering without filtering - unclustered_query = f"SELECT Pose_id FROM {filtering_window}" - self.logger.info("Preparing to cluster results") - # if filtering window is Results the clustering happens on the entire database - if filtering_window == "Results": - self.logger.warning( - "If clustering is not performed on a pre-filtered bookmark, the clustering process will be very slow." - ) - - print(new_query) - cur = self._run_query(new_query) - print(" This should be 19 hits:", (cur.fetchall())) # adding if we only want to keep one pose per ligand (will keep first entry) if not self.output_all_poses: - sql_string += " GROUP BY LigName" - + query += " GROUP BY LigName " # add how to order results if self.order_results: - try: - sql_string += ( - " ORDER BY " + self.field_to_column_name[self.order_results] - ) - except KeyError: - raise RuntimeError( - "Please ensure you are only requesting one option for --order_results and have written it correctly" - ) from None + query += "ORDER BY " + self.field_to_column_name[self.order_results] - # if clustering is requested, do that before saving view or filtering results for output - # Define clustering setup - def clusterFps( + output_query = query_select_string + query + view_query = f"SELECT * FROM {filtering_window} R " + query + print(" final query: ", query) + + return output_query, view_query + + def _prepare_cluster_query(self, unclustered_query: str) -> str | None: + """ + These methods will take (filtered, hopefully) data, then run the cluster query and cluster the filtered data. + This will output pose_ids that are representative of the clusters, and these pose_ids will be returned so that + they can be added to the unclustered query in the main filtering method. + They will only return a simple string since the filters were already applied, so the returning query is now the only query! + + Args: + unclustered_query (str): _description_ + + Returns: + str | None: _description_ + + Yields: + Iterator[str | None]: _description_ + """ + self.logger.warning( + "WARNING: Clustering can be memory-constrained. Using overly-permissive filters with clustering may cause issues." + ) + if self.interaction_cluster and self.mfpt_cluster: + self.logger.warning( + "N.B.: If using both interaction and morgan fingerprint clustering, the morgan fingerprint clustering will be performed on the results staus post interaction fingerprint clustering." + ) + + def _clusterFps( fps, cutoff ): # https://macinchem.org/2023/03/05/options-for-clustering-large-datasets-of-molecules/ """ @@ -2820,12 +2841,10 @@ def mp_wrapper(input_tpl): cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs - if self.interaction_cluster is not None: + cluster_query_string = None + + if self.interaction_cluster: cluster_query = f"SELECT Pose_ID, leff FROM Results WHERE Pose_ID IN ({unclustered_query})" - # if interaction filters are present - if interaction_queries != []: - # include them in the clustering query - cluster_query = with_stmt + cluster_query # resulting data # new poseid_leffs = self._run_query(cluster_query).fetchall() @@ -2842,7 +2861,7 @@ def mp_wrapper(input_tpl): for poseid_leff in poseid_leffs ] # index 2 is the bitvector string element - bclusters = clusterFps( + bclusters = _clusterFps( [ DataStructs.CreateFromBitString(poseid_leff_bv[2]) for poseid_leff_bv in poseid_leff_bvs @@ -2854,6 +2873,7 @@ def mp_wrapper(input_tpl): ) # select ligand from each cluster with best ligand efficiency + # interaction clusters representative pose ids int_rep_poseids = [] for cluster in bclusters: @@ -2861,8 +2881,6 @@ def mp_wrapper(input_tpl): c_leffs = np.array( [poseid_leff_bvs[cluster_element][1] for cluster_element in cluster] ) - # beware magic numbers - # element 0 ([0]) in each leff_poseid_ifps row is the pose_id best_lig_c = poseid_leff_bvs[cluster[np.argmin(c_leffs)]][0] int_rep_poseids.append(str(best_lig_c)) @@ -2877,36 +2895,32 @@ def mp_wrapper(input_tpl): # catch if no pose_ids returned if int_rep_poseids == []: - self.logger.warning( + raise OptionError( "No passing results prior to clustering. Clustering not performed." ) else: + cluster_query_string = "R.Pose_ID = " + " OR R.Pose_ID = ".join( + int_rep_poseids + ) + # if no more clustering if self.mfpt_cluster is None: - sql_string = ( - output_str + "Pose_ID=" + " OR Pose_ID=".join(int_rep_poseids) - ) + return cluster_query_string else: - unclustered_query = f"SELECT Pose_ID FROM Results WHERE {'Pose_ID=' + ' OR Pose_ID='.join(int_rep_poseids)}" + # carry the pose ids returned by this cluster to the MFPT clustering + unclustered_query = ( + f"SELECT R.Pose_ID FROM Results WHERE {cluster_query_string}" + ) - if self.mfpt_cluster is not None: - self.logger.warning( - "WARNING: Ligand morgan fingerprint clustering is memory-constrained. Using overly-permissive filters with clustering may cause issues." - ) - self.logger.warning( - "N.B.: If using both interaction and morgan fingerprint clustering, the morgan fingerprint clustering will be performed on the results staus post interaction fingerprint clustering." - ) - cluster_query = f"SELECT Results.Pose_ID, Results.leff, mol_morgan_bfp(Ligands.ligand_rdmol, 2, 1024) FROM Ligands INNER JOIN Results ON Results.LigName = Ligands.LigName WHERE Results.Pose_ID IN ({unclustered_query})" - if interaction_queries != []: - cluster_query = with_stmt + cluster_query + if self.mfpt_cluster: + cluster_query = f"SELECT R.Pose_ID, R.leff, mol_morgan_bfp(Ligands.ligand_rdmol, 2, 1024) FROM Ligands L INNER JOIN Results R ON R.LigName = L.LigName WHERE R.Pose_ID IN ({unclustered_query})" poseid_leff_mfps = self._run_query(cluster_query).fetchall() - bclusters = clusterFps( + bclusters = _clusterFps( [DataStructs.CreateFromBinaryText(mol[2]) for mol in poseid_leff_mfps], self.mfpt_cluster, ) self.logger.info( f"Number of Morgan fingerprint butina clusters: {len(bclusters)}" ) - # select ligand from each cluster with best ligand efficiency fp_rep_poseids = [] for c in bclusters: @@ -2923,20 +2937,15 @@ def mp_wrapper(input_tpl): # catch if no pose_ids returned if fp_rep_poseids == []: - self.logger.warning( + raise OptionError( "No passing results prior to clustering. Clustering not performed." ) else: - sql_string = ( - output_str + "Pose_ID=" + " OR Pose_ID=".join(fp_rep_poseids) + cluster_query_string = "R.Pose_ID = " + " OR R.Pose_ID = ".join( + fp_rep_poseids ) - return sql_string, sql_string.replace( - """SELECT {out_columns} FROM {window}""".format( - out_columns=outfield_string, window=filtering_window - ), - f"SELECT * FROM {filtering_window}", - ) # sql_query, view_query + return cluster_query_string def _prepare_interaction_filtering_query( self, include_interactions: list, exclude_interactions: list @@ -2953,6 +2962,7 @@ def _prepare_interaction_filtering_query( """ # nonsensical number to count an interaction if it satisfies an incomplete ("wildcard") interaction nonsense_counter = -10000 + num_of_interactions = len(include_interactions) + len(exclude_interactions) def _prepare_indices_for_query(interactions: list): """ @@ -3033,9 +3043,7 @@ def _prepare_indices_for_query(interactions: list): + ") " ) # 4. add grouping and wildcard for total interactions minus max_miss, essentially - query += ( - ") GROUP BY Pose_ID HAVING COUNT(DISTINCT filtered_interactions) = (?)) I " - ) + query += f") GROUP BY Pose_ID HAVING COUNT(DISTINCT filtered_interactions) = ({num_of_interactions})) I " return query @@ -3238,24 +3246,6 @@ def _get_interaction_indices(self, interaction_list) -> iter: return self._run_query(sql_string).fetchall() - def _generate_interaction_filtering_query(self, interaction_index_list): - # TODO refactor -> THIS IS ONE OF THE MAJOR ONES - """takes list of interaction indices and searches for ligand ids - which have those interactions - - Args: - interaction_index_list (list): List of interaction indices - - Returns: - str: SQLite-formatted query - """ - return """SELECT Pose_id FROM (SELECT Pose_ID, interaction_id - FROM Interactions - WHERE Pose_ID IN subq) - WHERE """ + """ OR """.join( - [f"""interaction_id={index}""" for index in interaction_index_list] - ) - def _generate_ligand_filtering_query(self, ligand_filters: dict) -> str: # TODO want to clean this one up """write string to select from ligand table From d3c88d2f1683f189cdb1ee3b2339215a7c617f9d Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 15:05:39 -0700 Subject: [PATCH 32/63] fix so max_miss works in storageman --- ringtail/storagemanager.py | 47 +++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index c8c6450e..59b52401 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -1811,12 +1811,13 @@ def create_bookmark(self, name, query, temp=False, add_poseID=False, filters={}) temp_flag = "TEMP " else: temp_flag = "" - query = "CREATE {temp_flag}VIEW {name} AS {query}".format( - name=name, query=query, temp_flag=temp_flag + + bookmark_query = f"CREATE {temp_flag}VIEW {name} AS {query}" + self._create_view(name, bookmark_query) + self._insert_bookmark_info(name, bookmark_query, filters) + self.logger.debug( + f"Created bookmark from the following query: {bookmark_query}" ) - self._create_view(name=name, query=query) - self._insert_bookmark_info(name, query, filters) - self.logger.debug(f"Created bookmark from the following query: {query}") def _create_view(self, name, query): """takes name and selection query, @@ -2564,6 +2565,7 @@ def _process_filters_for_query(self, filters_dict: dict): numerical_filters = [] interaction_filters = [] ligand_filters = {} + output_options = [] energy_filter_col_name = { "eworst": "docking_score", "ebest": "docking_score", @@ -2624,27 +2626,27 @@ def _process_filters_for_query(self, filters_dict: dict): interaction_filters.append( interaction_string.split(":") + [interact[1]] ) - # add react_any flag as interaction filter if not None if filter_key == "react_any" and filter_value: interaction_filters.append(["R", "", "", "", "", True]) # if filter has to do with ligands and SMARTS if filter_key in Filters.get_filter_keys("ligand"): ligand_filters[filter_key] = filter_value - + if filter_key == "max_miss": + max_miss = filter_value # put all processed filter in a dict processed_filters = {} if len(numerical_filters) > 0: processed_filters["num_filters"] = numerical_filters if len(interaction_filters) > 0: processed_filters["int_filters"] = interaction_filters + processed_filters["max_miss"] = max_miss if len(ligand_filters) > 0: processed_filters["lig_filters"] = ligand_filters return processed_filters def _generate_result_filtering_query(self, filters_dict): - # TODO THE biggest one """takes lists of filters, writes sql filtering string Args: @@ -2725,7 +2727,9 @@ def _generate_result_filtering_query(self, filters_dict): if bool(exclude_interactions or include_interactions): # prepare partial queries for the different interaction combinations int_query = self._prepare_interaction_filtering_query( - include_interactions, exclude_interactions + include_interactions, + exclude_interactions, + processed_filters["max_miss"], ) # check if ligand filters and prepare for query @@ -2783,7 +2787,8 @@ def _generate_result_filtering_query(self, filters_dict): output_query = query_select_string + query view_query = f"SELECT * FROM {filtering_window} R " + query - print(" final query: ", query) + print(" final query: ", output_query) + print(" view_query: ", view_query) return output_query, view_query @@ -2948,7 +2953,7 @@ def mp_wrapper(input_tpl): return cluster_query_string def _prepare_interaction_filtering_query( - self, include_interactions: list, exclude_interactions: list + self, include_interactions: list, exclude_interactions: list, max_miss: int ) -> str: """ _summary_ @@ -2956,13 +2961,16 @@ def _prepare_interaction_filtering_query( Args: include_interactions (list): _description_ exclude_interactions (list): _description_ + max_miss (int): _description_ Returns: str: _description_ """ # nonsensical number to count an interaction if it satisfies an incomplete ("wildcard") interaction nonsense_counter = -10000 - num_of_interactions = len(include_interactions) + len(exclude_interactions) + num_of_interactions = ( + len(include_interactions) + len(exclude_interactions) - max_miss + ) def _prepare_indices_for_query(interactions: list): """ @@ -2995,10 +3003,16 @@ def _prepare_indices_for_query(interactions: list): and_include_interactions, or_include_interactions = ( _prepare_indices_for_query(include_interactions) ) + else: + and_include_interactions = [] + or_include_interactions = [] if exclude_interactions: and_exclude_interactions, or_exclude_interactions = ( _prepare_indices_for_query(exclude_interactions) ) + else: + and_exclude_interactions = [] + or_exclude_interactions = [] # building the query # 1. select pose id, call CASE, in paranthesis because grouping with different query @@ -3006,8 +3020,8 @@ def _prepare_indices_for_query(interactions: list): if or_include_interactions or or_exclude_interactions: # add the case statements query += ", CASE " - # 2. list all OR statements - # TODO catch if no OR statements + # 2. list all OR statements + if or_include_interactions: for interactions in or_include_interactions: # iterate the nonsense counter nonsense_counter += 1 @@ -3016,6 +3030,7 @@ def _prepare_indices_for_query(interactions: list): + numlist2str(interactions, ",") + f") THEN {nonsense_counter} " ) + if or_exclude_interactions: for interactions in or_exclude_interactions: # iterate the nonsense counter nonsense_counter += 1 @@ -3024,7 +3039,7 @@ def _prepare_indices_for_query(interactions: list): + numlist2str(interactions, ",") + f") THEN {nonsense_counter} " ) - query += "ELSE interaction_id END " + query += "ELSE interaction_id END " # 3. proceed with all interactions query += "AS filtered_interactions FROM Interactions WHERE " if and_include_interactions: @@ -3043,7 +3058,7 @@ def _prepare_indices_for_query(interactions: list): + ") " ) # 4. add grouping and wildcard for total interactions minus max_miss, essentially - query += f") GROUP BY Pose_ID HAVING COUNT(DISTINCT filtered_interactions) = ({num_of_interactions})) I " + query += f") GROUP BY Pose_ID HAVING COUNT(DISTINCT filtered_interactions) >= ({num_of_interactions})) I " return query From 57a2e986328f140de93fb76db9c517f2da06604b Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 15:06:04 -0700 Subject: [PATCH 33/63] updated filter dict to compare to since defaults for ligand filters changed --- test/test_units.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/test/test_units.py b/test/test_units.py index 09127c34..53c6a3d1 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -242,11 +242,11 @@ def test_generate_interactions_prepare_filters(self): "hb_count": None, "react_any": None, "max_miss": 0, - "ligand_name": [], - "ligand_substruct": [], - "ligand_substruct_pos": [], + "ligand_name": None, + "ligand_substruct": None, + "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": "OR", + "ligand_operator": None, } in test_filters assert { @@ -262,11 +262,11 @@ def test_generate_interactions_prepare_filters(self): "hb_count": None, "react_any": None, "max_miss": 0, - "ligand_name": [], - "ligand_substruct": [], - "ligand_substruct_pos": [], + "ligand_name": None, + "ligand_substruct": None, + "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": "OR", + "ligand_operator": None, } in test_filters assert { @@ -282,11 +282,11 @@ def test_generate_interactions_prepare_filters(self): "hb_count": None, "react_any": None, "max_miss": 0, - "ligand_name": [], - "ligand_substruct": [], - "ligand_substruct_pos": [], + "ligand_name": None, + "ligand_substruct": None, + "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": "OR", + "ligand_operator": None, } in test_filters assert { @@ -302,11 +302,11 @@ def test_generate_interactions_prepare_filters(self): "hb_count": None, "react_any": None, "max_miss": 0, - "ligand_name": [], - "ligand_substruct": [], - "ligand_substruct_pos": [], + "ligand_name": None, + "ligand_substruct": None, + "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": "OR", + "ligand_operator": None, } in test_filters assert { @@ -322,11 +322,11 @@ def test_generate_interactions_prepare_filters(self): "hb_count": None, "react_any": None, "max_miss": 0, - "ligand_name": [], - "ligand_substruct": [], - "ligand_substruct_pos": [], + "ligand_name": None, + "ligand_substruct": None, + "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": "OR", + "ligand_operator": None, } in test_filters assert len(test_filters) == 5 From 07501b4386205a59952e8d2cb9f0598c5670efdc Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 16:46:25 -0700 Subject: [PATCH 34/63] enumerating interaction combinations now work --- ringtail/ringtailcore.py | 90 +++++++++++++++++++++++++++----------- ringtail/storagemanager.py | 10 ++--- 2 files changed, 68 insertions(+), 32 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index b135aa04..0f20cb38 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -238,6 +238,8 @@ def _prepare_filters_for_storageman(self, interaction_combination): for interaction in itype_interactions: if itype + "-" + interaction[0] not in interaction_combination: filters_dict[itype].remove(interaction) + # we want a match for all interactions when enumerating combinations + filters_dict["max_miss"] = 0 return filters_dict @@ -1296,9 +1298,11 @@ def filter( bookmark_name (str): name for resulting book mark file. Default value is 'passing_results' filter_bookmark (str): name of bookmark to perform filtering over options_dict (dict): write options as a dict + return_inter (bool): return an iterable of all of the filtering results Returns: int: number of ligands passing filter + iter (optional): an iterable of all of the filtering results """ @@ -1368,34 +1372,30 @@ def filter( self.storageopts.output_all_poses = False self.logger.info("Filtering results...") - - # get possible permutations of interaction with max_miss excluded - interaction_combs = self._generate_interaction_combinations( - self.filters.max_miss - ) ligands_passed = 0 - """This for comprehension takes all combinations represented in one union of one or multiple, and filters, and goes around until all combinations have been used to filter - - """ + # get possible permutations of interaction with max_miss excluded + if self.filters.max_miss > 0 and self.outputopts.enumerate_interaction_combs: + write_one_bookmark = False + else: + write_one_bookmark = True + with self.storageman: - for ic_idx, combination in enumerate(interaction_combs): - # prepare Filter object with only desired interaction combination for storageManager - filters_dict = self._prepare_filters_for_storageman(combination) - # set storageMan's internal ic_counter to reflect current ic_idx - if len(interaction_combs) > 1: - self.storageman.set_bookmark_suffix(ic_idx) - # ask storageManager to fetch results + # pre-process if filtering to multiple bookmark combinations + if write_one_bookmark: filtered_results = self.storageman.filter_results( - filters_dict, not self.outputopts.enumerate_interaction_combs + self.filters.todict(), ) + # if there were results of the filtering if filtered_results: + # if retuning an iterable with the resulting pose ids if return_iter: return filtered_results result_bookmark_name = self.storageman.get_current_bookmark_name() + # write output log file with self.outputman: self.outputman.write_filters_to_log( self.filters.todict(), - combination, + [], f"Morgan Fingerprints butina clustering cutoff: {self.storageman.mfpt_cluster}\nInteraction Fingerprints clustering cutoff: {self.storageman.interaction_cluster}", ) self.outputman.write_results_bookmark_to_log( @@ -1407,18 +1407,58 @@ def filter( self.outputman.log_num_passing_ligands(number_passing) print("\nNumber of ligands passing filters:", number_passing) ligands_passed = number_passing - elif len(interaction_combs) > 1: - self.logger.warning( - f"WARNING: No ligands found passing given interaction combination {combination}" - ) - self.storageman.drop_bookmark(self.storageman.bookmark_name) else: self.logger.warning(f"WARNING: No ligands found passing filter.") self.storageman.drop_bookmark(self.storageman.bookmark_name) - if len(interaction_combs) > 1: - maxmiss_union_results = self.storageman.get_maxmiss_union( - len(interaction_combs) + # else produce a bookmark for each interaction combination + elif not write_one_bookmark: + # TODO in this case max_miss has to be the exact number of interactions in each combo + interaction_combs = self._generate_interaction_combinations( + self.filters.max_miss ) + for ic_idx, combination in enumerate(interaction_combs): + # prepare Filter object with only desired interaction combination for storageManager + filters_dict = self._prepare_filters_for_storageman(combination) + # set storageMan's internal ic_counter to reflect current ic_idx + if len(interaction_combs) > 1: + self.storageman.set_bookmark_suffix(ic_idx) + # ask storageManager to fetch results + filtered_results = self.storageman.filter_results( + filters_dict, + not self.outputopts.enumerate_interaction_combs, + ) + if filtered_results: + if return_iter: + return filtered_results + result_bookmark_name = ( + self.storageman.get_current_bookmark_name() + ) + with self.outputman: + self.outputman.write_filters_to_log( + self.filters.todict(), + combination, + f"Morgan Fingerprints butina clustering cutoff: {self.storageman.mfpt_cluster}\nInteraction Fingerprints clustering cutoff: {self.storageman.interaction_cluster}", + ) + self.outputman.write_results_bookmark_to_log( + result_bookmark_name + ) + number_passing = self.outputman.write_filter_log( + filtered_results + ) + self.outputman.log_num_passing_ligands(number_passing) + print( + "\nNumber of ligands passing filters:", number_passing + ) + ligands_passed = number_passing + elif len(interaction_combs) > 1: + self.logger.warning( + f"WARNING: No ligands found passing given interaction combination {combination}" + ) + self.storageman.drop_bookmark(self.storageman.bookmark_name) + if len(interaction_combs) > 1: + maxmiss_union_results = self.storageman.get_maxmiss_union( + len(interaction_combs) + ) with self.outputman: self.outputman.write_maxmiss_union_header() self.outputman.write_results_bookmark_to_log( diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 59b52401..a1276dbb 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -246,7 +246,7 @@ def filter_results(self, all_filters: dict, suppress_output=False) -> iter: ) self.logger.debug(f"Query for filtering results: {filter_results_str}") - # if max_miss is not 0, we want to give each passing view a new name by changing the self.bookmark_name + # if max_miss> and we are enumerating interaction combinations, we want to give each passing view a new name by changing the self.bookmark_name if self.view_suffix is not None: self.current_bookmark_name = self.bookmark_name + "_" + self.view_suffix else: @@ -2377,8 +2377,6 @@ def _get_number_passing_ligands(self, bookmark_name: str | None = None): ) from e def get_maxmiss_union(self, total_combinations: int): - # TODO probably remove as union can happen automatically. Then if enumerating_interaction_combinations, - # just create the other bookmarks separately for each interaction combination through the method in the core """Get results that are in union considering max miss Args: @@ -2389,10 +2387,10 @@ def get_maxmiss_union(self, total_combinations: int): """ selection_strs = [] view_strs = [] - outfield_str = self._generate_outfield_string() + outfield_list = self._generate_outfield_string() for i in range(total_combinations): selection_strs.append( - f"SELECT {outfield_str} FROM {self.bookmark_name + '_' + str(i)}" + f"""SELECT {", ".join(outfield_list)} FROM {self.bookmark_name + '_' + str(i)}""" ) view_strs.append(f"SELECT * FROM {self.bookmark_name + '_' + str(i)}") @@ -2787,8 +2785,6 @@ def _generate_result_filtering_query(self, filters_dict): output_query = query_select_string + query view_query = f"SELECT * FROM {filtering_window} R " + query - print(" final query: ", output_query) - print(" view_query: ", view_query) return output_query, view_query From 1b28cb1f8b19167aed2e318ed8cc7ed8ef7e1332 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 16:53:36 -0700 Subject: [PATCH 35/63] removed nonsensical filter value --- test/test_units.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/test_units.py b/test/test_units.py index 53c6a3d1..487dea90 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -569,7 +569,6 @@ def test_bookmark_info(self, dbquery): eworst=-3, vdw_interactions=[("A:ALA:213:", True), ("A:VAL:279:", True)], hb_interactions=[("A:ALA:213:", True)], - ligand_operator="OR", ) curs = dbquery( "SELECT filters FROM Bookmarks WHERE Bookmark_name LIKE 'passing_results'" @@ -589,11 +588,11 @@ def test_bookmark_info(self, dbquery): "hb_count": None, "react_any": None, "max_miss": 0, - "ligand_name": [], - "ligand_substruct": [], - "ligand_substruct_pos": [], + "ligand_name": None, + "ligand_substruct": None, + "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": "OR", + "ligand_operator": None, } assert bookmark_filters_db_str == json.dumps(filters) From 329861b44b3adfac623070f21030df2de2f6f111 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 19:17:00 -0700 Subject: [PATCH 36/63] remove bitvector table again --- ringtail/ringtailcore.py | 19 ++-- ringtail/storagemanager.py | 205 +++++-------------------------------- 2 files changed, 32 insertions(+), 192 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index 0f20cb38..e7bb68d2 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -153,25 +153,18 @@ def _add_poses( flexres_pose, ) in poses: # fetch info about pose interactions and format into string with format -::::, joined by commas - pose_bitvector = self.storageman.fetch_interaction_bitvector(Pose_ID) + interactions = self.storageman.fetch_pose_interactions(Pose_ID) # if that pose id has interactions - if pose_bitvector is not None: + if interactions is not None: # make a list of all of them - interaction_indices = [] interactions_list = [] - # for each interaction bit, make into a string according to format above - for idx, bit in enumerate(pose_bitvector): - if bit == 1: - interaction_indices.append(idx) - for int_idx in interaction_indices: - # TODO refactor here if I refactor the method in storageman - interaction_info = self.storageman.fetch_interaction_info_by_index( - int_idx - ) + # for each interaction row, make into a string according to format above + for interaction_info in interactions: interaction = ( interaction_info[0] + "-" + ":".join(interaction_info[1:]) ) interactions_list.append(interaction) + interactions_str = ", ".join(interactions_list) properties["Interactions"].append(interactions_str) @@ -1656,6 +1649,8 @@ def find_similar_ligands(self, query_ligname: str): ) if similar_ligands is not None: + if not hasattr(self, "outputman"): + self.set_output_options() with self.outputman: self.outputman.write_find_similar_header(query_ligname, cluster_name) self.outputman.write_results_bookmark_to_log(bookmark_name) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index a1276dbb..62a7a514 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -195,26 +195,14 @@ def insert_interactions(self, Pose_IDs: list, interactions_list, duplicates): # for each pose id, list interaction_rows = [] - interaction_bv_rows = [] for index, Pose_ID in enumerate(Pose_IDs): # add interaction if unique, returns index of interaction - # insert_interaction_index_row will add a column in interaction_bitvectors if necessary pose_interactions = [ ((Pose_ID,) + self._insert_interaction_index_row(interaction_tuple)) for interaction_tuple in interactions_list[index] ] # adds each pose_interaction row to list interaction_rows.extend(pose_interactions) - # create list of indices - pose_bitvector_precursors = [ - interaction[1] for interaction in pose_interactions - ] - # prepend pose id - pose_bitvector_precursors.insert(0, Pose_ID) - interaction_bv_rows.extend([pose_bitvector_precursors]) - # has the form [pose_id, int_ind1, int_ind2, etc] - # then add new row for pose in bitvector table - self._insert_interaction_bitvector_rows(interaction_bv_rows, duplicates) self._insert_interaction_rows(interaction_rows, duplicates) # endregion @@ -517,7 +505,6 @@ def _create_tables(self): self._create_ligands_table() self._create_receptors_table() self._create_interaction_index_table() - self._create_interaction_bitvector_table() self._create_interaction_table() self._create_bookmark_table() self._create_db_properties_table() @@ -1297,120 +1284,6 @@ def _create_interaction_index_table(self): f"Error while creating interaction index table: {e}" ) from e - def _create_interaction_bitvector_table(self): - """Create table of Pose_IDs and their interaction bitvector fingerprint decomposed into columns (one per interaction). - - Columns are: - Pose_ID INTEGER FOREIGN KEY from RESULTS(Pose_ID), - int_0 (number corresponds to interaction_id in Interaction_indices table) - int_1 - ... - int_n - - Raises: - DatabaseTableCreationError - """ - - interaction_bv_table = f"""CREATE TABLE Interaction_bitvectors ( - interaction_bv_id INTEGER PRIMARY KEY AUTOINCREMENT, - Pose_ID INTEGER, - FOREIGN KEY (Pose_ID) REFERENCES RESULTS(Pose_ID));""" - - try: - cur = self.conn.cursor() - cur.execute(interaction_bv_table) - cur.close() - self.logger.debug("Interaction bitvector table has been created") - except sqlite3.OperationalError as e: - raise DatabaseTableCreationError( - f"Error while creating interaction bitvector table: {e}." - ) from e - - def _insert_interaction_bitvector_rows( - self, pose_id_interaction_indices: list, duplicates - ): - """One row is one Pose_id, will inserts a 1 in any column where the column name represents an interaction_index that pose_id has. - Pose_ID that is 1-to-1 with Results table. - - Args: - pose_id_interaction_indices (list): list of pose_id, followed by all interaction indices - duplicates (list(int)): list of pose_ids from results table deemed duplicates, can also contain Nones, will be treated according to self.duplicate_handling - - Raises: - DatabaseInsertionError - """ - # I need a for loop unpacking the list of tuples - # each item in the list is a pose id and its corresponding interaction indices - # so here is a first problem, do I do one insert statement for each pose id? - # I can do an executemany but then I have to make interaction tuples for all the represented indices - # I could do a: find longest interaction tuple, for each pose id make a list of that length of zeros - # then - # remove pose id - - # tuple of as many 1s as a pose id has interactions, used in the executemany statement, minus 1 since - - # make a for lop to prepare the insert statements, and do not use executemany to begin with - # this will force me to clean up these duplicate handling methods I think - sql_insert_full = """INSERT INTO Interaction_bitvectors (Pose_ID""" - unnamed_params = "(?" - try: - cur = self.conn.cursor() - if not self.duplicate_handling: # add all results - # for each pose id - for pose in pose_id_interaction_indices: - sql_insert_full = """INSERT INTO Interaction_bitvectors (Pose_ID""" - unnamed_params = "(?" - # make list of all interaction indices, remove pose id - interaction_indices: list = pose[1:] - for interaction in sorted(interaction_indices): - # add name of column for given interaction_index - sql_insert_full += f""",int_{interaction}""" - unnamed_params += ",?" - # remove the last comma - unnamed_params += ")" - sql_insert_full += ") VALUES " + unnamed_params - # create list of 1s for each interaction - interaction_bits = [1 for _ in range(len(interaction_indices))] - # add pose id to start of list - interaction_bits.insert(0, pose[0]) - # convert list to tuple for sql insert - interaction_bit_tuple = tuple(interaction_bits) - cur.execute(sql_insert_full, interaction_bit_tuple) - else: - # first, add any poses that are not duplicates - non_duplicates = [ - interaction_row - for interaction_row in interaction_rows - if interaction_row[0] not in duplicates - ] - # check if there are duplicates or if duplicates list contains only None - duplicates_exist = bool(duplicates.count(None) != len(duplicates)) - cur.executemany(sql_insert, non_duplicates) - - # only look for values to replace if there are duplicate pose ids - if self.duplicate_handling == "REPLACE" and duplicates_exist: - # delete all rows pertaining to duplicated pose_ids - duplicated_pose_ids = [id for id in duplicates if id is not None] - self._delete_interactions(duplicated_pose_ids) - # insert the interaction tuples for the new pose_ids - duplicates_only = [ - interaction_row - for interaction_row in interaction_rows - if interaction_row[0] in duplicates - ] - cur.executemany(sql_insert, duplicates_only) - - elif self.duplicate_handling == "IGNORE": - # ignore and don't add any poses that are duplicates - pass - self.conn.commit() - cur.close() - - except sqlite3.OperationalError as e: - raise DatabaseInsertionError( - f"Error while inserting an interaction row: {e}" - ) from e - def _create_interaction_table(self): """Create table a "tall-skinny" table of each pose-interaction. This table enables proper handling of duplicates if specified. @@ -1555,10 +1428,6 @@ def _insert_interaction_index_row(self, interaction_tuple) -> tuple: # create and insert new interaction id input_tuple = interaction_index + interaction_tuple cur.execute(sql_insert, input_tuple) - # create new column in interaction_bitvector table - cur.execute( - f"""ALTER TABLE Interaction_bitvectors ADD COLUMN int_{str(interaction_index[0])}""" - ) self.conn.commit() else: interaction_index = interaction_index[0] @@ -2123,27 +1992,6 @@ def fetch_interaction_info_by_index(self, interaction_idx): ) return self._run_query(query).fetchone()[1:] # cut off interaction index - def fetch_interaction_bitvector(self, pose_id): - # TODO remove - """Returns tuple containing interaction bitvector line for given pose_id - - Args: - pose_id (int): pose id to fetch interaction bitvector for - - Returns: - tuple: tuple representing interaction bitvector - None: if no interactions in database - """ - # catch if database does not have interactions - table_names = [table[0] for table in self._fetch_existing_table_names()] - if "Interaction_bitvectors" not in table_names: - return None - - query = "SELECT * FROM Interaction_bitvectors WHERE Pose_ID = {0}".format( - pose_id - ) - return self._run_query(query).fetchone()[1:] # cut off pose id - def fetch_pose_interactions(self, Pose_ID): """ Fetch all interactions parameters belonging to a Pose_ID @@ -2804,9 +2652,6 @@ def _prepare_cluster_query(self, unclustered_query: str) -> str | None: Yields: Iterator[str | None]: _description_ """ - self.logger.warning( - "WARNING: Clustering can be memory-constrained. Using overly-permissive filters with clustering may cause issues." - ) if self.interaction_cluster and self.mfpt_cluster: self.logger.warning( "N.B.: If using both interaction and morgan fingerprint clustering, the morgan fingerprint clustering will be performed on the results staus post interaction fingerprint clustering." @@ -2903,10 +2748,8 @@ def mp_wrapper(input_tpl): cluster_query_string = "R.Pose_ID = " + " OR R.Pose_ID = ".join( int_rep_poseids ) - # if no more clustering - if self.mfpt_cluster is None: - return cluster_query_string - else: + # if more clustering + if self.mfpt_cluster is not None: # carry the pose ids returned by this cluster to the MFPT clustering unclustered_query = ( f"SELECT R.Pose_ID FROM Results WHERE {cluster_query_string}" @@ -2945,7 +2788,7 @@ def mp_wrapper(input_tpl): cluster_query_string = "R.Pose_ID = " + " OR R.Pose_ID = ".join( fp_rep_poseids ) - + print(" cluster_query_string: ", cluster_query_string) return cluster_query_string def _prepare_interaction_filtering_query( @@ -3016,26 +2859,28 @@ def _prepare_indices_for_query(interactions: list): if or_include_interactions or or_exclude_interactions: # add the case statements query += ", CASE " - # 2. list all OR statements - if or_include_interactions: - for interactions in or_include_interactions: - # iterate the nonsense counter - nonsense_counter += 1 - query += ( - "WHEN interaction_id IN (" - + numlist2str(interactions, ",") - + f") THEN {nonsense_counter} " - ) - if or_exclude_interactions: - for interactions in or_exclude_interactions: - # iterate the nonsense counter - nonsense_counter += 1 - query += ( - "WHEN interaction_id NOT IN (" - + numlist2str(interactions, ",") - + f") THEN {nonsense_counter} " - ) - query += "ELSE interaction_id END " + # 2. list all OR statements + if or_include_interactions: + for interactions in or_include_interactions: + # iterate the nonsense counter + nonsense_counter += 1 + query += ( + "WHEN interaction_id IN (" + + numlist2str(interactions, ",") + + f") THEN {nonsense_counter} " + ) + if or_exclude_interactions: + for interactions in or_exclude_interactions: + # iterate the nonsense counter + nonsense_counter += 1 + query += ( + "WHEN interaction_id NOT IN (" + + numlist2str(interactions, ",") + + f") THEN {nonsense_counter} " + ) + query += "ELSE interaction_id END " + else: + query += ", interaction_id " # 3. proceed with all interactions query += "AS filtered_interactions FROM Interactions WHERE " if and_include_interactions: From f02e067871e39e149536caba62fdf6f530b02211 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 19:24:04 -0700 Subject: [PATCH 37/63] switched position of two tests --- test/test_units.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_units.py b/test/test_units.py index 487dea90..4a2e4aeb 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -132,23 +132,23 @@ def test_get_filterdata(self): os.system(("rm " + log_file_name)) - def test_similar_ligands_mfpt(self, monkeypatch): + def test_similar_ligands_interaction(self, monkeypatch): rtc = RingtailCore(db_file="output.db") ligand_name = "287065" - rtc.filter(ebest=-6, mfpt_cluster=0.5) + rtc.filter(ebest=-6, interaction_cluster=0.5) monkeypatch.setattr("builtins.input", lambda _: 0) # provides terminal input number_similar = rtc.find_similar_ligands(ligand_name) - assert number_similar == 8 + assert number_similar == 1 - def test_similar_ligands_interaction(self, monkeypatch): + def test_similar_ligands_mfpt(self, monkeypatch): rtc = RingtailCore(db_file="output.db") ligand_name = "287065" - rtc.filter(ebest=-6, interaction_cluster=0.5) + rtc.filter(ebest=-6, mfpt_cluster=0.5) monkeypatch.setattr("builtins.input", lambda _: 1) # provides terminal input number_similar = rtc.find_similar_ligands(ligand_name) - assert number_similar == 1 + assert number_similar == 8 def test_create_rdkitmol(self): bookmark_name = "rdkit_test" From a0b2a4fb76ed7742d74ba700412a2de0fabacefa Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 19:26:56 -0700 Subject: [PATCH 38/63] fixed table ref for mfpt clustering --- ringtail/storagemanager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 62a7a514..5e602765 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2692,7 +2692,6 @@ def mp_wrapper(input_tpl): if self.interaction_cluster: cluster_query = f"SELECT Pose_ID, leff FROM Results WHERE Pose_ID IN ({unclustered_query})" # resulting data - # new poseid_leffs = self._run_query(cluster_query).fetchall() cluster_poseids = ( "(" @@ -2756,7 +2755,7 @@ def mp_wrapper(input_tpl): ) if self.mfpt_cluster: - cluster_query = f"SELECT R.Pose_ID, R.leff, mol_morgan_bfp(Ligands.ligand_rdmol, 2, 1024) FROM Ligands L INNER JOIN Results R ON R.LigName = L.LigName WHERE R.Pose_ID IN ({unclustered_query})" + cluster_query = f"SELECT R.Pose_ID, R.leff, mol_morgan_bfp(L.ligand_rdmol, 2, 1024) FROM Ligands L INNER JOIN Results R ON R.LigName = L.LigName WHERE R.Pose_ID IN ({unclustered_query})" poseid_leff_mfps = self._run_query(cluster_query).fetchall() bclusters = _clusterFps( [DataStructs.CreateFromBinaryText(mol[2]) for mol in poseid_leff_mfps], From 3276f4ac71d48ebc04e39226e61c88777e1d7b9c Mon Sep 17 00:00:00 2001 From: maylinnp Date: Wed, 25 Sep 2024 19:27:33 -0700 Subject: [PATCH 39/63] made similar ligand output context managed by storageman --- ringtail/ringtailcore.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index e7bb68d2..05fb6b83 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -1647,16 +1647,17 @@ def find_similar_ligands(self, query_ligname: str): similar_ligands, bookmark_name, cluster_name = ( self.storageman.fetch_clustered_similars(query_ligname) ) - - if similar_ligands is not None: - if not hasattr(self, "outputman"): - self.set_output_options() - with self.outputman: - self.outputman.write_find_similar_header(query_ligname, cluster_name) - self.outputman.write_results_bookmark_to_log(bookmark_name) - number_similar = self.outputman.write_filter_log(similar_ligands) - self.outputman.log_num_passing_ligands(number_similar) - print("Number similar ligands:", number_similar) + if similar_ligands is not None: + if not hasattr(self, "outputman"): + self.set_output_options() + with self.outputman: + self.outputman.write_find_similar_header( + query_ligname, cluster_name + ) + self.outputman.write_results_bookmark_to_log(bookmark_name) + number_similar = self.outputman.write_filter_log(similar_ligands) + self.outputman.log_num_passing_ligands(number_similar) + print("Number similar ligands:", number_similar) return number_similar def plot(self, save=True, bookmark_name: str = None): From 4b4f3df4d8c8da64c50f2361d990083412397608 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 11:44:06 -0700 Subject: [PATCH 40/63] fixed bug with ligand filtering --- ringtail/storagemanager.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 5e602765..ab491989 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2411,7 +2411,6 @@ def _process_filters_for_query(self, filters_dict: dict): numerical_filters = [] interaction_filters = [] ligand_filters = {} - output_options = [] energy_filter_col_name = { "eworst": "docking_score", "ebest": "docking_score", @@ -2582,17 +2581,16 @@ def _generate_result_filtering_query(self, filters_dict): if "lig_filters" in processed_filters: lig_filters = processed_filters["lig_filters"] ligand_queries = [] - # if straight forward ligand filters, generate partial queries if ( - lig_filters["ligand_substruct"] - or lig_filters["ligand_name"] - or lig_filters["ligand_max_atoms"] + "ligand_substruct" in lig_filters + or "ligand_name" in lig_filters + or "ligand_max_atoms" in lig_filters ): ligand_queries.append( self._generate_ligand_filtering_query(lig_filters) ) # if complex ligand filter, generate partial query - if lig_filters["ligand_substruct_pos"]: + if "ligand_substruct_pos" in lig_filters: ligand_queries.append( self._ligand_substructure_position_filter(lig_filters) ) @@ -2606,7 +2604,7 @@ def _generate_result_filtering_query(self, filters_dict): if lig_query: # add with a join statement unclustered_query += ( - "JOIN (" + lig_query + ") ON R.LigName = L.LigName " + "JOIN (" + lig_query + ") L ON R.LigName = L.LigName " ) if num_query: unclustered_query += "WHERE " + num_query @@ -2626,14 +2624,14 @@ def _generate_result_filtering_query(self, filters_dict): query_select_string = f"""SELECT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R """ # adding if we only want to keep one pose per ligand (will keep first entry) if not self.output_all_poses: - query += " GROUP BY LigName " + query += " GROUP BY R.LigName " # add how to order results if self.order_results: query += "ORDER BY " + self.field_to_column_name[self.order_results] output_query = query_select_string + query view_query = f"SELECT * FROM {filtering_window} R " + query - + print(" output query: ", output_query) return output_query, view_query def _prepare_cluster_query(self, unclustered_query: str) -> str | None: @@ -3113,7 +3111,13 @@ def _generate_ligand_filtering_query(self, ligand_filters: dict) -> str: """ sql_ligand_string = "SELECT L.LigName FROM Ligands L WHERE" - logical_operator = ligand_filters["ligand_operator"] + if "ligand_operator" in ligand_filters: + logical_operator = ligand_filters["ligand_operator"] + else: + self.logger.info( + "A logical operator to combine ligand filters were not provided, will use the default value 'OR'." + ) + logical_operator = "OR" if logical_operator is None: logical_operator = "AND" for kw in ligand_filters.keys(): From 8fbd6b09f91886cc433b112dea8750fbefc120fc Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 11:59:38 -0700 Subject: [PATCH 41/63] added test for enumerated interaction combinations --- test/test_units.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/test/test_units.py b/test/test_units.py index 4a2e4aeb..67758361 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -109,9 +109,42 @@ def test_filter(self): hb_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], vdw_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], max_miss=1, + bookmark_name="union_bookmark", ) - + # make sure correct number of ligands passing + assert count_ligands_passing == 33 + # make sure only one bookmark was created + bookmarks = rtc.get_bookmark_names() + assert len(bookmarks) == 1 + assert bookmarks[0] == "union_bookmark" + rtc.drop_bookmark("union_bookmark") + + def test_enumerate_interaction_combinations(self): + # first test without enumerate, check number of passing union as well as number of bookmarks + rtc = RingtailCore(db_file="output.db") + count_ligands_passing = rtc.filter( + eworst=-6, + hb_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], + vdw_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], + max_miss=1, + enumerate_interaction_combs=True, + bookmark_name="enumerated_bookmark", + ) + # make sure correct number of ligands passing assert count_ligands_passing == 33 + # make sure additional bookmarks were created for the enumerated combinations + bookmarks = rtc.get_bookmark_names() + assert len(bookmarks) == 6 + # check that naming works properly + assert "enumerated_bookmark_0" in bookmarks + assert "enumerated_bookmark_union" in bookmarks + + def test_ligand_filters(self): + # ligand name + # ligand substruct + # ligand substruct pos + # ligand operator + pass def test_get_filterdata(self): rtc = RingtailCore(db_file="output.db") From 99028a04efd5a4aedad7b2fdceebddeaec791f63 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 12:19:05 -0700 Subject: [PATCH 42/63] defaulting ligand operator to 'OR' --- ringtail/ringtailoptions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ringtail/ringtailoptions.py b/ringtail/ringtailoptions.py index 6dcb8111..1a99f239 100644 --- a/ringtail/ringtailoptions.py +++ b/ringtail/ringtailoptions.py @@ -625,9 +625,10 @@ def checks(self): if self.ligand_operator not in ["OR", "AND"] and ( self.ligand_substruct or self.ligand_substruct_pos ): - raise OptionError( - f"Given 'ligand_operator' {self.ligand_operator} not allowed. Must be 'OR' or 'AND'." + logger.warning( + f"Given 'ligand_operator' {self.ligand_operator} not allowed with 'ligand_substruct' or 'ligand_substruct_pos'. Will be set to default 'OR'." ) + self.ligand_operator = "OR" if self.max_miss < 0: raise OptionError("'max_miss' must be greater than or equal to 0.") From 3dc4581e6434015db07f4fede355c4d3d8a45ac0 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 14:30:23 -0700 Subject: [PATCH 43/63] cast ligand filter values to string while writing log --- ringtail/outputmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ringtail/outputmanager.py b/ringtail/outputmanager.py index fc562a0c..c164bcb9 100644 --- a/ringtail/outputmanager.py +++ b/ringtail/outputmanager.py @@ -179,7 +179,7 @@ def write_filters_to_log( v = filters_dict.pop(k) if v is not None: if isinstance(v, list): - v = ", ".join([f for f in v if f != ""]) + v = ", ".join([str(f) for f in v if f != ""]) else: v = " [ none ]" buff.append("# % 7s : %s" % (k, v)) From 93af3f06afa9b45beec8356492a3e4089006e160 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 14:38:52 -0700 Subject: [PATCH 44/63] added ligand filters to pytest --- test/test_units.py | 59 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/test/test_units.py b/test/test_units.py index 67758361..8539dbbf 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -117,11 +117,12 @@ def test_filter(self): bookmarks = rtc.get_bookmark_names() assert len(bookmarks) == 1 assert bookmarks[0] == "union_bookmark" - rtc.drop_bookmark("union_bookmark") def test_enumerate_interaction_combinations(self): # first test without enumerate, check number of passing union as well as number of bookmarks rtc = RingtailCore(db_file="output.db") + # get current bookmark count + bookmarks_old = rtc.get_bookmark_names() count_ligands_passing = rtc.filter( eworst=-6, hb_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], @@ -132,19 +133,57 @@ def test_enumerate_interaction_combinations(self): ) # make sure correct number of ligands passing assert count_ligands_passing == 33 + # make sure additional bookmarks were created for the enumerated combinations - bookmarks = rtc.get_bookmark_names() - assert len(bookmarks) == 6 + bookmarks_with_new = rtc.get_bookmark_names() + # This filtering session should produce 6 bookmarks + assert len(bookmarks_with_new) - len(bookmarks_old) == 6 + # check that naming works properly - assert "enumerated_bookmark_0" in bookmarks - assert "enumerated_bookmark_union" in bookmarks + assert "enumerated_bookmark_0" in bookmarks_with_new + assert "enumerated_bookmark_union" in bookmarks_with_new def test_ligand_filters(self): - # ligand name - # ligand substruct - # ligand substruct pos - # ligand operator - pass + rtc = RingtailCore(db_file="output.db") + + # tests for partial names + count_ligands_passing = rtc.filter(ligand_name=["88"]) + assert count_ligands_passing == 7 + + # test substructure search (default 'OR' ligand_operator) + count_ligands_passing = rtc.filter(ligand_substruct=["C=O", "CC(C)(C)"]) + assert count_ligands_passing == 90 + + # test substructure search (default 'OR' ligand_operator) + count_ligands_passing = rtc.filter( + ligand_substruct=["C=O", "CC(C)(C)"], ligand_operator="AND" + ) + assert count_ligands_passing == 18 + + # test substructure with specified position, currently raises an error because substrcut with pos not found + from ringtail import exceptions as e + + with pytest.raises(e.OptionError) as exc_info: + count_ligands_passing = rtc.filter( + ligand_substruct_pos=["[Oh]C", 0, 100, -5.5, 10.0, 15.5] + ) + assert ( + str(exc_info.value) + == "There are no ligands passing the 'ligand_substruct_pos' filter, please revise your filter query." + ) + + def test_all_filters(self): + rtc = RingtailCore(db_file="output.db") + count_ligands_passing = rtc.filter( + eworst=-6, + hb_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], + vdw_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], + max_miss=1, + bookmark_name="big_query", + ligand_name=["88"], + ) + + assert count_ligands_passing == 1 def test_get_filterdata(self): rtc = RingtailCore(db_file="output.db") From 8596bf58515c3b2dd92392d2a709496c78b1defc Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 20:49:36 -0700 Subject: [PATCH 45/63] added bug fix for ligand filter keywords --- docs/source/changes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/changes.rst b/docs/source/changes.rst index d2cfce83..a06c538f 100644 --- a/docs/source/changes.rst +++ b/docs/source/changes.rst @@ -36,6 +36,7 @@ Bug fixes * The option `duplicate_handling` could previously only be applied during database creation and produced inconsistent table behavior. Option can now be applied at any time results are added to a database, and will create internally consistent tables. **Please note: if you have created tables in the past and invoking the keyword `duplicate_handling` you may have errors in the "Interaction_bitvectors" table (<2.0). These errors cannot be recovered, and we recommend you re-make the database with Ringtail 2.0.** * Writing SDFs from filtering bookmarks: will check that bookmark exists and has data before writing, and will now produce SDFs for any bookmarks existing bookmarks. If the bookmark results from a filtering where `max_miss` < 0 it will note if the non-union bookmark is used, and if the base name for such bookmarks is provided it will default to the `basename_union` bookmark for writing the SDFs. * Output from filtering using `max_miss` and `output_all_poses=False`(default) now producing expected behavior of outputting only one pose per ligand. Filtering for interactions `max_miss` allows any given pose for a ligand to miss `max_miss` interactions and still be considered to pass the filter. Previously, in the resulting `union` bookmark and `output_log` text file some ligands would present with more than one pose, although the option to `output_all_poses` was `False` (and thus the expectation would be one pose outputted per ligand). This would give the wrong count for how many ligands passed a filter, as some were counted more than once. +* The use of the keywords `--ligand_name`, `--ligand_substruct`, and `--ligand_substruct_pos` had ambiguous behavior where if they were invoked more than once, only the last filter value would be used (as opposed to concatenating the values). They now will work by supplying multiple values to one keyword, as well as one or more values to two or more keywords. Further, `ligand_substruct_pos` now takes input as one string (`"[C][Oh] 1 1.5 -20 42 -7.1"`)as opposed to one string and five numbers (`"[C][Oh]"" 1 1.5 -20 42 -7.1`). Changes in 1.1.0: enhanced database performance *********************************************** From 73094b72ab60300bbdd19437d58860e8bff8c93a Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 21:14:16 -0700 Subject: [PATCH 46/63] fixed test bug in compared dict --- test/test_units.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/test/test_units.py b/test/test_units.py index 8539dbbf..d4ab77da 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -637,10 +637,13 @@ def test_fetch_summary_data(self): def test_bookmark_info(self, dbquery): rtc = RingtailCore("output.db") + rtc.add_results_from_files( + file_path="test_data/adgpu/group2", + ) rtc.filter( eworst=-3, - vdw_interactions=[("A:ALA:213:", True), ("A:VAL:279:", True)], - hb_interactions=[("A:ALA:213:", True)], + hb_interactions=[("A:VAL:279:", True), ("A:LYS:162:", True)], + vdw_interactions=[("A:VAL:279:", True)], ) curs = dbquery( "SELECT filters FROM Bookmarks WHERE Bookmark_name LIKE 'passing_results'" @@ -654,8 +657,8 @@ def test_bookmark_info(self, dbquery): "lebest": None, "score_percentile": None, "le_percentile": None, - "vdw_interactions": [["A:ALA:213:", True], ["A:VAL:279:", True]], - "hb_interactions": [["A:ALA:213:", True]], + "vdw_interactions": [["A:VAL:279:", True]], + "hb_interactions": [["A:VAL:279:", True], ["A:LYS:162:", True]], "reactive_interactions": [], "hb_count": None, "react_any": None, @@ -663,8 +666,8 @@ def test_bookmark_info(self, dbquery): "ligand_name": None, "ligand_substruct": None, "ligand_substruct_pos": None, - "ligand_max_atoms": None, "ligand_operator": None, + "ligand_max_atoms": None, } assert bookmark_filters_db_str == json.dumps(filters) From 6c23b86dc83f93a5bee5978e3ecd42a4ec05be08 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 21:27:41 -0700 Subject: [PATCH 47/63] removed debug mode from tests --- test/test_cmdline.py | 141 +++++++++++++++++++++++-------------------- 1 file changed, 74 insertions(+), 67 deletions(-) diff --git a/test/test_cmdline.py b/test/test_cmdline.py index a3220f08..9512a874 100644 --- a/test/test_cmdline.py +++ b/test/test_cmdline.py @@ -28,12 +28,12 @@ class TestInputs: def test_files(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file test_data/adgpu/group1/127458.dlg.gz --file test_data/adgpu/group1/173101.dlg.gz --file test_data/adgpu/group1/100729.dlg.gz" + "python ../ringtail/cli/rt_process_vs.py write --file test_data/adgpu/group1/127458.dlg.gz --file test_data/adgpu/group1/173101.dlg.gz --file test_data/adgpu/group1/100729.dlg.gz" ) count1 = countrows("SELECT COUNT(*) FROM Ligands") os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file test_data/adgpu/group1/127458.dlg.gz test_data/adgpu/group1/173101.dlg.gz --file test_data/adgpu/group1/100729.dlg.gz --append_results" + "python ../ringtail/cli/rt_process_vs.py write --file test_data/adgpu/group1/127458.dlg.gz test_data/adgpu/group1/173101.dlg.gz --file test_data/adgpu/group1/100729.dlg.gz --append_results" ) count2 = countrows("SELECT COUNT(*) FROM Ligands") @@ -43,14 +43,14 @@ def test_files(self, countrows): def test_file_paths(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_path test_data/adgpu/group1 --file_path test_data/adgpu/group2" + "python ../ringtail/cli/rt_process_vs.py write --file_path test_data/adgpu/group1 --file_path test_data/adgpu/group2" ) count1 = countrows("SELECT COUNT(*) FROM Ligands") os.system("rm output.db") os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_path test_data/adgpu/group1 test_data/adgpu/group2" + "python ../ringtail/cli/rt_process_vs.py write --file_path test_data/adgpu/group1 test_data/adgpu/group2" ) count2 = countrows("SELECT COUNT(*) FROM Ligands") @@ -60,14 +60,14 @@ def test_file_paths(self, countrows): def test_file_list(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt --file_list test_data/filelist2.txt" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt --file_list test_data/filelist2.txt" ) count1 = countrows("SELECT COUNT(*) FROM Ligands") os.system("rm output.db") os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt test_data/filelist2.txt" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt test_data/filelist2.txt" ) count2 = countrows("SELECT COUNT(*) FROM Ligands") @@ -77,7 +77,7 @@ def test_file_list(self, countrows): def test_all_file_inputs(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt --file test_data/adgpu/group2/361056.dlg.gz test_data/adgpu/group2/53506.dlg.gz --file_path test_data/adgpu/group3" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt --file test_data/adgpu/group2/361056.dlg.gz test_data/adgpu/group2/53506.dlg.gz --file_path test_data/adgpu/group3" ) count = countrows("SELECT COUNT(*) FROM Ligands") @@ -87,7 +87,7 @@ def test_all_file_inputs(self, countrows): def test_vina_input(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d -m vina --file_path test_data/vina -rf test_data/vina/receptor.pdbqt -sr" + "python ../ringtail/cli/rt_process_vs.py write -m vina --file_path test_data/vina -rf test_data/vina/receptor.pdbqt -sr" ) count = countrows("SELECT COUNT(*) FROM Results") @@ -98,7 +98,7 @@ def test_overwrite(self, countrows): count_old_db = countrows("SELECT COUNT(*) FROM Ligands") os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt --overwrite" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt --overwrite" ) count_new_db = countrows("SELECT COUNT(*) FROM Ligands") assert count_old_db == 2 @@ -111,7 +111,7 @@ def test_overwrite_false(self, countrows): assert count_old_db == 3 code = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt" ) assert ( code == 256 @@ -131,7 +131,7 @@ def test_cmdline_config_file(self, countrows): with open(filepath, "w") as f: f.write(json.dumps(data, indent=4)) - os.system("python ../ringtail/cli/rt_process_vs.py write -d --config config.json") + os.system("python ../ringtail/cli/rt_process_vs.py write --config config.json") count = countrows("SELECT COUNT(*) FROM Ligands") @@ -141,17 +141,17 @@ def test_cmdline_config_file(self, countrows): def test_duplicate_handling(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_path test_data/adgpu/group1" + "python ../ringtail/cli/rt_process_vs.py write --file_path test_data/adgpu/group1" ) os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --input_db output.db --file_path test_data/adgpu/group1 --append_results --duplicate_handling ignore" + "python ../ringtail/cli/rt_process_vs.py write --input_db output.db --file_path test_data/adgpu/group1 --append_results --duplicate_handling ignore" ) count = countrows("SELECT COUNT(*) FROM Ligands") assert count == 138 def test_append_results(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --input_db output.db --file_path test_data/adgpu/group2 --append_results" + "python ../ringtail/cli/rt_process_vs.py write --input_db output.db --file_path test_data/adgpu/group2 --append_results" ) count = countrows("SELECT COUNT(*) FROM Ligands") @@ -160,7 +160,7 @@ def test_append_results(self, countrows): def test_save_rec_file(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --input_db output.db --receptor_file test_data/adgpu/4j8m.pdbqt --save_receptor --append_results" + "python ../ringtail/cli/rt_process_vs.py write --input_db output.db --receptor_file test_data/adgpu/4j8m.pdbqt --save_receptor --append_results" ) count = countrows( "SELECT COUNT(*) FROM Receptors WHERE receptor_object NOT NULL" @@ -172,7 +172,7 @@ def test_save_rec_file(self, countrows): def test_save_rec_file_gz(self, countrows): os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt --receptor_file test_data/adgpu/4j8m.pdbqt.gz --save_receptor" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt --receptor_file test_data/adgpu/4j8m.pdbqt.gz --save_receptor" ) count = countrows( "SELECT COUNT(*) FROM Receptors WHERE receptor_object NOT NULL" @@ -186,10 +186,10 @@ def test_save_rec_file_gz(self, countrows): class TestOutputs: def test_export_bookmark_csv(self): status1 = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt" ) status2 = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --export_bookmark_csv Ligands" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --export_bookmark_csv Ligands" ) assert status1 == status2 == 0 @@ -200,7 +200,7 @@ def test_export_bookmark_csv(self): def test_export_query_csv(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --export_query_csv 'SELECT * FROM Results'" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --export_query_csv 'SELECT * FROM Results'" ) assert status == 0 @@ -211,7 +211,7 @@ def test_export_query_csv(self): def test_interaction_tolerance(self): status_notol = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file test_data/adgpu/group1/127458.dlg.gz" + "python ../ringtail/cli/rt_process_vs.py write --file test_data/adgpu/group1/127458.dlg.gz" ) conn = sqlite3.connect("output.db") @@ -228,7 +228,7 @@ def test_interaction_tolerance(self): os.system("rm output.db") status_tol = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file test_data/adgpu/group1/127458.dlg.gz --interaction_tolerance" + "python ../ringtail/cli/rt_process_vs.py write --file test_data/adgpu/group1/127458.dlg.gz --interaction_tolerance" ) conn = sqlite3.connect("output.db") @@ -244,7 +244,7 @@ def test_interaction_tolerance(self): os.system("rm output.db") status_tol2 = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file test_data/adgpu/group1/127458.dlg.gz --interaction_tolerance 2.0" + "python ../ringtail/cli/rt_process_vs.py write --file test_data/adgpu/group1/127458.dlg.gz --interaction_tolerance 2.0" ) conn = sqlite3.connect("output.db") @@ -269,7 +269,7 @@ def test_interaction_tolerance(self): def test_max_poses(self): os.system("rm output.db") status3 = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt" ) conn = sqlite3.connect("output.db") cur = conn.cursor() @@ -282,7 +282,7 @@ def test_max_poses(self): os.system("rm output.db") status1 = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt --max_poses 1" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt --max_poses 1" ) conn = sqlite3.connect("output.db") cur = conn.cursor() @@ -298,7 +298,7 @@ def test_max_poses(self): os.system("rm output.db") status5 = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt --max_poses 5" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt --max_poses 5" ) conn = sqlite3.connect("output.db") cur = conn.cursor() @@ -318,7 +318,7 @@ def test_max_poses(self): def test_store_all(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt --store_all_poses" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt --store_all_poses" ) conn = sqlite3.connect("output.db") cur = conn.cursor() @@ -341,73 +341,80 @@ class TestFilters: def test_eworst(self): status1 = os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_list test_data/filelist1.txt" + "python ../ringtail/cli/rt_process_vs.py write --file_list test_data/filelist1.txt" ) status2 = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --eworst -15" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --eworst -15" ) assert status1 == status2 == 0 def test_ebest(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --ebest -15" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --ebest -15" ) assert status == 0 def test_leworst(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --leworst -0.4" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --leworst -0.4" ) assert status == 0 def test_lebest(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --leworst -0.4" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --leworst -0.4" ) assert status == 0 def test_epercentile(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --score_percentile 0.1" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --score_percentile 0.1" ) assert status == 0 def test_lepercentile(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --le_percentile 0.1" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --le_percentile 0.1" ) assert status == 0 def test_epercentile_eworst(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --score_percentile 0.1 --eworst -14" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --score_percentile 0.1 --eworst -14" ) assert status == 0 def test_lepercentile_leworst(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --le_percentile 0.1 --leworst -0.4" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --le_percentile 0.1 --leworst -0.4" ) assert status == 0 def test_name(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input output.db --ligand_name 127458" + "python ../ringtail/cli/rt_process_vs.py read --input output.db --ligand_name 127458" + ) + + assert status == 0 + + def test_ligand_filters(self, countrows): + status = os.system( + """python ../ringtail/cli/rt_process_vs.py read --input output.db --ligand_substruct "NC" --ligand_operator AND""" ) assert status == 0 def test_hbcount(self, countrows): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --hb_count 5" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --hb_count 5" ) count = countrows("SELECT COUNT(*) FROM passing_results") @@ -416,112 +423,112 @@ def test_hbcount(self, countrows): def test_hb1(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -hb A:LYS:162:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -hb A:LYS:162:" ) assert status == 0 def test_hb2(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -hb :LYS:162:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -hb :LYS:162:" ) assert status == 0 def test_hb3(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -hb :LYS::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -hb :LYS::" ) assert status == 0 def test_hb4(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -hb A:LYS::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -hb A:LYS::" ) assert status == 0 def test_hb5(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -hb A::162:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -hb A::162:" ) assert status == 0 def test_hb6(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -hb A:::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -hb A:::" ) assert status == 0 def test_hb7(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -hb ::162:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -hb ::162:" ) assert status == 0 def test_vdw1(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -vdw A:VAL:279:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -vdw A:VAL:279:" ) assert status == 0 def test_vdw2(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -vdw :VAL:279:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -vdw :VAL:279:" ) assert status == 0 def test_vdw3(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -vdw :VAL::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -vdw :VAL::" ) assert status == 0 def test_vdw4(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -vdw A:VAL::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -vdw A:VAL::" ) assert status == 0 def test_vdw5(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -vdw A::279:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -vdw A::279:" ) assert status == 0 def test_vdw6(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -vdw A:::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -vdw A:::" ) assert status == 0 def test_vdw7(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db -vdw ::279:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -vdw ::279:" ) assert status == 0 def test_all_filters(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --eworst -15 --ebest -16 --leworst -0.4 --lebest -0.5 --score_percentile 99 --le_percentile 99 --ligand_name 127458 --hb_count 5 --react_any -hb A:LYS:162: -vdw A:VAL:279:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --eworst -15 --ebest -16 --leworst -0.4 --lebest -0.5 --score_percentile 99 --le_percentile 99 --ligand_name 127458 --hb_count 5 --react_any -hb A:LYS:162: -vdw A:VAL:279:" ) assert status == 0 def test_export_sdf(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -e -4 -sdf . -d " + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db -e -4 -sdf . " ) import glob @@ -536,7 +543,7 @@ def test_export_sdf(self): def test_filters_value_error(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --score_percentile 109" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --score_percentile 109" ) # checking that code exited with error since a percentile cannot be above 100 assert status != 0 @@ -545,60 +552,60 @@ def test_filters_value_error(self): def test_react_any(self): # write new db with reactive data os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --output_db output.db --file_path test_data/reactive --receptor_file test_data/reactive/4j8m_m_rigid.pdbqt" + "python ../ringtail/cli/rt_process_vs.py write --output_db output.db --file_path test_data/reactive --receptor_file test_data/reactive/4j8m_m_rigid.pdbqt" ) status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --react_any" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --react_any" ) assert status == 0 def test_react1(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --reactive_interactions A:TYR:212:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --reactive_interactions A:TYR:212:" ) assert status == 0 def test_react2(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --reactive_interactions :TYR:212:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --reactive_interactions :TYR:212:" ) assert status == 0 def test_react3(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --reactive_interactions :TYR::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --reactive_interactions :TYR::" ) assert status == 0 def test_react4(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --reactive_interactions A:TYR::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --reactive_interactions A:TYR::" ) assert status == 0 def test_react5(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --reactive_interactions A::212:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --reactive_interactions A::212:" ) assert status == 0 def test_react6(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --reactive_interactions A:::" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --reactive_interactions A:::" ) assert status == 0 def test_react7(self): status = os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --reactive_interactions ::212:" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --reactive_interactions ::212:" ) assert status == 0 @@ -610,19 +617,19 @@ class TestOtherScripts: def test_rt_compare(self): # first database os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --file_path test_data/adgpu/group1" + "python ../ringtail/cli/rt_process_vs.py write --file_path test_data/adgpu/group1" ) # second database os.system( - "python ../ringtail/cli/rt_process_vs.py write -d --output_db output2.db --file_path test_data/adgpu/group1" + "python ../ringtail/cli/rt_process_vs.py write --output_db output2.db --file_path test_data/adgpu/group1" ) # filter producing 30 ligands os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output.db --eworst -6" + "python ../ringtail/cli/rt_process_vs.py read --input_db output.db --eworst -6" ) # filter producing 5 ligands os.system( - "python ../ringtail/cli/rt_process_vs.py read -d --input_db output2.db --eworst -7" + "python ../ringtail/cli/rt_process_vs.py read --input_db output2.db --eworst -7" ) # should produce 25 ligands os.system( From c14e86732d27eb8e1bac927b95a108d7b93a1b22 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 26 Sep 2024 21:28:39 -0700 Subject: [PATCH 48/63] fixed ligand substruct bug and handling of ligand filters --- ringtail/cloptionparser.py | 39 ++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/ringtail/cloptionparser.py b/ringtail/cloptionparser.py index bc0be89a..bf87a48b 100644 --- a/ringtail/cloptionparser.py +++ b/ringtail/cloptionparser.py @@ -485,7 +485,7 @@ def cmdline_parser(defaults: dict = {}): "-n", "--ligand_name", help="specify ligand name(s). Will combine name filters with OR", - action="store", + action="append", type=str, metavar="STRING", nargs="+", @@ -501,7 +501,7 @@ def cmdline_parser(defaults: dict = {}): ligand_group.add_argument( "--ligand_substruct", help="SMARTS pattern(s) for substructure matching", - action="store", + action="append", type=str, metavar="STRING", nargs="+", @@ -509,7 +509,7 @@ def cmdline_parser(defaults: dict = {}): ligand_group.add_argument( "--ligand_substruct_pos", help="SMARTS, index of atom in SMARTS, cutoff dist, and target XYZ coords", - action="store", + action="append", type=str, metavar="STRING", nargs="+", @@ -845,21 +845,42 @@ def process_options(self, parsed_opts): # make dictionary for ligand filters ligand_kw = Filters.get_filter_keys("ligand") ligand_filters = {} + # parse the ligand filters, depending on how the keywords are used they will be a list of list or list of lists for _type in ligand_kw: ligand_filter_value = getattr(parsed_opts, _type) + # just a simple string if _type == ("ligand_max_atoms"): ligand_filters[_type] = ligand_filter_value continue + # don't include None values if ligand_filter_value is (None): continue ligand_filters[_type] = [] - for filter in ligand_filter_value: - ligand_filters[_type].append(filter) + # the other ligand filters can come as [[filter1,filter2,filter3]] or [[filter1],[filter2, filter3]] + for filter_list in ligand_filter_value: + # if more than one filter in list, go through each + if len(filter_list) > 1: + for filter in filter_list: + if _type == "ligand_subtruct_pos": + # make a lits of the six values + ligand_filters[_type].append( + [i for i in filter.split(" ")] + ) + else: + ligand_filters[_type].append(filter) + else: + if _type == "ligand_subtruct_pos": + # if only one item in list, append to ligand list + ligand_filters[_type].append( + [i for i in filter_list[0].split(" ")] + ) + else: + ligand_filters[_type].append(filter_list[0]) ligand_filters["ligand_operator"] = parsed_opts.ligand_operator if ( - ligand_filters["ligand_max_atoms"] is not None - and len(ligand_filters["ligand_max_atoms"]) % 6 != 0 + "ligand_substruct_pos" in ligand_filters + and len(ligand_filters["ligand_substruct_pos"][0]) % 6 != 0 ): msg = "--ligand_substruct_pos needs groups of 6 values:\n" msg += " 1. Ligand SMARTS\n" @@ -868,7 +889,9 @@ def process_options(self, parsed_opts): msg += " 4. X\n" msg += " 5. Y\n" msg += " 6. Z\n" - msg += 'For example --ligand_substruct_pos "[C][Oh]" 1 1.5 -20. 42. -7.1' + msg += ( + 'For example --ligand_substruct_pos "[C][Oh] 1 1.5 -20 42 -7.1"' + ) raise OptionError(msg) for k, v in ligand_filters.items(): From a2e4117edc719e78d56aab271255f8694bb777e9 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 27 Sep 2024 09:36:44 -0700 Subject: [PATCH 49/63] ligand operator set to default OR until changed in code --- ringtail/ringtailoptions.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/ringtail/ringtailoptions.py b/ringtail/ringtailoptions.py index 1a99f239..dd5a6c4d 100644 --- a/ringtail/ringtailoptions.py +++ b/ringtail/ringtailoptions.py @@ -577,16 +577,16 @@ class Filters(RTOptions): "type": list, "description": "SMARTS pattern(s) for substructure matching, e.g., [''[Oh]C' 0 1.2 -5.5 10.0 15.5'] -> ['smart_string index_of_positioned_atom cutoff_distance x y z'].", }, - "ligand_max_atoms": { - "default": None, - "type": int, - "description": "Maximum number of heavy atoms a ligand may have.", - }, "ligand_operator": { "default": None, "type": str, "description": "Logical join operator for multiple SMARTS.", }, + "ligand_max_atoms": { + "default": None, + "type": int, + "description": "Maximum number of heavy atoms a ligand may have.", + }, } def __init__(self): @@ -595,7 +595,7 @@ def __init__(self): def checks(self): """Ensures all values are internally consistent and valid. Runs once after all values are set initially, then every time a value is changed.""" - if hasattr(self, "ligand_operator"): + if hasattr(self, "ligand_max_atoms"): if self.eworst is not None and self.score_percentile is not None: logger.warning( "Cannot use 'eworst' cutoff with 'score_percentile'. Overiding 'score_percentile' with 'eworst'." @@ -625,9 +625,7 @@ def checks(self): if self.ligand_operator not in ["OR", "AND"] and ( self.ligand_substruct or self.ligand_substruct_pos ): - logger.warning( - f"Given 'ligand_operator' {self.ligand_operator} not allowed with 'ligand_substruct' or 'ligand_substruct_pos'. Will be set to default 'OR'." - ) + logger.debug(f"'ligand_operator' set to default 'OR'.") self.ligand_operator = "OR" if self.max_miss < 0: From 07aeb6d270fb97ad46b7822fee55c3d48fc0bd38 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 27 Sep 2024 09:41:22 -0700 Subject: [PATCH 50/63] changed order ligand operator appears in --- ringtail/ringtailoptions.py | 10 +++++----- test/test_units.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ringtail/ringtailoptions.py b/ringtail/ringtailoptions.py index dd5a6c4d..b4e34b86 100644 --- a/ringtail/ringtailoptions.py +++ b/ringtail/ringtailoptions.py @@ -567,6 +567,11 @@ class Filters(RTOptions): "type": list, "description": "Specify ligand name(s). Will combine name filters with 'OR'.", }, + "ligand_operator": { + "default": None, + "type": str, + "description": "Logical join operator for multiple SMARTS.", + }, "ligand_substruct": { "default": None, "type": list, @@ -577,11 +582,6 @@ class Filters(RTOptions): "type": list, "description": "SMARTS pattern(s) for substructure matching, e.g., [''[Oh]C' 0 1.2 -5.5 10.0 15.5'] -> ['smart_string index_of_positioned_atom cutoff_distance x y z'].", }, - "ligand_operator": { - "default": None, - "type": str, - "description": "Logical join operator for multiple SMARTS.", - }, "ligand_max_atoms": { "default": None, "type": int, diff --git a/test/test_units.py b/test/test_units.py index d4ab77da..82ea6572 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -315,10 +315,10 @@ def test_generate_interactions_prepare_filters(self): "react_any": None, "max_miss": 0, "ligand_name": None, + "ligand_operator": None, "ligand_substruct": None, "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": None, } in test_filters assert { @@ -335,10 +335,10 @@ def test_generate_interactions_prepare_filters(self): "react_any": None, "max_miss": 0, "ligand_name": None, + "ligand_operator": None, "ligand_substruct": None, "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": None, } in test_filters assert { @@ -355,10 +355,10 @@ def test_generate_interactions_prepare_filters(self): "react_any": None, "max_miss": 0, "ligand_name": None, + "ligand_operator": None, "ligand_substruct": None, "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": None, } in test_filters assert { @@ -375,10 +375,10 @@ def test_generate_interactions_prepare_filters(self): "react_any": None, "max_miss": 0, "ligand_name": None, + "ligand_operator": None, "ligand_substruct": None, "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": None, } in test_filters assert { @@ -395,10 +395,10 @@ def test_generate_interactions_prepare_filters(self): "react_any": None, "max_miss": 0, "ligand_name": None, + "ligand_operator": None, "ligand_substruct": None, "ligand_substruct_pos": None, "ligand_max_atoms": None, - "ligand_operator": None, } in test_filters assert len(test_filters) == 5 @@ -664,9 +664,9 @@ def test_bookmark_info(self, dbquery): "react_any": None, "max_miss": 0, "ligand_name": None, + "ligand_operator": None, "ligand_substruct": None, "ligand_substruct_pos": None, - "ligand_operator": None, "ligand_max_atoms": None, } assert bookmark_filters_db_str == json.dumps(filters) From a66e6841126a56b176b5f4b5239f02087a8ac332 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 27 Sep 2024 09:52:49 -0700 Subject: [PATCH 51/63] fixed bug with dropping bookmark --- ringtail/ringtailcore.py | 9 ++++----- test/test_units.py | 9 +++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index 05fb6b83..47c35ca0 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -869,10 +869,10 @@ def set_filters( react_any=None, max_miss=None, ligand_name=None, + ligand_operator=None, ligand_substruct=None, ligand_substruct_pos=None, ligand_max_atoms=None, - ligand_operator=None, dict: dict = None, ): """ @@ -913,8 +913,8 @@ def set_filters( "hb_count": hb_count, "react_any": react_any, "max_miss": max_miss, - "ligand_name": ligand_name, "ligand_operator": ligand_operator, + "ligand_name": ligand_name, "ligand_substruct": ligand_substruct, "ligand_substruct_pos": ligand_substruct_pos, "ligand_max_atoms": ligand_max_atoms, @@ -1211,10 +1211,10 @@ def filter( react_any=None, max_miss=None, ligand_name=None, + ligand_operator=None, ligand_substruct=None, ligand_substruct_pos=None, ligand_max_atoms=None, - ligand_operator=None, filters_dict: dict | None = None, # other processing options: enumerate_interaction_combs: bool = False, @@ -1313,10 +1313,10 @@ def filter( react_any=react_any, max_miss=max_miss, ligand_name=ligand_name, + ligand_operator=ligand_operator, ligand_substruct=ligand_substruct, ligand_substruct_pos=ligand_substruct_pos, ligand_max_atoms=ligand_max_atoms, - ligand_operator=ligand_operator, dict=filters_dict, ) @@ -1405,7 +1405,6 @@ def filter( self.storageman.drop_bookmark(self.storageman.bookmark_name) # else produce a bookmark for each interaction combination elif not write_one_bookmark: - # TODO in this case max_miss has to be the exact number of interactions in each combo interaction_combs = self._generate_interaction_combinations( self.filters.max_miss ) diff --git a/test/test_units.py b/test/test_units.py index 82ea6572..160d05f4 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -117,6 +117,7 @@ def test_filter(self): bookmarks = rtc.get_bookmark_names() assert len(bookmarks) == 1 assert bookmarks[0] == "union_bookmark" + rtc.drop_bookmark("union_bookmark") def test_enumerate_interaction_combinations(self): # first test without enumerate, check number of passing union as well as number of bookmarks @@ -135,13 +136,13 @@ def test_enumerate_interaction_combinations(self): assert count_ligands_passing == 33 # make sure additional bookmarks were created for the enumerated combinations - bookmarks_with_new = rtc.get_bookmark_names() + bookmarks = rtc.get_bookmark_names() # This filtering session should produce 6 bookmarks - assert len(bookmarks_with_new) - len(bookmarks_old) == 6 + assert len(bookmarks) == 6 # check that naming works properly - assert "enumerated_bookmark_0" in bookmarks_with_new - assert "enumerated_bookmark_union" in bookmarks_with_new + assert "enumerated_bookmark_0" in bookmarks + assert "enumerated_bookmark_union" in bookmarks def test_ligand_filters(self): rtc = RingtailCore(db_file="output.db") From 7d91542c9b1e4f2a09c354336426fbfbf20b624f Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 27 Sep 2024 09:54:02 -0700 Subject: [PATCH 52/63] fixed drop bookmark bug, removed some TODOs and fixed a bug in a query --- ringtail/storagemanager.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index ab491989..124355a2 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -39,7 +39,7 @@ class StorageManager: _db_schema_code_compatibility = { "1.0.0": ["1.0.0"], "1.1.0": ["1.1.0"], - "2.0.0": ["2.0.0"], + "2.0.0": ["2.0.0", "2.1.0"], } """Base class for a generic virtual screening database object. @@ -1753,14 +1753,15 @@ def drop_bookmark(self, bookmark_name: str): DatabaseInsertionError """ - query_drop = "DROP VIEW IF EXISTS {0}".format(bookmark_name) - query_delete = "DELETE FROM Bookmarks WHERE Bookmark_name = '{0}'".format( - bookmark_name - ) + query_drop = f"DROP VIEW IF EXISTS {bookmark_name}" + query_delete = f"DELETE FROM Bookmarks WHERE Bookmark_name = '{bookmark_name}'" try: - self._run_query(query_drop) - self._run_query(query_delete) + cur = self.conn.execute(query_drop) + cur.execute(query_delete) + self.conn.commit() + cur.close() + self.logger.info(f"Dropped bookmark {bookmark_name}.") except sqlite3.OperationalError as e: raise DatabaseInsertionError( f"Error while attempting to drop bookmark {bookmark_name}" @@ -1978,7 +1979,6 @@ def fetch_single_pose_properties(self, pose_ID: int): return self._run_query(query) def fetch_interaction_info_by_index(self, interaction_idx): - # TODO refactor-> make it work for one or more indices """Returns tuple containing interaction info for given interaction_idx Args: @@ -2125,7 +2125,6 @@ def to_dataframe(self, requested_data: str, table=True) -> pd.DataFrame: return pd.read_sql_query(requested_data, self.conn) def _get_length_of_table(self, table_name: str): - # TODO check if index on table, and use that row if possible """ Finds the rowcount/length of a table based on the rowid @@ -2301,7 +2300,6 @@ def fetch_clustered_similars(self, ligname: str): raise ValueError( f"Given cluster number {cluster_choice} cannot be converted to int. Please be sure you are specifying integer." ) - # TODO might be able to refactor these queries query_ligand_cluster = cur.execute( f"SELECT {cluster_col_choice} FROM Ligand_clusters WHERE pose_id IN (SELECT Pose_ID FROM Results WHERE LigName LIKE '{ligname}')" ).fetchone() @@ -2600,7 +2598,9 @@ def _generate_result_filtering_query(self, filters_dict): ) if int_query: # add with a join statement - unclustered_query += "JOIN " + int_query + " ON R.Pose_ID = I.Pose_ID " + unclustered_query += ( + "JOIN (" + int_query + ") I ON R.Pose_ID = I.Pose_ID " + ) if lig_query: # add with a join statement unclustered_query += ( @@ -2620,7 +2620,6 @@ def _generate_result_filtering_query(self, filters_dict): # if not clustering, rename query query = unclustered_query # choose columns to be selected from filtering_window - # TODO when to use "DISTINCT" query_select_string = f"""SELECT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R """ # adding if we only want to keep one pose per ligand (will keep first entry) if not self.output_all_poses: @@ -2631,7 +2630,6 @@ def _generate_result_filtering_query(self, filters_dict): output_query = query_select_string + query view_query = f"SELECT * FROM {filtering_window} R " + query - print(" output query: ", output_query) return output_query, view_query def _prepare_cluster_query(self, unclustered_query: str) -> str | None: @@ -2785,7 +2783,6 @@ def mp_wrapper(input_tpl): cluster_query_string = "R.Pose_ID = " + " OR R.Pose_ID = ".join( fp_rep_poseids ) - print(" cluster_query_string: ", cluster_query_string) return cluster_query_string def _prepare_interaction_filtering_query( @@ -2852,7 +2849,7 @@ def _prepare_indices_for_query(interactions: list): # building the query # 1. select pose id, call CASE, in paranthesis because grouping with different query - query = "(SELECT Pose_ID FROM (SELECT Pose_ID " + query = "SELECT Pose_ID FROM (SELECT Pose_ID " if or_include_interactions or or_exclude_interactions: # add the case statements query += ", CASE " @@ -2896,7 +2893,7 @@ def _prepare_indices_for_query(interactions: list): + ") " ) # 4. add grouping and wildcard for total interactions minus max_miss, essentially - query += f") GROUP BY Pose_ID HAVING COUNT(DISTINCT filtered_interactions) >= ({num_of_interactions})) I " + query += f") GROUP BY Pose_ID HAVING COUNT(DISTINCT filtered_interactions) >= ({num_of_interactions}) " return query @@ -2979,6 +2976,10 @@ def _ligand_substructure_position_filter(self, ligand_filters_dict: dict): if len(pose_id_list) > 0: queries.append("Pose_ID IN ({0})".format(",".join(pose_id_list))) cur.close() + if not queries: + raise OptionError( + "There are no ligands passing the 'ligand_substruct_pos' filter, please revise your filter query." + ) return queries def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: @@ -3100,7 +3101,6 @@ def _get_interaction_indices(self, interaction_list) -> iter: return self._run_query(sql_string).fetchall() def _generate_ligand_filtering_query(self, ligand_filters: dict) -> str: - # TODO want to clean this one up """write string to select from ligand table Args: From cc6c436a969287e9fdf10c22fe4bd417c2524d7c Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 30 Sep 2024 14:22:02 -0700 Subject: [PATCH 53/63] removed warning for nonunion bookmark if enumerate_intearction_combs is false --- ringtail/ringtailcore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index 47c35ca0..677bde25 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -1558,6 +1558,7 @@ def ligands_rdkit_mol(self, bookmark_name=None, write_nonpassing=False) -> dict: try: max_miss_present = bool( bookmark_filters["max_miss"] > 0 + and not bookmark_filters["enumerate_interaction_combs"] and not "_union" in self.storageman.bookmark_name ) except: From d3619326d3c2bd21a6a21f82e0b0af22e9c939f8 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 3 Oct 2024 15:49:59 -0700 Subject: [PATCH 54/63] updated doc strings and removed todos, redundant print statements, etc --- ringtail/cloptionparser.py | 2 +- ringtail/outputmanager.py | 30 +++++--- ringtail/parsers.py | 1 - ringtail/receptormanager.py | 1 - ringtail/ringtailcore.py | 2 +- ringtail/storagemanager.py | 140 ++++++++++++++++++++---------------- ringtail/util.py | 9 +++ 7 files changed, 112 insertions(+), 73 deletions(-) diff --git a/ringtail/cloptionparser.py b/ringtail/cloptionparser.py index bf87a48b..c37450a2 100644 --- a/ringtail/cloptionparser.py +++ b/ringtail/cloptionparser.py @@ -508,7 +508,7 @@ def cmdline_parser(defaults: dict = {}): ) ligand_group.add_argument( "--ligand_substruct_pos", - help="SMARTS, index of atom in SMARTS, cutoff dist, and target XYZ coords", + help='"SMARTS, index of atom in SMARTS, cutoff dist, and target XYZ coords, group by "', action="append", type=str, metavar="STRING", diff --git a/ringtail/outputmanager.py b/ringtail/outputmanager.py index c164bcb9..c5cec804 100644 --- a/ringtail/outputmanager.py +++ b/ringtail/outputmanager.py @@ -35,9 +35,11 @@ def __init__(self, log_file=None, export_sdf_path=None): self.logger = LOGGER def __enter__(self): + """Opening outputmanager as a context manager""" self.open_logfile() def __exit__(self, exc_type, exc_value, traceback): + """Closing outputmanager as a context manager""" self.close_logfile() # -#-#- Log file methods -#-#-# @@ -63,7 +65,7 @@ def open_logfile(self, write_filters_header=True): raise OutputError("Error while creating log file") from e def close_logfile(self): - """Closes the log file properly""" + """Closes the log file properly and reset file pointer to filename""" if self._log_open: self.log_file.close() self.log_file = os.path.basename( @@ -100,7 +102,7 @@ def write_filter_log(self, lines): except Exception as e: raise OutputError("Error occurred during log writing") from e - def _write_log_line(self, line): + def _write_log_line(self, line: str): """write a single row to the log file Args: @@ -115,7 +117,7 @@ def _write_log_line(self, line): except Exception as e: raise OutputError(f"Error writing line {line} to log") from e - def log_num_passing_ligands(self, number_passing_ligands): + def log_num_passing_ligands(self, number_passing_ligands: int): """ Write the number of ligands which pass given filter to log file @@ -128,9 +130,7 @@ def log_num_passing_ligands(self, number_passing_ligands): try: self.log_file.write("\n") self.log_file.write( - "Number passing ligands: {num} \n".format( - num=str(number_passing_ligands) - ) + f"Number passing ligands: {str(number_passing_ligands)} \n" ) self.log_file.write("---------------\n") except Exception as e: @@ -215,10 +215,16 @@ def write_filters_to_log( raise OutputError("Error occurred while writing filters to log") from e def write_maxmiss_union_header(self): + """ + Properly formats header for the log file if using max_miss and enumerate_interaction_combs + """ self.log_file.write("\n---------------\n") self.log_file.write("Max Miss Union:\n") def write_find_similar_header(self, query_ligname, cluster_name): + """ + Properly formats header for the log file find_similar_ligands + """ if not self._log_open: self.open_logfile(write_filters_header=False) self.log_file.write("\n---------------\n") @@ -274,6 +280,14 @@ def write_out_mol(self, filename, mol, flexres_mols, properties): raise OutputError("Error occurred while writing SDF from RDKit Mol") from e def write_receptor_pdbqt(self, recname: str, receptor_compbytes): + """ + Writes a pdbqt file from receptor "blob" + + Args: + recname (str): name of receptor to use in output filename + receptor_compbytes (blob): receptor blob + """ + if not recname.endswith(".pdbqt"): recname = recname + ".pdbqt" receptor_str = ReceptorManager.blob2str(receptor_compbytes) @@ -364,9 +378,7 @@ def plot_all_data(self, binned_data): mappable=cm.ScalarMappable( colors.Normalize(vmin=min(bin_counts), vmax=max(bin_counts)), ), - cax=self.ax.inset_axes( - [0.85, 0.1, 0.05, 0.8] - ), # TODO dimensions not quite right + cax=self.ax.inset_axes([0.85, 0.1, 0.05, 0.8]), label="Scatterplot bin count", ) self.ax.set_xlabel("Best docking score / kcal/mol") diff --git a/ringtail/parsers.py b/ringtail/parsers.py index a9e7b415..72d9eea8 100644 --- a/ringtail/parsers.py +++ b/ringtail/parsers.py @@ -12,7 +12,6 @@ from .logutils import LOGGER as logger -# TODO add a second zip method (bz2), clean up duplication between dlg and pdbqt def parse_single_dlg(fname): """Parse an ADGPU DLG file uncompressed or gzipped diff --git a/ringtail/receptormanager.py b/ringtail/receptormanager.py index a846f881..be79c607 100644 --- a/ringtail/receptormanager.py +++ b/ringtail/receptormanager.py @@ -9,7 +9,6 @@ class ReceptorManager: - # TODO add b2z method too? """Class with methods dealing with formatting of receptor information""" @staticmethod diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index 677bde25..48c6e581 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -1579,7 +1579,7 @@ def ligands_rdkit_mol(self, bookmark_name=None, write_nonpassing=False) -> dict: self.storageman.bookmark_name + "_union" ) self.logger.warning( - "Requested 'export_sdf_path' with 'max_miss' present in the bookmark filter. Exported SDFs will be for union of interaction combinations." + "Requested 'export_sdf_path' with 'max_miss' and 'enumerate_interaction_combs' used in the filtering process. Exported SDFs will be for union of interaction combinations." ) # if not, raise error else: diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 124355a2..8caefab4 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -4,7 +4,6 @@ # Ringtail storage adaptors # -from traceback import format_exc import sqlite3 import time import json @@ -39,7 +38,7 @@ class StorageManager: _db_schema_code_compatibility = { "1.0.0": ["1.0.0"], "1.1.0": ["1.1.0"], - "2.0.0": ["2.0.0", "2.1.0"], + "2.0.0": ["2.0.0", "2.0.1"], } """Base class for a generic virtual screening database object. @@ -224,7 +223,7 @@ def filter_results(self, all_filters: dict, suppress_output=False) -> iter: # NOTE will cause error when any version int is > 10 # catch version 1.0.0 where returned db_rt_version will be 0 if db_rt_version == 0: - db_rt_version = 100 # TODO update this for new ringtail version and db schema version + db_rt_version = 100 raise StorageError( f"Input database was created with Ringtail v{'.'.join([i for i in db_rt_version[:2]] + [db_rt_version[2:]])}. Confirm that this matches current Ringtail version and use Ringtail update script(s) to update database if needed." ) @@ -501,6 +500,9 @@ def __init__( # region Methods for inserting into/removing from the database def _create_tables(self): + """ + Creates all tables needed for a Ringtail database of a specific version + """ self._create_results_table() self._create_ligands_table() self._create_receptors_table() @@ -1392,7 +1394,6 @@ def _generate_interaction_tuples(cls, interaction_dictionaries: list): return list(interactions) def _insert_interaction_index_row(self, interaction_tuple) -> tuple: - # change method to _insert_interaction_index """ Writes unique interactions and returns the interaction_id of the given interaction @@ -1535,7 +1536,7 @@ def _insert_cluster_data( self.conn.commit() def _create_indices(self): - """Create index containing possible filter and order by columns + """Create index for specified tables and columns. 'ak' stands for 'alternate key' and is prepended to index name to avoid naming conflicts Raises: StorageError @@ -1554,6 +1555,7 @@ def _create_indices(self): cur.execute( "CREATE INDEX IF NOT EXISTS ak_interactions ON Interactions(Pose_id, interaction_id)" ) + cur.execute("CREATE INDEX IF NOT EXISTS ak_ligands ON Ligands(LigName)") self.conn.commit() cur.close() self.logger.info( @@ -1663,6 +1665,7 @@ def fetch_bookmark(self, bookmark_name: str) -> sqlite3.Cursor: def create_bookmark(self, name, query, temp=False, add_poseID=False, filters={}): """Takes name and selection query and creates a bookmark of name. Bookmarks are Ringtail specific views that whose information is stored in the 'Bookmark' table. + #FIXME bug where ligand filter only results are not added as bookmarks Args: name (str): Name for bookmark which will be created @@ -1712,7 +1715,7 @@ def _create_view(self, name, query): cur.close() except sqlite3.OperationalError as e: raise DatabaseViewCreationError( - "Error ({1}) creating view from query \n{0}".format(query, e) + f"Error ({e}) creating view from query \n{query}" ) from e def _insert_bookmark_info(self, name: str, sqlite_query: str, filters={}): @@ -1930,7 +1933,7 @@ def fetch_flexres_info(self): except sqlite3.OperationalError as e: raise DatabaseQueryError("Error retrieving flexible residue info") from e - def fetch_passing_ligand_output_info(self): + def fetch_passing_ligand_output_info(self) -> iter: """fetch information required by vsmanager for writing out molecules Returns: @@ -1940,7 +1943,7 @@ def fetch_passing_ligand_output_info(self): query = "SELECT LigName, ligand_smile, atom_index_map, hydrogen_parents FROM Ligands WHERE LigName IN (SELECT DISTINCT LigName FROM passing_temp)" return self._run_query(query) - def fetch_single_ligand_output_info(self, ligname): + def fetch_single_ligand_output_info(self, ligname) -> str: """get output information for given ligand Args: @@ -1965,7 +1968,7 @@ def fetch_single_ligand_output_info(self, ligname): f"Error retrieving ligand info for {ligname}" ) from e - def fetch_single_pose_properties(self, pose_ID: int): + def fetch_single_pose_properties(self, pose_ID: int) -> iter: """fetch coordinates for pose given by pose_ID Args: @@ -1978,7 +1981,7 @@ def fetch_single_pose_properties(self, pose_ID: int): query = f"SELECT Pose_ID, docking_score, leff, ligand_coordinates, flexible_res_coordinates FROM Results WHERE Pose_ID={pose_ID}" return self._run_query(query) - def fetch_interaction_info_by_index(self, interaction_idx): + def fetch_interaction_info_by_index(self, interaction_idx) -> tuple: """Returns tuple containing interaction info for given interaction_idx Args: @@ -1992,7 +1995,7 @@ def fetch_interaction_info_by_index(self, interaction_idx): ) return self._run_query(query).fetchone()[1:] # cut off interaction index - def fetch_pose_interactions(self, Pose_ID): + def fetch_pose_interactions(self, Pose_ID) -> iter: """ Fetch all interactions parameters belonging to a Pose_ID @@ -2234,7 +2237,7 @@ def get_maxmiss_union(self, total_combinations: int): """ selection_strs = [] view_strs = [] - outfield_list = self._generate_outfield_string() + outfield_list = self._generate_outfield_list() for i in range(total_combinations): selection_strs.append( f"""SELECT {", ".join(outfield_list)} FROM {self.bookmark_name + '_' + str(i)}""" @@ -2373,7 +2376,7 @@ def _calc_percentile_cutoff(self, percentile: float, column="docking_score"): except sqlite3.OperationalError as e: raise StorageError("Error while generating percentile query") from e - def _generate_outfield_string(self): + def _generate_outfield_list(self): """list describing outfields to be written Returns: @@ -2387,9 +2390,7 @@ def _generate_outfield_string(self): for outfield in outfields_list: if outfield not in self._data_kw_groups("outfield_options"): raise OptionError( - "{out_f} is not a valid output option. Please see rt_process_vs.py --help for allowed options".format( - out_f=outfield - ) + f"{outfield} is not a valid output option. Please see rt_process_vs.py --help for allowed options" ) return [self.field_to_column_name[field] for field in outfields_list] @@ -2447,12 +2448,10 @@ def _process_filters_for_query(self, filters_dict: dict): ) continue if v > 0: - numerical_filters.append("num_hb > {value}".format(value=v)) + numerical_filters.append(f"num_hb > {v}") else: # if value is negative, it means less than specified number of hydrogen bonds - numerical_filters.append( - "num_hb <= {value}".format(value=-1 * v) - ) + numerical_filters.append(f"num_hb <= {-v}") interaction_name_to_letter = { "vdw_interactions": "V", "hb_interactions": "H", @@ -2498,9 +2497,10 @@ def _generate_result_filtering_query(self, filters_dict): Returns: str: SQLite-formatted string for filtering query """ + # table to filter over filtering_window = "Results" - outfield_columns = self._generate_outfield_string() + outfield_columns = self._generate_outfield_list() num_query = "" int_query = "" lig_query = "" @@ -2523,12 +2523,18 @@ def _generate_result_filtering_query(self, filters_dict): raise OptionError( "Cannot use 'score_percentile' or 'le_percentile' with 'filter_bookmark'." ) - # filtering window can be specified bookmark, or whole database (or other reduced versions of db) + # filtering window can be specified bookmark, as opposed to entire database using Results table filtering_window = self.filter_bookmark # process filter values to lists and dicts that are easily incorporated in sql queries processed_filters = self._process_filters_for_query(filters_dict) + # raise error if no filters are present and no clusterings + if not processed_filters and not clustering: + raise DatabaseQueryError( + "Ringtail query strings are empty, please check filter options." + ) + # check if clustering clustering = bool(self.mfpt_cluster or self.interaction_cluster) # if clustering without filtering @@ -2538,16 +2544,12 @@ def _generate_result_filtering_query(self, filters_dict): unclustered_query = f"SELECT R.Pose_id FROM {filtering_window} R " if not processed_filters and filtering_window == "Results": self.logger.warning( - "If clustering is not performed on a pre-filtered bookmark, the clustering process will be very slow." + "If clustering is not performed on a pre-filtered bookmark, the clustering process can be slow." ) else: # start with empty string, will prepend SELECT statement later unclustered_query = "" - # raise error if no filters are present and no clusterings - if not processed_filters and not clustering: - raise DatabaseQueryError( - "Query strings are empty. Please check filter options and ensure requested interactions are present." - ) + # create query string from filters if present if processed_filters: # start stringing together queries @@ -2556,7 +2558,6 @@ def _generate_result_filtering_query(self, filters_dict): num_query = " AND ".join( ["R." + filter for filter in processed_filters["num_filters"]] ) - # check for interactions and prepare for query if "int_filters" in processed_filters: # if interaction filters are present and valid, two lists of included and excluded interactions are returned @@ -2574,7 +2575,6 @@ def _generate_result_filtering_query(self, filters_dict): exclude_interactions, processed_filters["max_miss"], ) - # check if ligand filters and prepare for query if "lig_filters" in processed_filters: lig_filters = processed_filters["lig_filters"] @@ -2586,16 +2586,17 @@ def _generate_result_filtering_query(self, filters_dict): ): ligand_queries.append( self._generate_ligand_filtering_query(lig_filters) - ) + ) # TODO here? # if complex ligand filter, generate partial query if "ligand_substruct_pos" in lig_filters: ligand_queries.append( self._ligand_substructure_position_filter(lig_filters) ) - # join all ligand queries that are not empty + # join all ligand queries that are not empty #TODO this should have ran some stuff already, no? lig_query = " AND ".join( [lig_filter for lig_filter in ligand_queries if lig_filter] - ) + ) # TODO + # if filter queries exist for each group, string them together appropriately if int_query: # add with a join statement unclustered_query += ( @@ -2608,6 +2609,7 @@ def _generate_result_filtering_query(self, filters_dict): ) if num_query: unclustered_query += "WHERE " + num_query + # if clustering is requested, do that before saving view or filtering results for output if clustering: # add appropriate select @@ -2619,6 +2621,7 @@ def _generate_result_filtering_query(self, filters_dict): else: # if not clustering, rename query query = unclustered_query + # choose columns to be selected from filtering_window query_select_string = f"""SELECT {", ".join("R." + column for column in outfield_columns)} FROM {filtering_window} R """ # adding if we only want to keep one pose per ligand (will keep first entry) @@ -2626,7 +2629,7 @@ def _generate_result_filtering_query(self, filters_dict): query += " GROUP BY R.LigName " # add how to order results if self.order_results: - query += "ORDER BY " + self.field_to_column_name[self.order_results] + query += " ORDER BY " + self.field_to_column_name[self.order_results] output_query = query_select_string + query view_query = f"SELECT * FROM {filtering_window} R " + query @@ -2634,31 +2637,28 @@ def _generate_result_filtering_query(self, filters_dict): def _prepare_cluster_query(self, unclustered_query: str) -> str | None: """ - These methods will take (filtered, hopefully) data, then run the cluster query and cluster the filtered data. + These methods will take data returned from unclustered filter query, then run the cluster query and cluster the filtered data. This will output pose_ids that are representative of the clusters, and these pose_ids will be returned so that they can be added to the unclustered query in the main filtering method. - They will only return a simple string since the filters were already applied, so the returning query is now the only query! Args: - unclustered_query (str): _description_ + unclustered_query (str): query containing none or some filters that defines over which ligands the clustering should happen Returns: - str | None: _description_ - - Yields: - Iterator[str | None]: _description_ + str: (reduced) query to include in overall filter query if clustering returned results """ if self.interaction_cluster and self.mfpt_cluster: self.logger.warning( "N.B.: If using both interaction and morgan fingerprint clustering, the morgan fingerprint clustering will be performed on the results staus post interaction fingerprint clustering." ) - def _clusterFps( - fps, cutoff - ): # https://macinchem.org/2023/03/05/options-for-clustering-large-datasets-of-molecules/ + def _clusterFps(fps, cutoff): """ - fps (): fingerprints - cutoff distance (float) + https://macinchem.org/2023/03/05/options-for-clustering-large-datasets-of-molecules/ + + Args: + fps (): fingerprints + cutoff distance (float) """ # first generate the distance matrix: @@ -2789,15 +2789,15 @@ def _prepare_interaction_filtering_query( self, include_interactions: list, exclude_interactions: list, max_miss: int ) -> str: """ - _summary_ + Method that prepares a partial query for interactions Args: - include_interactions (list): _description_ - exclude_interactions (list): _description_ - max_miss (int): _description_ + include_interactions (list): interactions a pose should have + exclude_interactions (list): interactions a pose should not have + max_miss (int): max number of the provided interactions a pose_id is allowed to miss Returns: - str: _description_ + str: partial query to include in main filter query """ # nonsensical number to count an interaction if it satisfies an incomplete ("wildcard") interaction nonsense_counter = -10000 @@ -2897,7 +2897,20 @@ def _prepare_indices_for_query(interactions: list): return query - def _ligand_substructure_position_filter(self, ligand_filters_dict: dict): + def _ligand_substructure_position_filter(self, ligand_filters_dict: dict) -> str: + """ + Method that takes all ligand filters in the presence of a ligand_substruct_pos filter, and reduces the query to + " IN pose_ids" based on what pose_ids passed the ligand filters + + Args: + ligand_filters_dict (dict): all specified ligand filters + + Raises: + OptionError + + Returns: + str: partial query that identifies pose ids passing the ligand substructure filter + """ queries = [] nr_args_per_group = 6 nr_smarts = int( @@ -2984,7 +2997,7 @@ def _ligand_substructure_position_filter(self, ligand_filters_dict: dict): def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: """ - Method to generate bitvector strings from pose_ids + Method to generate a dict of generate bitvector strings from pose_ids Args: pose_ids (str): query formatted list of pose_ids (as tuple) @@ -3014,12 +3027,13 @@ def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: # return dict of pose id as string and bitvector return poseid_bv - def _prepare_interaction_indices_for_filtering(self, interaction_list): + def _prepare_interaction_indices_for_filtering(self, interaction_list: list): """ - _summary_ + Prepare lists of interaction indices where they are grouped by whether or not they should be evaluated as "AND" or "OR", + and whether to be excluded or included in the passing filter poses Args: - interaction_list (): _description_ + interaction_list (list): list of interactions Raises: OptionError @@ -3067,8 +3081,7 @@ def _prepare_interaction_indices_for_filtering(self, interaction_list): return include_interactions, exclude_interactions def _get_interaction_indices(self, interaction_list) -> iter: - """takes list of interaction info for a given ligand, - looks up corresponding interaction index + """takes list of interaction info and looks up corresponding interaction index Args: interaction_list (list): List containing interaction info @@ -3107,7 +3120,7 @@ def _generate_ligand_filtering_query(self, ligand_filters: dict) -> str: ligand_filters (list): List of filters on ligand table Returns: - str: SQLite-formatted query, Dict: dictionary of filters and values + str: SQLite-formatted query """ sql_ligand_string = "SELECT L.LigName FROM Ligands L WHERE" @@ -3287,6 +3300,13 @@ def _set_ringtail_db_schema_version(self, db_version: str = "2.0.0"): ) def check_ringtaildb_version(self): + """ + Checks the database version and confirms whether the code base is compatible with it + + Returns: + bool: whether or not db is compatible with the code base + str: current database version + """ cur = self.conn.cursor() db_version = str(cur.execute("PRAGMA user_version").fetchone()[0]) db_schema_ver = ".".join([*db_version]) @@ -3373,7 +3393,7 @@ def update_database_version(self, new_version, consent=False): def _update_db_110_to_200(self): """ - Method to update from database v 1.1.0 to 2.0.0, will remove bitvetor table and create Interaction table + Method to update from database v 1.1.0 to 2.0.0, will remove bitvetor table and create Interactions table Raises: DatabaseConnectionError diff --git a/ringtail/util.py b/ringtail/util.py index 6adfec7c..26f14c3e 100644 --- a/ringtail/util.py +++ b/ringtail/util.py @@ -83,5 +83,14 @@ def caller_info(skip=2): def numlist2str(list: list, separator: str) -> str: + """ + Joines item in a list by specified string separator + + Args: + list (list): list to be joined + separator (str): string item to separate the items in the list + Returns: + str: list as a string separated by separator + """ return separator.join([str(x) for x in list]) From 3ac0f4b38b5747eefee2cfec625794a1f44e4ecc Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 4 Oct 2024 09:46:38 -0700 Subject: [PATCH 55/63] added two semicolons and removed a todo --- ringtail/storagemanager.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 8caefab4..8e46e530 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -1710,7 +1710,7 @@ def _create_view(self, name, query): cur = self.conn.cursor() # drop old view if there is one try: - cur.execute("DROP VIEW IF EXISTS {name}".format(name=name)) + cur.execute(f"DROP VIEW IF EXISTS {name}") cur.execute(query) cur.close() except sqlite3.OperationalError as e: @@ -2586,7 +2586,7 @@ def _generate_result_filtering_query(self, filters_dict): ): ligand_queries.append( self._generate_ligand_filtering_query(lig_filters) - ) # TODO here? + ) # if complex ligand filter, generate partial query if "ligand_substruct_pos" in lig_filters: ligand_queries.append( @@ -3459,8 +3459,9 @@ def _create_connection(self): "Failed to load chemicalite cartridge. Please ensure chemicalite is installed with `conda install -c conda-forge chemicalite`." ) raise e - cursor = con.execute("PRAGMA synchronous = OFF") - cursor.execute("PRAGMA journal_mode = MEMORY") + cursor = con.execute("PRAGMA synchronous = OFF;") + cursor.execute("PRAGMA journal_mode = MEMORY;") + con.commit() cursor.close() except sqlite3.OperationalError as e: raise DatabaseConnectionError( From e6e44cf04c608cc33cf12114ec2d11898a7aa57e Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 4 Oct 2024 09:54:44 -0700 Subject: [PATCH 56/63] added note about visidata and chemicalite bookmarks --- docs/source/database_traversing.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/database_traversing.rst b/docs/source/database_traversing.rst index cd6fff29..d92416bf 100644 --- a/docs/source/database_traversing.rst +++ b/docs/source/database_traversing.rst @@ -8,7 +8,7 @@ View the data contained within the database using a terminal, we recommend using In this example (made with DLGs), the database contains ~3 poses for 9999 discrete ligands. Each of the rows here is a separate table or view within the database. From this screen, you can easily perform the sanity checks outline below. One should note that the number of column displayed on the first screen is 1 greater than the actual number of columns in a table (the number is correct for views). To more fully explore a given table, one may use the arrow keys or mouse to navigate to it, then press ``Enter/Return`` to access that table/view. The user may then scroll horizontally with the arrow keys, or press ``q`` to return up a level. -Using ``vd`` is particularly helpful to examine possible interactions of interest, stored within the ``Interactions`` table. +Using ``vd`` is particularly helpful to examine possible interactions of interest, stored within the ``Interaction_indices`` and ``Interactions`` table. To exit, return to the screen shown in the image above by pressing ``q``, then press ``q`` to exit. @@ -19,3 +19,7 @@ There are a few quick checks the user can make to ensure that the data has been - The number of rows in the ``Results`` table should be ~ ``max_poses`` * ``number of files`` and should be less than or equal to that number. For DLGs not every ligand may have up to ``max_poses``, which is why the number of rows is typically smaller than ``max_poses`` * ``number of DLGs``. - No ligand should have more than ``max_poses`` rows in the ``Results`` table. - If storing all poses, the number of rows in the Results table should match the ``number of ligands`` * ``number of output poses``. + +A note about visualizing bookmarks produced by ligand filters +************************************************************* +If using visidata to look at a bookmark produced by using ligand filters, the bookmark will most likely appear blank. This is due to database connection settings in visidata and the use of the sqlite extension ``chemicalite`` (used to aid in the ligand filters). The bookmark will still work as expected when using Ringtail functionality, for example you can write the bookmark to a csv file, or use the bookmark as basis for the next round of filtering. From 6ab90091d16be4c4cb9ac027b738e674d53dfa08 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 4 Oct 2024 12:37:55 -0700 Subject: [PATCH 57/63] fixed bug with ligand_substruct_pos and updated docs --- README.md | 8 ++++++ docs/source/api.rst | 15 ++++++------ docs/source/changes.rst | 10 ++++++++ docs/source/cmdline.rst | 12 ++++++--- ringtail/cloptionparser.py | 11 +++++---- ringtail/storagemanager.py | 50 ++++++++++++++++++++++++++++++-------- test/test_units.py | 18 ++++++-------- 7 files changed, 88 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 25cdb433..f42e99f4 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,14 @@ at [Scripps Research](https://www.scripps.edu/). In-depth documentation can be found on [ReadTheDocs](https://ringtail.readthedocs.io/en/latest/). +### New in version 2.0.1 +##### Enhancements to the code base +- The format of the queries produced to filter the database have been completely rewritten, reducing filtering time by at least a factor of 10 compared to 1.1.0. Extra indices were added to three of the tables to support the faster filtering speeds. + +##### Bug fixes +- The use of the keywords `--ligand_name`, `--ligand_substruct`, and `--ligand_substruct_pos` had ambiguous behavior where if they were invoked more than once, only the last filter value would be used (as opposed to concatenating the values). They now will work by supplying multiple values to one keyword, as well as one or more values to two or more keywords. Further, `ligand_substruct_pos` now takes input as one string (`"[C][Oh] 1 1.5 -20 42 -7.1"`)as opposed to one string and five numbers (`"[C][Oh]"" 1 1.5 -20 42 -7.1`). + + ### New in version 2.0 ##### Changes in keywords used for the command line tool diff --git a/docs/source/api.rst b/docs/source/api.rst index 8408b8d7..8ed251a8 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -169,16 +169,17 @@ The ``max_miss`` keywords allows the user to filter by given interactions exclud Ligand filters =============== -Several filters pertaining to the SMARTS structure of the ligand can be used. For example, the ``ligand_substruct_pos`` keyword may be used to filter for a specific ligand substructure (specified with a SMARTS string) to be placed within some distance of a given cartesian coordinate. The format for this option is ``"" ``. -ligand_name: Specify ligand name(s). Will combine name filters with 'OR'. -ligand_substruct: SMARTS pattern(s) for substructure matching. -ligand_substruct_pos: SMARTS pattern(s) for substructure matching, e.g., ['[Oh]C', 0, 1.2, -5.5, 10.0, 15.5] -> [smart_string, index_of_positioned_atom, cutoff_distance, x, y, z]. -ligand_max_atoms: Maximum number of heavy atoms a ligand may have. -ligand_operator: Logical join operator for multiple SMARTS. +Several filters pertaining to the SMARTS structure of the ligand can be used. For example, ligands can be filtered for presence of certain substrctures specified by their SMARTS string using ``ligand_substruct``, as well as their ligand name contaning a specific phrase ``ligand_name``. The ligand name search will include any ligand names that contain the specified phrase, and does not look for exact matches only. Use the keyword ``ligand_operator`` to determine if the ligand filters should be evaluated as this ``OR`` that (default), or combined with ``AND``. ``ligand_max_atoms`` can be used to specify maximum number of heavy atoms a ligand may have. .. code-block:: python - rtc.filter(ligand_substruct=["[Oh]C"], ligand_substruct_pos=["[Oh]C", 0, 1.2, -5.5, 10.0, 15.5]) + rtc.filter(ligand_substruct=["[Oh]C", "C=O"], ligand_name="cool_ligand",ligand_operator="AND", ligand_max_atoms=5) + +The ``ligand_substruct_pos`` option may be used to filter for a specific ligand substructure to be placed within some distance of a given cartesian coordinate. The format for this option using the API is as a list of the six elements: ``[""," , , , , ]``. If seachring for more than one ``ligand_substruct_pos`` make the value a list of lists. + +.. code-block:: python + + rtc.filter(ligand_name="_1", ligand_substruct_pos=[["C=O", 1, 10, 102, 106, 154], ['[C][Oh]', 1, 10, 102, 106, 154]]) Clustering diff --git a/docs/source/changes.rst b/docs/source/changes.rst index a06c538f..7d4c1fcf 100644 --- a/docs/source/changes.rst +++ b/docs/source/changes.rst @@ -3,6 +3,16 @@ Changes in Ringtail ###################### +Changes in 2.0.1: enhanced filtering speed +****************************************** +Enhancements to the code base +============================== +* The format of the queries produced to filter the database have been completely rewritten, reducing filtering time by at least a factor of 10 compared to 1.1.0. Extra indices were added to three of the tables to support the faster filtering speeds. + +Bug fixes +=========== +* The use of the keywords `--ligand_name`, `--ligand_substruct`, and `--ligand_substruct_pos` had ambiguous behavior where if they were invoked more than once, only the last filter value would be used (as opposed to concatenating the values). They now will work by supplying multiple values to one keyword, as well as one or more values to two or more keywords. Further, `ligand_substruct_pos` now takes input as one string (`"[C][Oh] 1 1.5 -20 42 -7.1"`)as opposed to one string and five numbers (`"[C][Oh]"" 1 1.5 -20 42 -7.1`). + Changes in 2.0: fully developed API *************************************** diff --git a/docs/source/cmdline.rst b/docs/source/cmdline.rst index dfeab2d4..f30071aa 100644 --- a/docs/source/cmdline.rst +++ b/docs/source/cmdline.rst @@ -122,7 +122,7 @@ Interaction filters It is possible to filter the docking results based on different types of interactions (hydrogen bonds onr van der waals) with specific residues. It is further possible to have ligands pass the filters while only fulfilling some of the interaction combinations in union (max number of interactions combinations missed, ``--max_miss``). The available interaction filters are ``--hb_interactions``, ``--vdw_interactions``, and ``--reactive_interactions``. Interaction filters must be specified in the order ``CHAIN:RES:NUM:ATOM_NAME``. Any combination of that information may be used, as long as 3 colons are present and the information ordering between the colons is correct. All desired interactions of a given type (e.g. ``-vdw``) may be specified with a single option tag (``-vdw B:THR:276:,B:HIS:226:``) or separate tags (``-vdw B:THR:276: -vdw B:HIS:226:``). -The ``--max_miss`` option allows the user to filter by given interactions excluding up to ``max_miss`` interactions. This gives :math:`\sum_{m=0}^{m}\frac{n!}{(n-m)!*m!}` combinations for *n* interaction filters and *m* max_miss. By default, results will be given for the union of the interaction conbinations. Use with ``--enumerate_interaction_combs`` to log ligands/poses passing each separate interaction combination (can significantly increase runtime). If ``max_miss > 0`` is used during filtering, a view is created for each combination of interaction filters and is named ``_`` where n is the index of the filter combination in the log file (indexing from 0). +The ``--max_miss`` option allows the user to filter by given interactions excluding up to ``max_miss`` interactions. This gives :math:`\sum_{m=0}^{m}\frac{n!}{(n-m)!*m!}` combinations for *n* interaction filters and *m* max_miss. By default, results will be given for the union of the interaction conbinations. Use with ``--enumerate_interaction_combs`` to log ligands/poses passing each separate interaction combination (can significantly increase runtime). ßIf ``max_miss > 0`` is used during filtering, a view is created for each combination of interaction filters and is named ``_`` where n is the index of the filter combination in the log file (indexing from 0). ``--react_any`` offers an option to filtering for poses that have reactions with any residue. .. code-block:: bash @@ -131,12 +131,18 @@ The ``--max_miss`` option allows the user to filter by given interactions exclud Ligand filters ================= -The ``--smarts_idxyz`` option may be used to filter for a specific ligand substructure (specified with a SMARTS string) to be placed within some distance of a given cartesian coordinate. The format for this option is ``"" ``. +The docked ligands can be filtered for presence of certain substrctures specified by their SMARTS string using ``--ligand_substruct``, as well as their ligand name contaning a specific phrase ``--ligand_name``. The ligand name search will include any ligand names that contain the specified phrase, and does not look for exact matches only. +Use the keyword ``--ligand_operator`` to determine if the ligand filters should be evaluated as this ``OR`` that (default), or combined with ``AND``. ``--ligand_max_atoms`` can be used to specify maximum number of heavy atoms a ligand may have. .. code-block:: bash - $ rt_process_vs read --input_db output.db --eworst -6 --hb_interactions A:VAL:279: A:LYS:162: --vdw_interactions A:VAL:279: A:LYS:162: --max_miss 1 + $ rt_process_vs read --input_db output.db --ligand_substruct 'C=O' 'CC(C)(C)' --ligand_operator AND --ligand_max_atoms 5 +The ``--ligand_substruct_pos`` option may be used to filter for a specific ligand substructure to be placed within some distance of a given cartesian coordinate. The format for this option is the six elements inside quotes and separated by spaces: ``" ""``. + +.. code-block:: bash + + $ rt_process_vs read --input_db output.db --ligand_name cool_ligand --ligand_substruct_pos "[C][Oh] 1 1.5 -20.3 42 -7.1" Clustering ============ diff --git a/ringtail/cloptionparser.py b/ringtail/cloptionparser.py index c37450a2..5662cb4b 100644 --- a/ringtail/cloptionparser.py +++ b/ringtail/cloptionparser.py @@ -500,7 +500,7 @@ def cmdline_parser(defaults: dict = {}): ) ligand_group.add_argument( "--ligand_substruct", - help="SMARTS pattern(s) for substructure matching", + help="SMARTS pattern(s) for substructure matching, if error delimit each substructure with ''.", action="append", type=str, metavar="STRING", @@ -508,7 +508,7 @@ def cmdline_parser(defaults: dict = {}): ) ligand_group.add_argument( "--ligand_substruct_pos", - help='"SMARTS, index of atom in SMARTS, cutoff dist, and target XYZ coords, group by "', + help='"SMARTS, index of atom in SMARTS, cutoff dist, and target XYZ coords". Group each set of six values with "".', action="append", type=str, metavar="STRING", @@ -861,7 +861,7 @@ def process_options(self, parsed_opts): # if more than one filter in list, go through each if len(filter_list) > 1: for filter in filter_list: - if _type == "ligand_subtruct_pos": + if _type == "ligand_substruct_pos": # make a lits of the six values ligand_filters[_type].append( [i for i in filter.split(" ")] @@ -869,7 +869,7 @@ def process_options(self, parsed_opts): else: ligand_filters[_type].append(filter) else: - if _type == "ligand_subtruct_pos": + if _type == "ligand_substruct_pos": # if only one item in list, append to ligand list ligand_filters[_type].append( [i for i in filter_list[0].split(" ")] @@ -878,9 +878,10 @@ def process_options(self, parsed_opts): ligand_filters[_type].append(filter_list[0]) ligand_filters["ligand_operator"] = parsed_opts.ligand_operator + # ligand substruct pos needs six items if ( "ligand_substruct_pos" in ligand_filters - and len(ligand_filters["ligand_substruct_pos"][0]) % 6 != 0 + and len(ligand_filters["ligand_substruct_pos"][0]) % 6 ): msg = "--ligand_substruct_pos needs groups of 6 values:\n" msg += " 1. Ligand SMARTS\n" diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 8e46e530..d5bc6742 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -2471,9 +2471,20 @@ def _process_filters_for_query(self, filters_dict: dict): # add react_any flag as interaction filter if not None if filter_key == "react_any" and filter_value: interaction_filters.append(["R", "", "", "", "", True]) + # if filter has to do with ligands and SMARTS if filter_key in Filters.get_filter_keys("ligand"): + if filter_key == "ligand_substruct_pos" and filter_value: + # go through each item and make sure the numbers are cast from string to numbers + for filter in filter_value: + # cast second item to int + filter[1] = int(filter[1]) + # cast last four items to float + for index in range(2, 6): + filter[index] = float(filter[index]) + ligand_filters[filter_key] = filter_value + if filter_key == "max_miss": max_miss = filter_value # put all processed filter in a dict @@ -2485,7 +2496,6 @@ def _process_filters_for_query(self, filters_dict: dict): processed_filters["max_miss"] = max_miss if len(ligand_filters) > 0: processed_filters["lig_filters"] = ligand_filters - return processed_filters def _generate_result_filtering_query(self, filters_dict): @@ -2504,6 +2514,8 @@ def _generate_result_filtering_query(self, filters_dict): num_query = "" int_query = "" lig_query = "" + ligand_substruct_queries = [] + join_stmnt = "" # if filtering over a bookmark (i.e., already filtered results) as opposed to a whole database if self.filter_bookmark is not None: @@ -2589,13 +2601,17 @@ def _generate_result_filtering_query(self, filters_dict): ) # if complex ligand filter, generate partial query if "ligand_substruct_pos" in lig_filters: - ligand_queries.append( - self._ligand_substructure_position_filter(lig_filters) - ) - # join all ligand queries that are not empty #TODO this should have ran some stuff already, no? + for substruct_pos in lig_filters["ligand_substruct_pos"]: + temp_lig_filter = lig_filters + temp_lig_filter["ligand_substruct_pos"] = substruct_pos + ligand_substruct_queries.append( + self._ligand_substructure_position_filter(temp_lig_filter) + ) + join_stmnt = " " + lig_filters["ligand_operator"] + " " + # join all ligand queries that are not empty lig_query = " AND ".join( [lig_filter for lig_filter in ligand_queries if lig_filter] - ) # TODO + ) # if filter queries exist for each group, string them together appropriately if int_query: # add with a join statement @@ -2607,8 +2623,21 @@ def _generate_result_filtering_query(self, filters_dict): unclustered_query += ( "JOIN (" + lig_query + ") L ON R.LigName = L.LigName " ) - if num_query: - unclustered_query += "WHERE " + num_query + # these two queries are joined on the Results table, after the multiple table spanning queries + if num_query or ligand_substruct_queries: + # add condition + unclustered_query += "WHERE " + # add numerical part of query + if num_query: + unclustered_query += num_query + # if both numerical and ligand_substruct_pos handle appropriately + if num_query and ligand_substruct_queries: + unclustered_query += " AND " + join_stmnt.join( + ligand_substruct_queries + ) + # if not, only the ligand_substruct_pos sets the WHERE condition + else: + unclustered_query += join_stmnt.join(ligand_substruct_queries) # if clustering is requested, do that before saving view or filtering results for output if clustering: @@ -2987,13 +3016,14 @@ def _ligand_substructure_position_filter(self, ligand_filters_dict: dict) -> str pose_id_list.append(str(pose_id)) break # add pose only once if len(pose_id_list) > 0: - queries.append("Pose_ID IN ({0})".format(",".join(pose_id_list))) + queries.append("R.Pose_ID IN ({0})".format(",".join(pose_id_list))) cur.close() if not queries: raise OptionError( "There are no ligands passing the 'ligand_substruct_pos' filter, please revise your filter query." ) - return queries + + return "".join(queries) def _generate_interaction_bitvectors(self, pose_ids: str) -> dict: """ diff --git a/test/test_units.py b/test/test_units.py index 160d05f4..f2639407 100644 --- a/test/test_units.py +++ b/test/test_units.py @@ -161,17 +161,13 @@ def test_ligand_filters(self): ) assert count_ligands_passing == 18 - # test substructure with specified position, currently raises an error because substrcut with pos not found - from ringtail import exceptions as e - - with pytest.raises(e.OptionError) as exc_info: - count_ligands_passing = rtc.filter( - ligand_substruct_pos=["[Oh]C", 0, 100, -5.5, 10.0, 15.5] - ) - assert ( - str(exc_info.value) - == "There are no ligands passing the 'ligand_substruct_pos' filter, please revise your filter query." - ) + count_ligands_passing = rtc.filter( + ligand_substruct_pos=[ + ["[C][Oh]", 1, 10, 102, 106, 154], + ["C=O", 1, 10, 102, 106, 154], + ] + ) + assert count_ligands_passing == 12 def test_all_filters(self): rtc = RingtailCore(db_file="output.db") From 314939139ea76e1d7773d867f8c3b8cefc52fee1 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 4 Oct 2024 12:52:51 -0700 Subject: [PATCH 58/63] updated doc for ligand_max_atoms --- ringtail/cloptionparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ringtail/cloptionparser.py b/ringtail/cloptionparser.py index 5662cb4b..e7d22daa 100644 --- a/ringtail/cloptionparser.py +++ b/ringtail/cloptionparser.py @@ -493,7 +493,7 @@ def cmdline_parser(defaults: dict = {}): ligand_group.add_argument( "-mna", "--ligand_max_atoms", - help="Maximum number of heavy atoms a ligand may have", + help="Maximum number of heavy atoms (non-hydrogens) a ligand may have", action="store", type=int, metavar="INT", From ce24f8bb77dc307b0183458f9698111a040aaaff Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 4 Oct 2024 13:05:08 -0700 Subject: [PATCH 59/63] bug fix: max number of heavy atoms uses correct chemicalite method --- ringtail/storagemanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index d5bc6742..d0c0d891 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -3172,7 +3172,7 @@ def _generate_ligand_filtering_query(self, ligand_filters: dict) -> str: name_sql_str = " L.LigName LIKE '%{value}%' OR".format(value=name) sql_ligand_string += name_sql_str if kw == "ligand_max_atoms" and ligand_filters[kw] is not None: - maxatom_sql_str = " mol_num_atms(ligand_rdmol) <= {} {}".format( + maxatom_sql_str = " mol_num_hvyatms(ligand_rdmol) <= {} {}".format( ligand_filters[kw], logical_operator ) sql_ligand_string += maxatom_sql_str From f858b54d5015ace6b7f8f48c0d50bc546885e9f4 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Fri, 4 Oct 2024 13:07:03 -0700 Subject: [PATCH 60/63] updated docs with bug fix --- README.md | 2 +- docs/source/changes.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f42e99f4..f6e437ba 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ In-depth documentation can be found on [ReadTheDocs](https://ringtail.readthedoc ##### Bug fixes - The use of the keywords `--ligand_name`, `--ligand_substruct`, and `--ligand_substruct_pos` had ambiguous behavior where if they were invoked more than once, only the last filter value would be used (as opposed to concatenating the values). They now will work by supplying multiple values to one keyword, as well as one or more values to two or more keywords. Further, `ligand_substruct_pos` now takes input as one string (`"[C][Oh] 1 1.5 -20 42 -7.1"`)as opposed to one string and five numbers (`"[C][Oh]"" 1 1.5 -20 42 -7.1`). - +- `--ligand_max_atoms` counted all atoms in the ligand, including hydrogens. With bug fix it counts only heavy atoms(not hydrogens). ### New in version 2.0 ##### Changes in keywords used for the command line tool diff --git a/docs/source/changes.rst b/docs/source/changes.rst index 7d4c1fcf..cb36c337 100644 --- a/docs/source/changes.rst +++ b/docs/source/changes.rst @@ -12,6 +12,7 @@ Enhancements to the code base Bug fixes =========== * The use of the keywords `--ligand_name`, `--ligand_substruct`, and `--ligand_substruct_pos` had ambiguous behavior where if they were invoked more than once, only the last filter value would be used (as opposed to concatenating the values). They now will work by supplying multiple values to one keyword, as well as one or more values to two or more keywords. Further, `ligand_substruct_pos` now takes input as one string (`"[C][Oh] 1 1.5 -20 42 -7.1"`)as opposed to one string and five numbers (`"[C][Oh]"" 1 1.5 -20 42 -7.1`). +* `--ligand_max_atoms` counted all atoms in the ligand, including hydrogens. With bug fix it counts only heavy atoms(not hydrogens). Changes in 2.0: fully developed API *************************************** From 5bc9e8372942595674e308194abb74fcc8d6584c Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 7 Oct 2024 08:51:07 -0700 Subject: [PATCH 61/63] updated doc string for ligand filters --- ringtail/ringtailcore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ringtail/ringtailcore.py b/ringtail/ringtailcore.py index 48c6e581..3622f6bd 100644 --- a/ringtail/ringtailcore.py +++ b/ringtail/ringtailcore.py @@ -1246,9 +1246,9 @@ def filter( hb_count (list[tuple]): accept ligands with at least the requested number of HB interactions. If a negative number is provided, then accept ligands with no more than the requested number of interactions. E.g., [('hb_count', 5)] react_any (bool): check if ligand reacted with any residue max_miss (int): Will compute all possible combinations of interaction filters excluding up to max_miss numer of interactions from given set. Default will only return union of poses interaction filter combinations. Use with 'enumerate_interaction_combs' for enumeration of poses passing each individual combination of interaction filters. - ligand_name (list[str]): specify ligand name(s). Will combine name filters with OR, e.g., ["lig1", "lig2"] - ligand_substruct (list[str]): SMARTS, index of atom in SMARTS, cutoff dist, and target XYZ coords, e.g., ["ccc", "CN"] - ligand_substruct_pos (list[str]): SMARTS pattern(s) for substructure matching, e.g., ['"[Oh]C" 0 1.2 -5.5 10.0 15.5'] -> ["smart_string index_of_positioned_atom cutoff_distance x y z"] + ligand_name (list[str]): specify ligand name(s). Will combine name filters with OR, e.g., [["lig1", "lig2"]] + ligand_substruct (list[str]): SMARTS, index of atom in SMARTS, cutoff dist, and target XYZ coords, e.g., [["ccc", "CN"]] + ligand_substruct_pos (list[list[type]]): SMARTS pattern(s) for substructure matching, e.g., [["[Oh]C", 0, 1.2, -5.5, 10.0, 15.5]] -> [["smart_string", index_of_positioned_atom, cutoff_distance, x, y, z]] ligand_max_atoms (int): Maximum number of heavy atoms a ligand may have ligand_operator (str): logical join operator for multiple SMARTS (default: OR), either AND or OR filters_dict (dict): provide filters as a dictionary From e756591adcc20cdc9f6f75265df9a0855c382c5f Mon Sep 17 00:00:00 2001 From: maylinnp Date: Mon, 7 Oct 2024 08:52:30 -0700 Subject: [PATCH 62/63] added create indices to database update method --- ringtail/storagemanager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index d0c0d891..39adb23c 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -3400,10 +3400,10 @@ def update_database_version(self, new_version, consent=False): ) cur.execute("ALTER TABLE Bookmarks ADD COLUMN filters") cur.execute( - "CREATE INDEX IF NOT EXISTS allind ON Results(LigName, docking_score, leff, deltas, reference_rmsd, energies_inter, energies_vdw, energies_electro, energies_intra, nr_interactions, run_number, pose_rank, num_hb)" + "CREATE INDEX IF NOT EXISTS ak_results ON Results(LigName, docking_score, leff, deltas, reference_rmsd, energies_inter, energies_vdw, energies_electro, energies_intra, nr_interactions, run_number, pose_rank, num_hb)" ) cur.execute( - "CREATE INDEX IF NOT EXISTS intind ON Interaction_indices(interaction_type, rec_chain, rec_resname, rec_resid, rec_atom, rec_atomid)" + "CREATE INDEX IF NOT EXISTS ak_intind ON Interaction_indices(interaction_type, rec_chain, rec_resname, rec_resid, rec_atom, rec_atomid)" ) try: self.conn.commit() @@ -3458,6 +3458,12 @@ def _update_db_110_to_200(self): ) # drop old bitvector table cur.execute("""DROP TABLE IF EXISTS Interaction_bitvectors;""") + # create new indixes + cur.execute("CREATE INDEX IF NOT EXISTS ak_poseid ON Results(Pose_id)") + cur.execute( + "CREATE INDEX IF NOT EXISTS ak_interactions ON Interactions(Pose_id, interaction_id)" + ) + cur.execute("CREATE INDEX IF NOT EXISTS ak_ligands ON Ligands(LigName)") self.conn.commit() self._set_ringtail_db_schema_version("2.0.0") # set explicit version except sqlite3.OperationalError as e: From 386b838fb1b7c70735472e4f031bfbeef6e7a3b2 Mon Sep 17 00:00:00 2001 From: maylinnp Date: Thu, 10 Oct 2024 08:45:08 -0700 Subject: [PATCH 63/63] updated code version references to 2.1.0 (db version is stil 2.0.0) --- docs/source/changes.rst | 4 ++-- docs/source/conf.py | 2 +- ringtail/storagemanager.py | 2 +- setup.py | 17 +++++++++-------- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/source/changes.rst b/docs/source/changes.rst index cb36c337..c78c97a6 100644 --- a/docs/source/changes.rst +++ b/docs/source/changes.rst @@ -3,7 +3,7 @@ Changes in Ringtail ###################### -Changes in 2.0.1: enhanced filtering speed +Changes in 2.1.0: enhanced filtering speed ****************************************** Enhancements to the code base ============================== @@ -14,7 +14,7 @@ Bug fixes * The use of the keywords `--ligand_name`, `--ligand_substruct`, and `--ligand_substruct_pos` had ambiguous behavior where if they were invoked more than once, only the last filter value would be used (as opposed to concatenating the values). They now will work by supplying multiple values to one keyword, as well as one or more values to two or more keywords. Further, `ligand_substruct_pos` now takes input as one string (`"[C][Oh] 1 1.5 -20 42 -7.1"`)as opposed to one string and five numbers (`"[C][Oh]"" 1 1.5 -20 42 -7.1`). * `--ligand_max_atoms` counted all atoms in the ligand, including hydrogens. With bug fix it counts only heavy atoms(not hydrogens). -Changes in 2.0: fully developed API +Changes in 2.x: fully developed API *************************************** Changes in keywords used for the command line tool diff --git a/docs/source/conf.py b/docs/source/conf.py index 1c0567f8..d7ca2a05 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,7 @@ project = "ringtail" copyright = "2024, Forli lab" author = "Forli lab" -release = "2.0.0" +release = "2.1.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/ringtail/storagemanager.py b/ringtail/storagemanager.py index 39adb23c..3d8204be 100644 --- a/ringtail/storagemanager.py +++ b/ringtail/storagemanager.py @@ -38,7 +38,7 @@ class StorageManager: _db_schema_code_compatibility = { "1.0.0": ["1.0.0"], "1.1.0": ["1.1.0"], - "2.0.0": ["2.0.0", "2.0.1"], + "2.0.0": ["2.0.0", "2.1.0"], } """Base class for a generic virtual screening database object. diff --git a/setup.py b/setup.py index 3396701b..09fda6d0 100644 --- a/setup.py +++ b/setup.py @@ -19,9 +19,10 @@ def find_files(directory): return matches + setup( name="ringtail", - version="2.0.0", + version="2.1.0", author="Forli Lab", author_email="forli@scripps.edu", url="https://github.com/forlilab/Ringtail", @@ -55,12 +56,12 @@ def find_files(directory): "Topic :: Software Development :: Libraries", ], entry_points={ - 'console_scripts': [ - 'rt_process_vs=ringtail.cli.rt_process_vs:main', - 'rt_compare=ringtail.cli.rt_compare:main', - 'rt_db_v100_to_v110=ringtail.cli.rt_db_v100_to_v110:main', - 'rt_db_to_v200=ringtail.cli.rt_db_to_v200:main', - 'rt_generate_config_file=ringtail.cli.rt_generate_config_file:main' + "console_scripts": [ + "rt_process_vs=ringtail.cli.rt_process_vs:main", + "rt_compare=ringtail.cli.rt_compare:main", + "rt_db_v100_to_v110=ringtail.cli.rt_db_v100_to_v110:main", + "rt_db_to_v200=ringtail.cli.rt_db_to_v200:main", + "rt_generate_config_file=ringtail.cli.rt_generate_config_file:main", ] - } + }, )