From 2606428f5c5848869775a1ba5ffaaacf930f1c29 Mon Sep 17 00:00:00 2001 From: AJ <46843456+amckenna41@users.noreply.github.com> Date: Wed, 8 Nov 2023 18:51:01 +0000 Subject: [PATCH] v2.4.1 - bug fixes, config updates, unit tests, docs --- .github/workflows/build_test.yml | 2 +- CONFIG.md | 18 +- README.md | 242 +++-- TODO.md | 42 +- config/README.md | 2 +- config/absorption.json | 2 +- config/enantioselectivity.json | 2 +- config/localization.json | 2 +- config/thermostability.json | 2 +- data/README.md | 30 +- example_datasets/README.md | 23 +- pySAR/README.md | 209 ++-- pySAR/__init__.py | 4 +- pySAR/descriptors.py | 349 ++++--- pySAR/encoding.py | 275 +++--- pySAR/evaluate.py | 41 +- pySAR/globals_.py | 4 - pySAR/model.py | 82 +- pySAR/plots.py | 10 +- pySAR/pyDSP.py | 122 +-- pySAR/pySAR.py | 242 +++-- pySAR/utils.py | 35 +- setup.cfg | 6 +- setup.py | 1 - tests/README.md | 8 +- tests/test_config/README.md | 8 +- tests/test_config/test_absorption.json | 42 +- .../test_config/test_enantioselectivity.json | 25 +- tests/test_config/test_localization.json | 38 +- tests/test_config/test_thermostability.json | 44 +- tests/test_descriptors.py | 287 +++--- tests/test_encoding.py | 771 ++++++--------- tests/test_model.py | 115 +-- tests/test_pyDSP.py | 570 +++++------ tests/test_pySAR.py | 919 ++++++++++-------- tests/test_utils.py | 298 ++---- 36 files changed, 2449 insertions(+), 2423 deletions(-) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 08cfc17..38e89a7 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-latest #platform: [ubuntu-latest, macos-latest, windows-latest] strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] #testing on multiple python versions + python-version: ["3.8", "3.9", "3.10"] #testing on multiple python versions steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/CONFIG.md b/CONFIG.md index f022f29..35e4794 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -1,8 +1,8 @@ # Config file parameters -pySAR works via configuration files that contain the plethora of parameters and variables available for the full pySAR pipeline. The config files are in JSON format and broken into 4 different subsections: "dataset", "model", "descriptors", and "pyDSP". "dataset" outlines parameters to do with the dataset, "model" consists of all ML model related parameters, "descriptors" specifies what protein physiochemical/structural descriptors to use and the metaparameters for some protein descriptors and "pyDSP" is all parameters related to any of the DSP functionalities in pySAR.
+`pySAR` works mainly via JSON configuration files. There are many different customisable parameters for the functionalities in `pySAR` including the metaparameters of some of the available protein descriptors, all Digital Signal Processing (DSP) parameters in the `pyDSP` module, the type of regression model to use and parameters specific to the dataset - a description of each parameter is available in the example below. -Example configuration file for thermostability.json used in research: +These config files offer a more straightforward way of making any changes to the `pySAR` pipeline. The names of **All** the parameters as listed in the example config files must remain unchanged, only the value of each parameter should be changed, any parameters not being used can be set to null. Additionally, you can pass in the individual parameter names and values to the `pySAR` and `Encoding` classes when numerically encoding the protein sequences via **kwargs**. An example of the config file used in my research project ([thermostability.json](https://github.com/amckenna41/pySAR/blob/master/config/thermostability.json)), with all of the available parameters, can be seen below. ```json { @@ -114,10 +114,10 @@ Example configuration file for thermostability.json used in research: * `descriptors[descriptors_csv]` - path to csv file of pre-calculated descriptor values of a dataset, saves time having to recalculate the features each time. * `descriptors[moreaubroto_autocorrelation][lag] / descriptors[moran_autocorrelation][lag] / descriptors[geary_autocorrelation][lag]` - The maximum lag value for each of the autocorrelation descriptors. If invalid value input then a default of 30 is used. -* `descriptors[moreaubroto_autocorrelation][properties] / descriptors[moran_autocorrelation][properties] / descriptors[geary_autocorrelation][properties]` - List of protein physiochemical and structural descriptors used in the calculation of each of the autocorrelation descriptors, properties must be a lit of their AAIndex number/accession number. There must be a least 1 property value input. +* `descriptors[moreaubroto_autocorrelation][properties] / descriptors[moran_autocorrelation][properties] / descriptors[geary_autocorrelation][properties]` - List of protein physiochemical and structural descriptors used in the calculation of each of the autocorrelation descriptors, properties must be a list of their AAIndex number/accession numbers. There must be a least 1 property value input. * `descriptors[moreaubroto_autocorrelation][normalize] / descriptors[moran_autocorrelation][normalize] / descriptors[geary_autocorrelation][normalize]` - rescale/normalize Autocorrelation values into range of 0-1. -* `descriptors[ctd][property]` - list of 1 or more physiochemical properties to use when calculating CTD descriptors. List of available input properties: If no properties input then hydrophobicity used by default. +* `descriptors[ctd][property]` - list of 1 or more physiochemical properties to use when calculating CTD descriptors. List of available input properties: hydrophobicity, normalized_vdwv, polarity, charge, secondary_struct, solvent_accessibility, polarizability. If no properties input then hydrophobicity used by default. * `descriptors[ctd][all]` - if True then all 7 of the available physiochemical descriptors will be used when calculating the CTD descriptors. Each proeprty generates 21 features so using all properties will output 147 features. Only 1 property used by default. * `descriptors[sequence_order_coupling_number][maxlag]` - maximum lag; length of the protein must be not less than maxlag. @@ -127,17 +127,17 @@ Example configuration file for thermostability.json used in research: * `descriptors[quasi_sequence_order][weight]` - weighting factor to use when calculating descriptor. * `descriptors[quasi_sequence_order][distance_matrix]` - path to physiochemical distance matrix for calculating quasi sequence order. -* `descriptors[pseudo_amino_acid_composition][lambda]` - lamda parameter that reflects the rank correlation and should be a non-negative integer and not larger than the length of the protein sequence. +* `descriptors[pseudo_amino_acid_composition][lambda]` - lambda parameter that reflects the rank correlation and should be a non-negative integer and not larger than the length of the protein sequence. * `descriptors[pseudo_amino_acid_composition][weight]` - weighting factor to use when calculating descriptor. * `descriptors[pseudo_amino_acid_composition][properties]` - 1 or more amino acid index properties from the AAI database used for calculating the sequence-order. -* `descriptors[amphiphilic_pseudo_amino_acid_composition][lambda]` - lamda parameter that reflects the rank correlation and should be a non-negative integer and not larger than the length of the protein sequence. +* `descriptors[amphiphilic_pseudo_amino_acid_composition][lambda]` - lambda parameter that reflects the rank correlation and should be a non-negative integer and not larger than the length of the protein sequence. * `descriptors[amphiphilic_pseudo_amino_acid_composition][weight]` - weighting factor to use when calculating descriptor. **DSP Parameters:** * `pyDSP[use_dsp]` - whether or not to apply Digital Signal Processing (DSP) techniques to the features passed into the model. If true, the values of the next DSP parameters will be applied to the features. -* `pyDSP[spectrum]` - which frequency output to use from the generated types of signals from DSP to use e.g power, absolute, imaginery, real. -* `pyDSP[window][type]` - convolutional window to apply to the signal output, pySAR supports: hamming, blackman, blackmanharris, gaussian, bartlett, kaiser, barthann, bohman, chebwin, cosine, exponential, flattop, hann, boxcar, hanning, nuttall, parzen, triang, tukey. -* `pyDSP[filter][type]` - window filter to apply to the signal output, pySAR supports: savgol, medfilt, symiirorder1, lfilter, hilbert. +* `pyDSP[spectrum]` - which frequency output/informational spectra to use from the generated types of signals from DSP to use e.g power, absolute, imaginery, real. +* `pyDSP[window][type]` - convolutional window to apply to the signal output, pySAR supports: hamming, blackman, blackmanharris, gaussian, bartlett, kaiser, barthann, bohman, chebwin, cosine, exponential, flattop, hann, boxcar, hanning, nuttall, parzen, triang and tukey. +* `pyDSP[filter][type]` - window filter to apply to the signal output, pySAR supports: savgol, medfilt, symiirorder1, lfilter and hilbert. [Back to top](#TOP) \ No newline at end of file diff --git a/README.md b/README.md index 464853d..d19b118 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,8 @@ - -

pySARLogo

-# pySAR # +# pySAR - Python Sequence Activity Relationship # [![PyPI](https://img.shields.io/pypi/v/pySAR)](https://pypi.org/project/pySAR/) [![pytest](https://github.com/amckenna41/pySAR/workflows/Building%20and%20Testing/badge.svg)](https://github.com/amckenna41/pySAR/actions?query=workflowBuilding%20and%20Testing) [![CircleCI](https://dl.circleci.com/status-badge/img/gh/amckenna41/pySAR/tree/master.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/amckenna41/pySAR/tree/master) @@ -22,7 +20,6 @@ Table of Contents ================= - * [Introduction](#Introduction) * [Requirements](#requirements) * [Installation](#installation) @@ -37,7 +34,7 @@ Table of Contents Research Article ================ -The research article that accompanied this software is titled: "Machine Learning Based Predictive Model for the Analysis of Sequence Activity Relationships Using Protein Spectra and Protein Descriptors". This research article is uploaded to the repository as [pySAR_research.pdf][pdf]. The article was published in the Journal of Biomedical Informatics and is available [here][article] [[1]](#references). There is also a quick Colab notebook demo of `pySAR` available [here][demo]. +The research article that accompanied this software is titled: "Machine Learning Based Predictive Model for the Analysis of Sequence Activity Relationships Using Protein Spectra and Protein Descriptors" and was published in the Journal of Biomedical Informatics and is available [here][article] [[1]](#references). There is also a quick Colab notebook demo of `pySAR` available [here][demo]. How to cite =========== @@ -45,16 +42,19 @@ How to cite Introduction ============ -`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs) of protein sequences. `pySAR` offers extensive and verbose functionalities that allow you to numerically encode a dataset of protein sequences using a large abundance of available methodologies and features. The software uses physiochemical and biochemical features from the Amino Acid Index (AAI) database [[2]](#references) as well as allowing for the calculation of a range of structural, physiochemical and biochemical protein descriptors.

-After finding the optimal technique and feature set at which to encode your dataset of sequences, `pySAR` can then be used to build a predictive regression model with the training data being that of the encoded sequences, and training labels being the experimentally pre-calculated activity values for each protein sequence. This model maps a set of protein sequences to the sought-after activity value, being able to accurately predict the activity/fitness value of new unseen sequences. The use-case for the software is within the field of protein engineering and Directed Evolution, where a user has a set of experimentally determined activity values for a library of mutant protein sequences and wants to computationally predict the sought activity value for a selection of mutated sequences, in the aim of finding the best sequence that minimises/maximises their activity value.
+`pySAR` is a Python library for analysing Sequence Activity Relationships (SARs)/Sequence Function Relationships (SFRs) of protein sequences. `pySAR` offers extensive and verbose functionalities that allow you to numerically encode a dataset of protein sequences using a large abundance of available methodologies and features. The software uses physiochemical and biochemical features from the Amino Acid Index (AAI) database [[2]](#references), as well as allowing for the calculation of a range of structural, physiochemical and biochemical protein descriptors, via the custom-built [`protpy`][protpy] package. + +After finding the optimal technique and feature set at which to numerically encode your dataset of sequences, `pySAR` can then be used to build a predictive regression ML model with the training data being that of the encoded protein sequences, and training labels being the in vitro experimentally pre-calculated activity values for each protein sequence. This model maps a set of protein sequences to the sought-after activity value, being able to accurately predict the activity/fitness value of new unseen sequences. The use-case for the software is within the field of Protein Engineering, Directed Evolution and or Drug Discovery, where a user has a set of in vitro experimentally determined activity/fitness values for a library of mutant protein sequences and wants to computationally predict the sought activity value for a selection of mutated unseen sequences, in the aim of finding the best sequence that minimises/maximises their activity value.
-Two additional custom-built softwares were created alongside pySAR - [aaindex][aaindex] and [protpy][protpy]. The aaindex software package is used for parsing the amino acid index which is a database of numerical indices representing various physicochemical and biochemical properties of amino acids and pairs of amino acids [[2]](#references). protpy is used for calculating a series of protein physiochemical, biochemical and structural protein descriptors. Both of these software packages are integrated into pySAR but can also be used individually for their respective purposes. +In the published [research][article], the sought activity/fitness characterisitc is the thermostability of proteins from a recombination library designed from parental cytochrome P450's. This thermostability is measured using the T50 metric (temperature at which 50% of a protein is irreversibly denatured after 10 mins of incubation, ranging from 39.2 to 64.4 degrees C), which we want to maximise [[1]](#references). + +Two additional custom-built softwares were created alongside `pySAR` - [`aaindex`][aaindex] and [`protpy`][protpy]. The `aaindex` software package is used for parsing the amino acid index which is a database of numerical indices representing various physicochemical and biochemical properties of amino acids and pairs of amino acids [[2]](#references). `protpy` is used for calculating a series of protein physiochemical, biochemical and structural protein descriptors. Both of these software packages are integrated into `pySAR` but can also be used individually for their respective purposes. Requirements ============ -* [Python][python] >= 3.7 -* [aaindex][aaindex] >= 1.0.4 -* [protpy][protpy] >= 1.0.7 +* [Python][python] >= 3.8 +* [aaindex][aaindex] >= 1.1.1 +* [protpy][protpy] >= 1.1.10 * [numpy][numpy] >= 1.24.2 * [pandas][pandas] >= 1.5.3 * [scikit-learn][sklearn] >= 1.2.1 @@ -81,7 +81,9 @@ cd pySAR Usage ===== ### Confile File -`pySAR` works through JSON configuration files. There are many different customisable parameters for the functionalities in `pySAR` including the metaparameters of each of the available protein descriptors, all Digital Signal Processing (DSP) parameters in the pyDSP module, the type of regression model to use and parameters specific to the dataset. These config files offer a more straightforward way of making any changes to the `pySAR` pipeline. The names of **All** the parameters as listed in the example config files must remain unchanged, only the value of each parameter should be changed, any parameters not being used can be set to null. An example of the config file used in my research project, with most of the available parameters, can be seen below and in config/thermostability.json. +`pySAR` works mainly via JSON configuration files. There are many different customisable parameters for the functionalities in `pySAR` including the metaparameters of some of the available protein descriptors, all Digital Signal Processing (DSP) parameters in the `pyDSP` module, the type of regression model to use and parameters specific to the dataset - a description of each parameter is available on the [CONFIG.md][config] file. + +These config files offer a more straightforward way of making any changes to the `pySAR` pipeline. The names of **All** the parameters as listed in the example config files must remain unchanged, only the value of each parameter should be changed, any parameters not being used can be set to null. Additionally, you can pass in the individual parameter names and values to the `pySAR` and `Encoding` classes when numerically encoding the protein sequences via **kwargs**. An example of the config file used in my research project ([thermostability.json](https://github.com/amckenna41/pySAR/blob/master/config/thermostability.json)), with most of the available parameters, can be seen below and in the example config file - [CONFIG.md][config]. ```json { @@ -99,7 +101,7 @@ Usage }, "descriptors": { - "descriptors_csv": "descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag":30, @@ -124,18 +126,23 @@ Usage } } ``` -
Encoding using all 566 AAIndex indices:
-Encoding protein sequences in dataset using all 566 indices in the AAI database. Each sequence encoded via an index in the AAI can be passed through an additional step where its protein spectra can be generated following an FFT. `pySAR` supports generation of the power, imaginary, real or absolute spectra as well as other DSP functionalities including windowing, convolution and filter functions. In the example below, the encoded sequences will be used to generate a imaginary protein spectra with a blackman window function applied. This will then be used as feature data to build a predictive model that can be used for accurate prediction of the sought activity value of unseen protein sequences. The encoding class also takes only the JSON config file as input which will have all the required parameter values. The output results will show the calculated metric values for each index in the AAI when measuring predicted vs observed activity values for the unseen test sequences.
+### Examples + +
Encoding protein sequences using all 566 AAIndex indices:
+Encoding protein sequences in dataset using all 566 indices in the AAI1 database. Each sequence encoded via an index in the AAI can be passed through an additional step where its protein spectra can be generated following an FFT. pySAR supports generation of the power, imaginary, real or absolute spectra as well as other DSP functionalities including windowing and filter functions.
+ +In the example below, the encoded sequences will be used to generate a imaginary protein spectra with a blackman window function applied. This will then be used as feature data to build a predictive regression ML model that can be used for accurate prediction of the sought activity value (thermostability) of unseen protein sequences. The encoding class also takes the JSON config file as input which will have all the required parameter values. The output results will show the calculated metric values for each index in the AAI when measuring predicted vs observed activity values for the unseen test sequences.
```python +#import encoding module from pySAR.encoding import * -'''test_config.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset1.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -147,18 +154,21 @@ from pySAR.encoding import * { "use_dsp": 1, "spectrum": "imaginary", - "window": "blackman" + "window": { + "type": "blackman" + } } } ''' #create instance of Encoding class, using RF algorithm with its default params -encoding = Encoding(config_file='test_config.json') +encoding = Encoding(config_file='thermostability.json') #encode sequences using all indices in the AAI if input parameter "aai_indices" is empty/None aai_encoding = encoding.aai_encoding() ``` -Output results showing AAI index and its category as well as all the associated metric values for each predictive model: +Output results showing AAI index and its category as well as all the associated metric values for each predictive model. From the results below we can determine that the **CHOP780206** index in the AAI has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of new unseen sequences: + | | Index | Category | R2 | RMSE | MSE | RPD | MAE | Explained Var | |---:|:-----------|:-----------|---------:|--------:|--------:|--------:|--------:|----------------:| | 0 | CHOP780206 | secondary_struct | 0.62737 | 3.85619 | 14.8702 | 1.63818 | 3.16755 | 0.713467 | @@ -169,17 +179,18 @@ Output results showing AAI index and its category as well as all the associated
Encoding using list of 4 AAI indices, with no DSP functionalities:
-Same procedure as prior, except 4 indices from the AAI are being specifically input into the function, with the encoded sequence output being concatenated together and used as feature data to build the predictive PlsRegression model with its default parameters. The config parameter use_dsp tells the function to not generate the protein spectra or apply any additional DSP processing to the sequences.
+This method follows a similar procedure as the previous step, except 4 indices from the AAI are being specifically input into the function, with the encoded sequence output being concatenated together and used as feature data to build the predictive PLSRegression model with its default parameters. The config parameter use_dsp tells the function to not generate the protein spectra or apply any additional DSP processing to the sequences.
```python +#import encoding module from pySAR.encoding import * -'''test_config2.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset2.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -195,13 +206,14 @@ from pySAR.encoding import * } ''' #create instance of Encoding class, using PLS algorithm with its default params -encoding = Encoding(config_file='test_config2.json') +encoding = Encoding(config_file='thermostability.json') #encode sequences using 4 indices specified by user, use_dsp = False -aai_encoding = encoding.aai_encoding(aai_list=["PONP800102","RICJ880102","ROBB760107","KARS160113"]) +aai_encoding = encoding.aai_encoding(aai_indices=["PONP800102","RICJ880102","ROBB760107","KARS160113"]) ``` -Output DataFrame showing the 4 predictive models built using the PLS algorithm, with the 4 indices from the AAI: +Output DataFrame showing the 4 predictive models built using the PLS algorithm, with the 4 indices from the AAI. From the results below we can determine that the **PONP800102** index in the AAI has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of unseen sequences: + | | Index | Category | R2 | RMSE | MSE | RPD | MAE | Explained Var | |---:|:-----------|:------------|---------:|--------:|---------:|--------:|--------:|----------------:| | 0 | PONP800102 | hydrophobic | 0.74726 | 3.0817 | 9.49688 | 1.98913 | 2.63742 | 0.751032 | @@ -211,18 +223,19 @@ Output DataFrame showing the 4 predictive models built using the PLS algorithm,
-
Encoding protein sequences using their calculated protein descriptors:
-Calculate the protein descriptor values for a dataset of protein sequences from the 15 available descriptors in the descriptors module. Use each descriptor as a feature set in the building of the predictive models used to predict the activity value of unseen sequences. By default, the function will look for a csv file pointed to by the "descriptors_csv" parameter in the config file that contains the pre-calculated descriptor values for a dataset. If file is not found then all descriptor values will be calculated for the dataset using the descriptors_ module. If a descriptor in the config file is to be used in the feature data, its parameter is set to true/1.
+
Encoding protein sequences using all available protein descriptors:
+Calculate the protein descriptor values for a dataset of protein sequences from the 15 available descriptors in the descriptors module. Use each descriptor as a feature set in the building of the predictive ML models used to predict the activity value of unseen sequences. By default, the function will look for a csv file pointed to by the "descriptors_csv" parameter in the config file that contains the pre-calculated descriptor values for a dataset. If file is not found then all descriptor values will be calculated for the dataset using the descriptors module and custom-built protpy package. ```python +#import encoding module from pySAR.encoding import * -'''test_config3.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset3.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -235,7 +248,7 @@ from pySAR.encoding import * }, "descriptors": { - "descriptors_csv": "precalculated_descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -247,35 +260,36 @@ from pySAR.encoding import * } ''' #create instance of Encoding class using AdaBoost algorithm, using 100 estimators & a learning rate of 1.5 -encoding = Encoding(config_file='test_config3.json') +encoding = Encoding(config_file='thermostability.json') -#building predictive models using all available descriptors -# calculating evaluation metrics values for models and storing into desc_results_df DataFrame +#building predictive models using all available descriptors, calculating evaluation metrics values for +# models and storing into desc_results_df DataFrame desc_results_df = encoding.descriptor_encoding() - ``` -Output results showing the protein descriptor and its group as well as all the associated metric values for each predictive model: +Output results showing the protein descriptor and its group as well as all the associated metric values for each predictive model. From the results below we can determine that the **CTD Distribution** descriptor has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of unseen sequences: + | | Descriptor | Group | R2 | RMSE | MSE | RPD | MAE | Explained Var | |---:|:------------------------|:----------------|---------:|--------:|--------:|--------:|--------:|----------------:| -| 0 | _distribution | CTD | 0.721885 | 3.26159 | 10.638 | 1.89621 | 2.60679 | 0.727389 | -| 1 | _geary_autocorrelation | Autocorrelation | 0.648121 | 3.67418 | 13.4996 | 1.68579 | 2.82868 | 0.666745 | -| 2 | _tripeptide_composition | Composition | 0.616577 | 3.3979 | 11.5457 | 1.61496 | 2.53736 | 0.675571 | -| 3 | _aa_composition | Composition | 0.612824 | 3.37447 | 11.3871 | 1.60711 | 2.79698 | 0.643864 | +| 0 | ctd_d | CTD | 0.721885 | 3.26159 | 10.638 | 1.89621 | 2.60679 | 0.727389 | +| 1 | geary_autocorrelation | Autocorrelation | 0.648121 | 3.67418 | 13.4996 | 1.68579 | 2.82868 | 0.666745 | +| 2 | tripeptide_composition | Composition | 0.616577 | 3.3979 | 11.5457 | 1.61496 | 2.53736 | 0.675571 | +| 3 | amino_acid_composition | Composition | 0.612824 | 3.37447 | 11.3871 | 1.60711 | 2.79698 | 0.643864 | | 4 | ...... | ...... | ...... | ...... | ...... | ...... | ...... | ...... |
Encoding using AAI + protein descriptors:
-Encoding protein sequences in dataset using all 566 indices in the AAI database combined with protein descriptors. All 566 indices can be used in concatenation with 1, 2 or 3 descriptors. E.g: at each iteration the encoded sequences using the indices from the AAI will be used to generate a protein spectra using the power spectrum with no window function applied, this will then be combined with the feature set generated from the dataset's descriptor values and used to build a predictive model that can be used for accurate prediction of the sought activity value of unseen protein sequences. The output results will show the calculated metric values when measuring predicted vs observed activity values for the test sequences.
+Encoding protein sequences in the dataset using ALL 566 indices in the AAI database combined with ALL available protein descriptors. All 566 indices can be used in concatenation with 1, 2 or 3 descriptors. At each iteration the encoded sequences generated from the indices from the AAI will be combined with the feature set generated from the dataset's descriptor values and used to build a predictive regression ML model that can be used for the accurate prediction of the sought activity/fitness value of unseen protein sequences. The output results will show the calculated metric values when measuring predicted vs observed activity values for the test sequences.
```python +#import encoding module from pySAR.encoding import * -'''test_config4.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset4.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -290,7 +304,7 @@ from pySAR.encoding import * }, "descriptors": { - "descriptors_csv": "precalculated_descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -301,22 +315,21 @@ from pySAR.encoding import * }, "pyDSP": { - "use_dsp": 1, + "use_dsp": 0, "spectrum": "power", "window": "" ... } } ''' -#create instance of Encoding class using RF algorithm, using 100 estimators with a learning rate of 1.5 -encoding = Encoding('test_config4.json') - -#building predictive models using all available aa_indices + combination of 2 descriptors, -# calculating evaluation metric values for models and storing into aai_desc_results_df DataFrame -aai_desc_results_df = encoding.aai_descriptor_encoding(desc_combo=2) +#create instance of Encoding class using RF algorithm, using 100 estimators with a learning rate of 1.5 - as listed in config +encoding = Encoding('thermostability.json') +#building predictive models using all available aa_indices + descriptors, calculating evaluation metric values for models and storing into aai_desc_results_df DataFrame +aai_desc_results_df = encoding.aai_descriptor_encoding() ``` -Output results showing AAI index and its category, the protein descriptor and its group as well as the R2 and RMSE values for each predictive model: + +Output results showing AAI index and its category, the protein descriptor and its group as well as all output metric values for each predictive model. From the results below we can determine that the **ARGP820103** index in concatenation with the **Conjoint Triad** descriptor has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of unseen sequences: | | Index | Category | Descriptor | Descriptor Group | R2 | RMSE | |---:|:-----------|:------------|:---------------------------|:---------------------|---------:|--------:| @@ -327,18 +340,19 @@ Output results showing AAI index and its category, the protein descriptor and it | 4 | ..... | ..... | ..... | ..... | ..... | ..... |
-
Building predictive model from AAI and protein descriptors:
-e.g: the below code will build a PlsRegression model using the AAI index CIDH920105 and the 'amino acid composition' descriptor. The index is passed through a DSP pipeline and is transformed into its informational protein spectra using the power spectra, with a hamming window function applied to the output of the FFT. The concatenated features from the AAI index and the descriptor will be used as the feature data in building the PLS model.
+
Building predictive model from subset of AAI and protein descriptors:
+The below code will build a PLSRegression model using the AAI index CIDH920105 and the amino acid composition descriptor. The index is passed through a DSP pipeline and is transformed into its informational protein spectra using the power spectra, with a hamming window function applied to the output of the FFT. The concatenated features from the AAI index and the descriptor will be used as the feature data in building the PLS ML model. This model is then used to access its predictability by testing on test unseen sequences. The output results will show the calculated metric values when measuring predicted vs observed activity values for the test sequences.
```python -import pySAR as pysar #import pySAR package +#import pySAR module +from pySAR.pySAR import * -'''test_config5.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset5.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... }, "model": @@ -349,7 +363,7 @@ import pySAR as pysar #import pySAR package }, "descriptors": { - "descriptors_csv": "precalculated_descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -367,30 +381,73 @@ import pySAR as pysar #import pySAR package } } ''' -#create instance of PySAR class -pySAR = pysar.PySAR(config_file="test_config5.json") -""" -PySAR parameters: +#create instance of PySAR class, inputting path to configuration file +pySAR = PySAR(config_file="thermostability.json") + +#encode protein sequences using both the CIDH920105 index + aa_composition descriptor +results_df = pySAR.encode_aai_descriptor(aai_indices="CIDH920105", descriptors="amino_acid_composition") +``` + +Output results showing AAI index and its category, the protein descriptor and its group as well as the metric values for the generated predictive model. From the results below we can determine that the **CIDH920105** index in concatenation with the **Amino Acid Composition** descriptor has medium predictability (R2 score) but a high error rate (MSE/RMSE) for our chosen dataset (thermostability) and this feature set combination is not that effective for predicting the thermostability of unseen sequences: + +```python +########################################################################################## +###################################### Parameters ######################################## + +# AAI Indices: CIDH920105 +# Descriptors: amino_acid_composition +# Configuration File: thermostability_config.json +# Dataset: thermostability.txt +# Number of Sequences/Sequence Length: 261 x 466 +# Target Activity: T50 +# Algorithm: PLSRegression +# Model Parameters: {'copy': True, 'max_iter': 500, 'n_components': 2, 'scale': True, +#'tol': 1e-06} +# Test Split: 0.2 +# Feature Space: (261, 486) + +########################################################################################## +######################################## Results ######################################### + +# R2: 0.6720111107323943 +# RMSE: 3.7522525079464457 +# MSE: 14.079398883390391 +# MAE: 3.0713217158459805 +# RPD 1.7461053136208489 +# Explained Variance 0.6721157080699659 + +########################################################################################## +``` +
-:config_file : str - full path to config file containing all required pySAR parameters. +
Calculate individual descriptor values, e.g Tripeptide Composition and Geary Autocorrelation:
+The individual protein descriptor values for the dataset of protein sequences can be calculated using the custom-built protpy package via the descriptor module. The full list of descriptors can be seen via the function all_descriptors_list() as well as on the protpy repo homepage. -""" -#encode protein sequences using both the CIDH920105 index + aa_composition descriptor. -results_df = pySAR.encode_aai_desc(indices="CIDH920105", descriptors="amino_acid_composition") +```python +#import descriptors class +from pySAR.descriptors import * + +#create instance of descriptors class +desc = Descriptors(config_file="thermostability.json") + +#calculate tripeptide composition descriptor +tripeptide_composition = desc.get_tripeptide_composition() + +#calculate geary autocorrelation descriptor +geary_autocorrelation = desc.get_geary_autocorrelation() ```
Calculate and export all protein descriptors:
-Prior to evaluating the various available properties and features at which to encode a set of protein sequences, it is reccomened that you pre-calculate all the available descriptors in one go, saving them to a csv for later that `pySAR` will then import from. Output values are stored in csv set by descriptors_csv config parameter. Output will be of the shape N x M, using the default parameters, where N is the number of protein sequences in the dataset and M is the total number of features calculated from all 15 descriptors which varies depending on some descriptor-specific metaparameters.
+Prior to evaluating the various available properties and features at which to encode a set of protein sequences, it is reccomened that you pre-calculate all the available descriptors in one go, saving them to a csv for later that pySAR will then import from. Output values are stored in a csv set by the descriptors_csv config parameter (the name of the exported csv via the descriptors_export_filename parameter can also be passed into the function). Output will be of the shape N x M, where N is the number of protein sequences in the dataset and M is the total number of features calculated from all 15 descriptors which varies depending on some descriptor-specific metaparameters. For example, using the thermostability dataset, the output will be 261 x 9714.
```python -'''test_config6.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset5.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... }, "model": @@ -399,7 +456,7 @@ Prior to evaluating the various available properties and features at which to en } "descriptors": { - "descriptors_csv": "precalculated_descriptors", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -414,19 +471,19 @@ Prior to evaluating the various available properties and features at which to en } } ''' -from pySAR.descriptors import * #import descriptors class +#import descriptors class +from pySAR.descriptors import * #create instance of descriptors class -desc = Descriptors(config_file="test_config6") - -#calculating all descriptor values and exporting to file named by parameter descriptors_csv -desc.get_all_descriptors(export=True) +desc = Descriptors(config_file="thermostability.json") +#export all descriptors to csv using parameters in config, export=True will export to csv +desc.get_all_descriptors(export=True, descriptors_export_filename="descriptors_thermostability.csv") ```
Get record from AAIndex database:
-A custom-built package called `aaindex` was created for this project to work with all the data in the AAIndex databases, primarily the aaindex1. The AAIndex library offers diverse functionalities for obtaining all data from all records within the aaindex1. Each record is stored in json format and can be retrieved via its accession number. Each record contains the following attributes: description, references, category, notes, correlation coefficient, pmid and values.
+A custom-built package called aaindex was created for this project to work with all the data in the AAIndex databases, primarily the aaindex1. The AAIndex library offers diverse functionalities for obtaining all data from all records within the aaindex1. Each record is stored in json format and can be retrieved via its accession number, and can also be searched via its name/description. Each record contains the following attributes: description, references, category, notes, correlation coefficient, pmid and values.
```python from aaindex import aaindex1 @@ -443,17 +500,22 @@ values = aaindex1['CHOP780206'].values #get amino acid values from record num_record = aaindex1.num_records() #get total number of records record_names = aaindex1.record_names() #get list of all record names amino_acids = aaindex1.amino_acids() #get list of all canonical amino acids +records = aaindex1.search("hydrophobicity") #get all records with hydrophobicity in their title/description ```
-Directories -=========== +Directories and Files +===================== * `/config` - configuration files for the example datasets that `pySAR` has been tested with, as well as the thermostability.json config file that was used in the research. These config files should be used as a template for future datasets used with `pySAR`. +* `/data` - data files used in the research proejct including the thermostability dataset, config file and pre-calculated protein descriptors. * `/docs` - documentation for `pySAR` (pending). -* `/example_datasets` - example datasets used for the building and testing of `pySAR`, including the thermostability dataset used in the research. The format of these datasets shoould be used as a template for future datasets used with `pySAR`. +* `/example_datasets` - example datasets used for the building and testing of `pySAR`, including the thermostability dataset used in the research. The format of these datasets should be used as a template for future datasets used with `pySAR`. * `/images` - all images used throughout the repo. * `/pySAR` - source code for `pySAR` software. * `/tests` - unit and integration tests for `pySAR`. +* `pySAR_research.pdf` - published research article. +* `pySAR_research.pptx` - powerpoint demo of the software development process of pySAR. +* `CONFIG.md` - example markdown file describing each of the available parameters in the config files. Issues ====== @@ -478,11 +540,11 @@ If you have any questions or comments, please contact amckenna41@qub.ac.uk or ra License ======= -Distributed under the MIT License. See `LICENSE` for more details. +Distributed under the MIT License. See [`LICENSE`][license] for more details. References ========== -\[1\]: Mckenna, A., & Dubey, S. (2022). Machine learning based predictive model for the analysis of sequence activity relationships using protein spectra and protein descriptors. Journal of Biomedical Informatics, 128(104016), 104016. https://doi.org/10.1016/j.jbi.2022.104016 +\[1\]: Mckenna, A., & Dubey, S. (2022). Machine learning based predictive model for the analysis of sequence activity relationships using protein spectra and protein descriptors. Journal of Biomedical Informatics, 128(104016), 104016. https://doi.org/10.1016/j.jbi.2022.104016

\[2\]: Kawashima, S. and Kanehisa, M., 2000. AAindex: amino acid index database. Nucleic acids research, 28(1), pp.374-374. DOI: 10.1093/nar/27.1.368

\[3\]: Fontaine NT, Cadet XF, Vetrivel I. Novel Descriptors and Digital Signal Processing- Based Method for Protein Sequence Activity Relationship Study. Int J Mol Sci. 2019 Nov 11;20(22):5640. doi: 10.3390/ijms20225640. PMID: 31718061; PMCID: PMC6888668.

\[4\]: Cadet, F., Fontaine, N., Li, G. et al. A machine learning approach for reliable prediction of amino acid interactions and its application in the directed evolution of enantioselective enzymes. Sci Rep 8, 16757 (2018).

@@ -514,4 +576,6 @@ DOI: 10.1021/acs.jcim.0c00073

[pdf]: https://github.com/amckenna41/pySAR/blob/master/pySAR_research.pdf [ppt]: https://github.com/amckenna41/pySAR/blob/master/pySAR_demo.key [demo]: https://colab.research.google.com/drive/1hxtnf8i4q13fB1_2TpJFimS5qfZi9RAo?usp=sharing -[Issues]: https://github.com/amckenna41/pySAR/issues \ No newline at end of file +[Issues]: https://github.com/amckenna41/pySAR/issues +[license]: https://github.com/amckenna41/pySAR/blob/master/LICENSE +[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md \ No newline at end of file diff --git a/TODO.md b/TODO.md index 21ba0c8..d416c3f 100644 --- a/TODO.md +++ b/TODO.md @@ -79,7 +79,7 @@ To Do List: - [X] Could remove aaindex1 & aaindex1.json file from tests/test_data. - [X] Remove .coveralls.yml - [X] Move pySAR demo to Google Colab. -- [ ] In descriptors/readme go into more detail about what each descriptor does. +- [X] In descriptors/readme go into more detail about what each descriptor does. - [ ] Add documentation section in readme. - [X] Add Assertion message to some unit tests. - [X] Change mentions of "descriptor_paramters" to "descriptor_properties". @@ -93,14 +93,14 @@ To Do List: - [X] Reorder software metadata in setup.py to be in order of main func, create __description__ var. - [X] Add download_url to setup.py - url of zipped package. - [X] In some unit tests, may need to use self.assertAlmostEqual instead of self.assertEqual. -- [ ] Remove 'get_' from functions. +- [X] Remove 'get_' from functions. - [X] Make self.params & other dicts in pySAR accessible via dot notation. - [ ] Remove all camel casing function names/vars, change to underscores and lowercase (https://peps.python.org/pep-0008/#function-and-variable-names). - [X] Usage example using fasta import function/module. - [X] Use Map class to allow for config file and parameters accessible via dot notation. - [X] In config files, change 'comp' to 'composition' - [X] Change protpy.aa_composition -> protpy.amino_acid_composition -- [ ] Add normalize parameter to each autocorrelation func & config. +- [X] Add normalize parameter to each autocorrelation func & config. - [X] Change all references of normalized_moreaubroto_autocorrelation to moreaubroto_autocorrelation - [X] Change 'Amp' -> 'amphiphilic' - config.md - [X] Change seq_order_... -> sequence_order... , quasi_seq_order -> quasi_sequence_order in config.md. @@ -118,12 +118,12 @@ To Do List: - [X] Prepend 'ctd_' to the ctd descriptors names, attributes and function names. - [ ] Add reference numbers to comments in descriptor functions - double check existing ones are correct, reorder them. - [X] Change QSOrder to QSO. -- [ ] Rewrite APAAComp descriptor comments to mention its dimensions change with lamda. +- [X] Rewrite APAAComp descriptor comments to mention its dimensions change with lamda. - [X] Wrap all if statements in brackets. - [X] Remove convolution from pyDSP and config. - [X] Move Map class to utils. - [X] Ensure all functions have Parameters and Returns in the comments, even if they are None -- [ ] Add filter function to pyDSP. +- [X] Add filter function to pyDSP. - [X] Change self.spectra to self.spectrum in pyDSP. - [X] In function comments change default = X to default= - [X] For pseudo and amp composition, only test on 1 dataset as takes to long with all of them. @@ -139,7 +139,7 @@ To Do List: - [X] Change aaindex column names from incrementing numbers to - "aa_1", "aa_2" ... - [X] Update aai encoding unit tests to take new naming convention into consideration. - [X] Ensure output from encoding funcs is DF not a Series. -- [ ] Remove "Getting X Descriptor" etc? +- [X] Remove "Getting X Descriptor" etc? - [ ] Rename software from pySAR -> pysar. - [X] Go over import_descriptors func. - [X] In test_descriptors, check if double import of descriptors module is needed. @@ -149,7 +149,7 @@ To Do List: - [X] Rerun get all descriptors func on colab to take into account new conjoint triad and CTD column names. - [X] If ["ctd"]["all"] = true this calculates ALL CTD descriptors for all 7 properties, if not true then CTD descriptors are calculated individually. - [X] Remove ctd_comp, distr, trans descriptors, just use parent CTD descriptors and slice from it. -- [ ] Python unit tests using ctd with 1 property, and using all properties, check dimensions - 21 vs 147 (147/21=7). 21 dimensions per property. 3 C, 3 T, 15 D. +- [X] Python unit tests using ctd with 1 property, and using all properties, check dimensions - 21 vs 147 (147/21=7). 21 dimensions per property. 3 C, 3 T, 15 D. - [X] Add spaces to test config files. - [ ] SOCN tests with distance matrix in config empty & non-empty, different SOCN functions. - [X] def quasi_sequence_order() - dimesnion (1,lag). def quasi_sequence_order_all() - dimension (1,lag*2) @@ -163,7 +163,7 @@ To Do List: - [X] Change self.activity -> self.activity_col, set self.activity to the actual column data. - [X] Change all references to config_path to config_file, including dsp_config. - [X] in aai_encoding func in Encoding, reorder columns such that MAE is before RPD. -- [ ] when testing desc and aai + desc endoing, use test config with and without pre-calcualted descriptors csv. +- [X] when testing desc and aai + desc endoing, use test config with and without pre-calcualted descriptors csv. - [X] Ensure example_datasets isnt in software packaging. - [X] Pretty print json when printing parameters in Encoding functions. - [X] Sort by for RMSE and MSE incorrect, smallest values should be first, largest values last. Sort asc instead of sort desc. @@ -177,7 +177,7 @@ To Do List: - [X] CTD columns are repeating twice in output csv. - [X] Remove property key from ctd_comp, ctd_distr, ctd_trans and from config.md - [X] Double check concatenated AAI columns have prefix aai_. Test this in test_encoding unit tests. -- [ ] Only generate and or upload coverage report for one Python version in workflows. +- [X] Only generate and or upload coverage report for one Python version in workflows. - [X] Input X and Y into Model class, initialise in constructor. - [X] Change test_size param to test_split. - [ ] Best params is empty when outputting hyperparameter results. Use default params if params in config is {}. @@ -186,15 +186,15 @@ To Do List: - [ ] Add results from research folder to Google Drive, mention in Research Article section. Mention pre-calculated descriptors from same section. - [X] Remove 2 distance matrices from pySAR/data, now a part of protpy package. - [X] Remove manifest file after removal of pySAR/data. -- [ ] Upload pySAR demo as ppt rather than .key. +- [X] Upload pySAR demo as ppt rather than .key. - [X] Double check what happens when dict not passed into Map class, should error be rasied? Reflect change in aaindex. - [X] Remove get_protein module and references to it. - [X] Add circleci badge back into repo now that it's sorta working. - [X] In hyperparameter tuning results change CV to Number of cross-validation folds etc. - [X] Less verbose output for hyperparameter tuning. -- [ ] __str__ of Desscriptor class displays all descriptor names and shapes. +- [X] __str__ of Desscriptor class displays all descriptor names and shapes. - [X] Remove "descriptors" from config, move csv param to "desc_properties", rename desc_properties -> descriptors. -- [X] Organise config, newline for [ and { +- [X] Organise config, newline for [] and {}. - [X] Change all references of lamda to lambda. - [X] Remove cutoff index. - [ ] Unit test desc_combo in test_descriptor @@ -202,11 +202,11 @@ To Do List: - [X] If less than 10 AAI Indices or Descriptors being encoded then print out else dont. Slight error when erroneauous index input this still outputs. Also model_parameters is empty. - [X] Finish encoding terminal outputs from desc and aai + desc. - [X] Check columns generated from aai_encoding follow format aai_X. -- [ ] Unit test columns follow format aai_X... +- [X] Unit test columns follow format aai_X... - [X] In utils.save_results, double check that input parameter doesnt already have an extension on it. - [ ] Complete test_model feature_selection unit tests. - [X] Remove rfft from pyDSP. -- [ ] Finish window and filter unit tests pyDSP. +- [X] Finish window and filter unit tests pyDSP. - [X] pyDSP encode_seqs(), window <> window_type - [X] for aai_desc_encoding in pySAR.py, check list of indices is split up into str. - [X] Test export of results: test output folder is created, import csv, double check columns, length etc, delete folder. @@ -234,7 +234,7 @@ To Do List: - [X] Go through all unit tests, any tests that are wrapped in with.selfAssertRaises()... , remove var assignment and just call function. [Back to top](#TOP) -- [ ] Mention that individual descriptors are explaiend in the protpy package. Mention protpy in pySar demo. +- [X] Mention that individual descriptors are explaiend in the protpy package. Mention protpy in pySar demo. - [X] Use **kwargs in class contstructor to be able to pass in specific parameter values, override the config file, if applicable. - [X] Change all config files to not use_dsp by default. - [X] Change all comment underlining from "------" to "=======". @@ -249,14 +249,14 @@ To Do List: - [X] For pysar.encode_descriptor, pysar.encode_aai and pysar.encode_aai_descriptor functions, there doesnt seem to be any functionality to support list of indices and or descriptors atm. - [X] Encoding functions in pySAR used for concatenating multiple descriptors etc. - [X] Encoding functions in Encoding used for encoding multiple descriptors seperately. -- [ ] For descriptor concatenations, maybe have a concat flag that if set to True will concat the multiple descriptors inoput. -- [ ] Read over and update comments. +- [X] For descriptor concatenations, maybe have a concat flag that if set to True will concat the multiple descriptors inoput. +- [X] Read over and update comments. - [X] In encoding.py functions, if the same index/descriptor is put in twice, ensure it isn't duplicated. - [X] Order indices alphabetically. - [X] Some test outputs when displaying list of parameters have "invalid_aaindex_code" or "invalid_descriptor_name" - [X] Disable tqdm using disbale flag if less than 5 or so AAI indices being calcualted. - [X] Return error if invalid aai indices/descriptors - don't print out parameters text if invalid. -- [ ] Go over files and folders in pypi package, remove tests. +- [X] Go over files and folders in pypi package, remove tests. - [X] Add feature space dimensions - add unit tests. - [X] After encoding in pysar.py check class variables have been set. - [X] aai_indices = ["MUNV940104", "ZASB820101"] / aai_descriptor_encoding = pysar.encode_aai_descriptor(aai_indices=aai_indices, descriptors="sequence_order_coupling_number") - puts Index output in []. @@ -264,4 +264,8 @@ To Do List: - [X] Remove textwrapper, change to textwrap.fill - [X] Reorder parameters, have test split at bottom fo encoding parameters text - [X] Add config file to list of parameters in output. -- [ ] Do i need additional pydsp parameter checks which are already in pYSAR. \ No newline at end of file +- [X] Mention number of tests and test cases in /tests readme - 51 tests, 6 test cases. +- [X] Recalculate and reupload descriptors_thermostability.csv. +- [X] Add info about the colunns and dimensions of each descriptors in pre-calculated csv file - fix Issue. +- [X] When calculating all descriptors (get_all_descriptors(export=True)), add some sort of print/tracking functionality. +- [X] Double check all links in readme. \ No newline at end of file diff --git a/config/README.md b/config/README.md index c0e0fc3..faff977 100644 --- a/config/README.md +++ b/config/README.md @@ -1,4 +1,4 @@ -# PySAR: Example Configuration Files +# pySAR: Example Configuration Files * `thermostability.json` - configuration file for using pySAR with the thermostability dataset studied in the research and in the /data folder. * `absorption.json` - configuration file for using pySAR with the absorption example dataset in the /example_datasets folder. diff --git a/config/absorption.json b/config/absorption.json index 4711cca..5c8abc4 100644 --- a/config/absorption.json +++ b/config/absorption.json @@ -1,7 +1,7 @@ { "dataset": { - "dataset": "absorption.txt", + "dataset": "example_datasets/absorption.txt", "sequence_col": "sequence", "activity": "peak" }, diff --git a/config/enantioselectivity.json b/config/enantioselectivity.json index 7cd249c..4d2c718 100644 --- a/config/enantioselectivity.json +++ b/config/enantioselectivity.json @@ -1,7 +1,7 @@ { "dataset": { - "dataset": "enantioselectivity.txt", + "dataset": "example_datasets/enantioselectivity.txt", "sequence_col": "sequence", "activity": "e-value" }, diff --git a/config/localization.json b/config/localization.json index 6bf7c5a..f8ea07f 100644 --- a/config/localization.json +++ b/config/localization.json @@ -1,7 +1,7 @@ { "dataset": { - "dataset": "localization.txt", + "dataset": "example_datasets/localization.txt", "sequence_col": "sequence", "activity": "log_GFP" }, diff --git a/config/thermostability.json b/config/thermostability.json index 3d29ec3..98c5bdb 100644 --- a/config/thermostability.json +++ b/config/thermostability.json @@ -1,7 +1,7 @@ { "dataset": { - "dataset": "thermostability.txt", + "dataset": "data/thermostability.txt", "sequence_col": "sequence", "activity": "T50" }, diff --git a/data/README.md b/data/README.md index 66d2bfe..fc3f292 100644 --- a/data/README.md +++ b/data/README.md @@ -1,15 +1,31 @@ # Data used in pySAR research project Usage -===== -pySAR imports the dataset declared within the configuration file (thermostability.txt) from this data directory as well as the pre-calculated descriptor values csv (descriptors_thermostability.csv), if applicable, which is also instantiated in the config file. An error will throw if the dataset and or descriptors csv is not found within this data directory. Please refer to the CONFIG.md file of where to declare the two aforementioned parameters in the config file. +----- +pySAR imports the dataset declared within the configuration file (thermostability.txt) from this data directory as well as the pre-calculated descriptor values csv (descriptors_thermostability.csv), if applicable, which it is also instantiated in the config file. An error will throw if the dataset and or descriptors csv is not found within this data directory. Please refer to the [CONFIG.md][config] example file for where to declare the two aforementioned parameters in the config file. Data -==== -* `thermostability.txt` - dataset studied in the associated work which consists of a dataset to measure the thermostability of various mutants from a recombination library designed from parental cytochrome P450's, measured using the T50 metric (temperature at which 50% of a protein is irreversibly denatured after 10 mins of incubation, ranging from 39.2 to 64.4 degrees C), which represents the protein activity of this dataset. [[1]](#references) +---- +* `thermostability.txt` - dataset studied in the associated research which consists of a dataset to measure the thermostability of various mutants from a recombination library designed from parental cytochrome P450's, measured using the T50 metric (temperature at which 50% of a protein is irreversibly denatured after 10 mins of incubation, ranging from 39.2 to 64.4 degrees C), which represents the protein activity of this dataset. [[1]](#references) * `thermostability.json` - configuration file for using pySAR with the thermostability dataset studied in the research. -* `descriptors_thermostability.csv` - csv of all pre-calculated descriptors for thermostability dataset. +* `descriptors_thermostability.csv` - csv of all pre-calculated descriptors for thermostability dataset, calculated using the descriptors module and `protpy` package [[2]](#references). The dimensions for the csv file are 261 x 9714 (261 protein sequences with 9714 features), and it uses the default parameters for any descriptor that has metaparameters (autocorrelation, sequence order and pseudo composition). Calculating all available descriptors took about ~78 minutes. The columns and dimensions of each descriptor is outlined below: + +* Amino Acid Composition - [0:20] (A,C,D,E...) +* Dipeptide Composition - [20:420] (AA,AC,AD,AE...) +* Tripeptide Composition - [420:8420] (AAA,AAC,AAD,AAE...) +* MoreauBroto Autocorrelation - [8420:8660] (MBAuto_CIDH920105_1,MBAuto_CIDH920105_2,MBAuto_CIDH920105_3,MBAuto_CIDH920105_4...) +* Moran Autocorrelation - [8660:8900] (MAuto_CIDH920105_1,MAuto_CIDH920105_2,MAuto_CIDH920105_3,MAuto_CIDH920105_4...) +* Geary Autocorrelation - [8900:9140] (GAuto_CIDH920105_1,GAuto_CIDH920105_2,GAuto_CIDH920105_3,GAuto_CIDH920105_4...) +* CTD - [9140:9161] (CTD_C_01_hydrophobicity,CTD_C_02_hydrophobicity,CTD_C_03_hydrophobicity,CTD_T_12_hydrophobicity...) +* Conjoint Triad - [9161:9504] (conj_triad_111,conj_triad_112,conj_triad_113,conj_triad_114...) +* Sequence Order Coupling Number - [9504:9534] (SOCN_SW1,SOCN_SW2,SOCN_SW3,SOCN_SW4...) +* Quasi Sequence Order - [9534:9584] (QSO_SW1,QSO_SW2,QSO_SW3,QSO_SW4...) +* Pseudo Amino Acid Composition - [9584:9634] (PAAC_1,PAAC_2,PAAC_3,PAAC_4...) +* Amphiphilic Pseudo Amino Acid Composition - [9634:9714] (APAAC_1,APAAC_2,APAAC_3,APAAC_4...) References -========== -\[1\]: Li, Y., Drummond, D. A., Sawayama, A. M., Snow, C. D., Bloom, J. D., & Arnold, F. H. (2007). A diverse family of thermostable cytochrome P450s created by recombination of stabilizing fragments. Nature Biotechnology, 25(9), 1051–1056. https://doi.org/10.1038/nbt1333
\ No newline at end of file +---------- +\[1\]: Li, Y., Drummond, D. A., Sawayama, A. M., Snow, C. D., Bloom, J. D., & Arnold, F. H. (2007). A diverse family of thermostable cytochrome P450s created by recombination of stabilizing fragments. Nature Biotechnology, 25(9), 1051–1056. https://doi.org/10.1038/nbt1333
+\[2\]: https://github.com/amckenna41/protpy + +[config]: https://github.com/amckenna41/pySAR/blob/master/CONFIG.md \ No newline at end of file diff --git a/example_datasets/README.md b/example_datasets/README.md index 13836a5..8aaa156 100644 --- a/example_datasets/README.md +++ b/example_datasets/README.md @@ -5,13 +5,28 @@ Datasets * `thermostability.txt` - dataset studied in the associated work which consists of a dataset to measure the thermostability of various mutants from a recombination library designed from parental cytochrome P450's, measured using the T50 metric (temperature at which 50% of a protein is irreversibly denatured after 10 mins of incubation, ranging from 39.2 to 64.4 degrees C), which represents the protein activity of this dataset [[1]](#references). -* `absorption.txt` - dataset of 80 blue and red-shifted protein variants of the Gloeobacter Violaceus rhodopsin (GR) protein that were mutated and substituted to tune its peak absorption wavelength. 1-5 mutations were generated in the course of tuning its absorption wavelength, for a total of 81 sequences, with the peak being captured as each sequence's activity ranging from values of 454 to 622 [[2]](#references). +* `absorption.txt` - dataset of 80 blue and red-shifted protein variants of the Gloeobacter Violaceus Rhodopsin (GR) protein that were mutated and substituted to tune its peak absorption wavelength. 1-5 mutations were generated in the course of tuning its absorption wavelength, for a total of 81 sequences, with the peak being captured as each sequence's activity ranging from values of 454 to 622 [[2]](#references). * `enantioselectivity.txt` - dataset consisting of 37 mutants and one WT (wild-type) sequence from the Aspergillus Niger organism and their calculated enantioselectivity. Enantioselectivity refers to the selectivity of a reaction towards one enantiomer and is expressed by the E-value with a range between 0 and 115 [[3]](#references). * `localization.txt` - dataset made up of 248 sequences made up of 2 seperate, 10-block recombination libraries that were designed from 3 parental ChR's (channelrhodopsin). Each chimeric ChR variant in these libraries consist of blocks of sequences from parental ChRs. Genes for these sequences were synthesized and expressed in human embryonic kidney (HEK) cells, and their membrane localization was measured as log_GFP ranging from values of -9.513 to 105 [[4]](#references). -* `descriptors_absorption.csv` - pre-calculated protein descriptors using sequences from absorption test dataset. -* `descriptors_enantioselectivity.csv` - pre-calculated protein descriptors using sequences from enantioselectivity test dataset. -* `descriptors_localization.csv` - pre-calculated protein descriptors using sequences from localization test dataset. +* `descriptors_absorption.csv` - pre-calculated protein descriptors using sequences from absorption test dataset. The dimensions for this csv are 81 x 9714 (81 protein sequences and 9714 features), when using default parameters as in the config file. +* `descriptors_enantioselectivity.csv` - pre-calculated protein descriptors using sequences from enantioselectivity test dataset. The dimensions for this csv are 152 x 9714 (152 protein sequences and 9714 features), when using default parameters as in the config file. +* `descriptors_localization.csv` - pre-calculated protein descriptors using sequences from localization test dataset. The dimensions for this csv are 254 x 9714 (254 protein sequences and 9714 features), when using default parameters as in the config file. + +Each of the pre-calculated descriptor CSVs have 9714 total features (when using the default parameters), the columns and dimensions of each descriptor file is outlined below: + +* Amino Acid Composition - [0:20] (A,C,D,E...) +* Dipeptide Composition - [20:420] (AA,AC,AD,AE...) +* Tripeptide Composition - [420:8420] (AAA,AAC,AAD,AAE...) +* MoreauBroto Autocorrelation - [8420:8660] (MBAuto_CIDH920105_1,MBAuto_CIDH920105_2,MBAuto_CIDH920105_3,MBAuto_CIDH920105_4...) +* Moran Autocorrelation - [8660:8900] (MAuto_CIDH920105_1,MAuto_CIDH920105_2,MAuto_CIDH920105_3,MAuto_CIDH920105_4...) +* Geary Autocorrelation - [8900:9140] (GAuto_CIDH920105_1,GAuto_CIDH920105_2,GAuto_CIDH920105_3,GAuto_CIDH920105_4...) +* CTD - [9140:9161] (CTD_C_01_hydrophobicity,CTD_C_02_hydrophobicity,CTD_C_03_hydrophobicity,CTD_T_12_hydrophobicity...) +* Conjoint Triad - [9161:9504] (conj_triad_111,conj_triad_112,conj_triad_113,conj_triad_114...) +* Sequence Order Coupling Number - [9504:9534] (SOCN_SW1,SOCN_SW2,SOCN_SW3,SOCN_SW4...) +* Quasi Sequence Order - [9534:9584] (QSO_SW1,QSO_SW2,QSO_SW3,QSO_SW4...) +* Pseudo Amino Acid Composition - [9584:9634] (PAAC_1,PAAC_2,PAAC_3,PAAC_4...) +* Amphiphilic Pseudo Amino Acid Composition - [9634:9714] (APAAC_1,APAAC_2,APAAC_3,APAAC_4...) References ---------- diff --git a/pySAR/README.md b/pySAR/README.md index b4de523..579695a 100644 --- a/pySAR/README.md +++ b/pySAR/README.md @@ -1,16 +1,16 @@ -# pySAR +# pySAR - Python Sequence Activity Relationship [![PyPI](https://img.shields.io/pypi/v/pySAR)](https://pypi.org/project/pySAR/) [![Platforms](https://img.shields.io/badge/platforms-linux%2C%20macOS%2C%20Windows-green)](https://pypi.org/project/pySAR/) [![PythonV](https://img.shields.io/pypi/pyversions/pySAR?logo=2)](https://pypi.org/project/pySAR/) [![License: MIT](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT) -[![Issues](https://img.shields.io/github/issues/amckenna41/pySAR)](https://github.com/amckenna41/pySAR/issues) -[![codecov](https://codecov.io/gh/amckenna41/pySAR/branch/master/graph/badge.svg?token=4PQDVGKGYN)](https://codecov.io/gh/amckenna41/pySAR) Usage ===== ### Confile File -`pySAR` works through JSON configuration files. There are many different customisable parameters for the functionalities in `pySAR` including the metaparameters of each of the available protein descriptors, all Digital Signal Processing (DSP) parameters in the pyDSP module, the type of regression model to use and parameters specific to the dataset. These config files offer a more straightforward way of making any changes to the `pySAR` pipeline. The names of **All** the parameters as listed in the example config files must remain unchanged, only the value of each parameter should be changed, any parameters not being used can be set to null. An example of the config file used in my research project, with most of the available parameters, can be seen below and in config/thermostability.json. +`pySAR` works mainly via JSON configuration files. There are many different customisable parameters for the functionalities in `pySAR` including the metaparameters of some of the available protein descriptors, all Digital Signal Processing (DSP) parameters in the `pyDSP` module, the type of regression model to use and parameters specific to the dataset - a description of each parameter is available on the [CONFIG.md][config] file. + +These config files offer a more straightforward way of making any changes to the `pySAR` pipeline. The names of **All** the parameters as listed in the example config files must remain unchanged, only the value of each parameter should be changed, any parameters not being used can be set to null. Additionally, you can pass in the individual parameter names and values to the `pySAR` and `Encoding` classes when numerically encoding the protein sequences via **kwargs**. An example of the config file used in my research project ([thermostability.json](https://github.com/amckenna41/pySAR/blob/master/config/thermostability.json)), with most of the available parameters, can be seen below and in the example config file - [CONFIG.md][config]. ```json { @@ -28,7 +28,7 @@ Usage }, "descriptors": { - "descriptors_csv": "descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag":30, @@ -52,19 +52,24 @@ Usage } } } - -
Encoding using all 566 AAIndex indices:
-Encoding protein sequences in dataset using all 566 indices in the AAI database. Each sequence encoded via an index in the AAI can be passed through an additional step where its protein spectra can be generated following an FFT. `pySAR` supports generation of the power, imaginary, real or absolute spectra as well as other DSP functionalities including windowing, convolution and filter functions. In the example below, the encoded sequences will be used to generate a imaginary protein spectra with a blackman window function applied. This will then be used as feature data to build a predictive model that can be used for accurate prediction of the sought activity value of unseen protein sequences. The encoding class also takes only the JSON config file as input which will have all the required parameter values. The output results will show the calculated metric values for each index in the AAI when measuring predicted vs observed activity values for the unseen test sequences.
+``` +### Examples + +
Encoding protein sequences using all 566 AAIndex indices:
+Encoding protein sequences in dataset using all 566 indices in the AAI1 database. Each sequence encoded via an index in the AAI can be passed through an additional step where its protein spectra can be generated following an FFT. pySAR supports generation of the power, imaginary, real or absolute spectra as well as other DSP functionalities including windowing and filter functions.
+ +In the example below, the encoded sequences will be used to generate a imaginary protein spectra with a blackman window function applied. This will then be used as feature data to build a predictive regression ML model that can be used for accurate prediction of the sought activity value (thermostability) of unseen protein sequences. The encoding class also takes the JSON config file as input which will have all the required parameter values. The output results will show the calculated metric values for each index in the AAI when measuring predicted vs observed activity values for the unseen test sequences.
```python +#import encoding module from pySAR.encoding import * -'''test_config.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset1.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -76,18 +81,21 @@ from pySAR.encoding import * { "use_dsp": 1, "spectrum": "imaginary", - "window": "blackman" + "window": { + "type": "blackman" + } } } ''' #create instance of Encoding class, using RF algorithm with its default params -encoding = Encoding(config_file='test_config.json') +encoding = Encoding(config_file='thermostability.json') #encode sequences using all indices in the AAI if input parameter "aai_indices" is empty/None aai_encoding = encoding.aai_encoding() ``` -Output results showing AAI index and its category as well as all the associated metric values for each predictive model: +Output results showing AAI index and its category as well as all the associated metric values for each predictive model. From the results below we can determine that the **CHOP780206** index in the AAI has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of new unseen sequences: + | | Index | Category | R2 | RMSE | MSE | RPD | MAE | Explained Var | |---:|:-----------|:-----------|---------:|--------:|--------:|--------:|--------:|----------------:| | 0 | CHOP780206 | secondary_struct | 0.62737 | 3.85619 | 14.8702 | 1.63818 | 3.16755 | 0.713467 | @@ -98,17 +106,18 @@ Output results showing AAI index and its category as well as all the associated
Encoding using list of 4 AAI indices, with no DSP functionalities:
-Same procedure as prior, except 4 indices from the AAI are being specifically input into the function, with the encoded sequence output being concatenated together and used as feature data to build the predictive PlsRegression model with its default parameters. The config parameter use_dsp tells the function to not generate the protein spectra or apply any additional DSP processing to the sequences.
+This method follows a similar procedure as the previous step, except 4 indices from the AAI are being specifically input into the function, with the encoded sequence output being concatenated together and used as feature data to build the predictive PLSRegression model with its default parameters. The config parameter use_dsp tells the function to not generate the protein spectra or apply any additional DSP processing to the sequences.
```python +#import encoding module from pySAR.encoding import * -'''test_config2.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset2.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -124,13 +133,14 @@ from pySAR.encoding import * } ''' #create instance of Encoding class, using PLS algorithm with its default params -encoding = Encoding(config_file='test_config2.json') +encoding = Encoding(config_file='thermostability.json') #encode sequences using 4 indices specified by user, use_dsp = False -aai_encoding = encoding.aai_encoding(aai_list=["PONP800102","RICJ880102","ROBB760107","KARS160113"]) +aai_encoding = encoding.aai_encoding(aai_indices=["PONP800102","RICJ880102","ROBB760107","KARS160113"]) ``` -Output DataFrame showing the 4 predictive models built using the PLS algorithm, with the 4 indices from the AAI: +Output DataFrame showing the 4 predictive models built using the PLS algorithm, with the 4 indices from the AAI. From the results below we can determine that the **PONP800102** index in the AAI has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of unseen sequences: + | | Index | Category | R2 | RMSE | MSE | RPD | MAE | Explained Var | |---:|:-----------|:------------|---------:|--------:|---------:|--------:|--------:|----------------:| | 0 | PONP800102 | hydrophobic | 0.74726 | 3.0817 | 9.49688 | 1.98913 | 2.63742 | 0.751032 | @@ -140,18 +150,19 @@ Output DataFrame showing the 4 predictive models built using the PLS algorithm,
-
Encoding protein sequences using their calculated protein descriptors:
-Calculate the protein descriptor values for a dataset of protein sequences from the 15 available descriptors in the descriptors module. Use each descriptor as a feature set in the building of the predictive models used to predict the activity value of unseen sequences. By default, the function will look for a csv file pointed to by the "descriptors_csv" parameter in the config file that contains the pre-calculated descriptor values for a dataset. If file is not found then all descriptor values will be calculated for the dataset using the descriptors_ module. If a descriptor in the config file is to be used in the feature data, its parameter is set to true/1.
+
Encoding protein sequences using all available protein descriptors:
+Calculate the protein descriptor values for a dataset of protein sequences from the 15 available descriptors in the descriptors module. Use each descriptor as a feature set in the building of the predictive ML models used to predict the activity value of unseen sequences. By default, the function will look for a csv file pointed to by the "descriptors_csv" parameter in the config file that contains the pre-calculated descriptor values for a dataset. If file is not found then all descriptor values will be calculated for the dataset using the descriptors module and custom-built protpy package. ```python +#import encoding module from pySAR.encoding import * -'''test_config3.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset3.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -164,7 +175,7 @@ from pySAR.encoding import * }, "descriptors": { - "descriptors_csv": "precalculated_descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -176,35 +187,36 @@ from pySAR.encoding import * } ''' #create instance of Encoding class using AdaBoost algorithm, using 100 estimators & a learning rate of 1.5 -encoding = Encoding(config_file='test_config3.json') +encoding = Encoding(config_file='thermostability.json') -#building predictive models using all available descriptors -# calculating evaluation metrics values for models and storing into desc_results_df DataFrame +#building predictive models using all available descriptors, calculating evaluation metrics values for +# models and storing into desc_results_df DataFrame desc_results_df = encoding.descriptor_encoding() - ``` -Output results showing the protein descriptor and its group as well as all the associated metric values for each predictive model: +Output results showing the protein descriptor and its group as well as all the associated metric values for each predictive model. From the results below we can determine that the **CTD Distribution** descriptor has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of unseen sequences: + | | Descriptor | Group | R2 | RMSE | MSE | RPD | MAE | Explained Var | |---:|:------------------------|:----------------|---------:|--------:|--------:|--------:|--------:|----------------:| -| 0 | _distribution | CTD | 0.721885 | 3.26159 | 10.638 | 1.89621 | 2.60679 | 0.727389 | -| 1 | _geary_autocorrelation | Autocorrelation | 0.648121 | 3.67418 | 13.4996 | 1.68579 | 2.82868 | 0.666745 | -| 2 | _tripeptide_composition | Composition | 0.616577 | 3.3979 | 11.5457 | 1.61496 | 2.53736 | 0.675571 | -| 3 | _aa_composition | Composition | 0.612824 | 3.37447 | 11.3871 | 1.60711 | 2.79698 | 0.643864 | +| 0 | ctd_d | CTD | 0.721885 | 3.26159 | 10.638 | 1.89621 | 2.60679 | 0.727389 | +| 1 | geary_autocorrelation | Autocorrelation | 0.648121 | 3.67418 | 13.4996 | 1.68579 | 2.82868 | 0.666745 | +| 2 | tripeptide_composition | Composition | 0.616577 | 3.3979 | 11.5457 | 1.61496 | 2.53736 | 0.675571 | +| 3 | amino_acid_composition | Composition | 0.612824 | 3.37447 | 11.3871 | 1.60711 | 2.79698 | 0.643864 | | 4 | ...... | ...... | ...... | ...... | ...... | ...... | ...... | ...... |
Encoding using AAI + protein descriptors:
-Encoding protein sequences in dataset using all 566 indices in the AAI database combined with protein descriptors. All 566 indices can be used in concatenation with 1, 2 or 3 descriptors. E.g: at each iteration the encoded sequences using the indices from the AAI will be used to generate a protein spectra using the power spectrum with no window function applied, this will then be combined with the feature set generated from the dataset's descriptor values and used to build a predictive model that can be used for accurate prediction of the sought activity value of unseen protein sequences. The output results will show the calculated metric values when measuring predicted vs observed activity values for the test sequences.
+Encoding protein sequences in the dataset using ALL 566 indices in the AAI database combined with ALL available protein descriptors. All 566 indices can be used in concatenation with 1, 2 or 3 descriptors. At each iteration the encoded sequences generated from the indices from the AAI will be combined with the feature set generated from the dataset's descriptor values and used to build a predictive regression ML model that can be used for the accurate prediction of the sought activity/fitness value of unseen protein sequences. The output results will show the calculated metric values when measuring predicted vs observed activity values for the test sequences.
```python +#import encoding module from pySAR.encoding import * -'''test_config4.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset4.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... } "model": @@ -219,7 +231,7 @@ from pySAR.encoding import * }, "descriptors": { - "descriptors_csv": "precalculated_descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -230,22 +242,21 @@ from pySAR.encoding import * }, "pyDSP": { - "use_dsp": 1, + "use_dsp": 0, "spectrum": "power", "window": "" ... } } ''' -#create instance of Encoding class using RF algorithm, using 100 estimators with a learning rate of 1.5 -encoding = Encoding('test_config4.json') - -#building predictive models using all available aa_indices + combination of 2 descriptors, -# calculating evaluation metric values for models and storing into aai_desc_results_df DataFrame -aai_desc_results_df = encoding.aai_descriptor_encoding(desc_combo=2) +#create instance of Encoding class using RF algorithm, using 100 estimators with a learning rate of 1.5 - as listed in config +encoding = Encoding('thermostability.json') +#building predictive models using all available aa_indices + descriptors, calculating evaluation metric values for models and storing into aai_desc_results_df DataFrame +aai_desc_results_df = encoding.aai_descriptor_encoding() ``` -Output results showing AAI index and its category, the protein descriptor and its group as well as the R2 and RMSE values for each predictive model: + +Output results showing AAI index and its category, the protein descriptor and its group as well as all output metric values for each predictive model. From the results below we can determine that the **ARGP820103** index in concatenation with the **Conjoint Triad** descriptor has the highest predictability (R2 score) for our chosen dataset (thermostability) and this generated model can be used for predicting the thermostability of unseen sequences: | | Index | Category | Descriptor | Descriptor Group | R2 | RMSE | |---:|:-----------|:------------|:---------------------------|:---------------------|---------:|--------:| @@ -256,18 +267,19 @@ Output results showing AAI index and its category, the protein descriptor and it | 4 | ..... | ..... | ..... | ..... | ..... | ..... |
-
Building predictive model from AAI and protein descriptors:
-e.g: the below code will build a PlsRegression model using the AAI index CIDH920105 and the 'amino acid composition' descriptor. The index is passed through a DSP pipeline and is transformed into its informational protein spectra using the power spectra, with a hamming window function applied to the output of the FFT. The concatenated features from the AAI index and the descriptor will be used as the feature data in building the PLS model.
+
Building predictive model from subset of AAI and protein descriptors:
+The below code will build a PLSRegression model using the AAI index CIDH920105 and the amino acid composition descriptor. The index is passed through a DSP pipeline and is transformed into its informational protein spectra using the power spectra, with a hamming window function applied to the output of the FFT. The concatenated features from the AAI index and the descriptor will be used as the feature data in building the PLS ML model. This model is then used to access its predictability by testing on test unseen sequences. The output results will show the calculated metric values when measuring predicted vs observed activity values for the test sequences.
```python -import pySAR as pysar #import pySAR package +#import pySAR module +from pySAR.pySAR import * -'''test_config5.json +'''thermostability.json { "dataset": { - "dataset": "test_dataset5.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... }, "model": @@ -278,7 +290,7 @@ import pySAR as pysar #import pySAR package }, "descriptors": { - "descriptors_csv": "precalculated_descriptors.csv", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -296,32 +308,73 @@ import pySAR as pysar #import pySAR package } } ''' -#create instance of PySAR class -pySAR = pysar.PySAR(config_file="test_config5.json") -""" -PySAR parameters: +#create instance of PySAR class, inputting path to configuration file +pySAR = PySAR(config_file="thermostability.json") -:config_file : str - full path to config file containing all required pySAR parameters. +#encode protein sequences using both the CIDH920105 index + aa_composition descriptor +results_df = pySAR.encode_aai_descriptor(aai_indices="CIDH920105", descriptors="amino_acid_composition") +``` -""" -#encode protein sequences using both the CIDH920105 index + aa_composition descriptor. -results_df = pySAR.encode_aai_desc(indices="CIDH920105", descriptors="amino_acid_composition") +Output results showing AAI index and its category, the protein descriptor and its group as well as the metric values for the generated predictive model. From the results below we can determine that the **CIDH920105** index in concatenation with the **Amino Acid Composition** descriptor has medium predictability (R2 score) but a high error rate (MSE/RMSE) for our chosen dataset (thermostability) and this feature set combination is not that effective for predicting the thermostability of unseen sequences: + +```python +########################################################################################## +###################################### Parameters ######################################## + +# AAI Indices: CIDH920105 +# Descriptors: amino_acid_composition +# Configuration File: thermostability_config.json +# Dataset: thermostability.txt +# Number of Sequences/Sequence Length: 261 x 466 +# Target Activity: T50 +# Algorithm: PLSRegression +# Model Parameters: {'copy': True, 'max_iter': 500, 'n_components': 2, 'scale': True, +#'tol': 1e-06} +# Test Split: 0.2 +# Feature Space: (261, 486) + +########################################################################################## +######################################## Results ######################################### + +# R2: 0.6720111107323943 +# RMSE: 3.7522525079464457 +# MSE: 14.079398883390391 +# MAE: 3.0713217158459805 +# RPD 1.7461053136208489 +# Explained Variance 0.6721157080699659 + +########################################################################################## ```
-
Calculate and export all protein descriptors:
-Prior to evaluating the various available properties and features at which to encode a set of protein sequences, it is reccomened that you pre-calculate all the available descriptors in one go, saving them to a csv for later that `pySAR` will then import from. Output values are stored in csv set by descriptors_csv config parameter. Output will be of the shape N x M, using the default parameters, where N is the number of protein sequences in the dataset and M is the total number of features calculated from all 15 descriptors which varies depending on some descriptor-specific metaparameters.
+
Calculate individual descriptor values, e.g Tripeptide Composition and Geary Autocorrelation:
+The individual protein descriptor values for the dataset of protein sequences can be calculated using the custom-built protpy package via the descriptor module. The full list of descriptors can be seen via the function all_descriptors_list() as well as on the protpy repo homepage. ```python -from pySAR.descriptors_ import * +#import descriptors class +from pySAR.descriptors import * + +#create instance of descriptors class +desc = Descriptors(config_file="thermostability.json") + +#calculate tripeptide composition descriptor +tripeptide_composition = desc.get_tripeptide_composition() + +#calculate geary autocorrelation descriptor +geary_autocorrelation = desc.get_geary_autocorrelation() +``` +
+ +
Calculate and export all protein descriptors:
+Prior to evaluating the various available properties and features at which to encode a set of protein sequences, it is reccomened that you pre-calculate all the available descriptors in one go, saving them to a csv for later that pySAR will then import from. Output values are stored in a csv set by the descriptors_csv config parameter (the name of the exported csv via the descriptors_export_filename parameter can also be passed into the function). Output will be of the shape N x M, where N is the number of protein sequences in the dataset and M is the total number of features calculated from all 15 descriptors which varies depending on some descriptor-specific metaparameters. For example, using the thermostability dataset, the output will be 261 x 9714.
-'''test_config6.json +```python +'''thermostability.json { "dataset": { - "dataset": "test_dataset5.txt", - "activity": "sought_activity" + "dataset": "thermostability.txt", + "activity": "T50" ... }, "model": @@ -330,7 +383,7 @@ from pySAR.descriptors_ import * } "descriptors": { - "descriptors_csv": "precalculated_descriptors", + "descriptors_csv": "descriptors_thermostability.csv", "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", @@ -345,14 +398,19 @@ from pySAR.descriptors_ import * } } ''' -#calculating all descriptor values and storing in file named by parameter descriptors_csv -desc = Descriptors("test_config6") +#import descriptors class +from pySAR.descriptors import * + +#create instance of descriptors class +desc = Descriptors(config_file="thermostability.json") +#export all descriptors to csv using parameters in config, export=True will export to csv +desc.get_all_descriptors(export=True, descriptors_export_filename="descriptors_thermostability.csv") ```
Get record from AAIndex database:
-A custom-built package called `aaindex` was created for this project to work with all the data in the AAIndex databases, primarily the aaindex1. The AAIndex library offers diverse functionalities for obtaining all data from all records within the aaindex1. Each record is stored in json format and can be retrieved via its accession number. Each record contains the following attributes: description, references, category, notes, correlation coefficient, pmid and values.
+A custom-built package called aaindex was created for this project to work with all the data in the AAIndex databases, primarily the aaindex1. The AAIndex library offers diverse functionalities for obtaining all data from all records within the aaindex1. Each record is stored in json format and can be retrieved via its accession number, and can also be searched via its name/description. Each record contains the following attributes: description, references, category, notes, correlation coefficient, pmid and values.
```python from aaindex import aaindex1 @@ -369,6 +427,7 @@ values = aaindex1['CHOP780206'].values #get amino acid values from record num_record = aaindex1.num_records() #get total number of records record_names = aaindex1.record_names() #get list of all record names amino_acids = aaindex1.amino_acids() #get list of all canonical amino acids +records = aaindex1.search("hydrophobicity") #get all records with hydrophobicity in their title/description ```
diff --git a/pySAR/__init__.py b/pySAR/__init__.py index 3da09aa..8e6aa6c 100644 --- a/pySAR/__init__.py +++ b/pySAR/__init__.py @@ -1,6 +1,6 @@ """ pySAR software metadata. """ __name__ = 'pySAR' -__version__ = "2.4.0" +__version__ = "2.4.1" __description__ = 'A Python package used to analysis Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning.' __author__ = 'AJ McKenna, https://github.com/amckenna41' __authorEmail__ = 'amckenna41@qub.ac.uk' @@ -10,5 +10,5 @@ __download_url__ = "https://github.com/amckenna41/pySAR/archive/refs/heads/main.zip" __status__ = "Production" __keywords__ = ["bioinformatics", "protein engineering", "python", "pypi", "machine learning", \ - "directed evolution", "sequence activity relationships", "SAR", "aaindex", "protein descriptors"] + "directed evolution", "drug discovery", "sequence activity relationships", "SAR", "aaindex", "protein descriptors"] __test_suite__ = "tests" \ No newline at end of file diff --git a/pySAR/descriptors.py b/pySAR/descriptors.py index e7a184a..a184243 100644 --- a/pySAR/descriptors.py +++ b/pySAR/descriptors.py @@ -8,8 +8,9 @@ import json from json import JSONDecodeError import itertools +import time +from tqdm import tqdm -from .globals_ import DATA_DIR from .utils import * import protpy as protpy @@ -18,40 +19,46 @@ class Descriptors(): Class for calculating a wide variety of protein physiochemical, biochemical and structural descriptors. These descriptors have been used in a wide variety of Bioinformaitcs applications including: protein strucutral and functional class prediction, - protein-protein interactions, subcellular location, secondary structure prediction etc. - They represent the different structural, functional & interaction profiles of proteins - by exploring the features in the groups of composition, correlation and distribution + protein-protein interactions, subcellular location, secondary structure prediction, among + many more. They represent the different structural, functional & interaction profiles of + proteins by exploring the features in the groups of composition, correlation and distribution of the constituent residues and their biochemical and physiochemical properties. A custom-built software package was created to generate these descriptors - protpy, which - is also open-souurce and available here: https://github.com/amckenna41/protpy. The package + is also open-source and available here: https://github.com/amckenna41/protpy. The package takes 1 or more protein sequences, returning the respective descriptor values in a Pandas DataFrame. protpy and this class allows calculation of the following descriptors: Amino Acid Compostion (AAComp), Dipeptide Composition (DPComp), Tripeptide Composition (TPComp), MoreauBroto Autocorrelation (MBAuto), Moran Autocorrelation (MAuto), Geary Autocorrelation - (GAuto), Composition (C), Transition (T), Distribution (D), CTD, Conjoint Triad (CTriad), - Sequence Order Coupling Number (SOCN), Quasi Sequence Order (QSO), Pseudo Amino Acid + (GAuto), Composition (CTD_C), Transition (CTD_T), Distribution (CTD_D), CTD, Conjoint Triad + (CTriad), Sequence Order Coupling Number (SOCN), Quasi Sequence Order (QSO), Pseudo Amino Acid Composition - type 1 (PAAcomp) and Amphiphilic Pseudo Amino Acid Composition - type 2 (APAAComp). Similar to other classes in pySAR, this class works via configuration files which contain the values for all the potential parameters, if applicable, of each descriptor. By default, - the class will look for a descriptors csv which is a file of the pre-calcualted descriptor + the class will look for a descriptors csv which is a file of the pre-calculated descriptor values for the specified dataset, if this file doesn't exist, or the parameter value is blank, - then each descriptor will have to be calculated using its respective function. + then each descriptor will have to be calculated using its respective function. It is reccomended that with every new dataset, the Descriptors class should be instantiated with the "all_desc" parameter set to 1 in the config file. This will calculate all the descriptor values for the dataset of protein sequences, storing the result in a csv file, meaning that - this file can be used for future use and the descriptors will not have to be recalculated. + this file can be used for future use and the descriptors will not have to be recalculated each + time. This csv file will be saved to the path and filename according to the "descriptors_csv" + parameter in the config file. Parameters ========== :config_file: str path to configuration file which will contain the various parameter values for all descriptors. If invalid value input then error will be raised. - :protein_seqs : np.ndarray + :protein_seqs: np.ndarray array of protein sequences that descriptors will be calculated for. If set to none or empty then error will be raised. + **kwargs: dict + keyword argument names and values for the dataset filename/path and the descriptors + csv path parameters. The keywords should be the same name and form of those in the + configuration file. The keyword values input take precedence over those in the config files. References ========== @@ -91,7 +98,7 @@ class Descriptors(): evolution". Science. 185 (4154): 862–864. Bibcode:1974Sci...185..862G. doi:10.1126/science.185.4154.862. ISSN 0036-8075. PMID 4843792. S2CID 35388307. """ - def __init__(self, config_file="", protein_seqs=None): + def __init__(self, config_file="", protein_seqs=None, **kwargs): self.config_file = config_file self.protein_seqs = protein_seqs @@ -122,28 +129,23 @@ def __init__(self, config_file="", protein_seqs=None): self.dataset_parameters = Map(self.config_parameters["dataset"]) self.desc_parameters = Map(self.config_parameters["descriptors"]) - #create data directory if doesnt exist - if not (os.path.isdir(DATA_DIR)): - os.makedirs(DATA_DIR) + #set dataset and descriptors csv filepath from kwargs, if applicable, or the config file values + self.dataset_filepath = kwargs.get('dataset_filepath') if 'dataset_filepath' in kwargs else self.dataset_parameters["dataset"] + self.descriptors_csv = kwargs.get('descriptors_csv') if 'descriptors_csv' in kwargs else self.desc_parameters.descriptors_csv #import protein sequences from dataset if not directly specified in protein_seqs input param if not (isinstance(self.protein_seqs, pd.Series)): if (self.protein_seqs is None or self.protein_seqs == ""): - dataset_filepath = "" #open dataset and read protein seqs if protein_seqs is empty/None - if (os.path.isfile(self.dataset_parameters["dataset"])): - dataset_filepath = self.dataset_parameters["dataset"] - elif (os.path.isfile(os.path.join(DATA_DIR, self.dataset_parameters["dataset"]))): - dataset_filepath = os.path.join(DATA_DIR, self.dataset_parameters["dataset"]) - else: - raise OSError('Dataset file not found at path: {}.'.format(dataset_filepath)) + if not (os.path.isfile(self.dataset_filepath)): + raise OSError('Dataset file not found at path: {}.'.format(self.dataset_filepath)) #read in dataset csv from filepath mentioned in config try: - data = pd.read_csv(dataset_filepath, sep=",", header=0) + data = pd.read_csv(self.dataset_filepath, sep=",", header=0) self.protein_seqs = data[self.dataset_parameters["sequence_col"]] except: - raise IOError('Error opening dataset file: {}.'.format(dataset_filepath)) + raise IOError('Error opening dataset file: {}.'.format(self.dataset_filepath)) else: #if 1 protein sequence (1 string) input then convert to pandas Series object if (isinstance(self.protein_seqs, str)): @@ -162,8 +164,7 @@ def __init__(self, config_file="", protein_seqs=None): #valid amino acids, if not then raise ValueError invalid_seqs = valid_sequence(self.protein_seqs) if (invalid_seqs != None): - raise ValueError('Invalid Amino Acids found in protein sequence dataset: {}.'. - format(invalid_seqs)) + raise ValueError('Invalid Amino Acids found in protein sequence dataset: {}.'.format(invalid_seqs)) #get the total number of inputted protein sequences self.num_seqs = len(self.protein_seqs) @@ -187,14 +188,13 @@ def __init__(self, config_file="", protein_seqs=None): self.all_descriptors = pd.DataFrame() #append extension if just the filename input as descriptors csv - if ((self.desc_parameters.descriptors_csv != '' and self.desc_parameters.descriptors_csv != None) - and (os.path.splitext(self.desc_parameters.descriptors_csv)[1] == '')): - self.desc_parameters.descriptors_csv = self.desc_parameters.descriptors_csv + ".csv" + if ((self.descriptors_csv != '' and self.descriptors_csv != None) + and (os.path.splitext(self.descriptors_csv)[1] == '')): + self.descriptors_csv = self.descriptors_csv + ".csv" #try importing descriptors csv with pre-calculated descriptor values - if (os.path.isfile(self.desc_parameters.descriptors_csv) or - os.path.isfile((os.path.join(DATA_DIR, self.desc_parameters.descriptors_csv)))): - self.import_descriptors(self.desc_parameters.descriptors_csv) + if (os.path.isfile(self.descriptors_csv)): + self.import_descriptors(self.descriptors_csv) #get the total number of inputted protein sequences self.num_seqs = self.all_descriptors.shape[0] @@ -210,9 +210,9 @@ def __init__(self, config_file="", protein_seqs=None): #list of available protein descriptors self.valid_descriptors = [ 'amino_acid_composition', 'dipeptide_composition', 'tripeptide_composition', - 'moreaubroto_autocorrelation','moran_autocorrelation','geary_autocorrelation', + 'moreaubroto_autocorrelation', 'moran_autocorrelation', 'geary_autocorrelation', 'ctd', 'ctd_composition', 'ctd_transition', 'ctd_distribution', 'conjoint_triad', - 'sequence_order_coupling_number','quasi_sequence_order', + 'sequence_order_coupling_number', 'quasi_sequence_order', 'pseudo_amino_acid_composition', 'amphiphilic_pseudo_amino_acid_composition' ] @@ -226,7 +226,7 @@ def import_descriptors(self, descriptor_filepath=""): Parameters ========== - :descriptor_filepath : str + :descriptor_filepath: str filepath to pre-calculated descriptor csv file. Returns @@ -239,9 +239,7 @@ def import_descriptors(self, descriptor_filepath=""): #verify descriptors csv exists at filepath if not (os.path.isfile(descriptor_filepath)): - descriptor_filepath = os.path.join(DATA_DIR, descriptor_filepath) - if not (os.path.isfile(descriptor_filepath)): - raise OSError('Descriptors csv file does not exist at filepath: {}.'.format(descriptor_filepath)) + raise OSError('Descriptors csv file does not exist at filepath: {}.'.format(descriptor_filepath)) #import descriptors csv as dataframe try: @@ -267,12 +265,12 @@ def import_descriptors(self, descriptor_filepath=""): tripeptide_composition_dim = (420, 8420) self.tripeptide_composition = descriptor_df.iloc[:,tripeptide_composition_dim[0]:tripeptide_composition_dim[1]] - #dimension of autocorrelation descriptors depends on the lag value and number of properties - norm_moreaubroto_dim = (8420, + #dimension of autocorrelation (moreaubroto, moran and geary) descriptors depends on the lag value and number of properties + moreaubroto_dim = (8420, 8420 + (self.desc_parameters.moreaubroto_autocorrelation["lag"] * len(self.desc_parameters.moreaubroto_autocorrelation["properties"]))) - self.moreaubroto_autocorrelation = descriptor_df.iloc[:,norm_moreaubroto_dim[0]:norm_moreaubroto_dim[1]] + self.moreaubroto_autocorrelation = descriptor_df.iloc[:,moreaubroto_dim[0]:moreaubroto_dim[1]] - moran_auto_dim = (norm_moreaubroto_dim[1], norm_moreaubroto_dim[1] + + moran_auto_dim = (moreaubroto_dim[1], moreaubroto_dim[1] + (self.desc_parameters.moran_autocorrelation["lag"] * len(self.desc_parameters.moran_autocorrelation["properties"]))) self.moran_autocorrelation = descriptor_df.iloc[:,moran_auto_dim[0]: moran_auto_dim[1]] @@ -311,6 +309,7 @@ def import_descriptors(self, descriptor_filepath=""): self.conjoint_triad = descriptor_df.iloc[:,conjoint_triad_dim[0]:conjoint_triad_dim[1]] + #socn value dependant on value of lag and distance matrix socn_lag = self.desc_parameters.sequence_order_coupling_number["lag"] socn_distance_matrix = self.desc_parameters.sequence_order_coupling_number["distance_matrix"] @@ -335,6 +334,7 @@ def import_descriptors(self, descriptor_filepath=""): self.quasi_sequence_order = descriptor_df.iloc[:,quasi_seq_order_dim[0]:quasi_seq_order_dim[1]] + #paac value dependant on lambda value paac_lambda = self.desc_parameters.pseudo_amino_acid_composition["lambda"] pseudo_amino_acid_composition_dim = (quasi_seq_order_dim[1], quasi_seq_order_dim[1] + (20 + paac_lambda)) @@ -367,7 +367,7 @@ def get_amino_acid_composition(self): Returns ======= - :amino_acid_composition : pd.Dataframe + :amino_acid_composition: pd.Dataframe pandas dataframe of AAComp for protein sequence. Dataframe will be of the shape N x 20, where N is the number of protein sequences and 20 is the number of features calculated from the descriptor @@ -380,7 +380,7 @@ def get_amino_acid_composition(self): #initialise dataframe aa_comp_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value for each sequence, concatenate descriptor values for seq in self.protein_seqs: aa_comp_seq = protpy.amino_acid_composition(seq) aa_comp_df = pd.concat([aa_comp_df, aa_comp_seq]) @@ -394,7 +394,7 @@ def get_dipeptide_composition(self): Calculate Dipeptide Composition (DPComp) for protein sequence using the custom-built protpy package. Dipeptide composition is the fraction of each dipeptide type within a protein sequence. With dipeptides - being of length 2 and there being 20 canonical amino acids this creates + being of length 2 and there being 20 canonical amino acids, this creates 20^2 different combinations, thus a 400-Dimensional vector will be produced such that: @@ -411,7 +411,7 @@ def get_dipeptide_composition(self): Returns ======= - :dipeptide_composition : pd.Dataframe + :dipeptide_composition: pd.Dataframe pandas Dataframe of dipeptide composition for protein sequence. Dataframe will be of the shape N x 400, where N is the number of protein sequences and 400 is the number of features calculated from the descriptor (20^2 for the 20 canonical @@ -424,7 +424,7 @@ def get_dipeptide_composition(self): #initialise dataframe dipeptide_comp_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequence, concatenate descriptor values for seq in self.protein_seqs: dipeptide_comp_seq = protpy.dipeptide_composition(seq) dipeptide_comp_df = pd.concat([dipeptide_comp_df, dipeptide_comp_seq]) @@ -455,7 +455,7 @@ def get_tripeptide_composition(self): Returns ======= - :tripeptide_composition : pd.Dataframe + :tripeptide_composition: pd.Dataframe pandas Dataframe of tripeptide composition for protein sequence. Dataframe will be of the shape N x 8000, where N is the number of protein sequences and 8000 is the number of features calculated from the descriptor (20^3 for the 20 canonical @@ -468,7 +468,7 @@ def get_tripeptide_composition(self): #initialise dataframe tripeptide_comp_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequence, concatenate descriptor values for seq in self.protein_seqs: tripeptide_comp_seq = protpy.tripeptide_composition(seq) tripeptide_comp_df = pd.concat([tripeptide_comp_df, tripeptide_comp_seq]) @@ -485,6 +485,7 @@ def get_moreaubroto_autocorrelation(self): describe the level of correlation between two objects (protein or peptide sequences) in terms of their specific structural or physicochemical properties, which are defined based on the distribution of amino acid properties along the sequence. + By default, 8 amino acid properties are used for deriving the descriptors. The derivations and detailed explanations of this type of descriptor is outlind in [4]. The MBAuto descriptor is a type of Autocorrelation descriptor that uses @@ -495,14 +496,14 @@ def get_moreaubroto_autocorrelation(self): is set in the config file. Using the default 8 properties with default lag value of 30, 240 features are generated, the default 8 properties are: - AccNo. CIDH920105 - Normalized Average Hydrophobicity Scales - AccNo. BHAR880101 - Average Flexibility Indices - AccNo. CHAM820101 - Polarizability Parameter - AccNo. CHAM820102 - Free Energy of Solution in Water, kcal/mole - AccNo. CHOC760101 - Residue Accessible Surface Area in Tripeptide - AccNo. BIGC670101 - Residue Volume - AccNo. CHAM810101 - Steric Parameter - AccNo. DAYM780201 - Relative Mutability + AccNo. CIDH920105 - Normalized Average Hydrophobicity Scales. + AccNo. BHAR880101 - Average Flexibility Indices. + AccNo. CHAM820101 - Polarizability Parameter. + AccNo. CHAM820102 - Free Energy of Solution in Water, kcal/mole. + AccNo. CHOC760101 - Residue Accessible Surface Area in Tripeptide. + AccNo. BIGC670101 - Residue Volume. + AccNo. CHAM810101 - Steric Parameter. + AccNo. DAYM780201 - Relative Mutability. Parameters ========== @@ -510,7 +511,7 @@ def get_moreaubroto_autocorrelation(self): Returns ======= - :moreaubroto_autocorrelation : pd.Dataframe + :moreaubroto_autocorrelation: pd.Dataframe pandas Dataframe of MBAuto values for protein sequence. Output will be of the shape N x M, where N is the number of protein sequences and M is the number of features calculated from the descriptor, calculated @@ -527,15 +528,15 @@ def get_moreaubroto_autocorrelation(self): normalize = self.desc_parameters.moreaubroto_autocorrelation["normalize"] #initialise dataframe - norm_moreaubroto_df = pd.DataFrame() + moreaubroto_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequence, concatenate descriptor values for seq in self.protein_seqs: - norm_moreaubroto_seq = protpy.moreaubroto_autocorrelation(seq, lag=lag, + moreaubroto_seq = protpy.moreaubroto_autocorrelation(seq, lag=lag, properties=properties, normalize=normalize) - norm_moreaubroto_df = pd.concat([norm_moreaubroto_df, norm_moreaubroto_seq]) + moreaubroto_df = pd.concat([moreaubroto_df, moreaubroto_seq]) - self.moreaubroto_autocorrelation = norm_moreaubroto_df + self.moreaubroto_autocorrelation = moreaubroto_df return self.moreaubroto_autocorrelation @@ -551,7 +552,7 @@ def get_moran_autocorrelation(self): Returns ======= - :moran_autocorrelation : pd.DataFrame + :moran_autocorrelation: pd.DataFrame pandas Dataframe of MAuto values for protein sequence. Output will be of the shape N x M, where N is the number of protein sequences and M is the number of features calculated from the descriptor, @@ -571,7 +572,7 @@ def get_moran_autocorrelation(self): #initialise dataframe moran_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequence, concatenate descriptor values for seq in self.protein_seqs: moran_seq = protpy.moran_autocorrelation(seq, lag=lag, properties=properties, normalize=normalize) @@ -595,7 +596,7 @@ def get_geary_autocorrelation(self): Returns ======= - :geary_autocorrelation : pd.DataFrame + :geary_autocorrelation: pd.DataFrame pandas Dataframe of GAuto values for protein sequence. Output will be of the shape N x M, where N is the number of protein sequences and M is the number of features calculated from the descriptor, calculated @@ -614,7 +615,7 @@ def get_geary_autocorrelation(self): #initialise dataframe geary_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequence, concatenate descriptor values for seq in self.protein_seqs: geary_seq = protpy.geary_autocorrelation(seq, lag=lag, properties=properties, normalize=normalize) @@ -627,7 +628,9 @@ def get_geary_autocorrelation(self): def get_ctd_composition(self): """ Calculate Composition (C_CTD) physiochemical/structural descriptor - of protein sequences from the calculated CTD descriptor. + of protein sequences from the calculated CTD descriptor. Composition + is determined as the number of amino acids of a particular property + divided by total number of amino acids, Parameters ========== @@ -635,7 +638,7 @@ def get_ctd_composition(self): Returns ======= - :ctd_composition : pd.DataFrame + :ctd_composition: pd.DataFrame pandas dataframe of C_CTD values for protein sequence. Output will be of the shape N x M, where N is the number of protein sequences and M is the (number of physiochemical properties * 3), with 3 @@ -654,14 +657,13 @@ def get_ctd_composition(self): #initialise dataframe comp_df = pd.DataFrame() - #get ctd properties used for calculating descriptor + #get ctd properties used for calculating descriptor ctd_property = self.desc_parameters.ctd["property"] if not (isinstance(ctd_property, list)): ctd_property = ctd_property.split(',') all_ctd = self.desc_parameters.ctd["all"] - #get composition descriptor from CTD dataframe, dependant on number of props, - #3 features per property + #get composition descriptor from CTD dataframe, dependant on number of props, 3 features per property if (all_ctd): comp_df = self.ctd.iloc[:,0:21] else: @@ -674,7 +676,9 @@ def get_ctd_composition(self): def get_ctd_transition(self): """ Calculate Transition (T_CTD) physiochemical/structural descriptor of - protein sequences from the calculated CTD descriptor. + protein sequences from the calculated CTD descriptor. Transition is + determined as the number of transitions from a particular property to + different property divided by (total number of amino acids − 1). Parameters ========== @@ -682,7 +686,7 @@ def get_ctd_transition(self): Returns ======= - :ctd_transition : pd.Dataframe + :ctd_transition: pd.Dataframe pandas Dataframe of T_CTD values for protein sequence. Output will be of the shape N x M, where N is the number of protein sequences and M is the (number of physiochemical properties * 3), with 3 @@ -707,8 +711,7 @@ def get_ctd_transition(self): ctd_property = ctd_property.split(',') all_ctd = self.desc_parameters.ctd["all"] - #get transition descriptor from CTD dataframe, dependant on number of props, - #3 features per property + #get transition descriptor from CTD dataframe, dependant on number of props, 3 features per property if (all_ctd): transition_df = self.ctd.iloc[:,21:42] else: @@ -721,7 +724,9 @@ def get_ctd_transition(self): def get_ctd_distribution(self): """ Calculate Distribution (D_CTD) physiochemical/structural descriptor of - protein sequences from the calculated CTD descriptor. + protein sequences from the calculated CTD descriptor. Distribution is + the chain length within which the first, 25%, 50%, 75% and 100% of the + amino acids of a particular property are located. Parameters ========== @@ -729,7 +734,7 @@ def get_ctd_distribution(self): Returns ======= - :ctd_distribution : pd.Dataframe + :ctd_distribution: pd.Dataframe pandas Dataframe of D_CTD values for protein sequence. Output will be of the shape N x M, where N is the number of protein sequences and M is the (number of physiochemical properties * 15), with 15 @@ -754,8 +759,7 @@ def get_ctd_distribution(self): ctd_property = ctd_property.split(',') all_ctd = self.desc_parameters.ctd["all"] - #get distribution descriptor from CTD dataframe, dependant on number of props, - #15 features per property + #get distribution descriptor from CTD dataframe, dependant on number of props, 15 features per property if (all_ctd): distribution_df = self.ctd.iloc[:,42:] else: @@ -777,7 +781,7 @@ def get_ctd(self): Returns ======= - :ctd : pd.Series + :ctd: pd.Series pandas Series of CTD values for protein sequence. Output will be of the shape N x M, where N is the number of protein sequences and M is (number of physiochemical properties * 21), @@ -809,7 +813,7 @@ def get_ctd(self): def get_conjoint_triad(self): """ Calculate Conjoint Triad (CTriad) of protein sequences using the custom-built - protpy package. The descriptor mainly considers neighbor relationships in + protpy package. The descriptor mainly considers neighbour relationships in protein sequences by encoding each protein sequence using the triad (continuous three amino acids) frequency distribution extracted from a 7-letter reduced alphabet [11]. CTriad calculates 343 different features (7x7x7), with the @@ -821,7 +825,7 @@ def get_conjoint_triad(self): Returns ======= - :conjoint_triad : pd.Dataframe + :conjoint_triad: pd.Dataframe pandas Dataframe of CTriad descriptor values for all protein sequences. Dataframe will be of the shape N x 343, where N is the number of protein sequences and 343 is the number of features calculated from the descriptor for a sequence. @@ -833,7 +837,7 @@ def get_conjoint_triad(self): #initialise dataframe conjoint_triad_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequence, concatenate descriptor values for seq in self.protein_seqs: conjoint_triad_seq = protpy.conjoint_triad(seq) conjoint_triad_df = pd.concat([conjoint_triad_df, conjoint_triad_seq]) @@ -846,11 +850,11 @@ def get_sequence_order_coupling_number(self): """ Calculate Sequence Order Coupling Number (SOCN) features for input protein sequence using custom-built protpy package. SOCN computes the dissimilarity between amino acid - pairs. The distance between amino acid pairs is determined by d which varies - between 1 to lag. For each d, it computes the sum of the dissimilarities - of all amino acid pairs. The number of output features can be calculated as N * 2, - where N = lag, by default this value is 30 which generates an output of M x 60 - where M is the number of protein sequenes. + pairs. The distance between amino acid pairs is determined by d which varies between + 1 to lag. For each d, it computes the sum of the dissimilarities of all amino acid + pairs. The number of output features can be calculated as N * 2, where N = lag, by + default this value is 30 which generates an output of M x 60 where M is the number + of protein sequenes. Parameters ========== @@ -858,7 +862,7 @@ def get_sequence_order_coupling_number(self): Returns ======= - :sequence_order_coupling_number_df : pd.Dataframe + :sequence_order_coupling_number_df: pd.Dataframe Dataframe of SOCN descriptor values for all protein sequences. Output will be of the shape N x M, where N is the number of protein sequences and M is the number of features calculated from the descriptor (calculated as @@ -875,7 +879,7 @@ def get_sequence_order_coupling_number(self): lag = self.desc_parameters.sequence_order_coupling_number["lag"] distance_matrix = self.desc_parameters.sequence_order_coupling_number["distance_matrix"] - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequence, concatenate descriptor values for seq in self.protein_seqs: #if no distance matrix present in config then calculate SOCN using both matrices if (distance_matrix == "" or distance_matrix == None): @@ -908,7 +912,7 @@ def get_quasi_sequence_order(self): Returns ======= - :quasi_sequence_order_df : pd.Dataframe + :quasi_sequence_order_df: pd.Dataframe Dataframe of quasi-sequence-order descriptor values for the protein sequences, with output shape N x 100 where N is the number of sequences and 100 the number of calculated features. @@ -925,7 +929,7 @@ def get_quasi_sequence_order(self): weight = self.desc_parameters.quasi_sequence_order["weight"] distance_matrix = self.desc_parameters.quasi_sequence_order["distance_matrix"] - #calculate descriptor value, concatenate descriptor values + #calculate descriptor value, for each sequene, concatenate descriptor values for seq in self.protein_seqs: #if no distance matrix present in config then calculate quasi seq order using both matrices if (distance_matrix == "" or distance_matrix == None): @@ -946,7 +950,7 @@ def get_pseudo_amino_acid_composition(self): Calculate Pseudo Amino Acid Composition (PAAComp) descriptor using custom-built protpy package. PAAComp combines the vanilla amino acid composition descriptor with additional local features, such as correlation between residues of a certain distance, as amino - acid composition doesn't take into accont sequence order info. The pseudo components + acid composition doesn't take into account sequence order info. The pseudo components of the descriptor are a series rank-different correlation factors [10]. The first 20 components are a weighted sum of the amino acid composition and 30 are physiochemical square correlations as dictated by the lambda and properties parameters. This generates @@ -960,7 +964,7 @@ def get_pseudo_amino_acid_composition(self): Returns ======= - :pseudo_amino_acid_composition_df : pd.Dataframe + :pseudo_amino_acid_composition_df: pd.Dataframe Dataframe of pseudo amino acid composition descriptor values for the protein sequences of output shape N x (20 + λ), where N is the number of protein sequences. With default lambda of 30, the output shape will be N x 50. @@ -977,8 +981,10 @@ def get_pseudo_amino_acid_composition(self): weight = self.desc_parameters.pseudo_amino_acid_composition["weight"] properties = self.desc_parameters.pseudo_amino_acid_composition["properties"] - #calculate descriptor value, concatenate descriptor values - for seq in self.protein_seqs: + #calculate descriptor value, for each sequence, concatenate descriptor values, + #tqdm loop to visualise progress as descriptor can take some time to execute + for seq in tqdm(self.protein_seqs, unit=" sequence", position=0, + desc="PAAComp", mininterval=30, ncols=90): pseudo_amino_acid_composition_seq = protpy.pseudo_amino_acid_composition(seq, lamda=lamda, weight=weight, properties=properties) pseudo_amino_acid_composition_df = pd.concat([pseudo_amino_acid_composition_df, pseudo_amino_acid_composition_seq]) @@ -1004,7 +1010,7 @@ def get_amphiphilic_pseudo_amino_acid_composition(self): Returns ======= - :amphiphilic_pseudo_amino_acid_composition_df : pd.Dataframe + :amphiphilic_pseudo_amino_acid_composition_df: pd.Dataframe Dataframe of Amphiphilic pseudo amino acid composition descriptor values for the protein sequences of output shape N x 80, where N is the number of protein sequences and 80 is calculated as (20 + 2*lambda). @@ -1020,8 +1026,10 @@ def get_amphiphilic_pseudo_amino_acid_composition(self): #initialise dataframe amphiphilic_pseudo_amino_acid_composition_df = pd.DataFrame() - #calculate descriptor value, concatenate descriptor values - for seq in self.protein_seqs: + #calculate descriptor value, for each sequence, concatenate descriptor values, + #tqdm loop to visualise progress as descriptor can take some time to execute + for seq in tqdm(self.protein_seqs, unit=" sequence", position=0, + desc="APAAComp", mininterval=30, ncols=90): amphiphilic_pseudo_amino_acid_composition_seq = protpy.amphiphilic_pseudo_amino_acid_composition(seq, lamda=lamda, weight=weight) amphiphilic_pseudo_amino_acid_composition_df = pd.concat([amphiphilic_pseudo_amino_acid_composition_df, @@ -1031,78 +1039,102 @@ def get_amphiphilic_pseudo_amino_acid_composition(self): return self.amphiphilic_pseudo_amino_acid_composition - def get_all_descriptors(self, export=False): + def get_all_descriptors(self, export=False, descriptors_export_filename=""): """ Calculate all individual descriptor values, concatenating each descriptor Dataframe into one storing all descriptors. The number of descriptor - features calculated is dependant on several additional parameters of some - descriptors, including the number of properties and max lag for the + features calculated is dependant on several additional meta parameters of + some descriptors, including the number of properties and max lag for the Autocorrelation, SOCN and QSO and the number of properties and lamda for - PAAComp and the lambda for APAAComp. To export all descriptors to a csv - set export=True when calling the function, this saves having to recalculate - all the descriptor values when using them in multiple encoding processes, - and the descriptors can be imported using the import_descriptors function. + PAAComp and the lambda for APAAComp. + + To export all descriptors to a csv set export=True when calling the function, + this saves having to recalculate all the descriptor values when using them + in multiple encoding processes, and the descriptors can be imported using the + import_descriptors function. By default, the function will save the output + csv to the value at the "descriptors_csv" parameter in the config file, + although the name for this exported csv can be set by the + descriptors_export_filename input parameter. Parameters ========== - :export : bool (default=False) + :export: bool (default=False) if true then all calculated descriptors from the protpy package will be exported to a CSV. This allows for pre-calculated descriptors for a dataset to be easily imported and not have to be recalculated again. + :descriptors_export_filename: str + filepath/filename for the exported csv of all the calculated descriptor + values if input parameter export=True Returns ======= - :all_descriptor_df : pd.DataFrame + :all_descriptor_df: pd.DataFrame concatenated dataframe of all individual descriptors. Using the default attributes and their associated values, the output will be of the shape N x 9714, where N is the number of protein sequences and 9714 is the number of descriptor features. """ - #if descriptor attribute DF is empty then call its respective get_descriptor function - if (getattr(self, "amino_acid_composition").empty): - self.amino_acid_composition = self.get_amino_acid_composition() + print('############################### Exporting all descriptors ################################\n') - if (getattr(self, "dipeptide_composition").empty): - self.dipeptide_composition = self.get_dipeptide_composition() + #start time counter + start = time.time() - if (getattr(self, "tripeptide_composition").empty): - self.tripeptide_composition = self.get_tripeptide_composition() + #iterate over all descriptors, calculating each using their respective function and the protpy package + for descr in tqdm(self.all_descriptors_list(), unit=" descriptor", position=0, + desc="Descriptors", mininterval=30, ncols=90): - if (getattr(self, "moreaubroto_autocorrelation").empty): - self.moreaubroto_autocorrelation = self.get_moreaubroto_autocorrelation() + #if descriptor attribute DF is empty then call its respective get_descriptor function + if (descr == "amino_acid_composition" and getattr(self, "amino_acid_composition").empty): + self.amino_acid_composition = self.get_amino_acid_composition() - if (getattr(self, "moran_autocorrelation").empty): - self.moran_autocorrelation = self.get_moran_autocorrelation() + if (descr == "dipeptide_composition" and getattr(self, "dipeptide_composition").empty): + self.dipeptide_composition = self.get_dipeptide_composition() - if (getattr(self, "geary_autocorrelation").empty): - self.geary_autocorrelation = self.get_geary_autocorrelation() + if (descr == "tripeptide_composition" and getattr(self, "tripeptide_composition").empty): + self.tripeptide_composition = self.get_tripeptide_composition() - if (getattr(self, "ctd").empty): - self.ctd = self.get_ctd() + if (descr == "moreaubroto_autocorrelation" and getattr(self, "moreaubroto_autocorrelation").empty): + self.moreaubroto_autocorrelation = self.get_moreaubroto_autocorrelation() - if (getattr(self, "ctd_composition").empty): - self.ctd_composition = self.get_ctd_composition() + if (descr == "moran_autocorrelation" and getattr(self, "moran_autocorrelation").empty): + self.moran_autocorrelation = self.get_moran_autocorrelation() - if (getattr(self, "ctd_transition").empty): - self.ctd_transition = self.get_ctd_transition() - - if (getattr(self, "ctd_distribution").empty): - self.ctd_distribution = self.get_ctd_distribution() + if (descr == "geary_autocorrelation" and getattr(self, "geary_autocorrelation").empty): + self.geary_autocorrelation = self.get_geary_autocorrelation() - if (getattr(self, "conjoint_triad").empty): - self.conjoint_triad = self.get_conjoint_triad() + if (descr == "ctd" and getattr(self, "ctd").empty): + self.ctd = self.get_ctd() + + if (descr == "ctd_composition" and getattr(self, "ctd_composition").empty): + self.ctd_composition = self.get_ctd_composition() + + if (descr == "ctd_transition" and getattr(self, "ctd_transition").empty): + self.ctd_transition = self.get_ctd_transition() + + if (descr == "ctd_distribution" and getattr(self, "ctd_distribution").empty): + self.ctd_distribution = self.get_ctd_distribution() - if (getattr(self, "sequence_order_coupling_number").empty): - self.sequence_order_coupling_number = self.get_sequence_order_coupling_number() + if (descr == "conjoint_triad" and getattr(self, "conjoint_triad").empty): + self.conjoint_triad = self.get_conjoint_triad() - if (getattr(self, "quasi_sequence_order").empty): - self.quasi_sequence_order = self.get_quasi_sequence_order() + if (descr == "sequence_order_coupling_number" and getattr(self, "sequence_order_coupling_number").empty): + self.sequence_order_coupling_number = self.get_sequence_order_coupling_number() - if (getattr(self, "pseudo_amino_acid_composition").empty): - self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition() + if (descr == "quasi_sequence_order" and getattr(self, "quasi_sequence_order").empty): + self.quasi_sequence_order = self.get_quasi_sequence_order() - if (getattr(self, "amphiphilic_pseudo_amino_acid_composition").empty): - self.amphiphilic_pseudo_amino_acid_composition = self.get_amphiphilic_pseudo_amino_acid_composition() + if (descr == "pseudo_amino_acid_composition" and getattr(self, "pseudo_amino_acid_composition").empty): + self.pseudo_amino_acid_composition = self.get_pseudo_amino_acid_composition() + + if (descr == "amphiphilic_pseudo_amino_acid_composition" and getattr(self, "amphiphilic_pseudo_amino_acid_composition").empty): + self.amphiphilic_pseudo_amino_acid_composition = self.get_amphiphilic_pseudo_amino_acid_composition() + + #stop time counter, calculate elapsed time + end = time.time() + elapsed = end - start + + print('\nElapsed time for calculating all descriptors: {0:.2f} minutes.'.format(elapsed/60)) + print('\n##########################################################################################') #append all calculated descriptors to list all_desc = [ @@ -1119,10 +1151,15 @@ def get_all_descriptors(self, export=False): #export pre-calculated descriptor values to a csv, use default name if parameter empty if (export): - if (self.desc_config.descriptors_csv == "" or self.desc_config.descriptors_csv == None): - self.desc_config.descriptors_csv = "descriptors_output.csv" - - self.all_descriptors.to_csv(os.path.join(DATA_DIR, self.desc_config.descriptors_csv), index=0) + if (descriptors_export_filename == ""): + if (self.desc_config.descriptors_csv == "" or self.desc_config.descriptors_csv == None): + self.desc_config.descriptors_csv = "descriptors_output.csv" + self.all_descriptors.to_csv(self.desc_config.descriptors_csv, index=0) + else: + #append extension if not present on filename - export to csv + if (os.path.splitext(os.path.basename(descriptors_export_filename))[1] == ""): + descriptors_export_filename = descriptors_export_filename + ".csv" + self.all_descriptors.to_csv(descriptors_export_filename, index=0) return all_descriptor_df @@ -1134,7 +1171,7 @@ def get_descriptor_encoding(self, descriptor): Parameters ========== - :descriptor : str + :descriptor: str name of descriptor to return. Method can accept the approximate name of the descriptor, e.g. 'amino_comp'/'aa_composition' etc will return the 'amino_acid_composition' descriptor. This functionality is realised @@ -1142,7 +1179,7 @@ def get_descriptor_encoding(self, descriptor): Returns ======= - :desc_encoding : pd.DataFrame/None + :desc_encoding: pd.DataFrame/None dataframe of matching descriptor attribute. None returned if no matching descriptor found. """ @@ -1159,8 +1196,8 @@ def get_descriptor_encoding(self, descriptor): if (desc_matches != []): desc = desc_matches[0] #set desc to closest descriptor match found else: - raise ValueError("Could not find a match for the input descriptor ({}) in" - " available valid models:\n {}.".format(descriptor, self.valid_descriptors)) + raise ValueError("Could not find a match for the input descriptor {} in" + " list of available valid models:\n{}.".format(descriptor, self.valid_descriptors)) #if sought descriptor attribute dataframe is empty, call the descriptor's # get_descriptor() function, set desc_encoding to descriptor attribute @@ -1248,17 +1285,19 @@ def all_descriptors_list(self, desc_combo=1): Get list of all available descriptor attributes. Using the desc_combo input parameter you can get the list of all descriptors, all combinations of 2 descriptors or all combinations of 3 descriptors. Default of 1 will - mean a list of all available descriptor attributes will be returned. + mean a list of all available descriptor attributes will be returned. With + there being 15 descriptors, 105 and 455 combinations of 2 and 3 descriptors + will be returned if desc_combo=2 or desc_combo=3, respectively. Parameters ========== - :desc_combo : int (default=1) + :desc_combo: int (default=1) combination of descriptors to return. A value of 2 or 3 will return all combinations of 2 or 3 descriptor attributes etc. Returns ======= - :all_descriptors : list + :all_descriptors: list list of available descriptor attributes. """ #filter out class attributes that are not any of the desired descriptors @@ -1455,4 +1494,8 @@ def __len__(self): return len(self.all_descriptors) def __shape__(self): - return self.all_descriptors.shape \ No newline at end of file + return self.all_descriptors.shape + + def __sizeof__(self): + """ Get size of all_descriptors object that stores all descriptor values. """ + return self.all_descriptors.__sizeof__() \ No newline at end of file diff --git a/pySAR/encoding.py b/pySAR/encoding.py index 99f8c3e..7a29e63 100644 --- a/pySAR/encoding.py +++ b/pySAR/encoding.py @@ -22,13 +22,16 @@ class Encoding(PySAR): The use-case of this class is when you have a dataset of protein sequences with a sought-after protein activity/fitness value and you want to measure this activity value for new and unseen sequences that have not had their activity value - experimentally measured. The encoding class allows for evaluation of a variety - of potential techniques at which to numerically encode the protein sequences, - allowing for the builiding of predictive regression ML models that can ultimately - predict the activity value of an unseen protein sequence. The strategies each - generate a huge number of potential models built an a plethora of available features + experimentally measured. Prior to protein sequences being passed into ML models, + the amino acids have to be numerically encoded. The encoding class allows for + evaluation of a variety of potential techniques at which to numerically encode the + protein sequences, allowing for the builiding of predictive regression ML models + that can ultimately predict the activity value of an unseen protein sequence by + mapping a relationship between sequence and activity/function. The strategies each + generate a huge number of potential models built an a plethora of available features that you can then assess for performance and predictability, selecting the - best-performing model out of all those evaluated. + best-performing model out of all those evaluated. This best-performing model should + then be used when you want to predict the activity/fitness value for new sequences. The encoding class inherits from the main PySAR module and allows for a dataset of protein sequences to be encoded through 3 main strategies: AAI Indices, @@ -47,7 +50,7 @@ class Encoding(PySAR): Parameters ========== - :config_file : (str) + :config_file: (str) path to configuration file with all required parameters for the pySAR encoding pipeline. **kwargs: dict @@ -58,7 +61,7 @@ class Encoding(PySAR): Methods ======= aai_encoding(aai_indices=None, sort_by='R2', output_folder=""): - encoding protein sequences using indices from the AAI. + encoding protein sequences using indices from the AAI and aaindex package. descriptor_encoding(descriptors=None, desc_combo=1, sort_by='R2', output_folder=""): encoding protein sequences using protein descriptors from descriptors module and protpy package. aai_descriptor_encoding(aai_indices=None, descriptors=None, desc_combo=1, sort_by='R2', output_folder=""): @@ -69,40 +72,45 @@ def __init__(self, config_file="", **kwargs): self.config_file = config_file - #pass config file into parent pySAR class + #pass config file and kwargs into parent pySAR class super().__init__(self.config_file, **kwargs) def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): """ Encoding all protein sequences using each of the available indices in the - AAI. The protein spectra of the AAI indices will be generated if use_dsp is true, - dictated by the instance attributes: spectrum, window and filter. If not true then - the encoced sequences from the AAI will directly be used. Each encoding will be - used as the feature data to build the predictive regression models. To date, - there are 566 indices in the AAI, therefore 566 total models can be built - using this encoding strategy. The metrics evaluated from the model for each AAI - encoding combination will be collated into a dataframe, saved and returned, with the - results sorted by R2 by default, this can be changed using the sort_by parameter. + AAI and aaindex package. The protein spectra of the AAI indices can be generated + if use_dsp is true when creating the Encodign instance, also utiilsied for the + DSP spectra are the instance attributes: spectrum, window and filter. If not true + then the encoced sequences from the AAI will directly be used - default. + + Each encoding will be used as the feature data to build the predictive regression + ML models. To date, there are 566 indices in the AAI, therefore 566 total models + can be built using this encoding strategy. The metrics evaluated from the model + for each AAI encoding combination will be collated into a dataframe, saved and + returned, with the results sorted by R2 by default, this can be changed using + the sort_by parameter. You can sort the output dataframe via the other metrics, + including: RMSE, MSE, MAE, RPD and Explained Variance. Parameters ========== - :aai_indices : str/list (default=None) + :aai_indices: str/list (default=None) str/list of aai indices to use for encoding the predictive models, by default - ALL AAI indices will be used. - :sort_by : str (default=R2) - sort output dataframe by specified column/metric value, results sorted by R2 score - by default. - :output_folder : str (default="") - output folder to store results csv to, if empty input it will be stored in + ALL AAI indices will be used if parameter remains as None. + :sort_by: str (default=R2) + sort output dataframe by specified column/metric value, results sorted by R2 + score by default. + :output_folder: str (default="") + output folder to store results csv to, if empty then input will be stored in the OUTPUT_FOLDER global var. Returns ======= - :aaindex_metrics_df : pd.DataFrame + :aaindex_metrics_df: pd.DataFrame dataframe of calculated metric values from generated predictive models encoded using indices in the AAI for the AAI encoding strategy. Output will - be of the shape 566 x 8, where 566 is the number of indices that can be used - for the encoding and 8 is the results/metric columns. + be of the shape X x 8, where X is the number of indices that can be used + for the encoding and 8 is the results/metric columns. If no indices are + passed in then this shape will be 566 x 8. """ #initialise dataframe to store all output results of AAI encoding aaindex_metrics_df = pd.DataFrame(columns=['Index', 'Category', 'R2', 'RMSE', @@ -123,7 +131,7 @@ def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): all_indices = aaindex1.record_codes() elif (isinstance(aai_indices, str)): #if single aai index input, cast to list if (',' in aai_indices): - all_indices = aai_indices.replace(' ', '').split(',') + all_indices = aai_indices.replace(' ', '').split(',') #split comma seperated list of indices into list else: all_indices = [aai_indices] elif ((not isinstance(aai_indices, list)) and (not isinstance(aai_indices, str))): @@ -134,10 +142,10 @@ def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): #remove any duplicates from aai indices list, sort alphabetically all_indices = sorted(list(set(all_indices))) - #validate each input AAI ascession number is valid, if not raise error + #validate each input AAI acession number is valid, if not raise error for index in all_indices: if not (index in aaindex1.record_codes()): - raise ValueError("AAI record ({}) not found in list of available record codes\nInput record codes: {}..".format(index, aai_indices)) + raise ValueError("AAI record {} not found in list of available record codes.".format(index)) #create text wrapper for amino acid indices and model parameters text line_length = 90 @@ -152,13 +160,12 @@ def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): print(textwrap.fill('# AAI Indices: {}'.format(', '.join(all_indices)), line_length)) else: print('# AAI Indices: {}'.format(len(all_indices))) - if (self.use_dsp): + if (self.use_dsp): print('# DSP Parameters:\n# Spectrum: {}\n# Window Function: {}\n# Filter Function: {}'.format( self.spectrum, self.window_type, self.filter_type)) print('# Configuration File: {}\n# Dataset: {}\n# Number of Sequences/Sequence Length: {} x {}\n# Target Activity: {}\n# Algorithm: {}'. format(os.path.basename(self.config_file), os.path.basename(self.dataset), self.num_seqs, self.sequence_length, self.activity_col, repr(temp_model_parameters))) if (self.model_parameters == "" or self.model_parameters is None or self.model_parameters == {}): - # print('# Model Parameters: {}'.format("\n\t".join(tw.wrap(', '.join(temp_model_parameters.model.get_params()))))) print(textwrap.fill('# Model Parameters: {}'.format(temp_model_parameters.model.get_params()), line_length)) else: print(textwrap.fill('# Model Parameters: {}'.format(self.model_parameters), line_length)) @@ -175,15 +182,15 @@ def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): 5.) Repeat steps 1 - 4 for all indices. 6.) Output results into a final dataframe, save to OUTPUT_DIR and return. ''' - start = time.time() #start time counter + #start time counter + start = time.time() #disable tqdm progress bar if 5 or less aai indices input tqdm_disable = False if (len(all_indices)) <= 5: tqdm_disable = True - #using tqdm package to create a progress bar showing encoding progress, - #file=sys.stdout to stop error where iterations were printing out of order + #using tqdm package to create a progress bar showing encoding progress for index in tqdm(all_indices[:int(len(all_indices))], unit=" indices", position=0, desc="AAI Indices", disable=tqdm_disable, ncols=90): @@ -237,7 +244,7 @@ def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): end = time.time() elapsed = end - start - print('\nElapsed Time for AAI Encoding: {0:.2f} seconds.'.format(elapsed)) + print('\nElapsed time for AAI Encoding: {0:.2f} seconds.'.format(elapsed)) print('\n##########################################################################################') #set columns in the output dataframe to each of the values/metrics lists @@ -251,7 +258,7 @@ def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): aaindex_metrics_['MAE'] = mae_ aaindex_metrics_['Explained Variance'] = explained_var_ - #convert index and category from default Object -> String datatypes, + #convert index and category from default Object -> String datatypes aaindex_metrics_['Index'] = aaindex_metrics_['Index'].astype(pd.StringDtype()) aaindex_metrics_['Category'] = aaindex_metrics_['Category'].astype(pd.StringDtype()) @@ -272,46 +279,48 @@ def aai_encoding(self, aai_indices=None, sort_by='R2', output_folder=""): return aaindex_metrics_ - def descriptor_encoding(self, descriptors=None, desc_combo=1, sort_by='R2', output_folder=""): + def descriptor_encoding(self, descriptors=[], desc_combo=1, sort_by='R2', output_folder=""): """ Encoding all protein sequences using the available physiochemical, biochemical and structural descriptors from the custom-built protpy package. The sequences can be encoded using combinations of 1, 2 or 3 of these descriptors, dictated by the desc_combo input parameter: set this to 1, 2 or 3 for what encoding - combination to use, default is 1. Each descriptor encoding will be used as the - feature data to build the predictive regression models. With 15 descriptors - supported by pySAR & protpy this means there can be 15, 105 and 455 total - predictive models built for 1, 2 or 3 descriptors, respecitvely. These totals - may vary depending on the meta-parameters on some of the descriptors e.g the - lag or lambda for the autocorrelation and pseudo amino acid descriptors, - respectively. The metrics evaluated from the model for each descriptor - encoding combination will be collated into a dataframe and saved and returned, - with the results sorted by the R2 score by default, this can be changed using - the sort_by parameter. + combination to use, default is 1. + + Each descriptor encoding will be used as the feature data to build the predictive + regression ML models. These models can then be used to predict the sought-after + activity/fitness value for unseen test sequences. With 15 descriptors supported + by pySAR & protpy this means there can be 15, 105 and 455 total predictive models + built for 1, 2 or 3 descriptors, respecitvely. These totals may vary depending on + the meta-parameters on some of the descriptors e.g the lag or lambda for the + autocorrelation and pseudo amino acid descriptors, respectively. The metrics + evaluated from the model for each descriptor encoding combination will be collated + into a dataframe and saved and returned, with the results sorted by the R2 score + by default, this can be changed using the sort_by parameter. Parameters ========== - :descriptors : str/list (default=None) + :descriptors: str/list (default=[]) str/list of descriptors to use for encoding, by default all available descriptors in the protpy package will be used for the encoding. - :desc_combo : int (default=1) - combination of descriptors to use. - :sort_by : str (default=R2) + :desc_combo: int (default=1) + combination of descriptors to use, default of 1. + :sort_by: str (default=R2) sort output dataframe by specified column/metric value, results sorted by R2 score by default. - :output_folder : str (default="") - output folder to store results csv to, if empty input it will be stored in - the OUTPUT_FOLDER global var. + :output_folder: str (default="") + output folder to store results csv to, if parameter not set then output will + be stored in the OUTPUT_FOLDER global var. Returns ======= - :desc_metrics_df_ : pd.DataFrame + :desc_metrics_df_: pd.DataFrame dataframe of calculated metric values from generated predictive models encoded using all or selected input descriptors for the descriptors - encoding strategy. Output will be of the shape 15 x 8, 105 x 8 or - 455 x 8 when using a desc_combo value of 1, 2 or 3, respectively - representing the number of descriptors that can be used for the - encoding and 8 is the results/metric columns. + encoding strategy. Output will be of the shape X x 8, where X is the + number of descriptors input and 8 is the results/metric columns. By + default the output shape will be 15 x 8, but with a desc_combo of 2 + and 3, the shape will be 105 x 8 and 455 x 8, respectively. """ #create dataframe to store output results from models desc_metrics_df = pd.DataFrame(columns=['Descriptor', 'Group', 'R2', 'RMSE', @@ -327,20 +336,20 @@ def descriptor_encoding(self, descriptors=None, desc_combo=1, sort_by='R2', outp mae_ = [] explained_var_ = [] - #create instance of descriptors class + #create instance of descriptors class using config file desc = Descriptors(self.config_file) - #if no descriptors passed into descriptors then use all descriptors by default, + #if no descriptors passed into descriptors input param then use all descriptors by default, #get list of all descriptors according to desc_combo value - if (descriptors == None or descriptors == [] or descriptors == ""): - all_descriptors = desc.all_descriptors_list(desc_combo) + if ((not isinstance(descriptors, list)) and (not isinstance(descriptors, str))): + raise TypeError("Input Descriptor parameter is not of type list or str, got {}.".format(type(descriptors))) + elif (descriptors == [] or descriptors == ""): + all_descriptors = desc.all_descriptors_list(desc_combo) #using all descriptors elif (isinstance(descriptors, str)): #if single descriptor input, cast to list if (',' in descriptors): - all_descriptors = descriptors.replace(' ', '').split(',') + all_descriptors = descriptors.replace(' ', '').split(',') #split comma seperated list of descriptors into list else: all_descriptors = [descriptors] - elif ((not isinstance(descriptors, list)) and (not isinstance(descriptors, str))): - raise TypeError("Input Descriptor parameter is not of type list or str, got {}.".format(type(descriptors))) else: if (desc_combo == 2): all_descriptors = list(itertools.combinations(descriptors, 2)) @@ -374,9 +383,10 @@ def descriptor_encoding(self, descriptors=None, desc_combo=1, sort_by='R2', outp start = time.time() ''' - 1.) Get current descriptor value or combination of descriptors from all_descriptors list. - 2.) Build model using descriptor features from current descriptor(s). - 3.) Predict and evaluate the model using the test data. + 1.) Get current descriptor value or combination of descriptors from all_descriptors list for + dataset of protein sequences. + 2.) Build model using calculated descriptor features from current descriptor(s). + 3.) Predict and evaluate the model using the test data protein sequences. 4.) Append descriptor(s) and calculated metrics to lists. 5.) Repeat steps 1 - 4 for all descriptors. 6.) Output results into a final dataframe, save it and return, sorting by sort_by parameter. @@ -421,19 +431,18 @@ def descriptor_encoding(self, descriptors=None, desc_combo=1, sort_by='R2', outp #set model training data to desc_ dataframe X = desc_ - #get protein activity values + #get protein activity values - training labels Y = self.activity ''' Note: If using the PlsRegression algorithm and there is only 1 feature (1-dimension) - in the feature data X (e.g SOCN) then create a new PLSReg model manually setting the + in the feature data X (e.g SOCN) then create a new PLSReg model and manually set the n_components parameter to 1 instead of the default 2 - this stops the error: 'ValueError - Invalid Number of Components: 2' ''' - #get train/test split, fit model and predict activity of test data if ((X.shape[1] == 1) and (self.algorithm.lower() == "plsregression")): - tmp_model = Model(X, Y, 'plsreg', parameters={'n_components':1}) + tmp_model = Model(X, Y, self.algorithm, parameters={'n_components':1}) X_train, X_test, Y_train, Y_test = tmp_model.train_test_split(test_split=self.test_split) model_fit = tmp_model.fit() Y_pred = tmp_model.predict() @@ -459,7 +468,7 @@ def descriptor_encoding(self, descriptors=None, desc_combo=1, sort_by='R2', outp end = time.time() elapsed = end - start - print('\nElapsed Time for Descriptor Encoding: {0:.2f} seconds.\n'.format(elapsed)) + print('\nElapsed time for Descriptor Encoding: {0:.2f} seconds.\n'.format(elapsed)) print('\n##########################################################################################') #if using combinations of 2 or 3 descriptors, group every 2 or 3 descriptor @@ -498,63 +507,67 @@ def descriptor_encoding(self, descriptors=None, desc_combo=1, sort_by='R2', outp #sort results according to sort_by parameter (R2 by default) desc_metrics_df_ = desc_metrics_df_.sort_values(by=[sort_by], ascending=sort_ascending) - #set save path according to the descriptor combinations type + #set save filename according to the descriptor combinations type if (desc_combo == 2): - save_path = 'desc_combo2_results' + save_filename = 'desc_combo2_results' elif (desc_combo == 3): - save_path = 'desc_combo3_results' + save_filename = 'desc_combo3_results' else: - save_path = 'desc_results' + save_filename = 'desc_results' - #save results dataframe to specified save_path - save_results(desc_metrics_df_, save_path, output_folder=output_folder) + #save results dataframe to specified save_filename + save_results(desc_metrics_df_, save_filename, output_folder=output_folder) return desc_metrics_df_ - def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo=1, sort_by='R2', output_folder=""): + def aai_descriptor_encoding(self, aai_indices=[], descriptors=[], desc_combo=1, sort_by='R2', output_folder=""): """ - Encoding all protein sequences using each of the available indices in the AAI in - concatenation with the protein descriptors available via the protpy pacakge. The - sequences can be encoded using 1 AAI + 1 Descriptor, 2 Descriptors or 3 Descriptors, - dictated by the desc_combo input parameter: set this to 1, 2 or 3 for what encoding - combination to use, default is 1. The protein spectra of the AAI indices will be - generated if the config param use_dsp is true, along with the class attributes: - spectrum, window and filter. Each encoding will be used as the feature data to - build the predictive regression models. To date, there are 566 indices and - pySAR/protpy supports 15 descriptors so the encoding process will generate 8490, - ~59000 and ~257000 models, when using 1, 2 or 3 descriptors + AAI indices, - respectively. These values may vary depending on the meta-parameters on some of - the descriptors such as the lag or lambda for the autocorrelation and pseudo - amino acid descriptors, respectively. The metrics evaluated from the model for - each AAI + Descriptor encoding combination will be collated into a dataframe and - saved and returned, sorted by the R2 score by default. + Encoding all protein sequences using each of the available indices in the AAI and + aaindex package in concatenation with the protein descriptors available via the + protpy pacakge. The sequences can be encoded using 1 AAI + 1 Descriptor, 2 + Descriptors or 3 Descriptors, dictated by the desc_combo input parameter: set + this to 1, 2 or 3 for what encoding combination to use, default is 1. The protein + spectra of the AAI indices will be generated if the config param use_dsp is true, + also utilised for the DSP transformation is the class attributes: spectrum, window + and filter. + + Each numerical encoding will be used as the feature data to build the predictive + regression ML models. To date, there are 566 indices and pySAR/protpy supports + 15 descriptors so the encoding process will generate 8490, ~59000 and ~257000 + models, when using 1, 2 or 3 descriptors + AAI indices, respectively. These values + may vary depending on the meta-parameters on some of the descriptors such as the + lag or lambda for the autocorrelation and pseudo amino acid descriptors, respectively. + The metrics evaluated from the model, accessing its accuracy and predictability for + each AAI + Descriptor encoding combination will be collated into a dataframe and saved + and returned, sorted by the R2 score by default. Parameters ========== - :aai_indices : str/list (default=None) + :aai_indices: str/list (default=[]) str/list of aai indices to use for encoding the predictive models, by default ALL AAI indices will be used. - :decs_list : list (default=None) + :descriptors: list (default=[]) str/list of descriptors to use for encoding, by default all available descriptors in the protpy package will be used for the encoding. - :desc_combo : int (default=1) + :desc_combo: int (default=1) combination of descriptors to use. - :sort_by : str (default=R2) + :sort_by: str (default=R2) sort output dataframe by specified column/metric value, results sorted by R2 score by default. - :output_folder : str (default="") + :output_folder: str (default="") output folder to store results csv to, if empty input it will be stored in the OUTPUT_FOLDER global var. Returns ======= - :aai_desc_metrics_df_ : pd.DataFrame + :aai_desc_metrics_df_: pd.DataFrame dataframe of calculated metric values from generated predictive models encoded using AAI indices + descriptors encoding strategy. The output will - be of shape (566 * 15) x 10, (566 * 105) x 10, or (566 * 455) x 10, - depending on the desc_combo param which dictates the combinations of - descriptors to use with the indices. 10 represents the results/metrics - columns of the dataframe. + be of shape (X * Y) x 10, where X is the number of AAI indices input, Y is + the number of descriptors input and 10 is the results/metrics columns of + the output dataframe. Using the default values and desc_combo of 1, 2 and + 3, the output shapes will be (566 * 15) x 10, (566 * 105) x 10, or + (566 * 455) x 10. """ #create dataframe to store output results from models aai_desc_metrics_df = pd.DataFrame(columns=['Index', 'Category', 'Descriptor',\ @@ -572,16 +585,17 @@ def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo mae_ = [] explained_var_ = [] + #raise error if invalid parameter data types input + if ((not isinstance(aai_indices, list)) and (not isinstance(aai_indices, str))): + raise TypeError("Input AAI parameter is not of type list or str, got {}.".format(type(aai_indices))) #if no indices passed into aai_indices then use all indices by default - if (aai_indices == None or aai_indices == [] or aai_indices == ""): + elif (aai_indices == None or aai_indices == [] or aai_indices == ""): all_indices = aaindex1.record_codes() - elif (isinstance(aai_indices, str)): #if single descriptor input, cast to list + elif (isinstance(aai_indices, str)): #if single descriptor input, cast comma seperated indices to list if (',' in aai_indices): all_indices = aai_indices.replace(' ', '').split(',') else: all_indices = [aai_indices] - elif ((not isinstance(aai_indices, list)) and (not isinstance(aai_indices, str))): - raise TypeError("Input AAI parameter is not of type list or str, got {}.".format(type(aai_indices))) else: all_indices = aai_indices @@ -591,23 +605,24 @@ def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo #validate each input AAI ascession number is valid, if not raise error for index in all_indices: if not (index in aaindex1.record_codes()): - raise ValueError("AAI record ({}) not found in list of available record codes\nInput record codes: {}..".format(index, aai_indices)) + raise ValueError("AAI record {} not found in list of available record codes.".format(index)) #create instance of Descriptors class desc = Descriptors(config_file=self.config_file) + #raise error if invalid parameter data types input + if ((not isinstance(descriptors, list)) and (not isinstance(descriptors, str))): + raise TypeError("Input Descriptor parameter is not of type list or str, got {}.".format(type(descriptors))) #if no descriptors passed into descriptors then use all descriptors by default, - #get list of all descriptors according to desc_combo value - if (descriptors == None or descriptors == [] or descriptors == ""): + elif (descriptors == None or descriptors == [] or descriptors == ""): all_descriptors = desc.all_descriptors_list(desc_combo) - elif (isinstance(descriptors, str)): #if single descriptor input, cast to list + elif (isinstance(descriptors, str)): #if single descriptor input, cast comma seperated descriptors to list if (',' in descriptors): all_descriptors = descriptors.replace(' ', '').split(',') else: all_descriptors = [descriptors] - elif ((not isinstance(descriptors, list)) and (not isinstance(descriptors, str))): - raise TypeError("Input Descriptor parameter is not of type list or str, got {}.".format(type(descriptors))) else: + #get list of all descriptors according to desc_combo value if (desc_combo == 2): all_descriptors = list(itertools.combinations(descriptors, 2)) elif (desc_combo == 3): @@ -616,7 +631,7 @@ def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo all_descriptors = descriptors #remove any duplicates from descriptors list, sort alphabetically - # all_descriptors = sorted(list(set(all_descriptors))) + all_descriptors = sorted(list(set(all_descriptors))) #create text wrapper for amino acid indices and descriptors text, split to newline if surpasses line length line_length = 90 @@ -650,11 +665,11 @@ def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo ''' 1.) Get AAI index encoding of protein sequences. If using DSP, create instance - of pyDSP class and generate protein spectra from the AAI indices, according to - instance parameters: spectrum, window and filter. + of pyDSP class and generate protein spectra from the AAI indices, according to + instance parameters: spectrum, window and filter. 2.) Get all descriptor values and concatenate to AAI encoding features. 3.) Build model using concatenated AAI and Descriptor features as the training data. - 4.) Predict and evaluate the model using the test data. + 4.) Predict and evaluate the model using the test data unseen protein sequences. 5.) Append index, descriptor and calculated metrics to lists. 6.) Repeat steps 1 - 5 for all indices in the AAI. 7.) Output results into a final dataframe, save it and return, sort by sort_by parameter. @@ -715,14 +730,14 @@ def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo ''' Note: If using the PlsRegression algorithm and there is only 1 feature (1-dimension) - in the feature data X then create a new PLSReg model with the n_components + in the feature data X (e.g SOCN) then create a new PLSReg model with the n_components parameter set to 1 instead of the default 2 - this stops the error: 'ValueError - Invalid Number of Components: 2.' ''' #get train/test split, fit model and predict activity of test data if (X.shape[1] == 1 and self.algorithm.lower() == "plsregression"): - tmp_model = Model(X, Y, 'plsreg', parameters={'n_components':1}) + tmp_model = Model(X, Y, self.algorithm, parameters={'n_components':1}) X_train, X_test, Y_train, Y_test = tmp_model.train_test_split(X, Y, self.model_parameters, self.test_split) model_fit = tmp_model.fit() Y_pred = tmp_model.predict() @@ -750,7 +765,7 @@ def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo end = time.time() elapsed = end - start - print('Elapsed Time for AAI + Descriptor Encoding: {0:.2f} seconds.'.format(elapsed)) + print('Elapsed time for AAI + Descriptor Encoding: {0:.2f} seconds.'.format(elapsed)) print('\n###########################################################################') #if using combinations of 2 or 3 descriptors, group every 2 or 3 descriptor @@ -793,16 +808,16 @@ def aai_descriptor_encoding(self, aai_indices=None, descriptors=None, desc_combo #sort results according to sort_by parameter (R2 by default) aai_desc_metrics_df_ = aai_desc_metrics_df_.sort_values(by=[sort_by], ascending=sort_ascending) - #set save path according to the descriptor combinations type + #set save filename according to the descriptor combinations type if (desc_combo == 2): - save_path = 'aai_desc_combo2_results' + save_filename = 'aai_desc_combo2_results' elif (desc_combo == 3): - save_path = 'aai_desc_combo3_results' + save_filename = 'aai_desc_combo3_results' else: - save_path = 'aai_desc_results' + save_filename = 'aai_desc_results' - #save results dataframe to specified save_path - save_results(aai_desc_metrics_df_, save_path, output_folder=output_folder) + #save results dataframe to specified save_filename + save_results(aai_desc_metrics_df_, save_filename, output_folder=output_folder) return aai_desc_metrics_df_ diff --git a/pySAR/evaluate.py b/pySAR/evaluate.py index 92054bb..cefe356 100644 --- a/pySAR/evaluate.py +++ b/pySAR/evaluate.py @@ -15,9 +15,9 @@ class Evaluate(): Parameters ========== - :Y_true : np.ndarray + :Y_true: np.ndarray array of observed activity/fitness values. - :Y_pred : np.ndarray + :Y_pred: np.ndarray array of predicted activity/fitness values. Methods @@ -46,7 +46,7 @@ def __init__(self, Y_true, Y_pred): self.Y_pred = np.array(Y_pred).reshape((-1,1)) #validate that predicted and observed input arrays are of the same length, - #if input predicted and observed arrays are not same shape then raise error + #if not same shape then raise error if (self.Y_true.shape != self.Y_pred.shape): raise ValueError('Observed and predicted values must be of the same shape,\ Y_true = {} & Y_pred = {}.'.format(Y_true.shape, Y_pred.shape)) @@ -67,33 +67,33 @@ def r2_(self, multioutput='uniform_average'): Parameters ========== - :multioutput : str (default='uniform_average') + :multioutput: str (default='uniform_average') method that defines aggregating of multiple output scores. Default is reccomended ('uniform_average'), available values: {‘raw_values’, ‘uniform_average’, ‘variance_weighted’}. Returns ======= - :r2 : float + :r2: float R2 (coefficient of determination) score for observed and predicted values. """ return r2_score(self.Y_true, self.Y_pred, multioutput=multioutput) - def mse_(self,multioutput='uniform_average'): + def mse_(self, multioutput='uniform_average'): """ Calculate MSE (mean square error) regression loss score for observed and predicted values. Parameters ========== - :multioutput : str (default='uniform_average') + :multioutput: str (default='uniform_average') method that defines aggregating of multiple output scores. Default is reccomended ('uniform_average'), available values: {‘raw_values’, ‘uniform_average’, ‘variance_weighted’}. Returns ======= - :mse : float + :mse: float MSE (mean square error) score for observed and predicted values. """ return mean_squared_error(self.Y_true, self.Y_pred, multioutput=multioutput) @@ -106,14 +106,14 @@ def rmse_(self, multioutput='uniform_average'): Parameters ========== - :multioutput : str (default='uniform_average') + :multioutput: str (default='uniform_average') method that defines aggregating of multiple output scores. Default is reccomended ('uniform_average'), available values: {‘raw_values’, ‘uniform_average’, ‘variance_weighted’}. Returns ======= - :rmse : float + :rmse: float RMSE score for observed and predicted values. """ return mean_squared_error(self.Y_true, self.Y_pred, squared=False, multioutput=multioutput) @@ -125,14 +125,14 @@ def mae_(self, multioutput='uniform_average'): Parameters ========== - :multioutput : str (default='uniform_average') + :multioutput: str (default='uniform_average') method that defines aggregating of multiple output scores. Default is reccomended ('uniform_average'), available values: {‘raw_values’, ‘uniform_average’, ‘variance_weighted’}. Returns ======= - :mae : float + :mae: float If multioutput is ‘raw_values’, then MAE is returned for each output separately. If multioutput is ‘uniform_average’ or an ndarray of weights, then the weighted average of all output errors is returned. @@ -152,7 +152,7 @@ def rpd_(self): Returns ======= - :rpd : float + :rpd: float the RPD score for the model. """ return self.Y_true.std() / np.sqrt(self.mse_()) @@ -164,14 +164,14 @@ def explained_var_(self, multioutput='uniform_average'): Parameters ========== - :multioutput : str (default='uniform_average') + :multioutput: str (default='uniform_average') method that defines aggregating of multiple output scores. Default is reccomended ('uniform_average'), available values: {‘raw_values’, ‘uniform_average’, ‘variance_weighted’}. Returns ======= - :explained_var : float + :explained_var: float The explained variance or ndarray if ‘multioutput’ is ‘raw_values’. """ return explained_variance_score(self.Y_true, self.Y_pred, multioutput=multioutput) @@ -186,8 +186,9 @@ def max_error_(self): Returns ======= - :max_error : float - A positive floating point value (the best value is 0.0). + :max_error: float + A positive floating point value of the maximal residueal error + (the best value is 0.0). """ return float(max_error(self.Y_true, self.Y_pred)) @@ -202,10 +203,10 @@ def mean_poisson_deviance_(self): Returns ======= - :mean_poisson_deviance : float + :mean_poisson_deviance: float A non-negative floating point value (the best value is 0.0). """ - return mean_poisson_deviance(self.Y_true, self.Y_true) + return mean_poisson_deviance(self.Y_true, self.Y_pred) def __repr__(self): return ".".format( @@ -214,5 +215,5 @@ def __repr__(self): def __str__(self): return "Instance of Evaluate Class with attribute values: \ R2: {}, RMSE: {}, MSE: {}, MAE: {}, RPD: {}, Explained Variance: {},\ - Max Error: {}".format(self.r2, self.rmse, self.mse, self.mae, + Max Error: {}.".format(self.r2, self.rmse, self.mse, self.mae, self.rpd, self.explained_var, self.max_error) \ No newline at end of file diff --git a/pySAR/globals_.py b/pySAR/globals_.py index 5077c9b..d3543d3 100644 --- a/pySAR/globals_.py +++ b/pySAR/globals_.py @@ -5,10 +5,6 @@ import os from datetime import datetime -#data dir is the default directory used to store all data required for pySAR -global DATA_DIR -DATA_DIR = 'data' - #output dir is the default directory used to store all outputs generated global OUTPUT_DIR OUTPUT_DIR = 'outputs' diff --git a/pySAR/model.py b/pySAR/model.py index ed11940..f2a015a 100644 --- a/pySAR/model.py +++ b/pySAR/model.py @@ -32,31 +32,35 @@ class Model(): Regression, Lasso, Ridge, Support Vector Regression, Stochastic Gradient Descent and K Nearest Neighbours (KNN). + Once a model object has been built and fitted to the training data and + labels, it can then be used for predicting the sought activity/fitness + value for unseen test sequences. + Parameters ========== - :X : np.ndarray + :X: np.ndarray training data. - :Y : np.ndarray + :Y: np.ndarray training data labels. - :algorithm : str + :algorithm: str sklearn regression algorithm to build and fit model with. Value can be an approximate representation of model name, for example: 'plsreg' will initialiase an instance of the PLSRegression model etc. Available algorithms listed above. - :parameters : dict (default={}) + :parameters: dict (default={}) parameters to use for specific sklearn model when building regression model, by default it is set to {}, meaning all of the models' default parameters are used. Refer to sci-kit learn for full list of available input parameters for each model: https://scikit-learn.org/stable/index.html. - :test_split : float (default=0.2) + :test_split: float (default=0.2) proportion of the test data to use for building model, default of 0.2 is - reccomended. + reccomended, meaning 80% of the data used for training and 20% for testing. Methods ======= get_model(): build model using inputtted parameters. - train_test_split(scale = True, test_split = 0.2, random_state=None, shuffle=True): + train_test_split(scale=True, test_split=0.2, random_state=None, shuffle=True): get train-test split of dataset. fit(): fit model. @@ -71,7 +75,8 @@ class Model(): feature_selection(method=""): undertake feature selection using technique specified by method input parameter to find optimal selection of features for maximum predictability - in model. + in model. Supported feature selection methods include SelectKBest, chi2, + VarianceThreshold, RFE, SelectFromModel and SequentialFeatureSelector. """ def __init__(self, X, Y, algorithm, parameters={}, test_split=0.2): @@ -94,7 +99,7 @@ def __init__(self, X, Y, algorithm, parameters={}, test_split=0.2): #raise error if algorithm parameter isnt string type if not(isinstance(self.algorithm, str)): - raise TypeError("Algorithm input parameter must be a string.") + raise TypeError("Algorithm input parameter must be a string, got type {}.".format(type(self.algorithm))) #get closest match of valid model from the input algorithm parameter value using difflib model_matches = get_close_matches(self.algorithm.lower().strip(),[item.lower().strip() \ @@ -104,7 +109,7 @@ def __init__(self, X, Y, algorithm, parameters={}, test_split=0.2): if (model_matches!=[]): self.algorithm = model_matches[0] else: - raise ValueError('Input algorithm {} not found in list of available valid models {}.'.format( + raise ValueError('Input algorithm {} not found in list of available valid models\n{}.'.format( self.algorithm, self.valid_models)) #create instance of algorithm object using its sklearn constructor @@ -125,7 +130,7 @@ def get_model(self): Returns ======= - :model : sklearn.model + :model: sklearn.model instantiated regression model with default or user-specified parameters. """ parameters = {} @@ -133,7 +138,7 @@ def get_model(self): if (self.algorithm.lower().strip() == 'plsregression'): #get parameters of sklearn model and check that user inputted - #parameters are available in the model, only use those that are valid. + #parameters are available in the model, only use those that are valid for k,v in PLSRegression().__dict__.items(): if (k in list(self.parameters.keys())): parameters[k] = self.parameters[k] @@ -260,7 +265,7 @@ def get_model(self): model = KNeighborsRegressor() #no matching valid algorithm/model found else: - raise ValueError('Input Algorithm ({}) not found in available valid models: {}'. + raise ValueError('Input Algorithm {} not found in available valid models:\n{}'. format(self.algorithm, self.valid_models)) return model @@ -275,19 +280,19 @@ def train_test_split(self, test_split=0.2, scale=True, random_state=None, shuffl Parameters ========== - :scale : bool (default=True) + :scale: bool (default=True) if true then scale the features such that they are standardised. - :test_split : float (default=0.2) - proportion of the total dataset to use for testing. + :test_split: float (default=0.2) + proportion of the total dataset to use for testing, rest used for training. :random_state : float (default=None) Controls the shuffling applied to the data before applying the split. Popular integer random seeds are 0 and 42, None by default. - :shuffle : bool (default=True) + :shuffle: bool (default=True) Whether or not to shuffle the data before splitting. Returns ======= - :self.X_train, self.X_test, self.Y_train, self.Y_test : np.ndarray + :self.X_train, self.X_test, self.Y_train, self.Y_test: np.ndarray splitted training and test data features and labels. """ #validate that X and Y arrays are of the same size @@ -337,7 +342,7 @@ def fit(self): Returns ======= - :self.model_fit : np.ndarray + :self.model_fit: np.ndarray fitted sklearn model of type specified by algorithm attribute. """ self.model_fit = self.model.fit(self.X_train, self.Y_train) @@ -345,7 +350,8 @@ def fit(self): def predict(self): """ - Predict the target values of unseen test data using the model. + Predict the target values of unseen test data using the + trained model. Parameters ========== @@ -353,7 +359,7 @@ def predict(self): Returns ======= - :self.model_fit.predict(self.X_test) : np.ndarray + :self.model_fit.predict(self.X_test): np.ndarray array of predicted target values for unseen test data. """ return self.model_fit.predict(self.X_test) @@ -364,8 +370,10 @@ def save(self, save_folder, model_name="model.pkl"): Parameters ========== - :save_folder : str + :save_folder: str folder to save model to. + :model_name: str + filename for model. Returns ======= @@ -383,7 +391,7 @@ def save(self, save_folder, model_name="model.pkl"): try: with open(save_path, 'wb') as file: pickle.dump(self.model, file) - except (IOError, OSError, pickle.PickleError, pickle.UnpicklingError): + except (pickle.PickleError): print("Error pickling model with path: {}.".format(save_path)) def hyperparameter_tuning(self, param_grid={}, metric='r2', cv=5, n_jobs=None, verbose=2): @@ -393,16 +401,16 @@ def hyperparameter_tuning(self, param_grid={}, metric='r2', cv=5, n_jobs=None, v Parameters ========== - :param_grid : dict (default={}) - dictionary/grid of selected models' parameter and the potential values of each + :param_grid: dict (default={}) + dictionary/grid of selected models' parameters and the potential values of each that you want to tune. - :metric : str (default=r2) + :metric: str (default=r2) scoring metric used to evaluate the performance of the cross-validated model on the test set, R2 by default. List of available scoring metrics can be found in documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter - :cv : int (default=5) - Determines the cross-validation splitting strategy. + :cv: int (default=5) + Determines the cross-validation splitting strategy, a CV fold of 5 is used by default. :n_jobs : int (default=None) Number of jobs to run in parallel. None means 1 job. :verbose: int (default=2) @@ -420,7 +428,7 @@ def hyperparameter_tuning(self, param_grid={}, metric='r2', cv=5, n_jobs=None, v #input metric must be in available scoring metrics, if not raise error if (metric not in sorted(_SCORERS.keys())): - raise UndefinedMetricWarning('Invalid scoring metric, {} not in available Sklearn Scoring Metrics: {}.\n'\ + raise UndefinedMetricWarning('Invalid scoring metric {} not in list of available Sklearn Scoring Metrics:\n{}.'\ .format(metric, _SCORERS.keys())) #cv must be of type int and be between 5 and 10, if not then default of 5 is used @@ -487,7 +495,7 @@ def model_fitted(self): Returns ======= - :True/False : bool + :True/False: bool true if model (self.model) has been fitted, false if not. """ return (self.model_fit != None) @@ -500,13 +508,13 @@ def feature_selection(self, method=""): Parameters ========== - :method : str (default="") + :method: str (default="") feature selection method to use. Returns ======= - :X_new : np.ndarray - best found features from training data. + :X_new: np.ndarray + best found features using training data. References ========== @@ -518,7 +526,7 @@ def feature_selection(self, method=""): #get closest valid feature selection method feature_matches = get_close_matches(method.lower().strip(), [item.lower().strip() \ - for item in valid_feature_selection], cutoff=0.5) + for item in valid_feature_selection], cutoff=0.6) #apply feature selection method according to input parameter if (feature_matches == 'selectkbest'): @@ -617,13 +625,13 @@ def __str__(self): type(self.model).__name__, self.parameters, self.model_fitted()) def __repr__(self): - """ Object representation of class instance """ + """ Object representation of class instance. """ return type(self.model).__name__ def __eq__(self, other): - """ Checking if 2 sklearn models are the same """ + """ Checking if 2 sklearn models are the same. """ return self.model == other.model def __sizeof__(self): - """ Get size of sklearn model """ + """ Get size of sklearn model. """ return self.model.__sizeof__() \ No newline at end of file diff --git a/pySAR/plots.py b/pySAR/plots.py index 93a5b13..99b50ca 100644 --- a/pySAR/plots.py +++ b/pySAR/plots.py @@ -14,16 +14,16 @@ def plot_reg(Y_true, Y_pred, r2, output_folder="", show_plot=False): Parameters ========== - :Y_true : np.ndarray + :Y_true: np.ndarray array of observed values. - :Y_pred : np.ndarray + :Y_pred: np.ndarray array of predicted values. - :r2 : float + :r2: float r2 score value. - :output_folder : str (default="") + :output_folder: str (default="") output folder to store regression plot to, if empty input it will be stored in the OUTPUT_FOLDER global var. - :show_plot : bool (default=False) + :show_plot: bool (default=False) whether to display plot or not when function is run, if False the plot is just saved to output folder. diff --git a/pySAR/pyDSP.py b/pySAR/pyDSP.py index fb35665..142955d 100644 --- a/pySAR/pyDSP.py +++ b/pySAR/pyDSP.py @@ -10,9 +10,9 @@ kaiser, gaussian, barthann, bohman, chebwin, cosine, exponential, boxcar, \ flattop, nuttall, parzen, tukey, triang try: - from scipy.fftpack import fft, ifft, fftfreq + from scipy.fftpack import fft except: - from numpy.fft import fft, ifft, fftfreq + from numpy.fft import fft import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import json @@ -27,29 +27,31 @@ class PyDSP(): components. When both the function and its Fourier transform are replaced with discretized counterparts, it is called the Discrete Fourier transform (DFT). An implementation algorithm for the DFT is known as the FFT, which is used here. From the FFT transformations on the - encoded protein sequences (encoded via amino acid property values of the AAI), various - informational protein spectra can be generated, including the power, real, imaginary and + encoded protein sequences (encoded via amino acid property values from records in the AAI), + various informational protein spectra can be generated, including the power, real, imaginary and absolute spectra. Prior to the FFT, a window function can be applied to the sequences which is a mathmatical function that applies a weighting to each discrete time series sample - in a finite set. By default, the hamming window function is applied; although the function + in a finite set. By default, no window function is applied; although the function can also accept the blackman, blackmanharris, bartlett, gaussia, bartlett, barthann, bohman, chebwin, cosine, exponential, flattop, hann, boxcar, nuttall, parzen, triang and tukey windows. A filter function can also be applied, the class accepts the savgol, medfilt, lfilter and - hilbert filters. + hilbert filters, by default no filter function is applied. - In the pipeline of pySAR this class and its functions are onyl used when the 'use_dsp' - parameter is set to true in the config files, meaning that the encoded protein sequences - are passed through a Digital Signal Processing (DSP) pipeline before being used as - training data for the regression models. The protein sequences being numerically encoded - is a pre-reqisite to use the functions in this class, meaning sequences cannot be directly - input. + In the pipeline of pySAR this class and its functions are only used when the 'use_dsp' + parameter is set to true in the config files or in the class input parameters, meaning that + the encoded protein sequences are passed through a Digital Signal Processing (DSP) pipeline + before being used as training data for the regression models. The protein sequences being + numerically encoded is a pre-requisite to use the functions in this class, meaning sequences + cannot be directly input. - The class accepts two main input parameters. The protein_seqs input param if an numpy array + The class accepts two main input parameters. The protein_seqs input param is a numpy array of numerically pre-encoded protein sequences. The config_file parameter is the filename or filepath to the configuration file that contains all the required parameters for the DSP encoding strategy/process. The class also accepts a variable number of keyword arguments (**kwargs) that will override the config file parameter values of the same name if - they are passed in. + they are passed in. The only DSP parameter required from the config file for the classes + functionality is the spectrum, so if the config_file parameter is not specified but the + spectrum is passed in then an error will not be raised. Parameters ========== @@ -67,8 +69,8 @@ class PyDSP(): Methods ======= pre_processing(): - complete pre-processing steps before completeing DSP functionality. - encode_seqs(): + complete required pre-processing steps before DSP functionality/pipeline. + encode_sequences(): calculate FFT and various informational spectra of protein seqeuences. inverse_fft(): calculate inverse FFT of protein sequences. @@ -116,15 +118,13 @@ def __init__(self, config_file="", protein_seqs=None, **kwargs): for seq in protein_seqs: if (isinstance(seq, str)): raise ValueError("Protein sequences cannot be directly passed into the pyDSP class, you " - "must first encode the protein sequences using a specific aaindex code, " + "must first encode the protein sequences using a specific aaindex record, " "and then pass the resultant encoded sequence to the protein_seqs parameter.") #reshape protein sequences to 2 dimensions # if (self.protein_seqs.ndim != 2): - # try: - # self.protein_seqs = self.protein_seqs.reshape((-1, 1)) - # except: - # raise ValueError('Error reshaping input sequences: {}'.format(protein_seqs)) + # self.protein_seqs = self.protein_seqs.reshape((-1, 1)) + #set pyDSP parameters from kwargs or json config - use_dsp, spectrum, window function, window filter self.dsp_parameters = kwargs.get('dsp_parameters') if 'dsp_parameters' in kwargs else self.config_parameters.pyDSP @@ -140,7 +140,7 @@ def __init__(self, config_file="", protein_seqs=None, **kwargs): self.pre_processing() #transform sequences into the various informational protein spectra - self.encode_seqs() + self.encode_sequences() def pre_processing(self): """ @@ -180,29 +180,29 @@ def pre_processing(self): #list of accepted spectra, window functions and filters all_spectra = ['power', 'absolute', 'real', 'imaginary'] + all_filters = ['savgol', 'medfilt', 'lfilter', 'hilbert'] all_windows = ['hamming', 'blackman', 'blackmanharris', 'gaussian', 'bartlett', 'kaiser', 'barthann', 'bohman', 'chebwin', 'cosine', 'exponential', 'flattop', 'hann', 'boxcar', 'nuttall', 'parzen', 'triang', 'tukey'] - all_filters = ['savgol', 'medfilt', 'lfilter', 'hilbert'] - #set required input parameters, raise error if spectrum is none + #get appoximate spectrum type from input, raise error if spectrum None or invalid if (self.spectrum == None): - raise ValueError('Invalid input Spectrum type ({}) not available: {}.'. - format(self.spectrum, all_spectra)) + raise ValueError('Spectrum parameter cannot be empty of None.') else: #get closest correct spectra from user input, if no close match then raise error spectra_matches = (get_close_matches(self.spectrum, all_spectra, cutoff=0.4)) if (spectra_matches == []): - raise ValueError('Invalid input Spectrum type ({}) not available: {}.'. - format(self.spectrum, all_spectra)) + raise ValueError('Invalid input spectrum type {}, not available in list of available spectra:\n{}.' + .format(self.spectrum, all_spectra)) else: self.spectrum = spectra_matches[0] #closest match in array + #get appoximate window type from input, if None or invalid set window to 1 (no window) if (self.window_type == None): self.window = 1 #window = 1 is the same as applying no window else: #get closest correct window function from user input - window_matches = (get_close_matches(self.window_type, all_windows, cutoff=0.4)) + window_matches = (get_close_matches(self.window_type, all_windows, cutoff=0.6)) #remove any null or None values from window parameters in config self.window_parameters = {k: v for k, v in self.window_parameters.items() if v} @@ -305,13 +305,13 @@ def pre_processing(self): else: self.window = 1 #window = 1 is the same as applying no window + #get appoximate filter type from input if ((self.filter_type != None) and (self.filter_type != "")): - #get closest correct filter from user input filter_matches = get_close_matches(self.filter_type, all_filters, cutoff=0.4) #set filter attribute according to approximate user input if (filter_matches != []): - if (filter_matches[0] == 'savgol'): #*** + if (filter_matches[0] == 'savgol'): self.filter_type = "savgol" elif (filter_matches[0] == 'medfilt'): self.filter_type = "medfilt" @@ -320,14 +320,16 @@ def pre_processing(self): elif (filter_matches[0] == 'hilbert'): self.filter_type = "hilbert" - def encode_seqs(self): + def encode_sequences(self): """ - Calculate the FFT of the protein sequences already encoded using - the AAI indices, then use the output of the FFT to calculate the various - informational protein spectra including the power, absolute, real and - imaginary. The spectrum_encoding attribute will be set to the spectrum - inputted by user from the 'spectrum' config parameter, if no valid - spectrum input as parameter then value error raised. + Calculate the DFT of the protein sequences already encoded using + the AAI indices, using the FFT algorithm, then use the output of the + FFT to calculate the various informational protein spectra including + the power, absolute, real and imaginary. The spectrum_encoding + attribute will be set to the spectrum inputted by user from the + 'spectrum' config parameter, if no valid spectrum input as parameter + then value error raised. After spectrum calculated, apply any + window or filter function, if applicable. Parameters ========== @@ -341,7 +343,7 @@ def encode_seqs(self): encoded_seq_copy = np.copy(self.protein_seqs) #initialise zero arrays used to store output of both fft, set - #datatype to complex number as that is the output type of the transformation + #datatype to complex number as that is the output type of the FFT transformation encoded_dataset_fft = np.zeros((self.protein_seqs.shape), dtype=complex) #initialise zero arrays used to store output frequencies from fft transformations @@ -353,7 +355,7 @@ def encode_seqs(self): #create temp zeros arrays to store current sequence's fft encoded_fft = np.zeros((self.signal_len), dtype=complex) - #apply window function to Fourier array, multiplying by 1 if using no window function + #apply window function to Fourier array, multiple by 1 if using no window function encoded_fft = fft(encoded_seq_copy[seq] * self.window) #apply filter to encoded sequences if filter_type not empty in config @@ -418,9 +420,9 @@ def inverse_fft(self, a, n): Parameters ========== - :a : np.ndarray + :a: np.ndarray input array of 1D Fourier Transform. - :n : int + :n: int length of the output. Returns @@ -437,24 +439,24 @@ def consensus_freq(self, freqs): Parameters ========== - :freqs : np.ndarray + :freqs: np.ndarray frequencies of Fourier Transform. Returns ======= - :CF : float + :CF: float consensus frequency found in array of frequencies. - :CFi : int - index of consensus frequency. """ #raise error if more than one sequence passed into function if (freqs.ndim == 2 and freqs.shape[1] != 2): - raise ValueError("Only one protein sequence should be passed into the function:" - " {}.".format(freqs)) + raise ValueError("Only one protein sequence should be passed into the function: {}.".format(freqs)) + print(self.max_freq(freqs)[0]) + print(self.num_seqs) + print((self.max_freq(freqs)[0])/self.num_seqs) # CF = PP/N ( peak position/length of largest protein in dataset) - CF, CFi = (self.max_freq(freqs))/self.num_seqs - return CF, CFi + CF = (self.max_freq(freqs)[0])/self.num_seqs + return CF def max_freq(self, freqs): """ @@ -462,20 +464,20 @@ def max_freq(self, freqs): Parameters ========== - :freqs : np.ndarray + :freqs: np.ndarray frequencies from Fourier Transform. Returns ======= - :max_F : float + :max_F: float maximum frequency found in array of frequencies. - :max_FI : int + :max_FI: int index of maximum frequency. """ #raise error if more than one sequence passed into function if (freqs.ndim == 2 and freqs.shape[1] != 2): - raise ValueError("Only one protein sequence should be passed into the function:" - "{}.".format(freqs)) + raise ValueError("Only one protein sequence should be passed into the function: {}.".format(freqs)) + max_F = max(freqs) max_FI = np.argmax(freqs) return max_F, max_FI @@ -538,6 +540,14 @@ def window_type(self): def window_type(self, val): self._window_type = val + @property + def filter(self): + return self._filter + + @filter.setter + def filter(self, val): + self._filter = val + @property def filter_type(self): return self._filter_type @@ -550,4 +560,4 @@ def __str__(self): return "Instance of PyDSP class, using parameters: {}.".format(self.__dict__.keys()) def __repr__(self): - return (''.format(self)) \ No newline at end of file + return ('.'.format(self)) \ No newline at end of file diff --git a/pySAR/pySAR.py b/pySAR/pySAR.py index af3ef0f..11dd5d2 100644 --- a/pySAR/pySAR.py +++ b/pySAR/pySAR.py @@ -10,7 +10,6 @@ from json import JSONDecodeError import textwrap -from .globals_ import DATA_DIR from aaindex import aaindex1 from .model import Model from .pyDSP import PyDSP @@ -24,10 +23,12 @@ class PySAR(): The PySAR class is the main class for the pySAR software. The class allows for the encoding of protein sequences via a plethora of techniques, mainly via AAI Indices and or strucutrual, biochemical and physiochemical protein descriptors that are - then used as features in the building of predictive regression models created to map the + then used as features in the building of predictive regression ML models created to map the protein sequences to a sought-after activity/fitness value (activity attribute), this is known as a Sequence Activity Relationship (SAR) or Sequence Function Relationship (SFR). - + Creating this mapping from sequence to activity/fitness then allows for the future prediction + of the sought activity/fitness value for unseen protein sequences. + Three main encoding strategies are possible in the class and in the software, namely using AAI Indices or protein descriptors as well as AAI Indices + Descriptors. Additionally, the protein sequences can be encoded using Digital Signal Processing (DSP) @@ -35,14 +36,14 @@ class PySAR(): via the pyDSP class in the software. This class accepts strings or lists of AAI Indices or descriptors and then passes these through a pipeline to get the required numerical encoding of the respective sequences. The calculated encodings of the sequences are - used as features in the building of the predictive models that will then predict the + used as features in the building of the predictive ML models that will then predict the acitivty values for new unseen protein sequences. After the encoding process, various metrics will be captured and stored in a local output folder according to the OUTPUT_FOLDER global var as well as a regression plot showing how well the model, and the selected protein feature attributes, fit to the test data of unseen protein sequences. - The class has one main input parameter (config_file) that is the filename or filepath + The class has one main input parameter (config_file), that is the filename or filepath to the configuration file that contains all the required parameters for the encoding strategy/process. The class also accepts a variable number of keyword arguments (**kwargs) that will override the config file parameter values of the same name if @@ -65,16 +66,23 @@ class PySAR(): pre-process / clean protein sequence dataset. get_aai_encoding(indices): get AAI encoding for user inputted index/indices. - encode_aai(indices=None, show_plot=False, print_results=True, output_folder=""): - get encoded protein sequences according to user inputted index/indices, applying - DSP if applicable. - get_descriptor_encoding(descriptors): - calculate user inputted descriptor/descriptors according to user input. - encode_descriptor(descriptor=None, show_plot=False, print_results=True, output_folder=""): - get encoded protein sequences according to user inputted descriptor/descriptors. + encode_aai(aai_indices=None, show_plot=False, print_results=True, output_folder=""): + full pipeline for encoding protein sequences according to user specified + index/indices from the respective records in the AAI database using the + get_aai_encoding() function, and outputting the results with all the predictability + metrics. Also applying a DSP pipeline if applicable. + get_descriptor_encoding(descriptors=None): + calculate user inputted descriptor/descriptors using the input protein sequences + and protpy package. + encode_descriptor(descriptors=None, show_plot=False, print_results=True, output_folder=""): + full pipeline for encoding protein sequences according to user inputted descriptor/descriptors, + calculated using the get_descriptor_encoding() function and the protpy package and outputting + the results with all the predictability metrics. encode_aai_descriptor(aai_indices=None, descriptors=None, show_plot=False, print_results=True, output_folder=""): - get encoded protein sequences according to user inputted AA index/indices and - descriptor/descriptors. + full pipeline for encoding protein sequences according to user specified index/indices + in concatenation with descriptor/descriptors using the get_aai_encoding() and + get_descriptor_encoding() functions. Output the results with all the predictability + metrics. output_results(results): print out the predictive model parameters/attributes and its results. """ @@ -91,7 +99,7 @@ def __init__(self, config_file="", **kwargs): #open json config file and read in parameters if not (isinstance(config_file, str) or config_file is None): - raise TypeError('JSON config file must be a filepath of type string, not of type {}.'.format(type(config_file))) + raise TypeError('JSON config file must be a filepath of type string, got type {}.'.format(type(config_file))) if (os.path.isfile(self.config_file)): config_filepath = self.config_file elif (os.path.isfile(os.path.join('config', self.config_file))): @@ -106,8 +114,9 @@ def __init__(self, config_file="", **kwargs): #create instance of Map class so parameters can be accessed via dot notation self.config_parameters = Map(self.config_parameters) - self.dataset = kwargs.get('dataset') if 'dataset' in kwargs else self.config_parameters.dataset["dataset"] + #dataset parameters + self.dataset = kwargs.get('dataset') if 'dataset' in kwargs else self.config_parameters.dataset["dataset"] self.sequence_col = kwargs.get('sequence_col') if 'sequence_col' in kwargs else self.config_parameters.dataset["sequence_col"] self.activity_col = kwargs.get('activity_col') if 'activity_col' in kwargs else self.config_parameters.dataset["activity"] @@ -122,7 +131,7 @@ def __init__(self, config_file="", **kwargs): #descriptors parameters self.descriptors = None - #set pyDSP parameters from kwargs or json config - use_dsp, spectrum, window function, window filter + #pyDSP parameters - use_dsp, spectrum, window function, window filter self.use_dsp = kwargs.get('use_dsp') if 'use_dsp' in kwargs else self.config_parameters.pyDSP["use_dsp"] self.dsp_parameters = kwargs.get('dsp_parameters') if 'dsp_parameters' in kwargs else self.config_parameters.pyDSP self.filter_parameters = kwargs.get('filter_parameters') if 'filter_parameters' in kwargs else self.dsp_parameters["filter"] @@ -158,8 +167,7 @@ def __init__(self, config_file="", **kwargs): def read_data(self): """ - Read in dataset according to file name from 'dataset' attribute. By default - the dataset should be stored in DATA_DIR. + Read in dataset according to file name from 'dataset' attribute. Parameters ========== @@ -167,25 +175,19 @@ def read_data(self): Returns ======= - :data (pd.DataFrame): + :data: pd.DataFrame dataframe of imported dataset. """ - filepath = "" - #read in dataset csv if found in path, if not raise error - if (os.path.isfile(os.path.join(DATA_DIR, self.dataset))): - filepath = os.path.join(DATA_DIR, self.dataset) - elif (os.path.isfile(self.dataset)): - filepath = self.dataset - else: - raise OSError('Dataset filepath is not correct: {}.'.format(filepath)) + if not (os.path.isfile(self.dataset)): + raise OSError('Dataset filepath is not correct: {}.'.format(self.dataset)) #read in dataset csv try: - data = pd.read_csv(filepath, sep=",", header=0) + data = pd.read_csv(self.dataset, sep=",", header=0) return data except: - raise IOError('Error opening dataset file: {}.'.format(filepath)) + raise IOError('Error opening dataset file: {}.'.format(self.dataset)) def preprocessing(self): """ @@ -208,7 +210,7 @@ def preprocessing(self): if (sequence_col_matches != []): self.sequence_col = sequence_col_matches[0] else: - raise ValueError('Sequence Column ({}) not present in dataset columns:\n{}'. + raise ValueError('Sequence column ({}) not present in dataset columns:\n{}.'. format(self.sequence_col, self.data.columns)) #remove any gaps found in sequences in dataset @@ -217,7 +219,7 @@ def preprocessing(self): #verify no invalid amino acids found in sequences, if so then raise error invalid_seqs = valid_sequence(self.sequences) if (invalid_seqs != None): - raise ValueError('Invalid Amino Acids found in protein sequence dataset: {}.'.format(invalid_seqs)) + raise ValueError('Invalid amino acids found in protein sequence dataset: {}.'.format(invalid_seqs)) #get closest match for activity column name in dataset activity_matches = get_close_matches(self.activity_col, self.data.columns, cutoff=0.6) @@ -226,56 +228,56 @@ def preprocessing(self): if (activity_matches != []): self.activity_col = activity_matches[0] else: - raise ValueError('Activity Column ({}) not present in dataset columns:\n{}'. + raise ValueError('Activity column ({}) not present in dataset columns:\n{}.'. format(self.activity_col,list(self.data.columns))) #remove any +/- infinity values or any Null/NAN's from activity values self.data[self.activity_col].replace([np.inf, -np.inf], np.nan) self.data[self.activity_col].fillna(0, inplace=True) - def get_aai_encoding(self, indices=None): + def get_aai_encoding(self, aai_indices=None): """ - Get AAI index encoding values for index specified by indices for each amino - acid in each of the protein sequences in dataset. The index/indices should be - in the form of the properties accession number which is the 10 length - alphanumeric code that represents each property within the AAI database. If - multiple indices/accession numbers input then encode protein sequences with - each index and concatenate. + Get AAI index encoding values for input index/indices and their respective + record values from the AAI database. Encode each amino acid in the protein + sequences in the dataset to the respective values specified in the AAI + The index/indices should be in the form of the properties accession number + which is the 10 length alphanumeric code that represents each property within + the AAI database. If multiple indices/accession numbers input then encode + protein sequences with each index and concatenate. Parameters ========== - :indices : str/list (default=None): + :aai_indices: str/list (default=None) string or list of AAI indices/accession numbers. Returns ======= - :encoded_seqs : np.ndarray: + :encoded_seqs: np.ndarray array of the encoded protein sequences in dataset via user input index/indices. """ #validate AAI indices are present in the input parameter, if not raise error - if (indices == None or indices == ""): - raise ValueError('AAI indices input parameter cannot be None or empty: {}.'.format(indices)) + if (aai_indices == None or aai_indices == ""): + raise ValueError('AAI indices input parameter cannot be None or empty: {}.'.format(aai_indices)) #check input indices is of correct type (str/list), if not raise type error - if (not isinstance(indices, str) and (not isinstance(indices, list))): - raise TypeError("Input indices parameter must be a string or list, got {}.".format(type(indices))) + if (not isinstance(aai_indices, str) and (not isinstance(aai_indices, list))): + raise TypeError("Input indices parameter must be a string or list, got {}.".format(type(aai_indices))) #cast index string to list, split multiple indices using comma - if (isinstance(indices, str)): - if (',' in indices): - indices = indices.split(',') #split on ',' just in case multiple indices passed in as str + if (isinstance(aai_indices, str)): + if (',' in aai_indices): + aai_indices = aai_indices.split(',') #split on ',' just in case multiple indices passed in as str else: - indices = [indices] + aai_indices = [aai_indices] #create zeros numpy array to store encoded sequence output - encoded_aai_ = np.zeros((self.num_seqs, self.sequence_length*len(indices))) + encoded_aai_ = np.zeros((self.num_seqs, self.sequence_length*len(aai_indices))) - #if multiple indices used then calculate AAI index encoding for each and - #then concatenate after each calculation - for index in range(0, len(indices)): + #if multiple indices used then calculate AAI index encoding for each and concatenate after each calculation + for index in range(0, len(aai_indices)): - #get values from aaindex record using its accession number - encoded_aai = aaindex1[indices[index]].values + #get values from aaindex record using its accession number and the aaindex package + encoded_aai = aaindex1[aai_indices[index]].values #initialise temp arrays to store encoded sequences temp_seq_vals = [] @@ -296,7 +298,7 @@ def get_aai_encoding(self, indices=None): #convert list of lists into array temp_all_seqs = np.array(temp_all_seqs, dtype="float32") - #in first iteration through indices (index=0) set encoded_aai_ to zero-initialised + #in first iteration through aai_indices (index=0) set encoded_aai_ to zero-initialised #numpy array, else concatenate to the array in previous iteration if (index == 0): encoded_aai_ = temp_all_seqs @@ -307,30 +309,32 @@ def get_aai_encoding(self, indices=None): def encode_aai(self, aai_indices=None, show_plot=False, print_results=True, output_folder=""): """ - Encode using AAI indices from the AAI database. If multiple - indices/accession numbers input then calculate each and concatenate them. - Build predictive model from AAI feature data. The resulting model assets - and its results will be exported to the directory pointed to by the global - variable OUTPUT_DIR. If use_dsp config parameter is true then pass AAI - Indices through a DSP transformation specified by the config's DSP parameters + Full pipeline for encoding proteins sequences in dataset using the input AAI indices + from the AAI database. If multiple indices/accession numbers input then calculate each + and concatenate them. Build predictive regression ML model from encoded AAI feature data + for predicting the activity/fitness values of unseen sequences. + + The resulting model assets and its results will be exported to the directory pointed to + by the global var OUTPUT_DIR. If use_dsp config parameter is true then pass AAI + Indices through a DSP transformation pipeline specified by the config's DSP parameters (spectrum, window & filter) via the PyDSP module and class. Parameters ========== - :aai_indices : str/list (default=None) + :aai_indices: str/list (default=None) string or list of indices/accession numbers from the AAI. - :show_plot : bool (default=False) + :show_plot: bool (default=False) display regression plot of best predictive model. If False then the plot will just be saved to the output folder, else it'll be displayed & also saved. - :print_results : bool (default=True) + :print_results: bool (default=True) if true, output verbose output of results and parameters from encoding process. - :output_folder : str (default="") + :output_folder: str (default="") output folder to store results csv to, if empty input it will be stored in the OUTPUT_FOLDER global var. Returns ======= - :aai_df : pd.Dataframe + :aai_df: pd.Dataframe pandas Dataframe storing metrics and results of encoding. """ #validate AAI indices are present in the input parameter @@ -343,11 +347,9 @@ def encode_aai(self, aai_indices=None, show_plot=False, print_results=True, outp self.aai_indices = aai_indices - print("self.aai_indices1", self.aai_indices) #if list of one element with multiple indices, split them into list of individual elements if isinstance(self.aai_indices, list) and len(self.aai_indices) == 1: self.aai_indices = self.aai_indices[0].replace(' ', '').split(',') - print("self.aai_indices2", self.aai_indices) #convert string indices into comma seperated list, remove whitespace if isinstance(self.aai_indices, str): @@ -372,7 +374,7 @@ def encode_aai(self, aai_indices=None, show_plot=False, print_results=True, outp if (self.spectrum == None or self.spectrum == ""): raise ValueError('Spectrum cannot be None or empty: {}.'.format(self.spectrum)) pyDSP = PyDSP(self.config_file, protein_seqs=encoded_seqs) - pyDSP.encode_seqs() + pyDSP.encode_sequences() X = pd.DataFrame(pyDSP.spectrum_encoding) #set training data to FFT spectrum encoding else: X = pd.DataFrame(encoded_seqs) #no DSP applied to encoded sequences @@ -416,8 +418,6 @@ def encode_aai(self, aai_indices=None, show_plot=False, print_results=True, outp #create comma seperated list of categories index_cat = ', '.join(index_cat) - print("self.aai_indices3", self.aai_indices) - #create output dataframe, set first row to attribute/metric values aai_df = pd.DataFrame(columns=['Index', 'Category', 'R2', 'RMSE', 'MSE', 'MAE', 'RPD', 'Explained Variance']) aai_df.loc[0] = [', '.join(self.aai_indices), str(index_cat).strip(), eval.r2, eval.rmse, eval.mse, eval.mae, eval.rpd, eval.explained_var] @@ -443,22 +443,21 @@ def encode_aai(self, aai_indices=None, show_plot=False, print_results=True, outp def get_descriptor_encoding(self, descriptors=None): """ - Calculate inputted descriptor(s), using the Descriptors class and - custom-built protpy package, requried for the encoding process. - Get closest match to user inputted string or list of descriptors - using difflib library. If a single descriptor is input then - calculate it and return, if list of descriptors input then - calculate each descriptor's value and concatenate. + Calculate inputted descriptor(s), using the Descriptors class and custom-built + protpy package, requried for the encoding process. Get closest match to user + inputted string or list of descriptors using difflib library. If a single + descriptor is input then calculate it and return, if list of descriptors input + then calculate each descriptor's value and concatenate. Parameters ========== - :descriptors : str/list (default=None) + :descriptors: str/list (default=None) string or list of protein descriptor names. Returns ======= - :encoded_desc : pd.DataFrame - pandas dataFrame of calculated descriptor values according to user + :encoded_desc: pd.DataFrame + pandas dataframe of calculated descriptor values according to user inputted descriptor(s). """ #raise error if no descriptors specified in input @@ -490,7 +489,7 @@ def get_descriptor_encoding(self, descriptors=None): descr.valid_descriptors, cutoff=0.6) descriptors[de] = desc_matches[0] if (descriptors[de] == []): - raise ValueError('No approximate descriptor found from one entered: {}.'.format(de)) + raise ValueError('No approximate descriptor found from one input: {}.'.format(de)) temp_descriptors.append(desc_matches[0]) #initialise temp lists and DF to store encoded descriptor values @@ -501,7 +500,7 @@ def get_descriptor_encoding(self, descriptors=None): #iterate and get each descriptors' values using Descriptor class and protpy package for d in range(0, len(descriptors)): encoded_desc_temp = descr.get_descriptor_encoding(descriptors[d]) - #raise value error if descriptor is empty + #raise value error if descriptor is empty/None if (encoded_desc_temp.empty): raise ValueError('Descriptor cannot be empty or None: {}.'.format(descriptors[d])) encoded_desc_vals.append(encoded_desc_temp) #append to array of all descriptor values @@ -514,31 +513,31 @@ def get_descriptor_encoding(self, descriptors=None): def encode_descriptor(self, descriptors=None, show_plot=False, print_results=True, output_folder=""): """ - Encode protein sequences using protein physiochemical, biochemical and or - structural descriptors, using the Descriptors class and custom-built protpy - package, and build predictive model from the descriptor feature/training data. - - If multiple descriptors input then calculate each and concatenate them. - The resulting model assets and its results will be exported to the directory - pointed to by the global variable OUTPUT_DIR. + + Full pipeline for encoding the protein sequences in the dataset using protein + physiochemical, biochemical and or structural descriptors, using the Descriptors + class and custom-built protpy package, and build predictive ML regression model + from the descriptor feature/training data. This model is then used to calculate + the activity/fitness value of unseen test sequences. If multiple descriptors input + then calculate each and concatenate them. The resulting model assets and its metric's + results will be exported to the directory pointed to by the global variable OUTPUT_DIR. Parameters ========== - :descriptors : str/list (default=None) + :descriptors: str/list (default=None) string or list of protein descriptor names. - :show_plot : bool (default=False) + :show_plot: bool (default=False) display regression plot of best predictive model. If False then the plot will just be saved to the output folder, else it'll be displayed & also saved. - :print_results : bool (default=True) - if true, output verbose output of results and parameters from encoding - process. - :output_folder : str (default="") + :print_results: bool (default=True) + if true, output verbose output of results and parameters from encoding process. + :output_folder: str (default="") output folder to store results csv to, if empty input it will be stored in the OUTPUT_FOLDER global var. Returns ======= - :desc_df : pd.DataFrame + :desc_df: pd.DataFrame pandas dataframe storing metrics and results of encoding. """ #raise error if no descriptor specified in input @@ -580,20 +579,17 @@ def encode_descriptor(self, descriptors=None, show_plot=False, print_results=Tru if (desc_matches != []): self.descriptors[desc] = desc_matches[0] else: - raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n {}.'. + raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n{}.'. format(self.descriptors[desc], descr.valid_descriptors)) #concatenate encoding of current descriptor to main encodng object descriptor_encoding_df = pd.concat([descriptor_encoding_df, self.get_descriptor_encoding(descriptors=self.descriptors[desc])], axis=1) - #set training data (X) to descriptor-encoded protein sequences - X = descriptor_encoding_df - #set class variable to the training data feature space - self.feature_space = X.shape + self.feature_space = descriptor_encoding_df.shape #create instance of model class of type specified by algorithm parameter using X and Y data - self.model = Model(X, self.activity, self.algorithm, parameters=self.model_parameters) + self.model = Model(descriptor_encoding_df, self.activity, self.algorithm, parameters=self.model_parameters) #updating algorithm attribute self.algorithm = repr(self.model) @@ -647,27 +643,28 @@ def encode_descriptor(self, descriptors=None, show_plot=False, print_results=Tru def encode_aai_descriptor(self, aai_indices=None, descriptors=None, show_plot=False, print_results=True, output_folder=""): """ - Encode using both AAI indices and the physiochemical/structural descriptors. - The two outputs from the individual encoding strategies, previously described - above, will be concatenated together and used in the building of a - predictive regression model. The resulting model assets and its results will - be exported to the directory pointed to by the global variable OUTPUT_DIR. - If the config parameter use_dsp is true then pass AAI Indices through a DSP - transformation specified by the DSP parameters (spectrum, window & filter) via - the PyDSP class/module. + Encode using both AAI indices and the physiochemical/structural descriptors from + the get_aai_encoding() and get_descriptor_encoding() functions. The two outputs + from the individual encoding strategies, previously described above, will be + concatenated together and used in the building of a predictive regression ML + model. The resulting model assets and its results will be exported to the + directory pointed to by the global variable OUTPUT_DIR. If the config parameter + use_dsp is true then pass AAI Indices through a DSP transformation pipeline + specified by the DSP parameters (spectrum, window & filter) via the PyDSP + class/module. Parameters ========== - :aai_indices : str/list (default=None) + :aai_indices: str/list (default=None) string or list of indices/accession numbers from the AAI database. - :descriptors : str/list (default=None) + :descriptors: str/list (default=None) string or list of protein descriptors names. - :show_plot : bool (default=False) - display regression plot of best predictive model. If False then the plot + :show_plot: bool (default=False) + display regression plot of best predictive model. If false then the plot will just be saved to the output folder, else it'll be displayed & also saved. - :print_results : bool (default=True) + :print_results: bool (default=True) if true, output verbose output of results and parameters from encoding process. - :output_folder : str (default="") + :output_folder: str (default="") output folder to store results csv to, if empty input it will be stored in the OUTPUT_FOLDER global var. @@ -697,8 +694,6 @@ def encode_aai_descriptor(self, aai_indices=None, descriptors=None, show_plot=Fa if isinstance(self.descriptors, str): self.descriptors = self.descriptors.replace(' ', '').split(',') - print("self.aai_indices - 1 ", self.aai_indices) - #if list of one element with multiple indices, split them into list of individual elements if isinstance(self.aai_indices, list) and len(self.aai_indices) == 1: self.aai_indices = self.aai_indices[0].replace(' ', '').split(',') @@ -707,7 +702,6 @@ def encode_aai_descriptor(self, aai_indices=None, descriptors=None, show_plot=Fa if isinstance(self.aai_indices, str): self.aai_indices = self.aai_indices.replace(' ', '').split(',') - print("self.aai_indices - 2 ", self.aai_indices) #sort list of indices into alphabetical order self.aai_indices.sort() @@ -747,7 +741,7 @@ def encode_aai_descriptor(self, aai_indices=None, descriptors=None, show_plot=Fa if (desc_matches != []): self.descriptors[desc] = desc_matches[0] else: - raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n {}.'. + raise ValueError('Could not find a match for the input descriptor ({}) in list of valid descriptors:\n{}.'. format(self.descriptors[desc], descr.valid_descriptors)) #concatenate encoding of current descriptor to main encodng object @@ -801,8 +795,6 @@ def encode_aai_descriptor(self, aai_indices=None, descriptors=None, show_plot=Fa else: desc_group = descr.descriptor_groups[self.descriptors] - print("self.aai_indices - 3 ", [','.join(self.aai_indices)]) - #set output dataframe columns aai_desc_df.loc[0] = [', '.join(self.aai_indices), str(index_cat).strip(), ', '.join(self.descriptors), str(desc_group), eval.r2, eval.rmse, eval.mse, eval.mae, eval.rpd, eval.explained_var] @@ -832,7 +824,7 @@ def output_results(self, results): Parameters ========== - :results : dict/pd.Series + :results: dict/pd.Series dictionary or Series of metrics and their associated values. Returns diff --git a/pySAR/utils.py b/pySAR/utils.py index 667ce46..008d38d 100644 --- a/pySAR/utils.py +++ b/pySAR/utils.py @@ -18,7 +18,7 @@ class Map(dict): Parameters ========== - :dict : dict + :dict: dict input dictionary to be mapped into dot notation. Usage @@ -83,13 +83,13 @@ def valid_sequence(sequences): Parameters ========== - :sequences : list/np.ndarray + :sequences: list/np.ndarray list or array of protein sequences. Returns ======= - :None or invalid_indices : None/list - if no invalid values found in the protein sequences, None returned. if + :None or invalid_indices: None/list + if no invalid values found in the protein sequences, None returned. If invalid values found, list of dicts returned in the form {sequence index: invalid value in sequence index}. @@ -129,16 +129,16 @@ def remove_gaps(sequences): The descriptors cannot be calculated if a '-' value is passsed into their respective funtions so gaps need to be removed. Removing the gaps has the same effect as setting the value at the index of the sequence to 0 and has no effect - on the descriptors calculation. Input can be a string or list/array of sequences. + on the descriptor calculation. Input can be a string or list/array of sequences. Parameters ========== - :sequences : str/list/np.ndarray + :sequences: str/list/np.ndarray string of 1 protein sequence or array/list of protein sequences. Returns ======= - :protein_seqs : np.ndarray + :protein_seqs: np.ndarray returns the same inputted protein sequence(s) but with any gaps ('-') removed. """ #bool needed to ensure correct output format if input is str @@ -176,12 +176,12 @@ def flatten(array): Parameters ========== - :array : np.ndarray / list + :array: np.ndarray/list array of arrays or list of lists to be flattened. Returns ======= - :flatten(array/list) : np.ndarray/list + :flatten(array/list): np.ndarray/list flattened 1-dimensional list or array. """ #if input is a string then return input as cannot be flattened @@ -204,19 +204,19 @@ def flatten(array): else: return flattened_array -def zero_padding(sequences): - """ +def zero_padding(sequences): + """ Pad sequences in input array with 0's such that every sequence is of the same length of max(len(sequences)). Parameters ========== - :sequences : np.ndarray / list + :sequences: np.ndarray/list array or list of encoded protein sequences. Returns ======= - :sequences: np.ndarray / list + :sequences: np.ndarray/list input sequences but with every sequence in the object now zero paddded to be the same length. """ @@ -242,10 +242,10 @@ def save_results(results, file_name, output_folder=""): Parameters ========== - :results : dict/pd.DataFrame/pd.Series + :results: dict/pd.DataFrame/pd.Series object of the metrics and results from the encoding process. Ideally should be a dataframe/series but function also accepts a dict of results. - :file_name : str + :file_name: str file name to call results file. Returns @@ -262,7 +262,7 @@ def save_results(results, file_name, output_folder=""): else: output_folder = output_folder + "_" + CURRENT_DATETIME - #create output folder if it doesnt exist + #create output folder if it doesn't exist if not (os.path.isdir(output_folder)): os.makedirs(output_folder) @@ -277,5 +277,4 @@ def save_results(results, file_name, output_folder=""): results.reset_index(drop=True, inplace=True) results.to_csv(os.path.join(output_folder, file_name)) else: - raise TypeError('Results Object must be of type: dict, pd.Series or pd.DataFrame, got object of type {}.' - .format(type(results))) \ No newline at end of file + raise TypeError('Results object must be of type: dict, pd.Series or pd.DataFrame, got object of type {}.'.format(type(results))) \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 7b17aab..81953d7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = PySAR -version = 2.4.0 +version = 2.4.1 description = Analysing Sequence Activity Relationships (SARs) of protein sequences and their mutants using Machine Learning. author = AJ McKenna author_email = amckenna41@qub.ac.uk @@ -13,6 +13,7 @@ maintainer = AJ McKenna keywords = bioinformatics protein engineering + drug discovery python pypi machine learning @@ -33,7 +34,6 @@ classifiers = Intended Audience :: Information Technology License :: OSI Approved :: MIT License Natural Language :: English - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 @@ -46,7 +46,7 @@ classifiers = [options] zip_safe = false packages = find: -python_requires = >=3.6 +python_requires = >=3.8 include_package_data = True install_requies = diff --git a/setup.py b/setup.py index f6925b3..b7d7519 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ 'Intended Audience :: Information Technology', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', diff --git a/tests/README.md b/tests/README.md index 2f98c21..2b7d4bb 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,6 +1,6 @@ # pySAR Tests -All of the modules and functionalities of pySAR are thoroughly tested using the Python [unittest][unittest] framework. +All of the modules and functionalities of pySAR are thoroughly tested using the Python [unittest][unittest] framework. `pySAR` has hundreds of individual unit tests with 51 test functions and 6 test cases for each of the modules. Running all unit tests takes approximately X minutes. Module Tests ------------ @@ -10,7 +10,6 @@ Module Tests * `test_pyDSP` - tests for pyDSP module and class. * `test_pySAR` - tests for pySAR module and class. * `test_utils` - tests for utils module and functionality. -* `test_evaluate` - tests for evaluate module and functionality. Running Tests ------------- @@ -21,9 +20,10 @@ python -m unittest discover tests -v #-v produces a more verbose and useful output ``` -To run a module's specific unittests, make sure you are in the pySAR directory and from a terminal/cmd-line run: +To run a module's specific unittests, make sure you are in the `pySAR` directory and from a terminal/cmd-line run: ```python -python -m unittest tests.test_MODULE -v +python -m unittest tests.test_MODULE -b +#-b output during a passing test is discarded. Output is echoed normally on test fail or error and is added to the failure messages. ``` Directory Folders diff --git a/tests/test_config/README.md b/tests/test_config/README.md index d658441..0ca3907 100644 --- a/tests/test_config/README.md +++ b/tests/test_config/README.md @@ -1,6 +1,6 @@ # PySAR: Configuration files for testing -* `thermostability.json` - configuration file for using pySAR with the thermostability dataset in the /data folder. -* `absorption.json` - configuration file for using pySAR with the absorption example dataset in the /example_datasets folder. -* `enantioselectivity.json` - configuration file for using pySAR with the enantioselectivity example dataset in the /example_datasets folder. -* `localization.json` - configuration file for using pySAR with the localization example dataset in the /example_datasets folder. \ No newline at end of file +* `test_thermostability.json` - configuration file for testing pySAR with the thermostability dataset. +* `test_absorption.json` - configuration file for testing pySAR with the absorption example dataset. +* `test_enantioselectivity.json` - configuration file for testing pySAR with the enantioselectivity example dataset. +* `test_localization.json` - configuration file for testing pySAR with the localization example dataset. \ No newline at end of file diff --git a/tests/test_config/test_absorption.json b/tests/test_config/test_absorption.json index 352de7f..fdf9596 100644 --- a/tests/test_config/test_absorption.json +++ b/tests/test_config/test_absorption.json @@ -8,75 +8,72 @@ "model": { - "algorithm": "plsregression", + "algorithm": "knn", "parameters": {}, "test_split": 0.2 }, - - "descriptors": + + "descriptors": { - "descriptors_csv": "", - "all_desc": 0, - "moreaubroto_autocorrelation": + "descriptors_csv": "descriptors_absorption.csv", + "moreaubroto_autocorrelation": { "lag": 30, - "properties":["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", + "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, "moran_autocorrelation": { "lag": 30, - "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", + "properties":["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "geary_autocorrelation": + "geary_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "ctd": + "ctd": { "property": "hydrophobicity", "all": 0 }, - "sequence_order_coupling_number": + "sequence_order_coupling_number": { "lag": 30, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" }, - "quasi_sequence_order": + "quasi_sequence_order": { "lag": 30, "weight": 0.1, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" }, - "pseudo_amino_acid_composition": + "pseudo_amino_acid_composition": { "lambda": 30, "weight": 0.05, - "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", - "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"] + "properties": [] }, - "amphiphilic_pseudo_amino_acid_composition": + "amphiphilic_pseudo_amino_acid_composition": { "lambda": 30, "weight": 0.5 - } - + } }, "pyDSP": { - "use_dsp": 1, + "use_dsp": 0, "spectrum": "power", "window": { "type": "hamming", "sym": 1, - "beta": 14, + "beta": 10, "alpha": null, "nbar": null, "sll": null, @@ -90,5 +87,6 @@ "delta": 1, "mode": "interp" } - } - } \ No newline at end of file + } + } + \ No newline at end of file diff --git a/tests/test_config/test_enantioselectivity.json b/tests/test_config/test_enantioselectivity.json index 340d254..28e535b 100644 --- a/tests/test_config/test_enantioselectivity.json +++ b/tests/test_config/test_enantioselectivity.json @@ -13,37 +13,36 @@ "test_split": 0.2 }, - "descriptors": + "descriptors": { - "descriptors_csv": "", - "all_desc": 0, - "moreaubroto_autocorrelation": + "descriptors_csv": "descriptors_enantioselectivity.csv", + "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "moran_autocorrelation": + "moran_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "geary_autocorrelation": + "geary_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "ctd": + "ctd": { "property": "hydrophobicity", "all": 0 }, - "sequence_order_coupling_number": + "sequence_order_coupling_number": { "lag": 30, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" @@ -54,13 +53,13 @@ "weight": 0.1, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" }, - "pseudo_amino_acid_composition": + "pseudo_amino_acid_composition": { "lambda": 30, "weight": 0.05, "properties": [] }, - "amphiphilic_pseudo_amino_acid_composition": + "amphiphilic_pseudo_amino_acid_composition": { "lambda": 30, "weight": 0.5 @@ -70,11 +69,11 @@ "pyDSP": { "use_dsp": 0, - "spectrum": "absolute", + "spectrum": "power", "window": { - "type": "blackman", + "type": "hamming", "sym": 1, - "beta": 14, + "beta": 10, "alpha": null, "nbar": null, "sll": null, diff --git a/tests/test_config/test_localization.json b/tests/test_config/test_localization.json index 829973d..41b462f 100644 --- a/tests/test_config/test_localization.json +++ b/tests/test_config/test_localization.json @@ -8,60 +8,58 @@ "model": { - "algorithm": "plsregression", + "algorithm": "adaboostregressor", "parameters": {}, "test_split": 0.2 }, - - "descriptors": - { - "descriptors_csv": "", - "all_desc": 0, - "moreaubroto_autocorrelation": + + "descriptors": + { + "descriptors_csv": "descriptors_localization.csv", + "moreaubroto_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "moran_autocorrelation": + "moran_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "geary_autocorrelation": + "geary_autocorrelation": { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "ctd": + "ctd": { - "property":"hydrophobicity", + "property": "hydrophobicity", "all": 0 }, - "sequence_order_coupling_number": + "sequence_order_coupling_number": { "lag": 30, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" }, - "quasi_sequence_order": + "quasi_sequence_order": { "lag": 30, "weight": 0.1, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" }, - "pseudo_amino_acid_composition": + "pseudo_amino_acid_composition": { "lambda": 30, "weight": 0.05, - "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", - "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"] + "properties": [] }, - "amphiphilic_pseudo_amino_acid_composition": + "amphiphilic_pseudo_amino_acid_composition": { "lambda": 30, "weight": 0.5 @@ -71,11 +69,11 @@ "pyDSP": { "use_dsp": 0, - "spectrum": "imaginary", + "spectrum": "power", "window": { - "type": "bartlett", + "type": "hamming", "sym": 1, - "beta": 14, + "beta": 10, "alpha": null, "nbar": null, "sll": null, diff --git a/tests/test_config/test_thermostability.json b/tests/test_config/test_thermostability.json index aafd127..0d65bc9 100644 --- a/tests/test_config/test_thermostability.json +++ b/tests/test_config/test_thermostability.json @@ -12,67 +12,73 @@ "parameters": {}, "test_split": 0.2 }, - - "descriptors": - { + + "descriptors": + { "descriptors_csv": "tests/test_data/test_thermostability_descriptors.csv", - "all_desc": 0, - "moreaubroto_autocorrelation": { + "moreaubroto_autocorrelation": + { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "moran_autocorrelation": { + "moran_autocorrelation": + { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "geary_autocorrelation": { + "geary_autocorrelation": + { "lag": 30, "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], "normalize": 1 }, - "ctd": { + "ctd": + { "property": "hydrophobicity", "all": 0 }, - "sequence_order_coupling_number": { + "sequence_order_coupling_number": + { "lag": 30, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" }, - "quasi_sequence_order": { + "quasi_sequence_order": + { "lag": 30, "weight": 0.1, "distance_matrix": "schneider-wrede-physiochemical-distance-matrix.json" }, - "pseudo_amino_acid_composition": { + "pseudo_amino_acid_composition": + { "lambda": 30, "weight": 0.05, - "properties": ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", - "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"] + "properties": [] }, - "amphiphilic_pseudo_amino_acid_composition": { + "amphiphilic_pseudo_amino_acid_composition": + { "lambda": 30, "weight": 0.5 } - }, "pyDSP": { - "use_dsp": 1, + "use_dsp": 0, "spectrum": "power", "window": { - "type": "blackmanharris", + "type": "hamming", "sym": 1, - "beta": 14, + "beta": 10, "alpha": null, "nbar": null, "sll": null, - "norm": null + "norm": null, + "std": null }, "filter": { "type": null, diff --git a/tests/test_descriptors.py b/tests/test_descriptors.py index 95b9c40..888a089 100644 --- a/tests/test_descriptors.py +++ b/tests/test_descriptors.py @@ -13,8 +13,7 @@ class DescriptorTests(unittest.TestCase): """ - Test suite for testing Descriptors module and functionality - in pySAR package. + Test suite for testing Descriptors module and functionality in pySAR package. Test Cases ========== @@ -50,6 +49,8 @@ class DescriptorTests(unittest.TestCase): testing correct Pseudo Amino Acid Composition descriptor functionality. test_amphiphilic_pseudo_amino_acid_composition: testing correct Amphiphilic Pseudo Amino Acid Composition descriptor functionality. + test_get_all_descriptors: + testing correct functionality for calculating all descriptors for a dataset of sequences. test_get_descriptor_encoding: testing correct descriptor encoding functionality. """ @@ -63,6 +64,7 @@ def setUp(self): os.path.join(config_path, "test_absorption.json"), os.path.join(config_path, "test_localization.json") ] + #path to pre-calculated protein descriptors for thermostability dataset self.test_descriptors_path = os.path.join('tests', 'test_data', 'test_thermostability_descriptors.csv') @@ -74,8 +76,7 @@ def setUp(self): "Q", "R", "S", "T", "V", "W", "Y"] def test_descriptor(self): - """ Test descriptor initialisation process. Verify the initial input parameters - and descriptor attributes are correct. """ + """ Test descriptor initialisation process. Verify the initial input parameters and descriptor attributes are correct. """ #1.) desc = descr.Descriptors(config_file=self.all_config_files[0]) #pre-calculated descriptors from thermostability dataset @@ -120,7 +121,7 @@ def test_descriptor(self): self.assertEqual(desc.all_descriptors.shape, (self.num_seqs[0], 9714), 'Attribute shape should be [{}, {}], got {}.'.format(self.num_seqs[0], 9714, desc.all_descriptors.shape)) #3.) - #testing on all 4 datasets/config files + #testing on remaining 3 datasets/config files that don't have a pre-calculated descriptors csv for config in range(1, len(self.all_config_files)): desc = descr.Descriptors(config_file=self.all_config_files[config]) @@ -160,35 +161,29 @@ def test_descriptor(self): descr.Descriptors(config_file="") def test_descriptor_groups(self): - """ Testing the descriptor groups dictionary which stores the specific group - that a descriptor attribute is a member of. """ - #testing on all 4 datasets + """ Testing the descriptor groups dictionary which stores the specific group that a descriptor attribute is a member of. """ + #testing on all 4 datasets and config file for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(config_file=self.all_config_files[dataset]) #1.) self.assertEqual(list(desc.descriptor_groups.keys()), desc.all_descriptors_list(), - "Descriptor groups list is incorrect: {}".format(list(desc.descriptor_groups.keys()))) + "Descriptor groups list is incorrect, got:\n{}.".format(list(desc.descriptor_groups.keys()))) self.assertEqual(list(desc.descriptor_groups.values()).count("Composition"), 3, - "Expected there to be 3 composition groups, got {}.".format( - list(desc.descriptor_groups.values()).count("Composition"))) + "Expected there to be 3 composition groups, got {}.".format(list(desc.descriptor_groups.values()).count("Composition"))) self.assertEqual(list(desc.descriptor_groups.values()).count("Autocorrelation"), 3, - "Expected there to be 3 autocorrelation groups, got {}.".format( - list(desc.descriptor_groups.values()).count("Autocorrelation"))) + "Expected there to be 3 autocorrelation groups, got {}.".format(list(desc.descriptor_groups.values()).count("Autocorrelation"))) self.assertEqual(list(desc.descriptor_groups.values()).count("Conjoint Triad"), 1, - "Expected there to be 1 conjoint triad groups, got {}.".format( - list(desc.descriptor_groups.values()).count("Conjoint Triad"))) + "Expected there to be 1 conjoint triad groups, got {}.".format(list(desc.descriptor_groups.values()).count("Conjoint Triad"))) self.assertEqual(list(desc.descriptor_groups.values()).count("Sequence Order"), 2, - "Expected there to be 2 sequence order groups, got {}.".format( - list(desc.descriptor_groups.values()).count("Sequence Order"))) + "Expected there to be 2 sequence order groups, got {}.".format(list(desc.descriptor_groups.values()).count("Sequence Order"))) self.assertEqual(list(desc.descriptor_groups.values()).count("CTD"), 4, - "Expected there to be 4 CTD groups, got {}.".format( - list(desc.descriptor_groups.values()).count("CTD"))) + "Expected there to be 4 CTD groups, got {}.".format(list(desc.descriptor_groups.values()).count("CTD"))) self.assertEqual(list(desc.descriptor_groups.values()).count("Pseudo Composition"), 2, - "Expected there to be 2 pseudo composition groups, got {}.".format( - list(desc.descriptor_groups.values()).count("Pseudo Composition"))) + "Expected there to be 2 pseudo composition groups, got {}.".format(list(desc.descriptor_groups.values()).count("Pseudo Composition"))) self.assertEqual(len(desc.descriptor_groups.keys()), len(desc.all_descriptors_list()), "Expected {} total descriptor groups, got {}.".format(len(desc.all_descriptors_list()), len(desc.descriptor_groups.keys()))) #2.) + #testing correct descriptor group is returned for each descriptor attribute self.assertEqual(desc.descriptor_groups['amino_acid_composition'], "Composition") self.assertEqual(desc.descriptor_groups['dipeptide_composition'], "Composition") self.assertEqual(desc.descriptor_groups['tripeptide_composition'], "Composition") @@ -208,21 +203,17 @@ def test_descriptor_groups(self): self.assertIsInstance(desc.descriptor_groups, dict, "Expected dict, got {}.".format(type(desc.descriptor_groups))) def test_all_descriptors_list(self): - """ Testing function that returns various combinations of available descriptors - using built-in itertools library. """ - #testing on all 4 datasets + """ Testing function that returns various combinations of available descriptors using built-in itertools library. """ + #testing on all 4 datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(config_file=self.all_config_files[dataset]) desc_list_1 = desc.all_descriptors_list(desc_combo=1) desc_list_2 = desc.all_descriptors_list(desc_combo=2) desc_list_3 = desc.all_descriptors_list(desc_combo=3) #1.) - self.assertEqual(len(desc_list_1), 15, - "Expected 15 descriptor combinations, got {}.".format(len(desc_list_1))) - self.assertEqual(len(desc_list_2), 105, - "Expected 105 descriptor combinations, got {}.".format(len(desc_list_2))) - self.assertEqual(len(desc_list_3), 455, - "Expected 455 descriptor combinations, got {}.".format(len(desc_list_3))) + self.assertEqual(len(desc_list_1), 15, "Expected 15 descriptor combinations, got {}.".format(len(desc_list_1))) + self.assertEqual(len(desc_list_2), 105, "Expected 105 descriptor combinations, got {}.".format(len(desc_list_2))) + self.assertEqual(len(desc_list_3), 455, "Expected 455 descriptor combinations, got {}.".format(len(desc_list_3))) #2.) self.assertIsInstance(desc_list_1, list, "Expected list, got {}.".format(type(desc_list_1))) self.assertIsInstance(desc_list_2, list, "Expected list, got {}.".format(type(desc_list_2))) @@ -277,14 +268,16 @@ def test_descriptor_import(self): self.assertFalse(desc.all_descriptors.empty, "Descriptor dataframe should not be empty.") #2.) with self.assertRaises(OSError): - desc.import_descriptors("invalid_filepath.csv") + desc.import_descriptors("invalid_csv.csv") + desc.import_descriptors("blahblahblah") #3.) with self.assertRaises(TypeError): desc.import_descriptors(1234) + desc.import_descriptors(False) def test_amino_acid_composition(self): """ Testing Amino Acid Composition protein descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) @@ -292,42 +285,43 @@ def test_amino_acid_composition(self): aa_comp = desc.get_amino_acid_composition() self.assertFalse(aa_comp.empty, 'Descriptor dataframe should not be empty') - self.assertTrue(desc.amino_acid_composition.equals(aa_comp), - 'Output dataframe and class attribute dataframes must be the same.') - self.assertEqual(aa_comp.shape, (self.num_seqs[dataset], 20), 'Descriptor not of correct shape.') - self.assertIsInstance(aa_comp, pd.DataFrame, 'Descriptor should be of type DataFrame.') + self.assertTrue(desc.amino_acid_composition.equals(aa_comp), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(aa_comp.shape, (self.num_seqs[dataset], 20), 'Descriptor not correct shape, got {}.'.format(aa_comp.shape)) + self.assertIsInstance(aa_comp, pd.DataFrame, 'Descriptor should be of type DataFrame, got {}.'.format(type(aa_comp))) self.assertTrue(aa_comp.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(aa_comp).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(aa_comp.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(aa_comp.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(aa_comp.dtypes))) self.assertEqual(self.amino_acids, list(aa_comp.columns), 'Incorrect column values found in output dataframe: {}.'.format(aa_comp.columns)) def test_dipeptide_composition(self): """ Testing Dipeptide Composition protein descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) #get descriptor values dipeptide_comp = desc.get_dipeptide_composition() - self.assertTrue(desc.dipeptide_composition.equals(dipeptide_comp), 'Output dataframe and class attribute dataframes must be the same.') self.assertFalse(dipeptide_comp.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(dipeptide_comp.shape, (self.num_seqs[dataset], 400), 'Descriptor not of correct shape ({}, 400).'.format(self.num_seqs[dataset])) - self.assertIsInstance(dipeptide_comp, pd.DataFrame, 'Descriptor should be of type DataFrame.') + self.assertTrue(desc.dipeptide_composition.equals(dipeptide_comp), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(dipeptide_comp.shape, (self.num_seqs[dataset], 400), 'Descriptor not correct shape, got {}.'.format(dipeptide_comp.shape)) + self.assertIsInstance(dipeptide_comp, pd.DataFrame, 'Descriptor should be of type DataFrame, got {}.'.format(type(dipeptide_comp))) self.assertTrue(dipeptide_comp.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(dipeptide_comp).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(dipeptide_comp.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(dipeptide_comp.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(dipeptide_comp.dtypes))) for col in list(dipeptide_comp.columns): #check all columns follow pattern of XY where x & y are amino acids self.assertTrue(bool(re.match(r'^[A-Z]{2}$', col)), "") - self.assertIn(col[0], self.amino_acids, "") - self.assertIn(col[1], self.amino_acids, "") + self.assertIn(col[0], self.amino_acids, "Column contains an invalid amino acid {}.".format(col[0])) + self.assertIn(col[1], self.amino_acids, "Column contains an invalid amino acid {}.".format(col[1])) @unittest.skip("Descriptor can take quite a bit of time to calculate therefore skipping") def test_tripeptide_composition(self): """ Testing Tripeptide Composition protein descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) @@ -335,58 +329,63 @@ def test_tripeptide_composition(self): tripeptide_comp = desc.get_tripeptide_composition() self.assertFalse(tripeptide_comp.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(tripeptide_comp.shape, (self.num_seqs[dataset], 8000), 'Descriptor not of correct ({}, 8000).'.format(self.num_seqs[dataset])) self.assertTrue(desc.tripeptide_composition.equals(tripeptide_comp), 'Output dataframe and class attribute dataframes must be the same.') - self.assertIsInstance(tripeptide_comp, pd.DataFrame, 'Descriptor should be of type DataFrame.') + self.assertEqual(tripeptide_comp.shape, (self.num_seqs[dataset], 8000), 'Descriptor not correct shape, got {}.'.format(tripeptide_comp.shape)) + self.assertIsInstance(tripeptide_comp, pd.DataFrame, 'Descriptor should be of type DataFrame, got {}.'.format(type(tripeptide_comp))) self.assertTrue(tripeptide_comp.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(tripeptide_comp).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(tripeptide_comp.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(tripeptide_comp.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(tripeptide_comp.dtypes))) for col in list(tripeptide_comp.columns): #check all columns follow pattern of XY where x & y are amino acids self.assertTrue(bool(re.match(r'^[A-Z]{3}$', col)), "") - self.assertIn(col[0], self.amino_acids, "") - self.assertIn(col[1], self.amino_acids, "") - self.assertIn(col[2], self.amino_acids, "") + self.assertIn(col[0], self.amino_acids, "Column contains an invalid amino acid {}.".format(col[0])) + self.assertIn(col[1], self.amino_acids, "Column contains an invalid amino acid {}.".format(col[1])) + self.assertIn(col[2], self.amino_acids, "Column contains an invalid amino acid {}.".format(col[2])) def test_moreaubroto_autocorrelation(self): """ Testing moreaubroto autocorrelation descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) #get descriptor values - moreaubroto = desc.get_moreaubroto_autocorrelation() - - self.assertFalse(moreaubroto.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(moreaubroto.shape, (self.num_seqs[dataset], 240), 'Descriptor not of correct ({}, 240)'.format(self.num_seqs[dataset])) - self.assertIsInstance(moreaubroto, pd.DataFrame, "Descriptor should be of type DataFrame.") - self.assertTrue(moreaubroto.any().isnull().sum()==0, 'Descriptor should not contain any null values.') - self.assertTrue(all(col == np.float64 for col in list(moreaubroto.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(moreaubroto.dtypes))) + moreau_broto = desc.get_moreaubroto_autocorrelation() + + self.assertFalse(moreau_broto.empty, 'Descriptor dataframe should not be empty.') + self.assertTrue(desc.moreaubroto_autocorrelation.equals(moreau_broto), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(moreau_broto.shape, (self.num_seqs[dataset], 240), 'Descriptor not correct shape, got {}.'.format(moreau_broto.shape)) + self.assertIsInstance(moreau_broto, pd.DataFrame, "Descriptor should be of type DataFrame.") + self.assertTrue(moreau_broto.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(moreau_broto).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') + self.assertTrue(all(col == np.float64 for col in list(moreau_broto.dtypes)), + "Column datatypes should be np.float64, got:\n{}.".format(list(moreau_broto.dtypes))) #check all columns follow pattern of MoreauBrotoAuto_X_Y where x is the asscession number of #the AAindex record and y is the count of the descriptor - for col in list(moreaubroto.columns): + for col in list(moreau_broto.columns): self.assertTrue(bool(re.match(r"MBAuto_[A-Z0-9]{10}_[0-9]", col)), "Column name doesn't match expected regex pattern: {}.".format(col)) def test_moran_autocorrelation(self): """ Testing Moran autocorrelation descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) #get descriptor values - moran = desc.get_moran_autocorrelation() - - self.assertEqual(moran.shape, (self.num_seqs[dataset], 240), 'Descriptor not of correct ({}, 240).'.format(self.num_seqs[dataset])) - self.assertIsInstance(moran, pd.DataFrame, "Descriptor should be of type DataFrame.") - self.assertFalse(moran.empty, 'Descriptor dataframe should not be empty.') - self.assertTrue(moran.any().isnull().sum()==0, 'Descriptor should not contain any null values.') - self.assertTrue(all(col == np.float64 for col in list(moran.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(moran.dtypes))) + moran_auto = desc.get_moran_autocorrelation() + + self.assertFalse(moran_auto.empty, 'Descriptor dataframe should not be empty.') + self.assertTrue(desc.moran_autocorrelation.equals(moran_auto), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(moran_auto.shape, (self.num_seqs[dataset], 240), 'Descriptor not correct shape, got {}.'.format(moran_auto.shape)) + self.assertIsInstance(moran_auto, pd.DataFrame, "Descriptor should be of type DataFrame.") + self.assertTrue(moran_auto.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(moran_auto).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') + self.assertTrue(all(col == np.float64 for col in list(moran_auto.dtypes)), + "Column datatypes should be np.float64, got:\n{}.".format(list(moran_auto.dtypes))) #check all columns follow pattern of MoranAuto_X_Y where x is the asscession number of #the AAindex record and y is the count of the descriptor - for col in list(moran.columns): + for col in list(moran_auto.columns): self.assertTrue(bool(re.match(r"MAuto_[A-Z0-9]{10}_[0-9]", col)), "Column name doesn't match expected regex pattern: {}.".format(col)) @@ -397,17 +396,19 @@ def test_geary_autocorrelation(self): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) #get descriptor values - geary = desc.get_geary_autocorrelation() - - self.assertFalse(geary.empty, 'Descriptor dataframe should not be empty') - self.assertEqual(geary.shape, (self.num_seqs[dataset], 240), 'Descriptor not of correct ({}, 240).'.format(self.num_seqs[dataset])) - self.assertIsInstance(geary, pd.DataFrame, "Descriptor should be of type DataFrame.") - self.assertTrue(geary.any().isnull().sum()==0, 'Descriptor should not contain any null values.') - self.assertTrue(all(col == np.float64 for col in list(geary.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(geary.dtypes))) + geary_auto = desc.get_geary_autocorrelation() + + self.assertFalse(geary_auto.empty, 'Descriptor dataframe should not be empty') + self.assertTrue(desc.geary_autocorrelation.equals(geary_auto), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(geary_auto.shape, (self.num_seqs[dataset], 240), 'Descriptor not correct shape, got {}.'.format(geary_auto.shape)) + self.assertIsInstance(geary_auto, pd.DataFrame, "Descriptor should be of type DataFrame.") + self.assertTrue(geary_auto.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(geary_auto).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') + self.assertTrue(all(col == np.float64 for col in list(geary_auto.dtypes)), + "Column datatypes should be np.float64, got:\n{}.".format(list(geary_auto.dtypes))) #check all columns follow pattern of GAuto_X_Y where x is the asscession number of #the AAindex record and y is the count of the descriptor - for col in list(geary.columns): + for col in list(geary_auto.columns): self.assertTrue(bool(re.match(r"GAuto_[A-Z0-9]{10}_[0-9]", col)), "Column name doesn't match expected regex pattern: {}.".format(col)) @@ -416,7 +417,7 @@ def test_ctd(self): ctd_properties = ["hydrophobicity", "normalized_vdwv", "polarity", "charge", "secondary_struct", "solvent_accessibility", "polarizability"] - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) @@ -424,11 +425,13 @@ def test_ctd(self): ctd = desc.get_ctd() self.assertFalse(ctd.empty, 'Descriptor dataframe should not be empty') - self.assertEqual(ctd.shape, (self.num_seqs[dataset], 21), 'Descriptor not of correct ({}, 21).'.format(self.num_seqs[dataset])) + self.assertTrue(desc.ctd.equals(ctd), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(ctd.shape, (self.num_seqs[dataset], 21), 'Descriptor not of correct, got {}.'.format(ctd.shape)) self.assertIsInstance(ctd, pd.DataFrame, "Descriptor should be of type DataFrame.") self.assertTrue(ctd.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(ctd).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(ctd.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(ctd.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(ctd.dtypes))) #iterate over all columns and check its name follows expected format for col in list(ctd.columns): matching_col = False @@ -444,11 +447,13 @@ def test_ctd(self): ctd_comp = desc.get_ctd_composition() self.assertFalse(ctd_comp.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(ctd_comp.shape, (self.num_seqs[dataset], 3), 'Descriptor not of correct ({}, 3).'.format(self.num_seqs[dataset])) + self.assertTrue(desc.ctd_composition.equals(ctd_comp), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(ctd_comp.shape, (self.num_seqs[dataset], 3), 'Descriptor not of correct, got {}.'.format(ctd_comp.shape)) self.assertIsInstance(ctd_comp, pd.DataFrame, "Descriptor should be of type DataFrame.") self.assertTrue(ctd_comp.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(ctd_comp).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(ctd_comp.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(ctd_comp.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(ctd_comp.dtypes))) #iterate over all columns and check its name follows expected format for col in list(ctd_comp.columns): matching_col = False @@ -464,11 +469,13 @@ def test_ctd(self): ctd_trans = desc.get_ctd_transition() self.assertFalse(ctd_trans.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(ctd_trans.shape, (self.num_seqs[dataset], 3), 'Descriptor not of correct ({}, 3).'.format(self.num_seqs[dataset])) + self.assertTrue(desc.ctd_transition.equals(ctd_trans), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(ctd_trans.shape, (self.num_seqs[dataset], 3), 'Descriptor not of correct, got {}.'.format(ctd_trans.shape)) self.assertIsInstance(ctd_trans, pd.DataFrame, "Descriptor should be of type DataFrame.") self.assertTrue(ctd_trans.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(ctd_trans).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(ctd_trans.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(ctd_trans.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(ctd_trans.dtypes))) #iterate over all columns and check its name follows expected format for col in list(ctd_trans.columns): matching_col = False @@ -484,11 +491,13 @@ def test_ctd(self): ctd_distr = desc.get_ctd_distribution() self.assertFalse(ctd_distr.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(ctd_distr.shape, (self.num_seqs[dataset], 15), 'Descriptor not of correct ({}, 15).'.format(self.num_seqs[dataset])) + self.assertTrue(desc.ctd_distribution.equals(ctd_distr), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(ctd_distr.shape, (self.num_seqs[dataset], 15), 'Descriptor not of correct, got {}.'.format(ctd_distr.shape)) self.assertIsInstance(ctd_distr, pd.DataFrame, "Descriptor should be of type DataFrame.") self.assertTrue(ctd_distr.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(ctd_distr).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(ctd_distr.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(ctd_distr.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(ctd_distr.dtypes))) #iterate over all columns and check its name follows expected format for col in list(ctd_distr.columns): matching_col = False @@ -502,7 +511,7 @@ def test_ctd(self): def test_conjoint_triad(self): """ Testing Conjoint Triad descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) @@ -510,11 +519,13 @@ def test_conjoint_triad(self): conjoint_triad = desc.get_conjoint_triad() self.assertFalse(conjoint_triad.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(conjoint_triad.shape, (self.num_seqs[dataset], 343), 'Descriptor not of correct shape (1, 343).') + self.assertTrue(desc.conjoint_triad.equals(conjoint_triad), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(conjoint_triad.shape, (self.num_seqs[dataset], 343), 'Descriptor not of correct shape, got {}.'.format(conjoint_triad.shape)) self.assertIsInstance(conjoint_triad, pd.DataFrame, 'Descriptor should be of type DataFrame.') self.assertTrue(conjoint_triad.any().isnull().sum()==0,'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(conjoint_triad).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.int64 for col in list(conjoint_triad.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(conjoint_triad.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(conjoint_triad.dtypes))) #iterate over all columns and check its name follows expected format for col in list(conjoint_triad.columns): self.assertTrue(bool(re.match(r"conj_triad_[0-9]{3}", col)), @@ -522,7 +533,7 @@ def test_conjoint_triad(self): def test_sequence_order_coupling_number(self): """ Testing sequence order coupling number descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0, len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) #1.) @@ -530,11 +541,13 @@ def test_sequence_order_coupling_number(self): sequence_order_coupling_number = desc.get_sequence_order_coupling_number() self.assertFalse(sequence_order_coupling_number.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(sequence_order_coupling_number.shape, (self.num_seqs[dataset], 30), 'Descriptor not of correct shape (1, 30).') + self.assertTrue(desc.sequence_order_coupling_number.equals(sequence_order_coupling_number), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(sequence_order_coupling_number.shape, (self.num_seqs[dataset], 30), 'Descriptor not of correct shape, got {}.'.format(sequence_order_coupling_number.shape)) self.assertIsInstance(sequence_order_coupling_number, pd.DataFrame, 'Descriptor should be of type DataFrame.') self.assertTrue(sequence_order_coupling_number.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(sequence_order_coupling_number).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(sequence_order_coupling_number.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(sequence_order_coupling_number.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(sequence_order_coupling_number.dtypes))) #check all columns follow pattern of SOCNX or SOCNXY where x & y integers between 0 and 9 for col in list(sequence_order_coupling_number.columns): self.assertTrue((bool(re.match(r'SOCN_SW[0-9]', col)) or bool(re.match(r'SOCN_SW[0-9][0-9]', col))), @@ -542,7 +555,7 @@ def test_sequence_order_coupling_number(self): def test_quasi_sequence_order(self): """ Testing Quasi sequence order descriptor attributes and methods. """ - #run tests on all test datasets + #run tests on all 4 test datasets and config files for dataset in range(0,len(self.all_config_files)): desc = descr.Descriptors(self.all_config_files[dataset]) @@ -550,11 +563,13 @@ def test_quasi_sequence_order(self): quasi_sequence_order = desc.get_quasi_sequence_order() #1.) self.assertFalse(quasi_sequence_order.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(quasi_sequence_order.shape, (self.num_seqs[dataset], 50), 'Descriptor not of correct shape (1, 100).') + self.assertTrue(desc.quasi_sequence_order.equals(quasi_sequence_order), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(quasi_sequence_order.shape, (self.num_seqs[dataset], 50), 'Descriptor not of correct shape, got {}.'.format(quasi_sequence_order.shape)) self.assertIsInstance(quasi_sequence_order, pd.DataFrame, 'Descriptor should be of type DataFrame.') self.assertTrue(quasi_sequence_order.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(quasi_sequence_order).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(quasi_sequence_order.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(quasi_sequence_order.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(quasi_sequence_order.dtypes))) #check all columns follow pattern of QSO_X, where x is an integer between 0 and 9 for col in list(quasi_sequence_order.columns): self.assertTrue((bool(re.match(r'QSO_SW[0-9]', col))), @@ -564,18 +579,20 @@ def test_quasi_sequence_order(self): def test_pseudo_amino_acid_composition(self): """ Testing Pseudo Amino Acid Composition descriptor attributes and methods. """ - #running unit test on one of the datasets due to length of computation + #running unit test on one of the datasets due to length of computation - thermostability desc = descr.Descriptors(self.all_config_files[0]) #1.) #get descriptor values pseudo_aa_comp = desc.get_pseudo_amino_acid_composition() self.assertFalse(pseudo_aa_comp.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(pseudo_aa_comp.shape, (self.num_seqs[0], 50), 'Descriptor not of correct shape (1,50).') + self.assertTrue(desc.pseudo_amino_acid_composition.equals(pseudo_aa_comp), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(pseudo_aa_comp.shape, (self.num_seqs[0], 50), 'Descriptor not of correct shape, got {}.'.format(pseudo_aa_comp.shape)) self.assertIsInstance(pseudo_aa_comp, pd.DataFrame, 'Descriptor should be of type DataFrame.') self.assertTrue(pseudo_aa_comp.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(pseudo_aa_comp).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(pseudo_aa_comp.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(pseudo_aa_comp.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(pseudo_aa_comp.dtypes))) #check all columns follow pattern of PAACX, where x is an integer between 0 and 9 for col in list(pseudo_aa_comp.columns): self.assertTrue(bool(re.match(r"PAAC[0-9]", col)), @@ -586,48 +603,63 @@ def test_pseudo_amino_acid_composition(self): @unittest.skip("Descriptor can take quite a bit of time to calculate therefore skipping.") def test_amphiphilic_pseudo_amino_acid_composition(self): """ Testing Amphiphilic Pseudo Amino Acid Composition descriptor attributes and methods. """ - #running unit test on one of the datasets due to length of computation + #running unit test on one of the datasets due to length of computation - thermostability desc = descr.Descriptors(self.all_config_files[0]) #1.) #get descriptor values amphiphilic_pseudo_aac = desc.get_amphiphilic_pseudo_amino_acid_composition() self.assertFalse(amphiphilic_pseudo_aac.empty, 'Descriptor dataframe should not be empty.') - self.assertEqual(amphiphilic_pseudo_aac.shape, (self.num_seqs[1], 80), 'Descriptor not of correct shape (1, 80).') + self.assertTrue(desc.amphiphilic_pseudo_amino_acid_composition.equals(amphiphilic_pseudo_aac), 'Output dataframe and class attribute dataframes must be the same.') + self.assertEqual(amphiphilic_pseudo_aac.shape, (self.num_seqs[1], 80), 'Descriptor not of correct shape, got {}.'.format(amphiphilic_pseudo_aac.shape)) self.assertIsInstance(amphiphilic_pseudo_aac, pd.DataFrame, 'Descriptor should be of type DataFrame.') self.assertTrue(amphiphilic_pseudo_aac.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(amphiphilic_pseudo_aac).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(amphiphilic_pseudo_aac.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(amphiphilic_pseudo_aac.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(amphiphilic_pseudo_aac.dtypes))) #check all columns follow pattern of APAAC_X, where x is an integer between 0 and 9 for col in list(amphiphilic_pseudo_aac.columns): self.assertTrue(bool(re.match(r"APAAC_[0-9]", col)), "Column doesn't follow correct naming convention: {}.".format(col)) - - # @unittest.skip("Test case requires recalculating all descriptors which is redundant to the above tests") + + def test_get_all_descriptors(self): + """ Testing functionality for calculating all protein descriptors for a datast of protein sequences. + Only testing on the thermostability dataset/config as its protein descriptors have been + pre-calcualted. Testing on the other datasets could take several hours each. """ +#1.) + #only testing on thermostability dataset to access pre-calculated descriptors + desc = descr.Descriptors(self.all_config_files[0]) + all_descriptors = desc.get_all_descriptors() + + self.assertIsInstance(all_descriptors, pd.DataFrame, 'Expected function output to be of type DataFrame, got {}.'.format(type(all_descriptors))) + self.assertEqual(all_descriptors.shape, (261, 9714), "Expected shape of output to be 261 x 9714, got {}.".format(all_descriptors.shape)) + + # @unittest.skip("Test case requires recalculating all descriptors which is redundant to the above tests") ** def test_get_descriptor_encoding(self): """ Testing get_descriptor_encoding function by passing string of approximate descriptor names in to get encoding. """ -#1.) desc = descr.Descriptors(self.all_config_files[0]) #using thermostability config to access pre-calculated descriptors - +#1.) aa_comp_desc = desc.get_descriptor_encoding("amino_comp") - self.assertIsInstance(aa_comp_desc, pd.DataFrame, - 'Descriptor attribute should be a dataframe, got {}.'.format(type(aa_comp_desc))) + + self.assertIsInstance(aa_comp_desc, pd.DataFrame, 'Descriptor attribute should be a dataframe, got {}.'.format(type(aa_comp_desc))) self.assertEqual(aa_comp_desc.shape, (self.num_seqs[0], 20), "Attribute shape should be ({}, {}), got {}.".format(self.num_seqs[0], 20, aa_comp_desc.shape)) self.assertTrue(aa_comp_desc.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(aa_comp_desc).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(aa_comp_desc.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(aa_comp_desc.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(aa_comp_desc.dtypes))) self.assertEqual(self.amino_acids, list(aa_comp_desc.columns), 'Incorrect column values found in output dataframe: {}.'.format(aa_comp_desc.columns)) #2.) geary_auto_desc = desc.get_descriptor_encoding("geary_auto") - self.assertIsInstance(geary_auto_desc, pd.DataFrame, - 'Descriptor attribute should be a dataframe, got {}.'.format(type(geary_auto_desc))) + + self.assertIsInstance(geary_auto_desc, pd.DataFrame, 'Descriptor attribute should be a dataframe, got {}.'.format(type(geary_auto_desc))) self.assertEqual(geary_auto_desc.shape, (self.num_seqs[0], 240), "Attribute shape should be ({}, {}), got {}.".format(self.num_seqs[0], 240, geary_auto_desc.shape)) self.assertTrue(geary_auto_desc.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(geary_auto_desc).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(geary_auto_desc.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(geary_auto_desc.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(geary_auto_desc.dtypes))) #check all columns follow pattern of GAuto_X_Y where x is the asscession number of #the AAindex record and y is the count of the descriptor for col in list(geary_auto_desc.columns): @@ -635,43 +667,46 @@ def test_get_descriptor_encoding(self): "Column name doesn't match expected regex pattern: {}.".format(col)) #3.) socn_desc = desc.get_descriptor_encoding("sequence_order_coupling") - self.assertIsInstance(socn_desc, pd.DataFrame, - 'Descriptor attribute should be a dataframe, got {}.'.format(socn_desc)) + + self.assertIsInstance(socn_desc, pd.DataFrame, 'Descriptor attribute should be a dataframe, got {}.'.format(socn_desc)) self.assertEqual(socn_desc.shape, (self.num_seqs[0], 30), "Attribute shape should be ({}, {}), got {}.".format(self.num_seqs[0], 30, socn_desc.shape)) self.assertTrue(socn_desc.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(socn_desc).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(socn_desc.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(socn_desc.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(socn_desc.dtypes))) #check all columns follow pattern of SOCNX or SOCNXY where x & y integers between 0 and 9 for col in list(socn_desc.columns): self.assertTrue((bool(re.match(r'SOCN_SW[0-9]', col)) or bool(re.match(r'SOCN_SW[0-9][0-9]', col))), "Column name doesn't match expected regex pattern: {}.".format(col)) #4.) dipeptide_comp_desc = desc.get_descriptor_encoding("dipeptide") - self.assertIsInstance(dipeptide_comp_desc, pd.DataFrame, - 'Descriptor attribute should be a dataframe, got {}.'.format(dipeptide_comp_desc)) + + self.assertIsInstance(dipeptide_comp_desc, pd.DataFrame, 'Descriptor attribute should be a dataframe, got {}.'.format(dipeptide_comp_desc)) self.assertEqual(dipeptide_comp_desc.shape, (self.num_seqs[0], 400), "Attribute shape should be ({}, {}), got {}.".format(self.num_seqs[0], 400, dipeptide_comp_desc.shape)) self.assertTrue(dipeptide_comp_desc.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(dipeptide_comp_desc).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(dipeptide_comp_desc.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(dipeptide_comp_desc.dtypes))) + "Column datatypes should be np.float64, got:\n{}.".format(list(dipeptide_comp_desc.dtypes))) for col in list(dipeptide_comp_desc.columns): #check all columns follow pattern of XY where x & y are amino acids self.assertTrue(bool(re.match(r'^[A-Z]{2}$', col)), "") - self.assertIn(col[0], self.amino_acids, "") - self.assertIn(col[1], self.amino_acids, "") + self.assertIn(col[0], self.amino_acids, "Column contains invalid amino acid: {}.".format(col[0])) + self.assertIn(col[1], self.amino_acids, "Column contains invalid amino acid: {}.".format(col[1])) #5.) ctd_transition_desc = desc.get_descriptor_encoding("ctd_transition") ctd_properties = ["hydrophobicity", "normalized_vdwv", "polarity", "charge", "secondary_struct", "solvent_accessibility", "polarizability"] - self.assertIsInstance(ctd_transition_desc, pd.DataFrame, - 'Descriptor attribute should be a dataframe, got {}.'.format(ctd_transition_desc)) + + self.assertIsInstance(ctd_transition_desc, pd.DataFrame, 'Descriptor attribute should be a dataframe, got {}.'.format(ctd_transition_desc)) self.assertEqual(ctd_transition_desc.shape, (self.num_seqs[0], 3), "Attribute shape should be ({}, {}), got {}.".format(self.num_seqs[0], 3, ctd_transition_desc.shape)) self.assertTrue(ctd_transition_desc.any().isnull().sum()==0, 'Descriptor should not contain any null values.') + self.assertTrue(np.isinf(ctd_transition_desc).values.sum()==0, 'Descriptor should not contain any +/- infinity values.') self.assertTrue(all(col == np.float64 for col in list(ctd_transition_desc.dtypes)), - "Column datatypes should be np.float64, got:\n{}".format(list(ctd_transition_desc.dtypes))) - #check all columns follow pattern of APAAC_X, where x is an integer between 0 and 9 + "Column datatypes should be np.float64, got:\n{}.".format(list(ctd_transition_desc.dtypes))) + #check all columns follow correct format for col in list(ctd_transition_desc.columns): matching_col = False for prop in ctd_properties: diff --git a/tests/test_encoding.py b/tests/test_encoding.py index cf42df8..d44cd09 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -7,20 +7,18 @@ import shutil import unittest from aaindex import aaindex1 +import numpy as np unittest.TestLoader.sortTestMethodsUsing = None -#stop sklearn warnings -def warn(*args, **kwargs): - pass +#suppress sklearn warnings import warnings -warnings.warn = warn - +warnings.filterwarnings("ignore") + import pySAR.encoding as pysar_ import pySAR.globals_ as _globals class EncodingTests(unittest.TestCase): """ - Test suite for testing encoding module and functionality - in pySAR package. + Test suite for testing encoding module and functionality in pySAR package. Test Cases ========== @@ -43,10 +41,10 @@ def setUp(self): ] #create instance of Encoding class for each config file - self.test_config1 = pysar_.Encoding(config_file=self.all_config_files[0]) - self.test_config2 = pysar_.Encoding(config_file=self.all_config_files[1]) - self.test_config3 = pysar_.Encoding(config_file=self.all_config_files[2]) - self.test_config4 = pysar_.Encoding(config_file=self.all_config_files[3]) + self.test_config_thermostability = pysar_.Encoding(config_file=self.all_config_files[0]) + self.test_config_enantioselectivity = pysar_.Encoding(config_file=self.all_config_files[1]) + self.test_config_absorption = pysar_.Encoding(config_file=self.all_config_files[2]) + self.test_config_localization = pysar_.Encoding(config_file=self.all_config_files[3]) #list of canonical amino acids self.amino_acids = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", @@ -84,524 +82,387 @@ def setUp(self): #temporary unit test output folder self.test_output_folder = os.path.join("tests", "test_outputs") - @unittest.skip("") + # @unittest.skip("Skipping aai encoding tests.") def test_aai_encoding(self): """ Testing AAI encoding functionality in Encoding module. """ #1.) - test_aai1 = ["FAUJ880110", "GEIM800111"] - test_encoding1 = self.test_config1.aai_encoding(aai_indices=test_aai1, sort_by="R2", output_folder=self.test_output_folder) + test_aai1 = ["FAUJ880110", "GEIM800111"] #thermostability dataset and config + test_encoding_thermostability = self.test_config_thermostability.aai_encoding(aai_indices=test_aai1, sort_by="R2", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding1, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding1))) - self.assertEqual(len(test_encoding1), 2, - "Expected 2 rows in output dataframe, got {}.".format(len(test_encoding1))) - self.assertEqual(set(list(test_encoding1["Index"])), set(test_aai1), - "Output index values don't match expected.") - self.assertEqual(test_encoding1["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding1["Index"].dtype)) - self.assertEqual(test_encoding1["Category"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding1["Category"].dtype)) - self.assertEqual(test_encoding1["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding1["R2"].dtype)) - self.assertEqual(test_encoding1["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding1["RMSE"].dtype)) - self.assertEqual(test_encoding1["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding1["MSE"].dtype)) - self.assertEqual(test_encoding1["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding1["MAE"].dtype)) - self.assertEqual(test_encoding1["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding1["RPD"].dtype)) - self.assertEqual(test_encoding1["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding1["Explained Variance"].dtype)) - for cat in list(test_encoding1["Category"]): + self.assertIsInstance(test_encoding_thermostability, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_thermostability))) + self.assertEqual(len(test_encoding_thermostability), 2, + "Expected 2 rows in output dataframe, got {}.".format(len(test_encoding_thermostability))) + self.assertEqual(set(list(test_encoding_thermostability["Index"])), set(test_aai1), + "Output index values don't match expected, got {}.".format(set(list(test_encoding_thermostability["Index"])))) + for cat in list(test_encoding_thermostability["Category"]): self.assertIn(cat, self.index_categories, - "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - for col in test_encoding1.columns: - self.assertIn(col, self.expected_aai_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_encoding_output_columns)) + "Category {} not found in list of categories:\n{}.".format(cat, self.index_categories)) + for col in test_encoding_thermostability.columns: + self.assertIn(col, self.expected_aai_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Index" or col == "Category"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_thermostability[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_thermostability[col]))) #2.) - test_aai2 = ["FAUJ880110", "GEIM800111", "JOND750102", "MAXF760102"] - test_encoding2 = self.test_config2.aai_encoding(aai_indices=test_aai2, sort_by="RMSE", output_folder=self.test_output_folder) + test_aai2 = ["FAUJ880110", "GEIM800111", "JOND750102", "MAXF760102"] #enantioselectivity dataset and config + test_encoding_enantioselectivity = self.test_config_enantioselectivity.aai_encoding(aai_indices=test_aai2, sort_by="RMSE", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding2, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding2))) - self.assertEqual(len(test_encoding2), 4, - "Expected 4 rows in output dataframe, got {}.".format(len(test_encoding2))) - self.assertEqual(set(list(test_encoding2["Index"])), set(test_aai2), - "Output index values don't match expected.") - self.assertEqual(test_encoding2["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding2["Index"].dtype)) - self.assertEqual(test_encoding2["Category"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding2["Category"].dtype)) - self.assertEqual(test_encoding2["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding2["R2"].dtype)) - self.assertEqual(test_encoding2["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding2["RMSE"].dtype)) - self.assertEqual(test_encoding2["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding2["MSE"].dtype)) - self.assertEqual(test_encoding2["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding2["MAE"].dtype)) - self.assertEqual(test_encoding2["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding2["RPD"].dtype)) - self.assertEqual(test_encoding2["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding2["Explained Variance"].dtype)) - for cat in list(test_encoding2["Category"]): + self.assertIsInstance(test_encoding_enantioselectivity, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_enantioselectivity))) + self.assertEqual(len(test_encoding_enantioselectivity), 4, + "Expected 4 rows in output dataframe, got {}.".format(len(test_encoding_enantioselectivity))) + self.assertEqual(set(list(test_encoding_enantioselectivity["Index"])), set(test_aai2), + "Output index values don't match expected, got {}.".format(set(list(test_encoding_enantioselectivity["Index"])))) + for cat in list(test_encoding_enantioselectivity["Category"]): self.assertIn(cat, self.index_categories, - "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - for col in test_encoding2.columns: - self.assertIn(col, self.expected_aai_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_encoding_output_columns)) + "Category {} not found in list of categories:\n{}.".format(cat, self.index_categories)) + for col in test_encoding_enantioselectivity.columns: + self.assertIn(col, self.expected_aai_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Index" or col == "Category"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_enantioselectivity[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_enantioselectivity[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_enantioselectivity[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_enantioselectivity[col]))) #3.) - test_aai3 = ["BIGC670101", "CHOP780211", "DESM900101", "FAUJ880113", "KANM800104"] - test_encoding3 = self.test_config3.aai_encoding(aai_indices=test_aai3, sort_by="MSE", output_folder=self.test_output_folder) + test_aai3 = ["BIGC670101", "CHOP780211", "DESM900101", "FAUJ880113", "KANM800104"] #absorption dataset and config + test_encoding_absorption = self.test_config_absorption.aai_encoding(aai_indices=test_aai3, sort_by="MSE", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding3, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding3))) - self.assertEqual(len(test_encoding3), 5, - "Expected 5 rows in output dataframe, got {}.".format(len(test_encoding3))) - self.assertEqual(set(list(test_encoding3["Index"])), set(test_aai3), - "Output index values don't match expected.") - self.assertEqual(test_encoding3["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding3["Index"].dtype)) - self.assertEqual(test_encoding3["Category"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding3["Category"].dtype)) - self.assertEqual(test_encoding3["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding3["R2"].dtype)) - self.assertEqual(test_encoding3["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding3["RMSE"].dtype)) - self.assertEqual(test_encoding3["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding3["MSE"].dtype)) - self.assertEqual(test_encoding3["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding3["MAE"].dtype)) - self.assertEqual(test_encoding3["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding3["RPD"].dtype)) - self.assertEqual(test_encoding3["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding3["Explained Variance"].dtype)) - for cat in list(test_encoding3["Category"]): + self.assertIsInstance(test_encoding_absorption, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_absorption))) + self.assertEqual(len(test_encoding_absorption), 5, + "Expected 5 rows in output dataframe, got {}.".format(len(test_encoding_absorption))) + self.assertEqual(set(list(test_encoding_absorption["Index"])), set(test_aai3), + "Output index values don't match expected, got {}.".format(set(list(test_encoding_absorption["Index"])))) + for cat in list(test_encoding_absorption["Category"]): self.assertIn(cat, self.index_categories, - "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - for col in test_encoding3.columns: - self.assertIn(col, self.expected_aai_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_encoding_output_columns)) + "Category {} not found in list of categories:\n{}.".format(cat, self.index_categories)) + for col in test_encoding_absorption.columns: + self.assertIn(col, self.expected_aai_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Index" or col == "Category"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_absorption[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_absorption[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_absorption[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_absorption[col]))) #4.) - test_aai4 = [] #passing in no indices into the function will calculate all 566+ indices - test_encoding4 = self.test_config4.aai_encoding(aai_indices=test_aai4, sort_by="MAE", output_folder=self.test_output_folder) + test_aai4 = [] #passing in no indices into the function will calculate all 566+ indices - localization dataset and config + test_encoding_localization = self.test_config_localization.aai_encoding(aai_indices=test_aai4, sort_by="MAE", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding4, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding4))) - self.assertEqual(len(test_encoding4), 566, - "Expected 566 rows in output dataframe, got {}.".format(len(test_encoding4))) - self.assertEqual(set(list(test_encoding4["Index"])), set(aaindex1.record_codes()), - "Output index values don't match expected.") - self.assertEqual(test_encoding4["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding4["Index"].dtype)) - self.assertEqual(test_encoding4["Category"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding4["Category"].dtype)) - self.assertEqual(test_encoding4["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding4["R2"].dtype)) - self.assertEqual(test_encoding4["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding4["RMSE"].dtype)) - self.assertEqual(test_encoding4["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding4["MSE"].dtype)) - self.assertEqual(test_encoding4["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding4["MAE"].dtype)) - self.assertEqual(test_encoding4["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding4["RPD"].dtype)) - self.assertEqual(test_encoding4["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding4["Explained Variance"].dtype)) - for cat in list(test_encoding4["Category"]): + self.assertIsInstance(test_encoding_localization, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_localization))) + self.assertEqual(len(test_encoding_localization), 566, + "Expected 566 rows in output dataframe, got {}.".format(len(test_encoding_localization))) + self.assertEqual(set(list(test_encoding_localization["Index"])), set(aaindex1.record_codes()), + "Output index values don't match expected, got {}.".format(set(list(test_encoding_localization["Index"])))) + for cat in list(test_encoding_localization["Category"]): self.assertIn(cat, self.index_categories, - "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - for col in test_encoding4.columns: - self.assertIn(col, self.expected_aai_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_encoding_output_columns)) + "Category {} not found in list of categories:\n{}.".format(cat, self.index_categories)) + for col in test_encoding_localization.columns: + self.assertIn(col, self.expected_aai_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Index" or col == "Category"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_localization[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_localization[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_localization[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_localization[col]))) #5.) - test_aai5 = ["CHOP780211"] - test_encoding5 = self.test_config3.aai_encoding(aai_indices=test_aai5, sort_by="invalid_metric", output_folder=self.test_output_folder) #R2 will then be used as default metric - - self.assertIsInstance(test_encoding5, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding5))) - self.assertEqual(len(test_encoding5), 1, - "Expected 1 rows in output dataframe, got {}.".format(len(test_encoding5))) - self.assertEqual(set(list(test_encoding5["Index"])), set(test_aai5), - "Output index values don't match expected.") - self.assertEqual(test_encoding5["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding5["Index"].dtype)) - self.assertEqual(test_encoding5["Category"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding5["Category"].dtype)) - self.assertEqual(test_encoding5["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding5["R2"].dtype)) - self.assertEqual(test_encoding5["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding5["RMSE"].dtype)) - self.assertEqual(test_encoding5["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding5["MSE"].dtype)) - self.assertEqual(test_encoding5["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding5["MAE"].dtype)) - self.assertEqual(test_encoding5["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding5["RPD"].dtype)) - self.assertEqual(test_encoding5["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding5["Explained Variance"].dtype)) - for cat in list(test_encoding5["Category"]): - self.assertIn(cat, self.index_categories, - "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - for col in test_encoding5.columns: - self.assertIn(col, self.expected_aai_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_encoding_output_columns)) - self.assertTrue(os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME), - "Output dir storing encoding results not found.") - self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "aaindex_results.csv")), - "Output csv storing encoding results not found.") -#6.) test_aai6 = "blahblah" test_aai7 = "DESM9001ZZ" with self.assertRaises(ValueError): - self.test_config1.aai_encoding(aai_indices=test_aai6, sort_by="RPD", output_folder=self.test_output_folder) - self.test_config1.aai_encoding(aai_indices=test_aai7, sort_by="RMSE", output_folder=self.test_output_folder) + self.test_config_thermostability.aai_encoding(aai_indices=test_aai6, sort_by="RPD", output_folder=self.test_output_folder) + self.test_config_enantioselectivity.aai_encoding(aai_indices=test_aai7, sort_by="RMSE", output_folder=self.test_output_folder) #7.) test_aai8 = 1234 test_aai9 = True with self.assertRaises(TypeError): - self.test_config2.aai_encoding(aai_indices=test_aai8, sort_by="MSE", output_folder=self.test_output_folder) - self.test_config3.aai_encoding(aai_indices=test_aai9, sort_by="MAE", output_folder=self.test_output_folder) + self.test_config_absorption.aai_encoding(aai_indices=test_aai8, sort_by="MSE", output_folder=self.test_output_folder) + self.test_config_localization.aai_encoding(aai_indices=test_aai9, sort_by="MAE", output_folder=self.test_output_folder) - @unittest.skip("Descriptor encoding functionality can take a lot of time, skipping.") + # @unittest.skip("Descriptor encoding functionality can take a lot of time, skipping.") def test_descriptor_encoding(self): - """ Testing Descriptor encoding functionality in Encoding module. """ + """ Testing Descriptor encoding functionality in Encoding module. """ #1.) test_desc1 = "amino_acid_composition" - test_encoding1 = self.test_config1.descriptor_encoding(descriptors=test_desc1, desc_combo=1, + test_encoding_thermostability = self.test_config_thermostability.descriptor_encoding(descriptors=test_desc1, desc_combo=1, sort_by="R2", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding1, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding1))) - self.assertEqual(len(test_encoding1), 1, - "Expected 1 rows in output dataframe, got {}.".format(len(test_encoding1))) - self.assertEqual(test_encoding1["Descriptor"].values[0], test_desc1, - "Output index values don't match expected, got {}.".format(test_encoding1["Descriptor"].values[0])) - self.assertEqual(test_encoding1["Group"].values[0], "Composition", - "Output group values don't match expected, got {}.".format(test_encoding1["Group"].values[0])) - self.assertEqual(test_encoding1["Descriptor"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding1["Descriptor"].dtype)) - self.assertEqual(test_encoding1["Group"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding1["Group"].dtype)) - self.assertEqual(test_encoding1["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding1["R2"].dtype)) - self.assertEqual(test_encoding1["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding1["RMSE"].dtype)) - self.assertEqual(test_encoding1["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding1["MSE"].dtype)) - self.assertEqual(test_encoding1["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding1["MAE"].dtype)) - self.assertEqual(test_encoding1["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding1["RPD"].dtype)) - self.assertEqual(test_encoding1["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding1["Explained Variance"].dtype)) - for col in test_encoding1.columns: - self.assertIn(col, self.expected_desc_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_desc_encoding_output_columns)) + self.assertIsInstance(test_encoding_thermostability, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_thermostability))) + self.assertEqual(len(test_encoding_thermostability), 1, + "Expected 1 row in output dataframe, got {}.".format(len(test_encoding_thermostability))) + self.assertEqual(test_encoding_thermostability["Descriptor"].values[0], test_desc1, + "Output index values don't match expected, got {}.".format(test_encoding_thermostability["Descriptor"].values[0])) + self.assertEqual(test_encoding_thermostability["Group"].values[0], "Composition", + "Output group values don't match expected, got {}.".format(test_encoding_thermostability["Group"].values[0])) + for col in test_encoding_thermostability.columns: + self.assertIn(col, self.expected_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_thermostability[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_thermostability[col]))) #2.) test_desc2 = "moran_auto" - test_encoding2 = self.test_config2.descriptor_encoding(descriptors=test_desc2, desc_combo=1, - sort_by="MAE", output_folder=self.test_output_folder) + test_encoding_enantioselectivity = self.test_config_enantioselectivity.descriptor_encoding(descriptors=test_desc2, desc_combo=1, + sort_by="MAE", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding2, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding2))) - self.assertEqual(len(test_encoding2), 1, - "Expected 1 rows in output dataframe, got {}.".format(len(test_encoding2))) - self.assertEqual(test_encoding2["Descriptor"].values[0], test_desc2, - "Output index values don't match expected, got {}.".format(test_encoding2["Descriptor"].values[0])) - self.assertEqual(test_encoding2["Group"].values[0], "Autocorrelation", - "Output group values don't match expected, got {}.".format(test_encoding2["Group"].values[0])) - self.assertEqual(test_encoding2["Descriptor"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding2["Descriptor"].dtype)) - self.assertEqual(test_encoding2["Group"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding2["Group"].dtype)) - self.assertEqual(test_encoding2["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding2["R2"].dtype)) - self.assertEqual(test_encoding2["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding2["RMSE"].dtype)) - self.assertEqual(test_encoding2["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding2["MSE"].dtype)) - self.assertEqual(test_encoding2["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding2["MAE"].dtype)) - self.assertEqual(test_encoding2["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding2["RPD"].dtype)) - self.assertEqual(test_encoding2["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding2["Explained Variance"].dtype)) - for col in test_encoding2.columns: - self.assertIn(col, self.expected_desc_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_desc_encoding_output_columns)) + self.assertIsInstance(test_encoding_enantioselectivity, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_enantioselectivity))) + self.assertEqual(len(test_encoding_enantioselectivity), 1, + "Expected 1 row in output dataframe, got {}.".format(len(test_encoding_enantioselectivity))) + self.assertEqual(test_encoding_enantioselectivity["Descriptor"].values[0], test_desc2, + "Output index values don't match expected, got {}.".format(test_encoding_enantioselectivity["Descriptor"].values[0])) + self.assertEqual(test_encoding_enantioselectivity["Group"].values[0], "Autocorrelation", + "Output group values don't match expected, got {}.".format(test_encoding_enantioselectivity["Group"].values[0])) + for col in test_encoding_enantioselectivity.columns: + self.assertIn(col, self.expected_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_enantioselectivity[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_enantioselectivity[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_enantioselectivity[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_enantioselectivity[col]))) #3.) test_desc3 = ["ctd", "conjoint_triad", "dipeptide_composition"] - test_encoding3 = self.test_config2.descriptor_encoding(descriptors=test_desc3, desc_combo=1, + test_encoding_absorption = self.test_config_absorption.descriptor_encoding(descriptors=test_desc3, desc_combo=1, sort_by="MSE", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding3, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding3))) - self.assertEqual(len(test_encoding3), 3, - "Expected 3 rows in output dataframe, got {}.".format(len(test_encoding3))) - self.assertEqual(set(list(test_encoding3["Descriptor"])), set(test_desc3), - "Output index values don't match expected, got {}.".format(list(test_encoding3["Descriptor"]))) - self.assertEqual(set(list(test_encoding3["Group"])), set(["Composition", "Conjoint Triad", "CTD"]), - "Output group values don't match expected, got {}.".format(list(test_encoding3["Group"]))) - self.assertEqual(test_encoding3["Descriptor"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding3["Descriptor"].dtype)) - self.assertEqual(test_encoding3["Group"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding3["Group"].dtype)) - self.assertEqual(test_encoding3["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding3["R2"].dtype)) - self.assertEqual(test_encoding3["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding3["RMSE"].dtype)) - self.assertEqual(test_encoding3["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding3["MSE"].dtype)) - self.assertEqual(test_encoding3["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding3["MAE"].dtype)) - self.assertEqual(test_encoding3["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding3["RPD"].dtype)) - self.assertEqual(test_encoding3["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding3["Explained Variance"].dtype)) - for col in test_encoding3.columns: - self.assertIn(col, self.expected_desc_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_desc_encoding_output_columns)) + self.assertIsInstance(test_encoding_absorption, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_absorption))) + self.assertEqual(len(test_encoding_absorption), 3, + "Expected 3 rows in output dataframe, got {}.".format(len(test_encoding_absorption))) + self.assertEqual(set(list(test_encoding_absorption["Descriptor"])), set(test_desc3), + "Output index values don't match expected, got {}.".format(list(test_encoding_absorption["Descriptor"]))) + self.assertEqual(set(list(test_encoding_absorption["Group"])), set(["Composition", "Conjoint Triad", "CTD"]), + "Output group values don't match expected, got {}.".format(list(test_encoding_absorption["Group"]))) + for col in test_encoding_absorption.columns: + self.assertIn(col, self.expected_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_absorption[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_absorption[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_absorption[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_absorption[col]))) #4.) test_desc4 = [] #no descriptors passed into encoding function will calculate/import all descriptors for dataset - test_encoding4 = self.test_config1.descriptor_encoding(descriptors=test_desc4, desc_combo=1, + test_encoding_thermostability = self.test_config_thermostability.descriptor_encoding(descriptors=test_desc4, desc_combo=1, sort_by="RPD", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding4, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding4))) - self.assertEqual(test_encoding4["Descriptor"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding4["Descriptor"].dtype)) - self.assertEqual(test_encoding4["Group"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding4["Group"].dtype)) - self.assertEqual(test_encoding4["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding4["R2"].dtype)) - self.assertEqual(test_encoding4["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding4["RMSE"].dtype)) - self.assertEqual(test_encoding4["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding4["MSE"].dtype)) - self.assertEqual(test_encoding4["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding4["MAE"].dtype)) - self.assertEqual(test_encoding4["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding4["RPD"].dtype)) - self.assertEqual(test_encoding4["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding4["Explained Variance"].dtype)) - for group in list(test_encoding4["Group"]): + self.assertIsInstance(test_encoding_thermostability, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_thermostability))) + self.assertEqual(len(test_encoding_thermostability), 15, + "Expected 15 rows in output dataframe, got {}.".format(len(test_encoding_thermostability))) + for col in test_encoding_thermostability.columns: + self.assertIn(col, self.expected_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_encoding_output_columns)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_thermostability[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_thermostability[col]))) + for group in list(test_encoding_thermostability["Group"]): self.assertIn(group, self.descriptor_groups, - "Group {} not found in list of groups:\n{}".format(group, self.descriptor_groups)) - for col in test_encoding4.columns: - self.assertIn(col, self.expected_desc_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_desc_encoding_output_columns)) - for desc in list(test_encoding4["Descriptor"]): + "Group {} not found in list of groups:\n{}.".format(group, self.descriptor_groups)) + for desc in list(test_encoding_thermostability["Descriptor"]): self.assertIn(desc, self.valid_descriptors, - "Descriptor {} not found in list of available descriptors:\n{}".format(desc, self.valid_descriptors)) + "Descriptor {} not found in list of available descriptors:\n{}.".format(desc, self.valid_descriptors)) self.assertTrue(os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME), "Output dir storing encoding results not found.") self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "desc_results.csv")), "Output csv storing encoding results not found.") #5.) invalid_test_desc5 = "invalid_descriptor_name" + invalid_test_desc6 = "blahblahblah" with self.assertRaises(ValueError): - self.test_config1.descriptor_encoding(descriptors=invalid_test_desc5, desc_combo=1, sort_by="Explained Variance") + self.test_config_thermostability.descriptor_encoding(descriptors=invalid_test_desc5, desc_combo=1, sort_by="MSE") + self.test_config_enantioselectivity.descriptor_encoding(descriptors=invalid_test_desc6, desc_combo=1, sort_by="RMSE") #6.) - invalid_test_desc6 = 12345 - invalid_test_desc7 = True + invalid_test_desc7 = 12345 + invalid_test_desc8 = True with self.assertRaises(TypeError): - self.test_config1.descriptor_encoding(descriptors=invalid_test_desc6, desc_combo=1, sort_by="MAE") - self.test_config1.descriptor_encoding(descriptors=invalid_test_desc7, desc_combo=1, sort_by="RMSE") + self.test_config_absorption.descriptor_encoding(descriptors=invalid_test_desc7, desc_combo=1, sort_by="MAE") + self.test_config_localization.descriptor_encoding(descriptors=invalid_test_desc8, desc_combo=1, sort_by="RPD") # @unittest.skip("AAI + Descriptor encoding functionality can take a lot of time, skipping.") def test_aai_descriptor_encoding(self): - """ Testing AAI + Descriptor encoding functionality in Encoding module. """ + """ Testing AAI + Descriptor encoding functionality in Encoding module. """ #1.) - test_aai1 = "FAUJ880110" - test_desc1 = "ctd" - test_encoding1 = self.test_config1.aai_descriptor_encoding(aai_indices=test_aai1, descriptors=test_desc1, - desc_combo=1, sort_by="R2", output_folder=self.test_output_folder) + test_aai1 = "FAUJ880110" #thermostability + test_desc1 = "tripeptide_composition" + test_encoding_thermostability = self.test_config_thermostability.aai_descriptor_encoding(aai_indices=test_aai1, descriptors=test_desc1, + desc_combo=1, sort_by="R2", output_folder=self.test_output_folder) - print("test_encoding1") - print(test_encoding1) - self.assertIsInstance(test_encoding1, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding1))) - self.assertEqual(len(test_encoding1), 1, - "Expected 1 rows in output dataframe, got {}.".format(len(test_encoding1))) - self.assertEqual(test_encoding1["Index"].values[0], test_aai1, - "Output index values don't match expected, got {}.".format(test_encoding1["Index"].values[0])) - self.assertEqual(test_encoding1["Category"].values[0], "geometry", - "Output group values don't match expected, got {}.".format(test_encoding1["Group"].values[0])) - self.assertEqual(test_encoding1["Descriptor"].values[0], test_desc1, - "Output index values don't match expected, got {}.".format(test_encoding1["Descriptor"].values[0])) - self.assertEqual(test_encoding1["Group"].values[0], "CTD", - "Output group values don't match expected, got {}.".format(test_encoding1["Group"].values[0])) - self.assertEqual(test_encoding1["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding1["Index"].dtype)) - self.assertEqual(test_encoding1["Category"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding1["Category"].dtype)) - self.assertEqual(test_encoding1["Descriptor"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding1["Descriptor"].dtype)) - self.assertEqual(test_encoding1["Group"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding1["Group"].dtype)) - self.assertEqual(test_encoding1["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding1["R2"].dtype)) - self.assertEqual(test_encoding1["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding1["RMSE"].dtype)) - self.assertEqual(test_encoding1["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding1["MSE"].dtype)) - self.assertEqual(test_encoding1["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding1["MAE"].dtype)) - self.assertEqual(test_encoding1["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding1["RPD"].dtype)) - self.assertEqual(test_encoding1["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding1["Explained Variance"].dtype)) - for col in test_encoding1.columns: - self.assertIn(col, self.expected_aai_desc_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_desc_encoding_output_columns)) + self.assertIsInstance(test_encoding_thermostability, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_thermostability))) + self.assertEqual(len(test_encoding_thermostability), 1, + "Expected 1 row in output dataframe, got {}.".format(len(test_encoding_thermostability))) + self.assertEqual(test_encoding_thermostability["Index"].values[0], test_aai1, + "Output index values don't match expected, got {}.".format(test_encoding_thermostability["Index"].values[0])) + self.assertEqual(test_encoding_thermostability["Category"].values[0], "geometry", + "Output group values don't match expected, got {}.".format(test_encoding_thermostability["Group"].values[0])) + self.assertEqual(set(list(test_encoding_thermostability["Descriptor"].values)), {"tripeptide_composition"}, + "Output descriptor column values don't match expected, got\n{}.".format(test_encoding_thermostability["Descriptor"])) + self.assertEqual(test_encoding_thermostability["Group"].values[0], "Composition", + "Output group values don't match expected, got {}.".format(test_encoding_thermostability["Group"].values[0])) + for col in test_encoding_thermostability.columns: + self.assertIn(col, self.expected_aai_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_desc_encoding_output_columns)) + if (col == "Index" or col == "Category" or col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_thermostability[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_thermostability[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_thermostability[col]))) #2.) - test_aai2 = "BIGC670101, DAYM780201" - test_desc2 = ["tripeptide_composition", "quasi_sequence_order", "sequence_order_coupling_number"] - test_encoding2 = self.test_config2.aai_descriptor_encoding(aai_indices=test_aai2, descriptors=test_desc2, + test_aai2 = "BIGC670101, DAYM780201" #enantioselectivity + test_desc2 = ["ctd", "quasi_sequence_order", "sequence_order_coupling_number"] + test_encoding_enantioselectivity = self.test_config_enantioselectivity.aai_descriptor_encoding(aai_indices=test_aai2, descriptors=test_desc2, desc_combo=1, sort_by="MSE", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding2, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding2))) - self.assertEqual(len(test_encoding2), 6, - "Expected 6 rows in output dataframe, got {}.".format(len(test_encoding2))) - self.assertEqual(set(list(test_encoding2["Index"])), set(test_aai2.replace(' ', '').split(',')), - "Expected index column to be type string, got {}.".format(test_encoding2["Index"].dtype)) - self.assertEqual(set(list(test_encoding2["Descriptor"])), set(test_desc2), - "Output index values don't match expected.") - self.assertEqual(test_encoding2["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding2["Index"].dtype)) - self.assertEqual(test_encoding2["Category"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding2["Category"].dtype)) - self.assertEqual(test_encoding2["Descriptor"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding2["Descriptor"].dtype)) - self.assertEqual(test_encoding2["Group"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding2["Group"].dtype)) - self.assertEqual(test_encoding2["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding2["R2"].dtype)) - self.assertEqual(test_encoding2["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding2["RMSE"].dtype)) - self.assertEqual(test_encoding2["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding2["MSE"].dtype)) - self.assertEqual(test_encoding2["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding2["MAE"].dtype)) - self.assertEqual(test_encoding2["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding2["RPD"].dtype)) - self.assertEqual(test_encoding2["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding2["Explained Variance"].dtype)) - for cat in list(test_encoding2["Category"]): - self.assertIn(cat, self.index_categories, - "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - for group in list(test_encoding2["Group"]): - self.assertIn(group, self.descriptor_groups, - "Group {} not found in list of groups:\n{}".format(group, self.descriptor_groups)) - for col in test_encoding2.columns: - self.assertIn(col, self.expected_aai_desc_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_desc_encoding_output_columns)) + self.assertIsInstance(test_encoding_enantioselectivity, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_enantioselectivity))) + self.assertEqual(len(test_encoding_enantioselectivity), 6, + "Expected 6 rows in output dataframe, got {}.".format(len(test_encoding_enantioselectivity))) + self.assertEqual(set(list(test_encoding_enantioselectivity["Index"])), set(test_aai2.replace(' ', '').split(',')), + "Output Index column does not match expected, got\n{}.".format(test_encoding_enantioselectivity["Index"])) + self.assertEqual(set(list(test_encoding_enantioselectivity["Category"].values)), {'composition', 'geometry'}, + "Output category values don't match expected, got {}.".format(test_encoding_enantioselectivity["Category"].values)) + self.assertEqual(set(list(test_encoding_enantioselectivity["Descriptor"])), set(test_desc2), + "Output descriptor column values don't match expected, got\n{}.".format(test_encoding_enantioselectivity["Descriptor"])) + self.assertEqual(set(list(test_encoding_enantioselectivity["Group"].values)), {"Sequence Order", "CTD"}, + "Output group values don't match expected, got {}.".format(test_encoding_enantioselectivity["Group"].values)) + for col in test_encoding_enantioselectivity.columns: + self.assertIn(col, self.expected_aai_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_desc_encoding_output_columns)) + if (col == "Index" or col == "Category" or col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_enantioselectivity[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_enantioselectivity[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_enantioselectivity[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_enantioselectivity[col]))) #3.) - test_aai3 = "GEOR030107, KARS160113, COWR900101" + test_aai3 = "GEOR030107, KARS160113, COWR900101" #absorption test_desc3 = ["amino_acid_composition", "ctd_distribution"] - test_encoding3 = self.test_config3.aai_descriptor_encoding(aai_indices=test_aai3, descriptors=test_desc3, - desc_combo=1, sort_by="MSE", output_folder=self.test_output_folder) #** + test_encoding_absorption = self.test_config_absorption.aai_descriptor_encoding(aai_indices=test_aai3, descriptors=test_desc3, + desc_combo=1, sort_by="MSE", output_folder=self.test_output_folder) - self.assertIsInstance(test_encoding3, pd.DataFrame, - "Expected output to be a dataframe, got {}.".format(type(test_encoding3))) - self.assertEqual(len(test_encoding3), 6, - "Expected 6 rows in output dataframe, got {}.".format(len(test_encoding3))) #** - self.assertEqual(set(list(test_encoding3["Index"])), set(test_aai3.replace(' ', '').split(',')), - "Expected index column to be type string, got {}.".format(list(test_encoding3["Index"]))) - self.assertEqual(set(list(test_encoding3["Descriptor"])), set(test_desc3), - "Output index values don't match expected.") - self.assertEqual(test_encoding3["Index"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding3["Index"].dtype)) - self.assertEqual(test_encoding3["Category"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding3["Category"].dtype)) - self.assertEqual(test_encoding3["Descriptor"].dtype, "string[python]", - "Expected index column to be type string, got {}.".format(test_encoding3["Descriptor"].dtype)) - self.assertEqual(test_encoding3["Group"].dtype, "string[python]", - "Expected category column to be type string, got {}.".format(test_encoding3["Group"].dtype)) - self.assertEqual(test_encoding3["R2"].dtype, float, - "Expected R2 column to be type float, got {}.".format(test_encoding3["R2"].dtype)) - self.assertEqual(test_encoding3["RMSE"].dtype, float, - "Expected RMSE column to be type float, got {}.".format(test_encoding3["RMSE"].dtype)) - self.assertEqual(test_encoding3["MSE"].dtype, float, - "Expected MSE column to be type float, got {}.".format(test_encoding3["MSE"].dtype)) - self.assertEqual(test_encoding3["MAE"].dtype, float, - "Expected MAE column to be type float, got {}.".format(test_encoding3["MAE"].dtype)) - self.assertEqual(test_encoding3["RPD"].dtype, float, - "Expected RPD column to be type float, got {}.".format(test_encoding3["RPD"].dtype)) - self.assertEqual(test_encoding3["Explained Variance"].dtype, float, - "Expected Explained Variance column to be type float, got {}.".format(test_encoding3["Explained Variance"].dtype)) - for cat in list(test_encoding3["Category"]): - self.assertIn(cat, self.index_categories, - "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - for group in list(test_encoding3["Group"]): - self.assertIn(group, self.descriptor_groups, - "Group {} not found in list of groups:\n{}".format(group, self.descriptor_groups)) - for col in test_encoding3.columns: - self.assertIn(col, self.expected_aai_desc_encoding_output_columns, - "Column {} not found in list of column:\n{}".format(col, self.expected_aai_desc_encoding_output_columns)) + self.assertIsInstance(test_encoding_absorption, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_absorption))) + self.assertEqual(len(test_encoding_absorption), 6, + "Expected 6 rows in output dataframe, got {}.".format(len(test_encoding_absorption))) + self.assertEqual(set(list(test_encoding_absorption["Index"])), set(test_aai3.replace(' ', '').split(',')), + "Output Index column does not match expected, got\n{}.".format(test_encoding_absorption["Index"])) + self.assertEqual(set(list(test_encoding_absorption["Category"].values)), {'hydrophobic', 'meta', 'sec_struct'}, + "Output category values don't match expected, got {}.".format(test_encoding_absorption["Category"].values)) + self.assertEqual(set(list(test_encoding_absorption["Descriptor"])), set(test_desc3), + "Output descriptor column values don't match expected, got\n{}.".format(test_encoding_absorption["Descriptor"])) + self.assertEqual(set(list(test_encoding_absorption["Group"].values)), {"Composition", "CTD"}, + "Output group values don't match expected, got {}.".format(test_encoding_absorption["Group"].values)) + for col in test_encoding_absorption.columns: + self.assertIn(col, self.expected_aai_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_desc_encoding_output_columns)) + if (col == "Index" or col == "Category" or col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_absorption[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_absorption[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_absorption[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_absorption[col]))) +#4.) + test_aai4 = ["BEGF750103", "CIDH920103", "JOND920101"] #localization + test_desc4 = ["dipeptide_composition", "ctd_transition"] + test_encoding_localization = self.test_config_localization.aai_descriptor_encoding(aai_indices=test_aai4, descriptors=test_desc4, + desc_combo=1, sort_by="MSE", output_folder=self.test_output_folder) + + self.assertIsInstance(test_encoding_localization, pd.DataFrame, + "Expected output to be a dataframe, got {}.".format(type(test_encoding_localization))) + self.assertEqual(len(test_encoding_localization), 6, + "Expected 6 rows in output dataframe, got {}.".format(len(test_encoding_localization))) + self.assertEqual(set(list(test_encoding_localization["Index"])), set(test_aai4), + "Output Index column does not match expected, got\n{}.".format(test_encoding_localization["Index"])) + self.assertEqual(set(list(test_encoding_localization["Category"].values)), {"sec_struct", "composition", "hydrophobic"}, + "Output category values don't match expected, got {}.".format(test_encoding_localization["Category"].values)) + self.assertEqual(set(list(test_encoding_localization["Descriptor"])), {"dipeptide_composition", "ctd_transition"}, + "Output descriptor column values don't match expected, got\n{}.".format(test_encoding_localization["Descriptor"])) + self.assertEqual(set(list(test_encoding_localization["Group"].values)), {"Composition", "CTD"}, + "Output group values don't match expected, got {}.".format(test_encoding_localization["Group"].values)) + for col in test_encoding_localization.columns: + self.assertIn(col, self.expected_aai_desc_encoding_output_columns, + "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_desc_encoding_output_columns)) + if (col == "Index" or col == "Category" or col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_localization[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_encoding_localization[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_localization[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_localization[col]))) self.assertTrue(os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME), "Output dir storing encoding results not found.") self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "aai_desc_results.csv")), "Output csv storing encoding results not found.") -#4.) - test_aai4 = ["invalid_aai_index"] - test_aai5 = "" - test_desc4 = ["invalid_descriptor_name"] +#5.) + test_aai5 = ["invalid_aai_index"] + test_aai6 = "" + test_desc5 = ["invalid_descriptor_name"] with self.assertRaises(ValueError): - self.test_config1.aai_descriptor_encoding(aai_indices=test_aai4, descriptors=test_desc4, desc_combo=1, sort_by="MSE") - self.test_config1.aai_descriptor_encoding(aai_indices=test_aai5, descriptors=test_desc4, desc_combo=1, sort_by="MSE") + self.test_config_thermostability.aai_descriptor_encoding(aai_indices=test_aai5, descriptors=test_desc5, desc_combo=1, sort_by="MSE") + self.test_config_enantioselectivity.aai_descriptor_encoding(aai_indices=test_aai6, descriptors=test_desc5, desc_combo=1, sort_by="MSE") #6.) - test_aai6 = 12345 - test_desc5 = 1000 + test_aai7 = 12345 + test_desc6 = 1000 + test_desc7 = False with self.assertRaises(TypeError): - self.test_config1.aai_descriptor_encoding(aai_indices=test_aai6, descriptors=test_desc5, desc_combo=1, sort_by="MAE") + self.test_config_absorption.aai_descriptor_encoding(aai_indices=test_aai7, descriptors=test_desc6, desc_combo=1, sort_by="MAE") + self.test_config_localization.aai_descriptor_encoding(aai_indices=test_aai7, descriptors=test_desc7, desc_combo=1, sort_by="MAE") #7.) ** Below inputs result in all AAI Indices being encoded with all descriptors, commenting out due to time and resource constraints ** - # test_aai7 = [] - # test_desc6 = [] - # test_encoding7 = self.test_config1.aai_descriptor_encoding(aai_indices=test_aai7, descriptors=test_desc6, + # test_aai8 = [] + # test_desc7 = [] + # test_encoding7 = self.test_encoding_thermostability.aai_descriptor_encoding(aai_indices=test_aai8, descriptors=test_desc7, # desc_combo=1, sort_by="MAE", output_folder=self.test_output_folder) - # self.assertIsInstance(test_encoding7, pd.DataFrame, - # "Expected output to be a dataframe, got {}.".format(type(test_encoding7))) - # self.assertEqual(len(test_encoding7), 8490, - # "Expected 8490 rows in output dataframe, got {}.".format(len(test_encoding7))) - # self.assertEqual(list(test_encoding7["Index"]), test_aai3, - # "Expected index column to be type string, got {}.".format(test_encoding7["Index"].dtype)) - # self.assertEqual(list(test_encoding7["Descriptor"]), test_desc1, + # self.assertIsInstance(test_encoding_thermostability, pd.DataFrame, + # "Expected output to be a dataframe, got {}.".format(type(test_encoding_thermostability))) + # self.assertEqual(len(test_encoding_thermostability), 6, + # "Expected 6 rows in output dataframe, got {}.".format(len(test_encoding_thermostability))) + # self.assertEqual(set(list(test_encoding_thermostability["Index"])), set(test_aai3.replace(' ', '').split(',')), + # "Expected index column to be type string, got {}.".format(test_encoding_thermostability["Index"].dtype)) + # self.assertEqual(test_encoding_thermostability["Category"].values[0], ["Composition", "CTD"], + # "Output category values don't match expected, got {}.".format(test_encoding_thermostability["Category"].values[0])) #** + # self.assertEqual(set(list(test_encoding_thermostability["Descriptor"])), set(test_desc3), # "Output index values don't match expected.") - # self.assertEqual(test_encoding7["Index"].dtype, "string[python]", - # "Expected index column to be type string, got {}.".format(test_encoding7["Index"].dtype)) - # self.assertEqual(test_encoding7["Category"].dtype, "string[python]", - # "Expected index column to be type string, got {}.".format(test_encoding7["Category"].dtype)) - # self.assertEqual(test_encoding7["Descriptor"].dtype, "string[python]", - # "Expected index column to be type string, got {}.".format(test_encoding7["Descriptor"].dtype)) - # self.assertEqual(test_encoding7["Group"].dtype, "string[python]", - # "Expected category column to be type string, got {}.".format(test_encoding7["Group"].dtype)) - # self.assertEqual(test_encoding7["R2"].dtype, float, - # "Expected R2 column to be type float, got {}.".format(test_encoding7["R2"].dtype)) - # self.assertEqual(test_encoding7["RMSE"].dtype, float, - # "Expected RMSE column to be type float, got {}.".format(test_encoding7["RMSE"].dtype)) - # self.assertEqual(test_encoding7["MSE"].dtype, float, - # "Expected MSE column to be type float, got {}.".format(test_encoding7["MSE"].dtype)) - # self.assertEqual(test_encoding7["MAE"].dtype, float, - # "Expected MAE column to be type float, got {}.".format(test_encoding7["MAE"].dtype)) - # self.assertEqual(test_encoding7["RPD"].dtype, float, - # "Expected RPD column to be type float, got {}.".format(test_encoding7["RPD"].dtype)) - # self.assertEqual(test_encoding7["Explained Variance"].dtype, float, - # "Expected Explained Variance column to be type float, got {}.".format(test_encoding5["Explained Variance"].dtype)) - # for cat in list(test_encoding7["Category"]): - # self.assertIn(cat, self.index_categories, - # "Category {} not found in list of categories:\n{}".format(cat, self.index_categories)) - # for group in list(test_encoding7["Group"]): + # self.assertEqual(test_encoding_thermostability["Group"].values[0], ["Composition", "CTD"], + # "Output group values don't match expected, got {}.".format(test_encoding_thermostability["Group"].values[0])) + # for col in test_encoding_thermostability.columns: + # self.assertIn(col, self.expected_aai_desc_encoding_output_columns, + # "Col {} not found in list of expected columns:\n{}.".format(col, self.expected_aai_desc_encoding_output_columns)) + # if (col == "Index" or col == "Category" or col == "Descriptor" or col == "Group"): + # self.assertTrue(all(isinstance(row, str) for row in list(test_encoding_thermostability[col].values)), + # "Column {} expected to be of type string got {}.".format(col, type(test_encoding_thermostability[col]))) + # else: + # self.assertTrue(all(isinstance(row, np.float64) for row in list(test_encoding_thermostability[col].values)), + # "Column {} expected to be of type np.float64 got {}.".format(col, type(test_encoding_thermostability[col]))) + # for group in list(test_encoding_thermostability["Group"]): # self.assertIn(group, self.descriptor_groups, - # "Group {} not found in list of groups:\n{}".format(group, self.descriptor_groups)) - # for col in test_encoding7.columns: - # self.assertIn(col, self.expected_aai_desc_encoding_output_columns, - # "Column {} not found in list of column:\n{}".format(col, self.expected_aai_desc_encoding_output_columns)) + # "Group {} not found in list of groups:\n{}.".format(group, self.descriptor_groups)) + # for desc in list(test_encoding_thermostability["Descriptor"]): + # self.assertIn(desc, self.valid_descriptors, + # "Descriptor {} not found in list of available descriptors:\n{}.".format(desc, self.valid_descriptors)) + # self.assertTrue(os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME), + # "Output dir storing encoding results not found.") + # self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "aai_desc_results.csv")), + # "Output csv storing encoding results not found.") def tearDown(self): """ Delete any temp files or folders created during testing process. """ diff --git a/tests/test_model.py b/tests/test_model.py index 1b8158c..82067aa 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -7,11 +7,11 @@ import numpy as np import shutil #### Suppress Sklearn warnings #### -def warn(*args, **kwargs): - pass import warnings -warnings.warn = warn -warnings.filterwarnings("ignore",category=DeprecationWarning) +# def warn(*args, **kwargs): +# pass +# warnings.warn = warn +warnings.filterwarnings("ignore", category=DeprecationWarning) from sklearn.cross_decomposition import PLSRegression from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor from sklearn.linear_model import Lasso @@ -53,7 +53,8 @@ def setUp(self): self.dummy_Y_2 = np.random.randint(20, size=50) self.dummy_Y_2D = np.random.ranf((50,1)) #50 sequences - self.test_folder = os.path.join('tests', 'test_model_output') #test model folder + #test model folder + self.test_folder = os.path.join('tests', 'test_model_output') os.mkdir(self.test_folder) def test_model(self): @@ -62,17 +63,16 @@ def test_model(self): 'BaggingRegressor', 'DecisionTreeRegressor', 'LinearRegression',\ 'Lasso', 'SVR', 'KNeighborsRegressor', 'GradientBoostingRegressor', 'Ridge'] - #iterate through all available algorithms and test them + #iterate through all available algorithms/models and test each for test_mod in range(0, len(test_models)): model = Model(self.dummy_X, self.dummy_Y, test_models[test_mod]) #1.) #checking model object is of the correct sklearn model datatype self.assertEqual(type(model.model).__name__, test_models[test_mod], - 'Model type is not correct, expected {}, got {}.'.format( - test_models[test_mod], type(model.model).__name__)) + 'Model type is not correct, expected {}, got {}.'.format(test_models[test_mod], type(model.model).__name__)) #2.) #assert that model has not been fitted - self.assertFalse(model.model_fitted(), 'Model should not be fitted on initialisation') + self.assertFalse(model.model_fitted(), 'Model should not be fitted on initialisation.') #3.) #verify that parameters input param = {} meaning the default params for the model are used self.assertEqual(model.parameters, {}, 'Default Parameters attribute should be an empty dict, but got {}.'.format(model.parameters)) @@ -81,94 +81,70 @@ def test_model(self): 'Default test split attribute should be 0.2, but got {}.'.format(model.test_split)) #5.) #verify that input model type is a valid model for the class self.assertTrue(model.algorithm in [item.lower() for item in model.valid_models], - 'Input algorithm {} not in available algorithms:\n {}'.format(model.algorithm, model.valid_models)) + 'Input algorithm {} not in available algorithms:\n{}.'.format(model.algorithm, model.valid_models)) #6.) #verify repr representation of model object is correct self.assertEqual(repr(model), test_models[test_mod], 'Repr function expected to be {}, but got {}.'.format(test_models[test_mod], repr(model))) #7.) #verify algorithm is a regression - self.assertTrue(sklearn.base.is_regressor(model.model), - 'Model type should be a sklearn regressor.') + self.assertTrue(sklearn.base.is_regressor(model.model), 'Model type should be a sklearn regressor.') #8.) if (self.dummy_X.ndim == 1): model = Model(self.dummy_X, self.dummy_Y, 'plsreg', parameters={'n_components': 1}) model.train_test_split() model.fit() - self.assertTrue(model.model_fitted(), 'Model has not been fitted') + self.assertTrue(model.model_fitted(), 'Model should be fitted') def test_model_input_closeness(self): """ Test case for testing the algorithm closeness function used to get the closest available algorithm to the algorithm input into the class. """ #1.) model = Model(self.dummy_X, self.dummy_Y, 'plsreg') - self.assertEqual(model.algorithm, "plsregression", - "Expected algorithm to be plsregression, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "PLSRegression", - "Expected representation of model object to be PLSRegression, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "plsregression", "Expected algorithm to be plsregression, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "PLSRegression", "Expected representation of model object to be PLSRegression, got {}.".format(repr(model))) #2.) model = Model(self.dummy_X, self.dummy_Y, 'randomfor') - self.assertEqual(model.algorithm, "randomforestregressor", - "Expected algorithm to be randomforestregressor, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "RandomForestRegressor", - "Expected representation of model object to be RandomForestRegressor, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "randomforestregressor", "Expected algorithm to be randomforestregressor, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "RandomForestRegressor", "Expected representation of model object to be RandomForestRegressor, got {}.".format(repr(model))) #3.) model = Model(self.dummy_X, self.dummy_Y, 'adaboo') - self.assertEqual(model.algorithm, "adaboostregressor", - "Expected algorithm to be adaboostregressor, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "AdaBoostRegressor", - "Expected representation of model object to be AdaBoostRegressor, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "adaboostregressor", "Expected algorithm to be adaboostregressor, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "AdaBoostRegressor", "Expected representation of model object to be AdaBoostRegressor, got {}.".format(repr(model))) #4.) model = Model(self.dummy_X, self.dummy_Y, 'bagging') - self.assertEqual(model.algorithm, "baggingregressor", - "Expected algorithm to be baggingregressor, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "BaggingRegressor") + self.assertEqual(model.algorithm, "baggingregressor", "Expected algorithm to be baggingregressor, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "BaggingRegressor", "Expected representation of model object to be BaggingRegressor, got {}.".format(repr(model))) #5.) model = Model(self.dummy_X, self.dummy_Y, 'decisiontree') - self.assertEqual(model.algorithm, "decisiontreeregressor", - "Expected algorithm to be decisiontreeregressor, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "DecisionTreeRegressor", - "Expected representation of model object to be DecisionTreeRegressor, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "decisiontreeregressor", "Expected algorithm to be decisiontreeregressor, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "DecisionTreeRegressor", "Expected representation of model object to be DecisionTreeRegressor, got {}.".format(repr(model))) #6.) model = Model(self.dummy_X, self.dummy_Y, 'linear') - self.assertEqual(model.algorithm, "linearregression", - "Expected algorithm to be linearregression, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "LinearRegression", - "Expected representation of model object to be LinearRegression, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "linearregression", "Expected algorithm to be linearregression, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "LinearRegression", "Expected representation of model object to be LinearRegression, got {}.".format(repr(model))) #7.) model = Model(self.dummy_X, self.dummy_Y, 'lass') - self.assertEqual(model.algorithm, "lasso", - "Expected algorithm to be lasso, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "Lasso", - "Expected representation of model object to be Lasso, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "lasso", "Expected algorithm to be lasso, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "Lasso", "Expected representation of model object to be Lasso, got {}.".format(repr(model))) #8.) model = Model(self.dummy_X, self.dummy_Y, 'kneighbors') - self.assertEqual(model.algorithm, "knearestneighbors", - "Expected algorithm to be knearestneighbors, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "KNeighborsRegressor", - "Expected representation of model object to be KNeighborsRegressor, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "knearestneighbors", "Expected algorithm to be knearestneighbors, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "KNeighborsRegressor", "Expected representation of model object to be KNeighborsRegressor, got {}.".format(repr(model))) #9.) model = Model(self.dummy_X, self.dummy_Y, 'sv') - self.assertEqual(model.algorithm, "svr", - "Expected algorithm to be svr, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "SVR", - "Expected representation of model object to be SVR, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "svr", "Expected algorithm to be svr, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "SVR", "Expected representation of model object to be SVR, got {}.".format(repr(model))) #10.) model = Model(self.dummy_X, self.dummy_Y, 'rid') - self.assertEqual(model.algorithm, "ridge", - "Expected algorithm to be ridge, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "Ridge", - "Expected representation of model object to be Ridge, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "ridge", "Expected algorithm to be ridge, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "Ridge", "Expected representation of model object to be Ridge, got {}.".format(repr(model))) #11.) model = Model(self.dummy_X, self.dummy_Y, 'gbr') - self.assertEqual(model.algorithm, "gbr", - "Expected algorithm to be gbr, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "GradientBoostingRegressor", - "Expected representation of model object to be GradientBoostingRegressor, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "gbr", "Expected algorithm to be gbr, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "GradientBoostingRegressor", "Expected representation of model object to be GradientBoostingRegressor, got {}.".format(repr(model))) #12.) model = Model(self.dummy_X, self.dummy_Y, 'sg') - self.assertEqual(model.algorithm, "sgd", - "Expected algorithm to be sgd, got {}.".format(model.algorithm)) - self.assertEqual(repr(model), "SGDRegressor", - "Expected representation of model object to be SGDRegressor, got {}".format(repr(model))) + self.assertEqual(model.algorithm, "sgd", "Expected algorithm to be sgd, got {}.".format(model.algorithm)) + self.assertEqual(repr(model), "SGDRegressor", "Expected representation of model object to be SGDRegressor, got {}.".format(repr(model))) #13.) with self.assertRaises(ValueError, msg='Value Error raised, invalid model/algorithm name given.'): Model(self.dummy_X, self.dummy_Y, 'abcdefg') @@ -277,7 +253,7 @@ def test_parameters(self): for k, v in model.model.get_params().items(): self.assertIn(k, list(pls_model.get_params()), - "Parameter {} should be in list of parameters:\n{}".format(k, list(pls_model.get_params()))) + "Parameter {} not found in list of available parameters:\n{}.".format(k, list(pls_model.get_params()))) #2.) rf_parameters = {"n_estimators": 200, "max_depth": 50, "min_samples_split": 10} model = Model(self.dummy_X_2, self.dummy_Y_2, algorithm="RandomForest", parameters=rf_parameters) @@ -285,7 +261,7 @@ def test_parameters(self): for k, v in model.model.get_params().items(): self.assertIn(k, list(rf_model.get_params()), - "Parameter {} should be in list of parameters:\n{}".format(k, list(rf_model.get_params()))) + "Parameter {} not found in list of available parameters:\n{}.".format(k, list(rf_model.get_params()))) #3.) knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"} model = Model(self.dummy_X_2, self.dummy_Y_2, algorithm="KNN", parameters=knn_parameters) @@ -293,7 +269,7 @@ def test_parameters(self): for k, v in model.model.get_params().items(): self.assertIn(k, list(knn_model.get_params()), - "Parameter {} should be in list of parameters:\n{}".format(k, list(knn_model.get_params()))) + "Parameter {} not found in list of available parameters:\n{}.".format(k, list(knn_model.get_params()))) #4.) svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1} model = Model(self.dummy_X_2, self.dummy_Y_2, algorithm="SVR",parameters=svr_parameters) @@ -301,7 +277,7 @@ def test_parameters(self): for k, v in model.model.get_params().items(): self.assertIn(k, list(svr_model.get_params()), - "Parameter {} should be in list of parameters:\n{}".format(k, list(svr_model.get_params()))) + "Parameter {} not found in list of available parameters:\n{}.".format(k, list(svr_model.get_params()))) #5.) ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"} model = Model(self.dummy_X_2, self.dummy_Y_2, algorithm="AdaBoost", parameters=ada_parameters) @@ -309,7 +285,7 @@ def test_parameters(self): for k, v in model.model.get_params().items(): self.assertIn(k, list(ada_model.get_params()), - "Parameter {} should be in list of parameters:\n{}".format(k, list(ada_model.get_params()))) + "Parameter {} not found in list of available parameters:\n{}.".format(k, list(ada_model.get_params()))) #6.) bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2} model = Model(self.dummy_X_2, self.dummy_Y_2, algorithm="Bagging", parameters=bagging_parameters) @@ -317,7 +293,7 @@ def test_parameters(self): for k, v in model.model.get_params().items(): self.assertIn(k, list(bagging_model.get_params()), - "Parameter {} should be in list of parameters:\n{}".format(k, list(bagging_model.get_params()))) + "Parameter {} not found in list of available parameters:\n{}.".format(k, list(bagging_model.get_params()))) #7.) lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004} model = Model(self.dummy_X_2, self.dummy_Y_2, algorithm="lasso", parameters=lasso_parameters) @@ -325,10 +301,10 @@ def test_parameters(self): for k, v in model.model.get_params().items(): self.assertIn(k, list(lasso_model.get_params()), - "Parameter {} should be in list of parameters:\n{}".format(k, list(lasso_model.get_params()))) + "Parameter {} not found in list of available parameters:\n{}.".format(k, list(lasso_model.get_params()))) def test_hyperparamter_tuning(self): - """ Testing hyperparamter tuning function. """ + """ Testing hyperparamter tuning functionality. """ #1.) model = Model(self.dummy_X, self.dummy_Y, algorithm="adaboost") X_train, X_test, Y_train, Y_test = model.train_test_split(test_split=0.2) @@ -378,9 +354,10 @@ def test_hyperparamter_tuning(self): #4.) with self.assertRaises(TypeError): model.hyperparameter_tuning(parameters='wrongType') + model.hyperparameter_tuning(parameters=123) def test_feature_selection(self): - """ Testing Feature Selection function. """ + """ Testing Feature Selection functionality. """ pass def tearDown(self): diff --git a/tests/test_pyDSP.py b/tests/test_pyDSP.py index d14ed11..56aa444 100644 --- a/tests/test_pyDSP.py +++ b/tests/test_pyDSP.py @@ -1,29 +1,32 @@ ################################################################################ -################# PyDSP Module Tests ################# +################# PyDSP Module Tests ################## ################################################################################ import os import numpy as np -import json import pySAR.pyDSP as pyDSP_ import pySAR.pySAR as pySAR import unittest +#suppress sklearn warnings +import warnings +warnings.filterwarnings("ignore") class pyDSPTests(unittest.TestCase): """ - Test suite for testing pyDSP module and functionality - in pySAR package. + Test suite for testing pyDSP module and functionality in pySAR package. Test Cases ========== test_pyDSP: testing correct overall pyDSP class and module functionality. test_preprocessing: - testing correct pydsp pre processing functionality. + testing correct pyDSP pre processing functionality. test_protein_spectra: - testing correct pydsp protein_spectra functionality. + testing correct pyDSP protein_spectra functionality. + test_consensus_freq: + testing correct pyDSP consensus frequency functionality. test_max_freq: - testing correct max_freq pydsp functionality. + testing correct pyDSP max_freq functionality. """ def setUp(self): """ Import the 4 config files for each of the 4 datasets used for testing the pyDSP methods. """ @@ -35,8 +38,6 @@ def setUp(self): os.path.join(self.config_path, "test_absorption.json"), os.path.join(self.config_path, "test_localization.json") ] - #create instance of pysar class using thermostability dataset & config - self.pysar = pySAR.PySAR(config_file=self.all_config_files[0]) def test_pyDSP(self): """ Test class input parameters and attributes. """ @@ -45,209 +46,158 @@ def test_pyDSP(self): aa_indices3 = "NAKH900106" aa_indices4 = "QIAN880105" #1.) - encoded_seq1 = self.pysar.get_aai_encoding(aa_indices1) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[0], protein_seqs=encoded_seq1) #test_thermostability + pysar_thermostability = pySAR.PySAR(config_file=self.all_config_files[0]) #thermostability + encoded_seq_thermostability = pysar_thermostability.get_aai_encoding(aa_indices1) + pyDSP_thermostability = pyDSP_.PyDSP(config_file=self.all_config_files[0], protein_seqs=encoded_seq_thermostability) - self.assertEqual(pyDSP.spectrum, "power", - "Output spectrum should be power, got {}.".format(pyDSP.spectrum)) - self.assertEqual(pyDSP.window_type, "blackmanharris", - "Output window function should be blackmanharris, got {}.".format(pyDSP.window_type)) - self.assertIsInstance(pyDSP.window, np.ndarray, - "Output from window function should be a numpy array.") - self.assertIsNone(pyDSP.filter, - "Filter function should be None on class initialisation.") - self.assertIsNone(pyDSP.filter_type, - "Filter type expected to be None, got {}.".format(pyDSP.filter_type)) - self.assertEqual(pyDSP.spectrum_encoding.shape, (self.pysar.num_seqs, self.pysar.sequence_length), - "Spectrum encoding shape expected to be ({}, {}), got {}.".format(self.pysar.num_seqs, self.pysar.sequence_length, pyDSP.spectrum_encoding.shape)) - self.assertEqual(pyDSP.num_seqs, self.pysar.num_seqs, - "num_seqs attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.num_seqs)) - self.assertEqual(pyDSP.signal_len, self.pysar.sequence_length, - "signal_len attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.sequence_length)) - self.assertEqual(pyDSP.fft_power.dtype, 'float64', - "power spectrum expected to be of type float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_power.shape, encoded_seq1.shape, - "FFT encoding with power spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_real.dtype, 'float64', - "real spectrum expected to be of type float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_real.shape, encoded_seq1.shape, - "FFT encoding with real spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_abs.dtype, 'float64', - "absolute spectrum expected to be of type float64, got {}.".format(pyDSP.fft_abs.dtype)) - self.assertEqual(pyDSP.fft_abs.shape, encoded_seq1.shape, - "FFT encoding with absolute spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_imag.dtype, 'float64', - "imaginary spectrum expected to be of type float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_imag.shape, encoded_seq1.shape, - "FFT encoding with imaginary spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_freqs.shape, encoded_seq1.shape, - "FFT frequencies expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertTrue(pyDSP.spectrum_encoding.any() == pyDSP.fft_power.any(), - "Spectrum encoding attribute should equal that of chosen fft spectrum, power.") - self.assertEqual(pyDSP.fft_power.dtype, "float64", - "Data type of power spectrum should be float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_real.dtype, "float64", - "Data type of real spectrum should be float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_imag.dtype, "float64", - "Data type of imaginary spectrum should be float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_abs.dtype, "float64", - "Data type of absolute spectrum should be float64, got {}.".format(pyDSP.fft_abs.dtype)) + self.assertEqual(pyDSP_thermostability.spectrum, "power", "Expected spectrum to be power, got {}.".format(pyDSP_thermostability.spectrum)) + self.assertEqual(pyDSP_thermostability.window_type, "hamming", "Expected window function to be hamming, got {}.".format(pyDSP_thermostability.window_type)) + self.assertIsInstance(pyDSP_thermostability.window, np.ndarray, "Expected window function to be a numpy array.") + self.assertIsNone(pyDSP_thermostability.filter, "Expected filter function to be None on class initialisation.") + self.assertIsNone(pyDSP_thermostability.filter_type, "Expected filter type to to be None, got {}.".format(pyDSP_thermostability.filter_type)) + self.assertEqual(pyDSP_thermostability.spectrum_encoding.shape, (pysar_thermostability.num_seqs, pysar_thermostability.sequence_length), + "Expected spectrum encoding to be ({}, {}), got {}.".format(pysar_thermostability.num_seqs, pysar_thermostability.sequence_length, pyDSP_thermostability.spectrum_encoding.shape)) + self.assertEqual(pyDSP_thermostability.num_seqs, pysar_thermostability.num_seqs, + "Expected num_seqs attribute in pyDSP_thermostability_thermostability class to be equal to that of pysar attribute: {}.".format(pysar_thermostability.num_seqs)) + self.assertEqual(pyDSP_thermostability.signal_len, pysar_thermostability.sequence_length, + "Expectd signal_len attribute in pyDSP_thermostability_thermostability class to be equal that of pysar attribute: {}.".format(pysar_thermostability.sequence_length)) + self.assertEqual(pyDSP_thermostability.fft_power.dtype, 'float64', "Expected power spectrum to be of type float64, got {}.".format(pyDSP_thermostability.fft_power.dtype)) + self.assertEqual(pyDSP_thermostability.fft_power.shape, encoded_seq_thermostability.shape, + "Expected FFT encoding with power spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_thermostability.shape)) + self.assertEqual(pyDSP_thermostability.fft_real.dtype, 'float64', "Expected real spectrum to be of type float64, got {}.".format(pyDSP_thermostability.fft_real.dtype)) + self.assertEqual(pyDSP_thermostability.fft_real.shape, encoded_seq_thermostability.shape, + "Expected FFT encoding with real spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_thermostability.shape)) + self.assertEqual(pyDSP_thermostability.fft_abs.dtype, 'float64', "Expected absolute spectrum to be of type float64, got {}.".format(pyDSP_thermostability.fft_abs.dtype)) + self.assertEqual(pyDSP_thermostability.fft_abs.shape, encoded_seq_thermostability.shape, + "Expected FFT encoding with absolute spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_thermostability.shape)) + self.assertEqual(pyDSP_thermostability.fft_imag.dtype, 'float64', "Expected imaginary spectrum to be of type float64, got {}.".format(pyDSP_thermostability.fft_imag.dtype)) + self.assertEqual(pyDSP_thermostability.fft_imag.shape, encoded_seq_thermostability.shape, + "Expected FFT encoding with imaginary spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_thermostability.shape)) + self.assertEqual(pyDSP_thermostability.fft_freqs.shape, encoded_seq_thermostability.shape, "Expected FFT frequencies to be same shape as encoded sequences: {}.".format(encoded_seq_thermostability.shape)) + self.assertTrue(pyDSP_thermostability.spectrum_encoding.any() == pyDSP_thermostability.fft_power.any(), + "Expected spectrum encoding attribute to be equal to chosen fft spectrum, power.") + self.assertEqual(pyDSP_thermostability.fft_power.dtype, "float64", "Expected data type of power spectrum to be float64, got {}.".format(pyDSP_thermostability.fft_power.dtype)) + self.assertEqual(pyDSP_thermostability.fft_real.dtype, "float64", "Expected data type of real spectrum to be float64, got {}.".format(pyDSP_thermostability.fft_real.dtype)) + self.assertEqual(pyDSP_thermostability.fft_imag.dtype, "float64", "Expected data type of imaginary spectrum to be float64, got {}.".format(pyDSP_thermostability.fft_imag.dtype)) + self.assertEqual(pyDSP_thermostability.fft_abs.dtype, "float64", "Expected data type of absolute spectrum to be float64, got {}.".format(pyDSP_thermostability.fft_abs.dtype)) #2.) - encoded_seq2 = self.pysar.get_aai_encoding(aa_indices2) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[1], protein_seqs=encoded_seq2) #test_enantioselectivity + pysar_enantioselectivity = pySAR.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + encoded_seq_enantioselectivity = pysar_enantioselectivity.get_aai_encoding(aa_indices2) + pyDSP_enantioselectivity = pyDSP_.PyDSP(config_file=self.all_config_files[1], protein_seqs=encoded_seq_enantioselectivity) - self.assertEqual(pyDSP.spectrum, "absolute", - "Output spectrum should be absolute, got {}.".format(pyDSP.spectrum)) - self.assertEqual(pyDSP.window_type, "blackman", - "Output window function should be blackman, got {}.".format(pyDSP.window_type)) - self.assertIsInstance(pyDSP.window, np.ndarray, - "Output from window function should be a numpy array.") - self.assertIsNone(pyDSP.filter, - "Filter function should be None on class initialisation.") - self.assertIsNone(pyDSP.filter_type, - "Filter type expected to be None, got {}.".format(pyDSP.filter_type)) - self.assertEqual(pyDSP.spectrum_encoding.shape, (self.pysar.num_seqs, self.pysar.sequence_length), - "Spectrum encoding shape expected to be ({}, {}), got {}.".format(self.pysar.num_seqs, self.pysar.sequence_length, pyDSP.spectrum_encoding.shape)) - self.assertEqual(pyDSP.num_seqs, self.pysar.num_seqs, - "num_seqs attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.num_seqs)) - self.assertEqual(pyDSP.signal_len, self.pysar.sequence_length, - "signal_len attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.sequence_length)) - self.assertEqual(pyDSP.fft_power.dtype, 'float64', - "power spectrum expected to be of type float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_power.shape, encoded_seq1.shape, - "FFT encoding with power spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_real.dtype, 'float64', - "real spectrum expected to be of type float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_real.shape, encoded_seq1.shape, - "FFT encoding with real spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_abs.dtype, 'float64', - "absolute spectrum expected to be of type float64, got {}.".format(pyDSP.fft_abs.dtype)) - self.assertEqual(pyDSP.fft_abs.shape, encoded_seq1.shape, - "FFT encoding with absolute spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_imag.dtype, 'float64', - "imaginary spectrum expected to be of type float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_imag.shape, encoded_seq1.shape, - "FFT encoding with imaginary spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_freqs.shape, encoded_seq1.shape, - "FFT frequencies expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertTrue(pyDSP.spectrum_encoding.any() == pyDSP.fft_power.any(), - "Spectrum encoding attribute should equal that of chosen fft spectrum, power.") - self.assertEqual(pyDSP.fft_power.dtype, "float64", - "Data type of power spectrum should be float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_real.dtype, "float64", - "Data type of real spectrum should be float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_imag.dtype, "float64", - "Data type of imaginary spectrum should be float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_abs.dtype, "float64", - "Data type of absolute spectrum should be float64, got {}.".format(pyDSP.fft_abs.dtype)) + self.assertEqual(pyDSP_enantioselectivity.spectrum, "power", "Expected spectrum to be power, got {}.".format(pyDSP_enantioselectivity.spectrum)) + self.assertEqual(pyDSP_enantioselectivity.window_type, "hamming", "Expected window function to be hamming, got {}.".format(pyDSP_enantioselectivity.window_type)) + self.assertIsInstance(pyDSP_enantioselectivity.window, np.ndarray, "Expected window function to be a numpy array.") + self.assertIsNone(pyDSP_enantioselectivity.filter, "Expected filter function to be None on class initialisation.") + self.assertIsNone(pyDSP_enantioselectivity.filter_type, "Expected filter type to to be None, got {}.".format(pyDSP_enantioselectivity.filter_type)) + self.assertEqual(pyDSP_enantioselectivity.spectrum_encoding.shape, (pysar_enantioselectivity.num_seqs, pysar_enantioselectivity.sequence_length), + "Expected spectrum encoding to be ({}, {}), got {}.".format(pysar_enantioselectivity.num_seqs, pysar_enantioselectivity.sequence_length, pyDSP_enantioselectivity.spectrum_encoding.shape)) + self.assertEqual(pyDSP_enantioselectivity.num_seqs, pysar_enantioselectivity.num_seqs, + "Expected num_seqs attribute in pyDSP_enantioselectivity_thermostability class to be equal to that of pysar attribute: {}.".format(pysar_enantioselectivity.num_seqs)) + self.assertEqual(pyDSP_enantioselectivity.signal_len, pysar_enantioselectivity.sequence_length, + "Expectd signal_len attribute in pyDSP_enantioselectivity_thermostability class to be equal that of pysar attribute: {}.".format(pysar_enantioselectivity.sequence_length)) + self.assertEqual(pyDSP_enantioselectivity.fft_power.dtype, 'float64', "Expected power spectrum to be of type float64, got {}.".format(pyDSP_enantioselectivity.fft_power.dtype)) + self.assertEqual(pyDSP_enantioselectivity.fft_power.shape, encoded_seq_enantioselectivity.shape, + "Expected FFT encoding with power spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_enantioselectivity.shape)) + self.assertEqual(pyDSP_enantioselectivity.fft_real.dtype, 'float64', "Expected real spectrum to be of type float64, got {}.".format(pyDSP_enantioselectivity.fft_real.dtype)) + self.assertEqual(pyDSP_enantioselectivity.fft_real.shape, encoded_seq_enantioselectivity.shape, + "Expected FFT encoding with real spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_enantioselectivity.shape)) + self.assertEqual(pyDSP_enantioselectivity.fft_abs.dtype, 'float64', "Expected absolute spectrum to be of type float64, got {}.".format(pyDSP_enantioselectivity.fft_abs.dtype)) + self.assertEqual(pyDSP_enantioselectivity.fft_abs.shape, encoded_seq_enantioselectivity.shape, + "Expected FFT encoding with absolute spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_enantioselectivity.shape)) + self.assertEqual(pyDSP_enantioselectivity.fft_imag.dtype, 'float64', "Expected imaginary spectrum to be of type float64, got {}.".format(pyDSP_enantioselectivity.fft_imag.dtype)) + self.assertEqual(pyDSP_enantioselectivity.fft_imag.shape, encoded_seq_enantioselectivity.shape, + "Expected FFT encoding with imaginary spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_enantioselectivity.shape)) + self.assertEqual(pyDSP_enantioselectivity.fft_freqs.shape, encoded_seq_enantioselectivity.shape, "Expected FFT frequencies to be same shape as encoded sequences: {}.".format(encoded_seq_enantioselectivity.shape)) + self.assertTrue(pyDSP_enantioselectivity.spectrum_encoding.any() == pyDSP_enantioselectivity.fft_power.any(), + "Expected spectrum encoding attribute to be equal to chosen fft spectrum, power.") + self.assertEqual(pyDSP_enantioselectivity.fft_power.dtype, "float64", "Expected data type of power spectrum to be float64, got {}.".format(pyDSP_enantioselectivity.fft_power.dtype)) + self.assertEqual(pyDSP_enantioselectivity.fft_real.dtype, "float64", "Expected data type of real spectrum to be float64, got {}.".format(pyDSP_enantioselectivity.fft_real.dtype)) + self.assertEqual(pyDSP_enantioselectivity.fft_imag.dtype, "float64", "Expected data type of imaginary spectrum to be float64, got {}.".format(pyDSP_enantioselectivity.fft_imag.dtype)) + self.assertEqual(pyDSP_enantioselectivity.fft_abs.dtype, "float64", "Expected data type of absolute spectrum to be float64, got {}.".format(pyDSP_enantioselectivity.fft_abs.dtype)) #3.) - encoded_seq3 = self.pysar.get_aai_encoding(aa_indices3) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[2], protein_seqs=encoded_seq3) #test_absorption + pysar_absorption = pySAR.PySAR(config_file=self.all_config_files[2]) #absorption + encoded_seq_absorption = pysar_absorption.get_aai_encoding(aa_indices3) + pyDSP_absorption = pyDSP_.PyDSP(config_file=self.all_config_files[2], protein_seqs=encoded_seq_absorption) - self.assertEqual(pyDSP.spectrum, "power", - "Output spectrum should be power, got {}.".format(pyDSP.spectrum)) - self.assertEqual(pyDSP.window_type, "hamming", - "Output window function should be hamming, got {}.".format(pyDSP.window_type)) - self.assertIsInstance(pyDSP.window, np.ndarray, - "Output from window function should be a numpy array.") - self.assertIsNone(pyDSP.filter, - "Filter function should be None on class initialisation.") - self.assertIsNone(pyDSP.filter_type, - "Filter type expected to be None, got {}.".format(pyDSP.filter_type)) - self.assertEqual(pyDSP.spectrum_encoding.shape, (self.pysar.num_seqs, self.pysar.sequence_length), - "Spectrum encoding shape expected to be ({}, {}), got {}.".format(self.pysar.num_seqs, self.pysar.sequence_length, pyDSP.spectrum_encoding.shape)) - self.assertEqual(pyDSP.num_seqs, self.pysar.num_seqs, - "num_seqs attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.num_seqs)) - self.assertEqual(pyDSP.signal_len, self.pysar.sequence_length, - "signal_len attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.sequence_length)) - self.assertEqual(pyDSP.fft_power.dtype, 'float64', - "power spectrum expected to be of type float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_power.shape, encoded_seq1.shape, - "FFT encoding with power spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_real.dtype, 'float64', - "real spectrum expected to be of type float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_real.shape, encoded_seq1.shape, - "FFT encoding with real spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_abs.dtype, 'float64', - "absolute spectrum expected to be of type float64, got {}.".format(pyDSP.fft_abs.dtype)) - self.assertEqual(pyDSP.fft_abs.shape, encoded_seq1.shape, - "FFT encoding with absolute spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_imag.dtype, 'float64', - "imaginary spectrum expected to be of type float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_imag.shape, encoded_seq1.shape, - "FFT encoding with imaginary spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_freqs.shape, encoded_seq1.shape, - "FFT frequencies expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertTrue(pyDSP.spectrum_encoding.any() == pyDSP.fft_power.any(), - "Spectrum encoding attribute should equal that of chosen fft spectrum, power.") - self.assertEqual(pyDSP.fft_power.dtype, "float64", - "Data type of power spectrum should be float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_real.dtype, "float64", - "Data type of real spectrum should be float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_imag.dtype, "float64", - "Data type of imaginary spectrum should be float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_abs.dtype, "float64", - "Data type of absolute spectrum should be float64, got {}.".format(pyDSP.fft_abs.dtype)) + self.assertEqual(pyDSP_absorption.spectrum, "power", "Expected spectrum to be power, got {}.".format(pyDSP_absorption.spectrum)) + self.assertEqual(pyDSP_absorption.window_type, "hamming", "Expected window function to be hamming, got {}.".format(pyDSP_absorption.window_type)) + self.assertIsInstance(pyDSP_absorption.window, np.ndarray, "Expected window function to be a numpy array.") + self.assertIsNone(pyDSP_absorption.filter, "Expected filter function to be None on class initialisation.") + self.assertIsNone(pyDSP_absorption.filter_type, "Expected filter type to to be None, got {}.".format(pyDSP_absorption.filter_type)) + self.assertEqual(pyDSP_absorption.spectrum_encoding.shape, (pysar_absorption.num_seqs, pysar_absorption.sequence_length), + "Expected spectrum encoding to be ({}, {}), got {}.".format(pysar_absorption.num_seqs, pysar_absorption.sequence_length, pyDSP_absorption.spectrum_encoding.shape)) + self.assertEqual(pyDSP_absorption.num_seqs, pysar_absorption.num_seqs, + "Expected num_seqs attribute in pyDSP_absorption_thermostability class to be equal to that of pysar attribute: {}.".format(pysar_absorption.num_seqs)) + self.assertEqual(pyDSP_absorption.signal_len, pysar_absorption.sequence_length, + "Expectd signal_len attribute in pyDSP_absorption_thermostability class to be equal that of pysar attribute: {}.".format(pysar_absorption.sequence_length)) + self.assertEqual(pyDSP_absorption.fft_power.dtype, 'float64', "Expected power spectrum to be of type float64, got {}.".format(pyDSP_absorption.fft_power.dtype)) + self.assertEqual(pyDSP_absorption.fft_power.shape, encoded_seq_absorption.shape, + "Expected FFT encoding with power spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_absorption.shape)) + self.assertEqual(pyDSP_absorption.fft_real.dtype, 'float64', "Expected real spectrum to be of type float64, got {}.".format(pyDSP_absorption.fft_real.dtype)) + self.assertEqual(pyDSP_absorption.fft_real.shape, encoded_seq_absorption.shape, + "Expected FFT encoding with real spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_absorption.shape)) + self.assertEqual(pyDSP_absorption.fft_abs.dtype, 'float64', "Expected absolute spectrum to be of type float64, got {}.".format(pyDSP_absorption.fft_abs.dtype)) + self.assertEqual(pyDSP_absorption.fft_abs.shape, encoded_seq_absorption.shape, + "Expected FFT encoding with absolute spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_absorption.shape)) + self.assertEqual(pyDSP_absorption.fft_imag.dtype, 'float64', "Expected imaginary spectrum to be of type float64, got {}.".format(pyDSP_absorption.fft_imag.dtype)) + self.assertEqual(pyDSP_absorption.fft_imag.shape, encoded_seq_absorption.shape, + "Expected FFT encoding with imaginary spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_absorption.shape)) + self.assertEqual(pyDSP_absorption.fft_freqs.shape, encoded_seq_absorption.shape, "Expected FFT frequencies to be same shape as encoded sequences: {}.".format(encoded_seq_absorption.shape)) + self.assertTrue(pyDSP_absorption.spectrum_encoding.any() == pyDSP_absorption.fft_power.any(), + "Expected spectrum encoding attribute to be equal to chosen fft spectrum, power.") + self.assertEqual(pyDSP_absorption.fft_power.dtype, "float64", "Expected data type of power spectrum to be float64, got {}.".format(pyDSP_absorption.fft_power.dtype)) + self.assertEqual(pyDSP_absorption.fft_real.dtype, "float64", "Expected data type of real spectrum to be float64, got {}.".format(pyDSP_absorption.fft_real.dtype)) + self.assertEqual(pyDSP_absorption.fft_imag.dtype, "float64", "Expected data type of imaginary spectrum to be float64, got {}.".format(pyDSP_absorption.fft_imag.dtype)) + self.assertEqual(pyDSP_absorption.fft_abs.dtype, "float64", "Expected data type of absolute spectrum to be float64, got {}.".format(pyDSP_absorption.fft_abs.dtype)) #4.) - encoded_seq4 = self.pysar.get_aai_encoding(aa_indices4) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[3], protein_seqs=encoded_seq4) #test_localization + pysar_localization = pySAR.PySAR(config_file=self.all_config_files[3]) #localization + encoded_seq_localization = pysar_localization.get_aai_encoding(aa_indices4) + pyDSP_localization = pyDSP_.PyDSP(config_file=self.all_config_files[3], protein_seqs=encoded_seq_localization) - self.assertEqual(pyDSP.spectrum, "imaginary", - "Output spectrum should be imaginary, got {}.".format(pyDSP.spectrum)) - self.assertEqual(pyDSP.window_type, "bartlett", - "Output window function should be bartlett, got {}.".format(pyDSP.window_type)) - self.assertIsInstance(pyDSP.window, np.ndarray, - "Output from window function should be a numpy array.") - self.assertIsNone(pyDSP.filter, - "Filter function should be None on class initialisation.") - self.assertIsNone(pyDSP.filter_type, - "Filter type expected to be None, got {}.".format(pyDSP.filter_type)) - self.assertEqual(pyDSP.spectrum_encoding.shape, (self.pysar.num_seqs, self.pysar.sequence_length), - "Spectrum encoding shape expected to be ({}, {}), got {}.".format(self.pysar.num_seqs, self.pysar.sequence_length, pyDSP.spectrum_encoding.shape)) - self.assertEqual(pyDSP.num_seqs, self.pysar.num_seqs, - "num_seqs attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.num_seqs)) - self.assertEqual(pyDSP.signal_len, self.pysar.sequence_length, - "signal_len attribute in pyDSP class should equal that of pysar attribute: {}.".format(self.pysar.sequence_length)) - self.assertEqual(pyDSP.fft_power.dtype, 'float64', - "power spectrum expected to be of type float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_power.shape, encoded_seq1.shape, - "FFT encoding with power spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_real.dtype, 'float64', - "real spectrum expected to be of type float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_real.shape, encoded_seq1.shape, - "FFT encoding with real spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_abs.dtype, 'float64', - "absolute spectrum expected to be of type float64, got {}.".format(pyDSP.fft_abs.dtype)) - self.assertEqual(pyDSP.fft_abs.shape, encoded_seq1.shape, - "FFT encoding with absolute spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_imag.dtype, 'float64', - "imaginary spectrum expected to be of type float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_imag.shape, encoded_seq1.shape, - "FFT encoding with imaginary spectrum expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertEqual(pyDSP.fft_freqs.shape, encoded_seq1.shape, - "FFT frequencies expected to be same shape as encoded sequences: {}.".format(encoded_seq1.shape)) - self.assertTrue(pyDSP.spectrum_encoding.any() == pyDSP.fft_power.any(), - "Spectrum encoding attribute should equal that of chosen fft spectrum, power.") - self.assertEqual(pyDSP.fft_power.dtype, "float64", - "Data type of power spectrum should be float64, got {}.".format(pyDSP.fft_power.dtype)) - self.assertEqual(pyDSP.fft_real.dtype, "float64", - "Data type of real spectrum should be float64, got {}.".format(pyDSP.fft_real.dtype)) - self.assertEqual(pyDSP.fft_imag.dtype, "float64", - "Data type of imaginary spectrum should be float64, got {}.".format(pyDSP.fft_imag.dtype)) - self.assertEqual(pyDSP.fft_abs.dtype, "float64", - "Data type of absolute spectrum should be float64, got {}.".format(pyDSP.fft_abs.dtype)) + self.assertEqual(pyDSP_localization.spectrum, "power", "Expected spectrum to be power, got {}.".format(pyDSP_localization.spectrum)) + self.assertEqual(pyDSP_localization.window_type, "hamming", "Expected window function to be hamming, got {}.".format(pyDSP_localization.window_type)) + self.assertIsInstance(pyDSP_localization.window, np.ndarray, "Expected window function to be a numpy array.") + self.assertIsNone(pyDSP_localization.filter, "Expected filter function to be None on class initialisation.") + self.assertIsNone(pyDSP_localization.filter_type, "Expected filter type to to be None, got {}.".format(pyDSP_localization.filter_type)) + self.assertEqual(pyDSP_localization.spectrum_encoding.shape, (pysar_localization.num_seqs, pysar_localization.sequence_length), + "Expected spectrum encoding to be ({}, {}), got {}.".format(pysar_localization.num_seqs, pysar_localization.sequence_length, pyDSP_localization.spectrum_encoding.shape)) + self.assertEqual(pyDSP_localization.num_seqs, pysar_localization.num_seqs, + "Expected num_seqs attribute in pyDSP_localization_thermostability class to be equal to that of pysar attribute: {}.".format(pysar_localization.num_seqs)) + self.assertEqual(pyDSP_localization.signal_len, pysar_localization.sequence_length, + "Expectd signal_len attribute in pyDSP_localization_thermostability class to be equal that of pysar attribute: {}.".format(pysar_localization.sequence_length)) + self.assertEqual(pyDSP_localization.fft_power.dtype, 'float64', "Expected power spectrum to be of type float64, got {}.".format(pyDSP_localization.fft_power.dtype)) + self.assertEqual(pyDSP_localization.fft_power.shape, encoded_seq_localization.shape, + "Expected FFT encoding with power spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_localization.shape)) + self.assertEqual(pyDSP_localization.fft_real.dtype, 'float64', "Expected real spectrum to be of type float64, got {}.".format(pyDSP_localization.fft_real.dtype)) + self.assertEqual(pyDSP_localization.fft_real.shape, encoded_seq_localization.shape, + "Expected FFT encoding with real spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_localization.shape)) + self.assertEqual(pyDSP_localization.fft_abs.dtype, 'float64', "Expected absolute spectrum to be of type float64, got {}.".format(pyDSP_localization.fft_abs.dtype)) + self.assertEqual(pyDSP_localization.fft_abs.shape, encoded_seq_localization.shape, + "Expected FFT encoding with absolute spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_localization.shape)) + self.assertEqual(pyDSP_localization.fft_imag.dtype, 'float64', "Expected imaginary spectrum to be of type float64, got {}.".format(pyDSP_localization.fft_imag.dtype)) + self.assertEqual(pyDSP_localization.fft_imag.shape, encoded_seq_localization.shape, + "Expected FFT encoding with imaginary spectrum to be same shape as encoded sequences: {}.".format(encoded_seq_localization.shape)) + self.assertEqual(pyDSP_localization.fft_freqs.shape, encoded_seq_localization.shape, "Expected FFT frequencies to be same shape as encoded sequences: {}.".format(encoded_seq_absorption.shape)) + self.assertTrue(pyDSP_localization.spectrum_encoding.any() == pyDSP_localization.fft_power.any(), + "Expected spectrum encoding attribute to be equal to chosen fft spectrum, power.") + self.assertEqual(pyDSP_localization.fft_power.dtype, "float64", "Expected data type of power spectrum to be float64, got {}.".format(pyDSP_localization.fft_power.dtype)) + self.assertEqual(pyDSP_localization.fft_real.dtype, "float64", "Expected data type of real spectrum to be float64, got {}.".format(pyDSP_localization.fft_real.dtype)) + self.assertEqual(pyDSP_localization.fft_imag.dtype, "float64", "Expected data type of imaginary spectrum to be float64, got {}.".format(pyDSP_localization.fft_imag.dtype)) + self.assertEqual(pyDSP_localization.fft_abs.dtype, "float64", "Expected data type of absolute spectrum to be float64, got {}.".format(pyDSP_localization.fft_abs.dtype)) #5.) with self.assertRaises(OSError, msg='OS Error raised, invalid config file path given.'): pyDSP_.PyDSP(config_file="blahblahblah") + pyDSP_.PyDSP(config_file="test_data/notafile.json") #6.) with self.assertRaises(TypeError, msg='Type Error raised, invalid config file path data type given.'): pyDSP_.PyDSP(config_file=4.21) + pyDSP_.PyDSP(config_file=False) #7.) - with self.assertRaises(ValueError, msg='Value Error raised, protein sequences input parameter cant be none.'): + with self.assertRaises(ValueError, msg='Value Error raised, protein sequences input parameter cannot be none or empty.'): pyDSP_.PyDSP(config_file=self.all_config_files[3], protein_seqs=None) - pyDSP_.PyDSP(config_file=self.all_config_files[2], protein_seqs="ABCDEF") + pyDSP_.PyDSP(config_file=self.all_config_files[2], protein_seqs="") - @unittest.skip("") def test_preprocessing(self): """ Testing preprocessing functionality of pyDSP class. """ test_aaindices1 = "COHE430101" @@ -255,144 +205,152 @@ def test_preprocessing(self): test_aaindices3 = "QIAN880107" test_aaindices4 = "ROSG850102" #1.) - encoded_seq1 = self.pysar.get_aai_encoding(test_aaindices1) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[0], protein_seqs=encoded_seq1) - pyDSP.pre_processing() - self.assertTrue(np.all((pyDSP.fft_power==0)), - "Power spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_real==0)), - "Real spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_imag==0)), - "Imaginary spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_abs==0)), - "Absolute spectrum should be initialised into zeros array on instantiation.") - self.assertFalse(np.isnan(pyDSP.spectrum_encoding).any(), - 'Sequences after pre-processing step should not contain null values.') + pysar_thermostability = pySAR.PySAR(config_file=self.all_config_files[0]) #thermostability + encoded_seq_thermostability = pysar_thermostability.get_aai_encoding(test_aaindices1) #thermostability + pyDSP_thermostability = pyDSP_.PyDSP(config_file=self.all_config_files[0], protein_seqs=encoded_seq_thermostability) + pyDSP_thermostability.pre_processing() + + self.assertTrue(np.all((pyDSP_thermostability.fft_power==0)), "Expected power spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_thermostability.fft_real==0)), "Expected real spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_thermostability.fft_imag==0)), "Expected imaginary spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_thermostability.fft_abs==0)), "Expected absolute spectrum to be initialised into zeros array.") + self.assertFalse(np.isnan(pyDSP_thermostability.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain null values.') + self.assertFalse(np.isinf(pyDSP_thermostability.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain any +/- infinity values.') #2.) - encoded_seq2 = self.pysar.get_aai_encoding(test_aaindices2) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[1], protein_seqs=encoded_seq2) - pyDSP.pre_processing() - self.assertTrue(np.all((pyDSP.fft_power==0)), - "Power spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_real==0)), - "Real spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_imag==0)), - "Imaginary spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_abs==0)), - "Absolute spectrum should be initialised into zeros array on instantiation.") - self.assertFalse(np.isnan(pyDSP.spectrum_encoding).any(), - 'Sequences after pre-processing step should not contain null values.') + pysar_enantioselectivity = pySAR.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + encoded_seq_enantioselectivity = pysar_enantioselectivity.get_aai_encoding(test_aaindices2) #enantioselectivity + pyDSP_enantioselectivity = pyDSP_.PyDSP(config_file=self.all_config_files[1], protein_seqs=encoded_seq_enantioselectivity) + pyDSP_enantioselectivity.pre_processing() + + self.assertTrue(np.all((pyDSP_enantioselectivity.fft_power==0)), "Expected power spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_enantioselectivity.fft_real==0)), "Expected real spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_enantioselectivity.fft_imag==0)), "Expected imaginary spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_enantioselectivity.fft_abs==0)), "Expected absolute spectrum to be initialised into zeros array.") + self.assertFalse(np.isnan(pyDSP_enantioselectivity.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain null values.') + self.assertFalse(np.isinf(pyDSP_enantioselectivity.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain any +/- infinity values.') #3.) - encoded_seq3 = self.pysar.get_aai_encoding(test_aaindices3) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[2], protein_seqs=encoded_seq3) - pyDSP.pre_processing() - self.assertTrue(np.all((pyDSP.fft_power==0)), - "Power spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_real==0)), - "Real spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_imag==0)), - "Imaginary spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_abs==0)), - "Absolute spectrum should be initialised into zeros array on instantiation.") - self.assertFalse(np.isnan(pyDSP.spectrum_encoding).any(), - 'Sequences after pre-processing step should not contain null values.') + pysar_absorption = pySAR.PySAR(config_file=self.all_config_files[2]) #absorption + encoded_seq_absorption = pysar_absorption.get_aai_encoding(test_aaindices3) #absorption + pyDSP_absorption = pyDSP_.PyDSP(config_file=self.all_config_files[2], protein_seqs=encoded_seq_absorption) + pyDSP_absorption.pre_processing() + + self.assertTrue(np.all((pyDSP_absorption.fft_power==0)), "Expected power spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_absorption.fft_real==0)), "Expected real spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_absorption.fft_imag==0)), "Expected imaginary spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_absorption.fft_abs==0)), "Expected absolute spectrum to be initialised into zeros array.") + self.assertFalse(np.isnan(pyDSP_absorption.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain null values.') + self.assertFalse(np.isinf(pyDSP_absorption.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain any +/- infinity values.') #4.) - encoded_seq4 = self.pysar.get_aai_encoding(test_aaindices4) - pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[3], protein_seqs=encoded_seq4) - pyDSP.pre_processing() - self.assertTrue(np.all((pyDSP.fft_power==0)), - "Power spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_real==0)), - "Real spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_imag==0)), - "Imaginary spectrum should be initialised into zeros array on instantiation.") - self.assertTrue(np.all((pyDSP.fft_abs==0)), - "Absolute spectrum should be initialised into zeros array on instantiation.") - self.assertFalse(np.isnan(pyDSP.spectrum_encoding).any(), - 'Sequences after pre-processing step should not contain null values.') + pysar_localization = pySAR.PySAR(config_file=self.all_config_files[3]) #localization + encoded_seq_localization = pysar_localization.get_aai_encoding(test_aaindices4) #localization + pyDSP_localization = pyDSP_.PyDSP(config_file=self.all_config_files[3], protein_seqs=encoded_seq_localization) + pyDSP_localization.pre_processing() + + self.assertTrue(np.all((pyDSP_localization.fft_power==0)), "Expected power spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_localization.fft_real==0)), "Expected real spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_localization.fft_imag==0)), "Expected imaginary spectrum to be initialised into zeros array.") + self.assertTrue(np.all((pyDSP_localization.fft_abs==0)), "Expected absolute spectrum to be initialised into zeros array.") + self.assertFalse(np.isnan(pyDSP_localization.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain null values.') + self.assertFalse(np.isinf(pyDSP_localization.spectrum_encoding).any(), + 'Expected sequences after pre-processing step to not contain any +/- infinity values.') def test_window(self): - """ Testing window functions that are available in pyDSP module. """ - aa_indices1 = "EISD860101" - aa_indices2 = "GEIM800107" - aa_indices3 = "NAKH900106" - aa_indices4 = "QIAN880105" - all_aaindices = [aa_indices1, aa_indices2, aa_indices3, aa_indices4] - all_windows = ['hamming', 'blackman','blackmanharris','gaussian','bartlett', + """ Testing window functions of pyDSP module. """ + all_aaindices = ["EISD860101", "GEIM800107", "NAKH900106", "QIAN880105"] + all_windows = ['hamming', 'blackman', 'blackmanharris', 'gaussian', 'bartlett', 'kaiser', 'barthann', 'bohman', 'chebwin', 'cosine', 'exponential', 'flattop', 'hann', 'boxcar', 'nuttall', 'parzen', 'triang', 'tukey'] - -#1.) iterate over all config files, all indices and all windows - for config in self.all_config_files: + all_shapes = [(466, ), (398, ), (298, ), (361, )] +#1.) + #iterate over all config files, all indices and all windows + for config in range(0, len(self.all_config_files)): for index in all_aaindices: for window in all_windows: - encoded_seq = self.pysar.get_aai_encoding(index) - #open config file and parse parameters - with open(config) as f: - parameters = json.load(f) - - #manually set window type in parameters object - parameters["pyDSP"]["window"]["type"] = window - - #create instance of pyDSP class - pyDSP = pyDSP_.PyDSP(config_file=parameters, protein_seqs=encoded_seq) + #create instance of PySAR and PyDSP classes + pysar_ = pySAR.PySAR(config_file=self.all_config_files[config]) + encoded_seq = pysar_.get_aai_encoding(index) + pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[config], protein_seqs=encoded_seq, window_type=window) self.assertEqual(pyDSP.window_type, window, "Expected window type to be {}, got {}.".format(pyDSP.window_type, window)) self.assertIsInstance(pyDSP.window, np.ndarray, "Expected window to be a numpy array, got {}.".format(pyDSP.window)) - self.assertEqual(pyDSP.window.shape, (466,), - "Expected shape of window to be (466,), got {}.".format(pyDSP.window.shape)) + self.assertEqual(pyDSP.window.shape, all_shapes[config], + "Expected shape of window to be {}, got {}.".format(all_shapes[config], pyDSP.window.shape)) self.assertFalse(np.isnan(pyDSP.window).any(), "Expected window output to contain no null/nan values.") - - # @unittest.skip("") + self.assertFalse(np.isinf(pyDSP.window).any(), + 'Expected sequences to not contain any +/- infinity values.') + def test_filter(self): - """ Testing filter functions that are available in pyDSP module. """ - aa_indices1 = "EISD860101" - aa_indices2 = "GEIM800107" - aa_indices3 = "NAKH900106" - aa_indices4 = "QIAN880105" - all_aaindices = [aa_indices1, aa_indices2, aa_indices3, aa_indices4] + """ Testing filter functions of pyDSP module. """ + all_aaindices = ["EISD860101", "GEIM800107", "NAKH900106", "QIAN880105"] all_filters = ['savgol'] - -#1.) iterate over all config files, all indices and all filters - for config in self.all_config_files: + all_shapes = [(466, ), (398, ), (298, ), (361, )] +#1.) + #iterate over all config files, all indices and all filters + for config in range(0, len(self.all_config_files)): for index in all_aaindices: - for filt in all_filters: - encoded_seq = self.pysar.get_aai_encoding(index) - #open config file and parse parameters - with open(config) as f: - parameters = json.load(f) - - #manually set filter type in parameters object - parameters["pyDSP"]["filter"]["type"] = filt - - #create instance of pyDSP class - pyDSP = pyDSP_.PyDSP(config_file=parameters, protein_seqs=encoded_seq) + for filter in all_filters: + #create instance of PySAR and PyDSP classes + pysar_ = pySAR.PySAR(config_file=self.all_config_files[config]) + encoded_seq = pysar_.get_aai_encoding(index) + pyDSP = pyDSP_.PyDSP(config_file=self.all_config_files[config], protein_seqs=encoded_seq, filter_type=filter) - self.assertEqual(pyDSP.filter_type, filt, - "Expected filter type to be {}, got {}.".format(pyDSP.filter_type, filt)) + self.assertEqual(pyDSP.filter_type, filter, + "Expected filter type to be {}, got {}.".format(pyDSP.filter_type, filter)) self.assertIsInstance(pyDSP.filter, np.ndarray, "Expected filter to be a numpy array, got {}.".format(pyDSP.filter)) - self.assertEqual(pyDSP.filter.shape, (466,), - "Expected shape of filter to be (466,), got {}.".format(pyDSP.filter.shape)) + self.assertEqual(pyDSP.filter.shape, all_shapes[config], + "Expected shape of filter to be {}, got {}.".format(all_shapes[config], pyDSP.filter.shape)) self.assertFalse(np.isnan(pyDSP.filter).any(), "Expected filter output to contain no null/nan values.") + self.assertFalse(np.isinf(pyDSP.filter).any(), + 'Expected sequences to not contain any +/- infinity values.') - # @unittest.skip("") def test_max_freq(self): """ Testing max frequency functionality. """ + all_aaindices = ["ISOY800101", "MEEJ800101", "PALJ810116", "QIAN880107"] +#1.) + for config in self.all_config_files: + for index in all_aaindices: + #create instance of PySAR and PyDSP classes + pysar_ = pySAR.PySAR(config_file=config) + encoded_seq = pysar_.get_aai_encoding(index) + pyDSP = pyDSP_.PyDSP(config_file=config, protein_seqs=encoded_seq) + + max_freq_, max_freq_index = pyDSP.max_freq(pyDSP.spectrum_encoding[0]) + self.assertIsInstance(max_freq_, float, + "Expected max frequency attribute to be a float, got {}.".format(type(max_freq_))) + self.assertIsInstance(max_freq_index, np.int64, + "Expected max frequency index attribute to be a np.int64, got {}.".format(type(max_freq_index))) +#2.) + with self.assertRaises(ValueError): + pyDSP.max_freq(pyDSP.spectrum_encoding) + + def test_consensus_freq(self): + """ Testing max frequency functionality. """ + all_aaindices = ["ISOY800101", "MEEJ800101", "PALJ810116", "QIAN880107"] #1.) - test_aaindices1 = "COHE430101" - encoded_seq1 = self.pysar.get_aai_encoding(test_aaindices1) for config in self.all_config_files: - pyDSP = pyDSP_.PyDSP(config_file=config, protein_seqs=encoded_seq1) - max_freq_, max_freq_index = pyDSP.max_freq(pyDSP.spectrum_encoding[0]) - self.assertIsInstance(max_freq_, float, - "Expected max frequency attribute to be a float, got {}.".format(type(max_freq_))) - self.assertIsInstance(max_freq_index, np.int64, - "Expected max frequency index attribute to be a np.int64, got {}.".format(type(max_freq_index))) + for index in all_aaindices: + #create instance of PySAR and PyDSP classes + pysar_ = pySAR.PySAR(config_file=config) + encoded_seq = pysar_.get_aai_encoding(index) + pyDSP = pyDSP_.PyDSP(config_file=config, protein_seqs=encoded_seq) + + #calculate consensus freq + consensus_freq = pyDSP.consensus_freq(pyDSP.spectrum_encoding[0]) + + self.assertIsInstance(consensus_freq, float, + "Expected consensus frequency attribute to be a float, got {}.".format(type(consensus_freq))) #2.) with self.assertRaises(ValueError): - pyDSP.max_freq(pyDSP.spectrum_encoding) \ No newline at end of file + pyDSP.consensus_freq(pyDSP.spectrum_encoding) \ No newline at end of file diff --git a/tests/test_pySAR.py b/tests/test_pySAR.py index e03e37e..ae8fffb 100644 --- a/tests/test_pySAR.py +++ b/tests/test_pySAR.py @@ -55,13 +55,15 @@ def setUp(self): self.amino_acids = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"] - #temporary unit test output folder + #create temporary unit test output folder to store any pysar assets and results self.test_output_folder = os.path.join("tests", "test_outputs") + if not (os.path.isdir(self.test_output_folder)): + os.makedirs(self.test_output_folder) @unittest.skip("Skipping metadata tests.") def test_pySAR_metadata(self): """ Testing correct pySAR version and metadata. """ - self.assertEqual(pysar_.__version__, "2.4.0", + self.assertEqual(pysar_.__version__, "2.4.1", "pySAR version is not correct, got: {}.".format(pysar_.__version__)) self.assertEqual(pysar_.__name__, "pySAR", "pySAR software name is not correct, got: {}.".format(pysar_.__name__)) @@ -80,152 +82,152 @@ def test_pySAR_metadata(self): self.assertEqual(pysar_.__maintainer__, "AJ McKenna", "pySAR maintainer is not correct, got: {}.".format(pysar_.__license__)) self.assertEqual(pysar_.__keywords__, ["bioinformatics", "protein engineering", "python", \ - "pypi", "machine learning", "directed evolution", "sequence activity relationships", \ - "SAR", "aaindex", "protein descriptors"], - "pySAR keywords is not correct, got: {}.".format(pysar_.__keywords__)) + "pypi", "machine learning", "directed evolution", "drug discovery", "sequence activity relationships", \ + "SAR", "aaindex", "protein descriptors"], "pySAR keywords is not correct, got: {}.".format(pysar_.__keywords__)) def test_pySAR(self): """ Testing pySAR intialisation process and associated methods & attributes. """ #1.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability #testing attribute values, including default values - self.assertEqual(test_pySAR.dataset, (os.path.join('tests', 'test_data', 'test_thermostability.txt')), - 'Dataset attribute does not equal what was input, got {}.'.format(test_pySAR.dataset)) - self.assertEqual(test_pySAR.sequence_col, "sequence", - 'Sequence column attribute is not correct, expected {}, got {}.'.format("sequence", test_pySAR.sequence_col)) - self.assertEqual(test_pySAR.activity_col, "T50", - "Activity attribute name not correct, expected {}, got {}.".format("T50", test_pySAR.activity_col)) - self.assertEqual(test_pySAR.algorithm, "plsregression", - 'Algorithm attribute not correct, expected {}, got {}.'.format("plsregression", test_pySAR.algorithm)) - self.assertEqual(test_pySAR.test_split, 0.2, - 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR.test_split)) - self.assertIsNone(test_pySAR.aai_indices, + self.assertEqual(test_pySAR_thermostability.dataset, (os.path.join('tests', 'test_data', 'test_thermostability.txt')), + 'Dataset attribute does not match expected, got {}.'.format(test_pySAR_thermostability.dataset)) + self.assertEqual(test_pySAR_thermostability.sequence_col, "sequence", + 'Sequence column attribute is not correct, expected sequence, got {}.'.format(test_pySAR_thermostability.sequence_col)) + self.assertEqual(test_pySAR_thermostability.activity_col, "T50", + "Activity attribute name not correct, expected T50, got {}.".format(test_pySAR_thermostability.activity_col)) + self.assertEqual(test_pySAR_thermostability.algorithm, "plsregression", + 'Algorithm attribute not correct, expected plsregression, got {}.'.format(test_pySAR_thermostability.algorithm)) + self.assertEqual(test_pySAR_thermostability.test_split, 0.2, + 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR_thermostability.test_split)) + self.assertIsNone(test_pySAR_thermostability.aai_indices, "AAI Indices attribute should be none on class initialisation.") - self.assertIsNone(test_pySAR.descriptors, + self.assertIsNone(test_pySAR_thermostability.descriptors, "Descriptors attribute should be none on class initialisation.") - self.assertEqual(test_pySAR.model_parameters, {}, - 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR.model_parameters)) - self.assertIsInstance(test_pySAR.data, pd.DataFrame, - 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR.data))) - self.assertIsInstance(test_pySAR.sequences, pd.Series, - 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR.sequences))) - self.assertIsInstance(test_pySAR.activity, pd.Series, - 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR.activity))) - self.assertEqual(test_pySAR.data.isnull().sum().sum(), 0, + self.assertEqual(test_pySAR_thermostability.model_parameters, {}, + 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR_thermostability.model_parameters)) + self.assertIsInstance(test_pySAR_thermostability.data, pd.DataFrame, + 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR_thermostability.data))) + self.assertIsInstance(test_pySAR_thermostability.sequences, pd.Series, + 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR_thermostability.sequences))) + self.assertIsInstance(test_pySAR_thermostability.activity, pd.Series, + 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR_thermostability.activity))) + self.assertEqual(test_pySAR_thermostability.data.isnull().sum().sum(), 0, 'Expected there to be no NAN/null values in data dataframe.') - self.assertEqual(test_pySAR.num_seqs, 261, - 'Number of sequences expected to be 261, got {}.'.format(test_pySAR.num_seqs)) - self.assertEqual(test_pySAR.sequence_length, 466, - 'Sequence length expected to be 466, got {}.'.format(test_pySAR.sequence_length)) - self.assertEqual(test_pySAR.feature_space, (), - 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_thermostability.num_seqs, 261, + 'Number of sequences expected to be 261, got {}.'.format(test_pySAR_thermostability.num_seqs)) + self.assertEqual(test_pySAR_thermostability.sequence_length, 466, + 'Sequence length expected to be 466, got {}.'.format(test_pySAR_thermostability.sequence_length)) + self.assertEqual(test_pySAR_thermostability.feature_space, (), + 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR_thermostability.feature_space)) #2.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity #testing attribute values, including default values - self.assertEqual(test_pySAR.dataset, (os.path.join('tests', 'test_data', 'test_enantioselectivity.txt')), - 'Dataset attribute does not equal what was input, got {}.'.format(test_pySAR.dataset)) - self.assertEqual(test_pySAR.sequence_col, "sequence", - 'Sequence column attribute is not correct, expected {}, got {}.'.format("sequence", test_pySAR.sequence_col)) - self.assertEqual(test_pySAR.activity_col, "e-value", - "Activity attribute name not correct, expected {}, got {}.".format("e-value", test_pySAR.activity_col)) - self.assertEqual(test_pySAR.algorithm, "randomforestregressor", - 'Algorithm attribute not correct, expected {}, got {}.'.format("randomforestregressor", test_pySAR.algorithm)) - self.assertEqual(test_pySAR.test_split, 0.2, - 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR.test_split)) - self.assertIsNone(test_pySAR.aai_indices, + self.assertEqual(test_pySAR_enantioselectivity.dataset, (os.path.join('tests', 'test_data', 'test_enantioselectivity.txt')), + 'Dataset attribute does not match expected, got {}.'.format(test_pySAR_enantioselectivity.dataset)) + self.assertEqual(test_pySAR_enantioselectivity.sequence_col, "sequence", + 'Sequence column attribute is not correct, expected sequence, got {}.'.format(test_pySAR_enantioselectivity.sequence_col)) + self.assertEqual(test_pySAR_enantioselectivity.activity_col, "e-value", + "Activity attribute name not correct, expected e-value, got {}.".format(test_pySAR_enantioselectivity.activity_col)) + self.assertEqual(test_pySAR_enantioselectivity.algorithm, "randomforestregressor", + 'Algorithm attribute not correct, expected randomforestregressor, got {}.'.format(test_pySAR_enantioselectivity.algorithm)) + self.assertEqual(test_pySAR_enantioselectivity.test_split, 0.2, + 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR_enantioselectivity.test_split)) + self.assertIsNone(test_pySAR_enantioselectivity.aai_indices, "AAI Indices attribute should be none on class initialisation.") - self.assertIsNone(test_pySAR.descriptors, + self.assertIsNone(test_pySAR_enantioselectivity.descriptors, "Descriptors attribute should be none on class initialisation.") - self.assertEqual(test_pySAR.model_parameters, {}, - 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR.model_parameters)) - self.assertIsInstance(test_pySAR.data, pd.DataFrame, - 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR.data))) - self.assertIsInstance(test_pySAR.sequences, pd.Series, - 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR.sequences))) - self.assertIsInstance(test_pySAR.activity, pd.Series, - 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR.activity))) - self.assertEqual(test_pySAR.data.isnull().sum().sum(), 0, + self.assertEqual(test_pySAR_enantioselectivity.model_parameters, {}, + 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR_enantioselectivity.model_parameters)) + self.assertIsInstance(test_pySAR_enantioselectivity.data, pd.DataFrame, + 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR_enantioselectivity.data))) + self.assertIsInstance(test_pySAR_enantioselectivity.sequences, pd.Series, + 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR_enantioselectivity.sequences))) + self.assertIsInstance(test_pySAR_enantioselectivity.activity, pd.Series, + 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR_enantioselectivity.activity))) + self.assertEqual(test_pySAR_enantioselectivity.data.isnull().sum().sum(), 0, 'Expected there to be no NAN/null values in data dataframe.') - self.assertEqual(test_pySAR.num_seqs, 152, - 'Number of sequences expected to be 152, got {}.'.format(test_pySAR.num_seqs)) - self.assertEqual(test_pySAR.sequence_length, 398, - 'Sequence length expected to be 398, got {}.'.format(test_pySAR.sequence_length)) - self.assertEqual(test_pySAR.feature_space, (), - 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_enantioselectivity.num_seqs, 152, + 'Number of sequences expected to be 152, got {}.'.format(test_pySAR_enantioselectivity.num_seqs)) + self.assertEqual(test_pySAR_enantioselectivity.sequence_length, 398, + 'Sequence length expected to be 398, got {}.'.format(test_pySAR_enantioselectivity.sequence_length)) + self.assertEqual(test_pySAR_enantioselectivity.feature_space, (), + 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR_enantioselectivity.feature_space)) #3.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[2]) #absorption + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[2]) #absorption #testing attribute values, including default values - self.assertEqual(test_pySAR.dataset, (os.path.join('tests', 'test_data', 'test_absorption.txt')), - 'Dataset attribute does not equal what was input, got {}.'.format(test_pySAR.dataset)) - self.assertEqual(test_pySAR.sequence_col, "sequence", - 'Sequence column attribute is not correct, expected {}, got {}.'.format("sequence", test_pySAR.sequence_col)) - self.assertEqual(test_pySAR.activity_col, "peak", - "Activity attribute name not correct, expected {}, got {}.".format("peak", test_pySAR.activity_col)) - self.assertEqual(test_pySAR.algorithm, "plsregression", - 'Algorithm attribute not correct, expected {}, got {}.'.format("plsregression", test_pySAR.algorithm)) - self.assertEqual(test_pySAR.test_split, 0.2, - 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR.test_split)) - self.assertIsNone(test_pySAR.aai_indices, + self.assertEqual(test_pySAR_absorption.dataset, (os.path.join('tests', 'test_data', 'test_absorption.txt')), + 'Dataset attribute does not match expected, got {}.'.format(test_pySAR_absorption.dataset)) + self.assertEqual(test_pySAR_absorption.sequence_col, "sequence", + 'Sequence column attribute is not correct, expected sequence, got {}.'.format(test_pySAR_absorption.sequence_col)) + self.assertEqual(test_pySAR_absorption.activity_col, "peak", + "Activity attribute name not correct, expected peak, got {}.".format(test_pySAR_absorption.activity_col)) + self.assertEqual(test_pySAR_absorption.algorithm, "knn", + 'Algorithm attribute not correct, expected knn, got {}.'.format(test_pySAR_absorption.algorithm)) + self.assertEqual(test_pySAR_absorption.test_split, 0.2, + 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR_absorption.test_split)) + self.assertIsNone(test_pySAR_absorption.aai_indices, "AAI Indices attribute should be none on class initialisation.") - self.assertIsNone(test_pySAR.descriptors, + self.assertIsNone(test_pySAR_absorption.descriptors, "Descriptors attribute should be none on class initialisation.") - self.assertEqual(test_pySAR.model_parameters, {}, - 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR.model_parameters)) - self.assertIsInstance(test_pySAR.data, pd.DataFrame, - 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR.data))) - self.assertIsInstance(test_pySAR.sequences, pd.Series, - 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR.sequences))) - self.assertIsInstance(test_pySAR.activity, pd.Series, - 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR.activity))) - self.assertEqual(test_pySAR.data.isnull().sum().sum(), 0, + self.assertEqual(test_pySAR_absorption.model_parameters, {}, + 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR_absorption.model_parameters)) + self.assertIsInstance(test_pySAR_absorption.data, pd.DataFrame, + 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR_absorption.data))) + self.assertIsInstance(test_pySAR_absorption.sequences, pd.Series, + 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR_absorption.sequences))) + self.assertIsInstance(test_pySAR_absorption.activity, pd.Series, + 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR_absorption.activity))) + self.assertEqual(test_pySAR_absorption.data.isnull().sum().sum(), 0, 'Expected there to be no NAN/null values in data dataframe.') - self.assertEqual(test_pySAR.num_seqs, 81, - 'Number of sequences expected to be 81, got {}.'.format(test_pySAR.num_seqs)) - self.assertEqual(test_pySAR.sequence_length, 298, - 'Sequence length expected to be 298, got {}.'.format(test_pySAR.sequence_length)) - self.assertEqual(test_pySAR.feature_space, (), - 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_absorption.num_seqs, 81, + 'Number of sequences expected to be 81, got {}.'.format(test_pySAR_absorption.num_seqs)) + self.assertEqual(test_pySAR_absorption.sequence_length, 298, + 'Sequence length expected to be 298, got {}.'.format(test_pySAR_absorption.sequence_length)) + self.assertEqual(test_pySAR_absorption.feature_space, (), + 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR_absorption.feature_space)) #4.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[3]) #localization + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[3]) #localization #testing attribute values, including default values - self.assertEqual(test_pySAR.dataset, (os.path.join('tests', 'test_data', 'test_localization.txt')), - 'Dataset attribute does not equal what was input, got {}.'.format(test_pySAR.dataset)) - self.assertEqual(test_pySAR.sequence_col, "sequence", - 'Sequence column attribute is not correct, expected {}, got {}.'.format("sequence", test_pySAR.sequence_col)) - self.assertEqual(test_pySAR.activity_col, "log_GFP", - "Activity attribute name not correct, expected {}, got {}.".format("log_GFP", test_pySAR.activity_col)) - self.assertEqual(test_pySAR.algorithm, "plsregression", - 'Algorithm attribute not correct, expected {}, got {}.'.format("plsregression", test_pySAR.algorithm)) - self.assertEqual(test_pySAR.test_split, 0.2, - 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR.test_split)) - self.assertIsNone(test_pySAR.aai_indices, + self.assertEqual(test_pySAR_localization.dataset, (os.path.join('tests', 'test_data', 'test_localization.txt')), + 'Dataset attribute does not match expected, got {}.'.format(test_pySAR_localization.dataset)) + self.assertEqual(test_pySAR_localization.sequence_col, "sequence", + 'Sequence column attribute is not correct, expected sequence, got {}.'.format(test_pySAR_localization.sequence_col)) + self.assertEqual(test_pySAR_localization.activity_col, "log_GFP", + "Activity attribute name not correct, expected log_GFP, got {}.".format(test_pySAR_localization.activity_col)) + self.assertEqual(test_pySAR_localization.algorithm, "adaboostregressor", + 'Algorithm attribute not correct, expected adaboostregressor, got {}.'.format(test_pySAR_localization.algorithm)) + self.assertEqual(test_pySAR_localization.test_split, 0.2, + 'Test split not expected, expected 0.2, got {}.'.format(test_pySAR_localization.test_split)) + self.assertIsNone(test_pySAR_localization.aai_indices, "AAI Indices attribute should be none on class initialisation.") - self.assertIsNone(test_pySAR.descriptors, + self.assertIsNone(test_pySAR_localization.descriptors, "Descriptors attribute should be none on class initialisation.") - self.assertEqual(test_pySAR.model_parameters, {}, - 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR.model_parameters)) - self.assertIsInstance(test_pySAR.data, pd.DataFrame, - 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR.data))) - self.assertIsInstance(test_pySAR.sequences, pd.Series, - 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR.sequences))) - self.assertIsInstance(test_pySAR.activity, pd.Series, - 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR.activity))) - self.assertEqual(test_pySAR.data.isnull().sum().sum(), 0, + self.assertEqual(test_pySAR_localization.model_parameters, {}, + 'Parameters attribute expected to be empty, got {}.'.format(test_pySAR_localization.model_parameters)) + self.assertIsInstance(test_pySAR_localization.data, pd.DataFrame, + 'Data expected to be a DataFrame, got {}.'.format(type(test_pySAR_localization.data))) + self.assertIsInstance(test_pySAR_localization.sequences, pd.Series, + 'Sequences expected to be a pd.Series, got {}.'.format(type(test_pySAR_localization.sequences))) + self.assertIsInstance(test_pySAR_localization.activity, pd.Series, + 'Activity expected to be a pd.Series, got {}.'.format(type(test_pySAR_localization.activity))) + self.assertEqual(test_pySAR_localization.data.isnull().sum().sum(), 0, 'Expected there to be no NAN/null values in data dataframe.') - self.assertEqual(test_pySAR.num_seqs, 254, - 'Number of sequences expected to be 254, got {}.'.format(test_pySAR.num_seqs)) - self.assertEqual(test_pySAR.sequence_length, 361, - 'Sequence length expected to be 361, got {}.'.format(test_pySAR.sequence_length)) - self.assertEqual(test_pySAR.feature_space, (), - 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_localization.num_seqs, 254, + 'Number of sequences expected to be 254, got {}.'.format(test_pySAR_localization.num_seqs)) + self.assertEqual(test_pySAR_localization.sequence_length, 361, + 'Sequence length expected to be 361, got {}.'.format(test_pySAR_localization.sequence_length)) + self.assertEqual(test_pySAR_localization.feature_space, (), + 'Feature space expected to be an empty tuble, got {}.'.format(test_pySAR_localization.feature_space)) #5.) #validate that if errorneous input parameters are input, that errors are raised with self.assertRaises(OSError, msg='OS Error raised, config file not found.'): pysar.PySAR(config_file="blahblahblah") + pysar.PySAR(config_file="test_data/nothing.json") #6.) with self.assertRaises(TypeError, msg='Type Error raised, config file parameter not correct data type.'): pysar.PySAR(config_file=101) @@ -234,11 +236,11 @@ def test_pySAR(self): def test_sequences(self): """ Testing getting the protein sequences from the dataset. """ #1.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability - test_seqs = test_pySAR.sequences + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + test_seqs = test_pySAR_thermostability.sequences - self.assertEqual(test_seqs.shape, (test_pySAR._num_seqs, ), - 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR._num_seqs, ))) + self.assertEqual(test_seqs.shape, (test_pySAR_thermostability._num_seqs, ), + 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR_thermostability._num_seqs, ))) self.assertIsInstance(test_seqs, pd.Series, 'Sequences not of correct type, expected {}, got {}.'.format(pd.Series, type(test_seqs))) self.assertTrue(test_seqs[0].startswith("MTIKEMPQPK"), @@ -246,52 +248,52 @@ def test_sequences(self): self.assertEqual(test_seqs.dtype, object, 'Sequence object expected to be of dtype object, got {}.'.format(test_seqs.dtype)) #2.) - test_pySAR_2 = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity - test_seqs = test_pySAR_2.sequences + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + test_seqs = test_pySAR_enantioselectivity.sequences - self.assertEqual(test_seqs.shape, (test_pySAR_2._num_seqs, ), - 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR_2._num_seqs, ))) + self.assertEqual(test_seqs.shape, (test_pySAR_enantioselectivity._num_seqs, ), + 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR_enantioselectivity._num_seqs, ))) self.assertIsInstance(test_seqs, pd.Series, - 'Sequences not of correct type, expected {}, got {}'.format(pd.Series, type(test_seqs))) + 'Sequences not of correct type, expected {}, got {}.'.format(pd.Series, type(test_seqs))) self.assertTrue(test_seqs[0].startswith("MSAPFAKF"), 'Error in second seqeuence expected it to start with MSAPFAKF.') self.assertEqual(test_seqs.dtype, object, 'Sequence object expected to be of dtype object, got {}.'.format(test_seqs.dtype)) #3.) - test_pySAR_3 = pysar.PySAR(config_file=self.all_config_files[2]) #absorption - test_seqs = test_pySAR_3.sequences + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[2]) #absorption + test_seqs = test_pySAR_absorption.sequences - self.assertEqual(test_seqs.shape, (test_pySAR_3._num_seqs, ), - 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR_3._num_seqs, ))) + self.assertEqual(test_seqs.shape, (test_pySAR_absorption._num_seqs, ), + 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR_absorption._num_seqs, ))) self.assertIsInstance(test_seqs, pd.Series, - 'Sequences not of correct type, expected {}, got {}'.format(pd.Series, type(test_seqs))) + 'Sequences not of correct type, expected {}, got {}.'.format(pd.Series, type(test_seqs))) self.assertTrue(test_seqs[0].startswith("MLMTVFSSAP"), 'Error in third seqeuence expected it to start with MLMTVFSSAP.') self.assertEqual(test_seqs.dtype, object, - 'Sequence object expected to be of dtype object, got {}'.format(test_seqs.dtype)) + 'Sequence object expected to be of dtype object, got {}.'.format(test_seqs.dtype)) #4.) - test_pySAR_4 = pysar.PySAR(config_file=self.all_config_files[3]) #localization - test_seqs = test_pySAR_4.sequences + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[3]) #localization + test_seqs = test_pySAR_localization.sequences - self.assertEqual(test_seqs.shape, (test_pySAR_4._num_seqs, ), - 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR_4._num_seqs, ))) + self.assertEqual(test_seqs.shape, (test_pySAR_localization._num_seqs, ), + 'Shape of the sequences not correct, expected {}, got {}.'.format(test_seqs.shape, (test_pySAR_localization._num_seqs, ))) self.assertIsInstance(test_seqs, pd.Series, - 'Sequences not of correct type, expected {}, got {}'.format(pd.Series, type(test_seqs))) + 'Sequences not of correct type, expected {}, got {}.'.format(pd.Series, type(test_seqs))) self.assertTrue(test_seqs[0].startswith("MSRLVAASWL"), 'Error in third seqeuence expected it to start with MSRLVAASWL.') self.assertEqual(test_seqs.dtype, object, - 'Sequence object expected to be of dtype object, got {}'.format(test_seqs.dtype)) + 'Sequence object expected to be of dtype object, got {}.'.format(test_seqs.dtype)) def test_activity(self): """ Testing function that gets activity from dataset. """ #1.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability - activity = test_pySAR.activity + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + activity = test_pySAR_thermostability.activity self.assertIsInstance(activity, pd.Series, 'Output should be a Series, got {}.'.format(type(activity))) - self.assertEqual(activity.shape, (test_pySAR.num_seqs,), - 'Output expected to be shape ({}), got {}.'.format((test_pySAR.num_seqs,), activity.shape)) + self.assertEqual(activity.shape, (test_pySAR_thermostability.num_seqs,), + 'Output expected to be shape {}, got {}.'.format((test_pySAR_thermostability.num_seqs,), activity.shape)) self.assertTrue((activity[:10] == np.array([55.0, 43.0, 49.0, 39.8, 52.9, 48.8, 45.0, 48.3, 61.5, 54.6])).all(), "First 10 elements of activity don't match expected output:\n{}.".format(activity[:10])) self.assertEqual(activity.name, "T50", @@ -299,47 +301,47 @@ def test_activity(self): self.assertTrue(activity.dtypes == np.float64, "Column datatypes should be np.float64, got {}.".format(activity.dtypes)) #2.) - test_pySAR_2 = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity - activity_2 = test_pySAR_2.activity - - self.assertIsInstance(activity_2, pd.Series, - 'Output should be a Series, got {}.'.format(type(activity_2))) - self.assertEqual(activity_2.shape, (test_pySAR_2.num_seqs,), - 'Output expected to be shape ({}), got {}.'.format((test_pySAR_2.num_seqs,), activity_2.shape)) - self.assertTrue((activity_2[:10] == np.array([5.0, 23.0, 10.0, 9.0, 12.0, 11.0, 11.0, 21.0, 18.0, 17.0])).all(), - "First 10 elements of activity don't match expected output:\n{}.".format(activity_2[:10])) - self.assertEqual(activity_2.name, "e-value", - "Expected e-value column name for Series, got {}.".format(activity_2.name)) - self.assertTrue(np.float64 == activity_2.dtypes, - "Column datatypes should be np.float64, got {}.".format(activity_2.dtypes)) + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + activity_enantioselectivity = test_pySAR_enantioselectivity.activity + + self.assertIsInstance(activity_enantioselectivity, pd.Series, + 'Output should be a Series, got {}.'.format(type(activity_enantioselectivity))) + self.assertEqual(activity_enantioselectivity.shape, (test_pySAR_enantioselectivity.num_seqs,), + 'Output expected to be shape ({}), got {}.'.format((test_pySAR_enantioselectivity.num_seqs,), activity_enantioselectivity.shape)) + self.assertTrue((activity_enantioselectivity[:10] == np.array([5.0, 23.0, 10.0, 9.0, 12.0, 11.0, 11.0, 21.0, 18.0, 17.0])).all(), + "First 10 elements of activity don't match expected output:\n{}.".format(activity_enantioselectivity[:10])) + self.assertEqual(activity_enantioselectivity.name, "e-value", + "Expected e-value column name for Series, got {}.".format(activity_enantioselectivity.name)) + self.assertTrue(np.float64 == activity_enantioselectivity.dtypes, + "Column datatypes should be np.float64, got {}.".format(activity_enantioselectivity.dtypes)) #3.) - test_pySAR_3 = pysar.PySAR(config_file=self.all_config_files[2]) #absorption - activity_3 = test_pySAR_3.activity - - self.assertIsInstance(activity_3, pd.Series, - 'Output should be a Series, got {}.'.format(type(activity_3))) - self.assertEqual(activity_3.shape, (test_pySAR_3.num_seqs,), - 'Output expected to be shape ({}), got {}.'.format((test_pySAR_3.num_seqs,), activity_3.shape)) - self.assertTrue((activity_3[:10] == np.array([539, 510, 510, 519, 525, 528, 528, 534, 528, 510])).all(), - "First 10 elements of activity don't match expected output:\n{}.".format(activity_3[:10])) - self.assertEqual(activity_3.name, "peak", - "Expected peak column name for Series, got {}.".format(activity_3.name)) - self.assertTrue(np.int64 == activity_3.dtypes, - "Column datatypes should be np.float64, got {}.".format(activity_3.dtypes)) + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[2]) #absorption + activity_absorption = test_pySAR_absorption.activity + + self.assertIsInstance(activity_absorption, pd.Series, + 'Output should be a Series, got {}.'.format(type(activity_absorption))) + self.assertEqual(activity_absorption.shape, (test_pySAR_absorption.num_seqs,), + 'Output expected to be shape ({}), got {}.'.format((test_pySAR_absorption.num_seqs,), activity_absorption.shape)) + self.assertTrue((activity_absorption[:10] == np.array([539, 510, 510, 519, 525, 528, 528, 534, 528, 510])).all(), + "First 10 elements of activity don't match expected output:\n{}.".format(activity_absorption[:10])) + self.assertEqual(activity_absorption.name, "peak", + "Expected peak column name for Series, got {}.".format(activity_absorption.name)) + self.assertTrue(np.int64 == activity_absorption.dtypes, + "Column datatypes should be np.float64, got {}.".format(activity_absorption.dtypes)) #4.) - test_pySAR_4 = pysar.PySAR(config_file=self.all_config_files[3]) #localization - activity_4 = test_pySAR_4.activity - - self.assertIsInstance(activity_4, pd.Series, - 'Output should be a Series, got {}.'.format(type(activity_4))) - self.assertEqual(activity_4.shape, (test_pySAR_4.num_seqs,), - 'Output expected to be shape ({}), got {}.'.format((test_pySAR_4.num_seqs,), activity_4.shape)) - # self.assertTrue((activity_4[:10] == np.array([-4.626936, -5.599110, -5.715788, -5.335352, -4.187052, -6.732491, -7.135846, -6.128409, -5.319843, -5.092067])).all(), - # "First 10 elements of activity don't match expected output:\n{}.".format(activity_4[:10])) - self.assertEqual(activity_4.name, "log_GFP", - "Expected log_GFP column name for Series, got {}.".format(activity_4.name)) - self.assertTrue(activity_4.dtypes == np.float64, - "Column datatypes should be np.float64, got {}.".format(activity_4.dtypes)) + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[3]) #localization + activity_localization = test_pySAR_localization.activity + + self.assertIsInstance(activity_localization, pd.Series, + 'Output should be a Series, got {}.'.format(type(activity_localization))) + self.assertEqual(activity_localization.shape, (test_pySAR_localization.num_seqs,), + 'Output expected to be shape ({}), got {}.'.format((test_pySAR_localization.num_seqs,), activity_localization.shape)) + # self.assertTrue((activity_localization[:10] == np.array([-4.626936, -5.599110, -5.715788, -5.335352, -4.187052, -6.732491, -7.135846, -6.128409, -5.319843, -5.092067])).all(), + # "First 10 elements of activity don't match expected output:\n{}.".format(activity_localization[:10])) + self.assertEqual(activity_localization.name, "log_GFP", + "Expected log_GFP column name for Series, got {}.".format(activity_localization.name)) + self.assertTrue(activity_localization.dtypes == np.float64, + "Column datatypes should be np.float64, got {}.".format(activity_localization.dtypes)) def test_get_aai_encoding(self): """ Testing getting the AAI encoding from the database for specific indices. """ @@ -350,77 +352,77 @@ def test_get_aai_encoding(self): error_aaindices = ["ABCD1234", "ABCD12345"] error_aaindices1 = "XYZ4567" #1.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability - aai_encoding = test_pySAR.get_aai_encoding(aa_indices) - - self.assertIsInstance(aai_encoding, np.ndarray, - 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding))) - self.assertEqual(aai_encoding.shape[0], test_pySAR.num_seqs, - 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR.num_seqs, aai_encoding.shape[0])) - self.assertEqual(aai_encoding.shape[1], test_pySAR.sequence_length * len(aa_indices), - 'The length of the sequences expected to be {}, got {}.'.format((test_pySAR.sequence_length * len(aa_indices)), str(aai_encoding.shape[1]))) - self.assertEqual(aai_encoding.dtype, np.float32, - 'Datatype of elements in numpy array expected to be dtype np.float32, got {}.'.format(aai_encoding.dtype)) + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + aai_encoding_thermostability = test_pySAR_thermostability.get_aai_encoding(aa_indices) + + self.assertIsInstance(aai_encoding_thermostability, np.ndarray, + 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding_thermostability))) + self.assertEqual(aai_encoding_thermostability.shape[0], test_pySAR_thermostability.num_seqs, + 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR_thermostability.num_seqs, aai_encoding_thermostability.shape[0])) + self.assertEqual(aai_encoding_thermostability.shape[1], test_pySAR_thermostability.sequence_length * len(aa_indices), + 'The length of the sequences expected to be {}, got {}.'.format((test_pySAR_thermostability.sequence_length * len(aa_indices)), str(aai_encoding_thermostability.shape[1]))) + self.assertEqual(aai_encoding_thermostability.dtype, np.float32, + 'Datatype of elements in numpy array expected to be dtype np.float32, got {}.'.format(aai_encoding_thermostability.dtype)) self.assertTrue((np.array([0.78, 0.5, 1.02, 0.68, 0.68, 0.78, 0.36, 0.68, 0.36, 0.68], - dtype=np.float32) == aai_encoding[0][:10]).all(), - 'The first 10 elements of the 1st sequence in encoding do not match what was expected:\n{}.'.format(aai_encoding[0][:10])) + dtype=np.float32) == aai_encoding_thermostability[0][:10]).all(), + 'The first 10 elements of the 1st sequence in encoding do not match what was expected:\n{}.'.format(aai_encoding_thermostability[0][:10])) #2.) - test_pySAR_1 = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity - aai_encoding_1 = test_pySAR_1.get_aai_encoding(aa_indices1) - - self.assertIsInstance(aai_encoding_1, np.ndarray, - 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding_1))) - self.assertEqual(aai_encoding_1.shape[0], test_pySAR_1.num_seqs, - 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR_1.num_seqs, aai_encoding_1.shape[0])) - self.assertEqual(aai_encoding_1.shape[1], test_pySAR_1.sequence_length, - 'The length of the sequences expected to be {}, got {}.'.format(test_pySAR_1.sequence_length, str(aai_encoding_1.shape[1]))) - self.assertEqual(aai_encoding_1.dtype, np.float32, - 'Datatype of elements in numpy array should be of dtype np.float32, got {}.'.format(aai_encoding_1.dtype)) + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + aai_encoding_enantioselectivity = test_pySAR_enantioselectivity.get_aai_encoding(aa_indices1) + + self.assertIsInstance(aai_encoding_enantioselectivity, np.ndarray, + 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding_enantioselectivity))) + self.assertEqual(aai_encoding_enantioselectivity.shape[0], test_pySAR_enantioselectivity.num_seqs, + 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR_enantioselectivity.num_seqs, aai_encoding_enantioselectivity.shape[0])) + self.assertEqual(aai_encoding_enantioselectivity.shape[1], test_pySAR_enantioselectivity.sequence_length, + 'The length of the sequences expected to be {}, got {}.'.format(test_pySAR_enantioselectivity.sequence_length, str(aai_encoding_enantioselectivity.shape[1]))) + self.assertEqual(aai_encoding_enantioselectivity.dtype, np.float32, + 'Datatype of elements in numpy array should be of dtype np.float32, got {}.'.format(aai_encoding_enantioselectivity.dtype)) self.assertTrue((np.array([3.79, 7.25, 10.88, 7.21, 2.93, 10.88, 6.11, 2.93, 7.21, 7.25], - dtype=np.float32) == aai_encoding_1[0][:10]).all(), - 'The first 10 elements of the 1st sequence do not match what was expected:\n{}.'.format(aai_encoding_1[0][:10])) + dtype=np.float32) == aai_encoding_enantioselectivity[0][:10]).all(), + 'The first 10 elements of the 1st sequence do not match what was expected:\n{}.'.format(aai_encoding_enantioselectivity[0][:10])) #3.) - test_pySAR_2 = pysar.PySAR(config_file=self.all_config_files[2]) #absorption - aai_encoding_2 = test_pySAR_2.get_aai_encoding(aa_indices2) - - self.assertIsInstance(aai_encoding_2, np.ndarray, - 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding_2))) - self.assertEqual(aai_encoding_2.shape[0], test_pySAR_2.num_seqs, - 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR_2.num_seqs, aai_encoding_2.shape[0])) - self.assertEqual(aai_encoding_2.shape[1], test_pySAR_2.sequence_length, - 'The length of the sequences expected to be {}, got {}.'.format(test_pySAR_2.sequence_length, str(aai_encoding_2.shape[1]))) - self.assertEqual(aai_encoding_2.dtype, np.float32, - 'Datatype of elements in numpy array should be of dtype np.float32, got {}.'.format(aai_encoding_2.dtype)) + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[2]) #absorption + aai_encoding_absorption = test_pySAR_absorption.get_aai_encoding(aa_indices2) + + self.assertIsInstance(aai_encoding_absorption, np.ndarray, + 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding_absorption))) + self.assertEqual(aai_encoding_absorption.shape[0], test_pySAR_absorption.num_seqs, + 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR_absorption.num_seqs, aai_encoding_absorption.shape[0])) + self.assertEqual(aai_encoding_absorption.shape[1], test_pySAR_absorption.sequence_length, + 'The length of the sequences expected to be {}, got {}.'.format(test_pySAR_absorption.sequence_length, str(aai_encoding_absorption.shape[1]))) + self.assertEqual(aai_encoding_absorption.dtype, np.float32, + 'Datatype of elements in numpy array should be of dtype np.float32, got {}.'.format(aai_encoding_absorption.dtype)) self.assertTrue((np.array([14.9, 17.6, 14.9, 9.5, 14.3, 18.8, 6.9, 6.9, 9.9, 14.8], - dtype=np.float32)==aai_encoding_2[0][:10]).all(), - 'The first 10 elements of the 1st sequence do not match what was expected:\n{}.'.format(aai_encoding_2[0][:10])) + dtype=np.float32)==aai_encoding_absorption[0][:10]).all(), + 'The first 10 elements of the 1st sequence do not match what was expected:\n{}.'.format(aai_encoding_absorption[0][:10])) #3.) - test_pySAR_3 = pysar.PySAR(config_file=self.all_config_files[3]) #localization - aai_encoding_3 = test_pySAR_3.get_aai_encoding(aa_indices3) - - self.assertIsInstance(aai_encoding_3, np.ndarray, - 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding_3))) - self.assertEqual(aai_encoding_3.shape[0], test_pySAR_3.num_seqs, - 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR_3.num_seqs, aai_encoding_3.shape[0])) - self.assertEqual(aai_encoding_3.shape[1], test_pySAR_3.sequence_length * 2, - 'The length of the sequences expected to be {}, got {}.'.format(test_pySAR_3.sequence_length, str(aai_encoding_3.shape[1]))) - self.assertEqual(aai_encoding_3.dtype, np.float32, - 'Datatype of elements in numpy array should be of dtype np.float32, got {}.'.format(aai_encoding_3.dtype)) + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[3]) #localization + aai_encoding_localization = test_pySAR_localization.get_aai_encoding(aa_indices3) + + self.assertIsInstance(aai_encoding_localization, np.ndarray, + 'AAI Encoding output expected to be a numpy array, got datatype {}.'.format(type(aai_encoding_localization))) + self.assertEqual(aai_encoding_localization.shape[0], test_pySAR_localization.num_seqs, + 'The number of sequences in the dataset expected to be {}, got {}.'.format(test_pySAR_localization.num_seqs, aai_encoding_localization.shape[0])) + self.assertEqual(aai_encoding_localization.shape[1], test_pySAR_localization.sequence_length * 2, + 'The length of the sequences expected to be {}, got {}.'.format(test_pySAR_localization.sequence_length, str(aai_encoding_localization.shape[1]))) + self.assertEqual(aai_encoding_localization.dtype, np.float32, + 'Datatype of elements in numpy array should be of dtype np.float32, got {}.'.format(aai_encoding_localization.dtype)) self.assertTrue((np.array([1.47, 0.77, 1.04, 1.22, 1.05, 1.32, 1.32, 0.77, 1.02, 1.22], - dtype=np.float32)==aai_encoding_3[0][:10]).all(), - 'The first 10 elements of sequence 0 do not match what was expected:\n{}.'.format(aai_encoding_3[0][:10])) + dtype=np.float32)==aai_encoding_localization[0][:10]).all(), + 'The first 10 elements of sequence 0 do not match what was expected:\n{}.'.format(aai_encoding_localization[0][:10])) #4.) with self.assertRaises(ValueError, msg='ValueError: Errorneous indices have been input.'): - test_pySAR_1.get_aai_encoding(error_aaindices) - test_pySAR_1.get_aai_encoding(error_aaindices1) + test_pySAR_thermostability.get_aai_encoding(error_aaindices) + test_pySAR_thermostability.get_aai_encoding(error_aaindices1) #5.) with self.assertRaises(TypeError, msg='TypeError: Errorneous indices datatypes have been input.'): - test_pySAR_1.get_aai_encoding(1235) - test_pySAR_1.get_aai_encoding(40.89) - test_pySAR_1.get_aai_encoding(False) + test_pySAR_enantioselectivity.get_aai_encoding(1235) + test_pySAR_localization.get_aai_encoding(40.89) + test_pySAR_absorption.get_aai_encoding(False) - def test_aai_encoding(self): - """ Testing AAI encoding pipeline. """ + def test_aai_encoding(self): + """ Testing AAI encoding pipeline. """ aa_indices_1 = "NAKH920102" aa_indices_2 = "CHOP780207, GEIM800104" aa_indices_3 = ["CHAM810101, ISOY800103"] @@ -430,110 +432,113 @@ def test_aai_encoding(self): expected_output_cols = ['Index', 'Category', 'R2', 'RMSE', 'MSE', 'RPD', 'MAE', 'Explained Variance'] #1.) - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability - test_aai_ = test_pySAR.encode_aai(aai_indices=aa_indices_1, print_results=0, output_folder=self.test_output_folder) - - self.assertIsInstance(test_aai_, pd.DataFrame, - 'Output should be a DataFrame, got {}.'.format(type(test_aai_))) - self.assertEqual(len(test_aai_.columns), 8, - "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_.columns))) - self.assertEqual(test_aai_['Index'].values[0], "NAKH920102", - "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_["Index"])) - self.assertEqual(test_aai_['Category'].values[0], "composition", - "Category names in ouput dataframe don't match expected: {}.".format(test_aai_["Category"])) - self.assertEqual(test_pySAR.feature_space, (261, 466), - "Expected feature space dimensions to be 261 x 466, got {}.".format(test_pySAR.feature_space)) - for col in test_aai_.columns: + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + test_aai_thermostability = test_pySAR_thermostability.encode_aai(aai_indices=aa_indices_1, print_results=0, output_folder=self.test_output_folder) + + self.assertIsInstance(test_aai_thermostability, pd.DataFrame, + 'Expected output to be a DataFrame, got {}.'.format(type(test_aai_thermostability))) + self.assertEqual(len(test_aai_thermostability.columns), 8, + "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_thermostability.columns))) + self.assertEqual(test_aai_thermostability['Index'].values[0], "NAKH920102", + "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_thermostability["Index"])) + self.assertEqual(test_aai_thermostability['Category'].values[0], "composition", + "Category names in ouput dataframe don't match expected: {}.".format(test_aai_thermostability["Category"])) + self.assertEqual(test_pySAR_thermostability.feature_space, (261, 466), + "Expected feature space dimensions to be 261 x 466, got {}.".format(test_pySAR_thermostability.feature_space)) + for col in test_aai_thermostability.columns: self.assertIn(col, expected_output_cols, - "Col {} not found in list of expected columns:\n{}".format(col, expected_output_cols)) + "Col {} not found in list of expected columns:\n{}.".format(col, expected_output_cols)) if (col == "Index" or col == "Category"): - self.assertTrue(all(isinstance(row, str) for row in list(test_aai_[col].values)), - "Column {} expected to be of type string got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, str) for row in list(test_aai_thermostability[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_aai_thermostability[col]))) else: - self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_[col].values)), - "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_thermostability[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_thermostability[col]))) self.assertTrue(os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME), - "Output dir storing encoding results not found: {}".format(self.test_output_folder + "_" + _globals.CURRENT_DATETIME)) + "Output dir storing encoding results not found: {}.".format(self.test_output_folder + "_" + _globals.CURRENT_DATETIME)) self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "aai_results.csv")), - "Output csv storing encoding results not found: {}".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "aai_results.csv"))) + "Output csv storing encoding results not found: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "aai_results.csv"))) self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png")), - "Output regression plot not found: {}".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png"))) + "Output regression plot not found: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png"))) #2.) - test_aai_ = test_pySAR.encode_aai(aai_indices=aa_indices_2, print_results=0, output_folder=self.test_output_folder) - self.assertIsInstance(test_aai_, pd.DataFrame, - 'Output should be a DataFrame, got {}.'.format(type(test_aai_))) - self.assertEqual(len(test_aai_.columns), 8, - "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_.columns))) - self.assertEqual(test_aai_['Index'].values[0], "CHOP780207, GEIM800104", - "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_["Index"])) - self.assertEqual(test_aai_['Category'].values[0], "sec_struct, sec_struct", - "Category names in ouput dataframe don't match expected: {}.".format(test_aai_["Category"])) - self.assertEqual(test_pySAR.feature_space, (261, 932), - "Expected feature space dimensions to be 261 x 932, got {}.".format(test_pySAR.feature_space)) - for col in test_aai_.columns: + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + test_aai_enantioselectivity = test_pySAR_enantioselectivity.encode_aai(aai_indices=aa_indices_2, print_results=0, output_folder=self.test_output_folder) + self.assertIsInstance(test_aai_enantioselectivity, pd.DataFrame, + 'Output should be a DataFrame, got {}.'.format(type(test_aai_enantioselectivity))) + self.assertEqual(len(test_aai_enantioselectivity.columns), 8, + "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_enantioselectivity.columns))) + self.assertEqual(test_aai_enantioselectivity['Index'].values[0], "CHOP780207, GEIM800104", + "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_enantioselectivity["Index"])) + self.assertEqual(test_aai_enantioselectivity['Category'].values[0], "sec_struct, sec_struct", + "Category names in ouput dataframe don't match expected: {}.".format(test_aai_enantioselectivity["Category"])) + self.assertEqual(test_pySAR_enantioselectivity.feature_space, (152, 796), + "Expected feature space dimensions to be 152 x 796, got {}.".format(test_pySAR_enantioselectivity.feature_space)) + for col in test_aai_enantioselectivity.columns: self.assertIn(col, expected_output_cols, - "Col {} not found in list of expected columns:\n{}".format(col, expected_output_cols)) + "Col {} not found in list of expected columns:\n{}.".format(col, expected_output_cols)) if (col == "Index" or col == "Category"): - self.assertTrue(all(isinstance(row, str) for row in list(test_aai_[col].values)), - "Column {} expected to be of type string got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, str) for row in list(test_aai_enantioselectivity[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_aai_enantioselectivity[col]))) else: - self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_[col].values)), - "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_enantioselectivity[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_enantioselectivity[col]))) #3.) - test_aai_ = test_pySAR.encode_aai(aai_indices=aa_indices_3, print_results=0, output_folder=self.test_output_folder) - self.assertIsInstance(test_aai_, pd.DataFrame, - 'Output should be a DataFrame, got {}.'.format(type(test_aai_))) - self.assertEqual(len(test_aai_.columns), 8, - "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_.columns))) - self.assertEqual(test_aai_['Index'].values[0], "CHAM810101, ISOY800103", - "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_["Index"])) - self.assertEqual(test_aai_['Category'].values[0], "geometry, sec_struct", - "Category names in ouput dataframe don't match expected: {}.".format(test_aai_["Category"])) - self.assertEqual(test_pySAR.feature_space, (261, 932), - "Expected feature space dimensions to be 261 x 932, got {}.".format(test_pySAR.feature_space)) - for col in test_aai_.columns: + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[2]) #localiztion + test_aai_localization = test_pySAR_localization.encode_aai(aai_indices=aa_indices_3, print_results=0, output_folder=self.test_output_folder) + self.assertIsInstance(test_aai_localization, pd.DataFrame, + 'Output should be a DataFrame, got {}.'.format(type(test_aai_localization))) + self.assertEqual(len(test_aai_localization.columns), 8, + "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_localization.columns))) + self.assertEqual(test_aai_localization['Index'].values[0], "CHAM810101, ISOY800103", + "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_localization["Index"])) + self.assertEqual(test_aai_localization['Category'].values[0], "geometry, sec_struct", + "Category names in ouput dataframe don't match expected: {}.".format(test_aai_localization["Category"])) + self.assertEqual(test_pySAR_localization.feature_space, (81, 596), + "Expected feature space dimensions to be 81 x 596, got {}.".format(test_pySAR_localization.feature_space)) + for col in test_aai_localization.columns: self.assertIn(col, expected_output_cols, - "Col {} not found in list of expected columns:\n{}".format(col, expected_output_cols)) + "Col {} not found in list of expected columns:\n{}.".format(col, expected_output_cols)) if (col == "Index" or col == "Category"): - self.assertTrue(all(isinstance(row, str) for row in list(test_aai_[col].values)), - "Column {} expected to be of type string got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, str) for row in list(test_aai_localization[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_aai_localization[col]))) else: - self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_[col].values)), - "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_localization[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_localization[col]))) #4.) - test_aai_ = test_pySAR.encode_aai(aai_indices=aa_indices_4, print_results=0, output_folder=self.test_output_folder) - self.assertIsInstance(test_aai_, pd.DataFrame, - 'Output should be a DataFrame, got {}.'.format(type(test_aai_))) - self.assertEqual(len(test_aai_.columns), 8, - "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_.columns))) - self.assertEqual(test_aai_['Index'].values[0], "PTIO830101, QIAN880136, RACS820110", - "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_["Index"])) - self.assertEqual(test_aai_['Category'].values[0], "sec_struct, sec_struct, geometry", - "Category names in ouput dataframe don't match expected: {}.".format(test_aai_["Category"])) - self.assertEqual(test_pySAR.feature_space, (261, 1398), - "Expected feature space dimensions to be 261 x 1398, got {}.".format(test_pySAR.feature_space)) - for col in test_aai_.columns: + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[3]) #absorption + test_aai_absorption = test_pySAR_absorption.encode_aai(aai_indices=aa_indices_4, print_results=0, output_folder=self.test_output_folder) + self.assertIsInstance(test_aai_absorption, pd.DataFrame, + 'Output should be a DataFrame, got {}.'.format(type(test_aai_absorption))) + self.assertEqual(len(test_aai_absorption.columns), 8, + "Expected 8 columns in dataframe output, got {}.".format(len(test_aai_absorption.columns))) + self.assertEqual(test_aai_absorption['Index'].values[0], "PTIO830101, QIAN880136, RACS820110", + "Index codes in ouput dataframe don't match expected: {}.".format(test_aai_absorption["Index"])) + self.assertEqual(test_aai_absorption['Category'].values[0], "sec_struct, sec_struct, geometry", + "Category names in ouput dataframe don't match expected: {}.".format(test_aai_absorption["Category"])) + self.assertEqual(test_pySAR_absorption.feature_space, (254, 1083), + "Expected feature space dimensions to be 254 x 1083, got {}.".format(test_pySAR_absorption.feature_space)) + for col in test_aai_absorption.columns: self.assertIn(col, expected_output_cols, "Col {} not found in list of expected columns:\n{}".format(col, expected_output_cols)) if (col == "Index" or col == "Category"): - self.assertTrue(all(isinstance(row, str) for row in list(test_aai_[col].values)), - "Column {} expected to be of type string got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, str) for row in list(test_aai_absorption[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_aai_absorption[col]))) else: - self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_[col].values)), - "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_[col]))) + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_absorption[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_absorption[col]))) #5.) - with self.assertRaises(ValueError, msg='ValueError: Indices parameter cannot be None or an empty string.'): - test_pySAR.encode_aai(aai_indices=None) - test_pySAR.encode_aai(aai_indices="") - test_pySAR.encode_aai() - test_pySAR.encode_aai(aai_indices=error_aaindices) - test_pySAR.encode_aai(aai_indices=error_aaindices1) + with self.assertRaises(ValueError, msg='ValueError: Indices parameter cannot be None, an empty string or an invalid AAI record number.'): + test_pySAR_thermostability.encode_aai(aai_indices=None) + test_pySAR_thermostability.encode_aai(aai_indices="") + test_pySAR_enantioselectivity.encode_aai() + test_pySAR_enantioselectivity.encode_aai(aai_indices=error_aaindices) + test_pySAR_localization.encode_aai(aai_indices=error_aaindices1) #6.) with self.assertRaises(TypeError, msg='TypeError: Indices must be lists or strings.'): - test_pySAR.encode_aai(aai_indices=123) - test_pySAR.encode_aai(aai_indices=0.90) - test_pySAR.encode_aai(aai_indices=False) - test_pySAR.encode_aai(aai_indices=9000) + test_pySAR_localization.encode_aai(aai_indices=123) + test_pySAR_localization.encode_aai(aai_indices=0.90) + test_pySAR_absorption.encode_aai(aai_indices=False) + test_pySAR_absorption.encode_aai(aai_indices=9000) def test_get_desc_encoding(self): """ Testing Descriptor encoding functionality. """ @@ -541,16 +546,15 @@ def test_get_desc_encoding(self): desc_2 = "ctd_transition" desc_3 = "moranauto, quasi_seq_order" all_desc = [desc_1, desc_2, "geary_auto", "sequence_order_coupling_number"] - - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability #1.) - desc_encoding = test_pySAR.get_descriptor_encoding(desc_1) - - self.assertIsInstance(desc_encoding, pd.DataFrame, - 'Descriptor Encoding output expected to be a DataFrame, got datatype {}.'.format(type(desc_encoding))) - self.assertEqual(desc_encoding.shape, (test_pySAR.num_seqs, 400), - 'Shape of descriptor encoding expected to be ({}), but got {}.'.format((test_pySAR.num_seqs, 400), desc_encoding.shape)) - for col in list(desc_encoding.columns): + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + desc_encoding_thermostability = test_pySAR_thermostability.get_descriptor_encoding(desc_1) + + self.assertIsInstance(desc_encoding_thermostability, pd.DataFrame, + 'Expected encoding output to be a DataFrame, got datatype {}.'.format(type(desc_encoding_thermostability))) + self.assertEqual(desc_encoding_thermostability.shape, (test_pySAR_thermostability.num_seqs, 400), + 'Expected shape of descriptor encoding expected to be {}, but got {}.'.format((test_pySAR_thermostability.num_seqs, 400), desc_encoding_thermostability.shape)) + for col in list(desc_encoding_thermostability.columns): #check all columns follow pattern of XY where x & y are amino acids self.assertTrue(bool(re.match(r'^[A-Z]{2}$', col)), "Column doesn't follow correct naming convention: {}.".format(col)) @@ -558,111 +562,159 @@ def test_get_desc_encoding(self): "1st half of column name {} is not a valid amino acid.".format(col[0])) self.assertIn(col[1], self.amino_acids, "2nd half of column name {} is not a valid amino acid.".format(col[0])) - self.assertTrue(all(col == np.float64 for col in list(desc_encoding.dtypes)), - "Descriptor values expected to be type np.float64, got:\n{}".format(list(desc_encoding.dtypes))) + self.assertTrue(all(col == np.float64 for col in list(desc_encoding_thermostability.dtypes)), + "Descriptor values expected to be type np.float64, got:\n{}.".format(list(desc_encoding_thermostability.dtypes))) #2.) - desc_encoding = test_pySAR.get_descriptor_encoding(desc_2) + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[0]) #enantioselectivity + desc_encoding_enantioselectivity = test_pySAR_enantioselectivity.get_descriptor_encoding(desc_2) - self.assertIsInstance(desc_encoding, pd.DataFrame, - 'Descriptor Encoding output expected to be a DataFrame, got datatype {}.'.format(type(desc_encoding))) - self.assertEqual(desc_encoding.shape, (test_pySAR.num_seqs, 3), - 'Shape of descriptor encoding expected to be ({}), but got {}.'.format((test_pySAR.num_seqs, 3), desc_encoding.shape)) - for col in list(desc_encoding.columns): + self.assertIsInstance(desc_encoding_enantioselectivity, pd.DataFrame, + 'Expected descriptor encoding output to be a DataFrame, got datatype {}.'.format(type(desc_encoding_enantioselectivity))) + self.assertEqual(desc_encoding_enantioselectivity.shape, (test_pySAR_enantioselectivity.num_seqs, 3), + 'Shape of descriptor encoding expected to be {}, but got {}.'.format((test_pySAR_enantioselectivity.num_seqs, 3), desc_encoding_enantioselectivity.shape)) + for col in list(desc_encoding_enantioselectivity.columns): #check all column names follow pattern for CTD descriptor self.assertTrue((bool(re.search(r"CTD_[A-Z]_[0-9]{2}_hydrophobicity", col))), "Column doesn't follow correct naming convention: {}.".format(col)) - self.assertTrue(all(col == np.float64 for col in list(desc_encoding.dtypes)), - "Descriptor values expected to be type np.float64, got:\n{}".format(list(desc_encoding.dtypes))) + self.assertTrue(all(col == np.float64 for col in list(desc_encoding_enantioselectivity.dtypes)), + "Descriptor values expected to be type np.float64, got:\n{}.".format(list(desc_encoding_enantioselectivity.dtypes))) #3.) - desc_encoding = test_pySAR.get_descriptor_encoding(desc_3) + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[0]) #localization + desc_encoding_localization = test_pySAR_localization.get_descriptor_encoding(desc_3) - self.assertIsInstance(desc_encoding, pd.DataFrame, - 'Descriptor Encoding output expected to be a DataFrame, got datatype {}.'.format(type(desc_encoding))) - self.assertEqual(desc_encoding.shape, (test_pySAR.num_seqs, 240+50), #MAuto dim + QSO dim - 'Shape of descriptor encoding expected to be ({}), but got {}.'.format((test_pySAR.num_seqs, 240+50), desc_encoding.shape)) + self.assertIsInstance(desc_encoding_localization, pd.DataFrame, + 'Expected descriptor encoding output to be a DataFrame, got datatype {}.'.format(type(desc_encoding_localization))) + self.assertEqual(desc_encoding_localization.shape, (test_pySAR_localization.num_seqs, 240+50), #MAuto dim + QSO dim + 'Shape of descriptor encoding expected to be {}, but got {}.'.format((test_pySAR_localization.num_seqs, 240+50), desc_encoding_localization.shape)) #check all column names follow pattern for MAuto + QSO descriptors - for col in list(desc_encoding.columns): + for col in list(desc_encoding_localization.columns): self.assertTrue(bool(re.match(r"MAuto_[A-Z0-9]{10}_[0-9]", col)) or bool(re.match(r"QSO_SW[0-9]", col)) \ or bool(re.match(r"QSO_SW[0-9][0-9]", col)), "Column doesn't follow correct naming convention: {}.".format(col)) - self.assertTrue(all(col == np.float64 for col in list(desc_encoding.dtypes)), - "Descriptor values expected to be type np.float64, got:\n{}".format(list(desc_encoding.dtypes))) + self.assertTrue(all(col == np.float64 for col in list(desc_encoding_localization.dtypes)), + "Descriptor values expected to be type np.float64, got:\n{}.".format(list(desc_encoding_localization.dtypes))) #4.) - desc_encoding = test_pySAR.get_descriptor_encoding(all_desc) + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[0]) #absorption + desc_encoding_absorption = test_pySAR_absorption.get_descriptor_encoding(all_desc) - self.assertIsInstance(desc_encoding, pd.DataFrame, - 'Descriptor Encoding output expected to be a DataFrame, got datatype {}.'.format(type(desc_encoding))) - self.assertEqual(desc_encoding.shape, (test_pySAR.num_seqs, 400+3+240+30), #DPComp dim + CTD_T dim + Gauto dim + QSO dim - 'Shape of descriptor encoding expected to be ({}), but got {}.'.format((test_pySAR.num_seqs, 400+3+240+30), desc_encoding.shape)) + self.assertIsInstance(desc_encoding_absorption, pd.DataFrame, + 'Expected descriptor encoding output to be a DataFrame, got datatype {}.'.format(type(desc_encoding_absorption))) + self.assertEqual(desc_encoding_absorption.shape, (test_pySAR_absorption.num_seqs, 400+3+240+30), #DPComp dim + CTD_T dim + Gauto dim + QSO dim + 'Shape of descriptor encoding expected to be {}, but got {}.'.format((test_pySAR_absorption.num_seqs, 400+3+240+30), desc_encoding_absorption.shape)) #check all column names follow pattern for DPComp + CTD_T + Gauto + QSO descriptors - for col in list(desc_encoding.columns): + for col in list(desc_encoding_absorption.columns): self.assertTrue(bool(re.match(r"GAuto_[A-Z0-9]{10}_[0-9]", col)) or bool(re.match(r'^[A-Z]{2}$', col)) or bool(re.match(r"SOCN_SW[0-9]", col)) or bool(re.match(r"QSO_SW[0-9][0-9]", col)) or bool(re.match(r"CTD_T_[0-9]_hydrophobicity", col)) or bool(re.match(r"CTD_T_[0-9]{2}_hydrophobicity", col)), "Column doesn't follow correct naming convention: {}.".format(col)) - self.assertTrue(all(col == np.float64 for col in list(desc_encoding.dtypes)), - "Descriptor values expected to be type np.float64, got:\n{}".format(list(desc_encoding.dtypes))) + self.assertTrue(all(col == np.float64 for col in list(desc_encoding_absorption.dtypes)), + "Descriptor values expected to be type np.float64, got:\n{}.".format(list(desc_encoding_absorption.dtypes))) #5.) with self.assertRaises(ValueError, msg='ValueError: Descriptor input parameter cannot be None.'): - test_pySAR.get_descriptor_encoding(descriptors=None) - test_pySAR.get_descriptor_encoding(descriptors="") - test_pySAR.get_descriptor_encoding(descriptors=[]) + test_pySAR_thermostability.get_descriptor_encoding(descriptors=None) + test_pySAR_enantioselectivity.get_descriptor_encoding(descriptors="") + test_pySAR_enantioselectivity.get_descriptor_encoding(descriptors=[]) #6.) with self.assertRaises(TypeError, msg='ValueError: Descriptor input parameter cannot be an invalid descriptor name.'): - test_pySAR.get_descriptor_encoding(descriptor=123) - test_pySAR.get_descriptor_encoding(descriptor=0.90) - test_pySAR.get_descriptor_encoding(descriptor=False) - test_pySAR.get_descriptor_encoding(descriptor=9000) + test_pySAR_localization.get_descriptor_encoding(descriptor=123) + test_pySAR_localization.get_descriptor_encoding(descriptor=0.90) + test_pySAR_absorption.get_descriptor_encoding(descriptor=False) + test_pySAR_absorption.get_descriptor_encoding(descriptor=9000) - def test_desc_encoding(self): + def test_desc_encoding(self): #*rewrite and exapnd tests """ Testing Descriptor encoding pipeline. """ desc_1 = "dipeptide_composition" desc_2 = "ctd_distribution" desc_3 = "seq_order_coupling_number" desc_4 = "moranauto, quasi_seq_order" - all_desc = [desc_1, desc_2, desc_3, "moranauto, quasi_seq_order"] - expected_output_cols = ['Descriptor', 'Group', 'R2', 'RMSE', 'MSE', - 'RPD', 'MAE', 'Explained Variance'] - expected_descriptor_feature_space = [(261, 400), (261, 15), (261, 30), (261, 290)] - - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + expected_output_cols = ['Descriptor', 'Group', 'R2', 'RMSE', 'MSE', 'RPD', 'MAE', 'Explained Variance'] #1.) - for de in range(0, len(all_desc)): - test_desc = test_pySAR.encode_descriptor(descriptors=all_desc[de], print_results=0, output_folder=self.test_output_folder) - self.assertIsInstance(test_desc, pd.DataFrame, - 'Output should be a DataFrame, got {}.'.format(type(test_desc))) - self.assertEqual(len(test_desc), 1, - "Expected 1 row in encoding output, got {}.".format(len(test_desc))) - for col in test_desc.columns: - self.assertIn(col, expected_output_cols, - "Col {} not found in list of expected columns:\n{}".format(col, expected_output_cols)) - if (col == "Descriptor" or col == "Group"): - self.assertTrue(all(isinstance(row, str) for row in list(test_desc[col].values)), - "Column {} expected to be of type string got {}.".format(col, type(test_desc[col]))) - else: - self.assertTrue(all(isinstance(row, np.float64) for row in list(test_desc[col].values)), - "Column {} expected to be of type np.float64 got {}.".format(col, type(test_desc[col]))) - self.assertEqual(test_pySAR.feature_space, expected_descriptor_feature_space[de], - "Expected feature space dimensions to be 261 x 466, got {}.".format(test_pySAR.feature_space)) - + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + test_desc_thermostability = test_pySAR_thermostability.encode_descriptor(descriptors=desc_1, print_results=0, output_folder=self.test_output_folder) + + self.assertIsInstance(test_desc_thermostability, pd.DataFrame, 'Expected output to be a DataFrame, got {}.'.format(type(test_desc_thermostability))) + self.assertEqual(len(test_desc_thermostability), 1, "Expected 1 row in encoding output, got {}.".format(len(test_desc_thermostability))) + for col in test_desc_thermostability.columns: + self.assertIn(col, expected_output_cols, + "Col {} not found in list of expected columns:\n{}.".format(col, expected_output_cols)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_desc_thermostability[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_desc_thermostability[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_desc_thermostability[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_desc_thermostability[col]))) + self.assertEqual(test_pySAR_thermostability.feature_space, (261, 400), + "Expected feature space dimensions to be 261 x 466, got {}.".format(test_pySAR_thermostability.feature_space)) +#2.) + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + test_desc_enantioselectivity = test_pySAR_enantioselectivity.encode_descriptor(descriptors=desc_1, print_results=0, output_folder=self.test_output_folder) + + self.assertIsInstance(test_desc_enantioselectivity, pd.DataFrame, 'Expected output to be a DataFrame, got {}.'.format(type(test_desc_enantioselectivity))) + self.assertEqual(len(test_desc_enantioselectivity), 1, "Expected 1 row in encoding output, got {}.".format(len(test_desc_enantioselectivity))) + for col in test_desc_enantioselectivity.columns: + self.assertIn(col, expected_output_cols, + "Col {} not found in list of expected columns:\n{}.".format(col, expected_output_cols)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_desc_enantioselectivity[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_desc_enantioselectivity[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_desc_enantioselectivity[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_desc_enantioselectivity[col]))) + self.assertEqual(test_pySAR_enantioselectivity.feature_space, (152, 400), + "Expected feature space dimensions to be 152 x 400, got {}.".format(test_pySAR_enantioselectivity.feature_space)) +#3.) + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[2]) #absorption + test_desc_absorption = test_pySAR_absorption.encode_descriptor(descriptors=desc_1, print_results=0, output_folder=self.test_output_folder) + + self.assertIsInstance(test_desc_absorption, pd.DataFrame, 'Expected output to be a DataFrame, got {}.'.format(type(test_desc_absorption))) + self.assertEqual(len(test_desc_absorption), 1, "Expected 1 row in encoding output, got {}.".format(len(test_desc_absorption))) + for col in test_desc_absorption.columns: + self.assertIn(col, expected_output_cols, + "Col {} not found in list of expected columns:\n{}.".format(col, expected_output_cols)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_desc_absorption[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_desc_absorption[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_desc_absorption[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_desc_absorption[col]))) + self.assertEqual(test_pySAR_absorption.feature_space, (81, 400), + "Expected feature space dimensions to be 81 x 400, got {}.".format(test_pySAR_absorption.feature_space)) +#4.) + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[3]) #localization + test_desc_localization = test_pySAR_localization.encode_descriptor(descriptors=desc_1, print_results=0, output_folder=self.test_output_folder) + + self.assertIsInstance(test_desc_localization, pd.DataFrame, 'Expected output to be a DataFrame, got {}.'.format(type(test_desc_localization))) + self.assertEqual(len(test_desc_localization), 1, "Expected 1 row in encoding output, got {}.".format(len(test_desc_localization))) + for col in test_desc_localization.columns: + self.assertIn(col, expected_output_cols, + "Col {} not found in list of expected columns:\n{}.".format(col, expected_output_cols)) + if (col == "Descriptor" or col == "Group"): + self.assertTrue(all(isinstance(row, str) for row in list(test_desc_localization[col].values)), + "Column {} expected to be of type string got {}.".format(col, type(test_desc_localization[col]))) + else: + self.assertTrue(all(isinstance(row, np.float64) for row in list(test_desc_localization[col].values)), + "Column {} expected to be of type np.float64 got {}.".format(col, type(test_desc_localization[col]))) + self.assertEqual(test_pySAR_localization.feature_space, (254, 400), + "Expected feature space dimensions to be 254 x 400, got {}.".format(test_pySAR_localization.feature_space)) + self.assertTrue(os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME), - "Output dir storing encoding results not found: {}".format(self.test_output_folder + "_" + _globals.CURRENT_DATETIME)) + "Output dir storing encoding results not found: {}.".format(self.test_output_folder + "_" + _globals.CURRENT_DATETIME)) self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "desc_results.csv")), - "Output csv storing encoding results not found: {}".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "desc_results.csv"))) + "Output csv storing encoding results not found: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "desc_results.csv"))) self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png")), - "Output regression plot not found: {}".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png"))) -#2.) + "Output regression plot not found: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png"))) +#5.) with self.assertRaises(ValueError, msg='ValueError: Descriptor parameter cannot be None or an empty string.'): - test_pySAR.encode_descriptor(descriptors=None) - test_pySAR.encode_descriptor(descriptors="") - test_pySAR.encode_descriptor(descriptors="invalid_descriptor") - test_pySAR.encode_descriptor(descriptors="blahblahblah") -#3.) + test_pySAR_thermostability.encode_descriptor(descriptors=None) + test_pySAR_thermostability.encode_descriptor(descriptors="") + test_pySAR_enantioselectivity.encode_descriptor(descriptors="invalid_descriptor") + test_pySAR_enantioselectivity.encode_descriptor(descriptors="blahblahblah") +#6.) with self.assertRaises(TypeError, msg='TypeError: Descriptor parameter has to be a strong or list.'): - test_pySAR.encode_descriptor(descriptors=123) - test_pySAR.encode_descriptor(descriptors=0.90) - test_pySAR.encode_descriptor(descriptors=False) - test_pySAR.encode_descriptor(descriptors=9000) + test_pySAR_absorption.encode_descriptor(descriptors=123) + test_pySAR_absorption.encode_descriptor(descriptors=0.90) + test_pySAR_localization.encode_descriptor(descriptors=False) + test_pySAR_localization.encode_descriptor(descriptors=9000) def test_aai_desc_encoding(self): """ Testing AAI + Descriptor encoding functionality. """ @@ -676,10 +728,9 @@ def test_aai_desc_encoding(self): desc_4 = ["moran_auto", "quasi_seq_order"] expected_output_cols = ['Descriptor', 'Group', 'Index', 'Category', 'R2', 'RMSE', 'MSE', 'RPD', 'MAE', 'Explained Variance'] - - test_pySAR = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability #1.) - test_aai_desc = test_pySAR.encode_aai_descriptor(descriptors=desc_1, aai_indices=aa_indices_1, print_results=0, output_folder=self.test_output_folder) + test_pySAR_thermostability = pysar.PySAR(config_file=self.all_config_files[0]) #thermostability + test_aai_desc = test_pySAR_thermostability.encode_aai_descriptor(descriptors=desc_1, aai_indices=aa_indices_1, print_results=0, output_folder=self.test_output_folder) self.assertIsInstance(test_aai_desc, pd.DataFrame, 'Expected output to be a DataFrame, got {}.'.format(type(test_aai_desc))) self.assertEqual(len(test_aai_desc.columns), 10, @@ -693,10 +744,12 @@ def test_aai_desc_encoding(self): else: self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_desc[col].values)), "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_desc[col]))) - self.assertEqual(test_pySAR.feature_space, (261, 486), - "Expected feature space dimensions to be 261 x 486, got {}.".format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_thermostability.feature_space, (261, 486), + "Expected feature space dimensions to be 261 x 486, got {}.".format(test_pySAR_thermostability.feature_space)) #2.) - test_aai_desc = test_pySAR.encode_aai_descriptor(descriptors=desc_2, aai_indices=aa_indices_2, print_results=0, output_folder=self.test_output_folder) + test_pySAR_enantioselectivity = pysar.PySAR(config_file=self.all_config_files[1]) #enantioselectivity + test_aai_desc = test_pySAR_enantioselectivity.encode_aai_descriptor(descriptors=desc_2, aai_indices=aa_indices_2, print_results=0, output_folder=self.test_output_folder) + self.assertIsInstance(test_aai_desc, pd.DataFrame, #**add more tests , directly testing output of columns 'Output expected to be a DataFrame, got {}.'.format(type(test_aai_desc))) self.assertEqual(len(test_aai_desc.columns), 10, @@ -710,10 +763,12 @@ def test_aai_desc_encoding(self): else: self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_desc[col].values)), "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_desc[col]))) - self.assertEqual(test_pySAR.feature_space, (261, 481), - "Expected feature space dimensions to be 261 x 481, got {}.".format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_enantioselectivity.feature_space, (152, 413), + "Expected test_pySAR_enantioselectivity space dimensions to be 152 x 413, got {}.".format(test_pySAR_enantioselectivity.feature_space)) #3.) - test_aai_desc = test_pySAR.encode_aai_descriptor(descriptors=desc_3, aai_indices=aa_indices_3, print_results=0, output_folder=self.test_output_folder) + test_pySAR_localization = pysar.PySAR(config_file=self.all_config_files[2]) #localization + test_aai_desc = test_pySAR_localization.encode_aai_descriptor(descriptors=desc_3, aai_indices=aa_indices_3, print_results=0, output_folder=self.test_output_folder) + self.assertIsInstance(test_aai_desc, pd.DataFrame, 'Output expected to be a DataFrame, got {}.'.format(type(test_aai_desc))) self.assertEqual(len(test_aai_desc.columns), 10, @@ -727,10 +782,12 @@ def test_aai_desc_encoding(self): else: self.assertTrue(all(isinstance(row, np.float64) for row in list(test_aai_desc[col].values)), "Column {} expected to be of type np.float64 got {}.".format(col, type(test_aai_desc[col]))) - self.assertEqual(test_pySAR.feature_space, (261, 809), - "Expected feature space dimensions to be 261 x 809, got {}.".format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_localization.feature_space, (81, 641), + "Expected feature space dimensions to be 81 x 641, got {}.".format(test_pySAR_localization.feature_space)) #4.) - test_aai_desc = test_pySAR.encode_aai_descriptor(descriptors=desc_4, aai_indices=aa_indices_4, print_results=0, output_folder=self.test_output_folder) + test_pySAR_absorption = pysar.PySAR(config_file=self.all_config_files[3]) #absorption + test_aai_desc = test_pySAR_absorption.encode_aai_descriptor(descriptors=desc_4, aai_indices=aa_indices_4, print_results=0, output_folder=self.test_output_folder) + self.assertIsInstance(test_aai_desc, pd.DataFrame, 'Output expected to be a DataFrame, got {}.'.format(type(test_aai_desc))) self.assertEqual(len(test_aai_desc.columns), 10, @@ -750,29 +807,29 @@ def test_aai_desc_encoding(self): "Output csv storing encoding results not found: {}".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "aai_desc_results.csv"))) self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png")), "Output regression plot not found: {}".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, "model_regression_plot.png"))) - self.assertEqual(test_pySAR.feature_space, (261, 1688), - "Expected feature space dimensions to be 261 x 1688, got {}.".format(test_pySAR.feature_space)) + self.assertEqual(test_pySAR_absorption.feature_space, (254, 1373), + "Expected feature space dimensions to be 254 x 1373, got {}.".format(test_pySAR_absorption.feature_space)) #5.) with self.assertRaises(ValueError, msg='ValueError: Descriptor and indices parameter cannot both be None or an empty string.'): - test_pySAR.encode_aai_descriptor(descriptors=None) - test_pySAR.encode_aai_descriptor(aai_indices=None) - test_pySAR.encode_aai_descriptor(descriptors="aa_comp") - test_pySAR.encode_aai_descriptor(aai_indices="LIFS790103") - test_pySAR.encode_aai_descriptor(aai_indices=None, descriptors=None) - test_pySAR.encode_aai_descriptor(aai_indices="", descriptors="") - test_pySAR.encode_aai_descriptor(descriptors="invalid_descriptor") - test_pySAR.encode_aai_descriptor(aai_indices="invalid_value") - test_pySAR.encode_aai_descriptor(descriptors="descriptor not found") - test_pySAR.encode_aai_descriptor(aai_indices="blahblahblah") + test_pySAR_thermostability.encode_aai_descriptor(descriptors=None) + test_pySAR_thermostability.encode_aai_descriptor(aai_indices=None) + test_pySAR_thermostability.encode_aai_descriptor(descriptors="aa_comp") + test_pySAR_thermostability.encode_aai_descriptor(aai_indices="LIFS790103") + test_pySAR_enantioselectivity.encode_aai_descriptor(aai_indices=None, descriptors=None) + test_pySAR_enantioselectivity.encode_aai_descriptor(aai_indices="", descriptors="") + test_pySAR_enantioselectivity.encode_aai_descriptor(descriptors="invalid_descriptor") + test_pySAR_enantioselectivity.encode_aai_descriptor(aai_indices="invalid_value") + test_pySAR_localization.encode_aai_descriptor(descriptors="descriptor not found") + test_pySAR_localization.encode_aai_descriptor(aai_indices="blahblahblah") #6.) with self.assertRaises(TypeError, msg='ValueError: Descriptor and indices must be lists or strings.'): - test_pySAR.encode_aai_descriptor(descriptors=123, aai_indices=123) - test_pySAR.encode_aai_descriptor(descriptors=0000, aai_indices=0.90) - test_pySAR.encode_aai_descriptor(descriptors=False, aai_indices=True) - test_pySAR.encode_aai_descriptor(descriptors=2.9, aai_indices=9000) + test_pySAR_localization.encode_aai_descriptor(descriptors=123, aai_indices=123) + test_pySAR_localization.encode_aai_descriptor(descriptors=0000, aai_indices=0.90) + test_pySAR_absorption.encode_aai_descriptor(descriptors=False, aai_indices=True) + test_pySAR_absorption.encode_aai_descriptor(descriptors=2.9, aai_indices=9000) def tearDown(self): - """ Delete any temp files or folders created during testing process. """ + """ Delete any temp files or folders created during test case. """ #removing any of the temp files created such as the results files, if you want to verify the results files # are actually being created thencomment out the below code block if (os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME)): diff --git a/tests/test_utils.py b/tests/test_utils.py index 5f6fb83..9ba47ea 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,11 +1,14 @@ ################################################################################ ################# Utilities Module Tests ################# ################################################################################ + import os import shutil import unittest import numpy as np -# np.warnings.filterwarnings('error', category=np.VisibleDeprecationWarning) +#suppress sklearn warnings +import warnings +warnings.filterwarnings("ignore") import pandas as pd import pySAR.globals_ as _globals @@ -31,36 +34,25 @@ class UtilsTest(unittest.TestCase): testing correct utils.Map class functionality. """ def setUp(self): - """ Import all test datasets. """ - try: - self.test_dataset1 = pd.read_csv(os.path.join('tests', 'test_data', + """ Import all test datasets from test_data folder. """ + self.test_dataset1 = pd.read_csv(os.path.join('tests', 'test_data', 'test_thermostability.txt'), sep=",", header=0) - except: - raise IOError('Error reading in test_thermostability.txt.') - try: - self.test_dataset2 = pd.read_csv(os.path.join('tests', 'test_data', + self.test_dataset2 = pd.read_csv(os.path.join('tests', 'test_data', 'test_enantioselectivity.txt'), sep=",", header=0) - except: - raise IOError('Error reading in test_enantioselectivity.txt.') - try: - self.test_dataset3 = pd.read_csv(os.path.join('tests', 'test_data', + self.test_dataset3 = pd.read_csv(os.path.join('tests', 'test_data', 'test_localization.txt'), sep=",", header=0) - except: - raise IOError('Error reading in test_localization.txt.') - try: - self.test_dataset4 = pd.read_csv(os.path.join('tests', 'test_data', + self.test_dataset4 = pd.read_csv(os.path.join('tests', 'test_data', 'test_absorption.txt'), sep=",", header=0) - except: - raise IOError('Error reading in test_absorption.txt.') #append all datasets to a list self.all_test_datasets = [self.test_dataset1, self.test_dataset2, self.test_dataset3, self.test_dataset4] - #temporary unit test output folder + #create temporary unit test output folder self.test_output_folder = os.path.join("tests", "test_outputs") - os.makedirs(self.test_output_folder + "_" + _globals.CURRENT_DATETIME ) - + if not (os.path.isdir(self.test_output_folder)): + os.makedirs(self.test_output_folder) + def test_valid_sequence(self): """ Test Valid/Invalid Sequences utility function. """ invalid_seqs = [["A", "B", "C", "D"], ["E", "F", "J"]] @@ -68,14 +60,10 @@ def test_valid_sequence(self): invalid_seqs2 = [["Z", 2, "Y", "X", 321]] invalid_seqs3 = "XXZXXZXXZ" #1.) - self.assertIsNotNone(utils.valid_sequence(invalid_seqs), - "Valid sequence function should not return None.") - self.assertIsNotNone(utils.valid_sequence(invalid_seqs1), - "Valid sequence function should not return None.") - self.assertIsNotNone(utils.valid_sequence(invalid_seqs2), - "Valid sequence function should not return None.") - self.assertIsNotNone(utils.valid_sequence(invalid_seqs3), - "Valid sequence function should not return None.") + self.assertIsNotNone(utils.valid_sequence(invalid_seqs), "Valid sequence function should not return None.") + self.assertIsNotNone(utils.valid_sequence(invalid_seqs1), "Valid sequence function should not return None.") + self.assertIsNotNone(utils.valid_sequence(invalid_seqs2), "Valid sequence function should not return None.") + self.assertIsNotNone(utils.valid_sequence(invalid_seqs3), "Valid sequence function should not return None.") #2.) self.assertEqual(len(utils.valid_sequence(invalid_seqs)), 2, "Expected 2 outputs from from valid sequence function, got {}.".format(len(utils.valid_sequence(invalid_seqs)))) @@ -101,14 +89,10 @@ def test_valid_sequence(self): random_seq2 = np.random.randint(0, len(self.test_dataset3)) random_seq3 = np.random.randint(0, len(self.test_dataset4)) - self.assertIsNone(utils.valid_sequence(self.test_dataset1['sequence'][random_seq]), - "Valid sequence function should return None") - self.assertIsNone(utils.valid_sequence(self.test_dataset2['sequence'][random_seq1]), - "Valid sequence function should return None") - self.assertIsNone(utils.valid_sequence(self.test_dataset3['sequence'][random_seq2]), - "Valid sequence function should return None") - self.assertIsNone(utils.valid_sequence(self.test_dataset4['sequence'][random_seq3]), - "Valid sequence function should return None") + self.assertIsNone(utils.valid_sequence(self.test_dataset1['sequence'][random_seq]), "Valid sequence function should return None.") + self.assertIsNone(utils.valid_sequence(self.test_dataset2['sequence'][random_seq1]), "Valid sequence function should return None.") + self.assertIsNone(utils.valid_sequence(self.test_dataset3['sequence'][random_seq2]), "Valid sequence function should return None.") + self.assertIsNone(utils.valid_sequence(self.test_dataset4['sequence'][random_seq3]), "Valid sequence function should return None.") def test_remove_gaps(self): """ Test utility function that removes any gaps from sequences. """ @@ -118,40 +102,26 @@ def test_remove_gaps(self): seq4 = "YUJBVFGHYJ---ASD" #1.) seq1_test = utils.remove_gaps(seq1) - self.assertEqual(len(seq1_test), 1, - "Expected length of output to be 1, got {}.".format(len(seq1_test))) - self.assertEqual(len(seq1_test[0]), 4, - "Expected length of output to be 4, got {}.".format(len(seq1_test[0]))) - self.assertIsInstance(seq1_test, list, - "Expected output to be of type list, got {}.".format(type(seq1_test))) - self.assertNotIn('-', seq1_test, - "There shouldn't be any gaps (-) in the sequence.") + self.assertEqual(len(seq1_test), 1, "Expected length of output to be 1, got {}.".format(len(seq1_test))) + self.assertEqual(len(seq1_test[0]), 4, "Expected length of output to be 4, got {}.".format(len(seq1_test[0]))) + self.assertIsInstance(seq1_test, list, "Expected output to be of type list, got {}.".format(type(seq1_test))) + self.assertNotIn('-', seq1_test, "Expected there to be no gaps (-) in the sequence.") #2.) seq2_test = utils.remove_gaps(seq2) - self.assertEqual(len(seq2_test), 1, - "Expected length of output to be 1, got {}.".format(len(seq2_test))) - self.assertEqual(len(seq2_test[0]), 6, - "Expected length of output to be 6, got {}.".format(len(seq2_test))) - self.assertIsInstance(seq2_test, list, - "Expected output to be of type list, got {}.".format(type(seq2_test))) - self.assertNotIn('-', seq2_test, - "There shouldn't be any gaps (-) in the sequence.") + self.assertEqual(len(seq2_test), 1, "Expected length of output to be 1, got {}.".format(len(seq2_test))) + self.assertEqual(len(seq2_test[0]), 6, "Expected length of output to be 6, got {}.".format(len(seq2_test))) + self.assertIsInstance(seq2_test, list, "Expected output to be of type list, got {}.".format(type(seq2_test))) + self.assertNotIn('-', seq2_test, "Expected there to be no gaps (-) in the sequence.") #3.) seq3_test = utils.remove_gaps(seq3) - self.assertEqual(len(seq3_test), 10, - "Expected length of output to be 10, got {}.".format(len(seq3_test))) - self.assertIsInstance(seq3_test, str, - "Expected output to be of type str, got {}.".format(len(seq3_test))) - self.assertNotIn('-', seq3_test, - "There shouldn't be any gaps (-) in the sequence.") + self.assertEqual(len(seq3_test), 10, "Expected length of output to be 10, got {}.".format(len(seq3_test))) + self.assertIsInstance(seq3_test, str, "Expected output to be of type str, got {}.".format(len(seq3_test))) + self.assertNotIn('-', seq3_test, "Expected there to be no gaps (-) in the sequence.") #4.) seq4_test = utils.remove_gaps(seq4) - self.assertEqual(len(seq4_test), 13, - "Expected length of output to be 13, got {}.".format(len(seq4_test))) - self.assertIsInstance(seq4_test, str, - "Expected output to be of type str, got {}.".format(len(seq4_test))) - self.assertNotIn('-', seq4_test, - "There shouldn't be any gaps (-) in the sequence.") + self.assertEqual(len(seq4_test), 13, "Expected length of output to be 13, got {}.".format(len(seq4_test))) + self.assertIsInstance(seq4_test, str, "Expected output to be of type str, got {}.".format(len(seq4_test))) + self.assertNotIn('-', seq4_test, "Expected there to be no gaps (-) in the sequence.") def test_flatten(self): """ Test flatten utility function that flattens an array or list. """ @@ -160,48 +130,34 @@ def test_flatten(self): seq3 = np.random.randint(10,90,(4,5,2)) seq4 = ["A", "B", "C", "D", "E", "F"] seq5 = "TUVWXYZ" -#1.) +#1.) flattened_array = utils.flatten(seq1) - self.assertEqual(flattened_array.shape, (6,1), - "Expected output shape to be (6,1), got {}.".format(flattened_array.shape)) - self.assertIsInstance(flattened_array, np.ndarray, - "Expected output to be of type np.ndarray, got {}.".format(type(flattened_array))) - self.assertEqual(flattened_array.ndim, 2, - "Expected 2 output dimensions, got {}.".format(flattened_array.ndim)) + self.assertEqual(flattened_array.shape, (6,1), "Expected output shape to be (6,1), got {}.".format(flattened_array.shape)) + self.assertIsInstance(flattened_array, np.ndarray, "Expected output to be of type np.ndarray, got {}.".format(type(flattened_array))) + self.assertEqual(flattened_array.ndim, 2, "Expected 2 output dimensions, got {}.".format(flattened_array.ndim)) self.assertTrue((np.array([[1],[2],[3],[4],[5],[6]]) == flattened_array).all(), - "Output array doesn't match expected:\n{}".format(flattened_array)) + "Output array doesn't match expected:\n{}.".format(flattened_array)) #2.) flattened_array_2 = utils.flatten(seq2) - self.assertEqual(flattened_array_2.shape, (9,1), - "Expected output shape to be (9,1), got {}.".format(flattened_array_2.shape)) - self.assertIsInstance(flattened_array_2, np.ndarray, - "Expected output to be of type np.ndarray, got {}.".format(type(flattened_array_2))) - self.assertEqual(flattened_array_2.ndim, 2, - "Expected 2 output dimensions, got {}.".format(flattened_array_2.ndim)) + self.assertEqual(flattened_array_2.shape, (9,1), "Expected output shape to be (9,1), got {}.".format(flattened_array_2.shape)) + self.assertIsInstance(flattened_array_2, np.ndarray, "Expected output to be of type np.ndarray, got {}.".format(type(flattened_array_2))) + self.assertEqual(flattened_array_2.ndim, 2, "Expected 2 output dimensions, got {}.".format(flattened_array_2.ndim)) self.assertTrue((np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9]]) == flattened_array_2).all(), - "Output array doesn't match expected:\n{}".format(flattened_array_2)) + "Output array doesn't match expected:\n{}.".format(flattened_array_2)) #3.) flattened_array_3 = utils.flatten(seq3) - self.assertEqual(flattened_array_3.shape, (40,1), - "Expected output shape to be (40,1), got {}.".format(flattened_array_3.shape)) - self.assertIsInstance(flattened_array_3, np.ndarray, - "Expected output to be of type np.ndarray, got {}.".format(type(flattened_array_3))) - self.assertEqual(flattened_array_3.ndim, 2, - "Expected 2 output dimensions, got {}.".format(flattened_array_3.ndim)) + self.assertEqual(flattened_array_3.shape, (40,1), "Expected output shape to be (40,1), got {}.".format(flattened_array_3.shape)) + self.assertIsInstance(flattened_array_3, np.ndarray, "Expected output to be of type np.ndarray, got {}.".format(type(flattened_array_3))) + self.assertEqual(flattened_array_3.ndim, 2, "Expected 2 output dimensions, got {}.".format(flattened_array_3.ndim)) #4.) flattened_array_4 = utils.flatten(seq4) - self.assertEqual(len(flattened_array_4), 6, - "Expected length of output to be 6, got {}.".format(len(flattened_array_4))) - self.assertIsInstance(flattened_array_4, list, - "Expected output to be of type list, got {}.".format(type(flattened_array_4))) - self.assertEqual(flattened_array_4, seq4, - "Output doesn't match expected sequence {}".format(seq4)) + self.assertEqual(len(flattened_array_4), 6, "Expected length of output to be 6, got {}.".format(len(flattened_array_4))) + self.assertIsInstance(flattened_array_4, list, "Expected output to be of type list, got {}.".format(type(flattened_array_4))) + self.assertEqual(flattened_array_4, seq4, "Output doesn't match expected sequence {}.".format(seq4)) #5.) flattened_array_5 = utils.flatten(seq5) - self.assertEqual(flattened_array_5, seq5, - "Output doesn't match expected sequence {}".format(seq5)) - self.assertIsInstance(flattened_array_5, str, - "Expected output to be of type string, got {}.".format(type(flattened_array_5))) + self.assertEqual(flattened_array_5, seq5, "Output doesn't match expected sequence {}.".format(seq5)) + self.assertIsInstance(flattened_array_5, str, "Expected output to be of type string, got {}.".format(type(flattened_array_5))) def test_zero_padding(self): """ Test zero padding utility function that pads an array or list with 0's. """ @@ -217,146 +173,109 @@ def test_zero_padding(self): "Expected length of output to be 361, got {}.".format(len(test_dataset3_padded[seq]))) self.assertIsInstance(test_dataset3_padded[seq], str, "Expected output to be of type string, got {}.".format(type(test_dataset3_padded[seq]))) - + self.assertIsInstance(test_dataset3_padded, pd.Series, "Expected output to be of type Series, got {}.".format(type(test_dataset3_padded))) self.assertEqual(test_dataset3_padded.shape[0], 254, "Expected number of sequences to be 254, got {}.".format(test_dataset3_padded[0])) #2.) padded_seqs1 = utils.zero_padding(seq1) - self.assertEqual(len(padded_seqs1), 2, - "Expected length of output to be 2, got {}.".format(len(padded_seqs1))) - self.assertEqual(len(padded_seqs1[0]), 5, - "Expected length of output to be 5, got {}.".format(len(padded_seqs1[0]))) - self.assertIsInstance(padded_seqs1, np.ndarray, - "Expected output to be of type numpy array, got {}.".format(type(padded_seqs1))) + self.assertEqual(len(padded_seqs1), 2, "Expected length of output to be 2, got {}.".format(len(padded_seqs1))) + self.assertEqual(len(padded_seqs1[0]), 5, "Expected length of output to be 5, got {}.".format(len(padded_seqs1[0]))) + self.assertIsInstance(padded_seqs1, np.ndarray, "Expected output to be of type numpy array, got {}.".format(type(padded_seqs1))) #3.) padded_seqs2 = utils.zero_padding(seq2) - self.assertEqual(len(padded_seqs2), 3, - "Expected length of output to be 3, got {}.".format(len(padded_seqs2))) - self.assertEqual(len(padded_seqs2[0]), 4, - "Expected length of output to be 4, got {}.".format(len(padded_seqs2[0]))) - self.assertIsInstance(padded_seqs2, list, - "Expected output to be of type list, got {}.".format(type(padded_seqs2))) + self.assertEqual(len(padded_seqs2), 3, "Expected length of output to be 3, got {}.".format(len(padded_seqs2))) + self.assertEqual(len(padded_seqs2[0]), 4, "Expected length of output to be 4, got {}.".format(len(padded_seqs2[0]))) + self.assertIsInstance(padded_seqs2, list, "Expected output to be of type list, got {}.".format(type(padded_seqs2))) #4.) padded_seqs3 = utils.zero_padding(seq3) - self.assertEqual(len(padded_seqs3), 4, - "Expected length of output to be 4, got {}.".format(len(padded_seqs3))) - self.assertEqual(padded_seqs3.shape, (4,5,2), - "Expected output to be of shape (4,5,2), got {}.".format(padded_seqs3.shape)) - self.assertIsInstance(padded_seqs3, np.ndarray, - "Expected output to be of type numpy array, got {}.".format(type(padded_seqs3))) - self.assertTrue(padded_seqs3.any() == seq3.any(), - "Expected original and padded sequences to have the same values.") + self.assertEqual(len(padded_seqs3), 4, "Expected length of output to be 4, got {}.".format(len(padded_seqs3))) + self.assertEqual(padded_seqs3.shape, (4,5,2), "Expected output to be of shape (4,5,2), got {}.".format(padded_seqs3.shape)) + self.assertIsInstance(padded_seqs3, np.ndarray, "Expected output to be of type numpy array, got {}.".format(type(padded_seqs3))) + self.assertTrue(padded_seqs3.any() == seq3.any(), "Expected original and padded sequences to have the same values.") def test_save_results(self): """ Testing save results utility function. """ #1.) #create dummy test results, save to csv and verify csv has been created & saved test_results = {'R2': 0.56, 'MSE': 0.34, 'RMSE': 0.89} - utils.save_results(test_results, 'test_results', output_folder=self.test_output_folder) - self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results.csv')), - "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results.csv'))) + utils.save_results(test_results, 'test_results', output_folder=os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder))) + self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results.csv')), + "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results.csv'))) #2.) test_results1 = {'MAE': 2.10, 'MSE': 0.99, 'RPD': 1.28} - utils.save_results(test_results1, 'test_results1', output_folder=self.test_output_folder) - self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results1.csv')), - "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results1.csv'))) + utils.save_results(test_results1, 'test_results1', output_folder=os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder))) + self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results1.csv')), + "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results1.csv'))) #3.) test_results2 = pd.DataFrame(np.random.randint(1,100, size=(5,3)), columns=['R2', 'MSE', 'RMSE']) - utils.save_results(test_results2, 'test_results2', output_folder=self.test_output_folder) - self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results2.csv')), - "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results2.csv'))) + utils.save_results(test_results2, 'test_results2', output_folder=os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder))) + self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results2.csv')), + "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results2.csv'))) #4.) test_results3 = pd.Series(np.random.randint(1,100), index=['Col1', 'Col2', 'Col3', 'Col4']) - utils.save_results(test_results3, 'test_results3', output_folder=self.test_output_folder) - self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results3.csv')), - "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results3.csv'))) + utils.save_results(test_results3, 'test_results3', output_folder=os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder))) + self.assertTrue(os.path.isfile(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results3.csv')), + "Output results csv not found in output folder: {}.".format(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results3.csv'))) #5.) test_results4 = np.random.randint(1, 100, size=(2,6)) with self.assertRaises(TypeError, msg='Type Error raised, invalid input parameter data type given.'): - utils.save_results(test_results4, 'test_results4') - self.assertFalse(os.path.isfile(os.path.join(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, 'test_results4.csv')), - "") + utils.save_results(test_results4, 'test_results4', output_folder=os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder))) + self.assertFalse(os.path.isfile(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results4.csv')), + "Output results csv should not be found in output folder: {}.".format(os.path.join(self.test_output_folder, os.path.basename(self.test_output_folder) + "_" + _globals.CURRENT_DATETIME, 'test_results4.csv'))) def test_map(self): """ Testing Map class which allows for a dict to be accessed via dot notation. """ #1.) test_map1 = utils.Map({"first_name":"Joe", "last_name":"Bloggs", "country":"Ireland", "city":"Dublin"}) - self.assertIsInstance(test_map1, dict, - "Expected instance to be of type dict, got {}.".format(type(test_map1))) - self.assertEqual(test_map1.first_name, "Joe", - "Expected Joe, got {}.".format(test_map1.first_name)) - self.assertEqual(test_map1.last_name, "Bloggs", - "Expected Bloggs, got {}.".format(test_map1.last_name)) - self.assertEqual(test_map1.country, "Ireland", - "Expected Ireland, got {}.".format(test_map1.country)) - self.assertEqual(test_map1.city, "Dublin", - "Expected Dublin, got {}.".format(test_map1.city)) - self.assertEqual(len(test_map1), 4, - "Expected output length to be 4, got {}.".format(len(test_map1))) + self.assertIsInstance(test_map1, dict, "Expected instance to be of type dict, got {}.".format(type(test_map1))) + self.assertEqual(test_map1.first_name, "Joe", "Expected Joe, got {}.".format(test_map1.first_name)) + self.assertEqual(test_map1.last_name, "Bloggs", "Expected Bloggs, got {}.".format(test_map1.last_name)) + self.assertEqual(test_map1.country, "Ireland", "Expected Ireland, got {}.".format(test_map1.country)) + self.assertEqual(test_map1.city, "Dublin", "Expected Dublin, got {}.".format(test_map1.city)) + self.assertEqual(len(test_map1), 4, "Expected output length to be 4, got {}.".format(len(test_map1))) #2.) test_map2 = utils.Map({"first_name":"John", "last_name":"Smith"}, country="Germany", city="Hanover") - self.assertIsInstance(test_map2, dict, - "Expected instance to be of type dict, got {}.".format(type(test_map2))) - self.assertEqual(test_map2.first_name, "John", - "Expected John, got {}.".format(test_map2.first_name)) - self.assertEqual(test_map2.last_name, "Smith", - "Expected Smith, got {}.".format(test_map2.last_name)) - self.assertEqual(test_map2.country, "Germany", - "Expected Germany, got {}.".format(test_map2.country)) - self.assertEqual(test_map2.city, "Hanover", - "Expected Hanover, got {}.".format(test_map2.city)) - self.assertEqual(len(test_map2), 4, - "Expected output length to be 4, got {}.".format(len(test_map2))) + self.assertIsInstance(test_map2, dict, "Expected instance to be of type dict, got {}.".format(type(test_map2))) + self.assertEqual(test_map2.first_name, "John", "Expected John, got {}.".format(test_map2.first_name)) + self.assertEqual(test_map2.last_name, "Smith", "Expected Smith, got {}.".format(test_map2.last_name)) + self.assertEqual(test_map2.country, "Germany", "Expected Germany, got {}.".format(test_map2.country)) + self.assertEqual(test_map2.city, "Hanover", "Expected Hanover, got {}.".format(test_map2.city)) + self.assertEqual(len(test_map2), 4, "Expected output length to be 4, got {}.".format(len(test_map2))) #3.) test_map3 = utils.Map({}) - self.assertIsInstance(test_map3, dict, - "Expected instance to be of type dict, got {}.".format(type(test_map3))) - self.assertEqual(test_map3, {}, - "Expected an empty dict, got {}.".format(test_map3)) - self.assertEqual(len(test_map3), 0, - "Expected output length to be 0, got {}.".format(len(test_map3))) + self.assertIsInstance(test_map3, dict, "Expected instance to be of type dict, got {}.".format(type(test_map3))) + self.assertEqual(test_map3, {}, "Expected an empty dict, got {}.".format(test_map3)) + self.assertEqual(len(test_map3), 0, "Expected output length to be 0, got {}.".format(len(test_map3))) #4.) test_map1.language = "Python" test_map1["age"] = 42 - self.assertEqual(test_map1.language, "Python", - "Expected Python, got {}.".format(test_map1.language)) - self.assertEqual(test_map1.age, 42, - "Expected 42, got {}.".format(test_map1.age)) - self.assertEqual(len(test_map1), 6, - "Expected output length to be 6, got {}.".format(len(test_map1))) + self.assertEqual(test_map1.language, "Python", "Expected Python, got {}.".format(test_map1.language)) + self.assertEqual(test_map1.age, 42, "Expected 42, got {}.".format(test_map1.age)) + self.assertEqual(len(test_map1), 6, "Expected output length to be 6, got {}.".format(len(test_map1))) test_map2.language = "C++" test_map2.age = 20 - self.assertEqual(test_map2.language, "C++", - "Expected C++, got {}.".format(test_map2.language)) - self.assertEqual(test_map2.age, 20, - "Expected 20, got {}.".format(test_map2.age)) - self.assertEqual(len(test_map2), 6, - "Expected output length to be 6, got {}.".format(len(test_map2))) + self.assertEqual(test_map2.language, "C++", "Expected C++, got {}.".format(test_map2.language)) + self.assertEqual(test_map2.age, 20, "Expected 20, got {}.".format(test_map2.age)) + self.assertEqual(len(test_map2), 6, "Expected output length to be 6, got {}.".format(len(test_map2))) test_map3.language = "Ruby" test_map3.age = 99 - self.assertEqual(test_map3.language, "Ruby", - "Expected Ruby, got {}.".format(test_map3.language)) - self.assertEqual(test_map3.age, 99, - "Expected 99, got {}.".format(test_map3.age)) - self.assertEqual(len(test_map3), 2, - "Expected output length to be 2, got {}.".format(len(test_map3))) + self.assertEqual(test_map3.language, "Ruby", "Expected Ruby, got {}.".format(test_map3.language)) + self.assertEqual(test_map3.age, 99, "Expected 99, got {}.".format(test_map3.age)) + self.assertEqual(len(test_map3), 2, "Expected output length to be 2, got {}.".format(len(test_map3))) #5.) del test_map1.first_name - self.assertEqual(len(test_map1), 5, - "Expected output length to be 5, got {}.".format(len(test_map1))) + self.assertEqual(len(test_map1), 5, "Expected output length to be 5, got {}.".format(len(test_map1))) del test_map1.country - self.assertEqual(len(test_map1), 4, - "Expected output length to be 4, got {}.".format(len(test_map1))) + self.assertEqual(len(test_map1), 4, "Expected output length to be 4, got {}.".format(len(test_map1))) del test_map3.language - self.assertEqual(len(test_map3), 1, - "Expected output length to be 1, got {}.".format(len(test_map3))) + self.assertEqual(len(test_map3), 1, "Expected output length to be 1, got {}.".format(len(test_map3))) #6.) with self.assertRaises(TypeError): utils.Map(1245) @@ -370,8 +289,5 @@ def tearDown(self): del self.test_dataset3 del self.test_dataset4 - #removing any of the temp files created such as the results files, if - #you want to verify the results files are actually being created then - #comment out the below code block. - if (os.path.isdir(self.test_output_folder + "_" + _globals.CURRENT_DATETIME)): - shutil.rmtree(self.test_output_folder + "_" + _globals.CURRENT_DATETIME, ignore_errors=False, onerror=None) \ No newline at end of file + #removing any of the temp files created such as the results files/outputs + shutil.rmtree(self.test_output_folder , ignore_errors=False, onerror=None) \ No newline at end of file