From 69711236d64b5cc6461f50a1039fe49b61cd101b Mon Sep 17 00:00:00 2001 From: Antonios Sarikas Date: Tue, 28 May 2024 22:29:47 +0300 Subject: [PATCH] Add unit tests for VoxelDataset --- docs/source/changelog.rst | 12 ++ docs/source/conf.py | 1 + docs/source/tutorial.rst | 2 +- src/moxel/data.py | 127 ++++++++++-------- src/moxel/utils.py | 20 ++- tests/test_data.py | 89 +++++++++++- .../clean_names.json | 0 .../clean_voxels.npy | Bin tests/toy_dataset/dummy.csv | 11 ++ .../{dummy_voxels => toy_dataset}/names.json | 0 tests/toy_dataset/test.json | 3 + tests/toy_dataset/train.json | 6 + tests/toy_dataset/validation.json | 3 + .../{dummy_voxels => toy_dataset}/voxels.npy | Bin 14 files changed, 205 insertions(+), 69 deletions(-) rename tests/{dummy_voxels => toy_dataset}/clean_names.json (100%) rename tests/{dummy_voxels => toy_dataset}/clean_voxels.npy (100%) create mode 100644 tests/toy_dataset/dummy.csv rename tests/{dummy_voxels => toy_dataset}/names.json (100%) create mode 100644 tests/toy_dataset/test.json create mode 100644 tests/toy_dataset/train.json create mode 100644 tests/toy_dataset/validation.json rename tests/{dummy_voxels => toy_dataset}/voxels.npy (100%) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 481d405..d66b51b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,18 @@ |:pushpin:| Changelog ===================== +Version x.x.x +------------- + +.. versionchanged:: x.x.x + + * Parameter ``out_dirname`` replaces ``out_pathname`` in + :func:`utils.voxels_from_files` and :func:`utils.voxels_from_dir`. + +.. versionadded:: x.x.x + + * CLI for RetNet + Version 0.1.2 ------------- diff --git a/docs/source/conf.py b/docs/source/conf.py index ff28918..fdce468 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -40,6 +40,7 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = 'sphinx_rtd_theme' +#html_logo = 'images/moxel_logo.svg' #html_static_path = ['_static'] # Path to GitHub repo {group}/{project} (note that `group` is the GitHub user or organization) diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 96cb5e6..736f11e 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -113,7 +113,7 @@ processed. .. code-tab:: python >>> from moxel.utils import voxels_from_dir - >>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_pathname='path/to/batch') + >>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_dirname='path/to/batch') .. code-tab:: console :caption: CLI diff --git a/src/moxel/data.py b/src/moxel/data.py index 4b70141..f7563a9 100644 --- a/src/moxel/data.py +++ b/src/moxel/data.py @@ -1,3 +1,19 @@ +# This file is part of MOXελ. +# Copyright (C) 2023-2024 Antonios P. Sarikas + +# MOXελ is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + r""" Write the docstring of the module. """ @@ -5,6 +21,8 @@ import os import json from pathlib import Path +import numpy as np +import pandas as pd import torch from torch.utils.data import Dataset, random_split from . utils import load_json @@ -12,12 +30,33 @@ def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1): r""" - Split a list of materials' names into train, validation and test sets. + Split voxels into train, validation and test sets. .. warning:: + * You should use this function **after** :func:`utils.batch_clean`. - * No directory is created by :func:`prepare_data`. **All ``.json`` - files are stored under the directory containing ``source``**. + * No directory is created by :func:`prepare_data`. + * All ``.json`` files are stored under the directory containing ``source``. + + Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be + used for training, validation and testing. + + Parameters + ---------- + source: str + Pathname to the file holding the names of the materials. + split_ratio: sequence of size 3, default=(0.8, 0.1, 0.1) + The sizes or fractions of splits to be produced. + + * ``train == split_ratio[0]``. + * ``val == split_ratio[1]``. + * ``test == split_ratio[2]``. + + seed : int, default=1 + Controls the randomness of the ``rng`` used for splitting. + + Examples + -------- Before the split:: @@ -25,6 +64,8 @@ def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1): ├──clean_voxels.npy └──clean_names.json + >>> prepare_data('path/to/voxels_data/clean_names.json') # doctest: SKIP + After the split:: voxels_data @@ -33,22 +74,6 @@ def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1): ├──train.json ├──validation.json └──test.json - - Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be - used for training, validation and testing. - - Parameters - ---------- - source: str - Pathname to the file holding the names of the materials - (``clean_names.json``). - split_ratio: array-like of shape (3,), default=(0.8, 0.1, 0.1) - The sizes or fractions of splits to be produced. - * ``split_ratio[0] == train``. - * ``split_ratio[1] == validation``. - * ``split_ratio[2] == test``. - seed : int, default=1 - Controls the randomness of the ``rng`` used for splitting. """ rng = torch.Generator().manual_seed(seed) path = Path(source).parent @@ -68,6 +93,11 @@ class VoxelDataset(Dataset): r""" Dataset for voxels. + .. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms + + .. tip:: + See `transforms`_ for implementing your own transforms. + Parameters ---------- path_to_indices: str @@ -76,28 +106,20 @@ class VoxelDataset(Dataset): Pathname to the ``.npy`` file holding the voxels. path_to_names: str, optional Pathname to the ``.json`` file holding the names of the materials. No - effect if ``path_to_Y == None``. + effect if ``path_to_Y=None``. path_to_Y : str, optional Pathname to the ``.csv`` file holding the labels of the voxels. index_col : str, optional - Column name of the ``.csv`` file to be used as row labels. The values - (names) under this column must follow the same naming scheme as in + Column name of the ``.csv`` file to be used as row labels. The names + (values) under this column must follow the same naming scheme as in ``clean_names.json``. labels : list, optional - List containing the names of the properties to be predicted. No effect - if ``path_to_Y == None``. + List containing the column names of the ``.csv`` to be used as labels. + No effect if ``path_to_Y=None``. transform_x : callable, optional - Transforms applied to input. See `transforms`_ for implementing your own - transforms. + Transforms applied to input. transform_y : callable, optional - Transforms applied to output. See `transforms`_ for implementing your - own transforms. No effect if ``pcd_Y == None``. - - .. note:: - For example, if you want to perform classification, here you can - pass the one-hot encoder (if the dataset is not already preprocessed). - - .. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms + Transforms applied to output. No effect if ``path_to_Y=None``. """ def __init__( self, path_to_indices, path_to_X, @@ -106,7 +128,7 @@ def __init__( transform_x=None, transform_y=None, ): - if (labels is not None) and (type(labels) != list): + if (labels is not None) and (type(labels) is not list): raise ValueError('labels must be a list!') self.path_to_X = path_to_X @@ -124,17 +146,21 @@ def __init__( self.X = None self.Y = None - @property - def voxel_indices(self): - return self._voxel_indices - def __len__(self): - return len(self.voxel_indices) + return len(self._voxel_indices) def __getitem__(self, idx): - # Account for np.load and multiprocessing. if self.X is None: - self.X = np.load(self.path_to_X, mmap_mode='r') + # Load and add a channel dimension to voxels array. + self.X = np.load(self.path_to_X, mmap_mode='r')[:, None] + + voxel_idx = self._voxel_indices[idx] + sample_x = torch.tensor(self.X[voxel_idx], dtype=torch.float) + + if self.transform_x is not None: + sample_x = self.transform_x(sample_x) + + # Only for labeled datasets. if self.Y is None and self.path_to_Y is not None: self.Y = pd.read_csv( self.path_to_Y, @@ -142,22 +168,13 @@ def __getitem__(self, idx): usecols=[*self.labels, self.index_col], ) - sample_x = self.X[name] - - if self.transform_x is not None: - sample_x = self.transform_x(sample_x) - - # Only for labeled datasets. if self.Y is not None: - name = self._voxel_names[idx] - sample_y = self.Y.loc[name].values + name = self._voxel_names[voxel_idx] + sample_y = torch.tensor(self.Y.loc[name].values, dtype=torch.float) if self.transform_y is not None: sample_y = self.transform_y(sample_y) - return ( - torch.tensor(sample_x, dtype=torch.float), - torch.tensor(sample_y, dtype=torch.float) - ) + return sample_x, sample_y - return torch.tensor(sample_x, dtype=torch.float) + return sample_x diff --git a/src/moxel/utils.py b/src/moxel/utils.py index fbec050..462c19a 100644 --- a/src/moxel/utils.py +++ b/src/moxel/utils.py @@ -166,7 +166,7 @@ def calculate(self, cubic_box=False, length=30, potential='lj', n_jobs=None): If ``True``, the simulation box is cubic. length : float, default=30 The size of the cubic box in Å. Takes effect only - if ``cubic_box == True``. + if ``cubic_box=True``. n_jobs : int, optional Number of jobs to run in parallel. If ``None``, then the number returned by ``os.cpu_count()`` is used. @@ -217,7 +217,7 @@ def lj_potential(self, coords): Parameters ---------- coordinates : array_like of shape (3,) - If ``cubic_box == True`` cartesian. Else, fractional. + If ``cubic_box=True`` cartesian. Else, fractional. Returns ------- @@ -234,10 +234,8 @@ def lj_potential(self, coords): self.cutoff, zip_results=False, ) - ''' - Need to check for length of r_ij because of - https://github.com/materialsproject/pymatgen/issues/3794 - ''' + # Need to check for length of r_ij because of + # https://github.com/materialsproject/pymatgen/issues/3794 if len(r_ij) == 0: # No neighbor, zero energy. return 1. @@ -247,7 +245,7 @@ def lj_potential(self, coords): es_j = self._lj_params[indices] x = (0.5 * (es_j[:, 1] + self.sigma)) / r_ij e = 4 * np.sqrt(es_j[:, 0] * self.epsilon) - energy = sum(e * (x**12 - x**6)) + energy = np.sum(e * (x**12 - x**6)) # This should be changed with clipping in future versions. return np.exp(-(1 / 298) * energy) # For numerical stability. @@ -276,7 +274,7 @@ def voxels_from_file( cubic_box : bool, default=False If ``True``, the simulation box is cubic. length : float, default=30 - The size of the cubic box in Å. Takes effect only if ``cubic_box == True``. + The size of the cubic box in Å. Takes effect only if ``cubic_box=True``. n_jobs : int, optional Number of jobs to run in parallel. If ``None``, then the number returned by ``os.cpu_count()`` is used. @@ -286,7 +284,7 @@ def voxels_from_file( Returns ------- out : ``array`` or :class:`Grid` - If ``only_voxels == True``, array of shape ``(grid_size,)*3``. + If ``only_voxels=True``, array of shape ``(grid_size,)*3``. Otherwise, :class:`Grid`. Notes @@ -343,7 +341,7 @@ def voxels_from_files( cubic_box : bool, default=False If ``True``, the simulation box is cubic. length : float, default=30 - The size of the cubic box in Å. Takes effect only if ``cubic_box == True``. + The size of the cubic box in Å. Takes effect only if ``cubic_box=True``. n_jobs : int, optional Number of jobs to run in parallel. If ``None``, then the number returned by ``os.cpu_count()`` is used. @@ -417,7 +415,7 @@ def voxels_from_dir( cubic_box : bool, default=False If ``True``, the simulation box is cubic. length : float, default=30 - The size of the cubic box in Å. Takes effect only if ``cubic_box == True``. + The size of the cubic box in Å. Takes effect only if ``cubic_box=True``. n_jobs : int, optional Number of jobs to run in parallel. If ``None``, then the number returned by ``os.cpu_count()`` is used. diff --git a/tests/test_data.py b/tests/test_data.py index f581705..b9aac22 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -19,8 +19,12 @@ import unittest import tempfile from itertools import combinations -from moxel.data import prepare_data +import numpy as np +import pandas as pd +import torch +from torch.utils.data import DataLoader from moxel.utils import load_json +from moxel.data import prepare_data, VoxelDataset class TestPrepareData(unittest.TestCase): @@ -64,7 +68,88 @@ def tearDown(self): class TestVoxelsDataset(unittest.TestCase): - ... + def setUp(self): + # The grid size is 5 and the train size is 4. + self.path_to_indices = 'tests/toy_dataset/train.json' + self.path_to_X = 'tests/toy_dataset/clean_voxels.npy' + self.dummy_tfm_x = lambda x: x + 100 + self.ds_indices = load_json(self.path_to_indices) + self.ds_voxels = np.load(self.path_to_X, mmap_mode='r')[:, None] + self.batch_size = 2 + + # For labeled dataset (optional). + self.path_to_names = 'tests/toy_dataset/clean_names.json' + self.path_to_Y = 'tests/toy_dataset/dummy.csv' + self.index_col = 'id' + self.labels = ['y_1', 'y_3'] + self.dummy_tfm_y = lambda x: x - 500 + + def test_unlabeled_dataset(self): + ds = VoxelDataset( + path_to_indices=self.path_to_indices, + path_to_X=self.path_to_X, + transform_x=self.dummy_tfm_x, + ) + + dl = DataLoader(ds, batch_size=self.batch_size) + + # Check that it has correct size. + self.assertEqual(len(ds), 4) + + # Check that it works with the dataloader. + for x in dl: + self.assertEqual(x.shape, (self.batch_size, 1, 5, 5, 5)) # Shape (B, C, D, H, W). + self.assertIs(x.dtype, torch.float) + + # Check that transforms are correctly applied. + for i in range(len(ds)): + transformed_x = ds[i] + raw_x = torch.tensor(self.ds_voxels[self.ds_indices[i]]) + self.assertTrue(torch.equal(transformed_x, self.dummy_tfm_x(raw_x))) + + def test_labeled_dataset(self): + ds = VoxelDataset( + path_to_indices=self.path_to_indices, + path_to_X=self.path_to_X, + path_to_names=self.path_to_names, + path_to_Y=self.path_to_Y, + index_col=self.index_col, + labels=self.labels, + transform_x=self.dummy_tfm_x, + transform_y=self.dummy_tfm_y, + ) + + dl = DataLoader(ds, batch_size=self.batch_size) + + material_names = load_json(self.path_to_names) + + df = pd.read_csv( + self.path_to_Y, + index_col=self.index_col, + usecols=[self.index_col, *self.labels], + ) + + # Check that it has correct size. + self.assertEqual(len(ds), 4) + + # Check that it works with the dataloader. + for x, y in dl: + self.assertEqual(x.shape, (self.batch_size, 1, 5, 5, 5)) # Shape (B, C, D, H, W). + self.assertIs(x.dtype, torch.float) + self.assertEqual(y.shape, (self.batch_size, len(self.labels))) # Shape (B, n_out). + self.assertIs(y.dtype, torch.float) + + # Check that transforms are correctly applied. + for i in range(len(ds)): + idx = self.ds_indices[i] + name = material_names[idx] + + transformed_x, transformed_y = ds[i] + raw_x = torch.tensor(self.ds_voxels[idx]) + raw_y = torch.tensor(df.loc[name, self.labels].values) + + self.assertTrue(torch.equal(transformed_x, self.dummy_tfm_x(raw_x))) + self.assertTrue(torch.equal(transformed_y, self.dummy_tfm_y(raw_y))) if __name__ == '__main__': diff --git a/tests/dummy_voxels/clean_names.json b/tests/toy_dataset/clean_names.json similarity index 100% rename from tests/dummy_voxels/clean_names.json rename to tests/toy_dataset/clean_names.json diff --git a/tests/dummy_voxels/clean_voxels.npy b/tests/toy_dataset/clean_voxels.npy similarity index 100% rename from tests/dummy_voxels/clean_voxels.npy rename to tests/toy_dataset/clean_voxels.npy diff --git a/tests/toy_dataset/dummy.csv b/tests/toy_dataset/dummy.csv new file mode 100644 index 0000000..a1b4b9d --- /dev/null +++ b/tests/toy_dataset/dummy.csv @@ -0,0 +1,11 @@ +id,y_1,y_2,y_3 +COF-5,1,2,3 +IRMOF-1,3,4,5 +MnH28C26(N2Cl)2,10,20,300 +ZIF-69,100,500,300 +ZnHBDC,-2,-4,-2 +ZnMOF-74,-9,-100,-305 +corrupted_1,,, +corrupted_2,,, +corrupted_3,,, +corrupted_4,,, diff --git a/tests/dummy_voxels/names.json b/tests/toy_dataset/names.json similarity index 100% rename from tests/dummy_voxels/names.json rename to tests/toy_dataset/names.json diff --git a/tests/toy_dataset/test.json b/tests/toy_dataset/test.json new file mode 100644 index 0000000..3a7b8b2 --- /dev/null +++ b/tests/toy_dataset/test.json @@ -0,0 +1,3 @@ +[ + 4 +] \ No newline at end of file diff --git a/tests/toy_dataset/train.json b/tests/toy_dataset/train.json new file mode 100644 index 0000000..a845ede --- /dev/null +++ b/tests/toy_dataset/train.json @@ -0,0 +1,6 @@ +[ + 1, + 5, + 2, + 0 +] \ No newline at end of file diff --git a/tests/toy_dataset/validation.json b/tests/toy_dataset/validation.json new file mode 100644 index 0000000..e877aac --- /dev/null +++ b/tests/toy_dataset/validation.json @@ -0,0 +1,3 @@ +[ + 3 +] \ No newline at end of file diff --git a/tests/dummy_voxels/voxels.npy b/tests/toy_dataset/voxels.npy similarity index 100% rename from tests/dummy_voxels/voxels.npy rename to tests/toy_dataset/voxels.npy