From 69711236d64b5cc6461f50a1039fe49b61cd101b Mon Sep 17 00:00:00 2001
From: Antonios Sarikas <antonios.sarikas@gmail.com>
Date: Tue, 28 May 2024 22:29:47 +0300
Subject: [PATCH] Add unit tests for VoxelDataset

---
 docs/source/changelog.rst                     |  12 ++
 docs/source/conf.py                           |   1 +
 docs/source/tutorial.rst                      |   2 +-
 src/moxel/data.py                             | 127 ++++++++++--------
 src/moxel/utils.py                            |  20 ++-
 tests/test_data.py                            |  89 +++++++++++-
 .../clean_names.json                          |   0
 .../clean_voxels.npy                          | Bin
 tests/toy_dataset/dummy.csv                   |  11 ++
 .../{dummy_voxels => toy_dataset}/names.json  |   0
 tests/toy_dataset/test.json                   |   3 +
 tests/toy_dataset/train.json                  |   6 +
 tests/toy_dataset/validation.json             |   3 +
 .../{dummy_voxels => toy_dataset}/voxels.npy  | Bin
 14 files changed, 205 insertions(+), 69 deletions(-)
 rename tests/{dummy_voxels => toy_dataset}/clean_names.json (100%)
 rename tests/{dummy_voxels => toy_dataset}/clean_voxels.npy (100%)
 create mode 100644 tests/toy_dataset/dummy.csv
 rename tests/{dummy_voxels => toy_dataset}/names.json (100%)
 create mode 100644 tests/toy_dataset/test.json
 create mode 100644 tests/toy_dataset/train.json
 create mode 100644 tests/toy_dataset/validation.json
 rename tests/{dummy_voxels => toy_dataset}/voxels.npy (100%)

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index 481d405..d66b51b 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -1,6 +1,18 @@
 |:pushpin:| Changelog
 =====================
 
+Version x.x.x
+-------------
+
+.. versionchanged:: x.x.x
+
+   * Parameter ``out_dirname`` replaces ``out_pathname`` in
+     :func:`utils.voxels_from_files` and :func:`utils.voxels_from_dir`.
+
+.. versionadded:: x.x.x
+   
+   * CLI for RetNet
+
 Version 0.1.2
 -------------
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index ff28918..fdce468 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -40,6 +40,7 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 html_theme = 'sphinx_rtd_theme'
+#html_logo = 'images/moxel_logo.svg'
 #html_static_path = ['_static']
 
 # Path to GitHub repo {group}/{project}  (note that `group` is the GitHub user or organization)
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index 96cb5e6..736f11e 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -113,7 +113,7 @@ processed.
         .. code-tab:: python
 
             >>> from moxel.utils import voxels_from_dir
-            >>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_pathname='path/to/batch')
+            >>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_dirname='path/to/batch')
 
         .. code-tab:: console
             :caption: CLI
diff --git a/src/moxel/data.py b/src/moxel/data.py
index 4b70141..f7563a9 100644
--- a/src/moxel/data.py
+++ b/src/moxel/data.py
@@ -1,3 +1,19 @@
+# This file is part of MOXελ.
+# Copyright (C) 2023-2024 Antonios P. Sarikas
+
+# MOXελ is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
 r"""
 Write the docstring of the module.
 """
@@ -5,6 +21,8 @@
 import os
 import json
 from pathlib import Path
+import numpy as np
+import pandas as pd
 import torch
 from torch.utils.data import Dataset, random_split
 from . utils import load_json
@@ -12,12 +30,33 @@
 
 def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1):
     r"""
-    Split a list of materials' names into train, validation and test sets.
+    Split voxels into train, validation and test sets.
 
     .. warning::
+
         * You should use this function **after** :func:`utils.batch_clean`.
-        * No directory is created by :func:`prepare_data`. **All ``.json``
-        files are stored under the directory containing ``source``**.
+        * No directory is created by :func:`prepare_data`.
+        * All ``.json`` files are stored under the directory containing ``source``.
+
+    Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be
+    used for training, validation and testing.
+
+    Parameters
+    ----------
+    source: str
+        Pathname to the file holding the names of the materials.
+    split_ratio: sequence of size 3, default=(0.8, 0.1, 0.1)
+        The sizes or fractions of splits to be produced.
+
+        * ``train == split_ratio[0]``.
+        * ``val == split_ratio[1]``.
+        * ``test == split_ratio[2]``.
+
+    seed : int, default=1
+        Controls the randomness of the ``rng`` used for splitting.
+
+    Examples
+    --------
 
     Before the split::
 
@@ -25,6 +64,8 @@ def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1):
         ├──clean_voxels.npy
         └──clean_names.json
 
+    >>> prepare_data('path/to/voxels_data/clean_names.json')  # doctest: SKIP
+
     After the split::
 
         voxels_data
@@ -33,22 +74,6 @@ def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1):
         ├──train.json
         ├──validation.json
         └──test.json
-
-    Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be
-    used for training, validation and testing.
-
-    Parameters
-    ----------
-    source: str
-        Pathname to the file holding the names of the materials
-        (``clean_names.json``).
-    split_ratio: array-like of shape (3,), default=(0.8, 0.1, 0.1)
-        The sizes or fractions of splits to be produced.
-        * ``split_ratio[0] == train``.
-        * ``split_ratio[1] == validation``.
-        * ``split_ratio[2] == test``.
-    seed : int, default=1
-        Controls the randomness of the ``rng`` used for splitting.
     """
     rng = torch.Generator().manual_seed(seed)
     path = Path(source).parent
@@ -68,6 +93,11 @@ class VoxelDataset(Dataset):
     r"""
     Dataset for voxels.
 
+    .. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms
+
+    .. tip::
+        See `transforms`_ for implementing your own transforms.
+
     Parameters
     ----------
     path_to_indices: str
@@ -76,28 +106,20 @@ class VoxelDataset(Dataset):
         Pathname to the ``.npy`` file holding the voxels.
     path_to_names: str, optional
         Pathname to the ``.json`` file holding the names of the materials. No
-        effect if ``path_to_Y == None``.
+        effect if ``path_to_Y=None``.
     path_to_Y : str, optional
         Pathname to the ``.csv`` file holding the labels of the voxels.
     index_col : str, optional
-        Column name of the ``.csv`` file to be used as row labels. The values
-        (names) under this column must follow the same naming scheme as in
+        Column name of the ``.csv`` file to be used as row labels. The names
+        (values) under this column must follow the same naming scheme as in
         ``clean_names.json``.
     labels : list, optional
-        List containing the names of the properties to be predicted. No effect
-        if ``path_to_Y == None``.
+        List containing the column names of the ``.csv`` to be used as labels.
+        No effect if ``path_to_Y=None``.
     transform_x : callable, optional
-        Transforms applied to input. See `transforms`_ for implementing your own
-        transforms.
+        Transforms applied to input.
     transform_y : callable, optional
-        Transforms applied to output.  See `transforms`_ for implementing your
-        own transforms. No effect if ``pcd_Y == None``.
-
-        .. note::
-            For example, if you want to perform classification, here you can
-            pass the one-hot encoder (if the dataset is not already preprocessed).
-
-    .. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms
+        Transforms applied to output. No effect if ``path_to_Y=None``.
     """
     def __init__(
             self, path_to_indices, path_to_X,
@@ -106,7 +128,7 @@ def __init__(
             transform_x=None, transform_y=None,
             ):
 
-        if (labels is not None) and (type(labels) != list):
+        if (labels is not None) and (type(labels) is not list):
             raise ValueError('labels must be a list!')
 
         self.path_to_X = path_to_X
@@ -124,17 +146,21 @@ def __init__(
         self.X = None
         self.Y = None
 
-    @property
-    def voxel_indices(self):
-        return self._voxel_indices
-
     def __len__(self):
-        return len(self.voxel_indices)
+        return len(self._voxel_indices)
 
     def __getitem__(self, idx):
-        # Account for np.load and multiprocessing.
         if self.X is None:
-            self.X = np.load(self.path_to_X, mmap_mode='r')
+            # Load and add a channel dimension to voxels array.
+            self.X = np.load(self.path_to_X, mmap_mode='r')[:, None]
+
+        voxel_idx = self._voxel_indices[idx]
+        sample_x = torch.tensor(self.X[voxel_idx], dtype=torch.float)
+
+        if self.transform_x is not None:
+            sample_x = self.transform_x(sample_x)
+
+        # Only for labeled datasets.
         if self.Y is None and self.path_to_Y is not None:
             self.Y = pd.read_csv(
                     self.path_to_Y,
@@ -142,22 +168,13 @@ def __getitem__(self, idx):
                     usecols=[*self.labels, self.index_col],
                     )
 
-        sample_x = self.X[name]
-
-        if self.transform_x is not None:
-            sample_x = self.transform_x(sample_x)
-
-        # Only for labeled datasets.
         if self.Y is not None:
-            name = self._voxel_names[idx]
-            sample_y = self.Y.loc[name].values
+            name = self._voxel_names[voxel_idx]
+            sample_y = torch.tensor(self.Y.loc[name].values, dtype=torch.float)
 
             if self.transform_y is not None:
                 sample_y = self.transform_y(sample_y)
 
-            return (
-                    torch.tensor(sample_x, dtype=torch.float),
-                    torch.tensor(sample_y, dtype=torch.float)
-                    )
+            return sample_x, sample_y
 
-        return torch.tensor(sample_x, dtype=torch.float)
+        return sample_x
diff --git a/src/moxel/utils.py b/src/moxel/utils.py
index fbec050..462c19a 100644
--- a/src/moxel/utils.py
+++ b/src/moxel/utils.py
@@ -166,7 +166,7 @@ def calculate(self, cubic_box=False, length=30, potential='lj', n_jobs=None):
             If ``True``, the simulation box is cubic.
         length : float, default=30
             The size of the cubic box in Å. Takes effect only
-            if ``cubic_box == True``.
+            if ``cubic_box=True``.
         n_jobs : int, optional
             Number of jobs to run in parallel. If ``None``, then the number returned
             by ``os.cpu_count()`` is used.
@@ -217,7 +217,7 @@ def lj_potential(self, coords):
         Parameters
         ----------
         coordinates : array_like of shape (3,)
-            If ``cubic_box == True`` cartesian. Else, fractional.
+            If ``cubic_box=True`` cartesian. Else, fractional.
 
         Returns
         -------
@@ -234,10 +234,8 @@ def lj_potential(self, coords):
                 self.cutoff, zip_results=False,
                 )
 
-        '''
-        Need to check for length of r_ij because of
-        https://github.com/materialsproject/pymatgen/issues/3794
-        '''
+        # Need to check for length of r_ij because of
+        # https://github.com/materialsproject/pymatgen/issues/3794
         if len(r_ij) == 0:  # No neighbor, zero energy.
             return 1.
 
@@ -247,7 +245,7 @@ def lj_potential(self, coords):
         es_j = self._lj_params[indices]
         x = (0.5 * (es_j[:, 1] + self.sigma)) / r_ij
         e = 4 * np.sqrt(es_j[:, 0] * self.epsilon)
-        energy = sum(e * (x**12 - x**6))
+        energy = np.sum(e * (x**12 - x**6))
 
         # This should be changed with clipping in future versions.
         return np.exp(-(1 / 298) * energy)  # For numerical stability.
@@ -276,7 +274,7 @@ def voxels_from_file(
     cubic_box : bool, default=False
         If ``True``, the simulation box is cubic.
     length : float, default=30
-        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
+        The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
     n_jobs : int, optional
         Number of jobs to run in parallel. If ``None``, then the number returned
         by ``os.cpu_count()`` is used.
@@ -286,7 +284,7 @@ def voxels_from_file(
     Returns
     -------
     out : ``array`` or :class:`Grid`
-        If ``only_voxels == True``, array of shape ``(grid_size,)*3``.
+        If ``only_voxels=True``, array of shape ``(grid_size,)*3``.
         Otherwise, :class:`Grid`.
 
     Notes
@@ -343,7 +341,7 @@ def voxels_from_files(
     cubic_box : bool, default=False
         If ``True``, the simulation box is cubic.
     length : float, default=30
-        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
+        The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
     n_jobs : int, optional
         Number of jobs to run in parallel. If ``None``, then the number returned
         by ``os.cpu_count()`` is used.
@@ -417,7 +415,7 @@ def voxels_from_dir(
     cubic_box : bool, default=False
         If ``True``, the simulation box is cubic.
     length : float, default=30
-        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
+        The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
     n_jobs : int, optional
         Number of jobs to run in parallel. If ``None``, then the number returned
         by ``os.cpu_count()`` is used.
diff --git a/tests/test_data.py b/tests/test_data.py
index f581705..b9aac22 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -19,8 +19,12 @@
 import unittest
 import tempfile
 from itertools import combinations
-from moxel.data import prepare_data
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader
 from moxel.utils import load_json
+from moxel.data import prepare_data, VoxelDataset
 
 
 class TestPrepareData(unittest.TestCase):
@@ -64,7 +68,88 @@ def tearDown(self):
 
 
 class TestVoxelsDataset(unittest.TestCase):
-    ...
+    def setUp(self):
+        # The grid size is 5 and the train size is 4.
+        self.path_to_indices = 'tests/toy_dataset/train.json'
+        self.path_to_X = 'tests/toy_dataset/clean_voxels.npy'
+        self.dummy_tfm_x = lambda x: x + 100
+        self.ds_indices = load_json(self.path_to_indices)
+        self.ds_voxels = np.load(self.path_to_X, mmap_mode='r')[:, None]
+        self.batch_size = 2
+
+        # For labeled dataset (optional).
+        self.path_to_names = 'tests/toy_dataset/clean_names.json'
+        self.path_to_Y = 'tests/toy_dataset/dummy.csv'
+        self.index_col = 'id'
+        self.labels = ['y_1', 'y_3']
+        self.dummy_tfm_y = lambda x: x - 500
+
+    def test_unlabeled_dataset(self):
+        ds = VoxelDataset(
+                path_to_indices=self.path_to_indices,
+                path_to_X=self.path_to_X,
+                transform_x=self.dummy_tfm_x,
+                )
+
+        dl = DataLoader(ds, batch_size=self.batch_size)
+
+        # Check that it has correct size.
+        self.assertEqual(len(ds), 4)
+
+        # Check that it works with the dataloader.
+        for x in dl:
+            self.assertEqual(x.shape, (self.batch_size, 1, 5, 5, 5))  # Shape (B, C, D, H, W).
+            self.assertIs(x.dtype, torch.float)
+
+        # Check that transforms are correctly applied.
+        for i in range(len(ds)):
+            transformed_x = ds[i]
+            raw_x = torch.tensor(self.ds_voxels[self.ds_indices[i]])
+            self.assertTrue(torch.equal(transformed_x, self.dummy_tfm_x(raw_x)))
+
+    def test_labeled_dataset(self):
+        ds = VoxelDataset(
+                path_to_indices=self.path_to_indices,
+                path_to_X=self.path_to_X,
+                path_to_names=self.path_to_names,
+                path_to_Y=self.path_to_Y,
+                index_col=self.index_col,
+                labels=self.labels,
+                transform_x=self.dummy_tfm_x,
+                transform_y=self.dummy_tfm_y,
+                )
+
+        dl = DataLoader(ds, batch_size=self.batch_size)
+
+        material_names = load_json(self.path_to_names)
+
+        df = pd.read_csv(
+                self.path_to_Y,
+                index_col=self.index_col,
+                usecols=[self.index_col, *self.labels],
+                )
+
+        # Check that it has correct size.
+        self.assertEqual(len(ds), 4)
+
+        # Check that it works with the dataloader.
+        for x, y in dl:
+            self.assertEqual(x.shape, (self.batch_size, 1, 5, 5, 5))  # Shape (B, C, D, H, W).
+            self.assertIs(x.dtype, torch.float)
+            self.assertEqual(y.shape, (self.batch_size, len(self.labels)))  # Shape (B, n_out).
+            self.assertIs(y.dtype, torch.float)
+
+        # Check that transforms are correctly applied.
+        for i in range(len(ds)):
+            idx = self.ds_indices[i]
+            name = material_names[idx]
+
+            transformed_x, transformed_y = ds[i]
+            raw_x = torch.tensor(self.ds_voxels[idx])
+            raw_y = torch.tensor(df.loc[name, self.labels].values)
+
+            self.assertTrue(torch.equal(transformed_x, self.dummy_tfm_x(raw_x)))
+            self.assertTrue(torch.equal(transformed_y, self.dummy_tfm_y(raw_y)))
 
 
 if __name__ == '__main__':
diff --git a/tests/dummy_voxels/clean_names.json b/tests/toy_dataset/clean_names.json
similarity index 100%
rename from tests/dummy_voxels/clean_names.json
rename to tests/toy_dataset/clean_names.json
diff --git a/tests/dummy_voxels/clean_voxels.npy b/tests/toy_dataset/clean_voxels.npy
similarity index 100%
rename from tests/dummy_voxels/clean_voxels.npy
rename to tests/toy_dataset/clean_voxels.npy
diff --git a/tests/toy_dataset/dummy.csv b/tests/toy_dataset/dummy.csv
new file mode 100644
index 0000000..a1b4b9d
--- /dev/null
+++ b/tests/toy_dataset/dummy.csv
@@ -0,0 +1,11 @@
+id,y_1,y_2,y_3
+COF-5,1,2,3
+IRMOF-1,3,4,5
+MnH28C26(N2Cl)2,10,20,300
+ZIF-69,100,500,300
+ZnHBDC,-2,-4,-2
+ZnMOF-74,-9,-100,-305
+corrupted_1,,,
+corrupted_2,,,
+corrupted_3,,,
+corrupted_4,,,
diff --git a/tests/dummy_voxels/names.json b/tests/toy_dataset/names.json
similarity index 100%
rename from tests/dummy_voxels/names.json
rename to tests/toy_dataset/names.json
diff --git a/tests/toy_dataset/test.json b/tests/toy_dataset/test.json
new file mode 100644
index 0000000..3a7b8b2
--- /dev/null
+++ b/tests/toy_dataset/test.json
@@ -0,0 +1,3 @@
+[
+    4
+]
\ No newline at end of file
diff --git a/tests/toy_dataset/train.json b/tests/toy_dataset/train.json
new file mode 100644
index 0000000..a845ede
--- /dev/null
+++ b/tests/toy_dataset/train.json
@@ -0,0 +1,6 @@
+[
+    1,
+    5,
+    2,
+    0
+]
\ No newline at end of file
diff --git a/tests/toy_dataset/validation.json b/tests/toy_dataset/validation.json
new file mode 100644
index 0000000..e877aac
--- /dev/null
+++ b/tests/toy_dataset/validation.json
@@ -0,0 +1,3 @@
+[
+    3
+]
\ No newline at end of file
diff --git a/tests/dummy_voxels/voxels.npy b/tests/toy_dataset/voxels.npy
similarity index 100%
rename from tests/dummy_voxels/voxels.npy
rename to tests/toy_dataset/voxels.npy