Add unit tests for VoxelDataset

adosar · May 28, 2024 · 6971123 · 6971123
1 parent fd8c992
commit 6971123
Show file tree

Hide file tree

Showing 14 changed files with 205 additions and 69 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,18 @@
 |:pushpin:| Changelog
 =====================
 
+Version x.x.x
+-------------
+
+.. versionchanged:: x.x.x
+
+   * Parameter ``out_dirname`` replaces ``out_pathname`` in
+     :func:`utils.voxels_from_files` and :func:`utils.voxels_from_dir`.
+
+.. versionadded:: x.x.x
+
+   * CLI for RetNet
+
 Version 0.1.2
 -------------
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -40,6 +40,7 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 html_theme = 'sphinx_rtd_theme'
+#html_logo = 'images/moxel_logo.svg'
 #html_static_path = ['_static']
 
 # Path to GitHub repo {group}/{project}  (note that `group` is the GitHub user or organization)

diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -113,7 +113,7 @@ processed.
         .. code-tab:: python
 
             >>> from moxel.utils import voxels_from_dir
-            >>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_pathname='path/to/batch')
+            >>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_dirname='path/to/batch')
 
         .. code-tab:: console
             :caption: CLI

diff --git a/src/moxel/data.py b/src/moxel/data.py
@@ -1,30 +1,71 @@
+# This file is part of MOXελ.
+# Copyright (C) 2023-2024 Antonios P. Sarikas
+
+# MOXελ is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
 r"""
 Write the docstring of the module.
 """
 
 import os
 import json
 from pathlib import Path
+import numpy as np
+import pandas as pd
 import torch
 from torch.utils.data import Dataset, random_split
 from . utils import load_json
 
 
 def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1):
     r"""
-    Split a list of materials' names into train, validation and test sets.
+    Split voxels into train, validation and test sets.
 
     .. warning::
+
         * You should use this function **after** :func:`utils.batch_clean`.
-        * No directory is created by :func:`prepare_data`. **All ``.json``
-        files are stored under the directory containing ``source``**.
+        * No directory is created by :func:`prepare_data`.
+        * All ``.json`` files are stored under the directory containing ``source``.
+
+    Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be
+    used for training, validation and testing.
+
+    Parameters
+    ----------
+    source: str
+        Pathname to the file holding the names of the materials.
+    split_ratio: sequence of size 3, default=(0.8, 0.1, 0.1)
+        The sizes or fractions of splits to be produced.
+
+        * ``train == split_ratio[0]``.
+        * ``val == split_ratio[1]``.
+        * ``test == split_ratio[2]``.
+
+    seed : int, default=1
+        Controls the randomness of the ``rng`` used for splitting.
+
+    Examples
+    --------
 
     Before the split::
 
         voxels_data
         ├──clean_voxels.npy
         └──clean_names.json
 
+    >>> prepare_data('path/to/voxels_data/clean_names.json')  # doctest: SKIP
+
     After the split::
 
         voxels_data
@@ -33,22 +74,6 @@ def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1):
         ├──train.json
         ├──validation.json
         └──test.json
-
-    Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be
-    used for training, validation and testing.
-
-    Parameters
-    ----------
-    source: str
-        Pathname to the file holding the names of the materials
-        (``clean_names.json``).
-    split_ratio: array-like of shape (3,), default=(0.8, 0.1, 0.1)
-        The sizes or fractions of splits to be produced.
-        * ``split_ratio[0] == train``.
-        * ``split_ratio[1] == validation``.
-        * ``split_ratio[2] == test``.
-    seed : int, default=1
-        Controls the randomness of the ``rng`` used for splitting.
     """
     rng = torch.Generator().manual_seed(seed)
     path = Path(source).parent
@@ -68,6 +93,11 @@ class VoxelDataset(Dataset):
     r"""
     Dataset for voxels.
 
+    .. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms
+
+    .. tip::
+        See `transforms`_ for implementing your own transforms.
+
     Parameters
     ----------
     path_to_indices: str
@@ -76,28 +106,20 @@ class VoxelDataset(Dataset):
         Pathname to the ``.npy`` file holding the voxels.
     path_to_names: str, optional
         Pathname to the ``.json`` file holding the names of the materials. No
-        effect if ``path_to_Y == None``.
+        effect if ``path_to_Y=None``.
     path_to_Y : str, optional
         Pathname to the ``.csv`` file holding the labels of the voxels.
     index_col : str, optional
-        Column name of the ``.csv`` file to be used as row labels. The values
-        (names) under this column must follow the same naming scheme as in
+        Column name of the ``.csv`` file to be used as row labels. The names
+        (values) under this column must follow the same naming scheme as in
         ``clean_names.json``.
     labels : list, optional
-        List containing the names of the properties to be predicted. No effect
-        if ``path_to_Y == None``.
+        List containing the column names of the ``.csv`` to be used as labels.
+        No effect if ``path_to_Y=None``.
     transform_x : callable, optional
-        Transforms applied to input. See `transforms`_ for implementing your own
-        transforms.
+        Transforms applied to input.
     transform_y : callable, optional
-        Transforms applied to output.  See `transforms`_ for implementing your
-        own transforms. No effect if ``pcd_Y == None``.
-
-        .. note::
-            For example, if you want to perform classification, here you can
-            pass the one-hot encoder (if the dataset is not already preprocessed).
-
-    .. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms
+        Transforms applied to output. No effect if ``path_to_Y=None``.
     """
     def __init__(
             self, path_to_indices, path_to_X,
@@ -106,7 +128,7 @@ def __init__(
             transform_x=None, transform_y=None,
             ):
 
-        if (labels is not None) and (type(labels) != list):
+        if (labels is not None) and (type(labels) is not list):
             raise ValueError('labels must be a list!')
 
         self.path_to_X = path_to_X
@@ -124,40 +146,35 @@ def __init__(
         self.X = None
         self.Y = None
 
-    @property
-    def voxel_indices(self):
-        return self._voxel_indices
-
     def __len__(self):
-        return len(self.voxel_indices)
+        return len(self._voxel_indices)
 
     def __getitem__(self, idx):
-        # Account for np.load and multiprocessing.
         if self.X is None:
-            self.X = np.load(self.path_to_X, mmap_mode='r')
+            # Load and add a channel dimension to voxels array.
+            self.X = np.load(self.path_to_X, mmap_mode='r')[:, None]
+
+        voxel_idx = self._voxel_indices[idx]
+        sample_x = torch.tensor(self.X[voxel_idx], dtype=torch.float)
+
+        if self.transform_x is not None:
+            sample_x = self.transform_x(sample_x)
+
+        # Only for labeled datasets.
         if self.Y is None and self.path_to_Y is not None:
             self.Y = pd.read_csv(
                     self.path_to_Y,
                     index_col=self.index_col,
                     usecols=[*self.labels, self.index_col],
                     )
 
-        sample_x = self.X[name]
-
-        if self.transform_x is not None:
-            sample_x = self.transform_x(sample_x)
-
-        # Only for labeled datasets.
         if self.Y is not None:
-            name = self._voxel_names[idx]
-            sample_y = self.Y.loc[name].values
+            name = self._voxel_names[voxel_idx]
+            sample_y = torch.tensor(self.Y.loc[name].values, dtype=torch.float)
 
             if self.transform_y is not None:
                 sample_y = self.transform_y(sample_y)
 
-            return (
-                    torch.tensor(sample_x, dtype=torch.float),
-                    torch.tensor(sample_y, dtype=torch.float)
-                    )
+            return sample_x, sample_y
 
-        return torch.tensor(sample_x, dtype=torch.float)
+        return sample_x
diff --git a/src/moxel/utils.py b/src/moxel/utils.py
@@ -166,7 +166,7 @@ def calculate(self, cubic_box=False, length=30, potential='lj', n_jobs=None):
             If ``True``, the simulation box is cubic.
         length : float, default=30
             The size of the cubic box in Å. Takes effect only
-            if ``cubic_box == True``.
+            if ``cubic_box=True``.
         n_jobs : int, optional
             Number of jobs to run in parallel. If ``None``, then the number returned
             by ``os.cpu_count()`` is used.
@@ -217,7 +217,7 @@ def lj_potential(self, coords):
         Parameters
         ----------
         coordinates : array_like of shape (3,)
-            If ``cubic_box == True`` cartesian. Else, fractional.
+            If ``cubic_box=True`` cartesian. Else, fractional.
 
         Returns
         -------
@@ -234,10 +234,8 @@ def lj_potential(self, coords):
                 self.cutoff, zip_results=False,
                 )
 
-        '''
-        Need to check for length of r_ij because of
-        https://github.com/materialsproject/pymatgen/issues/3794
-        '''
+        # Need to check for length of r_ij because of
+        # https://github.com/materialsproject/pymatgen/issues/3794
         if len(r_ij) == 0:  # No neighbor, zero energy.
             return 1.
 
@@ -247,7 +245,7 @@ def lj_potential(self, coords):
         es_j = self._lj_params[indices]
         x = (0.5 * (es_j[:, 1] + self.sigma)) / r_ij
         e = 4 * np.sqrt(es_j[:, 0] * self.epsilon)
-        energy = sum(e * (x**12 - x**6))
+        energy = np.sum(e * (x**12 - x**6))
 
         # This should be changed with clipping in future versions.
         return np.exp(-(1 / 298) * energy)  # For numerical stability.
@@ -276,7 +274,7 @@ def voxels_from_file(
     cubic_box : bool, default=False
         If ``True``, the simulation box is cubic.
     length : float, default=30
-        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
+        The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
     n_jobs : int, optional
         Number of jobs to run in parallel. If ``None``, then the number returned
         by ``os.cpu_count()`` is used.
@@ -286,7 +284,7 @@ def voxels_from_file(
     Returns
     -------
     out : ``array`` or :class:`Grid`
-        If ``only_voxels == True``, array of shape ``(grid_size,)*3``.
+        If ``only_voxels=True``, array of shape ``(grid_size,)*3``.
         Otherwise, :class:`Grid`.
 
     Notes
@@ -343,7 +341,7 @@ def voxels_from_files(
     cubic_box : bool, default=False
         If ``True``, the simulation box is cubic.
     length : float, default=30
-        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
+        The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
     n_jobs : int, optional
         Number of jobs to run in parallel. If ``None``, then the number returned
         by ``os.cpu_count()`` is used.
@@ -417,7 +415,7 @@ def voxels_from_dir(
     cubic_box : bool, default=False
         If ``True``, the simulation box is cubic.
     length : float, default=30
-        The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
+        The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
     n_jobs : int, optional
         Number of jobs to run in parallel. If ``None``, then the number returned
         by ``os.cpu_count()`` is used.