Skip to content

Commit

Permalink
Add unit tests for VoxelDataset
Browse files Browse the repository at this point in the history
  • Loading branch information
adosar committed May 28, 2024
1 parent fd8c992 commit 6971123
Show file tree
Hide file tree
Showing 14 changed files with 205 additions and 69 deletions.
12 changes: 12 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
|:pushpin:| Changelog
=====================

Version x.x.x
-------------

.. versionchanged:: x.x.x

* Parameter ``out_dirname`` replaces ``out_pathname`` in
:func:`utils.voxels_from_files` and :func:`utils.voxels_from_dir`.

.. versionadded:: x.x.x

* CLI for RetNet

Version 0.1.2
-------------

Expand Down
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'sphinx_rtd_theme'
#html_logo = 'images/moxel_logo.svg'
#html_static_path = ['_static']

# Path to GitHub repo {group}/{project} (note that `group` is the GitHub user or organization)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ processed.
.. code-tab:: python

>>> from moxel.utils import voxels_from_dir
>>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_pathname='path/to/batch')
>>> voxels_from_dir('path/to/CIFs/', grid_size=5, out_dirname='path/to/batch')

.. code-tab:: console
:caption: CLI
Expand Down
127 changes: 72 additions & 55 deletions src/moxel/data.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,71 @@
# This file is part of MOXελ.
# Copyright (C) 2023-2024 Antonios P. Sarikas

# MOXελ is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

r"""
Write the docstring of the module.
"""

import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from . utils import load_json


def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1):
r"""
Split a list of materials' names into train, validation and test sets.
Split voxels into train, validation and test sets.
.. warning::
* You should use this function **after** :func:`utils.batch_clean`.
* No directory is created by :func:`prepare_data`. **All ``.json``
files are stored under the directory containing ``source``**.
* No directory is created by :func:`prepare_data`.
* All ``.json`` files are stored under the directory containing ``source``.
Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be
used for training, validation and testing.
Parameters
----------
source: str
Pathname to the file holding the names of the materials.
split_ratio: sequence of size 3, default=(0.8, 0.1, 0.1)
The sizes or fractions of splits to be produced.
* ``train == split_ratio[0]``.
* ``val == split_ratio[1]``.
* ``test == split_ratio[2]``.
seed : int, default=1
Controls the randomness of the ``rng`` used for splitting.
Examples
--------
Before the split::
voxels_data
├──clean_voxels.npy
└──clean_names.json
>>> prepare_data('path/to/voxels_data/clean_names.json') # doctest: SKIP
After the split::
voxels_data
Expand All @@ -33,22 +74,6 @@ def prepare_data(source, split_ratio=(0.8, 0.1, 0.1), seed=1):
├──train.json
├──validation.json
└──test.json
Each ``.json`` file stores the indices of ``clean_voxels.npy`` that will be
used for training, validation and testing.
Parameters
----------
source: str
Pathname to the file holding the names of the materials
(``clean_names.json``).
split_ratio: array-like of shape (3,), default=(0.8, 0.1, 0.1)
The sizes or fractions of splits to be produced.
* ``split_ratio[0] == train``.
* ``split_ratio[1] == validation``.
* ``split_ratio[2] == test``.
seed : int, default=1
Controls the randomness of the ``rng`` used for splitting.
"""
rng = torch.Generator().manual_seed(seed)
path = Path(source).parent
Expand All @@ -68,6 +93,11 @@ class VoxelDataset(Dataset):
r"""
Dataset for voxels.
.. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms
.. tip::
See `transforms`_ for implementing your own transforms.
Parameters
----------
path_to_indices: str
Expand All @@ -76,28 +106,20 @@ class VoxelDataset(Dataset):
Pathname to the ``.npy`` file holding the voxels.
path_to_names: str, optional
Pathname to the ``.json`` file holding the names of the materials. No
effect if ``path_to_Y == None``.
effect if ``path_to_Y=None``.
path_to_Y : str, optional
Pathname to the ``.csv`` file holding the labels of the voxels.
index_col : str, optional
Column name of the ``.csv`` file to be used as row labels. The values
(names) under this column must follow the same naming scheme as in
Column name of the ``.csv`` file to be used as row labels. The names
(values) under this column must follow the same naming scheme as in
``clean_names.json``.
labels : list, optional
List containing the names of the properties to be predicted. No effect
if ``path_to_Y == None``.
List containing the column names of the ``.csv`` to be used as labels.
No effect if ``path_to_Y=None``.
transform_x : callable, optional
Transforms applied to input. See `transforms`_ for implementing your own
transforms.
Transforms applied to input.
transform_y : callable, optional
Transforms applied to output. See `transforms`_ for implementing your
own transforms. No effect if ``pcd_Y == None``.
.. note::
For example, if you want to perform classification, here you can
pass the one-hot encoder (if the dataset is not already preprocessed).
.. _transforms: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms
Transforms applied to output. No effect if ``path_to_Y=None``.
"""
def __init__(
self, path_to_indices, path_to_X,
Expand All @@ -106,7 +128,7 @@ def __init__(
transform_x=None, transform_y=None,
):

if (labels is not None) and (type(labels) != list):
if (labels is not None) and (type(labels) is not list):
raise ValueError('labels must be a list!')

self.path_to_X = path_to_X
Expand All @@ -124,40 +146,35 @@ def __init__(
self.X = None
self.Y = None

@property
def voxel_indices(self):
return self._voxel_indices

def __len__(self):
return len(self.voxel_indices)
return len(self._voxel_indices)

def __getitem__(self, idx):
# Account for np.load and multiprocessing.
if self.X is None:
self.X = np.load(self.path_to_X, mmap_mode='r')
# Load and add a channel dimension to voxels array.
self.X = np.load(self.path_to_X, mmap_mode='r')[:, None]

voxel_idx = self._voxel_indices[idx]
sample_x = torch.tensor(self.X[voxel_idx], dtype=torch.float)

if self.transform_x is not None:
sample_x = self.transform_x(sample_x)

# Only for labeled datasets.
if self.Y is None and self.path_to_Y is not None:
self.Y = pd.read_csv(
self.path_to_Y,
index_col=self.index_col,
usecols=[*self.labels, self.index_col],
)

sample_x = self.X[name]

if self.transform_x is not None:
sample_x = self.transform_x(sample_x)

# Only for labeled datasets.
if self.Y is not None:
name = self._voxel_names[idx]
sample_y = self.Y.loc[name].values
name = self._voxel_names[voxel_idx]
sample_y = torch.tensor(self.Y.loc[name].values, dtype=torch.float)

if self.transform_y is not None:
sample_y = self.transform_y(sample_y)

return (
torch.tensor(sample_x, dtype=torch.float),
torch.tensor(sample_y, dtype=torch.float)
)
return sample_x, sample_y

return torch.tensor(sample_x, dtype=torch.float)
return sample_x
20 changes: 9 additions & 11 deletions src/moxel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def calculate(self, cubic_box=False, length=30, potential='lj', n_jobs=None):
If ``True``, the simulation box is cubic.
length : float, default=30
The size of the cubic box in Å. Takes effect only
if ``cubic_box == True``.
if ``cubic_box=True``.
n_jobs : int, optional
Number of jobs to run in parallel. If ``None``, then the number returned
by ``os.cpu_count()`` is used.
Expand Down Expand Up @@ -217,7 +217,7 @@ def lj_potential(self, coords):
Parameters
----------
coordinates : array_like of shape (3,)
If ``cubic_box == True`` cartesian. Else, fractional.
If ``cubic_box=True`` cartesian. Else, fractional.
Returns
-------
Expand All @@ -234,10 +234,8 @@ def lj_potential(self, coords):
self.cutoff, zip_results=False,
)

'''
Need to check for length of r_ij because of
https://github.com/materialsproject/pymatgen/issues/3794
'''
# Need to check for length of r_ij because of
# https://github.com/materialsproject/pymatgen/issues/3794
if len(r_ij) == 0: # No neighbor, zero energy.
return 1.

Expand All @@ -247,7 +245,7 @@ def lj_potential(self, coords):
es_j = self._lj_params[indices]
x = (0.5 * (es_j[:, 1] + self.sigma)) / r_ij
e = 4 * np.sqrt(es_j[:, 0] * self.epsilon)
energy = sum(e * (x**12 - x**6))
energy = np.sum(e * (x**12 - x**6))

# This should be changed with clipping in future versions.
return np.exp(-(1 / 298) * energy) # For numerical stability.
Expand Down Expand Up @@ -276,7 +274,7 @@ def voxels_from_file(
cubic_box : bool, default=False
If ``True``, the simulation box is cubic.
length : float, default=30
The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
n_jobs : int, optional
Number of jobs to run in parallel. If ``None``, then the number returned
by ``os.cpu_count()`` is used.
Expand All @@ -286,7 +284,7 @@ def voxels_from_file(
Returns
-------
out : ``array`` or :class:`Grid`
If ``only_voxels == True``, array of shape ``(grid_size,)*3``.
If ``only_voxels=True``, array of shape ``(grid_size,)*3``.
Otherwise, :class:`Grid`.
Notes
Expand Down Expand Up @@ -343,7 +341,7 @@ def voxels_from_files(
cubic_box : bool, default=False
If ``True``, the simulation box is cubic.
length : float, default=30
The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
n_jobs : int, optional
Number of jobs to run in parallel. If ``None``, then the number returned
by ``os.cpu_count()`` is used.
Expand Down Expand Up @@ -417,7 +415,7 @@ def voxels_from_dir(
cubic_box : bool, default=False
If ``True``, the simulation box is cubic.
length : float, default=30
The size of the cubic box in Å. Takes effect only if ``cubic_box == True``.
The size of the cubic box in Å. Takes effect only if ``cubic_box=True``.
n_jobs : int, optional
Number of jobs to run in parallel. If ``None``, then the number returned
by ``os.cpu_count()`` is used.
Expand Down
Loading

0 comments on commit 6971123

Please sign in to comment.