From 6d9938157b7522b7eb7b0dd801dc273acb127be2 Mon Sep 17 00:00:00 2001 From: Martin van der Schelling <61459087+mpvanderschelling@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:26:30 +0200 Subject: [PATCH] Fix data indexing issue and add column renaming functionality --- .../_experimental/_jobqueue2.py | 4 +- .../experimentdata/_experimental/_newdata2.py | 30 +- .../_experimental/_newexperimentdata2.py | 20 +- tests/newdata/experimentdata/__init__.py | 0 tests/newdata/experimentdata/conftest.py | 130 +++ .../newdata/experimentdata/test__jobqueue.py | 43 + .../experimentdata/test_experimentdata.py | 737 ++++++++++++++++++ tests/newdata/test_data.py | 11 +- 8 files changed, 956 insertions(+), 19 deletions(-) create mode 100644 tests/newdata/experimentdata/__init__.py create mode 100644 tests/newdata/experimentdata/conftest.py create mode 100644 tests/newdata/experimentdata/test__jobqueue.py create mode 100644 tests/newdata/experimentdata/test_experimentdata.py diff --git a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py index 3c88308b..82721ace 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py @@ -73,8 +73,8 @@ def __add__(self, __o: Index | str) -> Index: # Make a copy of other.jobs and modify its index other_jobs_copy = deepcopy(__o) - other_jobs_copy.jobs.index = range( - len(other_jobs_copy)) + self.jobs.index[-1] + 1 + other_jobs_copy.jobs.index = pd.Index(range( + len(other_jobs_copy))) + self.jobs.index[-1] + 1 return Index(pd.concat([self.jobs, other_jobs_copy.jobs])) diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py index 4bff29cd..26df0982 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py @@ -148,8 +148,9 @@ def __repr__(self) -> str: # Properties # ============================================================================= + @property - def indices(self) -> List[int]: + def indices(self) -> pd.Index: """ Get the indices of the data. @@ -158,7 +159,7 @@ def indices(self) -> List[int]: List[int] The list of indices. """ - return list(self.data.keys()) + return pd.Index(list(self.data.keys())) @property def names(self) -> List[str]: @@ -187,8 +188,9 @@ def is_empty(self) -> bool: # Initialization # ============================================================================= + @classmethod - def from_indices(cls, rows: Iterable[int]) -> _Data: + def from_indices(cls, rows: Iterable[int] | pd.Index) -> _Data: """ Create a _Data object from a list of indices. @@ -426,7 +428,7 @@ def n_best_samples(self, nosamples: int, key: str) -> pd.DataFrame: df = self.to_dataframe() return df.nsmallest(n=nosamples, columns=key) - def add_column(self, key: str): + def add_column(self, key: str, exist_ok: bool = True): """ Add a new column to the data with missing values. @@ -436,8 +438,23 @@ def add_column(self, key: str): The key for the new column. """ for row in self.data: + if not exist_ok and key in self.data[row]: + raise KeyError(f"Key '{key}' already exists in the data.") self.data[row][key] = MISSING_VALUE + def rename_columns(self, mapping: Dict[str, str]): + """ + Rename columns in the data. + + Parameters + ---------- + mapping : Dict[str, str] + The mapping of old to new column names. + """ + for row in self.data: + for old_key, new_key in mapping.items(): + self.data[row][new_key] = self.data[row].pop(old_key) + def remove(self, rows: Iterable[int]): """ Remove specific rows from the data. @@ -513,7 +530,8 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data: # ============================================================================= -def _data_factory(data: DataTypes) -> _Data: +def _data_factory(data: DataTypes, + keys: Optional[Iterable[str]] = None) -> _Data: if data is None: return _Data() @@ -527,7 +545,7 @@ def _data_factory(data: DataTypes) -> _Data: return _Data.from_file(Path(data)) elif isinstance(data, np.ndarray): - return _Data.from_numpy(data) + return _Data.from_numpy(data, keys=keys) else: raise TypeError( diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py index 0a2cc770..7851f30f 100644 --- a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py +++ b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py @@ -110,8 +110,12 @@ def __init__(self, self.project_dir = _project_dir_factory(project_dir) - self._input_data = _data_factory(input_data) - self._output_data = _data_factory(output_data) + if isinstance(input_data, np.ndarray) and isinstance(domain, Domain): + self._input_data = _data_factory(input_data, domain.names) + self._output_data = _data_factory(output_data, domain.output_names) + else: + self._input_data = _data_factory(input_data) + self._output_data = _data_factory(output_data) # Create empty output_data from indices if output_data is empty if self._output_data.is_empty(): @@ -134,9 +138,8 @@ def __init__(self, # For backwards compatibility; if the output_data has # only one column, rename it to 'y' - # TODO: Fix this for newdata2 if self._output_data.names == [0]: - self._output_data.columns.set_columnnames(['y']) + self._output_data.rename_columns({0: 'y'}) def __len__(self): """The len() method returns the number of datapoints""" @@ -944,7 +947,7 @@ def _set_experiment_sample(self, self._output_data.set_data( row=experiment_sample.job_number, value=value, - column=column) + key=column) self._jobs.mark(experiment_sample._jobnumber, status=Status.FINISHED) @@ -997,11 +1000,10 @@ def _set_error(self, index: int) -> None: index index of the experiment_sample to mark as error """ - # self.jobs.mark_as_error(index) self._jobs.mark(index, status=Status.ERROR) - self._output_data.set_data( - index, - value=['ERROR' for _ in self._output_data.names]) + for column in self._output_data.names: + self._output_data.set_data( + index, value='ERROR', key=column) @_access_file def _write_error(self, index: int): diff --git a/tests/newdata/experimentdata/__init__.py b/tests/newdata/experimentdata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/newdata/experimentdata/conftest.py b/tests/newdata/experimentdata/conftest.py new file mode 100644 index 00000000..68189f88 --- /dev/null +++ b/tests/newdata/experimentdata/conftest.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from f3dasm._src.design.parameter import (_CategoricalParameter, + _ContinuousParameter, + _DiscreteParameter) +from f3dasm._src.experimentdata._experimental._newexperimentdata2 import \ + ExperimentData +from f3dasm.design import Domain, make_nd_continuous_domain + +SEED = 42 + + +@pytest.fixture(scope="package") +def seed() -> int: + return SEED + + +@pytest.fixture(scope="package") +def domain() -> Domain: + + space = { + 'x1': _ContinuousParameter(-5.12, 5.12), + 'x2': _DiscreteParameter(-3, 3), + 'x3': _CategoricalParameter(["red", "green", "blue"]) + } + + return Domain(space=space) + + +@pytest.fixture(scope="package") +def domain_continuous() -> Domain: + return make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + + +@pytest.fixture(scope="package") +def experimentdata(domain: Domain) -> ExperimentData: + e_data = ExperimentData(domain) + e_data.sample(sampler='random', n_samples=10, seed=SEED) + return e_data + + +@pytest.fixture(scope="package") +def experimentdata2(domain: Domain) -> ExperimentData: + return ExperimentData.from_sampling(sampler='random', domain=domain, n_samples=10, seed=SEED) + + +@pytest.fixture(scope="package") +def experimentdata_continuous(domain_continuous: Domain) -> ExperimentData: + return ExperimentData.from_sampling(sampler='random', domain=domain_continuous, n_samples=10, seed=SEED) + + +@pytest.fixture(scope="package") +def experimentdata_expected() -> ExperimentData: + domain_continuous = make_nd_continuous_domain( + bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + data = ExperimentData.from_sampling( + sampler='random', domain=domain_continuous, n_samples=10, seed=SEED) + for es, output in zip(data, np.zeros((10, 1))): + es.store(name='y', object=float(output)) + data._set_experiment_sample(es) + data.add(input_data=np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), + output_data=np.array([[0.0], [0.0]]), domain=data.domain) + + # data._input_data.data = data._input_data.data.round(6) + return data + + +@pytest.fixture(scope="package") +def experimentdata_expected_no_output() -> ExperimentData: + domain_continuous = make_nd_continuous_domain( + bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + data = ExperimentData.from_sampling( + sampler='random', domain=domain_continuous, n_samples=10, seed=SEED) + data.add(input_data=np.array( + [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), domain=domain_continuous) + + # data._input_data.data = data._input_data.data.round(6) + + return data + + +@pytest.fixture(scope="package") +def experimentdata_expected_only_domain() -> ExperimentData: + domain_continuous = make_nd_continuous_domain( + bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3) + return ExperimentData(domain=domain_continuous) + + +@pytest.fixture(scope="package") +def numpy_array(domain_continuous: Domain) -> np.ndarray: + rng = np.random.default_rng(SEED) + return rng.random((10, len(domain_continuous))) + + +@pytest.fixture(scope="package") +def numpy_output_array(domain_continuous: Domain) -> np.ndarray: + return np.zeros((10, 1)) + + +@pytest.fixture(scope="package") +def xarray_dataset(domain_continuous: Domain) -> xr.Dataset: + rng = np.random.default_rng(SEED) + # np.random.seed(SEED) + input_data = rng.random((10, len(domain_continuous))) + input_names = domain_continuous.names + + output_data = pd.DataFrame() + output_names = output_data.columns.to_list() + + return xr.Dataset({'input': xr.DataArray(input_data, dims=['iterations', 'input_dim'], coords={ + 'iterations': range(len(input_data)), 'input_dim': input_names}), + 'output': xr.DataArray(output_data, dims=['iterations', 'output_dim'], coords={ + 'iterations': range(len(output_data)), 'output_dim': output_names})}) + + +@pytest.fixture(scope="package") +def pandas_dataframe(domain_continuous: Domain) -> pd.DataFrame: + # np.random.seed(SEED) + rng = np.random.default_rng(SEED) + return pd.DataFrame(rng.random((10, len(domain_continuous))), columns=domain_continuous.names) + + +@pytest.fixture(scope="package") +def continuous_parameter() -> _ContinuousParameter: + return _ContinuousParameter(lower_bound=0., upper_bound=1.) diff --git a/tests/newdata/experimentdata/test__jobqueue.py b/tests/newdata/experimentdata/test__jobqueue.py new file mode 100644 index 00000000..52010733 --- /dev/null +++ b/tests/newdata/experimentdata/test__jobqueue.py @@ -0,0 +1,43 @@ +import pandas as pd + +from f3dasm._src.experimentdata._experimental._jobqueue2 import \ + Index as _JobQueue + +# from f3dasm._src.experimentdata._jobqueue import _JobQueue + + +def test_select_all_with_matching_status(): + # Create a job queue with some jobs + job_queue = _JobQueue() + job_queue.jobs = pd.Series( + ['in progress', 'running', 'completed', 'in progress', 'failed']) + + # Select all jobs with status 'in progress' + selected_jobs = job_queue.select_all('in progress') + + # Check if the selected jobs match the expected result + assert (selected_jobs.jobs == ['in progress', 'in progress']).all() + + +def test_select_all_with_no_matching_status(): + # Create a job queue with some jobs + job_queue = _JobQueue() + job_queue.jobs = pd.Series( + ['in progress', 'running', 'completed', 'in progress', 'failed']) + + # Select all jobs with status 'cancelled' + selected_jobs = job_queue.select_all('cancelled') + + # Check if the selected jobs match the expected result + assert selected_jobs.jobs.empty + + +def test_select_all_with_empty_job_queue(): + # Create an empty job queue + job_queue = _JobQueue() + + # Select all jobs with status 'in progress' + selected_jobs = job_queue.select_all('in progress') + + # Check if the selected jobs match the expected result + assert selected_jobs.jobs.empty diff --git a/tests/newdata/experimentdata/test_experimentdata.py b/tests/newdata/experimentdata/test_experimentdata.py new file mode 100644 index 00000000..026945e6 --- /dev/null +++ b/tests/newdata/experimentdata/test_experimentdata.py @@ -0,0 +1,737 @@ +from __future__ import annotations + +import csv +import pickle +from pathlib import Path +from typing import Iterable + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from f3dasm import ExperimentSample +from f3dasm._src.design.parameter import _ContinuousParameter +from f3dasm._src.experimentdata._experimental._jobqueue2 import \ + Index as _JobQueue +from f3dasm._src.experimentdata._experimental._newdata2 import DataTypes, _Data +from f3dasm._src.experimentdata._experimental._newexperimentdata2 import \ + ExperimentData +from f3dasm.design import Domain, Status, make_nd_continuous_domain + +pytestmark = pytest.mark.smoke + +SEED = 42 + + +def test_check_experimentdata(experimentdata: ExperimentData): + assert isinstance(experimentdata, ExperimentData) + +# Write test functions + + +def test_experiment_data_init(experimentdata: ExperimentData, domain: Domain): + assert experimentdata.domain == domain + assert experimentdata.project_dir == Path.cwd() + # Add more assertions as needed + + +def test_experiment_data_add(experimentdata: ExperimentData, + experimentdata2: ExperimentData, domain: Domain): + experimentdata_total = ExperimentData(domain) + experimentdata_total.add_experiments(experimentdata) + experimentdata_total.add_experiments(experimentdata2) + assert experimentdata_total == experimentdata + experimentdata2 + + +def test_experiment_data_len_empty(domain: Domain): + experiment_data = ExperimentData(domain) + assert len(experiment_data) == 0 # Update with the expected length + + +def test_experiment_data_len_equals_input_data(experimentdata: ExperimentData): + assert len(experimentdata) == len(experimentdata._input_data) + + +@pytest.mark.parametrize("slice_type", [3, [0, 1, 3]]) +def test_experiment_data_select(slice_type: int | Iterable[int], experimentdata: ExperimentData): + input_data = experimentdata._input_data[slice_type] + output_data = experimentdata._output_data[slice_type] + jobs = experimentdata._jobs[slice_type] + constructed_experimentdata = ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=experimentdata.domain) + assert constructed_experimentdata == experimentdata.select(slice_type) + +# Constructors +# ====================================================================================== + + +def test_from_file(experimentdata_continuous: ExperimentData, seed: int, tmp_path: Path): + # experimentdata_continuous.filename = tmp_path / 'test001' + experimentdata_continuous.store(tmp_path / 'experimentdata') + + experimentdata_from_file = ExperimentData.from_file( + tmp_path / 'experimentdata') + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experimentdata_continuous._input_data.to_dataframe(), experimentdata_from_file._input_data.to_dataframe(), check_dtype=False, atol=1e-6) + pd.testing.assert_frame_equal(experimentdata_continuous._output_data.to_dataframe(), + experimentdata_from_file._output_data.to_dataframe()) + pd.testing.assert_series_equal( + experimentdata_continuous._jobs.jobs, experimentdata_from_file._jobs.jobs) + # assert experimentdata_continuous.input_data == experimentdata_from_file.input_data + assert experimentdata_continuous._output_data == experimentdata_from_file._output_data + assert experimentdata_continuous.domain == experimentdata_from_file.domain + assert experimentdata_continuous._jobs == experimentdata_from_file._jobs + + +def test_from_file_wrong_name(experimentdata_continuous: ExperimentData, seed: int, tmp_path: Path): + experimentdata_continuous.filename = tmp_path / 'test001' + experimentdata_continuous.store() + + with pytest.raises(FileNotFoundError): + _ = ExperimentData.from_file(tmp_path / 'experimentdata') + + +def test_from_sampling(experimentdata_continuous: ExperimentData, seed: int): + # sampler = RandomUniform(domain=experimentdata_continuous.domain, number_of_samples=10, seed=seed) + experimentdata_from_sampling = ExperimentData.from_sampling(sampler='random', + domain=experimentdata_continuous.domain, + n_samples=10, seed=seed) + assert experimentdata_from_sampling == experimentdata_continuous + + +@pytest.fixture +def sample_csv_inputdata(tmp_path): + # Create sample CSV files for testing + input_csv_file = tmp_path / 'experimentdata_data.csv' + + # Create sample input and output dataframes + input_data = pd.DataFrame( + {'input_col1': [1, 2, 3], 'input_col2': [4, 5, 6]}) + + return input_csv_file, input_data + + +@pytest.fixture +def sample_csv_outputdata(tmp_path): + # Create sample CSV files for testing + output_csv_file = tmp_path / 'experimentdata_output.csv' + + # Create sample input and output dataframes + output_data = pd.DataFrame( + {'output_col1': [7, 8, 9], 'output_col2': [10, 11, 12]}) + + return output_csv_file, output_data + + +def test_from_object(experimentdata_continuous: ExperimentData): + input_data = experimentdata_continuous._input_data + output_data = experimentdata_continuous._output_data + jobs = experimentdata_continuous._jobs + domain = experimentdata_continuous.domain + experiment_data = ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) + assert experiment_data == ExperimentData( + input_data=input_data, output_data=output_data, jobs=jobs, domain=domain) + assert experiment_data == experimentdata_continuous + +# Exporters +# ====================================================================================== + + +def test_to_numpy(experimentdata_continuous: ExperimentData, numpy_array: np.ndarray): + x, y = experimentdata_continuous.to_numpy() + + # cast x to floats + x = x.astype(float) + # assert if x and numpy_array have all the same values + assert np.allclose(x, numpy_array) + + +def test_to_xarray(experimentdata_continuous: ExperimentData, xarray_dataset: xr.Dataset): + exported_dataset = experimentdata_continuous.to_xarray() + # assert if xr_dataset is equal to xarray + assert exported_dataset.equals(xarray_dataset) + + +def test_to_pandas(experimentdata_continuous: ExperimentData, pandas_dataframe: pd.DataFrame): + exported_dataframe, _ = experimentdata_continuous.to_pandas() + # assert if pandas_dataframe is equal to exported_dataframe + pd.testing.assert_frame_equal( + exported_dataframe, pandas_dataframe, atol=1e-6, check_dtype=False) +# Exporters +# ====================================================================================== + + +def test_add_new_input_column(experimentdata: ExperimentData, + continuous_parameter: _ContinuousParameter): + kwargs = {'low': continuous_parameter.lower_bound, + 'high': continuous_parameter.upper_bound} + experimentdata.add_input_parameter( + name='test', type='float', **kwargs) + assert 'test' in experimentdata._input_data.names + + +def test_add_new_output_column(experimentdata: ExperimentData): + experimentdata.add_output_parameter(name='test', is_disk=False) + assert 'test' in experimentdata._output_data.names + + +def test_set_error(experimentdata_continuous: ExperimentData): + experimentdata_continuous._set_error(3) + assert experimentdata_continuous._jobs.jobs[3] == Status.ERROR + + +# Helper function to create a temporary CSV file with sample data +def create_sample_csv_input(file_path): + data = [ + ["x0", "x1", "x2"], + [0.77395605, 0.43887844, 0.85859792], + [0.69736803, 0.09417735, 0.97562235], + [0.7611397, 0.78606431, 0.12811363], + [0.45038594, 0.37079802, 0.92676499], + [0.64386512, 0.82276161, 0.4434142], + [0.22723872, 0.55458479, 0.06381726], + [0.82763117, 0.6316644, 0.75808774], + [0.35452597, 0.97069802, 0.89312112], + [0.7783835, 0.19463871, 0.466721], + [0.04380377, 0.15428949, 0.68304895], + [0.000000, 0.000000, 0.000000], + [1.000000, 1.000000, 1.000000], + ] + with open(file_path, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerows(data) + + +def create_sample_csv_output(file_path): + data = [ + ["y"], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + + ] + with open(file_path, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerows(data) + +# Pytest fixture to create a temporary CSV file + + +def create_domain_pickle(filepath): + domain = make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3) + domain.store(filepath) + + +def create_jobs_pickle_finished(filepath): + domain = make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3) + + _data_input = _Data.from_dataframe(pd_input()) + _data_output = _Data.from_dataframe(pd_output()) + experimentdata = ExperimentData( + domain=domain, input_data=_data_input, output_data=_data_output) + experimentdata._jobs.store(filepath) + + +def create_jobs_pickle_open(filepath): + domain = make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3) + + _data_input = _Data.from_dataframe(pd_input()) + experimentdata = ExperimentData(domain=domain, input_data=_data_input) + experimentdata._jobs.store(filepath) + + +def path_domain(tmp_path): + domain_file_path = tmp_path / "test_domain.pkl" + create_domain_pickle(domain_file_path) + return domain_file_path + + +def str_domain(tmp_path): + domain_file_path = tmp_path / "test_domain.pkl" + create_domain_pickle(domain_file_path) + return str(domain_file_path) + + +def path_jobs_finished(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_finished(jobs_file_path) + return jobs_file_path + + +def str_jobs_finished(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_finished(jobs_file_path) + return str(jobs_file_path) + + +def path_jobs_open(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_open(jobs_file_path) + return jobs_file_path + + +def str_jobs_open(tmp_path): + jobs_file_path = tmp_path / "test_jobs.pkl" + create_jobs_pickle_open(jobs_file_path) + return str(jobs_file_path) + + +def path_input(tmp_path): + csv_file_path = tmp_path / "test_input.csv" + create_sample_csv_input(csv_file_path) + return csv_file_path + + +def str_input(tmp_path): + csv_file_path = tmp_path / "test_input.csv" + create_sample_csv_input(csv_file_path) + return str(csv_file_path) + + +def path_output(tmp_path: Path): + csv_file_path = tmp_path / "test_output.csv" + create_sample_csv_output(csv_file_path) + return csv_file_path + + +def str_output(tmp_path: Path): + csv_file_path = tmp_path / "test_output.csv" + create_sample_csv_output(csv_file_path) + return str(csv_file_path) + +# Pytest test function for reading and monkeypatching a CSV file + + +def numpy_input(): + return np.array([ + [0.77395605, 0.43887844, 0.85859792], + [0.69736803, 0.09417735, 0.97562235], + [0.7611397, 0.78606431, 0.12811363], + [0.45038594, 0.37079802, 0.92676499], + [0.64386512, 0.82276161, 0.4434142], + [0.22723872, 0.55458479, 0.06381726], + [0.82763117, 0.6316644, 0.75808774], + [0.35452597, 0.97069802, 0.89312112], + [0.7783835, 0.19463871, 0.466721], + [0.04380377, 0.15428949, 0.68304895], + [0.000000, 0.000000, 0.000000], + [1.000000, 1.000000, 1.000000], + ]) + + +def numpy_output(): + return np.array([ + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + + ]) + + +def pd_input(): + return pd.DataFrame([ + [0.77395605, 0.43887844, 0.85859792], + [0.69736803, 0.09417735, 0.97562235], + [0.7611397, 0.78606431, 0.12811363], + [0.45038594, 0.37079802, 0.92676499], + [0.64386512, 0.82276161, 0.4434142], + [0.22723872, 0.55458479, 0.06381726], + [0.82763117, 0.6316644, 0.75808774], + [0.35452597, 0.97069802, 0.89312112], + [0.7783835, 0.19463871, 0.466721], + [0.04380377, 0.15428949, 0.68304895], + [0.000000, 0.000000, 0.000000], + [1.000000, 1.000000, 1.000000], + ], columns=["x0", "x1", "x2"]) + + +def pd_output(): + return pd.DataFrame([ + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + [0.0], + + ], columns=["y"]) + + +def data_input(): + return _Data.from_dataframe(pd_input()) + + +def data_output(): + return _Data.from_dataframe(pd_output()) + + +@pytest.mark.parametrize("input_data", [path_input, str_input, pd_input(), data_input(), numpy_input()]) +@pytest.mark.parametrize("output_data", [path_output, str_output, pd_output(), data_output()]) +@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3), None, path_domain, str_domain]) +@pytest.mark.parametrize("jobs", [None, path_jobs_finished, str_jobs_finished]) +def test_init_with_output(input_data: DataTypes, output_data: DataTypes, domain: Domain | str | Path | None, + jobs: _JobQueue | str | Path | None, + experimentdata_expected: ExperimentData, monkeypatch, tmp_path: Path): + + # if input_data is Callable + if callable(input_data): + input_data = input_data(tmp_path) + expected_data_input = pd.read_csv(input_data) + + # if output_data is Callable + if callable(output_data): + output_data = output_data(tmp_path) + expected_data_output = pd.read_csv(output_data) + + if callable(domain): + domain = domain(tmp_path) + expected_domain = Domain.from_file(domain) + + if callable(jobs): + jobs = jobs(tmp_path) + expected_jobs = _JobQueue.from_file(jobs).jobs + + # monkeypatch pd.read_csv to return the expected_data DataFrame + def mock_read_csv(*args, **kwargs): + + path = args[0] + if isinstance(args[0], str): + path = Path(path) + + if path == tmp_path / "test_input.csv": + return expected_data_input + + elif path == tmp_path / "test_output.csv": + return expected_data_output + + else: + raise ValueError("Unexpected file path") + + def mock_load_pickle(*args, **kwargs): + return expected_domain + + def mock_pd_read_pickle(*args, **kwargs): + path = args[0] + + if isinstance(path, str): + path = Path(path) + + if path == tmp_path / "test_jobs.pkl": + return expected_jobs + + else: + raise ValueError("Unexpected jobs file path") + + monkeypatch.setattr(pd, "read_csv", mock_read_csv) + monkeypatch.setattr(pickle, "load", mock_load_pickle) + monkeypatch.setattr(pd, "read_pickle", mock_pd_read_pickle) + + if isinstance(input_data, np.ndarray) and domain is None: + with pytest.raises(ValueError): + ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + return + # Initialize ExperimentData with the CSV file + experiment_data = ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experiment_data._input_data.to_dataframe(), experimentdata_expected._input_data.to_dataframe(), check_dtype=False, atol=1e-6) + pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), + experimentdata_expected._output_data.to_dataframe(), check_dtype=False) + + +@pytest.mark.parametrize("input_data", [pd_input(), path_input, str_input, data_input(), numpy_input()]) +@pytest.mark.parametrize("output_data", [None]) +@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3), None, path_domain, str_domain]) +@pytest.mark.parametrize("jobs", [None, path_jobs_open, str_jobs_open]) +def test_init_without_output(input_data: DataTypes, output_data: DataTypes, domain: Domain, jobs: _JobQueue, + experimentdata_expected_no_output: ExperimentData, monkeypatch, tmp_path): + + # if input_data is Callable + if callable(input_data): + input_data = input_data(tmp_path) + expected_data_input = pd.read_csv(input_data) + + # if output_data is Callable + if callable(output_data): + output_data = output_data(tmp_path) + expected_data_output = pd.read_csv(output_data) + + if callable(domain): + domain = domain(tmp_path) + expected_domain = Domain.from_file(domain) + + if callable(jobs): + jobs = jobs(tmp_path) + expected_jobs = _JobQueue.from_file(jobs).jobs + + # monkeypatch pd.read_csv to return the expected_data DataFrame + def mock_read_csv(*args, **kwargs): + + path = args[0] + if isinstance(args[0], str): + path = Path(path) + + if path == tmp_path / "test_input.csv": + return expected_data_input + + elif path == tmp_path / "test_output.csv": + return expected_data_output + + else: + raise ValueError("Unexpected file path") + + def mock_load_pickle(*args, **kwargs): + return expected_domain + + def mock_pd_read_pickle(*args, **kwargs): + path = args[0] + + if isinstance(path, str): + path = Path(path) + + if path == tmp_path / "test_jobs.pkl": + return expected_jobs + + monkeypatch.setattr(pd, "read_csv", mock_read_csv) + monkeypatch.setattr(pickle, "load", mock_load_pickle) + monkeypatch.setattr(pd, "read_pickle", mock_pd_read_pickle) + + if isinstance(input_data, np.ndarray) and domain is None: + with pytest.raises(ValueError): + ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + return + + # Initialize ExperimentData with the CSV file + experiment_data = ExperimentData(domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs) + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experiment_data._input_data.to_dataframe(), experimentdata_expected_no_output._input_data.to_dataframe(), atol=1e-6, check_dtype=False) + pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), + experimentdata_expected_no_output._output_data.to_dataframe()) + pd.testing.assert_series_equal( + experiment_data._jobs.jobs, experimentdata_expected_no_output._jobs.jobs) + # assert experiment_data.domain == experimentdata_expected_no_output.domain + assert experiment_data._jobs == experimentdata_expected_no_output._jobs + + +@pytest.mark.parametrize("input_data", [None]) +@pytest.mark.parametrize("output_data", [None]) +@pytest.mark.parametrize("domain", [make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), + dimensionality=3), path_domain, str_domain]) +def test_init_only_domain(input_data: DataTypes, output_data: DataTypes, domain: Domain | str | Path, + experimentdata_expected_only_domain: ExperimentData, + monkeypatch, tmp_path): + + # if input_data is Callable + if callable(input_data): + input_data = input_data(tmp_path) + expected_data_input = pd.read_csv(input_data) + + # if output_data is Callable + if callable(output_data): + output_data = output_data(tmp_path) + expected_data_output = pd.read_csv(output_data) + + if callable(domain): + domain = domain(tmp_path) + expected_domain = Domain.from_file(domain) + + # monkeypatch pd.read_csv to return the expected_data DataFrame + def mock_read_csv(*args, **kwargs): + + path = args[0] + if isinstance(args[0], str): + path = Path(path) + + if path == tmp_path / "test_input.csv": + return expected_data_input + + elif path == tmp_path / "test_output.csv": + return expected_data_output + + else: + raise ValueError("Unexpected file path") + + def mock_load_pickle(*args, **kwargs): + return expected_domain + + monkeypatch.setattr(pd, "read_csv", mock_read_csv) + monkeypatch.setattr(pickle, "load", mock_load_pickle) + + # Initialize ExperimentData with the CSV file + experiment_data = ExperimentData(domain=domain, input_data=input_data, + output_data=output_data) + + # Check if the input_data attribute of ExperimentData matches the expected_data + pd.testing.assert_frame_equal( + experiment_data._input_data.to_dataframe(), experimentdata_expected_only_domain._input_data.to_dataframe(), check_dtype=False) + pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), + experimentdata_expected_only_domain._output_data.to_dataframe(), check_dtype=False) + assert experiment_data._input_data == experimentdata_expected_only_domain._input_data + assert experiment_data._output_data == experimentdata_expected_only_domain._output_data + assert experiment_data.domain == experimentdata_expected_only_domain.domain + assert experiment_data._jobs == experimentdata_expected_only_domain._jobs + + assert experiment_data == experimentdata_expected_only_domain + + +@pytest.mark.parametrize("input_data", [[0.1, 0.2], {"a": 0.1, "b": 0.2}, 0.2, 2]) +def test_invalid_type(input_data): + with pytest.raises(TypeError): + ExperimentData(input_data=input_data) + + +def test_add_invalid_type(experimentdata: ExperimentData): + with pytest.raises(TypeError): + experimentdata + 1 + + +def test_add_two_different_domains(experimentdata: ExperimentData, experimentdata_continuous: ExperimentData): + with pytest.raises(ValueError): + experimentdata + experimentdata_continuous + + +def test_repr_html(experimentdata: ExperimentData, monkeypatch): + assert isinstance(experimentdata._repr_html_(), str) + + +def test_store(experimentdata: ExperimentData, tmp_path: Path): + experimentdata.store(tmp_path / "test") + assert (tmp_path / "test" / "experiment_data" / "input.csv").exists() + assert (tmp_path / "test" / "experiment_data" / "output.csv").exists() + assert (tmp_path / "test" / "experiment_data" / "domain.pkl").exists() + assert (tmp_path / "test" / "experiment_data" / "jobs.pkl").exists() + + +def test_store_give_no_filename(experimentdata: ExperimentData, tmp_path: Path): + experimentdata.set_project_dir(tmp_path / 'test2') + experimentdata.store() + assert (tmp_path / "test2" / "experiment_data" / "input.csv").exists() + assert (tmp_path / "test2" / "experiment_data" / "output.csv").exists() + assert (tmp_path / "test2" / "experiment_data" / "domain.pkl").exists() + assert (tmp_path / "test2" / "experiment_data" / "jobs.pkl").exists() + + +@pytest.mark.parametrize("mode", ["sequential", "parallel", "typo"]) +def test_evaluate_mode(mode: str, experimentdata_continuous: ExperimentData, tmp_path: Path): + experimentdata_continuous.filename = tmp_path / 'test009' + + if mode == "typo": + with pytest.raises(ValueError): + experimentdata_continuous.evaluate("ackley", mode=mode, kwargs={ + "scale_bounds": np.array([[0., 1.], [0., 1.], [0., 1.]]), 'seed': SEED}) + else: + experimentdata_continuous.evaluate("ackley", mode=mode, kwargs={ + "scale_bounds": np.array([[0., 1.], [0., 1.], [0., 1.]]), 'seed': SEED}) + + +def test_get_input_data(experimentdata_expected_no_output: ExperimentData): + input_data = experimentdata_expected_no_output.get_input_data() + df, _ = input_data.to_pandas() + pd.testing.assert_frame_equal(df, pd_input(), check_dtype=False, atol=1e-6) + assert experimentdata_expected_no_output._input_data == input_data._input_data + + +@pytest.mark.parametrize("selection", ["x0", ["x0"], ["x0", "x2"]]) +def test_get_input_data_selection(experimentdata_expected_no_output: ExperimentData, selection: Iterable[str] | str): + input_data = experimentdata_expected_no_output.get_input_data(selection) + df, _ = input_data.to_pandas() + if isinstance(selection, str): + selection = [selection] + selected_pd = pd_input()[selection] + pd.testing.assert_frame_equal( + df, selected_pd, check_dtype=False, atol=1e-6) + + +def test_get_output_data(experimentdata_expected: ExperimentData): + output_data = experimentdata_expected.get_output_data() + _, df = output_data.to_pandas() + pd.testing.assert_frame_equal(df, pd_output(), check_dtype=False) + assert experimentdata_expected._output_data == output_data._output_data + + +@pytest.mark.parametrize("selection", ["y", ["y"]]) +def test_get_output_data_selection(experimentdata_expected: ExperimentData, selection: Iterable[str] | str): + output_data = experimentdata_expected.get_output_data(selection) + _, df = output_data.to_pandas() + if isinstance(selection, str): + selection = [selection] + selected_pd = pd_output()[selection] + pd.testing.assert_frame_equal(df, selected_pd, check_dtype=False) + + +def test_iter_behaviour(experimentdata_continuous: ExperimentData): + for i in experimentdata_continuous: + assert isinstance(i, ExperimentSample) + + selected_experimentdata = experimentdata_continuous.select([0, 2, 4]) + for i in selected_experimentdata: + assert isinstance(i, ExperimentSample) + + +def test_select_with_status_open(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('open') + assert all(job == Status.OPEN for job in selected_data._jobs.jobs) + + +def test_select_with_status_in_progress(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('in progress') + assert all(job == Status.IN_PROGRESS for job in selected_data._jobs.jobs) + + +def test_select_with_status_finished(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('finished') + assert all(job == Status.FINISHED for job in selected_data._jobs.jobs) + + +def test_select_with_status_error(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('error') + assert all(job == Status.ERROR for job in selected_data._jobs.jobs) + + +def test_select_with_status_invalid_status(experimentdata: ExperimentData): + with pytest.raises(ValueError): + _ = experimentdata.select_with_status('invalid_status') + + +if __name__ == "__main__": # pragma: no cover + pytest.main() diff --git a/tests/newdata/test_data.py b/tests/newdata/test_data.py index fb5f0cba..6c5abe52 100644 --- a/tests/newdata/test_data.py +++ b/tests/newdata/test_data.py @@ -1,5 +1,4 @@ -from copy import deepcopy -from typing import Any, Dict, List +from typing import Any, Dict import numpy as np import pandas as pd @@ -175,6 +174,14 @@ def test_select_columns_single(): assert selected_data.data == expected_data +def test_rename_columns(): + input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} + data = _Data(input_data) + data.rename_columns({"a": "x", "b": "y"}) + expected_data = {0: {"x": 1, "y": 2, "c": 3}, 1: {"x": 4, "y": 5, "c": 6}} + assert data.data == expected_data + + def test_drop(): input_data = {0: {"a": 1, "b": 2, "c": 3}, 1: {"a": 4, "b": 5, "c": 6}} data = _Data(input_data)