Fix data indexing issue and add column renaming functionality

bessagroup · Jun 25, 2024 · 6d99381 · 6d99381
1 parent 9be5928
commit 6d99381
Show file tree

Hide file tree

Showing 8 changed files with 956 additions and 19 deletions.
diff --git a/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py b/src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py
@@ -73,8 +73,8 @@ def __add__(self, __o: Index | str) -> Index:
 
  # Make a copy of other.jobs and modify its index
  other_jobs_copy = deepcopy(__o)
- other_jobs_copy.jobs.index = range(
- len(other_jobs_copy)) + self.jobs.index[-1] + 1
+ other_jobs_copy.jobs.index = pd.Index(range(
+ len(other_jobs_copy))) + self.jobs.index[-1] + 1
 
  return Index(pd.concat([self.jobs, other_jobs_copy.jobs]))
 

diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newdata2.py
@@ -148,8 +148,9 @@ def __repr__(self) -> str:
 # Properties
 # =============================================================================
 
+
  @property
- def indices(self) -> List[int]:
+ def indices(self) -> pd.Index:
  """
  Get the indices of the data.
 
@@ -158,7 +159,7 @@ def indices(self) -> List[int]:
  List[int]
  The list of indices.
  """
- return list(self.data.keys())
+ return pd.Index(list(self.data.keys()))
 
  @property
  def names(self) -> List[str]:
@@ -187,8 +188,9 @@ def is_empty(self) -> bool:
 # Initialization
 # =============================================================================
 
+
  @classmethod
- def from_indices(cls, rows: Iterable[int]) -> _Data:
+ def from_indices(cls, rows: Iterable[int] | pd.Index) -> _Data:
  """
  Create a _Data object from a list of indices.
 
@@ -426,7 +428,7 @@ def n_best_samples(self, nosamples: int, key: str) -> pd.DataFrame:
  df = self.to_dataframe()
  return df.nsmallest(n=nosamples, columns=key)
 
- def add_column(self, key: str):
+ def add_column(self, key: str, exist_ok: bool = True):
  """
  Add a new column to the data with missing values.
 
@@ -436,8 +438,23 @@ def add_column(self, key: str):
  The key for the new column.
  """
  for row in self.data:
+ if not exist_ok and key in self.data[row]:
+ raise KeyError(f"Key '{key}' already exists in the data.")
  self.data[row][key] = MISSING_VALUE
 
+ def rename_columns(self, mapping: Dict[str, str]):
+ """
+ Rename columns in the data.
+
+ Parameters
+ ----------
+ mapping : Dict[str, str]
+ The mapping of old to new column names.
+ """
+ for row in self.data:
+ for old_key, new_key in mapping.items():
+ self.data[row][new_key] = self.data[row].pop(old_key)
+
  def remove(self, rows: Iterable[int]):
  """
  Remove specific rows from the data.
@@ -513,7 +530,8 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data:
 # =============================================================================
 
 
-def _data_factory(data: DataTypes) -> _Data:
+def _data_factory(data: DataTypes,
+ keys: Optional[Iterable[str]] = None) -> _Data:
  if data is None:
  return _Data()
 
@@ -527,7 +545,7 @@ def _data_factory(data: DataTypes) -> _Data:
  return _Data.from_file(Path(data))
 
  elif isinstance(data, np.ndarray):
- return _Data.from_numpy(data)
+ return _Data.from_numpy(data, keys=keys)
 
  else:
  raise TypeError(

diff --git a/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py b/src/f3dasm/_src/experimentdata/_experimental/_newexperimentdata2.py
@@ -110,8 +110,12 @@ def __init__(self,
 
  self.project_dir = _project_dir_factory(project_dir)
 
- self._input_data = _data_factory(input_data)
- self._output_data = _data_factory(output_data)
+ if isinstance(input_data, np.ndarray) and isinstance(domain, Domain):
+ self._input_data = _data_factory(input_data, domain.names)
+ self._output_data = _data_factory(output_data, domain.output_names)
+ else:
+ self._input_data = _data_factory(input_data)
+ self._output_data = _data_factory(output_data)
 
  # Create empty output_data from indices if output_data is empty
  if self._output_data.is_empty():
@@ -134,9 +138,8 @@ def __init__(self,
 
  # For backwards compatibility; if the output_data has
  # only one column, rename it to 'y'
- # TODO: Fix this for newdata2
  if self._output_data.names == [0]:
- self._output_data.columns.set_columnnames(['y'])
+ self._output_data.rename_columns({0: 'y'})
 
  def __len__(self):
  """The len() method returns the number of datapoints"""
@@ -944,7 +947,7 @@ def _set_experiment_sample(self,
 
  self._output_data.set_data(
  row=experiment_sample.job_number, value=value,
- column=column)
+ key=column)
 
  self._jobs.mark(experiment_sample._jobnumber, status=Status.FINISHED)
 
@@ -997,11 +1000,10 @@ def _set_error(self, index: int) -> None:
  index
  index of the experiment_sample to mark as error
  """
- # self.jobs.mark_as_error(index)
  self._jobs.mark(index, status=Status.ERROR)
- self._output_data.set_data(
- index,
- value=['ERROR' for _ in self._output_data.names])
+ for column in self._output_data.names:
+ self._output_data.set_data(
+  index, value='ERROR', key=column)
 
  @_access_file
  def _write_error(self, index: int):

diff --git a/tests/newdata/experimentdata/__init__.py b/tests/newdata/experimentdata/__init__.py
diff --git a/tests/newdata/experimentdata/conftest.py b/tests/newdata/experimentdata/conftest.py
@@ -0,0 +1,130 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pytest
+import xarray as xr
+
+from f3dasm._src.design.parameter import (_CategoricalParameter,
+ _ContinuousParameter,
+ _DiscreteParameter)
+from f3dasm._src.experimentdata._experimental._newexperimentdata2 import \
+ ExperimentData
+from f3dasm.design import Domain, make_nd_continuous_domain
+
+SEED = 42
+
+
+@pytest.fixture(scope="package")
+def seed() -> int:
+ return SEED
+
+
+@pytest.fixture(scope="package")
+def domain() -> Domain:
+
+ space = {
+ 'x1': _ContinuousParameter(-5.12, 5.12),
+ 'x2': _DiscreteParameter(-3, 3),
+ 'x3': _CategoricalParameter(["red", "green", "blue"])
+ }
+
+ return Domain(space=space)
+
+
+@pytest.fixture(scope="package")
+def domain_continuous() -> Domain:
+ return make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)
+
+
+@pytest.fixture(scope="package")
+def experimentdata(domain: Domain) -> ExperimentData:
+ e_data = ExperimentData(domain)
+ e_data.sample(sampler='random', n_samples=10, seed=SEED)
+ return e_data
+
+
+@pytest.fixture(scope="package")
+def experimentdata2(domain: Domain) -> ExperimentData:
+ return ExperimentData.from_sampling(sampler='random', domain=domain, n_samples=10, seed=SEED)
+
+
+@pytest.fixture(scope="package")
+def experimentdata_continuous(domain_continuous: Domain) -> ExperimentData:
+ return ExperimentData.from_sampling(sampler='random', domain=domain_continuous, n_samples=10, seed=SEED)
+
+
+@pytest.fixture(scope="package")
+def experimentdata_expected() -> ExperimentData:
+ domain_continuous = make_nd_continuous_domain(
+ bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)
+ data = ExperimentData.from_sampling(
+ sampler='random', domain=domain_continuous, n_samples=10, seed=SEED)
+ for es, output in zip(data, np.zeros((10, 1))):
+ es.store(name='y', object=float(output))
+ data._set_experiment_sample(es)
+ data.add(input_data=np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]),
+ output_data=np.array([[0.0], [0.0]]), domain=data.domain)
+
+ # data._input_data.data = data._input_data.data.round(6)
+ return data
+
+
+@pytest.fixture(scope="package")
+def experimentdata_expected_no_output() -> ExperimentData:
+ domain_continuous = make_nd_continuous_domain(
+ bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)
+ data = ExperimentData.from_sampling(
+ sampler='random', domain=domain_continuous, n_samples=10, seed=SEED)
+ data.add(input_data=np.array(
+ [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), domain=domain_continuous)
+
+ # data._input_data.data = data._input_data.data.round(6)
+
+ return data
+
+
+@pytest.fixture(scope="package")
+def experimentdata_expected_only_domain() -> ExperimentData:
+ domain_continuous = make_nd_continuous_domain(
+ bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)
+ return ExperimentData(domain=domain_continuous)
+
+
+@pytest.fixture(scope="package")
+def numpy_array(domain_continuous: Domain) -> np.ndarray:
+ rng = np.random.default_rng(SEED)
+ return rng.random((10, len(domain_continuous)))
+
+
+@pytest.fixture(scope="package")
+def numpy_output_array(domain_continuous: Domain) -> np.ndarray:
+ return np.zeros((10, 1))
+
+
+@pytest.fixture(scope="package")
+def xarray_dataset(domain_continuous: Domain) -> xr.Dataset:
+ rng = np.random.default_rng(SEED)
+ # np.random.seed(SEED)
+ input_data = rng.random((10, len(domain_continuous)))
+ input_names = domain_continuous.names
+
+ output_data = pd.DataFrame()
+ output_names = output_data.columns.to_list()
+
+ return xr.Dataset({'input': xr.DataArray(input_data, dims=['iterations', 'input_dim'], coords={
+ 'iterations': range(len(input_data)), 'input_dim': input_names}),
+ 'output': xr.DataArray(output_data, dims=['iterations', 'output_dim'], coords={
+ 'iterations': range(len(output_data)), 'output_dim': output_names})})
+
+
+@pytest.fixture(scope="package")
+def pandas_dataframe(domain_continuous: Domain) -> pd.DataFrame:
+ # np.random.seed(SEED)
+ rng = np.random.default_rng(SEED)
+ return pd.DataFrame(rng.random((10, len(domain_continuous))), columns=domain_continuous.names)
+
+
+@pytest.fixture(scope="package")
+def continuous_parameter() -> _ContinuousParameter:
+ return _ContinuousParameter(lower_bound=0., upper_bound=1.)
diff --git a/tests/newdata/experimentdata/test__jobqueue.py b/tests/newdata/experimentdata/test__jobqueue.py
@@ -0,0 +1,43 @@
+import pandas as pd
+
+from f3dasm._src.experimentdata._experimental._jobqueue2 import \
+ Index as _JobQueue
+
+# from f3dasm._src.experimentdata._jobqueue import _JobQueue
+
+
+def test_select_all_with_matching_status():
+ # Create a job queue with some jobs
+ job_queue = _JobQueue()
+ job_queue.jobs = pd.Series(
+ ['in progress', 'running', 'completed', 'in progress', 'failed'])
+
+ # Select all jobs with status 'in progress'
+ selected_jobs = job_queue.select_all('in progress')
+
+ # Check if the selected jobs match the expected result
+ assert (selected_jobs.jobs == ['in progress', 'in progress']).all()
+
+
+def test_select_all_with_no_matching_status():
+ # Create a job queue with some jobs
+ job_queue = _JobQueue()
+ job_queue.jobs = pd.Series(
+ ['in progress', 'running', 'completed', 'in progress', 'failed'])
+
+ # Select all jobs with status 'cancelled'
+ selected_jobs = job_queue.select_all('cancelled')
+
+ # Check if the selected jobs match the expected result
+ assert selected_jobs.jobs.empty
+
+
+def test_select_all_with_empty_job_queue():
+ # Create an empty job queue
+ job_queue = _JobQueue()
+
+ # Select all jobs with status 'in progress'
+ selected_jobs = job_queue.select_all('in progress')
+
+ # Check if the selected jobs match the expected result
+ assert selected_jobs.jobs.empty