Skip to content

Commit

Permalink
Fix data indexing issue and add column renaming functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
mpvanderschelling committed Jun 25, 2024
1 parent 9be5928 commit 6d99381
Show file tree
Hide file tree
Showing 8 changed files with 956 additions and 19 deletions.
4 changes: 2 additions & 2 deletions src/f3dasm/_src/experimentdata/_experimental/_jobqueue2.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def __add__(self, __o: Index | str) -> Index:

# Make a copy of other.jobs and modify its index
other_jobs_copy = deepcopy(__o)
other_jobs_copy.jobs.index = range(
len(other_jobs_copy)) + self.jobs.index[-1] + 1
other_jobs_copy.jobs.index = pd.Index(range(
len(other_jobs_copy))) + self.jobs.index[-1] + 1

return Index(pd.concat([self.jobs, other_jobs_copy.jobs]))

Expand Down
30 changes: 24 additions & 6 deletions src/f3dasm/_src/experimentdata/_experimental/_newdata2.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ def __repr__(self) -> str:
# Properties
# =============================================================================


@property
def indices(self) -> List[int]:
def indices(self) -> pd.Index:
"""
Get the indices of the data.
Expand All @@ -158,7 +159,7 @@ def indices(self) -> List[int]:
List[int]
The list of indices.
"""
return list(self.data.keys())
return pd.Index(list(self.data.keys()))

@property
def names(self) -> List[str]:
Expand Down Expand Up @@ -187,8 +188,9 @@ def is_empty(self) -> bool:
# Initialization
# =============================================================================


@classmethod
def from_indices(cls, rows: Iterable[int]) -> _Data:
def from_indices(cls, rows: Iterable[int] | pd.Index) -> _Data:
"""
Create a _Data object from a list of indices.
Expand Down Expand Up @@ -426,7 +428,7 @@ def n_best_samples(self, nosamples: int, key: str) -> pd.DataFrame:
df = self.to_dataframe()
return df.nsmallest(n=nosamples, columns=key)

def add_column(self, key: str):
def add_column(self, key: str, exist_ok: bool = True):
"""
Add a new column to the data with missing values.
Expand All @@ -436,8 +438,23 @@ def add_column(self, key: str):
The key for the new column.
"""
for row in self.data:
if not exist_ok and key in self.data[row]:
raise KeyError(f"Key '{key}' already exists in the data.")
self.data[row][key] = MISSING_VALUE

def rename_columns(self, mapping: Dict[str, str]):
"""
Rename columns in the data.
Parameters
----------
mapping : Dict[str, str]
The mapping of old to new column names.
"""
for row in self.data:
for old_key, new_key in mapping.items():
self.data[row][new_key] = self.data[row].pop(old_key)

def remove(self, rows: Iterable[int]):
"""
Remove specific rows from the data.
Expand Down Expand Up @@ -513,7 +530,8 @@ def _convert_dict_to_data(dictionary: Dict[str, Any]) -> _Data:
# =============================================================================


def _data_factory(data: DataTypes) -> _Data:
def _data_factory(data: DataTypes,
keys: Optional[Iterable[str]] = None) -> _Data:
if data is None:
return _Data()

Expand All @@ -527,7 +545,7 @@ def _data_factory(data: DataTypes) -> _Data:
return _Data.from_file(Path(data))

elif isinstance(data, np.ndarray):
return _Data.from_numpy(data)
return _Data.from_numpy(data, keys=keys)

else:
raise TypeError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,12 @@ def __init__(self,

self.project_dir = _project_dir_factory(project_dir)

self._input_data = _data_factory(input_data)
self._output_data = _data_factory(output_data)
if isinstance(input_data, np.ndarray) and isinstance(domain, Domain):
self._input_data = _data_factory(input_data, domain.names)
self._output_data = _data_factory(output_data, domain.output_names)
else:
self._input_data = _data_factory(input_data)
self._output_data = _data_factory(output_data)

# Create empty output_data from indices if output_data is empty
if self._output_data.is_empty():
Expand All @@ -134,9 +138,8 @@ def __init__(self,

# For backwards compatibility; if the output_data has
# only one column, rename it to 'y'
# TODO: Fix this for newdata2
if self._output_data.names == [0]:
self._output_data.columns.set_columnnames(['y'])
self._output_data.rename_columns({0: 'y'})

def __len__(self):
"""The len() method returns the number of datapoints"""
Expand Down Expand Up @@ -944,7 +947,7 @@ def _set_experiment_sample(self,

self._output_data.set_data(
row=experiment_sample.job_number, value=value,
column=column)
key=column)

self._jobs.mark(experiment_sample._jobnumber, status=Status.FINISHED)

Expand Down Expand Up @@ -997,11 +1000,10 @@ def _set_error(self, index: int) -> None:
index
index of the experiment_sample to mark as error
"""
# self.jobs.mark_as_error(index)
self._jobs.mark(index, status=Status.ERROR)
self._output_data.set_data(
index,
value=['ERROR' for _ in self._output_data.names])
for column in self._output_data.names:
self._output_data.set_data(
index, value='ERROR', key=column)

@_access_file
def _write_error(self, index: int):
Expand Down
Empty file.
130 changes: 130 additions & 0 deletions tests/newdata/experimentdata/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from __future__ import annotations

import numpy as np
import pandas as pd
import pytest
import xarray as xr

from f3dasm._src.design.parameter import (_CategoricalParameter,
_ContinuousParameter,
_DiscreteParameter)
from f3dasm._src.experimentdata._experimental._newexperimentdata2 import \
ExperimentData
from f3dasm.design import Domain, make_nd_continuous_domain

SEED = 42


@pytest.fixture(scope="package")
def seed() -> int:
return SEED


@pytest.fixture(scope="package")
def domain() -> Domain:

space = {
'x1': _ContinuousParameter(-5.12, 5.12),
'x2': _DiscreteParameter(-3, 3),
'x3': _CategoricalParameter(["red", "green", "blue"])
}

return Domain(space=space)


@pytest.fixture(scope="package")
def domain_continuous() -> Domain:
return make_nd_continuous_domain(bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)


@pytest.fixture(scope="package")
def experimentdata(domain: Domain) -> ExperimentData:
e_data = ExperimentData(domain)
e_data.sample(sampler='random', n_samples=10, seed=SEED)
return e_data


@pytest.fixture(scope="package")
def experimentdata2(domain: Domain) -> ExperimentData:
return ExperimentData.from_sampling(sampler='random', domain=domain, n_samples=10, seed=SEED)


@pytest.fixture(scope="package")
def experimentdata_continuous(domain_continuous: Domain) -> ExperimentData:
return ExperimentData.from_sampling(sampler='random', domain=domain_continuous, n_samples=10, seed=SEED)


@pytest.fixture(scope="package")
def experimentdata_expected() -> ExperimentData:
domain_continuous = make_nd_continuous_domain(
bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)
data = ExperimentData.from_sampling(
sampler='random', domain=domain_continuous, n_samples=10, seed=SEED)
for es, output in zip(data, np.zeros((10, 1))):
es.store(name='y', object=float(output))
data._set_experiment_sample(es)
data.add(input_data=np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]),
output_data=np.array([[0.0], [0.0]]), domain=data.domain)

# data._input_data.data = data._input_data.data.round(6)
return data


@pytest.fixture(scope="package")
def experimentdata_expected_no_output() -> ExperimentData:
domain_continuous = make_nd_continuous_domain(
bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)
data = ExperimentData.from_sampling(
sampler='random', domain=domain_continuous, n_samples=10, seed=SEED)
data.add(input_data=np.array(
[[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]), domain=domain_continuous)

# data._input_data.data = data._input_data.data.round(6)

return data


@pytest.fixture(scope="package")
def experimentdata_expected_only_domain() -> ExperimentData:
domain_continuous = make_nd_continuous_domain(
bounds=np.array([[0., 1.], [0., 1.], [0., 1.]]), dimensionality=3)
return ExperimentData(domain=domain_continuous)


@pytest.fixture(scope="package")
def numpy_array(domain_continuous: Domain) -> np.ndarray:
rng = np.random.default_rng(SEED)
return rng.random((10, len(domain_continuous)))


@pytest.fixture(scope="package")
def numpy_output_array(domain_continuous: Domain) -> np.ndarray:
return np.zeros((10, 1))


@pytest.fixture(scope="package")
def xarray_dataset(domain_continuous: Domain) -> xr.Dataset:
rng = np.random.default_rng(SEED)
# np.random.seed(SEED)
input_data = rng.random((10, len(domain_continuous)))
input_names = domain_continuous.names

output_data = pd.DataFrame()
output_names = output_data.columns.to_list()

return xr.Dataset({'input': xr.DataArray(input_data, dims=['iterations', 'input_dim'], coords={
'iterations': range(len(input_data)), 'input_dim': input_names}),
'output': xr.DataArray(output_data, dims=['iterations', 'output_dim'], coords={
'iterations': range(len(output_data)), 'output_dim': output_names})})


@pytest.fixture(scope="package")
def pandas_dataframe(domain_continuous: Domain) -> pd.DataFrame:
# np.random.seed(SEED)
rng = np.random.default_rng(SEED)
return pd.DataFrame(rng.random((10, len(domain_continuous))), columns=domain_continuous.names)


@pytest.fixture(scope="package")
def continuous_parameter() -> _ContinuousParameter:
return _ContinuousParameter(lower_bound=0., upper_bound=1.)
43 changes: 43 additions & 0 deletions tests/newdata/experimentdata/test__jobqueue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd

from f3dasm._src.experimentdata._experimental._jobqueue2 import \
Index as _JobQueue

# from f3dasm._src.experimentdata._jobqueue import _JobQueue


def test_select_all_with_matching_status():
# Create a job queue with some jobs
job_queue = _JobQueue()
job_queue.jobs = pd.Series(
['in progress', 'running', 'completed', 'in progress', 'failed'])

# Select all jobs with status 'in progress'
selected_jobs = job_queue.select_all('in progress')

# Check if the selected jobs match the expected result
assert (selected_jobs.jobs == ['in progress', 'in progress']).all()


def test_select_all_with_no_matching_status():
# Create a job queue with some jobs
job_queue = _JobQueue()
job_queue.jobs = pd.Series(
['in progress', 'running', 'completed', 'in progress', 'failed'])

# Select all jobs with status 'cancelled'
selected_jobs = job_queue.select_all('cancelled')

# Check if the selected jobs match the expected result
assert selected_jobs.jobs.empty


def test_select_all_with_empty_job_queue():
# Create an empty job queue
job_queue = _JobQueue()

# Select all jobs with status 'in progress'
selected_jobs = job_queue.select_all('in progress')

# Check if the selected jobs match the expected result
assert selected_jobs.jobs.empty
Loading

0 comments on commit 6d99381

Please sign in to comment.