diff --git a/.github/workflows/push_to_main.yml b/.github/workflows/push_to_main.yml index 1e676447..6983132f 100644 --- a/.github/workflows/push_to_main.yml +++ b/.github/workflows/push_to_main.yml @@ -41,37 +41,6 @@ jobs: - name: Test with pytest (excluding abaqus tests) run: | python -m pytest -S abaqus - publish-sphinx-documentation: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.8" - cache: "pip" # caching pip dependencies - - name: Install package dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - - name: Install documentation requirements - run: | - pip install -r docs/requirements.txt - - name: Build documentation - run: | - sphinx-build -b html ./docs/source ./docs/build/html - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - name: html-docs - path: docs/build/html/ - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: docs/build/html - force_orphan: false build-package: strategy: matrix: diff --git a/VERSION b/VERSION index 7b5753f5..b000a6a0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.4.6 \ No newline at end of file +1.4.7 \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 2bc55b0b..25930cac 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -25,8 +25,8 @@ project = 'f3dasm' author = 'Martin van der Schelling' copyright = '2022, Martin van der Schelling' -version = '1.4.6' -release = '1.4.6' +version = '1.4.7' +release = '1.4.7' # -- General configuration ---------------------------------------------------- diff --git a/docs/source/rst_doc_files/classes/datageneration/datagenerator.rst b/docs/source/rst_doc_files/classes/datageneration/datagenerator.rst index ccca38e5..02b90ff9 100644 --- a/docs/source/rst_doc_files/classes/datageneration/datagenerator.rst +++ b/docs/source/rst_doc_files/classes/datageneration/datagenerator.rst @@ -38,7 +38,7 @@ There are three methods available of handeling the :class:`~f3dasm.ExperimentSam * :code:`sequential`: regular for-loop over each of the :class:`~f3dasm.ExperimentSample` objects in order * :code:`parallel`: utilizing the multiprocessing capabilities (with the `pathos `_ multiprocessing library), each :class:`~f3dasm.ExperimentSample` object is run in a separate core * :code:`cluster`: utilizing the multiprocessing capabilities, each :class:`~f3dasm.ExperimentSample` object is run in a separate node. After completion of an sample, the node will automatically pick the next available sample. More information on this mode can be found in the :ref:`cluster-mode` section. - +* :code:`cluster_parallel`: Combination of the :code:`cluster` and :code:`parallel` mode. Each node will run multiple samples in parallel. Implement your simulator ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/rst_doc_files/classes/sampling/sampling.rst b/docs/source/rst_doc_files/classes/sampling/sampling.rst index 168777e5..96e46baa 100644 --- a/docs/source/rst_doc_files/classes/sampling/sampling.rst +++ b/docs/source/rst_doc_files/classes/sampling/sampling.rst @@ -21,9 +21,6 @@ The function should return the samples (``input_data``) in one of the following * A :class:`~pandas.DataFrame` object * A :class:`~numpy.ndarray` object -.. note:: - - ... .. _implemented samplers: @@ -36,7 +33,6 @@ To use the sampler in the data-driven process, you should pass the function to t .. code-block:: python from f3dasm.design import ExperimentData, Domain - from domain = Domain(...) # Create the ExperimentData object @@ -63,4 +59,5 @@ Name Method ``"random"`` Random Uniform sampling `numpy.random.uniform `_ ``"latin"`` Latin Hypercube sampling `SALib.latin `_ ``"sobol"`` Sobol Sequence sampling `SALib.sobol_sequence `_ +``"grid"`` Grid Search sampling `itertools.product `_ ======================== ====================================================================== =========================================================================================================== diff --git a/docs/source/rst_doc_files/general/overview.rst b/docs/source/rst_doc_files/general/overview.rst index 6a269985..b488ef04 100644 --- a/docs/source/rst_doc_files/general/overview.rst +++ b/docs/source/rst_doc_files/general/overview.rst @@ -3,9 +3,9 @@ Overview ======== -.. _design-of-experiments: https://bessagroup.github.io/f3dasm/classes/design/experimentdata.html -.. _sampling: https://bessagroup.github.io/f3dasm/classes/sampling/sampling.html -.. _optimizing: https://bessagroup.github.io/f3dasm/classes/optimization/optimizers.html +.. _design-of-experiments: https://f3dasm.readthedocs.io/en/latest/rst_doc_files/classes/design/experimentdata.html +.. _sampling: https://f3dasm.readthedocs.io/en/latest/rst_doc_files/classes/sampling/sampling.html +.. _optimizing: https://f3dasm.readthedocs.io/en/latest/rst_doc_files/classes/optimization/optimizers.html .. _parallelizing: .. _TORQUE system: https://hpc-wiki.info/hpc/Torque diff --git a/paper/paper.md b/paper/paper.md index e6fbc284..cbc6b9ba 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -116,7 +116,7 @@ An overview of the different levels of abstraction is given in \autoref{fig:f3da # Documentation and collaborative development -To improve the usability of the `f3dasm` frame­work, thorough documentation has been included with the Sphinx package. Documentation for this package can be accessed on [the homepage](https://bessagroup.github.io/f3dasm/) and will be kept up to date with the latest release of the package. +To improve the usability of the `f3dasm` frame­work, thorough documentation has been included with the Sphinx package. Documentation for this package can be accessed on [the homepage](https://f3dasm.readthedocs.io/en/latest/) and will be kept up to date with the latest release of the package. The `f3dasm` framework relies on the collaborative efforts of scientists and developers to expand its capabilities. Therefore, it is essential to have a well-defined software development process in place. This is achieved by maintaining strict branching policies, and incorporating comprehensive testing suites and automatic continuous integration with GitHub Workflows. @@ -138,7 +138,7 @@ To maintain the integrity of the framework, various (automatic) validation proce # Availability -`f3dasm` and its extensions are available as a `pip` package and is compatible with Python 3.8 to 3.10 and all major operating systems (Linux, MacOS and Windows). Detailed installation instructions can be found on the ['Getting Started'](https://bessagroup.github.io/f3dasm/general/gettingstarted.html) documentation page. +`f3dasm` and its extensions are available as a `pip` package and is compatible with Python 3.8 to 3.10 and all major operating systems (Linux, MacOS and Windows). Detailed installation instructions can be found on the ['Getting Started'](https://f3dasm.readthedocs.io/en/latest/rst_doc_files/general/gettingstarted.html) documentation page. # Acknowledgements diff --git a/src/f3dasm/_src/datageneration/datagenerator.py b/src/f3dasm/_src/datageneration/datagenerator.py index 5d2cf2e5..acd5fa02 100644 --- a/src/f3dasm/_src/datageneration/datagenerator.py +++ b/src/f3dasm/_src/datageneration/datagenerator.py @@ -12,7 +12,7 @@ import sys from abc import abstractmethod from functools import partial -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, List, Optional if sys.version_info < (3, 8): # NOQA from typing_extensions import Protocol # NOQA @@ -187,3 +187,69 @@ def add_post_process(self, func: Callable, **kwargs): The keyword arguments to pass to the post-processing function """ self.post_process = partial(func, **kwargs) + + +def convert_function(f: Callable, + input: List[str], + output: Optional[List[str]] = None, + kwargs: Optional[Dict[str, Any]] = None, + to_disk: Optional[List[str]] = None) -> DataGenerator: + """ + Converts a given function `f` into a `DataGenerator` object. + + Parameters + ---------- + f : Callable + The function to be converted. + input : List[str] + A list of argument names required by the function. + output : Optional[List[str]], optional + A list of names for the return values of the function. + Defaults to None. + kwargs : Optional[Dict[str, Any]], optional + Additional keyword arguments passed to the function. Defaults to None. + to_disk : Optional[List[str]], optional + The list of output names where the value needs to be stored on disk. + Defaults to None. + + Returns + ------- + DataGenerator + A converted `DataGenerator` object. + + Notes + ----- + + The function `f` can have any number of arguments and any number of returns + as long as they are consistent with the `input` and `output` arguments that + are given to this function. + """ + + kwargs = kwargs if kwargs is not None else {} + to_disk = to_disk if to_disk is not None else [] + output = output if output is not None else [] + + class TempDataGenerator(DataGenerator): + def execute(self, **_kwargs) -> None: + _input = {input_name: self.experiment_sample.get(input_name) + for input_name in input} + _output = f(**_input, **kwargs) + + # check if output is empty + if output is None: + return + + if len(output) == 1: + _output = (_output,) + + for name, value in zip(output, _output): + if name in to_disk: + self.experiment_sample.store(name=name, + object=value, + to_disk=True) + else: + self.experiment_sample.store(name=name, + object=value, + to_disk=False) + + return TempDataGenerator() diff --git a/src/f3dasm/_src/datageneration/functions/adapters/augmentor.py b/src/f3dasm/_src/datageneration/functions/adapters/augmentor.py index 518e57c7..5f827e56 100644 --- a/src/f3dasm/_src/datageneration/functions/adapters/augmentor.py +++ b/src/f3dasm/_src/datageneration/functions/adapters/augmentor.py @@ -77,6 +77,10 @@ def augment(self, input: np.ndarray) -> np.ndarray: scale = abs(self.noise * yy) + if isinstance(input, float): + # convert to numpy float + input = np.float64(input) + noise: np.ndarray = np.random.normal( loc=0.0, scale=scale, size=input.shape) y_noise = input + float(noise) diff --git a/src/f3dasm/_src/datageneration/functions/adapters/pybenchfunction.py b/src/f3dasm/_src/datageneration/functions/adapters/pybenchfunction.py index c505d7a4..fde70a5f 100644 --- a/src/f3dasm/_src/datageneration/functions/adapters/pybenchfunction.py +++ b/src/f3dasm/_src/datageneration/functions/adapters/pybenchfunction.py @@ -89,9 +89,15 @@ def _configure_offset(self): unscaled_offset = np.atleast_1d( [ + # np.random.uniform( + # low=-abs(g[d] - self.scale_bounds[d, 0]), + # high=abs(g[d] - self.scale_bounds[d, 1])) + + # This is added so we only create offsets in one quadrant + np.random.uniform( low=-abs(g[d] - self.scale_bounds[d, 0]), - high=abs(g[d] - self.scale_bounds[d, 1])) + high=0.0) for d in range(self.dimensionality) ] ) diff --git a/src/f3dasm/_src/datageneration/functions/function.py b/src/f3dasm/_src/datageneration/functions/function.py index fcfc85e9..5379a328 100644 --- a/src/f3dasm/_src/datageneration/functions/function.py +++ b/src/f3dasm/_src/datageneration/functions/function.py @@ -101,8 +101,8 @@ def execute(self, experiment_sample: ExperimentSample) -> ExperimentSample: x = x._value if isinstance(x, ArrayBox): x = x._value - - experiment_sample["y"] = float(self(x).ravel().astype(np.float32)) + y = np.nan_to_num(self(x), nan=np.nan) + experiment_sample["y"] = float(y.ravel().astype(np.float64)) return experiment_sample def _run( diff --git a/src/f3dasm/_src/datageneration/functions/pybenchfunction.py b/src/f3dasm/_src/datageneration/functions/pybenchfunction.py index 6cd7070c..c3831565 100644 --- a/src/f3dasm/_src/datageneration/functions/pybenchfunction.py +++ b/src/f3dasm/_src/datageneration/functions/pybenchfunction.py @@ -269,7 +269,7 @@ class Bartels(PyBenchFunction): continuous = True convex = False separable = False - differentiable = False + differentiable = True multimodal = True randomized_term = False parametric = False @@ -660,7 +660,7 @@ class BukinN6(PyBenchFunction): continuous = True convex = True separable = False - differentiable = False + differentiable = True multimodal = True randomized_term = False parametric = False @@ -737,7 +737,7 @@ class CrossInTray(PyBenchFunction): continuous = True convex = False separable = False - differentiable = False + differentiable = True multimodal = True randomized_term = False parametric = False @@ -1249,7 +1249,7 @@ class HolderTable(PyBenchFunction): continuous = True convex = False separable = False - differentiable = False + differentiable = True multimodal = True randomized_term = False parametric = False @@ -1650,7 +1650,7 @@ class Powell(PyBenchFunction): continuous = True convex = True separable = True - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False @@ -2116,7 +2116,7 @@ class Schwefel(PyBenchFunction): continuous = True convex = False separable = True - differentiable = False + differentiable = True multimodal = True randomized_term = False parametric = False @@ -2155,7 +2155,7 @@ class Schwefel2_20(PyBenchFunction): continuous = True convex = True separable = True - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False @@ -2193,7 +2193,7 @@ class Schwefel2_21(PyBenchFunction): continuous = True convex = True separable = True - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False @@ -2231,7 +2231,7 @@ class Schwefel2_22(PyBenchFunction): continuous = True convex = True separable = True - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False @@ -2270,7 +2270,7 @@ class Schwefel2_23(PyBenchFunction): continuous = True convex = True separable = True - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False @@ -2490,7 +2490,7 @@ class Sphere(PyBenchFunction): continuous = True convex = True separable = True - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False @@ -2750,7 +2750,7 @@ class XinSheYangN2(PyBenchFunction): continuous = False convex = False separable = False - differentiable = False + differentiable = True multimodal = True randomized_term = False parametric = False @@ -2828,7 +2828,7 @@ class XinSheYangN4(PyBenchFunction): continuous = True convex = True separable = False - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False @@ -2866,7 +2866,7 @@ class Zakharov(PyBenchFunction): continuous = True convex = False separable = False - differentiable = False + differentiable = True multimodal = False randomized_term = False parametric = False diff --git a/src/f3dasm/_src/design/domain.py b/src/f3dasm/_src/design/domain.py index b4a6b745..a66dcf16 100644 --- a/src/f3dasm/_src/design/domain.py +++ b/src/f3dasm/_src/design/domain.py @@ -58,14 +58,34 @@ def __len__(self) -> int: """The len() method returns the number of parameters""" return len(self.space) - def __eq__(self, other: Domain) -> bool: + def __eq__(self, __o: Domain) -> bool: """Custom equality comparison for Domain objects.""" - if not isinstance(other, Domain): + if not isinstance(__o, Domain): return TypeError(f"Cannot compare Domain with \ - {type(other.__name__)}") + {type(__o.__name__)}") return ( - self.space == other.space) + self.space == __o.space) + + def __add__(self, __o: Domain) -> Domain: + if not isinstance(__o, Domain): + raise TypeError(f"Cannot add Domain with {type(__o.__name__)}") + + combined_space = {} + # Merge values for keys that are present in both dictionaries + for key in self.space.keys(): + if key in __o.space: + combined_space[key] = self.space[key] + __o.space[key] + else: + combined_space[key] = self.space[key] + + # Add keys from dict2 that are not present in dict1 + for key in __o.space.keys(): + if key not in self.space: + combined_space[key] = __o.space[key] + + return Domain(space=combined_space, + output_space={**self.output_space, **__o.output_space}) def items(self) -> Iterator[_Parameter]: """Return an iterator over the items of the parameters""" @@ -428,7 +448,7 @@ def add(self, name: str, f"Unknown type {type}!" f"Possible types are: 'float', 'int', 'category', 'constant'.") - def add_output(self, name: str, to_disk: bool): + def add_output(self, name: str, to_disk: bool, exist_ok=False): """Add a new output parameter to the domain. Parameters @@ -446,9 +466,11 @@ def add_output(self, name: str, to_disk: bool): {'param1': OutputParameter(to_disk=True)} """ if name in self.output_space: - raise KeyError( - f"Parameter {name} already exists in the domain! \ - Choose a different name.") + if not exist_ok: + raise KeyError( + f"Parameter {name} already exists in the domain! \ + Choose a different name.") + return self.output_space[name] = _OutputParameter(to_disk) # Getters @@ -724,6 +746,41 @@ def select(self, names: str | Iterable[str]) -> Domain: return Domain(space={key: self.space[key] for key in names}) + def drop_output(self, names: str | Iterable[str]) -> Domain: + """Drop a subset of output parameters from the domain. + + Parameters + ---------- + + names : str or Iterable[str] + The names of the output parameters to drop. + + Returns + ------- + Domain + A new domain with the dropped output parameters. + + Example + ------- + >>> domain = Domain() + >>> domain.output_space = { + ... 'param1': _OutputParameter(to_disk=True), + ... 'param2': _OutputParameter(to_disk=True), + ... 'param3': _OutputParameter(to_disk=True) + ... } + >>> domain.drop_output(['param1', 'param3']) + Domain({'param2': _OutputParameter(to_disk=True)}) + """ + + if isinstance(names, str): + names = [names] + + return Domain( + space=self.space, + output_space={key: self.output_space[key] + for key in self.output_space + if key not in names}) + # Miscellaneous # ============================================================================= @@ -753,7 +810,6 @@ def _check_output(self, names: List[str]): """ for output_name in names: if not self.is_in_output(output_name): - print(f"Output {output_name} not in domain. Adding it.") self.add_output(output_name, to_disk=False) def is_in_output(self, output_name: str) -> bool: diff --git a/src/f3dasm/_src/design/parameter.py b/src/f3dasm/_src/design/parameter.py index a1b3d171..93fedecd 100644 --- a/src/f3dasm/_src/design/parameter.py +++ b/src/f3dasm/_src/design/parameter.py @@ -66,6 +66,27 @@ class _ConstantParameter(_Parameter): def __post_init__(self): self._check_hashable() + def __add__(self, __o: _Parameter + ) -> _ConstantParameter | _CategoricalParameter: + if isinstance(__o, _ConstantParameter): + if self.value == __o.value: + return self + else: + return _CategoricalParameter( + categories=[self.value, __o.value]) + + if isinstance(__o, _CategoricalParameter): + return self.to_categorical() + __o + + if isinstance(__o, _DiscreteParameter): + return self.to_categorical() + __o + + if isinstance(__o, _ContinuousParameter): + raise ValueError("Cannot add continuous parameter to constant!") + + def to_categorical(self) -> _CategoricalParameter: + return _CategoricalParameter(categories=[self.value]) + def _check_hashable(self): """Check if the value is hashable.""" try: @@ -121,6 +142,31 @@ def __post_init__(self): self._check_types() self._check_range() + def __add__(self, __o: _Parameter) -> _ContinuousParameter: + if not isinstance(__o, _ContinuousParameter): + raise ValueError( + "Cannot add non-continuous parameter to continuous!") + + if self.log != __o.log: + raise ValueError( + "Cannot add continuous parameters with different log scales!") + + if self.lower_bound == __o.lower_bound \ + and self.upper_bound == __o.upper_bound: + # If both lower and upper bounds are the same, + # return the first object + return self + + if self.lower_bound > __o.upper_bound \ + or __o.lower_bound > self.upper_bound: + # If the ranges do not coincide, raise ValueError + raise ValueError("Ranges do not coincide, cannot add") + + # For other scenarios, join the ranges + return _ContinuousParameter( + lower_bound=min(self.lower_bound, __o.lower_bound), + upper_bound=max(self.upper_bound, __o.upper_bound)) + def _check_types(self): """Check if the boundaries are actually floats""" if isinstance(self.lower_bound, int): @@ -169,6 +215,22 @@ def __post_init__(self): self._check_types() self._check_range() + def __add__(self, __o: _Parameter) -> _DiscreteParameter: + if isinstance(__o, _DiscreteParameter): + if self.lower_bound == __o.lower_bound and \ + self.upper_bound == __o.upper_bound and \ + self.step == __o.step: + return self + + if isinstance(__o, _CategoricalParameter): + return __o + self + + if isinstance(__o, _ConstantParameter): + return __o.to_categorical() + self + + if isinstance(__o, _ContinuousParameter): + raise ValueError("Cannot add continuous parameter to discrete!") + def _check_types(self): """Check if the boundaries are actually ints""" if not isinstance(self.lower_bound, int) or not isinstance( @@ -205,6 +267,27 @@ class _CategoricalParameter(_Parameter): def __post_init__(self): self._check_duplicates() + def __add__(self, __o: _Parameter) -> _CategoricalParameter: + if isinstance(__o, _CategoricalParameter): + # join unique categories + joint_categories = list(set(self.categories + __o.categories)) + + if isinstance(__o, _ConstantParameter): + joint_categories = list(set(self.categories + [__o.value])) + + if isinstance(__o, _DiscreteParameter): + roll_out_discrete = list(range( + __o.lower_bound, __o.upper_bound, __o.step)) + joint_categories = list(set(self.categories + roll_out_discrete)) + + if isinstance(__o, _ContinuousParameter): + raise ValueError("Cannot add continuous parameter to categorical!") + + return _CategoricalParameter(joint_categories) + + def __eq__(self, __o: _CategoricalParameter) -> bool: + return set(self.categories) == set(__o.categories) + def _check_duplicates(self): """Check if there are duplicates in the categories list""" if len(self.categories) != len(set(self.categories)): diff --git a/src/f3dasm/_src/design/samplers.py b/src/f3dasm/_src/design/samplers.py index 8d9468d0..e5f5016f 100644 --- a/src/f3dasm/_src/design/samplers.py +++ b/src/f3dasm/_src/design/samplers.py @@ -6,6 +6,14 @@ from __future__ import annotations # Standard +import sys +from itertools import product + +if sys.version_info < (3, 8): # NOQA + from typing_extensions import Literal # NOQA +else: + from typing import Literal + from typing import Optional # Third-party @@ -25,10 +33,12 @@ # # ============================================================================= +SamplerNames = Literal['random', 'latin', 'sobol', 'grid'] # Factory function # ============================================================================= + def _sampler_factory(sampler: str, domain: Domain) -> Sampler: if sampler.lower() == 'random': return RandomUniform(domain) @@ -39,6 +49,9 @@ def _sampler_factory(sampler: str, domain: Domain) -> Sampler: elif sampler.lower() == 'sobol': return SobolSequence(domain) + elif sampler.lower() == 'grid': + return GridSampler(domain) + else: raise KeyError(f"Sampler {sampler} not found!" f"Available built-in samplers are: 'random'," @@ -144,7 +157,8 @@ def get_samples(self, numsamples: Optional[int] = None) -> pd.DataFrame: empty_frame = self.domain._create_empty_dataframe() # Then, create a new frame from the samples and columnnames - samples_frame = pd.DataFrame(data=samples, columns=columnnames) + samples_frame = pd.DataFrame( + data=samples, columns=columnnames, dtype=object) df = pd.concat([empty_frame, samples_frame], sort=True) return df @@ -283,3 +297,56 @@ def sample_continuous(self, numsamples: int) -> np.ndarray: # stretch samples samples = self._stretch_samples(samples) return samples + + +class GridSampler(Sampler): + """Sampling via Grid Sampling + + All the combination of the discrete and categorical parameters are + sampled. The argument number_of_samples is ignored. + Notes + ----- + This sampler is at the moment only applicable for + discrete and categorical parameters. + + """ + + def get_samples(self, numsamples: Optional[int] = None) -> pd.DataFrame: + """Receive samples of the search space + + Parameters + ---------- + numsamples + number of samples + + Returns + ------- + Data objects with the samples + """ + + self.set_seed(self.seed) + + # If numsamples is None, take the object attribute number_of_samples + if numsamples is None: + numsamples = self.number_of_samples + + continuous = self.domain.get_continuous_parameters() + + if continuous: + raise ValueError("Grid sampling is only possible for domains \ + strictly with only discrete and \ + categorical parameters") + + discrete = self.domain.get_discrete_parameters() + categorical = self.domain.get_categorical_parameters() + + _iterdict = {} + + for k, v in categorical.items(): + _iterdict[k] = v.categories + + for k, v, in discrete.items(): + _iterdict[k] = range(v.lower_bound, v.upper_bound+1) + + return pd.DataFrame(list(product(*_iterdict.values())), + columns=_iterdict, dtype=object) diff --git a/src/f3dasm/_src/experimentdata/_data.py b/src/f3dasm/_src/experimentdata/_data.py index eb72db7a..e6d0cc55 100644 --- a/src/f3dasm/_src/experimentdata/_data.py +++ b/src/f3dasm/_src/experimentdata/_data.py @@ -70,6 +70,10 @@ def __getitem__(self, index: int | Iterable[int]) -> _Data: ------- A subset of the data. """ + # If the object is empty, return itself + if self.is_empty(): + return self + if isinstance(index, int): index = [index] return _Data(data=self.data.loc[index].copy(), @@ -236,7 +240,7 @@ def to_numpy(self) -> np.ndarray: np.ndarray numpy array with the data. """ - return self.data.to_numpy() + return self.data.to_numpy(dtype=np.float32) def to_xarray(self, label: str) -> xr.DataArray: """Export the _Data object to a xarray DataArray. @@ -264,7 +268,7 @@ def to_dataframe(self) -> pd.DataFrame: """ df = deepcopy(self.data) df.columns = self.names - return df + return df.astype(object) def combine_data_to_multiindex(self, other: _Data, jobs_df: pd.DataFrame) -> pd.DataFrame: @@ -347,6 +351,29 @@ def select_columns(self, columns: Iterable[str] | str) -> _Data: return _Data( self.data[self.columns.iloc(columns)], columns=_selected_columns) + def drop(self, columns: Iterable[str] | str) -> _Data: + """Drop the selected columns from the data. + + Parameters + ---------- + columns : Iterable[str] | str + The columns to drop. + + Returns + ------- + _Data + The data without the selected columns + """ + if isinstance(columns, str): + columns = [columns] + _selected_columns = _Columns( + { + name: None for name in self.columns.columns + if name not in columns}) + return _Data( + data=self.data.drop(columns=self.columns.iloc(columns)), + columns=_selected_columns) + # Append and remove data # ============================================================================= @@ -377,7 +404,14 @@ def add_empty_rows(self, number_of_rows: int): np.nan, index=new_indices, columns=self.data.columns) self.data = pd.concat([self.data, empty_data], ignore_index=False) - def add_column(self, name: str): + def add_column(self, name: str, exist_ok: bool = False): + if name in self.columns.names: + if not exist_ok: + raise ValueError( + f"Column {name} already exists in the data. " + "Set exist_ok to True to allow skipping existing columns.") + return + if self.data.columns.empty: new_columns_index = 0 else: @@ -392,6 +426,16 @@ def remove(self, indices: List[int]): def round(self, decimals: int): self.data = self.data.round(decimals=decimals) + def overwrite(self, indices: Iterable[int], other: _Data | Dict[str, Any]): + if isinstance(other, Dict): + other = _convert_dict_to_data(other) + + for other_column in other.columns.names: + if other_column not in self.columns.names: + self.add_column(other_column) + + self.data.update(other.data.set_index(pd.Index(indices))) + # Getters and setters # ============================================================================= @@ -440,6 +484,16 @@ def is_empty(self) -> bool: """Check if the data is empty.""" return self.data.empty + def get_index_with_nan(self) -> pd.Index: + """Get the indices with NaN values. + + Returns + ------- + pd.Index + The indices with NaN values. + """ + return self.indices[self.data.isna().any(axis=1)] + def has_columnnames(self, names: Iterable[str]) -> bool: return set(names).issubset(self.names) diff --git a/src/f3dasm/_src/experimentdata/_io.py b/src/f3dasm/_src/experimentdata/_io.py index d5cdcf9a..388edac9 100644 --- a/src/f3dasm/_src/experimentdata/_io.py +++ b/src/f3dasm/_src/experimentdata/_io.py @@ -14,6 +14,7 @@ from typing import Any, Mapping, Optional, Type # Third-party +import matplotlib.pyplot as plt import numpy as np import pandas as pd import xarray as xr @@ -39,6 +40,9 @@ OUTPUT_DATA_FILENAME = "output" JOBS_FILENAME = "jobs" +RESOLUTION_MATPLOTLIB_FIGURE = 300 +MAX_TRIES = 10 + # Storing methods # ============================================================================= @@ -116,7 +120,7 @@ def load(self) -> Any: class NumpyStore(_Store): """Class to store and load objects using the numpy protocol""" - suffix: int = '.npy' + suffix: str = '.npy' def store(self) -> None: """ @@ -138,7 +142,7 @@ def load(self) -> np.ndarray: class PandasStore(_Store): """Class to store and load objects using the pandas protocol""" - suffix: int = '.csv' + suffix: str = '.csv' def store(self) -> None: """ @@ -160,7 +164,7 @@ def load(self) -> pd.DataFrame: class XarrayStore(_Store): """Class to store and load objects using the xarray protocol""" - suffix: int = '.nc' + suffix: str = '.nc' def store(self) -> None: """ @@ -180,12 +184,52 @@ def load(self) -> xr.DataArray | xr.Dataset: return xr.open_dataset(self.path.with_suffix(self.suffix)) +class FigureStore(_Store): + """Class to store and load objects using the matplotlib protocol""" + suffix: str = '.png' + + def store(self) -> None: + """ + Store the figure to disk as a png file + + Notes + ----- + - The figure is saved with a resolution of 300 dpi. + - The figure is saved with tight bounding boxes. + """ + self.object.savefig(self.path.with_suffix( + self.suffix), dpi=RESOLUTION_MATPLOTLIB_FIGURE, + bbox_inches='tight') + + def load(self) -> np.ndarray: + """ + Load the image as an numpy array from disk + using the matplotlib `plt.imread` function. + + Returns + ------- + np.ndarray + The loaded image in the form of a numpy array + + Notes + ----- + The returned array has shape + - (M, N) for grayscale images. + - (M, N, 3) for RGB images. + - (M, N, 4) for RGBA images. + + Images are returned as float arrays (0-1). + """ + return plt.imread(self.path.with_suffix(self.suffix)) + + STORE_TYPE_MAPPING: Mapping[Type, _Store] = { np.ndarray: NumpyStore, pd.DataFrame: PandasStore, pd.Series: PandasStore, xr.DataArray: XarrayStore, - xr.Dataset: XarrayStore + xr.Dataset: XarrayStore, + plt.Figure: FigureStore, } # Loading and saving functions diff --git a/src/f3dasm/_src/experimentdata/_jobqueue.py b/src/f3dasm/_src/experimentdata/_jobqueue.py index 10ecbe11..438b6c4d 100644 --- a/src/f3dasm/_src/experimentdata/_jobqueue.py +++ b/src/f3dasm/_src/experimentdata/_jobqueue.py @@ -63,7 +63,7 @@ def __init__(self, jobs: Optional[pd.Series] = None): self.jobs: pd.Series = jobs - def __add__(self, other: _JobQueue | int) -> _JobQueue: + def __add__(self, other: _JobQueue | str) -> _JobQueue: """Add two JobQueue objects together. Parameters @@ -76,10 +76,10 @@ def __add__(self, other: _JobQueue | int) -> _JobQueue: JobQueue JobQueue object containing the added jobs. """ - if isinstance(other, int): + if isinstance(other, str): # make _JobQueue from the jobnumber other = _JobQueue( - pd.Series([Status.OPEN], index=[0], dtype='string')) + pd.Series(other, index=[0], dtype='string')) try: last_index = self.jobs.index[-1] @@ -167,6 +167,24 @@ def reset(self) -> None: """Resets the job queue.""" self.jobs = pd.Series(dtype='string') + # Select + # ========================================================================= + + def select_all(self, status: str) -> _JobQueue: + """Selects all jobs with a certain status. + + Parameters + ---------- + status : str + Status of the jobs to select + + Returns + ------- + JobQueue + JobQueue object containing the selected jobs. + """ + return _JobQueue(self.jobs[self.jobs == status]) + # Export # ========================================================================= @@ -233,6 +251,16 @@ def add(self, number_of_jobs: int = 1, status: str = Status.OPEN): jobs_to_add = pd.Series(status, index=new_indices, dtype='string') self.jobs = pd.concat([self.jobs, jobs_to_add], ignore_index=False) + def overwrite( + self, indices: Iterable[int], + other: _JobQueue | str) -> None: + + if isinstance(other, str): + other = _JobQueue( + pd.Series([other], index=[0], dtype='string')) + + self.jobs.update(other.jobs.set_axis(indices)) + # Mark # ========================================================================= diff --git a/src/f3dasm/_src/experimentdata/_newdata.py b/src/f3dasm/_src/experimentdata/_newdata.py index 14b6de51..b8a45d01 100644 --- a/src/f3dasm/_src/experimentdata/_newdata.py +++ b/src/f3dasm/_src/experimentdata/_newdata.py @@ -640,6 +640,13 @@ def round(self, decimals: int): self.data = [[round(value, decimals) for value in row] for row in self.data] + def overwrite(self, data: _Data, indices: Iterable[int]): + # TODO: Implement this method! + ... + +# Getters and setters +# ============================================================================= + def get_data_dict(self, index: int) -> Dict[str, Any]: """ Get the data as a dictionary. diff --git a/src/f3dasm/_src/experimentdata/experimentdata.py b/src/f3dasm/_src/experimentdata/experimentdata.py index 5ae1f1c6..f0974c7c 100644 --- a/src/f3dasm/_src/experimentdata/experimentdata.py +++ b/src/f3dasm/_src/experimentdata/experimentdata.py @@ -14,6 +14,7 @@ import traceback from functools import wraps from pathlib import Path +from time import sleep from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type) @@ -35,13 +36,13 @@ from ..datageneration.datagenerator import DataGenerator from ..datageneration.functions.function_factory import _datagenerator_factory from ..design.domain import Domain, _domain_factory -from ..design.samplers import Sampler, _sampler_factory +from ..design.samplers import Sampler, SamplerNames, _sampler_factory from ..logger import logger from ..optimization import Optimizer from ..optimization.optimizer_factory import _optimizer_factory from ._data import DataTypes, _Data, _data_factory from ._io import (DOMAIN_FILENAME, EXPERIMENTDATA_SUBFOLDER, - INPUT_DATA_FILENAME, JOBS_FILENAME, LOCK_FILENAME, + INPUT_DATA_FILENAME, JOBS_FILENAME, LOCK_FILENAME, MAX_TRIES, OUTPUT_DATA_FILENAME, _project_dir_factory) from ._jobqueue import NoOpenJobsError, Status, _jobs_factory from .experimentsample import ExperimentSample @@ -149,6 +150,9 @@ def __init__(self, def __len__(self): """The len() method returns the number of datapoints""" + if self._input_data.is_empty(): + return len(self._output_data) + return len(self._input_data) def __iter__(self) -> Iterator[Tuple[Dict[str, Any]]]: @@ -164,23 +168,19 @@ def __next__(self) -> ExperimentSample: return self.get_experiment_sample(index) def __add__(self, - other: ExperimentData | ExperimentSample) -> ExperimentData: + __o: ExperimentData | ExperimentSample) -> ExperimentData: """The + operator combines two ExperimentData objects""" # Check if the domains are the same - if not isinstance(other, (ExperimentData, ExperimentSample)): + if not isinstance(__o, (ExperimentData, ExperimentSample)): raise TypeError( f"Can only add ExperimentData or " - f"ExperimentSample objects, not {type(other)}") - - if isinstance(other, ExperimentData) and self.domain != other.domain: - raise ValueError( - "Cannot add ExperimentData objects with different domains") + f"ExperimentSample objects, not {type(__o)}") return ExperimentData( - input_data=self._input_data + other._input_data, - output_data=self._output_data + other._output_data, - jobs=self._jobs + other._jobs, domain=self.domain, + input_data=self._input_data + __o._input_data, + output_data=self._output_data + __o._output_data, + jobs=self._jobs + __o._jobs, domain=self.domain + __o.domain, project_dir=self.project_dir) def __eq__(self, __o: ExperimentData) -> bool: @@ -216,17 +216,35 @@ def wrapper_func(self: ExperimentData, *args, **kwargs) -> None: (self. project_dir / EXPERIMENTDATA_SUBFOLDER / LOCK_FILENAME) .with_suffix('.lock')) + + # If the lock has been acquired: with lock: - self = ExperimentData.from_file(self.project_dir) - value = operation(self, *args, **kwargs) - self.store() + tries = 0 + while tries < MAX_TRIES: + try: + self = ExperimentData.from_file(self.project_dir) + value = operation(self, *args, **kwargs) + self.store() + break + + # Racing conditions can occur when the file is empty + # and the file is being read at the same time + except pd.errors.EmptyDataError: + tries += 1 + logger.debug(( + f"EmptyDataError occurred, retrying" + f" {tries+1}/{MAX_TRIES}")) + sleep(1) + + raise pd.errors.EmptyDataError() + return value return wrapper_func # Properties # ========================================================================= - @property + @ property def index(self) -> pd.Index: """Returns an iterable of the job number of the experiments @@ -235,12 +253,15 @@ def index(self) -> pd.Index: pd.Index The job number of all the experiments in pandas Index format """ + if self._input_data.is_empty(): + return self._output_data.indices + return self._input_data.indices # Alternative Constructors # ========================================================================= - @classmethod + @ classmethod def from_file(cls: Type[ExperimentData], project_dir: Path | str) -> ExperimentData: """Create an ExperimentData object from .csv and .json files. @@ -277,8 +298,9 @@ def from_sampling(cls, sampler: Sampler | str, domain: Domain | DictConfig, Parameters ---------- - sampler : Sampler - Sampler object containing the sampling strategy. + sampler : Sampler | str + Sampler object containing the sampling strategy or one of the + built-in sampler names. domain : Domain | DictConfig Domain object containing the domain of the experiment or hydra DictConfig object containing the configuration. @@ -291,6 +313,17 @@ def from_sampling(cls, sampler: Sampler | str, domain: Domain | DictConfig, ------- ExperimentData ExperimentData object containing the sampled data. + + Note + ---- + + If a string is passed for the sampler argument, it should be one + of the built-in samplers: + + * 'random' : Random sampling + * 'latin' : Latin Hypercube Sampling + * 'sobol' : Sobol Sequence Sampling + * 'grid' : Grid Search Sampling """ experimentdata = cls(domain=domain) experimentdata.sample(sampler=sampler, n_samples=n_samples, seed=seed) @@ -311,12 +344,17 @@ def from_yaml(cls, config: DictConfig) -> ExperimentData: ExperimentData ExperimentData object containing the loaded data. """ + # Option 0: Both existing and sampling + if 'from_file' in config and 'from_sampling' in config: + return cls.from_file(config.from_file) + cls.from_sampling( + **config.from_sampling) + # Option 1: From exisiting ExperimentData files if 'from_file' in config: return cls.from_file(config.from_file) # Option 2: Sample from the domain - elif 'from_sampling' in config: + if 'from_sampling' in config: return cls.from_sampling(**config.from_sampling) else: @@ -377,6 +415,53 @@ def select(self, indices: int | Iterable[int]) -> ExperimentData: jobs=self._jobs[indices], domain=self.domain, project_dir=self.project_dir) + def drop_output(self, names: Iterable[str] | str) -> ExperimentData: + """Drop a column from the output data + + Parameters + ---------- + names : Iteraeble | str + The names of the columns to drop. + + Returns + ------- + ExperimentData + The ExperimentData object with the column dropped. + """ + return ExperimentData(input_data=self._input_data, + output_data=self._output_data.drop(names), + jobs=self._jobs, domain=self.domain.drop_output( + names), + project_dir=self.project_dir) + + def select_with_status(self, status: Literal['open', 'in progress', + 'finished', 'error'] + ) -> ExperimentData: + """Select a subset of the ExperimentData object with a given status + + Parameters + ---------- + status : Literal['open', 'in progress', 'finished', 'error'] + The status to select. + + Returns + ------- + ExperimentData + The selected ExperimentData object with only the selected status. + + Raises + ------ + ValueError + Raised when invalid status is specified + """ + if status not in [s.value for s in Status]: + raise ValueError(f"Invalid status {status} given. " + f"\nChoose from values: " + f"{', '.join([s.value for s in Status])}") + + _indices = self._jobs.select_all(status).indices + return self.select(_indices) + def get_input_data(self, parameter_names: Optional[str | Iterable[str]] = None ) -> ExperimentData: @@ -578,26 +663,33 @@ def add(self, domain: Optional[Domain] = None, jobs : Optional[Path | str], optional jobs off the added object, by default None """ - self._add_experiments(ExperimentData( + self.add_experiments(ExperimentData( domain=domain, input_data=input_data, output_data=output_data, jobs=jobs)) - def _add_experiments(self, - experiment_sample: ExperimentSample | ExperimentData - ) -> None: + def add_experiments(self, + experiment_sample: ExperimentSample | ExperimentData + ) -> None: """ Add an ExperimentSample or ExperimentData to the ExperimentData - attribute. + attribute. Parameters ---------- experiment_sample : ExperimentSample or ExperimentData Experiment(s) to add. + + Raises + ------ + ValueError + If -after checked- the indices of the input and output data + objects are not equal. """ if isinstance(experiment_sample, ExperimentData): experiment_sample._reset_index() + self.domain += experiment_sample.domain self._input_data += experiment_sample._input_data self._output_data += experiment_sample._output_data @@ -613,7 +705,101 @@ def _add_experiments(self, # Apparently you need to cast the types again # TODO: Breaks if values are NaN or infinite - self._input_data.cast_types(self.domain) + # self._input_data.cast_types(self.domain) + + def overwrite( + self, indices: Iterable[int], + domain: Optional[Domain] = None, + input_data: Optional[DataTypes] = None, + output_data: Optional[DataTypes] = None, + jobs: Optional[Path | str] = None, + add_if_not_exist: bool = False + ) -> None: + """Overwrite the ExperimentData object. + + Parameters + ---------- + indices : Iterable[int] + The indices to overwrite. + domain : Optional[Domain], optional + Domain of the new object, by default None + input_data : Optional[DataTypes], optional + input parameters of the new object, by default None + output_data : Optional[DataTypes], optional + output parameters of the new object, by default None + jobs : Optional[Path | str], optional + jobs off the new object, by default None + add_if_not_exist : bool, optional + If True, the new objects are added if the requested indices + do not exist in the current ExperimentData object, by default False + """ + + # Be careful, if a job has output data and gets overwritten with a + # job that has no output data, the status is set to open. But the job + # will still have the output data! + + # This is usually not a problem, because the output data will be + # immediately overwritten in optimization. + + self._overwrite_experiments( + indices=indices, + experiment_sample=ExperimentData( + domain=domain, input_data=input_data, + output_data=output_data, + jobs=jobs), + add_if_not_exist=add_if_not_exist) + + def _overwrite_experiments( + self, indices: Iterable[int], + experiment_sample: ExperimentSample | ExperimentData, + add_if_not_exist: bool) -> None: + """ + Overwrite the ExperimentData object at the given indices. + + Parameters + ---------- + indices : Iterable[int] + The indices to overwrite. + experimentdata : ExperimentData | ExperimentSample + The new ExperimentData object to overwrite with. + add_if_not_exist : bool + If True, the new objects are added if the requested indices + do not exist in the current ExperimentData object. + """ + if not all(pd.Index(indices).isin(self.index)): + if add_if_not_exist: + self.add_experiments(experiment_sample) + return + else: + raise ValueError( + f"The given indices {indices} do not exist in the current " + f"ExperimentData object. " + f"If you want to add the new experiments, " + f"set add_if_not_exist to True.") + + self._input_data.overwrite( + indices=indices, other=experiment_sample._input_data) + self._output_data.overwrite( + indices=indices, other=experiment_sample._output_data) + + self._jobs.overwrite( + indices=indices, other=experiment_sample._jobs) + + if isinstance(experiment_sample, ExperimentData): + self.domain += experiment_sample.domain + + @_access_file + def overwrite_disk( + self, indices: Iterable[int], + domain: Optional[Domain] = None, + input_data: Optional[DataTypes] = None, + output_data: Optional[DataTypes] = None, + jobs: Optional[Path | str] = None, + add_if_not_exist: bool = False + ) -> None: + self.overwrite(indices=indices, domain=domain, input_data=input_data, + output_data=output_data, jobs=jobs, + add_if_not_exist=add_if_not_exist) def add_input_parameter( self, name: str, @@ -633,7 +819,8 @@ def add_input_parameter( self._input_data.add_column(name) self.domain.add(name=name, type=type, **kwargs) - def add_output_parameter(self, name: str, is_disk: bool) -> None: + def add_output_parameter( + self, name: str, is_disk: bool, exist_ok: bool = False) -> None: """Add a new output column to the ExperimentData object. Parameters @@ -642,9 +829,12 @@ def add_output_parameter(self, name: str, is_disk: bool) -> None: name of the new output column is_disk Whether the output column will be stored on disk or not + exist_ok + If True, it will not raise an error if the output column already + exists, by default False """ - self._output_data.add_column(name) - self.domain.add_output(name, is_disk) + self._output_data.add_column(name, exist_ok=exist_ok) + self.domain.add_output(name=name, to_disk=is_disk, exist_ok=exist_ok) def remove_rows_bottom(self, number_of_rows: int): """ @@ -671,7 +861,11 @@ def _reset_index(self) -> None: Reset the index of the ExperimentData object. """ self._input_data.reset_index() - self._output_data.reset_index(self._input_data.indices) + + if self._input_data.is_empty(): + self._output_data.reset_index() + else: + self._output_data.reset_index(self._input_data.indices) self._jobs.reset_index() # ExperimentSample @@ -878,10 +1072,25 @@ def mark_all_error_open(self) -> None: Mark all the experiments that have the status 'error' open """ self._jobs.mark_all_error_open() + + def mark_all_in_progress_open(self) -> None: + """ + Mark all the experiments that have the status 'in progress' open + """ + self._jobs.mark_all_in_progress_open() + + def mark_all_nan_open(self) -> None: + """ + Mark all the experiments that have 'nan' in output open + """ + indices = self._output_data.get_index_with_nan() + self.mark(indices=indices, status='open') # Datageneration # ========================================================================= - def evaluate(self, data_generator: DataGenerator, mode: str = 'sequential', + def evaluate(self, data_generator: DataGenerator, + mode: Literal['sequential', 'parallel', + 'cluster', 'cluster_parallel'] = 'sequential', kwargs: Optional[dict] = None) -> None: """Run any function over the entirety of the experiments @@ -889,8 +1098,14 @@ def evaluate(self, data_generator: DataGenerator, mode: str = 'sequential', ---------- data_generator : DataGenerator data grenerator to use - mode, optional - operational mode, by default 'sequential' + mode : str, optional + operational mode, by default 'sequential'. Choose between: + + * 'sequential' : Run the operation sequentially + * 'parallel' : Run the operation on multiple cores + * 'cluster' : Run the operation on the cluster + * 'cluster_parallel' : Run the operation on the cluster in parallel + kwargs, optional Any keyword arguments that need to be supplied to the function, by default None @@ -913,6 +1128,8 @@ def evaluate(self, data_generator: DataGenerator, mode: str = 'sequential', return self._run_multiprocessing(data_generator, kwargs) elif mode.lower() == "cluster": return self._run_cluster(data_generator, kwargs) + elif mode.lower() == "cluster_parallel": + return self._run_cluster_parallel(data_generator, kwargs) else: raise ValueError("Invalid parallelization mode specified.") @@ -989,19 +1206,32 @@ def _run_multiprocessing(self, data_generator: DataGenerator, except NoOpenJobsError: break - def f(options: Dict[str, Any]) -> Any: - logger.debug( - "Running experiment_sample" - f"{options['experiment_sample'].job_number}") - return data_generator._run(**options) + def f(options: Dict[str, Any]) -> Tuple[ExperimentSample, int]: + try: + + logger.debug( + f"Running experiment_sample " + f"{options['experiment_sample'].job_number}") + + return (data_generator._run(**options), 0) # no *args! + + except Exception as e: + error_msg = f"Error in experiment_sample \ + {options['experiment_sample'].job_number}: {e}" + error_traceback = traceback.format_exc() + logger.error(f"{error_msg}\n{error_traceback}") + return (options['experiment_sample'], 1) with mp.Pool() as pool: # maybe implement pool.starmap_async ? - _experiment_samples: List[ExperimentSample] = pool.starmap( - f, options) + _experiment_samples: List[ + Tuple[ExperimentSample, int]] = pool.starmap(f, options) - for _experiment_sample in _experiment_samples: - self._set_experiment_sample(_experiment_sample) + for _experiment_sample, exit_code in _experiment_samples: + if exit_code == 0: + self._set_experiment_sample(_experiment_sample) + else: + self._set_error(_experiment_sample.job_number) def _run_cluster(self, data_generator: DataGenerator, kwargs: dict): """Run the operation on the cluster @@ -1035,9 +1265,9 @@ def _run_cluster(self, data_generator: DataGenerator, kwargs: dict): _experiment_sample = data_generator._run( experiment_sample, **kwargs) self._write_experiment_sample(_experiment_sample) - except Exception as e: - error_msg = "Error in experiment_sample " - f"{experiment_sample._jobnumber}: {e}" + except Exception: + n = experiment_sample.job_number + error_msg = f"Error in experiment_sample {n}: " error_traceback = traceback.format_exc() logger.error(f"{error_msg}\n{error_traceback}") self._write_error(experiment_sample._jobnumber) @@ -1048,68 +1278,160 @@ def _run_cluster(self, data_generator: DataGenerator, kwargs: dict): (self.project_dir / EXPERIMENTDATA_SUBFOLDER / LOCK_FILENAME ).with_suffix('.lock').unlink(missing_ok=True) + def _run_cluster_parallel( + self, data_generator: DataGenerator, kwargs: dict): + """Run the operation on the cluster and parallelize it over cores + + Parameters + ---------- + operation : ExperimentSampleCallable + function execution for every entry in the ExperimentData object + kwargs : dict + Any keyword arguments that need to be supplied to the function + + Raises + ------ + NoOpenJobsError + Raised when there are no open jobs left + """ + # Retrieve the updated experimentdata object from disc + try: + self = self.from_file(self.project_dir) + except FileNotFoundError: # If not found, store current + self.store() + + no_jobs = False + + while True: + es_list = [] + for core in range(mp.cpu_count()): + try: + es_list.append(self._get_open_job_data()) + except NoOpenJobsError: + logger.debug("No Open jobs left!") + no_jobs = True + break + + d = self.select([e.job_number for e in es_list]) + + d._run_multiprocessing( + data_generator=data_generator, kwargs=kwargs) + + # TODO access resource first! + self.overwrite_disk( + indices=d.index, input_data=d._input_data, + output_data=d._output_data, jobs=d._jobs, + domain=d.domain, add_if_not_exist=False) + + if no_jobs: + break + + self = self.from_file(self.project_dir) + # Remove the lockfile from disk + (self.project_dir / EXPERIMENTDATA_SUBFOLDER / LOCK_FILENAME + ).with_suffix('.lock').unlink(missing_ok=True) + # Optimization # ========================================================================= def optimize(self, optimizer: Optimizer | str, data_generator: DataGenerator | str, - iterations: int, kwargs: Optional[Dict[str, Any]] = None, + iterations: int, + kwargs: Optional[Dict[str, Any]] = None, hyperparameters: Optional[Dict[str, Any]] = None, - x0_selection: str = 'best') -> None: + x0_selection: Literal['best', 'random', + 'last', + 'new'] | ExperimentData = 'best', + sampler: Optional[Sampler | str] = 'random', + overwrite: bool = False, + callback: Optional[Callable] = None) -> None: """Optimize the experimentdata object Parameters ---------- - optimizer : Optimizer - Optimizer object to use + optimizer : Optimizer | str + Optimizer object data_generator : DataGenerator | str - Data generator object to use + DataGenerator object iterations : int - Number of iterations to run + number of iterations kwargs : Dict[str, Any], optional - Any additional keyword arguments that need to be supplied to \ - the data generator, by default None + any additional keyword arguments that will be passed to + the DataGenerator hyperparameters : Dict[str, Any], optional - Any additional hyperparameters that need to be supplied to the \ - optimizer, by default None - x0_selection : str, optional - How to select the initial design, by default 'best' + any additional keyword arguments that will be passed to + the optimizer + x0_selection : str | ExperimentData + How to select the initial design. By default 'best' + The following x0_selections are available: + + * 'best': Select the best designs from the current experimentdata + * 'random': Select random designs from the current experimentdata + * 'last': Select the last designs from the current experimentdata + * 'new': Create new random designs from the current experimentdata + + If the x0_selection is 'new', new designs are sampled with the + sampler provided. The number of designs selected is equal to the + population size of the optimizer. + + If an ExperimentData object is passed as x0_selection, + the optimizer will use the input_data and output_data from this + object as initial samples. + sampler: Sampler, optional + If x0_selection = 'new', the sampler to use. By default 'random' + overwrite: bool, optional + If True, the optimizer will overwrite the current data. By default + False + callback : Callable, optional + A callback function that is called after every iteration. It has + the following signature: + + ``callback(intermediate_result: ExperimentData)`` + + where the first argument is a parameter containing an + `ExperimentData` object with the current iterate(s). Raises ------ ValueError Raised when invalid x0_selection is specified - ValueError - Raised when invalid optimizer type is specified - - Note - ---- - The following x0_selections are available: - - * 'best': Select the best designs from the current experimentdata - * 'random': Select random designs from the current experimentdata - * 'last': Select the last designs from the current experimentdata - - The number of designs selected is equal to the \ - population size of the optimizer """ + # Create the data generator object if a string reference is passed if isinstance(data_generator, str): data_generator: DataGenerator = _datagenerator_factory( - data_generator, self.domain, kwargs) + data_generator=data_generator, + domain=self.domain, kwargs=kwargs) + # Create the optimizer object if a string reference is passed if isinstance(optimizer, str): optimizer: Optimizer = _optimizer_factory( optimizer, self.domain, hyperparameters) + # Create the sampler object if a string reference is passed + if isinstance(sampler, str): + sampler: Sampler = _sampler_factory(sampler, self.domain) + if optimizer.type == 'scipy': self._iterate_scipy( - optimizer, data_generator, iterations, kwargs, x0_selection) + optimizer=optimizer, data_generator=data_generator, + iterations=iterations, kwargs=kwargs, + x0_selection=x0_selection, + sampler=sampler, + overwrite=overwrite, + callback=callback) else: self._iterate( - optimizer, data_generator, iterations, kwargs, x0_selection) + optimizer=optimizer, data_generator=data_generator, + iterations=iterations, kwargs=kwargs, + x0_selection=x0_selection, + sampler=sampler, + overwrite=overwrite, + callback=callback) def _iterate(self, optimizer: Optimizer, data_generator: DataGenerator, - iterations: int, kwargs: dict, x0_selection: str): + iterations: int, kwargs: Dict[str, Any], x0_selection: str, + sampler: Sampler, overwrite: bool, + callback: Callable): """Internal represenation of the iteration process Parameters @@ -1120,29 +1442,86 @@ def _iterate(self, optimizer: Optimizer, data_generator: DataGenerator, DataGenerator object iterations : int number of iterations - kwargs : dict, optional - any additional keyword arguments that will be passed to \ - the DataGenerator, by default None - x0_selection : str - How to select the initial design + kwargs : Dict[str, Any] + any additional keyword arguments that will be passed to + the DataGenerator + x0_selection : str | ExperimentData + How to select the initial design. + The following x0_selections are available: + + * 'best': Select the best designs from the current experimentdata + * 'random': Select random designs from the current experimentdata + * 'last': Select the last designs from the current experimentdata + * 'new': Create new random designs from the current experimentdata + + If the x0_selection is 'new', new designs are sampled with the + sampler provided. The number of designs selected is equal to the + population size of the optimizer. + + If an ExperimentData object is passed as x0_selection, + the optimizer will use the input_data and output_data from this + object as initial samples. + + sampler: Sampler + If x0_selection = 'new', the sampler to use + overwrite: bool + If True, the optimizer will overwrite the current data. + callback : Callable + A callback function that is called after every iteration. It has + the following signature: + + ``callback(intermediate_result: ExperimentData)`` + + where the first argument is a parameter containing an + `ExperimentData` object with the current iterate(s). Raises ------ ValueError Raised when invalid x0_selection is specified + """ + last_index = self.index[-1] if not self.index.empty else -1 + + if isinstance(x0_selection, str): + if x0_selection == 'new': + + if iterations < optimizer.hyperparameters.population: + raise ValueError( + f'For creating new samples, the total number of ' + f'requested iterations ({iterations}) cannot be ' + f'smaller than the population size ' + f'({optimizer.hyperparameters.population})') + + init_samples = ExperimentData.from_sampling( + domain=self.domain, + sampler=sampler, + n_samples=optimizer.hyperparameters.population, + seed=optimizer.seed) + + init_samples.evaluate( + data_generator=data_generator, kwargs=kwargs, + mode='sequential') + + if callback is not None: + callback(init_samples) + + if overwrite: + _indices = init_samples.index + last_index + 1 + self._overwrite_experiments( + experiment_sample=init_samples, + indices=_indices, + add_if_not_exist=True) - Note - ---- - The following x0_selections are available: + else: + self.add_experiments(init_samples) - * 'best': Select the best designs from the current experimentdata - * 'random': Select random designs from the current experimentdata - * 'last': Select the last designs from the current experimentdata + x0_selection = 'last' + iterations -= optimizer.hyperparameters.population + + x0 = x0_factory(experiment_data=self, mode=x0_selection, + n_samples=optimizer.hyperparameters.population) + optimizer.set_data(x0) - The number of designs selected is equal to the \ - population size of the optimizer - """ - optimizer.set_x0(self, mode=x0_selection) optimizer._check_number_of_datapoints() optimizer._construct_model(data_generator) @@ -1154,20 +1533,33 @@ def _iterate(self, optimizer: Optimizer, data_generator: DataGenerator, # If new_samples is a tuple of input_data and output_data if isinstance(new_samples, tuple): - self.add(domain=self.domain, - input_data=new_samples[0], output_data=new_samples[1]) + new_samples = ExperimentData( + domain=self.domain, + input_data=new_samples[0], + output_data=new_samples[1], + ) + # If applicable, evaluate the new designs: + new_samples.evaluate( + data_generator, mode='sequential', kwargs=kwargs) - else: - self._add_experiments(new_samples) + if callback is not None: + callback(new_samples) - # If applicable, evaluate the new designs: - self.evaluate(data_generator, mode='sequential', kwargs=kwargs) + if overwrite: + _indices = new_samples.index + last_index + 1 + self._overwrite_experiments(experiment_sample=new_samples, + indices=_indices, + add_if_not_exist=True) + + else: + self.add_experiments(new_samples) optimizer.set_data(self) - # Remove overiterations - self.remove_rows_bottom(number_of_overiterations( - iterations, population=optimizer.hyperparameters.population)) + if not overwrite: + # Remove overiterations + self.remove_rows_bottom(number_of_overiterations( + iterations, population=optimizer.hyperparameters.population)) # Reset the optimizer optimizer.reset(ExperimentData(domain=self.domain)) @@ -1175,77 +1567,139 @@ def _iterate(self, optimizer: Optimizer, data_generator: DataGenerator, def _iterate_scipy(self, optimizer: Optimizer, data_generator: DataGenerator, iterations: int, kwargs: dict, - x0_selection: str): - """Internal represenation of the iteration process for s - cipy-optimize algorithms + x0_selection: str | ExperimentData, + sampler: Sampler, overwrite: bool, + callback: Callable): + """Internal represenation of the iteration process for scipy-minimize + optimizers. Parameters ---------- - optimizer : _Optimizer + optimizer : Optimizer Optimizer object data_generator : DataGenerator DataGenerator object iterations : int number of iterations - kwargs : dict, optional - any additional keyword arguments that will be passed \ - to the DataGenerator, by default None - x0_selection : str - How to select the initial design + kwargs : Dict[str, Any] + any additional keyword arguments that will be passed to + the DataGenerator + x0_selection : str | ExperimentData + How to select the initial design. + The following x0_selections are available: + + * 'best': Select the best designs from the current experimentdata + * 'random': Select random designs from the current experimentdata + * 'last': Select the last designs from the current experimentdata + * 'new': Create new random designs from the current experimentdata + + If the x0_selection is 'new', new designs are sampled with the + sampler provided. The number of designs selected is equal to the + population size of the optimizer. + + If an ExperimentData object is passed as x0_selection, + the optimizer will use the input_data and output_data from this + object as initial samples. + + sampler: Sampler + If x0_selection = 'new', the sampler to use + overwrite: bool + If True, the optimizer will overwrite the current data. + callback : Callable + A callback function that is called after every iteration. It has + the following signature: + + ``callback(intermediate_result: ExperimentData)`` + + where the first argument is a parameter containing an + `ExperimentData` object with the current iterate(s). Raises ------ ValueError Raised when invalid x0_selection is specified + """ + last_index = self.index[-1] if not self.index.empty else -1 + n_data_before_iterate = len(self) - Note - ---- - The following x0_selections are available: + if isinstance(x0_selection, str): + if x0_selection == 'new': - * 'best': Select the best designs from the current experimentdata - * 'random': Select random designs from the current experimentdata - * 'last': Select the last designs from the current experimentdata + if iterations < optimizer.hyperparameters.population: + raise ValueError( + f'For creating new samples, the total number of ' + f'requested iterations ({iterations}) cannot be ' + f'smaller than the population size ' + f'({optimizer.hyperparameters.population})') - The number of designs selected is equal to the \ - population size of the optimizer - """ + init_samples = ExperimentData.from_sampling( + domain=self.domain, + sampler=sampler, + n_samples=optimizer.hyperparameters.population, + seed=optimizer.seed) + + init_samples.evaluate( + data_generator=data_generator, kwargs=kwargs, + mode='sequential') + + if callback is not None: + callback(init_samples) + + if overwrite: + _indices = init_samples.index + last_index + 1 + self._overwrite_experiments( + experiment_sample=init_samples, + indices=_indices, + add_if_not_exist=True) + + else: + self.add_experiments(init_samples) + + x0_selection = 'last' + + x0 = x0_factory(experiment_data=self, mode=x0_selection, + n_samples=optimizer.hyperparameters.population) + optimizer.set_data(x0) - optimizer.set_x0(self, mode=x0_selection) - n_data_before_iterate = len(self) optimizer._check_number_of_datapoints() optimizer.run_algorithm(iterations, data_generator) - # Do not add the first element, as this is already in the sampled data - self._add_experiments(optimizer.data.select(optimizer.data.index[1:])) + new_samples: ExperimentData = optimizer.data.select( + optimizer.data.index[1:]) + new_samples.evaluate(data_generator, mode='sequential', kwargs=kwargs) + + if callback is not None: + callback(new_samples) - # TODO: At the end, the data should have - # n_data_before_iterate + iterations amount of elements! - # If x_new is empty, repeat best x0 to fill up total iteration - if len(self) == n_data_before_iterate: - repeated_x, repeated_y = self.get_n_best_output( - n_samples=1).to_numpy() - # repeated_last_element = self.get_n_best_output( - # n_samples=1).to_numpy()[0].ravel() + if overwrite: + self.add_experiments( + optimizer.data.select([optimizer.data.index[-1]])) - for repetition in range(iterations): - # self._add_experiments( - # ExperimentSample.from_numpy(repeated_last_element, - # domain=self.domain)) + elif not overwrite: + # Do not add the first element, as this is already + # in the sampled data + self.add_experiments(new_samples) - self.add( - domain=self.domain, input_data=repeated_x, - output_data=repeated_y) + # TODO: At the end, the data should have + # n_data_before_iterate + iterations amount of elements! + # If x_new is empty, repeat best x0 to fill up total iteration + if len(self) == n_data_before_iterate: + repeated_sample = self.get_n_best_output( + n_samples=1) - # Repeat last iteration to fill up total iteration - if len(self) < n_data_before_iterate + iterations: - last_design = self.get_experiment_sample(len(self)-1) + for repetition in range(iterations): + self.add_experiments(repeated_sample) - while len(self) < n_data_before_iterate + iterations: - self._add_experiments(last_design) + # Repeat last iteration to fill up total iteration + if len(self) < n_data_before_iterate + iterations: + last_design = self.get_experiment_sample(len(self)-1) + + while len(self) < n_data_before_iterate + iterations: + self.add_experiments(last_design) # Evaluate the function on the extra iterations - self.evaluate(data_generator, mode='sequential') + self.evaluate(data_generator, mode='sequential', kwargs=kwargs) # Reset the optimizer optimizer.reset(ExperimentData(domain=self.domain)) @@ -1253,27 +1707,25 @@ def _iterate_scipy(self, optimizer: Optimizer, # Sampling # ========================================================================= - def sample(self, sampler: Sampler | str, n_samples: int = 1, + def sample(self, sampler: Sampler | SamplerNames, n_samples: int = 1, seed: Optional[int] = None) -> None: """Sample data from the domain providing the sampler strategy Parameters ---------- - sampler : Sampler or str + sampler: Sampler | str Sampler callable or string of built-in sampler + If a string is passed, it should be one of the built-in samplers: + + * 'random' : Random sampling + * 'latin' : Latin Hypercube Sampling + * 'sobol' : Sobol Sequence Sampling + * 'grid' : Grid Search Sampling n_samples : int, optional Number of samples to generate, by default 1 seed : Optional[int], optional Seed to use for the sampler, by default None - Note - ---- - If a string is passed, it should be one of the built-in samplers: - - * 'random' : Random sampling - * 'latin' : Latin Hypercube Sampling - * 'sobol' : Sobol Sequence Sampling - Raises ------ ValueError @@ -1299,3 +1751,50 @@ def set_project_dir(self, project_dir: Path | str): Path to the project directory """ self.project_dir = _project_dir_factory(project_dir) + + +def x0_factory(experiment_data: ExperimentData, + mode: str | ExperimentData, n_samples: int): + """Set the initial population to the best n samples of the given data + + Parameters + ---------- + experiment_data : ExperimentData + Data to be used for the initial population + mode : str + Mode of selecting the initial population, by default 'best' + The following modes are available: + + - best: select the best n samples + - random: select n random samples + - last: select the last n samples + n_samples : int + Number of samples to select + + Raises + ------ + ValueError + Raises when the mode is not recognized + """ + if isinstance(mode, ExperimentData): + x0 = mode + + elif mode == 'best': + x0 = experiment_data.get_n_best_output(n_samples) + + elif mode == 'random': + x0 = experiment_data.select( + np.random.choice( + experiment_data.index, + size=n_samples, replace=False)) + + elif mode == 'last': + x0 = experiment_data.select( + experiment_data.index[-n_samples:]) + + else: + raise ValueError( + f'Unknown selection mode {mode}, use best, random or last') + + x0._reset_index() + return x0 diff --git a/src/f3dasm/_src/experimentdata/experimentsample.py b/src/f3dasm/_src/experimentdata/experimentsample.py index 67dcb8e4..1bc21d8a 100644 --- a/src/f3dasm/_src/experimentdata/experimentsample.py +++ b/src/f3dasm/_src/experimentdata/experimentsample.py @@ -9,7 +9,14 @@ from __future__ import annotations # Standard +import sys from pathlib import Path + +if sys.version_info < (3, 8): # NOQA + from typing_extensions import Literal # NOQA +else: + from typing import Literal + from typing import Any, Dict, Optional, Tuple, Type # Third-party @@ -195,7 +202,7 @@ def __setitem__(self, key: str, value: Any): self._dict_output[key] = (value, False) def __repr__(self) -> str: - return (f"ExperimentSample({self.job_number} :" + return (f"ExperimentSample({self.job_number} ({self.jobs}) :" f"{self.input_data} - {self.output_data})") @property @@ -257,8 +264,26 @@ def job_number(self) -> int: """ return self._jobnumber + @property + def jobs(self) -> Literal['finished', 'open']: + """Retrieve the job status. + + Returns + ------- + str + The job number of the design as a tuple. + """ + # Check if the output contains values or not all nan + has_all_nan = np.all(np.isnan(list(self._output_data.values()))) + + if self._output_data and not has_all_nan: + status = 'finished' + else: + status = 'open' + + return status + # Alias - jobs = job_number _jobs = jobs # Export diff --git a/src/f3dasm/_src/hydra_utils.py b/src/f3dasm/_src/hydra_utils.py new file mode 100644 index 00000000..4e3be697 --- /dev/null +++ b/src/f3dasm/_src/hydra_utils.py @@ -0,0 +1,62 @@ +""" +This module defines utility functions for the Hydra configuration system. +""" +# Modules +# ============================================================================= + +# Standard +from copy import deepcopy + +# Third-party +from omegaconf import OmegaConf + +# Local +from .experimentdata.experimentsample import ExperimentSample + +# Authorship & Credits +# ============================================================================= +__author__ = 'Martin van der Schelling (M.P.vanderSchelling@tudelft.nl)' +__credits__ = ['Martin van der Schelling'] +__status__ = 'Alpha' +# ============================================================================= +# +# ============================================================================= + + +def update_config_with_experiment_sample( + config: OmegaConf, experiment_sample: ExperimentSample) -> OmegaConf: + """ + Update the config with the values from the experiment sample + + Parameters + ---------- + config : OmegaConf + The configuration to update + experiment_sample : ExperimentSample + The experiment sample to update the configuration with + + Returns + ------- + OmegaConf + The updated configuration + + Notes + ----- + The function will update the configuration with the values from the + experiment sample. The function will only update the configuration with + values that are present in the experiment sample. If the experiment sample + contains values that are not present in the configuration, they will be + ignored. Keys can be nested using dots, e.g. 'a.b' will update the value + of 'c' in the configuration key 'b'. + + The function will return a new configuration object with the + updated values. The original configuration object will not be modified. + """ + cfg = deepcopy(config) + for key, value in experiment_sample.to_dict().items(): + try: + OmegaConf.update(cfg, key, value) + except AttributeError: + continue + + return cfg diff --git a/src/f3dasm/_src/optimization/adapters/scipy_implementations.py b/src/f3dasm/_src/optimization/adapters/scipy_implementations.py index d2d4273a..1839c8b5 100644 --- a/src/f3dasm/_src/optimization/adapters/scipy_implementations.py +++ b/src/f3dasm/_src/optimization/adapters/scipy_implementations.py @@ -1,6 +1,9 @@ # Modules # ============================================================================= +# Standard +import warnings + # Third-party core import autograd.numpy as np from scipy.optimize import minimize @@ -19,12 +22,15 @@ # # ============================================================================= +warnings.filterwarnings( + "ignore", message="^OptimizeWarning: Unknown solver options.*") + class _SciPyOptimizer(Optimizer): type: str = 'scipy' def _callback(self, xk: np.ndarray, *args, **kwargs) -> None: - self.data._add_experiments( + self.data.add_experiments( ExperimentSample.from_numpy(xk, domain=self.domain)) def update_step(self): @@ -56,8 +62,7 @@ def fun(x): minimize( fun=fun, method=self.method, - # TODO: #89 Fix this with the newest gradient method! - jac='3-point', + jac=data_generator.dfdx, x0=self.data.get_n_best_output(1).to_numpy()[0].ravel(), callback=self._callback, options=self.hyperparameters.__dict__, diff --git a/src/f3dasm/_src/optimization/cg.py b/src/f3dasm/_src/optimization/cg.py index b2502bf7..84fbc8b8 100644 --- a/src/f3dasm/_src/optimization/cg.py +++ b/src/f3dasm/_src/optimization/cg.py @@ -28,7 +28,7 @@ class CG_Parameters(OptimizerParameters): class CG(_SciPyOptimizer): """CG""" - + require_gradients: bool = True method: str = "CG" hyperparameters: CG_Parameters = CG_Parameters() diff --git a/src/f3dasm/_src/optimization/lbfgsb.py b/src/f3dasm/_src/optimization/lbfgsb.py index c240f29c..071fbb63 100644 --- a/src/f3dasm/_src/optimization/lbfgsb.py +++ b/src/f3dasm/_src/optimization/lbfgsb.py @@ -33,7 +33,7 @@ class LBFGSB_Parameters(OptimizerParameters): class LBFGSB(_SciPyOptimizer): """L-BFGS-B""" - + require_gradients: bool = True method: str = "L-BFGS-B" hyperparameters: LBFGSB_Parameters = LBFGSB_Parameters() diff --git a/src/f3dasm/_src/optimization/neldermead.py b/src/f3dasm/_src/optimization/neldermead.py index dc501f05..f3a9df93 100644 --- a/src/f3dasm/_src/optimization/neldermead.py +++ b/src/f3dasm/_src/optimization/neldermead.py @@ -33,7 +33,7 @@ class NelderMead_Parameters(OptimizerParameters): class NelderMead(_SciPyOptimizer): """Nelder-Mead""" - + require_gradients: bool = False method: str = "Nelder-Mead" hyperparameters: NelderMead_Parameters = NelderMead_Parameters() diff --git a/src/f3dasm/_src/optimization/optimizer.py b/src/f3dasm/_src/optimization/optimizer.py index d7cf2678..7f2ceb64 100644 --- a/src/f3dasm/_src/optimization/optimizer.py +++ b/src/f3dasm/_src/optimization/optimizer.py @@ -73,6 +73,7 @@ class OptimizerParameters: class Optimizer: type: ClassVar[str] = 'any' + require_gradients: ClassVar[bool] = False hyperparameters: OptimizerParameters = OptimizerParameters() def __init__( @@ -86,17 +87,17 @@ def __init__( Domain indicating the search-space of the optimization parameters seed : Optional[int], optional Seed of the random number generator for stochastic optimization - processes, by default None, set to random + processes, by default None, set to random name : Optional[str], optional Name of the optimization object, by default None, - it will use the name of the class + it will use the name of the class Note ---- Any additional keyword arguments will be used to overwrite - the default hyperparameters of the optimizer. + the default hyperparameters of the optimizer. """ # Check if **hyperparameters is empty @@ -152,55 +153,15 @@ def set_seed(self): def reset(self, data: ExperimentData): """Reset the optimizer to its initial state""" - self.data = data + self.set_data(data) self.__post_init__() def set_data(self, data: ExperimentData): """Set the data attribute to the given data""" self.data = data - def set_x0(self, experiment_data: ExperimentData, mode: str): - """Set the initial population to the best n samples of the given data - - Parameters - ---------- - experiment_data : ExperimentData - Data to be used for the initial population - mode : str - Mode of selecting the initial population, by default 'best' - - Raises - ------ - ValueError - Raises when the mode is not recognized - - Note - ---- - The following modes are available: - - best: select the best n samples - - random: select n random samples - - last: select the last n samples - """ - if mode.lower() == 'best': - x0 = experiment_data.get_n_best_output( - self.hyperparameters.population) - - elif mode.lower() == 'random': - x0 = experiment_data.select( - np.random.choice( - experiment_data.index, - self.hyperparameters.population, replace=False)) - - elif mode.lower() == 'last': - x0 = experiment_data.select( - experiment_data.index[-self.hyperparameters.population:]) - - else: - raise ValueError( - f'Unknown selection mode {mode}, use best, random or last') - - x0._reset_index() - self.data = x0 + def add_experiments(self, experiments: ExperimentData): + ... def get_name(self) -> str: """Get the name of the optimizer diff --git a/src/f3dasm/_src/optimization/randomsearch.py b/src/f3dasm/_src/optimization/randomsearch.py index 05399a1f..445cd851 100644 --- a/src/f3dasm/_src/optimization/randomsearch.py +++ b/src/f3dasm/_src/optimization/randomsearch.py @@ -35,7 +35,7 @@ class RandomSearch_Parameters(OptimizerParameters): class RandomSearch(Optimizer): """Naive random search""" - + require_gradients: bool = False hyperparameters: RandomSearch_Parameters = RandomSearch_Parameters() def set_seed(self): diff --git a/src/f3dasm/datageneration/__init__.py b/src/f3dasm/datageneration/__init__.py index 657e0a71..a5275f56 100644 --- a/src/f3dasm/datageneration/__init__.py +++ b/src/f3dasm/datageneration/__init__.py @@ -4,7 +4,7 @@ # Modules # ============================================================================= -from .._src.datageneration.datagenerator import DataGenerator +from .._src.datageneration.datagenerator import DataGenerator, convert_function # Authorship & Credits # ============================================================================= diff --git a/src/f3dasm/design.py b/src/f3dasm/design.py index cfd2b6c0..420c6328 100644 --- a/src/f3dasm/design.py +++ b/src/f3dasm/design.py @@ -9,6 +9,7 @@ from ._src.design.parameter import (PARAMETERS, _CategoricalParameter, _ConstantParameter, _ContinuousParameter, _DiscreteParameter, _Parameter) +from ._src.design.samplers import Sampler, SamplerNames from ._src.experimentdata._data import _Data from ._src.experimentdata._jobqueue import NoOpenJobsError, Status, _JobQueue @@ -34,4 +35,6 @@ 'Status', '_Data', '_JobQueue', + 'Sampler', + 'SamplerNames', ] diff --git a/src/f3dasm/hydra.py b/src/f3dasm/hydra.py new file mode 100644 index 00000000..0051d4f7 --- /dev/null +++ b/src/f3dasm/hydra.py @@ -0,0 +1,21 @@ +""" +Module for hydra utilities +""" +# Modules +# ============================================================================= + +# Local +from ._src.hydra_utils import update_config_with_experiment_sample + +# Authorship & Credits +# ============================================================================= +__author__ = 'Martin van der Schelling (M.P.vanderSchelling@tudelft.nl)' +__credits__ = ['Martin van der Schelling'] +__status__ = 'Stable' +# ============================================================================= +# +# ============================================================================= + +__all__ = [ + 'update_config_with_experiment_sample', +] diff --git a/tests/datageneration/__init__.py b/tests/datageneration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/datageneration/conftest.py b/tests/datageneration/conftest.py new file mode 100644 index 00000000..6523c98e --- /dev/null +++ b/tests/datageneration/conftest.py @@ -0,0 +1,35 @@ +from typing import Callable + +import pytest + +from f3dasm import ExperimentData +from f3dasm.design import Domain + + +@pytest.fixture(scope="package") +def experiment_data() -> ExperimentData: + domain = Domain() + domain.add_float('x', low=0.0, high=1.0) + + experiment_data = ExperimentData(domain=domain) + + experiment_data.sample(sampler='random', n_samples=10, seed=2023) + return experiment_data + + +def example_function(x: int, s: int): + return x + s, x - s + + +def example_function2(x: int): + return x, -x + + +@pytest.fixture(scope="package") +def function_1() -> Callable: + return example_function + + +@pytest.fixture(scope="package") +def function_2() -> Callable: + return example_function2 diff --git a/tests/datageneration/test_datagenerator.py b/tests/datageneration/test_datagenerator.py new file mode 100644 index 00000000..dadd361b --- /dev/null +++ b/tests/datageneration/test_datagenerator.py @@ -0,0 +1,28 @@ +from typing import Callable + +import pytest + +from f3dasm import ExperimentData +from f3dasm.datageneration import DataGenerator, convert_function + +pytestmark = pytest.mark.smoke + + +def test_convert_function( + experiment_data: ExperimentData, function_1: Callable): + data_generator = convert_function(f=function_1, input=['x'], output=[ + 'y0', 'y1'], kwargs={'s': 103}) + + assert isinstance(data_generator, DataGenerator) + + experiment_data.evaluate(data_generator) + + +def test_convert_function2( + experiment_data: ExperimentData, function_2: Callable): + data_generator = convert_function(f=function_2, input=['x'], output=[ + 'y0', 'y1']) + + assert isinstance(data_generator, DataGenerator) + + experiment_data.evaluate(data_generator) diff --git a/tests/design/test_data.py b/tests/design/test_data.py index 738f10c6..7fd5e7e5 100644 --- a/tests/design/test_data.py +++ b/tests/design/test_data.py @@ -99,3 +99,26 @@ def test_compatible_columns_add(): g = _Data(dg) _ = f + g + + +def test_overwrite_data(sample_data: _Data): + overwrite_data = _Data(pd.DataFrame( + {'input1': [5, 6, 7], 'input2': [8, 9, 10]})) + + sample_data.overwrite(other=overwrite_data, indices=[0, 1, 2]) + + pd.testing.assert_frame_equal(sample_data.data, overwrite_data.data, + check_dtype=False, atol=1e-6) + + +def test_overwrite_data2(sample_data: _Data): + overwrite_data = _Data(pd.DataFrame( + {'input1': [5, 6, ], 'input2': [8, 9]})) + + sample_data.overwrite(other=overwrite_data, indices=[1, 2]) + + ground_truth = _Data(pd.DataFrame( + {'input1': [1, 5, 6], 'input2': [4, 8, 9]})) + + pd.testing.assert_frame_equal(sample_data.data, ground_truth.data, + check_dtype=False, atol=1e-6) diff --git a/tests/design/test_jobqueue.py b/tests/design/test_jobqueue.py index 781c2342..336d79e6 100644 --- a/tests/design/test_jobqueue.py +++ b/tests/design/test_jobqueue.py @@ -1,14 +1,17 @@ +from typing import Iterable + import pandas as pd import pytest -from f3dasm.design import NoOpenJobsError, _JobQueue, Status +from f3dasm.design import NoOpenJobsError, Status, _JobQueue pytestmark = pytest.mark.smoke @pytest.fixture def sample_job_queue(): - jobs = pd.Series(['open', 'open', 'in progress', 'finished'], dtype='string') + jobs = pd.Series(['open', 'open', 'in progress', + 'finished'], dtype='string') job_queue = _JobQueue(jobs) yield job_queue @@ -29,16 +32,19 @@ def test_job_queue_initialization(sample_job_queue: _JobQueue): def test_job_queue_repr_html(sample_job_queue: _JobQueue): assert isinstance(sample_job_queue._repr_html_(), str) + def test_job_queue_remove(sample_job_queue: _JobQueue): sample_job_queue.remove([1, 3]) - expected_jobs = pd.Series(['open', 'in progress'], index=[0, 2], dtype='string') + expected_jobs = pd.Series(['open', 'in progress'], index=[ + 0, 2], dtype='string') assert sample_job_queue.jobs.equals(expected_jobs) def test_job_queue_add(): job_queue = _JobQueue() job_queue.add(5, 'open') - assert job_queue.jobs.equals(pd.Series(['open', 'open', 'open', 'open', 'open'], dtype='string')) + assert job_queue.jobs.equals( + pd.Series(['open', 'open', 'open', 'open', 'open'], dtype='string')) def test_job_queue_reset(sample_job_queue: _JobQueue): @@ -53,7 +59,8 @@ def test_job_queue_get_open_job(sample_job_queue: _JobQueue): def test_job_queue_get_open_job_no_jobs(): - jobs = pd.Series(['finished', 'finished', 'in progress', 'finished'], dtype='string') + jobs = pd.Series(['finished', 'finished', 'in progress', + 'finished'], dtype='string') job_queue = _JobQueue(jobs) with pytest.raises(NoOpenJobsError): job_queue.get_open_job() @@ -79,7 +86,8 @@ def test_job_queue_mark_as_error(sample_job_queue: _JobQueue): def test_job_queue_mark_all_in_progress_open(sample_job_queue: _JobQueue): sample_job_queue.mark_all_in_progress_open() - assert sample_job_queue.jobs.equals(pd.Series(['open', 'open', 'open', 'finished'], dtype='string')) + assert sample_job_queue.jobs.equals( + pd.Series(['open', 'open', 'open', 'finished'], dtype='string')) def test_job_queue_is_all_finished(sample_job_queue: _JobQueue): diff --git a/tests/design/test_space.py b/tests/design/test_space.py index 26352099..76e7850b 100644 --- a/tests/design/test_space.py +++ b/tests/design/test_space.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from f3dasm.design import (_CategoricalParameter, _ContinuousParameter, - _DiscreteParameter) +from f3dasm.design import (_CategoricalParameter, _ConstantParameter, + _ContinuousParameter, _DiscreteParameter) pytestmark = pytest.mark.smoke @@ -119,5 +119,63 @@ def test_duplicates_categories_categorical_space(): _ = _CategoricalParameter(categories=categories) +@pytest.mark.parametrize("args", [((0., 5.), (-1., 3.), (-1., 5.),), + ((0., 5.), (1., 3.), (0., 5.),), + ((-1., 3.), (0., 5.), (-1., 5.),), + ((0., 5.), (0., 5.), (0., 5.),)]) +def test_add_continuous(args): + a, b, expected = args + param_a = _ContinuousParameter(*a) + param_b = _ContinuousParameter(*b) + + assert param_a + param_b == _ContinuousParameter(*expected) + + +@pytest.mark.parametrize("args", [((0., 5.), (6., 10.)),]) +def test_faulty_continuous_ranges(args): + a, b = args + param_a = _ContinuousParameter(*a) + param_b = _ContinuousParameter(*b) + with pytest.raises(ValueError): + param_a + param_b + + +def test_faulty_continous_log(): + a = _ContinuousParameter(1., 5., log=True) + b = _ContinuousParameter(0., 5., log=False) + with pytest.raises(ValueError): + a + b + + +@pytest.mark.parametrize("args", [(('test1', 'test2'), ('test3',), ('test1', 'test2', 'test3'),), + (('test1', 'test3'), ('test3',), + ('test1', 'test3'),)]) +def test_add_categorical(args): + a, b, expected = args + param_a = _CategoricalParameter(list(a)) + param_b = _CategoricalParameter(list(b)) + + assert param_a + param_b == _CategoricalParameter(list(expected)) + + +@pytest.mark.parametrize( + "args", + [(_CategoricalParameter(['test1', 'test2']), _ConstantParameter('test3'), _CategoricalParameter(['test1', 'test2', 'test3']),), + (_CategoricalParameter(['test1', 'test2']), _DiscreteParameter( + 1, 3), _CategoricalParameter(['test1', 'test2', 1, 2]),), + (_CategoricalParameter(['test1', 'test2']), _ConstantParameter( + 'test1'), _CategoricalParameter(['test1', 'test2']),), + (_CategoricalParameter(['test1', 'test2']), _CategoricalParameter([ + 'test1']), _CategoricalParameter(['test1', 'test2']),), + (_ConstantParameter('test3'), _CategoricalParameter( + ['test1', 'test2']), _CategoricalParameter(['test1', 'test2', 'test3'])) + + + ]) +def test_add_combination(args): + a, b, expected = args + assert a + b == expected + + if __name__ == "__main__": # pragma: no cover pytest.main() diff --git a/tests/experimentdata/test__jobqueue.py b/tests/experimentdata/test__jobqueue.py new file mode 100644 index 00000000..b38268bc --- /dev/null +++ b/tests/experimentdata/test__jobqueue.py @@ -0,0 +1,36 @@ +import pandas as pd + +from f3dasm._src.experimentdata._jobqueue import _JobQueue + + +def test_select_all_with_matching_status(): + # Create a job queue with some jobs + job_queue = _JobQueue() + job_queue.jobs = pd.Series(['in progress', 'running', 'completed', 'in progress', 'failed']) + + # Select all jobs with status 'in progress' + selected_jobs = job_queue.select_all('in progress') + + # Check if the selected jobs match the expected result + assert (selected_jobs.jobs == ['in progress', 'in progress']).all() + +def test_select_all_with_no_matching_status(): + # Create a job queue with some jobs + job_queue = _JobQueue() + job_queue.jobs = pd.Series(['in progress', 'running', 'completed', 'in progress', 'failed']) + + # Select all jobs with status 'cancelled' + selected_jobs = job_queue.select_all('cancelled') + + # Check if the selected jobs match the expected result + assert selected_jobs.jobs.empty + +def test_select_all_with_empty_job_queue(): + # Create an empty job queue + job_queue = _JobQueue() + + # Select all jobs with status 'in progress' + selected_jobs = job_queue.select_all('in progress') + + # Check if the selected jobs match the expected result + assert selected_jobs.jobs.empty diff --git a/tests/experimentdata/test_experimentdata.py b/tests/experimentdata/test_experimentdata.py index c53b0be6..10142ffd 100644 --- a/tests/experimentdata/test_experimentdata.py +++ b/tests/experimentdata/test_experimentdata.py @@ -35,8 +35,8 @@ def test_experiment_data_init(experimentdata: ExperimentData, domain: Domain): def test_experiment_data_add(experimentdata: ExperimentData, experimentdata2: ExperimentData, domain: Domain): experimentdata_total = ExperimentData(domain) - experimentdata_total._add_experiments(experimentdata) - experimentdata_total._add_experiments(experimentdata2) + experimentdata_total.add_experiments(experimentdata) + experimentdata_total.add_experiments(experimentdata2) assert experimentdata_total == experimentdata + experimentdata2 @@ -48,6 +48,7 @@ def test_experiment_data_len_empty(domain: Domain): def test_experiment_data_len_equals_input_data(experimentdata: ExperimentData): assert len(experimentdata) == len(experimentdata._input_data) + @pytest.mark.parametrize("slice_type", [3, [0, 1, 3]]) def test_experiment_data_select(slice_type: int | Iterable[int], experimentdata: ExperimentData): input_data = experimentdata._input_data[slice_type] @@ -70,7 +71,7 @@ def test_from_file(experimentdata_continuous: ExperimentData, seed: int, tmp_pat # Check if the input_data attribute of ExperimentData matches the expected_data pd.testing.assert_frame_equal( - experimentdata_continuous._input_data.to_dataframe(), experimentdata_from_file._input_data.to_dataframe()) + experimentdata_continuous._input_data.to_dataframe(), experimentdata_from_file._input_data.to_dataframe(), check_dtype=False, atol=1e-6) pd.testing.assert_frame_equal(experimentdata_continuous._output_data.to_dataframe(), experimentdata_from_file._output_data.to_dataframe()) pd.testing.assert_series_equal( @@ -138,6 +139,9 @@ def test_from_object(experimentdata_continuous: ExperimentData): def test_to_numpy(experimentdata_continuous: ExperimentData, numpy_array: np.ndarray): x, y = experimentdata_continuous.to_numpy() + + # cast x to floats + x = x.astype(float) # assert if x and numpy_array have all the same values assert np.allclose(x, numpy_array) @@ -151,7 +155,8 @@ def test_to_xarray(experimentdata_continuous: ExperimentData, xarray_dataset: xr def test_to_pandas(experimentdata_continuous: ExperimentData, pandas_dataframe: pd.DataFrame): exported_dataframe, _ = experimentdata_continuous.to_pandas() # assert if pandas_dataframe is equal to exported_dataframe - assert exported_dataframe.equals(pandas_dataframe) + pd.testing.assert_frame_equal( + exported_dataframe, pandas_dataframe, atol=1e-6, check_dtype=False) # Exporters # ====================================================================================== @@ -460,10 +465,9 @@ def mock_pd_read_pickle(*args, **kwargs): # Check if the input_data attribute of ExperimentData matches the expected_data pd.testing.assert_frame_equal( - experiment_data._input_data.to_dataframe(), experimentdata_expected._input_data.to_dataframe(), check_dtype=False) + experiment_data._input_data.to_dataframe(), experimentdata_expected._input_data.to_dataframe(), check_dtype=False, atol=1e-6) pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), experimentdata_expected._output_data.to_dataframe(), check_dtype=False) - assert experiment_data == experimentdata_expected @pytest.mark.parametrize("input_data", [pd_input(), path_input, str_input, data_input(), numpy_input()]) @@ -536,18 +540,14 @@ def mock_pd_read_pickle(*args, **kwargs): # Check if the input_data attribute of ExperimentData matches the expected_data pd.testing.assert_frame_equal( - experiment_data._input_data.to_dataframe(), experimentdata_expected_no_output._input_data.to_dataframe()) + experiment_data._input_data.to_dataframe(), experimentdata_expected_no_output._input_data.to_dataframe(), atol=1e-6, check_dtype=False) pd.testing.assert_frame_equal(experiment_data._output_data.to_dataframe(), experimentdata_expected_no_output._output_data.to_dataframe()) pd.testing.assert_series_equal( experiment_data._jobs.jobs, experimentdata_expected_no_output._jobs.jobs) - assert experiment_data._input_data == experimentdata_expected_no_output._input_data - assert experiment_data._output_data == experimentdata_expected_no_output._output_data - assert experiment_data.domain == experimentdata_expected_no_output.domain + # assert experiment_data.domain == experimentdata_expected_no_output.domain assert experiment_data._jobs == experimentdata_expected_no_output._jobs - assert experiment_data == experimentdata_expected_no_output - @pytest.mark.parametrize("input_data", [None]) @pytest.mark.parametrize("output_data", [None]) @@ -663,7 +663,7 @@ def test_evaluate_mode(mode: str, experimentdata_continuous: ExperimentData, tmp def test_get_input_data(experimentdata_expected_no_output: ExperimentData): input_data = experimentdata_expected_no_output.get_input_data() df, _ = input_data.to_pandas() - pd.testing.assert_frame_equal(df, pd_input()) + pd.testing.assert_frame_equal(df, pd_input(), check_dtype=False, atol=1e-6) assert experimentdata_expected_no_output._input_data == input_data._input_data @@ -674,13 +674,14 @@ def test_get_input_data_selection(experimentdata_expected_no_output: ExperimentD if isinstance(selection, str): selection = [selection] selected_pd = pd_input()[selection] - pd.testing.assert_frame_equal(df, selected_pd) + pd.testing.assert_frame_equal( + df, selected_pd, check_dtype=False, atol=1e-6) def test_get_output_data(experimentdata_expected: ExperimentData): output_data = experimentdata_expected.get_output_data() _, df = output_data.to_pandas() - pd.testing.assert_frame_equal(df, pd_output()) + pd.testing.assert_frame_equal(df, pd_output(), check_dtype=False) assert experimentdata_expected._output_data == output_data._output_data @@ -691,7 +692,7 @@ def test_get_output_data_selection(experimentdata_expected: ExperimentData, sele if isinstance(selection, str): selection = [selection] selected_pd = pd_output()[selection] - pd.testing.assert_frame_equal(df, selected_pd) + pd.testing.assert_frame_equal(df, selected_pd, check_dtype=False) def test_iter_behaviour(experimentdata_continuous: ExperimentData): @@ -703,5 +704,30 @@ def test_iter_behaviour(experimentdata_continuous: ExperimentData): assert isinstance(i, ExperimentSample) +def test_select_with_status_open(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('open') + assert all(job == Status.OPEN for job in selected_data._jobs.jobs) + + +def test_select_with_status_in_progress(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('in progress') + assert all(job == Status.IN_PROGRESS for job in selected_data._jobs.jobs) + + +def test_select_with_status_finished(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('finished') + assert all(job == Status.FINISHED for job in selected_data._jobs.jobs) + + +def test_select_with_status_error(experimentdata: ExperimentData): + selected_data = experimentdata.select_with_status('error') + assert all(job == Status.ERROR for job in selected_data._jobs.jobs) + + +def test_select_with_status_invalid_status(experimentdata: ExperimentData): + with pytest.raises(ValueError): + _ = experimentdata.select_with_status('invalid_status') + + if __name__ == "__main__": # pragma: no cover pytest.main() diff --git a/tests/optimization/test_all_optimizers.py b/tests/optimization/test_all_optimizers.py index 26ff0886..ef1c23a8 100644 --- a/tests/optimization/test_all_optimizers.py +++ b/tests/optimization/test_all_optimizers.py @@ -82,7 +82,9 @@ def test_all_optimizers_3_functions(seed: int, data_generator: DataGenerator, op @pytest.mark.parametrize("iterations", [10, 23, 66, 86]) @pytest.mark.parametrize("optimizer", OPTIMIZERS) @pytest.mark.parametrize("data_generator", ["sphere"]) -def test_optimizer_iterations(iterations: int, data_generator: str, optimizer: str): +@pytest.mark.parametrize("x0_selection", ["best", "new"]) +def test_optimizer_iterations(iterations: int, data_generator: str, + optimizer: str, x0_selection: str): numsamples = 40 # initial samples seed = 42 @@ -111,12 +113,24 @@ def test_optimizer_iterations(iterations: int, data_generator: str, optimizer: s data.evaluate(data_generator, mode='sequential', kwargs={'seed': seed, 'noise': None, 'scale_bounds': np.tile([-1.0, 1.0], (dim, 1)), }) - data.optimize(optimizer=optimizer, data_generator=data_generator, - iterations=iterations, kwargs={'seed': seed, 'noise': None, - 'scale_bounds': np.tile([-1.0, 1.0], (dim, 1)), }, - hyperparameters={'seed': seed}) + _optimizer = _optimizer_factory(optimizer, domain=domain) - assert len(data) == (iterations + numsamples) + if x0_selection == "new" and iterations < _optimizer.hyperparameters.population: + with pytest.raises(ValueError): + data.optimize(optimizer=optimizer, data_generator=data_generator, + iterations=iterations, kwargs={'seed': seed, 'noise': None, + 'scale_bounds': np.tile([-1.0, 1.0], (dim, 1)), }, + hyperparameters={'seed': seed}, + x0_selection=x0_selection) + else: + + data.optimize(optimizer=optimizer, data_generator=data_generator, + iterations=iterations, kwargs={'seed': seed, 'noise': None, + 'scale_bounds': np.tile([-1.0, 1.0], (dim, 1)), }, + hyperparameters={'seed': seed}, + x0_selection=x0_selection) + + assert len(data) == (iterations + numsamples) if __name__ == "__main__": # pragma: no cover