Skip to content

Commit 1b46daa

Browse files
authored
Merge pull request #201 from SciCatProject/dataset-query
Add experimental ScicatClient.query_datasets
2 parents 1b554e0 + af912bb commit 1b46daa

File tree

4 files changed

+343
-5
lines changed

4 files changed

+343
-5
lines changed

docs/release-notes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ Security
4141
Features
4242
~~~~~~~~
4343

44+
* Added experimental :meth:`client.ScicatClient.query_datasets` for querying datasets by field.
45+
4446
Breaking changes
4547
~~~~~~~~~~~~~~~~
4648

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,11 @@ extend-include = ["*.ipynb"]
9898
extend-exclude = [".*", "__pycache__", "build", "dist", "venv"]
9999

100100
[tool.ruff.lint]
101-
select = ["B", "C4", "D", "DTZ", "E", "F", "G", "I", "FBT003", "PERF", "PGH", "PT", "PYI", "RUF", "S", "T20", "W"]
101+
select = ["B", "C4", "D", "DTZ", "E", "F", "G", "I", "FBT003", "PERF", "PGH", "PT", "PYI", "RUF", "S", "T20", "UP", "W"]
102102
ignore = [
103103
"D105", # most magic methods don't need docstrings as their purpose is always the same
104104
"E741", "E742", "E743", # do not use names ‘l’, ‘O’, or ‘I’; they are not a problem with a proper font
105-
"UP038", # does not seem to work and leads to slower code
105+
"UP038", # leads to slower code
106106
# Conflict with ruff format, see
107107
# https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
108108
"COM812", "COM819", "D206", "D300", "E111", "E114", "E117", "ISC001", "ISC002", "Q000", "Q001", "Q002", "Q003", "W191",

src/scitacean/client.py

Lines changed: 118 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import dataclasses
88
import datetime
9+
import json
910
import re
1011
import warnings
1112
from collections.abc import Callable, Iterable, Iterator
@@ -15,6 +16,7 @@
1516
from urllib.parse import quote_plus
1617

1718
import httpx
19+
import pydantic
1820

1921
from . import model
2022
from ._base_model import convert_download_to_user_model
@@ -708,6 +710,112 @@ def get_dataset_model(
708710
**dset_json,
709711
)
710712

713+
def query_datasets(
714+
self,
715+
fields: dict[str, Any],
716+
*,
717+
limit: int | None = None,
718+
order: str | None = None,
719+
strict_validation: bool = False,
720+
) -> list[model.DownloadDataset]:
721+
"""Query for datasets in SciCat.
722+
723+
Attention
724+
---------
725+
This function is experimental and may change or be removed in the future.
726+
It is currently unclear how best to implement querying because SciCat
727+
provides multiple, very different APIs and there are plans for supporting
728+
queries via Mongo query language directly.
729+
730+
See `issue #177 <https://github.com/SciCatProject/scitacean/issues/177>`_
731+
for a discussion.
732+
733+
Parameters
734+
----------
735+
fields:
736+
Fields to query for.
737+
Returned datasets must match all fields exactly.
738+
See examples below.
739+
limit:
740+
Maximum number of results to return.
741+
Requires ``order`` to be specified.
742+
If not given, all matching datasets are returned.
743+
order:
744+
Specify order of results.
745+
For example, ``"creationTime:asc"`` and ``"creationTime:desc"`` return
746+
results in ascending or descending order in creation time, respectively.
747+
strict_validation:
748+
If ``True``, the datasets must pass validation.
749+
If ``False``, datasets are still returned if validation fails.
750+
Note that some dataset fields may have a bad value or type.
751+
A warning will be logged if validation fails.
752+
753+
Returns
754+
-------
755+
:
756+
A list of dataset models that match the query.
757+
758+
Examples
759+
--------
760+
Get all datasets belonging to proposal ``abc.123``:
761+
762+
.. code-block:: python
763+
764+
scicat_client.query_datasets({'proposalId': 'abc.123'})
765+
766+
Get all datasets that belong to proposal ``abc.123``
767+
**and** have name ``"ds name"``: (The name and proposal must match exactly.)
768+
769+
.. code-block:: python
770+
771+
scicat_client.query_datasets({
772+
'proposalId': 'abc.123',
773+
'datasetName': 'ds name'
774+
})
775+
776+
Return only the newest 5 datasets for proposal ``bc.123``:
777+
778+
.. code-block:: python
779+
780+
scicat_client.query_datasets(
781+
{'proposalId': 'bc.123'},
782+
limit=5,
783+
order="creationTime:desc",
784+
)
785+
"""
786+
# Use a pydantic model to support serializing custom types to JSON.
787+
params_model = pydantic.create_model( # type: ignore[call-overload]
788+
"QueryParams", **{key: (type(field), ...) for key, field in fields.items()}
789+
)
790+
params = {"fields": params_model(**fields).model_dump_json()}
791+
792+
limits: dict[str, str | int] = {}
793+
if order is not None:
794+
limits["order"] = order
795+
if limit is not None:
796+
if order is None:
797+
raise ValueError("`order` is required when `limit` is specified.")
798+
limits["limit"] = limit
799+
if limits:
800+
params["limits"] = json.dumps(limits)
801+
802+
dsets_json = self._call_endpoint(
803+
cmd="get",
804+
url="datasets/fullquery",
805+
params=params,
806+
operation="query_datasets",
807+
)
808+
if not dsets_json:
809+
return []
810+
return [
811+
model.construct(
812+
model.DownloadDataset,
813+
_strict_validation=strict_validation,
814+
**dset_json,
815+
)
816+
for dset_json in dsets_json
817+
]
818+
711819
def get_orig_datablocks(
712820
self, pid: PID, strict_validation: bool = False
713821
) -> list[model.DownloadOrigDatablock]:
@@ -1010,7 +1118,12 @@ def validate_dataset_model(
10101118
raise ValueError(f"Dataset {dset} did not pass validation in SciCat.")
10111119

10121120
def _send_to_scicat(
1013-
self, *, cmd: str, url: str, data: model.BaseModel | None = None
1121+
self,
1122+
*,
1123+
cmd: str,
1124+
url: str,
1125+
data: model.BaseModel | None = None,
1126+
params: dict[str, str] | None = None,
10141127
) -> httpx.Response:
10151128
if self._token is not None:
10161129
token = self._token.get_str()
@@ -1029,6 +1142,7 @@ def _send_to_scicat(
10291142
content=data.model_dump_json(exclude_none=True)
10301143
if data is not None
10311144
else None,
1145+
params=params,
10321146
headers=headers,
10331147
timeout=self._timeout.seconds,
10341148
)
@@ -1047,14 +1161,15 @@ def _call_endpoint(
10471161
*,
10481162
cmd: str,
10491163
url: str,
1050-
data: model.BaseModel | None = None,
10511164
operation: str,
1165+
data: model.BaseModel | None = None,
1166+
params: dict[str, str] | None = None,
10521167
) -> Any:
10531168
full_url = _url_concat(self._base_url, url)
10541169
logger = get_logger()
10551170
logger.info("Calling SciCat API at %s for operation '%s'", full_url, operation)
10561171

1057-
response = self._send_to_scicat(cmd=cmd, url=full_url, data=data)
1172+
response = self._send_to_scicat(cmd=cmd, url=full_url, data=data, params=params)
10581173
if not response.is_success:
10591174
logger.error(
10601175
"SciCat API call to %s failed: %s %s: %s",

0 commit comments

Comments
 (0)