Skip to content

Commit

Permalink
Introduce a new inventory JSON/dict format that preserves more types
Browse files Browse the repository at this point in the history
It doesn't convert anymore to string primitive types or collections.

Signed-off-by: Alexis Jeandet <[email protected]>
  • Loading branch information
jeandet committed Feb 25, 2025
1 parent 9f39dd9 commit 5eab456
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 33 deletions.
3 changes: 1 addition & 2 deletions speasy/core/impex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,8 +914,7 @@ def _concatenate_variables(variables: Dict[str, SpeasyVariable], product_id) ->
if len(variables) == 0:
return None
elif len(variables) == 1:
result = list(variables.values())[0].copy()
return result
return list(variables.values())[0]

axes = []
columns = []
Expand Down
49 changes: 35 additions & 14 deletions speasy/core/inventory/indexes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from types import NoneType

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'NoneType' is not used.
from typing import Optional, Union

__INDEXES_TYPES__ = {}
Expand Down Expand Up @@ -85,13 +86,15 @@ def __contains__(self, item: str or ComponentIndex):
return True
return False


class ArgumentIndex(SpeasyIndex):
def __init__(self, name: str, provider: str, uid: str, meta: Optional[dict] = None):
super().__init__(name, provider, uid, meta)

def __repr__(self):
return f'<ArgumentIndex: {self.spz_name()}>'


class ArgumentListIndex(SpeasyIndex):
def __init__(self, name: str, provider: str, uid: str, meta: Optional[dict] = None):
super().__init__(name, provider, uid, meta)
Expand All @@ -103,7 +106,7 @@ def _arguments(self):
def __repr__(self):
return f'<ArgumentListIndex: {self.spz_name()}>'

def __getitem__(self, item)->ArgumentIndex:
def __getitem__(self, item) -> ArgumentIndex:
return self._arguments[item]

def __len__(self):
Expand All @@ -112,6 +115,7 @@ def __len__(self):
def __iter__(self):
return self._arguments.__iter__()


class TemplatedParameterIndex(ParameterIndex):
__spz_arguments__: ArgumentListIndex

Expand Down Expand Up @@ -145,33 +149,48 @@ def __contains__(self, item: str or ParameterIndex):
return False


def to_dict(inventory_tree: SpeasyIndex or str):
def to_dict(inventory_tree: SpeasyIndex or str, version: int = 1):
if isinstance(inventory_tree, SpeasyIndex):
return {key: to_dict(value) for key, value in inventory_tree.__dict__.items()}
elif type(inventory_tree) is not str:
return str(inventory_tree)
return {key: to_dict(value, version=version) for key, value in inventory_tree.__dict__.items()}
elif version <= 1:
if type(inventory_tree) is not str:
inventory_tree = str(inventory_tree)
else:
if type(inventory_tree) in [list, tuple, set]:
return type(inventory_tree)([to_dict(value, version) for value in inventory_tree])
if type(inventory_tree) is dict:
return {key: to_dict(value, version) for key, value in inventory_tree.items()}
if type(inventory_tree) not in [str, int, float, bool, type(None)]:
return str(inventory_tree)

return inventory_tree


def from_dict(inventory_tree: dict or str):
if type(inventory_tree) is str:
return inventory_tree
def from_dict(inventory_tree: dict or str, version: int = 1):
if version <= 1:
if type(inventory_tree) is str:
return inventory_tree
else:
if type(inventory_tree) in [str, int, float, bool, type(None), list, tuple, set]:
return inventory_tree
if type(inventory_tree) is dict and "__spz_type__" not in inventory_tree:
return inventory_tree
idx_type = inventory_tree.pop("__spz_type__")
idx_name = inventory_tree.pop("__spz_name__")
idx_provider = inventory_tree.pop("__spz_provider__")
idx_uid = inventory_tree.pop("__spz_uid__")
idx_meta = {key: from_dict(value) for key, value in inventory_tree.items()}
idx_meta = {key: from_dict(value, version) for key, value in inventory_tree.items()}
root = __INDEXES_TYPES__.get(idx_type, SpeasyIndex)(name=idx_name, provider=idx_provider, uid=idx_uid,
meta=idx_meta)
return root


def to_json(inventory_tree: SpeasyIndex, sort_keys=True):
return json.dumps(to_dict(inventory_tree), sort_keys=sort_keys)
def to_json(inventory_tree: SpeasyIndex, sort_keys=True, version: int = 1):
return json.dumps(to_dict(inventory_tree, version), sort_keys=sort_keys)


def from_json(inventory_tree: str):
return from_dict(json.loads(inventory_tree))
def from_json(inventory_tree: str, version: int = 1):
return from_dict(json.loads(inventory_tree), version)


def make_inventory_node(parent, ctor, name, provider, uid, **meta):
Expand All @@ -191,4 +210,6 @@ def inventory_has_changed(orig, new):
return True
return False

AnyProductIndex = Union[ParameterIndex, TemplatedParameterIndex, DatasetIndex, TimetableIndex, CatalogIndex, ComponentIndex]

AnyProductIndex = Union[
ParameterIndex, TemplatedParameterIndex, DatasetIndex, TimetableIndex, CatalogIndex, ComponentIndex]
5 changes: 3 additions & 2 deletions speasy/core/proxy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

log = logging.getLogger(__name__)
PROXY_ALLOWED_KWARGS = ['disable_proxy']
MINIMUM_REQUIRED_PROXY_VERSION = Version("0.11.0")
MINIMUM_REQUIRED_PROXY_VERSION = Version("0.12.0")
_CURRENT_PROXY_SERVER_VERSION = None

if proxy_cfg.url() == "" or proxy_cfg.enabled() == False:
Expand Down Expand Up @@ -113,13 +113,14 @@ def get(provider: str, **kwargs):
kwargs['provider'] = provider
kwargs['format'] = 'python_dict'
kwargs['zstd_compression'] = zstd_compression
kwargs['version'] = 2
headers = {}
if saved_inventory is not None:
headers["If-Modified-Since"] = parser.parse(saved_inventory.build_date).ctime()
resp = http.get(f"{url}/get_inventory", params=kwargs, headers=headers)
log.debug(f"Asking {provider} inventory from proxy {resp.url}, {resp.headers}")
if resp.status_code == 200:
inventory = inventory_from_dict(pickle.loads(decompress(resp.bytes)))
inventory = inventory_from_dict(pickle.loads(decompress(resp.bytes)), version=2)
index.set("proxy_inventories", provider, inventory)
index.set("proxy_inventories_save_date", provider, datetime.utcnow())
return inventory
Expand Down
19 changes: 12 additions & 7 deletions speasy/data_providers/cda/_inventory_builder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
_MASTERS_CDF_PATH = f"{cda_cfg.inventory_data_path()}/masters_cdf/"
_XML_CATALOG_PATH = f"{cda_cfg.inventory_data_path()}/all.xml"

_CDAWEB_INVENTORY_ = "cdaweb-inventory"
_CDAWEB_INVENTORY_TREE_ = "tree_v2"
_CDAWEB_INVENTORY_LAST_MODIFIED_MASTERS_ = "masters-last-modified"
_CDAWEB_INVENTORY_LAST_MODIFIED_XML_ = "last_modified_xml"


def _ensure_path_exists(path: str):
dirname = os.path.dirname(path)
Expand All @@ -37,21 +42,21 @@ def _download_and_extract_master_cdf(masters_url: str):

def update_master_cdf(masters_url: str = "https://spdf.gsfc.nasa.gov/pub/software/cdawlib/0MASTERS/master.tar"):
last_modified = http.head(masters_url).headers['last-modified']
if index.get("cdaweb-inventory", "masters-last-modified", "") != last_modified:
if index.get(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_MASTERS_, "") != last_modified:
_clean_master_cdf_folder()
_download_and_extract_master_cdf(masters_url)
index.set("cdaweb-inventory", "masters-last-modified", last_modified)
index.set(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_MASTERS_, last_modified)
return True
return False


def update_xml_catalog(xml_catalog_url: str = "https://spdf.gsfc.nasa.gov/pub/catalogs/all.xml"):
last_modified = http.head(xml_catalog_url).headers['last-modified']
if index.get("cdaweb-inventory", "xml_catalog-last-modified", "") != last_modified:
if index.get(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_XML_, "") != last_modified:
_ensure_path_exists(_XML_CATALOG_PATH)
with open(_XML_CATALOG_PATH, 'w') as f:
f.write(http.get(xml_catalog_url).text)
index.set("cdaweb-inventory", "xml_catalog-last-modified", last_modified)
index.set(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_XML_, last_modified)
return True
return False

Expand All @@ -60,11 +65,11 @@ def build_inventory(root: SpeasyIndex = None, xml_catalog_url: str = "https://sp
masters_url: str = "https://spdf.gsfc.nasa.gov/pub/software/cdawlib/0MASTERS/master.tar"):
needs_rebuild = update_xml_catalog(xml_catalog_url)
needs_rebuild |= update_master_cdf(masters_url)
if needs_rebuild or not index.contains("cdaweb-inventory", "tree"):
if needs_rebuild or not index.contains(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_TREE_):
root = load_xml_catalog(xml_file_path=_XML_CATALOG_PATH, root=root)
update_tree(root=root, master_cdf_dir=_MASTERS_CDF_PATH)
index.set("cdaweb-inventory", "tree", to_dict(root))
index.set(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_TREE_, to_dict(root, version=2))
else:
t = from_dict(index.get("cdaweb-inventory", "tree"))
t = from_dict(index.get(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_TREE_), version=2)
root.__dict__ = t.__dict__
return root
31 changes: 23 additions & 8 deletions speasy/data_providers/csa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tempfile import TemporaryDirectory
from typing import Optional, Tuple, Dict

import numpy as np
from astroquery.utils.tap.core import TapPlus

from speasy.core import any_files, AllowedKwargs, fix_name, EnsureUTCDateTime
Expand All @@ -21,6 +22,18 @@
log = logging.getLogger(__name__)


def _only_primitive_types(d: dict) -> dict:
for k, v in d.items():
if not isinstance(v, (int, bool, str, type(None), list, tuple, set)):
if isinstance(v, np.integer):
d[k] = int(v)
elif isinstance(v, np.floating):
d[k] = float(v)
elif isinstance(v, np.bool):
d[k] = bool(v)
return d


def to_dataset_and_variable(index_or_str: ParameterIndex or str) -> Tuple[str, str]:
if type(index_or_str) is str:
parts = index_or_str.split('/')
Expand All @@ -38,7 +51,7 @@ def register_dataset(instruments, datasets, dataset):
name = fix_name(meta['dataset_id'])
node = make_inventory_node(instruments[dataset['instruments']], DatasetIndex, name=name,
provider="csa",
uid=meta['dataset_id'], **meta)
uid=meta['dataset_id'], **_only_primitive_types(meta))
datasets[meta['dataset_id']] = node


Expand All @@ -49,7 +62,7 @@ def register_observatory(missions, observatories, observatory):
name=fix_name(name),
provider="csa",
uid=name,
**meta)
**_only_primitive_types(meta))
observatories[name] = node


Expand All @@ -58,7 +71,7 @@ def register_mission(inventory_tree, missions, mission):
name = meta.pop('name')
node = make_inventory_node(inventory_tree, SpeasyIndex, name=fix_name(name),
provider="csa",
uid=name, **meta)
uid=name, **_only_primitive_types(meta))
missions[name] = node


Expand All @@ -68,7 +81,7 @@ def register_instrument(observatories, instruments, instrument):
node = make_inventory_node(observatories.get(instrument['observatories'], observatories['MULTIPLE']),
SpeasyIndex, name=fix_name(name),
provider="csa",
uid=name, **meta)
uid=name, **_only_primitive_types(meta))
instruments[name] = node


Expand All @@ -81,15 +94,17 @@ def register_param(datasets, parameter):
meta['stop_date'] = parent_dataset.stop_date
name = fix_name(meta['parameter_id'])
make_inventory_node(parent_dataset, ParameterIndex, name=name,
provider="csa", uid=f"{parameter['dataset_id']}/{parameter['parameter_id']}", **meta)
provider="csa", uid=f"{parameter['dataset_id']}/{parameter['parameter_id']}",
**_only_primitive_types(meta))


def build_inventory(root: SpeasyIndex, tapurl="https://csa.esac.esa.int/csa-sl-tap/tap/"):
CSA = TapPlus(url=tapurl)
missions_req = CSA.launch_job_async("SELECT * FROM csa.v_mission")
observatories_req = CSA.launch_job_async("SELECT * FROM csa.v_observatory")
instruments_req = CSA.launch_job_async("SELECT * FROM csa.v_instrument")
datasets_req = CSA.launch_job_async("SELECT * FROM csa.v_dataset WHERE dataset_id like '%GRMB' OR (is_cef='true' AND is_istp='true')")
datasets_req = CSA.launch_job_async(
"SELECT * FROM csa.v_dataset WHERE dataset_id like '%GRMB' OR (is_cef='true' AND is_istp='true')")
parameters_req = CSA.launch_job_async("SELECT * FROM csa.v_parameter WHERE data_type='Data' AND value_type<>'CHAR'")
missions = {}
observatories = {}
Expand Down Expand Up @@ -135,7 +150,7 @@ def _dataset_range(self, dataset: str or DatasetIndex) -> DateTimeRange:
def _dl_variable(self,
dataset: str, variable: str,
start_time: datetime, stop_time: datetime, extra_http_headers: Dict[str, str] or None = None) -> \
Optional[SpeasyVariable]:
Optional[SpeasyVariable]:

# https://csa.esac.esa.int/csa-sl-tap/data?RETRIEVAL_TYPE=product&&DATASET_ID=C3_CP_PEA_LERL_DEFlux&START_DATE=2001-06-10T22:12:14Z&END_DATE=2001-06-11T06:12:14Z&DELIVERY_FORMAT=CDF_ISTP&DELIVERY_INTERVAL=all
ds_range = self._dataset_range(dataset)
Expand Down Expand Up @@ -238,5 +253,5 @@ def get_data(self, product, start_time: datetime, stop_time: datetime,

def get_variable(self, dataset: str, variable: str, start_time: datetime or str, stop_time: datetime or str,
**kwargs) -> \
Optional[SpeasyVariable]:
Optional[SpeasyVariable]:
return self.get_data(f"{dataset}/{variable}", start_time, stop_time, **kwargs)
54 changes: 54 additions & 0 deletions tests/test_inventories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import unittest
import os

Check notice

Code scanning / CodeQL

Unused import Note test

Import of 'os' is not used.
from ddt import ddt, data, unpack

import speasy as spz
from speasy.core.inventory.indexes import from_dict, to_dict, SpeasyIndex
from speasy.core.dataprovider import DataProvider


def compare_inventories(inventory1: SpeasyIndex, inventory2: SpeasyIndex):
if inventory1.spz_name() != inventory2.spz_name():
print(f"Name mismatch: {inventory1.spz_name()} != {inventory2.spz_name()}")
return False
for key in inventory1.__dict__.keys():
if key not in inventory2.__dict__:
print(f"Key missing: {key}")
return False
value1 = inventory1.__dict__[key]
value2 = inventory2.__dict__[key]
if isinstance(value1, SpeasyIndex) and isinstance(value2, SpeasyIndex):
if not compare_inventories(value1, value2):
return False
elif value1 != value2:
print(f"Value mismatch: {value1} != {value2}")
return False
return True


@ddt
class FromDictAndToDictPreserveInventory(unittest.TestCase):

def assertInventoryEqual(self, inventory1: SpeasyIndex, inventory2: SpeasyIndex):
if inventory1.spz_name() != inventory2.spz_name():
self.fail(f"Name mismatch: {inventory1.spz_name()} != {inventory2.spz_name()}")
for key in inventory1.__dict__.keys():
if key not in inventory2.__dict__:
self.fail(f"Key missing: {key}")
value1 = inventory1.__dict__[key]
value2 = inventory2.__dict__[key]
if isinstance(value1, SpeasyIndex) and isinstance(value2, SpeasyIndex):
self.assertInventoryEqual(value1, value2)
elif value1 != value2:
self.fail(f"Value mismatch: {value1}({type(value1)}) != {value2}({type(value2)}) for key {key}")

@data(
(spz.amda,),
(spz.cda,),
(spz.ssc,),
(spz.csa,),
)
@unpack
def test_from_dict_and_to_dict_preserve_inventory(self, provider: DataProvider):
inventory = provider._inventory(provider_name=provider.provider_name, disable_proxy=True)
self.assertInventoryEqual(inventory, from_dict(to_dict(inventory, version=2), version=2))

0 comments on commit 5eab456

Please sign in to comment.