Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce a new inventory JSON/dict format that preserves more primitive types #200

Merged
merged 1 commit into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions speasy/core/impex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,8 +914,7 @@ def _concatenate_variables(variables: Dict[str, SpeasyVariable], product_id) ->
if len(variables) == 0:
return None
elif len(variables) == 1:
result = list(variables.values())[0].copy()
return result
return list(variables.values())[0]

axes = []
columns = []
Expand Down
48 changes: 34 additions & 14 deletions speasy/core/inventory/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,15 @@ def __contains__(self, item: str or ComponentIndex):
return True
return False


class ArgumentIndex(SpeasyIndex):
def __init__(self, name: str, provider: str, uid: str, meta: Optional[dict] = None):
super().__init__(name, provider, uid, meta)

def __repr__(self):
return f'<ArgumentIndex: {self.spz_name()}>'


class ArgumentListIndex(SpeasyIndex):
def __init__(self, name: str, provider: str, uid: str, meta: Optional[dict] = None):
super().__init__(name, provider, uid, meta)
Expand All @@ -103,7 +105,7 @@ def _arguments(self):
def __repr__(self):
return f'<ArgumentListIndex: {self.spz_name()}>'

def __getitem__(self, item)->ArgumentIndex:
def __getitem__(self, item) -> ArgumentIndex:
return self._arguments[item]

def __len__(self):
Expand All @@ -112,6 +114,7 @@ def __len__(self):
def __iter__(self):
return self._arguments.__iter__()


class TemplatedParameterIndex(ParameterIndex):
__spz_arguments__: ArgumentListIndex

Expand Down Expand Up @@ -145,33 +148,48 @@ def __contains__(self, item: str or ParameterIndex):
return False


def to_dict(inventory_tree: SpeasyIndex or str):
def to_dict(inventory_tree: SpeasyIndex or str, version: int = 1):
if isinstance(inventory_tree, SpeasyIndex):
return {key: to_dict(value) for key, value in inventory_tree.__dict__.items()}
elif type(inventory_tree) is not str:
return str(inventory_tree)
return {key: to_dict(value, version=version) for key, value in inventory_tree.__dict__.items()}
elif version <= 1:
if type(inventory_tree) is not str:
inventory_tree = str(inventory_tree)
else:
if type(inventory_tree) in [list, tuple, set]:
return type(inventory_tree)([to_dict(value, version) for value in inventory_tree])
if type(inventory_tree) is dict:
return {key: to_dict(value, version) for key, value in inventory_tree.items()}
if type(inventory_tree) not in [str, int, float, bool, type(None)]:
return str(inventory_tree)

return inventory_tree


def from_dict(inventory_tree: dict or str):
if type(inventory_tree) is str:
return inventory_tree
def from_dict(inventory_tree: dict or str, version: int = 1):
if version <= 1:
if type(inventory_tree) is str:
return inventory_tree
else:
if type(inventory_tree) in [str, int, float, bool, type(None), list, tuple, set]:
return inventory_tree
if type(inventory_tree) is dict and "__spz_type__" not in inventory_tree:
return inventory_tree
idx_type = inventory_tree.pop("__spz_type__")
idx_name = inventory_tree.pop("__spz_name__")
idx_provider = inventory_tree.pop("__spz_provider__")
idx_uid = inventory_tree.pop("__spz_uid__")
idx_meta = {key: from_dict(value) for key, value in inventory_tree.items()}
idx_meta = {key: from_dict(value, version) for key, value in inventory_tree.items()}
root = __INDEXES_TYPES__.get(idx_type, SpeasyIndex)(name=idx_name, provider=idx_provider, uid=idx_uid,
meta=idx_meta)
return root


def to_json(inventory_tree: SpeasyIndex, sort_keys=True):
return json.dumps(to_dict(inventory_tree), sort_keys=sort_keys)
def to_json(inventory_tree: SpeasyIndex, sort_keys=True, version: int = 1):
return json.dumps(to_dict(inventory_tree, version), sort_keys=sort_keys)


def from_json(inventory_tree: str):
return from_dict(json.loads(inventory_tree))
def from_json(inventory_tree: str, version: int = 1):
return from_dict(json.loads(inventory_tree), version)


def make_inventory_node(parent, ctor, name, provider, uid, **meta):
Expand All @@ -191,4 +209,6 @@ def inventory_has_changed(orig, new):
return True
return False

AnyProductIndex = Union[ParameterIndex, TemplatedParameterIndex, DatasetIndex, TimetableIndex, CatalogIndex, ComponentIndex]

AnyProductIndex = Union[
ParameterIndex, TemplatedParameterIndex, DatasetIndex, TimetableIndex, CatalogIndex, ComponentIndex]
5 changes: 3 additions & 2 deletions speasy/core/proxy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

log = logging.getLogger(__name__)
PROXY_ALLOWED_KWARGS = ['disable_proxy']
MINIMUM_REQUIRED_PROXY_VERSION = Version("0.11.0")
MINIMUM_REQUIRED_PROXY_VERSION = Version("0.12.0")
_CURRENT_PROXY_SERVER_VERSION = None

if proxy_cfg.url() == "" or proxy_cfg.enabled() == False:
Expand Down Expand Up @@ -113,13 +113,14 @@ def get(provider: str, **kwargs):
kwargs['provider'] = provider
kwargs['format'] = 'python_dict'
kwargs['zstd_compression'] = zstd_compression
kwargs['version'] = 2
headers = {}
if saved_inventory is not None:
headers["If-Modified-Since"] = parser.parse(saved_inventory.build_date).ctime()
resp = http.get(f"{url}/get_inventory", params=kwargs, headers=headers)
log.debug(f"Asking {provider} inventory from proxy {resp.url}, {resp.headers}")
if resp.status_code == 200:
inventory = inventory_from_dict(pickle.loads(decompress(resp.bytes)))
inventory = inventory_from_dict(pickle.loads(decompress(resp.bytes)), version=2)
index.set("proxy_inventories", provider, inventory)
index.set("proxy_inventories_save_date", provider, datetime.utcnow())
return inventory
Expand Down
19 changes: 12 additions & 7 deletions speasy/data_providers/cda/_inventory_builder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
_MASTERS_CDF_PATH = f"{cda_cfg.inventory_data_path()}/masters_cdf/"
_XML_CATALOG_PATH = f"{cda_cfg.inventory_data_path()}/all.xml"

_CDAWEB_INVENTORY_ = "cdaweb-inventory"
_CDAWEB_INVENTORY_TREE_ = "tree_v2"
_CDAWEB_INVENTORY_LAST_MODIFIED_MASTERS_ = "masters-last-modified"
_CDAWEB_INVENTORY_LAST_MODIFIED_XML_ = "last_modified_xml"


def _ensure_path_exists(path: str):
dirname = os.path.dirname(path)
Expand All @@ -37,21 +42,21 @@ def _download_and_extract_master_cdf(masters_url: str):

def update_master_cdf(masters_url: str = "https://spdf.gsfc.nasa.gov/pub/software/cdawlib/0MASTERS/master.tar"):
last_modified = http.head(masters_url).headers['last-modified']
if index.get("cdaweb-inventory", "masters-last-modified", "") != last_modified:
if index.get(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_MASTERS_, "") != last_modified:
_clean_master_cdf_folder()
_download_and_extract_master_cdf(masters_url)
index.set("cdaweb-inventory", "masters-last-modified", last_modified)
index.set(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_MASTERS_, last_modified)
return True
return False


def update_xml_catalog(xml_catalog_url: str = "https://spdf.gsfc.nasa.gov/pub/catalogs/all.xml"):
last_modified = http.head(xml_catalog_url).headers['last-modified']
if index.get("cdaweb-inventory", "xml_catalog-last-modified", "") != last_modified:
if index.get(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_XML_, "") != last_modified:
_ensure_path_exists(_XML_CATALOG_PATH)
with open(_XML_CATALOG_PATH, 'w') as f:
f.write(http.get(xml_catalog_url).text)
index.set("cdaweb-inventory", "xml_catalog-last-modified", last_modified)
index.set(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_LAST_MODIFIED_XML_, last_modified)
return True
return False

Expand All @@ -60,11 +65,11 @@ def build_inventory(root: SpeasyIndex = None, xml_catalog_url: str = "https://sp
masters_url: str = "https://spdf.gsfc.nasa.gov/pub/software/cdawlib/0MASTERS/master.tar"):
needs_rebuild = update_xml_catalog(xml_catalog_url)
needs_rebuild |= update_master_cdf(masters_url)
if needs_rebuild or not index.contains("cdaweb-inventory", "tree"):
if needs_rebuild or not index.contains(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_TREE_):
root = load_xml_catalog(xml_file_path=_XML_CATALOG_PATH, root=root)
update_tree(root=root, master_cdf_dir=_MASTERS_CDF_PATH)
index.set("cdaweb-inventory", "tree", to_dict(root))
index.set(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_TREE_, to_dict(root, version=2))
else:
t = from_dict(index.get("cdaweb-inventory", "tree"))
t = from_dict(index.get(_CDAWEB_INVENTORY_, _CDAWEB_INVENTORY_TREE_), version=2)
root.__dict__ = t.__dict__
return root
31 changes: 23 additions & 8 deletions speasy/data_providers/csa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tempfile import TemporaryDirectory
from typing import Optional, Tuple, Dict

import numpy as np
from astroquery.utils.tap.core import TapPlus

from speasy.core import any_files, AllowedKwargs, fix_name, EnsureUTCDateTime
Expand All @@ -21,6 +22,18 @@
log = logging.getLogger(__name__)


def _only_primitive_types(d: dict) -> dict:
for k, v in d.items():
if not isinstance(v, (int, bool, str, type(None), list, tuple, set)):
if isinstance(v, np.integer):
d[k] = int(v)
elif isinstance(v, np.floating):
d[k] = float(v)
elif isinstance(v, (np.bool_,bool)):
d[k] = bool(v)
return d


def to_dataset_and_variable(index_or_str: ParameterIndex or str) -> Tuple[str, str]:
if type(index_or_str) is str:
parts = index_or_str.split('/')
Expand All @@ -38,7 +51,7 @@ def register_dataset(instruments, datasets, dataset):
name = fix_name(meta['dataset_id'])
node = make_inventory_node(instruments[dataset['instruments']], DatasetIndex, name=name,
provider="csa",
uid=meta['dataset_id'], **meta)
uid=meta['dataset_id'], **_only_primitive_types(meta))
datasets[meta['dataset_id']] = node


Expand All @@ -49,7 +62,7 @@ def register_observatory(missions, observatories, observatory):
name=fix_name(name),
provider="csa",
uid=name,
**meta)
**_only_primitive_types(meta))
observatories[name] = node


Expand All @@ -58,7 +71,7 @@ def register_mission(inventory_tree, missions, mission):
name = meta.pop('name')
node = make_inventory_node(inventory_tree, SpeasyIndex, name=fix_name(name),
provider="csa",
uid=name, **meta)
uid=name, **_only_primitive_types(meta))
missions[name] = node


Expand All @@ -68,7 +81,7 @@ def register_instrument(observatories, instruments, instrument):
node = make_inventory_node(observatories.get(instrument['observatories'], observatories['MULTIPLE']),
SpeasyIndex, name=fix_name(name),
provider="csa",
uid=name, **meta)
uid=name, **_only_primitive_types(meta))
instruments[name] = node


Expand All @@ -81,15 +94,17 @@ def register_param(datasets, parameter):
meta['stop_date'] = parent_dataset.stop_date
name = fix_name(meta['parameter_id'])
make_inventory_node(parent_dataset, ParameterIndex, name=name,
provider="csa", uid=f"{parameter['dataset_id']}/{parameter['parameter_id']}", **meta)
provider="csa", uid=f"{parameter['dataset_id']}/{parameter['parameter_id']}",
**_only_primitive_types(meta))


def build_inventory(root: SpeasyIndex, tapurl="https://csa.esac.esa.int/csa-sl-tap/tap/"):
CSA = TapPlus(url=tapurl)
missions_req = CSA.launch_job_async("SELECT * FROM csa.v_mission")
observatories_req = CSA.launch_job_async("SELECT * FROM csa.v_observatory")
instruments_req = CSA.launch_job_async("SELECT * FROM csa.v_instrument")
datasets_req = CSA.launch_job_async("SELECT * FROM csa.v_dataset WHERE dataset_id like '%GRMB' OR (is_cef='true' AND is_istp='true')")
datasets_req = CSA.launch_job_async(
"SELECT * FROM csa.v_dataset WHERE dataset_id like '%GRMB' OR (is_cef='true' AND is_istp='true')")
parameters_req = CSA.launch_job_async("SELECT * FROM csa.v_parameter WHERE data_type='Data' AND value_type<>'CHAR'")
missions = {}
observatories = {}
Expand Down Expand Up @@ -135,7 +150,7 @@ def _dataset_range(self, dataset: str or DatasetIndex) -> DateTimeRange:
def _dl_variable(self,
dataset: str, variable: str,
start_time: datetime, stop_time: datetime, extra_http_headers: Dict[str, str] or None = None) -> \
Optional[SpeasyVariable]:
Optional[SpeasyVariable]:

# https://csa.esac.esa.int/csa-sl-tap/data?RETRIEVAL_TYPE=product&&DATASET_ID=C3_CP_PEA_LERL_DEFlux&START_DATE=2001-06-10T22:12:14Z&END_DATE=2001-06-11T06:12:14Z&DELIVERY_FORMAT=CDF_ISTP&DELIVERY_INTERVAL=all
ds_range = self._dataset_range(dataset)
Expand Down Expand Up @@ -238,5 +253,5 @@ def get_data(self, product, start_time: datetime, stop_time: datetime,

def get_variable(self, dataset: str, variable: str, start_time: datetime or str, stop_time: datetime or str,
**kwargs) -> \
Optional[SpeasyVariable]:
Optional[SpeasyVariable]:
return self.get_data(f"{dataset}/{variable}", start_time, stop_time, **kwargs)
53 changes: 53 additions & 0 deletions tests/test_inventories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import unittest
from ddt import ddt, data, unpack

import speasy as spz
from speasy.core.inventory.indexes import from_dict, to_dict, SpeasyIndex
from speasy.core.dataprovider import DataProvider


def compare_inventories(inventory1: SpeasyIndex, inventory2: SpeasyIndex):
if inventory1.spz_name() != inventory2.spz_name():
print(f"Name mismatch: {inventory1.spz_name()} != {inventory2.spz_name()}")
return False
for key in inventory1.__dict__.keys():
if key not in inventory2.__dict__:
print(f"Key missing: {key}")
return False
value1 = inventory1.__dict__[key]
value2 = inventory2.__dict__[key]
if isinstance(value1, SpeasyIndex) and isinstance(value2, SpeasyIndex):
if not compare_inventories(value1, value2):
return False
elif value1 != value2:
print(f"Value mismatch: {value1} != {value2}")
return False
return True


@ddt
class FromDictAndToDictPreserveInventory(unittest.TestCase):

def assertInventoryEqual(self, inventory1: SpeasyIndex, inventory2: SpeasyIndex):
if inventory1.spz_name() != inventory2.spz_name():
self.fail(f"Name mismatch: {inventory1.spz_name()} != {inventory2.spz_name()}")
for key in inventory1.__dict__.keys():
if key not in inventory2.__dict__:
self.fail(f"Key missing: {key}")
value1 = inventory1.__dict__[key]
value2 = inventory2.__dict__[key]
if isinstance(value1, SpeasyIndex) and isinstance(value2, SpeasyIndex):
self.assertInventoryEqual(value1, value2)
elif value1 != value2:
self.fail(f"Value mismatch: {value1}({type(value1)}) != {value2}({type(value2)}) for key {key}")

@data(
(spz.amda,),
(spz.cda,),
(spz.ssc,),
(spz.csa,),
)
@unpack
def test_from_dict_and_to_dict_preserve_inventory(self, provider: DataProvider):
inventory = provider._inventory(provider_name=provider.provider_name, disable_proxy=True)
self.assertInventoryEqual(inventory, from_dict(to_dict(inventory, version=2), version=2))
Loading