Skip to content

Commit 997bf92

Browse files
authored
Stac validation (#9)
* Adjust stac output for validator * change time extent to tuple * Fix mypy grumbling, add TemporalExtent * Fix flake8 grumbling
1 parent e3f3a30 commit 997bf92

File tree

3 files changed

+64
-9
lines changed

3 files changed

+64
-9
lines changed

export_client_cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
@click.option(
1818
"-r",
1919
"--refresh-dir",
20-
help="If specified, will read the manifest.json out of an existing directory and re-execute the query to update results."
20+
help=("If specified, will read the manifest.json out of an existing "
21+
"directory and re-execute the query to update results.")
2122
)
2223
@click.option(
2324
"-t",

isamples_export_client/duckdb_utilities.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import datetime
12
import json
23
from typing import Optional
34

@@ -14,15 +15,37 @@ def __repr__(self):
1415
return f"GeoFeaturesResult geo_json={self.geo_json}, bbox={self.bbox}"
1516

1617

18+
class TemporalExtent(tuple):
19+
20+
def __new__(self, t0: Optional[datetime.datetime], t1: Optional[datetime.datetime]):
21+
return tuple.__new__(TemporalExtent, (t0, t1))
22+
23+
1724
def read_geo_features_from_jsonl(filename: str) -> Optional[GeoFeaturesResult]:
1825
con = duckdb.connect()
1926
con.install_extension("spatial")
2027
con.load_extension("spatial")
2128
con.read_json(filename, format="newline_delimited")
2229
location_prefix = "produced_by.sampling_site.sample_location."
23-
q = f"select ST_Extent(envelope)as bb, ST_AsGEOJSON(envelope) as poly from (select ST_Envelope_Agg(ST_Point({location_prefix}longitude, {location_prefix}latitude)) as envelope from '{filename}' where {location_prefix}longitude is not null)"
30+
q = ("select ST_Extent(envelope)as bb, ST_AsGEOJSON(envelope) as poly "
31+
f"from (select ST_Envelope_Agg(ST_Point({location_prefix}longitude, {location_prefix}latitude)) as envelope "
32+
f"from '{filename}' where {location_prefix}longitude is not null)")
2433
spatial_results = con.sql(q).fetchone()
2534
if spatial_results is not None:
2635
return GeoFeaturesResult(spatial_results[0], spatial_results[1])
2736
else:
2837
return None
38+
39+
40+
def get_temporal_extent_from_jsonl(filename: str) -> TemporalExtent:
41+
con = duckdb.connect()
42+
con.read_json(filename, format="newline_delimited")
43+
q = f"SET TimeZone='UTC'; CREATE TABLE samples AS SELECT * FROM read_json('{filename}', format='newline_delimited');"
44+
con.sql(q)
45+
q = ("SELECT min(produced_by.result_time::TIMESTAMPTZ) as min_t, "
46+
"max(produced_by.result_time::TIMESTAMPTZ) as max_t "
47+
"from samples where produced_by.result_time is not null")
48+
result = con.sql(q).fetchone()
49+
if result is not None:
50+
return TemporalExtent(result[0], result[1])
51+
return TemporalExtent(None, None)

isamples_export_client/export_client.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010
import requests
1111
from requests import Session, Response
1212

13-
from isamples_export_client.duckdb_utilities import GeoFeaturesResult, read_geo_features_from_jsonl
13+
from isamples_export_client.duckdb_utilities import (
14+
GeoFeaturesResult,
15+
TemporalExtent,
16+
read_geo_features_from_jsonl,
17+
get_temporal_extent_from_jsonl
18+
)
1419
from isamples_export_client.geoparquet_utilities import write_geoparquet_from_json_lines
1520

1621
GEOPARQUET = "geoparquet"
@@ -26,6 +31,7 @@
2631

2732
STAC_COLLECTION_TYPE = "Collection"
2833
STAC_VERSION = "1.0.0"
34+
STAC_DEFAULT_LICENSE = "CC-BY-4.0" # https://spdx.org/licenses/CC-BY-4.0.html
2935
COLLECTION_ID = "isamples-stac-collection-"
3036
COLLECTION_DESCRIPTION = """The Internet of Samples (iSamples) is a multi-disciplinary and multi-institutional
3137
project funded by the National Science Foundation to design, develop, and promote service infrastructure to uniquely,
@@ -41,6 +47,13 @@ def datetime_to_solr_format(dt):
4147
return dt.strftime(SOLR_TIME_FORMAT)
4248

4349

50+
class JsonDateTimeEncoder(json.JSONEncoder):
51+
def default(self, o):
52+
if isinstance(o, datetime.datetime):
53+
return o.isoformat(timespec="seconds")
54+
return json.JSONEncoder.default(self, o)
55+
56+
4457
class ExportJobStatus(Enum):
4558
CREATED = "created"
4659
STARTED = "started"
@@ -185,7 +198,15 @@ def write_manifest(self, query: str, uuid: str, tstarted: datetime.datetime, num
185198
f.write(json.dumps(manifests, indent=4))
186199
return manifest_path
187200

188-
def write_stac(self, uuid: str, tstarted: datetime.datetime, geo_result: GeoFeaturesResult, solr_query: str, json_file_path: str, parquet_file_path: str) -> str:
201+
def write_stac(
202+
self,
203+
uuid: str,
204+
tstarted: datetime.datetime,
205+
geo_result: GeoFeaturesResult,
206+
temporal_result: TemporalExtent,
207+
solr_query: str,
208+
json_file_path: str,
209+
parquet_file_path: str) -> str:
189210
assets_dict = {
190211
}
191212
description_string = (
@@ -210,8 +231,17 @@ def write_stac(self, uuid: str, tstarted: datetime.datetime, geo_result: GeoFeat
210231
"type": STAC_COLLECTION_TYPE,
211232
"id": f"iSamples Export Service result {uuid}",
212233
"collection": f"{COLLECTION_TITLE} {uuid}",
213-
"geometry": geo_result.geo_json_dict,
214-
"bbox": geo_result.bbox,
234+
"license": STAC_DEFAULT_LICENSE,
235+
"extent": {
236+
"spatial": {
237+
"bbox": [geo_result.bbox,]
238+
},
239+
"temporal": {
240+
"interval": [
241+
temporal_result
242+
]
243+
}
244+
},
215245
"properties": {
216246
"datetime": datetime_to_solr_format(tstarted)
217247
},
@@ -305,8 +335,8 @@ def write_stac(self, uuid: str, tstarted: datetime.datetime, geo_result: GeoFeat
305335
"assets": assets_dict
306336
}
307337
stac_path = ExportClient._stac_file_path(self._destination_directory)
308-
with open(stac_path, "w") as f:
309-
f.write(json.dumps(stac_item, indent=4))
338+
with open(stac_path, "w", encoding="UTF-8") as f:
339+
json.dump(stac_item, f, indent=4, ensure_ascii=False, cls=JsonDateTimeEncoder)
310340
return stac_path
311341

312342
def perform_full_download(self):
@@ -334,13 +364,14 @@ def perform_full_download(self):
334364
manifest_path = self.write_manifest(self._query, uuid, tstarted, num_results)
335365
logging.info(f"Successfully wrote manifest file to {manifest_path}")
336366
geo_result = read_geo_features_from_jsonl(filename)
367+
temporal_result = get_temporal_extent_from_jsonl(filename)
337368
parquet_filename = None
338369
if self.is_geoparquet:
339370
parquet_filename = write_geoparquet_from_json_lines(filename)
340371
query_string = status_json.get("query").replace("'", "\"")
341372
solr_query_dict = json.loads(query_string)
342373
query = solr_query_dict.pop("q")
343-
stac_path = self.write_stac(uuid, tstarted, geo_result, query, filename, parquet_filename)
374+
stac_path = self.write_stac(uuid, tstarted, geo_result, temporal_result, query, filename, parquet_filename)
344375
logging.info(f"Successfully wrote stac item to {stac_path}")
345376
break
346377
except Exception as e:

0 commit comments

Comments
 (0)