Skip to content

Commit f5dfd2c

Browse files
authored
Merge pull request #201 from lanl/genesis-datacard
Genesis datacard reader
2 parents c420f48 + 2b5d658 commit f5dfd2c

File tree

4 files changed

+64
-7
lines changed

4 files changed

+64
-7
lines changed

dsi/core.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class Terminal():
2929
PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader', 'file_writer', 'collection_reader']
3030
VALID_ENV = ['Hostname', 'SystemKernel', 'GitInfo']
3131
VALID_READERS = ['Bueno', 'Csv', 'YAML1', 'TOML1', 'Parquet', 'Schema', 'JSON', 'MetadataReader1', 'Ensemble', 'Cloverleaf', 'Dict']
32-
VALID_DATACARDS = ['Oceans11Datacard', 'DublinCoreDatacard', 'SchemaOrgDatacard', 'GoogleDatacard']
32+
VALID_DATACARDS = ['Oceans11Datacard', 'DublinCoreDatacard', 'SchemaOrgDatacard', 'GoogleDatacard', 'GenesisDatacard']
3333
VALID_WRITERS = ['ER_Diagram', 'Table_Plot', 'Csv_Writer', 'Parquet_Writer']
3434
VALID_PLUGINS = VALID_ENV + VALID_READERS + VALID_WRITERS + VALID_DATACARDS
3535
VALID_BACKENDS = ['Gufi', 'Sqlite', 'DuckDB', 'SqlAlchemy', 'HPSS']
@@ -87,8 +87,8 @@ def static_munge(prefix, implementations):
8787

8888
self.user_wrapper = False
8989
self.new_tables = None
90-
self.dsi_tables = ["runtable", "filesystem", "oceans11_datacard", "dublin_core_datacard", "schema_org_datacard", "google_datacard"]
91-
90+
self.dsi_tables = ["runtable", "filesystem", "oceans11_datacard", "dublin_core_datacard",
91+
"schema_org_datacard", "google_datacard", "genesis_datacard"]
9292
self.logger = logging.getLogger(self.__class__.__name__)
9393
self.debug_level = debug
9494
if debug == 1 or debug == 2:

dsi/dsi.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ def list_readers(self):
134134
print("SchemaOrgDatacard : Loads dataset metadata adhering to schema.org (JSON)")
135135
print("GoogleDatacard : Loads dataset metadata adhering to the Google Data Cards Playbook (YAML)")
136136
print("Oceans11Datacard : Loads dataset metadata for Oceans11 DSI data server (oceans11.lanl.gov) (YAML)")
137+
print("GenesisDatacard : Loads dataset metadata for LANL Genesis data standard (CSV)")
137138
print()
138139

139140
def read(self, filenames, reader_name, table_name = None):
@@ -157,6 +158,7 @@ def read(self, filenames, reader_name, table_name = None):
157158
- "SchemaOrgDatacard" → .json
158159
- "GoogleDatacard" → .yaml or .yml
159160
- "Oceans11Datacard" → .yaml or .yml
161+
- "GenesisDatacard" → .csv
160162
161163
`reader_name` : str
162164
Name of the DSI Reader to use for loading the data.
@@ -236,6 +238,8 @@ def read(self, filenames, reader_name, table_name = None):
236238
self.t.load_module('plugin', 'SchemaOrgDatacard', 'reader', filenames=filenames)
237239
elif reader_name.lower() == "googledatacard":
238240
self.t.load_module('plugin', 'GoogleDatacard', 'reader', filenames=filenames)
241+
elif reader_name.lower() == "genesisdatacard":
242+
self.t.load_module('plugin', 'GenesisDatacard', 'reader', filenames=filenames)
239243
elif reader_name.lower() == "bueno":
240244
self.t.load_module('plugin', 'Bueno', 'reader', filenames=filenames)
241245
elif reader_name.lower() == "csv":
@@ -266,7 +270,7 @@ def read(self, filenames, reader_name, table_name = None):
266270
if correct_reader == False:
267271
print("read() ERROR: Please check your spelling of the 'reader_name' argument as it does not exist in DSI\n")
268272
elg = "Collection, CSV, Parquet, YAML1, TOML1, JSON, Ensemble, Cloverleaf, Bueno, DublinCoreDatacard, SchemaOrgDatacard"
269-
sys.exit(f"Eligible readers are: {elg}, GoogleDatacard, Oceans11Datacard")
273+
sys.exit(f"Eligible readers are: {elg}, GoogleDatacard, Oceans11Datacard, GenesisDatacard")
270274

271275
table_keys = [k for k in self.t.new_tables if k not in ("dsi_relations", "dsi_units")]
272276
if self.schema_read == True:

dsi/plugins/file_reader.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from hashlib import sha1
44
import json
55
from math import isnan
6-
from pandas import DataFrame, read_csv, concat
6+
from pandas import DataFrame, read_csv, concat, read_excel
77
import re
88
import yaml
99
try: import tomllib
@@ -943,6 +943,58 @@ def add_rows(self) -> None:
943943
self.datacard_data["google_datacard"] = temp_data
944944
self.set_schema_2(self.datacard_data)
945945

946+
class GenesisDatacard(FileReader):
947+
"""
948+
DSI Reader that stores a dataset's data card as a row in the `genesis_datacard` table.
949+
Input datacard should follow template in `examples/test/template_dc_genesis.xlsx`
950+
"""
951+
def __init__(self, filenames, **kwargs):
952+
"""
953+
`filenames` : str or list of str
954+
File name(s) of Excel data card files to ingest. Each file must adhere to the
955+
LANL Gensis metadata standard. The Excel file must only have one sheet.
956+
"""
957+
super().__init__(filenames, **kwargs)
958+
if isinstance(filenames, str):
959+
self.datacard_files = [filenames]
960+
else:
961+
self.datacard_files = filenames
962+
self.genesis_data = OrderedDict()
963+
964+
def add_rows(self) -> None:
965+
"""
966+
Flattens data in the input data card as a row in the `genesis_datacard` table
967+
968+
`return`: None.
969+
If an error occurs, a tuple in the format - (ErrorType, "error message") - is returned to and printed by the core
970+
"""
971+
temp_data = OrderedDict()
972+
for filename in self.datacard_files:
973+
try:
974+
temp_df = read_excel(filename, sheet_name = 0)
975+
except:
976+
raise ValueError(f"Error reading in {filename} for the Genesis data card reader")
977+
978+
required_columns = ["Metadata Element", "Supporting Element", "Requirement Level", "LANL Input Example"]
979+
if not set(required_columns).issubset(temp_df.columns.tolist()):
980+
raise ValueError(f"The required metadata columns are {', '.join(required_columns)}")
981+
982+
for _, row in temp_df.iterrows():
983+
if row['Requirement Level'].lower() == "mandatory":
984+
if type(row['Metadata Element']) == str and row['Metadata Element'].strip() not in ["", None]:
985+
if row["Metadata Element"] in temp_data.keys():
986+
temp_data[row["Metadata Element"]].append(row["LANL Input Example"])
987+
else:
988+
temp_data[row["Metadata Element"]] = [row["LANL Input Example"]]
989+
elif type(row['Supporting Element']) == str and row['Supporting Element'].strip() not in ["", None]:
990+
if row["Supporting Element"] in temp_data.keys():
991+
temp_data[row["Supporting Element"]].append(row["LANL Input Example"])
992+
else:
993+
temp_data[row["Supporting Element"]] = [row["LANL Input Example"]]
994+
995+
self.genesis_data["genesis_datacard"] = temp_data
996+
self.set_schema_2(self.genesis_data)
997+
946998
class MetadataReader1(FileReader):
947999
"""
9481000
DSI Reader that reads in an individual or a set of JSON metadata files
@@ -999,4 +1051,4 @@ def add_rows(self) -> None:
9991051
filename = self.target_table_prefix + "__" + filename
10001052
self.metadata_file_data[filename] = json_data
10011053

1002-
self.set_schema_2(self.metadata_file_data)
1054+
self.set_schema_2(self.metadata_file_data)

requirements.extras.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ nbformat>=5.10.2
44
graphviz>=0.20.3
55
opencv-python>=4.9.0.80
66
duckdb>=1.2.0
7-
xmltodict>=0.14.2
7+
xmltodict>=0.14.2
8+
openpyxl>=3.1.5

0 commit comments

Comments
 (0)