Merge pull request #201 from lanl/genesis-datacard

jpulidojr · web-flow · commit f5dfd2cf4955 · 2025-11-18T13:39:58.000-07:00
Genesis datacard reader
diff --git a/dsi/core.py b/dsi/core.py
@@ -29,7 +29,7 @@ class Terminal():
     PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader', 'file_writer', 'collection_reader']
     VALID_ENV = ['Hostname', 'SystemKernel', 'GitInfo']
     VALID_READERS = ['Bueno', 'Csv', 'YAML1', 'TOML1', 'Parquet', 'Schema', 'JSON', 'MetadataReader1', 'Ensemble', 'Cloverleaf', 'Dict']
-    VALID_DATACARDS = ['Oceans11Datacard', 'DublinCoreDatacard', 'SchemaOrgDatacard', 'GoogleDatacard']
+    VALID_DATACARDS = ['Oceans11Datacard', 'DublinCoreDatacard', 'SchemaOrgDatacard', 'GoogleDatacard', 'GenesisDatacard']
     VALID_WRITERS = ['ER_Diagram', 'Table_Plot', 'Csv_Writer', 'Parquet_Writer']
     VALID_PLUGINS = VALID_ENV + VALID_READERS + VALID_WRITERS + VALID_DATACARDS
     VALID_BACKENDS = ['Gufi', 'Sqlite', 'DuckDB', 'SqlAlchemy', 'HPSS']
@@ -87,8 +87,8 @@ def static_munge(prefix, implementations):
 
         self.user_wrapper = False
         self.new_tables = None
-        self.dsi_tables = ["runtable", "filesystem", "oceans11_datacard", "dublin_core_datacard", "schema_org_datacard", "google_datacard"]
-
+        self.dsi_tables = ["runtable", "filesystem", "oceans11_datacard", "dublin_core_datacard", 
+                           "schema_org_datacard", "google_datacard", "genesis_datacard"]
         self.logger = logging.getLogger(self.__class__.__name__)
         self.debug_level = debug
         if debug == 1 or debug == 2:
diff --git a/dsi/dsi.py b/dsi/dsi.py
@@ -134,6 +134,7 @@ def list_readers(self):
         print("SchemaOrgDatacard    : Loads dataset metadata adhering to schema.org (JSON)")
         print("GoogleDatacard       : Loads dataset metadata adhering to the Google Data Cards Playbook (YAML)")
         print("Oceans11Datacard     : Loads dataset metadata for Oceans11 DSI data server (oceans11.lanl.gov) (YAML)")
+        print("GenesisDatacard      : Loads dataset metadata for LANL Genesis data standard (CSV)")
         print()
 
     def read(self, filenames, reader_name, table_name = None):
@@ -157,6 +158,7 @@ def read(self, filenames, reader_name, table_name = None):
                 - "SchemaOrgDatacard"    → .json
                 - "GoogleDatacard"       → .yaml or .yml
                 - "Oceans11Datacard"     → .yaml or .yml
+                - "GenesisDatacard"      → .csv
 
         `reader_name` : str
             Name of the DSI Reader to use for loading the data. 
@@ -236,6 +238,8 @@ def read(self, filenames, reader_name, table_name = None):
                         self.t.load_module('plugin', 'SchemaOrgDatacard', 'reader', filenames=filenames)
                     elif reader_name.lower() == "googledatacard":
                         self.t.load_module('plugin', 'GoogleDatacard', 'reader', filenames=filenames)
+                    elif reader_name.lower() == "genesisdatacard":
+                        self.t.load_module('plugin', 'GenesisDatacard', 'reader', filenames=filenames)
                     elif reader_name.lower() == "bueno":
                         self.t.load_module('plugin', 'Bueno', 'reader', filenames=filenames)
                     elif reader_name.lower() == "csv":
@@ -266,7 +270,7 @@ def read(self, filenames, reader_name, table_name = None):
             if correct_reader == False:
                 print("read() ERROR: Please check your spelling of the 'reader_name' argument as it does not exist in DSI\n")
                 elg = "Collection, CSV, Parquet, YAML1, TOML1, JSON, Ensemble, Cloverleaf, Bueno, DublinCoreDatacard, SchemaOrgDatacard"
-                sys.exit(f"Eligible readers are: {elg}, GoogleDatacard, Oceans11Datacard")
+                sys.exit(f"Eligible readers are: {elg}, GoogleDatacard, Oceans11Datacard, GenesisDatacard")
 
         table_keys = [k for k in self.t.new_tables if k not in ("dsi_relations", "dsi_units")]
         if self.schema_read == True:
diff --git a/dsi/plugins/file_reader.py b/dsi/plugins/file_reader.py
@@ -3,7 +3,7 @@
 from hashlib import sha1
 import json
 from math import isnan
-from pandas import DataFrame, read_csv, concat
+from pandas import DataFrame, read_csv, concat, read_excel
 import re
 import yaml
 try: import tomllib
@@ -943,6 +943,58 @@ def add_rows(self) -> None:
         self.datacard_data["google_datacard"] = temp_data
         self.set_schema_2(self.datacard_data)
 
+class GenesisDatacard(FileReader):
+    """
+    DSI Reader that stores a dataset's data card as a row in the `genesis_datacard` table.
+    Input datacard should follow template in `examples/test/template_dc_genesis.xlsx`
+    """
+    def __init__(self, filenames, **kwargs):
+        """
+        `filenames` : str or list of str
+            File name(s) of Excel data card files to ingest. Each file must adhere to the
+            LANL Gensis metadata standard. The Excel file must only have one sheet.
+        """
+        super().__init__(filenames, **kwargs)
+        if isinstance(filenames, str):
+            self.datacard_files = [filenames]
+        else:
+            self.datacard_files = filenames
+        self.genesis_data = OrderedDict()
+
+    def add_rows(self) -> None:
+        """
+        Flattens data in the input data card as a row in the `genesis_datacard` table
+
+        `return`: None. 
+            If an error occurs, a tuple in the format - (ErrorType, "error message") - is returned to and printed by the core
+        """
+        temp_data = OrderedDict()
+        for filename in self.datacard_files:
+            try:
+                temp_df = read_excel(filename, sheet_name = 0)
+            except:
+                raise ValueError(f"Error reading in {filename} for the Genesis data card reader")
+
+            required_columns = ["Metadata Element", "Supporting Element", "Requirement Level", "LANL Input Example"]
+            if not set(required_columns).issubset(temp_df.columns.tolist()):
+                raise ValueError(f"The required metadata columns are {', '.join(required_columns)}")
+            
+            for _, row in temp_df.iterrows():
+                if row['Requirement Level'].lower() == "mandatory":
+                    if type(row['Metadata Element']) == str and row['Metadata Element'].strip() not in ["", None]:
+                        if row["Metadata Element"] in temp_data.keys():
+                            temp_data[row["Metadata Element"]].append(row["LANL Input Example"])
+                        else:
+                            temp_data[row["Metadata Element"]] = [row["LANL Input Example"]]
+                    elif type(row['Supporting Element']) == str and row['Supporting Element'].strip() not in ["", None]:
+                        if row["Supporting Element"] in temp_data.keys():
+                            temp_data[row["Supporting Element"]].append(row["LANL Input Example"])
+                        else:
+                            temp_data[row["Supporting Element"]] = [row["LANL Input Example"]]
+
+        self.genesis_data["genesis_datacard"] = temp_data
+        self.set_schema_2(self.genesis_data)
+        
 class MetadataReader1(FileReader):
     """
     DSI Reader that reads in an individual or a set of JSON metadata files
@@ -999,4 +1051,4 @@ def add_rows(self) -> None:
                 filename = self.target_table_prefix + "__" + filename
             self.metadata_file_data[filename] = json_data
 
-        self.set_schema_2(self.metadata_file_data)
+        self.set_schema_2(self.metadata_file_data)
diff --git a/requirements.extras.txt b/requirements.extras.txt
@@ -4,4 +4,5 @@ nbformat>=5.10.2
 graphviz>=0.20.3
 opencv-python>=4.9.0.80
 duckdb>=1.2.0
-xmltodict>=0.14.2
+xmltodict>=0.14.2
+openpyxl>=3.1.5