Update sqlite/duckdb to overwrite an existing db schema and added tests

Vedant1 · Vedant1 · commit 9df5329a2323 · 2025-12-05T16:44:19.000-07:00
diff --git a/dsi/backends/duckdb.py b/dsi/backends/duckdb.py
@@ -175,6 +175,56 @@ def ingest_artifacts(self, collection, isVerbose=False):
         """
         artifacts = collection
 
+        self.cur.execute("BEGIN TRANSACTION")
+
+        if self.list() is not None and list(artifacts.keys()) == ["dsi_relations"]: 
+            pk_list = artifacts["dsi_relations"]["primary_key"]
+            fk_list = artifacts["dsi_relations"]["foreign_key"]
+            pk_tables = set(t[0] for t in pk_list)
+            fk_tables = set(t[0] for t in fk_list if t[0] != None)
+            all_schema_tables = pk_tables.union(fk_tables)
+            db_tables = [t[0] for t in self.list() if t[0] != "dsi_units"]
+
+            # check if tables from dsi_relations are all in the db
+            if all_schema_tables.issubset(set(db_tables)):
+                circ, _ = self.check_table_relations(all_schema_tables, artifacts["dsi_relations"])
+                if circ:
+                    return (ValueError, f"A complex schema with a circular dependency cannot be ingested into a DuckDB backend.")
+
+                drop_order = all_schema_tables
+                collect = self.process_artifacts()
+                if "dsi_relations" in collect.keys():
+                    curr_pk_tables = set(t[0] for t in collect["dsi_relations"]["primary_key"])
+                    curr_fk_tables = set(t[0] for t in collect["dsi_relations"]["foreign_key"] if t[0] != None)
+                    curr_schema_tables = curr_pk_tables.union(curr_fk_tables)
+
+                    # need to drop and reingest all tables in old schema and new schema
+                    all_schema_tables = all_schema_tables.union(curr_schema_tables)
+
+                    _, ord_tables1 = self.check_table_relations(all_schema_tables, collect["dsi_relations"])
+                    drop_order = ord_tables1
+                
+                for table in drop_order:
+                    self.cur.execute(f'DROP TABLE IF EXISTS "{table}";')
+                try:
+                    self.con.commit()
+                except Exception as e:
+                    self.cur.execute("ROLLBACK")
+                    self.cur.execute("CHECKPOINT")
+                    return (duckdb.Error, e)
+                
+                #do not reingest tables not in old or new schema as they will be the same
+                non_schema_tables = set(db_tables) - all_schema_tables 
+                for t in non_schema_tables:
+                    del collect[t]
+                
+                collect["dsi_relations"] = artifacts["dsi_relations"]
+                artifacts = collect
+
+            else:
+                print("WARNING: Complex schemas can only be ingested if all referenced data tables are loaded into DSI.")
+            
+
         table_order = artifacts.keys()
         if "dsi_relations" in artifacts.keys():
             circular, ordered_tables = self.check_table_relations(artifacts.keys(), artifacts["dsi_relations"])
@@ -184,10 +234,8 @@ def ingest_artifacts(self, collection, isVerbose=False):
             else:
                 table_order = list(reversed(ordered_tables)) # ingest primary key tables first then children
 
-        self.cur.execute("BEGIN TRANSACTION")
         if self.runTable:
-            runTable_create = "CREATE TABLE IF NOT EXISTS runTable " \
-            "(run_id INTEGER PRIMARY KEY, run_timestamp TEXT UNIQUE);"
+            runTable_create = "CREATE TABLE IF NOT EXISTS runTable (run_id INTEGER PRIMARY KEY, run_timestamp TEXT UNIQUE);"
             self.cur.execute(runTable_create)
 
             sequence_run_id = "CREATE SEQUENCE IF NOT EXISTS seq_run_id START 1;"
@@ -387,13 +435,16 @@ def notebook(self, interactive=False):
     def read_to_artifact(self):
         return self.process_artifacts()
     
-    def process_artifacts(self):
+    def process_artifacts(self, only_units_relations = False):
         """
         Reads data from the DuckDB database into a nested OrderedDict.
         Keys are table names, and values are OrderedDicts containing table data.
 
         If the database contains PK/FK relationships, they are stored in a special `dsi_relations` table.
 
+        `only_units_relations` : bool, default=False
+            **USERS SHOULD IGNORE THIS FLAG.** Used internally by duckdb.py.
+
         `return` : OrderedDict
             A nested OrderedDict containing all data from the DuckDB database.
         """
@@ -404,20 +455,22 @@ def process_artifacts(self):
                                      SELECT table_name FROM information_schema.tables
                                      WHERE table_schema = 'main' AND table_type = 'BASE TABLE'
                                      """).fetchall()
-        for item in tableList:
-            tableName = self.duckdb_compatible_name(item[0])
 
-            tableInfo = self.cur.execute(f"PRAGMA table_info({tableName});").fetchdf()
-            colDict = OrderedDict((self.duckdb_compatible_name(col), []) for col in tableInfo['name'])
+        if only_units_relations == False:
+            for item in tableList:
+                tableName = self.duckdb_compatible_name(item[0])
+
+                tableInfo = self.cur.execute(f"PRAGMA table_info({tableName});").fetchdf()
+                colDict = OrderedDict((self.duckdb_compatible_name(col), []) for col in tableInfo['name'])
 
-            data = self.cur.execute(f"SELECT * FROM {tableName};").fetchall()
-            for row in data:
-                for colName, val in zip(colDict.keys(), row):
-                    if val == "NULL":
-                        colDict[colName].append(None)
-                    else:
-                        colDict[colName].append(val)
-            artifact[tableName] = colDict
+                data = self.cur.execute(f"SELECT * FROM {tableName};").fetchall()
+                for row in data:
+                    for colName, val in zip(colDict.keys(), row):
+                        if val == "NULL":
+                            colDict[colName].append(None)
+                        else:
+                            colDict[colName].append(val)
+                artifact[tableName] = colDict
 
         pk_list = []
         fkData = self.cur.execute(f"""
@@ -743,6 +796,8 @@ def list(self):
                                      SELECT table_name FROM information_schema.tables
                                      WHERE table_schema = 'main' AND table_type = 'BASE TABLE'
                                      """).fetchall()
+        if not tableList:
+            return None
         tableList = [self.duckdb_compatible_name(table[0]) for table in tableList]
         
         info_list = []
@@ -839,12 +894,13 @@ def summary_helper(self, table_name):
         col_info = self.cur.execute(f"PRAGMA table_info({table_name})").fetchall()
 
         numeric_types = {'INTEGER', 'REAL', 'FLOAT', 'NUMERIC', 'DECIMAL', 'DOUBLE', 'BIGINT'}
-        headers = ['column', 'type', 'min', 'max', 'avg', 'std_dev']
+        headers = ['column', 'type', 'unique', 'min', 'max', 'avg', 'std_dev']
         rows = []
 
         for col in col_info:
             col_name = col[1]
             col_type = col[2].upper()
+            unique_vals = self.cur.execute(f"SELECT COUNT(DISTINCT {col_name}) FROM {table_name};").fetchone()[0]
             is_primary = col[5] > 0
             display_name = f"{col_name}*" if is_primary else col_name
 
@@ -863,7 +919,7 @@ def summary_helper(self, table_name):
             
             if avg_val != None and std_dev == None:
                 std_dev = 0
-            rows.append([display_name, col_type, min_val, max_val, avg_val, std_dev])
+            rows.append([display_name, col_type, unique_vals, min_val, max_val, avg_val, std_dev])
 
         return headers, rows
 
@@ -1007,7 +1063,7 @@ def visit(node):
         if any(visit(node) for node in list(graph.keys())):
             return True, None  # Circular dependency detected
 
-        # Step 3: Order tables from least dependencies to most (if no circular dependencies)
+        # Order tables from least dependencies to most (if no circular dependencies)
         in_degree = {table: 0 for table in tables}
         for child in graph:
             for parent in graph[child]:
diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py
@@ -208,12 +208,12 @@ def ingest_artifacts(self, collection, isVerbose=False):
                             create_stmt += f"{col[1]} {col[2]}, "
                     
                     if fk_dict:
-                        fk_stmt = "FOREIGN KEY "
+                        fk_stmt = ""
                         for k, v in fk_dict.items():
                             if k not in create_stmt:
                                 msg = f"Input schema references a nonexistent column, {k}, in the foreign_key section of {table}"
                                 raise ValueError(msg)
-                            fk_stmt += f"({k}) REFERENCES {v[0]}({v[1]}), "
+                            fk_stmt += f"FOREIGN KEY ({k}) REFERENCES {v[0]}({v[1]}), "
                         create_stmt += fk_stmt
                     create_stmt = create_stmt[:-2] + ");"
 
diff --git a/dsi/dsi.py b/dsi/dsi.py
@@ -304,10 +304,10 @@ def read(self, filenames, reader_name, table_name = None):
                 sys.exit(f"read() ERROR: {e}")
             self.t.active_metadata = OrderedDict()
 
-        if len(table_keys) > 1:
-            print(f"Loaded {filenames} into tables: {', '.join(table_keys)}")
-        else:
+        if len(table_keys) == 1:
             print(f"Loaded {filenames} into the table {table_keys[0]}")
+        else:
+            print(f"Loaded {filenames} into tables: {', '.join(table_keys)}")
 
     def query(self, statement, collection = False, update = False):
         """
diff --git a/dsi/tests/test_dsi.py b/dsi/tests/test_dsi.py
@@ -5,6 +5,7 @@
 import textwrap
 from pandas import DataFrame
 from collections import OrderedDict
+import hashlib
 
 def test_list_functions():
     test = DSI()
@@ -888,6 +889,31 @@ def test_query_update_schema_sqlite_backend():
     assert data['i'].tolist() == [123,234]
     assert data['new_col'].tolist() == ["test1", "test1"]
 
+def test_overwrite_schema_sqlite_backend():
+    dbpath = 'data.db'
+    if os.path.exists(dbpath):
+        os.remove(dbpath)
+
+    test = DSI(filename=dbpath, backend_name= "Sqlite")
+    test.schema(filename="examples/test/yaml1_schema.json")
+    test.read(filenames=["examples/test/student_test1.yml", "examples/test/student_test2.yml"], reader_name='YAML1')
+    test.write(filename="full_erd.png", writer_name="ER_Diagram")
+
+    test.schema(filename="examples/test/yaml1_circular_schema.json")
+    test.write(filename="new_erd.png", writer_name="ER_Diagram")
+
+    def file_hash(path):
+        sha = hashlib.sha256()
+        with open(path, "rb") as f:
+            sha.update(f.read())
+        return sha.hexdigest()
+
+    hash1 = file_hash("full_erd.png")
+    hash2 = file_hash("new_erd.png")
+
+    assert hash1 != hash2
+
+
 
 # DUCKDB
 # DUCKDB
@@ -1730,4 +1756,46 @@ def test_query_update_schema_duckdb_backend():
 
     data = test.get_table("math", collection=True, update=True)
     assert data['specification'].tolist() == [123,234]
-    assert data['new_col'].tolist() == ["test1", "test1"]
+    assert data['new_col'].tolist() == ["test1", "test1"]
+
+def test_overwrite_schema_duckdb_backend():
+    dbpath = 'data.db'
+    if os.path.exists(dbpath):
+        os.remove(dbpath)
+
+    test = DSI(filename=dbpath, backend_name= "DuckDB")
+    test.schema(filename="examples/test/yaml1_schema.json")
+    test.read(filenames=["examples/test/student_test1.yml", "examples/test/student_test2.yml"], reader_name='YAML1')
+    test.write(filename="full_erd.png", writer_name="ER_Diagram")
+
+    #loophole to assign new schema since there isnt another schema file that can be used with yaml data (circular wont work here)
+    new_schema = OrderedDict({'primary_key': [('address', 'i'), ('math', 'specification')], 'foreign_key': [('math', 'b'), (None, None)]})
+    test.read(filenames=new_schema, reader_name="Collection", table_name="dsi_relations") #loophole to assign new schema since
+    test.write(filename="new_erd.png", writer_name="ER_Diagram")
+
+    def file_hash(path):
+        sha = hashlib.sha256()
+        with open(path, "rb") as f:
+            sha.update(f.read())
+        return sha.hexdigest()
+
+    hash1 = file_hash("full_erd.png")
+    hash2 = file_hash("new_erd.png")
+
+    assert hash1 != hash2
+
+def test_fail_overwrite_schema_duckdb_backend():
+    dbpath = 'data.db'
+    if os.path.exists(dbpath):
+        os.remove(dbpath)
+
+    test = DSI(filename=dbpath, backend_name= "DuckDB")
+    test.schema(filename="examples/test/yaml1_schema.json")
+    test.read(filenames=["examples/test/student_test1.yml", "examples/test/student_test2.yml"], reader_name='YAML1')
+    test.write(filename="full_erd.png", writer_name="ER_Diagram")
+
+    try:
+        test.schema(filename="examples/test/yaml1_circular_schema.json") # should not allow circular dependency overwrite
+        assert False
+    except:
+        assert True
diff --git a/examples/test/coreterminal.py b/examples/test/coreterminal.py
@@ -9,6 +9,7 @@
 
 # a.load_module('plugin', 'Schema', 'reader', filename="example_schema.json")
 a.load_module('plugin', 'Schema', 'reader', filename="yaml1_schema.json")
+# a.load_module('plugin', 'Schema', 'reader', filename="yaml1_circular_schema.json")
 
 a.load_module('plugin', 'YAML1', 'reader', filenames=["student_test1.yml", "student_test2.yml"])
 # a.load_module('plugin', 'TOML1', 'reader', filenames=["results.toml", "results1.toml"], target_table_prefix = "results")
diff --git a/examples/test/dsi_example.py b/examples/test/dsi_example.py
@@ -10,7 +10,8 @@
 # test.list_writers()
 
 ''' Example uses of loading DSI readers '''
-# test.schema(filename="yaml1_schema.json") # must be loaded first
+# test.schema(filename="yaml1_circular_schema.json") # must be loaded first
+test.schema(filename="yaml1_schema.json") # must be loaded first
 # test.schema(filename="example_schema.json") # must be loaded first
 
 test.read(filenames=["student_test1.yml", "student_test2.yml"], reader_name='YAML1')