sqlite3 copy uses inefficient method when sqlite3 cli not available

austinweisgrau · austinweisgrau · commit e148f05b9c02 · 2025-02-17T10:17:25.000-08:00
diff --git a/parsons/databases/sqlite/sqlite.py b/parsons/databases/sqlite/sqlite.py
@@ -1,18 +1,20 @@
+import datetime
+import logging
+import pickle
+import shutil
 import sqlite3
-from pathlib import Path
 import subprocess
-import datetime
-from typing import Optional, Literal, Union
 from collections.abc import Iterator
-from parsons.utilities import files
-from parsons.etl.table import Table
-import pickle
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Literal, Optional, Union
+
 import petl
+
 from parsons.databases.database_connector import DatabaseConnector
 from parsons.databases.table import BaseTable
-from contextlib import contextmanager
-
-import logging
+from parsons.etl.table import Table
+from parsons.utilities import files
 
 # Max number of rows that we query at a time, so we can avoid loading huge
 # data sets into memory.
@@ -183,6 +185,7 @@ def copy(
         table_name: str,
         if_exists: str = "fail",
         strict_length: bool = False,
+        force_python_sdk: bool = False,
     ):
         """
         Copy a :ref:`parsons-table` to Sqlite.
@@ -200,6 +203,8 @@ def copy(
                 the created table's column sizes will be sized to exactly fit the current data,
                 or if their size will be rounded up to account for future values being larger
                 then the current dataset. Defaults to ``False``.
+            force_python_sdk: bool
+                Use the python SDK to import data to sqlite3, even if the sqlite3 cli utility is available for more efficient loading. Defaults to False.
         """
 
         with self.connection() as connection:
@@ -211,13 +216,51 @@ def copy(
                 self.query_with_connection(sql, connection, commit=False, return_values=False)
                 logger.info(f"{table_name} created.")
 
-        csv_file_path = tbl.to_csv()
-
-        self._cli_command(f'".import --csv --skip 1 {csv_file_path} {table_name}"')
+        # Use the sqlite3 command line for csv import if possible, as it is much more efficient
+        if shutil.which("sqlite3") and not force_python_sdk:
+            csv_file_path = tbl.to_csv()
+            self._cli_command(f'".import --csv --skip 1 {csv_file_path} {table_name}"')
+        else:
+            self.import_table_iteratively(tbl, table_name, if_exists)
 
         logger.info(f"{len(tbl)} rows copied to {table_name}.")
 
+    def import_table_iteratively(
+        self, tbl: Table, table_name: str, if_exists: str, chunksize=10000
+    ) -> None:
+        """Import a CSV row by row using the python sqlite3 API.
+
+        Iterates over chunks of length `chunksize`
+
+        It is generally more efficient to use the sqlite3 CLI to
+        import a CSV, but not all machines have the shell utility
+        available, so we can fall back to this method.
+        """
+        chunked_tbls = tbl.chunk(chunksize)
+        insert_sql = "INSERT INTO {} ({}) VALUES ({});".format(
+            table_name,
+            ", ".join(tbl.columns),
+            ", ".join(["?" for _ in tbl.columns]),
+        )
+        with self.connection() as connection:
+            with self.cursor(connection) as cursor:
+                for chunked_tbl in chunked_tbls:
+                    cursor.executemany(
+                        insert_sql,
+                        tuple([tuple(row.values()) for row in chunked_tbl]),
+                    )
+
     def _cli_command(self, command: str) -> None:
+        """Use the sqlite3 command line utility to run a command.
+
+        Certain commands are only possible via the shell utility and
+        not via the python API, such as the CSV import command.
+
+        sqlite3 comes as part of the python stdlib, but the shell
+        utility is not available by default on all systems. Windows
+        machines in particular generally don't have the sqlite3
+        utility unless it is explicitly installed.
+        """
         db_path = Path(self.db_path).resolve()
         full_command = f"sqlite3 {db_path} {command}"
         resp = subprocess.run(
diff --git a/parsons/etl/etl.py b/parsons/etl/etl.py
@@ -896,7 +896,7 @@ def concat(self, *tables, missing=None):
 
         self.table = petl.cat(self.table, *petl_tables, missing=missing)
 
-    def chunk(self, rows):
+    def chunk(self, rows: int):
         """
         Divides a Parsons table into smaller tables of a specified row count. If the table
         cannot be divided evenly, then the final table will only include the remainder.
diff --git a/test/test_databases/test_dbsync.py b/test/test_databases/test_dbsync.py
@@ -1,14 +1,12 @@
 import os
-import unittest
-from parsons import Postgres, DBSync, Table, Redshift
-from parsons.databases.database_connector import DatabaseConnector
 import tempfile
-from parsons.databases.sqlite import Sqlite
+import unittest
 from abc import ABC
 from typing import Optional, Type
 
 from parsons import DBSync, Postgres, Redshift, Table
 from parsons.databases.database_connector import DatabaseConnector
+from parsons.databases.sqlite import Sqlite
 from test.test_databases.fakes import FakeDatabase
 from test.utils import assert_matching_tables
 
@@ -46,9 +44,7 @@ def setUp(self):
             f"{self.temp_schema}.source_table" if self.temp_schema else "source_table"
         )
         self.destination_table = (
-            f"{self.temp_schema}.destination_table"
-            if self.temp_schema
-            else "destination_table"
+            f"{self.temp_schema}.destination_table" if self.temp_schema else "destination_table"
         )
 
         # Create source table
@@ -71,9 +67,7 @@ def tearDown(self):
 
     def assert_matching_tables(self) -> None:
         source = self.source_db.query(f"SELECT * FROM {self.source_table}")
-        destination = self.destination_db.query(
-            f"SELECT * FROM {self.destination_table}"
-        )
+        destination = self.destination_db.query(f"SELECT * FROM {self.destination_table}")
         assert_matching_tables(source, destination)
 
     def table_sync_full(self, if_exists: str, **kwargs):
@@ -109,19 +103,15 @@ def test_table_sync_full_empty_table(self):
     def test_table_sync_full_chunk(self):
         # Test chunking in full sync.
         self.db_sync.chunk_size = 10
-        self.db_sync.table_sync_full(
-            self.source_table, self.destination_table, if_exists="drop"
-        )
+        self.db_sync.table_sync_full(self.source_table, self.destination_table, if_exists="drop")
         self.assert_matching_tables()
 
     def test_table_sync_incremental(self):
         # Test that incremental sync
 
         self.destination_db.copy(self.table1, self.destination_table)
         self.source_db.copy(self.table2, self.source_table, if_exists="append")
-        self.db_sync.table_sync_incremental(
-            self.source_table, self.destination_table, "pk"
-        )
+        self.db_sync.table_sync_incremental(self.source_table, self.destination_table, "pk")
         self.assert_matching_tables()
 
     def test_table_sync_incremental_chunk(self):
@@ -130,17 +120,13 @@ def test_table_sync_incremental_chunk(self):
         self.db_sync.chunk_size = 10
         self.destination_db.copy(self.table1, self.destination_table)
         self.source_db.copy(self.table2, self.source_table, if_exists="append")
-        self.db_sync.table_sync_incremental(
-            self.source_table, self.destination_table, "pk"
-        )
+        self.db_sync.table_sync_incremental(self.source_table, self.destination_table, "pk")
 
         self.assert_matching_tables()
 
     def test_table_sync_incremental_create_destination_table(self):
         # Test that an incremental sync works if the destination table does not exist.
-        self.db_sync.table_sync_incremental(
-            self.source_table, self.destination_table, "pk"
-        )
+        self.db_sync.table_sync_incremental(self.source_table, self.destination_table, "pk")
         self.assert_matching_tables()
 
     def test_table_sync_incremental_empty_table(self):
@@ -210,9 +196,7 @@ def initialize_db_connections(self) -> None:
 
 # These tests interact directly with the Postgres database. In order to run, set the
 # env to LIVE_TEST='TRUE'.
-@unittest.skipIf(
-    not os.environ.get("LIVE_TEST"), "Skipping because not running live test"
-)
+@unittest.skipIf(not os.environ.get("LIVE_TEST"), "Skipping because not running live test")
 class TestPostgresDBSync(TestDBSync):
     db = Postgres
     setup_sql = f"""
@@ -226,8 +210,6 @@ class TestPostgresDBSync(TestDBSync):
 
 # These tests interact directly with the Postgres database. In order to run, set the
 # env to LIVE_TEST='TRUE'.
-@unittest.skipIf(
-    not os.environ.get("LIVE_TEST"), "Skipping because not running live test"
-)
+@unittest.skipIf(not os.environ.get("LIVE_TEST"), "Skipping because not running live test")
 class TestRedshiftDBSync(TestPostgresDBSync):
     db = Redshift
diff --git a/test/test_databases/test_sqlite.py b/test/test_databases/test_sqlite.py
@@ -1,11 +1,12 @@
+import tempfile
+import unittest
+
 from parsons import Table
 from parsons.databases.sqlite import Sqlite
 from test.utils import assert_matching_tables
-import unittest
-import tempfile
 
 
-class TestSqliteCreateStatement(unittest.TestCase):
+class TestSqlite(unittest.TestCase):
     def setUp(self):
         temp_db = tempfile.mkstemp(suffix=".db")[1]
         self.sqlite = Sqlite(temp_db)
@@ -17,6 +18,11 @@ def test_copy(self) -> None:
         tbl1_read = self.sqlite.query("select * from tbl1")
         assert_matching_tables(self.tbl, tbl1_read)
 
+    def test_copy_no_cli(self) -> None:
+        self.sqlite.copy(self.tbl, "tbl1", if_exists="drop", force_python_sdk=True)
+        tbl1_read = self.sqlite.query("select * from tbl1")
+        assert_matching_tables(self.tbl, tbl1_read)
+
     def test_copy_append(self) -> None:
         self.sqlite.copy(self.tbl, "tbl1", if_exists="drop")
         self.sqlite.copy(self.tbl, "tbl1", if_exists="append")