Merge pull request #51 from piercefreeman/feature/optimize-insert

piercefreeman · web-flow · commit 4f0c1aa96917 · 2024-12-26T22:27:04.000-08:00
Optimize insertion speed
diff --git a/iceaxe/__tests__/benchmarks/test_bulk_insert.py b/iceaxe/__tests__/benchmarks/test_bulk_insert.py
@@ -0,0 +1,45 @@
+import time
+from typing import Sequence
+
+import pytest
+
+from iceaxe.__tests__.conf_models import UserDemo
+from iceaxe.logging import CONSOLE, LOGGER
+from iceaxe.session import DBConnection
+
+
+def generate_test_users(count: int) -> Sequence[UserDemo]:
+    """
+    Generate a sequence of test users for bulk insertion.
+
+    :param count: Number of users to generate
+    :return: Sequence of UserDemo instances
+    """
+    return [
+        UserDemo(name=f"User {i}", email=f"user{i}@example.com") for i in range(count)
+    ]
+
+
+@pytest.mark.asyncio
+@pytest.mark.integration_tests
+async def test_bulk_insert_performance(db_connection: DBConnection):
+    """
+    Test the performance of bulk inserting 500k records.
+    """
+    NUM_USERS = 500_000
+    users = generate_test_users(NUM_USERS)
+    LOGGER.info(f"Generated {NUM_USERS} test users")
+
+    start_time = time.time()
+
+    await db_connection.insert(users)
+
+    total_time = time.time() - start_time
+    records_per_second = NUM_USERS / total_time
+
+    CONSOLE.print("\nBulk Insert Performance:")
+    CONSOLE.print(f"Total time: {total_time:.2f} seconds")
+    CONSOLE.print(f"Records per second: {records_per_second:.2f}")
+
+    result = await db_connection.conn.fetchval("SELECT COUNT(*) FROM userdemo")
+    assert result == NUM_USERS
diff --git a/iceaxe/session.py b/iceaxe/session.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 from contextlib import asynccontextmanager
 from json import loads as json_loads
+from math import ceil
 from typing import (
     Any,
     Literal,
@@ -33,6 +34,9 @@
 
 TableType = TypeVar("TableType", bound=TableBase)
 
+# PostgreSQL has a limit of 65535 parameters per query
+PG_MAX_PARAMETERS = 65535
+
 
 class DBConnection:
     """
@@ -235,37 +239,98 @@ async def insert(self, objects: Sequence[TableBase]):
         if not objects:
             return
 
-        for model, model_objects in self._aggregate_models_by_table(objects):
-            table_name = QueryIdentifier(model.get_table_name())
-            fields = {
-                field: info
-                for field, info in model.model_fields.items()
-                if (not info.exclude and not info.autoincrement)
-            }
-            field_string = ", ".join(f'"{field}"' for field in fields)
-            primary_key = self._get_primary_key(model)
-
-            placeholders = ", ".join(f"${i}" for i in range(1, len(fields) + 1))
-            query = f"INSERT INTO {table_name} ({field_string}) VALUES ({placeholders})"
-            if primary_key:
-                query += f" RETURNING {primary_key}"
+        # Reuse a single transaction for all inserts
+        async with self._ensure_transaction():
+            for model, model_objects in self._aggregate_models_by_table(objects):
+                # For each table, build batched insert queries
+                table_name = QueryIdentifier(model.get_table_name())
+                fields = {
+                    field: info
+                    for field, info in model.model_fields.items()
+                    if (not info.exclude and not info.autoincrement)
+                }
+                primary_key = self._get_primary_key(model)
+                field_names = list(
+                    fields.keys()
+                )  # Iterate over these in order for each row
+                field_identifiers = ", ".join(f'"{f}"' for f in field_names)
+
+                # Calculate max batch size based on number of fields
+                # Each row uses len(fields) parameters, so max_batch_size * len(fields) <= PG_MAX_PARAMETERS
+                max_batch_size = PG_MAX_PARAMETERS // len(fields)
+                # Cap at 5000 rows per batch to avoid excessive memory usage
+                max_batch_size = min(max_batch_size, 5000)
+
+                total = len(model_objects)
+                num_batches = ceil(total / max_batch_size)
+
+                for batch_idx in range(num_batches):
+                    start_idx = batch_idx * max_batch_size
+                    end_idx = (batch_idx + 1) * max_batch_size
+                    batch_objects = model_objects[start_idx:end_idx]
+
+                    # Build the multi-row VALUES clause
+                    # e.g. for 3 rows with 2 columns, we'd want:
+                    #   VALUES ($1, $2), ($3, $4), ($5, $6)
+                    num_rows = len(batch_objects)
+                    if not num_rows:
+                        continue
 
-            async with self._ensure_transaction():
-                for obj in model_objects:
-                    obj_values = obj.model_dump()
-                    values = [
-                        info.to_db_value(obj_values[field])
-                        for field, info in fields.items()
-                    ]
-                    result = await self.conn.fetchrow(query, *values)
+                    # placeholders per row: ($1, $2, ...)
+                    # but we have to shift the placeholder index for each row
+                    placeholders: list[str] = []
+                    values: list[Any] = []
+                    param_index = 1
+
+                    for obj in batch_objects:
+                        obj_values = obj.model_dump()
+                        row_values = []
+                        for field in field_names:
+                            info = fields[field]
+                            row_values.append(info.to_db_value(obj_values[field]))
+                        values.extend(row_values)
+                        row_placeholder = (
+                            "("
+                            + ", ".join(
+                                f"${p}"
+                                for p in range(
+                                    param_index, param_index + len(field_names)
+                                )
+                            )
+                            + ")"
+                        )
+                        placeholders.append(row_placeholder)
+                        param_index += len(field_names)
+
+                    placeholders_clause = ", ".join(placeholders)
+
+                    query = f"""
+                        INSERT INTO {table_name} ({field_identifiers})
+                        VALUES {placeholders_clause}
+                    """
+                    if primary_key:
+                        query += f" RETURNING {primary_key}"
+
+                    # Insert them in one go
+                    if primary_key:
+                        rows = await self.conn.fetch(query, *values)
+                        # 'rows' should be a list of Record objects, one per inserted row
+                        # Update each object in the same order
+                        for obj, row in zip(batch_objects, rows):
+                            setattr(obj, primary_key, row[primary_key])
+                    else:
+                        # No need to fetch anything if there's no primary key
+                        await self.conn.execute(query, *values)
 
-                    if primary_key and result:
-                        setattr(obj, primary_key, result[primary_key])
-                    obj.clear_modified_attributes()
+                    # Mark as unmodified
+                    for obj in batch_objects:
+                        obj.clear_modified_attributes()
 
+        # Register modification callbacks outside the main insert loop
         for obj in objects:
             obj.register_modified_callback(self.modification_tracker.track_modification)
 
+        # Clear modification status
         self.modification_tracker.clear_status(objects)
 
     @overload