feat: Automatically divide queryset in batches

abdellatif-codoc · qcoumes · commit 9d1383f302ba · 2024-10-01T06:07:57.000+02:00
diff --git a/django_opensearch_dsl/documents.py b/django_opensearch_dsl/documents.py
@@ -6,7 +6,7 @@
 from typing import Iterable, Optional
 
 from django.db import models
-from django.db.models import Q, QuerySet
+from django.db.models import Max, Min, Q, QuerySet
 from opensearchpy.helpers import bulk, parallel_bulk
 from opensearchpy.helpers.document import Document as DSLDocument
 
@@ -101,29 +101,38 @@ def get_indexing_queryset(
         """Divide the queryset into chunks."""
         chunk_size = self.django.queryset_pagination
         qs = self.get_queryset(db_alias=db_alias, filter_=filter_, exclude=exclude, count=count)
-        qs = qs.order_by("pk") if not qs.query.is_sliced else qs
         count = qs.count()
         model = self.django.model.__name__
         action = action.present_participle.title()
 
-        i = 0
+        if self.django.order_indexing_queryset and not qs.query.is_ordered:
+            qs = qs.order_by("pk")
+
+        # In order to avoid loading big querysets into memory or
+        # loading them in temporary tables in the database,
+        # we have the possibility to divide the queryset using batch_size.
+        result = qs.aggregate(min_pk=Min("pk"), max_pk=Max("pk"))
+        min_value = result["min_pk"]
+        max_value = result["max_pk"] + 1
+
         done = 0
+        current_batch = 0
+        total_batches = (max_value - min_value + chunk_size - 1) // chunk_size
         start = time.time()
         if verbose:
             stdout.write(f"{action} {model}: 0% ({self._eta(start, done, count)})\r")
-        while done < count:
-            if verbose:
-                stdout.write(f"{action} {model}: {round(i / count * 100)}% ({self._eta(start, done, count)})\r")
 
-            for obj in qs[i : i + chunk_size]:
+        for pk_offset in range(min_value, max_value, chunk_size):
+            current_batch += 1
+            max_pk = min(pk_offset + self.django.queryset_pagination, max_value)
+            batch_qs = qs.filter(pk__gte=pk_offset, pk__lt=max_pk)
+            stdout.write(f"Processing batch {current_batch}/{total_batches} with pk from {pk_offset} to {max_pk - 1}\n")
+            for obj in batch_qs:
                 done += 1
+                if done % chunk_size == 0:
+                    stdout.write(f"{action} {model}: {round(done / count * 100)}% ({self._eta(start, done, count)})\r")
                 yield obj
 
-            i = min(i + chunk_size, count)
-
-        if verbose:
-            stdout.write(f"{action} {count} {model}: OK          \n")
-
     def init_prepare(self):
         """Initialise the data model preparers once here.
 
diff --git a/django_opensearch_dsl/management/commands/opensearch.py b/django_opensearch_dsl/management/commands/opensearch.py
@@ -154,6 +154,7 @@ def _manage_document(
         refresh,
         missing,
         database,
+        batch_size,
         **options,
     ):  # noqa
         """Manage the creation and deletion of indices."""
@@ -437,6 +438,13 @@ def add_arguments(self, parser):
             default=False,
             help="When used with 'index' action, only index documents not indexed yet.",
         )
+        subparser.add_argument(
+            "-b",
+            "--batch-size",
+            type=int,
+            default=None,
+            help="Specify the batch size for processing documents.",
+        )
 
         self.usage = parser.format_usage()
 
diff --git a/django_opensearch_dsl/registries.py b/django_opensearch_dsl/registries.py
@@ -50,6 +50,7 @@ def register_document(self, document):
                 "ignore_signals": getattr(django_meta, "ignore_signals", False),
                 "auto_refresh": getattr(django_meta, "auto_refresh", DODConfig.auto_refresh_enabled()),
                 "related_models": getattr(django_meta, "related_models", []),
+                "order_indexing_queryset": getattr(django_meta, "order_indexing_queryset", True),
             }
         )
         if not django_attr.model:  # pragma: no cover

Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@ def register_document(self, document):`
`50`	`50`	`"ignore_signals": getattr(django_meta, "ignore_signals", False),`
`51`	`51`	`"auto_refresh": getattr(django_meta, "auto_refresh", DODConfig.auto_refresh_enabled()),`
`52`	`52`	`"related_models": getattr(django_meta, "related_models", []),`
	`53`	`+ "order_indexing_queryset": getattr(django_meta, "order_indexing_queryset", True),`
`53`	`54`	`}`
`54`	`55`	`)`
`55`	`56`	`if not django_attr.model: # pragma: no cover`