Skip to content

Commit 9d1383f

Browse files
abdellatif-codocqcoumes
authored andcommitted
feat: Automatically divide queryset in batches
1 parent 4f04a28 commit 9d1383f

File tree

3 files changed

+30
-12
lines changed

3 files changed

+30
-12
lines changed

django_opensearch_dsl/documents.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import Iterable, Optional
77

88
from django.db import models
9-
from django.db.models import Q, QuerySet
9+
from django.db.models import Max, Min, Q, QuerySet
1010
from opensearchpy.helpers import bulk, parallel_bulk
1111
from opensearchpy.helpers.document import Document as DSLDocument
1212

@@ -101,29 +101,38 @@ def get_indexing_queryset(
101101
"""Divide the queryset into chunks."""
102102
chunk_size = self.django.queryset_pagination
103103
qs = self.get_queryset(db_alias=db_alias, filter_=filter_, exclude=exclude, count=count)
104-
qs = qs.order_by("pk") if not qs.query.is_sliced else qs
105104
count = qs.count()
106105
model = self.django.model.__name__
107106
action = action.present_participle.title()
108107

109-
i = 0
108+
if self.django.order_indexing_queryset and not qs.query.is_ordered:
109+
qs = qs.order_by("pk")
110+
111+
# In order to avoid loading big querysets into memory or
112+
# loading them in temporary tables in the database,
113+
# we have the possibility to divide the queryset using batch_size.
114+
result = qs.aggregate(min_pk=Min("pk"), max_pk=Max("pk"))
115+
min_value = result["min_pk"]
116+
max_value = result["max_pk"] + 1
117+
110118
done = 0
119+
current_batch = 0
120+
total_batches = (max_value - min_value + chunk_size - 1) // chunk_size
111121
start = time.time()
112122
if verbose:
113123
stdout.write(f"{action} {model}: 0% ({self._eta(start, done, count)})\r")
114-
while done < count:
115-
if verbose:
116-
stdout.write(f"{action} {model}: {round(i / count * 100)}% ({self._eta(start, done, count)})\r")
117124

118-
for obj in qs[i : i + chunk_size]:
125+
for pk_offset in range(min_value, max_value, chunk_size):
126+
current_batch += 1
127+
max_pk = min(pk_offset + self.django.queryset_pagination, max_value)
128+
batch_qs = qs.filter(pk__gte=pk_offset, pk__lt=max_pk)
129+
stdout.write(f"Processing batch {current_batch}/{total_batches} with pk from {pk_offset} to {max_pk - 1}\n")
130+
for obj in batch_qs:
119131
done += 1
132+
if done % chunk_size == 0:
133+
stdout.write(f"{action} {model}: {round(done / count * 100)}% ({self._eta(start, done, count)})\r")
120134
yield obj
121135

122-
i = min(i + chunk_size, count)
123-
124-
if verbose:
125-
stdout.write(f"{action} {count} {model}: OK \n")
126-
127136
def init_prepare(self):
128137
"""Initialise the data model preparers once here.
129138

django_opensearch_dsl/management/commands/opensearch.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def _manage_document(
154154
refresh,
155155
missing,
156156
database,
157+
batch_size,
157158
**options,
158159
): # noqa
159160
"""Manage the creation and deletion of indices."""
@@ -437,6 +438,13 @@ def add_arguments(self, parser):
437438
default=False,
438439
help="When used with 'index' action, only index documents not indexed yet.",
439440
)
441+
subparser.add_argument(
442+
"-b",
443+
"--batch-size",
444+
type=int,
445+
default=None,
446+
help="Specify the batch size for processing documents.",
447+
)
440448

441449
self.usage = parser.format_usage()
442450

django_opensearch_dsl/registries.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def register_document(self, document):
5050
"ignore_signals": getattr(django_meta, "ignore_signals", False),
5151
"auto_refresh": getattr(django_meta, "auto_refresh", DODConfig.auto_refresh_enabled()),
5252
"related_models": getattr(django_meta, "related_models", []),
53+
"order_indexing_queryset": getattr(django_meta, "order_indexing_queryset", True),
5354
}
5455
)
5556
if not django_attr.model: # pragma: no cover

0 commit comments

Comments
 (0)