@@ -1128,13 +1128,31 @@ def _process_search(
1128
1128
stream_slice : Mapping [str , Any ] = None ,
1129
1129
stream_state : Mapping [str , Any ] = None ,
1130
1130
next_page_token : Mapping [str , Any ] = None ,
1131
+ last_id = None ,
1131
1132
) -> Tuple [List , requests .Response ]:
1132
1133
stream_records = {}
1133
1134
properties_list = list (self .properties .keys ())
1135
+ if last_id == None :
1136
+ last_id = 0
1137
+ # The search query below uses the following criteria:
1138
+ # - Last modified >= timestemp of previous sync
1139
+ # - Last modified <= timestamp of current sync to avoid open ended queries
1140
+ # - Object primary key <= last_id with initial value 0, then max(last_id) returned from previous pagination loop
1141
+ # - Sort results by primary key ASC
1142
+ # Note: Although results return out of chronological order, sorting on primary key ensures retrieval of *all* records
1143
+ # once the final pagination loop completes. This is preferable to sorting by a non-unique value, such as
1144
+ # last modified date, which may result in an infinite loop in some edge cases.
1145
+ key = self .primary_key
1146
+ if key == "id" :
1147
+ key = "hs_object_id"
1134
1148
payload = (
1135
1149
{
1136
- "filters" : [{"value" : int (self ._state .timestamp () * 1000 ), "propertyName" : self .last_modified_field , "operator" : "GTE" }],
1137
- "sorts" : [{"propertyName" : self .last_modified_field , "direction" : "ASCENDING" }],
1150
+ "filters" : [
1151
+ {"value" : int (self ._state .timestamp () * 1000 ), "propertyName" : self .last_modified_field , "operator" : "GTE" },
1152
+ {"value" : int (self ._init_sync .timestamp () * 1000 ), "propertyName" : self .last_modified_field , "operator" : "LTE" },
1153
+ {"value" : last_id , "propertyName" : key , "operator" : "GTE" },
1154
+ ],
1155
+ "sorts" : [{"propertyName" : key , "direction" : "ASCENDING" }],
1138
1156
"properties" : properties_list ,
1139
1157
"limit" : 100 ,
1140
1158
}
@@ -1168,6 +1186,16 @@ def _read_associations(self, records: Iterable) -> Iterable[Mapping[str, Any]]:
1168
1186
current_record [_slice ] = associations_list
1169
1187
return records_by_pk .values ()
1170
1188
1189
+ def get_max (self , val1 , val2 ):
1190
+ try :
1191
+ # Try to convert both values to integers
1192
+ int_val1 = int (val1 )
1193
+ int_val2 = int (val2 )
1194
+ return max (int_val1 , int_val2 )
1195
+ except ValueError :
1196
+ # If conversion fails, fall back to string comparison
1197
+ return max (str (val1 ), str (val2 ))
1198
+
1171
1199
def read_records (
1172
1200
self ,
1173
1201
sync_mode : SyncMode ,
@@ -1178,14 +1206,13 @@ def read_records(
1178
1206
stream_state = stream_state or {}
1179
1207
pagination_complete = False
1180
1208
next_page_token = None
1209
+ last_id = None
1210
+ max_last_id = None
1181
1211
1182
- latest_cursor = None
1183
1212
while not pagination_complete :
1184
1213
if self .state :
1185
1214
records , raw_response = self ._process_search (
1186
- next_page_token = next_page_token ,
1187
- stream_state = stream_state ,
1188
- stream_slice = stream_slice ,
1215
+ next_page_token = next_page_token , stream_state = stream_state , stream_slice = stream_slice , last_id = max_last_id
1189
1216
)
1190
1217
if self .associations :
1191
1218
records = self ._read_associations (records )
@@ -1200,8 +1227,7 @@ def read_records(
1200
1227
records = self .record_unnester .unnest (records )
1201
1228
1202
1229
for record in records :
1203
- cursor = self ._field_to_datetime (record [self .updated_at_field ])
1204
- latest_cursor = max (cursor , latest_cursor ) if latest_cursor else cursor
1230
+ last_id = self .get_max (record [self .primary_key ], last_id ) if last_id else record [self .primary_key ]
1205
1231
yield record
1206
1232
1207
1233
next_page_token = self .next_page_token (raw_response )
@@ -1211,13 +1237,13 @@ def read_records(
1211
1237
# Hubspot documentation states that the search endpoints are limited to 10,000 total results
1212
1238
# for any given query. Attempting to page beyond 10,000 will result in a 400 error.
1213
1239
# https://developers.hubspot.com/docs/api/crm/search. We stop getting data at 10,000 and
1214
- # start a new search query with the latest state that has been collected.
1215
- self ._update_state ( latest_cursor = latest_cursor )
1240
+ # start a new search query with the latest id that has been collected.
1241
+ max_last_id = self .get_max ( max_last_id , last_id ) if max_last_id else last_id
1216
1242
next_page_token = None
1217
1243
1218
1244
# Since Search stream does not have slices is safe to save the latest
1219
1245
# state as the initial sync date
1220
- self ._update_state (latest_cursor = latest_cursor , is_last_record = True )
1246
+ self ._update_state (latest_cursor = self . _init_sync , is_last_record = True )
1221
1247
# Always return an empty generator just in case no records were ever yielded
1222
1248
yield from []
1223
1249
0 commit comments