Skip to content

Commit dc2d462

Browse files
tw4likreymer
authored andcommitted
add auto pause expiry support:
- stop crawls that have been paused for too long - add 'paused_crawl_limit_minutes' to Helm chart - add paused time and expiry to crawlconfig API response - set to 'stopped_pause_expired' state - ui: add support for 'Stopped: Paused Too Long' for stopped_pause_expired - use 'paused_at' in CrawlJob to indicate crawl is paused and when
1 parent 8e38648 commit dc2d462

File tree

11 files changed

+94
-22
lines changed

11 files changed

+94
-22
lines changed

backend/btrixcloud/crawlconfigs.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import re
1212
import os
1313
import traceback
14-
from datetime import datetime
14+
from datetime import datetime, timedelta
1515
from uuid import UUID, uuid4
1616
import urllib.parse
1717

@@ -89,6 +89,8 @@ class CrawlConfigOps:
8989
crawler_images_map: dict[str, str]
9090
crawler_image_pull_policy_map: dict[str, str]
9191

92+
paused_expiry_delta: timedelta
93+
9294
def __init__(
9395
self,
9496
dbclient,
@@ -115,6 +117,10 @@ def __init__(
115117
"DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
116118
)
117119

120+
self.paused_expiry_delta = timedelta(
121+
minutes=int(os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", "10080"))
122+
)
123+
118124
self.router = APIRouter(
119125
prefix="/crawlconfigs",
120126
tags=["crawlconfigs"],
@@ -750,6 +756,12 @@ async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
750756
crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
751757
crawlconfig.lastCrawlStopping = crawl.stopping
752758
crawlconfig.lastCrawlPausing = crawl.pausing
759+
crawlconfig.lastCrawlPausedAt = crawl.pausedAt
760+
crawlconfig.lastCrawlPausedExpiry = None
761+
if crawl.pausedAt:
762+
crawlconfig.lastCrawlPausedExpiry = (
763+
crawl.pausedAt + self.paused_expiry_delta
764+
)
753765
crawlconfig.isCrawlRunning = True
754766

755767
async def get_crawl_config_out(self, cid: UUID, org: Organization):

backend/btrixcloud/crawlmanager.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import secrets
55

66
from typing import Optional, Dict, Tuple
7-
from datetime import timedelta
7+
from datetime import datetime, timedelta
88

99
from fastapi import HTTPException
1010

@@ -386,9 +386,13 @@ async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict:
386386

387387
return await self.delete_crawl_job(crawl_id)
388388

389-
async def pause_resume_crawl(self, crawl_id: str, pause: bool) -> dict:
389+
async def pause_resume_crawl(
390+
self, crawl_id: str, paused_at: Optional[datetime] = None
391+
) -> dict:
390392
"""pause or unpause a crawl"""
391-
return await self._patch_job(crawl_id, {"paused": int(pause)})
393+
return await self._patch_job(
394+
crawl_id, {"pausedAt": date_to_str(paused_at) if paused_at else ""}
395+
)
392396

393397
async def delete_crawl_configs_for_org(self, org: str) -> None:
394398
"""Delete all crawl configs for given org"""

backend/btrixcloud/crawls.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -778,13 +778,21 @@ async def pause_crawl(
778778
raise HTTPException(status_code=400, detail="not_a_crawl")
779779

780780
result = None
781+
782+
if pause:
783+
paused_at = dt_now()
784+
else:
785+
paused_at = None
786+
781787
try:
782-
result = await self.crawl_manager.pause_resume_crawl(crawl_id, pause=pause)
788+
result = await self.crawl_manager.pause_resume_crawl(
789+
crawl_id, paused_at=paused_at
790+
)
783791

784792
if result.get("success"):
785793
await self.crawls.find_one_and_update(
786794
{"_id": crawl_id, "type": "crawl", "oid": org.id},
787-
{"$set": {"pausing": pause}},
795+
{"$set": {"pausing": pause, "pausedAt": paused_at}},
788796
)
789797

790798
return {"success": True}

backend/btrixcloud/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ class UserOrgInfoOut(BaseModel):
238238
TYPE_SUCCESSFUL_STATES = Literal[
239239
"complete",
240240
"stopped_by_user",
241+
"stopped_pause_expired",
241242
"stopped_storage_quota_reached",
242243
"stopped_time_quota_reached",
243244
"stopped_org_readonly",
@@ -481,6 +482,8 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
481482

482483
lastCrawlStopping: Optional[bool] = False
483484
lastCrawlPausing: Optional[bool] = False
485+
lastCrawlPausedAt: Optional[datetime] = None
486+
lastCrawlPausedExpiry: Optional[datetime] = None
484487
profileName: Optional[str] = None
485488
firstSeed: Optional[str] = None
486489
seedCount: int = 0
@@ -867,6 +870,7 @@ class CrawlOut(BaseMongoModel):
867870
profileName: Optional[str] = None
868871
stopping: Optional[bool] = False
869872
pausing: Optional[bool] = False
873+
pausedAt: Optional[datetime] = None
870874
manual: bool = False
871875
cid_rev: Optional[int] = None
872876
scale: Scale = 1

backend/btrixcloud/operator/crawls.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import os
55
import math
66
from pprint import pprint
7-
from typing import Optional, Any, Sequence
8-
from datetime import datetime
7+
from typing import Optional, Any, Sequence, cast
8+
from datetime import datetime, timedelta
99
from uuid import UUID
1010

1111
import json
@@ -79,6 +79,7 @@
7979

8080
# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
8181
# pylint: disable=invalid-name, too-many-lines, too-many-return-statements
82+
# pylint: disable=too-many-instance-attributes
8283
# ============================================================================
8384
class CrawlOperator(BaseOperator):
8485
"""CrawlOperator Handler"""
@@ -93,6 +94,8 @@ class CrawlOperator(BaseOperator):
9394

9495
min_avail_storage_ratio: float
9596

97+
paused_expires_delta: timedelta
98+
9699
def __init__(self, *args):
97100
super().__init__(*args)
98101

@@ -110,6 +113,13 @@ def __init__(self, *args):
110113
os.environ.get("CRAWLER_MIN_AVAIL_STORAGE_RATIO") or 0
111114
)
112115

116+
# time in minutes before paused crawl is stopped - default is 7 days
117+
paused_crawl_limit_minutes = int(
118+
os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", "10080")
119+
)
120+
121+
self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes)
122+
113123
def init_routes(self, app):
114124
"""init routes for this operator"""
115125

@@ -160,7 +170,7 @@ async def sync_crawls(self, data: MCSyncData):
160170
scale=spec.get("scale", 1),
161171
started=data.parent["metadata"]["creationTimestamp"],
162172
stopping=spec.get("stopping", False),
163-
paused=spec.get("paused", False),
173+
paused_at=str_to_date(spec.get("pausedAt")),
164174
timeout=spec.get("timeout") or 0,
165175
max_crawl_size=int(spec.get("maxCrawlSize") or 0),
166176
scheduled=spec.get("manual") != "1",
@@ -265,11 +275,25 @@ async def sync_crawls(self, data: MCSyncData):
265275
status.scale = 1
266276

267277
# stopping paused crawls
268-
if crawl.paused and crawl.stopping:
269-
status.stopReason = "stopped_by_user"
270-
status.stopping = True
271-
print(f"Paused crawl stopped by user, id: {crawl.id}")
272-
await self.mark_finished(crawl, status, "stopped_by_user")
278+
if crawl.paused_at:
279+
stop_reason: Optional[StopReason] = None
280+
state: Optional[TYPE_NON_RUNNING_STATES] = None
281+
# Check if pause expiry limit is reached and if so, stop crawl
282+
if dt_now() >= (crawl.paused_at + self.paused_expires_delta):
283+
print(f"Paused crawl expiry reached, stopping crawl, id: {crawl.id}")
284+
stop_reason = "stopped_pause_expired"
285+
state = "stopped_pause_expired"
286+
287+
# Check if paused crawl was stopped manually
288+
elif crawl.stopping:
289+
print(f"Paused crawl stopped by user, id: {crawl.id}")
290+
stop_reason = "stopped_by_user"
291+
state = "stopped_by_user"
292+
293+
if stop_reason and state:
294+
status.stopping = True
295+
status.stopReason = stop_reason
296+
await self.mark_finished(crawl, status, state)
273297

274298
children = self._load_redis(params, status, data.children)
275299

@@ -335,7 +359,9 @@ async def sync_crawls(self, data: MCSyncData):
335359

336360
for i in range(0, status.scale):
337361
children.extend(
338-
self._load_crawler(params, i, status, data.children, crawl.paused)
362+
self._load_crawler(
363+
params, i, status, data.children, bool(crawl.paused_at)
364+
)
339365
)
340366

341367
return {
@@ -858,12 +884,12 @@ async def sync_crawl_state(
858884
):
859885
# mark as waiting (if already running)
860886
await self.set_state(
861-
"waiting_capacity" if not crawl.paused else "paused",
887+
"waiting_capacity" if not crawl.paused_at else "paused",
862888
status,
863889
crawl,
864890
allowed_from=(
865891
RUNNING_AND_STARTING_ONLY
866-
if not crawl.paused
892+
if not crawl.paused_at
867893
else RUNNING_AND_WAITING_STATES
868894
),
869895
)
@@ -884,7 +910,7 @@ async def sync_crawl_state(
884910

885911
# crawler pods already shut down, remove redis pause key
886912
# for simple resume later
887-
if crawl.paused:
913+
if crawl.paused_at:
888914
await redis.delete(f"{crawl.id}:paused")
889915

890916
elif crawler_running and not redis:
@@ -896,7 +922,7 @@ async def sync_crawl_state(
896922
return status
897923

898924
# only get here if at least one crawler pod is running
899-
if crawl.paused:
925+
if crawl.paused_at:
900926
await redis.set(f"{crawl.id}:paused", "1")
901927

902928
# update lastActiveTime if crawler is running

backend/btrixcloud/operator/models.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Operator Models"""
22

33
from collections import defaultdict
4+
from datetime import datetime
45
from uuid import UUID
56
from typing import Optional, DefaultDict, Literal, Annotated, Any
67
from pydantic import BaseModel, Field
@@ -17,6 +18,7 @@
1718

1819
StopReason = Literal[
1920
"stopped_by_user",
21+
"stopped_pause_expired",
2022
"time-limit",
2123
"size-limit",
2224
"stopped_storage_quota_reached",
@@ -76,7 +78,7 @@ class CrawlSpec(BaseModel):
7678
started: str
7779
crawler_channel: str
7880
stopping: bool = False
79-
paused: bool = False
81+
paused_at: Optional[datetime] = None
8082
scheduled: bool = False
8183
timeout: int = 0
8284
max_crawl_size: int = 0

chart/app-templates/crawl_job.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,5 @@ spec:
3636

3737
proxyId: "{{ proxy_id }}"
3838

39-
paused: {{ paused or 0 }}
39+
pausedAt: "{{ pausedAt }}"
4040

chart/templates/configmap.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ data:
2020

2121
INVITE_EXPIRE_SECONDS: "{{ .Values.invite_expire_seconds }}"
2222

23+
PAUSED_CRAWL_LIMIT_MINUTES: "{{ .Values.paused_crawl_limit_minutes }}"
24+
2325
REGISTRATION_ENABLED: "{{ .Values.registration_enabled | default 0 }}"
2426

2527
REGISTER_TO_ORG_ID: "{{ .Values.registration_org_id }}"

chart/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ allow_dupe_invites: "0"
8282
# number of seconds before pending invites expire - default is 7 days
8383
invite_expire_seconds: 604800
8484

85+
# number of minutes before paused crawls are stopped - default is 7 days
86+
paused_crawl_limit_minutes: 10080
87+
8588
# base url for replayweb.page
8689
rwp_base_url: "https://cdn.jsdelivr.net/npm/[email protected]/"
8790

frontend/src/features/archived-items/crawl-status.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,16 @@ export class CrawlStatus extends TailwindElement {
241241
label = msg("Stopped");
242242
break;
243243

244+
case "stopped_pause_expired":
245+
color = "var(--warning)";
246+
icon = html`<sl-icon
247+
name="dash-square-fill"
248+
slot="prefix"
249+
style="color: ${color}"
250+
></sl-icon>`;
251+
label = msg("Stopped: Paused Too Long");
252+
break;
253+
244254
case "stopped_storage_quota_reached":
245255
color = "var(--warning)";
246256
icon = html`<sl-icon

0 commit comments

Comments
 (0)