Skip to content

Commit 68f2ab9

Browse files
feat(replay): query IP for trace connected errors for replay summary (v2) (#98479)
redo of #97737 reverted the old pr as the trace id query was too slow -- it did 2 snuba queries per trace id (bad). this pr instead makes it 2 total queries for all trace ids MERGE WED --------- Co-authored-by: Andrew Liu <[email protected]>
1 parent e064033 commit 68f2ab9

File tree

5 files changed

+541
-157
lines changed

5 files changed

+541
-157
lines changed

src/sentry/options/defaults.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,17 @@
511511
default=False,
512512
flags=FLAG_AUTOMATOR_MODIFIABLE,
513513
)
514+
# Trace sampling rates for replay summary endpoint.
515+
register(
516+
"replay.endpoints.project_replay_summary.trace_sample_rate_post",
517+
default=0.0,
518+
flags=FLAG_AUTOMATOR_MODIFIABLE,
519+
)
520+
register(
521+
"replay.endpoints.project_replay_summary.trace_sample_rate_get",
522+
default=0.0,
523+
flags=FLAG_AUTOMATOR_MODIFIABLE,
524+
)
514525

515526
# User Feedback Options
516527
register(

src/sentry/replays/endpoints/project_replay_summary.py

Lines changed: 124 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import logging
22
from typing import Any
33

4+
import sentry_sdk
45
from django.conf import settings
56
from drf_spectacular.utils import extend_schema
67
from rest_framework.request import Request
78
from rest_framework.response import Response
89

9-
from sentry import features
10+
from sentry import features, options
1011
from sentry.api.api_owners import ApiOwner
1112
from sentry.api.api_publish_status import ApiPublishStatus
1213
from sentry.api.base import region_silo_endpoint
@@ -24,7 +25,7 @@
2425
)
2526
from sentry.seer.seer_setup import has_seer_access
2627
from sentry.seer.signed_seer_api import make_signed_seer_api_request
27-
from sentry.utils import json
28+
from sentry.utils import json, metrics
2829

2930
logger = logging.getLogger(__name__)
3031

@@ -62,9 +63,15 @@ class ProjectReplaySummaryEndpoint(ProjectEndpoint):
6263
}
6364
permission_classes = (ReplaySummaryPermission,)
6465

65-
def __init__(self, **options) -> None:
66+
def __init__(self, **kw) -> None:
6667
storage.initialize_client()
67-
super().__init__(**options)
68+
self.sample_rate_post = options.get(
69+
"replay.endpoints.project_replay_summary.trace_sample_rate_post"
70+
)
71+
self.sample_rate_get = options.get(
72+
"replay.endpoints.project_replay_summary.trace_sample_rate_get"
73+
)
74+
super().__init__(**kw)
6875

6976
def make_seer_request(self, path: str, post_body: dict[str, Any]) -> Response:
7077
"""Make a POST request to a Seer endpoint. Raises HTTPError and logs non-200 status codes."""
@@ -133,91 +140,128 @@ def has_replay_summary_access(self, project: Project, request: Request) -> bool:
133140

134141
def get(self, request: Request, project: Project, replay_id: str) -> Response:
135142
"""Poll for the status of a replay summary task in Seer."""
136-
if not self.has_replay_summary_access(project, request):
137-
return self.respond(
138-
{"detail": "Replay summaries are not available for this organization."}, status=403
139-
)
140143

141-
# We skip checking Seer permissions here for performance, and because summaries can't be created without them anyway.
144+
with sentry_sdk.start_transaction(
145+
name="replays.endpoints.project_replay_summary.get",
146+
op="replays.endpoints.project_replay_summary.get",
147+
custom_sampling_context={"sample_rate": self.sample_rate_get},
148+
):
142149

143-
# Request Seer for the state of the summary task.
144-
return self.make_seer_request(
145-
SEER_POLL_STATE_ENDPOINT_PATH,
146-
{
147-
"replay_id": replay_id,
148-
},
149-
)
150+
if not self.has_replay_summary_access(project, request):
151+
return self.respond(
152+
{"detail": "Replay summaries are not available for this organization."},
153+
status=403,
154+
)
155+
156+
# We skip checking Seer permissions here for performance, and because summaries can't be created without them anyway.
157+
158+
# Request Seer for the state of the summary task.
159+
return self.make_seer_request(
160+
SEER_POLL_STATE_ENDPOINT_PATH,
161+
{
162+
"replay_id": replay_id,
163+
},
164+
)
150165

151166
def post(self, request: Request, project: Project, replay_id: str) -> Response:
152167
"""Download replay segment data and parse it into logs. Then post to Seer to start a summary task."""
153-
if not self.has_replay_summary_access(project, request):
154-
return self.respond(
155-
{"detail": "Replay summaries are not available for this organization."}, status=403
168+
169+
with sentry_sdk.start_transaction(
170+
name="replays.endpoints.project_replay_summary.post",
171+
op="replays.endpoints.project_replay_summary.post",
172+
custom_sampling_context={"sample_rate": self.sample_rate_post},
173+
):
174+
175+
if not self.has_replay_summary_access(project, request):
176+
return self.respond(
177+
{"detail": "Replay summaries are not available for this organization."},
178+
status=403,
179+
)
180+
181+
filter_params = self.get_filter_params(request, project)
182+
num_segments = request.data.get("num_segments", 0)
183+
temperature = request.data.get("temperature", None)
184+
185+
# Limit data with the frontend's segment count, to keep summaries consistent with the video displayed in the UI.
186+
# While the replay is live, the FE and BE may have different counts.
187+
if num_segments > MAX_SEGMENTS_TO_SUMMARIZE:
188+
logger.warning(
189+
"Replay Summary: hit max segment limit.",
190+
extra={
191+
"replay_id": replay_id,
192+
"project_id": project.id,
193+
"organization_id": project.organization.id,
194+
"segment_limit": MAX_SEGMENTS_TO_SUMMARIZE,
195+
},
196+
)
197+
num_segments = MAX_SEGMENTS_TO_SUMMARIZE
198+
199+
# Fetch the replay's error and trace IDs from the replay_id.
200+
snuba_response = query_replay_instance(
201+
project_id=project.id,
202+
replay_id=replay_id,
203+
start=filter_params["start"],
204+
end=filter_params["end"],
205+
organization=project.organization,
206+
request_user_id=request.user.id,
207+
)
208+
processed_response = process_raw_response(
209+
snuba_response,
210+
fields=request.query_params.getlist("field"),
156211
)
212+
error_ids = processed_response[0].get("error_ids", []) if processed_response else []
213+
trace_ids = processed_response[0].get("trace_ids", []) if processed_response else []
157214

158-
filter_params = self.get_filter_params(request, project)
159-
num_segments = request.data.get("num_segments", 0)
160-
temperature = request.data.get("temperature", None)
215+
# Fetch same-trace errors.
216+
trace_connected_errors = fetch_trace_connected_errors(
217+
project=project,
218+
trace_ids=trace_ids,
219+
start=filter_params["start"],
220+
end=filter_params["end"],
221+
limit=100,
222+
)
223+
trace_connected_error_ids = {x["id"] for x in trace_connected_errors}
161224

162-
# Limit data with the frontend's segment count, to keep summaries consistent with the video displayed in the UI.
163-
# While the replay is live, the FE and BE may have different counts.
164-
if num_segments > MAX_SEGMENTS_TO_SUMMARIZE:
165-
logger.warning(
166-
"Replay Summary: hit max segment limit.",
167-
extra={
225+
# Fetch directly linked errors, if they weren't returned by the trace query.
226+
replay_errors = fetch_error_details(
227+
project_id=project.id,
228+
error_ids=[x for x in error_ids if x not in trace_connected_error_ids],
229+
)
230+
231+
error_events = replay_errors + trace_connected_errors
232+
233+
metrics.distribution(
234+
"replays.endpoints.project_replay_summary.direct_errors",
235+
value=len(replay_errors),
236+
)
237+
metrics.distribution(
238+
"replays.endpoints.project_replay_summary.trace_connected_errors",
239+
value=len(trace_connected_errors),
240+
)
241+
metrics.distribution(
242+
"replays.endpoints.project_replay_summary.num_trace_ids",
243+
value=len(trace_ids),
244+
)
245+
246+
# Download segment data.
247+
# XXX: For now this is capped to 100 and blocking. DD shows no replays with >25 segments, but we should still stress test and figure out how to deal with large replays.
248+
segment_md = fetch_segments_metadata(project.id, replay_id, 0, num_segments)
249+
segment_data = iter_segment_data(segment_md)
250+
251+
# Combine replay and error data and parse into logs.
252+
logs = get_summary_logs(segment_data, error_events, project.id)
253+
254+
# Post to Seer to start a summary task.
255+
# XXX: Request isn't streaming. Limitation of Seer authentication. Would be much faster if we
256+
# could stream the request data since the GCS download will (likely) dominate latency.
257+
return self.make_seer_request(
258+
SEER_START_TASK_ENDPOINT_PATH,
259+
{
260+
"logs": logs,
261+
"num_segments": num_segments,
168262
"replay_id": replay_id,
169-
"project_id": project.id,
170263
"organization_id": project.organization.id,
171-
"segment_limit": MAX_SEGMENTS_TO_SUMMARIZE,
264+
"project_id": project.id,
265+
"temperature": temperature,
172266
},
173267
)
174-
num_segments = MAX_SEGMENTS_TO_SUMMARIZE
175-
176-
# Fetch the replay's error and trace IDs from the replay_id.
177-
snuba_response = query_replay_instance(
178-
project_id=project.id,
179-
replay_id=replay_id,
180-
start=filter_params["start"],
181-
end=filter_params["end"],
182-
organization=project.organization,
183-
request_user_id=request.user.id,
184-
)
185-
processed_response = process_raw_response(
186-
snuba_response,
187-
fields=request.query_params.getlist("field"),
188-
)
189-
error_ids = processed_response[0].get("error_ids", []) if processed_response else []
190-
trace_ids = processed_response[0].get("trace_ids", []) if processed_response else []
191-
192-
# Fetch error details.
193-
replay_errors = fetch_error_details(project_id=project.id, error_ids=error_ids)
194-
trace_connected_errors = fetch_trace_connected_errors(
195-
project=project,
196-
trace_ids=trace_ids,
197-
start=filter_params["start"],
198-
end=filter_params["end"],
199-
)
200-
error_events = replay_errors + trace_connected_errors
201-
202-
# Download segment data.
203-
# XXX: For now this is capped to 100 and blocking. DD shows no replays with >25 segments, but we should still stress test and figure out how to deal with large replays.
204-
segment_md = fetch_segments_metadata(project.id, replay_id, 0, num_segments)
205-
segment_data = iter_segment_data(segment_md)
206-
207-
# Combine replay and error data and parse into logs.
208-
logs = get_summary_logs(segment_data, error_events, project.id)
209-
210-
# Post to Seer to start a summary task.
211-
# XXX: Request isn't streaming. Limitation of Seer authentication. Would be much faster if we
212-
# could stream the request data since the GCS download will (likely) dominate latency.
213-
return self.make_seer_request(
214-
SEER_START_TASK_ENDPOINT_PATH,
215-
{
216-
"logs": logs,
217-
"num_segments": num_segments,
218-
"replay_id": replay_id,
219-
"organization_id": project.organization.id,
220-
"project_id": project.id,
221-
"temperature": temperature,
222-
},
223-
)

src/sentry/replays/query.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from collections.abc import Generator, Sequence
44
from datetime import datetime
5-
from typing import Any
5+
from typing import Any, Literal
66

77
from snuba_sdk import (
88
Column,
@@ -34,6 +34,8 @@
3434
make_full_aggregation_query,
3535
query_using_optimized_search,
3636
)
37+
from sentry.search.events.types import SnubaParams
38+
from sentry.snuba.utils import get_dataset
3739
from sentry.utils.snuba import raw_snql_query
3840

3941
MAX_PAGE_SIZE = 100
@@ -902,3 +904,55 @@ def compute_has_viewed(viewed_by_id: int | None) -> Function:
902904
],
903905
alias="has_viewed",
904906
)
907+
908+
909+
def query_trace_connected_events(
910+
dataset_label: Literal["errors", "issuePlatform", "discover"],
911+
selected_columns: list[str],
912+
query: str | None,
913+
snuba_params: SnubaParams,
914+
equations: list[str] | None = None,
915+
orderby: list[str] | None = None,
916+
offset: int = 0,
917+
limit: int = 10,
918+
referrer: str = "api.replay.details-page",
919+
) -> dict[str, Any]:
920+
"""
921+
Query for trace-connected events, with a reusable query configuration for replays.
922+
923+
Args:
924+
dataset: The Snuba dataset to query against
925+
selected_columns: List of columns to select
926+
query: Optional query string
927+
snuba_params: Snuba parameters including project IDs, time range, etc.
928+
equations: Optional list of equations
929+
orderby: Optional ordering specification
930+
offset: Pagination offset
931+
limit: Pagination limit
932+
referrer: Referrer string for tracking
933+
934+
Returns:
935+
Query result from the dataset
936+
"""
937+
query_details = {
938+
"selected_columns": selected_columns,
939+
"query": query,
940+
"snuba_params": snuba_params,
941+
"equations": equations,
942+
"orderby": orderby,
943+
"offset": offset,
944+
"limit": limit,
945+
"referrer": referrer,
946+
"auto_fields": True,
947+
"auto_aggregations": True,
948+
"use_aggregate_conditions": True,
949+
"allow_metric_aggregates": False,
950+
"transform_alias_to_input_format": True,
951+
}
952+
953+
dataset = get_dataset(dataset_label)
954+
955+
if dataset is None:
956+
raise ValueError(f"Unknown dataset: {dataset_label}")
957+
958+
return dataset.query(**query_details)

0 commit comments

Comments
 (0)