|
1 | 1 | import logging
|
2 | 2 | from typing import Any
|
3 | 3 |
|
| 4 | +import sentry_sdk |
4 | 5 | from django.conf import settings
|
5 | 6 | from drf_spectacular.utils import extend_schema
|
6 | 7 | from rest_framework.request import Request
|
7 | 8 | from rest_framework.response import Response
|
8 | 9 |
|
9 |
| -from sentry import features |
| 10 | +from sentry import features, options |
10 | 11 | from sentry.api.api_owners import ApiOwner
|
11 | 12 | from sentry.api.api_publish_status import ApiPublishStatus
|
12 | 13 | from sentry.api.base import region_silo_endpoint
|
|
24 | 25 | )
|
25 | 26 | from sentry.seer.seer_setup import has_seer_access
|
26 | 27 | from sentry.seer.signed_seer_api import make_signed_seer_api_request
|
27 |
| -from sentry.utils import json |
| 28 | +from sentry.utils import json, metrics |
28 | 29 |
|
29 | 30 | logger = logging.getLogger(__name__)
|
30 | 31 |
|
@@ -62,9 +63,15 @@ class ProjectReplaySummaryEndpoint(ProjectEndpoint):
|
62 | 63 | }
|
63 | 64 | permission_classes = (ReplaySummaryPermission,)
|
64 | 65 |
|
65 |
| - def __init__(self, **options) -> None: |
| 66 | + def __init__(self, **kw) -> None: |
66 | 67 | storage.initialize_client()
|
67 |
| - super().__init__(**options) |
| 68 | + self.sample_rate_post = options.get( |
| 69 | + "replay.endpoints.project_replay_summary.trace_sample_rate_post" |
| 70 | + ) |
| 71 | + self.sample_rate_get = options.get( |
| 72 | + "replay.endpoints.project_replay_summary.trace_sample_rate_get" |
| 73 | + ) |
| 74 | + super().__init__(**kw) |
68 | 75 |
|
69 | 76 | def make_seer_request(self, path: str, post_body: dict[str, Any]) -> Response:
|
70 | 77 | """Make a POST request to a Seer endpoint. Raises HTTPError and logs non-200 status codes."""
|
@@ -133,91 +140,128 @@ def has_replay_summary_access(self, project: Project, request: Request) -> bool:
|
133 | 140 |
|
134 | 141 | def get(self, request: Request, project: Project, replay_id: str) -> Response:
|
135 | 142 | """Poll for the status of a replay summary task in Seer."""
|
136 |
| - if not self.has_replay_summary_access(project, request): |
137 |
| - return self.respond( |
138 |
| - {"detail": "Replay summaries are not available for this organization."}, status=403 |
139 |
| - ) |
140 | 143 |
|
141 |
| - # We skip checking Seer permissions here for performance, and because summaries can't be created without them anyway. |
| 144 | + with sentry_sdk.start_transaction( |
| 145 | + name="replays.endpoints.project_replay_summary.get", |
| 146 | + op="replays.endpoints.project_replay_summary.get", |
| 147 | + custom_sampling_context={"sample_rate": self.sample_rate_get}, |
| 148 | + ): |
142 | 149 |
|
143 |
| - # Request Seer for the state of the summary task. |
144 |
| - return self.make_seer_request( |
145 |
| - SEER_POLL_STATE_ENDPOINT_PATH, |
146 |
| - { |
147 |
| - "replay_id": replay_id, |
148 |
| - }, |
149 |
| - ) |
| 150 | + if not self.has_replay_summary_access(project, request): |
| 151 | + return self.respond( |
| 152 | + {"detail": "Replay summaries are not available for this organization."}, |
| 153 | + status=403, |
| 154 | + ) |
| 155 | + |
| 156 | + # We skip checking Seer permissions here for performance, and because summaries can't be created without them anyway. |
| 157 | + |
| 158 | + # Request Seer for the state of the summary task. |
| 159 | + return self.make_seer_request( |
| 160 | + SEER_POLL_STATE_ENDPOINT_PATH, |
| 161 | + { |
| 162 | + "replay_id": replay_id, |
| 163 | + }, |
| 164 | + ) |
150 | 165 |
|
151 | 166 | def post(self, request: Request, project: Project, replay_id: str) -> Response:
|
152 | 167 | """Download replay segment data and parse it into logs. Then post to Seer to start a summary task."""
|
153 |
| - if not self.has_replay_summary_access(project, request): |
154 |
| - return self.respond( |
155 |
| - {"detail": "Replay summaries are not available for this organization."}, status=403 |
| 168 | + |
| 169 | + with sentry_sdk.start_transaction( |
| 170 | + name="replays.endpoints.project_replay_summary.post", |
| 171 | + op="replays.endpoints.project_replay_summary.post", |
| 172 | + custom_sampling_context={"sample_rate": self.sample_rate_post}, |
| 173 | + ): |
| 174 | + |
| 175 | + if not self.has_replay_summary_access(project, request): |
| 176 | + return self.respond( |
| 177 | + {"detail": "Replay summaries are not available for this organization."}, |
| 178 | + status=403, |
| 179 | + ) |
| 180 | + |
| 181 | + filter_params = self.get_filter_params(request, project) |
| 182 | + num_segments = request.data.get("num_segments", 0) |
| 183 | + temperature = request.data.get("temperature", None) |
| 184 | + |
| 185 | + # Limit data with the frontend's segment count, to keep summaries consistent with the video displayed in the UI. |
| 186 | + # While the replay is live, the FE and BE may have different counts. |
| 187 | + if num_segments > MAX_SEGMENTS_TO_SUMMARIZE: |
| 188 | + logger.warning( |
| 189 | + "Replay Summary: hit max segment limit.", |
| 190 | + extra={ |
| 191 | + "replay_id": replay_id, |
| 192 | + "project_id": project.id, |
| 193 | + "organization_id": project.organization.id, |
| 194 | + "segment_limit": MAX_SEGMENTS_TO_SUMMARIZE, |
| 195 | + }, |
| 196 | + ) |
| 197 | + num_segments = MAX_SEGMENTS_TO_SUMMARIZE |
| 198 | + |
| 199 | + # Fetch the replay's error and trace IDs from the replay_id. |
| 200 | + snuba_response = query_replay_instance( |
| 201 | + project_id=project.id, |
| 202 | + replay_id=replay_id, |
| 203 | + start=filter_params["start"], |
| 204 | + end=filter_params["end"], |
| 205 | + organization=project.organization, |
| 206 | + request_user_id=request.user.id, |
| 207 | + ) |
| 208 | + processed_response = process_raw_response( |
| 209 | + snuba_response, |
| 210 | + fields=request.query_params.getlist("field"), |
156 | 211 | )
|
| 212 | + error_ids = processed_response[0].get("error_ids", []) if processed_response else [] |
| 213 | + trace_ids = processed_response[0].get("trace_ids", []) if processed_response else [] |
157 | 214 |
|
158 |
| - filter_params = self.get_filter_params(request, project) |
159 |
| - num_segments = request.data.get("num_segments", 0) |
160 |
| - temperature = request.data.get("temperature", None) |
| 215 | + # Fetch same-trace errors. |
| 216 | + trace_connected_errors = fetch_trace_connected_errors( |
| 217 | + project=project, |
| 218 | + trace_ids=trace_ids, |
| 219 | + start=filter_params["start"], |
| 220 | + end=filter_params["end"], |
| 221 | + limit=100, |
| 222 | + ) |
| 223 | + trace_connected_error_ids = {x["id"] for x in trace_connected_errors} |
161 | 224 |
|
162 |
| - # Limit data with the frontend's segment count, to keep summaries consistent with the video displayed in the UI. |
163 |
| - # While the replay is live, the FE and BE may have different counts. |
164 |
| - if num_segments > MAX_SEGMENTS_TO_SUMMARIZE: |
165 |
| - logger.warning( |
166 |
| - "Replay Summary: hit max segment limit.", |
167 |
| - extra={ |
| 225 | + # Fetch directly linked errors, if they weren't returned by the trace query. |
| 226 | + replay_errors = fetch_error_details( |
| 227 | + project_id=project.id, |
| 228 | + error_ids=[x for x in error_ids if x not in trace_connected_error_ids], |
| 229 | + ) |
| 230 | + |
| 231 | + error_events = replay_errors + trace_connected_errors |
| 232 | + |
| 233 | + metrics.distribution( |
| 234 | + "replays.endpoints.project_replay_summary.direct_errors", |
| 235 | + value=len(replay_errors), |
| 236 | + ) |
| 237 | + metrics.distribution( |
| 238 | + "replays.endpoints.project_replay_summary.trace_connected_errors", |
| 239 | + value=len(trace_connected_errors), |
| 240 | + ) |
| 241 | + metrics.distribution( |
| 242 | + "replays.endpoints.project_replay_summary.num_trace_ids", |
| 243 | + value=len(trace_ids), |
| 244 | + ) |
| 245 | + |
| 246 | + # Download segment data. |
| 247 | + # XXX: For now this is capped to 100 and blocking. DD shows no replays with >25 segments, but we should still stress test and figure out how to deal with large replays. |
| 248 | + segment_md = fetch_segments_metadata(project.id, replay_id, 0, num_segments) |
| 249 | + segment_data = iter_segment_data(segment_md) |
| 250 | + |
| 251 | + # Combine replay and error data and parse into logs. |
| 252 | + logs = get_summary_logs(segment_data, error_events, project.id) |
| 253 | + |
| 254 | + # Post to Seer to start a summary task. |
| 255 | + # XXX: Request isn't streaming. Limitation of Seer authentication. Would be much faster if we |
| 256 | + # could stream the request data since the GCS download will (likely) dominate latency. |
| 257 | + return self.make_seer_request( |
| 258 | + SEER_START_TASK_ENDPOINT_PATH, |
| 259 | + { |
| 260 | + "logs": logs, |
| 261 | + "num_segments": num_segments, |
168 | 262 | "replay_id": replay_id,
|
169 |
| - "project_id": project.id, |
170 | 263 | "organization_id": project.organization.id,
|
171 |
| - "segment_limit": MAX_SEGMENTS_TO_SUMMARIZE, |
| 264 | + "project_id": project.id, |
| 265 | + "temperature": temperature, |
172 | 266 | },
|
173 | 267 | )
|
174 |
| - num_segments = MAX_SEGMENTS_TO_SUMMARIZE |
175 |
| - |
176 |
| - # Fetch the replay's error and trace IDs from the replay_id. |
177 |
| - snuba_response = query_replay_instance( |
178 |
| - project_id=project.id, |
179 |
| - replay_id=replay_id, |
180 |
| - start=filter_params["start"], |
181 |
| - end=filter_params["end"], |
182 |
| - organization=project.organization, |
183 |
| - request_user_id=request.user.id, |
184 |
| - ) |
185 |
| - processed_response = process_raw_response( |
186 |
| - snuba_response, |
187 |
| - fields=request.query_params.getlist("field"), |
188 |
| - ) |
189 |
| - error_ids = processed_response[0].get("error_ids", []) if processed_response else [] |
190 |
| - trace_ids = processed_response[0].get("trace_ids", []) if processed_response else [] |
191 |
| - |
192 |
| - # Fetch error details. |
193 |
| - replay_errors = fetch_error_details(project_id=project.id, error_ids=error_ids) |
194 |
| - trace_connected_errors = fetch_trace_connected_errors( |
195 |
| - project=project, |
196 |
| - trace_ids=trace_ids, |
197 |
| - start=filter_params["start"], |
198 |
| - end=filter_params["end"], |
199 |
| - ) |
200 |
| - error_events = replay_errors + trace_connected_errors |
201 |
| - |
202 |
| - # Download segment data. |
203 |
| - # XXX: For now this is capped to 100 and blocking. DD shows no replays with >25 segments, but we should still stress test and figure out how to deal with large replays. |
204 |
| - segment_md = fetch_segments_metadata(project.id, replay_id, 0, num_segments) |
205 |
| - segment_data = iter_segment_data(segment_md) |
206 |
| - |
207 |
| - # Combine replay and error data and parse into logs. |
208 |
| - logs = get_summary_logs(segment_data, error_events, project.id) |
209 |
| - |
210 |
| - # Post to Seer to start a summary task. |
211 |
| - # XXX: Request isn't streaming. Limitation of Seer authentication. Would be much faster if we |
212 |
| - # could stream the request data since the GCS download will (likely) dominate latency. |
213 |
| - return self.make_seer_request( |
214 |
| - SEER_START_TASK_ENDPOINT_PATH, |
215 |
| - { |
216 |
| - "logs": logs, |
217 |
| - "num_segments": num_segments, |
218 |
| - "replay_id": replay_id, |
219 |
| - "organization_id": project.organization.id, |
220 |
| - "project_id": project.id, |
221 |
| - "temperature": temperature, |
222 |
| - }, |
223 |
| - ) |
|
0 commit comments