Skip to content

Commit d4803c3

Browse files
angelachennpre-commit-ci[bot]edgarrmondragon
authored
feat: add CommitDiffsStream, PullRequestCommitDiffsStream, refactor diff logic into GitHubDiffStream (#372)
This PR adds child streams for fetching diffs of the respective parent streams. Referenced [Parent-Child Streams](https://gitlab.com/meltano/sdk/-/blob/main/docs/parent_streams.md) and followed the same format as changes in #345. - `CommitDiffsStream` -> `CommitsStream` as parent - `PullRequestCommitDiffsStream` -> `PullRequestCommitsStream` as parent - renamed `PullRequestCommits` to `PullRequestCommitsStream` to match convention Like #345, parent context is also added to relevant streams to give users more flexibility in using or joining their tables. I've tested changes on my end successfully and as always, please advise if I missed anything. Thank you! Latest commit refactors the repetition between the 3 diff streams into a reusable block (`GithubDiffStream`) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Edgar Ramírez Mondragón <[email protected]>
1 parent 062b659 commit d4803c3

File tree

3 files changed

+135
-42
lines changed

3 files changed

+135
-42
lines changed

tap_github/client.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,52 @@ def calculate_sync_cost(
306306
return {"rest": 1, "graphql": 0, "search": 0}
307307

308308

309+
class GitHubDiffStream(GitHubRestStream):
310+
"""Base class for GitHub diff streams."""
311+
312+
@property
313+
def http_headers(self) -> dict:
314+
"""Return the http headers needed for diff requests."""
315+
headers = super().http_headers
316+
headers["Accept"] = "application/vnd.github.v3.diff"
317+
return headers
318+
319+
def parse_response(self, response: requests.Response) -> Iterable[dict]:
320+
"""Parse the response to yield the diff text instead of an object
321+
and prevent buffer overflow."""
322+
if response.status_code != 200:
323+
contents = response.json()
324+
self.logger.info(
325+
"Skipping %s due to %d error: %s",
326+
self.name.replace("_", " "),
327+
response.status_code,
328+
contents["message"],
329+
)
330+
yield {
331+
"success": False,
332+
"error_message": contents["message"],
333+
}
334+
return
335+
336+
if content_length_str := response.headers.get("Content-Length"):
337+
content_length = int(content_length_str)
338+
max_size = 41_943_040 # 40 MiB
339+
if content_length > max_size:
340+
self.logger.info(
341+
"Skipping %s. The diff size (%.2f MiB) exceeded the maximum"
342+
" size limit of 40 MiB.",
343+
self.name.replace("_", " "),
344+
content_length / 1024 / 1024,
345+
)
346+
yield {
347+
"success": False,
348+
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
349+
}
350+
return
351+
352+
yield {"diff": response.text, "success": True}
353+
354+
309355
class GitHubGraphqlStream(GraphQLStream, GitHubRestStream):
310356
"""GitHub Graphql stream class."""
311357

tap_github/repository_streams.py

Lines changed: 83 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from singer_sdk.exceptions import FatalAPIError
1111
from singer_sdk.helpers.jsonpath import extract_jsonpath
1212

13-
from tap_github.client import GitHubGraphqlStream, GitHubRestStream
13+
from tap_github.client import GitHubDiffStream, GitHubGraphqlStream, GitHubRestStream
1414
from tap_github.schema_objects import (
1515
files_object,
1616
label_object,
@@ -1079,6 +1079,14 @@ def post_process(self, row: dict, context: dict | None = None) -> dict:
10791079
row["commit_timestamp"] = row["commit"]["committer"]["date"]
10801080
return row
10811081

1082+
def get_child_context(self, record: dict, context: dict | None) -> dict:
1083+
return {
1084+
"org": context["org"] if context else None,
1085+
"repo": context["repo"] if context else None,
1086+
"repo_id": context["repo_id"] if context else None,
1087+
"commit_id": record["sha"],
1088+
}
1089+
10821090
schema = th.PropertiesList(
10831091
th.Property("org", th.StringType),
10841092
th.Property("repo", th.StringType),
@@ -1162,6 +1170,37 @@ class CommitCommentsStream(GitHubRestStream):
11621170
).to_dict()
11631171

11641172

1173+
class CommitDiffsStream(GitHubDiffStream):
1174+
name = "commit_diffs"
1175+
path = "/repos/{org}/{repo}/commits/{commit_id}"
1176+
primary_keys: ClassVar[list[str]] = ["commit_id"]
1177+
parent_stream_type = CommitsStream
1178+
ignore_parent_replication_key = False
1179+
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
1180+
1181+
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
1182+
row = super().post_process(row, context)
1183+
if context is not None:
1184+
# Get commit ID (sha) from context
1185+
row["org"] = context["org"]
1186+
row["repo"] = context["repo"]
1187+
row["repo_id"] = context["repo_id"]
1188+
row["commit_id"] = context["commit_id"]
1189+
return row
1190+
1191+
schema = th.PropertiesList(
1192+
# Parent keys
1193+
th.Property("org", th.StringType),
1194+
th.Property("repo", th.StringType),
1195+
th.Property("repo_id", th.IntegerType),
1196+
th.Property("commit_id", th.StringType),
1197+
# Rest
1198+
th.Property("diff", th.StringType),
1199+
th.Property("success", th.BooleanType),
1200+
th.Property("error_message", th.StringType),
1201+
).to_dict()
1202+
1203+
11651204
class LabelsStream(GitHubRestStream):
11661205
"""Defines 'labels' stream."""
11671206

@@ -1355,14 +1394,23 @@ def get_child_context(self, record: dict, context: dict | None) -> dict:
13551394
).to_dict()
13561395

13571396

1358-
class PullRequestCommits(GitHubRestStream):
1397+
class PullRequestCommitsStream(GitHubRestStream):
13591398
name = "pull_request_commits"
13601399
path = "/repos/{org}/{repo}/pulls/{pull_number}/commits"
13611400
ignore_parent_replication_key = False
13621401
primary_keys: ClassVar[list[str]] = ["node_id"]
13631402
parent_stream_type = PullRequestsStream
13641403
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
13651404

1405+
def get_child_context(self, record: dict, context: dict | None) -> dict:
1406+
return {
1407+
"org": context["org"] if context else None,
1408+
"repo": context["repo"] if context else None,
1409+
"repo_id": context["repo_id"] if context else None,
1410+
"pull_number": context["pull_number"] if context else None,
1411+
"commit_id": record["sha"],
1412+
}
1413+
13661414
schema = th.PropertiesList(
13671415
# Parent keys
13681416
th.Property("org", th.StringType),
@@ -1443,7 +1491,7 @@ def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict
14431491
return row
14441492

14451493

1446-
class PullRequestDiffsStream(GitHubRestStream):
1494+
class PullRequestDiffsStream(GitHubDiffStream):
14471495
name = "pull_request_diffs"
14481496
path = "/repos/{org}/{repo}/pulls/{pull_number}"
14491497
primary_keys: ClassVar[list[str]] = ["pull_id"]
@@ -1453,53 +1501,48 @@ class PullRequestDiffsStream(GitHubRestStream):
14531501
# Known Github API errors
14541502
tolerated_http_errors: ClassVar[list[int]] = [404, 406, 422, 502]
14551503

1456-
@property
1457-
def http_headers(self) -> dict:
1458-
headers = super().http_headers
1459-
headers["Accept"] = "application/vnd.github.v3.diff"
1460-
return headers
1504+
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
1505+
row = super().post_process(row, context)
1506+
if context is not None:
1507+
# Get PR ID from context
1508+
row["org"] = context["org"]
1509+
row["repo"] = context["repo"]
1510+
row["repo_id"] = context["repo_id"]
1511+
row["pull_number"] = context["pull_number"]
1512+
row["pull_id"] = context["pull_id"]
1513+
return row
14611514

1462-
def parse_response(self, response: requests.Response) -> Iterable[dict]:
1463-
"""Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1464-
if response.status_code != 200:
1465-
contents = response.json()
1466-
self.logger.info(
1467-
"Skipping PR due to %d error: %s",
1468-
response.status_code,
1469-
contents["message"],
1470-
)
1471-
yield {
1472-
"success": False,
1473-
"error_message": contents["message"],
1474-
}
1475-
return
1515+
schema = th.PropertiesList(
1516+
# Parent keys
1517+
th.Property("org", th.StringType),
1518+
th.Property("repo", th.StringType),
1519+
th.Property("repo_id", th.IntegerType),
1520+
th.Property("pull_number", th.IntegerType),
1521+
th.Property("pull_id", th.IntegerType),
1522+
# Rest
1523+
th.Property("diff", th.StringType),
1524+
th.Property("success", th.BooleanType),
1525+
th.Property("error_message", th.StringType),
1526+
).to_dict()
14761527

1477-
if content_length_str := response.headers.get("Content-Length"):
1478-
content_length = int(content_length_str)
1479-
max_size = 41_943_040 # 40 MiB
1480-
if content_length > max_size:
1481-
self.logger.info(
1482-
"Skipping PR. The diff size (%.2f MiB) exceeded the maximum size "
1483-
"limit of 40 MiB.",
1484-
content_length / 1024 / 1024,
1485-
)
1486-
yield {
1487-
"success": False,
1488-
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
1489-
}
1490-
return
14911528

1492-
yield {"diff": response.text, "success": True}
1529+
class PullRequestCommitDiffsStream(GitHubDiffStream):
1530+
name = "pull_request_commit_diffs"
1531+
path = "/repos/{org}/{repo}/commits/{commit_id}"
1532+
primary_keys: ClassVar[list[str]] = ["commit_id"]
1533+
parent_stream_type = PullRequestCommitsStream
1534+
ignore_parent_replication_key = False
1535+
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
14931536

14941537
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
14951538
row = super().post_process(row, context)
14961539
if context is not None:
1497-
# Get PR ID from context
1540+
# Get commit ID (sha) from context
14981541
row["org"] = context["org"]
14991542
row["repo"] = context["repo"]
15001543
row["repo_id"] = context["repo_id"]
15011544
row["pull_number"] = context["pull_number"]
1502-
row["pull_id"] = context["pull_id"]
1545+
row["commit_id"] = context["commit_id"]
15031546
return row
15041547

15051548
schema = th.PropertiesList(
@@ -1508,7 +1551,7 @@ def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict
15081551
th.Property("repo", th.StringType),
15091552
th.Property("repo_id", th.IntegerType),
15101553
th.Property("pull_number", th.IntegerType),
1511-
th.Property("pull_id", th.IntegerType),
1554+
th.Property("commit_id", th.StringType),
15121555
# Rest
15131556
th.Property("diff", th.StringType),
15141557
th.Property("success", th.BooleanType),

tap_github/streams.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
BranchesStream,
1717
CollaboratorsStream,
1818
CommitCommentsStream,
19+
CommitDiffsStream,
1920
CommitsStream,
2021
CommunityProfileStream,
2122
ContributorsStream,
@@ -34,7 +35,8 @@
3435
ProjectCardsStream,
3536
ProjectColumnsStream,
3637
ProjectsStream,
37-
PullRequestCommits,
38+
PullRequestCommitDiffsStream,
39+
PullRequestCommitsStream,
3840
PullRequestDiffsStream,
3941
PullRequestsStream,
4042
ReadmeHtmlStream,
@@ -82,6 +84,7 @@ def __init__(self, valid_queries: set[str], streams: list[type[Stream]]) -> None
8284
CollaboratorsStream,
8385
CommitCommentsStream,
8486
CommitsStream,
87+
CommitDiffsStream,
8588
CommunityProfileStream,
8689
ContributorsStream,
8790
DependenciesStream,
@@ -98,7 +101,8 @@ def __init__(self, valid_queries: set[str], streams: list[type[Stream]]) -> None
98101
ProjectCardsStream,
99102
ProjectColumnsStream,
100103
ProjectsStream,
101-
PullRequestCommits,
104+
PullRequestCommitsStream,
105+
PullRequestCommitDiffsStream,
102106
PullRequestDiffsStream,
103107
PullRequestsStream,
104108
ReadmeHtmlStream,

0 commit comments

Comments
 (0)