-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-data.py
236 lines (189 loc) · 7.9 KB
/
get-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import calendar
import os
import re
import sys
from datetime import date, timedelta
import pandas as pd
from ghapi.core import GhApi
from rich.console import Console
console = Console(force_terminal=True)
def make_clickable_url(name, url):
return f'<a href="{url}" rel="noopener noreferrer" target="_blank">{name}</a>'
def get_last_month():
year = date.today().year
month = date.today().month
if month == 1:
month = 12
year -= 1
else:
month -= 1
month_start = date(year, month, 1)
month_end = date(year, month, calendar.monthrange(year, month)[1])
return month_start.strftime("%Y-%m-%d"), month_end.strftime("%Y-%m-%d")
def get_last_week():
year, week_num, _ = date.today().isocalendar()
if week_num == 1:
week_num = 52
year -= 1
else:
week_num -= 1
new_date = date(year, 1, 1) + timedelta(7 * week_num)
weekday = new_date.weekday()
week_start = new_date - timedelta(weekday)
week_end = new_date + timedelta(6 - weekday)
return week_start.strftime("%Y-%m-%d"), week_end.strftime("%Y-%m-%d")
def perform_search(query, page_num=1):
try:
if page_num > 1:
result = gh.search.issues_and_pull_requests(
search_query,
sort="updated",
order="desc",
per_page=100,
page=page_num,
)
else:
result = gh.search.issues_and_pull_requests(
search_query,
sort="updated",
order="desc",
per_page=100,
)
return result
except Exception:
pass
def process_results(items, dest_df, filter_name, ignored_repos):
for item in items:
# Flag is a boolean variable we will use to check if the repo appears
# in the list of repos to ignore
flag = False
repo_full_name = "/".join(item["repository_url"].split("/")[-2:])
for ignored_repo in ignored_repos:
pattern = re.compile(ignored_repo)
match = re.match(pattern, repo_full_name)
if match is not None:
# We set the flag to True and stop searching since we have found
# a match in the list of repos to ignore
flag = True
break
# We want to ignore this repo so we continue the loop to the next
# iteration instead of executing the below code
if flag:
continue
# Find the set of the filters being applied in this query
filter_name_set = set(filter_name.split(":"))
# Establish if this item already exists in the DataFrame, based on title
existing_indx = dest_df.index[dest_df["raw_title"] == item["title"]].tolist()
if existing_indx:
# Find the set of the filters already applied
row_filter_set = set(dest_df.loc[existing_indx[0], "filter"].split(":"))
# Find the difference between the two sets of filters
set_diff = filter_name_set.difference(row_filter_set)
# Update filters in the row
dest_df.loc[existing_indx[0], "filter"] = ":".join(
list(row_filter_set) + list(set_diff)
)
else:
details = pd.DataFrame(
{
"number": item["number"],
"raw_title": item["title"],
"link": (
item["pull_request"]["html_url"]
if "pull_request" in item.keys()
else item["html_url"]
),
"repo_name": repo_full_name,
"repo_url": item["repository_url"]
.replace("api.", "")
.replace("repos/", ""),
"state": item["state"],
"created_at": item["created_at"],
"updated_at": item["updated_at"],
"closed_at": item["closed_at"],
"pull_request": "pull_request" in item.keys(),
"filter": filter_name,
},
index=[0],
)
dest_df = pd.concat([dest_df, details], ignore_index=True)
dest_df.reset_index(inplace=True, drop=True)
return dest_df
token = os.environ["ACCESS_TOKEN"] if "ACCESS_TOKEN" in os.environ else None
if token is None:
raise ValueError("ACCESS_TOKEN must be set!")
gh = GhApi(token=token)
try:
result = gh.users.get_authenticated()
username = result["login"]
except Exception:
console.print("[bold red]You are rate limited! :scream:")
sys.exit(1)
if os.path.exists(".repoignore"):
ignored_repos = []
with open(".repoignore") as f:
for line in f.readlines():
ignored_repos.append(line.strip("\n"))
else:
ignored_repos = []
month_start, month_end = get_last_month()
week_start, week_end = get_last_week()
columns = [
"number",
"raw_title",
"link",
"repo_name",
"repo_url",
"created_at",
"updated_at",
"closed_at",
"pull_request",
"filter",
"title",
"repository",
]
df = pd.DataFrame(columns=columns)
df["pull_request"] = df["pull_request"].astype(bool)
queries = {
f"is:issue is:open assignee:{username}": "assigned",
f"is:pr is:open assignee:{username}": "assigned",
f"is:issue is:open author:{username}": "created",
f"is:pr is:open author:{username}": "created",
f"is:pr is:open review-requested:{username}": "review_requested",
f"is:issue assignee:{username} closed:{month_start}..{month_end}": "assigned:closed_last_month",
f"is:issue author:{username} closed:{month_start}..{month_end}": "created:closed_last_month",
f"is:pr assignee:{username} closed:{month_start}..{month_end}": "assigned:closed_last_month",
f"is:pr author:{username} closed:{month_start}..{month_end}": "created:closed_last_month",
f"is:issue assignee:{username} closed:{week_start}..{week_end}": "assigned:closed_last_week",
f"is:issue author:{username} closed:{week_start}..{week_end}": "created:closed_last_week",
f"is:pr assignee:{username} closed:{week_start}..{week_end}": "assigned:closed_last_week",
f"is:pr author:{username} closed:{week_start}..{week_end}": "created:closed_last_week",
f"is:issue assignee:{username} updated:{week_start}..{week_end}": "assigned:updated_last_week",
f"is:issue author:{username} updated:{week_start}..{week_end}": "created:updated_last_week",
f"is:pr assignee:{username} updated:{week_start}..{week_end}": "assigned:updated_last_week",
f"is:pr author:{username} updated:{week_start}..{week_end}": "created:updated_last_week",
f"is:issue author:{username} created:{week_start}..{week_end}": "created:created_last_week",
f"is:pr author:{username} created:{week_start}..{week_end}": "created:created_last_week",
}
for search_query, filter_name in queries.items():
console.print(f"[bold blue]Query params:[/bold blue] {search_query}")
result = perform_search(search_query)
if result is None:
print("[bold yellow] Query returned no results. Skipping...")
continue
total_pages = (result["total_count"] // 100) + 1
with console.status("[bold yellow]Processing query..."):
df = process_results(result["items"], df, filter_name, ignored_repos)
if total_pages > 1:
for i in range(2, total_pages + 1):
result = perform_search(search_query, page_num=i)
df = process_results(result["items"], df, filter_name, ignored_repos)
console.print("[bold yellow]Query processed!")
console.print("[bold blue]Saving results to CSV file...")
df["title"] = df.apply(lambda x: make_clickable_url(x["raw_title"], x["link"]), axis=1)
df["repository"] = df.apply(
lambda x: make_clickable_url(x["repo_name"], x["repo_url"]), axis=1
)
df.sort_values("repo_name", inplace=True)
df.to_csv("github-activity.csv", index=False)
console.print("[bold green]Done!")