Skip to content

Commit fe6f06c

Browse files
authored
Add support for GitHub issue classification to the HTTP service (#2330)
1 parent 7e58f6d commit fe6f06c

File tree

6 files changed

+247
-1
lines changed

6 files changed

+247
-1
lines changed

docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ services:
3030
image: mozilla/bugbug-http-service
3131
environment:
3232
- BUGBUG_BUGZILLA_TOKEN
33+
- BUGBUG_GITHUB_TOKEN
3334
- PORT=8000
3435
ports:
3536
- target: 8000
@@ -44,6 +45,7 @@ services:
4445
image: mozilla/bugbug-http-service-bg-worker
4546
environment:
4647
- BUGBUG_BUGZILLA_TOKEN
48+
- BUGBUG_GITHUB_TOKEN
4749

4850
bugbug-spawn-pipeline:
4951
build:

http_service/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
### Local development
2+
3+
**For starting the service locally run the following commands.**
4+
5+
Start Redis:
6+
7+
docker-compose up redis
8+
9+
Build the http service image:
10+
11+
docker build -t mozilla/bugbug-http-service -f Dockerfile .
12+
13+
Start the http service:
14+
15+
docker-compose up bugbug-http-service
16+
17+
Build the background worker image:
18+
19+
docker build -t mozilla/bugbug-http-service-bg-worker --build-arg TAG=latest -f Dockerfile.bg_worker .
20+
21+
Run the background worker:
22+
23+
docker-compose up bugbug-http-service-bg-worker

http_service/bugbug_http/app.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from bugbug_http.models import (
3333
MODELS_NAMES,
3434
classify_bug,
35+
classify_issue,
3536
get_config_specific_groups,
3637
schedule_tests,
3738
)
@@ -80,6 +81,8 @@
8081
VALIDATOR = Validator()
8182

8283
BUGZILLA_TOKEN = os.environ.get("BUGBUG_BUGZILLA_TOKEN")
84+
GITHUB_TOKEN = os.environ.get("BUGBUG_GITHUB_TOKEN")
85+
8386
BUGZILLA_API_URL = (
8487
libmozdata.config.get("Bugzilla", "URL", "https://bugzilla.mozilla.org")
8588
+ "/rest/bug"
@@ -212,6 +215,27 @@ def schedule_bug_classification(model_name: str, bug_ids: Sequence[int]) -> None
212215
)
213216

214217

218+
def schedule_issue_classification(
219+
model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
220+
) -> None:
221+
"""Schedule the classification of a issue_id list"""
222+
job_id = get_job_id()
223+
224+
# Set the mapping before queuing to avoid some race conditions
225+
job_id_mapping = {}
226+
for issue_num in issue_nums:
227+
key = JobInfo(classify_issue, model_name, owner, repo, issue_num).mapping_key
228+
job_id_mapping[key] = job_id
229+
230+
redis_conn.mset(job_id_mapping)
231+
232+
schedule_job(
233+
JobInfo(classify_issue, model_name, owner, repo, issue_nums),
234+
job_id=job_id,
235+
timeout=BUGZILLA_JOB_TIMEOUT,
236+
)
237+
238+
215239
def is_pending(job):
216240
# Check if there is a job
217241
job_id = redis_conn.get(job.mapping_key)
@@ -275,6 +299,23 @@ def get_bugs_last_change_time(bug_ids):
275299
return bugs
276300

277301

302+
def get_github_issues_update_time(
303+
owner: str, repo: str, issue_nums: Sequence[int]
304+
) -> dict:
305+
header = {"Authorization": "token {}".format(GITHUB_TOKEN)}
306+
repo_url = f"https://api.github.com/repos/{owner}/{repo}/issues/"
307+
308+
issues = {}
309+
for issue_num in issue_nums:
310+
issue_url = repo_url + str(issue_num)
311+
response = utils.get_session("github").get(issue_url, headers=header)
312+
response.raise_for_status()
313+
raw_issue = response.json()
314+
issues[raw_issue["number"]] = raw_issue["updated_at"]
315+
316+
return issues
317+
318+
278319
def is_prediction_invalidated(job, change_time):
279320
# First get the saved change time
280321
saved_change_time = redis_conn.get(job.change_time_key)
@@ -407,6 +448,84 @@ def model_prediction(model_name, bug_id):
407448
return compress_response(data, status_code)
408449

409450

451+
@application.route(
452+
"/<model_name>/predict/github/<string:owner>/<string:repo>/<int:issue_num>"
453+
)
454+
@cross_origin()
455+
def model_prediction_github(model_name, owner, repo, issue_num):
456+
"""
457+
---
458+
get:
459+
description: Classify a single issue using given model, answer either 200 if the issue is processed or 202 if the issue is being processed
460+
summary: Classify a single issue
461+
parameters:
462+
- name: model_name
463+
in: path
464+
schema: ModelName
465+
- name: owner
466+
in: path
467+
schema:
468+
type: str
469+
example: webcompat
470+
- name: repo
471+
in: path
472+
schema:
473+
type: str
474+
example: web-bugs
475+
- name: issue_number
476+
in: path
477+
schema:
478+
type: integer
479+
example: 123456
480+
responses:
481+
200:
482+
description: A single issue prediction
483+
content:
484+
application/json:
485+
schema: BugPrediction
486+
202:
487+
description: A temporary answer for the issue being processed
488+
content:
489+
application/json:
490+
schema: NotAvailableYet
491+
401:
492+
description: API key is missing
493+
content:
494+
application/json:
495+
schema: UnauthorizedError
496+
"""
497+
headers = request.headers
498+
499+
auth = headers.get(API_TOKEN)
500+
501+
if not auth:
502+
return jsonify(UnauthorizedError().dump({})), 401
503+
else:
504+
LOGGER.info("Request with API TOKEN %r", auth)
505+
506+
if model_name not in MODELS_NAMES:
507+
return jsonify({"error": f"Model {model_name} doesn't exist"}), 404
508+
509+
# Get the latest change date from github for the issue
510+
update_time = get_github_issues_update_time(owner, repo, [issue_num])
511+
512+
job = JobInfo(classify_issue, model_name, owner, repo, issue_num)
513+
issue_change_time = update_time.get(issue_num)
514+
if issue_change_time and is_prediction_invalidated(job, update_time[issue_num]):
515+
clean_prediction_cache(job)
516+
517+
status_code = 200
518+
data = get_result(job)
519+
520+
if not data:
521+
if not is_pending(job):
522+
schedule_issue_classification(model_name, owner, repo, [issue_num])
523+
status_code = 202
524+
data = {"ready": False}
525+
526+
return compress_response(data, status_code)
527+
528+
410529
@application.route("/<model_name>/predict/batch", methods=["POST"])
411530
@cross_origin()
412531
def batch_prediction(model_name):

http_service/bugbug_http/models.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from libmozdata.bugzilla import Bugzilla
1616
from redis import Redis
1717

18-
from bugbug import bugzilla, repository, test_scheduling
18+
from bugbug import bugzilla, github, repository, test_scheduling
1919
from bugbug.model import Model
2020
from bugbug.utils import get_hgmo_stack
2121
from bugbug_http.readthrough_cache import ReadthroughTTLCache
@@ -26,6 +26,7 @@
2626
MODELS_NAMES = [
2727
"defectenhancementtask",
2828
"component",
29+
"needsdiagnosis",
2930
"regression",
3031
"stepstoreproduce",
3132
"spambug",
@@ -110,6 +111,64 @@ def classify_bug(model_name: str, bug_ids: Sequence[int], bugzilla_token: str) -
110111
return "OK"
111112

112113

114+
def classify_issue(
115+
model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
116+
) -> str:
117+
from bugbug_http.app import JobInfo
118+
119+
issue_ids_set = set(map(int, issue_nums))
120+
121+
issues = {
122+
issue_num: github.fetch_issue_by_number(owner, repo, issue_num, True)
123+
for issue_num in issue_nums
124+
}
125+
126+
missing_issues = issue_ids_set.difference(issues.keys())
127+
128+
for issue_id in missing_issues:
129+
job = JobInfo(classify_issue, model_name, owner, repo, issue_id)
130+
131+
# TODO: Find a better error format
132+
setkey(job.result_key, orjson.dumps({"available": False}))
133+
134+
if not issues:
135+
return "NOK"
136+
137+
model = MODEL_CACHE.get(model_name)
138+
139+
if not model:
140+
LOGGER.info("Missing model %r, aborting" % model_name)
141+
return "NOK"
142+
143+
model_extra_data = model.get_extra_data()
144+
145+
# TODO: Classify could choke on a single bug which could make the whole
146+
# job to fail. What should we do here?
147+
probs = model.classify(list(issues.values()), True)
148+
indexes = probs.argmax(axis=-1)
149+
suggestions = model.le.inverse_transform(indexes)
150+
151+
probs_list = probs.tolist()
152+
indexes_list = indexes.tolist()
153+
suggestions_list = suggestions.tolist()
154+
155+
for i, issue_id in enumerate(issues.keys()):
156+
data = {
157+
"prob": probs_list[i],
158+
"index": indexes_list[i],
159+
"class": suggestions_list[i],
160+
"extra_data": model_extra_data,
161+
}
162+
163+
job = JobInfo(classify_issue, model_name, owner, repo, issue_id)
164+
setkey(job.result_key, orjson.dumps(data), compress=True)
165+
166+
# Save the bug last change
167+
setkey(job.change_time_key, issues[issue_id]["updated_at"].encode())
168+
169+
return "OK"
170+
171+
113172
@lru_cache(maxsize=None)
114173
def get_known_tasks() -> Tuple[str, ...]:
115174
with open("known_tasks", "r") as f:

http_service/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ services:
66
image: mozilla/bugbug-http-service
77
environment:
88
- BUGBUG_BUGZILLA_TOKEN
9+
- BUGBUG_GITHUB_TOKEN
910
- REDIS_URL=redis://redis:6379/0
1011
- PORT=8000
1112
- PULSE_USER
@@ -26,6 +27,7 @@ services:
2627
image: mozilla/bugbug-http-service-bg-worker
2728
environment:
2829
- BUGBUG_BUGZILLA_TOKEN
30+
- BUGBUG_GITHUB_TOKEN
2931
- REDIS_URL=redis://redis:6379/0
3032
- BUGBUG_ALLOW_MISSING_MODELS
3133
- BUGBUG_REPO_DIR

http_service/tests/test_bug_classification.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,47 @@ def do_request():
6161
assert retrieve_compressed_reponse(rv) == result
6262

6363

64+
def test_model_predict_id_github(client, jobs, add_result, responses):
65+
issue_id = "12345"
66+
result = {
67+
"prob": [0.11845558881759644, 0.8815444111824036],
68+
"index": 1,
69+
"class": 1,
70+
"extra_data": {},
71+
}
72+
73+
responses.add(
74+
responses.GET,
75+
f"https://api.github.com/repos/webcompat/web-bugs/issues/{issue_id}",
76+
status=200,
77+
json={"number": issue_id, "updated_at": time.time()},
78+
)
79+
80+
def do_request():
81+
return client.get(
82+
"/needsdiagnosis/predict/github/webcompat/web-bugs/12345",
83+
headers={API_TOKEN: "test"},
84+
)
85+
86+
rv = do_request()
87+
assert rv.status_code == 202
88+
assert retrieve_compressed_reponse(rv) == {"ready": False}
89+
90+
# request still not ready
91+
rv = do_request()
92+
assert rv.status_code == 202
93+
assert retrieve_compressed_reponse(rv) == {"ready": False}
94+
assert len(jobs) == 1
95+
96+
# now it's ready
97+
keys = next(iter(jobs.values()))
98+
add_result(keys[0], result)
99+
100+
rv = do_request()
101+
assert rv.status_code == 200
102+
assert retrieve_compressed_reponse(rv) == result
103+
104+
64105
def test_model_predict_batch(client, jobs, add_result, add_change_time, responses):
65106
bug_ids = [123, 456]
66107
result = {

0 commit comments

Comments
 (0)