Skip to content
This repository was archived by the owner on Apr 8, 2025. It is now read-only.

Commit d3658be

Browse files
authored
Merge pull request #721 from deepset-ai/model_confidence_documentation
Added comments on how temperature parameter is automatically applied and stored
2 parents c2394dd + da87668 commit d3658be

File tree

2 files changed

+17
-10
lines changed

2 files changed

+17
-10
lines changed

examples/question_answering_confidence.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ def question_answering_confidence():
8282
model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device))
8383

8484
# 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling.
85+
# It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter.
86+
# During the calibration, this parameter is automatically set internally as an attribute of the prediction head.
8587
evaluator_dev = Evaluator(
8688
data_loader=data_silo.get_data_loader("dev"),
8789
tasks=data_silo.processor.tasks,
@@ -90,7 +92,7 @@ def question_answering_confidence():
9092
result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True)
9193
# evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev")))
9294

93-
# 7. Finally, run the evaluator on the test set to see how good the calibration of the confidence scores was
95+
# 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy
9496
evaluator_test = Evaluator(
9597
data_loader=data_silo.get_data_loader("test"),
9698
tasks=data_silo.processor.tasks,
@@ -99,23 +101,28 @@ def question_answering_confidence():
99101
result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0]
100102
logger.info("Grouping predictions by confidence score and calculating metrics for each bin.")
101103
em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10)
102-
for bin in range(10):
103-
logger.info(f"Bin {bin} - exact match: {em_per_bin[bin]}, average confidence score: {confidence_per_bin[bin]}")
104+
for bin_number in range(10):
105+
logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}")
104106

105-
# 8. Hooray! You have a model with calibrated confidence scores. Store it:
107+
# 8. Hooray! You have a model with calibrated confidence scores.
108+
# Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head.
106109
save_dir = Path("../saved_models/qa-confidence-tutorial")
107110
model.save(save_dir)
108111
processor.save(save_dir)
109112

110-
# 8. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough
113+
# 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough
114+
# To this end, load the stored model, which will automatically load the stored temperature parameter.
115+
# The confidence scores are automatically adjusted based on this temperature parameter.
116+
# For each prediction, we can check the model's confidence and decide whether to output the prediction or not.
117+
inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True)
118+
logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}")
119+
111120
QA_input = [
112121
{
113122
"questions": ["Who counted the game among the best ever made?"],
114123
"text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
115124
}]
116-
117-
model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
118-
result = model.inference_from_dicts(dicts=QA_input, return_json=False)[0]
125+
result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0]
119126
if result.prediction[0].confidence > 0.9:
120127
print(result.prediction[0].answer)
121128
else:

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def get_dependency_links(filename):
4646

4747
setup(
4848
name="farm",
49-
version="0.6.2",
49+
version="0.7.0",
5050
author="Timo Moeller, Malte Pietsch, Branden Chan, Tanay Soni, Bogdan Kostic, Julian Risch",
5151
author_email="[email protected]",
5252
description="Framework for finetuning and evaluating transformer based language models",
@@ -55,7 +55,7 @@ def get_dependency_links(filename):
5555
keywords="BERT NLP deep-learning language-model transformer qa question-answering transfer-learning",
5656
license="Apache",
5757
url="https://github.com/deepset-ai/FARM",
58-
download_url="https://github.com/deepset-ai/FARM/archive/0.6.2.tar.gz",
58+
download_url="https://github.com/deepset-ai/FARM/archive/0.7.0.tar.gz",
5959
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
6060
dependency_links=dependency_links,
6161
install_requires=parsed_requirements,

0 commit comments

Comments
 (0)