Skip to content

Commit

Permalink
Merge pull request #3 from VectorInstitute/develop
Browse files Browse the repository at this point in the history
Add results for correctness_answer metric
  • Loading branch information
xeon27 authored Apr 24, 2024
2 parents 29e4b18 + 43ef363 commit ac683ea
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 27 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Not all of the elements will necessarily be available. Some evaluation can be pe

#### Evaluation With Ground Truth

- Compare Generated and GT Answers (*answer_correctness*): Many evaluation techniques compare the generated answer (**A**) with the GT answer (**A\***).
- Compare Generated and GT Answers (*correctness_answer*): Many evaluation techniques compare the generated answer (**A**) with the GT answer (**A\***).


## 🧑🏿‍💻 Developing
Expand Down
2 changes: 1 addition & 1 deletion veval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
relevance_query_answer,
relevance_query_context,
groundedness_context_answer,
answer_correctness
correctness_answer
)
6 changes: 3 additions & 3 deletions veval/metrics/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ def relevance_query_context(
context=context,
)

def answer_correctness(
def correctness_answer(
query: List[str],
answer: List[str],
gt_answer: List[str],
) -> float:
return LLMJudgeMetrics().answer_correctness(
return LLMJudgeMetrics().correctness_answer(
query=query,
answer=answer,
gt_answer=gt_answer
Expand Down Expand Up @@ -151,7 +151,7 @@ def relevance_query_context(
)
return score.get("context_relevancy")

def answer_correctness(
def correctness_answer(
self,
query: List[str],
answer: List[str],
Expand Down
4 changes: 2 additions & 2 deletions veval/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
relevance_query_answer,
groundedness_context_answer,
relevance_query_context,
answer_correctness,
correctness_answer,
)


Expand All @@ -13,7 +13,7 @@
"relevance_query_answer": relevance_query_answer,
"groundedness_context_answer": groundedness_context_answer,
"relevance_query_context": relevance_query_context,
"answer_correctness": answer_correctness,
"correctness_answer": correctness_answer,
}
HIGHER_IS_BETTER_REGISTRY = {}
REQUIRES_GROUND_TRUTH = {}
Expand Down
7 changes: 6 additions & 1 deletion veval/tasks/pubmedqa/pubmedqa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,9 @@ metric_list:
- metric: relevance_query_context
args:
- query
- context
- context
- metric: correctness_answer
args:
- query
- answer
- gt_answer
21 changes: 12 additions & 9 deletions veval/tasks/pubmedqa/results.json
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
{
"basic_rag": {
"vector_retriever": {
"relevance_query_answer": 0.44129510519008974,
"groundedness_context_answer": 0.75,
"relevance_query_context": 0.03815020446139827
"relevance_query_answer": 0.4259400605403941,
"groundedness_context_answer": 0.7948717948717948,
"relevance_query_context": 0.03815020446139827,
"correctness_answer": 0.35946493662493934
}
},
"rerank_rag": {
"vector_retriever": {
"relevance_query_answer": 0.3437782048107566,
"groundedness_context_answer": 0.6666666666666666,
"relevance_query_context": 0.030686041548959497
"relevance_query_answer": 0.3426819871420391,
"groundedness_context_answer": 0.8,
"relevance_query_context": 0.030012390230146888,
"correctness_answer": 0.21825412465880867
},
"reranker": {
"relevance_query_answer": 0.1690786564806392,
"groundedness_context_answer": 0.5714285714285714,
"relevance_query_context": 0.06618154068154068
"relevance_query_answer": 0.1532415999867033,
"groundedness_context_answer": 1.0,
"relevance_query_context": 0.05700815850815851,
"correctness_answer": 0.21376968227387938
}
}
}
21 changes: 12 additions & 9 deletions veval/tasks/wikieval/results.json
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
{
"basic_rag": {
"vector_retriever": {
"relevance_query_answer": 0.7927767474040405,
"groundedness_context_answer": 0.3,
"relevance_query_context": 0.034626632490707715
"relevance_query_answer": 0.8297511377816242,
"groundedness_context_answer": 0.4166666666666667,
"relevance_query_context": 0.03485995336918988,
"correctness_answer": 0.5439944340161982
}
},
"rerank_rag": {
"vector_retriever": {
"relevance_query_answer": 0.1906669029753638,
"groundedness_context_answer": 0.7,
"relevance_query_context": 0.025259509118086262
"relevance_query_answer": 0.16941168246296598,
"groundedness_context_answer": 0.7777777777777778,
"relevance_query_context": 0.02237391283031606,
"correctness_answer": 0.23251316837266248
},
"reranker": {
"relevance_query_answer": 0.19066759578754763,
"groundedness_context_answer": 0.3333333333333333,
"relevance_query_context": 0.04587373737373737
"relevance_query_answer": 0.16962720707322923,
"groundedness_context_answer": 0.5,
"relevance_query_context": 0.040923122199437986,
"correctness_answer": 0.21998312295164335
}
}
}
7 changes: 6 additions & 1 deletion veval/tasks/wikieval/wikieval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,9 @@ metric_list:
- metric: relevance_query_context
args:
- query
- context
- context
- metric: correctness_answer
args:
- query
- answer
- gt_answer

0 comments on commit ac683ea

Please sign in to comment.