/
evaluate.py
185 lines (138 loc) · 5.23 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""Examples using the `evaluate` API to evaluate a target system on a dataset."""
import logging
logging.basicConfig(level=logging.INFO)
# ruff: noqa: E402
# mypy: ignore-errors
# Prerequisites for the following examples:
from typing import Sequence
from langsmith import Client
from langsmith.evaluation import evaluate, evaluate_existing
from langsmith.schemas import Example, Run
client = Client()
client.clone_public_dataset(
"https://smith.langchain.com/public/419dcab2-1d66-4b94-8901-0357ead390df/d"
)
dataset_name = "Evaluate Examples"
# Example 1: Evalute your target system on a dataset
## Example (row)-level evaluator
def accuracy(run: Run, example: Example):
"""Row-level evaluator for accuracy."""
pred = run.outputs["output"]
expected = example.outputs["answer"]
return {"score": expected.lower() == pred.lower()}
## Summary evaluators - define your custom aggregation logic
def precision(runs: Sequence[Run], examples: Sequence[Example]):
"""Batch-level evaluator for precision."""
# TP / (TP + FP)
predictions = [run.outputs["output"].lower() for run in runs]
expected = [example.outputs["answer"].lower() for example in examples]
# yes and no are the only possible answers
tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"])
fp = sum([p == "yes" and e == "no" for p, e in zip(predictions, expected)])
return {"score": tp / (tp + fp)}
## The target system / thing you want to evaluate
def predict(inputs: dict) -> dict:
"""This can be any function or just an API call to your app."""
return {"output": "Yes"}
results = evaluate(
predict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
)
# Example 3: evaluating over only a subset of the examples
experiment_name = results.experiment_name
examples = client.list_examples(dataset_name=dataset_name, limit=5)
results = evaluate(
predict,
data=examples,
evaluators=[accuracy],
summary_evaluators=[precision],
experiment_prefix="My Experiment",
)
# Example 4: Streaming each prediction to more easily + eagerly debug.
results = evaluate(
predict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
blocking=False,
)
for i, result in enumerate(results):
pass
# Example 5: Add evaluation results to an existing set of runs (within an experiment)
def predicted_length(run: Run, example: Example):
"""Row-level evaluator for the length of the prediction."""
return {"score": len(next(iter(run.outputs)))}
def recall(runs: Sequence[Run], examples: Sequence[Example]):
"""Batch-level evaluator for recall."""
predictions = [run.outputs["output"].lower() for run in runs]
expected = [example.outputs["answer"].lower() for example in examples]
tp = sum([p == e for p, e in zip(predictions, expected) if p == "yes"])
fn = sum([p == "no" and e == "yes" for p, e in zip(predictions, expected)])
return {"score": tp / (tp + fn)}
evaluate_existing(
experiment_name,
data=dataset_name,
evaluators=[predicted_length],
summary_evaluators=[recall],
)
# Example 2: Using Off-the-shelf evaluators from LangChain
from langsmith.evaluation import LangChainStringEvaluator
def prepare_criteria_data(run: Run, example: Example):
"""Prepare the data for the criteria evaluator."""
return {
"prediction": run.outputs["output"],
"reference": example.outputs["answer"],
"input": str(example.inputs),
}
results = evaluate(
predict,
data=dataset_name,
evaluators=[
accuracy,
# Loads the evaluator from LangChain
LangChainStringEvaluator("embedding_distance"),
LangChainStringEvaluator(
"labeled_criteria",
config={
"criteria": {
"usefulness": "The prediction is useful if it is correct"
" and/or asks a useful followup question."
},
},
prepare_data=prepare_criteria_data,
),
],
summary_evaluators=[precision],
)
# Example 6: Evaluate a langchain object
from langchain_core.runnables import chain as as_runnable
@as_runnable
def nested_predict(inputs):
"""Object can be nested."""
return {"output": "Yes"}
@as_runnable
def lc_predict(inputs):
"""LangChain runnable object."""
return nested_predict.invoke(inputs)
results = evaluate(
lc_predict.invoke,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
)
# TODO: Comparing against an EXISTING test run
# I think this should be a separate API honestly to remove ambiguity,
# since you may want to mix ground truth "correctness" metrics with
# other metrics that are not directly comparable to the ground truth.
# Maybe something like
# def preferred(target: Run, baseline: Run):
# return {"score": target.outputs["output"] > baseline.outputs["output"]}
# evaluate_relative(predict, baseline=experiment_name/id/class,
# evaluators=[predicted_length])
# TODO: Evaluate against RAW PREDICTIONS
# If I have a set of predictions like {"output": "foo"}
# Evaluate COULD create a new experiment with those predictions and then
# call the evaluators.
# This could be useful if youw anted to just run inference separately