Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Python] evaluate() #542

Merged
merged 36 commits into from Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
a3b9826
V2 API Test
hinthornw Mar 23, 2024
e553790
Merge branch 'main' into wfh/eval2.0
hinthornw Mar 25, 2024
0f74a98
Wrapper test
hinthornw Mar 25, 2024
671310e
Rename + add subclasses
hinthornw Mar 25, 2024
aef9065
Add example
hinthornw Mar 25, 2024
4e0c221
Update docs
hinthornw Mar 25, 2024
078b897
gs
hinthornw Mar 25, 2024
10d2398
Add example
hinthornw Mar 25, 2024
bff5758
Add ability to do a prepare_data fn
hinthornw Mar 25, 2024
cbbc5d2
Add examples
hinthornw Mar 25, 2024
b9567d3
context_run
hinthornw Mar 26, 2024
3a0d5d8
Always trace
hinthornw Mar 26, 2024
d4c4648
Context run
hinthornw Mar 26, 2024
d0dcc96
Pass client
hinthornw Mar 26, 2024
319bb62
Nits
hinthornw Mar 26, 2024
e566093
pyp
hinthornw Mar 26, 2024
9c8fb54
Merge branch 'main' into wfh/eval2.0
hinthornw Mar 26, 2024
df06219
Update
hinthornw Mar 26, 2024
26ec444
Use experiment-prefix
hinthornw Mar 26, 2024
b516b7f
Prep data
hinthornw Mar 26, 2024
c497800
format
hinthornw Mar 26, 2024
aee4091
Client
hinthornw Mar 26, 2024
a5d5581
doctest
hinthornw Mar 26, 2024
9871522
Warn beta
hinthornw Mar 26, 2024
20f9722
rm examples
hinthornw Mar 26, 2024
4b06c34
Update
hinthornw Mar 26, 2024
b2dbda4
dep
hinthornw Mar 26, 2024
59fab3f
Integration tests add tiktoken
hinthornw Mar 26, 2024
24ccb1b
Update
hinthornw Mar 26, 2024
eb3ff0d
Add comments
hinthornw Mar 26, 2024
0fe3a19
comment
hinthornw Mar 26, 2024
483b26b
default shallow
hinthornw Mar 27, 2024
490a2d5
test
hinthornw Mar 27, 2024
4ba3988
rapidly fuzz
hinthornw Mar 27, 2024
fe9f922
Bump
hinthornw Mar 27, 2024
7c93475
sesh
hinthornw Mar 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 14 additions & 1 deletion .github/actions/python-integration-tests/action.yml
Expand Up @@ -10,6 +10,9 @@ inputs:
openai-api-key:
description: "OpenAI API key"
required: false
anthropic-api-key:
description: "Anthropic API key"
required: false
runs:
using: "composite"
steps:
Expand All @@ -30,7 +33,7 @@ runs:
- name: Install dependencies
run: |
poetry install --with dev
poetry run pip install -U langchain
poetry run pip install -U langchain langchain_anthropic tiktoken
shell: bash
working-directory: python

Expand All @@ -42,3 +45,13 @@ runs:
run: make integration_tests_fast
shell: bash
working-directory: python

- name: Run doctest
env:
LANGCHAIN_TRACING_V2: "true"
LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }}
OPENAI_API_KEY: ${{ inputs.openai-api-key }}
ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
run: make doctest
shell: bash
working-directory: python
1 change: 1 addition & 0 deletions .github/workflows/integration_tests.yml
Expand Up @@ -49,6 +49,7 @@ jobs:
python-version: 3.11
langchain-api-key: ${{ secrets.LANGCHAIN_API_KEY }}
openai-api-key: ${{ secrets.OPENAI_API_KEY }}
anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}

js_integration_test:
name: JS Integration Test
Expand Down
3 changes: 3 additions & 0 deletions python/Makefile
Expand Up @@ -12,6 +12,9 @@ integration_tests:
integration_tests_fast:
poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests

doctest:
poetry run pytest -n auto --durations=10 --doctest-modules langsmith

lint:
poetry run ruff .
poetry run mypy .
Expand Down
3 changes: 2 additions & 1 deletion python/langsmith/beta/__init__.py
@@ -1,5 +1,6 @@
"""Beta functionality prone to change."""

from langsmith.beta._evals import compute_test_metrics, convert_runs_to_test
from langsmith.beta._utils import warn_beta

__all__ = ["convert_runs_to_test", "compute_test_metrics"]
__all__ = ["convert_runs_to_test", "compute_test_metrics", "warn_beta"]
30 changes: 24 additions & 6 deletions python/langsmith/client.py
Expand Up @@ -49,11 +49,12 @@
from langsmith import env as ls_env
from langsmith import schemas as ls_schemas
from langsmith import utils as ls_utils
from langsmith.evaluation import evaluator as ls_evaluator

if TYPE_CHECKING:
import pandas as pd # type: ignore

from langsmith.evaluation import evaluator as ls_evaluator
hinthornw marked this conversation as resolved.
Show resolved Hide resolved

logger = logging.getLogger(__name__)
_urllib3_logger = logging.getLogger("urllib3.connectionpool")

Expand Down Expand Up @@ -728,7 +729,9 @@ def request_with_retries(
args = list(e.args)
msg = args[1] if len(args) > 1 else ""
msg = msg.replace("session", "session (project)")
emsg = "\n".join([args[0]] + [msg] + args[2:])
emsg = "\n".join(
[str(args[0])] + [msg] + [str(arg) for arg in args[2:]]
)
raise ls_utils.LangSmithError(
f"Failed to {request_method} {url} in LangSmith API. {emsg}"
) from e
Expand Down Expand Up @@ -3144,11 +3147,20 @@ def _resolve_example_id(
def _select_eval_results(
self,
results: Union[ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults],
*,
fn_name: Optional[str] = None,
) -> List[ls_evaluator.EvaluationResult]:
from langsmith.evaluation import evaluator as ls_evaluator # noqa: F811

if isinstance(results, ls_evaluator.EvaluationResult):
results_ = [results]
elif isinstance(results, dict) and "results" in results:
results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
elif isinstance(results, dict):
if "results" in results:
results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
else:
results_ = [
ls_evaluator.EvaluationResult(**{"key": fn_name, **results})
]
else:
raise TypeError(
f"Invalid evaluation result type {type(results)}."
Expand Down Expand Up @@ -3208,15 +3220,20 @@ def _log_evaluation_feedback(
evaluator_response: Union[
ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults
],
run: ls_schemas.Run,
run: Optional[ls_schemas.Run] = None,
source_info: Optional[Dict[str, Any]] = None,
project_id: Optional[ID_TYPE] = None,
) -> List[ls_evaluator.EvaluationResult]:
results = self._select_eval_results(evaluator_response)
for res in results:
source_info_ = source_info or {}
if res.evaluator_info:
source_info_ = {**res.evaluator_info, **source_info_}
run_id_ = res.target_run_id if res.target_run_id else run.id
run_id_ = None
if res.target_run_id:
run_id_ = res.target_run_id
elif run is not None:
run_id_ = run.id
self.create_feedback(
run_id_,
res.key,
Expand All @@ -3227,6 +3244,7 @@ def _log_evaluation_feedback(
source_info=source_info_,
source_run_id=res.source_run_id,
feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
project_id=project_id,
)
return results

Expand Down
5 changes: 5 additions & 0 deletions python/langsmith/evaluation/__init__.py
@@ -1,11 +1,13 @@
"""Evaluation Helpers."""

from langsmith.evaluation._runner import evaluate, evaluate_existing
from langsmith.evaluation.evaluator import (
EvaluationResult,
EvaluationResults,
RunEvaluator,
run_evaluator,
)
from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator
from langsmith.evaluation.string_evaluator import StringEvaluator

__all__ = [
Expand All @@ -14,4 +16,7 @@
"EvaluationResults",
"RunEvaluator",
"StringEvaluator",
"evaluate",
"evaluate_existing",
"LangChainStringEvaluator",
]