Skip to content

Commit

Permalink
[Python] Add beta evaluate() (#542)
Browse files Browse the repository at this point in the history
- [X] base  `evaluate` api
- [X] support wrapper to support OTS evaluators from langchain
- [X] add examples
  • Loading branch information
hinthornw committed Mar 27, 2024
1 parent 660d29c commit 545b7fa
Show file tree
Hide file tree
Showing 15 changed files with 2,093 additions and 63 deletions.
15 changes: 14 additions & 1 deletion .github/actions/python-integration-tests/action.yml
Expand Up @@ -10,6 +10,9 @@ inputs:
openai-api-key:
description: "OpenAI API key"
required: false
anthropic-api-key:
description: "Anthropic API key"
required: false
runs:
using: "composite"
steps:
Expand All @@ -30,7 +33,7 @@ runs:
- name: Install dependencies
run: |
poetry install --with dev
poetry run pip install -U langchain
poetry run pip install -U langchain langchain_anthropic tiktoken rapidfuzz
shell: bash
working-directory: python

Expand All @@ -42,3 +45,13 @@ runs:
run: make integration_tests_fast
shell: bash
working-directory: python

- name: Run doctest
env:
LANGCHAIN_TRACING_V2: "true"
LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }}
OPENAI_API_KEY: ${{ inputs.openai-api-key }}
ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
run: make doctest
shell: bash
working-directory: python
1 change: 1 addition & 0 deletions .github/workflows/integration_tests.yml
Expand Up @@ -49,6 +49,7 @@ jobs:
python-version: 3.11
langchain-api-key: ${{ secrets.LANGCHAIN_API_KEY }}
openai-api-key: ${{ secrets.OPENAI_API_KEY }}
anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}

js_integration_test:
name: JS Integration Test
Expand Down
3 changes: 3 additions & 0 deletions python/Makefile
Expand Up @@ -12,6 +12,9 @@ integration_tests:
integration_tests_fast:
poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests

doctest:
poetry run pytest -n auto --durations=10 --doctest-modules langsmith

lint:
poetry run ruff .
poetry run mypy .
Expand Down
3 changes: 2 additions & 1 deletion python/langsmith/beta/__init__.py
@@ -1,5 +1,6 @@
"""Beta functionality prone to change."""

from langsmith.beta._evals import compute_test_metrics, convert_runs_to_test
from langsmith.beta._utils import warn_beta

__all__ = ["convert_runs_to_test", "compute_test_metrics"]
__all__ = ["convert_runs_to_test", "compute_test_metrics", "warn_beta"]
30 changes: 24 additions & 6 deletions python/langsmith/client.py
Expand Up @@ -49,11 +49,12 @@
from langsmith import env as ls_env
from langsmith import schemas as ls_schemas
from langsmith import utils as ls_utils
from langsmith.evaluation import evaluator as ls_evaluator

if TYPE_CHECKING:
import pandas as pd # type: ignore

from langsmith.evaluation import evaluator as ls_evaluator

logger = logging.getLogger(__name__)
_urllib3_logger = logging.getLogger("urllib3.connectionpool")

Expand Down Expand Up @@ -728,7 +729,9 @@ def request_with_retries(
args = list(e.args)
msg = args[1] if len(args) > 1 else ""
msg = msg.replace("session", "session (project)")
emsg = "\n".join([args[0]] + [msg] + args[2:])
emsg = "\n".join(
[str(args[0])] + [msg] + [str(arg) for arg in args[2:]]
)
raise ls_utils.LangSmithError(
f"Failed to {request_method} {url} in LangSmith API. {emsg}"
) from e
Expand Down Expand Up @@ -3144,11 +3147,20 @@ def _resolve_example_id(
def _select_eval_results(
self,
results: Union[ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults],
*,
fn_name: Optional[str] = None,
) -> List[ls_evaluator.EvaluationResult]:
from langsmith.evaluation import evaluator as ls_evaluator # noqa: F811

if isinstance(results, ls_evaluator.EvaluationResult):
results_ = [results]
elif isinstance(results, dict) and "results" in results:
results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
elif isinstance(results, dict):
if "results" in results:
results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
else:
results_ = [
ls_evaluator.EvaluationResult(**{"key": fn_name, **results})
]
else:
raise TypeError(
f"Invalid evaluation result type {type(results)}."
Expand Down Expand Up @@ -3208,15 +3220,20 @@ def _log_evaluation_feedback(
evaluator_response: Union[
ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults
],
run: ls_schemas.Run,
run: Optional[ls_schemas.Run] = None,
source_info: Optional[Dict[str, Any]] = None,
project_id: Optional[ID_TYPE] = None,
) -> List[ls_evaluator.EvaluationResult]:
results = self._select_eval_results(evaluator_response)
for res in results:
source_info_ = source_info or {}
if res.evaluator_info:
source_info_ = {**res.evaluator_info, **source_info_}
run_id_ = res.target_run_id if res.target_run_id else run.id
run_id_ = None
if res.target_run_id:
run_id_ = res.target_run_id
elif run is not None:
run_id_ = run.id
self.create_feedback(
run_id_,
res.key,
Expand All @@ -3227,6 +3244,7 @@ def _log_evaluation_feedback(
source_info=source_info_,
source_run_id=res.source_run_id,
feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
project_id=project_id,
)
return results

Expand Down
5 changes: 5 additions & 0 deletions python/langsmith/evaluation/__init__.py
@@ -1,11 +1,13 @@
"""Evaluation Helpers."""

from langsmith.evaluation._runner import evaluate, evaluate_existing
from langsmith.evaluation.evaluator import (
EvaluationResult,
EvaluationResults,
RunEvaluator,
run_evaluator,
)
from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator
from langsmith.evaluation.string_evaluator import StringEvaluator

__all__ = [
Expand All @@ -14,4 +16,7 @@
"EvaluationResults",
"RunEvaluator",
"StringEvaluator",
"evaluate",
"evaluate_existing",
"LangChainStringEvaluator",
]

0 comments on commit 545b7fa

Please sign in to comment.