[Python] Add beta evaluate() (#542)

- [X] base `evaluate` api - [X] support wrapper to support OTS evaluators from langchain - [X] add examples
langchain-ai · Mar 27, 2024 · 545b7fa · 545b7fa
1 parent 660d29c
commit 545b7fa
Show file tree

Hide file tree

Showing 15 changed files with 2,093 additions and 63 deletions.
diff --git a/.github/actions/python-integration-tests/action.yml b/.github/actions/python-integration-tests/action.yml
@@ -10,6 +10,9 @@ inputs:
  openai-api-key:
  description: "OpenAI API key"
  required: false
+ anthropic-api-key:
+ description: "Anthropic API key"
+ required: false
 runs:
  using: "composite"
  steps:
@@ -30,7 +33,7 @@ runs:
  - name: Install dependencies
  run: |
  poetry install --with dev
- poetry run pip install -U langchain
+ poetry run pip install -U langchain langchain_anthropic tiktoken rapidfuzz
  shell: bash
  working-directory: python
 
@@ -42,3 +45,13 @@ runs:
  run: make integration_tests_fast
  shell: bash
  working-directory: python
+
+ - name: Run doctest
+ env:
+ LANGCHAIN_TRACING_V2: "true"
+ LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }}
+ OPENAI_API_KEY: ${{ inputs.openai-api-key }}
+ ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
+ run: make doctest
+ shell: bash
+ working-directory: python
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
@@ -49,6 +49,7 @@ jobs:
  python-version: 3.11
  langchain-api-key: ${{ secrets.LANGCHAIN_API_KEY }}
  openai-api-key: ${{ secrets.OPENAI_API_KEY }}
+ anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
 
  js_integration_test:
  name: JS Integration Test

diff --git a/python/Makefile b/python/Makefile
@@ -12,6 +12,9 @@ integration_tests:
 integration_tests_fast:
  poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests
 
+doctest:
+ poetry run pytest -n auto --durations=10 --doctest-modules langsmith
+
 lint:
  poetry run ruff .
  poetry run mypy .

diff --git a/python/langsmith/beta/__init__.py b/python/langsmith/beta/__init__.py
@@ -1,5 +1,6 @@
 """Beta functionality prone to change."""
 
 from langsmith.beta._evals import compute_test_metrics, convert_runs_to_test
+from langsmith.beta._utils import warn_beta
 
-__all__ = ["convert_runs_to_test", "compute_test_metrics"]
+__all__ = ["convert_runs_to_test", "compute_test_metrics", "warn_beta"]
diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -49,11 +49,12 @@
 from langsmith import env as ls_env
 from langsmith import schemas as ls_schemas
 from langsmith import utils as ls_utils
-from langsmith.evaluation import evaluator as ls_evaluator
 
 if TYPE_CHECKING:
  import pandas as pd # type: ignore
 
+ from langsmith.evaluation import evaluator as ls_evaluator
+
 logger = logging.getLogger(__name__)
 _urllib3_logger = logging.getLogger("urllib3.connectionpool")
 
@@ -728,7 +729,9 @@ def request_with_retries(
  args = list(e.args)
  msg = args[1] if len(args) > 1 else ""
  msg = msg.replace("session", "session (project)")
- emsg = "\n".join([args[0]] + [msg] + args[2:])
+ emsg = "\n".join(
+ [str(args[0])] + [msg] + [str(arg) for arg in args[2:]]
+ )
  raise ls_utils.LangSmithError(
  f"Failed to {request_method} {url} in LangSmith API. {emsg}"
  ) from e
@@ -3144,11 +3147,20 @@ def _resolve_example_id(
  def _select_eval_results(
  self,
  results: Union[ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults],
+ *,
+ fn_name: Optional[str] = None,
  ) -> List[ls_evaluator.EvaluationResult]:
+ from langsmith.evaluation import evaluator as ls_evaluator # noqa: F811
+
  if isinstance(results, ls_evaluator.EvaluationResult):
  results_ = [results]
- elif isinstance(results, dict) and "results" in results:
- results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
+ elif isinstance(results, dict):
+ if "results" in results:
+ results_ = cast(List[ls_evaluator.EvaluationResult], results["results"])
+ else:
+ results_ = [
+ ls_evaluator.EvaluationResult(**{"key": fn_name, **results})
+ ]
  else:
  raise TypeError(
  f"Invalid evaluation result type {type(results)}."
@@ -3208,15 +3220,20 @@ def _log_evaluation_feedback(
  evaluator_response: Union[
  ls_evaluator.EvaluationResult, ls_evaluator.EvaluationResults
  ],
- run: ls_schemas.Run,
+ run: Optional[ls_schemas.Run] = None,
  source_info: Optional[Dict[str, Any]] = None,
+ project_id: Optional[ID_TYPE] = None,
  ) -> List[ls_evaluator.EvaluationResult]:
  results = self._select_eval_results(evaluator_response)
  for res in results:
  source_info_ = source_info or {}
  if res.evaluator_info:
  source_info_ = {**res.evaluator_info, **source_info_}
- run_id_ = res.target_run_id if res.target_run_id else run.id
+ run_id_ = None
+ if res.target_run_id:
+ run_id_ = res.target_run_id
+ elif run is not None:
+ run_id_ = run.id
  self.create_feedback(
  run_id_,
  res.key,
@@ -3227,6 +3244,7 @@ def _log_evaluation_feedback(
  source_info=source_info_,
  source_run_id=res.source_run_id,
  feedback_source_type=ls_schemas.FeedbackSourceType.MODEL,
+ project_id=project_id,
  )
  return results
 

diff --git a/python/langsmith/evaluation/__init__.py b/python/langsmith/evaluation/__init__.py
@@ -1,11 +1,13 @@
 """Evaluation Helpers."""
 
+from langsmith.evaluation._runner import evaluate, evaluate_existing
 from langsmith.evaluation.evaluator import (
  EvaluationResult,
  EvaluationResults,
  RunEvaluator,
  run_evaluator,
 )
+from langsmith.evaluation.integrations._langchain import LangChainStringEvaluator
 from langsmith.evaluation.string_evaluator import StringEvaluator
 
 __all__ = [
@@ -14,4 +16,7 @@
  "EvaluationResults",
  "RunEvaluator",
  "StringEvaluator",
+ "evaluate",
+ "evaluate_existing",
+ "LangChainStringEvaluator",
 ]