Update

langchain-ai · Mar 26, 2024 · 4b06c34 · 4b06c34
1 parent 20f9722
commit 4b06c34
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 83 deletions.
diff --git a/python/Makefile b/python/Makefile
@@ -13,7 +13,7 @@ integration_tests_fast:
  poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests
 
 doctest:
- poetry run pytest --doctest-modules langsmith/evaluation/_runner.py
+ poetry run pytest -n auto --durations=10 --doctest-modules langsmith
 
 lint:
  poetry run ruff .

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -28,11 +28,11 @@
 from typing_extensions import TypedDict
 
 import langsmith
+from langsmith import beta as ls_beta
 from langsmith import env as ls_env
 from langsmith import run_helpers as rh
 from langsmith import run_trees, schemas
 from langsmith import utils as ls_utils
-from langsmith import beta as ls_beta
 from langsmith.evaluation.evaluator import (
  EvaluationResult,
  EvaluationResults,

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
@@ -194,6 +194,10 @@ def __call__(
  """ # noqa: E501
  return self.evaluate_run(run, example)
 
+ def __repr__(self) -> str:
+ """String representation of the DynamicRunEvaluator object."""
+ return f"<DynamicRunEvaluator {self.func.__name__}>"
+
 
 def run_evaluator(
  func: Callable[

diff --git a/python/langsmith/evaluation/integrations/_langchain.py b/python/langsmith/evaluation/integrations/_langchain.py
@@ -37,89 +37,86 @@ class LangChainStringEvaluator:
  Examples:
  Creating a simple LangChainStringEvaluator:
 
- .. code-block:: python
-
- evaluator = LangChainStringEvaluator("exact_match")
+ >>> evaluator = LangChainStringEvaluator("exact_match")
 
  Converting a LangChainStringEvaluator to a RunEvaluator:
 
- .. code-block:: python
-
- from langsmith.evaluation import LangChainStringEvaluator
-
- evaluator = LangChainStringEvaluator(
- "criteria",
- config={
-  "criteria": {
-  "usefulness": "The prediction is useful if"
-  " it is correct and/or asks a useful followup question."
-  },
-  )
-  run_evaluator = evaluator.as_run_evaluator()
+ >>> from langsmith.evaluation import LangChainStringEvaluator
+ >>> evaluator = LangChainStringEvaluator(
+ ...  "criteria",
+ ... config={
+ ...  "criteria": {
+ ...  "usefulness": "The prediction is useful if"
+ ...  " it is correct and/or asks a useful followup question."
+ ... },
+ ... }
+ ... )
+ >>> run_evaluator = evaluator.as_run_evaluator()
+ >>> run_evaluator # doctest: +ELLIPSIS
+ <DynamicRunEvaluator ...>
 
  Using the `evaluate` API with different evaluators:
-
- .. code-block:: python
-
- from langchain_anthropic import ChatAnthropic
-
- import langsmith
- from langsmith.evaluation import LangChainStringEvaluator, evaluate
-
- # Criteria evaluator
- criteria_evaluator = LangChainStringEvaluator(
- "criteria", config={
- "criteria": {
- "usefulness": "The prediction is useful if it is correct"
- " and/or asks a useful followup question."
- },
- "llm": ChatAnthropic(model="claude-3-opus-20240229")
- }
- )
-
- # Embedding distance evaluator
- embedding_evaluator = LangChainStringEvaluator("embedding_distance")
-
- # Exact match evaluator
- exact_match_evaluator = LangChainStringEvaluator("exact_match")
-
- # Regex match evaluator
- regex_match_evaluator = LangChainStringEvaluator(
- "regex_match", config={
- "flags": re.IGNORECASE
- }
- )
-
- # Scoring evaluator
- scoring_evaluator = LangChainStringEvaluator(
- "scoring", config={
- "criteria": {
- "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate"
- },
- "normalize_by": 10
- }
- )
-
- # String distance evaluator
- string_distance_evaluator = LangChainStringEvaluator(
- "string_distance", config={
- "distance_metric": "levenshtein"
- }
- )
-
- results = evaluate(
- lambda inputs: {"prediction": "foo"},
- data="my-dataset",
- evaluators=[
- embedding_evaluator,
- criteria_evaluator,
- exact_match_evaluator,
- regex_match_evaluator,
- scoring_evaluator,
- string_distance_evaluator
- ],
- batch_evaluators=[equal_length],
- )
+ >>> def prepare_data(run: Run, example: Example):
+ ... # Convert the evaluation data into the format expected by the evaluator
+ ... # Only required for datasets with multiple inputs/output keys
+ ... return {
+ ... "prediction": run.outputs["prediction"],
+ ... "reference": example.outputs["answer"],
+ ... "input": str(example.inputs),
+ ... }
+ ...
+ >>> import re
+ >>> from langchain_anthropic import ChatAnthropic
+ >>> import langsmith
+ >>> from langsmith.evaluation import LangChainStringEvaluator, evaluate
+ >>> criteria_evaluator = LangChainStringEvaluator(
+ ... "criteria", config={
+ ... "criteria": {
+ ... "usefulness": "The prediction is useful if it is correct"
+ ... " and/or asks a useful followup question."
+ ... },
+ ... "llm": ChatAnthropic(model="claude-3-opus-20240229")
+ ... },
+ ... prepare_data=prepare_data
+ ... )
+ >>> embedding_evaluator = LangChainStringEvaluator("embedding_distance")
+ >>> exact_match_evaluator = LangChainStringEvaluator("exact_match")
+ >>> regex_match_evaluator = LangChainStringEvaluator(
+ ... "regex_match", config={
+ ... "flags": re.IGNORECASE
+ ... },
+ ... prepare_data=prepare_data
+ ... )
+ >>> scoring_evaluator = LangChainStringEvaluator(
+ ... "labeled_score_string", config={
+ ... "criteria": {
+ ... "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate"
+ ... },
+ ... "normalize_by": 10
+ ... },
+ ... prepare_data=prepare_data
+ ... )
+ >>> string_distance_evaluator = LangChainStringEvaluator(
+ ... "string_distance", config={
+ ... "distance_metric": "levenshtein"
+ ... },
+ ... prepare_data=prepare_data
+ ... )
+ >>> from langsmith import Client
+ >>> client = Client()
+ >>> results = evaluate(
+ ... lambda inputs: {"prediction": "foo"},
+ ... data=client.list_examples(dataset_name="Evaluate Examples", limit=1),
+ ... evaluators=[
+ ... embedding_evaluator,
+ ... criteria_evaluator,
+ ... exact_match_evaluator,
+ ... regex_match_evaluator,
+ ... scoring_evaluator,
+ ... string_distance_evaluator
+ ... ],
+ ... ) # doctest: +ELLIPSIS
+ View the evaluation results for experiment:...
  """ # noqa: E501
 
  def __init__(
@@ -187,7 +184,7 @@ def prepare_evaluator_inputs(
  ) -> SingleEvaluatorInput:
  if run.outputs and len(run.outputs) > 1:
  raise ValueError(
- "The evaluator only supports a single output. "
+ f"Evaluator {self.evaluator} only supports a single output. "
  "Please ensure that the run has a single output."
  " Or create a custom evaluator yourself:\n\n"
  f"{customization_error_str}"
@@ -199,7 +196,7 @@ def prepare_evaluator_inputs(
  and len(example.outputs) > 1
  ):
  raise ValueError(
- "The evaluator only supports a single output. "
+ f"Evaluator {self.evaluator} nly supports a single output. "
  "Please ensure that the example has a single output."
  " Or create a custom evaluator yourself:\n\n"
  f"{customization_error_str}"
@@ -211,7 +208,7 @@ def prepare_evaluator_inputs(
  and len(example.inputs) > 1
  ):
  raise ValueError(
- "The evaluator only supports a single input. "
+ f"Evaluator {self.evaluator} only supports a single input. "
  "Please ensure that the example has a single input."
  " Or create a custom evaluator yourself:\n\n"
  f"{customization_error_str}"