From 4b06c345dd42e6500f498db9e28a16bde7f9cdda Mon Sep 17 00:00:00 2001
From: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com>
Date: Tue, 26 Mar 2024 09:17:48 -0700
Subject: [PATCH] Update

---
 python/Makefile                               |   2 +-
 python/langsmith/evaluation/_runner.py        |   2 +-
 python/langsmith/evaluation/evaluator.py      |   4 +
 .../evaluation/integrations/_langchain.py     | 159 +++++++++---------
 4 files changed, 84 insertions(+), 83 deletions(-)

diff --git a/python/Makefile b/python/Makefile
index 15d45e0a..a50228cc 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -13,7 +13,7 @@ integration_tests_fast:
 	poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests
 
 doctest:
-	poetry run pytest --doctest-modules langsmith/evaluation/_runner.py
+	poetry run pytest -n auto --durations=10 --doctest-modules langsmith
 
 lint:
 	poetry run ruff .
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
index cae607e2..03390dd2 100644
--- a/python/langsmith/evaluation/_runner.py
+++ b/python/langsmith/evaluation/_runner.py
@@ -28,11 +28,11 @@
 from typing_extensions import TypedDict
 
 import langsmith
+from langsmith import beta as ls_beta
 from langsmith import env as ls_env
 from langsmith import run_helpers as rh
 from langsmith import run_trees, schemas
 from langsmith import utils as ls_utils
-from langsmith import beta as ls_beta
 from langsmith.evaluation.evaluator import (
     EvaluationResult,
     EvaluationResults,
diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
index 24f280ef..e56e00bd 100644
--- a/python/langsmith/evaluation/evaluator.py
+++ b/python/langsmith/evaluation/evaluator.py
@@ -194,6 +194,10 @@ def __call__(
         """  # noqa: E501
         return self.evaluate_run(run, example)
 
+    def __repr__(self) -> str:
+        """String representation of the DynamicRunEvaluator object."""
+        return f"<DynamicRunEvaluator {self.func.__name__}>"
+
 
 def run_evaluator(
     func: Callable[
diff --git a/python/langsmith/evaluation/integrations/_langchain.py b/python/langsmith/evaluation/integrations/_langchain.py
index 39da628f..43a03d64 100644
--- a/python/langsmith/evaluation/integrations/_langchain.py
+++ b/python/langsmith/evaluation/integrations/_langchain.py
@@ -37,89 +37,86 @@ class LangChainStringEvaluator:
     Examples:
         Creating a simple LangChainStringEvaluator:
 
-        .. code-block:: python
-
-            evaluator = LangChainStringEvaluator("exact_match")
+        >>> evaluator = LangChainStringEvaluator("exact_match")
 
         Converting a LangChainStringEvaluator to a RunEvaluator:
 
-        .. code-block:: python
-
-            from langsmith.evaluation import LangChainStringEvaluator
-
-            evaluator = LangChainStringEvaluator(
-                "criteria",
-                config={
-                    "criteria": {
-                        "usefulness": "The prediction is useful if"
-                        " it is correct and/or asks a useful followup question."
-                    },
-            )
-            run_evaluator = evaluator.as_run_evaluator()
+        >>> from langsmith.evaluation import LangChainStringEvaluator
+        >>> evaluator = LangChainStringEvaluator(
+        ...     "criteria",
+        ...     config={
+        ...         "criteria": {
+        ...             "usefulness": "The prediction is useful if"
+        ...             " it is correct and/or asks a useful followup question."
+        ...         },
+        ...     }
+        ... )
+        >>> run_evaluator = evaluator.as_run_evaluator()
+        >>> run_evaluator # doctest: +ELLIPSIS
+        <DynamicRunEvaluator ...>
 
         Using the `evaluate` API with different evaluators:
-
-        .. code-block:: python
-
-            from langchain_anthropic import ChatAnthropic
-
-            import langsmith
-            from langsmith.evaluation import LangChainStringEvaluator, evaluate
-
-            # Criteria evaluator
-            criteria_evaluator = LangChainStringEvaluator(
-                "criteria", config={
-                    "criteria": {
-                        "usefulness": "The prediction is useful if it is correct"
-                                " and/or asks a useful followup question."
-                    },
-                    "llm": ChatAnthropic(model="claude-3-opus-20240229")
-                }
-            )
-
-            # Embedding distance evaluator
-            embedding_evaluator = LangChainStringEvaluator("embedding_distance")
-
-            # Exact match evaluator
-            exact_match_evaluator = LangChainStringEvaluator("exact_match")
-
-            # Regex match evaluator
-            regex_match_evaluator = LangChainStringEvaluator(
-                "regex_match", config={
-                    "flags": re.IGNORECASE
-                }
-            )
-
-            # Scoring evaluator
-            scoring_evaluator = LangChainStringEvaluator(
-                "scoring", config={
-                    "criteria": {
-                        "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate"
-                    },
-                    "normalize_by": 10
-                }
-            )
-
-            # String distance evaluator
-            string_distance_evaluator = LangChainStringEvaluator(
-                "string_distance", config={
-                    "distance_metric": "levenshtein"
-                }
-            )
-
-            results = evaluate(
-                lambda inputs: {"prediction": "foo"},
-                data="my-dataset",
-                evaluators=[
-                    embedding_evaluator,
-                    criteria_evaluator,
-                    exact_match_evaluator,
-                    regex_match_evaluator,
-                    scoring_evaluator,
-                    string_distance_evaluator
-                ],
-                batch_evaluators=[equal_length],
-            )
+        >>> def prepare_data(run: Run, example: Example):
+        ...     # Convert the evaluation data into the format expected by the evaluator
+        ...     # Only required for datasets with multiple inputs/output keys
+        ...     return {
+        ...         "prediction": run.outputs["prediction"],
+        ...         "reference": example.outputs["answer"],
+        ...         "input": str(example.inputs),
+        ...     }
+        ...
+        >>> import re
+        >>> from langchain_anthropic import ChatAnthropic
+        >>> import langsmith
+        >>> from langsmith.evaluation import LangChainStringEvaluator, evaluate
+        >>> criteria_evaluator = LangChainStringEvaluator(
+        ...     "criteria", config={
+        ...         "criteria": {
+        ...             "usefulness": "The prediction is useful if it is correct"
+        ...                     " and/or asks a useful followup question."
+        ...         },
+        ...         "llm": ChatAnthropic(model="claude-3-opus-20240229")
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> embedding_evaluator = LangChainStringEvaluator("embedding_distance")
+        >>> exact_match_evaluator = LangChainStringEvaluator("exact_match")
+        >>> regex_match_evaluator = LangChainStringEvaluator(
+        ...     "regex_match", config={
+        ...         "flags": re.IGNORECASE
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> scoring_evaluator = LangChainStringEvaluator(
+        ...     "labeled_score_string", config={
+        ...         "criteria": {
+        ...             "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate"
+        ...         },
+        ...         "normalize_by": 10
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> string_distance_evaluator = LangChainStringEvaluator(
+        ...     "string_distance", config={
+        ...         "distance_metric": "levenshtein"
+        ...     },
+        ...     prepare_data=prepare_data
+        ... )
+        >>> from langsmith import Client
+        >>> client = Client()
+        >>> results = evaluate(
+        ...     lambda inputs: {"prediction": "foo"},
+        ...     data=client.list_examples(dataset_name="Evaluate Examples", limit=1),
+        ...     evaluators=[
+        ...         embedding_evaluator,
+        ...         criteria_evaluator,
+        ...         exact_match_evaluator,
+        ...         regex_match_evaluator,
+        ...         scoring_evaluator,
+        ...         string_distance_evaluator
+        ...     ],
+        ... )  # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
     """  # noqa: E501
 
     def __init__(
@@ -187,7 +184,7 @@ def prepare_evaluator_inputs(
         ) -> SingleEvaluatorInput:
             if run.outputs and len(run.outputs) > 1:
                 raise ValueError(
-                    "The evaluator only supports a single output. "
+                    f"Evaluator {self.evaluator} only supports a single output. "
                     "Please ensure that the run has a single output."
                     " Or create a custom evaluator yourself:\n\n"
                     f"{customization_error_str}"
@@ -199,7 +196,7 @@ def prepare_evaluator_inputs(
                 and len(example.outputs) > 1
             ):
                 raise ValueError(
-                    "The evaluator only supports a single output. "
+                    f"Evaluator {self.evaluator} nly supports a single output. "
                     "Please ensure that the example has a single output."
                     " Or create a custom evaluator yourself:\n\n"
                     f"{customization_error_str}"
@@ -211,7 +208,7 @@ def prepare_evaluator_inputs(
                 and len(example.inputs) > 1
             ):
                 raise ValueError(
-                    "The evaluator only supports a single input. "
+                    f"Evaluator {self.evaluator} only supports a single input. "
                     "Please ensure that the example has a single input."
                     " Or create a custom evaluator yourself:\n\n"
                     f"{customization_error_str}"