From 4b06c345dd42e6500f498db9e28a16bde7f9cdda Mon Sep 17 00:00:00 2001 From: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com> Date: Tue, 26 Mar 2024 09:17:48 -0700 Subject: [PATCH] Update --- python/Makefile | 2 +- python/langsmith/evaluation/_runner.py | 2 +- python/langsmith/evaluation/evaluator.py | 4 + .../evaluation/integrations/_langchain.py | 159 +++++++++--------- 4 files changed, 84 insertions(+), 83 deletions(-) diff --git a/python/Makefile b/python/Makefile index 15d45e0a..a50228cc 100644 --- a/python/Makefile +++ b/python/Makefile @@ -13,7 +13,7 @@ integration_tests_fast: poetry run pytest -n auto --durations=10 -v --cov=langsmith --cov-report=term-missing --cov-report=html --cov-config=.coveragerc tests/integration_tests doctest: - poetry run pytest --doctest-modules langsmith/evaluation/_runner.py + poetry run pytest -n auto --durations=10 --doctest-modules langsmith lint: poetry run ruff . diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index cae607e2..03390dd2 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -28,11 +28,11 @@ from typing_extensions import TypedDict import langsmith +from langsmith import beta as ls_beta from langsmith import env as ls_env from langsmith import run_helpers as rh from langsmith import run_trees, schemas from langsmith import utils as ls_utils -from langsmith import beta as ls_beta from langsmith.evaluation.evaluator import ( EvaluationResult, EvaluationResults, diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 24f280ef..e56e00bd 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -194,6 +194,10 @@ def __call__( """ # noqa: E501 return self.evaluate_run(run, example) + def __repr__(self) -> str: + """String representation of the DynamicRunEvaluator object.""" + return f"" + def run_evaluator( func: Callable[ diff --git a/python/langsmith/evaluation/integrations/_langchain.py b/python/langsmith/evaluation/integrations/_langchain.py index 39da628f..43a03d64 100644 --- a/python/langsmith/evaluation/integrations/_langchain.py +++ b/python/langsmith/evaluation/integrations/_langchain.py @@ -37,89 +37,86 @@ class LangChainStringEvaluator: Examples: Creating a simple LangChainStringEvaluator: - .. code-block:: python - - evaluator = LangChainStringEvaluator("exact_match") + >>> evaluator = LangChainStringEvaluator("exact_match") Converting a LangChainStringEvaluator to a RunEvaluator: - .. code-block:: python - - from langsmith.evaluation import LangChainStringEvaluator - - evaluator = LangChainStringEvaluator( - "criteria", - config={ - "criteria": { - "usefulness": "The prediction is useful if" - " it is correct and/or asks a useful followup question." - }, - ) - run_evaluator = evaluator.as_run_evaluator() + >>> from langsmith.evaluation import LangChainStringEvaluator + >>> evaluator = LangChainStringEvaluator( + ... "criteria", + ... config={ + ... "criteria": { + ... "usefulness": "The prediction is useful if" + ... " it is correct and/or asks a useful followup question." + ... }, + ... } + ... ) + >>> run_evaluator = evaluator.as_run_evaluator() + >>> run_evaluator # doctest: +ELLIPSIS + Using the `evaluate` API with different evaluators: - - .. code-block:: python - - from langchain_anthropic import ChatAnthropic - - import langsmith - from langsmith.evaluation import LangChainStringEvaluator, evaluate - - # Criteria evaluator - criteria_evaluator = LangChainStringEvaluator( - "criteria", config={ - "criteria": { - "usefulness": "The prediction is useful if it is correct" - " and/or asks a useful followup question." - }, - "llm": ChatAnthropic(model="claude-3-opus-20240229") - } - ) - - # Embedding distance evaluator - embedding_evaluator = LangChainStringEvaluator("embedding_distance") - - # Exact match evaluator - exact_match_evaluator = LangChainStringEvaluator("exact_match") - - # Regex match evaluator - regex_match_evaluator = LangChainStringEvaluator( - "regex_match", config={ - "flags": re.IGNORECASE - } - ) - - # Scoring evaluator - scoring_evaluator = LangChainStringEvaluator( - "scoring", config={ - "criteria": { - "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate" - }, - "normalize_by": 10 - } - ) - - # String distance evaluator - string_distance_evaluator = LangChainStringEvaluator( - "string_distance", config={ - "distance_metric": "levenshtein" - } - ) - - results = evaluate( - lambda inputs: {"prediction": "foo"}, - data="my-dataset", - evaluators=[ - embedding_evaluator, - criteria_evaluator, - exact_match_evaluator, - regex_match_evaluator, - scoring_evaluator, - string_distance_evaluator - ], - batch_evaluators=[equal_length], - ) + >>> def prepare_data(run: Run, example: Example): + ... # Convert the evaluation data into the format expected by the evaluator + ... # Only required for datasets with multiple inputs/output keys + ... return { + ... "prediction": run.outputs["prediction"], + ... "reference": example.outputs["answer"], + ... "input": str(example.inputs), + ... } + ... + >>> import re + >>> from langchain_anthropic import ChatAnthropic + >>> import langsmith + >>> from langsmith.evaluation import LangChainStringEvaluator, evaluate + >>> criteria_evaluator = LangChainStringEvaluator( + ... "criteria", config={ + ... "criteria": { + ... "usefulness": "The prediction is useful if it is correct" + ... " and/or asks a useful followup question." + ... }, + ... "llm": ChatAnthropic(model="claude-3-opus-20240229") + ... }, + ... prepare_data=prepare_data + ... ) + >>> embedding_evaluator = LangChainStringEvaluator("embedding_distance") + >>> exact_match_evaluator = LangChainStringEvaluator("exact_match") + >>> regex_match_evaluator = LangChainStringEvaluator( + ... "regex_match", config={ + ... "flags": re.IGNORECASE + ... }, + ... prepare_data=prepare_data + ... ) + >>> scoring_evaluator = LangChainStringEvaluator( + ... "labeled_score_string", config={ + ... "criteria": { + ... "accuracy": "Score 1: Completely inaccurate\nScore 5: Somewhat accurate\nScore 10: Completely accurate" + ... }, + ... "normalize_by": 10 + ... }, + ... prepare_data=prepare_data + ... ) + >>> string_distance_evaluator = LangChainStringEvaluator( + ... "string_distance", config={ + ... "distance_metric": "levenshtein" + ... }, + ... prepare_data=prepare_data + ... ) + >>> from langsmith import Client + >>> client = Client() + >>> results = evaluate( + ... lambda inputs: {"prediction": "foo"}, + ... data=client.list_examples(dataset_name="Evaluate Examples", limit=1), + ... evaluators=[ + ... embedding_evaluator, + ... criteria_evaluator, + ... exact_match_evaluator, + ... regex_match_evaluator, + ... scoring_evaluator, + ... string_distance_evaluator + ... ], + ... ) # doctest: +ELLIPSIS + View the evaluation results for experiment:... """ # noqa: E501 def __init__( @@ -187,7 +184,7 @@ def prepare_evaluator_inputs( ) -> SingleEvaluatorInput: if run.outputs and len(run.outputs) > 1: raise ValueError( - "The evaluator only supports a single output. " + f"Evaluator {self.evaluator} only supports a single output. " "Please ensure that the run has a single output." " Or create a custom evaluator yourself:\n\n" f"{customization_error_str}" @@ -199,7 +196,7 @@ def prepare_evaluator_inputs( and len(example.outputs) > 1 ): raise ValueError( - "The evaluator only supports a single output. " + f"Evaluator {self.evaluator} nly supports a single output. " "Please ensure that the example has a single output." " Or create a custom evaluator yourself:\n\n" f"{customization_error_str}" @@ -211,7 +208,7 @@ def prepare_evaluator_inputs( and len(example.inputs) > 1 ): raise ValueError( - "The evaluator only supports a single input. " + f"Evaluator {self.evaluator} only supports a single input. " "Please ensure that the example has a single input." " Or create a custom evaluator yourself:\n\n" f"{customization_error_str}"