mirror of https://github.com/hwchase17/langchain
[Breaking] Update Evaluation Functionality (#7388)
- Migrate from deprecated langchainplus_sdk to `langsmith` package - Update the `run_on_dataset()` API to use an eval config - Update a number of evaluators, as well as the loading logic - Update docstrings / reference docs - Update tracer to share single HTTP sessionpull/7651/head
parent
224199083b
commit
a673a51efa
@ -1,16 +0,0 @@
|
||||
"""LangChain + Client."""
|
||||
from langchain.client.runner_utils import (
|
||||
InputFormatError,
|
||||
arun_on_dataset,
|
||||
arun_on_examples,
|
||||
run_on_dataset,
|
||||
run_on_examples,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"InputFormatError",
|
||||
"arun_on_dataset",
|
||||
"run_on_dataset",
|
||||
"arun_on_examples",
|
||||
"run_on_examples",
|
||||
]
|
@ -1,759 +0,0 @@
|
||||
"""Utilities for running language models or Chains over datasets."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Coroutine,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchainplus_sdk import LangChainPlusClient, RunEvaluator
|
||||
from langchainplus_sdk.schemas import Example
|
||||
|
||||
from langchain.callbacks.base import BaseCallbackHandler
|
||||
from langchain.callbacks.manager import Callbacks
|
||||
from langchain.callbacks.tracers.base import BaseTracer
|
||||
from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
|
||||
from langchain.callbacks.tracers.langchain import LangChainTracer
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chat_models.base import BaseChatModel
|
||||
from langchain.llms.base import BaseLLM
|
||||
from langchain.schema import (
|
||||
ChatResult,
|
||||
LLMResult,
|
||||
)
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.schema.messages import (
|
||||
BaseMessage,
|
||||
HumanMessage,
|
||||
get_buffer_string,
|
||||
messages_from_dict,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MODEL_OR_CHAIN_FACTORY = Union[Callable[[], Chain], BaseLanguageModel]
|
||||
|
||||
|
||||
class InputFormatError(Exception):
|
||||
"""Raised when the input format is invalid."""
|
||||
|
||||
|
||||
def _get_prompts(inputs: Dict[str, Any]) -> List[str]:
|
||||
"""Get prompts from inputs.
|
||||
|
||||
Args:
|
||||
inputs: The input dictionary.
|
||||
|
||||
Returns:
|
||||
A list of prompts.
|
||||
Raises:
|
||||
InputFormatError: If the input format is invalid.
|
||||
"""
|
||||
if not inputs:
|
||||
raise InputFormatError("Inputs should not be empty.")
|
||||
|
||||
prompts = []
|
||||
if "prompt" in inputs:
|
||||
if not isinstance(inputs["prompt"], str):
|
||||
raise InputFormatError(
|
||||
"Expected string for 'prompt', got"
|
||||
f" {type(inputs['prompt']).__name__}"
|
||||
)
|
||||
prompts = [inputs["prompt"]]
|
||||
elif "prompts" in inputs:
|
||||
if not isinstance(inputs["prompts"], list) or not all(
|
||||
isinstance(i, str) for i in inputs["prompts"]
|
||||
):
|
||||
raise InputFormatError(
|
||||
"Expected list of strings for 'prompts',"
|
||||
f" got {type(inputs['prompts']).__name__}"
|
||||
)
|
||||
prompts = inputs["prompts"]
|
||||
elif len(inputs) == 1:
|
||||
prompt_ = next(iter(inputs.values()))
|
||||
if isinstance(prompt_, str):
|
||||
prompts = [prompt_]
|
||||
elif isinstance(prompt_, list) and all(isinstance(i, str) for i in prompt_):
|
||||
prompts = prompt_
|
||||
else:
|
||||
raise InputFormatError(f"LLM Run expects string prompt input. Got {inputs}")
|
||||
else:
|
||||
raise InputFormatError(
|
||||
f"LLM Run expects 'prompt' or 'prompts' in inputs. Got {inputs}"
|
||||
)
|
||||
|
||||
return prompts
|
||||
|
||||
|
||||
def _get_messages(inputs: Dict[str, Any]) -> List[List[BaseMessage]]:
|
||||
"""Get Chat Messages from inputs.
|
||||
|
||||
Args:
|
||||
inputs: The input dictionary.
|
||||
|
||||
Returns:
|
||||
A list of chat messages.
|
||||
Raises:
|
||||
InputFormatError: If the input format is invalid.
|
||||
"""
|
||||
if not inputs:
|
||||
raise InputFormatError("Inputs should not be empty.")
|
||||
|
||||
if "messages" in inputs:
|
||||
single_input = inputs["messages"]
|
||||
elif len(inputs) == 1:
|
||||
single_input = next(iter(inputs.values()))
|
||||
else:
|
||||
raise InputFormatError(f"Chat Run expects 'messages' in inputs. Got {inputs}")
|
||||
if isinstance(single_input, list) and all(
|
||||
isinstance(i, dict) for i in single_input
|
||||
):
|
||||
raw_messages = [single_input]
|
||||
elif isinstance(single_input, list) and all(
|
||||
isinstance(i, list) for i in single_input
|
||||
):
|
||||
raw_messages = single_input
|
||||
else:
|
||||
raise InputFormatError(
|
||||
f"Chat Run expects List[dict] or List[List[dict]] 'messages'"
|
||||
f" input. Got {inputs}"
|
||||
)
|
||||
return [messages_from_dict(batch) for batch in raw_messages]
|
||||
|
||||
|
||||
async def _arun_llm(
|
||||
llm: BaseLanguageModel,
|
||||
inputs: Dict[str, Any],
|
||||
*,
|
||||
tags: Optional[List[str]] = None,
|
||||
callbacks: Callbacks = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Union[LLMResult, ChatResult]:
|
||||
"""Asynchronously run the language model.
|
||||
|
||||
Args:
|
||||
llm: The language model to run.
|
||||
inputs: The input dictionary.
|
||||
tags: Optional tags to add to the run.
|
||||
callbacks: Optional callbacks to use during the run.
|
||||
input_mapper: Optional function to map inputs to the expected format.
|
||||
|
||||
Returns:
|
||||
The LLMResult or ChatResult.
|
||||
Raises:
|
||||
ValueError: If the LLM type is unsupported.
|
||||
InputFormatError: If the input format is invalid.
|
||||
"""
|
||||
if input_mapper is not None:
|
||||
if not isinstance(llm, (BaseLLM, BaseChatModel)):
|
||||
raise ValueError(f"Unsupported LLM type {type(llm).__name__}")
|
||||
llm_output = await llm.agenerate(
|
||||
input_mapper(inputs), callbacks=callbacks, tags=tags
|
||||
)
|
||||
elif isinstance(llm, BaseLLM):
|
||||
try:
|
||||
llm_prompts = _get_prompts(inputs)
|
||||
llm_output = await llm.agenerate(
|
||||
llm_prompts, callbacks=callbacks, tags=tags
|
||||
)
|
||||
except InputFormatError:
|
||||
llm_messages = _get_messages(inputs)
|
||||
buffer_strings = [get_buffer_string(messages) for messages in llm_messages]
|
||||
llm_output = await llm.agenerate(
|
||||
buffer_strings, callbacks=callbacks, tags=tags
|
||||
)
|
||||
elif isinstance(llm, BaseChatModel):
|
||||
try:
|
||||
messages = _get_messages(inputs)
|
||||
llm_output = await llm.agenerate(messages, callbacks=callbacks, tags=tags)
|
||||
except InputFormatError:
|
||||
prompts = _get_prompts(inputs)
|
||||
converted_messages: List[List[BaseMessage]] = [
|
||||
[HumanMessage(content=prompt)] for prompt in prompts
|
||||
]
|
||||
llm_output = await llm.agenerate(
|
||||
converted_messages, callbacks=callbacks, tags=tags
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported LLM type {type(llm)}")
|
||||
return llm_output
|
||||
|
||||
|
||||
async def _arun_llm_or_chain(
|
||||
example: Example,
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
n_repetitions: int,
|
||||
*,
|
||||
tags: Optional[List[str]] = None,
|
||||
callbacks: Optional[List[BaseCallbackHandler]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
|
||||
"""Asynchronously run the Chain or language model.
|
||||
|
||||
Args:
|
||||
example: The example to run.
|
||||
llm_or_chain_factory: The Chain or language model constructor to run.
|
||||
n_repetitions: The number of times to run the model on each example.
|
||||
tags: Optional tags to add to the run.
|
||||
callbacks: Optional callbacks to use during the run.
|
||||
input_mapper: Optional function to map the input to the expected format.
|
||||
|
||||
Returns:
|
||||
A list of outputs.
|
||||
"""
|
||||
if callbacks:
|
||||
previous_example_ids = [
|
||||
getattr(tracer, "example_id", None) for tracer in callbacks
|
||||
]
|
||||
for tracer in callbacks:
|
||||
if hasattr(tracer, "example_id"):
|
||||
tracer.example_id = example.id
|
||||
else:
|
||||
previous_example_ids = None
|
||||
outputs = []
|
||||
for _ in range(n_repetitions):
|
||||
try:
|
||||
if isinstance(llm_or_chain_factory, BaseLanguageModel):
|
||||
output: Any = await _arun_llm(
|
||||
llm_or_chain_factory,
|
||||
example.inputs,
|
||||
tags=tags,
|
||||
callbacks=callbacks,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
else:
|
||||
chain = llm_or_chain_factory()
|
||||
if input_mapper is not None:
|
||||
inputs_ = input_mapper(example.inputs)
|
||||
else:
|
||||
inputs_ = example.inputs
|
||||
if len(inputs_) == 1:
|
||||
inputs_ = next(iter(inputs_.values()))
|
||||
output = await chain.acall(inputs_, callbacks=callbacks, tags=tags)
|
||||
outputs.append(output)
|
||||
except Exception as e:
|
||||
logger.warning(f"Chain failed for example {example.id}. Error: {e}")
|
||||
outputs.append({"Error": str(e)})
|
||||
if callbacks and previous_example_ids:
|
||||
for example_id, tracer in zip(previous_example_ids, callbacks):
|
||||
if hasattr(tracer, "example_id"):
|
||||
tracer.example_id = example_id
|
||||
return outputs
|
||||
|
||||
|
||||
async def _gather_with_concurrency(
|
||||
n: int,
|
||||
initializer: Callable[[], Coroutine[Any, Any, Any]],
|
||||
*async_funcs: Callable[
|
||||
[Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
|
||||
],
|
||||
) -> List[Any]:
|
||||
"""Run coroutines with a concurrency limit.
|
||||
|
||||
Args:
|
||||
n: The maximum number of concurrent tasks.
|
||||
initializer: A coroutine that initializes shared resources for the tasks.
|
||||
async_funcs: The async_funcs to be run concurrently.
|
||||
|
||||
Returns:
|
||||
A list of results from the coroutines.
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(n)
|
||||
job_state = {"num_processed": 0}
|
||||
|
||||
callback_queue: asyncio.Queue[Sequence[BaseCallbackHandler]] = asyncio.Queue()
|
||||
for _ in range(n):
|
||||
callback_queue.put_nowait(await initializer())
|
||||
|
||||
async def run_coroutine_with_semaphore(
|
||||
async_func: Callable[
|
||||
[Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
|
||||
]
|
||||
) -> Any:
|
||||
async with semaphore:
|
||||
callbacks = await callback_queue.get()
|
||||
try:
|
||||
result = await async_func(callbacks, job_state)
|
||||
finally:
|
||||
callback_queue.put_nowait(callbacks)
|
||||
return result
|
||||
|
||||
results = await asyncio.gather(
|
||||
*(run_coroutine_with_semaphore(function) for function in async_funcs)
|
||||
)
|
||||
while callback_queue:
|
||||
try:
|
||||
callbacks = callback_queue.get_nowait()
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
for callback in callbacks:
|
||||
if isinstance(callback, (LangChainTracer, EvaluatorCallbackHandler)):
|
||||
callback.wait_for_futures()
|
||||
return results
|
||||
|
||||
|
||||
async def _callbacks_initializer(
|
||||
project_name: Optional[str],
|
||||
client: LangChainPlusClient,
|
||||
run_evaluators: Sequence[RunEvaluator],
|
||||
evaluation_handler_collector: List[EvaluatorCallbackHandler],
|
||||
) -> List[BaseTracer]:
|
||||
"""
|
||||
Initialize a tracer to share across tasks.
|
||||
|
||||
Args:
|
||||
project_name: The project name for the tracer.
|
||||
client: The client to use for the tracer.
|
||||
run_evaluators: The evaluators to run.
|
||||
evaluation_handler_collector: A list to collect the evaluators.
|
||||
Used to wait for the evaluators to finish.
|
||||
|
||||
Returns:
|
||||
The callbacks for this thread.
|
||||
"""
|
||||
callbacks: List[BaseTracer] = []
|
||||
if project_name:
|
||||
callbacks.append(LangChainTracer(project_name=project_name))
|
||||
evaluator_project_name = f"{project_name}-evaluators" if project_name else None
|
||||
if run_evaluators:
|
||||
callback = EvaluatorCallbackHandler(
|
||||
client=client,
|
||||
evaluators=run_evaluators,
|
||||
# We already have concurrency, don't want to overload the machine
|
||||
max_workers=1,
|
||||
project_name=evaluator_project_name,
|
||||
)
|
||||
callbacks.append(callback)
|
||||
evaluation_handler_collector.append(callback)
|
||||
return callbacks
|
||||
|
||||
|
||||
async def arun_on_examples(
|
||||
examples: Iterator[Example],
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
*,
|
||||
concurrency_level: int = 5,
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Asynchronously run the chain on examples and store traces
|
||||
to the specified project name.
|
||||
|
||||
Args:
|
||||
examples: Examples to run the model or chain over.
|
||||
llm_or_chain_factory: Language model or Chain constructor to run
|
||||
over the dataset. The Chain constructor is used to permit
|
||||
independent calls on each example without carrying over state.
|
||||
concurrency_level: The number of async tasks to run concurrently.
|
||||
num_repetitions: Number of times to run the model on each example.
|
||||
This is useful when testing success rates or generating confidence
|
||||
intervals.
|
||||
project_name: Project name to use when tracing runs.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to read the dataset. If not provided, a new
|
||||
client will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the project.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
input_mapper: function to map to the inputs dictionary from an Example
|
||||
to the format expected by the model to be evaluated. This is useful if
|
||||
your model needs to deserialize more complex schema or if your dataset
|
||||
has inputs with keys that differ from what is expected by your chain
|
||||
or agent.
|
||||
|
||||
Returns:
|
||||
A dictionary mapping example ids to the model outputs.
|
||||
"""
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
||||
client_ = client or LangChainPlusClient()
|
||||
results: Dict[str, List[Any]] = {}
|
||||
|
||||
async def process_example(
|
||||
example: Example, callbacks: List[BaseCallbackHandler], job_state: dict
|
||||
) -> None:
|
||||
"""Process a single example."""
|
||||
result = await _arun_llm_or_chain(
|
||||
example,
|
||||
llm_or_chain_factory,
|
||||
num_repetitions,
|
||||
tags=tags,
|
||||
callbacks=callbacks,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
results[str(example.id)] = result
|
||||
job_state["num_processed"] += 1
|
||||
if verbose:
|
||||
print(
|
||||
f"Processed examples: {job_state['num_processed']}",
|
||||
end="\r",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
evaluation_handlers: List[EvaluatorCallbackHandler] = []
|
||||
await _gather_with_concurrency(
|
||||
concurrency_level,
|
||||
functools.partial(
|
||||
_callbacks_initializer,
|
||||
project_name=project_name,
|
||||
client=client_,
|
||||
evaluation_handler_collector=evaluation_handlers,
|
||||
run_evaluators=run_evaluators or [],
|
||||
),
|
||||
*(functools.partial(process_example, e) for e in examples),
|
||||
)
|
||||
for handler in evaluation_handlers:
|
||||
handler.wait_for_futures()
|
||||
return results
|
||||
|
||||
|
||||
def run_llm(
|
||||
llm: BaseLanguageModel,
|
||||
inputs: Dict[str, Any],
|
||||
callbacks: Callbacks,
|
||||
*,
|
||||
tags: Optional[List[str]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Union[LLMResult, ChatResult]:
|
||||
"""
|
||||
Run the language model on the example.
|
||||
|
||||
Args:
|
||||
llm: The language model to run.
|
||||
inputs: The input dictionary.
|
||||
callbacks: The callbacks to use during the run.
|
||||
tags: Optional tags to add to the run.
|
||||
input_mapper: function to map to the inputs dictionary from an Example
|
||||
Returns:
|
||||
The LLMResult or ChatResult.
|
||||
Raises:
|
||||
ValueError: If the LLM type is unsupported.
|
||||
InputFormatError: If the input format is invalid.
|
||||
"""
|
||||
if input_mapper is not None:
|
||||
if not isinstance(llm, (BaseLLM, BaseChatModel)):
|
||||
raise ValueError(f"Unsupported LLM type {type(llm).__name__}")
|
||||
llm_output = llm.generate(input_mapper(inputs), callbacks=callbacks, tags=tags)
|
||||
elif isinstance(llm, BaseLLM):
|
||||
try:
|
||||
llm_prompts = _get_prompts(inputs)
|
||||
llm_output = llm.generate(llm_prompts, callbacks=callbacks, tags=tags)
|
||||
except InputFormatError:
|
||||
llm_messages = _get_messages(inputs)
|
||||
buffer_strings = [get_buffer_string(messages) for messages in llm_messages]
|
||||
llm_output = llm.generate(buffer_strings, callbacks=callbacks)
|
||||
elif isinstance(llm, BaseChatModel):
|
||||
try:
|
||||
messages = _get_messages(inputs)
|
||||
llm_output = llm.generate(messages, callbacks=callbacks, tags=tags)
|
||||
except InputFormatError:
|
||||
prompts = _get_prompts(inputs)
|
||||
converted_messages: List[List[BaseMessage]] = [
|
||||
[HumanMessage(content=prompt)] for prompt in prompts
|
||||
]
|
||||
llm_output = llm.generate(
|
||||
converted_messages, callbacks=callbacks, tags=tags
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported LLM type {type(llm)}")
|
||||
return llm_output
|
||||
|
||||
|
||||
def run_llm_or_chain(
|
||||
example: Example,
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
n_repetitions: int,
|
||||
*,
|
||||
tags: Optional[List[str]] = None,
|
||||
callbacks: Optional[List[BaseCallbackHandler]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
|
||||
"""
|
||||
Run the Chain or language model synchronously.
|
||||
|
||||
Args:
|
||||
example: The example to run.
|
||||
llm_or_chain_factory: The Chain or language model constructor to run.
|
||||
n_repetitions: The number of times to run the model on each example.
|
||||
tags: Optional tags to add to the run.
|
||||
callbacks: Optional callbacks to use during the run.
|
||||
|
||||
Returns:
|
||||
Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
|
||||
The outputs of the model or chain.
|
||||
"""
|
||||
if callbacks:
|
||||
previous_example_ids = [
|
||||
getattr(tracer, "example_id", None) for tracer in callbacks
|
||||
]
|
||||
for tracer in callbacks:
|
||||
if hasattr(tracer, "example_id"):
|
||||
tracer.example_id = example.id
|
||||
else:
|
||||
previous_example_ids = None
|
||||
outputs = []
|
||||
for _ in range(n_repetitions):
|
||||
try:
|
||||
if isinstance(llm_or_chain_factory, BaseLanguageModel):
|
||||
output: Any = run_llm(
|
||||
llm_or_chain_factory,
|
||||
example.inputs,
|
||||
callbacks,
|
||||
tags=tags,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
else:
|
||||
chain = llm_or_chain_factory()
|
||||
if input_mapper is not None:
|
||||
inputs_ = input_mapper(example.inputs)
|
||||
else:
|
||||
inputs_ = example.inputs
|
||||
if len(inputs_) == 1:
|
||||
inputs_ = next(iter(inputs_.values()))
|
||||
output = chain(inputs_, callbacks=callbacks, tags=tags)
|
||||
outputs.append(output)
|
||||
except Exception as e:
|
||||
logger.warning(f"Chain failed for example {example.id}. Error: {e}")
|
||||
outputs.append({"Error": str(e)})
|
||||
if callbacks and previous_example_ids:
|
||||
for example_id, tracer in zip(previous_example_ids, callbacks):
|
||||
if hasattr(tracer, "example_id"):
|
||||
tracer.example_id = example_id
|
||||
return outputs
|
||||
|
||||
|
||||
def run_on_examples(
|
||||
examples: Iterator[Example],
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
*,
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the Chain or language model on examples and store
|
||||
traces to the specified project name.
|
||||
|
||||
Args:
|
||||
examples: Examples to run the model or chain over.
|
||||
llm_or_chain_factory: Language model or Chain constructor to run
|
||||
over the dataset. The Chain constructor is used to permit
|
||||
independent calls on each example without carrying over state.
|
||||
num_repetitions: Number of times to run the model on each example.
|
||||
This is useful when testing success rates or generating confidence
|
||||
intervals.
|
||||
project_name: Name of the project to store the traces in.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to access the dataset. If None, a new client
|
||||
will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the project.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
input_mapper: A function to map to the inputs dictionary from an Example
|
||||
to the format expected by the model to be evaluated. This is useful if
|
||||
your model needs to deserialize more complex schema or if your dataset
|
||||
has inputs with keys that differ from what is expected by your chain
|
||||
or agent.
|
||||
|
||||
Returns:
|
||||
A dictionary mapping example ids to the model outputs.
|
||||
"""
|
||||
results: Dict[str, Any] = {}
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
||||
client_ = client or LangChainPlusClient()
|
||||
tracer = LangChainTracer(project_name=project_name)
|
||||
evaluator_project_name = f"{project_name}-evaluators"
|
||||
evalution_handler = EvaluatorCallbackHandler(
|
||||
evaluators=run_evaluators or [],
|
||||
client=client_,
|
||||
project_name=evaluator_project_name,
|
||||
)
|
||||
callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
|
||||
for i, example in enumerate(examples):
|
||||
result = run_llm_or_chain(
|
||||
example,
|
||||
llm_or_chain_factory,
|
||||
num_repetitions,
|
||||
tags=tags,
|
||||
callbacks=callbacks,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
if verbose:
|
||||
print(f"{i+1} processed", flush=True, end="\r")
|
||||
results[str(example.id)] = result
|
||||
tracer.wait_for_futures()
|
||||
evalution_handler.wait_for_futures()
|
||||
return results
|
||||
|
||||
|
||||
def _get_project_name(
|
||||
project_name: Optional[str],
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
dataset_name: Optional[str],
|
||||
) -> str:
|
||||
"""
|
||||
Get the project name.
|
||||
|
||||
Args:
|
||||
project_name: The project name if manually specified.
|
||||
llm_or_chain_factory: The Chain or language model constructor.
|
||||
dataset_name: The dataset name.
|
||||
|
||||
Returns:
|
||||
The project name.
|
||||
"""
|
||||
if project_name is not None:
|
||||
return project_name
|
||||
current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
||||
if isinstance(llm_or_chain_factory, BaseLanguageModel):
|
||||
model_name = llm_or_chain_factory.__class__.__name__
|
||||
else:
|
||||
model_name = llm_or_chain_factory().__class__.__name__
|
||||
dataset_prefix = f"{dataset_name}-" if dataset_name else ""
|
||||
return f"{dataset_prefix}{model_name}-{current_time}"
|
||||
|
||||
|
||||
async def arun_on_dataset(
|
||||
dataset_name: str,
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
*,
|
||||
concurrency_level: int = 5,
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Asynchronously run the Chain or language model on a dataset
|
||||
and store traces to the specified project name.
|
||||
|
||||
Args:
|
||||
dataset_name: Name of the dataset to run the chain on.
|
||||
llm_or_chain_factory: Language model or Chain constructor to run
|
||||
over the dataset. The Chain constructor is used to permit
|
||||
independent calls on each example without carrying over state.
|
||||
concurrency_level: The number of async tasks to run concurrently.
|
||||
num_repetitions: Number of times to run the model on each example.
|
||||
This is useful when testing success rates or generating confidence
|
||||
intervals.
|
||||
project_name: Name of the project to store the traces in.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to read the dataset. If not provided, a new
|
||||
client will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the session.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
input_mapper: A function to map to the inputs dictionary from an Example
|
||||
to the format expected by the model to be evaluated. This is useful if
|
||||
your model needs to deserialize more complex schema or if your dataset
|
||||
has inputs with keys that differ from what is expected by your chain
|
||||
or agent.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the run's project name and the resulting model outputs.
|
||||
"""
|
||||
client_ = client or LangChainPlusClient()
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
||||
dataset = client_.read_dataset(dataset_name=dataset_name)
|
||||
examples = client_.list_examples(dataset_id=str(dataset.id))
|
||||
results = await arun_on_examples(
|
||||
examples,
|
||||
llm_or_chain_factory,
|
||||
concurrency_level=concurrency_level,
|
||||
num_repetitions=num_repetitions,
|
||||
project_name=project_name,
|
||||
verbose=verbose,
|
||||
client=client_,
|
||||
tags=tags,
|
||||
run_evaluators=run_evaluators,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
return {
|
||||
"project_name": project_name,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
|
||||
def run_on_dataset(
|
||||
dataset_name: str,
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
*,
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the Chain or language model on a dataset and store traces
|
||||
to the specified project name.
|
||||
|
||||
Args:
|
||||
dataset_name: Name of the dataset to run the chain on.
|
||||
llm_or_chain_factory: Language model or Chain constructor to run
|
||||
over the dataset. The Chain constructor is used to permit
|
||||
independent calls on each example without carrying over state.
|
||||
num_repetitions: Number of times to run the model on each example.
|
||||
This is useful when testing success rates or generating confidence
|
||||
intervals.
|
||||
project_name: Name of the project to store the traces in.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to access the dataset. If None, a new client
|
||||
will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the session.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
input_mapper: A function to map to the inputs dictionary from an Example
|
||||
to the format expected by the model to be evaluated. This is useful if
|
||||
your model needs to deserialize more complex schema or if your dataset
|
||||
has inputs with keys that differ from what is expected by your chain
|
||||
or agent.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the run's project name and the resulting model outputs.
|
||||
"""
|
||||
client_ = client or LangChainPlusClient()
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
||||
dataset = client_.read_dataset(dataset_name=dataset_name)
|
||||
examples = client_.list_examples(dataset_id=str(dataset.id))
|
||||
results = run_on_examples(
|
||||
examples,
|
||||
llm_or_chain_factory,
|
||||
num_repetitions=num_repetitions,
|
||||
project_name=project_name,
|
||||
verbose=verbose,
|
||||
tags=tags,
|
||||
run_evaluators=run_evaluators,
|
||||
client=client_,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
return {
|
||||
"project_name": project_name,
|
||||
"results": results,
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
"""Evaluation classes that interface with traced runs and datasets."""
|
||||
from langchain.evaluation.run_evaluators.base import (
|
||||
RunEvaluatorChain,
|
||||
RunEvaluatorInputMapper,
|
||||
RunEvaluatorOutputParser,
|
||||
)
|
||||
from langchain.evaluation.run_evaluators.implementations import (
|
||||
ChoicesOutputParser,
|
||||
StringRunEvaluatorInputMapper,
|
||||
get_criteria_evaluator,
|
||||
get_qa_evaluator,
|
||||
get_trajectory_evaluator,
|
||||
)
|
||||
from langchain.evaluation.run_evaluators.loading import (
|
||||
load_run_evaluator_for_model,
|
||||
load_run_evaluators_for_model,
|
||||
)
|
||||
from langchain.evaluation.run_evaluators.string_run_evaluator import (
|
||||
StringRunEvaluatorChain,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"RunEvaluatorChain",
|
||||
"RunEvaluatorInputMapper",
|
||||
"RunEvaluatorOutputParser",
|
||||
"get_qa_evaluator",
|
||||
"get_criteria_evaluator",
|
||||
"get_trajectory_evaluator",
|
||||
"StringRunEvaluatorInputMapper",
|
||||
"ChoicesOutputParser",
|
||||
"StringRunEvaluatorChain",
|
||||
"load_run_evaluators_for_model",
|
||||
"load_run_evaluator_for_model",
|
||||
]
|
@ -1,108 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchainplus_sdk import EvaluationResult, RunEvaluator
|
||||
from langchainplus_sdk.schemas import Example, Run
|
||||
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManagerForChainRun,
|
||||
CallbackManagerForChainRun,
|
||||
)
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.schema import RUN_KEY, BaseOutputParser
|
||||
|
||||
|
||||
class RunEvaluatorInputMapper:
|
||||
"""Map the inputs of a run to the inputs of an evaluation."""
|
||||
|
||||
@abstractmethod
|
||||
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
|
||||
def __call__(self, run: Run, example: Optional[Example] = None) -> Any:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
return self.map(run, example)
|
||||
|
||||
|
||||
class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
|
||||
"""Parse the output of a run."""
|
||||
|
||||
eval_chain_output_key: str = "text"
|
||||
|
||||
def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
|
||||
"""Parse the output of a run."""
|
||||
text = output[self.eval_chain_output_key]
|
||||
return self.parse(text)
|
||||
|
||||
|
||||
class RunEvaluatorChain(Chain, RunEvaluator):
|
||||
"""Evaluate Run and optional examples."""
|
||||
|
||||
input_mapper: RunEvaluatorInputMapper
|
||||
"""Maps the Run and Optional example to a dictionary for the eval chain."""
|
||||
eval_chain: Chain
|
||||
"""The evaluation chain."""
|
||||
output_parser: RunEvaluatorOutputParser
|
||||
"""Parse the output of the eval chain into feedback."""
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
return ["run", "example"]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
return ["feedback"]
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Call the evaluation chain."""
|
||||
run: Run = inputs["run"]
|
||||
example: Optional[Example] = inputs.get("example")
|
||||
chain_input = self.input_mapper.map(run, example)
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
callbacks = _run_manager.get_child()
|
||||
chain_output = self.eval_chain(
|
||||
chain_input, callbacks=callbacks, include_run_info=True
|
||||
)
|
||||
run_info = chain_output[RUN_KEY]
|
||||
feedback = self.output_parser.parse_chain_output(chain_output)
|
||||
feedback.evaluator_info[RUN_KEY] = run_info
|
||||
return {"feedback": feedback}
|
||||
|
||||
async def _acall(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
run: Run = inputs["run"]
|
||||
example: Optional[Example] = inputs.get("example")
|
||||
chain_input = self.input_mapper.map(run, example)
|
||||
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
|
||||
callbacks = _run_manager.get_child()
|
||||
chain_output = await self.eval_chain.acall(
|
||||
chain_input,
|
||||
callbacks=callbacks,
|
||||
include_run_info=True,
|
||||
)
|
||||
run_info = chain_output[RUN_KEY]
|
||||
feedback = self.output_parser.parse_chain_output(chain_output)
|
||||
feedback.evaluator_info[RUN_KEY] = run_info
|
||||
return {"feedback": feedback}
|
||||
|
||||
def evaluate_run(
|
||||
self, run: Run, example: Optional[Example] = None
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate an example."""
|
||||
return self({"run": run, "example": example})["feedback"]
|
||||
|
||||
async def aevaluate_run(
|
||||
self, run: Run, example: Optional[Example] = None
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate an example."""
|
||||
result = await self.acall({"run": run, "example": example})
|
||||
return result["feedback"]
|
@ -1,306 +0,0 @@
|
||||
from typing import Any, Dict, Mapping, Optional, Sequence, Union
|
||||
|
||||
from langchainplus_sdk.evaluation import EvaluationResult
|
||||
from langchainplus_sdk.schemas import Example, Run, RunTypeEnum
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain.chat_models.base import BaseChatModel
|
||||
from langchain.evaluation.agents.trajectory_eval_chain import (
|
||||
TrajectoryEvalChain,
|
||||
TrajectoryOutputParser,
|
||||
)
|
||||
from langchain.evaluation.criteria.eval_chain import (
|
||||
CriteriaEvalChain,
|
||||
CriteriaResultOutputParser,
|
||||
)
|
||||
from langchain.evaluation.qa.eval_chain import QAEvalChain
|
||||
from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
|
||||
from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
|
||||
from langchain.evaluation.run_evaluators.base import (
|
||||
RunEvaluatorChain,
|
||||
RunEvaluatorInputMapper,
|
||||
RunEvaluatorOutputParser,
|
||||
)
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from langchain.schema import BasePromptTemplate
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.tools.base import BaseTool
|
||||
|
||||
_QA_PROMPTS = {
|
||||
"qa": QA_DEFAULT_PROMPT,
|
||||
"sql": SQL_PROMPT,
|
||||
}
|
||||
|
||||
|
||||
class StringRunEvaluatorInputMapper(RunEvaluatorInputMapper, BaseModel):
|
||||
"""Maps the Run and Optional[Example] to a dictionary."""
|
||||
|
||||
prediction_map: Dict[str, str]
|
||||
"""Map from run outputs to the evaluation inputs."""
|
||||
input_map: Dict[str, str]
|
||||
"""Map from run inputs to the evaluation inputs."""
|
||||
answer_map: Optional[Dict[str, str]] = None
|
||||
"""Map from example outputs to the evaluation inputs."""
|
||||
|
||||
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
if run.outputs is None and self.prediction_map:
|
||||
raise ValueError(f"Run {run.id} has no outputs.")
|
||||
if self.answer_map and (not example or not example.outputs):
|
||||
raise ValueError("This evaluator requires references, but none were given.")
|
||||
outputs = run.outputs or {}
|
||||
data = {value: outputs[key] for key, value in self.prediction_map.items()}
|
||||
data.update({value: run.inputs[key] for key, value in self.input_map.items()})
|
||||
if self.answer_map and example and example.outputs:
|
||||
data.update(
|
||||
{value: example.outputs[key] for key, value in self.answer_map.items()}
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
class ChoicesOutputParser(RunEvaluatorOutputParser):
|
||||
"""Parse a feedback run with optional choices."""
|
||||
|
||||
evaluation_name: str
|
||||
choices_map: Optional[Dict[str, int]] = None
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "choices_run_eval"
|
||||
|
||||
def parse(self, text: str) -> EvaluationResult:
|
||||
"""Parse the last line of the text and return an evaluation result."""
|
||||
lines = text.strip().split()
|
||||
value = lines[-1].strip()
|
||||
score = self.choices_map.get(value) if self.choices_map else None
|
||||
comment = " ".join(lines[:-1]) if len(lines) > 1 else None
|
||||
return EvaluationResult(
|
||||
key=self.evaluation_name,
|
||||
score=score,
|
||||
value=value,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
def get_qa_evaluator(
|
||||
llm: BaseLanguageModel,
|
||||
*,
|
||||
prompt: Union[PromptTemplate, str] = QA_DEFAULT_PROMPT,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
answer_key: str = "output",
|
||||
evaluation_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
"""Get an eval chain that compares response against ground truth."""
|
||||
if isinstance(prompt, str):
|
||||
prompt = _QA_PROMPTS[prompt]
|
||||
eval_chain = QAEvalChain.from_llm(llm=llm, prompt=prompt, **kwargs)
|
||||
input_mapper = kwargs.pop(
|
||||
"input_mapper",
|
||||
StringRunEvaluatorInputMapper(
|
||||
input_map={input_key: "query"},
|
||||
prediction_map={prediction_key: "result"},
|
||||
answer_map={answer_key: "answer"},
|
||||
),
|
||||
)
|
||||
evaluation_name = evaluation_name or "Correctness"
|
||||
output_parser = kwargs.pop(
|
||||
"output_parser",
|
||||
ChoicesOutputParser(
|
||||
evaluation_name=evaluation_name,
|
||||
choices_map={"CORRECT": 1, "INCORRECT": 0},
|
||||
),
|
||||
)
|
||||
tags = kwargs.pop("tags", [])
|
||||
return RunEvaluatorChain(
|
||||
eval_chain=eval_chain,
|
||||
input_mapper=input_mapper,
|
||||
output_parser=output_parser,
|
||||
tags=tags + [evaluation_name],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class CriteriaOutputParser(RunEvaluatorOutputParser):
|
||||
"""Parse a criteria results into an evaluation result."""
|
||||
|
||||
evaluation_name: str
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "criteria"
|
||||
|
||||
def parse(self, parsed_output: Union[str, dict]) -> EvaluationResult:
|
||||
"""Parse the last line of the text and return an evaluation result."""
|
||||
if isinstance(parsed_output, str):
|
||||
parsed_output_ = CriteriaResultOutputParser().parse(parsed_output)
|
||||
else:
|
||||
parsed_output_ = parsed_output
|
||||
return EvaluationResult(
|
||||
key=self.evaluation_name,
|
||||
score=parsed_output_["score"],
|
||||
value=parsed_output_["value"],
|
||||
comment=parsed_output_["reasoning"],
|
||||
)
|
||||
|
||||
|
||||
def get_criteria_evaluator(
|
||||
llm: BaseLanguageModel,
|
||||
criteria: Union[Mapping[str, str], Sequence[str], str],
|
||||
*,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
prompt: Optional[BasePromptTemplate] = None,
|
||||
evaluation_name: Optional[str] = None,
|
||||
requires_reference: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
"""Get an eval chain for grading a model's response against a map of criteria."""
|
||||
input_mapper = kwargs.pop(
|
||||
"input_mapper",
|
||||
StringRunEvaluatorInputMapper(
|
||||
input_map={input_key: "input"},
|
||||
prediction_map={prediction_key: "output"},
|
||||
),
|
||||
)
|
||||
criteria_ = CriteriaEvalChain.resolve_criteria(criteria)
|
||||
evaluation_name = evaluation_name or " ".join(criteria_.keys())
|
||||
parser = kwargs.pop(
|
||||
"output_parser",
|
||||
CriteriaOutputParser(
|
||||
choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
|
||||
),
|
||||
)
|
||||
tags = kwargs.pop("tags", [])
|
||||
eval_chain = CriteriaEvalChain.from_llm(
|
||||
llm=llm,
|
||||
criteria=criteria_,
|
||||
prompt=prompt,
|
||||
requires_reference=requires_reference,
|
||||
**kwargs,
|
||||
)
|
||||
return RunEvaluatorChain(
|
||||
eval_chain=eval_chain,
|
||||
input_mapper=input_mapper,
|
||||
output_parser=parser,
|
||||
tags=tags + [evaluation_name],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class TrajectoryRunEvalOutputParser(RunEvaluatorOutputParser, TrajectoryOutputParser):
|
||||
evaluation_name: str = "Agent Trajectory"
|
||||
"""The name assigned to the evaluation feedback."""
|
||||
evaluator_info: dict = Field(default_factory=dict)
|
||||
"""Additional information to log as feedback metadata."""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "agent_trajectory_run_eval"
|
||||
|
||||
def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
|
||||
"""Parse the output of a run."""
|
||||
return EvaluationResult(
|
||||
key=self.evaluation_name,
|
||||
score=int(output["score"]),
|
||||
comment=output["reasoning"],
|
||||
evaluator_info=self.evaluator_info,
|
||||
)
|
||||
|
||||
|
||||
class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
|
||||
"""Maps the Run and Optional[Example] to a dictionary."""
|
||||
|
||||
agent_input_key: str = "input"
|
||||
"""The key to load from the agent executor's run input dictionary."""
|
||||
agent_output_key: str = "output"
|
||||
"""The key to load from the agent executor's run output dictionary."""
|
||||
tool_input_key: str = "input"
|
||||
"""The key to load from the tool executor's run input dictionary."""
|
||||
tool_output_key: str = "output"
|
||||
"""The key to load from the tool executor's run output dictionary."""
|
||||
reference_output_key: Optional[str] = None
|
||||
"""The key to use for selecting the reference answer."""
|
||||
|
||||
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
if run.child_runs is None:
|
||||
raise ValueError("Run must have child runs to be evaluated.")
|
||||
if run.outputs is None:
|
||||
raise ValueError("Run must have outputs to be evaluated.")
|
||||
reference = ""
|
||||
if example is not None and example.outputs:
|
||||
if self.reference_output_key is not None:
|
||||
reference = example.outputs[self.reference_output_key]
|
||||
elif "output" in example.outputs:
|
||||
reference = example.outputs["output"]
|
||||
elif len(example.outputs) == 1:
|
||||
reference = next(iter(example.outputs.values()))
|
||||
else:
|
||||
raise ValueError("Could not infer the reference answer from ")
|
||||
|
||||
question = run.inputs[self.agent_input_key]
|
||||
tool_runs = [
|
||||
run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool
|
||||
]
|
||||
agent_steps = []
|
||||
for i, run_ in enumerate(tool_runs, 1):
|
||||
tool_output = (
|
||||
f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}"
|
||||
if run_.outputs
|
||||
else (f"Tool error: {run_.error}" if run_.error else "No output")
|
||||
)
|
||||
agent_steps.append(
|
||||
f"""Step {i}:
|
||||
Tool used: {run_.name}
|
||||
Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)}
|
||||
Tool output: {tool_output}"""
|
||||
)
|
||||
|
||||
return {
|
||||
"question": question,
|
||||
"agent_trajectory": "\n\n".join(agent_steps),
|
||||
"answer": run.outputs[self.agent_output_key],
|
||||
"reference": reference,
|
||||
}
|
||||
|
||||
|
||||
def get_trajectory_evaluator(
|
||||
llm: BaseChatModel,
|
||||
agent_tools: Sequence[BaseTool],
|
||||
*,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
tool_input_key: str = "input",
|
||||
tool_output_key: str = "output",
|
||||
reference_output_key: Optional[str] = None,
|
||||
evaluation_name: str = "Agent Trajectory",
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
"""Get an eval chain for grading a model's response against a map of criteria."""
|
||||
input_mapper = kwargs.pop(
|
||||
"input_mapper",
|
||||
TrajectoryInputMapper(
|
||||
agent_input_key=input_key,
|
||||
agent_output_key=prediction_key,
|
||||
tool_input_key=tool_input_key,
|
||||
tool_output_key=tool_output_key,
|
||||
reference_output_key=reference_output_key,
|
||||
),
|
||||
)
|
||||
parser = kwargs.pop(
|
||||
"output_parser",
|
||||
TrajectoryRunEvalOutputParser(evaluation_name=evaluation_name),
|
||||
)
|
||||
eval_chain = TrajectoryEvalChain.from_llm(
|
||||
llm=llm, agent_tools=agent_tools, return_reasoning=True, **kwargs
|
||||
)
|
||||
tags = kwargs.pop("tags", [])
|
||||
return RunEvaluatorChain(
|
||||
eval_chain=eval_chain,
|
||||
input_mapper=input_mapper,
|
||||
output_parser=parser,
|
||||
tags=tags + [evaluation_name],
|
||||
**kwargs,
|
||||
)
|
@ -1,115 +0,0 @@
|
||||
""""Loading helpers for run evaluators."""
|
||||
from typing import Any, List, Optional, Sequence, Union
|
||||
|
||||
from langchainplus_sdk import RunEvaluator
|
||||
|
||||
from langchain.base_language import BaseLanguageModel
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.evaluation.loading import load_evaluator
|
||||
from langchain.evaluation.run_evaluators.string_run_evaluator import (
|
||||
StringRunEvaluatorChain,
|
||||
)
|
||||
from langchain.evaluation.schema import EvaluatorType, StringEvaluator
|
||||
from langchain.tools.base import Tool
|
||||
|
||||
|
||||
def load_run_evaluator_for_model(
|
||||
evaluator: EvaluatorType,
|
||||
model: Union[Chain, BaseLanguageModel, Tool],
|
||||
*,
|
||||
input_key: Optional[str] = None,
|
||||
prediction_key: Optional[str] = None,
|
||||
reference_key: Optional[str] = None,
|
||||
eval_llm: Optional[BaseLanguageModel] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[RunEvaluator]:
|
||||
"""Load evaluators specified by a list of evaluator types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
evaluator: EvaluatorType
|
||||
The evaluator type to load.
|
||||
model : Union[Chain, BaseLanguageModel, Tool]
|
||||
The model to evaluate. Used to infer how to parse the run.
|
||||
input_key : Optional[str], a chain run's input key to map
|
||||
to the evaluator's input
|
||||
prediction_key : Optional[str], the key in the run's outputs to
|
||||
represent the Chain prediction
|
||||
reference_key : Optional[str], the key in the dataset example (row)
|
||||
outputs to represent the reference, or ground-truth label
|
||||
eval_llm : BaseLanguageModel, optional
|
||||
The language model to use for evaluation, if none is provided, a default
|
||||
ChatOpenAI gpt-4 model will be used.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to all evaluators.
|
||||
|
||||
Returns
|
||||
-------
|
||||
RunEvaluator
|
||||
The loaded Run evaluator.
|
||||
"""
|
||||
evaluator_ = load_evaluator(evaluator, llm=eval_llm, **kwargs)
|
||||
if isinstance(evaluator_, StringEvaluator):
|
||||
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
|
||||
model,
|
||||
evaluator_,
|
||||
input_key=input_key,
|
||||
prediction_key=prediction_key,
|
||||
reference_key=reference_key,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Run evaluator for {evaluator} is not implemented")
|
||||
return run_evaluator
|
||||
|
||||
|
||||
def load_run_evaluators_for_model(
|
||||
evaluators: Sequence[EvaluatorType],
|
||||
model: Union[Chain, BaseLanguageModel, Tool],
|
||||
*,
|
||||
input_key: Optional[str] = None,
|
||||
prediction_key: Optional[str] = None,
|
||||
reference_key: Optional[str] = None,
|
||||
eval_llm: Optional[BaseLanguageModel] = None,
|
||||
config: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[RunEvaluator]:
|
||||
"""Load evaluators specified by a list of evaluator types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
evaluators : Sequence[EvaluatorType]
|
||||
The list of evaluator types to load.
|
||||
model : Union[Chain, BaseLanguageModel, Tool]
|
||||
The model to evaluate. Used to infer how to parse the run.
|
||||
input_key : Optional[str], a chain run's input key to map
|
||||
to the evaluator's input
|
||||
prediction_key : Optional[str], the key in the run's outputs to
|
||||
represent the Chain prediction
|
||||
reference_key : Optional[str], the key in the dataset example (row)
|
||||
outputs to represent the reference, or ground-truth label
|
||||
eval_llm : BaseLanguageModel, optional
|
||||
The language model to use for evaluation, if none is provided, a default
|
||||
ChatOpenAI gpt-4 model will be used.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to all evaluators.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[RunEvaluator]
|
||||
The loaded Run evaluators.
|
||||
"""
|
||||
run_evaluators = []
|
||||
for evaluator in evaluators:
|
||||
_kwargs = config.get(evaluator, {}) if config else {}
|
||||
run_evaluators.append(
|
||||
load_run_evaluator_for_model(
|
||||
evaluator,
|
||||
model,
|
||||
input_key=input_key,
|
||||
prediction_key=prediction_key,
|
||||
reference_key=reference_key,
|
||||
eval_llm=eval_llm,
|
||||
**{**kwargs, **_kwargs},
|
||||
)
|
||||
)
|
||||
return run_evaluators
|
@ -0,0 +1,102 @@
|
||||
"""LangSmith utilities.
|
||||
|
||||
This module provides utilities for connecting to `LangSmith <https://smith.langchain.com/>`_. For more information on LangSmith, see the `LangSmith documentation <https://docs.smith.langchain.com/>`_.
|
||||
|
||||
**Evaluation**
|
||||
|
||||
LangSmith helps you evaluate Chains and other language model application components using a number of LangChain evaluators.
|
||||
An example of this is shown below, assuming you've created a LangSmith dataset called ``<my_dataset_name>``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langsmith import Client
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.chains import LLMChain
|
||||
from langchain.smith import RunEvalConfig, run_on_dataset
|
||||
|
||||
# Chains may have memory. Passing in a constructor function lets the
|
||||
# evaluation framework avoid cross-contamination between runs.
|
||||
def construct_chain():
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(
|
||||
llm,
|
||||
"What's the answer to {your_input_key}"
|
||||
)
|
||||
return chain
|
||||
|
||||
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
||||
evaluation_config = RunEvalConfig(
|
||||
evaluators=[
|
||||
"qa", # "Correctness" against a reference answer
|
||||
"embedding_distance",
|
||||
RunEvalConfig.Criteria("helpfulness"),
|
||||
RunEvalConfig.Criteria({
|
||||
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
||||
}),
|
||||
]
|
||||
)
|
||||
|
||||
client = Client()
|
||||
run_on_dataset(
|
||||
client,
|
||||
"<my_dataset_name>",
|
||||
construct_chain,
|
||||
evaluation=evaluation_config,
|
||||
)
|
||||
|
||||
You can also create custom evaluators by subclassing the
|
||||
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
||||
or LangSmith's `RunEvaluator` classes.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from typing import Optional
|
||||
from langchain.evaluation import StringEvaluator
|
||||
|
||||
class MyStringEvaluator(StringEvaluator):
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
return "exact_match"
|
||||
|
||||
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
||||
return {"score": prediction == reference}
|
||||
|
||||
|
||||
evaluation_config = RunEvalConfig(
|
||||
custom_evaluators = [MyStringEvaluator()],
|
||||
)
|
||||
|
||||
run_on_dataset(
|
||||
client,
|
||||
"<my_dataset_name>",
|
||||
construct_chain,
|
||||
evaluation=evaluation_config,
|
||||
)
|
||||
|
||||
**Primary Functions**
|
||||
|
||||
- :func:`arun_on_dataset <langchain.smith.evaluation.runner_utils.arun_on_dataset>`: Asynchronous function to evaluate a chain, agent, or other LangChain component over a dataset.
|
||||
- :func:`run_on_dataset <langchain.smith.evaluation.runner_utils.run_on_dataset>`: Function to evaluate a chain, agent, or other LangChain component over a dataset.
|
||||
- :class:`RunEvalConfig <langchain.smith.evaluation.config.RunEvalConfig>`: Class representing the configuration for running evaluation. You can select evaluators by :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>` or config, or you can pass in `custom_evaluators`
|
||||
""" # noqa: E501
|
||||
from langchain.smith.evaluation import (
|
||||
RunEvalConfig,
|
||||
arun_on_dataset,
|
||||
run_on_dataset,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"arun_on_dataset",
|
||||
"run_on_dataset",
|
||||
"ChoicesOutputParser",
|
||||
"RunEvalConfig",
|
||||
]
|
@ -0,0 +1,69 @@
|
||||
"""LangSmith evaluation utilities.
|
||||
|
||||
This module provides utilities for evaluating Chains and other language model
|
||||
applications using LangChain evaluators and LangSmith.
|
||||
|
||||
For more information on the LangSmith API, see the `LangSmith API documentation <https://docs.smith.langchain.com/docs/>`_.
|
||||
|
||||
**Example**
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langsmith import Client
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.chains import LLMChain
|
||||
from langchain.smith import EvaluatorType, RunEvalConfig, run_on_dataset
|
||||
|
||||
def construct_chain():
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(
|
||||
llm,
|
||||
"What's the answer to {your_input_key}"
|
||||
)
|
||||
return chain
|
||||
|
||||
evaluation_config = RunEvalConfig(
|
||||
evaluators=[
|
||||
EvaluatorType.QA, # "Correctness" against a reference answer
|
||||
EvaluatorType.EMBEDDING_DISTANCE,
|
||||
RunEvalConfig.Criteria("helpfulness"),
|
||||
RunEvalConfig.Criteria({
|
||||
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
||||
}),
|
||||
]
|
||||
)
|
||||
|
||||
client = Client()
|
||||
run_on_dataset(
|
||||
client,
|
||||
"<my_dataset_name>",
|
||||
construct_chain,
|
||||
evaluation=evaluation_config
|
||||
)
|
||||
|
||||
**Attributes**
|
||||
|
||||
- ``arun_on_dataset``: Asynchronous function to evaluate a chain or other LangChain component over a dataset.
|
||||
- ``run_on_dataset``: Function to evaluate a chain or other LangChain component over a dataset.
|
||||
- ``RunEvalConfig``: Class representing the configuration for running evaluation.
|
||||
- ``StringRunEvaluatorChain``: Class representing a string run evaluator chain.
|
||||
- ``InputFormatError``: Exception raised when the input format is incorrect.
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
|
||||
from langchain.smith.evaluation.config import RunEvalConfig
|
||||
from langchain.smith.evaluation.runner_utils import (
|
||||
InputFormatError,
|
||||
arun_on_dataset,
|
||||
run_on_dataset,
|
||||
)
|
||||
from langchain.smith.evaluation.string_run_evaluator import StringRunEvaluatorChain
|
||||
|
||||
__all__ = [
|
||||
"InputFormatError",
|
||||
"arun_on_dataset",
|
||||
"run_on_dataset",
|
||||
"StringRunEvaluatorChain",
|
||||
"RunEvalConfig",
|
||||
]
|
@ -0,0 +1,228 @@
|
||||
"""Configuration for run evaluators."""
|
||||
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langsmith import RunEvaluator
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.evaluation.criteria.eval_chain import CRITERIA_TYPE
|
||||
from langchain.evaluation.embedding_distance.base import (
|
||||
EmbeddingDistance as EmbeddingDistanceEnum,
|
||||
)
|
||||
from langchain.evaluation.schema import EvaluatorType, StringEvaluator
|
||||
from langchain.evaluation.string_distance.base import (
|
||||
StringDistance as StringDistanceEnum,
|
||||
)
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.schema.prompt_template import BasePromptTemplate
|
||||
|
||||
|
||||
class EvalConfig(BaseModel):
|
||||
"""Configuration for a given run evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
evaluator_type : EvaluatorType
|
||||
The type of evaluator to use.
|
||||
|
||||
Methods
|
||||
-------
|
||||
get_kwargs()
|
||||
Get the keyword arguments for the evaluator configuration.
|
||||
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType
|
||||
|
||||
def get_kwargs(self) -> Dict[str, Any]:
|
||||
"""Get the keyword arguments for the load_evaluator call.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict[str, Any]
|
||||
The keyword arguments for the load_evaluator call.
|
||||
|
||||
"""
|
||||
return self.dict(exclude={"evaluator_type"}, exclude_none=True)
|
||||
|
||||
|
||||
class RunEvalConfig(BaseModel):
|
||||
"""Configuration for a run evaluation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
evaluators : List[Union[EvaluatorType, EvalConfig]]
|
||||
Configurations for which evaluators to apply to the dataset run.
|
||||
Each can be the string of an :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
|
||||
as EvaluatorType.QA, the evaluator type string ("qa"), or a configuration for a
|
||||
given evaluator (e.g., :class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`).
|
||||
|
||||
custom_evaluators : Optional[List[Union[RunEvaluator, StringEvaluator]]]
|
||||
Custom evaluators to apply to the dataset run.
|
||||
|
||||
reference_key : Optional[str]
|
||||
The key in the dataset run to use as the reference string.
|
||||
If not provided, it will be inferred automatically.
|
||||
|
||||
prediction_key : Optional[str]
|
||||
The key from the traced run's outputs dictionary to use to
|
||||
represent the prediction. If not provided, it will be inferred
|
||||
automatically.
|
||||
|
||||
input_key : Optional[str]
|
||||
The key from the traced run's inputs dictionary to use to represent the
|
||||
input. If not provided, it will be inferred automatically.
|
||||
|
||||
eval_llm : Optional[BaseLanguageModel]
|
||||
The language model to pass to any evaluators that use a language model.
|
||||
""" # noqa: E501
|
||||
|
||||
evaluators: List[Union[EvaluatorType, EvalConfig]] = Field(default_factory=list)
|
||||
"""Configurations for which evaluators to apply to the dataset run.
|
||||
Each can be the string of an
|
||||
:class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
|
||||
as `EvaluatorType.QA`, the evaluator type string ("qa"), or a configuration for a
|
||||
given evaluator
|
||||
(e.g.,
|
||||
:class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`).""" # noqa: E501
|
||||
custom_evaluators: Optional[List[Union[RunEvaluator, StringEvaluator]]] = None
|
||||
"""Custom evaluators to apply to the dataset run."""
|
||||
reference_key: Optional[str] = None
|
||||
"""The key in the dataset run to use as the reference string.
|
||||
If not provided, we will attempt to infer automatically."""
|
||||
prediction_key: Optional[str] = None
|
||||
"""The key from the traced run's outputs dictionary to use to
|
||||
represent the prediction. If not provided, it will be inferred
|
||||
automatically."""
|
||||
input_key: Optional[str] = None
|
||||
"""The key from the traced run's inputs dictionary to use to represent the
|
||||
input. If not provided, it will be inferred automatically."""
|
||||
eval_llm: Optional[BaseLanguageModel] = None
|
||||
"""The language model to pass to any evaluators that require one."""
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
class Criteria(EvalConfig):
|
||||
"""Configuration for a reference-free criteria evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
criteria : Optional[CRITERIA_TYPE]
|
||||
The criteria to evaluate.
|
||||
llm : Optional[BaseLanguageModel]
|
||||
The language model to use for the evaluation chain.
|
||||
|
||||
"""
|
||||
|
||||
criteria: Optional[CRITERIA_TYPE] = None
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
|
||||
|
||||
def __init__(
|
||||
self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
|
||||
) -> None:
|
||||
super().__init__(criteria=criteria, **kwargs)
|
||||
|
||||
class LabeledCriteria(EvalConfig):
|
||||
"""Configuration for a labeled (with references) criteria evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
criteria : Optional[CRITERIA_TYPE]
|
||||
The criteria to evaluate.
|
||||
llm : Optional[BaseLanguageModel]
|
||||
The language model to use for the evaluation chain.
|
||||
"""
|
||||
|
||||
criteria: Optional[CRITERIA_TYPE] = None
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
evaluator_type: EvaluatorType = EvaluatorType.LABELED_CRITERIA
|
||||
|
||||
def __init__(
|
||||
self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
|
||||
) -> None:
|
||||
super().__init__(criteria=criteria, **kwargs)
|
||||
|
||||
class EmbeddingDistance(EvalConfig):
|
||||
"""Configuration for an embedding distance evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
embeddings : Optional[Embeddings]
|
||||
The embeddings to use for computing the distance.
|
||||
|
||||
distance_metric : Optional[EmbeddingDistanceEnum]
|
||||
The distance metric to use for computing the distance.
|
||||
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.EMBEDDING_DISTANCE
|
||||
embeddings: Optional[Embeddings] = None
|
||||
distance_metric: Optional[EmbeddingDistanceEnum] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
class StringDistance(EvalConfig):
|
||||
"""Configuration for a string distance evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance : Optional[StringDistanceEnum]
|
||||
The string distance metric to use.
|
||||
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
|
||||
distance: Optional[StringDistanceEnum] = None
|
||||
|
||||
class QA(EvalConfig):
|
||||
"""Configuration for a QA evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prompt : Optional[BasePromptTemplate]
|
||||
The prompt template to use for generating the question.
|
||||
llm : Optional[BaseLanguageModel]
|
||||
The language model to use for the evaluation chain.
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.QA
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
prompt: Optional[BasePromptTemplate] = None
|
||||
|
||||
class ContextQA(EvalConfig):
|
||||
"""Configuration for a context-based QA evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prompt : Optional[BasePromptTemplate]
|
||||
The prompt template to use for generating the question.
|
||||
llm : Optional[BaseLanguageModel]
|
||||
The language model to use for the evaluation chain.
|
||||
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
prompt: Optional[BasePromptTemplate] = None
|
||||
|
||||
class CoTQA(EvalConfig):
|
||||
"""Configuration for a context-based QA evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prompt : Optional[BasePromptTemplate]
|
||||
The prompt template to use for generating the question.
|
||||
llm : Optional[BaseLanguageModel]
|
||||
The language model to use for the evaluation chain.
|
||||
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
prompt: Optional[BasePromptTemplate] = None
|
||||
|
||||
# TODO: Trajectory
|
File diff suppressed because it is too large
Load Diff
@ -1,81 +0,0 @@
|
||||
import sys
|
||||
from typing import Iterator
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
from langchainplus_sdk import LangChainPlusClient as Client
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.client.runner_utils import run_on_dataset
|
||||
from langchain.evaluation import EvaluatorType
|
||||
from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
|
||||
from langchain.llms.openai import OpenAI
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
)
|
||||
def dataset_name() -> Iterator[str]:
|
||||
import pandas as pd
|
||||
|
||||
client = Client()
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
{"question": "5", "answer": 5.0},
|
||||
{"question": "5 + 3", "answer": 8.0},
|
||||
{"question": "2^3.171", "answer": 9.006708689094099},
|
||||
{"question": " 2 ^3.171 ", "answer": 9.006708689094099},
|
||||
]
|
||||
)
|
||||
|
||||
uid = str(uuid4())[-8:]
|
||||
_dataset_name = f"lcp integration tests - {uid}"
|
||||
client.upload_dataframe(
|
||||
df,
|
||||
name=_dataset_name,
|
||||
input_keys=["question"],
|
||||
output_keys=["answer"],
|
||||
description="Integration test dataset",
|
||||
)
|
||||
yield _dataset_name
|
||||
|
||||
|
||||
def test_chat_model(dataset_name: str) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
evaluators = load_run_evaluators_for_model(
|
||||
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
|
||||
)
|
||||
results = run_on_dataset(
|
||||
dataset_name,
|
||||
llm,
|
||||
run_evaluators=evaluators,
|
||||
)
|
||||
print("CHAT", results, file=sys.stderr)
|
||||
|
||||
|
||||
def test_llm(dataset_name: str) -> None:
|
||||
llm = OpenAI(temperature=0)
|
||||
evaluators = load_run_evaluators_for_model(
|
||||
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
|
||||
)
|
||||
results = run_on_dataset(
|
||||
dataset_name,
|
||||
llm,
|
||||
run_evaluators=evaluators,
|
||||
)
|
||||
print("LLM", results, file=sys.stderr)
|
||||
|
||||
|
||||
def test_chain(dataset_name: str) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
||||
evaluators = load_run_evaluators_for_model(
|
||||
[EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer"
|
||||
)
|
||||
results = run_on_dataset(
|
||||
dataset_name,
|
||||
lambda: chain,
|
||||
run_evaluators=evaluators,
|
||||
)
|
||||
print("CHAIN", results, file=sys.stderr)
|
@ -0,0 +1,429 @@
|
||||
from typing import Iterator, List
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
from langsmith import Client as Client
|
||||
from langsmith.schemas import DataType
|
||||
|
||||
from langchain.callbacks.tracers.evaluation import wait_for_all_evaluators
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.evaluation import EvaluatorType
|
||||
from langchain.llms.openai import OpenAI
|
||||
from langchain.schema.messages import BaseMessage, HumanMessage
|
||||
from langchain.smith import RunEvalConfig, run_on_dataset
|
||||
from langchain.smith.evaluation import InputFormatError
|
||||
|
||||
|
||||
def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
|
||||
# Assert that all runs completed, all feedback completed, and that the
|
||||
# chain or llm passes for the feedback provided.
|
||||
runs = list(client.list_runs(project_name=_project_name, execution_order=1))
|
||||
assert len(runs) == 4
|
||||
wait_for_all_evaluators()
|
||||
feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
|
||||
assert len(feedback) == 8
|
||||
assert all([f.score == 1 for f in feedback])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def eval_project_name() -> str:
|
||||
return f"lcp integration tests - {str(uuid4())[-8:]}"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def client() -> Client:
|
||||
return Client()
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
)
|
||||
def kv_dataset_name() -> Iterator[str]:
|
||||
import pandas as pd
|
||||
|
||||
client = Client()
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"some_input": [
|
||||
"What's the capital of California?",
|
||||
"What's the capital of Nevada?",
|
||||
"What's the capital of Oregon?",
|
||||
"What's the capital of Washington?",
|
||||
],
|
||||
"other_input": [
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
],
|
||||
"some_output": ["Sacramento", "Carson City", "Salem", "Olympia"],
|
||||
"other_output": ["e", "f", "g", "h"],
|
||||
}
|
||||
)
|
||||
|
||||
uid = str(uuid4())[-8:]
|
||||
_dataset_name = f"lcp kv dataset integration tests - {uid}"
|
||||
client.upload_dataframe(
|
||||
df,
|
||||
name=_dataset_name,
|
||||
input_keys=["some_input", "other_input"],
|
||||
output_keys=["some_output", "other_output"],
|
||||
description="Integration test dataset",
|
||||
)
|
||||
yield _dataset_name
|
||||
|
||||
|
||||
def test_chat_model(
|
||||
kv_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
with pytest.raises(ValueError, match="Must specify reference_key"):
|
||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
||||
eval_config = RunEvalConfig(
|
||||
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
||||
reference_key="some_output",
|
||||
)
|
||||
with pytest.raises(
|
||||
InputFormatError, match="Example inputs do not match language model"
|
||||
):
|
||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
||||
|
||||
def input_mapper(d: dict) -> List[BaseMessage]:
|
||||
return [HumanMessage(content=d["some_input"])]
|
||||
|
||||
run_on_dataset(
|
||||
client,
|
||||
kv_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
input_mapper=input_mapper,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> None:
|
||||
llm = OpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
with pytest.raises(ValueError, match="Must specify reference_key"):
|
||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
||||
eval_config = RunEvalConfig(
|
||||
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
||||
reference_key="some_output",
|
||||
)
|
||||
with pytest.raises(
|
||||
InputFormatError, match="Example inputs do not match language model"
|
||||
):
|
||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
||||
|
||||
def input_mapper(d: dict) -> str:
|
||||
return d["some_input"]
|
||||
|
||||
run_on_dataset(
|
||||
client,
|
||||
kv_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
input_mapper=input_mapper,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
with pytest.raises(ValueError, match="Must specify reference_key"):
|
||||
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
|
||||
eval_config = RunEvalConfig(
|
||||
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
||||
reference_key="some_output",
|
||||
)
|
||||
with pytest.raises(
|
||||
InputFormatError, match="Example inputs do not match chain input keys"
|
||||
):
|
||||
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
|
||||
|
||||
def input_mapper(d: dict) -> dict:
|
||||
return {"input": d["some_input"]}
|
||||
|
||||
with pytest.raises(
|
||||
InputFormatError,
|
||||
match=" match the chain's expected input keys.",
|
||||
):
|
||||
run_on_dataset(
|
||||
client,
|
||||
kv_dataset_name,
|
||||
lambda: chain,
|
||||
evaluation=eval_config,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
|
||||
def right_input_mapper(d: dict) -> dict:
|
||||
return {"question": d["some_input"]}
|
||||
|
||||
run_on_dataset(
|
||||
client,
|
||||
kv_dataset_name,
|
||||
lambda: chain,
|
||||
evaluation=eval_config,
|
||||
input_mapper=right_input_mapper,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
### Testing Chat Datasets
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
)
|
||||
def chat_dataset_name() -> Iterator[str]:
|
||||
def _create_message(txt: str, role: str = "human") -> List[dict]:
|
||||
return [{"type": role, "data": {"content": txt}}]
|
||||
|
||||
import pandas as pd
|
||||
|
||||
client = Client()
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"input": [
|
||||
_create_message(txt)
|
||||
for txt in (
|
||||
"What's the capital of California?",
|
||||
"What's the capital of Nevada?",
|
||||
"What's the capital of Oregon?",
|
||||
"What's the capital of Washington?",
|
||||
)
|
||||
],
|
||||
"output": [
|
||||
_create_message(txt, role="ai")[0]
|
||||
for txt in ("Sacramento", "Carson City", "Salem", "Olympia")
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
uid = str(uuid4())[-8:]
|
||||
_dataset_name = f"lcp chat dataset integration tests - {uid}"
|
||||
ds = client.create_dataset(
|
||||
_dataset_name, description="Integration test dataset", data_type=DataType.chat
|
||||
)
|
||||
for row in df.itertuples():
|
||||
client.create_example(
|
||||
dataset_id=ds.id,
|
||||
inputs={"input": row.input},
|
||||
outputs={"output": row.output},
|
||||
)
|
||||
yield _dataset_name
|
||||
|
||||
|
||||
def test_chat_model_on_chat_dataset(
|
||||
chat_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
run_on_dataset(
|
||||
client,
|
||||
chat_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
project_name=eval_project_name,
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_llm_on_chat_dataset(
|
||||
chat_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = OpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
run_on_dataset(
|
||||
client,
|
||||
chat_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_chain_on_chat_dataset(chat_dataset_name: str, client: Client) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
with pytest.raises(
|
||||
ValueError, match="Cannot evaluate a chain on dataset with data_type=chat"
|
||||
):
|
||||
run_on_dataset(
|
||||
client,
|
||||
chat_dataset_name,
|
||||
lambda: chain,
|
||||
evaluation=eval_config,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
)
|
||||
def llm_dataset_name() -> Iterator[str]:
|
||||
import pandas as pd
|
||||
|
||||
client = Client()
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"input": [
|
||||
"What's the capital of California?",
|
||||
"What's the capital of Nevada?",
|
||||
"What's the capital of Oregon?",
|
||||
"What's the capital of Washington?",
|
||||
],
|
||||
"output": ["Sacramento", "Carson City", "Salem", "Olympia"],
|
||||
}
|
||||
)
|
||||
|
||||
uid = str(uuid4())[-8:]
|
||||
_dataset_name = f"lcp llm dataset integration tests - {uid}"
|
||||
client.upload_dataframe(
|
||||
df,
|
||||
name=_dataset_name,
|
||||
input_keys=["input"],
|
||||
output_keys=["output"],
|
||||
description="Integration test dataset",
|
||||
data_type=DataType.llm,
|
||||
)
|
||||
yield _dataset_name
|
||||
|
||||
|
||||
def test_chat_model_on_llm_dataset(
|
||||
llm_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
run_on_dataset(
|
||||
client,
|
||||
llm_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_llm_on_llm_dataset(
|
||||
llm_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = OpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
run_on_dataset(
|
||||
client,
|
||||
llm_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_chain_on_llm_dataset(llm_dataset_name: str, client: Client) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
with pytest.raises(
|
||||
ValueError, match="Cannot evaluate a chain on dataset with data_type=llm"
|
||||
):
|
||||
run_on_dataset(
|
||||
client,
|
||||
llm_dataset_name,
|
||||
lambda: chain,
|
||||
evaluation=eval_config,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
)
|
||||
def kv_singleio_dataset_name() -> Iterator[str]:
|
||||
import pandas as pd
|
||||
|
||||
client = Client()
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"the wackiest input": [
|
||||
"What's the capital of California?",
|
||||
"What's the capital of Nevada?",
|
||||
"What's the capital of Oregon?",
|
||||
"What's the capital of Washington?",
|
||||
],
|
||||
"unthinkable output": ["Sacramento", "Carson City", "Salem", "Olympia"],
|
||||
}
|
||||
)
|
||||
|
||||
uid = str(uuid4())[-8:]
|
||||
_dataset_name = f"lcp singleio kv dataset integration tests - {uid}"
|
||||
client.upload_dataframe(
|
||||
df,
|
||||
name=_dataset_name,
|
||||
input_keys=["the wackiest input"],
|
||||
output_keys=["unthinkable output"],
|
||||
description="Integration test dataset",
|
||||
)
|
||||
yield _dataset_name
|
||||
|
||||
|
||||
def test_chat_model_on_kv_singleio_dataset(
|
||||
kv_singleio_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
run_on_dataset(
|
||||
client,
|
||||
kv_singleio_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_llm_on_kv_singleio_dataset(
|
||||
kv_singleio_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = OpenAI(temperature=0)
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
run_on_dataset(
|
||||
client,
|
||||
kv_singleio_dataset_name,
|
||||
llm,
|
||||
evaluation=eval_config,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
||||
|
||||
|
||||
def test_chain_on_kv_singleio_dataset(
|
||||
kv_singleio_dataset_name: str, eval_project_name: str, client: Client
|
||||
) -> None:
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||
run_on_dataset(
|
||||
client,
|
||||
kv_singleio_dataset_name,
|
||||
lambda: chain,
|
||||
evaluation=eval_config,
|
||||
project_name=eval_project_name,
|
||||
tags=["shouldpass"],
|
||||
)
|
||||
_check_all_feedback_passed(eval_project_name, client)
|
@ -1,54 +0,0 @@
|
||||
"""Test run evaluator implementations basic functionality."""
|
||||
|
||||
from uuid import UUID
|
||||
|
||||
import pytest
|
||||
from langchainplus_sdk.schemas import Example, Run
|
||||
|
||||
from langchain.evaluation.run_evaluators import get_criteria_evaluator, get_qa_evaluator
|
||||
from tests.unit_tests.llms.fake_llm import FakeLLM
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def run() -> Run:
|
||||
return Run(
|
||||
id=UUID("f77cd087-48f7-4c62-9e0e-297842202107"),
|
||||
name="My Run",
|
||||
inputs={"input": "What is the answer to life, the universe, and everything?"},
|
||||
outputs={"output": "The answer is 42."},
|
||||
start_time="2021-07-20T15:00:00.000000+00:00",
|
||||
end_time="2021-07-20T15:00:00.000000+00:00",
|
||||
run_type="chain",
|
||||
execution_order=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example() -> Example:
|
||||
return Example(
|
||||
id=UUID("f77cd087-48f7-4c62-9e0e-297842202106"),
|
||||
dataset_id=UUID("f77cd087-48f7-4c62-9e0e-297842202105"),
|
||||
inputs={"input": "What is the answer to life, the universe, and everything?"},
|
||||
outputs={"output": "The answer is 42."},
|
||||
created_at="2021-07-20T15:00:00.000000+00:00",
|
||||
)
|
||||
|
||||
|
||||
def test_get_qa_evaluator(run: Run, example: Example) -> None:
|
||||
"""Test get_qa_evaluator."""
|
||||
eval_llm = FakeLLM(
|
||||
queries={"a": "This checks out.\nCORRECT"}, sequential_responses=True
|
||||
)
|
||||
qa_evaluator = get_qa_evaluator(eval_llm)
|
||||
res = qa_evaluator.evaluate_run(run, example)
|
||||
assert res.value == "CORRECT"
|
||||
assert res.score == 1
|
||||
|
||||
|
||||
def test_get_criteria_evaluator(run: Run, example: Example) -> None:
|
||||
"""Get a criteria evaluator."""
|
||||
eval_llm = FakeLLM(queries={"a": "This checks out.\nY"}, sequential_responses=True)
|
||||
criteria_evaluator = get_criteria_evaluator(eval_llm, criteria="conciseness")
|
||||
res = criteria_evaluator.evaluate_run(run, example)
|
||||
assert res.value == "Y"
|
||||
assert res.score == 1
|
@ -1,114 +0,0 @@
|
||||
"""Test the loading function for evaluators."""
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
|
||||
from langchain.evaluation.loading import load_evaluators
|
||||
from langchain.evaluation.run_evaluators.string_run_evaluator import (
|
||||
StringRunEvaluatorChain,
|
||||
)
|
||||
from langchain.evaluation.schema import StringEvaluator
|
||||
from tests.unit_tests.chains.test_base import FakeChain
|
||||
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
|
||||
from tests.unit_tests.llms.fake_llm import FakeLLM
|
||||
|
||||
|
||||
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
|
||||
def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
|
||||
"""Test loading evaluators."""
|
||||
fake_llm = FakeLLM(
|
||||
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
|
||||
)
|
||||
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
|
||||
if not isinstance(evaluator, StringEvaluator):
|
||||
raise ValueError("Evaluator is not a string evaluator")
|
||||
model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
|
||||
kwargs = {}
|
||||
if evaluator.requires_reference:
|
||||
kwargs["reference_key"] = "generations"
|
||||
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
|
||||
model, evaluator, **kwargs
|
||||
)
|
||||
callback = RunCollectorCallbackHandler()
|
||||
model.predict("Foo input", callbacks=[callback])
|
||||
run = callback.traced_runs[0]
|
||||
example = MagicMock()
|
||||
example.inputs = {}
|
||||
example.outputs = {"generations": "Foo output"}
|
||||
result = run_evaluator._prepare_input({"run": run, "example": example})
|
||||
assert result["input"] == "Foo input"
|
||||
assert result["prediction"] == "Foo output"
|
||||
if evaluator.requires_reference:
|
||||
assert "reference" in result
|
||||
assert result["reference"] == "Foo output"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
|
||||
def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
|
||||
"""Test loading evaluators."""
|
||||
fake_llm = FakeLLM(
|
||||
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
|
||||
)
|
||||
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
|
||||
if not isinstance(evaluator, StringEvaluator):
|
||||
raise ValueError("Evaluator is not a string evaluator")
|
||||
model = FakeChatModel()
|
||||
kwargs = {}
|
||||
if evaluator.requires_reference:
|
||||
kwargs["reference_key"] = "generations"
|
||||
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
|
||||
model, evaluator, **kwargs
|
||||
)
|
||||
callback = RunCollectorCallbackHandler()
|
||||
model.predict("Foo input", callbacks=[callback])
|
||||
run = callback.traced_runs[0]
|
||||
example = MagicMock()
|
||||
example.inputs = {}
|
||||
example.outputs = {"generations": "Another fake response"}
|
||||
result = run_evaluator._prepare_input({"run": run, "example": example})
|
||||
assert result["input"] == "Human: Foo input"
|
||||
assert result["prediction"] == "AI: fake response"
|
||||
if evaluator.requires_reference:
|
||||
assert "reference" in result
|
||||
assert result["reference"] == "Another fake response"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
|
||||
def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
|
||||
model = FakeChain(
|
||||
the_input_keys=["an_input", "another_input"],
|
||||
)
|
||||
fake_llm = FakeChatModel()
|
||||
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
|
||||
if not isinstance(evaluator, StringEvaluator):
|
||||
raise ValueError("Evaluator is not a string evaluator")
|
||||
# No input key
|
||||
with pytest.raises(ValueError, match="multiple input keys"):
|
||||
StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
|
||||
with pytest.raises(ValueError, match="does not have specified"):
|
||||
StringRunEvaluatorChain.from_model_and_evaluator(
|
||||
model, evaluator, input_key="some_input"
|
||||
)
|
||||
kwargs = {}
|
||||
if evaluator.requires_reference:
|
||||
kwargs["reference_key"] = "label_column"
|
||||
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
|
||||
model, evaluator, input_key="an_input", **kwargs
|
||||
)
|
||||
callback = RunCollectorCallbackHandler()
|
||||
model(
|
||||
{"an_input": "Foo input", "another_input": "Another fake response"},
|
||||
callbacks=[callback],
|
||||
)
|
||||
run = callback.traced_runs[0]
|
||||
example = MagicMock()
|
||||
example.inputs = {}
|
||||
example.outputs = {"label_column": "Another fake response"}
|
||||
result = run_evaluator._prepare_input({"run": run, "example": example})
|
||||
assert result["input"] == "Foo input"
|
||||
assert result["prediction"] == "baz"
|
||||
if evaluator.requires_reference:
|
||||
assert "reference" in result
|
||||
assert result["reference"] == "Another fake response"
|
@ -0,0 +1,347 @@
|
||||
"""Test the LangSmith evaluation helpers."""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from langsmith.client import Client
|
||||
from langsmith.schemas import Dataset, Example
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.transform import TransformChain
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.smith.evaluation.runner_utils import (
|
||||
InputFormatError,
|
||||
_get_messages,
|
||||
_get_prompt,
|
||||
_run_llm,
|
||||
_run_llm_or_chain,
|
||||
_validate_example_inputs_for_chain,
|
||||
_validate_example_inputs_for_language_model,
|
||||
arun_on_dataset,
|
||||
)
|
||||
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
|
||||
from tests.unit_tests.llms.fake_llm import FakeLLM
|
||||
|
||||
_CREATED_AT = datetime(2015, 1, 1, 0, 0, 0)
|
||||
_TENANT_ID = "7a3d2b56-cd5b-44e5-846f-7eb6e8144ce4"
|
||||
_EXAMPLE_MESSAGE = {
|
||||
"data": {"content": "Foo", "example": False, "additional_kwargs": {}},
|
||||
"type": "human",
|
||||
}
|
||||
_VALID_MESSAGES = [
|
||||
{"messages": [_EXAMPLE_MESSAGE], "other_key": "value"},
|
||||
{"messages": [], "other_key": "value"},
|
||||
{
|
||||
"messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]],
|
||||
"other_key": "value",
|
||||
},
|
||||
{"any_key": [_EXAMPLE_MESSAGE]},
|
||||
{"any_key": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]]},
|
||||
]
|
||||
_VALID_PROMPTS = [
|
||||
{"prompts": ["foo"], "other_key": "value"},
|
||||
{"prompt": "foo", "other_key": ["bar", "baz"]},
|
||||
{"some_key": "foo"},
|
||||
{"some_key": ["foo"]},
|
||||
]
|
||||
|
||||
_INVALID_PROMPTS = (
|
||||
[
|
||||
{"prompts": "foo"},
|
||||
{"prompt": ["foo"]},
|
||||
{"some_key": 3},
|
||||
{"some_key": "foo", "other_key": "bar"},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inputs",
|
||||
_VALID_MESSAGES,
|
||||
)
|
||||
def test__get_messages_valid(inputs: Dict[str, Any]) -> None:
|
||||
{"messages": []}
|
||||
_get_messages(inputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inputs",
|
||||
_VALID_PROMPTS,
|
||||
)
|
||||
def test__get_prompts_valid(inputs: Dict[str, Any]) -> None:
|
||||
_get_prompt(inputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inputs",
|
||||
_VALID_PROMPTS,
|
||||
)
|
||||
def test__validate_example_inputs_for_language_model(inputs: Dict[str, Any]) -> None:
|
||||
mock_ = mock.MagicMock()
|
||||
mock_.inputs = inputs
|
||||
_validate_example_inputs_for_language_model(mock_, None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inputs",
|
||||
_INVALID_PROMPTS,
|
||||
)
|
||||
def test__validate_example_inputs_for_language_model_invalid(
|
||||
inputs: Dict[str, Any]
|
||||
) -> None:
|
||||
mock_ = mock.MagicMock()
|
||||
mock_.inputs = inputs
|
||||
with pytest.raises(InputFormatError):
|
||||
_validate_example_inputs_for_language_model(mock_, None)
|
||||
|
||||
|
||||
def test__validate_example_inputs_for_chain_single_input() -> None:
|
||||
mock_ = mock.MagicMock()
|
||||
mock_.inputs = {"foo": "bar"}
|
||||
chain = mock.MagicMock()
|
||||
chain.input_keys = ["def not foo"]
|
||||
_validate_example_inputs_for_chain(mock_, chain, None)
|
||||
|
||||
|
||||
def test__validate_example_inputs_for_chain_input_mapper() -> None:
|
||||
mock_ = mock.MagicMock()
|
||||
mock_.inputs = {"foo": "bar", "baz": "qux"}
|
||||
chain = mock.MagicMock()
|
||||
chain.input_keys = ["not foo", "not baz", "not qux"]
|
||||
|
||||
def wrong_output_format(inputs: dict) -> str:
|
||||
assert "foo" in inputs
|
||||
assert "baz" in inputs
|
||||
return "hehe"
|
||||
|
||||
with pytest.raises(InputFormatError, match="must be a dictionary"):
|
||||
_validate_example_inputs_for_chain(mock_, chain, wrong_output_format)
|
||||
|
||||
def wrong_output_keys(inputs: dict) -> dict:
|
||||
assert "foo" in inputs
|
||||
assert "baz" in inputs
|
||||
return {"not foo": "foo", "not baz": "baz"}
|
||||
|
||||
with pytest.raises(InputFormatError, match="keys that match"):
|
||||
_validate_example_inputs_for_chain(mock_, chain, wrong_output_keys)
|
||||
|
||||
def input_mapper(inputs: dict) -> dict:
|
||||
assert "foo" in inputs
|
||||
assert "baz" in inputs
|
||||
return {"not foo": inputs["foo"], "not baz": inputs["baz"], "not qux": "qux"}
|
||||
|
||||
_validate_example_inputs_for_chain(mock_, chain, input_mapper)
|
||||
|
||||
|
||||
def test__validate_example_inputs_for_chain_multi_io() -> None:
|
||||
mock_ = mock.MagicMock()
|
||||
mock_.inputs = {"foo": "bar", "baz": "qux"}
|
||||
chain = mock.MagicMock()
|
||||
chain.input_keys = ["foo", "baz"]
|
||||
_validate_example_inputs_for_chain(mock_, chain, None)
|
||||
|
||||
|
||||
def test__validate_example_inputs_for_chain_single_input_multi_expect() -> None:
|
||||
mock_ = mock.MagicMock()
|
||||
mock_.inputs = {"foo": "bar"}
|
||||
chain = mock.MagicMock()
|
||||
chain.input_keys = ["def not foo", "oh here is another"]
|
||||
with pytest.raises(
|
||||
InputFormatError, match="Example inputs do not match chain input keys."
|
||||
):
|
||||
_validate_example_inputs_for_chain(mock_, chain, None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("inputs", _INVALID_PROMPTS)
|
||||
def test__get_prompts_invalid(inputs: Dict[str, Any]) -> None:
|
||||
with pytest.raises(InputFormatError):
|
||||
_get_prompt(inputs)
|
||||
|
||||
|
||||
def test_run_llm_or_chain_with_input_mapper() -> None:
|
||||
example = Example(
|
||||
id=uuid.uuid4(),
|
||||
created_at=_CREATED_AT,
|
||||
inputs={"the wrong input": "1", "another key": "2"},
|
||||
outputs={"output": "2"},
|
||||
dataset_id=str(uuid.uuid4()),
|
||||
)
|
||||
|
||||
def run_val(inputs: dict) -> dict:
|
||||
assert "the right input" in inputs
|
||||
return {"output": "2"}
|
||||
|
||||
mock_chain = TransformChain(
|
||||
input_variables=["the right input"],
|
||||
output_variables=["output"],
|
||||
transform=run_val,
|
||||
)
|
||||
|
||||
def input_mapper(inputs: dict) -> dict:
|
||||
assert "the wrong input" in inputs
|
||||
return {"the right input": inputs["the wrong input"]}
|
||||
|
||||
result = _run_llm_or_chain(
|
||||
example, lambda: mock_chain, n_repetitions=1, input_mapper=input_mapper
|
||||
)
|
||||
assert len(result) == 1
|
||||
assert result[0] == {"output": "2", "the right input": "1"}
|
||||
bad_result = _run_llm_or_chain(
|
||||
example,
|
||||
lambda: mock_chain,
|
||||
n_repetitions=1,
|
||||
)
|
||||
assert len(bad_result) == 1
|
||||
assert "Error" in bad_result[0]
|
||||
|
||||
# Try with LLM
|
||||
def llm_input_mapper(inputs: dict) -> str:
|
||||
assert "the wrong input" in inputs
|
||||
return "the right input"
|
||||
|
||||
mock_llm = FakeLLM(queries={"the right input": "somenumber"})
|
||||
result = _run_llm_or_chain(
|
||||
example, mock_llm, n_repetitions=1, input_mapper=llm_input_mapper
|
||||
)
|
||||
assert len(result) == 1
|
||||
llm_result = result[0]
|
||||
assert isinstance(llm_result, str)
|
||||
assert llm_result == "somenumber"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inputs",
|
||||
[
|
||||
{"one_key": [_EXAMPLE_MESSAGE], "other_key": "value"},
|
||||
{
|
||||
"messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE], _EXAMPLE_MESSAGE],
|
||||
"other_key": "value",
|
||||
},
|
||||
{"prompts": "foo"},
|
||||
{},
|
||||
],
|
||||
)
|
||||
def test__get_messages_invalid(inputs: Dict[str, Any]) -> None:
|
||||
with pytest.raises(InputFormatError):
|
||||
_get_messages(inputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("inputs", _VALID_PROMPTS + _VALID_MESSAGES)
|
||||
def test_run_llm_all_formats(inputs: Dict[str, Any]) -> None:
|
||||
llm = FakeLLM()
|
||||
_run_llm(llm, inputs, mock.MagicMock())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("inputs", _VALID_MESSAGES + _VALID_PROMPTS)
|
||||
def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
|
||||
llm = FakeChatModel()
|
||||
_run_llm(llm, inputs, mock.MagicMock())
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
dataset = Dataset(
|
||||
id=uuid.uuid4(),
|
||||
name="test",
|
||||
description="Test dataset",
|
||||
owner_id="owner",
|
||||
created_at=_CREATED_AT,
|
||||
tenant_id=_TENANT_ID,
|
||||
)
|
||||
uuids = [
|
||||
"0c193153-2309-4704-9a47-17aee4fb25c8",
|
||||
"0d11b5fd-8e66-4485-b696-4b55155c0c05",
|
||||
"90d696f0-f10d-4fd0-b88b-bfee6df08b84",
|
||||
"4ce2c6d8-5124-4c0c-8292-db7bdebcf167",
|
||||
"7b5a524c-80fa-4960-888e-7d380f9a11ee",
|
||||
]
|
||||
examples = [
|
||||
Example(
|
||||
id=uuids[0],
|
||||
created_at=_CREATED_AT,
|
||||
inputs={"input": "1"},
|
||||
outputs={"output": "2"},
|
||||
dataset_id=str(uuid.uuid4()),
|
||||
),
|
||||
Example(
|
||||
id=uuids[1],
|
||||
created_at=_CREATED_AT,
|
||||
inputs={"input": "3"},
|
||||
outputs={"output": "4"},
|
||||
dataset_id=str(uuid.uuid4()),
|
||||
),
|
||||
Example(
|
||||
id=uuids[2],
|
||||
created_at=_CREATED_AT,
|
||||
inputs={"input": "5"},
|
||||
outputs={"output": "6"},
|
||||
dataset_id=str(uuid.uuid4()),
|
||||
),
|
||||
Example(
|
||||
id=uuids[3],
|
||||
created_at=_CREATED_AT,
|
||||
inputs={"input": "7"},
|
||||
outputs={"output": "8"},
|
||||
dataset_id=str(uuid.uuid4()),
|
||||
),
|
||||
Example(
|
||||
id=uuids[4],
|
||||
created_at=_CREATED_AT,
|
||||
inputs={"input": "9"},
|
||||
outputs={"output": "10"},
|
||||
dataset_id=str(uuid.uuid4()),
|
||||
),
|
||||
]
|
||||
|
||||
def mock_read_dataset(*args: Any, **kwargs: Any) -> Dataset:
|
||||
return dataset
|
||||
|
||||
def mock_list_examples(*args: Any, **kwargs: Any) -> Iterator[Example]:
|
||||
return iter(examples)
|
||||
|
||||
async def mock_arun_chain(
|
||||
example: Example,
|
||||
llm_or_chain: Union[BaseLanguageModel, Chain],
|
||||
n_repetitions: int,
|
||||
tags: Optional[List[str]] = None,
|
||||
callbacks: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Dict[str, Any]]:
|
||||
return [
|
||||
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
|
||||
]
|
||||
|
||||
def mock_create_project(*args: Any, **kwargs: Any) -> None:
|
||||
pass
|
||||
|
||||
with mock.patch.object(
|
||||
Client, "read_dataset", new=mock_read_dataset
|
||||
), mock.patch.object(Client, "list_examples", new=mock_list_examples), mock.patch(
|
||||
"langchain.smith.evaluation.runner_utils._arun_llm_or_chain",
|
||||
new=mock_arun_chain,
|
||||
), mock.patch.object(
|
||||
Client, "create_project", new=mock_create_project
|
||||
):
|
||||
client = Client(api_url="http://localhost:1984", api_key="123")
|
||||
chain = mock.MagicMock()
|
||||
chain.input_keys = ["foothing"]
|
||||
num_repetitions = 3
|
||||
results = await arun_on_dataset(
|
||||
dataset_name="test",
|
||||
llm_or_chain_factory=lambda: chain,
|
||||
concurrency_level=2,
|
||||
project_name="test_project",
|
||||
num_repetitions=num_repetitions,
|
||||
client=client,
|
||||
)
|
||||
|
||||
expected = {
|
||||
uuid_: [
|
||||
{"result": f"Result for example {uuid.UUID(uuid_)}"}
|
||||
for _ in range(num_repetitions)
|
||||
]
|
||||
for uuid_ in uuids
|
||||
}
|
||||
assert results["results"] == expected
|
Loading…
Reference in New Issue