Add String Distance and Embedding Evaluators (#7123)

Add a string evaluator and pairwise string evaluator implementation for:
- Embedding distance
- String distance

Update docs
pull/7390/head
William FH 11 months ago committed by GitHub
parent fb6e63dc36
commit 4789c99bc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -165,28 +165,35 @@ Classes
callbacks.aim_callback.AimCallbackHandler
callbacks.argilla_callback.ArgillaCallbackHandler
callbacks.arize_callback.ArizeCallbackHandler
callbacks.arthur_callback.ArthurCallbackHandler
callbacks.base.AsyncCallbackHandler
callbacks.base.BaseCallbackHandler
callbacks.base.BaseCallbackManager
callbacks.clearml_callback.ClearMLCallbackHandler
callbacks.comet_ml_callback.CometCallbackHandler
callbacks.file.FileCallbackHandler
callbacks.flyte_callback.FlyteCallbackHandler
callbacks.human.HumanApprovalCallbackHandler
callbacks.human.HumanRejectedException
callbacks.infino_callback.InfinoCallbackHandler
callbacks.manager.AsyncCallbackManager
callbacks.manager.AsyncCallbackManagerForChainRun
callbacks.manager.AsyncCallbackManagerForLLMRun
callbacks.manager.AsyncCallbackManagerForRetrieverRun
callbacks.manager.AsyncCallbackManagerForToolRun
callbacks.manager.AsyncParentRunManager
callbacks.manager.AsyncRunManager
callbacks.manager.BaseRunManager
callbacks.manager.CallbackManager
callbacks.manager.CallbackManagerForChainRun
callbacks.manager.CallbackManagerForLLMRun
callbacks.manager.CallbackManagerForRetrieverRun
callbacks.manager.CallbackManagerForToolRun
callbacks.manager.ParentRunManager
callbacks.manager.RunManager
callbacks.mlflow_callback.MlflowCallbackHandler
callbacks.openai_info.OpenAICallbackHandler
callbacks.promptlayer_callback.PromptLayerCallbackHandler
callbacks.stdout.StdOutCallbackHandler
callbacks.streaming_aiter.AsyncIteratorCallbackHandler
callbacks.streaming_aiter_final_only.AsyncFinalIteratorCallbackHandler
@ -229,6 +236,8 @@ Functions
callbacks.aim_callback.import_aim
callbacks.clearml_callback.import_clearml
callbacks.comet_ml_callback.import_comet_ml
callbacks.flyte_callback.analyze_text
callbacks.flyte_callback.import_flytekit
callbacks.infino_callback.import_infino
callbacks.manager.env_var_is_set
callbacks.manager.get_openai_callback
@ -283,9 +292,11 @@ Classes
chains.base.Chain
chains.combine_documents.base.AnalyzeDocumentChain
chains.combine_documents.base.BaseCombineDocumentsChain
chains.combine_documents.map_reduce.CombineDocsProtocol
chains.combine_documents.map_reduce.MapReduceDocumentsChain
chains.combine_documents.map_rerank.MapRerankDocumentsChain
chains.combine_documents.reduce.AsyncCombineDocsProtocol
chains.combine_documents.reduce.CombineDocsProtocol
chains.combine_documents.reduce.ReduceDocumentsChain
chains.combine_documents.refine.RefineDocumentsChain
chains.combine_documents.stuff.StuffDocumentsChain
chains.constitutional_ai.base.ConstitutionalChain
@ -299,8 +310,10 @@ Classes
chains.flare.prompts.FinishedOutputParser
chains.graph_qa.base.GraphQAChain
chains.graph_qa.cypher.GraphCypherQAChain
chains.graph_qa.hugegraph.HugeGraphQAChain
chains.graph_qa.kuzu.KuzuQAChain
chains.graph_qa.nebulagraph.NebulaGraphQAChain
chains.graph_qa.sparql.GraphSparqlQAChain
chains.hyde.base.HypotheticalDocumentEmbedder
chains.llm.LLMChain
chains.llm_bash.base.LLMBashChain
@ -363,7 +376,6 @@ Functions
.. autosummary::
:toctree: chains
chains.combine_documents.base.format_document
chains.graph_qa.cypher.extract_cypher
chains.loading.load_chain
chains.loading.load_chain_from_config
@ -415,6 +427,7 @@ Classes
chat_models.fake.FakeListChatModel
chat_models.google_palm.ChatGooglePalm
chat_models.google_palm.ChatGooglePalmError
chat_models.human.HumanInputChatModel
chat_models.openai.ChatOpenAI
chat_models.promptlayer_openai.PromptLayerChatOpenAI
chat_models.vertexai.ChatVertexAI
@ -513,6 +526,7 @@ Classes
document_loaders.blob_loaders.youtube_audio.YoutubeAudioLoader
document_loaders.blockchain.BlockchainDocumentLoader
document_loaders.blockchain.BlockchainType
document_loaders.brave_search.BraveSearchLoader
document_loaders.chatgpt.ChatGPTLoader
document_loaders.college_confidential.CollegeConfidentialLoader
document_loaders.confluence.ConfluenceLoader
@ -520,6 +534,7 @@ Classes
document_loaders.conllu.CoNLLULoader
document_loaders.csv_loader.CSVLoader
document_loaders.csv_loader.UnstructuredCSVLoader
document_loaders.cube_semantic.CubeSemanticLoader
document_loaders.dataframe.DataFrameLoader
document_loaders.diffbot.DiffbotLoader
document_loaders.directory.DirectoryLoader
@ -736,6 +751,7 @@ Classes
embeddings.self_hosted.SelfHostedEmbeddings
embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceEmbeddings
embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceInstructEmbeddings
embeddings.spacy_embeddings.SpacyEmbeddings
embeddings.tensorflow_hub.TensorflowHubEmbeddings
embeddings.vertexai.VertexAIEmbeddings
@ -790,6 +806,9 @@ Classes
evaluation.comparison.eval_chain.PairwiseStringResultOutputParser
evaluation.criteria.eval_chain.CriteriaEvalChain
evaluation.criteria.eval_chain.CriteriaResultOutputParser
evaluation.embedding_distance.base.EmbeddingDistance
evaluation.embedding_distance.base.EmbeddingDistanceEvalChain
evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain
evaluation.qa.eval_chain.ContextQAEvalChain
evaluation.qa.eval_chain.CotQAEvalChain
evaluation.qa.eval_chain.QAEvalChain
@ -799,10 +818,16 @@ Classes
evaluation.run_evaluators.implementations.ChoicesOutputParser
evaluation.run_evaluators.implementations.CriteriaOutputParser
evaluation.run_evaluators.implementations.StringRunEvaluatorInputMapper
evaluation.run_evaluators.implementations.TrajectoryEvalOutputParser
evaluation.run_evaluators.implementations.TrajectoryInputMapper
evaluation.run_evaluators.implementations.TrajectoryRunEvalOutputParser
evaluation.schema.AgentTrajectoryEvaluator
evaluation.schema.EvaluatorType
evaluation.schema.LLMEvalChain
evaluation.schema.PairwiseStringEvaluator
evaluation.schema.StringEvaluator
evaluation.string_distance.base.PairwiseStringDistanceEvalChain
evaluation.string_distance.base.StringDistance
evaluation.string_distance.base.StringDistanceEvalChain
Functions
--------------
@ -812,6 +837,8 @@ Functions
:toctree: evaluation
evaluation.loading.load_dataset
evaluation.loading.load_evaluator
evaluation.loading.load_evaluators
evaluation.run_evaluators.implementations.get_criteria_evaluator
evaluation.run_evaluators.implementations.get_qa_evaluator
evaluation.run_evaluators.implementations.get_trajectory_evaluator
@ -1057,6 +1084,7 @@ Functions
llms.aviary.get_completions
llms.aviary.get_models
llms.base.create_base_retry_decorator
llms.base.get_prompts
llms.base.update_cache
llms.cohere.completion_with_retry
@ -1069,6 +1097,7 @@ Functions
llms.openai.completion_with_retry
llms.openai.update_token_usage
llms.utils.enforce_stop_tokens
llms.vertexai.completion_with_retry
llms.vertexai.is_codey_model
:mod:`langchain.load`: Load
@ -1241,7 +1270,6 @@ Classes
:toctree: prompts
:template: class.rst
prompts.base.BasePromptTemplate
prompts.base.StringPromptTemplate
prompts.base.StringPromptValue
prompts.chat.AIMessagePromptTemplate
@ -1348,7 +1376,7 @@ Classes
retrievers.multi_query.LineListOutputParser
retrievers.multi_query.MultiQueryRetriever
retrievers.pinecone_hybrid_search.PineconeHybridSearchRetriever
retrievers.pupmed.PubMedRetriever
retrievers.pubmed.PubMedRetriever
retrievers.remote_retriever.RemoteLangChainRetriever
retrievers.self_query.base.SelfQueryRetriever
retrievers.self_query.chroma.ChromaTranslator
@ -1400,28 +1428,29 @@ Classes
:toctree: schema
:template: class.rst
schema.AIMessage
schema.AgentFinish
schema.BaseChatMessageHistory
schema.BaseDocumentTransformer
schema.BaseLLMOutputParser
schema.BaseMemory
schema.BaseMessage
schema.BaseOutputParser
schema.BaseRetriever
schema.ChatGeneration
schema.ChatMessage
schema.ChatResult
schema.Document
schema.FunctionMessage
schema.Generation
schema.HumanMessage
schema.LLMResult
schema.NoOpOutputParser
schema.OutputParserException
schema.PromptValue
schema.RunInfo
schema.SystemMessage
schema.agent.AgentFinish
schema.document.BaseDocumentTransformer
schema.document.Document
schema.memory.BaseChatMessageHistory
schema.memory.BaseMemory
schema.messages.AIMessage
schema.messages.BaseMessage
schema.messages.ChatMessage
schema.messages.FunctionMessage
schema.messages.HumanMessage
schema.messages.SystemMessage
schema.output.ChatGeneration
schema.output.ChatResult
schema.output.Generation
schema.output.LLMResult
schema.output.RunInfo
schema.output_parser.BaseLLMOutputParser
schema.output_parser.BaseOutputParser
schema.output_parser.NoOpOutputParser
schema.output_parser.OutputParserException
schema.prompt.PromptValue
schema.prompt_template.BasePromptTemplate
schema.retriever.BaseRetriever
Functions
--------------
@ -1430,9 +1459,10 @@ Functions
.. autosummary::
:toctree: schema
schema.get_buffer_string
schema.messages_from_dict
schema.messages_to_dict
schema.messages.get_buffer_string
schema.messages.messages_from_dict
schema.messages.messages_to_dict
schema.prompt_template.format_document
:mod:`langchain.server`: Server
================================
@ -1535,6 +1565,8 @@ Classes
tools.bing_search.tool.BingSearchRun
tools.brave_search.tool.BraveSearch
tools.convert_to_openai.FunctionDescription
tools.dataforseo_api_search.tool.DataForSeoAPISearchResults
tools.dataforseo_api_search.tool.DataForSeoAPISearchRun
tools.ddg_search.tool.DuckDuckGoSearchResults
tools.ddg_search.tool.DuckDuckGoSearchRun
tools.file_management.copy.CopyFileTool
@ -1708,6 +1740,7 @@ Classes
utilities.bibtex.BibtexparserWrapper
utilities.bing_search.BingSearchAPIWrapper
utilities.brave_search.BraveSearchWrapper
utilities.dataforseo_api_search.DataForSeoAPIWrapper
utilities.duckduckgo_search.DuckDuckGoSearchAPIWrapper
utilities.google_places_api.GooglePlacesAPIWrapper
utilities.google_search.GoogleSearchAPIWrapper
@ -1805,12 +1838,17 @@ Classes
vectorstores.faiss.FAISS
vectorstores.hologres.Hologres
vectorstores.lancedb.LanceDB
vectorstores.marqo.Marqo
vectorstores.matching_engine.MatchingEngine
vectorstores.milvus.Milvus
vectorstores.mongodb_atlas.MongoDBAtlasVectorSearch
vectorstores.myscale.MyScale
vectorstores.myscale.MyScaleSettings
vectorstores.opensearch_vector_search.OpenSearchVectorSearch
vectorstores.pgembedding.BaseModel
vectorstores.pgembedding.CollectionStore
vectorstores.pgembedding.EmbeddingStore
vectorstores.pgembedding.PGEmbedding
vectorstores.pgvector.BaseModel
vectorstores.pgvector.CollectionStore
vectorstores.pgvector.DistanceStrategy

@ -3,32 +3,63 @@
This module contains off-the-shelf evaluation chains for grading the output of
LangChain primitives such as language models and chains.
To load an evaluator, you can use the :func:`load_evaluators <langchain.evaluation.loading.load_evaluators>` function with the
**Loading an evaluator**
To load an evaluator, you can use the :func:`load_evaluators <langchain.evaluation.loading.load_evaluators>` or
:func:`load_evaluator <langchain.evaluation.loading.load_evaluator>` functions with the
names of the evaluators to load.
.. code-block:: python
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("qa")
evaluator.evaluate_strings(
prediction="We sold more than 40,000 units last week",
input="How many units did we sell last week?",
reference="We sold 32,378 units",
)
The evaluator must be one of :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`.
**Datasets**
To load one of the LangChain HuggingFace datasets, you can use the :func:`load_dataset <langchain.evaluation.loading.load_dataset>` function with the
name of the dataset to load.
Some common use cases for evaluation include:
.. code-block:: python
from langchain.evaluation import load_dataset
ds = load_dataset("llm-math")
**Some common use cases for evaluation include:**
- Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain <langchain.evaluation.qa.eval_chain.QAEvalChain>`
- Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>`
- Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain <langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain>`
- Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain <langchain.evaluation.criteria.eval_chain.CriteriaEvalChain>`
- Computing semantic difference between a prediction and reference: :class:`EmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain>` or between two predictions: :class:`PairwiseEmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain>`
- Measuring the string distance between a prediction and reference :class:`StringDistanceEvalChain <langchain.evaluation.string_distance.base.StringDistanceEvalChain>` or between two predictions :class:`PairwiseStringDistanceEvalChain <langchain.evaluation.string_distance.base.PairwiseStringDistanceEvalChain>`
**Low-level API**
This module also contains low-level APIs for creating custom evaluators for
specific evaluation tasks. These include:
These evaluators implement one of the following interfaces:
- :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`: Evaluate a prediction string against a reference label and/or input context.
- :class:`PairwiseStringEvaluator <langchain.evaluation.schema.PairwiseStringEvaluator>`: Evaluate two prediction strings against each other.
Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs.
- :class:`AgentTrajectoryEvaluator <langchain.evaluation.schema.AgentTrajectoryEvaluator>`: Evaluate the full sequence of actions
taken by an agent.
- :class:`PairwiseStringEvaluator <langchain.evaluation.schema.PairwiseStringEvaluator>`: Evaluate two prediction strings against each other. Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs.
- :class:`AgentTrajectoryEvaluator <langchain.evaluation.schema.AgentTrajectoryEvaluator>` Evaluate the full sequence of actions taken by an agent.
These interfaces enable easier composability and usage within a higher level evaluation framework.
""" # noqa: E501
from langchain.evaluation.agents import TrajectoryEvalChain
from langchain.evaluation.comparison import PairwiseStringEvalChain
from langchain.evaluation.criteria import CriteriaEvalChain
from langchain.evaluation.embedding_distance import (
EmbeddingDistance,
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
)
from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
from langchain.evaluation.schema import (
@ -37,6 +68,11 @@ from langchain.evaluation.schema import (
PairwiseStringEvaluator,
StringEvaluator,
)
from langchain.evaluation.string_distance import (
PairwiseStringDistanceEvalChain,
StringDistance,
StringDistanceEvalChain,
)
__all__ = [
"EvaluatorType",
@ -48,6 +84,12 @@ __all__ = [
"PairwiseStringEvaluator",
"TrajectoryEvalChain",
"CriteriaEvalChain",
"EmbeddingDistance",
"EmbeddingDistanceEvalChain",
"PairwiseEmbeddingDistanceEvalChain",
"StringDistance",
"StringDistanceEvalChain",
"PairwiseStringDistanceEvalChain",
"load_evaluators",
"load_evaluator",
"load_dataset",

@ -77,40 +77,42 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
the sequence of actions taken and their outcomes.
Example:
.. code-block:: python
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import TrajectoryEvalChain
from langchain.tools import tool
@tool
def geography_answers(country: str, question: str) -> str:
\"\"\"Very helpful answers to geography questions.\"\"\"
return f"{country}? IDK - We may never know {question}."
llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0)
agent = initialize_agent(
tools=[geography_answers],
llm=llm,
agent=AgentType.OPENAI_FUNCTIONS,
return_intermediate_steps=True,
)
question = "How many dwell in the largest minor region in Argentina?"
response = agent(question)
.. code-block:: python
eval_chain = TrajectoryEvalChain.from_llm(
llm=llm, agent_tools=[geography_answers], return_reasoning=True
)
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import TrajectoryEvalChain
from langchain.tools import tool
result = eval_chain.evaluate_agent_trajectory(
input=question,
agent_trajectory=response["intermediate_steps"],
prediction=response["output"],
reference="Paris",
)
print(result["score"])
# 0
@tool
def geography_answers(country: str, question: str) -> str:
\"\"\"Very helpful answers to geography questions.\"\"\"
return f"{country}? IDK - We may never know {question}."
llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0)
agent = initialize_agent(
tools=[geography_answers],
llm=llm,
agent=AgentType.OPENAI_FUNCTIONS,
return_intermediate_steps=True,
)
question = "How many dwell in the largest minor region in Argentina?"
response = agent(question)
eval_chain = TrajectoryEvalChain.from_llm(
llm=llm, agent_tools=[geography_answers], return_reasoning=True
)
result = eval_chain.evaluate_agent_trajectory(
input=question,
agent_trajectory=response["intermediate_steps"],
prediction=response["output"],
reference="Paris",
)
print(result["score"])
# 0
""" # noqa: E501
agent_tools: Optional[List[BaseTool]] = None
@ -336,7 +338,8 @@ The following is the expected answer. Use this to measure correctness:
callbacks (Callbacks): Callbacks to use for this chain run.
Returns:
dict: The evaluation result.
dict: The evaluation result, which includes the score and optionally
the reasoning for reaching that.
"""
inputs = {
"question": input,
@ -367,7 +370,8 @@ The following is the expected answer. Use this to measure correctness:
callbacks (Callbacks): Callbacks to use for this chain run.
Returns:
dict: The evaluation result.
dict: The evaluation result, which includes the score and optionally
the reasoning for reaching that.
"""
inputs = {
"question": input,

@ -52,7 +52,8 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
"""A chain for comparing the output of two models.
"""A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs.
Example:
>>> from langchain.chat_models import ChatOpenAI

@ -92,10 +92,37 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
--------
>>> from langchain.chat_models import ChatAnthropic
>>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> llm = ChatAnthropic()
>>> llm = ChatAnthropic(temperature=0)
>>> criteria = {"my-custom-criterion": "Is the submission the most amazing ever?"}
>>> chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
"""
>>> evaluator = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
>>> evaluator.evaluate_strings(prediction="Imagine an ice cream flavor for the color aquamarine", input="Tell me an idea")
{
'reasoning': 'Here is my step-by-step reasoning for the given criteria:\\n\\nThe criterion is: "Is the submission the most amazing ever?" This is a subjective criterion and open to interpretation. The submission suggests an aquamarine-colored ice cream flavor which is creative but may or may not be considered the most amazing idea ever conceived. There are many possible amazing ideas and this one ice cream flavor suggestion may or may not rise to that level for every person. \\n\\nN',
'value': 'N',
'score': 0,
}
>>> from langchain.chat_models import ChatOpenAI
>>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> llm = ChatOpenAI(model="gpt-4", temperature=0)
>>> criteria = "correctness"
>>> evaluator = CriteriaEvalChain.from_llm(
... llm=llm,
... criteria=criteria,
... requires_reference=True,
... )
>>> evaluator.evaluate_strings(
... prediction="The answer is 4",
... input="How many apples are there?",
... reference="There are 3 apples",
... )
{
'score': 0,
'reasoning': 'The criterion for this task is the correctness of the submission. The submission states that there are 4 apples, but the reference indicates that there are actually 3 apples. Therefore, the submission is not correct, accurate, or factual according to the given criterion.\\n\\nN',
'value': 'N',
}
""" # noqa: E501
output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
"""The parser to use to map the output to a structured result."""

@ -0,0 +1,12 @@
"""Evaluators that measure embedding distances."""
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistance,
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
)
__all__ = [
"EmbeddingDistance",
"EmbeddingDistanceEvalChain",
"PairwiseEmbeddingDistanceEvalChain",
]

@ -0,0 +1,438 @@
"""A chain for comparing the output of two models using embeddings."""
from enum import Enum
from typing import Any, Dict, List, Optional
import numpy as np
from pydantic import Field, root_validator
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.embeddings.base import Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
from langchain.math_utils import cosine_similarity
class EmbeddingDistance(str, Enum):
"""Embedding Distance Metric.
Attributes:
COSINE: Cosine distance metric.
EUCLIDEAN: Euclidean distance metric.
MANHATTAN: Manhattan distance metric.
CHEBYSHEV: Chebyshev distance metric.
HAMMING: Hamming distance metric.
"""
COSINE = "cosine"
EUCLIDEAN = "euclidean"
MANHATTAN = "manhattan"
CHEBYSHEV = "chebyshev"
HAMMING = "hamming"
class _EmbeddingDistanceChainMixin(Chain):
"""Shared functionality for embedding distance evaluators.
Attributes:
embeddings (Embeddings): The embedding objects to vectorize the outputs.
distance_metric (EmbeddingDistance): The distance metric to use
for comparing the embeddings.
"""
embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
class Config:
"""Permit embeddings to go unvalidated."""
arbitrary_types_allowed: bool = True
@property
def output_keys(self) -> List[str]:
"""Return the output keys of the chain.
Returns:
List[str]: The output keys.
"""
return ["score"]
@root_validator
def _validate_distance_metric(cls, values: dict) -> dict:
"""Validate the distance metric.
Args:
values (dict): The values to validate.
Returns:
dict: The validated values.
"""
values["distance_metric"] = values["distance_metric"].lower()
return values
def _get_metric(self, metric: EmbeddingDistance) -> Any:
"""Get the metric function for the given metric name.
Args:
metric (EmbeddingDistance): The metric name.
Returns:
Any: The metric function.
"""
metrics = {
EmbeddingDistance.COSINE: self._cosine_distance,
EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
EmbeddingDistance.MANHATTAN: self._manhattan_distance,
EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
EmbeddingDistance.HAMMING: self._hamming_distance,
}
if metric in metrics:
return metrics[metric]
else:
raise ValueError(f"Invalid metric: {metric}")
@staticmethod
def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
"""Compute the cosine distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.ndarray: The cosine distance.
"""
return 1.0 - cosine_similarity(a, b)
@staticmethod
def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Euclidean distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Euclidean distance.
"""
return np.linalg.norm(a - b)
@staticmethod
def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Manhattan distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Manhattan distance.
"""
return np.sum(np.abs(a - b))
@staticmethod
def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Chebyshev distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Chebyshev distance.
"""
return np.max(np.abs(a - b))
@staticmethod
def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
"""Compute the Hamming distance between two vectors.
Args:
a (np.ndarray): The first vector.
b (np.ndarray): The second vector.
Returns:
np.floating: The Hamming distance.
"""
return np.mean(a != b)
def _compute_score(self, vectors: np.ndarray) -> float:
"""Compute the score based on the distance metric.
Args:
vectors (np.ndarray): The input vectors.
Returns:
float: The computed score.
"""
metric = self._get_metric(self.distance_metric)
score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
return score
class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
"""Use embedding distances to score semantic difference between
a prediction and reference.
Examples:
>>> chain = EmbeddingDistanceEvalChain()
>>> result = chain.evaluate_strings(prediction="Hello", reference="Hi")
>>> print(result)
{'score': 0.5}
"""
@property
def requires_reference(self) -> bool:
"""Return whether the chain requires a reference.
Returns:
bool: True if a reference is required, False otherwise.
"""
return True
@property
def input_keys(self) -> List[str]:
"""Return the input keys of the chain.
Returns:
List[str]: The input keys.
"""
return ["prediction", "reference"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Compute the score for a prediction and reference.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (Optional[CallbackManagerForChainRun], optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
vectors = np.array(
self.embeddings.embed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
)
score = self._compute_score(vectors)
return {"score": score}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Asynchronously compute the score for a prediction and reference.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (AsyncCallbackManagerForChainRun, optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
embedded = await self.embeddings.aembed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
vectors = np.array(embedded)
score = self._compute_score(vectors)
return {"score": score}
def _evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between a prediction and
reference.
Args:
prediction (str): The output string from the first model.
reference (str): The reference string (required)
callbacks (Callbacks, optional): The callbacks to use.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return self(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
async def _aevaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance between
a prediction and reference.
Args:
prediction (str): The output string from the first model.
reference (str): The output string from the second model.
callbacks (Callbacks, optional): The callbacks to use.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return await self.acall(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
class PairwiseEmbeddingDistanceEvalChain(
_EmbeddingDistanceChainMixin, PairwiseStringEvaluator
):
"""Use embedding distances to score semantic difference between two predictions.
Examples:
>>> chain = PairwiseEmbeddingDistanceEvalChain()
>>> result = chain.evaluate_string_pairs(prediction="Hello", prediction_b="Hi")
>>> print(result)
{'score': 0.5}
"""
@property
def input_keys(self) -> List[str]:
"""Return the input keys of the chain.
Returns:
List[str]: The input keys.
"""
return ["prediction", "prediction_b"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Compute the score for two predictions.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (CallbackManagerForChainRun, optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
vectors = np.array(
self.embeddings.embed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
)
score = self._compute_score(vectors)
return {"score": score}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Asynchronously compute the score for two predictions.
Args:
inputs (Dict[str, Any]): The input data.
run_manager (AsyncCallbackManagerForChainRun, optional):
The callback manager.
Returns:
Dict[str, Any]: The computed score.
"""
embedded = await self.embeddings.aembed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
vectors = np.array(embedded)
score = self._compute_score(vectors)
return {"score": score}
def _evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces
metadata (Dict[str, Any], optional): metadata to apply to
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
result = self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}
async def _aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance
between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces
metadata (Dict[str, Any], optional): metadata to apply to traces
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
result = await self.acall(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}

@ -1,25 +1,46 @@
"""Loading datasets and evaluators."""
from typing import Any, Dict, List, Optional, Sequence, Type
from typing import Any, Dict, List, Optional, Sequence, Type, Union
from langchain.chains.base import Chain
from langchain.chat_models.openai import ChatOpenAI
from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
from langchain.evaluation.comparison import PairwiseStringEvalChain
from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
)
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain
from langchain.evaluation.string_distance.base import (
PairwiseStringDistanceEvalChain,
StringDistanceEvalChain,
)
from langchain.schema.language_model import BaseLanguageModel
def load_dataset(uri: str) -> List[Dict]:
"""Load a dataset from the LangChainDatasets HuggingFace org.
"""Load a dataset from the `LangChainDatasets HuggingFace org <https://huggingface.co/LangChainDatasets>`_.
Args:
uri: The uri of the dataset to load.
Returns:
A list of dictionaries, each representing a row in the dataset.
"""
**Prerequisites**
.. code-block:: shell
pip install datasets
Examples
--------
.. code-block:: python
from langchain.evaluation import load_dataset
ds = load_dataset("llm-math")
""" # noqa: E501
try:
from datasets import load_dataset
except ImportError:
@ -32,13 +53,17 @@ def load_dataset(uri: str) -> List[Dict]:
return [d for d in dataset["train"]]
_EVALUATOR_MAP: Dict[EvaluatorType, Type[LLMEvalChain]] = {
_EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
EvaluatorType.QA: QAEvalChain,
EvaluatorType.COT_QA: CotQAEvalChain,
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
EvaluatorType.CRITERIA: CriteriaEvalChain,
EvaluatorType.STRING_DISTANCE: StringDistanceEvalChain,
EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,
EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
}
@ -66,8 +91,8 @@ def load_evaluator(
Examples
--------
>>> llm = ChatOpenAI(model="gpt-4", temperature=0)
>>> evaluator = _load_evaluator("qa", llm=llm)
>>> from langchain.evaluation import load_evaluator, EvaluatorType
>>> evaluator = load_evaluator(EvaluatorType.QA)
"""
llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
if evaluator not in _EVALUATOR_MAP:
@ -75,7 +100,11 @@ def load_evaluator(
f"Unknown evaluator type: {evaluator}"
f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
)
return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs)
evaluator_cls = _EVALUATOR_MAP[evaluator]
if issubclass(evaluator_cls, LLMEvalChain):
return evaluator_cls.from_llm(llm=llm, **kwargs)
else:
return evaluator_cls(**kwargs)
def load_evaluators(
@ -107,10 +136,9 @@ def load_evaluators(
Examples
--------
.. code-block:: python
from langchain.evaluation import load_evaluators, EvaluatorType
evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA]
loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
>>> from langchain.evaluation import load_evaluators, EvaluatorType
>>> evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA]
>>> loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
"""
llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
loaded = []

@ -167,6 +167,11 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
"""Whether the chain requires an input string."""
return True
class Config:
"""Configuration for the QAEvalChain."""
extra = Extra.ignore
@classmethod
def _validate_input_vars(cls, prompt: PromptTemplate) -> None:
expected_input_vars = {"query", "context", "result"}

@ -77,7 +77,7 @@ class RunEvaluatorChain(Chain, RunEvaluator):
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: AsyncCallbackManagerForChainRun | None = None,
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
run: Run = inputs["run"]
example: Optional[Example] = inputs.get("example")

@ -33,6 +33,14 @@ class EvaluatorType(str, Enum):
CRITERIA = "criteria"
"""The criteria evaluator, which evaluates a model based on a
custom set of criteria."""
STRING_DISTANCE = "string_distance"
"""Compare predictions to a reference answer using string edit distances."""
PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
"""Compare predictions based on string edit distances."""
EMBEDDING_DISTANCE = "embedding_distance"
"""Compare a prediction to a reference label using embedding distance."""
PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
"""Compare two predictions using embedding distance."""
class LLMEvalChain(Chain):
@ -89,7 +97,8 @@ class _EvalArgsMixin:
class StringEvaluator(_EvalArgsMixin, ABC):
"""Protocol for evaluating strings."""
"""Grade, tag, or otherwise evaluate predictions relative to their inputs
and/or reference labels."""
@property
def evaluation_name(self) -> str:
@ -204,7 +213,7 @@ class StringEvaluator(_EvalArgsMixin, ABC):
class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
"""A protocol for comparing the output of two models."""
"""Compare the output of two models (or two outputs of the same model)."""
@abstractmethod
def _evaluate_string_pairs(

@ -0,0 +1,12 @@
"""String distance evaluators."""
from langchain.evaluation.string_distance.base import (
PairwiseStringDistanceEvalChain,
StringDistance,
StringDistanceEvalChain,
)
__all__ = [
"PairwiseStringDistanceEvalChain",
"StringDistance",
"StringDistanceEvalChain",
]

@ -0,0 +1,376 @@
"""String distance evaluators based on the RapidFuzz library."""
from enum import Enum
from typing import Any, Callable, Dict, List, Optional
from pydantic import Field, root_validator
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
def _load_rapidfuzz() -> Any:
"""
Load the RapidFuzz library.
Raises:
ImportError: If the rapidfuzz library is not installed.
Returns:
Any: The rapidfuzz.distance module.
"""
try:
import rapidfuzz
except ImportError:
raise ImportError(
"Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
)
return rapidfuzz.distance
class StringDistance(str, Enum):
"""Distance metric to use."""
DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
LEVENSHTEIN = "levenshtein"
JARO = "jaro"
JARO_WINKLER = "jaro_winkler"
class _RapidFuzzChainMixin(Chain):
"""Shared methods for the rapidfuzz string distance evaluators."""
distance: StringDistance = Field(default=StringDistance.LEVENSHTEIN)
@root_validator
def validate_dependencies(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate that the rapidfuzz library is installed.
Args:
values (Dict[str, Any]): The input values.
Returns:
Dict[str, Any]: The validated values.
"""
_load_rapidfuzz()
return values
@property
def output_keys(self) -> List[str]:
"""
Get the output keys.
Returns:
List[str]: The output keys.
"""
return ["score"]
@staticmethod
def _get_metric(distance: str) -> Callable:
"""
Get the distance metric function based on the distance type.
Args:
distance (str): The distance type.
Returns:
Callable: The distance metric function.
Raises:
ValueError: If the distance metric is invalid.
"""
rf_distance = _load_rapidfuzz()
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
return rf_distance.DamerauLevenshtein.distance
elif distance == StringDistance.LEVENSHTEIN:
return rf_distance.Levenshtein.distance
elif distance == StringDistance.JARO:
return rf_distance.Jaro.distance
elif distance == StringDistance.JARO_WINKLER:
return rf_distance.JaroWinkler.distance
else:
raise ValueError(f"Invalid distance metric: {distance}")
@property
def metric(self) -> Callable:
"""
Get the distance metric function.
Returns:
Callable: The distance metric function.
"""
return _RapidFuzzChainMixin._get_metric(self.distance)
class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
"""Compute string distances between the prediction and the reference."""
@property
def requires_input(self) -> bool:
"""
Check if input is required.
Returns:
bool: True if input is required, False otherwise.
"""
return False
@property
def requires_reference(self) -> bool:
"""
Check if reference is required.
Returns:
bool: True if reference is required, False otherwise.
"""
return True
@property
def input_keys(self) -> List[str]:
"""
Get the input keys.
Returns:
List[str]: The input keys.
"""
return ["reference", "prediction"]
@staticmethod
def _get_metric(distance: str) -> Callable:
"""
Get the distance metric function based on the distance type.
Args:
distance (str): The distance type.
Returns:
Callable: The distance metric function.
Raises:
ValueError: If the distance metric is invalid.
"""
rf_distance = _load_rapidfuzz()
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
return rf_distance.DamerauLevenshtein.distance
elif distance == StringDistance.LEVENSHTEIN:
return rf_distance.Levenshtein.distance
elif distance == StringDistance.JARO:
return rf_distance.Jaro.distance
elif distance == StringDistance.JARO_WINKLER:
return rf_distance.JaroWinkler.distance
else:
raise ValueError(f"Invalid distance metric: {distance}")
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Compute the string distance between the prediction and the reference.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (Optional[CallbackManagerForChainRun]):
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["reference"], inputs["prediction"])}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Asynchronously compute the string distance between the prediction
and the reference.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (Optional[AsyncCallbackManagerForChainRun]:
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["reference"], inputs["prediction"])}
def _evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""
Evaluate the string distance between the prediction and the reference.
Args:
prediction (str): The prediction string.
reference (Optional[str], optional): The reference string.
input (Optional[str], optional): The input string.
callbacks (Callbacks, optional): The callbacks to use.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = self(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
return {"score": result["score"]}
async def _aevaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""
Asynchronously evaluate the string distance between the
prediction and the reference.
Args:
prediction (str): The prediction string.
reference (Optional[str], optional): The reference string.
input (Optional[str], optional): The input string.
callbacks (Callbacks, optional): The callbacks to use.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = await self.acall(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
)
return {"score": result["score"]}
class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvaluator):
"""Compute string edit distances between two predictions."""
@property
def input_keys(self) -> List[str]:
"""
Get the input keys.
Returns:
List[str]: The input keys.
"""
return ["prediction", "prediction_b"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Compute the string distance between two predictions.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (CallbackManagerForChainRun , optional):
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""
Asynchronously compute the string distance between two predictions.
Args:
inputs (Dict[str, Any]): The input values.
run_manager (AsyncCallbackManagerForChainRun , optional):
The callback manager.
Returns:
Dict[str, Any]: The evaluation results containing the score.
"""
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
def _evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""
Evaluate the string distance between two predictions.
Args:
prediction (str): The first prediction string.
prediction_b (str): The second prediction string.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces.
metadata (Dict[str, Any], optional): Metadata to apply to traces.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}
async def _aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""
Asynchronously evaluate the string distance between two predictions.
Args:
prediction (str): The first prediction string.
prediction_b (str): The second prediction string.
callbacks (Callbacks, optional): The callbacks to use.
tags (List[str], optional): Tags to apply to traces.
metadata (Dict[str, Any], optional): Metadata to apply to traces.
**kwargs: Additional keyword arguments.
Returns:
dict: The evaluation results containing the score.
"""
result = await self.acall(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
tags=tags,
metadata=metadata,
)
return {"score": result["score"]}

109
poetry.lock generated

@ -8920,6 +8920,111 @@ packaging = "*"
[package.extras]
test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"]
[[package]]
name = "rapidfuzz"
version = "3.1.1"
description = "rapid fuzzy string matching"
category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:17e4cbe6632aae7c35101c4b7c498e83f6eacf61be0def4ff98167df30dc69ca"},
{file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:167dbce2da6bb5b73d43e53434c5a9d7d1214b658b315420e44044782f4c482b"},
{file = "rapidfuzz-3.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdee4f4d04761ce167538adbefa01a64e7cab949d89aa09df39ef0d5e859fb2a"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e77ed7d0bd8d9be530c462c921904ada8d3417671eed749784c5a315af334d"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fdd2ab5ab56fcaf839a9f58caa8756dbfeba0b3dc187850b763d0a1e6ee9c97a"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0843c53d54d5b7d6122d8f1d7574d8c91a7aacc5c316f74d6e33d98aec82949d"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b3e953dcef0302eeb4fe8c7c4907e50d175199fc07da05ad6bd1d8d141ff138"},
{file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec5523d5c08c639cd4e301d42f3ad7c6fb061a1f1cd6b5b627e59af345edfed7"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b4995792e106c3f1ab6f56dd6089918b065888e2e55a71e3fea8d0f66bf30989"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cdbf9a76ea47f14026daaed43a2c2150ab0e9a4d5396909f028380f33e61c522"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f25d1975e846d07990cf946a5927a932aa7cccd308ae9979b03a58ff1cd80087"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e0755f5ac6c3d1dc2505eb2e6eaf5508ff17b42c084406714fbabf2d50d098b6"},
{file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:de784bbe06d32e66617cd20766c37aae2438902d54b3fa608d2e0a929ca705f4"},
{file = "rapidfuzz-3.1.1-cp310-cp310-win32.whl", hash = "sha256:ef6c38040d868dcc0132fad377aafeb5b2da71354759e77f41ae599316df2dee"},
{file = "rapidfuzz-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c74fde444bcd13ef3a803c578b28f33b4f9edf368f46ca3de57fda456065967"},
{file = "rapidfuzz-3.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:e549da8d68ad4ee385c918ea8b9efeda875df9edf6c6b48df927bd061c00bfef"},
{file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:58ca539cc6ce385d650138a9b1908b05622c2dd08a23d5aea4890523ef3774d5"},
{file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91946c496e6f380939dbea14ff6ce6de87480445c09d03964f5374101462594b"},
{file = "rapidfuzz-3.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f2024f83a9300440e845b441e71726471f7567021c1d80796ca02e71c5f0dc2"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17b017f9e1b88dfd6d9b03170ef8e86477de0d9d37fbfcbe72ca070cacbe1b65"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6772eb7cc4429f1eae5a9b41e5b0b1af8f0d50727c6e338d9ad5bceee01da5a"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c089ce856919e03f4dd8f9168d60ac580d30cd0451fd60dcdef73010eca68973"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f2cd9a3760080876fc59edb26926e51d6db44dea65e85f1eb04aa5f58c3bc41"},
{file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f32791ee045a7b3d6a56208a55d996d5f7a32fdb688f5c5ee899cb7589539eb"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:68d910048b36613701ea671de68f701e2c1ba2839295238def840ff1fc1b15f4"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6f767d4823002e65c06ea273f952fda2b88775e1c2d508564f04d32cdd7f65b2"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:10313075642a9f1f948d356f4f0803ae28a496d7967b466b9cae1a4be8aa4df3"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1465ea085154378e69bf4bc5e27bdac5c94684416882ace31865232adc9239a2"},
{file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:53e3c588e7ea158fa80095dd0ff53f49e2ede9a8d71a3a5b964ca045d845a9b9"},
{file = "rapidfuzz-3.1.1-cp311-cp311-win32.whl", hash = "sha256:cb08db5c122fea4196483b82f7596e50ef9cab1770f7696c197bf0815ac4dd17"},
{file = "rapidfuzz-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b7c65112c87568274d399ad7a62902cef17801c2bd047b162e79e43758b3ce27"},
{file = "rapidfuzz-3.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:ea3e46a534de97a6cad2018cb950492a0fcacad380e35440ce3c1c8fef96a261"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a8bb256b34fcad4f3fa00be6b57fe35bcb54f031911195929145c67d9738ffec"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51f21f37aec6bc117e9083181ddc3cbbcbf56b6506492b128d8e836d3545ca80"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a371846f45ed9d24927a8d5222884536c1e171543396b36250fafb2e848bc92"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25eea5c8006b6c8747ca204675c9e939f3c4d27167fb43b2aa211443d34f9abd"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:db5e71e5a810d2f1163c914e01b3ba241409a98286ac4850ff26076115ae401b"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c07e16ab38e717931319cff1340debbf2ef940a1cda4eb70e323079b62df306"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aadc5a8b9859737a8f87831215b7fab0c04afeb960bb987c528421a4e6dfb8b6"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:0de229cb613be060580c71c1674acbde57921c7ed33d7a726e071a2562924113"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:b1bf8aba99b267aad0a01dfb44ee39803676007724abcfb72129c350476b2341"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d3264e4a02e4148e30078104fb0c1b6c8eb166ddc5ebe843a22433f58f87dc47"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:712331c1c70c79a219c2ac233b4e25e75ffad51042840d147d5e94519c7d8a1a"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:6ede2d42ad55bd4e7a3394e98c5f58ddace78775493391732d32be61268a4116"},
{file = "rapidfuzz-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:32a5c47b5153f25eb512dbb91f9850225d2dcfb3404a1c48406726c7732b0726"},
{file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:51bb8f7aa4fe45618e75cdccf08491c752a7f137ffbf7d3afd1809791ac8c326"},
{file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:788fb03c5acb5b48f5f918f4cbb5dc072498becf018c64e7e27d6b76e63e68b8"},
{file = "rapidfuzz-3.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dc7f25e20781c8d42e813516ee4ff9043ecce4a8e25fc94ee6732a83d81c1c99"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4a751f216fd1222a4a8c7ceff5180872a156202c3bdca1b337e5a5b09298dfd"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83b48b789f2da1688882cba595c40179194ab15ec17ea1d4c9de9ee239649904"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a6f5cd9f1282da49b8d0747c40f3fea2d64ab5e4c2cc2295baf87ff7a0d062"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5fe8054c244bf63be2380efc275edd86da3a706460d42911dc3ff914f3260a5"},
{file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d4d509e9aa011e1be5e4da7c5062dc4fc3688714687110536925980b3d03ac6"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ccc1b5b467766110085c80bb9311d233fccc8ed1ce965aebba3125e1bab04cba"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7e181411958d04d5b437a0981e87815e8f1b1909f5ae0e339246d3bc464f53e7"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:c53cf36cdb10819b7154fefdbffbef442ba567d9c1ca74a7e76fd759ace45e6c"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:851b44130393139cb336aa54c681d595d75a3160b7be330f3acc0c3b9dabce70"},
{file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49d900da023eeb3bfbe9feee126312eb9fd0458129aa5a581e4d8d8bf4483d14"},
{file = "rapidfuzz-3.1.1-cp38-cp38-win32.whl", hash = "sha256:6c0e96821029c46847df4ff266ea283a2b6163a4f76a4567f9986934e9c4410c"},
{file = "rapidfuzz-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7af18372f576e36e93f4662bdf64043ac23dfa02d7f768d7e7e1d0211bb9cb35"},
{file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8b966344ed4122a71ab8ccdca2954db1ce0d8049cb9bcac58db07558f9d9ec32"},
{file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a293370448f2e46fdc6e086ac99923015bdc53973a65d3df35aefc685e1a5809"},
{file = "rapidfuzz-3.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:351d253fdee62d6d0e80c75f0505accc1ce8cc73a50779c60986ef21c92f20f9"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e951c874a0e5b375b2af9b5f264eefc679c0685c166ee0641e703ef0795509b"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4019def8a18bc867ac61f08a542bf474a7a9b3f662f5d5cd169c9135866562f5"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:086a2d84c2e497e3ab160ccf164e319bca874d9383d008fcadf91ede8ac7997f"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6d4da453fbd8793ebb11bed396f8a4b9041d6227bf055903447305dd7942312f"},
{file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10f56af1d46fbeaaa0dc50901c2dc439c7a455cfdac2f1acf6cffeb65ae82c48"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7726f67e4a0b2b4392f03aa62e16b12a697156c6735df27b21bd3ab561b01659"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d72916d27fb88741bfb576b0b0639354ca00f5e91046171c985262c68a86bbb5"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c85bb6946fb02231d1e60ab45c36ecee04ecf7f725e094f5beee798b6b7d36d"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:fb7049dff52cded65184a3d2ff45cfd226bff7314f49a8f4b83f943eea9181a7"},
{file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:408007b4bc5a0a0cb9bfcdcc8cffa9b71fec6ee53ccdf9c26b57539f7e264ab5"},
{file = "rapidfuzz-3.1.1-cp39-cp39-win32.whl", hash = "sha256:9dc7154889937ca5a004d17f62b4798e0af52f69c38eb3112dbdb52b006d4419"},
{file = "rapidfuzz-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:16c506bac2e0a6f6581b334a7802c2f0d8343ec1d77e5cf9452c33d6219abef8"},
{file = "rapidfuzz-3.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:5e11e11880951e767342b56627ab2dc9d3ef90e2605b656e9b5e6e0beadaaf0f"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a8b8f32463781e4703965c9cf7a609a19a74478f332e0d62cd9d0e7a9db91321"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b408ac3c7f8c3414bfd5c6044ca4bb385b390bcf5eae3ad884cef48628c131ae"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ff1a517de2b1e80ddf1a3037a6ebca9925154c1af70751518d50d5c332e1ec8"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1e23665be5918f979180130babedab9317fbb34cdae237c7defad7e86bc684e"},
{file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:15260263a0c7bffac934a53b6622d77e06e10929ee4d2e62ac6f70c13988f351"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f7acc5c9c7cf567372de5b6c817f93db508e7b9bd7f29bd6187df8d2cc60ced5"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79f5a3ab7ff6c46336f38690f0564bc7689cefa180257ed9078c42f75b10c9d2"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:362e366e79fcc9a8866b41f20ef4d2987a06f8b134096e659594c059aa8a6d88"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:819d9317c3d86b508d87ab1bca5867f3abc18b902c822bc57366ccc6330a030b"},
{file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4a64ddfb7084b678da7778c1263aee2baae5a2ca55ec5589a022defc38103eb1"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8243bb4bb4db7c3501932ced6a978b284e19c3619b6802455e47bfd0905adb81"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39c7d0dbd77a7f28ff85a1dff2afb2ed73e5cd81cca3f654450ed339a271c0ab"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4afab735bb0ac3ec9bafcc35376ed336d26af6140c4d81e4c869e77df77ecd5"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69d503a7641b5a63aa53c7aca0b857d38f48cd7bae39f8563679b324e3d2d47a"},
{file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ef3ad80458e47723812976a2ea1282ff207ad20e6cb19da1917f76699bd5aaa5"},
{file = "rapidfuzz-3.1.1.tar.gz", hash = "sha256:a06a08be3cb7d7df7993dd16e84aaf59bd5a7ff98a9f1b3e893d18b273a71c64"},
]
[package.extras]
full = ["numpy"]
[[package]]
name = "ratelimiter"
version = "1.2.0.post0"
@ -12410,7 +12515,7 @@ clarifai = ["clarifai"]
cohere = ["cohere"]
docarray = ["docarray"]
embeddings = ["sentence-transformers"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"]
javascript = ["esprima"]
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"]
@ -12420,4 +12525,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "cc95f4e0d4bee4ba19cf539be5ffd81f1ddb33229ace936ef3b6cbd4122493ca"
content-hash = "6e2acbd4f760e92454f9f9e29840679fbd59b8662a99bcb89e2251a5b8736e6d"

@ -116,6 +116,7 @@ streamlit = {version = "^1.18.0", optional = true, python = ">=3.8.1,<3.9.7 || >
psychicapi = {version = "^0.8.0", optional = true}
cassio = {version = "^0.0.7", optional = true}
rdflib = {version = "^6.3.2", optional = true}
rapidfuzz = {version = "^3.1.1", optional = true}
[tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0"
@ -346,7 +347,8 @@ extended_testing = [
"scikit-learn",
"streamlit",
"pyspark",
"openai"
"openai",
"rapidfuzz"
]
[[tool.poetry.source]]

@ -0,0 +1,123 @@
from typing import Tuple
import numpy as np
import pytest
from langchain.evaluation.embedding_distance import (
EmbeddingDistance,
PairwiseEmbeddingDistanceEvalChain,
)
@pytest.fixture
def vectors() -> Tuple[np.ndarray, np.ndarray]:
"""Create two random vectors."""
vector_a = np.array(
[
0.5488135,
0.71518937,
0.60276338,
0.54488318,
0.4236548,
0.64589411,
0.43758721,
0.891773,
0.96366276,
0.38344152,
]
)
vector_b = np.array(
[
0.79172504,
0.52889492,
0.56804456,
0.92559664,
0.07103606,
0.0871293,
0.0202184,
0.83261985,
0.77815675,
0.87001215,
]
)
return vector_a, vector_b
@pytest.fixture
def chain() -> PairwiseEmbeddingDistanceEvalChain:
"""Create a PairwiseEmbeddingDistanceEvalChain."""
return PairwiseEmbeddingDistanceEvalChain()
@pytest.mark.requires("scipy")
def test_cosine_similarity(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the cosine similarity."""
chain.distance_metric = EmbeddingDistance.COSINE
result = chain._compute_score(np.array(vectors))
expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_euclidean_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the euclidean distance."""
from scipy.spatial.distance import euclidean
chain.distance_metric = EmbeddingDistance.EUCLIDEAN
result = chain._compute_score(np.array(vectors))
expected = euclidean(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_manhattan_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the manhattan distance."""
from scipy.spatial.distance import cityblock
chain.distance_metric = EmbeddingDistance.MANHATTAN
result = chain._compute_score(np.array(vectors))
expected = cityblock(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_chebyshev_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the chebyshev distance."""
from scipy.spatial.distance import chebyshev
chain.distance_metric = EmbeddingDistance.CHEBYSHEV
result = chain._compute_score(np.array(vectors))
expected = chebyshev(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_hamming_distance(
chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the hamming distance."""
from scipy.spatial.distance import hamming
chain.distance_metric = EmbeddingDistance.HAMMING
result = chain._compute_score(np.array(vectors))
expected = hamming(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("openai", "tiktoken")
def test_embedding_distance(chain: PairwiseEmbeddingDistanceEvalChain) -> None:
"""Test the embedding distance."""
result = chain.evaluate_string_pairs(
prediction="A single cat", prediction_b="A single cat"
)
assert np.isclose(result["score"], 0.0)

@ -0,0 +1,51 @@
import pytest
from langchain.evaluation.string_distance import (
PairwiseStringDistanceEvalChain,
StringDistance,
StringDistanceEvalChain,
)
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
def test_zero_distance(distance: StringDistance) -> None:
eval_chain = StringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = eval_chain.evaluate_strings(prediction=string, reference=string)
assert "score" in result
assert result["score"] == 0
@pytest.mark.asyncio
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
async def test_zero_distance_async(distance: StringDistance) -> None:
eval_chain = StringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = await eval_chain.aevaluate_strings(prediction=string, reference=string)
assert "score" in result
assert result["score"] == 0
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
def test_zero_distance_pairwise(distance: StringDistance) -> None:
eval_chain = PairwiseStringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = eval_chain.evaluate_string_pairs(prediction=string, prediction_b=string)
assert "score" in result
assert result["score"] == 0
@pytest.mark.asyncio
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("distance", list(StringDistance))
async def test_zero_distance_pairwise_async(distance: StringDistance) -> None:
eval_chain = PairwiseStringDistanceEvalChain(distance=distance)
string = "三人行则必有我师"
result = await eval_chain.aevaluate_string_pairs(
prediction=string, prediction_b=string
)
assert "score" in result
assert result["score"] == 0

@ -2,20 +2,27 @@
import pytest
from langchain.embeddings.fake import FakeEmbeddings
from langchain.evaluation.loading import EvaluatorType, load_evaluators
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("evaluator_type", EvaluatorType)
def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
"""Test loading evaluators."""
fake_llm = FakeChatModel()
load_evaluators([evaluator_type], llm=fake_llm)
embeddings = FakeEmbeddings(size=32)
load_evaluators([evaluator_type], llm=fake_llm, embeddings=embeddings)
# Test as string
load_evaluators([evaluator_type.value], llm=fake_llm) # type: ignore
load_evaluators(
[evaluator_type.value], # type: ignore
llm=fake_llm,
embeddings=embeddings,
)
def test_criteria_eval_chain_requires_reference() -> None:

Loading…
Cancel
Save