Add String Distance and Embedding Evaluators (#7123)

Add a string evaluator and pairwise string evaluator implementation for: - Embedding distance - String distance Update docs
11 months ago · 4789c99bc2
parent fb6e63dc36
commit 4789c99bc2
19 changed files with 1374 additions and 94 deletions
--- a/docs/api_reference/api_reference.rst
+++ b/docs/api_reference/api_reference.rst
@ -165,28 +165,35 @@ Classes
    callbacks.aim_callback.AimCallbackHandler
    callbacks.argilla_callback.ArgillaCallbackHandler
    callbacks.arize_callback.ArizeCallbackHandler
+    callbacks.arthur_callback.ArthurCallbackHandler
    callbacks.base.AsyncCallbackHandler
    callbacks.base.BaseCallbackHandler
    callbacks.base.BaseCallbackManager
    callbacks.clearml_callback.ClearMLCallbackHandler
    callbacks.comet_ml_callback.CometCallbackHandler
    callbacks.file.FileCallbackHandler
+    callbacks.flyte_callback.FlyteCallbackHandler
    callbacks.human.HumanApprovalCallbackHandler
    callbacks.human.HumanRejectedException
    callbacks.infino_callback.InfinoCallbackHandler
    callbacks.manager.AsyncCallbackManager
    callbacks.manager.AsyncCallbackManagerForChainRun
    callbacks.manager.AsyncCallbackManagerForLLMRun
+    callbacks.manager.AsyncCallbackManagerForRetrieverRun
    callbacks.manager.AsyncCallbackManagerForToolRun
+    callbacks.manager.AsyncParentRunManager
    callbacks.manager.AsyncRunManager
    callbacks.manager.BaseRunManager
    callbacks.manager.CallbackManager
    callbacks.manager.CallbackManagerForChainRun
    callbacks.manager.CallbackManagerForLLMRun
+    callbacks.manager.CallbackManagerForRetrieverRun
    callbacks.manager.CallbackManagerForToolRun
+    callbacks.manager.ParentRunManager
    callbacks.manager.RunManager
    callbacks.mlflow_callback.MlflowCallbackHandler
    callbacks.openai_info.OpenAICallbackHandler
+    callbacks.promptlayer_callback.PromptLayerCallbackHandler
    callbacks.stdout.StdOutCallbackHandler
    callbacks.streaming_aiter.AsyncIteratorCallbackHandler
    callbacks.streaming_aiter_final_only.AsyncFinalIteratorCallbackHandler
@ -229,6 +236,8 @@ Functions
    callbacks.aim_callback.import_aim
    callbacks.clearml_callback.import_clearml
    callbacks.comet_ml_callback.import_comet_ml
+    callbacks.flyte_callback.analyze_text
+    callbacks.flyte_callback.import_flytekit
    callbacks.infino_callback.import_infino
    callbacks.manager.env_var_is_set
    callbacks.manager.get_openai_callback
@ -283,9 +292,11 @@ Classes
    chains.base.Chain
    chains.combine_documents.base.AnalyzeDocumentChain
    chains.combine_documents.base.BaseCombineDocumentsChain
-    chains.combine_documents.map_reduce.CombineDocsProtocol
    chains.combine_documents.map_reduce.MapReduceDocumentsChain
    chains.combine_documents.map_rerank.MapRerankDocumentsChain
+    chains.combine_documents.reduce.AsyncCombineDocsProtocol
+    chains.combine_documents.reduce.CombineDocsProtocol
+    chains.combine_documents.reduce.ReduceDocumentsChain
    chains.combine_documents.refine.RefineDocumentsChain
    chains.combine_documents.stuff.StuffDocumentsChain
    chains.constitutional_ai.base.ConstitutionalChain
@ -299,8 +310,10 @@ Classes
    chains.flare.prompts.FinishedOutputParser
    chains.graph_qa.base.GraphQAChain
    chains.graph_qa.cypher.GraphCypherQAChain
+    chains.graph_qa.hugegraph.HugeGraphQAChain
    chains.graph_qa.kuzu.KuzuQAChain
    chains.graph_qa.nebulagraph.NebulaGraphQAChain
+    chains.graph_qa.sparql.GraphSparqlQAChain
    chains.hyde.base.HypotheticalDocumentEmbedder
    chains.llm.LLMChain
    chains.llm_bash.base.LLMBashChain
@ -363,7 +376,6 @@ Functions
 .. autosummary::
    :toctree: chains

-    chains.combine_documents.base.format_document
    chains.graph_qa.cypher.extract_cypher
    chains.loading.load_chain
    chains.loading.load_chain_from_config
@ -415,6 +427,7 @@ Classes
    chat_models.fake.FakeListChatModel
    chat_models.google_palm.ChatGooglePalm
    chat_models.google_palm.ChatGooglePalmError
+    chat_models.human.HumanInputChatModel
    chat_models.openai.ChatOpenAI
    chat_models.promptlayer_openai.PromptLayerChatOpenAI
    chat_models.vertexai.ChatVertexAI
@ -513,6 +526,7 @@ Classes
    document_loaders.blob_loaders.youtube_audio.YoutubeAudioLoader
    document_loaders.blockchain.BlockchainDocumentLoader
    document_loaders.blockchain.BlockchainType
+    document_loaders.brave_search.BraveSearchLoader
    document_loaders.chatgpt.ChatGPTLoader
    document_loaders.college_confidential.CollegeConfidentialLoader
    document_loaders.confluence.ConfluenceLoader
@ -520,6 +534,7 @@ Classes
    document_loaders.conllu.CoNLLULoader
    document_loaders.csv_loader.CSVLoader
    document_loaders.csv_loader.UnstructuredCSVLoader
+    document_loaders.cube_semantic.CubeSemanticLoader
    document_loaders.dataframe.DataFrameLoader
    document_loaders.diffbot.DiffbotLoader
    document_loaders.directory.DirectoryLoader
@ -736,6 +751,7 @@ Classes
    embeddings.self_hosted.SelfHostedEmbeddings
    embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceEmbeddings
    embeddings.self_hosted_hugging_face.SelfHostedHuggingFaceInstructEmbeddings
+    embeddings.spacy_embeddings.SpacyEmbeddings
    embeddings.tensorflow_hub.TensorflowHubEmbeddings
    embeddings.vertexai.VertexAIEmbeddings

@ -790,6 +806,9 @@ Classes
    evaluation.comparison.eval_chain.PairwiseStringResultOutputParser
    evaluation.criteria.eval_chain.CriteriaEvalChain
    evaluation.criteria.eval_chain.CriteriaResultOutputParser
+    evaluation.embedding_distance.base.EmbeddingDistance
+    evaluation.embedding_distance.base.EmbeddingDistanceEvalChain
+    evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain
    evaluation.qa.eval_chain.ContextQAEvalChain
    evaluation.qa.eval_chain.CotQAEvalChain
    evaluation.qa.eval_chain.QAEvalChain
@ -799,10 +818,16 @@ Classes
    evaluation.run_evaluators.implementations.ChoicesOutputParser
    evaluation.run_evaluators.implementations.CriteriaOutputParser
    evaluation.run_evaluators.implementations.StringRunEvaluatorInputMapper
-    evaluation.run_evaluators.implementations.TrajectoryEvalOutputParser
    evaluation.run_evaluators.implementations.TrajectoryInputMapper
+    evaluation.run_evaluators.implementations.TrajectoryRunEvalOutputParser
+    evaluation.schema.AgentTrajectoryEvaluator
+    evaluation.schema.EvaluatorType
+    evaluation.schema.LLMEvalChain
    evaluation.schema.PairwiseStringEvaluator
    evaluation.schema.StringEvaluator
+    evaluation.string_distance.base.PairwiseStringDistanceEvalChain
+    evaluation.string_distance.base.StringDistance
+    evaluation.string_distance.base.StringDistanceEvalChain

 Functions
 --------------
@ -812,6 +837,8 @@ Functions
    :toctree: evaluation

    evaluation.loading.load_dataset
+    evaluation.loading.load_evaluator
+    evaluation.loading.load_evaluators
    evaluation.run_evaluators.implementations.get_criteria_evaluator
    evaluation.run_evaluators.implementations.get_qa_evaluator
    evaluation.run_evaluators.implementations.get_trajectory_evaluator
@ -1057,6 +1084,7 @@ Functions

    llms.aviary.get_completions
    llms.aviary.get_models
+    llms.base.create_base_retry_decorator
    llms.base.get_prompts
    llms.base.update_cache
    llms.cohere.completion_with_retry
@ -1069,6 +1097,7 @@ Functions
    llms.openai.completion_with_retry
    llms.openai.update_token_usage
    llms.utils.enforce_stop_tokens
+    llms.vertexai.completion_with_retry
    llms.vertexai.is_codey_model

 :mod:`langchain.load`: Load
@ -1241,7 +1270,6 @@ Classes
    :toctree: prompts
    :template: class.rst

-    prompts.base.BasePromptTemplate
    prompts.base.StringPromptTemplate
    prompts.base.StringPromptValue
    prompts.chat.AIMessagePromptTemplate
@ -1348,7 +1376,7 @@ Classes
    retrievers.multi_query.LineListOutputParser
    retrievers.multi_query.MultiQueryRetriever
    retrievers.pinecone_hybrid_search.PineconeHybridSearchRetriever
-    retrievers.pupmed.PubMedRetriever
+    retrievers.pubmed.PubMedRetriever
    retrievers.remote_retriever.RemoteLangChainRetriever
    retrievers.self_query.base.SelfQueryRetriever
    retrievers.self_query.chroma.ChromaTranslator
@ -1400,28 +1428,29 @@ Classes
    :toctree: schema
    :template: class.rst

-    schema.AIMessage
-    schema.AgentFinish
-    schema.BaseChatMessageHistory
-    schema.BaseDocumentTransformer
-    schema.BaseLLMOutputParser
-    schema.BaseMemory
-    schema.BaseMessage
-    schema.BaseOutputParser
-    schema.BaseRetriever
-    schema.ChatGeneration
-    schema.ChatMessage
-    schema.ChatResult
-    schema.Document
-    schema.FunctionMessage
-    schema.Generation
-    schema.HumanMessage
-    schema.LLMResult
-    schema.NoOpOutputParser
-    schema.OutputParserException
-    schema.PromptValue
-    schema.RunInfo
-    schema.SystemMessage
+    schema.agent.AgentFinish
+    schema.document.BaseDocumentTransformer
+    schema.document.Document
+    schema.memory.BaseChatMessageHistory
+    schema.memory.BaseMemory
+    schema.messages.AIMessage
+    schema.messages.BaseMessage
+    schema.messages.ChatMessage
+    schema.messages.FunctionMessage
+    schema.messages.HumanMessage
+    schema.messages.SystemMessage
+    schema.output.ChatGeneration
+    schema.output.ChatResult
+    schema.output.Generation
+    schema.output.LLMResult
+    schema.output.RunInfo
+    schema.output_parser.BaseLLMOutputParser
+    schema.output_parser.BaseOutputParser
+    schema.output_parser.NoOpOutputParser
+    schema.output_parser.OutputParserException
+    schema.prompt.PromptValue
+    schema.prompt_template.BasePromptTemplate
+    schema.retriever.BaseRetriever

 Functions
 --------------
@ -1430,9 +1459,10 @@ Functions
 .. autosummary::
    :toctree: schema

-    schema.get_buffer_string
-    schema.messages_from_dict
-    schema.messages_to_dict
+    schema.messages.get_buffer_string
+    schema.messages.messages_from_dict
+    schema.messages.messages_to_dict
+    schema.prompt_template.format_document

 :mod:`langchain.server`: Server
 ================================
@ -1535,6 +1565,8 @@ Classes
    tools.bing_search.tool.BingSearchRun
    tools.brave_search.tool.BraveSearch
    tools.convert_to_openai.FunctionDescription
+    tools.dataforseo_api_search.tool.DataForSeoAPISearchResults
+    tools.dataforseo_api_search.tool.DataForSeoAPISearchRun
    tools.ddg_search.tool.DuckDuckGoSearchResults
    tools.ddg_search.tool.DuckDuckGoSearchRun
    tools.file_management.copy.CopyFileTool
@ -1708,6 +1740,7 @@ Classes
    utilities.bibtex.BibtexparserWrapper
    utilities.bing_search.BingSearchAPIWrapper
    utilities.brave_search.BraveSearchWrapper
+    utilities.dataforseo_api_search.DataForSeoAPIWrapper
    utilities.duckduckgo_search.DuckDuckGoSearchAPIWrapper
    utilities.google_places_api.GooglePlacesAPIWrapper
    utilities.google_search.GoogleSearchAPIWrapper
@ -1805,12 +1838,17 @@ Classes
    vectorstores.faiss.FAISS
    vectorstores.hologres.Hologres
    vectorstores.lancedb.LanceDB
+    vectorstores.marqo.Marqo
    vectorstores.matching_engine.MatchingEngine
    vectorstores.milvus.Milvus
    vectorstores.mongodb_atlas.MongoDBAtlasVectorSearch
    vectorstores.myscale.MyScale
    vectorstores.myscale.MyScaleSettings
    vectorstores.opensearch_vector_search.OpenSearchVectorSearch
+    vectorstores.pgembedding.BaseModel
+    vectorstores.pgembedding.CollectionStore
+    vectorstores.pgembedding.EmbeddingStore
+    vectorstores.pgembedding.PGEmbedding
    vectorstores.pgvector.BaseModel
    vectorstores.pgvector.CollectionStore
    vectorstores.pgvector.DistanceStrategy
--- a/langchain/evaluation/init.py
+++ b/langchain/evaluation/init.py
@ -3,32 +3,63 @@
 This module contains off-the-shelf evaluation chains for grading the output of
 LangChain primitives such as language models and chains.

-To load an evaluator, you can use the :func:`load_evaluators <langchain.evaluation.loading.load_evaluators>` function with the
+**Loading an evaluator**
+
+To load an evaluator, you can use the :func:`load_evaluators <langchain.evaluation.loading.load_evaluators>` or
+:func:`load_evaluator <langchain.evaluation.loading.load_evaluator>` functions with the
 names of the evaluators to load.

+.. code-block:: python
+
+    from langchain.evaluation import load_evaluator
+
+    evaluator = load_evaluator("qa")
+    evaluator.evaluate_strings(
+        prediction="We sold more than 40,000 units last week",
+        input="How many units did we sell last week?",
+        reference="We sold 32,378 units",
+    )
+
+The evaluator must be one of :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`.
+
+**Datasets**
+
 To load one of the LangChain HuggingFace datasets, you can use the :func:`load_dataset <langchain.evaluation.loading.load_dataset>` function with the
 name of the dataset to load.

-Some common use cases for evaluation include:
+.. code-block:: python
+
+        from langchain.evaluation import load_dataset
+        ds = load_dataset("llm-math")
+
+**Some common use cases for evaluation include:**

 - Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain <langchain.evaluation.qa.eval_chain.QAEvalChain>`
 - Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>`
 - Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain <langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain>`
 - Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain <langchain.evaluation.criteria.eval_chain.CriteriaEvalChain>`
+- Computing semantic difference between a prediction and reference: :class:`EmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain>` or between two predictions: :class:`PairwiseEmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain>` 
+- Measuring the string distance between a prediction and reference :class:`StringDistanceEvalChain <langchain.evaluation.string_distance.base.StringDistanceEvalChain>` or between two predictions :class:`PairwiseStringDistanceEvalChain <langchain.evaluation.string_distance.base.PairwiseStringDistanceEvalChain>`
+
+**Low-level API**

-This module also contains low-level APIs for creating custom evaluators for
-specific evaluation tasks. These include:
+These evaluators implement one of the following interfaces:

 - :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`: Evaluate a prediction string against a reference label and/or input context.
- :class:`PairwiseStringEvaluator <langchain.evaluation.schema.PairwiseStringEvaluator>`: Evaluate two prediction strings against each other.
-    Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs.
- :class:`AgentTrajectoryEvaluator <langchain.evaluation.schema.AgentTrajectoryEvaluator>`: Evaluate the full sequence of actions
-    taken by an agent.
+- :class:`PairwiseStringEvaluator <langchain.evaluation.schema.PairwiseStringEvaluator>`: Evaluate two prediction strings against each other. Useful for scoring preferences, measuring similarity between two chain or llm agents, or comparing outputs on similar inputs.
+- :class:`AgentTrajectoryEvaluator <langchain.evaluation.schema.AgentTrajectoryEvaluator>` Evaluate the full sequence of actions taken by an agent.
+
+These interfaces enable easier composability and usage within a higher level evaluation framework.

 """  # noqa: E501
 from langchain.evaluation.agents import TrajectoryEvalChain
 from langchain.evaluation.comparison import PairwiseStringEvalChain
 from langchain.evaluation.criteria import CriteriaEvalChain
+from langchain.evaluation.embedding_distance import (
+    EmbeddingDistance,
+    EmbeddingDistanceEvalChain,
+    PairwiseEmbeddingDistanceEvalChain,
+)
 from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
 from langchain.evaluation.schema import (
@ -37,6 +68,11 @@ from langchain.evaluation.schema import (
    PairwiseStringEvaluator,
    StringEvaluator,
 )
+from langchain.evaluation.string_distance import (
+    PairwiseStringDistanceEvalChain,
+    StringDistance,
+    StringDistanceEvalChain,
+)

 __all__ = [
    "EvaluatorType",
@ -48,6 +84,12 @@ __all__ = [
    "PairwiseStringEvaluator",
    "TrajectoryEvalChain",
    "CriteriaEvalChain",
+    "EmbeddingDistance",
+    "EmbeddingDistanceEvalChain",
+    "PairwiseEmbeddingDistanceEvalChain",
+    "StringDistance",
+    "StringDistanceEvalChain",
+    "PairwiseStringDistanceEvalChain",
    "load_evaluators",
    "load_evaluator",
    "load_dataset",
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@ -77,40 +77,42 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):
    the sequence of actions taken and their outcomes.

    Example:
-        .. code-block:: python
-            from langchain.agents import AgentType, initialize_agent
-            from langchain.chat_models import ChatOpenAI
-            from langchain.evaluation import TrajectoryEvalChain
-            from langchain.tools import tool
-
-            @tool
-            def geography_answers(country: str, question: str) -> str:
-                \"\"\"Very helpful answers to geography questions.\"\"\"
-                return f"{country}? IDK - We may never know {question}."
-
-            llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0)
-            agent = initialize_agent(
-                tools=[geography_answers],
-                llm=llm,
-                agent=AgentType.OPENAI_FUNCTIONS,
-                return_intermediate_steps=True,
-            )

-            question = "How many dwell in the largest minor region in Argentina?"
-            response = agent(question)
+    .. code-block:: python

-            eval_chain = TrajectoryEvalChain.from_llm(
-                llm=llm, agent_tools=[geography_answers], return_reasoning=True
-            )
+        from langchain.agents import AgentType, initialize_agent
+        from langchain.chat_models import ChatOpenAI
+        from langchain.evaluation import TrajectoryEvalChain
+        from langchain.tools import tool

-            result = eval_chain.evaluate_agent_trajectory(
-                input=question,
-                agent_trajectory=response["intermediate_steps"],
-                prediction=response["output"],
-                reference="Paris",
-            )
-            print(result["score"])
-            # 0
+        @tool
+        def geography_answers(country: str, question: str) -> str:
+            \"\"\"Very helpful answers to geography questions.\"\"\"
+            return f"{country}? IDK - We may never know {question}."
+
+        llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0)
+        agent = initialize_agent(
+            tools=[geography_answers],
+            llm=llm,
+            agent=AgentType.OPENAI_FUNCTIONS,
+            return_intermediate_steps=True,
+        )
+
+        question = "How many dwell in the largest minor region in Argentina?"
+        response = agent(question)
+
+        eval_chain = TrajectoryEvalChain.from_llm(
+            llm=llm, agent_tools=[geography_answers], return_reasoning=True
+        )
+
+        result = eval_chain.evaluate_agent_trajectory(
+            input=question,
+            agent_trajectory=response["intermediate_steps"],
+            prediction=response["output"],
+            reference="Paris",
+        )
+        print(result["score"])
+        # 0
    """  # noqa: E501

    agent_tools: Optional[List[BaseTool]] = None
@ -336,7 +338,8 @@ The following is the expected answer. Use this to measure correctness:
            callbacks (Callbacks): Callbacks to use for this chain run.

        Returns:
-            dict: The evaluation result.
+            dict: The evaluation result, which includes the score and optionally
+                the reasoning for reaching that.
        """
        inputs = {
            "question": input,
@ -367,7 +370,8 @@ The following is the expected answer. Use this to measure correctness:
            callbacks (Callbacks): Callbacks to use for this chain run.

        Returns:
-            dict: The evaluation result.
+            dict: The evaluation result, which includes the score and optionally
+                the reasoning for reaching that.
        """
        inputs = {
            "question": input,
--- a/langchain/evaluation/comparison/eval_chain.py
+++ b/langchain/evaluation/comparison/eval_chain.py
@ -52,7 +52,8 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):


 class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
-    """A chain for comparing the output of two models.
+    """A chain for comparing two outputs, such as the outputs
+     of two models, prompts, or outputs of a single model on similar inputs.

    Example:
    >>> from langchain.chat_models import ChatOpenAI
--- a/langchain/evaluation/criteria/eval_chain.py
+++ b/langchain/evaluation/criteria/eval_chain.py
@ -92,10 +92,37 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
    --------
    >>> from langchain.chat_models import ChatAnthropic
    >>> from langchain.evaluation.criteria import CriteriaEvalChain
-    >>> llm = ChatAnthropic()
+    >>> llm = ChatAnthropic(temperature=0)
    >>> criteria = {"my-custom-criterion": "Is the submission the most amazing ever?"}
-    >>> chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
-    """
+    >>> evaluator = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
+    >>> evaluator.evaluate_strings(prediction="Imagine an ice cream flavor for the color aquamarine", input="Tell me an idea")
+    {
+        'reasoning': 'Here is my step-by-step reasoning for the given criteria:\\n\\nThe criterion is: "Is the submission the most amazing ever?" This is a subjective criterion and open to interpretation. The submission suggests an aquamarine-colored ice cream flavor which is creative but may or may not be considered the most amazing idea ever conceived. There are many possible amazing ideas and this one ice cream flavor suggestion may or may not rise to that level for every person. \\n\\nN',
+        'value': 'N',
+        'score': 0,
+    }
+
+    >>> from langchain.chat_models import ChatOpenAI
+    >>> from langchain.evaluation.criteria import CriteriaEvalChain
+    >>> llm = ChatOpenAI(model="gpt-4", temperature=0)
+    >>> criteria = "correctness"
+    >>> evaluator = CriteriaEvalChain.from_llm(
+    ...     llm=llm,
+    ...     criteria=criteria,
+    ...    requires_reference=True,
+    ... )
+    >>> evaluator.evaluate_strings(
+    ...   prediction="The answer is 4",
+    ...   input="How many apples are there?",
+    ...   reference="There are 3 apples",
+    ...   )
+    {
+        'score': 0,
+        'reasoning': 'The criterion for this task is the correctness of the submission. The submission states that there are 4 apples, but the reference indicates that there are actually 3 apples. Therefore, the submission is not correct, accurate, or factual according to the given criterion.\\n\\nN',
+        'value': 'N',
+    }
+
+    """  # noqa: E501

    output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
    """The parser to use to map the output to a structured result."""
--- a/langchain/evaluation/embedding_distance/init.py
+++ b/langchain/evaluation/embedding_distance/init.py
@ -0,0 +1,12 @@
+"""Evaluators that measure embedding distances."""
+from langchain.evaluation.embedding_distance.base import (
+    EmbeddingDistance,
+    EmbeddingDistanceEvalChain,
+    PairwiseEmbeddingDistanceEvalChain,
+)
+
+__all__ = [
+    "EmbeddingDistance",
+    "EmbeddingDistanceEvalChain",
+    "PairwiseEmbeddingDistanceEvalChain",
+]
--- a/langchain/evaluation/embedding_distance/base.py
+++ b/langchain/evaluation/embedding_distance/base.py
@ -0,0 +1,438 @@
+"""A chain for comparing the output of two models using embeddings."""
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+from pydantic import Field, root_validator
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from langchain.chains.base import Chain
+from langchain.embeddings.base import Embeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
+from langchain.math_utils import cosine_similarity
+
+
+class EmbeddingDistance(str, Enum):
+    """Embedding Distance Metric.
+
+    Attributes:
+        COSINE: Cosine distance metric.
+        EUCLIDEAN: Euclidean distance metric.
+        MANHATTAN: Manhattan distance metric.
+        CHEBYSHEV: Chebyshev distance metric.
+        HAMMING: Hamming distance metric.
+    """
+
+    COSINE = "cosine"
+    EUCLIDEAN = "euclidean"
+    MANHATTAN = "manhattan"
+    CHEBYSHEV = "chebyshev"
+    HAMMING = "hamming"
+
+
+class _EmbeddingDistanceChainMixin(Chain):
+    """Shared functionality for embedding distance evaluators.
+
+    Attributes:
+        embeddings (Embeddings): The embedding objects to vectorize the outputs.
+        distance_metric (EmbeddingDistance): The distance metric to use
+                                            for comparing the embeddings.
+    """
+
+    embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
+    distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
+
+    class Config:
+        """Permit embeddings to go unvalidated."""
+
+        arbitrary_types_allowed: bool = True
+
+    @property
+    def output_keys(self) -> List[str]:
+        """Return the output keys of the chain.
+
+        Returns:
+            List[str]: The output keys.
+        """
+        return ["score"]
+
+    @root_validator
+    def _validate_distance_metric(cls, values: dict) -> dict:
+        """Validate the distance metric.
+
+        Args:
+            values (dict): The values to validate.
+
+        Returns:
+            dict: The validated values.
+        """
+        values["distance_metric"] = values["distance_metric"].lower()
+        return values
+
+    def _get_metric(self, metric: EmbeddingDistance) -> Any:
+        """Get the metric function for the given metric name.
+
+        Args:
+            metric (EmbeddingDistance): The metric name.
+
+        Returns:
+            Any: The metric function.
+        """
+        metrics = {
+            EmbeddingDistance.COSINE: self._cosine_distance,
+            EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
+            EmbeddingDistance.MANHATTAN: self._manhattan_distance,
+            EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
+            EmbeddingDistance.HAMMING: self._hamming_distance,
+        }
+        if metric in metrics:
+            return metrics[metric]
+        else:
+            raise ValueError(f"Invalid metric: {metric}")
+
+    @staticmethod
+    def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+        """Compute the cosine distance between two vectors.
+
+        Args:
+            a (np.ndarray): The first vector.
+            b (np.ndarray): The second vector.
+
+        Returns:
+            np.ndarray: The cosine distance.
+        """
+        return 1.0 - cosine_similarity(a, b)
+
+    @staticmethod
+    def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        """Compute the Euclidean distance between two vectors.
+
+        Args:
+            a (np.ndarray): The first vector.
+            b (np.ndarray): The second vector.
+
+        Returns:
+            np.floating: The Euclidean distance.
+        """
+        return np.linalg.norm(a - b)
+
+    @staticmethod
+    def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        """Compute the Manhattan distance between two vectors.
+
+        Args:
+            a (np.ndarray): The first vector.
+            b (np.ndarray): The second vector.
+
+        Returns:
+            np.floating: The Manhattan distance.
+        """
+        return np.sum(np.abs(a - b))
+
+    @staticmethod
+    def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        """Compute the Chebyshev distance between two vectors.
+
+        Args:
+            a (np.ndarray): The first vector.
+            b (np.ndarray): The second vector.
+
+        Returns:
+            np.floating: The Chebyshev distance.
+        """
+        return np.max(np.abs(a - b))
+
+    @staticmethod
+    def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        """Compute the Hamming distance between two vectors.
+
+        Args:
+            a (np.ndarray): The first vector.
+            b (np.ndarray): The second vector.
+
+        Returns:
+            np.floating: The Hamming distance.
+        """
+        return np.mean(a != b)
+
+    def _compute_score(self, vectors: np.ndarray) -> float:
+        """Compute the score based on the distance metric.
+
+        Args:
+            vectors (np.ndarray): The input vectors.
+
+        Returns:
+            float: The computed score.
+        """
+        metric = self._get_metric(self.distance_metric)
+        score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
+        return score
+
+
+class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
+    """Use embedding distances to score semantic difference between
+    a prediction and reference.
+
+    Examples:
+        >>> chain = EmbeddingDistanceEvalChain()
+        >>> result = chain.evaluate_strings(prediction="Hello", reference="Hi")
+        >>> print(result)
+        {'score': 0.5}
+    """
+
+    @property
+    def requires_reference(self) -> bool:
+        """Return whether the chain requires a reference.
+
+        Returns:
+            bool: True if a reference is required, False otherwise.
+        """
+        return True
+
+    @property
+    def input_keys(self) -> List[str]:
+        """Return the input keys of the chain.
+
+        Returns:
+            List[str]: The input keys.
+        """
+        return ["prediction", "reference"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Compute the score for a prediction and reference.
+
+        Args:
+            inputs (Dict[str, Any]): The input data.
+            run_manager (Optional[CallbackManagerForChainRun], optional):
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The computed score.
+        """
+        vectors = np.array(
+            self.embeddings.embed_documents(
+                [inputs["prediction"], inputs["prediction_b"]]
+            )
+        )
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Asynchronously compute the score for a prediction and reference.
+
+        Args:
+            inputs (Dict[str, Any]): The input data.
+            run_manager (AsyncCallbackManagerForChainRun, optional):
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The computed score.
+        """
+        embedded = await self.embeddings.aembed_documents(
+            [inputs["prediction"], inputs["prediction_b"]]
+        )
+        vectors = np.array(embedded)
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    def _evaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the embedding distance between a prediction and
+        reference.
+
+        Args:
+            prediction (str): The output string from the first model.
+            reference (str): The reference string (required)
+            callbacks (Callbacks, optional): The callbacks to use.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        return self(
+            inputs={"prediction": prediction, "reference": reference},
+            callbacks=callbacks,
+        )
+
+    async def _aevaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the embedding distance between
+        a prediction and reference.
+
+        Args:
+            prediction (str): The output string from the first model.
+            reference (str): The output string from the second model.
+            callbacks (Callbacks, optional): The callbacks to use.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        return await self.acall(
+            inputs={"prediction": prediction, "reference": reference},
+            callbacks=callbacks,
+        )
+
+
+class PairwiseEmbeddingDistanceEvalChain(
+    _EmbeddingDistanceChainMixin, PairwiseStringEvaluator
+):
+    """Use embedding distances to score semantic difference between two predictions.
+
+    Examples:
+        >>> chain = PairwiseEmbeddingDistanceEvalChain()
+        >>> result = chain.evaluate_string_pairs(prediction="Hello", prediction_b="Hi")
+        >>> print(result)
+        {'score': 0.5}
+    """
+
+    @property
+    def input_keys(self) -> List[str]:
+        """Return the input keys of the chain.
+
+        Returns:
+            List[str]: The input keys.
+        """
+        return ["prediction", "prediction_b"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Compute the score for two predictions.
+
+        Args:
+            inputs (Dict[str, Any]): The input data.
+            run_manager (CallbackManagerForChainRun, optional):
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The computed score.
+        """
+        vectors = np.array(
+            self.embeddings.embed_documents(
+                [inputs["prediction"], inputs["prediction_b"]]
+            )
+        )
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Asynchronously compute the score for two predictions.
+
+        Args:
+            inputs (Dict[str, Any]): The input data.
+            run_manager (AsyncCallbackManagerForChainRun, optional):
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The computed score.
+        """
+        embedded = await self.embeddings.aembed_documents(
+            [inputs["prediction"], inputs["prediction_b"]]
+        )
+        vectors = np.array(embedded)
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    def _evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the embedding distance between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            callbacks (Callbacks, optional): The callbacks to use.
+            tags (List[str], optional): Tags to apply to traces
+            metadata (Dict[str, Any], optional): metadata to apply to
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        result = self(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+        )
+        return {"score": result["score"]}
+
+    async def _aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the embedding distance
+
+        between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            callbacks (Callbacks, optional): The callbacks to use.
+            tags (List[str], optional): Tags to apply to traces
+            metadata (Dict[str, Any], optional): metadata to apply to traces
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        result = await self.acall(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+        )
+        return {"score": result["score"]}
--- a/langchain/evaluation/loading.py
+++ b/langchain/evaluation/loading.py
@ -1,25 +1,46 @@
 """Loading datasets and evaluators."""
-from typing import Any, Dict, List, Optional, Sequence, Type
+from typing import Any, Dict, List, Optional, Sequence, Type, Union

 from langchain.chains.base import Chain
 from langchain.chat_models.openai import ChatOpenAI
 from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
 from langchain.evaluation.comparison import PairwiseStringEvalChain
 from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
+from langchain.evaluation.embedding_distance.base import (
+    EmbeddingDistanceEvalChain,
+    PairwiseEmbeddingDistanceEvalChain,
+)
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
 from langchain.evaluation.schema import EvaluatorType, LLMEvalChain
+from langchain.evaluation.string_distance.base import (
+    PairwiseStringDistanceEvalChain,
+    StringDistanceEvalChain,
+)
 from langchain.schema.language_model import BaseLanguageModel


 def load_dataset(uri: str) -> List[Dict]:
-    """Load a dataset from the LangChainDatasets HuggingFace org.
+    """Load a dataset from the `LangChainDatasets HuggingFace org <https://huggingface.co/LangChainDatasets>`_.

    Args:
        uri: The uri of the dataset to load.

    Returns:
        A list of dictionaries, each representing a row in the dataset.
-    """
+
+    **Prerequisites**
+
+    .. code-block:: shell
+
+        pip install datasets
+
+    Examples
+    --------
+    .. code-block:: python
+
+        from langchain.evaluation import load_dataset
+        ds = load_dataset("llm-math")
+    """  # noqa: E501
    try:
        from datasets import load_dataset
    except ImportError:
@ -32,13 +53,17 @@ def load_dataset(uri: str) -> List[Dict]:
    return [d for d in dataset["train"]]


-_EVALUATOR_MAP: Dict[EvaluatorType, Type[LLMEvalChain]] = {
+_EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
    EvaluatorType.QA: QAEvalChain,
    EvaluatorType.COT_QA: CotQAEvalChain,
    EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
    EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
    EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
    EvaluatorType.CRITERIA: CriteriaEvalChain,
+    EvaluatorType.STRING_DISTANCE: StringDistanceEvalChain,
+    EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
+    EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,
+    EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
 }


@ -66,8 +91,8 @@ def load_evaluator(

    Examples
    --------
-    >>> llm = ChatOpenAI(model="gpt-4", temperature=0)
-    >>> evaluator = _load_evaluator("qa", llm=llm)
+    >>> from langchain.evaluation import load_evaluator, EvaluatorType
+    >>> evaluator = load_evaluator(EvaluatorType.QA)
    """
    llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
    if evaluator not in _EVALUATOR_MAP:
@ -75,7 +100,11 @@ def load_evaluator(
            f"Unknown evaluator type: {evaluator}"
            f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
        )
-    return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs)
+    evaluator_cls = _EVALUATOR_MAP[evaluator]
+    if issubclass(evaluator_cls, LLMEvalChain):
+        return evaluator_cls.from_llm(llm=llm, **kwargs)
+    else:
+        return evaluator_cls(**kwargs)


 def load_evaluators(
@ -107,10 +136,9 @@ def load_evaluators(

    Examples
    --------
-    .. code-block:: python
-        from langchain.evaluation import load_evaluators, EvaluatorType
-        evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA]
-        loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
+    >>> from langchain.evaluation import load_evaluators, EvaluatorType
+    >>> evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA]
+    >>> loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
    """
    llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
    loaded = []
--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@ -167,6 +167,11 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
        """Whether the chain requires an input string."""
        return True

+    class Config:
+        """Configuration for the QAEvalChain."""
+
+        extra = Extra.ignore
+
    @classmethod
    def _validate_input_vars(cls, prompt: PromptTemplate) -> None:
        expected_input_vars = {"query", "context", "result"}
--- a/langchain/evaluation/run_evaluators/base.py
+++ b/langchain/evaluation/run_evaluators/base.py
@ -77,7 +77,7 @@ class RunEvaluatorChain(Chain, RunEvaluator):
    async def _acall(
        self,
        inputs: Dict[str, Any],
-        run_manager: AsyncCallbackManagerForChainRun | None = None,
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
    ) -> Dict[str, Any]:
        run: Run = inputs["run"]
        example: Optional[Example] = inputs.get("example")
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@ -33,6 +33,14 @@ class EvaluatorType(str, Enum):
    CRITERIA = "criteria"
    """The criteria evaluator, which evaluates a model based on a
    custom set of criteria."""
+    STRING_DISTANCE = "string_distance"
+    """Compare predictions to a reference answer using string edit distances."""
+    PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
+    """Compare predictions based on string edit distances."""
+    EMBEDDING_DISTANCE = "embedding_distance"
+    """Compare a prediction to a reference label using embedding distance."""
+    PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
+    """Compare two predictions using embedding distance."""


 class LLMEvalChain(Chain):
@ -89,7 +97,8 @@ class _EvalArgsMixin:


 class StringEvaluator(_EvalArgsMixin, ABC):
-    """Protocol for evaluating strings."""
+    """Grade, tag, or otherwise evaluate predictions relative to their inputs
+    and/or reference labels."""

    @property
    def evaluation_name(self) -> str:
@ -204,7 +213,7 @@ class StringEvaluator(_EvalArgsMixin, ABC):


 class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
-    """A protocol for comparing the output of two models."""
+    """Compare the output of two models (or two outputs of the same model)."""

    @abstractmethod
    def _evaluate_string_pairs(
--- a/langchain/evaluation/string_distance/init.py
+++ b/langchain/evaluation/string_distance/init.py
@ -0,0 +1,12 @@
+"""String distance evaluators."""
+from langchain.evaluation.string_distance.base import (
+    PairwiseStringDistanceEvalChain,
+    StringDistance,
+    StringDistanceEvalChain,
+)
+
+__all__ = [
+    "PairwiseStringDistanceEvalChain",
+    "StringDistance",
+    "StringDistanceEvalChain",
+]
--- a/langchain/evaluation/string_distance/base.py
+++ b/langchain/evaluation/string_distance/base.py
@ -0,0 +1,376 @@
+"""String distance evaluators based on the RapidFuzz library."""
+
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional
+
+from pydantic import Field, root_validator
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from langchain.chains.base import Chain
+from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
+
+
+def _load_rapidfuzz() -> Any:
+    """
+    Load the RapidFuzz library.
+
+    Raises:
+        ImportError: If the rapidfuzz library is not installed.
+
+    Returns:
+        Any: The rapidfuzz.distance module.
+    """
+    try:
+        import rapidfuzz
+    except ImportError:
+        raise ImportError(
+            "Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
+        )
+    return rapidfuzz.distance
+
+
+class StringDistance(str, Enum):
+    """Distance metric to use."""
+
+    DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
+    LEVENSHTEIN = "levenshtein"
+    JARO = "jaro"
+    JARO_WINKLER = "jaro_winkler"
+
+
+class _RapidFuzzChainMixin(Chain):
+    """Shared methods for the rapidfuzz string distance evaluators."""
+
+    distance: StringDistance = Field(default=StringDistance.LEVENSHTEIN)
+
+    @root_validator
+    def validate_dependencies(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Validate that the rapidfuzz library is installed.
+
+        Args:
+            values (Dict[str, Any]): The input values.
+
+        Returns:
+            Dict[str, Any]: The validated values.
+        """
+        _load_rapidfuzz()
+        return values
+
+    @property
+    def output_keys(self) -> List[str]:
+        """
+        Get the output keys.
+
+        Returns:
+            List[str]: The output keys.
+        """
+        return ["score"]
+
+    @staticmethod
+    def _get_metric(distance: str) -> Callable:
+        """
+        Get the distance metric function based on the distance type.
+
+        Args:
+            distance (str): The distance type.
+
+        Returns:
+            Callable: The distance metric function.
+
+        Raises:
+            ValueError: If the distance metric is invalid.
+        """
+        rf_distance = _load_rapidfuzz()
+        if distance == StringDistance.DAMERAU_LEVENSHTEIN:
+            return rf_distance.DamerauLevenshtein.distance
+        elif distance == StringDistance.LEVENSHTEIN:
+            return rf_distance.Levenshtein.distance
+        elif distance == StringDistance.JARO:
+            return rf_distance.Jaro.distance
+        elif distance == StringDistance.JARO_WINKLER:
+            return rf_distance.JaroWinkler.distance
+        else:
+            raise ValueError(f"Invalid distance metric: {distance}")
+
+    @property
+    def metric(self) -> Callable:
+        """
+        Get the distance metric function.
+
+        Returns:
+            Callable: The distance metric function.
+        """
+        return _RapidFuzzChainMixin._get_metric(self.distance)
+
+
+class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
+    """Compute string distances between the prediction and the reference."""
+
+    @property
+    def requires_input(self) -> bool:
+        """
+        Check if input is required.
+
+        Returns:
+            bool: True if input is required, False otherwise.
+        """
+        return False
+
+    @property
+    def requires_reference(self) -> bool:
+        """
+        Check if reference is required.
+
+        Returns:
+            bool: True if reference is required, False otherwise.
+        """
+        return True
+
+    @property
+    def input_keys(self) -> List[str]:
+        """
+        Get the input keys.
+
+        Returns:
+            List[str]: The input keys.
+        """
+        return ["reference", "prediction"]
+
+    @staticmethod
+    def _get_metric(distance: str) -> Callable:
+        """
+        Get the distance metric function based on the distance type.
+
+        Args:
+            distance (str): The distance type.
+
+        Returns:
+            Callable: The distance metric function.
+
+        Raises:
+            ValueError: If the distance metric is invalid.
+        """
+        rf_distance = _load_rapidfuzz()
+        if distance == StringDistance.DAMERAU_LEVENSHTEIN:
+            return rf_distance.DamerauLevenshtein.distance
+        elif distance == StringDistance.LEVENSHTEIN:
+            return rf_distance.Levenshtein.distance
+        elif distance == StringDistance.JARO:
+            return rf_distance.Jaro.distance
+        elif distance == StringDistance.JARO_WINKLER:
+            return rf_distance.JaroWinkler.distance
+        else:
+            raise ValueError(f"Invalid distance metric: {distance}")
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """
+        Compute the string distance between the prediction and the reference.
+
+        Args:
+            inputs (Dict[str, Any]): The input values.
+            run_manager (Optional[CallbackManagerForChainRun]):
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The evaluation results containing the score.
+        """
+        return {"score": self.metric(inputs["reference"], inputs["prediction"])}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """
+        Asynchronously compute the string distance between the prediction
+            and the reference.
+
+        Args:
+            inputs (Dict[str, Any]): The input values.
+            run_manager (Optional[AsyncCallbackManagerForChainRun]:
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The evaluation results containing the score.
+        """
+        return {"score": self.metric(inputs["reference"], inputs["prediction"])}
+
+    def _evaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Evaluate the string distance between the prediction and the reference.
+
+        Args:
+            prediction (str): The prediction string.
+            reference (Optional[str], optional): The reference string.
+            input (Optional[str], optional): The input string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: The evaluation results containing the score.
+        """
+        result = self(
+            inputs={"prediction": prediction, "reference": reference},
+            callbacks=callbacks,
+        )
+        return {"score": result["score"]}
+
+    async def _aevaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Asynchronously evaluate the string distance between the
+            prediction and the reference.
+
+        Args:
+            prediction (str): The prediction string.
+            reference (Optional[str], optional): The reference string.
+            input (Optional[str], optional): The input string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: The evaluation results containing the score.
+        """
+        result = await self.acall(
+            inputs={"prediction": prediction, "reference": reference},
+            callbacks=callbacks,
+        )
+        return {"score": result["score"]}
+
+
+class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvaluator):
+    """Compute string edit distances between two predictions."""
+
+    @property
+    def input_keys(self) -> List[str]:
+        """
+        Get the input keys.
+
+        Returns:
+            List[str]: The input keys.
+        """
+        return ["prediction", "prediction_b"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """
+        Compute the string distance between two predictions.
+
+        Args:
+            inputs (Dict[str, Any]): The input values.
+            run_manager (CallbackManagerForChainRun , optional):
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The evaluation results containing the score.
+        """
+        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """
+        Asynchronously compute the string distance between two predictions.
+
+        Args:
+            inputs (Dict[str, Any]): The input values.
+            run_manager (AsyncCallbackManagerForChainRun , optional):
+                The callback manager.
+
+        Returns:
+            Dict[str, Any]: The evaluation results containing the score.
+        """
+        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
+
+    def _evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Evaluate the string distance between two predictions.
+
+        Args:
+            prediction (str): The first prediction string.
+            prediction_b (str): The second prediction string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            tags (List[str], optional): Tags to apply to traces.
+            metadata (Dict[str, Any], optional): Metadata to apply to traces.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: The evaluation results containing the score.
+        """
+        result = self(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+        )
+        return {"score": result["score"]}
+
+    async def _aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Asynchronously evaluate the string distance between two predictions.
+
+        Args:
+            prediction (str): The first prediction string.
+            prediction_b (str): The second prediction string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            tags (List[str], optional): Tags to apply to traces.
+            metadata (Dict[str, Any], optional): Metadata to apply to traces.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: The evaluation results containing the score.
+        """
+        result = await self.acall(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+        )
+        return {"score": result["score"]}
--- a/poetry.lock
+++ b/poetry.lock
@ -8920,6 +8920,111 @@ packaging = "*"
 [package.extras]
 test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"]

+[[package]]
+name = "rapidfuzz"
+version = "3.1.1"
+description = "rapid fuzzy string matching"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:17e4cbe6632aae7c35101c4b7c498e83f6eacf61be0def4ff98167df30dc69ca"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:167dbce2da6bb5b73d43e53434c5a9d7d1214b658b315420e44044782f4c482b"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdee4f4d04761ce167538adbefa01a64e7cab949d89aa09df39ef0d5e859fb2a"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e77ed7d0bd8d9be530c462c921904ada8d3417671eed749784c5a315af334d"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fdd2ab5ab56fcaf839a9f58caa8756dbfeba0b3dc187850b763d0a1e6ee9c97a"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0843c53d54d5b7d6122d8f1d7574d8c91a7aacc5c316f74d6e33d98aec82949d"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b3e953dcef0302eeb4fe8c7c4907e50d175199fc07da05ad6bd1d8d141ff138"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec5523d5c08c639cd4e301d42f3ad7c6fb061a1f1cd6b5b627e59af345edfed7"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b4995792e106c3f1ab6f56dd6089918b065888e2e55a71e3fea8d0f66bf30989"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cdbf9a76ea47f14026daaed43a2c2150ab0e9a4d5396909f028380f33e61c522"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f25d1975e846d07990cf946a5927a932aa7cccd308ae9979b03a58ff1cd80087"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e0755f5ac6c3d1dc2505eb2e6eaf5508ff17b42c084406714fbabf2d50d098b6"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:de784bbe06d32e66617cd20766c37aae2438902d54b3fa608d2e0a929ca705f4"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-win32.whl", hash = "sha256:ef6c38040d868dcc0132fad377aafeb5b2da71354759e77f41ae599316df2dee"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c74fde444bcd13ef3a803c578b28f33b4f9edf368f46ca3de57fda456065967"},
+    {file = "rapidfuzz-3.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:e549da8d68ad4ee385c918ea8b9efeda875df9edf6c6b48df927bd061c00bfef"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:58ca539cc6ce385d650138a9b1908b05622c2dd08a23d5aea4890523ef3774d5"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91946c496e6f380939dbea14ff6ce6de87480445c09d03964f5374101462594b"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f2024f83a9300440e845b441e71726471f7567021c1d80796ca02e71c5f0dc2"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17b017f9e1b88dfd6d9b03170ef8e86477de0d9d37fbfcbe72ca070cacbe1b65"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6772eb7cc4429f1eae5a9b41e5b0b1af8f0d50727c6e338d9ad5bceee01da5a"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c089ce856919e03f4dd8f9168d60ac580d30cd0451fd60dcdef73010eca68973"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f2cd9a3760080876fc59edb26926e51d6db44dea65e85f1eb04aa5f58c3bc41"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f32791ee045a7b3d6a56208a55d996d5f7a32fdb688f5c5ee899cb7589539eb"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:68d910048b36613701ea671de68f701e2c1ba2839295238def840ff1fc1b15f4"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6f767d4823002e65c06ea273f952fda2b88775e1c2d508564f04d32cdd7f65b2"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:10313075642a9f1f948d356f4f0803ae28a496d7967b466b9cae1a4be8aa4df3"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1465ea085154378e69bf4bc5e27bdac5c94684416882ace31865232adc9239a2"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:53e3c588e7ea158fa80095dd0ff53f49e2ede9a8d71a3a5b964ca045d845a9b9"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-win32.whl", hash = "sha256:cb08db5c122fea4196483b82f7596e50ef9cab1770f7696c197bf0815ac4dd17"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b7c65112c87568274d399ad7a62902cef17801c2bd047b162e79e43758b3ce27"},
+    {file = "rapidfuzz-3.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:ea3e46a534de97a6cad2018cb950492a0fcacad380e35440ce3c1c8fef96a261"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a8bb256b34fcad4f3fa00be6b57fe35bcb54f031911195929145c67d9738ffec"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51f21f37aec6bc117e9083181ddc3cbbcbf56b6506492b128d8e836d3545ca80"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a371846f45ed9d24927a8d5222884536c1e171543396b36250fafb2e848bc92"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:25eea5c8006b6c8747ca204675c9e939f3c4d27167fb43b2aa211443d34f9abd"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:db5e71e5a810d2f1163c914e01b3ba241409a98286ac4850ff26076115ae401b"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c07e16ab38e717931319cff1340debbf2ef940a1cda4eb70e323079b62df306"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aadc5a8b9859737a8f87831215b7fab0c04afeb960bb987c528421a4e6dfb8b6"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:0de229cb613be060580c71c1674acbde57921c7ed33d7a726e071a2562924113"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:b1bf8aba99b267aad0a01dfb44ee39803676007724abcfb72129c350476b2341"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d3264e4a02e4148e30078104fb0c1b6c8eb166ddc5ebe843a22433f58f87dc47"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:712331c1c70c79a219c2ac233b4e25e75ffad51042840d147d5e94519c7d8a1a"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:6ede2d42ad55bd4e7a3394e98c5f58ddace78775493391732d32be61268a4116"},
+    {file = "rapidfuzz-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:32a5c47b5153f25eb512dbb91f9850225d2dcfb3404a1c48406726c7732b0726"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:51bb8f7aa4fe45618e75cdccf08491c752a7f137ffbf7d3afd1809791ac8c326"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:788fb03c5acb5b48f5f918f4cbb5dc072498becf018c64e7e27d6b76e63e68b8"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dc7f25e20781c8d42e813516ee4ff9043ecce4a8e25fc94ee6732a83d81c1c99"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4a751f216fd1222a4a8c7ceff5180872a156202c3bdca1b337e5a5b09298dfd"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83b48b789f2da1688882cba595c40179194ab15ec17ea1d4c9de9ee239649904"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a6f5cd9f1282da49b8d0747c40f3fea2d64ab5e4c2cc2295baf87ff7a0d062"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5fe8054c244bf63be2380efc275edd86da3a706460d42911dc3ff914f3260a5"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d4d509e9aa011e1be5e4da7c5062dc4fc3688714687110536925980b3d03ac6"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ccc1b5b467766110085c80bb9311d233fccc8ed1ce965aebba3125e1bab04cba"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7e181411958d04d5b437a0981e87815e8f1b1909f5ae0e339246d3bc464f53e7"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:c53cf36cdb10819b7154fefdbffbef442ba567d9c1ca74a7e76fd759ace45e6c"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:851b44130393139cb336aa54c681d595d75a3160b7be330f3acc0c3b9dabce70"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49d900da023eeb3bfbe9feee126312eb9fd0458129aa5a581e4d8d8bf4483d14"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-win32.whl", hash = "sha256:6c0e96821029c46847df4ff266ea283a2b6163a4f76a4567f9986934e9c4410c"},
+    {file = "rapidfuzz-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7af18372f576e36e93f4662bdf64043ac23dfa02d7f768d7e7e1d0211bb9cb35"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8b966344ed4122a71ab8ccdca2954db1ce0d8049cb9bcac58db07558f9d9ec32"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a293370448f2e46fdc6e086ac99923015bdc53973a65d3df35aefc685e1a5809"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:351d253fdee62d6d0e80c75f0505accc1ce8cc73a50779c60986ef21c92f20f9"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e951c874a0e5b375b2af9b5f264eefc679c0685c166ee0641e703ef0795509b"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4019def8a18bc867ac61f08a542bf474a7a9b3f662f5d5cd169c9135866562f5"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:086a2d84c2e497e3ab160ccf164e319bca874d9383d008fcadf91ede8ac7997f"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6d4da453fbd8793ebb11bed396f8a4b9041d6227bf055903447305dd7942312f"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10f56af1d46fbeaaa0dc50901c2dc439c7a455cfdac2f1acf6cffeb65ae82c48"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7726f67e4a0b2b4392f03aa62e16b12a697156c6735df27b21bd3ab561b01659"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d72916d27fb88741bfb576b0b0639354ca00f5e91046171c985262c68a86bbb5"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c85bb6946fb02231d1e60ab45c36ecee04ecf7f725e094f5beee798b6b7d36d"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:fb7049dff52cded65184a3d2ff45cfd226bff7314f49a8f4b83f943eea9181a7"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:408007b4bc5a0a0cb9bfcdcc8cffa9b71fec6ee53ccdf9c26b57539f7e264ab5"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-win32.whl", hash = "sha256:9dc7154889937ca5a004d17f62b4798e0af52f69c38eb3112dbdb52b006d4419"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:16c506bac2e0a6f6581b334a7802c2f0d8343ec1d77e5cf9452c33d6219abef8"},
+    {file = "rapidfuzz-3.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:5e11e11880951e767342b56627ab2dc9d3ef90e2605b656e9b5e6e0beadaaf0f"},
+    {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a8b8f32463781e4703965c9cf7a609a19a74478f332e0d62cd9d0e7a9db91321"},
+    {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b408ac3c7f8c3414bfd5c6044ca4bb385b390bcf5eae3ad884cef48628c131ae"},
+    {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ff1a517de2b1e80ddf1a3037a6ebca9925154c1af70751518d50d5c332e1ec8"},
+    {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1e23665be5918f979180130babedab9317fbb34cdae237c7defad7e86bc684e"},
+    {file = "rapidfuzz-3.1.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:15260263a0c7bffac934a53b6622d77e06e10929ee4d2e62ac6f70c13988f351"},
+    {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f7acc5c9c7cf567372de5b6c817f93db508e7b9bd7f29bd6187df8d2cc60ced5"},
+    {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79f5a3ab7ff6c46336f38690f0564bc7689cefa180257ed9078c42f75b10c9d2"},
+    {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:362e366e79fcc9a8866b41f20ef4d2987a06f8b134096e659594c059aa8a6d88"},
+    {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:819d9317c3d86b508d87ab1bca5867f3abc18b902c822bc57366ccc6330a030b"},
+    {file = "rapidfuzz-3.1.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4a64ddfb7084b678da7778c1263aee2baae5a2ca55ec5589a022defc38103eb1"},
+    {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8243bb4bb4db7c3501932ced6a978b284e19c3619b6802455e47bfd0905adb81"},
+    {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39c7d0dbd77a7f28ff85a1dff2afb2ed73e5cd81cca3f654450ed339a271c0ab"},
+    {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4afab735bb0ac3ec9bafcc35376ed336d26af6140c4d81e4c869e77df77ecd5"},
+    {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69d503a7641b5a63aa53c7aca0b857d38f48cd7bae39f8563679b324e3d2d47a"},
+    {file = "rapidfuzz-3.1.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ef3ad80458e47723812976a2ea1282ff207ad20e6cb19da1917f76699bd5aaa5"},
+    {file = "rapidfuzz-3.1.1.tar.gz", hash = "sha256:a06a08be3cb7d7df7993dd16e84aaf59bd5a7ff98a9f1b3e893d18b273a71c64"},
+]
+
+[package.extras]
+full = ["numpy"]
+
 [[package]]
 name = "ratelimiter"
 version = "1.2.0.post0"
@ -12410,7 +12515,7 @@ clarifai = ["clarifai"]
 cohere = ["cohere"]
 docarray = ["docarray"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"]
+extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "telethon", "tqdm", "zep-python"]
 javascript = ["esprima"]
 llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"]
 openai = ["openai", "tiktoken"]
@ -12420,4 +12525,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "cc95f4e0d4bee4ba19cf539be5ffd81f1ddb33229ace936ef3b6cbd4122493ca"
+content-hash = "6e2acbd4f760e92454f9f9e29840679fbd59b8662a99bcb89e2251a5b8736e6d"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -116,6 +116,7 @@ streamlit = {version = "^1.18.0", optional = true, python = ">=3.8.1,<3.9.7 || >
 psychicapi = {version = "^0.8.0", optional = true}
 cassio = {version = "^0.0.7", optional = true}
 rdflib = {version = "^6.3.2", optional = true}
+rapidfuzz = {version = "^3.1.1", optional = true}

 [tool.poetry.group.docs.dependencies]
 autodoc_pydantic = "^1.8.0"
@ -346,7 +347,8 @@ extended_testing = [
 "scikit-learn",
 "streamlit",
 "pyspark",
- "openai"
+ "openai",
+ "rapidfuzz"
 ]

 [[tool.poetry.source]]
--- a/tests/integration_tests/evaluation/embedding_distance/test_embedding.py
+++ b/tests/integration_tests/evaluation/embedding_distance/test_embedding.py
@ -0,0 +1,123 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+
+from langchain.evaluation.embedding_distance import (
+    EmbeddingDistance,
+    PairwiseEmbeddingDistanceEvalChain,
+)
+
+
+@pytest.fixture
+def vectors() -> Tuple[np.ndarray, np.ndarray]:
+    """Create two random vectors."""
+    vector_a = np.array(
+        [
+            0.5488135,
+            0.71518937,
+            0.60276338,
+            0.54488318,
+            0.4236548,
+            0.64589411,
+            0.43758721,
+            0.891773,
+            0.96366276,
+            0.38344152,
+        ]
+    )
+    vector_b = np.array(
+        [
+            0.79172504,
+            0.52889492,
+            0.56804456,
+            0.92559664,
+            0.07103606,
+            0.0871293,
+            0.0202184,
+            0.83261985,
+            0.77815675,
+            0.87001215,
+        ]
+    )
+    return vector_a, vector_b
+
+
+@pytest.fixture
+def chain() -> PairwiseEmbeddingDistanceEvalChain:
+    """Create a PairwiseEmbeddingDistanceEvalChain."""
+    return PairwiseEmbeddingDistanceEvalChain()
+
+
+@pytest.mark.requires("scipy")
+def test_cosine_similarity(
+    chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the cosine similarity."""
+    chain.distance_metric = EmbeddingDistance.COSINE
+    result = chain._compute_score(np.array(vectors))
+    expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
+        np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
+    )
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_euclidean_distance(
+    chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the euclidean distance."""
+    from scipy.spatial.distance import euclidean
+
+    chain.distance_metric = EmbeddingDistance.EUCLIDEAN
+    result = chain._compute_score(np.array(vectors))
+    expected = euclidean(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_manhattan_distance(
+    chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the manhattan distance."""
+    from scipy.spatial.distance import cityblock
+
+    chain.distance_metric = EmbeddingDistance.MANHATTAN
+    result = chain._compute_score(np.array(vectors))
+    expected = cityblock(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_chebyshev_distance(
+    chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the chebyshev distance."""
+    from scipy.spatial.distance import chebyshev
+
+    chain.distance_metric = EmbeddingDistance.CHEBYSHEV
+    result = chain._compute_score(np.array(vectors))
+    expected = chebyshev(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_hamming_distance(
+    chain: PairwiseEmbeddingDistanceEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the hamming distance."""
+    from scipy.spatial.distance import hamming
+
+    chain.distance_metric = EmbeddingDistance.HAMMING
+    result = chain._compute_score(np.array(vectors))
+    expected = hamming(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("openai", "tiktoken")
+def test_embedding_distance(chain: PairwiseEmbeddingDistanceEvalChain) -> None:
+    """Test the embedding distance."""
+    result = chain.evaluate_string_pairs(
+        prediction="A single cat", prediction_b="A single cat"
+    )
+    assert np.isclose(result["score"], 0.0)
--- a/tests/unit_tests/evaluation/string_distance/init.py
+++ b/tests/unit_tests/evaluation/string_distance/init.py
--- a/tests/unit_tests/evaluation/string_distance/test_base.py
+++ b/tests/unit_tests/evaluation/string_distance/test_base.py
@ -0,0 +1,51 @@
+import pytest
+
+from langchain.evaluation.string_distance import (
+    PairwiseStringDistanceEvalChain,
+    StringDistance,
+    StringDistanceEvalChain,
+)
+
+
+@pytest.mark.requires("rapidfuzz")
+@pytest.mark.parametrize("distance", list(StringDistance))
+def test_zero_distance(distance: StringDistance) -> None:
+    eval_chain = StringDistanceEvalChain(distance=distance)
+    string = "三人行则必有我师"
+    result = eval_chain.evaluate_strings(prediction=string, reference=string)
+    assert "score" in result
+    assert result["score"] == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires("rapidfuzz")
+@pytest.mark.parametrize("distance", list(StringDistance))
+async def test_zero_distance_async(distance: StringDistance) -> None:
+    eval_chain = StringDistanceEvalChain(distance=distance)
+    string = "三人行则必有我师"
+    result = await eval_chain.aevaluate_strings(prediction=string, reference=string)
+    assert "score" in result
+    assert result["score"] == 0
+
+
+@pytest.mark.requires("rapidfuzz")
+@pytest.mark.parametrize("distance", list(StringDistance))
+def test_zero_distance_pairwise(distance: StringDistance) -> None:
+    eval_chain = PairwiseStringDistanceEvalChain(distance=distance)
+    string = "三人行则必有我师"
+    result = eval_chain.evaluate_string_pairs(prediction=string, prediction_b=string)
+    assert "score" in result
+    assert result["score"] == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires("rapidfuzz")
+@pytest.mark.parametrize("distance", list(StringDistance))
+async def test_zero_distance_pairwise_async(distance: StringDistance) -> None:
+    eval_chain = PairwiseStringDistanceEvalChain(distance=distance)
+    string = "三人行则必有我师"
+    result = await eval_chain.aevaluate_string_pairs(
+        prediction=string, prediction_b=string
+    )
+    assert "score" in result
+    assert result["score"] == 0
--- a/tests/unit_tests/evaluation/test_loading.py
+++ b/tests/unit_tests/evaluation/test_loading.py
@ -2,20 +2,27 @@

 import pytest

+from langchain.embeddings.fake import FakeEmbeddings
 from langchain.evaluation.loading import EvaluatorType, load_evaluators
 from langchain.evaluation.schema import StringEvaluator
 from tests.unit_tests.llms.fake_chat_model import FakeChatModel
 from tests.unit_tests.llms.fake_llm import FakeLLM


+@pytest.mark.requires("rapidfuzz")
@pytest.mark.parametrize("evaluator_type", EvaluatorType)
 def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
    """Test loading evaluators."""
    fake_llm = FakeChatModel()
-    load_evaluators([evaluator_type], llm=fake_llm)
+    embeddings = FakeEmbeddings(size=32)
+    load_evaluators([evaluator_type], llm=fake_llm, embeddings=embeddings)

    # Test as string
-    load_evaluators([evaluator_type.value], llm=fake_llm)  # type: ignore
+    load_evaluators(
+        [evaluator_type.value],  # type: ignore
+        llm=fake_llm,
+        embeddings=embeddings,
+    )


 def test_criteria_eval_chain_requires_reference() -> None: