[Breaking] Update Evaluation Functionality (#7388)

- Migrate from deprecated langchainplus_sdk to `langsmith` package - Update the `run_on_dataset()` API to use an eval config - Update a number of evaluators, as well as the loading logic - Update docstrings / reference docs - Update tracer to share single HTTP session
11 months ago · a673a51efa
parent 224199083b
commit a673a51efa
48 changed files with 3626 additions and 2546 deletions
--- a/docs/api_reference/create_api_rst.py
+++ b/docs/api_reference/create_api_rst.py
@ -20,7 +20,9 @@ def load_members() -> dict:
                cls = re.findall(r"^class ([^_].*)\(", line)
                members[top_level]["classes"].extend([module + "." + c for c in cls])
                func = re.findall(r"^def ([^_].*)\(", line)
-                members[top_level]["functions"].extend([module + "." + f for f in func])
+                afunc = re.findall(r"^async def ([^_].*)\(", line)
+                func_strings = [module + "." + f for f in func + afunc]
+                members[top_level]["functions"].extend(func_strings)
    return members


--- a/docs/extras/guides/evaluation/criteria_eval_chain.ipynb
+++ b/docs/extras/guides/evaluation/criteria_eval_chain.ipynb
@ -12,7 +12,7 @@
    "The `CriteriaEvalChain` is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can\n",
    "describe those criteria in regular language. In this example, you will use the `CriteriaEvalChain` to check whether an output is concise.\n",
    "\n",
-    "### Step 1: Create the Eval Chain\n",
+    "### Step 1: Load Eval Chain\n",
    "\n",
    "First, create the evaluation chain to predict whether outputs are \"concise\"."
   ]
@ -27,11 +27,15 @@
   "outputs": [],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.evaluation.criteria import CriteriaEvalChain\n",
+    "from langchain.evaluation import load_evaluator, EvaluatorType\n",
    "\n",
-    "llm = ChatOpenAI(temperature=0)\n",
+    "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
    "criterion = \"conciseness\"\n",
-    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criterion)"
+    "eval_chain = load_evaluator(EvaluatorType.CRITERIA, llm=eval_llm, criteria=criterion)\n",
+    "\n",
+    "# Equivalent to:\n",
+    "# from langchain.evaluation import CriteriaEvalChain\n",
+    "# CriteriaEvalChain.from_llm(llm=eval_llm, criteria=criterion)"
   ]
  },
  {
@ -80,7 +84,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'reasoning': '1. Conciseness: The submission is concise and to the point. It directly answers the question without any unnecessary information. Therefore, the submission meets the criterion of conciseness.\\n\\nY', 'value': 'Y', 'score': 1}\n"
+      "{'reasoning': 'The criterion for this task is conciseness. The submission should be concise and to the point.\\n\\nLooking at the submission, it provides a detailed explanation of the origin of the term \"synecdoche\". It explains the Greek roots of the word and how it entered the English language. \\n\\nWhile the explanation is detailed, it is also concise. It doesn\\'t include unnecessary information or go off on tangents. It sticks to the point, which is explaining the origin of the term.\\n\\nTherefore, the submission meets the criterion of conciseness.\\n\\nY', 'value': 'Y', 'score': 1}\n"
     ]
    }
   ],
@ -89,40 +93,6 @@
    "print(eval_result)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "8c4ec9dd-6557-4f23-8480-c822eb6ec552",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['conciseness',\n",
-       " 'relevance',\n",
-       " 'correctness',\n",
-       " 'coherence',\n",
-       " 'harmfulness',\n",
-       " 'maliciousness',\n",
-       " 'helpfulness',\n",
-       " 'controversiality',\n",
-       " 'mysogyny',\n",
-       " 'criminality',\n",
-       " 'insensitive']"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
-    "CriteriaEvalChain.get_supported_default_criteria()"
-   ]
-  },
  {
   "cell_type": "markdown",
   "id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
@ -133,6 +103,24 @@
    "Some criteria may be useful only when there are ground truth reference labels. You can pass these in as well."
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0c41cd19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_chain = load_evaluator(\n",
+    "    EvaluatorType.LABELED_CRITERIA,\n",
+    "    llm=eval_llm,\n",
+    "    criteria=\"correctness\",\n",
+    ")\n",
+    "\n",
+    "# Equivalent to\n",
+    "# from langchain.evaluation import LabeledCriteriaEvalChain\n",
+    "# LabeledCriteriaEvalChain.from_llm(llm=eval_llm, criteria=criterion)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 5,
@ -145,65 +133,18 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "With ground truth: 1\n",
-      "Withoutg ground truth: 0\n"
+      "With ground truth: 1\n"
     ]
    }
   ],
   "source": [
-    "eval_chain = CriteriaEvalChain.from_llm(\n",
-    "    llm=llm, criteria=\"correctness\", requires_reference=True\n",
-    ")\n",
-    "\n",
    "# We can even override the model's learned knowledge using ground truth labels\n",
    "eval_result = eval_chain.evaluate_strings(\n",
    "    input=\"What is the capital of the US?\",\n",
    "    prediction=\"Topeka, KS\",\n",
    "    reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
    ")\n",
-    "print(f'With ground truth: {eval_result[\"score\"]}')\n",
-    "\n",
-    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\")\n",
-    "eval_result = eval_chain.evaluate_strings(\n",
-    "    input=\"What is the capital of the US?\",\n",
-    "    prediction=\"Topeka, KS\",\n",
-    ")\n",
-    "print(f'Withoutg ground truth: {eval_result[\"score\"]}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2eb7dedb-913a-4d9e-b48a-9521425d1008",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Multiple Criteria\n",
-    "\n",
-    "To check whether an output complies with all of a list of default criteria, pass in a list! Be sure to only include criteria that are relevant to the provided information, and avoid mixing criteria that measure opposing things (e.g., harmfulness and helpfulness)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "50c067f7-bc6e-4d6c-ba34-97a72023be27",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'reasoning': 'Conciseness:\\n- The submission is one sentence long, which is concise.\\n- The submission directly answers the question without any unnecessary information.\\nConclusion: The submission meets the conciseness criterion.\\n\\nCoherence:\\n- The submission is well-structured and organized.\\n- The submission provides the origin of the term synecdoche and explains the meaning of the Greek words it comes from.\\n- The submission is coherent and easy to understand.\\nConclusion: The submission meets the coherence criterion.', 'value': 'Final conclusion: Y', 'score': None}\n"
-     ]
-    }
-   ],
-   "source": [
-    "criteria = [\"conciseness\", \"coherence\"]\n",
-    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)\n",
-    "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
-    "print(eval_result)"
+    "print(f'With ground truth: {eval_result[\"score\"]}')"
   ]
  },
  {
@ -220,7 +161,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "id": "bafa0a11-2617-4663-84bf-24df7d0736be",
   "metadata": {},
   "outputs": [
@ -228,62 +169,22 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'reasoning': '1. Criteria: numeric: Does the output contain numeric information?\\n- The submission does not contain any numeric information.\\n- Conclusion: The submission meets the criteria.', 'value': 'Answer: Y', 'score': None}\n"
+      "{'reasoning': 'The criterion is asking if the output contains numeric information. The submission does mention the \"late 16th century,\" which is a numeric information. Therefore, the submission meets the criterion.\\n\\nY', 'value': 'Y', 'score': 1}\n"
     ]
    }
   ],
   "source": [
    "custom_criterion = {\"numeric\": \"Does the output contain numeric information?\"}\n",
    "\n",
-    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criterion)\n",
+    "eval_chain = load_evaluator(\n",
+    "    EvaluatorType.CRITERIA,\n",
+    "    llm=eval_llm,\n",
+    "    criteria=custom_criterion,\n",
+    ")\n",
    "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
    "print(eval_result)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "6db12a16-0058-4a14-8064-8528540963d8",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Meets criteria:  1\n",
-      "Does not meet criteria:  0\n"
-     ]
-    }
-   ],
-   "source": [
-    "# You can specify multiple criteria in the dictionary. We recommend you keep the number criteria to a minimum, however for more reliable results.\n",
-    "\n",
-    "custom_criteria = {\n",
-    "    \"complements-user\": \"Does the submission complements the question or the person writing the question in some way?\",\n",
-    "    \"positive\": \"Does the submission maintain a positive sentiment throughout?\",\n",
-    "    \"active voice\": \"Does the submission maintain an active voice throughout, avoiding state of being verbs?\",\n",
-    "}\n",
-    "\n",
-    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criteria)\n",
-    "\n",
-    "# Example that complies\n",
-    "query = \"What's the population of lagos?\"\n",
-    "eval_result = eval_chain.evaluate_strings(\n",
-    "    prediction=\"I think that's a great question, you're really curious! About 30 million people live in Lagos, Nigeria, as of 2023.\",\n",
-    "    input=query,\n",
-    ")\n",
-    "print(\"Meets criteria: \", eval_result[\"score\"])\n",
-    "\n",
-    "# Example that does not comply\n",
-    "eval_result = eval_chain.evaluate_strings(\n",
-    "    prediction=\"The population of Lagos, Nigeria, is about 30 million people.\",\n",
-    "    input=query,\n",
-    ")\n",
-    "print(\"Does not meet criteria: \", eval_result[\"score\"])"
-   ]
-  },
  {
   "cell_type": "markdown",
   "id": "07485cce-8d52-43a0-bdad-76ec7dacfb51",
@ -297,7 +198,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
   "id": "99e3c242-5b12-4bd5-b487-64990a159655",
   "metadata": {},
   "outputs": [
@ -323,7 +224,7 @@
       "  ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -337,7 +238,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
   "id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
   "metadata": {
    "tags": []
@ -346,25 +247,25 @@
    {
     "data": {
      "text/plain": [
-       "{'reasoning': 'harmful1: The submission includes a derogatory and insulting term (\"lilly-livered nincompoop\") to describe Will, which could be considered harmful and disrespectful. Therefore, the submission meets this criterion.\\nharmful2: The submission does not contain any explicit encouragement of dangerous or illegal activity, nor does it contain any explicit racism or sexism. However, the use of the term \"nincompoop\" could be seen as belittling and harmful to Will\\'s self-esteem. Therefore, the submission meets this criterion as well.\\n\\nY',\n",
-       " 'value': 'Y',\n",
-       " 'score': 1}"
+       "{'reasoning': 'The criterion asks to identify if the assistant\\'s response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.\\n\\nThe assistant\\'s response is \"I say that man is a lilly-livered nincompoop\". This is a negative statement about a person named Will. \\n\\nThe term \"lilly-livered\" is an old-fashioned term meaning cowardly, and \"nincompoop\" is a term meaning a foolish or stupid person. \\n\\nWhile this is a negative and potentially insulting statement, it does not appear to be harmful in a physical sense, unethical, racist, sexist, toxic, dangerous, or illegal. \\n\\nIt could be argued that the statement is harmful in a psychological or emotional sense, as it is insulting. However, without further context, it\\'s hard to definitively say if this meets the criteria of being harmful.\\n\\nN',\n",
+       " 'value': 'N',\n",
+       " 'score': 0}"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "eval_chain = CriteriaEvalChain.from_llm(\n",
-    "    llm=llm, criteria=[PRINCIPLES[\"harmful1\"], PRINCIPLES[\"harmful2\"]]\n",
+    "eval_chain = load_evaluator(\n",
+    "    EvaluatorType.CRITERIA, llm=eval_llm, criteria=PRINCIPLES[\"harmful1\"]\n",
    ")\n",
    "eval_result = eval_chain.evaluate_strings(\n",
    "    prediction=\"I say that man is a lilly-livered nincompoop\",\n",
    "    input=\"What do you think of Will?\",\n",
    ")\n",
-    "eval_result"
+    "print(eval_result)"
   ]
  },
  {
@ -378,14 +279,6 @@
    "\n",
    "Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like \"correctness\" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense."
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "415eb393-c64f-41f1-98de-de99e8e3597e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
@ -404,7 +297,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.11.2"
  }
 },
 "nbformat": 4,
--- a/langchain/experimental/client/tracing_datasets.ipynb
+++ b/langchain/experimental/client/tracing_datasets.ipynb
@ -7,15 +7,19 @@
    "tags": []
   },
   "source": [
-    "# Debug, Evaluate, and Monitor LLMs with LangSmith\n",
+    "# LangSmith Walkthrough\n",
    "\n",
-    "LangChain makes it easy to get started with Agents and other LLM applications. Even so, delivering a high-quality agent to production can be deceptively difficult. To aid the development process, we've designed tracing and callbacks at the core of LangChain. In this notebook, you will get started prototyping, testing, and monitoring an LLM agent.\n",
+    "LangChain makes it easy to prototype LLM applications and Agents. Even so, delivering a high-quality product to production can be deceptively difficult. You will likely have to heavily customize your prompts, chains, and other components to create a high-quality product.\n",
    "\n",
-    "When might you want to use tracing? Some situations we've found it useful include:\n",
-    "- Quickly debugging a new chain, agent, or set of tools\n",
-    "- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n",
-    "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n",
-    "- Capturing production traces and using LangChain summarizers to analyze app usage"
+    "To aid the development process, we've designed tracing and callbacks at the core of LangChain. In this notebook, you will get started prototyping and testing an example LLM agent.\n",
+    "\n",
+    "When might this come in handy? You may find it useful when you want to:\n",
+    "\n",
+    "- Quickly debug a new chain, agent, or set of tools\n",
+    "- Visualize how components (chains, llms, retrievers, etc.) relate and are used\n",
+    "- Evaluate different prompts and LLMs for a single component\n",
+    "- Run a given chain several times over a dataset to ensure it consistently meets a quality bar.\n",
+    "- Capture usage traces and using LLMs or analytics pipelines to generate insights"
   ]
  },
  {
@ -25,17 +29,15 @@
   "source": [
    "## Prerequisites\n",
    "\n",
-    "**Either [create a hosted LangSmith account](https://www.langchain.plus/) and connect with an API key OR\n",
-    "run the server locally.**\n",
-    "\n",
+    "**Run the [local tracing server](https://docs.smith.langchain.com/docs/additional-resources/local_installation) OR [create a hosted LangSmith account](https://smith.langchain.com/) and connect with an API key.**\n",
    "\n",
    "To run the local server, execute the following comand in your terminal:\n",
    "```\n",
-    "pip install --upgrade langchain\n",
-    "langchain plus start\n",
+    "pip install --upgrade langsmith\n",
+    "langsmith start\n",
    "```\n",
    "\n",
-    "Now, let's get started by creating a client to connect to LangChain+."
+    "Now, let's get started debugging!"
   ]
  },
  {
@ -45,25 +47,58 @@
    "tags": []
   },
   "source": [
-    "## Debug your Agent\n",
+    "## Debug your Chain \n",
    "\n",
    "First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
    "You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable. This will automatically create a debug project for you.\n",
    "\n",
-    "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.langchain.plus/docs/)\n",
+    "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/)\n",
    "\n",
    "**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
    "\n",
-    "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta."
+    "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
   "metadata": {
    "tags": []
   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from uuid import uuid4\n",
+    "\n",
+    "unique_id = uuid4().hex[0:8]\n",
+    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
+    "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
+    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"  # Uncomment this line to use the hosted version\n",
+    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGSMITH-API-KEY>\"  # Uncomment this line to use the hosted version.\n",
+    "\n",
+    "# Used by the agent in this tutorial\n",
+    "# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
+    "# os.environ[\"SERPAPI_API_KEY\"] = \"<YOUR-SERPAPI-API-KEY>\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ee7f34b-b65c-4e09-ad52-e3ace78d0221",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Create the langsmith client to interact with the API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "510b5ca0",
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "name": "stdout",
@ -75,10 +110,10 @@
    {
     "data": {
      "text/html": [
-       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
+       "<a href=\"https://dev.smith.langchain.com/\", target=\"_blank\" rel=\"noopener\">LangSmith Client</a>"
      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
+       "Client (API URL: https://dev.api.smith.langchain.com)"
      ]
     },
     "execution_count": 2,
@ -87,21 +122,9 @@
    }
   ],
   "source": [
-    "import os\n",
-    "from uuid import uuid4\n",
-    "from langchainplus_sdk import LangChainPlusClient\n",
-    "\n",
-    "unique_id = uuid4().hex[0:8]\n",
-    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
-    "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
-    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"  # Uncomment this line to use the hosted version\n",
-    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGCHAINPLUS-API-KEY>\"  # Uncomment this line to use the hosted version.\n",
+    "from langsmith import Client\n",
    "\n",
-    "# Used by the agent below\n",
-    "# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
-    "# os.environ[\"SERPAPI_API_KEY\"] = \"<YOUR-SERPAPI-API-KEY>\"\n",
-    "\n",
-    "client = LangChainPlusClient()\n",
+    "client = Client()\n",
    "print(\"You can click the link below to view the UI\")\n",
    "client"
   ]
@ -111,7 +134,7 @@
   "id": "ca27fa11-ddce-4af0-971e-c5c37d5b92ef",
   "metadata": {},
   "source": [
-    "Now, start prototyping your agent. We will use a straightforward math example."
+    "Now, start prototyping your agent. We will use a math example using an older ReACT-style agent."
   ]
  },
  {
@ -124,8 +147,7 @@
   "outputs": [],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import initialize_agent, load_tools\n",
-    "from langchain.agents import AgentType\n",
+    "from langchain.agents import AgentType, initialize_agent, load_tools\n",
    "\n",
    "llm = ChatOpenAI(temperature=0)\n",
    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
@ -184,7 +206,10 @@
   "source": [
    "from langchain.callbacks.tracers.langchain import wait_for_all_tracers\n",
    "\n",
-    "# Logs are submitted in a background thread. Make sure they've been submitted before moving on.\n",
+    "# Logs are submitted in a background thread to avoid blocking execution.\n",
+    "# For the sake of this tutorial, we want to make sure\n",
+    "# they've been submitted before moving on. This is also\n",
+    "# useful for serverless deployments.\n",
    "wait_for_all_tracers()"
   ]
  },
@ -193,7 +218,7 @@
   "id": "9decb964-be07-4b6c-9802-9825c8be7b64",
   "metadata": {},
   "source": [
-    "Assuming you've successfully initiated the server as described earlier, your agent logs should show up in your server. You can check by clicking on the link below:"
+    "Assuming you've successfully configured the server earlier, your agent traces should show up in your server's UI. You can check by clicking on the link below:"
   ]
  },
  {
@ -207,10 +232,10 @@
    {
     "data": {
      "text/html": [
-       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
+       "<a href=\"https://dev.smith.langchain.com/\", target=\"_blank\" rel=\"noopener\">LangSmith Client</a>"
      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
+       "Client (API URL: https://dev.api.smith.langchain.com)"
      ]
     },
     "execution_count": 6,
@ -229,7 +254,7 @@
   "source": [
    "## Test\n",
    "\n",
-    "Once you've debugged a prototype of your agent, you will want to create tests and benchmark evaluations as you think about putting it into a production environment.\n",
+    "Once you've debugged a customized your LLM component, you will want to create tests and benchmark evaluations to measure its performance before putting it into a production environment.\n",
    "\n",
    "In this notebook, you will run evaluators to test an agent. You will do so in a few steps:\n",
    "\n",
@ -254,26 +279,14 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "id": "d14a9881-2a01-404c-8c56-0b78565c3ff4",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "dataset_name = \"calculator-example-dataset\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
   "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n",
-    "    client.delete_dataset(dataset_name=dataset_name)\n",
+    "dataset_name = f\"calculator-example-dataset-{unique_id}\"\n",
+    "\n",
    "dataset = client.create_dataset(\n",
    "    dataset_name, description=\"A calculator example dataset\"\n",
    ")\n",
@ -289,118 +302,90 @@
  },
  {
   "cell_type": "markdown",
-   "id": "92e8944f-e6fc-4bdf-9611-b2db39698cbe",
-   "metadata": {},
+   "id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
+   "metadata": {
+    "tags": []
+   },
   "source": [
-    "### 2. Select RunEvaluators\n",
-    "\n",
-    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
-    "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
+    "### 2. Define the Agent or LLM to Test\n",
    "\n",
-    "Below, we will create some pre-implemented run evaluators that do the following:\n",
-    "- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
-    "- Evaluate the overall agent trajectory based on the tool usage and intermediate steps.\n",
-    "- Evaluating 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
-    "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
-    "\n",
-    "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
-    "custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n",
+    "You can evaluate any LLM or chain. Since chains can have memory, we will pass in a `chain_factory` (aka a `constructor` ) function to initialize for each call.\n",
    "\n",
-    "Below, create the run evaluators.\n",
-    "\n",
-    "**Note: the feedback API is currently experimental and subject to change.**"
+    "In this case, you will test an agent that uses OpenAI's function calling endpoints, but it can be any simple chain."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
-   "id": "56298faa-9ff2-43a2-b35a-ee306e3bf64d",
+   "execution_count": 8,
+   "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "from langchain.evaluation.run_evaluators import (\n",
-    "    get_qa_evaluator,\n",
-    "    get_criteria_evaluator,\n",
-    "    get_trajectory_evaluator,\n",
-    ")\n",
    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.agents import AgentType, initialize_agent, load_tools\n",
    "\n",
-    "# You can use any model, but stronger llms tend to be more reliable\n",
-    "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
-    "\n",
-    "# Measures accuracy against ground truth\n",
-    "qa_evaluator = get_qa_evaluator(eval_llm)\n",
-    "\n",
-    "# Measures how effective and efficient the agent's actions are\n",
+    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
-    "trajectory_evaluator = get_trajectory_evaluator(eval_llm, agent_tools=tools)\n",
    "\n",
-    "# Measure helpfulness. We have some pre-defined criteria you can select\n",
-    "helpfulness_evaluator = get_criteria_evaluator(\n",
-    "    eval_llm,\n",
-    "    \"helpfulness\",\n",
-    ")\n",
    "\n",
-    "# Custom criteria are specified as a dictionary\n",
-    "custom_criteria_evaluator = get_criteria_evaluator(\n",
-    "    eval_llm,\n",
-    "    {\n",
-    "        \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
-    "    },\n",
-    ")\n",
+    "# Since chains can be stateful (e.g. they can have memory), we provide\n",
+    "# a way to initialize a new chain for each row in the dataset. This is done\n",
+    "# by passing in a factory function that returns a new chain for each row.\n",
+    "def agent_factory():\n",
+    "    return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
    "\n",
-    "evaluators = [\n",
-    "    qa_evaluator,\n",
-    "    trajectory_evaluator,\n",
-    "    helpfulness_evaluator,\n",
-    "    custom_criteria_evaluator,\n",
-    "]"
+    "\n",
+    "# If your chain is NOT stateful, your factory can return the object directly\n",
+    "# to improve runtime performance. For example:\n",
+    "# chain_factory = lambda: agent"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
-   "metadata": {
-    "tags": []
-   },
+   "id": "9cb9ef53",
+   "metadata": {},
   "source": [
-    "### 3. Define the Agent or LLM to Test\n",
+    "### 3. Configure Evaluation\n",
+    "\n",
+    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
+    "It can be helpful to use automated metrics and ai-assisted feedback to evaluate your component's performance.\n",
    "\n",
-    "You can evaluate any LLM or chain. Since chains can have memory, we need to pass an\n",
-    "initializer function that returns a new chain for each row.\n",
+    "Below, we will create some pre-implemented run evaluators that do the following:\n",
+    "- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
+    "- Measure semantic (dis)similarity using embedding distance\n",
+    "- Evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
    "\n",
-    "In this case, you will test an agent that uses OpenAI's function calling endpoints, but it can be any simple chain."
+    "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
+    "custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
-   "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
+   "execution_count": 9,
+   "id": "a25dc281",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import initialize_agent, load_tools\n",
-    "from langchain.agents import AgentType\n",
-    "\n",
-    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
-    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
-    "\n",
-    "\n",
-    "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
-    "# a way to initialize a new chain for each row in the dataset. This is done\n",
-    "# by passing in a factory function that returns a new chain for each row.\n",
-    "def agent_factory():\n",
-    "    return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
-    "\n",
-    "\n",
-    "# If your chain is NOT stateful, your factory can return the object directly\n",
-    "# to improve runtime performance. For example:\n",
-    "# chain_factory = lambda: agent"
+    "from langchain.evaluation import EvaluatorType\n",
+    "from langchain.smith import RunEvalConfig\n",
+    "\n",
+    "evaluation_config = RunEvalConfig(\n",
+    "    # Evaluators can either be an evaluator type (e.g., \"qa\", \"criteria\", \"embedding_distance\", etc.) or a configuration for that evaluator\n",
+    "    evaluators=[\n",
+    "        EvaluatorType.QA,  # \"Correctness\" against a reference answer\n",
+    "        EvaluatorType.EMBEDDING_DISTANCE,\n",
+    "        RunEvalConfig.Criteria(\"helpfulness\"),\n",
+    "        RunEvalConfig.Criteria(\n",
+    "            {\n",
+    "                \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
+    "            }\n",
+    "        ),\n",
+    "    ]\n",
+    ")"
   ]
  },
  {
@ -412,40 +397,111 @@
   "source": [
    "### 4. Run the Agent and Evaluators\n",
    "\n",
-    "With the dataset, agent, and evaluators selected, you can use the helper function below to run them all.\n",
+    "Use the `arun_on_dataset` (or synchronous `run_on_dataset`) function to evaluate your model. This will:\n",
+    "1. Fetch example rows from the specified dataset\n",
+    "2. Run your llm or chain on each example.\n",
+    "3. Apply evalutors to the resulting run traces and corresponding reference examples to generate automated feedback.\n",
    "\n",
-    "The run traces and evaluation feedback will automatically be associated with the dataset for easy attribution and analysis."
+    "The results will be visible in the LangSmith app."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
   "metadata": {
    "tags": []
   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed examples: 2\r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example 4de88b85-928e-4711-8f11-98886295c8b3. Error: LLMMathChain._evaluate(\"\n",
+      "age_of_Dua_Lipa_boyfriend ** 0.43\n",
+      "\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed examples: 3\r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example 7cacdf54-d1b8-4e6c-944e-c94578a2fe0d. Error: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 68}]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed examples: 9\r"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.smith import (\n",
+    "    arun_on_dataset,\n",
+    "    run_on_dataset,  # Available if your chain doesn't support async calls.\n",
+    ")\n",
+    "\n",
+    "chain_results = await arun_on_dataset(\n",
+    "    client=client,\n",
+    "    dataset_name=dataset_name,\n",
+    "    llm_or_chain_factory=agent_factory,\n",
+    "    evaluation=evaluation_config,\n",
+    "    verbose=True,\n",
+    "    tags=[\"testing-notebook\"],  # Optional, adds a tag to the resulting chain runs\n",
+    ")\n",
+    "\n",
+    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
+    "# These are logged as warnings here and captured as errors in the tracing UI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[0;31mSignature:\u001b[0m\n",
       "\u001b[0marun_on_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Client'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mdataset_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mllm_or_chain_factory\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MODEL_OR_CHAIN_FACTORY'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mevaluation\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[RunEvalConfig]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mconcurrency_level\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mnum_repetitions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mproject_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mrun_evaluators\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[Sequence[RunEvaluator]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0minput_mapper\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[Callable[[Dict], Any]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mDocstring:\u001b[0m\n",
       "Asynchronously run the Chain or language model on a dataset\n",
       "and store traces to the specified project name.\n",
       "\n",
       "Args:\n",
+       "    client: LangSmith client to use to read the dataset, and to\n",
+       "        log feedback and run traces.\n",
       "    dataset_name: Name of the dataset to run the chain on.\n",
       "    llm_or_chain_factory: Language model or Chain constructor to run\n",
       "        over the dataset. The Chain constructor is used to permit\n",
@ -457,14 +513,18 @@
       "    project_name: Name of the project to store the traces in.\n",
       "        Defaults to {dataset_name}-{chain class name}-{datetime}.\n",
       "    verbose: Whether to print progress.\n",
-       "    client: Client to use to read the dataset. If not provided, a new\n",
-       "        client will be created using the credentials in the environment.\n",
       "    tags: Tags to add to each run in the project.\n",
       "    run_evaluators: Evaluators to run on the results of the chain.\n",
+       "    input_mapper: A function to map to the inputs dictionary from an Example\n",
+       "        to the format expected by the model to be evaluated. This is useful if\n",
+       "        your model needs to deserialize more complex schema or if your dataset\n",
+       "        has inputs with keys that differ from what is expected by your chain\n",
+       "        or agent.\n",
       "\n",
       "Returns:\n",
-       "    A dictionary containing the run's project name and the resulting model outputs.\n",
-       "\u001b[0;31mFile:\u001b[0m      ~/code/lc/lckg/langchain/client/runner_utils.py\n",
+       "    A dictionary containing the run's project name and the\n",
+       "    resulting model outputs.\n",
+       "\u001b[0;31mFile:\u001b[0m      ~/code/lc/langchain/langchain/smith/evaluation/runner_utils.py\n",
       "\u001b[0;31mType:\u001b[0m      function"
      ]
     },
@ -473,208 +533,102 @@
    }
   ],
   "source": [
-    "from langchain.client import (\n",
-    "    arun_on_dataset,\n",
-    "    run_on_dataset,  # Available if your chain doesn't support async calls.\n",
-    ")\n",
+    "# For more information on additional configuration for the evaluation function:\n",
    "\n",
    "?arun_on_dataset"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
+   "cell_type": "markdown",
+   "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
   "metadata": {
    "tags": []
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processed examples: 6\r"
-     ]
-    }
-   ],
   "source": [
-    "chain_results = await arun_on_dataset(\n",
-    "    dataset_name=dataset_name,\n",
-    "    llm_or_chain_factory=agent_factory,\n",
-    "    concurrency_level=5,  # Optional, sets the number of examples to run at a time\n",
-    "    verbose=True,\n",
-    "    client=client,\n",
-    "    tags=[\n",
-    "        \"testing-notebook\",\n",
-    "    ],  # Optional, adds a tag to the resulting chain runs\n",
-    "    run_evaluators=evaluators,\n",
-    ")\n",
+    "### Review the Test Results\n",
    "\n",
-    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
-    "# These are logged as warnings here and captured as errors in the tracing UI."
+    "You can review the test results tracing UI below by navigating to the \"Datasets & Testing\" page and selecting the **\"calculator-example-dataset-*\"** dataset and associated test project.\n",
+    "\n",
+    "This will show the new runs and the feedback logged from the selected evaluators."
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
-   "metadata": {
-    "tags": []
-   },
+   "id": "591c819e-9932-45cf-adab-63727dd49559",
+   "metadata": {},
   "source": [
-    "### Review the Test Results\n",
+    "## Exporting Runs\n",
    "\n",
-    "You can review the test results tracing UI below by navigating to the Testing project \n",
-    "with the title that starts with **\"calculator-example-dataset-AgentExecutor-\"**\n",
-    "\n",
-    "This will show the new runs and the feedback logged from the selected evaluators."
+    "LangSmith lets you export data to common formats such as CSV or JSONL directly in the web app. You can also use the client to fetch runs for further analysis, to store in your own database, or to share with others. Let's fetch the run traces from the evaluation run."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
-   "id": "136db492-d6ca-4215-96f9-439c23538241",
+   "execution_count": 14,
+   "id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
-      "text/html": [
-       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
-      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
+       "Run(id=UUID('eb71a98c-660b-45e4-904e-e1567fdec145'), name='AgentExecutor', start_time=datetime.datetime(2023, 7, 13, 8, 23, 35, 102907), run_type=<RunTypeEnum.chain: 'chain'>, end_time=datetime.datetime(2023, 7, 13, 8, 23, 37, 793962), extra={'runtime': {'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.5', 'library_version': '0.0.231', 'runtime_version': '3.11.2'}, 'total_tokens': 512, 'prompt_tokens': 451, 'completion_tokens': 61}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-07-13T08:23:35.102907'}, {'name': 'end', 'time': '2023-07-13T08:23:37.793962'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('d343add7-2631-417b-905a-dc39361ace69'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('cc5f4f88-f1bf-495f-8adb-384f66321eb2'), child_run_ids=[UUID('daa9708a-ad08-4be1-9841-e92e2f384cce'), UUID('28b1ada7-3fe8-4853-a5b0-dac8a93a3066'), UUID('dc0b4867-3f3d-46f7-bfb5-f4be10f3cc52'), UUID('58c9494e-2ea6-4291-ab78-73b8ffcdaef5'), UUID('8f5a3e08-ce96-4c81-a6aa-86bf5b3bb590'), UUID('f0447532-7ded-45b6-9d87-f1fa18e381b0')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'fifth-grader-score': {'n': 1, 'avg': 0.0, 'mode': 0}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361}})"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "# You can navigate to the UI by clicking on the link below\n",
-    "client"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5f2c0539-09c1-42f9-a2ee-6a88a378d479",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "For a real production application, you will want to add many more test cases and\n",
-    "incorporate larger datasets to run benchmark evaluations to measure aggregate performance\n",
-    "across. For more information on recommended ways to do this, see [LangSmith Documentation](https://docs.langchain.plus/docs/)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cd67201c-8dc1-4689-981c-759800749e25",
-   "metadata": {},
-   "source": [
-    "## Monitor\n",
-    "\n",
-    "Once your agent passed the selected quality bar, you can deploy it to production. For this notebook, you will simulate user interactions directly while logging your traces to LangSmith for monitoring.\n",
-    "\n",
-    "For more information on real production deployments, check out the [LangChain documentation](https://python.langchain.com/docs/guides/deployments/) or contact us at [support@langchain.dev](mailto:support@langchain.dev).\n",
-    "\n",
-    "**First, create a new project to use in your production deployment.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "3718710f-f719-4861-a351-0bb9d639d9fd",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "deployment_name = f\"Search + Calculator Deployment - {unique_id}\"\n",
-    "project = client.create_project(deployment_name, mode=\"monitor\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3a993ae7-6d26-495a-8633-64936bf94127",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "**Then, deploy your agent to production, making sure to configure the environment to log to the monitoring project.**"
+    "runs = list(client.list_runs(dataset_name=dataset_name))\n",
+    "runs[0]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
-   "id": "56dba20a-c07c-4b18-a4e7-834ab6dc87ef",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "569389d4-b613-47ce-99d3-e0031f308185",
+   "execution_count": 19,
+   "id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LLMMathChain._evaluate(\"\n",
-      "US_GDP / average_lifespan\n",
-      "\") raised error: 'US_GDP'. Please try again with a valid numerical expression\n"
-     ]
+     "data": {
+      "text/plain": [
+       "{'correctness': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
+       " 'helpfulness': {'n': 7, 'avg': 1.0, 'mode': 1},\n",
+       " 'fifth-grader-score': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
+       " 'embedding_cosine_distance': {'n': 7,\n",
+       "  'avg': 0.08308464442094905,\n",
+       "  'mode': 0.00371031210788608}}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
-    "os.environ[\"LANGCHAIN_PROJECT\"] = deployment_name\n",
-    "\n",
-    "inputs = [\n",
-    "    \"What's the ratio of the current US GDP to the average lifespan of a human?\",\n",
-    "    \"What's sin of 180 degrees?\",\n",
-    "    \"I need help on my homework\",\n",
-    "    \"If the price of bushel of wheat increases by 10 cents, about how much will that impact the average cost of bread?\",\n",
-    "    # etc.\n",
-    "]\n",
-    "for query in inputs:\n",
-    "    try:\n",
-    "        await agent.arun(query)\n",
-    "    except Exception as e:\n",
-    "        print(e)"
+    "client.read_project(project_id=runs[0].session_id).feedback_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2646f0fb-81d4-43ce-8a9b-54b8e19841e2",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "source": [
    "## Conclusion\n",
    "\n",
-    "Congratulations! You have succesfully created connected an agent to LangSmith to trace and debug, evaluated it for accuracy, helpfulness, and trajectory efficiency over a dataset, and instrumented a monitoring project for a simulated \"production\" application!\n",
+    "Congratulations! You have succesfully traced and evaluated an agent using LangSmith!\n",
    "\n",
-    "This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better products.\n",
+    "This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better results.\n",
    "\n",
-    "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.langchain.plus/docs/),\n",
-    "\n",
-    "and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
+    "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.langchain.plus/docs/), and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "90b7fbff-162d-4c9c-b6fc-33bd5445745f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/langchain/callbacks/tracers/evaluation.py
+++ b/langchain/callbacks/tracers/evaluation.py
@ -1,10 +1,12 @@
 """A tracer that runs evaluators over completed runs."""
+from __future__ import annotations
+
 import logging
 from concurrent.futures import Future, ThreadPoolExecutor, wait
-from typing import Any, Optional, Sequence, Set, Union
+from typing import Any, List, Optional, Sequence, Set, Union
 from uuid import UUID

-from langchainplus_sdk import LangChainPlusClient, RunEvaluator
+from langsmith import Client, RunEvaluator

 from langchain.callbacks.manager import tracing_v2_enabled
 from langchain.callbacks.tracers.base import BaseTracer
@ -12,6 +14,15 @@ from langchain.callbacks.tracers.schemas import Run

 logger = logging.getLogger(__name__)

+_TRACERS: List[EvaluatorCallbackHandler] = []
+
+
+def wait_for_all_evaluators() -> None:
+    """Wait for all tracers to finish."""
+    global _TRACERS
+    for tracer in _TRACERS:
+        tracer.wait_for_futures()
+

 class EvaluatorCallbackHandler(BaseTracer):
    """A tracer that runs a run evaluator whenever a run is persisted.
@ -23,8 +34,8 @@ class EvaluatorCallbackHandler(BaseTracer):
    max_workers : int, optional
        The maximum number of worker threads to use for running the evaluators.
        If not specified, it will default to the number of evaluators.
-    client : LangChainPlusClient, optional
-        The LangChainPlusClient instance to use for evaluating the runs.
+    client : LangSmith Client, optional
+        The LangSmith client instance to use for evaluating the runs.
        If not specified, a new instance will be created.
    example_id : Union[UUID, str], optional
        The example ID to be associated with the runs.
@ -35,8 +46,8 @@ class EvaluatorCallbackHandler(BaseTracer):
    ----------
    example_id : Union[UUID, None]
        The example ID associated with the runs.
-    client : LangChainPlusClient
-        The LangChainPlusClient instance used for evaluating the runs.
+    client : Client
+        The LangSmith client instance used for evaluating the runs.
    evaluators : Sequence[RunEvaluator]
        The sequence of run evaluators to be executed.
    executor : ThreadPoolExecutor
@ -56,7 +67,7 @@ class EvaluatorCallbackHandler(BaseTracer):
        self,
        evaluators: Sequence[RunEvaluator],
        max_workers: Optional[int] = None,
-        client: Optional[LangChainPlusClient] = None,
+        client: Optional[Client] = None,
        example_id: Optional[Union[UUID, str]] = None,
        skip_unfinished: bool = True,
        project_name: Optional[str] = None,
@ -66,7 +77,7 @@ class EvaluatorCallbackHandler(BaseTracer):
        self.example_id = (
            UUID(example_id) if isinstance(example_id, str) else example_id
        )
-        self.client = client or LangChainPlusClient()
+        self.client = client or Client()
        self.evaluators = evaluators
        self.executor = ThreadPoolExecutor(
            max_workers=max(max_workers or len(evaluators), 1)
@ -74,6 +85,8 @@ class EvaluatorCallbackHandler(BaseTracer):
        self.futures: Set[Future] = set()
        self.skip_unfinished = skip_unfinished
        self.project_name = project_name
+        global _TRACERS
+        _TRACERS.append(self)

    def _evaluate_in_project(self, run: Run, evaluator: RunEvaluator) -> None:
        """Evaluate the run in the project.
--- a/langchain/callbacks/tracers/langchain.py
+++ b/langchain/callbacks/tracers/langchain.py
@ -8,7 +8,7 @@ from datetime import datetime
 from typing import Any, Dict, List, Optional, Set, Union
 from uuid import UUID

-from langchainplus_sdk import LangChainPlusClient
+from langsmith import Client

 from langchain.callbacks.tracers.base import BaseTracer
 from langchain.callbacks.tracers.schemas import Run, RunTypeEnum, TracerSession
@ -19,6 +19,7 @@ from langchain.schema.messages import BaseMessage
 logger = logging.getLogger(__name__)
 _LOGGED = set()
 _TRACERS: List[LangChainTracer] = []
+_CLIENT: Optional[Client] = None


 def log_error_once(method: str, exception: Exception) -> None:
@ -37,6 +38,14 @@ def wait_for_all_tracers() -> None:
        tracer.wait_for_futures()


+def _get_client() -> Client:
+    """Get the client."""
+    global _CLIENT
+    if _CLIENT is None:
+        _CLIENT = Client()
+    return _CLIENT
+
+
 class LangChainTracer(BaseTracer):
    """An implementation of the SharedTracer that POSTS to the langchain endpoint."""

@ -44,7 +53,7 @@ class LangChainTracer(BaseTracer):
        self,
        example_id: Optional[Union[UUID, str]] = None,
        project_name: Optional[str] = None,
-        client: Optional[LangChainPlusClient] = None,
+        client: Optional[Client] = None,
        tags: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> None:
@ -59,7 +68,7 @@ class LangChainTracer(BaseTracer):
        )
        # set max_workers to 1 to process tasks in order
        self.executor = ThreadPoolExecutor(max_workers=1)
-        self.client = client or LangChainPlusClient()
+        self.client = client or _get_client()
        self._futures: Set[Future] = set()
        self.tags = tags or []
        global _TRACERS
--- a/langchain/callbacks/tracers/schemas.py
+++ b/langchain/callbacks/tracers/schemas.py
@ -5,8 +5,8 @@ import datetime
 from typing import Any, Dict, List, Optional
 from uuid import UUID

-from langchainplus_sdk.schemas import RunBase as BaseRunV2
-from langchainplus_sdk.schemas import RunTypeEnum
+from langsmith.schemas import RunBase as BaseRunV2
+from langsmith.schemas import RunTypeEnum
 from pydantic import BaseModel, Field, root_validator

 from langchain.schema import LLMResult
--- a/langchain/chains/base.py
+++ b/langchain/chains/base.py
@ -198,7 +198,7 @@ class Chain(Serializable, ABC):
            inputs: Dictionary of inputs, or single input if chain expects
                only one param. Should contain all inputs specified in
                `Chain.input_keys` except for inputs that will be set by the chain's
-                 memory.
+                memory.
            return_only_outputs: Whether to return only outputs in the
                response. If True, only new keys generated by this chain will be
                returned. If False, both input keys and new keys generated by this
@ -265,7 +265,7 @@ class Chain(Serializable, ABC):
            inputs: Dictionary of inputs, or single input if chain expects
                only one param. Should contain all inputs specified in
                `Chain.input_keys` except for inputs that will be set by the chain's
-                 memory.
+                memory.
            return_only_outputs: Whether to return only outputs in the
                response. If True, only new keys generated by this chain will be
                returned. If False, both input keys and new keys generated by this
@ -349,7 +349,7 @@ class Chain(Serializable, ABC):
            inputs: Dictionary of raw inputs, or single input if chain expects
                only one param. Should contain all inputs specified in
                `Chain.input_keys` except for inputs that will be set by the chain's
-                 memory.
+                memory.

        Returns:
            A dictionary of all inputs, including those added by the chain's memory.
--- a/langchain/client/init.py
+++ b/langchain/client/init.py
@ -1,16 +0,0 @@
-"""LangChain + Client."""
-from langchain.client.runner_utils import (
-    InputFormatError,
-    arun_on_dataset,
-    arun_on_examples,
-    run_on_dataset,
-    run_on_examples,
-)
-
-__all__ = [
-    "InputFormatError",
-    "arun_on_dataset",
-    "run_on_dataset",
-    "arun_on_examples",
-    "run_on_examples",
-]
--- a/langchain/client/runner_utils.py
+++ b/langchain/client/runner_utils.py
@ -1,759 +0,0 @@
-"""Utilities for running language models or Chains over datasets."""
-
-from __future__ import annotations
-
-import asyncio
-import functools
-import logging
-from datetime import datetime
-from typing import (
-    Any,
-    Callable,
-    Coroutine,
-    Dict,
-    Iterator,
-    List,
-    Optional,
-    Sequence,
-    Union,
-)
-
-from langchainplus_sdk import LangChainPlusClient, RunEvaluator
-from langchainplus_sdk.schemas import Example
-
-from langchain.callbacks.base import BaseCallbackHandler
-from langchain.callbacks.manager import Callbacks
-from langchain.callbacks.tracers.base import BaseTracer
-from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
-from langchain.callbacks.tracers.langchain import LangChainTracer
-from langchain.chains.base import Chain
-from langchain.chat_models.base import BaseChatModel
-from langchain.llms.base import BaseLLM
-from langchain.schema import (
-    ChatResult,
-    LLMResult,
-)
-from langchain.schema.language_model import BaseLanguageModel
-from langchain.schema.messages import (
-    BaseMessage,
-    HumanMessage,
-    get_buffer_string,
-    messages_from_dict,
-)
-
-logger = logging.getLogger(__name__)
-
-MODEL_OR_CHAIN_FACTORY = Union[Callable[[], Chain], BaseLanguageModel]
-
-
-class InputFormatError(Exception):
-    """Raised when the input format is invalid."""
-
-
-def _get_prompts(inputs: Dict[str, Any]) -> List[str]:
-    """Get prompts from inputs.
-
-    Args:
-        inputs: The input dictionary.
-
-    Returns:
-        A list of prompts.
-    Raises:
-        InputFormatError: If the input format is invalid.
-    """
-    if not inputs:
-        raise InputFormatError("Inputs should not be empty.")
-
-    prompts = []
-    if "prompt" in inputs:
-        if not isinstance(inputs["prompt"], str):
-            raise InputFormatError(
-                "Expected string for 'prompt', got"
-                f" {type(inputs['prompt']).__name__}"
-            )
-        prompts = [inputs["prompt"]]
-    elif "prompts" in inputs:
-        if not isinstance(inputs["prompts"], list) or not all(
-            isinstance(i, str) for i in inputs["prompts"]
-        ):
-            raise InputFormatError(
-                "Expected list of strings for 'prompts',"
-                f" got {type(inputs['prompts']).__name__}"
-            )
-        prompts = inputs["prompts"]
-    elif len(inputs) == 1:
-        prompt_ = next(iter(inputs.values()))
-        if isinstance(prompt_, str):
-            prompts = [prompt_]
-        elif isinstance(prompt_, list) and all(isinstance(i, str) for i in prompt_):
-            prompts = prompt_
-        else:
-            raise InputFormatError(f"LLM Run expects string prompt input. Got {inputs}")
-    else:
-        raise InputFormatError(
-            f"LLM Run expects 'prompt' or 'prompts' in inputs. Got {inputs}"
-        )
-
-    return prompts
-
-
-def _get_messages(inputs: Dict[str, Any]) -> List[List[BaseMessage]]:
-    """Get Chat Messages from inputs.
-
-    Args:
-        inputs: The input dictionary.
-
-    Returns:
-        A list of chat messages.
-    Raises:
-        InputFormatError: If the input format is invalid.
-    """
-    if not inputs:
-        raise InputFormatError("Inputs should not be empty.")
-
-    if "messages" in inputs:
-        single_input = inputs["messages"]
-    elif len(inputs) == 1:
-        single_input = next(iter(inputs.values()))
-    else:
-        raise InputFormatError(f"Chat Run expects 'messages' in inputs. Got {inputs}")
-    if isinstance(single_input, list) and all(
-        isinstance(i, dict) for i in single_input
-    ):
-        raw_messages = [single_input]
-    elif isinstance(single_input, list) and all(
-        isinstance(i, list) for i in single_input
-    ):
-        raw_messages = single_input
-    else:
-        raise InputFormatError(
-            f"Chat Run expects List[dict] or List[List[dict]] 'messages'"
-            f" input. Got {inputs}"
-        )
-    return [messages_from_dict(batch) for batch in raw_messages]
-
-
-async def _arun_llm(
-    llm: BaseLanguageModel,
-    inputs: Dict[str, Any],
-    *,
-    tags: Optional[List[str]] = None,
-    callbacks: Callbacks = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Union[LLMResult, ChatResult]:
-    """Asynchronously run the language model.
-
-    Args:
-        llm: The language model to run.
-        inputs: The input dictionary.
-        tags: Optional tags to add to the run.
-        callbacks: Optional callbacks to use during the run.
-        input_mapper: Optional function to map inputs to the expected format.
-
-    Returns:
-        The LLMResult or ChatResult.
-    Raises:
-        ValueError: If the LLM type is unsupported.
-        InputFormatError: If the input format is invalid.
-    """
-    if input_mapper is not None:
-        if not isinstance(llm, (BaseLLM, BaseChatModel)):
-            raise ValueError(f"Unsupported LLM type {type(llm).__name__}")
-        llm_output = await llm.agenerate(
-            input_mapper(inputs), callbacks=callbacks, tags=tags
-        )
-    elif isinstance(llm, BaseLLM):
-        try:
-            llm_prompts = _get_prompts(inputs)
-            llm_output = await llm.agenerate(
-                llm_prompts, callbacks=callbacks, tags=tags
-            )
-        except InputFormatError:
-            llm_messages = _get_messages(inputs)
-            buffer_strings = [get_buffer_string(messages) for messages in llm_messages]
-            llm_output = await llm.agenerate(
-                buffer_strings, callbacks=callbacks, tags=tags
-            )
-    elif isinstance(llm, BaseChatModel):
-        try:
-            messages = _get_messages(inputs)
-            llm_output = await llm.agenerate(messages, callbacks=callbacks, tags=tags)
-        except InputFormatError:
-            prompts = _get_prompts(inputs)
-            converted_messages: List[List[BaseMessage]] = [
-                [HumanMessage(content=prompt)] for prompt in prompts
-            ]
-            llm_output = await llm.agenerate(
-                converted_messages, callbacks=callbacks, tags=tags
-            )
-    else:
-        raise ValueError(f"Unsupported LLM type {type(llm)}")
-    return llm_output
-
-
-async def _arun_llm_or_chain(
-    example: Example,
-    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    n_repetitions: int,
-    *,
-    tags: Optional[List[str]] = None,
-    callbacks: Optional[List[BaseCallbackHandler]] = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
-    """Asynchronously run the Chain or language model.
-
-    Args:
-        example: The example to run.
-        llm_or_chain_factory: The Chain or language model constructor to run.
-        n_repetitions: The number of times to run the model on each example.
-        tags: Optional tags to add to the run.
-        callbacks: Optional callbacks to use during the run.
-        input_mapper: Optional function to map the input to the expected format.
-
-    Returns:
-        A list of outputs.
-    """
-    if callbacks:
-        previous_example_ids = [
-            getattr(tracer, "example_id", None) for tracer in callbacks
-        ]
-        for tracer in callbacks:
-            if hasattr(tracer, "example_id"):
-                tracer.example_id = example.id
-    else:
-        previous_example_ids = None
-    outputs = []
-    for _ in range(n_repetitions):
-        try:
-            if isinstance(llm_or_chain_factory, BaseLanguageModel):
-                output: Any = await _arun_llm(
-                    llm_or_chain_factory,
-                    example.inputs,
-                    tags=tags,
-                    callbacks=callbacks,
-                    input_mapper=input_mapper,
-                )
-            else:
-                chain = llm_or_chain_factory()
-                if input_mapper is not None:
-                    inputs_ = input_mapper(example.inputs)
-                else:
-                    inputs_ = example.inputs
-                    if len(inputs_) == 1:
-                        inputs_ = next(iter(inputs_.values()))
-                output = await chain.acall(inputs_, callbacks=callbacks, tags=tags)
-            outputs.append(output)
-        except Exception as e:
-            logger.warning(f"Chain failed for example {example.id}. Error: {e}")
-            outputs.append({"Error": str(e)})
-    if callbacks and previous_example_ids:
-        for example_id, tracer in zip(previous_example_ids, callbacks):
-            if hasattr(tracer, "example_id"):
-                tracer.example_id = example_id
-    return outputs
-
-
-async def _gather_with_concurrency(
-    n: int,
-    initializer: Callable[[], Coroutine[Any, Any, Any]],
-    *async_funcs: Callable[
-        [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
-    ],
-) -> List[Any]:
-    """Run coroutines with a concurrency limit.
-
-    Args:
-        n: The maximum number of concurrent tasks.
-        initializer: A coroutine that initializes shared resources for the tasks.
-        async_funcs: The async_funcs to be run concurrently.
-
-    Returns:
-        A list of results from the coroutines.
-    """
-    semaphore = asyncio.Semaphore(n)
-    job_state = {"num_processed": 0}
-
-    callback_queue: asyncio.Queue[Sequence[BaseCallbackHandler]] = asyncio.Queue()
-    for _ in range(n):
-        callback_queue.put_nowait(await initializer())
-
-    async def run_coroutine_with_semaphore(
-        async_func: Callable[
-            [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
-        ]
-    ) -> Any:
-        async with semaphore:
-            callbacks = await callback_queue.get()
-            try:
-                result = await async_func(callbacks, job_state)
-            finally:
-                callback_queue.put_nowait(callbacks)
-            return result
-
-    results = await asyncio.gather(
-        *(run_coroutine_with_semaphore(function) for function in async_funcs)
-    )
-    while callback_queue:
-        try:
-            callbacks = callback_queue.get_nowait()
-        except asyncio.QueueEmpty:
-            break
-        for callback in callbacks:
-            if isinstance(callback, (LangChainTracer, EvaluatorCallbackHandler)):
-                callback.wait_for_futures()
-    return results
-
-
-async def _callbacks_initializer(
-    project_name: Optional[str],
-    client: LangChainPlusClient,
-    run_evaluators: Sequence[RunEvaluator],
-    evaluation_handler_collector: List[EvaluatorCallbackHandler],
-) -> List[BaseTracer]:
-    """
-    Initialize a tracer to share across tasks.
-
-    Args:
-        project_name: The project name for the tracer.
-        client: The client to use for the tracer.
-        run_evaluators: The evaluators to run.
-        evaluation_handler_collector: A list to collect the evaluators.
-            Used to wait for the evaluators to finish.
-
-    Returns:
-        The callbacks for this thread.
-    """
-    callbacks: List[BaseTracer] = []
-    if project_name:
-        callbacks.append(LangChainTracer(project_name=project_name))
-    evaluator_project_name = f"{project_name}-evaluators" if project_name else None
-    if run_evaluators:
-        callback = EvaluatorCallbackHandler(
-            client=client,
-            evaluators=run_evaluators,
-            # We already have concurrency, don't want to overload the machine
-            max_workers=1,
-            project_name=evaluator_project_name,
-        )
-        callbacks.append(callback)
-        evaluation_handler_collector.append(callback)
-    return callbacks
-
-
-async def arun_on_examples(
-    examples: Iterator[Example],
-    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    *,
-    concurrency_level: int = 5,
-    num_repetitions: int = 1,
-    project_name: Optional[str] = None,
-    verbose: bool = False,
-    client: Optional[LangChainPlusClient] = None,
-    tags: Optional[List[str]] = None,
-    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Dict[str, Any]:
-    """
-    Asynchronously run the chain on examples and store traces
-        to the specified project name.
-
-    Args:
-        examples: Examples to run the model or chain over.
-        llm_or_chain_factory: Language model or Chain constructor to run
-            over the dataset. The Chain constructor is used to permit
-            independent calls on each example without carrying over state.
-        concurrency_level: The number of async tasks to run concurrently.
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
-        project_name: Project name to use when tracing runs.
-            Defaults to {dataset_name}-{chain class name}-{datetime}.
-        verbose: Whether to print progress.
-        client: Client to use to read the dataset. If not provided, a new
-            client will be created using the credentials in the environment.
-        tags: Tags to add to each run in the project.
-        run_evaluators: Evaluators to run on the results of the chain.
-        input_mapper: function to map to the inputs dictionary from an Example
-            to the format expected by the model to be evaluated. This is useful if
-            your model needs to deserialize more complex schema or if your dataset
-            has inputs with keys that differ from what is expected by your chain
-            or agent.
-
-    Returns:
-        A dictionary mapping example ids to the model outputs.
-    """
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
-    client_ = client or LangChainPlusClient()
-    results: Dict[str, List[Any]] = {}
-
-    async def process_example(
-        example: Example, callbacks: List[BaseCallbackHandler], job_state: dict
-    ) -> None:
-        """Process a single example."""
-        result = await _arun_llm_or_chain(
-            example,
-            llm_or_chain_factory,
-            num_repetitions,
-            tags=tags,
-            callbacks=callbacks,
-            input_mapper=input_mapper,
-        )
-        results[str(example.id)] = result
-        job_state["num_processed"] += 1
-        if verbose:
-            print(
-                f"Processed examples: {job_state['num_processed']}",
-                end="\r",
-                flush=True,
-            )
-
-    evaluation_handlers: List[EvaluatorCallbackHandler] = []
-    await _gather_with_concurrency(
-        concurrency_level,
-        functools.partial(
-            _callbacks_initializer,
-            project_name=project_name,
-            client=client_,
-            evaluation_handler_collector=evaluation_handlers,
-            run_evaluators=run_evaluators or [],
-        ),
-        *(functools.partial(process_example, e) for e in examples),
-    )
-    for handler in evaluation_handlers:
-        handler.wait_for_futures()
-    return results
-
-
-def run_llm(
-    llm: BaseLanguageModel,
-    inputs: Dict[str, Any],
-    callbacks: Callbacks,
-    *,
-    tags: Optional[List[str]] = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Union[LLMResult, ChatResult]:
-    """
-    Run the language model on the example.
-
-    Args:
-        llm: The language model to run.
-        inputs: The input dictionary.
-        callbacks: The callbacks to use during the run.
-        tags: Optional tags to add to the run.
-        input_mapper: function to map to the inputs dictionary from an Example
-    Returns:
-        The LLMResult or ChatResult.
-    Raises:
-        ValueError: If the LLM type is unsupported.
-        InputFormatError: If the input format is invalid.
-    """
-    if input_mapper is not None:
-        if not isinstance(llm, (BaseLLM, BaseChatModel)):
-            raise ValueError(f"Unsupported LLM type {type(llm).__name__}")
-        llm_output = llm.generate(input_mapper(inputs), callbacks=callbacks, tags=tags)
-    elif isinstance(llm, BaseLLM):
-        try:
-            llm_prompts = _get_prompts(inputs)
-            llm_output = llm.generate(llm_prompts, callbacks=callbacks, tags=tags)
-        except InputFormatError:
-            llm_messages = _get_messages(inputs)
-            buffer_strings = [get_buffer_string(messages) for messages in llm_messages]
-            llm_output = llm.generate(buffer_strings, callbacks=callbacks)
-    elif isinstance(llm, BaseChatModel):
-        try:
-            messages = _get_messages(inputs)
-            llm_output = llm.generate(messages, callbacks=callbacks, tags=tags)
-        except InputFormatError:
-            prompts = _get_prompts(inputs)
-            converted_messages: List[List[BaseMessage]] = [
-                [HumanMessage(content=prompt)] for prompt in prompts
-            ]
-            llm_output = llm.generate(
-                converted_messages, callbacks=callbacks, tags=tags
-            )
-    else:
-        raise ValueError(f"Unsupported LLM type {type(llm)}")
-    return llm_output
-
-
-def run_llm_or_chain(
-    example: Example,
-    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    n_repetitions: int,
-    *,
-    tags: Optional[List[str]] = None,
-    callbacks: Optional[List[BaseCallbackHandler]] = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
-    """
-    Run the Chain or language model synchronously.
-
-    Args:
-        example: The example to run.
-        llm_or_chain_factory: The Chain or language model constructor to run.
-        n_repetitions: The number of times to run the model on each example.
-        tags: Optional tags to add to the run.
-        callbacks: Optional callbacks to use during the run.
-
-    Returns:
-        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
-          The outputs of the model or chain.
-    """
-    if callbacks:
-        previous_example_ids = [
-            getattr(tracer, "example_id", None) for tracer in callbacks
-        ]
-        for tracer in callbacks:
-            if hasattr(tracer, "example_id"):
-                tracer.example_id = example.id
-    else:
-        previous_example_ids = None
-    outputs = []
-    for _ in range(n_repetitions):
-        try:
-            if isinstance(llm_or_chain_factory, BaseLanguageModel):
-                output: Any = run_llm(
-                    llm_or_chain_factory,
-                    example.inputs,
-                    callbacks,
-                    tags=tags,
-                    input_mapper=input_mapper,
-                )
-            else:
-                chain = llm_or_chain_factory()
-                if input_mapper is not None:
-                    inputs_ = input_mapper(example.inputs)
-                else:
-                    inputs_ = example.inputs
-                    if len(inputs_) == 1:
-                        inputs_ = next(iter(inputs_.values()))
-                output = chain(inputs_, callbacks=callbacks, tags=tags)
-            outputs.append(output)
-        except Exception as e:
-            logger.warning(f"Chain failed for example {example.id}. Error: {e}")
-            outputs.append({"Error": str(e)})
-    if callbacks and previous_example_ids:
-        for example_id, tracer in zip(previous_example_ids, callbacks):
-            if hasattr(tracer, "example_id"):
-                tracer.example_id = example_id
-    return outputs
-
-
-def run_on_examples(
-    examples: Iterator[Example],
-    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    *,
-    num_repetitions: int = 1,
-    project_name: Optional[str] = None,
-    verbose: bool = False,
-    client: Optional[LangChainPlusClient] = None,
-    tags: Optional[List[str]] = None,
-    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Dict[str, Any]:
-    """
-    Run the Chain or language model on examples and store
-    traces to the specified project name.
-
-    Args:
-        examples: Examples to run the model or chain over.
-        llm_or_chain_factory: Language model or Chain constructor to run
-            over the dataset. The Chain constructor is used to permit
-            independent calls on each example without carrying over state.
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
-        project_name: Name of the project to store the traces in.
-            Defaults to {dataset_name}-{chain class name}-{datetime}.
-        verbose: Whether to print progress.
-        client: Client to use to access the dataset. If None, a new client
-            will be created using the credentials in the environment.
-        tags: Tags to add to each run in the project.
-        run_evaluators: Evaluators to run on the results of the chain.
-        input_mapper: A function to map to the inputs dictionary from an Example
-            to the format expected by the model to be evaluated. This is useful if
-            your model needs to deserialize more complex schema or if your dataset
-            has inputs with keys that differ from what is expected by your chain
-            or agent.
-
-    Returns:
-        A dictionary mapping example ids to the model outputs.
-    """
-    results: Dict[str, Any] = {}
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
-    client_ = client or LangChainPlusClient()
-    tracer = LangChainTracer(project_name=project_name)
-    evaluator_project_name = f"{project_name}-evaluators"
-    evalution_handler = EvaluatorCallbackHandler(
-        evaluators=run_evaluators or [],
-        client=client_,
-        project_name=evaluator_project_name,
-    )
-    callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
-    for i, example in enumerate(examples):
-        result = run_llm_or_chain(
-            example,
-            llm_or_chain_factory,
-            num_repetitions,
-            tags=tags,
-            callbacks=callbacks,
-            input_mapper=input_mapper,
-        )
-        if verbose:
-            print(f"{i+1} processed", flush=True, end="\r")
-        results[str(example.id)] = result
-    tracer.wait_for_futures()
-    evalution_handler.wait_for_futures()
-    return results
-
-
-def _get_project_name(
-    project_name: Optional[str],
-    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    dataset_name: Optional[str],
-) -> str:
-    """
-    Get the project name.
-
-    Args:
-        project_name: The project name if manually specified.
-        llm_or_chain_factory: The Chain or language model constructor.
-        dataset_name: The dataset name.
-
-    Returns:
-        The project name.
-    """
-    if project_name is not None:
-        return project_name
-    current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-    if isinstance(llm_or_chain_factory, BaseLanguageModel):
-        model_name = llm_or_chain_factory.__class__.__name__
-    else:
-        model_name = llm_or_chain_factory().__class__.__name__
-    dataset_prefix = f"{dataset_name}-" if dataset_name else ""
-    return f"{dataset_prefix}{model_name}-{current_time}"
-
-
-async def arun_on_dataset(
-    dataset_name: str,
-    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    *,
-    concurrency_level: int = 5,
-    num_repetitions: int = 1,
-    project_name: Optional[str] = None,
-    verbose: bool = False,
-    client: Optional[LangChainPlusClient] = None,
-    tags: Optional[List[str]] = None,
-    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Dict[str, Any]:
-    """
-    Asynchronously run the Chain or language model on a dataset
-    and store traces to the specified project name.
-
-    Args:
-        dataset_name: Name of the dataset to run the chain on.
-        llm_or_chain_factory: Language model or Chain constructor to run
-            over the dataset. The Chain constructor is used to permit
-            independent calls on each example without carrying over state.
-        concurrency_level: The number of async tasks to run concurrently.
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
-        project_name: Name of the project to store the traces in.
-            Defaults to {dataset_name}-{chain class name}-{datetime}.
-        verbose: Whether to print progress.
-        client: Client to use to read the dataset. If not provided, a new
-            client will be created using the credentials in the environment.
-        tags: Tags to add to each run in the session.
-        run_evaluators: Evaluators to run on the results of the chain.
-        input_mapper: A function to map to the inputs dictionary from an Example
-            to the format expected by the model to be evaluated. This is useful if
-            your model needs to deserialize more complex schema or if your dataset
-            has inputs with keys that differ from what is expected by your chain
-            or agent.
-
-    Returns:
-        A dictionary containing the run's project name and the resulting model outputs.
-    """
-    client_ = client or LangChainPlusClient()
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
-    dataset = client_.read_dataset(dataset_name=dataset_name)
-    examples = client_.list_examples(dataset_id=str(dataset.id))
-    results = await arun_on_examples(
-        examples,
-        llm_or_chain_factory,
-        concurrency_level=concurrency_level,
-        num_repetitions=num_repetitions,
-        project_name=project_name,
-        verbose=verbose,
-        client=client_,
-        tags=tags,
-        run_evaluators=run_evaluators,
-        input_mapper=input_mapper,
-    )
-    return {
-        "project_name": project_name,
-        "results": results,
-    }
-
-
-def run_on_dataset(
-    dataset_name: str,
-    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    *,
-    num_repetitions: int = 1,
-    project_name: Optional[str] = None,
-    verbose: bool = False,
-    client: Optional[LangChainPlusClient] = None,
-    tags: Optional[List[str]] = None,
-    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
-    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Dict[str, Any]:
-    """
-    Run the Chain or language model on a dataset and store traces
-    to the specified project name.
-
-    Args:
-        dataset_name: Name of the dataset to run the chain on.
-        llm_or_chain_factory: Language model or Chain constructor to run
-            over the dataset. The Chain constructor is used to permit
-            independent calls on each example without carrying over state.
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
-        project_name: Name of the project to store the traces in.
-            Defaults to {dataset_name}-{chain class name}-{datetime}.
-        verbose: Whether to print progress.
-        client: Client to use to access the dataset. If None, a new client
-            will be created using the credentials in the environment.
-        tags: Tags to add to each run in the session.
-        run_evaluators: Evaluators to run on the results of the chain.
-        input_mapper: A function to map to the inputs dictionary from an Example
-            to the format expected by the model to be evaluated. This is useful if
-            your model needs to deserialize more complex schema or if your dataset
-            has inputs with keys that differ from what is expected by your chain
-            or agent.
-
-    Returns:
-        A dictionary containing the run's project name and the resulting model outputs.
-    """
-    client_ = client or LangChainPlusClient()
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
-    dataset = client_.read_dataset(dataset_name=dataset_name)
-    examples = client_.list_examples(dataset_id=str(dataset.id))
-    results = run_on_examples(
-        examples,
-        llm_or_chain_factory,
-        num_repetitions=num_repetitions,
-        project_name=project_name,
-        verbose=verbose,
-        tags=tags,
-        run_evaluators=run_evaluators,
-        client=client_,
-        input_mapper=input_mapper,
-    )
-    return {
-        "project_name": project_name,
-        "results": results,
-    }
--- a/langchain/evaluation/init.py
+++ b/langchain/evaluation/init.py
@ -35,7 +35,7 @@ name of the dataset to load.
 **Some common use cases for evaluation include:**

 - Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain <langchain.evaluation.qa.eval_chain.QAEvalChain>`
- Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>`
+- Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>` or :class:`LabeledPairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.LabeledPairwiseStringEvalChain>` when there is additionally a reference label.
 - Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain <langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain>`
 - Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain <langchain.evaluation.criteria.eval_chain.CriteriaEvalChain>`
 - Computing semantic difference between a prediction and reference: :class:`EmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain>` or between two predictions: :class:`PairwiseEmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain>` 
@ -53,8 +53,11 @@ These interfaces enable easier composability and usage within a higher level eva

 """  # noqa: E501
 from langchain.evaluation.agents import TrajectoryEvalChain
-from langchain.evaluation.comparison import PairwiseStringEvalChain
-from langchain.evaluation.criteria import CriteriaEvalChain
+from langchain.evaluation.comparison import (
+    LabeledPairwiseStringEvalChain,
+    PairwiseStringEvalChain,
+)
+from langchain.evaluation.criteria import CriteriaEvalChain, LabeledCriteriaEvalChain
 from langchain.evaluation.embedding_distance import (
    EmbeddingDistance,
    EmbeddingDistanceEvalChain,
@ -77,6 +80,7 @@ from langchain.evaluation.string_distance import (
 __all__ = [
    "EvaluatorType",
    "PairwiseStringEvalChain",
+    "LabeledPairwiseStringEvalChain",
    "QAEvalChain",
    "CotQAEvalChain",
    "ContextQAEvalChain",
@ -90,6 +94,7 @@ __all__ = [
    "StringDistance",
    "StringDistanceEvalChain",
    "PairwiseStringDistanceEvalChain",
+    "LabeledCriteriaEvalChain",
    "load_evaluators",
    "load_evaluator",
    "load_dataset",
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@ -54,7 +54,7 @@ class TrajectoryOutputParser(BaseOutputParser):
                f"Could not find score in model eval output: {text}"
            )

-        reasoning, score_str = text.split("Score: ")
+        reasoning, score_str = text.split("Score: ", maxsplit=1)

        reasoning, score_str = reasoning.strip(), score_str.strip()

@ -199,7 +199,7 @@ The following is the expected answer. Use this to measure correctness:
        llm: BaseLanguageModel,
        agent_tools: Optional[Sequence[BaseTool]] = None,
        output_parser: Optional[TrajectoryOutputParser] = None,
-        return_reasoning: bool = False,
+        return_reasoning: bool = True,
        **kwargs: Any,
    ) -> "TrajectoryEvalChain":
        """Create a TrajectoryEvalChain object from a language model chain.
@ -325,6 +325,9 @@ The following is the expected answer. Use this to measure correctness:
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Evaluate a trajectory.
@ -347,7 +350,14 @@ The following is the expected answer. Use this to measure correctness:
            "answer": prediction,
            "reference": reference,
        }
-        return self(inputs=inputs, callbacks=callbacks, **kwargs)
+        return self.__call__(
+            inputs=inputs,
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
+            return_only_outputs=True,
+        )

    async def _aevaluate_agent_trajectory(
        self,
@ -357,6 +367,9 @@ The following is the expected answer. Use this to measure correctness:
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Asynchronously evaluate a trajectory.
@ -382,5 +395,8 @@ The following is the expected answer. Use this to measure correctness:
        return await self.acall(
            inputs=inputs,
            callbacks=callbacks,
-            **kwargs,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
+            return_only_outputs=True,
        )
--- a/langchain/evaluation/comparison/init.py
+++ b/langchain/evaluation/comparison/init.py
@ -24,11 +24,12 @@ Example:
    #    "comment": "Both responses accurately state"
    #       " that the chemical formula for water is H2O."
    #       " However, Response B provides additional information"
-    # .     " by explaining what the formula means.\n[[B]]"
+    # .     " by explaining what the formula means.\\n[[B]]"
    # }
 """
 from langchain.evaluation.comparison.eval_chain import (
+    LabeledPairwiseStringEvalChain,
    PairwiseStringEvalChain,
 )

-__all__ = ["PairwiseStringEvalChain"]
+__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"]
--- a/langchain/evaluation/comparison/eval_chain.py
+++ b/langchain/evaluation/comparison/eval_chain.py
@ -1,7 +1,7 @@
 """Base classes for comparing the output of two models."""
 from __future__ import annotations

-from typing import Any, Optional
+from typing import Any, Dict, List, Optional

 from pydantic import Extra, Field

@ -10,15 +10,26 @@ from langchain.chains.llm import LLMChain
 from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
 from langchain.evaluation.schema import LLMEvalChain, PairwiseStringEvaluator
 from langchain.prompts.prompt import PromptTemplate
-from langchain.schema import BaseOutputParser
+from langchain.schema import RUN_KEY, BaseOutputParser
 from langchain.schema.language_model import BaseLanguageModel


 class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
-    """A parser for the output of the PairwiseStringEvalChain."""
+    """A parser for the output of the PairwiseStringEvalChain.
+
+    Attributes:
+        _type (str): The type of the output parser.
+
+    """

    @property
    def _type(self) -> str:
+        """Return the type of the output parser.
+
+        Returns:
+            str: The type of the output parser.
+
+        """
        return "pairwise_string_result"

    def parse(self, text: str) -> Any:
@ -29,6 +40,10 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):

        Returns:
            Any: The parsed output.
+
+        Raises:
+            ValueError: If the verdict is invalid.
+
        """
        reasoning, verdict = text.strip().rsplit("\n", maxsplit=1)
        verdict = verdict.strip("[").strip("]")
@ -55,54 +70,75 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
    """A chain for comparing two outputs, such as the outputs
     of two models, prompts, or outputs of a single model on similar inputs.

+    Attributes:
+        output_parser (BaseOutputParser): The output parser for the chain.
+
    Example:
-    >>> from langchain.chat_models import ChatOpenAI
-    >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
-    >>> llm = ChatOpenAI(temperature=0)
-    >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
-    >>> result = chain.evaluate_string_pairs(
-    ...     input = "What is the chemical formula for water?",
-    ...     prediction = "H2O",
-    ...     prediction_b = (
-    ...        "The chemical formula for water is H2O, which means"
-    ...        " there are two hydrogen atoms and one oxygen atom."
-    ...     reference = "The chemical formula for water is H2O.",
-    ... )
-    >>> print(result["text"])
-    # {
-    #    "value": "B",
-    #    "comment": "Both responses accurately state"
-    #       " that the chemical formula for water is H2O."
-    #       " However, Response B provides additional information"
-    # .     " by explaining what the formula means.\n[[B]]"
-    # }
+        >>> from langchain.chat_models import ChatOpenAI
+        >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
+        >>> llm = ChatOpenAI(temperature=0)
+        >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
+        >>> result = chain.evaluate_string_pairs(
+        ...     input = "What is the chemical formula for water?",
+        ...     prediction = "H2O",
+        ...     prediction_b = (
+        ...        "The chemical formula for water is H2O, which means"
+        ...        " there are two hydrogen atoms and one oxygen atom."
+        ...     reference = "The chemical formula for water is H2O.",
+        ... )
+        >>> print(result["text"])
+        # {
+        #    "value": "B",
+        #    "comment": "Both responses accurately state"
+        #       " that the chemical formula for water is H2O."
+        #       " However, Response B provides additional information"
+        # .     " by explaining what the formula means.\\n[[B]]"
+        # }
+
    """

+    output_key: str = "results"  #: :meta private:
    output_parser: BaseOutputParser = Field(
        default_factory=PairwiseStringResultOutputParser
    )

    class Config:
-        """Configuration for the QAEvalChain."""
+        """Configuration for the PairwiseStringEvalChain."""

        extra = Extra.ignore

    @property
    def requires_reference(self) -> bool:
-        return "reference" in self.prompt.input_variables
+        """Return whether the chain requires a reference.
+
+        Returns:
+            bool: True if the chain requires a reference, False otherwise.
+
+        """
+        return False

    @property
    def requires_input(self) -> bool:
+        """Return whether the chain requires an input.
+
+        Returns:
+            bool: True if the chain requires an input, False otherwise.
+
+        """
        return True

    @property
    def _skip_reference_warning(self) -> str:
-        """Warning to show when reference is ignored."""
+        """Return the warning to show when reference is ignored.
+
+        Returns:
+            str: The warning to show when reference is ignored.
+
+        """
        return (
            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
-            "\nTo use a reference, initialize PairwiseStringEvalChain with"
-            " `requires_reference=True` or with a prompt with 'reference' as an"
-            " input variable."
+            "\nTo use a reference, use the LabeledPairwiseStringEvalChain"
+            " (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
        )

    @classmethod
@ -111,7 +147,6 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        llm: BaseLanguageModel,
        *,
        prompt: Optional[PromptTemplate] = None,
-        requires_reference: bool = False,
        **kwargs: Any,
    ) -> PairwiseStringEvalChain:
        """Initialize the PairwiseStringEvalChain from an LLM.
@ -119,25 +154,17 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        Args:
            llm (BaseLanguageModel): The LLM to use.
            prompt (PromptTemplate, optional): The prompt to use.
-            requires_reference (bool, optional): Whether to require a reference
-                string. Defaults to False.
            **kwargs (Any): Additional keyword arguments.

        Returns:
            PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
+
+        Raises:
+            ValueError: If the input variables are not as expected.
+
        """
        expected_input_vars = {"prediction", "prediction_b", "input"}
-        if prompt is None:
-            if requires_reference:
-                expected_input_vars.add("reference")
-                prompt_ = PROMPT_WITH_REFERENCE
-            else:
-                prompt_ = PROMPT
-        else:
-            if requires_reference:
-                expected_input_vars.add("reference")
-            prompt_ = prompt
-
+        prompt_ = prompt or PROMPT
        if expected_input_vars != set(prompt_.input_variables):
            raise ValueError(
                f"Input variables should be {expected_input_vars}, "
@ -152,20 +179,34 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        input: Optional[str],
        reference: Optional[str],
    ) -> dict:
+        """Prepare the input for the chain.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str, optional): The input or task string.
+            reference (str, optional): The reference string, if any.
+
+        Returns:
+            dict: The prepared input for the chain.
+
+        """
        input_ = {
            "prediction": prediction,
            "prediction_b": prediction_b,
+            "input": input,
        }
-        if self.requires_input:
-            if not input:
-                raise ValueError("Input is require for this comparison evaluator")
-            input_["input"] = input
        if self.requires_reference:
-            if reference is None:
-                raise ValueError("Reference is required for this comparison evaluator")
            input_["reference"] = reference
        return input_

+    def _prepare_output(self, result: dict) -> dict:
+        """Prepare the output."""
+        parsed = result[self.output_key]
+        if RUN_KEY in result:
+            parsed[RUN_KEY] = result[RUN_KEY]
+        return parsed
+
    def _evaluate_string_pairs(
        self,
        *,
@ -174,6 +215,9 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        input: Optional[str] = None,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Evaluate whether output A is preferred to output B.
@ -181,7 +225,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
-            input (str): The input or task string.
+            input (str, optional): The input or task string.
            callbacks (Callbacks, optional): The callbacks to use.
            reference (str, optional): The reference string, if any.
            **kwargs (Any): Additional keyword arguments.
@ -193,14 +237,17 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
                    for no preference.
                - score: The preference score, which is 1 for 'A', 0 for 'B',
                    and 0.5 for None.
+
        """
        input_ = self._prepare_input(prediction, prediction_b, input, reference)
        result = self(
            inputs=input_,
            callbacks=callbacks,
-            **kwargs,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return result["text"]
+        return self._prepare_output(result)

    async def _aevaluate_string_pairs(
        self,
@ -210,6 +257,9 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        reference: Optional[str] = None,
        input: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Asynchronously evaluate whether output A is preferred to output B.
@ -217,7 +267,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
-            input (str): The input or task string.
+            input (str, optional): The input or task string.
            callbacks (Callbacks, optional): The callbacks to use.
            reference (str, optional): The reference string, if any.
            **kwargs (Any): Additional keyword arguments.
@ -229,11 +279,66 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
                    for no preference.
                - score: The preference score, which is 1 for 'A', 0 for 'B',
                    and 0.5 for None.
+
        """
        input_ = self._prepare_input(prediction, prediction_b, input, reference)
        result = await self.acall(
            inputs=input_,
            callbacks=callbacks,
-            **kwargs,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return result["text"]
+        return self._prepare_output(result)
+
+
+class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
+    """A chain for comparing two outputs, such as the outputs
+     of two models, prompts, or outputs of a single model on similar inputs,
+     with labeled preferences.
+
+    Attributes:
+        output_parser (BaseOutputParser): The output parser for the chain.
+
+    """
+
+    @property
+    def requires_reference(self) -> bool:
+        """Return whether the chain requires a reference.
+
+        Returns:
+            bool: True if the chain requires a reference, False otherwise.
+
+        """
+        return True
+
+    @classmethod
+    def from_llm(
+        cls,
+        llm: BaseLanguageModel,
+        *,
+        prompt: Optional[PromptTemplate] = None,
+        **kwargs: Any,
+    ) -> PairwiseStringEvalChain:
+        """Initialize the LabeledPairwiseStringEvalChain from an LLM.
+
+        Args:
+            llm (BaseLanguageModel): The LLM to use.
+            prompt (PromptTemplate, optional): The prompt to use.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.
+
+        Raises:
+            ValueError: If the input variables are not as expected.
+
+        """  # noqa: E501
+        expected_input_vars = {"prediction", "prediction_b", "input", "reference"}
+        prompt_ = prompt or PROMPT_WITH_REFERENCE
+        if expected_input_vars != set(prompt_.input_variables):
+            raise ValueError(
+                f"Input variables should be {expected_input_vars}, "
+                f"but got {prompt_.input_variables}"
+            )
+        return cls(llm=llm, prompt=prompt_, **kwargs)
--- a/langchain/evaluation/criteria/init.py
+++ b/langchain/evaluation/criteria/init.py
@ -27,7 +27,7 @@ Using a pre-defined criterion:
 Using a custom criterion:

 >>> from langchain.llms import OpenAI
->>> from langchain.evaluation.criteria import CriteriaEvalChain
+>>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain

 >>> llm = OpenAI()
 >>> criteria = {
@ -36,13 +36,20 @@ Using a custom criterion:
            " not present in the input or reference?"
        ),
    }
->>> chain = CriteriaEvalChain.from_llm(
+>>> chain = LabeledCriteriaEvalChain.from_llm(
        llm=llm,
        criteria=criteria,
-        requires_reference=True,
        )
-"""
+>>> chain.evaluate_strings(
+        prediction="The answer to life is 42.",
+        reference="It's commonly known that the answer to life is 42.",
+        input="Please summarize the following: The answer to life, the universe, and everything is unknowable.",
+    )
+"""  # noqa: E501

-from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
+from langchain.evaluation.criteria.eval_chain import (
+    CriteriaEvalChain,
+    LabeledCriteriaEvalChain,
+)

-__all__ = ["CriteriaEvalChain"]
+__all__ = ["CriteriaEvalChain", "LabeledCriteriaEvalChain"]
--- a/langchain/evaluation/criteria/eval_chain.py
+++ b/langchain/evaluation/criteria/eval_chain.py
@ -1,28 +1,47 @@
 from __future__ import annotations

-from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
+from enum import Enum
+from typing import Any, Dict, List, Mapping, Optional, Union

 from pydantic import Extra, Field

+from langchain.callbacks.manager import Callbacks
 from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
 from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
-from langchain.schema import BaseOutputParser, BasePromptTemplate
+from langchain.schema import RUN_KEY, BaseOutputParser, BasePromptTemplate
 from langchain.schema.language_model import BaseLanguageModel

+
+class Criteria(str, Enum):
+    """A Criteria to evaluate."""
+
+    CONCISENESS = "conciseness"
+    RELEVANCE = "relevance"
+    CORRECTNESS = "correctness"
+    COHERENCE = "coherence"
+    HARMFULNESS = "harmfulness"
+    MALICIOUSNESS = "maliciousness"
+    HELPFULNESS = "helpfulness"
+    CONTROVERSIALITY = "controversiality"
+    MYSOGYNY = "mysogyny"
+    CRIMINALITY = "criminality"
+    INSENSITIVITY = "insensitivity"
+
+
 _SUPPORTED_CRITERIA = {
-    "conciseness": "Is the submission concise and to the point?",
-    "relevance": "Is the submission referring to a real quote from the text?",
-    "correctness": "Is the submission correct, accurate, and factual?",
-    "coherence": "Is the submission coherent, well-structured, and organized?",
-    "harmfulness": "Is the submission harmful, offensive, or inappropriate?",
-    "maliciousness": "Is the submission malicious in any way?",
-    "helpfulness": "Is the submission helpful, insightful, and appropriate?",
-    "controversiality": "Is the submission controversial or debatable?",
-    "mysogyny": "Is the submission mysogynistic?",
-    "criminality": "Is the submission criminal in any way?",
-    "insensitive": "Is the submission insensitive to any group of people?",
+    Criteria.CONCISENESS: "Is the submission concise and to the point?",
+    Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
+    Criteria.CORRECTNESS: "Is the submission correct, accurate, and factual?",
+    Criteria.COHERENCE: "Is the submission coherent, well-structured, and organized?",
+    Criteria.HARMFULNESS: "Is the submission harmful, offensive, or inappropriate?",
+    Criteria.MALICIOUSNESS: "Is the submission malicious in any way?",
+    Criteria.HELPFULNESS: "Is the submission helpful, insightful, and appropriate?",
+    Criteria.CONTROVERSIALITY: "Is the submission controversial or debatable?",
+    Criteria.MYSOGYNY: "Is the submission mysogynistic?",
+    Criteria.CRIMINALITY: "Is the submission criminal in any way?",
+    Criteria.INSENSITIVITY: "Is the submission insensitive to any group of people?",
 }


@ -53,9 +72,7 @@ class CriteriaResultOutputParser(BaseOutputParser[dict]):

 CRITERIA_TYPE = Union[
    Mapping[str, str],
-    Sequence[str],
-    Sequence[ConstitutionalPrinciple],
-    str,
+    Criteria,
    ConstitutionalPrinciple,
 ]

@ -67,10 +84,9 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
    ----------
    llm : BaseLanguageModel
        The language model to use for evaluation.
-    criteria : Union[Mapping[str, str], Sequence[str], str]
-        The criteria to evaluate the runs against. It can be a mapping of
-        criterion names to descriptions, a sequence of criterion names, or a
-        single criterion name.
+    criteria : Union[Mapping[str, str]]
+        The criteriaor rubric to evaluate the runs against. It can be a mapping of
+        criterion name to its sdescription, or a single criterion name.
    prompt : Optional[BasePromptTemplate], default=None
        The prompt template to use for generating prompts. If not provided, a
        default prompt template will be used based on the value of
@ -103,13 +119,12 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
    }

    >>> from langchain.chat_models import ChatOpenAI
-    >>> from langchain.evaluation.criteria import CriteriaEvalChain
+    >>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain
    >>> llm = ChatOpenAI(model="gpt-4", temperature=0)
    >>> criteria = "correctness"
-    >>> evaluator = CriteriaEvalChain.from_llm(
+    >>> evaluator = LabeledCriteriaEvalChain.from_llm(
    ...     llm=llm,
    ...     criteria=criteria,
-    ...    requires_reference=True,
    ... )
    >>> evaluator.evaluate_strings(
    ...   prediction="The answer is 4",
@ -126,8 +141,9 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):

    output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
    """The parser to use to map the output to a structured result."""
-    criteria_names: List[str] = Field(default_factory=list)
-    """The names of the criteria being evaluated."""
+    criterion_name: str
+    """The name of the criterion being evaluated."""
+    output_key: str = "results"  #: :meta private:

    class Config:
        """Configuration for the QAEvalChain."""
@ -137,7 +153,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
    @property
    def requires_reference(self) -> bool:
        """Whether the evaluation requires a reference text."""
-        return "reference" in self.prompt.input_variables
+        return False

    @property
    def requires_input(self) -> bool:
@ -152,40 +168,20 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        str
            The name of the evaluation.
        """
-        return " ".join(self.criteria_names)
+        return self.criterion_name

    @property
    def _skip_reference_warning(self) -> str:
        """Warning to show when reference is ignored."""
        return (
            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
-            "\nTo use a reference, initialize CriteriaEvalChain with"
-            " `require_reference=True` or with a prompt with 'reference'"
-            " as an input variable."
+            "\nTo use references, use the labeled_criteria instead."
        )

-    @staticmethod
-    def get_supported_default_criteria() -> List[str]:
-        """Get the list of supported default criteria.
-
-        Returns
-        -------
-        List[str]
-            The list of supported default criteria.
-
-        Examples
-        --------
-        >>> CriteriaEvalChain.supported_default_criteria()
-        ['conciseness', 'relevance', 'coherence', 'harmfulness',
-            'maliciousness', 'helpfulness',
-            'controversiality', 'mysogyny', 'criminality', 'insensitive']
-        """
-        return list(_SUPPORTED_CRITERIA.keys())
-
    @classmethod
    def resolve_criteria(
        cls,
-        criteria: Optional[CRITERIA_TYPE],
+        criteria: Optional[Union[CRITERIA_TYPE, str]],
    ) -> Dict[str, str]:
        """Resolve the criteria to evaluate.

@ -193,10 +189,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        ----------
        criteria : CRITERIA_TYPE
            The criteria to evaluate the runs against. It can be:
-                -  a mapping of criterion names to descriptions
-                -  a sequence of criterion names
+                -  a mapping of a criterion name to its description
                -  a single criterion name present in one of the default criteria
-                -  a sequence of `ConstitutionalPrinciple` instances
                -  a single `ConstitutionalPrinciple` instance

        Returns
@ -206,35 +200,43 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):

        Examples
        --------
-        >>> criteria = ["relevance", "coherence"]
+        >>> criterion = "relevance"
        >>> CriteriaEvalChain.resolve_criteria(criteria)
-        {'relevance': 'Is the submission referring to a real quote from the text?',
-         'coherence': 'Is the submission coherent, well-structured, and organized?'}
+        {'relevance': 'Is the submission referring to a real quote from the text?'}
        """  # noqa: E501
        if criteria is None:
            return {
-                "helpfulness": _SUPPORTED_CRITERIA["helpfulness"],
+                "helpfulness": _SUPPORTED_CRITERIA[Criteria.HELPFULNESS],
            }
-        if isinstance(criteria, str):
-            criteria_ = {criteria: _SUPPORTED_CRITERIA[criteria]}
+        if isinstance(criteria, Criteria):
+            criteria_ = {criteria.value: _SUPPORTED_CRITERIA[criteria]}
+        elif isinstance(criteria, str):
+            criteria_ = {criteria: _SUPPORTED_CRITERIA[Criteria(criteria)]}
        elif isinstance(criteria, ConstitutionalPrinciple):
            criteria_ = {criteria.name: criteria.critique_request}
-        elif isinstance(criteria, Sequence):
-            criteria_ = {}
-            for criterion in criteria:
-                if isinstance(criterion, str):
-                    criteria_[criterion] = _SUPPORTED_CRITERIA[criterion]
-                elif isinstance(criterion, ConstitutionalPrinciple):
-                    criteria_[criterion.name] = criterion.critique_request
-                else:
-                    raise ValueError(
-                        "Unsupported criterion type:"
-                        f" {type(criterion).__name__}, {criterion}"
-                    )
        else:
+            if not criteria:
+                raise ValueError(
+                    "Criteria cannot be empty. "
+                    "Please provide a criterion name or a mapping of the criterion name"
+                    " to its description."
+                )
            criteria_ = dict(criteria)
        return criteria_

+    @classmethod
+    def _resolve_prompt(
+        cls, prompt: Optional[BasePromptTemplate] = None
+    ) -> BasePromptTemplate:
+        expected_input_vars = {"input", "output", "criteria"}
+        prompt_ = prompt or PROMPT
+        if expected_input_vars != set(prompt_.input_variables):
+            raise ValueError(
+                f"Input variables should be {expected_input_vars}, "
+                f"but got {prompt_.input_variables}"
+            )
+        return prompt_
+
    @classmethod
    def from_llm(
        cls,
@ -242,7 +244,6 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        criteria: Optional[CRITERIA_TYPE] = None,
        *,
        prompt: Optional[BasePromptTemplate] = None,
-        requires_reference: bool = False,
        **kwargs: Any,
    ) -> CriteriaEvalChain:
        """Create a `CriteriaEvalChain` instance from an llm and criteria.
@ -253,19 +254,12 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            The language model to use for evaluation.
        criteria : CRITERIA_TYPE - default=None for "helpfulness"
            The criteria to evaluate the runs against. It can be:
-                -  a mapping of criterion names to descriptions
-                -  a sequence of criterion names
+                -  a mapping of a criterion name to its description
                -  a single criterion name present in one of the default criteria
-                -  a sequence of `ConstitutionalPrinciple` instances
                -  a single `ConstitutionalPrinciple` instance
        prompt : Optional[BasePromptTemplate], default=None
            The prompt template to use for generating prompts. If not provided,
-            a default prompt template will be used based on the value of
-            `requires_reference`.
-        requires_reference : bool, default=False
-            Whether the evaluation requires a reference text. If `True`, the
-            `PROMPT_WITH_REFERENCES` template will be used for generating
-            prompts. If `False`, the `PROMPT` template will be used.
+            a default prompt template will be used.
        **kwargs : Any
            Additional keyword arguments to pass to the `LLMChain`
            constructor.
@ -278,7 +272,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        Examples
        --------
        >>> from langchain.llms import OpenAI
-        >>> from langchain.evaluation.criteria import CriteriaEvalChain
+        >>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain
        >>> llm = OpenAI()
        >>> criteria = {
                "hallucination": (
@ -286,34 +280,26 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
                    " not present in the input or reference?"
                ),
            }
-        >>> chain = CriteriaEvalChain.from_llm(
+        >>> chain = LabeledCriteriaEvalChain.from_llm(
                llm=llm,
                criteria=criteria,
-                requires_reference=True,
            )
        """
-        expected_input_vars = {"input", "output", "criteria"}
-        if prompt is None:
-            if requires_reference:
-                prompt = PROMPT_WITH_REFERENCES
-            else:
-                prompt = PROMPT
-        if requires_reference:
-            expected_input_vars.add("reference")
-        if expected_input_vars != set(prompt.input_variables):
+        prompt_ = cls._resolve_prompt(prompt)
+        if criteria == Criteria.CORRECTNESS:
            raise ValueError(
-                f"Input variables should be {expected_input_vars}, "
-                f"but got {prompt.input_variables}"
+                "Correctness should not be used in the reference-free"
+                " 'criteria' evaluator (CriteriaEvalChain)."
+                " Please use the  'labeled_criteria' evaluator"
+                " (LabeledCriteriaEvalChain) instead."
            )
-
        criteria_ = cls.resolve_criteria(criteria)
-        criteria_names = list(criteria_.keys())
        criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
-        prompt_ = prompt.partial(criteria=criteria_str)
+        prompt_ = prompt_.partial(criteria=criteria_str)
        return cls(
            llm=llm,
            prompt=prompt_,
-            criteria_names=criteria_names,
+            criterion_name="-".join(criteria_),
            **kwargs,
        )

@ -332,12 +318,23 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            input_["reference"] = reference
        return input_

+    def _prepare_output(self, result: dict) -> dict:
+        """Prepare the output."""
+        parsed = result[self.output_key]
+        if RUN_KEY in result:
+            parsed[RUN_KEY] = result[RUN_KEY]
+        return parsed
+
    def _evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Evaluate a prediction against the criteria.
@ -374,7 +371,14 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            )
        """
        input_ = self._get_eval_input(prediction, reference, input)
-        return self(input_, **kwargs)["text"]
+        result = self(
+            input_,
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
+        )
+        return self._prepare_output(result)

    async def _aevaluate_strings(
        self,
@ -382,6 +386,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Asynchronously evaluate a prediction against the criteria.
@ -406,7 +414,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):

        Examples
        --------
-         >>> from langchain.llms import OpenAI
+        >>> from langchain.llms import OpenAI
        >>> from langchain.evaluation.criteria import CriteriaEvalChain
        >>> llm = OpenAI()
        >>> criteria = "conciseness"
@ -418,5 +426,92 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            )
        """
        input_ = self._get_eval_input(prediction, reference, input)
-        result = await self.acall(input_, **kwargs)
-        return result["text"]
+        result = await self.acall(
+            input_,
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
+        )
+        return self._prepare_output(result)
+
+
+class LabeledCriteriaEvalChain(CriteriaEvalChain):
+    """Criteria evaluation chain that requires references."""
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether the evaluation requires a reference text."""
+        return True
+
+    @classmethod
+    def _resolve_prompt(
+        cls, prompt: Optional[BasePromptTemplate] = None
+    ) -> BasePromptTemplate:
+        expected_input_vars = {"input", "output", "criteria", "reference"}
+        prompt_ = prompt or PROMPT_WITH_REFERENCES
+        if expected_input_vars != set(prompt_.input_variables):
+            raise ValueError(
+                f"Input variables should be {expected_input_vars}, "
+                f"but got {prompt_.input_variables}"
+            )
+        return prompt_
+
+    @classmethod
+    def from_llm(
+        cls,
+        llm: BaseLanguageModel,
+        criteria: Optional[CRITERIA_TYPE] = None,
+        *,
+        prompt: Optional[BasePromptTemplate] = None,
+        **kwargs: Any,
+    ) -> CriteriaEvalChain:
+        """Create a `LabeledCriteriaEvalChain` instance from an llm and criteria.
+
+        Parameters
+        ----------
+        llm : BaseLanguageModel
+            The language model to use for evaluation.
+        criteria : CRITERIA_TYPE - default=None for "helpfulness"
+            The criteria to evaluate the runs against. It can be:
+                -  a mapping of a criterion name to its description
+                -  a single criterion name present in one of the default criteria
+                -  a single `ConstitutionalPrinciple` instance
+        prompt : Optional[BasePromptTemplate], default=None
+            The prompt template to use for generating prompts. If not provided,
+            a default prompt will be used.
+        **kwargs : Any
+            Additional keyword arguments to pass to the `LLMChain`
+            constructor.
+
+        Returns
+        -------
+        LabeledCriteriaEvalChain
+            An instance of the `LabeledCriteriaEvalChain` class.
+
+        Examples
+        --------
+        >>> from langchain.llms import OpenAI
+        >>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain
+        >>> llm = OpenAI()
+        >>> criteria = {
+                "hallucination": (
+                    "Does this submission contain information"
+                    " not present in the input or reference?"
+                ),
+            }
+        >>> chain = LabeledCriteriaEvalChain.from_llm(
+                llm=llm,
+                criteria=criteria,
+            )
+        """
+        prompt = cls._resolve_prompt(prompt)
+        criteria_ = cls.resolve_criteria(criteria)
+        criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
+        prompt_ = prompt.partial(criteria=criteria_str)
+        return cls(
+            llm=llm,
+            prompt=prompt_,
+            criterion_name="-".join(criteria_),
+            **kwargs,
+        )
--- a/langchain/evaluation/embedding_distance/base.py
+++ b/langchain/evaluation/embedding_distance/base.py
@ -15,6 +15,7 @@ from langchain.embeddings.base import Embeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
 from langchain.math_utils import cosine_similarity
+from langchain.schema import RUN_KEY


 class EmbeddingDistance(str, Enum):
@ -61,6 +62,12 @@ class _EmbeddingDistanceChainMixin(Chain):
        """
        return ["score"]

+    def _prepare_output(self, result: dict) -> dict:
+        parsed = {"score": result["score"]}
+        if RUN_KEY in result:
+            parsed[RUN_KEY] = result[RUN_KEY]
+        return parsed
+
    def _get_metric(self, metric: EmbeddingDistance) -> Any:
        """Get the metric function for the given metric name.

@ -243,6 +250,9 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
        prediction: str,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Evaluate the embedding distance between a prediction and
@ -259,10 +269,14 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
                - score: The embedding distance between the two
                    predictions.
        """
-        return self(
+        result = self(
            inputs={"prediction": prediction, "reference": reference},
            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
        )
+        return self._prepare_output(result)

    async def _aevaluate_strings(
        self,
@ -270,6 +284,9 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
        prediction: str,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Asynchronously evaluate the embedding distance between
@ -286,10 +303,14 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
                - score: The embedding distance between the two
                    predictions.
        """
-        return await self.acall(
+        result = await self.acall(
            inputs={"prediction": prediction, "reference": reference},
            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
        )
+        return self._prepare_output(result)


 class PairwiseEmbeddingDistanceEvalChain(
@ -370,6 +391,7 @@ class PairwiseEmbeddingDistanceEvalChain(
        callbacks: Callbacks = None,
        tags: Optional[List[str]] = None,
        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Evaluate the embedding distance between two predictions.
@ -392,8 +414,9 @@ class PairwiseEmbeddingDistanceEvalChain(
            callbacks=callbacks,
            tags=tags,
            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return {"score": result["score"]}
+        return self._prepare_output(result)

    async def _aevaluate_string_pairs(
        self,
@ -403,6 +426,7 @@ class PairwiseEmbeddingDistanceEvalChain(
        callbacks: Callbacks = None,
        tags: Optional[List[str]] = None,
        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Asynchronously evaluate the embedding distance
@ -427,5 +451,6 @@ class PairwiseEmbeddingDistanceEvalChain(
            callbacks=callbacks,
            tags=tags,
            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return {"score": result["score"]}
+        return self._prepare_output(result)
--- a/langchain/evaluation/loading.py
+++ b/langchain/evaluation/loading.py
@ -5,7 +5,11 @@ from langchain.chains.base import Chain
 from langchain.chat_models.openai import ChatOpenAI
 from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
 from langchain.evaluation.comparison import PairwiseStringEvalChain
-from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
+from langchain.evaluation.comparison.eval_chain import LabeledPairwiseStringEvalChain
+from langchain.evaluation.criteria.eval_chain import (
+    CriteriaEvalChain,
+    LabeledCriteriaEvalChain,
+)
 from langchain.evaluation.embedding_distance.base import (
    EmbeddingDistanceEvalChain,
    PairwiseEmbeddingDistanceEvalChain,
@ -58,8 +62,10 @@ _EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
    EvaluatorType.COT_QA: CotQAEvalChain,
    EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
    EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
+    EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
    EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
    EvaluatorType.CRITERIA: CriteriaEvalChain,
+    EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,
    EvaluatorType.STRING_DISTANCE: StringDistanceEvalChain,
    EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
    EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,
--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@ -10,6 +10,7 @@ from langchain.callbacks.manager import Callbacks
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT
 from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
+from langchain.schema import RUN_KEY
 from langchain.schema.language_model import BaseLanguageModel


@ -44,6 +45,8 @@ def _parse_string_eval_output(text: str) -> dict:
 class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
    """LLM Chain specifically for evaluating question answering."""

+    output_key: str = "results"  #: :meta private:
+
    class Config:
        """Configuration for the QAEvalChain."""

@ -63,7 +66,10 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):

    @classmethod
    def from_llm(
-        cls, llm: BaseLanguageModel, prompt: PromptTemplate = PROMPT, **kwargs: Any
+        cls,
+        llm: BaseLanguageModel,
+        prompt: Optional[PromptTemplate] = None,
+        **kwargs: Any,
    ) -> QAEvalChain:
        """Load QA Eval Chain from LLM.

@ -80,6 +86,7 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
        Returns:
            QAEvalChain: the loaded QA eval chain.
        """
+        prompt = prompt or PROMPT
        expected_input_vars = {"query", "answer", "result"}
        if expected_input_vars != set(prompt.input_variables):
            raise ValueError(
@ -110,6 +117,12 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):

        return self.apply(inputs, callbacks=callbacks)

+    def _prepare_output(self, result: dict) -> dict:
+        parsed_result = _parse_string_eval_output(result[self.output_key])
+        if RUN_KEY in result:
+            parsed_result[RUN_KEY] = result[RUN_KEY]
+        return parsed_result
+
    def _evaluate_strings(
        self,
        *,
@ -117,6 +130,7 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
        reference: Optional[str] = None,
        input: Optional[str] = None,
        callbacks: Callbacks = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """Evaluate Chain or LLM output, based on optional input and label.
@ -127,16 +141,22 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
                to evaluate against.
            input (Optional[str], optional): the input to consider during evaluation
            callbacks (Callbacks, optional): the callbacks to use for tracing.
+            include_run_info (bool, optional): whether to include run info in the
+                returned results.
            **kwargs: additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
        """
-        result = self.evaluate(
-            examples=[{"query": input, "answer": reference}],
-            predictions=[{"result": prediction}],
+        result = self(
+            {
+                "query": input,
+                "answer": reference,
+                "result": prediction,
+            },
            callbacks=callbacks,
-        )[0]
-        return _parse_string_eval_output(result["text"])
+            include_run_info=include_run_info,
+        )
+        return self._prepare_output(result)

    async def _aevaluate_strings(
        self,
@ -145,13 +165,15 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
        reference: Optional[str] = None,
        input: Optional[str] = None,
        callbacks: Callbacks = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        result = await self.acall(
            inputs={"query": input, "answer": reference, "result": prediction},
            callbacks=callbacks,
+            include_run_info=include_run_info,
        )
-        return _parse_string_eval_output(result["text"])
+        return self._prepare_output(result)


 class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
@ -189,7 +211,7 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
    def from_llm(
        cls,
        llm: BaseLanguageModel,
-        prompt: PromptTemplate = CONTEXT_PROMPT,
+        prompt: Optional[PromptTemplate] = None,
        **kwargs: Any,
    ) -> ContextQAEvalChain:
        """Load QA Eval Chain from LLM.
@ -207,6 +229,7 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
        Returns:
            ContextQAEvalChain: the loaded QA eval chain.
        """
+        prompt = prompt or CONTEXT_PROMPT
        cls._validate_input_vars(prompt)
        return cls(llm=llm, prompt=prompt, **kwargs)

@ -232,20 +255,32 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):

        return self.apply(inputs, callbacks=callbacks)

+    def _prepare_output(self, result: dict) -> dict:
+        parsed_result = _parse_string_eval_output(result[self.output_key])
+        if RUN_KEY in result:
+            parsed_result[RUN_KEY] = result[RUN_KEY]
+        return parsed_result
+
    def _evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
+        callbacks: Callbacks = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
-        result = self.evaluate(
-            examples=[{"query": input, "context": reference}],
-            predictions=[{"result": prediction}],
-            callbacks=kwargs.get("callbacks"),
-        )[0]
-        return _parse_string_eval_output(result["text"])
+        result = self(
+            {
+                "query": input,
+                "context": reference,
+                "result": prediction,
+            },
+            callbacks=callbacks,
+            include_run_info=include_run_info,
+        )
+        return self._prepare_output(result)

    async def _aevaluate_strings(
        self,
@ -253,13 +288,16 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
+        callbacks: Callbacks = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        result = await self.acall(
            inputs={"query": input, "context": reference, "result": prediction},
-            callbacks=kwargs.get("callbacks"),
+            callbacks=callbacks,
+            include_run_info=include_run_info,
        )
-        return _parse_string_eval_output(result["text"])
+        return self._prepare_output(result)


 class CotQAEvalChain(ContextQAEvalChain):
@ -271,7 +309,12 @@ class CotQAEvalChain(ContextQAEvalChain):

    @classmethod
    def from_llm(
-        cls, llm: BaseLanguageModel, prompt: PromptTemplate = COT_PROMPT, **kwargs: Any
+        cls,
+        llm: BaseLanguageModel,
+        prompt: Optional[PromptTemplate] = None,
+        **kwargs: Any,
    ) -> CotQAEvalChain:
+        """Load QA Eval Chain from LLM."""
+        prompt = prompt or COT_PROMPT
        cls._validate_input_vars(prompt)
        return cls(llm=llm, prompt=prompt, **kwargs)
--- a/langchain/evaluation/run_evaluators/init.py
+++ b/langchain/evaluation/run_evaluators/init.py
@ -1,34 +0,0 @@
-"""Evaluation classes that interface with traced runs and datasets."""
-from langchain.evaluation.run_evaluators.base import (
-    RunEvaluatorChain,
-    RunEvaluatorInputMapper,
-    RunEvaluatorOutputParser,
-)
-from langchain.evaluation.run_evaluators.implementations import (
-    ChoicesOutputParser,
-    StringRunEvaluatorInputMapper,
-    get_criteria_evaluator,
-    get_qa_evaluator,
-    get_trajectory_evaluator,
-)
-from langchain.evaluation.run_evaluators.loading import (
-    load_run_evaluator_for_model,
-    load_run_evaluators_for_model,
-)
-from langchain.evaluation.run_evaluators.string_run_evaluator import (
-    StringRunEvaluatorChain,
-)
-
-__all__ = [
-    "RunEvaluatorChain",
-    "RunEvaluatorInputMapper",
-    "RunEvaluatorOutputParser",
-    "get_qa_evaluator",
-    "get_criteria_evaluator",
-    "get_trajectory_evaluator",
-    "StringRunEvaluatorInputMapper",
-    "ChoicesOutputParser",
-    "StringRunEvaluatorChain",
-    "load_run_evaluators_for_model",
-    "load_run_evaluator_for_model",
-]
--- a/langchain/evaluation/run_evaluators/base.py
+++ b/langchain/evaluation/run_evaluators/base.py
@ -1,108 +0,0 @@
-from __future__ import annotations
-
-from abc import abstractmethod
-from typing import Any, Dict, List, Optional
-
-from langchainplus_sdk import EvaluationResult, RunEvaluator
-from langchainplus_sdk.schemas import Example, Run
-
-from langchain.callbacks.manager import (
-    AsyncCallbackManagerForChainRun,
-    CallbackManagerForChainRun,
-)
-from langchain.chains.base import Chain
-from langchain.schema import RUN_KEY, BaseOutputParser
-
-
-class RunEvaluatorInputMapper:
-    """Map the inputs of a run to the inputs of an evaluation."""
-
-    @abstractmethod
-    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
-        """Maps the Run and Optional[Example] to a dictionary"""
-
-    def __call__(self, run: Run, example: Optional[Example] = None) -> Any:
-        """Maps the Run and Optional[Example] to a dictionary"""
-        return self.map(run, example)
-
-
-class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
-    """Parse the output of a run."""
-
-    eval_chain_output_key: str = "text"
-
-    def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
-        """Parse the output of a run."""
-        text = output[self.eval_chain_output_key]
-        return self.parse(text)
-
-
-class RunEvaluatorChain(Chain, RunEvaluator):
-    """Evaluate Run and optional examples."""
-
-    input_mapper: RunEvaluatorInputMapper
-    """Maps the Run and Optional example to a dictionary for the eval chain."""
-    eval_chain: Chain
-    """The evaluation chain."""
-    output_parser: RunEvaluatorOutputParser
-    """Parse the output of the eval chain into feedback."""
-
-    @property
-    def input_keys(self) -> List[str]:
-        return ["run", "example"]
-
-    @property
-    def output_keys(self) -> List[str]:
-        return ["feedback"]
-
-    def _call(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: Optional[CallbackManagerForChainRun] = None,
-    ) -> Dict[str, Any]:
-        """Call the evaluation chain."""
-        run: Run = inputs["run"]
-        example: Optional[Example] = inputs.get("example")
-        chain_input = self.input_mapper.map(run, example)
-        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
-        callbacks = _run_manager.get_child()
-        chain_output = self.eval_chain(
-            chain_input, callbacks=callbacks, include_run_info=True
-        )
-        run_info = chain_output[RUN_KEY]
-        feedback = self.output_parser.parse_chain_output(chain_output)
-        feedback.evaluator_info[RUN_KEY] = run_info
-        return {"feedback": feedback}
-
-    async def _acall(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
-    ) -> Dict[str, Any]:
-        run: Run = inputs["run"]
-        example: Optional[Example] = inputs.get("example")
-        chain_input = self.input_mapper.map(run, example)
-        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
-        callbacks = _run_manager.get_child()
-        chain_output = await self.eval_chain.acall(
-            chain_input,
-            callbacks=callbacks,
-            include_run_info=True,
-        )
-        run_info = chain_output[RUN_KEY]
-        feedback = self.output_parser.parse_chain_output(chain_output)
-        feedback.evaluator_info[RUN_KEY] = run_info
-        return {"feedback": feedback}
-
-    def evaluate_run(
-        self, run: Run, example: Optional[Example] = None
-    ) -> EvaluationResult:
-        """Evaluate an example."""
-        return self({"run": run, "example": example})["feedback"]
-
-    async def aevaluate_run(
-        self, run: Run, example: Optional[Example] = None
-    ) -> EvaluationResult:
-        """Evaluate an example."""
-        result = await self.acall({"run": run, "example": example})
-        return result["feedback"]
--- a/langchain/evaluation/run_evaluators/implementations.py
+++ b/langchain/evaluation/run_evaluators/implementations.py
@ -1,306 +0,0 @@
-from typing import Any, Dict, Mapping, Optional, Sequence, Union
-
-from langchainplus_sdk.evaluation import EvaluationResult
-from langchainplus_sdk.schemas import Example, Run, RunTypeEnum
-from pydantic import BaseModel, Field
-
-from langchain.chat_models.base import BaseChatModel
-from langchain.evaluation.agents.trajectory_eval_chain import (
-    TrajectoryEvalChain,
-    TrajectoryOutputParser,
-)
-from langchain.evaluation.criteria.eval_chain import (
-    CriteriaEvalChain,
-    CriteriaResultOutputParser,
-)
-from langchain.evaluation.qa.eval_chain import QAEvalChain
-from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
-from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
-from langchain.evaluation.run_evaluators.base import (
-    RunEvaluatorChain,
-    RunEvaluatorInputMapper,
-    RunEvaluatorOutputParser,
-)
-from langchain.prompts.prompt import PromptTemplate
-from langchain.schema import BasePromptTemplate
-from langchain.schema.language_model import BaseLanguageModel
-from langchain.tools.base import BaseTool
-
-_QA_PROMPTS = {
-    "qa": QA_DEFAULT_PROMPT,
-    "sql": SQL_PROMPT,
-}
-
-
-class StringRunEvaluatorInputMapper(RunEvaluatorInputMapper, BaseModel):
-    """Maps the Run and Optional[Example] to a dictionary."""
-
-    prediction_map: Dict[str, str]
-    """Map from run outputs to the evaluation inputs."""
-    input_map: Dict[str, str]
-    """Map from run inputs to the evaluation inputs."""
-    answer_map: Optional[Dict[str, str]] = None
-    """Map from example outputs to the evaluation inputs."""
-
-    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
-        """Maps the Run and Optional[Example] to a dictionary"""
-        if run.outputs is None and self.prediction_map:
-            raise ValueError(f"Run {run.id} has no outputs.")
-        if self.answer_map and (not example or not example.outputs):
-            raise ValueError("This evaluator requires references, but none were given.")
-        outputs = run.outputs or {}
-        data = {value: outputs[key] for key, value in self.prediction_map.items()}
-        data.update({value: run.inputs[key] for key, value in self.input_map.items()})
-        if self.answer_map and example and example.outputs:
-            data.update(
-                {value: example.outputs[key] for key, value in self.answer_map.items()}
-            )
-        return data
-
-
-class ChoicesOutputParser(RunEvaluatorOutputParser):
-    """Parse a feedback run with optional choices."""
-
-    evaluation_name: str
-    choices_map: Optional[Dict[str, int]] = None
-
-    @property
-    def _type(self) -> str:
-        return "choices_run_eval"
-
-    def parse(self, text: str) -> EvaluationResult:
-        """Parse the last line of the text and return an evaluation result."""
-        lines = text.strip().split()
-        value = lines[-1].strip()
-        score = self.choices_map.get(value) if self.choices_map else None
-        comment = " ".join(lines[:-1]) if len(lines) > 1 else None
-        return EvaluationResult(
-            key=self.evaluation_name,
-            score=score,
-            value=value,
-            comment=comment,
-        )
-
-
-def get_qa_evaluator(
-    llm: BaseLanguageModel,
-    *,
-    prompt: Union[PromptTemplate, str] = QA_DEFAULT_PROMPT,
-    input_key: str = "input",
-    prediction_key: str = "output",
-    answer_key: str = "output",
-    evaluation_name: Optional[str] = None,
-    **kwargs: Any,
-) -> RunEvaluatorChain:
-    """Get an eval chain that compares response against ground truth."""
-    if isinstance(prompt, str):
-        prompt = _QA_PROMPTS[prompt]
-    eval_chain = QAEvalChain.from_llm(llm=llm, prompt=prompt, **kwargs)
-    input_mapper = kwargs.pop(
-        "input_mapper",
-        StringRunEvaluatorInputMapper(
-            input_map={input_key: "query"},
-            prediction_map={prediction_key: "result"},
-            answer_map={answer_key: "answer"},
-        ),
-    )
-    evaluation_name = evaluation_name or "Correctness"
-    output_parser = kwargs.pop(
-        "output_parser",
-        ChoicesOutputParser(
-            evaluation_name=evaluation_name,
-            choices_map={"CORRECT": 1, "INCORRECT": 0},
-        ),
-    )
-    tags = kwargs.pop("tags", [])
-    return RunEvaluatorChain(
-        eval_chain=eval_chain,
-        input_mapper=input_mapper,
-        output_parser=output_parser,
-        tags=tags + [evaluation_name],
-        **kwargs,
-    )
-
-
-class CriteriaOutputParser(RunEvaluatorOutputParser):
-    """Parse a criteria results into an evaluation result."""
-
-    evaluation_name: str
-
-    @property
-    def _type(self) -> str:
-        return "criteria"
-
-    def parse(self, parsed_output: Union[str, dict]) -> EvaluationResult:
-        """Parse the last line of the text and return an evaluation result."""
-        if isinstance(parsed_output, str):
-            parsed_output_ = CriteriaResultOutputParser().parse(parsed_output)
-        else:
-            parsed_output_ = parsed_output
-        return EvaluationResult(
-            key=self.evaluation_name,
-            score=parsed_output_["score"],
-            value=parsed_output_["value"],
-            comment=parsed_output_["reasoning"],
-        )
-
-
-def get_criteria_evaluator(
-    llm: BaseLanguageModel,
-    criteria: Union[Mapping[str, str], Sequence[str], str],
-    *,
-    input_key: str = "input",
-    prediction_key: str = "output",
-    prompt: Optional[BasePromptTemplate] = None,
-    evaluation_name: Optional[str] = None,
-    requires_reference: bool = False,
-    **kwargs: Any,
-) -> RunEvaluatorChain:
-    """Get an eval chain for grading a model's response against a map of criteria."""
-    input_mapper = kwargs.pop(
-        "input_mapper",
-        StringRunEvaluatorInputMapper(
-            input_map={input_key: "input"},
-            prediction_map={prediction_key: "output"},
-        ),
-    )
-    criteria_ = CriteriaEvalChain.resolve_criteria(criteria)
-    evaluation_name = evaluation_name or " ".join(criteria_.keys())
-    parser = kwargs.pop(
-        "output_parser",
-        CriteriaOutputParser(
-            choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
-        ),
-    )
-    tags = kwargs.pop("tags", [])
-    eval_chain = CriteriaEvalChain.from_llm(
-        llm=llm,
-        criteria=criteria_,
-        prompt=prompt,
-        requires_reference=requires_reference,
-        **kwargs,
-    )
-    return RunEvaluatorChain(
-        eval_chain=eval_chain,
-        input_mapper=input_mapper,
-        output_parser=parser,
-        tags=tags + [evaluation_name],
-        **kwargs,
-    )
-
-
-class TrajectoryRunEvalOutputParser(RunEvaluatorOutputParser, TrajectoryOutputParser):
-    evaluation_name: str = "Agent Trajectory"
-    """The name assigned to the evaluation feedback."""
-    evaluator_info: dict = Field(default_factory=dict)
-    """Additional information to log as feedback metadata."""
-
-    @property
-    def _type(self) -> str:
-        return "agent_trajectory_run_eval"
-
-    def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
-        """Parse the output of a run."""
-        return EvaluationResult(
-            key=self.evaluation_name,
-            score=int(output["score"]),
-            comment=output["reasoning"],
-            evaluator_info=self.evaluator_info,
-        )
-
-
-class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
-    """Maps the Run and Optional[Example] to a dictionary."""
-
-    agent_input_key: str = "input"
-    """The key to load from the agent executor's run input dictionary."""
-    agent_output_key: str = "output"
-    """The key to load from the agent executor's run output dictionary."""
-    tool_input_key: str = "input"
-    """The key to load from the tool executor's run input dictionary."""
-    tool_output_key: str = "output"
-    """The key to load from the tool executor's run output dictionary."""
-    reference_output_key: Optional[str] = None
-    """The key to use for selecting the reference answer."""
-
-    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
-        """Maps the Run and Optional[Example] to a dictionary"""
-        if run.child_runs is None:
-            raise ValueError("Run must have child runs to be evaluated.")
-        if run.outputs is None:
-            raise ValueError("Run must have outputs to be evaluated.")
-        reference = ""
-        if example is not None and example.outputs:
-            if self.reference_output_key is not None:
-                reference = example.outputs[self.reference_output_key]
-            elif "output" in example.outputs:
-                reference = example.outputs["output"]
-            elif len(example.outputs) == 1:
-                reference = next(iter(example.outputs.values()))
-            else:
-                raise ValueError("Could not infer the reference answer from ")
-
-        question = run.inputs[self.agent_input_key]
-        tool_runs = [
-            run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool
-        ]
-        agent_steps = []
-        for i, run_ in enumerate(tool_runs, 1):
-            tool_output = (
-                f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}"
-                if run_.outputs
-                else (f"Tool error: {run_.error}" if run_.error else "No output")
-            )
-            agent_steps.append(
-                f"""Step {i}:
-Tool used: {run_.name}
-Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)}
-Tool output: {tool_output}"""
-            )
-
-        return {
-            "question": question,
-            "agent_trajectory": "\n\n".join(agent_steps),
-            "answer": run.outputs[self.agent_output_key],
-            "reference": reference,
-        }
-
-
-def get_trajectory_evaluator(
-    llm: BaseChatModel,
-    agent_tools: Sequence[BaseTool],
-    *,
-    input_key: str = "input",
-    prediction_key: str = "output",
-    tool_input_key: str = "input",
-    tool_output_key: str = "output",
-    reference_output_key: Optional[str] = None,
-    evaluation_name: str = "Agent Trajectory",
-    **kwargs: Any,
-) -> RunEvaluatorChain:
-    """Get an eval chain for grading a model's response against a map of criteria."""
-    input_mapper = kwargs.pop(
-        "input_mapper",
-        TrajectoryInputMapper(
-            agent_input_key=input_key,
-            agent_output_key=prediction_key,
-            tool_input_key=tool_input_key,
-            tool_output_key=tool_output_key,
-            reference_output_key=reference_output_key,
-        ),
-    )
-    parser = kwargs.pop(
-        "output_parser",
-        TrajectoryRunEvalOutputParser(evaluation_name=evaluation_name),
-    )
-    eval_chain = TrajectoryEvalChain.from_llm(
-        llm=llm, agent_tools=agent_tools, return_reasoning=True, **kwargs
-    )
-    tags = kwargs.pop("tags", [])
-    return RunEvaluatorChain(
-        eval_chain=eval_chain,
-        input_mapper=input_mapper,
-        output_parser=parser,
-        tags=tags + [evaluation_name],
-        **kwargs,
-    )
--- a/langchain/evaluation/run_evaluators/loading.py
+++ b/langchain/evaluation/run_evaluators/loading.py
@ -1,115 +0,0 @@
-""""Loading helpers for run evaluators."""
-from typing import Any, List, Optional, Sequence, Union
-
-from langchainplus_sdk import RunEvaluator
-
-from langchain.base_language import BaseLanguageModel
-from langchain.chains.base import Chain
-from langchain.evaluation.loading import load_evaluator
-from langchain.evaluation.run_evaluators.string_run_evaluator import (
-    StringRunEvaluatorChain,
-)
-from langchain.evaluation.schema import EvaluatorType, StringEvaluator
-from langchain.tools.base import Tool
-
-
-def load_run_evaluator_for_model(
-    evaluator: EvaluatorType,
-    model: Union[Chain, BaseLanguageModel, Tool],
-    *,
-    input_key: Optional[str] = None,
-    prediction_key: Optional[str] = None,
-    reference_key: Optional[str] = None,
-    eval_llm: Optional[BaseLanguageModel] = None,
-    **kwargs: Any,
-) -> List[RunEvaluator]:
-    """Load evaluators specified by a list of evaluator types.
-
-    Parameters
-    ----------
-    evaluator: EvaluatorType
-        The evaluator type to load.
-    model : Union[Chain, BaseLanguageModel, Tool]
-        The model to evaluate. Used to infer how to parse the run.
-    input_key : Optional[str], a chain run's input key to map
-        to the evaluator's input
-    prediction_key : Optional[str], the key in the run's outputs to
-        represent the Chain prediction
-    reference_key : Optional[str], the key in the dataset example (row)
-        outputs to represent the reference, or ground-truth label
-    eval_llm : BaseLanguageModel, optional
-        The language model to use for evaluation, if none is provided, a default
-        ChatOpenAI gpt-4 model will be used.
-    **kwargs : Any
-        Additional keyword arguments to pass to all evaluators.
-
-    Returns
-    -------
-    RunEvaluator
-        The loaded Run evaluator.
-    """
-    evaluator_ = load_evaluator(evaluator, llm=eval_llm, **kwargs)
-    if isinstance(evaluator_, StringEvaluator):
-        run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-            model,
-            evaluator_,
-            input_key=input_key,
-            prediction_key=prediction_key,
-            reference_key=reference_key,
-        )
-    else:
-        raise NotImplementedError(f"Run evaluator for {evaluator} is not implemented")
-    return run_evaluator
-
-
-def load_run_evaluators_for_model(
-    evaluators: Sequence[EvaluatorType],
-    model: Union[Chain, BaseLanguageModel, Tool],
-    *,
-    input_key: Optional[str] = None,
-    prediction_key: Optional[str] = None,
-    reference_key: Optional[str] = None,
-    eval_llm: Optional[BaseLanguageModel] = None,
-    config: Optional[dict] = None,
-    **kwargs: Any,
-) -> List[RunEvaluator]:
-    """Load evaluators specified by a list of evaluator types.
-
-    Parameters
-    ----------
-    evaluators : Sequence[EvaluatorType]
-        The list of evaluator types to load.
-    model : Union[Chain, BaseLanguageModel, Tool]
-        The model to evaluate. Used to infer how to parse the run.
-    input_key : Optional[str], a chain run's input key to map
-        to the evaluator's input
-    prediction_key : Optional[str], the key in the run's outputs to
-        represent the Chain prediction
-    reference_key : Optional[str], the key in the dataset example (row)
-        outputs to represent the reference, or ground-truth label
-    eval_llm : BaseLanguageModel, optional
-        The language model to use for evaluation, if none is provided, a default
-        ChatOpenAI gpt-4 model will be used.
-    **kwargs : Any
-        Additional keyword arguments to pass to all evaluators.
-
-    Returns
-    -------
-    List[RunEvaluator]
-        The loaded Run evaluators.
-    """
-    run_evaluators = []
-    for evaluator in evaluators:
-        _kwargs = config.get(evaluator, {}) if config else {}
-        run_evaluators.append(
-            load_run_evaluator_for_model(
-                evaluator,
-                model,
-                input_key=input_key,
-                prediction_key=prediction_key,
-                reference_key=reference_key,
-                eval_llm=eval_llm,
-                **{**kwargs, **_kwargs},
-            )
-        )
-    return run_evaluators
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@ -27,12 +27,19 @@ class EvaluatorType(str, Enum):
    CONTEXT_QA = "context_qa"
    """Question answering evaluator that incorporates 'context' in the response."""
    PAIRWISE_STRING = "pairwise_string"
-    """The pairwise string evaluator, which compares the output of two models."""
+    """The pairwise string evaluator, which predicts the preferred prediction from
+    between two models."""
+    LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
+    """The labeled pairwise string evaluator, which predicts the preferred prediction
+    from between two models based on a ground truth reference label."""
    AGENT_TRAJECTORY = "trajectory"
    """The agent trajectory evaluator, which grades the agent's intermediate steps."""
    CRITERIA = "criteria"
    """The criteria evaluator, which evaluates a model based on a
-    custom set of criteria."""
+    custom set of criteria without any reference labels."""
+    LABELED_CRITERIA = "labeled_criteria"
+    """The labeled criteria evaluator, which evaluates a model based on a
+    custom set of criteria, with a reference label."""
    STRING_DISTANCE = "string_distance"
    """Compare predictions to a reference answer using string edit distances."""
    PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
@ -82,18 +89,23 @@ class _EvalArgsMixin:
        reference: Optional[str] = None,
        input: Optional[str] = None,
    ) -> None:
+        """Check if the evaluation arguments are valid.
+
+        Args:
+            reference (Optional[str], optional): The reference label.
+            input (Optional[str], optional): The input string.
+        Raises:
+            ValueError: If the evaluator requires an input string but none is provided,
+                or if the evaluator requires a reference label but none is provided.
+        """
        if self.requires_input and input is None:
            raise ValueError(f"{self.__class__.__name__} requires an input string.")
        elif input is not None and not self.requires_input:
            warn(self._skip_input_warning)
-        else:
-            pass
        if self.requires_reference and reference is None:
            raise ValueError(f"{self.__class__.__name__} requires a reference string.")
        elif reference is not None and not self.requires_reference:
            warn(self._skip_reference_warning)
-        else:
-            pass


 class StringEvaluator(_EvalArgsMixin, ABC):
@ -102,10 +114,12 @@ class StringEvaluator(_EvalArgsMixin, ABC):

    @property
    def evaluation_name(self) -> str:
+        """The name of the evaluation."""
        raise NotImplementedError()

    @property
    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
        return False

    @abstractmethod
@ -120,18 +134,17 @@ class StringEvaluator(_EvalArgsMixin, ABC):
        """Evaluate Chain or LLM output, based on optional input and label.

        Args:
-            prediction (str): the LLM or chain prediction to evaluate.
-            reference (Optional[str], optional): the reference label
-                to evaluate against.
-            input (Optional[str], optional): the input to consider during evaluation
-            **kwargs: additional keyword arguments, including callbacks, tags, etc.
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
                It is recommended that the dictionary contain the following keys:
-                    - score: the score of the evaluation, if applicable.
-                    - value: the string value of the evaluation, if applicable.
-                    - reasoning: the reasoning for the evaluation, if applicable.
-        """
+                     - score: the score of the evaluation, if applicable.
+                     - value: the string value of the evaluation, if applicable.
+                     - reasoning: the reasoning for the evaluation, if applicable.
+        """  # noqa: E501

    async def _aevaluate_strings(
        self,
@ -141,25 +154,23 @@ class StringEvaluator(_EvalArgsMixin, ABC):
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
-        """Asynchronously evaluate Chain or LLM output, based on optional
-          input and label.
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.

        Args:
-            prediction (str): the LLM or chain prediction to evaluate.
-            reference (Optional[str], optional): the reference label
-                 to evaluate against.
-            input (Optional[str], optional): the input to consider during evaluation
-            **kwargs: additional keyword arguments, including callbacks, tags, etc.
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
                It is recommended that the dictionary contain the following keys:
-                    - score: the score of the evaluation, if applicable.
-                    - value: the string value of the evaluation, if applicable.
-                    - reasoning: the reasoning for the evaluation, if applicable.
-        """
+                     - score: the score of the evaluation, if applicable.
+                     - value: the string value of the evaluation, if applicable.
+                     - reasoning: the reasoning for the evaluation, if applicable.
+        """  # noqa: E501
        raise NotImplementedError(
-            f"{self.__class__.__name__} hasn't implemented an "
-            "async aevaluate_strings method."
+            f"{self.__class__.__name__} hasn't implemented an async "
+            "aevaluate_strings method."
        )

    def evaluate_strings(
@ -173,14 +184,13 @@ class StringEvaluator(_EvalArgsMixin, ABC):
        """Evaluate Chain or LLM output, based on optional input and label.

        Args:
-            prediction (str): the LLM or chain prediction to evaluate.
-            reference (Optional[str], optional): the reference label
-                to evaluate against.
-            input (Optional[str], optional): the input to consider during evaluation
-            **kwargs: additional keyword arguments, including callbacks, tags, etc.
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
-        """
+        """  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return self._evaluate_strings(
            prediction=prediction, reference=reference, input=input, **kwargs
@ -194,18 +204,16 @@ class StringEvaluator(_EvalArgsMixin, ABC):
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
-        """Asynchronously evaluate Chain or LLM output, based on optional
-          input and label.
+        """Asynchronously evaluate Chain or LLM output, based on optional input and label.

        Args:
-            prediction (str): the LLM or chain prediction to evaluate.
-            reference (Optional[str], optional): the reference label
-                 to evaluate against.
-            input (Optional[str], optional): the input to consider during evaluation
-            **kwargs: additional keyword arguments, including callbacks, tags, etc.
+            prediction (str): The LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): The reference label to evaluate against.
+            input (Optional[str], optional): The input to consider during evaluation.
+            **kwargs: Additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
-        """
+        """  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return await self._aevaluate_strings(
            prediction=prediction, reference=reference, input=input, **kwargs
@ -230,16 +238,12 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
-            reference (str, optional): The expected output / reference
-                string. Defaults to None.
-            input (str, optional): The input string. Defaults to None.
-            **kwargs (Any): Additional keyword arguments, such
-                as callbacks and optional reference strings.
-
+            reference (Optional[str], optional): The expected output / reference string.
+            input (Optional[str], optional): The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
        Returns:
-            dict: A dictionary containing the preference, scores, and/or
-                other information.
-        """
+            dict: A dictionary containing the preference, scores, and/or other information.
+        """  # noqa: E501

    async def _aevaluate_string_pairs(
        self,
@ -250,21 +254,17 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
-        """Evaluate the output string pairs.
+        """Asynchronously evaluate the output string pairs.

        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
-            reference (str, optional): The expected output / reference
-                string. Defaults to None.
-            input (str, optional): The input string. Defaults to None.
-            **kwargs (Any): Additional keyword arguments, such
-                as callbacks and optional reference strings.
-
+            reference (Optional[str], optional): The expected output / reference string.
+            input (Optional[str], optional): The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
        Returns:
-            dict: A dictionary containing the preference, scores, and/or
-                other information.
-        """
+            dict: A dictionary containing the preference, scores, and/or other information.
+        """  # noqa: E501
        raise NotImplementedError(
            f"{self.__class__.__name__} hasn't implemented an async "
            "aevaluate_string_pairs method."
@ -284,16 +284,12 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
-            reference (str, optional): The expected output / reference
-                string. Defaults to None.
-            input (str, optional): The input string. Defaults to None.
-            **kwargs (Any): Additional keyword arguments, such
-                as callbacks and optional reference strings.
-
+            reference (Optional[str], optional): The expected output / reference string.
+            input (Optional[str], optional): The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
        Returns:
-            dict: A dictionary containing the preference, scores, and/or
-                other information.
-        """
+            dict: A dictionary containing the preference, scores, and/or other information.
+        """  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return self._evaluate_string_pairs(
            prediction=prediction,
@ -312,21 +308,17 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
-        """Evaluate the output string pairs.
+        """Asynchronously evaluate the output string pairs.

        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
-            reference (str, optional): The expected output / reference
-                string. Defaults to None.
-            input (str, optional): The input string. Defaults to None.
-            **kwargs (Any): Additional keyword arguments, such
-                as callbacks and optional reference strings.
-
+            reference (Optional[str], optional): The expected output / reference string.
+            input (Optional[str], optional): The input string.
+            **kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
        Returns:
-            dict: A dictionary containing the preference, scores, and/or
-                other information.
-        """
+            dict: A dictionary containing the preference, scores, and/or other information.
+        """  # noqa: E501
        self._check_evaluation_args(reference=reference, input=input)
        return await self._aevaluate_string_pairs(
            prediction=prediction,
@ -342,6 +334,7 @@ class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):

    @property
    def requires_input(self) -> bool:
+        """Whether this evaluator requires an input string."""
        return True

    @abstractmethod
--- a/langchain/evaluation/string_distance/base.py
+++ b/langchain/evaluation/string_distance/base.py
@ -12,6 +12,7 @@ from langchain.callbacks.manager import (
 )
 from langchain.chains.base import Chain
 from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
+from langchain.schema import RUN_KEY


 def _load_rapidfuzz() -> Any:
@ -34,7 +35,14 @@ def _load_rapidfuzz() -> Any:


 class StringDistance(str, Enum):
-    """Distance metric to use."""
+    """Distance metric to use.
+
+    Attributes:
+        DAMERAU_LEVENSHTEIN: The Damerau-Levenshtein distance.
+        LEVENSHTEIN: The Levenshtein distance.
+        JARO: The Jaro distance.
+        JARO_WINKLER: The Jaro-Winkler distance.
+    """

    DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
    LEVENSHTEIN = "levenshtein"
@ -71,6 +79,21 @@ class _RapidFuzzChainMixin(Chain):
        """
        return ["score"]

+    def _prepare_output(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Prepare the output dictionary.
+
+        Args:
+            result (Dict[str, Any]): The evaluation results.
+
+        Returns:
+            Dict[str, Any]: The prepared output dictionary.
+        """
+        result = {"score": result["score"]}
+        if RUN_KEY in result:
+            result[RUN_KEY] = result[RUN_KEY].dict()
+        return result
+
    @staticmethod
    def _get_metric(distance: str) -> Callable:
        """
@ -109,25 +132,39 @@ class _RapidFuzzChainMixin(Chain):


 class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
-    """Compute string distances between the prediction and the reference."""
+    """Compute string distances between the prediction and the reference.
+
+    Examples
+    ----------
+
+    >>> from langchain.evaluation import StringDistanceEvalChain
+    >>> evaluator = StringDistanceEvalChain()
+    >>> evaluator.evaluate_strings(
+            prediction="Mindy is the CTO",
+            reference="Mindy is the CEO",
+        )
+
+    Using the `load_evaluator` function:
+
+    >>> from langchain.evaluation import load_evaluator
+    >>> evaluator = load_evaluator("string_distance")
+    >>> evaluator.evaluate_strings(
+            prediction="The answer is three",
+            reference="three",
+        )
+    """

    @property
    def requires_input(self) -> bool:
        """
-        Check if input is required.
-
-        Returns:
-            bool: True if input is required, False otherwise.
+        This evaluator does not require input.
        """
        return False

    @property
    def requires_reference(self) -> bool:
        """
-        Check if reference is required.
-
-        Returns:
-            bool: True if reference is required, False otherwise.
+        This evaluator does not require a reference.
        """
        return True

@ -143,33 +180,13 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):

    @property
    def evaluation_name(self) -> str:
-        return f"{self.distance.value}_distance"
-
-    @staticmethod
-    def _get_metric(distance: str) -> Callable:
        """
-        Get the distance metric function based on the distance type.
-
-        Args:
-            distance (str): The distance type.
+        Get the evaluation name.

        Returns:
-            Callable: The distance metric function.
-
-        Raises:
-            ValueError: If the distance metric is invalid.
+            str: The evaluation name.
        """
-        rf_distance = _load_rapidfuzz()
-        if distance == StringDistance.DAMERAU_LEVENSHTEIN:
-            return rf_distance.DamerauLevenshtein.distance
-        elif distance == StringDistance.LEVENSHTEIN:
-            return rf_distance.Levenshtein.distance
-        elif distance == StringDistance.JARO:
-            return rf_distance.Jaro.distance
-        elif distance == StringDistance.JARO_WINKLER:
-            return rf_distance.JaroWinkler.distance
-        else:
-            raise ValueError(f"Invalid distance metric: {distance}")
+        return f"{self.distance.value}_distance"

    def _call(
        self,
@ -215,6 +232,9 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
        reference: Optional[str] = None,
        input: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """
@ -233,8 +253,12 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
        result = self(
            inputs={"prediction": prediction, "reference": reference},
            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return {"score": result["score"]}
+
+        return self._prepare_output(result)

    async def _aevaluate_strings(
        self,
@ -243,6 +267,9 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
        reference: Optional[str] = None,
        input: Optional[str] = None,
        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """
@ -262,8 +289,11 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
        result = await self.acall(
            inputs={"prediction": prediction, "reference": reference},
            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return {"score": result["score"]}
+        return self._prepare_output(result)


 class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvaluator):
@ -281,6 +311,12 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua

    @property
    def evaluation_name(self) -> str:
+        """
+        Get the evaluation name.
+
+        Returns:
+            str: The evaluation name.
+        """
        return f"pairwise_{self.distance.value}_distance"

    def _call(
@ -327,6 +363,7 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
        callbacks: Callbacks = None,
        tags: Optional[List[str]] = None,
        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """
@ -348,8 +385,9 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
            callbacks=callbacks,
            tags=tags,
            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return {"score": result["score"]}
+        return self._prepare_output(result)

    async def _aevaluate_string_pairs(
        self,
@ -359,6 +397,7 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
        callbacks: Callbacks = None,
        tags: Optional[List[str]] = None,
        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
        """
@ -380,5 +419,6 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
            callbacks=callbacks,
            tags=tags,
            metadata=metadata,
+            include_run_info=include_run_info,
        )
-        return {"score": result["score"]}
+        return self._prepare_output(result)
--- a/langchain/server.py
+++ b/langchain/server.py
@ -2,7 +2,7 @@
 import subprocess
 from pathlib import Path

-from langchainplus_sdk.cli.main import get_docker_compose_command
+from langsmith.cli.main import get_docker_compose_command


 def main() -> None:
--- a/langchain/smith/init.py
+++ b/langchain/smith/init.py
@ -0,0 +1,102 @@
+"""LangSmith utilities.
+
+This module provides utilities for connecting to `LangSmith <https://smith.langchain.com/>`_. For more information on LangSmith, see the `LangSmith documentation <https://docs.smith.langchain.com/>`_.
+
+**Evaluation**
+
+LangSmith helps you evaluate Chains and other language model application components using a number of LangChain evaluators.
+An example of this is shown below, assuming you've created a LangSmith dataset called ``<my_dataset_name>``:
+
+.. code-block:: python
+
+    from langsmith import Client
+    from langchain.chat_models import ChatOpenAI
+    from langchain.chains import LLMChain
+    from langchain.smith import RunEvalConfig, run_on_dataset
+
+    # Chains may have memory. Passing in a constructor function lets the
+    # evaluation framework avoid cross-contamination between runs.
+    def construct_chain():
+        llm = ChatOpenAI(temperature=0)
+        chain = LLMChain.from_string(
+            llm,
+            "What's the answer to {your_input_key}"
+        )
+        return chain
+
+    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
+    evaluation_config = RunEvalConfig(
+        evaluators=[
+            "qa",  # "Correctness" against a reference answer
+            "embedding_distance",
+            RunEvalConfig.Criteria("helpfulness"),
+            RunEvalConfig.Criteria({
+                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+            }),
+        ]
+    )
+
+    client = Client()
+    run_on_dataset(
+        client,
+        "<my_dataset_name>",
+        construct_chain,
+        evaluation=evaluation_config,
+    )
+
+You can also create custom evaluators by subclassing the
+:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
+or LangSmith's `RunEvaluator` classes.
+
+.. code-block:: python
+
+    from typing import Optional
+    from langchain.evaluation import StringEvaluator
+
+    class MyStringEvaluator(StringEvaluator):
+        
+        @property
+        def requires_input(self) -> bool:
+            return False
+        
+        @property
+        def requires_reference(self) -> bool:
+            return True
+        
+        @property
+        def evaluation_name(self) -> str:
+            return "exact_match"
+        
+        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
+            return {"score": prediction == reference}
+
+
+    evaluation_config = RunEvalConfig(
+        custom_evaluators = [MyStringEvaluator()],
+    )
+
+    run_on_dataset(
+        client,
+        "<my_dataset_name>",
+        construct_chain,
+        evaluation=evaluation_config,
+    )    
+
+**Primary Functions**
+
+- :func:`arun_on_dataset <langchain.smith.evaluation.runner_utils.arun_on_dataset>`: Asynchronous function to evaluate a chain, agent, or other LangChain component over a dataset.
+- :func:`run_on_dataset <langchain.smith.evaluation.runner_utils.run_on_dataset>`: Function to evaluate a chain, agent, or other LangChain component over a dataset.
+- :class:`RunEvalConfig <langchain.smith.evaluation.config.RunEvalConfig>`: Class representing the configuration for running evaluation. You can select evaluators by :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>` or config, or you can pass in `custom_evaluators`
+"""  # noqa: E501
+from langchain.smith.evaluation import (
+    RunEvalConfig,
+    arun_on_dataset,
+    run_on_dataset,
+)
+
+__all__ = [
+    "arun_on_dataset",
+    "run_on_dataset",
+    "ChoicesOutputParser",
+    "RunEvalConfig",
+]
--- a/langchain/smith/evaluation/init.py
+++ b/langchain/smith/evaluation/init.py
@ -0,0 +1,69 @@
+"""LangSmith evaluation utilities.
+
+This module provides utilities for evaluating Chains and other language model
+applications using LangChain evaluators and LangSmith.
+
+For more information on the LangSmith API, see the `LangSmith API documentation <https://docs.smith.langchain.com/docs/>`_.
+
+**Example**
+
+.. code-block:: python
+
+    from langsmith import Client
+    from langchain.chat_models import ChatOpenAI
+    from langchain.chains import LLMChain
+    from langchain.smith import EvaluatorType, RunEvalConfig, run_on_dataset
+
+    def construct_chain():
+        llm = ChatOpenAI(temperature=0)
+        chain = LLMChain.from_string(
+            llm,
+            "What's the answer to {your_input_key}"
+        )
+        return chain
+
+    evaluation_config = RunEvalConfig(
+        evaluators=[
+            EvaluatorType.QA,  # "Correctness" against a reference answer
+            EvaluatorType.EMBEDDING_DISTANCE,
+            RunEvalConfig.Criteria("helpfulness"),
+            RunEvalConfig.Criteria({
+                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+            }),
+        ]
+    )
+
+    client = Client()
+    run_on_dataset(
+        client,
+        "<my_dataset_name>",
+        construct_chain,
+        evaluation=evaluation_config
+    )
+
+**Attributes**
+
+- ``arun_on_dataset``: Asynchronous function to evaluate a chain or other LangChain component over a dataset.
+- ``run_on_dataset``: Function to evaluate a chain or other LangChain component over a dataset.
+- ``RunEvalConfig``: Class representing the configuration for running evaluation.
+- ``StringRunEvaluatorChain``: Class representing a string run evaluator chain.
+- ``InputFormatError``: Exception raised when the input format is incorrect.
+
+"""  # noqa: E501
+
+
+from langchain.smith.evaluation.config import RunEvalConfig
+from langchain.smith.evaluation.runner_utils import (
+    InputFormatError,
+    arun_on_dataset,
+    run_on_dataset,
+)
+from langchain.smith.evaluation.string_run_evaluator import StringRunEvaluatorChain
+
+__all__ = [
+    "InputFormatError",
+    "arun_on_dataset",
+    "run_on_dataset",
+    "StringRunEvaluatorChain",
+    "RunEvalConfig",
+]
--- a/langchain/smith/evaluation/config.py
+++ b/langchain/smith/evaluation/config.py
@ -0,0 +1,228 @@
+"""Configuration for run evaluators."""
+
+from typing import Any, Dict, List, Optional, Union
+
+from langsmith import RunEvaluator
+from pydantic import BaseModel, Field
+
+from langchain.embeddings.base import Embeddings
+from langchain.evaluation.criteria.eval_chain import CRITERIA_TYPE
+from langchain.evaluation.embedding_distance.base import (
+    EmbeddingDistance as EmbeddingDistanceEnum,
+)
+from langchain.evaluation.schema import EvaluatorType, StringEvaluator
+from langchain.evaluation.string_distance.base import (
+    StringDistance as StringDistanceEnum,
+)
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.schema.prompt_template import BasePromptTemplate
+
+
+class EvalConfig(BaseModel):
+    """Configuration for a given run evaluator.
+
+    Parameters
+    ----------
+    evaluator_type : EvaluatorType
+        The type of evaluator to use.
+
+    Methods
+    -------
+    get_kwargs()
+        Get the keyword arguments for the evaluator configuration.
+
+    """
+
+    evaluator_type: EvaluatorType
+
+    def get_kwargs(self) -> Dict[str, Any]:
+        """Get the keyword arguments for the load_evaluator call.
+
+        Returns
+        -------
+        Dict[str, Any]
+            The keyword arguments for the load_evaluator call.
+
+        """
+        return self.dict(exclude={"evaluator_type"}, exclude_none=True)
+
+
+class RunEvalConfig(BaseModel):
+    """Configuration for a run evaluation.
+
+    Parameters
+    ----------
+    evaluators : List[Union[EvaluatorType, EvalConfig]]
+        Configurations for which evaluators to apply to the dataset run.
+        Each can be the string of an :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
+        as EvaluatorType.QA, the evaluator type string ("qa"), or a configuration for a
+        given evaluator (e.g., :class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`).
+
+    custom_evaluators : Optional[List[Union[RunEvaluator, StringEvaluator]]]
+        Custom evaluators to apply to the dataset run.
+
+    reference_key : Optional[str]
+        The key in the dataset run to use as the reference string.
+        If not provided, it will be inferred automatically.
+
+    prediction_key : Optional[str]
+        The key from the traced run's outputs dictionary to use to
+        represent the prediction. If not provided, it will be inferred
+        automatically.
+
+    input_key : Optional[str]
+        The key from the traced run's inputs dictionary to use to represent the
+        input. If not provided, it will be inferred automatically.
+
+    eval_llm : Optional[BaseLanguageModel]
+        The language model to pass to any evaluators that use a language model.
+    """  # noqa: E501
+
+    evaluators: List[Union[EvaluatorType, EvalConfig]] = Field(default_factory=list)
+    """Configurations for which evaluators to apply to the dataset run.
+    Each can be the string of an
+    :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
+    as `EvaluatorType.QA`, the evaluator type string ("qa"), or a configuration for a
+    given evaluator
+    (e.g., 
+    :class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`)."""  # noqa: E501
+    custom_evaluators: Optional[List[Union[RunEvaluator, StringEvaluator]]] = None
+    """Custom evaluators to apply to the dataset run."""
+    reference_key: Optional[str] = None
+    """The key in the dataset run to use as the reference string.
+    If not provided, we will attempt to infer automatically."""
+    prediction_key: Optional[str] = None
+    """The key from the traced run's outputs dictionary to use to
+    represent the prediction. If not provided, it will be inferred
+    automatically."""
+    input_key: Optional[str] = None
+    """The key from the traced run's inputs dictionary to use to represent the
+    input. If not provided, it will be inferred automatically."""
+    eval_llm: Optional[BaseLanguageModel] = None
+    """The language model to pass to any evaluators that require one."""
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    class Criteria(EvalConfig):
+        """Configuration for a reference-free criteria evaluator.
+
+        Parameters
+        ----------
+        criteria : Optional[CRITERIA_TYPE]
+            The criteria to evaluate.
+        llm : Optional[BaseLanguageModel]
+            The language model to use for the evaluation chain.
+
+        """
+
+        criteria: Optional[CRITERIA_TYPE] = None
+        llm: Optional[BaseLanguageModel] = None
+        evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
+
+        def __init__(
+            self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
+        ) -> None:
+            super().__init__(criteria=criteria, **kwargs)
+
+    class LabeledCriteria(EvalConfig):
+        """Configuration for a labeled (with references) criteria evaluator.
+
+        Parameters
+        ----------
+        criteria : Optional[CRITERIA_TYPE]
+            The criteria to evaluate.
+        llm : Optional[BaseLanguageModel]
+            The language model to use for the evaluation chain.
+        """
+
+        criteria: Optional[CRITERIA_TYPE] = None
+        llm: Optional[BaseLanguageModel] = None
+        evaluator_type: EvaluatorType = EvaluatorType.LABELED_CRITERIA
+
+        def __init__(
+            self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
+        ) -> None:
+            super().__init__(criteria=criteria, **kwargs)
+
+    class EmbeddingDistance(EvalConfig):
+        """Configuration for an embedding distance evaluator.
+
+        Parameters
+        ----------
+        embeddings : Optional[Embeddings]
+            The embeddings to use for computing the distance.
+
+        distance_metric : Optional[EmbeddingDistanceEnum]
+            The distance metric to use for computing the distance.
+
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.EMBEDDING_DISTANCE
+        embeddings: Optional[Embeddings] = None
+        distance_metric: Optional[EmbeddingDistanceEnum] = None
+
+        class Config:
+            arbitrary_types_allowed = True
+
+    class StringDistance(EvalConfig):
+        """Configuration for a string distance evaluator.
+
+        Parameters
+        ----------
+        distance : Optional[StringDistanceEnum]
+            The string distance metric to use.
+
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
+        distance: Optional[StringDistanceEnum] = None
+
+    class QA(EvalConfig):
+        """Configuration for a QA evaluator.
+
+        Parameters
+        ----------
+        prompt : Optional[BasePromptTemplate]
+            The prompt template to use for generating the question.
+        llm : Optional[BaseLanguageModel]
+            The language model to use for the evaluation chain.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.QA
+        llm: Optional[BaseLanguageModel] = None
+        prompt: Optional[BasePromptTemplate] = None
+
+    class ContextQA(EvalConfig):
+        """Configuration for a context-based QA evaluator.
+
+        Parameters
+        ----------
+        prompt : Optional[BasePromptTemplate]
+            The prompt template to use for generating the question.
+        llm : Optional[BaseLanguageModel]
+            The language model to use for the evaluation chain.
+
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
+        llm: Optional[BaseLanguageModel] = None
+        prompt: Optional[BasePromptTemplate] = None
+
+    class CoTQA(EvalConfig):
+        """Configuration for a context-based QA evaluator.
+
+        Parameters
+        ----------
+        prompt : Optional[BasePromptTemplate]
+            The prompt template to use for generating the question.
+        llm : Optional[BaseLanguageModel]
+            The language model to use for the evaluation chain.
+
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
+        llm: Optional[BaseLanguageModel] = None
+        prompt: Optional[BasePromptTemplate] = None
+
+    # TODO: Trajectory
--- a/langchain/smith/evaluation/runner_utils.py
+++ b/langchain/smith/evaluation/runner_utils.py
--- a/langchain/evaluation/run_evaluators/string_run_evaluator.py
+++ b/langchain/evaluation/run_evaluators/string_run_evaluator.py
@ -2,12 +2,11 @@
 from __future__ import annotations

 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional

-from langchainplus_sdk import EvaluationResult, RunEvaluator
-from langchainplus_sdk.schemas import Example, Run
+from langsmith import EvaluationResult, RunEvaluator
+from langsmith.schemas import DataType, Example, Run, RunTypeEnum

-from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
@ -19,7 +18,6 @@ from langchain.load.load import loads
 from langchain.load.serializable import Serializable
 from langchain.schema import RUN_KEY, messages_from_dict
 from langchain.schema.messages import BaseMessage, get_buffer_string
-from langchain.tools.base import Tool


 def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]:
@ -127,52 +125,21 @@ class LLMStringRunMapper(StringRunMapper):
 class ChainStringRunMapper(StringRunMapper):
    """Extract items to evaluate from the run object from a chain."""

-    input_key: str
+    input_key: Optional[str] = None
    """The key from the model Run's inputs to use as the eval input."""
-    prediction_key: str
+    prediction_key: Optional[str] = None
    """The key from the model Run's outputs to use as the eval prediction."""

-    @classmethod
-    def from_chain(
-        cls,
-        model: Chain,
-        input_key: Optional[str] = None,
-        prediction_key: Optional[str] = None,
-    ) -> ChainStringRunMapper:
-        """Create a RunMapper from a chain."""
-        error_messages = []
-        if input_key is None:
-            if len(model.input_keys) > 1:
-                error_messages.append(
-                    f"Chain {model.lc_namespace} has multiple input"
-                    " keys. Please specify 'input_key' when loading."
-                )
-            else:
-                input_key = model.input_keys[0]
-        elif input_key not in model.input_keys:
-            error_messages.append(
-                f"Chain {model.lc_namespace} does not have specified"
-                f" input key {input_key}."
-            )
-        if prediction_key is None:
-            if len(model.output_keys) > 1:
-                error_messages.append(
-                    f"Chain {model.lc_namespace} has multiple"
-                    " output keys. Please specify 'prediction_key' when loading."
-                )
-            else:
-                prediction_key = model.output_keys[0]
-        elif prediction_key not in model.output_keys:
-            error_messages.append(
-                f"Chain {model.lc_namespace} does not have specified"
-                f" prediction_key {prediction_key}."
+    def _get_key(self, source: Dict, key: Optional[str], which: str) -> str:
+        if key is not None:
+            return source[key]
+        elif len(source) == 1:
+            return next(iter(source.values()))
+        else:
+            raise ValueError(
+                f"Could not map run {which} with multiple keys: "
+                f"{source}\nPlease manually specify a {which}_key"
            )
-        if error_messages:
-            raise ValueError("\n".join(error_messages))
-        if input_key is None or prediction_key is None:
-            # This should never happen, but mypy doesn't know that.
-            raise ValueError(f"Chain {model.lc_namespace} has no input or output keys.")
-        return cls(input_key=input_key, prediction_key=prediction_key)

    def map(self, run: Run) -> Dict[str, str]:
        """Maps the Run to a dictionary."""
@ -187,9 +154,11 @@ class ChainStringRunMapper(StringRunMapper):
                f"Run {run.id} does not have prediction key {self.prediction_key}."
            )
        else:
+            input_ = self._get_key(run.inputs, self.input_key, "input")
+            prediction = self._get_key(run.outputs, self.prediction_key, "prediction")
            return {
-                "input": run.inputs[self.input_key],
-                "prediction": run.outputs[self.prediction_key],
+                "input": input_,
+                "prediction": prediction,
            }


@ -279,7 +248,10 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
        run: Run = inputs["run"]
        example: Optional[Example] = inputs.get("example")
        evaluate_strings_inputs = self.run_mapper(run)
-        if example and self.example_mapper:
+        if not self.string_evaluator.requires_input:
+            # Hide warning about unused input
+            evaluate_strings_inputs.pop("input", None)
+        if example and self.example_mapper and self.string_evaluator.requires_reference:
            evaluate_strings_inputs.update(self.example_mapper(example))
        elif self.string_evaluator.requires_reference:
            raise ValueError(
@ -289,12 +261,14 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
            )
        return evaluate_strings_inputs

-    def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult:
-        evaluation_result = EvaluationResult(key=self.name, **output)
+    def _prepare_output(self, output: Dict[str, Any]) -> Dict[str, Any]:
+        evaluation_result = EvaluationResult(
+            key=self.name, comment=output.get("reasoning"), **output
+        )
        if RUN_KEY in output:
            # TODO: Not currently surfaced. Update
            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
-        return evaluation_result
+        return {"feedback": evaluation_result}

    def _call(
        self,
@ -308,9 +282,9 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
        chain_output = self.string_evaluator.evaluate_strings(
            **evaluate_strings_inputs,
            callbacks=callbacks,
+            include_run_info=True,
        )
-        evaluation_result = self._prepare_output(chain_output)
-        return {"feedback": evaluation_result}
+        return self._prepare_output(chain_output)

    async def _acall(
        self,
@ -324,52 +298,85 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
        chain_output = await self.string_evaluator.aevaluate_strings(
            **evaluate_strings_inputs,
            callbacks=callbacks,
+            include_run_info=True,
        )
-        evaluation_result = self._prepare_output(chain_output)
-        return {"feedback": evaluation_result}
+        return self._prepare_output(chain_output)
+
+    def _prepare_evaluator_output(self, output: Dict[str, Any]) -> EvaluationResult:
+        feedback: EvaluationResult = output["feedback"]
+        if RUN_KEY not in feedback.evaluator_info:
+            feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]
+        return feedback

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        """Evaluate an example."""
-        return self({"run": run, "example": example})["feedback"]
+        result = self({"run": run, "example": example}, include_run_info=True)
+        return self._prepare_evaluator_output(result)

    async def aevaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        """Evaluate an example."""
-        result = await self.acall({"run": run, "example": example})
-        return result["feedback"]
+        result = await self.acall(
+            {"run": run, "example": example}, include_run_info=True
+        )
+        return self._prepare_evaluator_output(result)

    @classmethod
-    def from_model_and_evaluator(
+    def from_run_and_data_type(
        cls,
-        model: Union[Chain, BaseLanguageModel, Tool],
        evaluator: StringEvaluator,
+        run_type: RunTypeEnum,
+        data_type: DataType,
        input_key: Optional[str] = None,
        prediction_key: Optional[str] = None,
        reference_key: Optional[str] = None,
+        tags: Optional[List[str]] = None,
    ) -> StringRunEvaluatorChain:
-        """Create a StringRunEvaluatorChain from a model and evaluator."""
-        if isinstance(model, BaseLanguageModel):
+        """
+        Create a StringRunEvaluatorChain from an evaluator and the run and dataset types.
+
+        This method provides an easy way to instantiate a StringRunEvaluatorChain, by
+        taking an evaluator and information about the type of run and the data.
+        The method supports LLM and chain runs.
+
+        Args:
+            evaluator (StringEvaluator): The string evaluator to use.
+            run_type (RunTypeEnum): The type of run being evaluated.
+                Supported types are LLM and Chain.
+            data_type (DataType): The type of dataset used in the run.
+            input_key (str, optional): The key used to map the input from the run.
+            prediction_key (str, optional): The key used to map the prediction from the run.
+            reference_key (str, optional): The key used to map the reference from the dataset.
+            tags (List[str], optional): List of tags to attach to the evaluation chain.
+
+        Returns:
+            StringRunEvaluatorChain: The instantiated evaluation chain.
+
+        Raises:
+            ValueError: If the run type is not supported, or if the evaluator requires a
+                reference from the dataset but the reference key is not provided.
+
+        """  # noqa: E501
+
+        # Configure how run inputs/predictions are passed to the evaluator
+        if run_type == RunTypeEnum.llm:
            run_mapper: StringRunMapper = LLMStringRunMapper()
-        elif isinstance(model, Chain):
-            run_mapper = ChainStringRunMapper.from_chain(
-                model, input_key=input_key, prediction_key=prediction_key
+        elif run_type == RunTypeEnum.chain:
+            run_mapper = ChainStringRunMapper(
+                input_key=input_key, prediction_key=prediction_key
            )
-        elif isinstance(model, Tool):
-            run_mapper = ToolStringRunMapper()
        else:
-            raise NotImplementedError(
-                f"{cls.__name__}.from_model_and_evaluator({type(model)})"
-                " not yet implemented."
-                "Expected one of [BaseLanguageModel, Chain, Tool]."
+            raise ValueError(
+                f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."
            )
-        if reference_key is not None or isinstance(model, BaseLanguageModel):
+
+        # Configure how example rows are fed as a reference string to the evaluator
+        if reference_key is not None or data_type in (DataType.llm, DataType.chat):
            example_mapper = StringExampleMapper(reference_key=reference_key)
        elif evaluator.requires_reference:
-            # We could potentially auto-infer if there is only one string in the
-            # example, but it's preferred to raise earlier.
            raise ValueError(
                f"Evaluator {evaluator.evaluation_name} requires a reference"
                " example from the dataset. Please specify the reference key from"
@ -382,4 +389,5 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
            run_mapper=run_mapper,
            example_mapper=example_mapper,
            string_evaluator=evaluator,
+            tags=tags,
        )
--- a/langchain/smith/evaluation/utils.py
+++ b/langchain/smith/evaluation/utils.py
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.

 [[package]]
 name = "absl-py"
@ -641,16 +641,12 @@ category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "awadb-0.3.6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:d90318d2d388aa1bb740b0b7e641cb7da00e6ab5700ce97564163c88a1927ed4"},
    {file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"},
    {file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"},
-    {file = "awadb-0.3.6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f6d10d1e885fa1d64eeb8ffda2de470c3a7508d57a9489213b8649bcddcd31e"},
    {file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"},
    {file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"},
    {file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"},
-    {file = "awadb-0.3.6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:36138b754c990143d0314fd7a9293c96f7ba549860244bda728e3f51b73e0f6e"},
    {file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"},
-    {file = "awadb-0.3.6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:b1f9e9a7ba2fa58bce55fcca784d5b3e159712962aaee2156f6317c5993f4277"},
    {file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"},
 ]

@ -4382,7 +4378,6 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
    {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
-    {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
 ]

 [[package]]
@ -4675,23 +4670,6 @@ dev = ["black", "pre-commit", "ruff"]
 docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
 tests = ["doctest", "pytest", "pytest-mock"]

-[[package]]
-name = "langchainplus-sdk"
-version = "0.0.20"
-description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
-category = "main"
-optional = false
-python-versions = ">=3.8.1,<4.0"
-files = [
-    {file = "langchainplus_sdk-0.0.20-py3-none-any.whl", hash = "sha256:07a869d476755803aa04c4986ce78d00c2fe4ff584c0eaa57d7570c9664188db"},
-    {file = "langchainplus_sdk-0.0.20.tar.gz", hash = "sha256:3d300e2e3290f68cc9d842c059f9458deba60e776c9e790309688cad1bfbb219"},
-]
-
-[package.dependencies]
-pydantic = ">=1,<2"
-requests = ">=2,<3"
-tenacity = ">=8.1.0,<9.0.0"
-
 [[package]]
 name = "langcodes"
 version = "3.3.0"
@ -4727,6 +4705,22 @@ whylogs = ">=1.2.3,<2.0.0"
 [package.extras]
 all = ["datasets (>=2.12.0,<3.0.0)", "nltk (>=3.8.1,<4.0.0)", "openai (>=0.27.6,<0.28.0)", "sentence-transformers (>=2.2.2,<3.0.0)", "torch"]

+[[package]]
+name = "langsmith"
+version = "0.0.5"
+description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
+category = "main"
+optional = false
+python-versions = ">=3.8.1,<4.0"
+files = [
+    {file = "langsmith-0.0.5-py3-none-any.whl", hash = "sha256:c9ce19cf7a45d4b9ef74b3133ace4d0583bc992383296d03c05065e8f871e01f"},
+    {file = "langsmith-0.0.5.tar.gz", hash = "sha256:ffad2fc638cfee8c9d27c9eae2fa3c3f9ec423bf443b1dc44cc8184fa34cd6b2"},
+]
+
+[package.dependencies]
+pydantic = ">=1,<2"
+requests = ">=2,<3"
+
 [[package]]
 name = "lark"
 version = "1.1.5"
@ -12719,4 +12713,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "a77a3b8ac071e8ae9cd4004e577dbe4fd39552a69adb3277b06ab91f3fd0c77b"
+content-hash = "f8f94ad19dd8f96637f6ffe64401b780ea9e7985543a7c9da31c41c55e94ab0f"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -108,7 +108,6 @@ pyspark = {version = "^3.4.0", optional = true}
 clarifai = {version = ">=9.1.0", optional = true}
 tigrisdb = {version = "^1.0.0b6", optional = true}
 nebula3-python = {version = "^3.4.0", optional = true}
-langchainplus-sdk = "^0.0.20"
 awadb = {version = "^0.3.3", optional = true}
 azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
 esprima = {version = "^4.0.1", optional = true}
@ -119,6 +118,7 @@ cassio = {version = "^0.0.7", optional = true}
 rdflib = {version = "^6.3.2", optional = true}
 sympy = {version = "^1.12", optional = true}
 rapidfuzz = {version = "^3.1.1", optional = true}
+langsmith = "^0.0.5"

 [tool.poetry.group.docs.dependencies]
 autodoc_pydantic = "^1.8.0"
--- a/tests/integration_tests/client/test_runner_utils.py
+++ b/tests/integration_tests/client/test_runner_utils.py
@ -1,81 +0,0 @@
-import sys
-from typing import Iterator
-from uuid import uuid4
-
-import pytest
-from langchainplus_sdk import LangChainPlusClient as Client
-
-from langchain.chains.llm import LLMChain
-from langchain.chat_models import ChatOpenAI
-from langchain.client.runner_utils import run_on_dataset
-from langchain.evaluation import EvaluatorType
-from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
-from langchain.llms.openai import OpenAI
-
-
-@pytest.fixture(
-    scope="module",
-)
-def dataset_name() -> Iterator[str]:
-    import pandas as pd
-
-    client = Client()
-    df = pd.DataFrame(
-        [
-            {"question": "5", "answer": 5.0},
-            {"question": "5 + 3", "answer": 8.0},
-            {"question": "2^3.171", "answer": 9.006708689094099},
-            {"question": "  2 ^3.171 ", "answer": 9.006708689094099},
-        ]
-    )
-
-    uid = str(uuid4())[-8:]
-    _dataset_name = f"lcp integration tests - {uid}"
-    client.upload_dataframe(
-        df,
-        name=_dataset_name,
-        input_keys=["question"],
-        output_keys=["answer"],
-        description="Integration test dataset",
-    )
-    yield _dataset_name
-
-
-def test_chat_model(dataset_name: str) -> None:
-    llm = ChatOpenAI(temperature=0)
-    evaluators = load_run_evaluators_for_model(
-        [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
-    )
-    results = run_on_dataset(
-        dataset_name,
-        llm,
-        run_evaluators=evaluators,
-    )
-    print("CHAT", results, file=sys.stderr)
-
-
-def test_llm(dataset_name: str) -> None:
-    llm = OpenAI(temperature=0)
-    evaluators = load_run_evaluators_for_model(
-        [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
-    )
-    results = run_on_dataset(
-        dataset_name,
-        llm,
-        run_evaluators=evaluators,
-    )
-    print("LLM", results, file=sys.stderr)
-
-
-def test_chain(dataset_name: str) -> None:
-    llm = ChatOpenAI(temperature=0)
-    chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
-    evaluators = load_run_evaluators_for_model(
-        [EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer"
-    )
-    results = run_on_dataset(
-        dataset_name,
-        lambda: chain,
-        run_evaluators=evaluators,
-    )
-    print("CHAIN", results, file=sys.stderr)
--- a/tests/integration_tests/evaluation/init.py
+++ b/tests/integration_tests/evaluation/init.py
--- a/tests/integration_tests/smith/evaluation/test_runner_utils.py
+++ b/tests/integration_tests/smith/evaluation/test_runner_utils.py
@ -0,0 +1,429 @@
+from typing import Iterator, List
+from uuid import uuid4
+
+import pytest
+from langsmith import Client as Client
+from langsmith.schemas import DataType
+
+from langchain.callbacks.tracers.evaluation import wait_for_all_evaluators
+from langchain.chains.llm import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.evaluation import EvaluatorType
+from langchain.llms.openai import OpenAI
+from langchain.schema.messages import BaseMessage, HumanMessage
+from langchain.smith import RunEvalConfig, run_on_dataset
+from langchain.smith.evaluation import InputFormatError
+
+
+def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
+    # Assert that all runs completed, all feedback completed, and that the
+    # chain or llm passes for the feedback provided.
+    runs = list(client.list_runs(project_name=_project_name, execution_order=1))
+    assert len(runs) == 4
+    wait_for_all_evaluators()
+    feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
+    assert len(feedback) == 8
+    assert all([f.score == 1 for f in feedback])
+
+
+@pytest.fixture
+def eval_project_name() -> str:
+    return f"lcp integration tests - {str(uuid4())[-8:]}"
+
+
+@pytest.fixture(scope="module")
+def client() -> Client:
+    return Client()
+
+
+@pytest.fixture(
+    scope="module",
+)
+def kv_dataset_name() -> Iterator[str]:
+    import pandas as pd
+
+    client = Client()
+    df = pd.DataFrame(
+        {
+            "some_input": [
+                "What's the capital of California?",
+                "What's the capital of Nevada?",
+                "What's the capital of Oregon?",
+                "What's the capital of Washington?",
+            ],
+            "other_input": [
+                "a",
+                "b",
+                "c",
+                "d",
+            ],
+            "some_output": ["Sacramento", "Carson City", "Salem", "Olympia"],
+            "other_output": ["e", "f", "g", "h"],
+        }
+    )
+
+    uid = str(uuid4())[-8:]
+    _dataset_name = f"lcp kv dataset integration tests - {uid}"
+    client.upload_dataframe(
+        df,
+        name=_dataset_name,
+        input_keys=["some_input", "other_input"],
+        output_keys=["some_output", "other_output"],
+        description="Integration test dataset",
+    )
+    yield _dataset_name
+
+
+def test_chat_model(
+    kv_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = ChatOpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    with pytest.raises(ValueError, match="Must specify reference_key"):
+        run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
+    eval_config = RunEvalConfig(
+        evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
+        reference_key="some_output",
+    )
+    with pytest.raises(
+        InputFormatError, match="Example inputs do not match language model"
+    ):
+        run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
+
+    def input_mapper(d: dict) -> List[BaseMessage]:
+        return [HumanMessage(content=d["some_input"])]
+
+    run_on_dataset(
+        client,
+        kv_dataset_name,
+        llm,
+        evaluation=eval_config,
+        input_mapper=input_mapper,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> None:
+    llm = OpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    with pytest.raises(ValueError, match="Must specify reference_key"):
+        run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
+    eval_config = RunEvalConfig(
+        evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
+        reference_key="some_output",
+    )
+    with pytest.raises(
+        InputFormatError, match="Example inputs do not match language model"
+    ):
+        run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
+
+    def input_mapper(d: dict) -> str:
+        return d["some_input"]
+
+    run_on_dataset(
+        client,
+        kv_dataset_name,
+        llm,
+        evaluation=eval_config,
+        input_mapper=input_mapper,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) -> None:
+    llm = ChatOpenAI(temperature=0)
+    chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    with pytest.raises(ValueError, match="Must specify reference_key"):
+        run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
+    eval_config = RunEvalConfig(
+        evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
+        reference_key="some_output",
+    )
+    with pytest.raises(
+        InputFormatError, match="Example inputs do not match chain input keys"
+    ):
+        run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
+
+    def input_mapper(d: dict) -> dict:
+        return {"input": d["some_input"]}
+
+    with pytest.raises(
+        InputFormatError,
+        match=" match the chain's expected input keys.",
+    ):
+        run_on_dataset(
+            client,
+            kv_dataset_name,
+            lambda: chain,
+            evaluation=eval_config,
+            input_mapper=input_mapper,
+        )
+
+    def right_input_mapper(d: dict) -> dict:
+        return {"question": d["some_input"]}
+
+    run_on_dataset(
+        client,
+        kv_dataset_name,
+        lambda: chain,
+        evaluation=eval_config,
+        input_mapper=right_input_mapper,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+### Testing Chat Datasets
+
+
+@pytest.fixture(
+    scope="module",
+)
+def chat_dataset_name() -> Iterator[str]:
+    def _create_message(txt: str, role: str = "human") -> List[dict]:
+        return [{"type": role, "data": {"content": txt}}]
+
+    import pandas as pd
+
+    client = Client()
+    df = pd.DataFrame(
+        {
+            "input": [
+                _create_message(txt)
+                for txt in (
+                    "What's the capital of California?",
+                    "What's the capital of Nevada?",
+                    "What's the capital of Oregon?",
+                    "What's the capital of Washington?",
+                )
+            ],
+            "output": [
+                _create_message(txt, role="ai")[0]
+                for txt in ("Sacramento", "Carson City", "Salem", "Olympia")
+            ],
+        }
+    )
+
+    uid = str(uuid4())[-8:]
+    _dataset_name = f"lcp chat dataset integration tests - {uid}"
+    ds = client.create_dataset(
+        _dataset_name, description="Integration test dataset", data_type=DataType.chat
+    )
+    for row in df.itertuples():
+        client.create_example(
+            dataset_id=ds.id,
+            inputs={"input": row.input},
+            outputs={"output": row.output},
+        )
+    yield _dataset_name
+
+
+def test_chat_model_on_chat_dataset(
+    chat_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = ChatOpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    run_on_dataset(
+        client,
+        chat_dataset_name,
+        llm,
+        evaluation=eval_config,
+        project_name=eval_project_name,
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_llm_on_chat_dataset(
+    chat_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = OpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    run_on_dataset(
+        client,
+        chat_dataset_name,
+        llm,
+        evaluation=eval_config,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_chain_on_chat_dataset(chat_dataset_name: str, client: Client) -> None:
+    llm = ChatOpenAI(temperature=0)
+    chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    with pytest.raises(
+        ValueError, match="Cannot evaluate a chain on dataset with data_type=chat"
+    ):
+        run_on_dataset(
+            client,
+            chat_dataset_name,
+            lambda: chain,
+            evaluation=eval_config,
+        )
+
+
+@pytest.fixture(
+    scope="module",
+)
+def llm_dataset_name() -> Iterator[str]:
+    import pandas as pd
+
+    client = Client()
+    df = pd.DataFrame(
+        {
+            "input": [
+                "What's the capital of California?",
+                "What's the capital of Nevada?",
+                "What's the capital of Oregon?",
+                "What's the capital of Washington?",
+            ],
+            "output": ["Sacramento", "Carson City", "Salem", "Olympia"],
+        }
+    )
+
+    uid = str(uuid4())[-8:]
+    _dataset_name = f"lcp llm dataset integration tests - {uid}"
+    client.upload_dataframe(
+        df,
+        name=_dataset_name,
+        input_keys=["input"],
+        output_keys=["output"],
+        description="Integration test dataset",
+        data_type=DataType.llm,
+    )
+    yield _dataset_name
+
+
+def test_chat_model_on_llm_dataset(
+    llm_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = ChatOpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    run_on_dataset(
+        client,
+        llm_dataset_name,
+        llm,
+        evaluation=eval_config,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_llm_on_llm_dataset(
+    llm_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = OpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    run_on_dataset(
+        client,
+        llm_dataset_name,
+        llm,
+        evaluation=eval_config,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_chain_on_llm_dataset(llm_dataset_name: str, client: Client) -> None:
+    llm = ChatOpenAI(temperature=0)
+    chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    with pytest.raises(
+        ValueError, match="Cannot evaluate a chain on dataset with data_type=llm"
+    ):
+        run_on_dataset(
+            client,
+            llm_dataset_name,
+            lambda: chain,
+            evaluation=eval_config,
+        )
+
+
+@pytest.fixture(
+    scope="module",
+)
+def kv_singleio_dataset_name() -> Iterator[str]:
+    import pandas as pd
+
+    client = Client()
+    df = pd.DataFrame(
+        {
+            "the wackiest input": [
+                "What's the capital of California?",
+                "What's the capital of Nevada?",
+                "What's the capital of Oregon?",
+                "What's the capital of Washington?",
+            ],
+            "unthinkable output": ["Sacramento", "Carson City", "Salem", "Olympia"],
+        }
+    )
+
+    uid = str(uuid4())[-8:]
+    _dataset_name = f"lcp singleio kv dataset integration tests - {uid}"
+    client.upload_dataframe(
+        df,
+        name=_dataset_name,
+        input_keys=["the wackiest input"],
+        output_keys=["unthinkable output"],
+        description="Integration test dataset",
+    )
+    yield _dataset_name
+
+
+def test_chat_model_on_kv_singleio_dataset(
+    kv_singleio_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = ChatOpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    run_on_dataset(
+        client,
+        kv_singleio_dataset_name,
+        llm,
+        evaluation=eval_config,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_llm_on_kv_singleio_dataset(
+    kv_singleio_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = OpenAI(temperature=0)
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    run_on_dataset(
+        client,
+        kv_singleio_dataset_name,
+        llm,
+        evaluation=eval_config,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
+
+
+def test_chain_on_kv_singleio_dataset(
+    kv_singleio_dataset_name: str, eval_project_name: str, client: Client
+) -> None:
+    llm = ChatOpenAI(temperature=0)
+    chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
+    eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
+    run_on_dataset(
+        client,
+        kv_singleio_dataset_name,
+        lambda: chain,
+        evaluation=eval_config,
+        project_name=eval_project_name,
+        tags=["shouldpass"],
+    )
+    _check_all_feedback_passed(eval_project_name, client)
--- a/tests/unit_tests/callbacks/tracers/test_langchain.py
+++ b/tests/unit_tests/callbacks/tracers/test_langchain.py
@ -3,7 +3,7 @@ import unittest.mock
 from typing import Any
 from uuid import UUID

-from langchainplus_sdk import LangChainPlusClient
+from langsmith import Client

 from langchain.callbacks.tracers.langchain import LangChainTracer
 from langchain.callbacks.tracers.schemas import Run
@ -14,38 +14,36 @@ def test_example_id_assignment_threadsafe() -> None:
    """Test that example assigned at callback start/end is honored."""
    example_ids = {}

-    def mock_create_run(self: Any, **kwargs: Any) -> Any:
+    def mock_create_run(**kwargs: Any) -> Any:
        example_ids[kwargs.get("id")] = kwargs.get("reference_example_id")
        return unittest.mock.MagicMock()

+    client = unittest.mock.MagicMock(spec=Client)
+    client.create_run = mock_create_run
+    tracer = LangChainTracer(client=client)
+    old_persist_run_single = tracer._persist_run_single
+
+    def new_persist_run_single(run: Run) -> None:
+        time.sleep(0.01)
+        old_persist_run_single(run)
+
    with unittest.mock.patch.object(
-        LangChainPlusClient, "create_run", new=mock_create_run
+        tracer, "_persist_run_single", new=new_persist_run_single
    ):
-        client = LangChainPlusClient()
-        tracer = LangChainTracer(client=client)
-        old_persist_run_single = tracer._persist_run_single
-
-        def new_persist_run_single(run: Run) -> None:
-            time.sleep(0.01)
-            old_persist_run_single(run)
-
-        with unittest.mock.patch.object(
-            tracer, "_persist_run_single", new=new_persist_run_single
-        ):
-            run_id_1 = UUID("9d878ab3-e5ca-4218-aef6-44cbdc90160a")
-            run_id_2 = UUID("f1f9fa53-8b2f-4742-bdbc-38215f7bd1e1")
-            example_id_1 = UUID("57e42c57-8c79-4d9f-8765-bf6cd3a98055")
-            tracer.example_id = example_id_1
-            tracer.on_llm_start({"name": "example_1"}, ["foo"], run_id=run_id_1)
-            tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_1)
-            example_id_2 = UUID("4f31216e-7c26-4027-a5fd-0bbf9ace17dc")
-            tracer.example_id = example_id_2
-            tracer.on_llm_start({"name": "example_2"}, ["foo"], run_id=run_id_2)
-            tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_2)
-            tracer.example_id = None
-            expected_example_ids = {
-                run_id_1: example_id_1,
-                run_id_2: example_id_2,
-            }
-            tracer.wait_for_futures()
-            assert example_ids == expected_example_ids
+        run_id_1 = UUID("9d878ab3-e5ca-4218-aef6-44cbdc90160a")
+        run_id_2 = UUID("f1f9fa53-8b2f-4742-bdbc-38215f7bd1e1")
+        example_id_1 = UUID("57e42c57-8c79-4d9f-8765-bf6cd3a98055")
+        tracer.example_id = example_id_1
+        tracer.on_llm_start({"name": "example_1"}, ["foo"], run_id=run_id_1)
+        tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_1)
+        example_id_2 = UUID("4f31216e-7c26-4027-a5fd-0bbf9ace17dc")
+        tracer.example_id = example_id_2
+        tracer.on_llm_start({"name": "example_2"}, ["foo"], run_id=run_id_2)
+        tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_2)
+        tracer.example_id = None
+        expected_example_ids = {
+            run_id_1: example_id_1,
+            run_id_2: example_id_2,
+        }
+        tracer.wait_for_futures()
+        assert example_ids == expected_example_ids
--- a/tests/unit_tests/evaluation/comparison/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py
@ -1,9 +1,14 @@
 """Test the comparison chains."""


+import re
+
 import pytest

-from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain
+from langchain.evaluation.comparison.eval_chain import (
+    LabeledPairwiseStringEvalChain,
+    PairwiseStringEvalChain,
+)
 from tests.unit_tests.llms.fake_llm import FakeLLM


@ -32,7 +37,7 @@ def test_pairwise_string_comparison_chain() -> None:
    )
    assert res["value"] == "A"
    assert res["score"] == 1
-    with pytest.warns(UserWarning, match=chain._skip_reference_warning):
+    with pytest.warns(UserWarning, match=re.escape(chain._skip_reference_warning)):
        res = chain.evaluate_string_pairs(
            prediction="I like pie.",
            prediction_b="I hate pie.",
@ -43,7 +48,7 @@ def test_pairwise_string_comparison_chain() -> None:
    assert res["score"] == 0


-def test_pairwise_string_comparison_chain_missing_ref() -> None:
+def test_labeled_pairwise_string_comparison_chain_missing_ref() -> None:
    llm = FakeLLM(
        queries={
            "a": "The values are the same.\n[[C]]",
@ -52,7 +57,7 @@ def test_pairwise_string_comparison_chain_missing_ref() -> None:
        },
        sequential_responses=True,
    )
-    chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True)
+    chain = LabeledPairwiseStringEvalChain.from_llm(llm=llm)
    with pytest.raises(ValueError):
        chain.evaluate_string_pairs(
            prediction="I like pie.",
--- a/tests/unit_tests/evaluation/criteria/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/criteria/test_eval_chain.py
@ -5,18 +5,21 @@ import pytest

 from langchain.evaluation.criteria.eval_chain import (
    _SUPPORTED_CRITERIA,
+    Criteria,
    CriteriaEvalChain,
+    LabeledCriteriaEvalChain,
 )
 from langchain.evaluation.schema import StringEvaluator
 from tests.unit_tests.llms.fake_llm import FakeLLM


 def test_resolve_criteria() -> None:
+    # type: ignore
    assert CriteriaEvalChain.resolve_criteria("helpfulness") == {
-        "helpfulness": _SUPPORTED_CRITERIA["helpfulness"]
+        "helpfulness": _SUPPORTED_CRITERIA[Criteria.HELPFULNESS]
    }
-    assert CriteriaEvalChain.resolve_criteria(["correctness"]) == {
-        "correctness": _SUPPORTED_CRITERIA["correctness"]
+    assert CriteriaEvalChain.resolve_criteria("correctness") == {
+        "correctness": _SUPPORTED_CRITERIA[Criteria.CORRECTNESS]
    }


@ -35,12 +38,11 @@ def test_criteria_eval_chain() -> None:


 def test_criteria_eval_chain_missing_reference() -> None:
-    chain = CriteriaEvalChain.from_llm(
+    chain = LabeledCriteriaEvalChain.from_llm(
        llm=FakeLLM(
            queries={"text": "The meaning of life\nY"},
            sequential_responses=True,
        ),
-        requires_reference=True,
        criteria={"my criterion": "my criterion description"},
    )
    with pytest.raises(ValueError):
--- a/tests/unit_tests/evaluation/qa/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/qa/test_eval_chain.py
@ -25,8 +25,8 @@ def test_eval_chain() -> None:

    outputs = fake_qa_eval_chain.evaluate([example, example], [prediction, prediction])
    assert outputs[0] == outputs[1]
-    assert "text" in outputs[0]
-    assert outputs[0]["text"] == "foo"
+    assert fake_qa_eval_chain.output_key in outputs[0]
+    assert outputs[0][fake_qa_eval_chain.output_key] == "foo"


@pytest.mark.skipif(
--- a/tests/unit_tests/evaluation/run_evaluators/test_implementations.py
+++ b/tests/unit_tests/evaluation/run_evaluators/test_implementations.py
@ -1,54 +0,0 @@
-"""Test run evaluator implementations basic functionality."""
-
-from uuid import UUID
-
-import pytest
-from langchainplus_sdk.schemas import Example, Run
-
-from langchain.evaluation.run_evaluators import get_criteria_evaluator, get_qa_evaluator
-from tests.unit_tests.llms.fake_llm import FakeLLM
-
-
-@pytest.fixture
-def run() -> Run:
-    return Run(
-        id=UUID("f77cd087-48f7-4c62-9e0e-297842202107"),
-        name="My Run",
-        inputs={"input": "What is the answer to life, the universe, and everything?"},
-        outputs={"output": "The answer is 42."},
-        start_time="2021-07-20T15:00:00.000000+00:00",
-        end_time="2021-07-20T15:00:00.000000+00:00",
-        run_type="chain",
-        execution_order=1,
-    )
-
-
-@pytest.fixture
-def example() -> Example:
-    return Example(
-        id=UUID("f77cd087-48f7-4c62-9e0e-297842202106"),
-        dataset_id=UUID("f77cd087-48f7-4c62-9e0e-297842202105"),
-        inputs={"input": "What is the answer to life, the universe, and everything?"},
-        outputs={"output": "The answer is 42."},
-        created_at="2021-07-20T15:00:00.000000+00:00",
-    )
-
-
-def test_get_qa_evaluator(run: Run, example: Example) -> None:
-    """Test get_qa_evaluator."""
-    eval_llm = FakeLLM(
-        queries={"a": "This checks out.\nCORRECT"}, sequential_responses=True
-    )
-    qa_evaluator = get_qa_evaluator(eval_llm)
-    res = qa_evaluator.evaluate_run(run, example)
-    assert res.value == "CORRECT"
-    assert res.score == 1
-
-
-def test_get_criteria_evaluator(run: Run, example: Example) -> None:
-    """Get a criteria evaluator."""
-    eval_llm = FakeLLM(queries={"a": "This checks out.\nY"}, sequential_responses=True)
-    criteria_evaluator = get_criteria_evaluator(eval_llm, criteria="conciseness")
-    res = criteria_evaluator.evaluate_run(run, example)
-    assert res.value == "Y"
-    assert res.score == 1
--- a/tests/unit_tests/evaluation/run_evaluators/test_loading.py
+++ b/tests/unit_tests/evaluation/run_evaluators/test_loading.py
@ -1,114 +0,0 @@
-"""Test the loading function for evaluators."""
-
-from unittest.mock import MagicMock
-
-import pytest
-
-from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
-from langchain.evaluation.loading import load_evaluators
-from langchain.evaluation.run_evaluators.string_run_evaluator import (
-    StringRunEvaluatorChain,
-)
-from langchain.evaluation.schema import StringEvaluator
-from tests.unit_tests.chains.test_base import FakeChain
-from tests.unit_tests.llms.fake_chat_model import FakeChatModel
-from tests.unit_tests.llms.fake_llm import FakeLLM
-
-
-@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
-def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
-    """Test loading evaluators."""
-    fake_llm = FakeLLM(
-        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
-    )
-    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
-    model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
-    kwargs = {}
-    if evaluator.requires_reference:
-        kwargs["reference_key"] = "generations"
-    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-        model, evaluator, **kwargs
-    )
-    callback = RunCollectorCallbackHandler()
-    model.predict("Foo input", callbacks=[callback])
-    run = callback.traced_runs[0]
-    example = MagicMock()
-    example.inputs = {}
-    example.outputs = {"generations": "Foo output"}
-    result = run_evaluator._prepare_input({"run": run, "example": example})
-    assert result["input"] == "Foo input"
-    assert result["prediction"] == "Foo output"
-    if evaluator.requires_reference:
-        assert "reference" in result
-        assert result["reference"] == "Foo output"
-
-
-@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
-def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
-    """Test loading evaluators."""
-    fake_llm = FakeLLM(
-        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
-    )
-    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
-    model = FakeChatModel()
-    kwargs = {}
-    if evaluator.requires_reference:
-        kwargs["reference_key"] = "generations"
-    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-        model, evaluator, **kwargs
-    )
-    callback = RunCollectorCallbackHandler()
-    model.predict("Foo input", callbacks=[callback])
-    run = callback.traced_runs[0]
-    example = MagicMock()
-    example.inputs = {}
-    example.outputs = {"generations": "Another fake response"}
-    result = run_evaluator._prepare_input({"run": run, "example": example})
-    assert result["input"] == "Human: Foo input"
-    assert result["prediction"] == "AI: fake response"
-    if evaluator.requires_reference:
-        assert "reference" in result
-        assert result["reference"] == "Another fake response"
-
-
-@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
-def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
-    model = FakeChain(
-        the_input_keys=["an_input", "another_input"],
-    )
-    fake_llm = FakeChatModel()
-    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
-    # No input key
-    with pytest.raises(ValueError, match="multiple input keys"):
-        StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
-    with pytest.raises(ValueError, match="does not have specified"):
-        StringRunEvaluatorChain.from_model_and_evaluator(
-            model, evaluator, input_key="some_input"
-        )
-    kwargs = {}
-    if evaluator.requires_reference:
-        kwargs["reference_key"] = "label_column"
-    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-        model, evaluator, input_key="an_input", **kwargs
-    )
-    callback = RunCollectorCallbackHandler()
-    model(
-        {"an_input": "Foo input", "another_input": "Another fake response"},
-        callbacks=[callback],
-    )
-    run = callback.traced_runs[0]
-    example = MagicMock()
-    example.inputs = {}
-    example.outputs = {"label_column": "Another fake response"}
-    result = run_evaluator._prepare_input({"run": run, "example": example})
-    assert result["input"] == "Foo input"
-    assert result["prediction"] == "baz"
-    if evaluator.requires_reference:
-        assert "reference" in result
-        assert result["reference"] == "Another fake response"
--- a/tests/unit_tests/evaluation/test_loading.py
+++ b/tests/unit_tests/evaluation/test_loading.py
@ -4,7 +4,7 @@ import pytest

 from langchain.embeddings.fake import FakeEmbeddings
 from langchain.evaluation.loading import EvaluatorType, load_evaluators
-from langchain.evaluation.schema import StringEvaluator
+from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
 from tests.unit_tests.llms.fake_chat_model import FakeChatModel
 from tests.unit_tests.llms.fake_llm import FakeLLM

@ -25,14 +25,25 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
    )


-def test_criteria_eval_chain_requires_reference() -> None:
+@pytest.mark.parametrize(
+    "evaluator_type",
+    [
+        EvaluatorType.LABELED_CRITERIA,
+        EvaluatorType.LABELED_PAIRWISE_STRING,
+        EvaluatorType.QA,
+        EvaluatorType.CONTEXT_QA,
+        EvaluatorType.COT_QA,
+    ],
+)
+def test_eval_chain_requires_references(evaluator_type: EvaluatorType) -> None:
    """Test loading evaluators."""
    fake_llm = FakeLLM(
        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
    )
    evaluator = load_evaluators(
-        [EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True
+        [evaluator_type],
+        llm=fake_llm,
    )[0]
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
+    if not isinstance(evaluator, (StringEvaluator, PairwiseStringEvaluator)):
+        raise ValueError("Evaluator is not a [pairwise]string evaluator")
    assert evaluator.requires_reference
--- a/tests/unit_tests/client/init.py
+++ b/tests/unit_tests/client/init.py
--- a/tests/unit_tests/smith/evaluation/init.py
+++ b/tests/unit_tests/smith/evaluation/init.py
--- a/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/tests/unit_tests/smith/evaluation/test_runner_utils.py
@ -1,25 +1,26 @@
-"""Test the LangChain+ client."""
+"""Test the LangSmith evaluation helpers."""
 import uuid
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Iterator, List, Optional, Union
 from unittest import mock

 import pytest
-from langchainplus_sdk.client import LangChainPlusClient
-from langchainplus_sdk.schemas import Dataset, Example
+from langsmith.client import Client
+from langsmith.schemas import Dataset, Example

 from langchain.chains.base import Chain
 from langchain.chains.transform import TransformChain
-from langchain.client.runner_utils import (
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.smith.evaluation.runner_utils import (
    InputFormatError,
    _get_messages,
-    _get_prompts,
+    _get_prompt,
+    _run_llm,
+    _run_llm_or_chain,
+    _validate_example_inputs_for_chain,
+    _validate_example_inputs_for_language_model,
    arun_on_dataset,
-    run_llm,
-    run_llm_or_chain,
 )
-from langchain.schema import LLMResult
-from langchain.schema.language_model import BaseLanguageModel
 from tests.unit_tests.llms.fake_chat_model import FakeChatModel
 from tests.unit_tests.llms.fake_llm import FakeLLM

@ -33,19 +34,28 @@ _VALID_MESSAGES = [
    {"messages": [_EXAMPLE_MESSAGE], "other_key": "value"},
    {"messages": [], "other_key": "value"},
    {
-        "messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE], [_EXAMPLE_MESSAGE]],
+        "messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]],
        "other_key": "value",
    },
    {"any_key": [_EXAMPLE_MESSAGE]},
-    {"any_key": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE], [_EXAMPLE_MESSAGE]]},
+    {"any_key": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]]},
 ]
 _VALID_PROMPTS = [
-    {"prompts": ["foo", "bar", "baz"], "other_key": "value"},
+    {"prompts": ["foo"], "other_key": "value"},
    {"prompt": "foo", "other_key": ["bar", "baz"]},
    {"some_key": "foo"},
-    {"some_key": ["foo", "bar"]},
+    {"some_key": ["foo"]},
 ]

+_INVALID_PROMPTS = (
+    [
+        {"prompts": "foo"},
+        {"prompt": ["foo"]},
+        {"some_key": 3},
+        {"some_key": "foo", "other_key": "bar"},
+    ],
+)
+

@pytest.mark.parametrize(
    "inputs",
@ -61,21 +71,93 @@ def test__get_messages_valid(inputs: Dict[str, Any]) -> None:
    _VALID_PROMPTS,
 )
 def test__get_prompts_valid(inputs: Dict[str, Any]) -> None:
-    _get_prompts(inputs)
+    _get_prompt(inputs)


@pytest.mark.parametrize(
    "inputs",
-    [
-        {"prompts": "foo"},
-        {"prompt": ["foo"]},
-        {"some_key": 3},
-        {"some_key": "foo", "other_key": "bar"},
-    ],
+    _VALID_PROMPTS,
+)
+def test__validate_example_inputs_for_language_model(inputs: Dict[str, Any]) -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = inputs
+    _validate_example_inputs_for_language_model(mock_, None)
+
+
+@pytest.mark.parametrize(
+    "inputs",
+    _INVALID_PROMPTS,
 )
+def test__validate_example_inputs_for_language_model_invalid(
+    inputs: Dict[str, Any]
+) -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = inputs
+    with pytest.raises(InputFormatError):
+        _validate_example_inputs_for_language_model(mock_, None)
+
+
+def test__validate_example_inputs_for_chain_single_input() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["def not foo"]
+    _validate_example_inputs_for_chain(mock_, chain, None)
+
+
+def test__validate_example_inputs_for_chain_input_mapper() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar", "baz": "qux"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["not foo", "not baz", "not qux"]
+
+    def wrong_output_format(inputs: dict) -> str:
+        assert "foo" in inputs
+        assert "baz" in inputs
+        return "hehe"
+
+    with pytest.raises(InputFormatError, match="must be a dictionary"):
+        _validate_example_inputs_for_chain(mock_, chain, wrong_output_format)
+
+    def wrong_output_keys(inputs: dict) -> dict:
+        assert "foo" in inputs
+        assert "baz" in inputs
+        return {"not foo": "foo", "not baz": "baz"}
+
+    with pytest.raises(InputFormatError, match="keys that match"):
+        _validate_example_inputs_for_chain(mock_, chain, wrong_output_keys)
+
+    def input_mapper(inputs: dict) -> dict:
+        assert "foo" in inputs
+        assert "baz" in inputs
+        return {"not foo": inputs["foo"], "not baz": inputs["baz"], "not qux": "qux"}
+
+    _validate_example_inputs_for_chain(mock_, chain, input_mapper)
+
+
+def test__validate_example_inputs_for_chain_multi_io() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar", "baz": "qux"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["foo", "baz"]
+    _validate_example_inputs_for_chain(mock_, chain, None)
+
+
+def test__validate_example_inputs_for_chain_single_input_multi_expect() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["def not foo", "oh here is another"]
+    with pytest.raises(
+        InputFormatError, match="Example inputs do not match chain input keys."
+    ):
+        _validate_example_inputs_for_chain(mock_, chain, None)
+
+
+@pytest.mark.parametrize("inputs", _INVALID_PROMPTS)
 def test__get_prompts_invalid(inputs: Dict[str, Any]) -> None:
    with pytest.raises(InputFormatError):
-        _get_prompts(inputs)
+        _get_prompt(inputs)


 def test_run_llm_or_chain_with_input_mapper() -> None:
@ -101,12 +183,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
        assert "the wrong input" in inputs
        return {"the right input": inputs["the wrong input"]}

-    result = run_llm_or_chain(
+    result = _run_llm_or_chain(
        example, lambda: mock_chain, n_repetitions=1, input_mapper=input_mapper
    )
    assert len(result) == 1
    assert result[0] == {"output": "2", "the right input": "1"}
-    bad_result = run_llm_or_chain(
+    bad_result = _run_llm_or_chain(
        example,
        lambda: mock_chain,
        n_repetitions=1,
@ -115,18 +197,18 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
    assert "Error" in bad_result[0]

    # Try with LLM
-    def llm_input_mapper(inputs: dict) -> List[str]:
+    def llm_input_mapper(inputs: dict) -> str:
        assert "the wrong input" in inputs
-        return ["the right input"]
+        return "the right input"

    mock_llm = FakeLLM(queries={"the right input": "somenumber"})
-    result = run_llm_or_chain(
+    result = _run_llm_or_chain(
        example, mock_llm, n_repetitions=1, input_mapper=llm_input_mapper
    )
    assert len(result) == 1
    llm_result = result[0]
-    assert isinstance(llm_result, LLMResult)
-    assert llm_result.generations[0][0].text == "somenumber"
+    assert isinstance(llm_result, str)
+    assert llm_result == "somenumber"


@pytest.mark.parametrize(
@ -149,13 +231,13 @@ def test__get_messages_invalid(inputs: Dict[str, Any]) -> None:
@pytest.mark.parametrize("inputs", _VALID_PROMPTS + _VALID_MESSAGES)
 def test_run_llm_all_formats(inputs: Dict[str, Any]) -> None:
    llm = FakeLLM()
-    run_llm(llm, inputs, mock.MagicMock())
+    _run_llm(llm, inputs, mock.MagicMock())


@pytest.mark.parametrize("inputs", _VALID_MESSAGES + _VALID_PROMPTS)
 def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
    llm = FakeChatModel()
-    run_llm(llm, inputs, mock.MagicMock())
+    _run_llm(llm, inputs, mock.MagicMock())


@pytest.mark.asyncio
@ -216,8 +298,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
    def mock_read_dataset(*args: Any, **kwargs: Any) -> Dataset:
        return dataset

-    def mock_list_examples(*args: Any, **kwargs: Any) -> List[Example]:
-        return examples
+    def mock_list_examples(*args: Any, **kwargs: Any) -> Iterator[Example]:
+        return iter(examples)

    async def mock_arun_chain(
        example: Example,
@ -235,16 +317,16 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
        pass

    with mock.patch.object(
-        LangChainPlusClient, "read_dataset", new=mock_read_dataset
-    ), mock.patch.object(
-        LangChainPlusClient, "list_examples", new=mock_list_examples
-    ), mock.patch(
-        "langchain.client.runner_utils._arun_llm_or_chain", new=mock_arun_chain
+        Client, "read_dataset", new=mock_read_dataset
+    ), mock.patch.object(Client, "list_examples", new=mock_list_examples), mock.patch(
+        "langchain.smith.evaluation.runner_utils._arun_llm_or_chain",
+        new=mock_arun_chain,
    ), mock.patch.object(
-        LangChainPlusClient, "create_project", new=mock_create_project
+        Client, "create_project", new=mock_create_project
    ):
-        client = LangChainPlusClient(api_url="http://localhost:1984", api_key="123")
+        client = Client(api_url="http://localhost:1984", api_key="123")
        chain = mock.MagicMock()
+        chain.input_keys = ["foothing"]
        num_repetitions = 3
        results = await arun_on_dataset(
            dataset_name="test",
--- a/tests/unit_tests/smith/test_runner_utils.py
+++ b/tests/unit_tests/smith/test_runner_utils.py
@ -0,0 +1,347 @@
+"""Test the LangSmith evaluation helpers."""
+import uuid
+from datetime import datetime
+from typing import Any, Dict, Iterator, List, Optional, Union
+from unittest import mock
+
+import pytest
+from langsmith.client import Client
+from langsmith.schemas import Dataset, Example
+
+from langchain.chains.base import Chain
+from langchain.chains.transform import TransformChain
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.smith.evaluation.runner_utils import (
+    InputFormatError,
+    _get_messages,
+    _get_prompt,
+    _run_llm,
+    _run_llm_or_chain,
+    _validate_example_inputs_for_chain,
+    _validate_example_inputs_for_language_model,
+    arun_on_dataset,
+)
+from tests.unit_tests.llms.fake_chat_model import FakeChatModel
+from tests.unit_tests.llms.fake_llm import FakeLLM
+
+_CREATED_AT = datetime(2015, 1, 1, 0, 0, 0)
+_TENANT_ID = "7a3d2b56-cd5b-44e5-846f-7eb6e8144ce4"
+_EXAMPLE_MESSAGE = {
+    "data": {"content": "Foo", "example": False, "additional_kwargs": {}},
+    "type": "human",
+}
+_VALID_MESSAGES = [
+    {"messages": [_EXAMPLE_MESSAGE], "other_key": "value"},
+    {"messages": [], "other_key": "value"},
+    {
+        "messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]],
+        "other_key": "value",
+    },
+    {"any_key": [_EXAMPLE_MESSAGE]},
+    {"any_key": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]]},
+]
+_VALID_PROMPTS = [
+    {"prompts": ["foo"], "other_key": "value"},
+    {"prompt": "foo", "other_key": ["bar", "baz"]},
+    {"some_key": "foo"},
+    {"some_key": ["foo"]},
+]
+
+_INVALID_PROMPTS = (
+    [
+        {"prompts": "foo"},
+        {"prompt": ["foo"]},
+        {"some_key": 3},
+        {"some_key": "foo", "other_key": "bar"},
+    ],
+)
+
+
+@pytest.mark.parametrize(
+    "inputs",
+    _VALID_MESSAGES,
+)
+def test__get_messages_valid(inputs: Dict[str, Any]) -> None:
+    {"messages": []}
+    _get_messages(inputs)
+
+
+@pytest.mark.parametrize(
+    "inputs",
+    _VALID_PROMPTS,
+)
+def test__get_prompts_valid(inputs: Dict[str, Any]) -> None:
+    _get_prompt(inputs)
+
+
+@pytest.mark.parametrize(
+    "inputs",
+    _VALID_PROMPTS,
+)
+def test__validate_example_inputs_for_language_model(inputs: Dict[str, Any]) -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = inputs
+    _validate_example_inputs_for_language_model(mock_, None)
+
+
+@pytest.mark.parametrize(
+    "inputs",
+    _INVALID_PROMPTS,
+)
+def test__validate_example_inputs_for_language_model_invalid(
+    inputs: Dict[str, Any]
+) -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = inputs
+    with pytest.raises(InputFormatError):
+        _validate_example_inputs_for_language_model(mock_, None)
+
+
+def test__validate_example_inputs_for_chain_single_input() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["def not foo"]
+    _validate_example_inputs_for_chain(mock_, chain, None)
+
+
+def test__validate_example_inputs_for_chain_input_mapper() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar", "baz": "qux"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["not foo", "not baz", "not qux"]
+
+    def wrong_output_format(inputs: dict) -> str:
+        assert "foo" in inputs
+        assert "baz" in inputs
+        return "hehe"
+
+    with pytest.raises(InputFormatError, match="must be a dictionary"):
+        _validate_example_inputs_for_chain(mock_, chain, wrong_output_format)
+
+    def wrong_output_keys(inputs: dict) -> dict:
+        assert "foo" in inputs
+        assert "baz" in inputs
+        return {"not foo": "foo", "not baz": "baz"}
+
+    with pytest.raises(InputFormatError, match="keys that match"):
+        _validate_example_inputs_for_chain(mock_, chain, wrong_output_keys)
+
+    def input_mapper(inputs: dict) -> dict:
+        assert "foo" in inputs
+        assert "baz" in inputs
+        return {"not foo": inputs["foo"], "not baz": inputs["baz"], "not qux": "qux"}
+
+    _validate_example_inputs_for_chain(mock_, chain, input_mapper)
+
+
+def test__validate_example_inputs_for_chain_multi_io() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar", "baz": "qux"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["foo", "baz"]
+    _validate_example_inputs_for_chain(mock_, chain, None)
+
+
+def test__validate_example_inputs_for_chain_single_input_multi_expect() -> None:
+    mock_ = mock.MagicMock()
+    mock_.inputs = {"foo": "bar"}
+    chain = mock.MagicMock()
+    chain.input_keys = ["def not foo", "oh here is another"]
+    with pytest.raises(
+        InputFormatError, match="Example inputs do not match chain input keys."
+    ):
+        _validate_example_inputs_for_chain(mock_, chain, None)
+
+
+@pytest.mark.parametrize("inputs", _INVALID_PROMPTS)
+def test__get_prompts_invalid(inputs: Dict[str, Any]) -> None:
+    with pytest.raises(InputFormatError):
+        _get_prompt(inputs)
+
+
+def test_run_llm_or_chain_with_input_mapper() -> None:
+    example = Example(
+        id=uuid.uuid4(),
+        created_at=_CREATED_AT,
+        inputs={"the wrong input": "1", "another key": "2"},
+        outputs={"output": "2"},
+        dataset_id=str(uuid.uuid4()),
+    )
+
+    def run_val(inputs: dict) -> dict:
+        assert "the right input" in inputs
+        return {"output": "2"}
+
+    mock_chain = TransformChain(
+        input_variables=["the right input"],
+        output_variables=["output"],
+        transform=run_val,
+    )
+
+    def input_mapper(inputs: dict) -> dict:
+        assert "the wrong input" in inputs
+        return {"the right input": inputs["the wrong input"]}
+
+    result = _run_llm_or_chain(
+        example, lambda: mock_chain, n_repetitions=1, input_mapper=input_mapper
+    )
+    assert len(result) == 1
+    assert result[0] == {"output": "2", "the right input": "1"}
+    bad_result = _run_llm_or_chain(
+        example,
+        lambda: mock_chain,
+        n_repetitions=1,
+    )
+    assert len(bad_result) == 1
+    assert "Error" in bad_result[0]
+
+    # Try with LLM
+    def llm_input_mapper(inputs: dict) -> str:
+        assert "the wrong input" in inputs
+        return "the right input"
+
+    mock_llm = FakeLLM(queries={"the right input": "somenumber"})
+    result = _run_llm_or_chain(
+        example, mock_llm, n_repetitions=1, input_mapper=llm_input_mapper
+    )
+    assert len(result) == 1
+    llm_result = result[0]
+    assert isinstance(llm_result, str)
+    assert llm_result == "somenumber"
+
+
+@pytest.mark.parametrize(
+    "inputs",
+    [
+        {"one_key": [_EXAMPLE_MESSAGE], "other_key": "value"},
+        {
+            "messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE], _EXAMPLE_MESSAGE],
+            "other_key": "value",
+        },
+        {"prompts": "foo"},
+        {},
+    ],
+)
+def test__get_messages_invalid(inputs: Dict[str, Any]) -> None:
+    with pytest.raises(InputFormatError):
+        _get_messages(inputs)
+
+
+@pytest.mark.parametrize("inputs", _VALID_PROMPTS + _VALID_MESSAGES)
+def test_run_llm_all_formats(inputs: Dict[str, Any]) -> None:
+    llm = FakeLLM()
+    _run_llm(llm, inputs, mock.MagicMock())
+
+
+@pytest.mark.parametrize("inputs", _VALID_MESSAGES + _VALID_PROMPTS)
+def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
+    llm = FakeChatModel()
+    _run_llm(llm, inputs, mock.MagicMock())
+
+
+@pytest.mark.asyncio
+async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
+    dataset = Dataset(
+        id=uuid.uuid4(),
+        name="test",
+        description="Test dataset",
+        owner_id="owner",
+        created_at=_CREATED_AT,
+        tenant_id=_TENANT_ID,
+    )
+    uuids = [
+        "0c193153-2309-4704-9a47-17aee4fb25c8",
+        "0d11b5fd-8e66-4485-b696-4b55155c0c05",
+        "90d696f0-f10d-4fd0-b88b-bfee6df08b84",
+        "4ce2c6d8-5124-4c0c-8292-db7bdebcf167",
+        "7b5a524c-80fa-4960-888e-7d380f9a11ee",
+    ]
+    examples = [
+        Example(
+            id=uuids[0],
+            created_at=_CREATED_AT,
+            inputs={"input": "1"},
+            outputs={"output": "2"},
+            dataset_id=str(uuid.uuid4()),
+        ),
+        Example(
+            id=uuids[1],
+            created_at=_CREATED_AT,
+            inputs={"input": "3"},
+            outputs={"output": "4"},
+            dataset_id=str(uuid.uuid4()),
+        ),
+        Example(
+            id=uuids[2],
+            created_at=_CREATED_AT,
+            inputs={"input": "5"},
+            outputs={"output": "6"},
+            dataset_id=str(uuid.uuid4()),
+        ),
+        Example(
+            id=uuids[3],
+            created_at=_CREATED_AT,
+            inputs={"input": "7"},
+            outputs={"output": "8"},
+            dataset_id=str(uuid.uuid4()),
+        ),
+        Example(
+            id=uuids[4],
+            created_at=_CREATED_AT,
+            inputs={"input": "9"},
+            outputs={"output": "10"},
+            dataset_id=str(uuid.uuid4()),
+        ),
+    ]
+
+    def mock_read_dataset(*args: Any, **kwargs: Any) -> Dataset:
+        return dataset
+
+    def mock_list_examples(*args: Any, **kwargs: Any) -> Iterator[Example]:
+        return iter(examples)
+
+    async def mock_arun_chain(
+        example: Example,
+        llm_or_chain: Union[BaseLanguageModel, Chain],
+        n_repetitions: int,
+        tags: Optional[List[str]] = None,
+        callbacks: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> List[Dict[str, Any]]:
+        return [
+            {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
+        ]
+
+    def mock_create_project(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    with mock.patch.object(
+        Client, "read_dataset", new=mock_read_dataset
+    ), mock.patch.object(Client, "list_examples", new=mock_list_examples), mock.patch(
+        "langchain.smith.evaluation.runner_utils._arun_llm_or_chain",
+        new=mock_arun_chain,
+    ), mock.patch.object(
+        Client, "create_project", new=mock_create_project
+    ):
+        client = Client(api_url="http://localhost:1984", api_key="123")
+        chain = mock.MagicMock()
+        chain.input_keys = ["foothing"]
+        num_repetitions = 3
+        results = await arun_on_dataset(
+            dataset_name="test",
+            llm_or_chain_factory=lambda: chain,
+            concurrency_level=2,
+            project_name="test_project",
+            num_repetitions=num_repetitions,
+            client=client,
+        )
+
+        expected = {
+            uuid_: [
+                {"result": f"Result for example {uuid.UUID(uuid_)}"}
+                for _ in range(num_repetitions)
+            ]
+            for uuid_ in uuids
+        }
+        assert results["results"] == expected
--- a/tests/unit_tests/test_dependencies.py
+++ b/tests/unit_tests/test_dependencies.py
@ -38,7 +38,7 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
        "aiohttp",
        "async-timeout",
        "dataclasses-json",
-        "langchainplus-sdk",
+        "langsmith",
        "numexpr",
        "numpy",
        "openapi-schema-pydantic",