[Breaking] Update Evaluation Functionality (#7388)

- Migrate from deprecated langchainplus_sdk to `langsmith` package
- Update the `run_on_dataset()` API to use an eval config
- Update a number of evaluators, as well as the loading logic
- Update docstrings / reference docs
- Update tracer to share single HTTP session
pull/7651/head
William FH 11 months ago committed by GitHub
parent 224199083b
commit a673a51efa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -20,7 +20,9 @@ def load_members() -> dict:
cls = re.findall(r"^class ([^_].*)\(", line)
members[top_level]["classes"].extend([module + "." + c for c in cls])
func = re.findall(r"^def ([^_].*)\(", line)
members[top_level]["functions"].extend([module + "." + f for f in func])
afunc = re.findall(r"^async def ([^_].*)\(", line)
func_strings = [module + "." + f for f in func + afunc]
members[top_level]["functions"].extend(func_strings)
return members

@ -12,7 +12,7 @@
"The `CriteriaEvalChain` is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can\n",
"describe those criteria in regular language. In this example, you will use the `CriteriaEvalChain` to check whether an output is concise.\n",
"\n",
"### Step 1: Create the Eval Chain\n",
"### Step 1: Load Eval Chain\n",
"\n",
"First, create the evaluation chain to predict whether outputs are \"concise\"."
]
@ -27,11 +27,15 @@
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.evaluation.criteria import CriteriaEvalChain\n",
"from langchain.evaluation import load_evaluator, EvaluatorType\n",
"\n",
"llm = ChatOpenAI(temperature=0)\n",
"eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
"criterion = \"conciseness\"\n",
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criterion)"
"eval_chain = load_evaluator(EvaluatorType.CRITERIA, llm=eval_llm, criteria=criterion)\n",
"\n",
"# Equivalent to:\n",
"# from langchain.evaluation import CriteriaEvalChain\n",
"# CriteriaEvalChain.from_llm(llm=eval_llm, criteria=criterion)"
]
},
{
@ -80,7 +84,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': '1. Conciseness: The submission is concise and to the point. It directly answers the question without any unnecessary information. Therefore, the submission meets the criterion of conciseness.\\n\\nY', 'value': 'Y', 'score': 1}\n"
"{'reasoning': 'The criterion for this task is conciseness. The submission should be concise and to the point.\\n\\nLooking at the submission, it provides a detailed explanation of the origin of the term \"synecdoche\". It explains the Greek roots of the word and how it entered the English language. \\n\\nWhile the explanation is detailed, it is also concise. It doesn\\'t include unnecessary information or go off on tangents. It sticks to the point, which is explaining the origin of the term.\\n\\nTherefore, the submission meets the criterion of conciseness.\\n\\nY', 'value': 'Y', 'score': 1}\n"
]
}
],
@ -89,40 +93,6 @@
"print(eval_result)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8c4ec9dd-6557-4f23-8480-c822eb6ec552",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"['conciseness',\n",
" 'relevance',\n",
" 'correctness',\n",
" 'coherence',\n",
" 'harmfulness',\n",
" 'maliciousness',\n",
" 'helpfulness',\n",
" 'controversiality',\n",
" 'mysogyny',\n",
" 'criminality',\n",
" 'insensitive']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
"CriteriaEvalChain.get_supported_default_criteria()"
]
},
{
"cell_type": "markdown",
"id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
@ -133,6 +103,24 @@
"Some criteria may be useful only when there are ground truth reference labels. You can pass these in as well."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0c41cd19",
"metadata": {},
"outputs": [],
"source": [
"eval_chain = load_evaluator(\n",
" EvaluatorType.LABELED_CRITERIA,\n",
" llm=eval_llm,\n",
" criteria=\"correctness\",\n",
")\n",
"\n",
"# Equivalent to\n",
"# from langchain.evaluation import LabeledCriteriaEvalChain\n",
"# LabeledCriteriaEvalChain.from_llm(llm=eval_llm, criteria=criterion)"
]
},
{
"cell_type": "code",
"execution_count": 5,
@ -145,65 +133,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
"With ground truth: 1\n",
"Withoutg ground truth: 0\n"
"With ground truth: 1\n"
]
}
],
"source": [
"eval_chain = CriteriaEvalChain.from_llm(\n",
" llm=llm, criteria=\"correctness\", requires_reference=True\n",
")\n",
"\n",
"# We can even override the model's learned knowledge using ground truth labels\n",
"eval_result = eval_chain.evaluate_strings(\n",
" input=\"What is the capital of the US?\",\n",
" prediction=\"Topeka, KS\",\n",
" reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
")\n",
"print(f'With ground truth: {eval_result[\"score\"]}')\n",
"\n",
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\")\n",
"eval_result = eval_chain.evaluate_strings(\n",
" input=\"What is the capital of the US?\",\n",
" prediction=\"Topeka, KS\",\n",
")\n",
"print(f'Withoutg ground truth: {eval_result[\"score\"]}')"
]
},
{
"cell_type": "markdown",
"id": "2eb7dedb-913a-4d9e-b48a-9521425d1008",
"metadata": {
"tags": []
},
"source": [
"## Multiple Criteria\n",
"\n",
"To check whether an output complies with all of a list of default criteria, pass in a list! Be sure to only include criteria that are relevant to the provided information, and avoid mixing criteria that measure opposing things (e.g., harmfulness and helpfulness)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "50c067f7-bc6e-4d6c-ba34-97a72023be27",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': 'Conciseness:\\n- The submission is one sentence long, which is concise.\\n- The submission directly answers the question without any unnecessary information.\\nConclusion: The submission meets the conciseness criterion.\\n\\nCoherence:\\n- The submission is well-structured and organized.\\n- The submission provides the origin of the term synecdoche and explains the meaning of the Greek words it comes from.\\n- The submission is coherent and easy to understand.\\nConclusion: The submission meets the coherence criterion.', 'value': 'Final conclusion: Y', 'score': None}\n"
]
}
],
"source": [
"criteria = [\"conciseness\", \"coherence\"]\n",
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)\n",
"eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
"print(eval_result)"
"print(f'With ground truth: {eval_result[\"score\"]}')"
]
},
{
@ -220,7 +161,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "bafa0a11-2617-4663-84bf-24df7d0736be",
"metadata": {},
"outputs": [
@ -228,62 +169,22 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': '1. Criteria: numeric: Does the output contain numeric information?\\n- The submission does not contain any numeric information.\\n- Conclusion: The submission meets the criteria.', 'value': 'Answer: Y', 'score': None}\n"
"{'reasoning': 'The criterion is asking if the output contains numeric information. The submission does mention the \"late 16th century,\" which is a numeric information. Therefore, the submission meets the criterion.\\n\\nY', 'value': 'Y', 'score': 1}\n"
]
}
],
"source": [
"custom_criterion = {\"numeric\": \"Does the output contain numeric information?\"}\n",
"\n",
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criterion)\n",
"eval_chain = load_evaluator(\n",
" EvaluatorType.CRITERIA,\n",
" llm=eval_llm,\n",
" criteria=custom_criterion,\n",
")\n",
"eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
"print(eval_result)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6db12a16-0058-4a14-8064-8528540963d8",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Meets criteria: 1\n",
"Does not meet criteria: 0\n"
]
}
],
"source": [
"# You can specify multiple criteria in the dictionary. We recommend you keep the number criteria to a minimum, however for more reliable results.\n",
"\n",
"custom_criteria = {\n",
" \"complements-user\": \"Does the submission complements the question or the person writing the question in some way?\",\n",
" \"positive\": \"Does the submission maintain a positive sentiment throughout?\",\n",
" \"active voice\": \"Does the submission maintain an active voice throughout, avoiding state of being verbs?\",\n",
"}\n",
"\n",
"eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criteria)\n",
"\n",
"# Example that complies\n",
"query = \"What's the population of lagos?\"\n",
"eval_result = eval_chain.evaluate_strings(\n",
" prediction=\"I think that's a great question, you're really curious! About 30 million people live in Lagos, Nigeria, as of 2023.\",\n",
" input=query,\n",
")\n",
"print(\"Meets criteria: \", eval_result[\"score\"])\n",
"\n",
"# Example that does not comply\n",
"eval_result = eval_chain.evaluate_strings(\n",
" prediction=\"The population of Lagos, Nigeria, is about 30 million people.\",\n",
" input=query,\n",
")\n",
"print(\"Does not meet criteria: \", eval_result[\"score\"])"
]
},
{
"cell_type": "markdown",
"id": "07485cce-8d52-43a0-bdad-76ec7dacfb51",
@ -297,7 +198,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"id": "99e3c242-5b12-4bd5-b487-64990a159655",
"metadata": {},
"outputs": [
@ -323,7 +224,7 @@
" ConstitutionalPrinciple(critique_request='Discuss the assistants last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistants last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
]
},
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -337,7 +238,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 8,
"id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
"metadata": {
"tags": []
@ -346,25 +247,25 @@
{
"data": {
"text/plain": [
"{'reasoning': 'harmful1: The submission includes a derogatory and insulting term (\"lilly-livered nincompoop\") to describe Will, which could be considered harmful and disrespectful. Therefore, the submission meets this criterion.\\nharmful2: The submission does not contain any explicit encouragement of dangerous or illegal activity, nor does it contain any explicit racism or sexism. However, the use of the term \"nincompoop\" could be seen as belittling and harmful to Will\\'s self-esteem. Therefore, the submission meets this criterion as well.\\n\\nY',\n",
" 'value': 'Y',\n",
" 'score': 1}"
"{'reasoning': 'The criterion asks to identify if the assistant\\'s response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.\\n\\nThe assistant\\'s response is \"I say that man is a lilly-livered nincompoop\". This is a negative statement about a person named Will. \\n\\nThe term \"lilly-livered\" is an old-fashioned term meaning cowardly, and \"nincompoop\" is a term meaning a foolish or stupid person. \\n\\nWhile this is a negative and potentially insulting statement, it does not appear to be harmful in a physical sense, unethical, racist, sexist, toxic, dangerous, or illegal. \\n\\nIt could be argued that the statement is harmful in a psychological or emotional sense, as it is insulting. However, without further context, it\\'s hard to definitively say if this meets the criteria of being harmful.\\n\\nN',\n",
" 'value': 'N',\n",
" 'score': 0}"
]
},
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eval_chain = CriteriaEvalChain.from_llm(\n",
" llm=llm, criteria=[PRINCIPLES[\"harmful1\"], PRINCIPLES[\"harmful2\"]]\n",
"eval_chain = load_evaluator(\n",
" EvaluatorType.CRITERIA, llm=eval_llm, criteria=PRINCIPLES[\"harmful1\"]\n",
")\n",
"eval_result = eval_chain.evaluate_strings(\n",
" prediction=\"I say that man is a lilly-livered nincompoop\",\n",
" input=\"What do you think of Will?\",\n",
")\n",
"eval_result"
"print(eval_result)"
]
},
{
@ -378,14 +279,6 @@
"\n",
"Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like \"correctness\" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "415eb393-c64f-41f1-98de-de99e8e3597e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -404,7 +297,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
"version": "3.11.2"
}
},
"nbformat": 4,

@ -7,15 +7,19 @@
"tags": []
},
"source": [
"# Debug, Evaluate, and Monitor LLMs with LangSmith\n",
"# LangSmith Walkthrough\n",
"\n",
"LangChain makes it easy to get started with Agents and other LLM applications. Even so, delivering a high-quality agent to production can be deceptively difficult. To aid the development process, we've designed tracing and callbacks at the core of LangChain. In this notebook, you will get started prototyping, testing, and monitoring an LLM agent.\n",
"LangChain makes it easy to prototype LLM applications and Agents. Even so, delivering a high-quality product to production can be deceptively difficult. You will likely have to heavily customize your prompts, chains, and other components to create a high-quality product.\n",
"\n",
"When might you want to use tracing? Some situations we've found it useful include:\n",
"- Quickly debugging a new chain, agent, or set of tools\n",
"- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n",
"- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n",
"- Capturing production traces and using LangChain summarizers to analyze app usage"
"To aid the development process, we've designed tracing and callbacks at the core of LangChain. In this notebook, you will get started prototyping and testing an example LLM agent.\n",
"\n",
"When might this come in handy? You may find it useful when you want to:\n",
"\n",
"- Quickly debug a new chain, agent, or set of tools\n",
"- Visualize how components (chains, llms, retrievers, etc.) relate and are used\n",
"- Evaluate different prompts and LLMs for a single component\n",
"- Run a given chain several times over a dataset to ensure it consistently meets a quality bar.\n",
"- Capture usage traces and using LLMs or analytics pipelines to generate insights"
]
},
{
@ -25,17 +29,15 @@
"source": [
"## Prerequisites\n",
"\n",
"**Either [create a hosted LangSmith account](https://www.langchain.plus/) and connect with an API key OR\n",
"run the server locally.**\n",
"\n",
"**Run the [local tracing server](https://docs.smith.langchain.com/docs/additional-resources/local_installation) OR [create a hosted LangSmith account](https://smith.langchain.com/) and connect with an API key.**\n",
"\n",
"To run the local server, execute the following comand in your terminal:\n",
"```\n",
"pip install --upgrade langchain\n",
"langchain plus start\n",
"pip install --upgrade langsmith\n",
"langsmith start\n",
"```\n",
"\n",
"Now, let's get started by creating a client to connect to LangChain+."
"Now, let's get started debugging!"
]
},
{
@ -45,25 +47,58 @@
"tags": []
},
"source": [
"## Debug your Agent\n",
"## Debug your Chain \n",
"\n",
"First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
"You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable. This will automatically create a debug project for you.\n",
"\n",
"For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.langchain.plus/docs/)\n",
"For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/)\n",
"\n",
"**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
"\n",
"**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta."
"**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version."
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "904db9a5-f387-4a57-914c-c8af8d39e249",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"from uuid import uuid4\n",
"\n",
"unique_id = uuid4().hex[0:8]\n",
"os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
"os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
"# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\" # Uncomment this line to use the hosted version\n",
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGSMITH-API-KEY>\" # Uncomment this line to use the hosted version.\n",
"\n",
"# Used by the agent in this tutorial\n",
"# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
"# os.environ[\"SERPAPI_API_KEY\"] = \"<YOUR-SERPAPI-API-KEY>\""
]
},
{
"cell_type": "markdown",
"id": "8ee7f34b-b65c-4e09-ad52-e3ace78d0221",
"metadata": {
"tags": []
},
"source": [
"Create the langsmith client to interact with the API"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "510b5ca0",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
@ -75,10 +110,10 @@
{
"data": {
"text/html": [
"<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
"<a href=\"https://dev.smith.langchain.com/\", target=\"_blank\" rel=\"noopener\">LangSmith Client</a>"
],
"text/plain": [
"LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
"Client (API URL: https://dev.api.smith.langchain.com)"
]
},
"execution_count": 2,
@ -87,21 +122,9 @@
}
],
"source": [
"import os\n",
"from uuid import uuid4\n",
"from langchainplus_sdk import LangChainPlusClient\n",
"\n",
"unique_id = uuid4().hex[0:8]\n",
"os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
"os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
"# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\" # Uncomment this line to use the hosted version\n",
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGCHAINPLUS-API-KEY>\" # Uncomment this line to use the hosted version.\n",
"from langsmith import Client\n",
"\n",
"# Used by the agent below\n",
"# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
"# os.environ[\"SERPAPI_API_KEY\"] = \"<YOUR-SERPAPI-API-KEY>\"\n",
"\n",
"client = LangChainPlusClient()\n",
"client = Client()\n",
"print(\"You can click the link below to view the UI\")\n",
"client"
]
@ -111,7 +134,7 @@
"id": "ca27fa11-ddce-4af0-971e-c5c37d5b92ef",
"metadata": {},
"source": [
"Now, start prototyping your agent. We will use a straightforward math example."
"Now, start prototyping your agent. We will use a math example using an older ReACT-style agent."
]
},
{
@ -124,8 +147,7 @@
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.agents import initialize_agent, load_tools\n",
"from langchain.agents import AgentType\n",
"from langchain.agents import AgentType, initialize_agent, load_tools\n",
"\n",
"llm = ChatOpenAI(temperature=0)\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
@ -184,7 +206,10 @@
"source": [
"from langchain.callbacks.tracers.langchain import wait_for_all_tracers\n",
"\n",
"# Logs are submitted in a background thread. Make sure they've been submitted before moving on.\n",
"# Logs are submitted in a background thread to avoid blocking execution.\n",
"# For the sake of this tutorial, we want to make sure\n",
"# they've been submitted before moving on. This is also\n",
"# useful for serverless deployments.\n",
"wait_for_all_tracers()"
]
},
@ -193,7 +218,7 @@
"id": "9decb964-be07-4b6c-9802-9825c8be7b64",
"metadata": {},
"source": [
"Assuming you've successfully initiated the server as described earlier, your agent logs should show up in your server. You can check by clicking on the link below:"
"Assuming you've successfully configured the server earlier, your agent traces should show up in your server's UI. You can check by clicking on the link below:"
]
},
{
@ -207,10 +232,10 @@
{
"data": {
"text/html": [
"<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
"<a href=\"https://dev.smith.langchain.com/\", target=\"_blank\" rel=\"noopener\">LangSmith Client</a>"
],
"text/plain": [
"LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
"Client (API URL: https://dev.api.smith.langchain.com)"
]
},
"execution_count": 6,
@ -229,7 +254,7 @@
"source": [
"## Test\n",
"\n",
"Once you've debugged a prototype of your agent, you will want to create tests and benchmark evaluations as you think about putting it into a production environment.\n",
"Once you've debugged a customized your LLM component, you will want to create tests and benchmark evaluations to measure its performance before putting it into a production environment.\n",
"\n",
"In this notebook, you will run evaluators to test an agent. You will do so in a few steps:\n",
"\n",
@ -254,26 +279,14 @@
{
"cell_type": "code",
"execution_count": 7,
"id": "d14a9881-2a01-404c-8c56-0b78565c3ff4",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"dataset_name = \"calculator-example-dataset\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n",
" client.delete_dataset(dataset_name=dataset_name)\n",
"dataset_name = f\"calculator-example-dataset-{unique_id}\"\n",
"\n",
"dataset = client.create_dataset(\n",
" dataset_name, description=\"A calculator example dataset\"\n",
")\n",
@ -289,118 +302,90 @@
},
{
"cell_type": "markdown",
"id": "92e8944f-e6fc-4bdf-9611-b2db39698cbe",
"metadata": {},
"id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
"metadata": {
"tags": []
},
"source": [
"### 2. Select RunEvaluators\n",
"\n",
"Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
"It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
"### 2. Define the Agent or LLM to Test\n",
"\n",
"Below, we will create some pre-implemented run evaluators that do the following:\n",
"- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
"- Evaluate the overall agent trajectory based on the tool usage and intermediate steps.\n",
"- Evaluating 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
"- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
"\n",
"For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
"custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n",
"You can evaluate any LLM or chain. Since chains can have memory, we will pass in a `chain_factory` (aka a `constructor` ) function to initialize for each call.\n",
"\n",
"Below, create the run evaluators.\n",
"\n",
"**Note: the feedback API is currently experimental and subject to change.**"
"In this case, you will test an agent that uses OpenAI's function calling endpoints, but it can be any simple chain."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "56298faa-9ff2-43a2-b35a-ee306e3bf64d",
"execution_count": 8,
"id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.evaluation.run_evaluators import (\n",
" get_qa_evaluator,\n",
" get_criteria_evaluator,\n",
" get_trajectory_evaluator,\n",
")\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.agents import AgentType, initialize_agent, load_tools\n",
"\n",
"# You can use any model, but stronger llms tend to be more reliable\n",
"eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
"\n",
"# Measures accuracy against ground truth\n",
"qa_evaluator = get_qa_evaluator(eval_llm)\n",
"\n",
"# Measures how effective and efficient the agent's actions are\n",
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
"trajectory_evaluator = get_trajectory_evaluator(eval_llm, agent_tools=tools)\n",
"\n",
"# Measure helpfulness. We have some pre-defined criteria you can select\n",
"helpfulness_evaluator = get_criteria_evaluator(\n",
" eval_llm,\n",
" \"helpfulness\",\n",
")\n",
"\n",
"# Custom criteria are specified as a dictionary\n",
"custom_criteria_evaluator = get_criteria_evaluator(\n",
" eval_llm,\n",
" {\n",
" \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
" },\n",
")\n",
"# Since chains can be stateful (e.g. they can have memory), we provide\n",
"# a way to initialize a new chain for each row in the dataset. This is done\n",
"# by passing in a factory function that returns a new chain for each row.\n",
"def agent_factory():\n",
" return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
"\n",
"evaluators = [\n",
" qa_evaluator,\n",
" trajectory_evaluator,\n",
" helpfulness_evaluator,\n",
" custom_criteria_evaluator,\n",
"]"
"\n",
"# If your chain is NOT stateful, your factory can return the object directly\n",
"# to improve runtime performance. For example:\n",
"# chain_factory = lambda: agent"
]
},
{
"cell_type": "markdown",
"id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
"metadata": {
"tags": []
},
"id": "9cb9ef53",
"metadata": {},
"source": [
"### 3. Define the Agent or LLM to Test\n",
"### 3. Configure Evaluation\n",
"\n",
"Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
"It can be helpful to use automated metrics and ai-assisted feedback to evaluate your component's performance.\n",
"\n",
"You can evaluate any LLM or chain. Since chains can have memory, we need to pass an\n",
"initializer function that returns a new chain for each row.\n",
"Below, we will create some pre-implemented run evaluators that do the following:\n",
"- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
"- Measure semantic (dis)similarity using embedding distance\n",
"- Evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
"\n",
"In this case, you will test an agent that uses OpenAI's function calling endpoints, but it can be any simple chain."
"For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
"custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
"execution_count": 9,
"id": "a25dc281",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.agents import initialize_agent, load_tools\n",
"from langchain.agents import AgentType\n",
"\n",
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
"\n",
"\n",
"# Since chains can be stateful (e.g. they can have memory), we need provide\n",
"# a way to initialize a new chain for each row in the dataset. This is done\n",
"# by passing in a factory function that returns a new chain for each row.\n",
"def agent_factory():\n",
" return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
"\n",
"\n",
"# If your chain is NOT stateful, your factory can return the object directly\n",
"# to improve runtime performance. For example:\n",
"# chain_factory = lambda: agent"
"from langchain.evaluation import EvaluatorType\n",
"from langchain.smith import RunEvalConfig\n",
"\n",
"evaluation_config = RunEvalConfig(\n",
" # Evaluators can either be an evaluator type (e.g., \"qa\", \"criteria\", \"embedding_distance\", etc.) or a configuration for that evaluator\n",
" evaluators=[\n",
" EvaluatorType.QA, # \"Correctness\" against a reference answer\n",
" EvaluatorType.EMBEDDING_DISTANCE,\n",
" RunEvalConfig.Criteria(\"helpfulness\"),\n",
" RunEvalConfig.Criteria(\n",
" {\n",
" \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
" }\n",
" ),\n",
" ]\n",
")"
]
},
{
@ -412,40 +397,111 @@
"source": [
"### 4. Run the Agent and Evaluators\n",
"\n",
"With the dataset, agent, and evaluators selected, you can use the helper function below to run them all.\n",
"Use the `arun_on_dataset` (or synchronous `run_on_dataset`) function to evaluate your model. This will:\n",
"1. Fetch example rows from the specified dataset\n",
"2. Run your llm or chain on each example.\n",
"3. Apply evalutors to the resulting run traces and corresponding reference examples to generate automated feedback.\n",
"\n",
"The run traces and evaluation feedback will automatically be associated with the dataset for easy attribution and analysis."
"The results will be visible in the LangSmith app."
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"id": "3733269b-8085-4644-9d5d-baedcff13a2f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed examples: 2\r"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Chain failed for example 4de88b85-928e-4711-8f11-98886295c8b3. Error: LLMMathChain._evaluate(\"\n",
"age_of_Dua_Lipa_boyfriend ** 0.43\n",
"\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed examples: 3\r"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Chain failed for example 7cacdf54-d1b8-4e6c-944e-c94578a2fe0d. Error: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 68}]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed examples: 9\r"
]
}
],
"source": [
"from langchain.smith import (\n",
" arun_on_dataset,\n",
" run_on_dataset, # Available if your chain doesn't support async calls.\n",
")\n",
"\n",
"chain_results = await arun_on_dataset(\n",
" client=client,\n",
" dataset_name=dataset_name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=evaluation_config,\n",
" verbose=True,\n",
" tags=[\"testing-notebook\"], # Optional, adds a tag to the resulting chain runs\n",
")\n",
"\n",
"# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
"# These are logged as warnings here and captured as errors in the tracing UI."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[0;31mSignature:\u001b[0m\n",
"\u001b[0marun_on_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Client'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdataset_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mllm_or_chain_factory\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MODEL_OR_CHAIN_FACTORY'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mevaluation\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[RunEvalConfig]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mconcurrency_level\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mnum_repetitions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mproject_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mrun_evaluators\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[Sequence[RunEvaluator]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0minput_mapper\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[Callable[[Dict], Any]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m\n",
"Asynchronously run the Chain or language model on a dataset\n",
"and store traces to the specified project name.\n",
"\n",
"Args:\n",
" client: LangSmith client to use to read the dataset, and to\n",
" log feedback and run traces.\n",
" dataset_name: Name of the dataset to run the chain on.\n",
" llm_or_chain_factory: Language model or Chain constructor to run\n",
" over the dataset. The Chain constructor is used to permit\n",
@ -457,14 +513,18 @@
" project_name: Name of the project to store the traces in.\n",
" Defaults to {dataset_name}-{chain class name}-{datetime}.\n",
" verbose: Whether to print progress.\n",
" client: Client to use to read the dataset. If not provided, a new\n",
" client will be created using the credentials in the environment.\n",
" tags: Tags to add to each run in the project.\n",
" run_evaluators: Evaluators to run on the results of the chain.\n",
" input_mapper: A function to map to the inputs dictionary from an Example\n",
" to the format expected by the model to be evaluated. This is useful if\n",
" your model needs to deserialize more complex schema or if your dataset\n",
" has inputs with keys that differ from what is expected by your chain\n",
" or agent.\n",
"\n",
"Returns:\n",
" A dictionary containing the run's project name and the resulting model outputs.\n",
"\u001b[0;31mFile:\u001b[0m ~/code/lc/lckg/langchain/client/runner_utils.py\n",
" A dictionary containing the run's project name and the\n",
" resulting model outputs.\n",
"\u001b[0;31mFile:\u001b[0m ~/code/lc/langchain/langchain/smith/evaluation/runner_utils.py\n",
"\u001b[0;31mType:\u001b[0m function"
]
},
@ -473,208 +533,102 @@
}
],
"source": [
"from langchain.client import (\n",
" arun_on_dataset,\n",
" run_on_dataset, # Available if your chain doesn't support async calls.\n",
")\n",
"# For more information on additional configuration for the evaluation function:\n",
"\n",
"?arun_on_dataset"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
"cell_type": "markdown",
"id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed examples: 6\r"
]
}
],
"source": [
"chain_results = await arun_on_dataset(\n",
" dataset_name=dataset_name,\n",
" llm_or_chain_factory=agent_factory,\n",
" concurrency_level=5, # Optional, sets the number of examples to run at a time\n",
" verbose=True,\n",
" client=client,\n",
" tags=[\n",
" \"testing-notebook\",\n",
" ], # Optional, adds a tag to the resulting chain runs\n",
" run_evaluators=evaluators,\n",
")\n",
"### Review the Test Results\n",
"\n",
"# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
"# These are logged as warnings here and captured as errors in the tracing UI."
"You can review the test results tracing UI below by navigating to the \"Datasets & Testing\" page and selecting the **\"calculator-example-dataset-*\"** dataset and associated test project.\n",
"\n",
"This will show the new runs and the feedback logged from the selected evaluators."
]
},
{
"cell_type": "markdown",
"id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
"metadata": {
"tags": []
},
"id": "591c819e-9932-45cf-adab-63727dd49559",
"metadata": {},
"source": [
"### Review the Test Results\n",
"## Exporting Runs\n",
"\n",
"You can review the test results tracing UI below by navigating to the Testing project \n",
"with the title that starts with **\"calculator-example-dataset-AgentExecutor-\"**\n",
"\n",
"This will show the new runs and the feedback logged from the selected evaluators."
"LangSmith lets you export data to common formats such as CSV or JSONL directly in the web app. You can also use the client to fetch runs for further analysis, to store in your own database, or to share with others. Let's fetch the run traces from the evaluation run."
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "136db492-d6ca-4215-96f9-439c23538241",
"execution_count": 14,
"id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
],
"text/plain": [
"LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
"Run(id=UUID('eb71a98c-660b-45e4-904e-e1567fdec145'), name='AgentExecutor', start_time=datetime.datetime(2023, 7, 13, 8, 23, 35, 102907), run_type=<RunTypeEnum.chain: 'chain'>, end_time=datetime.datetime(2023, 7, 13, 8, 23, 37, 793962), extra={'runtime': {'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.5', 'library_version': '0.0.231', 'runtime_version': '3.11.2'}, 'total_tokens': 512, 'prompt_tokens': 451, 'completion_tokens': 61}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-07-13T08:23:35.102907'}, {'name': 'end', 'time': '2023-07-13T08:23:37.793962'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('d343add7-2631-417b-905a-dc39361ace69'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('cc5f4f88-f1bf-495f-8adb-384f66321eb2'), child_run_ids=[UUID('daa9708a-ad08-4be1-9841-e92e2f384cce'), UUID('28b1ada7-3fe8-4853-a5b0-dac8a93a3066'), UUID('dc0b4867-3f3d-46f7-bfb5-f4be10f3cc52'), UUID('58c9494e-2ea6-4291-ab78-73b8ffcdaef5'), UUID('8f5a3e08-ce96-4c81-a6aa-86bf5b3bb590'), UUID('f0447532-7ded-45b6-9d87-f1fa18e381b0')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'fifth-grader-score': {'n': 1, 'avg': 0.0, 'mode': 0}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361}})"
]
},
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# You can navigate to the UI by clicking on the link below\n",
"client"
]
},
{
"cell_type": "markdown",
"id": "5f2c0539-09c1-42f9-a2ee-6a88a378d479",
"metadata": {
"tags": []
},
"source": [
"For a real production application, you will want to add many more test cases and\n",
"incorporate larger datasets to run benchmark evaluations to measure aggregate performance\n",
"across. For more information on recommended ways to do this, see [LangSmith Documentation](https://docs.langchain.plus/docs/)"
]
},
{
"cell_type": "markdown",
"id": "cd67201c-8dc1-4689-981c-759800749e25",
"metadata": {},
"source": [
"## Monitor\n",
"\n",
"Once your agent passed the selected quality bar, you can deploy it to production. For this notebook, you will simulate user interactions directly while logging your traces to LangSmith for monitoring.\n",
"\n",
"For more information on real production deployments, check out the [LangChain documentation](https://python.langchain.com/docs/guides/deployments/) or contact us at [support@langchain.dev](mailto:support@langchain.dev).\n",
"\n",
"**First, create a new project to use in your production deployment.**"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "3718710f-f719-4861-a351-0bb9d639d9fd",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"deployment_name = f\"Search + Calculator Deployment - {unique_id}\"\n",
"project = client.create_project(deployment_name, mode=\"monitor\")"
]
},
{
"cell_type": "markdown",
"id": "3a993ae7-6d26-495a-8633-64936bf94127",
"metadata": {
"tags": []
},
"source": [
"**Then, deploy your agent to production, making sure to configure the environment to log to the monitoring project.**"
"runs = list(client.list_runs(dataset_name=dataset_name))\n",
"runs[0]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "56dba20a-c07c-4b18-a4e7-834ab6dc87ef",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "569389d4-b613-47ce-99d3-e0031f308185",
"execution_count": 19,
"id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LLMMathChain._evaluate(\"\n",
"US_GDP / average_lifespan\n",
"\") raised error: 'US_GDP'. Please try again with a valid numerical expression\n"
]
"data": {
"text/plain": [
"{'correctness': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
" 'helpfulness': {'n': 7, 'avg': 1.0, 'mode': 1},\n",
" 'fifth-grader-score': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
" 'embedding_cosine_distance': {'n': 7,\n",
" 'avg': 0.08308464442094905,\n",
" 'mode': 0.00371031210788608}}"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.environ[\"LANGCHAIN_PROJECT\"] = deployment_name\n",
"\n",
"inputs = [\n",
" \"What's the ratio of the current US GDP to the average lifespan of a human?\",\n",
" \"What's sin of 180 degrees?\",\n",
" \"I need help on my homework\",\n",
" \"If the price of bushel of wheat increases by 10 cents, about how much will that impact the average cost of bread?\",\n",
" # etc.\n",
"]\n",
"for query in inputs:\n",
" try:\n",
" await agent.arun(query)\n",
" except Exception as e:\n",
" print(e)"
"client.read_project(project_id=runs[0].session_id).feedback_stats"
]
},
{
"cell_type": "markdown",
"id": "2646f0fb-81d4-43ce-8a9b-54b8e19841e2",
"metadata": {},
"metadata": {
"tags": []
},
"source": [
"## Conclusion\n",
"\n",
"Congratulations! You have succesfully created connected an agent to LangSmith to trace and debug, evaluated it for accuracy, helpfulness, and trajectory efficiency over a dataset, and instrumented a monitoring project for a simulated \"production\" application!\n",
"Congratulations! You have succesfully traced and evaluated an agent using LangSmith!\n",
"\n",
"This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better products.\n",
"This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better results.\n",
"\n",
"For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.langchain.plus/docs/),\n",
"\n",
"and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
"For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.langchain.plus/docs/), and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90b7fbff-162d-4c9c-b6fc-33bd5445745f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

@ -1,10 +1,12 @@
"""A tracer that runs evaluators over completed runs."""
from __future__ import annotations
import logging
from concurrent.futures import Future, ThreadPoolExecutor, wait
from typing import Any, Optional, Sequence, Set, Union
from typing import Any, List, Optional, Sequence, Set, Union
from uuid import UUID
from langchainplus_sdk import LangChainPlusClient, RunEvaluator
from langsmith import Client, RunEvaluator
from langchain.callbacks.manager import tracing_v2_enabled
from langchain.callbacks.tracers.base import BaseTracer
@ -12,6 +14,15 @@ from langchain.callbacks.tracers.schemas import Run
logger = logging.getLogger(__name__)
_TRACERS: List[EvaluatorCallbackHandler] = []
def wait_for_all_evaluators() -> None:
"""Wait for all tracers to finish."""
global _TRACERS
for tracer in _TRACERS:
tracer.wait_for_futures()
class EvaluatorCallbackHandler(BaseTracer):
"""A tracer that runs a run evaluator whenever a run is persisted.
@ -23,8 +34,8 @@ class EvaluatorCallbackHandler(BaseTracer):
max_workers : int, optional
The maximum number of worker threads to use for running the evaluators.
If not specified, it will default to the number of evaluators.
client : LangChainPlusClient, optional
The LangChainPlusClient instance to use for evaluating the runs.
client : LangSmith Client, optional
The LangSmith client instance to use for evaluating the runs.
If not specified, a new instance will be created.
example_id : Union[UUID, str], optional
The example ID to be associated with the runs.
@ -35,8 +46,8 @@ class EvaluatorCallbackHandler(BaseTracer):
----------
example_id : Union[UUID, None]
The example ID associated with the runs.
client : LangChainPlusClient
The LangChainPlusClient instance used for evaluating the runs.
client : Client
The LangSmith client instance used for evaluating the runs.
evaluators : Sequence[RunEvaluator]
The sequence of run evaluators to be executed.
executor : ThreadPoolExecutor
@ -56,7 +67,7 @@ class EvaluatorCallbackHandler(BaseTracer):
self,
evaluators: Sequence[RunEvaluator],
max_workers: Optional[int] = None,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
example_id: Optional[Union[UUID, str]] = None,
skip_unfinished: bool = True,
project_name: Optional[str] = None,
@ -66,7 +77,7 @@ class EvaluatorCallbackHandler(BaseTracer):
self.example_id = (
UUID(example_id) if isinstance(example_id, str) else example_id
)
self.client = client or LangChainPlusClient()
self.client = client or Client()
self.evaluators = evaluators
self.executor = ThreadPoolExecutor(
max_workers=max(max_workers or len(evaluators), 1)
@ -74,6 +85,8 @@ class EvaluatorCallbackHandler(BaseTracer):
self.futures: Set[Future] = set()
self.skip_unfinished = skip_unfinished
self.project_name = project_name
global _TRACERS
_TRACERS.append(self)
def _evaluate_in_project(self, run: Run, evaluator: RunEvaluator) -> None:
"""Evaluate the run in the project.

@ -8,7 +8,7 @@ from datetime import datetime
from typing import Any, Dict, List, Optional, Set, Union
from uuid import UUID
from langchainplus_sdk import LangChainPlusClient
from langsmith import Client
from langchain.callbacks.tracers.base import BaseTracer
from langchain.callbacks.tracers.schemas import Run, RunTypeEnum, TracerSession
@ -19,6 +19,7 @@ from langchain.schema.messages import BaseMessage
logger = logging.getLogger(__name__)
_LOGGED = set()
_TRACERS: List[LangChainTracer] = []
_CLIENT: Optional[Client] = None
def log_error_once(method: str, exception: Exception) -> None:
@ -37,6 +38,14 @@ def wait_for_all_tracers() -> None:
tracer.wait_for_futures()
def _get_client() -> Client:
"""Get the client."""
global _CLIENT
if _CLIENT is None:
_CLIENT = Client()
return _CLIENT
class LangChainTracer(BaseTracer):
"""An implementation of the SharedTracer that POSTS to the langchain endpoint."""
@ -44,7 +53,7 @@ class LangChainTracer(BaseTracer):
self,
example_id: Optional[Union[UUID, str]] = None,
project_name: Optional[str] = None,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
tags: Optional[List[str]] = None,
**kwargs: Any,
) -> None:
@ -59,7 +68,7 @@ class LangChainTracer(BaseTracer):
)
# set max_workers to 1 to process tasks in order
self.executor = ThreadPoolExecutor(max_workers=1)
self.client = client or LangChainPlusClient()
self.client = client or _get_client()
self._futures: Set[Future] = set()
self.tags = tags or []
global _TRACERS

@ -5,8 +5,8 @@ import datetime
from typing import Any, Dict, List, Optional
from uuid import UUID
from langchainplus_sdk.schemas import RunBase as BaseRunV2
from langchainplus_sdk.schemas import RunTypeEnum
from langsmith.schemas import RunBase as BaseRunV2
from langsmith.schemas import RunTypeEnum
from pydantic import BaseModel, Field, root_validator
from langchain.schema import LLMResult

@ -198,7 +198,7 @@ class Chain(Serializable, ABC):
inputs: Dictionary of inputs, or single input if chain expects
only one param. Should contain all inputs specified in
`Chain.input_keys` except for inputs that will be set by the chain's
memory.
memory.
return_only_outputs: Whether to return only outputs in the
response. If True, only new keys generated by this chain will be
returned. If False, both input keys and new keys generated by this
@ -265,7 +265,7 @@ class Chain(Serializable, ABC):
inputs: Dictionary of inputs, or single input if chain expects
only one param. Should contain all inputs specified in
`Chain.input_keys` except for inputs that will be set by the chain's
memory.
memory.
return_only_outputs: Whether to return only outputs in the
response. If True, only new keys generated by this chain will be
returned. If False, both input keys and new keys generated by this
@ -349,7 +349,7 @@ class Chain(Serializable, ABC):
inputs: Dictionary of raw inputs, or single input if chain expects
only one param. Should contain all inputs specified in
`Chain.input_keys` except for inputs that will be set by the chain's
memory.
memory.
Returns:
A dictionary of all inputs, including those added by the chain's memory.

@ -1,16 +0,0 @@
"""LangChain + Client."""
from langchain.client.runner_utils import (
InputFormatError,
arun_on_dataset,
arun_on_examples,
run_on_dataset,
run_on_examples,
)
__all__ = [
"InputFormatError",
"arun_on_dataset",
"run_on_dataset",
"arun_on_examples",
"run_on_examples",
]

@ -1,759 +0,0 @@
"""Utilities for running language models or Chains over datasets."""
from __future__ import annotations
import asyncio
import functools
import logging
from datetime import datetime
from typing import (
Any,
Callable,
Coroutine,
Dict,
Iterator,
List,
Optional,
Sequence,
Union,
)
from langchainplus_sdk import LangChainPlusClient, RunEvaluator
from langchainplus_sdk.schemas import Example
from langchain.callbacks.base import BaseCallbackHandler
from langchain.callbacks.manager import Callbacks
from langchain.callbacks.tracers.base import BaseTracer
from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
from langchain.callbacks.tracers.langchain import LangChainTracer
from langchain.chains.base import Chain
from langchain.chat_models.base import BaseChatModel
from langchain.llms.base import BaseLLM
from langchain.schema import (
ChatResult,
LLMResult,
)
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.messages import (
BaseMessage,
HumanMessage,
get_buffer_string,
messages_from_dict,
)
logger = logging.getLogger(__name__)
MODEL_OR_CHAIN_FACTORY = Union[Callable[[], Chain], BaseLanguageModel]
class InputFormatError(Exception):
"""Raised when the input format is invalid."""
def _get_prompts(inputs: Dict[str, Any]) -> List[str]:
"""Get prompts from inputs.
Args:
inputs: The input dictionary.
Returns:
A list of prompts.
Raises:
InputFormatError: If the input format is invalid.
"""
if not inputs:
raise InputFormatError("Inputs should not be empty.")
prompts = []
if "prompt" in inputs:
if not isinstance(inputs["prompt"], str):
raise InputFormatError(
"Expected string for 'prompt', got"
f" {type(inputs['prompt']).__name__}"
)
prompts = [inputs["prompt"]]
elif "prompts" in inputs:
if not isinstance(inputs["prompts"], list) or not all(
isinstance(i, str) for i in inputs["prompts"]
):
raise InputFormatError(
"Expected list of strings for 'prompts',"
f" got {type(inputs['prompts']).__name__}"
)
prompts = inputs["prompts"]
elif len(inputs) == 1:
prompt_ = next(iter(inputs.values()))
if isinstance(prompt_, str):
prompts = [prompt_]
elif isinstance(prompt_, list) and all(isinstance(i, str) for i in prompt_):
prompts = prompt_
else:
raise InputFormatError(f"LLM Run expects string prompt input. Got {inputs}")
else:
raise InputFormatError(
f"LLM Run expects 'prompt' or 'prompts' in inputs. Got {inputs}"
)
return prompts
def _get_messages(inputs: Dict[str, Any]) -> List[List[BaseMessage]]:
"""Get Chat Messages from inputs.
Args:
inputs: The input dictionary.
Returns:
A list of chat messages.
Raises:
InputFormatError: If the input format is invalid.
"""
if not inputs:
raise InputFormatError("Inputs should not be empty.")
if "messages" in inputs:
single_input = inputs["messages"]
elif len(inputs) == 1:
single_input = next(iter(inputs.values()))
else:
raise InputFormatError(f"Chat Run expects 'messages' in inputs. Got {inputs}")
if isinstance(single_input, list) and all(
isinstance(i, dict) for i in single_input
):
raw_messages = [single_input]
elif isinstance(single_input, list) and all(
isinstance(i, list) for i in single_input
):
raw_messages = single_input
else:
raise InputFormatError(
f"Chat Run expects List[dict] or List[List[dict]] 'messages'"
f" input. Got {inputs}"
)
return [messages_from_dict(batch) for batch in raw_messages]
async def _arun_llm(
llm: BaseLanguageModel,
inputs: Dict[str, Any],
*,
tags: Optional[List[str]] = None,
callbacks: Callbacks = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Union[LLMResult, ChatResult]:
"""Asynchronously run the language model.
Args:
llm: The language model to run.
inputs: The input dictionary.
tags: Optional tags to add to the run.
callbacks: Optional callbacks to use during the run.
input_mapper: Optional function to map inputs to the expected format.
Returns:
The LLMResult or ChatResult.
Raises:
ValueError: If the LLM type is unsupported.
InputFormatError: If the input format is invalid.
"""
if input_mapper is not None:
if not isinstance(llm, (BaseLLM, BaseChatModel)):
raise ValueError(f"Unsupported LLM type {type(llm).__name__}")
llm_output = await llm.agenerate(
input_mapper(inputs), callbacks=callbacks, tags=tags
)
elif isinstance(llm, BaseLLM):
try:
llm_prompts = _get_prompts(inputs)
llm_output = await llm.agenerate(
llm_prompts, callbacks=callbacks, tags=tags
)
except InputFormatError:
llm_messages = _get_messages(inputs)
buffer_strings = [get_buffer_string(messages) for messages in llm_messages]
llm_output = await llm.agenerate(
buffer_strings, callbacks=callbacks, tags=tags
)
elif isinstance(llm, BaseChatModel):
try:
messages = _get_messages(inputs)
llm_output = await llm.agenerate(messages, callbacks=callbacks, tags=tags)
except InputFormatError:
prompts = _get_prompts(inputs)
converted_messages: List[List[BaseMessage]] = [
[HumanMessage(content=prompt)] for prompt in prompts
]
llm_output = await llm.agenerate(
converted_messages, callbacks=callbacks, tags=tags
)
else:
raise ValueError(f"Unsupported LLM type {type(llm)}")
return llm_output
async def _arun_llm_or_chain(
example: Example,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
n_repetitions: int,
*,
tags: Optional[List[str]] = None,
callbacks: Optional[List[BaseCallbackHandler]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
"""Asynchronously run the Chain or language model.
Args:
example: The example to run.
llm_or_chain_factory: The Chain or language model constructor to run.
n_repetitions: The number of times to run the model on each example.
tags: Optional tags to add to the run.
callbacks: Optional callbacks to use during the run.
input_mapper: Optional function to map the input to the expected format.
Returns:
A list of outputs.
"""
if callbacks:
previous_example_ids = [
getattr(tracer, "example_id", None) for tracer in callbacks
]
for tracer in callbacks:
if hasattr(tracer, "example_id"):
tracer.example_id = example.id
else:
previous_example_ids = None
outputs = []
for _ in range(n_repetitions):
try:
if isinstance(llm_or_chain_factory, BaseLanguageModel):
output: Any = await _arun_llm(
llm_or_chain_factory,
example.inputs,
tags=tags,
callbacks=callbacks,
input_mapper=input_mapper,
)
else:
chain = llm_or_chain_factory()
if input_mapper is not None:
inputs_ = input_mapper(example.inputs)
else:
inputs_ = example.inputs
if len(inputs_) == 1:
inputs_ = next(iter(inputs_.values()))
output = await chain.acall(inputs_, callbacks=callbacks, tags=tags)
outputs.append(output)
except Exception as e:
logger.warning(f"Chain failed for example {example.id}. Error: {e}")
outputs.append({"Error": str(e)})
if callbacks and previous_example_ids:
for example_id, tracer in zip(previous_example_ids, callbacks):
if hasattr(tracer, "example_id"):
tracer.example_id = example_id
return outputs
async def _gather_with_concurrency(
n: int,
initializer: Callable[[], Coroutine[Any, Any, Any]],
*async_funcs: Callable[
[Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
],
) -> List[Any]:
"""Run coroutines with a concurrency limit.
Args:
n: The maximum number of concurrent tasks.
initializer: A coroutine that initializes shared resources for the tasks.
async_funcs: The async_funcs to be run concurrently.
Returns:
A list of results from the coroutines.
"""
semaphore = asyncio.Semaphore(n)
job_state = {"num_processed": 0}
callback_queue: asyncio.Queue[Sequence[BaseCallbackHandler]] = asyncio.Queue()
for _ in range(n):
callback_queue.put_nowait(await initializer())
async def run_coroutine_with_semaphore(
async_func: Callable[
[Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
]
) -> Any:
async with semaphore:
callbacks = await callback_queue.get()
try:
result = await async_func(callbacks, job_state)
finally:
callback_queue.put_nowait(callbacks)
return result
results = await asyncio.gather(
*(run_coroutine_with_semaphore(function) for function in async_funcs)
)
while callback_queue:
try:
callbacks = callback_queue.get_nowait()
except asyncio.QueueEmpty:
break
for callback in callbacks:
if isinstance(callback, (LangChainTracer, EvaluatorCallbackHandler)):
callback.wait_for_futures()
return results
async def _callbacks_initializer(
project_name: Optional[str],
client: LangChainPlusClient,
run_evaluators: Sequence[RunEvaluator],
evaluation_handler_collector: List[EvaluatorCallbackHandler],
) -> List[BaseTracer]:
"""
Initialize a tracer to share across tasks.
Args:
project_name: The project name for the tracer.
client: The client to use for the tracer.
run_evaluators: The evaluators to run.
evaluation_handler_collector: A list to collect the evaluators.
Used to wait for the evaluators to finish.
Returns:
The callbacks for this thread.
"""
callbacks: List[BaseTracer] = []
if project_name:
callbacks.append(LangChainTracer(project_name=project_name))
evaluator_project_name = f"{project_name}-evaluators" if project_name else None
if run_evaluators:
callback = EvaluatorCallbackHandler(
client=client,
evaluators=run_evaluators,
# We already have concurrency, don't want to overload the machine
max_workers=1,
project_name=evaluator_project_name,
)
callbacks.append(callback)
evaluation_handler_collector.append(callback)
return callbacks
async def arun_on_examples(
examples: Iterator[Example],
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
*,
concurrency_level: int = 5,
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Dict[str, Any]:
"""
Asynchronously run the chain on examples and store traces
to the specified project name.
Args:
examples: Examples to run the model or chain over.
llm_or_chain_factory: Language model or Chain constructor to run
over the dataset. The Chain constructor is used to permit
independent calls on each example without carrying over state.
concurrency_level: The number of async tasks to run concurrently.
num_repetitions: Number of times to run the model on each example.
This is useful when testing success rates or generating confidence
intervals.
project_name: Project name to use when tracing runs.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to read the dataset. If not provided, a new
client will be created using the credentials in the environment.
tags: Tags to add to each run in the project.
run_evaluators: Evaluators to run on the results of the chain.
input_mapper: function to map to the inputs dictionary from an Example
to the format expected by the model to be evaluated. This is useful if
your model needs to deserialize more complex schema or if your dataset
has inputs with keys that differ from what is expected by your chain
or agent.
Returns:
A dictionary mapping example ids to the model outputs.
"""
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
client_ = client or LangChainPlusClient()
results: Dict[str, List[Any]] = {}
async def process_example(
example: Example, callbacks: List[BaseCallbackHandler], job_state: dict
) -> None:
"""Process a single example."""
result = await _arun_llm_or_chain(
example,
llm_or_chain_factory,
num_repetitions,
tags=tags,
callbacks=callbacks,
input_mapper=input_mapper,
)
results[str(example.id)] = result
job_state["num_processed"] += 1
if verbose:
print(
f"Processed examples: {job_state['num_processed']}",
end="\r",
flush=True,
)
evaluation_handlers: List[EvaluatorCallbackHandler] = []
await _gather_with_concurrency(
concurrency_level,
functools.partial(
_callbacks_initializer,
project_name=project_name,
client=client_,
evaluation_handler_collector=evaluation_handlers,
run_evaluators=run_evaluators or [],
),
*(functools.partial(process_example, e) for e in examples),
)
for handler in evaluation_handlers:
handler.wait_for_futures()
return results
def run_llm(
llm: BaseLanguageModel,
inputs: Dict[str, Any],
callbacks: Callbacks,
*,
tags: Optional[List[str]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Union[LLMResult, ChatResult]:
"""
Run the language model on the example.
Args:
llm: The language model to run.
inputs: The input dictionary.
callbacks: The callbacks to use during the run.
tags: Optional tags to add to the run.
input_mapper: function to map to the inputs dictionary from an Example
Returns:
The LLMResult or ChatResult.
Raises:
ValueError: If the LLM type is unsupported.
InputFormatError: If the input format is invalid.
"""
if input_mapper is not None:
if not isinstance(llm, (BaseLLM, BaseChatModel)):
raise ValueError(f"Unsupported LLM type {type(llm).__name__}")
llm_output = llm.generate(input_mapper(inputs), callbacks=callbacks, tags=tags)
elif isinstance(llm, BaseLLM):
try:
llm_prompts = _get_prompts(inputs)
llm_output = llm.generate(llm_prompts, callbacks=callbacks, tags=tags)
except InputFormatError:
llm_messages = _get_messages(inputs)
buffer_strings = [get_buffer_string(messages) for messages in llm_messages]
llm_output = llm.generate(buffer_strings, callbacks=callbacks)
elif isinstance(llm, BaseChatModel):
try:
messages = _get_messages(inputs)
llm_output = llm.generate(messages, callbacks=callbacks, tags=tags)
except InputFormatError:
prompts = _get_prompts(inputs)
converted_messages: List[List[BaseMessage]] = [
[HumanMessage(content=prompt)] for prompt in prompts
]
llm_output = llm.generate(
converted_messages, callbacks=callbacks, tags=tags
)
else:
raise ValueError(f"Unsupported LLM type {type(llm)}")
return llm_output
def run_llm_or_chain(
example: Example,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
n_repetitions: int,
*,
tags: Optional[List[str]] = None,
callbacks: Optional[List[BaseCallbackHandler]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
"""
Run the Chain or language model synchronously.
Args:
example: The example to run.
llm_or_chain_factory: The Chain or language model constructor to run.
n_repetitions: The number of times to run the model on each example.
tags: Optional tags to add to the run.
callbacks: Optional callbacks to use during the run.
Returns:
Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
The outputs of the model or chain.
"""
if callbacks:
previous_example_ids = [
getattr(tracer, "example_id", None) for tracer in callbacks
]
for tracer in callbacks:
if hasattr(tracer, "example_id"):
tracer.example_id = example.id
else:
previous_example_ids = None
outputs = []
for _ in range(n_repetitions):
try:
if isinstance(llm_or_chain_factory, BaseLanguageModel):
output: Any = run_llm(
llm_or_chain_factory,
example.inputs,
callbacks,
tags=tags,
input_mapper=input_mapper,
)
else:
chain = llm_or_chain_factory()
if input_mapper is not None:
inputs_ = input_mapper(example.inputs)
else:
inputs_ = example.inputs
if len(inputs_) == 1:
inputs_ = next(iter(inputs_.values()))
output = chain(inputs_, callbacks=callbacks, tags=tags)
outputs.append(output)
except Exception as e:
logger.warning(f"Chain failed for example {example.id}. Error: {e}")
outputs.append({"Error": str(e)})
if callbacks and previous_example_ids:
for example_id, tracer in zip(previous_example_ids, callbacks):
if hasattr(tracer, "example_id"):
tracer.example_id = example_id
return outputs
def run_on_examples(
examples: Iterator[Example],
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
*,
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Dict[str, Any]:
"""
Run the Chain or language model on examples and store
traces to the specified project name.
Args:
examples: Examples to run the model or chain over.
llm_or_chain_factory: Language model or Chain constructor to run
over the dataset. The Chain constructor is used to permit
independent calls on each example without carrying over state.
num_repetitions: Number of times to run the model on each example.
This is useful when testing success rates or generating confidence
intervals.
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to access the dataset. If None, a new client
will be created using the credentials in the environment.
tags: Tags to add to each run in the project.
run_evaluators: Evaluators to run on the results of the chain.
input_mapper: A function to map to the inputs dictionary from an Example
to the format expected by the model to be evaluated. This is useful if
your model needs to deserialize more complex schema or if your dataset
has inputs with keys that differ from what is expected by your chain
or agent.
Returns:
A dictionary mapping example ids to the model outputs.
"""
results: Dict[str, Any] = {}
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
client_ = client or LangChainPlusClient()
tracer = LangChainTracer(project_name=project_name)
evaluator_project_name = f"{project_name}-evaluators"
evalution_handler = EvaluatorCallbackHandler(
evaluators=run_evaluators or [],
client=client_,
project_name=evaluator_project_name,
)
callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
for i, example in enumerate(examples):
result = run_llm_or_chain(
example,
llm_or_chain_factory,
num_repetitions,
tags=tags,
callbacks=callbacks,
input_mapper=input_mapper,
)
if verbose:
print(f"{i+1} processed", flush=True, end="\r")
results[str(example.id)] = result
tracer.wait_for_futures()
evalution_handler.wait_for_futures()
return results
def _get_project_name(
project_name: Optional[str],
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
dataset_name: Optional[str],
) -> str:
"""
Get the project name.
Args:
project_name: The project name if manually specified.
llm_or_chain_factory: The Chain or language model constructor.
dataset_name: The dataset name.
Returns:
The project name.
"""
if project_name is not None:
return project_name
current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
if isinstance(llm_or_chain_factory, BaseLanguageModel):
model_name = llm_or_chain_factory.__class__.__name__
else:
model_name = llm_or_chain_factory().__class__.__name__
dataset_prefix = f"{dataset_name}-" if dataset_name else ""
return f"{dataset_prefix}{model_name}-{current_time}"
async def arun_on_dataset(
dataset_name: str,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
*,
concurrency_level: int = 5,
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Dict[str, Any]:
"""
Asynchronously run the Chain or language model on a dataset
and store traces to the specified project name.
Args:
dataset_name: Name of the dataset to run the chain on.
llm_or_chain_factory: Language model or Chain constructor to run
over the dataset. The Chain constructor is used to permit
independent calls on each example without carrying over state.
concurrency_level: The number of async tasks to run concurrently.
num_repetitions: Number of times to run the model on each example.
This is useful when testing success rates or generating confidence
intervals.
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to read the dataset. If not provided, a new
client will be created using the credentials in the environment.
tags: Tags to add to each run in the session.
run_evaluators: Evaluators to run on the results of the chain.
input_mapper: A function to map to the inputs dictionary from an Example
to the format expected by the model to be evaluated. This is useful if
your model needs to deserialize more complex schema or if your dataset
has inputs with keys that differ from what is expected by your chain
or agent.
Returns:
A dictionary containing the run's project name and the resulting model outputs.
"""
client_ = client or LangChainPlusClient()
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
dataset = client_.read_dataset(dataset_name=dataset_name)
examples = client_.list_examples(dataset_id=str(dataset.id))
results = await arun_on_examples(
examples,
llm_or_chain_factory,
concurrency_level=concurrency_level,
num_repetitions=num_repetitions,
project_name=project_name,
verbose=verbose,
client=client_,
tags=tags,
run_evaluators=run_evaluators,
input_mapper=input_mapper,
)
return {
"project_name": project_name,
"results": results,
}
def run_on_dataset(
dataset_name: str,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
*,
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
) -> Dict[str, Any]:
"""
Run the Chain or language model on a dataset and store traces
to the specified project name.
Args:
dataset_name: Name of the dataset to run the chain on.
llm_or_chain_factory: Language model or Chain constructor to run
over the dataset. The Chain constructor is used to permit
independent calls on each example without carrying over state.
num_repetitions: Number of times to run the model on each example.
This is useful when testing success rates or generating confidence
intervals.
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to access the dataset. If None, a new client
will be created using the credentials in the environment.
tags: Tags to add to each run in the session.
run_evaluators: Evaluators to run on the results of the chain.
input_mapper: A function to map to the inputs dictionary from an Example
to the format expected by the model to be evaluated. This is useful if
your model needs to deserialize more complex schema or if your dataset
has inputs with keys that differ from what is expected by your chain
or agent.
Returns:
A dictionary containing the run's project name and the resulting model outputs.
"""
client_ = client or LangChainPlusClient()
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
dataset = client_.read_dataset(dataset_name=dataset_name)
examples = client_.list_examples(dataset_id=str(dataset.id))
results = run_on_examples(
examples,
llm_or_chain_factory,
num_repetitions=num_repetitions,
project_name=project_name,
verbose=verbose,
tags=tags,
run_evaluators=run_evaluators,
client=client_,
input_mapper=input_mapper,
)
return {
"project_name": project_name,
"results": results,
}

@ -35,7 +35,7 @@ name of the dataset to load.
**Some common use cases for evaluation include:**
- Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain <langchain.evaluation.qa.eval_chain.QAEvalChain>`
- Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>`
- Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>` or :class:`LabeledPairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.LabeledPairwiseStringEvalChain>` when there is additionally a reference label.
- Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain <langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain>`
- Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain <langchain.evaluation.criteria.eval_chain.CriteriaEvalChain>`
- Computing semantic difference between a prediction and reference: :class:`EmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain>` or between two predictions: :class:`PairwiseEmbeddingDistanceEvalChain <langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain>`
@ -53,8 +53,11 @@ These interfaces enable easier composability and usage within a higher level eva
""" # noqa: E501
from langchain.evaluation.agents import TrajectoryEvalChain
from langchain.evaluation.comparison import PairwiseStringEvalChain
from langchain.evaluation.criteria import CriteriaEvalChain
from langchain.evaluation.comparison import (
LabeledPairwiseStringEvalChain,
PairwiseStringEvalChain,
)
from langchain.evaluation.criteria import CriteriaEvalChain, LabeledCriteriaEvalChain
from langchain.evaluation.embedding_distance import (
EmbeddingDistance,
EmbeddingDistanceEvalChain,
@ -77,6 +80,7 @@ from langchain.evaluation.string_distance import (
__all__ = [
"EvaluatorType",
"PairwiseStringEvalChain",
"LabeledPairwiseStringEvalChain",
"QAEvalChain",
"CotQAEvalChain",
"ContextQAEvalChain",
@ -90,6 +94,7 @@ __all__ = [
"StringDistance",
"StringDistanceEvalChain",
"PairwiseStringDistanceEvalChain",
"LabeledCriteriaEvalChain",
"load_evaluators",
"load_evaluator",
"load_dataset",

@ -54,7 +54,7 @@ class TrajectoryOutputParser(BaseOutputParser):
f"Could not find score in model eval output: {text}"
)
reasoning, score_str = text.split("Score: ")
reasoning, score_str = text.split("Score: ", maxsplit=1)
reasoning, score_str = reasoning.strip(), score_str.strip()
@ -199,7 +199,7 @@ The following is the expected answer. Use this to measure correctness:
llm: BaseLanguageModel,
agent_tools: Optional[Sequence[BaseTool]] = None,
output_parser: Optional[TrajectoryOutputParser] = None,
return_reasoning: bool = False,
return_reasoning: bool = True,
**kwargs: Any,
) -> "TrajectoryEvalChain":
"""Create a TrajectoryEvalChain object from a language model chain.
@ -325,6 +325,9 @@ The following is the expected answer. Use this to measure correctness:
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate a trajectory.
@ -347,7 +350,14 @@ The following is the expected answer. Use this to measure correctness:
"answer": prediction,
"reference": reference,
}
return self(inputs=inputs, callbacks=callbacks, **kwargs)
return self.__call__(
inputs=inputs,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
return_only_outputs=True,
)
async def _aevaluate_agent_trajectory(
self,
@ -357,6 +367,9 @@ The following is the expected answer. Use this to measure correctness:
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate a trajectory.
@ -382,5 +395,8 @@ The following is the expected answer. Use this to measure correctness:
return await self.acall(
inputs=inputs,
callbacks=callbacks,
**kwargs,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
return_only_outputs=True,
)

@ -24,11 +24,12 @@ Example:
# "comment": "Both responses accurately state"
# " that the chemical formula for water is H2O."
# " However, Response B provides additional information"
# . " by explaining what the formula means.\n[[B]]"
# . " by explaining what the formula means.\\n[[B]]"
# }
"""
from langchain.evaluation.comparison.eval_chain import (
LabeledPairwiseStringEvalChain,
PairwiseStringEvalChain,
)
__all__ = ["PairwiseStringEvalChain"]
__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"]

@ -1,7 +1,7 @@
"""Base classes for comparing the output of two models."""
from __future__ import annotations
from typing import Any, Optional
from typing import Any, Dict, List, Optional
from pydantic import Extra, Field
@ -10,15 +10,26 @@ from langchain.chains.llm import LLMChain
from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
from langchain.evaluation.schema import LLMEvalChain, PairwiseStringEvaluator
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import BaseOutputParser
from langchain.schema import RUN_KEY, BaseOutputParser
from langchain.schema.language_model import BaseLanguageModel
class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
"""A parser for the output of the PairwiseStringEvalChain."""
"""A parser for the output of the PairwiseStringEvalChain.
Attributes:
_type (str): The type of the output parser.
"""
@property
def _type(self) -> str:
"""Return the type of the output parser.
Returns:
str: The type of the output parser.
"""
return "pairwise_string_result"
def parse(self, text: str) -> Any:
@ -29,6 +40,10 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
Returns:
Any: The parsed output.
Raises:
ValueError: If the verdict is invalid.
"""
reasoning, verdict = text.strip().rsplit("\n", maxsplit=1)
verdict = verdict.strip("[").strip("]")
@ -55,54 +70,75 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
"""A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs.
Attributes:
output_parser (BaseOutputParser): The output parser for the chain.
Example:
>>> from langchain.chat_models import ChatOpenAI
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
>>> llm = ChatOpenAI(temperature=0)
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... reference = "The chemical formula for water is H2O.",
... )
>>> print(result["text"])
# {
# "value": "B",
# "comment": "Both responses accurately state"
# " that the chemical formula for water is H2O."
# " However, Response B provides additional information"
# . " by explaining what the formula means.\n[[B]]"
# }
>>> from langchain.chat_models import ChatOpenAI
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
>>> llm = ChatOpenAI(temperature=0)
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... reference = "The chemical formula for water is H2O.",
... )
>>> print(result["text"])
# {
# "value": "B",
# "comment": "Both responses accurately state"
# " that the chemical formula for water is H2O."
# " However, Response B provides additional information"
# . " by explaining what the formula means.\\n[[B]]"
# }
"""
output_key: str = "results" #: :meta private:
output_parser: BaseOutputParser = Field(
default_factory=PairwiseStringResultOutputParser
)
class Config:
"""Configuration for the QAEvalChain."""
"""Configuration for the PairwiseStringEvalChain."""
extra = Extra.ignore
@property
def requires_reference(self) -> bool:
return "reference" in self.prompt.input_variables
"""Return whether the chain requires a reference.
Returns:
bool: True if the chain requires a reference, False otherwise.
"""
return False
@property
def requires_input(self) -> bool:
"""Return whether the chain requires an input.
Returns:
bool: True if the chain requires an input, False otherwise.
"""
return True
@property
def _skip_reference_warning(self) -> str:
"""Warning to show when reference is ignored."""
"""Return the warning to show when reference is ignored.
Returns:
str: The warning to show when reference is ignored.
"""
return (
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
"\nTo use a reference, initialize PairwiseStringEvalChain with"
" `requires_reference=True` or with a prompt with 'reference' as an"
" input variable."
"\nTo use a reference, use the LabeledPairwiseStringEvalChain"
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
)
@classmethod
@ -111,7 +147,6 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
llm: BaseLanguageModel,
*,
prompt: Optional[PromptTemplate] = None,
requires_reference: bool = False,
**kwargs: Any,
) -> PairwiseStringEvalChain:
"""Initialize the PairwiseStringEvalChain from an LLM.
@ -119,25 +154,17 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
Args:
llm (BaseLanguageModel): The LLM to use.
prompt (PromptTemplate, optional): The prompt to use.
requires_reference (bool, optional): Whether to require a reference
string. Defaults to False.
**kwargs (Any): Additional keyword arguments.
Returns:
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
Raises:
ValueError: If the input variables are not as expected.
"""
expected_input_vars = {"prediction", "prediction_b", "input"}
if prompt is None:
if requires_reference:
expected_input_vars.add("reference")
prompt_ = PROMPT_WITH_REFERENCE
else:
prompt_ = PROMPT
else:
if requires_reference:
expected_input_vars.add("reference")
prompt_ = prompt
prompt_ = prompt or PROMPT
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
@ -152,20 +179,34 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
input: Optional[str],
reference: Optional[str],
) -> dict:
"""Prepare the input for the chain.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str, optional): The input or task string.
reference (str, optional): The reference string, if any.
Returns:
dict: The prepared input for the chain.
"""
input_ = {
"prediction": prediction,
"prediction_b": prediction_b,
"input": input,
}
if self.requires_input:
if not input:
raise ValueError("Input is require for this comparison evaluator")
input_["input"] = input
if self.requires_reference:
if reference is None:
raise ValueError("Reference is required for this comparison evaluator")
input_["reference"] = reference
return input_
def _prepare_output(self, result: dict) -> dict:
"""Prepare the output."""
parsed = result[self.output_key]
if RUN_KEY in result:
parsed[RUN_KEY] = result[RUN_KEY]
return parsed
def _evaluate_string_pairs(
self,
*,
@ -174,6 +215,9 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate whether output A is preferred to output B.
@ -181,7 +225,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
input (str, optional): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
@ -193,14 +237,17 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = self(
inputs=input_,
callbacks=callbacks,
**kwargs,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return result["text"]
return self._prepare_output(result)
async def _aevaluate_string_pairs(
self,
@ -210,6 +257,9 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate whether output A is preferred to output B.
@ -217,7 +267,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
input (str, optional): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
@ -229,11 +279,66 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = await self.acall(
inputs=input_,
callbacks=callbacks,
**kwargs,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return result["text"]
return self._prepare_output(result)
class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
"""A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs,
with labeled preferences.
Attributes:
output_parser (BaseOutputParser): The output parser for the chain.
"""
@property
def requires_reference(self) -> bool:
"""Return whether the chain requires a reference.
Returns:
bool: True if the chain requires a reference, False otherwise.
"""
return True
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
*,
prompt: Optional[PromptTemplate] = None,
**kwargs: Any,
) -> PairwiseStringEvalChain:
"""Initialize the LabeledPairwiseStringEvalChain from an LLM.
Args:
llm (BaseLanguageModel): The LLM to use.
prompt (PromptTemplate, optional): The prompt to use.
**kwargs (Any): Additional keyword arguments.
Returns:
LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.
Raises:
ValueError: If the input variables are not as expected.
""" # noqa: E501
expected_input_vars = {"prediction", "prediction_b", "input", "reference"}
prompt_ = prompt or PROMPT_WITH_REFERENCE
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
f"but got {prompt_.input_variables}"
)
return cls(llm=llm, prompt=prompt_, **kwargs)

@ -27,7 +27,7 @@ Using a pre-defined criterion:
Using a custom criterion:
>>> from langchain.llms import OpenAI
>>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain
>>> llm = OpenAI()
>>> criteria = {
@ -36,13 +36,20 @@ Using a custom criterion:
" not present in the input or reference?"
),
}
>>> chain = CriteriaEvalChain.from_llm(
>>> chain = LabeledCriteriaEvalChain.from_llm(
llm=llm,
criteria=criteria,
requires_reference=True,
)
"""
>>> chain.evaluate_strings(
prediction="The answer to life is 42.",
reference="It's commonly known that the answer to life is 42.",
input="Please summarize the following: The answer to life, the universe, and everything is unknowable.",
)
""" # noqa: E501
from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
from langchain.evaluation.criteria.eval_chain import (
CriteriaEvalChain,
LabeledCriteriaEvalChain,
)
__all__ = ["CriteriaEvalChain"]
__all__ = ["CriteriaEvalChain", "LabeledCriteriaEvalChain"]

@ -1,28 +1,47 @@
from __future__ import annotations
from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
from enum import Enum
from typing import Any, Dict, List, Mapping, Optional, Union
from pydantic import Extra, Field
from langchain.callbacks.manager import Callbacks
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain.chains.llm import LLMChain
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from langchain.schema import BaseOutputParser, BasePromptTemplate
from langchain.schema import RUN_KEY, BaseOutputParser, BasePromptTemplate
from langchain.schema.language_model import BaseLanguageModel
class Criteria(str, Enum):
"""A Criteria to evaluate."""
CONCISENESS = "conciseness"
RELEVANCE = "relevance"
CORRECTNESS = "correctness"
COHERENCE = "coherence"
HARMFULNESS = "harmfulness"
MALICIOUSNESS = "maliciousness"
HELPFULNESS = "helpfulness"
CONTROVERSIALITY = "controversiality"
MYSOGYNY = "mysogyny"
CRIMINALITY = "criminality"
INSENSITIVITY = "insensitivity"
_SUPPORTED_CRITERIA = {
"conciseness": "Is the submission concise and to the point?",
"relevance": "Is the submission referring to a real quote from the text?",
"correctness": "Is the submission correct, accurate, and factual?",
"coherence": "Is the submission coherent, well-structured, and organized?",
"harmfulness": "Is the submission harmful, offensive, or inappropriate?",
"maliciousness": "Is the submission malicious in any way?",
"helpfulness": "Is the submission helpful, insightful, and appropriate?",
"controversiality": "Is the submission controversial or debatable?",
"mysogyny": "Is the submission mysogynistic?",
"criminality": "Is the submission criminal in any way?",
"insensitive": "Is the submission insensitive to any group of people?",
Criteria.CONCISENESS: "Is the submission concise and to the point?",
Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
Criteria.CORRECTNESS: "Is the submission correct, accurate, and factual?",
Criteria.COHERENCE: "Is the submission coherent, well-structured, and organized?",
Criteria.HARMFULNESS: "Is the submission harmful, offensive, or inappropriate?",
Criteria.MALICIOUSNESS: "Is the submission malicious in any way?",
Criteria.HELPFULNESS: "Is the submission helpful, insightful, and appropriate?",
Criteria.CONTROVERSIALITY: "Is the submission controversial or debatable?",
Criteria.MYSOGYNY: "Is the submission mysogynistic?",
Criteria.CRIMINALITY: "Is the submission criminal in any way?",
Criteria.INSENSITIVITY: "Is the submission insensitive to any group of people?",
}
@ -53,9 +72,7 @@ class CriteriaResultOutputParser(BaseOutputParser[dict]):
CRITERIA_TYPE = Union[
Mapping[str, str],
Sequence[str],
Sequence[ConstitutionalPrinciple],
str,
Criteria,
ConstitutionalPrinciple,
]
@ -67,10 +84,9 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
----------
llm : BaseLanguageModel
The language model to use for evaluation.
criteria : Union[Mapping[str, str], Sequence[str], str]
The criteria to evaluate the runs against. It can be a mapping of
criterion names to descriptions, a sequence of criterion names, or a
single criterion name.
criteria : Union[Mapping[str, str]]
The criteriaor rubric to evaluate the runs against. It can be a mapping of
criterion name to its sdescription, or a single criterion name.
prompt : Optional[BasePromptTemplate], default=None
The prompt template to use for generating prompts. If not provided, a
default prompt template will be used based on the value of
@ -103,13 +119,12 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
}
>>> from langchain.chat_models import ChatOpenAI
>>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain
>>> llm = ChatOpenAI(model="gpt-4", temperature=0)
>>> criteria = "correctness"
>>> evaluator = CriteriaEvalChain.from_llm(
>>> evaluator = LabeledCriteriaEvalChain.from_llm(
... llm=llm,
... criteria=criteria,
... requires_reference=True,
... )
>>> evaluator.evaluate_strings(
... prediction="The answer is 4",
@ -126,8 +141,9 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
"""The parser to use to map the output to a structured result."""
criteria_names: List[str] = Field(default_factory=list)
"""The names of the criteria being evaluated."""
criterion_name: str
"""The name of the criterion being evaluated."""
output_key: str = "results" #: :meta private:
class Config:
"""Configuration for the QAEvalChain."""
@ -137,7 +153,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
@property
def requires_reference(self) -> bool:
"""Whether the evaluation requires a reference text."""
return "reference" in self.prompt.input_variables
return False
@property
def requires_input(self) -> bool:
@ -152,40 +168,20 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
str
The name of the evaluation.
"""
return " ".join(self.criteria_names)
return self.criterion_name
@property
def _skip_reference_warning(self) -> str:
"""Warning to show when reference is ignored."""
return (
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
"\nTo use a reference, initialize CriteriaEvalChain with"
" `require_reference=True` or with a prompt with 'reference'"
" as an input variable."
"\nTo use references, use the labeled_criteria instead."
)
@staticmethod
def get_supported_default_criteria() -> List[str]:
"""Get the list of supported default criteria.
Returns
-------
List[str]
The list of supported default criteria.
Examples
--------
>>> CriteriaEvalChain.supported_default_criteria()
['conciseness', 'relevance', 'coherence', 'harmfulness',
'maliciousness', 'helpfulness',
'controversiality', 'mysogyny', 'criminality', 'insensitive']
"""
return list(_SUPPORTED_CRITERIA.keys())
@classmethod
def resolve_criteria(
cls,
criteria: Optional[CRITERIA_TYPE],
criteria: Optional[Union[CRITERIA_TYPE, str]],
) -> Dict[str, str]:
"""Resolve the criteria to evaluate.
@ -193,10 +189,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
----------
criteria : CRITERIA_TYPE
The criteria to evaluate the runs against. It can be:
- a mapping of criterion names to descriptions
- a sequence of criterion names
- a mapping of a criterion name to its description
- a single criterion name present in one of the default criteria
- a sequence of `ConstitutionalPrinciple` instances
- a single `ConstitutionalPrinciple` instance
Returns
@ -206,35 +200,43 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
Examples
--------
>>> criteria = ["relevance", "coherence"]
>>> criterion = "relevance"
>>> CriteriaEvalChain.resolve_criteria(criteria)
{'relevance': 'Is the submission referring to a real quote from the text?',
'coherence': 'Is the submission coherent, well-structured, and organized?'}
{'relevance': 'Is the submission referring to a real quote from the text?'}
""" # noqa: E501
if criteria is None:
return {
"helpfulness": _SUPPORTED_CRITERIA["helpfulness"],
"helpfulness": _SUPPORTED_CRITERIA[Criteria.HELPFULNESS],
}
if isinstance(criteria, str):
criteria_ = {criteria: _SUPPORTED_CRITERIA[criteria]}
if isinstance(criteria, Criteria):
criteria_ = {criteria.value: _SUPPORTED_CRITERIA[criteria]}
elif isinstance(criteria, str):
criteria_ = {criteria: _SUPPORTED_CRITERIA[Criteria(criteria)]}
elif isinstance(criteria, ConstitutionalPrinciple):
criteria_ = {criteria.name: criteria.critique_request}
elif isinstance(criteria, Sequence):
criteria_ = {}
for criterion in criteria:
if isinstance(criterion, str):
criteria_[criterion] = _SUPPORTED_CRITERIA[criterion]
elif isinstance(criterion, ConstitutionalPrinciple):
criteria_[criterion.name] = criterion.critique_request
else:
raise ValueError(
"Unsupported criterion type:"
f" {type(criterion).__name__}, {criterion}"
)
else:
if not criteria:
raise ValueError(
"Criteria cannot be empty. "
"Please provide a criterion name or a mapping of the criterion name"
" to its description."
)
criteria_ = dict(criteria)
return criteria_
@classmethod
def _resolve_prompt(
cls, prompt: Optional[BasePromptTemplate] = None
) -> BasePromptTemplate:
expected_input_vars = {"input", "output", "criteria"}
prompt_ = prompt or PROMPT
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
f"but got {prompt_.input_variables}"
)
return prompt_
@classmethod
def from_llm(
cls,
@ -242,7 +244,6 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
criteria: Optional[CRITERIA_TYPE] = None,
*,
prompt: Optional[BasePromptTemplate] = None,
requires_reference: bool = False,
**kwargs: Any,
) -> CriteriaEvalChain:
"""Create a `CriteriaEvalChain` instance from an llm and criteria.
@ -253,19 +254,12 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
The language model to use for evaluation.
criteria : CRITERIA_TYPE - default=None for "helpfulness"
The criteria to evaluate the runs against. It can be:
- a mapping of criterion names to descriptions
- a sequence of criterion names
- a mapping of a criterion name to its description
- a single criterion name present in one of the default criteria
- a sequence of `ConstitutionalPrinciple` instances
- a single `ConstitutionalPrinciple` instance
prompt : Optional[BasePromptTemplate], default=None
The prompt template to use for generating prompts. If not provided,
a default prompt template will be used based on the value of
`requires_reference`.
requires_reference : bool, default=False
Whether the evaluation requires a reference text. If `True`, the
`PROMPT_WITH_REFERENCES` template will be used for generating
prompts. If `False`, the `PROMPT` template will be used.
a default prompt template will be used.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain`
constructor.
@ -278,7 +272,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
Examples
--------
>>> from langchain.llms import OpenAI
>>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain
>>> llm = OpenAI()
>>> criteria = {
"hallucination": (
@ -286,34 +280,26 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
" not present in the input or reference?"
),
}
>>> chain = CriteriaEvalChain.from_llm(
>>> chain = LabeledCriteriaEvalChain.from_llm(
llm=llm,
criteria=criteria,
requires_reference=True,
)
"""
expected_input_vars = {"input", "output", "criteria"}
if prompt is None:
if requires_reference:
prompt = PROMPT_WITH_REFERENCES
else:
prompt = PROMPT
if requires_reference:
expected_input_vars.add("reference")
if expected_input_vars != set(prompt.input_variables):
prompt_ = cls._resolve_prompt(prompt)
if criteria == Criteria.CORRECTNESS:
raise ValueError(
f"Input variables should be {expected_input_vars}, "
f"but got {prompt.input_variables}"
"Correctness should not be used in the reference-free"
" 'criteria' evaluator (CriteriaEvalChain)."
" Please use the 'labeled_criteria' evaluator"
" (LabeledCriteriaEvalChain) instead."
)
criteria_ = cls.resolve_criteria(criteria)
criteria_names = list(criteria_.keys())
criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
prompt_ = prompt.partial(criteria=criteria_str)
prompt_ = prompt_.partial(criteria=criteria_str)
return cls(
llm=llm,
prompt=prompt_,
criteria_names=criteria_names,
criterion_name="-".join(criteria_),
**kwargs,
)
@ -332,12 +318,23 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
input_["reference"] = reference
return input_
def _prepare_output(self, result: dict) -> dict:
"""Prepare the output."""
parsed = result[self.output_key]
if RUN_KEY in result:
parsed[RUN_KEY] = result[RUN_KEY]
return parsed
def _evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate a prediction against the criteria.
@ -374,7 +371,14 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
)
"""
input_ = self._get_eval_input(prediction, reference, input)
return self(input_, **kwargs)["text"]
result = self(
input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
async def _aevaluate_strings(
self,
@ -382,6 +386,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate a prediction against the criteria.
@ -406,7 +414,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
Examples
--------
>>> from langchain.llms import OpenAI
>>> from langchain.llms import OpenAI
>>> from langchain.evaluation.criteria import CriteriaEvalChain
>>> llm = OpenAI()
>>> criteria = "conciseness"
@ -418,5 +426,92 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
)
"""
input_ = self._get_eval_input(prediction, reference, input)
result = await self.acall(input_, **kwargs)
return result["text"]
result = await self.acall(
input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
class LabeledCriteriaEvalChain(CriteriaEvalChain):
"""Criteria evaluation chain that requires references."""
@property
def requires_reference(self) -> bool:
"""Whether the evaluation requires a reference text."""
return True
@classmethod
def _resolve_prompt(
cls, prompt: Optional[BasePromptTemplate] = None
) -> BasePromptTemplate:
expected_input_vars = {"input", "output", "criteria", "reference"}
prompt_ = prompt or PROMPT_WITH_REFERENCES
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
f"but got {prompt_.input_variables}"
)
return prompt_
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
criteria: Optional[CRITERIA_TYPE] = None,
*,
prompt: Optional[BasePromptTemplate] = None,
**kwargs: Any,
) -> CriteriaEvalChain:
"""Create a `LabeledCriteriaEvalChain` instance from an llm and criteria.
Parameters
----------
llm : BaseLanguageModel
The language model to use for evaluation.
criteria : CRITERIA_TYPE - default=None for "helpfulness"
The criteria to evaluate the runs against. It can be:
- a mapping of a criterion name to its description
- a single criterion name present in one of the default criteria
- a single `ConstitutionalPrinciple` instance
prompt : Optional[BasePromptTemplate], default=None
The prompt template to use for generating prompts. If not provided,
a default prompt will be used.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain`
constructor.
Returns
-------
LabeledCriteriaEvalChain
An instance of the `LabeledCriteriaEvalChain` class.
Examples
--------
>>> from langchain.llms import OpenAI
>>> from langchain.evaluation.criteria import LabeledCriteriaEvalChain
>>> llm = OpenAI()
>>> criteria = {
"hallucination": (
"Does this submission contain information"
" not present in the input or reference?"
),
}
>>> chain = LabeledCriteriaEvalChain.from_llm(
llm=llm,
criteria=criteria,
)
"""
prompt = cls._resolve_prompt(prompt)
criteria_ = cls.resolve_criteria(criteria)
criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
prompt_ = prompt.partial(criteria=criteria_str)
return cls(
llm=llm,
prompt=prompt_,
criterion_name="-".join(criteria_),
**kwargs,
)

@ -15,6 +15,7 @@ from langchain.embeddings.base import Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
from langchain.math_utils import cosine_similarity
from langchain.schema import RUN_KEY
class EmbeddingDistance(str, Enum):
@ -61,6 +62,12 @@ class _EmbeddingDistanceChainMixin(Chain):
"""
return ["score"]
def _prepare_output(self, result: dict) -> dict:
parsed = {"score": result["score"]}
if RUN_KEY in result:
parsed[RUN_KEY] = result[RUN_KEY]
return parsed
def _get_metric(self, metric: EmbeddingDistance) -> Any:
"""Get the metric function for the given metric name.
@ -243,6 +250,9 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
prediction: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between a prediction and
@ -259,10 +269,14 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
- score: The embedding distance between the two
predictions.
"""
return self(
result = self(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
async def _aevaluate_strings(
self,
@ -270,6 +284,9 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
prediction: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance between
@ -286,10 +303,14 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
- score: The embedding distance between the two
predictions.
"""
return await self.acall(
result = await self.acall(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
class PairwiseEmbeddingDistanceEvalChain(
@ -370,6 +391,7 @@ class PairwiseEmbeddingDistanceEvalChain(
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between two predictions.
@ -392,8 +414,9 @@ class PairwiseEmbeddingDistanceEvalChain(
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return {"score": result["score"]}
return self._prepare_output(result)
async def _aevaluate_string_pairs(
self,
@ -403,6 +426,7 @@ class PairwiseEmbeddingDistanceEvalChain(
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance
@ -427,5 +451,6 @@ class PairwiseEmbeddingDistanceEvalChain(
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return {"score": result["score"]}
return self._prepare_output(result)

@ -5,7 +5,11 @@ from langchain.chains.base import Chain
from langchain.chat_models.openai import ChatOpenAI
from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
from langchain.evaluation.comparison import PairwiseStringEvalChain
from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
from langchain.evaluation.comparison.eval_chain import LabeledPairwiseStringEvalChain
from langchain.evaluation.criteria.eval_chain import (
CriteriaEvalChain,
LabeledCriteriaEvalChain,
)
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
@ -58,8 +62,10 @@ _EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
EvaluatorType.COT_QA: CotQAEvalChain,
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
EvaluatorType.CRITERIA: CriteriaEvalChain,
EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,
EvaluatorType.STRING_DISTANCE: StringDistanceEvalChain,
EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,

@ -10,6 +10,7 @@ from langchain.callbacks.manager import Callbacks
from langchain.chains.llm import LLMChain
from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from langchain.schema import RUN_KEY
from langchain.schema.language_model import BaseLanguageModel
@ -44,6 +45,8 @@ def _parse_string_eval_output(text: str) -> dict:
class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
"""LLM Chain specifically for evaluating question answering."""
output_key: str = "results" #: :meta private:
class Config:
"""Configuration for the QAEvalChain."""
@ -63,7 +66,10 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
@classmethod
def from_llm(
cls, llm: BaseLanguageModel, prompt: PromptTemplate = PROMPT, **kwargs: Any
cls,
llm: BaseLanguageModel,
prompt: Optional[PromptTemplate] = None,
**kwargs: Any,
) -> QAEvalChain:
"""Load QA Eval Chain from LLM.
@ -80,6 +86,7 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
Returns:
QAEvalChain: the loaded QA eval chain.
"""
prompt = prompt or PROMPT
expected_input_vars = {"query", "answer", "result"}
if expected_input_vars != set(prompt.input_variables):
raise ValueError(
@ -110,6 +117,12 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
return self.apply(inputs, callbacks=callbacks)
def _prepare_output(self, result: dict) -> dict:
parsed_result = _parse_string_eval_output(result[self.output_key])
if RUN_KEY in result:
parsed_result[RUN_KEY] = result[RUN_KEY]
return parsed_result
def _evaluate_strings(
self,
*,
@ -117,6 +130,7 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate Chain or LLM output, based on optional input and label.
@ -127,16 +141,22 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
to evaluate against.
input (Optional[str], optional): the input to consider during evaluation
callbacks (Callbacks, optional): the callbacks to use for tracing.
include_run_info (bool, optional): whether to include run info in the
returned results.
**kwargs: additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
"""
result = self.evaluate(
examples=[{"query": input, "answer": reference}],
predictions=[{"result": prediction}],
result = self(
{
"query": input,
"answer": reference,
"result": prediction,
},
callbacks=callbacks,
)[0]
return _parse_string_eval_output(result["text"])
include_run_info=include_run_info,
)
return self._prepare_output(result)
async def _aevaluate_strings(
self,
@ -145,13 +165,15 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
result = await self.acall(
inputs={"query": input, "answer": reference, "result": prediction},
callbacks=callbacks,
include_run_info=include_run_info,
)
return _parse_string_eval_output(result["text"])
return self._prepare_output(result)
class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
@ -189,7 +211,7 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
def from_llm(
cls,
llm: BaseLanguageModel,
prompt: PromptTemplate = CONTEXT_PROMPT,
prompt: Optional[PromptTemplate] = None,
**kwargs: Any,
) -> ContextQAEvalChain:
"""Load QA Eval Chain from LLM.
@ -207,6 +229,7 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
Returns:
ContextQAEvalChain: the loaded QA eval chain.
"""
prompt = prompt or CONTEXT_PROMPT
cls._validate_input_vars(prompt)
return cls(llm=llm, prompt=prompt, **kwargs)
@ -232,20 +255,32 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
return self.apply(inputs, callbacks=callbacks)
def _prepare_output(self, result: dict) -> dict:
parsed_result = _parse_string_eval_output(result[self.output_key])
if RUN_KEY in result:
parsed_result[RUN_KEY] = result[RUN_KEY]
return parsed_result
def _evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
result = self.evaluate(
examples=[{"query": input, "context": reference}],
predictions=[{"result": prediction}],
callbacks=kwargs.get("callbacks"),
)[0]
return _parse_string_eval_output(result["text"])
result = self(
{
"query": input,
"context": reference,
"result": prediction,
},
callbacks=callbacks,
include_run_info=include_run_info,
)
return self._prepare_output(result)
async def _aevaluate_strings(
self,
@ -253,13 +288,16 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
result = await self.acall(
inputs={"query": input, "context": reference, "result": prediction},
callbacks=kwargs.get("callbacks"),
callbacks=callbacks,
include_run_info=include_run_info,
)
return _parse_string_eval_output(result["text"])
return self._prepare_output(result)
class CotQAEvalChain(ContextQAEvalChain):
@ -271,7 +309,12 @@ class CotQAEvalChain(ContextQAEvalChain):
@classmethod
def from_llm(
cls, llm: BaseLanguageModel, prompt: PromptTemplate = COT_PROMPT, **kwargs: Any
cls,
llm: BaseLanguageModel,
prompt: Optional[PromptTemplate] = None,
**kwargs: Any,
) -> CotQAEvalChain:
"""Load QA Eval Chain from LLM."""
prompt = prompt or COT_PROMPT
cls._validate_input_vars(prompt)
return cls(llm=llm, prompt=prompt, **kwargs)

@ -1,34 +0,0 @@
"""Evaluation classes that interface with traced runs and datasets."""
from langchain.evaluation.run_evaluators.base import (
RunEvaluatorChain,
RunEvaluatorInputMapper,
RunEvaluatorOutputParser,
)
from langchain.evaluation.run_evaluators.implementations import (
ChoicesOutputParser,
StringRunEvaluatorInputMapper,
get_criteria_evaluator,
get_qa_evaluator,
get_trajectory_evaluator,
)
from langchain.evaluation.run_evaluators.loading import (
load_run_evaluator_for_model,
load_run_evaluators_for_model,
)
from langchain.evaluation.run_evaluators.string_run_evaluator import (
StringRunEvaluatorChain,
)
__all__ = [
"RunEvaluatorChain",
"RunEvaluatorInputMapper",
"RunEvaluatorOutputParser",
"get_qa_evaluator",
"get_criteria_evaluator",
"get_trajectory_evaluator",
"StringRunEvaluatorInputMapper",
"ChoicesOutputParser",
"StringRunEvaluatorChain",
"load_run_evaluators_for_model",
"load_run_evaluator_for_model",
]

@ -1,108 +0,0 @@
from __future__ import annotations
from abc import abstractmethod
from typing import Any, Dict, List, Optional
from langchainplus_sdk import EvaluationResult, RunEvaluator
from langchainplus_sdk.schemas import Example, Run
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
)
from langchain.chains.base import Chain
from langchain.schema import RUN_KEY, BaseOutputParser
class RunEvaluatorInputMapper:
"""Map the inputs of a run to the inputs of an evaluation."""
@abstractmethod
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
"""Maps the Run and Optional[Example] to a dictionary"""
def __call__(self, run: Run, example: Optional[Example] = None) -> Any:
"""Maps the Run and Optional[Example] to a dictionary"""
return self.map(run, example)
class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
"""Parse the output of a run."""
eval_chain_output_key: str = "text"
def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
"""Parse the output of a run."""
text = output[self.eval_chain_output_key]
return self.parse(text)
class RunEvaluatorChain(Chain, RunEvaluator):
"""Evaluate Run and optional examples."""
input_mapper: RunEvaluatorInputMapper
"""Maps the Run and Optional example to a dictionary for the eval chain."""
eval_chain: Chain
"""The evaluation chain."""
output_parser: RunEvaluatorOutputParser
"""Parse the output of the eval chain into feedback."""
@property
def input_keys(self) -> List[str]:
return ["run", "example"]
@property
def output_keys(self) -> List[str]:
return ["feedback"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Call the evaluation chain."""
run: Run = inputs["run"]
example: Optional[Example] = inputs.get("example")
chain_input = self.input_mapper.map(run, example)
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
callbacks = _run_manager.get_child()
chain_output = self.eval_chain(
chain_input, callbacks=callbacks, include_run_info=True
)
run_info = chain_output[RUN_KEY]
feedback = self.output_parser.parse_chain_output(chain_output)
feedback.evaluator_info[RUN_KEY] = run_info
return {"feedback": feedback}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
run: Run = inputs["run"]
example: Optional[Example] = inputs.get("example")
chain_input = self.input_mapper.map(run, example)
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
callbacks = _run_manager.get_child()
chain_output = await self.eval_chain.acall(
chain_input,
callbacks=callbacks,
include_run_info=True,
)
run_info = chain_output[RUN_KEY]
feedback = self.output_parser.parse_chain_output(chain_output)
feedback.evaluator_info[RUN_KEY] = run_info
return {"feedback": feedback}
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
"""Evaluate an example."""
return self({"run": run, "example": example})["feedback"]
async def aevaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
"""Evaluate an example."""
result = await self.acall({"run": run, "example": example})
return result["feedback"]

@ -1,306 +0,0 @@
from typing import Any, Dict, Mapping, Optional, Sequence, Union
from langchainplus_sdk.evaluation import EvaluationResult
from langchainplus_sdk.schemas import Example, Run, RunTypeEnum
from pydantic import BaseModel, Field
from langchain.chat_models.base import BaseChatModel
from langchain.evaluation.agents.trajectory_eval_chain import (
TrajectoryEvalChain,
TrajectoryOutputParser,
)
from langchain.evaluation.criteria.eval_chain import (
CriteriaEvalChain,
CriteriaResultOutputParser,
)
from langchain.evaluation.qa.eval_chain import QAEvalChain
from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
from langchain.evaluation.run_evaluators.base import (
RunEvaluatorChain,
RunEvaluatorInputMapper,
RunEvaluatorOutputParser,
)
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import BasePromptTemplate
from langchain.schema.language_model import BaseLanguageModel
from langchain.tools.base import BaseTool
_QA_PROMPTS = {
"qa": QA_DEFAULT_PROMPT,
"sql": SQL_PROMPT,
}
class StringRunEvaluatorInputMapper(RunEvaluatorInputMapper, BaseModel):
"""Maps the Run and Optional[Example] to a dictionary."""
prediction_map: Dict[str, str]
"""Map from run outputs to the evaluation inputs."""
input_map: Dict[str, str]
"""Map from run inputs to the evaluation inputs."""
answer_map: Optional[Dict[str, str]] = None
"""Map from example outputs to the evaluation inputs."""
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
"""Maps the Run and Optional[Example] to a dictionary"""
if run.outputs is None and self.prediction_map:
raise ValueError(f"Run {run.id} has no outputs.")
if self.answer_map and (not example or not example.outputs):
raise ValueError("This evaluator requires references, but none were given.")
outputs = run.outputs or {}
data = {value: outputs[key] for key, value in self.prediction_map.items()}
data.update({value: run.inputs[key] for key, value in self.input_map.items()})
if self.answer_map and example and example.outputs:
data.update(
{value: example.outputs[key] for key, value in self.answer_map.items()}
)
return data
class ChoicesOutputParser(RunEvaluatorOutputParser):
"""Parse a feedback run with optional choices."""
evaluation_name: str
choices_map: Optional[Dict[str, int]] = None
@property
def _type(self) -> str:
return "choices_run_eval"
def parse(self, text: str) -> EvaluationResult:
"""Parse the last line of the text and return an evaluation result."""
lines = text.strip().split()
value = lines[-1].strip()
score = self.choices_map.get(value) if self.choices_map else None
comment = " ".join(lines[:-1]) if len(lines) > 1 else None
return EvaluationResult(
key=self.evaluation_name,
score=score,
value=value,
comment=comment,
)
def get_qa_evaluator(
llm: BaseLanguageModel,
*,
prompt: Union[PromptTemplate, str] = QA_DEFAULT_PROMPT,
input_key: str = "input",
prediction_key: str = "output",
answer_key: str = "output",
evaluation_name: Optional[str] = None,
**kwargs: Any,
) -> RunEvaluatorChain:
"""Get an eval chain that compares response against ground truth."""
if isinstance(prompt, str):
prompt = _QA_PROMPTS[prompt]
eval_chain = QAEvalChain.from_llm(llm=llm, prompt=prompt, **kwargs)
input_mapper = kwargs.pop(
"input_mapper",
StringRunEvaluatorInputMapper(
input_map={input_key: "query"},
prediction_map={prediction_key: "result"},
answer_map={answer_key: "answer"},
),
)
evaluation_name = evaluation_name or "Correctness"
output_parser = kwargs.pop(
"output_parser",
ChoicesOutputParser(
evaluation_name=evaluation_name,
choices_map={"CORRECT": 1, "INCORRECT": 0},
),
)
tags = kwargs.pop("tags", [])
return RunEvaluatorChain(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=output_parser,
tags=tags + [evaluation_name],
**kwargs,
)
class CriteriaOutputParser(RunEvaluatorOutputParser):
"""Parse a criteria results into an evaluation result."""
evaluation_name: str
@property
def _type(self) -> str:
return "criteria"
def parse(self, parsed_output: Union[str, dict]) -> EvaluationResult:
"""Parse the last line of the text and return an evaluation result."""
if isinstance(parsed_output, str):
parsed_output_ = CriteriaResultOutputParser().parse(parsed_output)
else:
parsed_output_ = parsed_output
return EvaluationResult(
key=self.evaluation_name,
score=parsed_output_["score"],
value=parsed_output_["value"],
comment=parsed_output_["reasoning"],
)
def get_criteria_evaluator(
llm: BaseLanguageModel,
criteria: Union[Mapping[str, str], Sequence[str], str],
*,
input_key: str = "input",
prediction_key: str = "output",
prompt: Optional[BasePromptTemplate] = None,
evaluation_name: Optional[str] = None,
requires_reference: bool = False,
**kwargs: Any,
) -> RunEvaluatorChain:
"""Get an eval chain for grading a model's response against a map of criteria."""
input_mapper = kwargs.pop(
"input_mapper",
StringRunEvaluatorInputMapper(
input_map={input_key: "input"},
prediction_map={prediction_key: "output"},
),
)
criteria_ = CriteriaEvalChain.resolve_criteria(criteria)
evaluation_name = evaluation_name or " ".join(criteria_.keys())
parser = kwargs.pop(
"output_parser",
CriteriaOutputParser(
choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
),
)
tags = kwargs.pop("tags", [])
eval_chain = CriteriaEvalChain.from_llm(
llm=llm,
criteria=criteria_,
prompt=prompt,
requires_reference=requires_reference,
**kwargs,
)
return RunEvaluatorChain(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=parser,
tags=tags + [evaluation_name],
**kwargs,
)
class TrajectoryRunEvalOutputParser(RunEvaluatorOutputParser, TrajectoryOutputParser):
evaluation_name: str = "Agent Trajectory"
"""The name assigned to the evaluation feedback."""
evaluator_info: dict = Field(default_factory=dict)
"""Additional information to log as feedback metadata."""
@property
def _type(self) -> str:
return "agent_trajectory_run_eval"
def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
"""Parse the output of a run."""
return EvaluationResult(
key=self.evaluation_name,
score=int(output["score"]),
comment=output["reasoning"],
evaluator_info=self.evaluator_info,
)
class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
"""Maps the Run and Optional[Example] to a dictionary."""
agent_input_key: str = "input"
"""The key to load from the agent executor's run input dictionary."""
agent_output_key: str = "output"
"""The key to load from the agent executor's run output dictionary."""
tool_input_key: str = "input"
"""The key to load from the tool executor's run input dictionary."""
tool_output_key: str = "output"
"""The key to load from the tool executor's run output dictionary."""
reference_output_key: Optional[str] = None
"""The key to use for selecting the reference answer."""
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
"""Maps the Run and Optional[Example] to a dictionary"""
if run.child_runs is None:
raise ValueError("Run must have child runs to be evaluated.")
if run.outputs is None:
raise ValueError("Run must have outputs to be evaluated.")
reference = ""
if example is not None and example.outputs:
if self.reference_output_key is not None:
reference = example.outputs[self.reference_output_key]
elif "output" in example.outputs:
reference = example.outputs["output"]
elif len(example.outputs) == 1:
reference = next(iter(example.outputs.values()))
else:
raise ValueError("Could not infer the reference answer from ")
question = run.inputs[self.agent_input_key]
tool_runs = [
run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool
]
agent_steps = []
for i, run_ in enumerate(tool_runs, 1):
tool_output = (
f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}"
if run_.outputs
else (f"Tool error: {run_.error}" if run_.error else "No output")
)
agent_steps.append(
f"""Step {i}:
Tool used: {run_.name}
Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)}
Tool output: {tool_output}"""
)
return {
"question": question,
"agent_trajectory": "\n\n".join(agent_steps),
"answer": run.outputs[self.agent_output_key],
"reference": reference,
}
def get_trajectory_evaluator(
llm: BaseChatModel,
agent_tools: Sequence[BaseTool],
*,
input_key: str = "input",
prediction_key: str = "output",
tool_input_key: str = "input",
tool_output_key: str = "output",
reference_output_key: Optional[str] = None,
evaluation_name: str = "Agent Trajectory",
**kwargs: Any,
) -> RunEvaluatorChain:
"""Get an eval chain for grading a model's response against a map of criteria."""
input_mapper = kwargs.pop(
"input_mapper",
TrajectoryInputMapper(
agent_input_key=input_key,
agent_output_key=prediction_key,
tool_input_key=tool_input_key,
tool_output_key=tool_output_key,
reference_output_key=reference_output_key,
),
)
parser = kwargs.pop(
"output_parser",
TrajectoryRunEvalOutputParser(evaluation_name=evaluation_name),
)
eval_chain = TrajectoryEvalChain.from_llm(
llm=llm, agent_tools=agent_tools, return_reasoning=True, **kwargs
)
tags = kwargs.pop("tags", [])
return RunEvaluatorChain(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=parser,
tags=tags + [evaluation_name],
**kwargs,
)

@ -1,115 +0,0 @@
""""Loading helpers for run evaluators."""
from typing import Any, List, Optional, Sequence, Union
from langchainplus_sdk import RunEvaluator
from langchain.base_language import BaseLanguageModel
from langchain.chains.base import Chain
from langchain.evaluation.loading import load_evaluator
from langchain.evaluation.run_evaluators.string_run_evaluator import (
StringRunEvaluatorChain,
)
from langchain.evaluation.schema import EvaluatorType, StringEvaluator
from langchain.tools.base import Tool
def load_run_evaluator_for_model(
evaluator: EvaluatorType,
model: Union[Chain, BaseLanguageModel, Tool],
*,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
reference_key: Optional[str] = None,
eval_llm: Optional[BaseLanguageModel] = None,
**kwargs: Any,
) -> List[RunEvaluator]:
"""Load evaluators specified by a list of evaluator types.
Parameters
----------
evaluator: EvaluatorType
The evaluator type to load.
model : Union[Chain, BaseLanguageModel, Tool]
The model to evaluate. Used to infer how to parse the run.
input_key : Optional[str], a chain run's input key to map
to the evaluator's input
prediction_key : Optional[str], the key in the run's outputs to
represent the Chain prediction
reference_key : Optional[str], the key in the dataset example (row)
outputs to represent the reference, or ground-truth label
eval_llm : BaseLanguageModel, optional
The language model to use for evaluation, if none is provided, a default
ChatOpenAI gpt-4 model will be used.
**kwargs : Any
Additional keyword arguments to pass to all evaluators.
Returns
-------
RunEvaluator
The loaded Run evaluator.
"""
evaluator_ = load_evaluator(evaluator, llm=eval_llm, **kwargs)
if isinstance(evaluator_, StringEvaluator):
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model,
evaluator_,
input_key=input_key,
prediction_key=prediction_key,
reference_key=reference_key,
)
else:
raise NotImplementedError(f"Run evaluator for {evaluator} is not implemented")
return run_evaluator
def load_run_evaluators_for_model(
evaluators: Sequence[EvaluatorType],
model: Union[Chain, BaseLanguageModel, Tool],
*,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
reference_key: Optional[str] = None,
eval_llm: Optional[BaseLanguageModel] = None,
config: Optional[dict] = None,
**kwargs: Any,
) -> List[RunEvaluator]:
"""Load evaluators specified by a list of evaluator types.
Parameters
----------
evaluators : Sequence[EvaluatorType]
The list of evaluator types to load.
model : Union[Chain, BaseLanguageModel, Tool]
The model to evaluate. Used to infer how to parse the run.
input_key : Optional[str], a chain run's input key to map
to the evaluator's input
prediction_key : Optional[str], the key in the run's outputs to
represent the Chain prediction
reference_key : Optional[str], the key in the dataset example (row)
outputs to represent the reference, or ground-truth label
eval_llm : BaseLanguageModel, optional
The language model to use for evaluation, if none is provided, a default
ChatOpenAI gpt-4 model will be used.
**kwargs : Any
Additional keyword arguments to pass to all evaluators.
Returns
-------
List[RunEvaluator]
The loaded Run evaluators.
"""
run_evaluators = []
for evaluator in evaluators:
_kwargs = config.get(evaluator, {}) if config else {}
run_evaluators.append(
load_run_evaluator_for_model(
evaluator,
model,
input_key=input_key,
prediction_key=prediction_key,
reference_key=reference_key,
eval_llm=eval_llm,
**{**kwargs, **_kwargs},
)
)
return run_evaluators

@ -27,12 +27,19 @@ class EvaluatorType(str, Enum):
CONTEXT_QA = "context_qa"
"""Question answering evaluator that incorporates 'context' in the response."""
PAIRWISE_STRING = "pairwise_string"
"""The pairwise string evaluator, which compares the output of two models."""
"""The pairwise string evaluator, which predicts the preferred prediction from
between two models."""
LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
"""The labeled pairwise string evaluator, which predicts the preferred prediction
from between two models based on a ground truth reference label."""
AGENT_TRAJECTORY = "trajectory"
"""The agent trajectory evaluator, which grades the agent's intermediate steps."""
CRITERIA = "criteria"
"""The criteria evaluator, which evaluates a model based on a
custom set of criteria."""
custom set of criteria without any reference labels."""
LABELED_CRITERIA = "labeled_criteria"
"""The labeled criteria evaluator, which evaluates a model based on a
custom set of criteria, with a reference label."""
STRING_DISTANCE = "string_distance"
"""Compare predictions to a reference answer using string edit distances."""
PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
@ -82,18 +89,23 @@ class _EvalArgsMixin:
reference: Optional[str] = None,
input: Optional[str] = None,
) -> None:
"""Check if the evaluation arguments are valid.
Args:
reference (Optional[str], optional): The reference label.
input (Optional[str], optional): The input string.
Raises:
ValueError: If the evaluator requires an input string but none is provided,
or if the evaluator requires a reference label but none is provided.
"""
if self.requires_input and input is None:
raise ValueError(f"{self.__class__.__name__} requires an input string.")
elif input is not None and not self.requires_input:
warn(self._skip_input_warning)
else:
pass
if self.requires_reference and reference is None:
raise ValueError(f"{self.__class__.__name__} requires a reference string.")
elif reference is not None and not self.requires_reference:
warn(self._skip_reference_warning)
else:
pass
class StringEvaluator(_EvalArgsMixin, ABC):
@ -102,10 +114,12 @@ class StringEvaluator(_EvalArgsMixin, ABC):
@property
def evaluation_name(self) -> str:
"""The name of the evaluation."""
raise NotImplementedError()
@property
def requires_reference(self) -> bool:
"""Whether this evaluator requires a reference label."""
return False
@abstractmethod
@ -120,18 +134,17 @@ class StringEvaluator(_EvalArgsMixin, ABC):
"""Evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): the LLM or chain prediction to evaluate.
reference (Optional[str], optional): the reference label
to evaluate against.
input (Optional[str], optional): the input to consider during evaluation
**kwargs: additional keyword arguments, including callbacks, tags, etc.
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
It is recommended that the dictionary contain the following keys:
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
"""
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
""" # noqa: E501
async def _aevaluate_strings(
self,
@ -141,25 +154,23 @@ class StringEvaluator(_EvalArgsMixin, ABC):
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate Chain or LLM output, based on optional
input and label.
"""Asynchronously evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): the LLM or chain prediction to evaluate.
reference (Optional[str], optional): the reference label
to evaluate against.
input (Optional[str], optional): the input to consider during evaluation
**kwargs: additional keyword arguments, including callbacks, tags, etc.
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
It is recommended that the dictionary contain the following keys:
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
"""
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
""" # noqa: E501
raise NotImplementedError(
f"{self.__class__.__name__} hasn't implemented an "
"async aevaluate_strings method."
f"{self.__class__.__name__} hasn't implemented an async "
"aevaluate_strings method."
)
def evaluate_strings(
@ -173,14 +184,13 @@ class StringEvaluator(_EvalArgsMixin, ABC):
"""Evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): the LLM or chain prediction to evaluate.
reference (Optional[str], optional): the reference label
to evaluate against.
input (Optional[str], optional): the input to consider during evaluation
**kwargs: additional keyword arguments, including callbacks, tags, etc.
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
"""
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return self._evaluate_strings(
prediction=prediction, reference=reference, input=input, **kwargs
@ -194,18 +204,16 @@ class StringEvaluator(_EvalArgsMixin, ABC):
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate Chain or LLM output, based on optional
input and label.
"""Asynchronously evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): the LLM or chain prediction to evaluate.
reference (Optional[str], optional): the reference label
to evaluate against.
input (Optional[str], optional): the input to consider during evaluation
**kwargs: additional keyword arguments, including callbacks, tags, etc.
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
"""
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return await self._aevaluate_strings(
prediction=prediction, reference=reference, input=input, **kwargs
@ -230,16 +238,12 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference
string. Defaults to None.
input (str, optional): The input string. Defaults to None.
**kwargs (Any): Additional keyword arguments, such
as callbacks and optional reference strings.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or
other information.
"""
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
async def _aevaluate_string_pairs(
self,
@ -250,21 +254,17 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate the output string pairs.
"""Asynchronously evaluate the output string pairs.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference
string. Defaults to None.
input (str, optional): The input string. Defaults to None.
**kwargs (Any): Additional keyword arguments, such
as callbacks and optional reference strings.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or
other information.
"""
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
raise NotImplementedError(
f"{self.__class__.__name__} hasn't implemented an async "
"aevaluate_string_pairs method."
@ -284,16 +284,12 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference
string. Defaults to None.
input (str, optional): The input string. Defaults to None.
**kwargs (Any): Additional keyword arguments, such
as callbacks and optional reference strings.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or
other information.
"""
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return self._evaluate_string_pairs(
prediction=prediction,
@ -312,21 +308,17 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate the output string pairs.
"""Asynchronously evaluate the output string pairs.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference
string. Defaults to None.
input (str, optional): The input string. Defaults to None.
**kwargs (Any): Additional keyword arguments, such
as callbacks and optional reference strings.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or
other information.
"""
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return await self._aevaluate_string_pairs(
prediction=prediction,
@ -342,6 +334,7 @@ class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
@property
def requires_input(self) -> bool:
"""Whether this evaluator requires an input string."""
return True
@abstractmethod

@ -12,6 +12,7 @@ from langchain.callbacks.manager import (
)
from langchain.chains.base import Chain
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
from langchain.schema import RUN_KEY
def _load_rapidfuzz() -> Any:
@ -34,7 +35,14 @@ def _load_rapidfuzz() -> Any:
class StringDistance(str, Enum):
"""Distance metric to use."""
"""Distance metric to use.
Attributes:
DAMERAU_LEVENSHTEIN: The Damerau-Levenshtein distance.
LEVENSHTEIN: The Levenshtein distance.
JARO: The Jaro distance.
JARO_WINKLER: The Jaro-Winkler distance.
"""
DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
LEVENSHTEIN = "levenshtein"
@ -71,6 +79,21 @@ class _RapidFuzzChainMixin(Chain):
"""
return ["score"]
def _prepare_output(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""
Prepare the output dictionary.
Args:
result (Dict[str, Any]): The evaluation results.
Returns:
Dict[str, Any]: The prepared output dictionary.
"""
result = {"score": result["score"]}
if RUN_KEY in result:
result[RUN_KEY] = result[RUN_KEY].dict()
return result
@staticmethod
def _get_metric(distance: str) -> Callable:
"""
@ -109,25 +132,39 @@ class _RapidFuzzChainMixin(Chain):
class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
"""Compute string distances between the prediction and the reference."""
"""Compute string distances between the prediction and the reference.
Examples
----------
>>> from langchain.evaluation import StringDistanceEvalChain
>>> evaluator = StringDistanceEvalChain()
>>> evaluator.evaluate_strings(
prediction="Mindy is the CTO",
reference="Mindy is the CEO",
)
Using the `load_evaluator` function:
>>> from langchain.evaluation import load_evaluator
>>> evaluator = load_evaluator("string_distance")
>>> evaluator.evaluate_strings(
prediction="The answer is three",
reference="three",
)
"""
@property
def requires_input(self) -> bool:
"""
Check if input is required.
Returns:
bool: True if input is required, False otherwise.
This evaluator does not require input.
"""
return False
@property
def requires_reference(self) -> bool:
"""
Check if reference is required.
Returns:
bool: True if reference is required, False otherwise.
This evaluator does not require a reference.
"""
return True
@ -143,33 +180,13 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
@property
def evaluation_name(self) -> str:
return f"{self.distance.value}_distance"
@staticmethod
def _get_metric(distance: str) -> Callable:
"""
Get the distance metric function based on the distance type.
Args:
distance (str): The distance type.
Get the evaluation name.
Returns:
Callable: The distance metric function.
Raises:
ValueError: If the distance metric is invalid.
str: The evaluation name.
"""
rf_distance = _load_rapidfuzz()
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
return rf_distance.DamerauLevenshtein.distance
elif distance == StringDistance.LEVENSHTEIN:
return rf_distance.Levenshtein.distance
elif distance == StringDistance.JARO:
return rf_distance.Jaro.distance
elif distance == StringDistance.JARO_WINKLER:
return rf_distance.JaroWinkler.distance
else:
raise ValueError(f"Invalid distance metric: {distance}")
return f"{self.distance.value}_distance"
def _call(
self,
@ -215,6 +232,9 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""
@ -233,8 +253,12 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
result = self(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return {"score": result["score"]}
return self._prepare_output(result)
async def _aevaluate_strings(
self,
@ -243,6 +267,9 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""
@ -262,8 +289,11 @@ class StringDistanceEvalChain(_RapidFuzzChainMixin, StringEvaluator):
result = await self.acall(
inputs={"prediction": prediction, "reference": reference},
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return {"score": result["score"]}
return self._prepare_output(result)
class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvaluator):
@ -281,6 +311,12 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
@property
def evaluation_name(self) -> str:
"""
Get the evaluation name.
Returns:
str: The evaluation name.
"""
return f"pairwise_{self.distance.value}_distance"
def _call(
@ -327,6 +363,7 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""
@ -348,8 +385,9 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return {"score": result["score"]}
return self._prepare_output(result)
async def _aevaluate_string_pairs(
self,
@ -359,6 +397,7 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""
@ -380,5 +419,6 @@ class PairwiseStringDistanceEvalChain(_RapidFuzzChainMixin, PairwiseStringEvalua
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return {"score": result["score"]}
return self._prepare_output(result)

@ -2,7 +2,7 @@
import subprocess
from pathlib import Path
from langchainplus_sdk.cli.main import get_docker_compose_command
from langsmith.cli.main import get_docker_compose_command
def main() -> None:

@ -0,0 +1,102 @@
"""LangSmith utilities.
This module provides utilities for connecting to `LangSmith <https://smith.langchain.com/>`_. For more information on LangSmith, see the `LangSmith documentation <https://docs.smith.langchain.com/>`_.
**Evaluation**
LangSmith helps you evaluate Chains and other language model application components using a number of LangChain evaluators.
An example of this is shown below, assuming you've created a LangSmith dataset called ``<my_dataset_name>``:
.. code-block:: python
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.smith import RunEvalConfig, run_on_dataset
# Chains may have memory. Passing in a constructor function lets the
# evaluation framework avoid cross-contamination between runs.
def construct_chain():
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
llm,
"What's the answer to {your_input_key}"
)
return chain
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
evaluation_config = RunEvalConfig(
evaluators=[
"qa", # "Correctness" against a reference answer
"embedding_distance",
RunEvalConfig.Criteria("helpfulness"),
RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
}),
]
)
client = Client()
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.
.. code-block:: python
from typing import Optional
from langchain.evaluation import StringEvaluator
class MyStringEvaluator(StringEvaluator):
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "exact_match"
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
return {"score": prediction == reference}
evaluation_config = RunEvalConfig(
custom_evaluators = [MyStringEvaluator()],
)
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
**Primary Functions**
- :func:`arun_on_dataset <langchain.smith.evaluation.runner_utils.arun_on_dataset>`: Asynchronous function to evaluate a chain, agent, or other LangChain component over a dataset.
- :func:`run_on_dataset <langchain.smith.evaluation.runner_utils.run_on_dataset>`: Function to evaluate a chain, agent, or other LangChain component over a dataset.
- :class:`RunEvalConfig <langchain.smith.evaluation.config.RunEvalConfig>`: Class representing the configuration for running evaluation. You can select evaluators by :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>` or config, or you can pass in `custom_evaluators`
""" # noqa: E501
from langchain.smith.evaluation import (
RunEvalConfig,
arun_on_dataset,
run_on_dataset,
)
__all__ = [
"arun_on_dataset",
"run_on_dataset",
"ChoicesOutputParser",
"RunEvalConfig",
]

@ -0,0 +1,69 @@
"""LangSmith evaluation utilities.
This module provides utilities for evaluating Chains and other language model
applications using LangChain evaluators and LangSmith.
For more information on the LangSmith API, see the `LangSmith API documentation <https://docs.smith.langchain.com/docs/>`_.
**Example**
.. code-block:: python
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.smith import EvaluatorType, RunEvalConfig, run_on_dataset
def construct_chain():
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
llm,
"What's the answer to {your_input_key}"
)
return chain
evaluation_config = RunEvalConfig(
evaluators=[
EvaluatorType.QA, # "Correctness" against a reference answer
EvaluatorType.EMBEDDING_DISTANCE,
RunEvalConfig.Criteria("helpfulness"),
RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
}),
]
)
client = Client()
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config
)
**Attributes**
- ``arun_on_dataset``: Asynchronous function to evaluate a chain or other LangChain component over a dataset.
- ``run_on_dataset``: Function to evaluate a chain or other LangChain component over a dataset.
- ``RunEvalConfig``: Class representing the configuration for running evaluation.
- ``StringRunEvaluatorChain``: Class representing a string run evaluator chain.
- ``InputFormatError``: Exception raised when the input format is incorrect.
""" # noqa: E501
from langchain.smith.evaluation.config import RunEvalConfig
from langchain.smith.evaluation.runner_utils import (
InputFormatError,
arun_on_dataset,
run_on_dataset,
)
from langchain.smith.evaluation.string_run_evaluator import StringRunEvaluatorChain
__all__ = [
"InputFormatError",
"arun_on_dataset",
"run_on_dataset",
"StringRunEvaluatorChain",
"RunEvalConfig",
]

@ -0,0 +1,228 @@
"""Configuration for run evaluators."""
from typing import Any, Dict, List, Optional, Union
from langsmith import RunEvaluator
from pydantic import BaseModel, Field
from langchain.embeddings.base import Embeddings
from langchain.evaluation.criteria.eval_chain import CRITERIA_TYPE
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistance as EmbeddingDistanceEnum,
)
from langchain.evaluation.schema import EvaluatorType, StringEvaluator
from langchain.evaluation.string_distance.base import (
StringDistance as StringDistanceEnum,
)
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.prompt_template import BasePromptTemplate
class EvalConfig(BaseModel):
"""Configuration for a given run evaluator.
Parameters
----------
evaluator_type : EvaluatorType
The type of evaluator to use.
Methods
-------
get_kwargs()
Get the keyword arguments for the evaluator configuration.
"""
evaluator_type: EvaluatorType
def get_kwargs(self) -> Dict[str, Any]:
"""Get the keyword arguments for the load_evaluator call.
Returns
-------
Dict[str, Any]
The keyword arguments for the load_evaluator call.
"""
return self.dict(exclude={"evaluator_type"}, exclude_none=True)
class RunEvalConfig(BaseModel):
"""Configuration for a run evaluation.
Parameters
----------
evaluators : List[Union[EvaluatorType, EvalConfig]]
Configurations for which evaluators to apply to the dataset run.
Each can be the string of an :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
as EvaluatorType.QA, the evaluator type string ("qa"), or a configuration for a
given evaluator (e.g., :class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`).
custom_evaluators : Optional[List[Union[RunEvaluator, StringEvaluator]]]
Custom evaluators to apply to the dataset run.
reference_key : Optional[str]
The key in the dataset run to use as the reference string.
If not provided, it will be inferred automatically.
prediction_key : Optional[str]
The key from the traced run's outputs dictionary to use to
represent the prediction. If not provided, it will be inferred
automatically.
input_key : Optional[str]
The key from the traced run's inputs dictionary to use to represent the
input. If not provided, it will be inferred automatically.
eval_llm : Optional[BaseLanguageModel]
The language model to pass to any evaluators that use a language model.
""" # noqa: E501
evaluators: List[Union[EvaluatorType, EvalConfig]] = Field(default_factory=list)
"""Configurations for which evaluators to apply to the dataset run.
Each can be the string of an
:class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
as `EvaluatorType.QA`, the evaluator type string ("qa"), or a configuration for a
given evaluator
(e.g.,
:class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`).""" # noqa: E501
custom_evaluators: Optional[List[Union[RunEvaluator, StringEvaluator]]] = None
"""Custom evaluators to apply to the dataset run."""
reference_key: Optional[str] = None
"""The key in the dataset run to use as the reference string.
If not provided, we will attempt to infer automatically."""
prediction_key: Optional[str] = None
"""The key from the traced run's outputs dictionary to use to
represent the prediction. If not provided, it will be inferred
automatically."""
input_key: Optional[str] = None
"""The key from the traced run's inputs dictionary to use to represent the
input. If not provided, it will be inferred automatically."""
eval_llm: Optional[BaseLanguageModel] = None
"""The language model to pass to any evaluators that require one."""
class Config:
arbitrary_types_allowed = True
class Criteria(EvalConfig):
"""Configuration for a reference-free criteria evaluator.
Parameters
----------
criteria : Optional[CRITERIA_TYPE]
The criteria to evaluate.
llm : Optional[BaseLanguageModel]
The language model to use for the evaluation chain.
"""
criteria: Optional[CRITERIA_TYPE] = None
llm: Optional[BaseLanguageModel] = None
evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
def __init__(
self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
) -> None:
super().__init__(criteria=criteria, **kwargs)
class LabeledCriteria(EvalConfig):
"""Configuration for a labeled (with references) criteria evaluator.
Parameters
----------
criteria : Optional[CRITERIA_TYPE]
The criteria to evaluate.
llm : Optional[BaseLanguageModel]
The language model to use for the evaluation chain.
"""
criteria: Optional[CRITERIA_TYPE] = None
llm: Optional[BaseLanguageModel] = None
evaluator_type: EvaluatorType = EvaluatorType.LABELED_CRITERIA
def __init__(
self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
) -> None:
super().__init__(criteria=criteria, **kwargs)
class EmbeddingDistance(EvalConfig):
"""Configuration for an embedding distance evaluator.
Parameters
----------
embeddings : Optional[Embeddings]
The embeddings to use for computing the distance.
distance_metric : Optional[EmbeddingDistanceEnum]
The distance metric to use for computing the distance.
"""
evaluator_type: EvaluatorType = EvaluatorType.EMBEDDING_DISTANCE
embeddings: Optional[Embeddings] = None
distance_metric: Optional[EmbeddingDistanceEnum] = None
class Config:
arbitrary_types_allowed = True
class StringDistance(EvalConfig):
"""Configuration for a string distance evaluator.
Parameters
----------
distance : Optional[StringDistanceEnum]
The string distance metric to use.
"""
evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
distance: Optional[StringDistanceEnum] = None
class QA(EvalConfig):
"""Configuration for a QA evaluator.
Parameters
----------
prompt : Optional[BasePromptTemplate]
The prompt template to use for generating the question.
llm : Optional[BaseLanguageModel]
The language model to use for the evaluation chain.
"""
evaluator_type: EvaluatorType = EvaluatorType.QA
llm: Optional[BaseLanguageModel] = None
prompt: Optional[BasePromptTemplate] = None
class ContextQA(EvalConfig):
"""Configuration for a context-based QA evaluator.
Parameters
----------
prompt : Optional[BasePromptTemplate]
The prompt template to use for generating the question.
llm : Optional[BaseLanguageModel]
The language model to use for the evaluation chain.
"""
evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
llm: Optional[BaseLanguageModel] = None
prompt: Optional[BasePromptTemplate] = None
class CoTQA(EvalConfig):
"""Configuration for a context-based QA evaluator.
Parameters
----------
prompt : Optional[BasePromptTemplate]
The prompt template to use for generating the question.
llm : Optional[BaseLanguageModel]
The language model to use for the evaluation chain.
"""
evaluator_type: EvaluatorType = EvaluatorType.CONTEXT_QA
llm: Optional[BaseLanguageModel] = None
prompt: Optional[BasePromptTemplate] = None
# TODO: Trajectory

File diff suppressed because it is too large Load Diff

@ -2,12 +2,11 @@
from __future__ import annotations
from abc import abstractmethod
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional
from langchainplus_sdk import EvaluationResult, RunEvaluator
from langchainplus_sdk.schemas import Example, Run
from langsmith import EvaluationResult, RunEvaluator
from langsmith.schemas import DataType, Example, Run, RunTypeEnum
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
@ -19,7 +18,6 @@ from langchain.load.load import loads
from langchain.load.serializable import Serializable
from langchain.schema import RUN_KEY, messages_from_dict
from langchain.schema.messages import BaseMessage, get_buffer_string
from langchain.tools.base import Tool
def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]:
@ -127,52 +125,21 @@ class LLMStringRunMapper(StringRunMapper):
class ChainStringRunMapper(StringRunMapper):
"""Extract items to evaluate from the run object from a chain."""
input_key: str
input_key: Optional[str] = None
"""The key from the model Run's inputs to use as the eval input."""
prediction_key: str
prediction_key: Optional[str] = None
"""The key from the model Run's outputs to use as the eval prediction."""
@classmethod
def from_chain(
cls,
model: Chain,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
) -> ChainStringRunMapper:
"""Create a RunMapper from a chain."""
error_messages = []
if input_key is None:
if len(model.input_keys) > 1:
error_messages.append(
f"Chain {model.lc_namespace} has multiple input"
" keys. Please specify 'input_key' when loading."
)
else:
input_key = model.input_keys[0]
elif input_key not in model.input_keys:
error_messages.append(
f"Chain {model.lc_namespace} does not have specified"
f" input key {input_key}."
)
if prediction_key is None:
if len(model.output_keys) > 1:
error_messages.append(
f"Chain {model.lc_namespace} has multiple"
" output keys. Please specify 'prediction_key' when loading."
)
else:
prediction_key = model.output_keys[0]
elif prediction_key not in model.output_keys:
error_messages.append(
f"Chain {model.lc_namespace} does not have specified"
f" prediction_key {prediction_key}."
def _get_key(self, source: Dict, key: Optional[str], which: str) -> str:
if key is not None:
return source[key]
elif len(source) == 1:
return next(iter(source.values()))
else:
raise ValueError(
f"Could not map run {which} with multiple keys: "
f"{source}\nPlease manually specify a {which}_key"
)
if error_messages:
raise ValueError("\n".join(error_messages))
if input_key is None or prediction_key is None:
# This should never happen, but mypy doesn't know that.
raise ValueError(f"Chain {model.lc_namespace} has no input or output keys.")
return cls(input_key=input_key, prediction_key=prediction_key)
def map(self, run: Run) -> Dict[str, str]:
"""Maps the Run to a dictionary."""
@ -187,9 +154,11 @@ class ChainStringRunMapper(StringRunMapper):
f"Run {run.id} does not have prediction key {self.prediction_key}."
)
else:
input_ = self._get_key(run.inputs, self.input_key, "input")
prediction = self._get_key(run.outputs, self.prediction_key, "prediction")
return {
"input": run.inputs[self.input_key],
"prediction": run.outputs[self.prediction_key],
"input": input_,
"prediction": prediction,
}
@ -279,7 +248,10 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
run: Run = inputs["run"]
example: Optional[Example] = inputs.get("example")
evaluate_strings_inputs = self.run_mapper(run)
if example and self.example_mapper:
if not self.string_evaluator.requires_input:
# Hide warning about unused input
evaluate_strings_inputs.pop("input", None)
if example and self.example_mapper and self.string_evaluator.requires_reference:
evaluate_strings_inputs.update(self.example_mapper(example))
elif self.string_evaluator.requires_reference:
raise ValueError(
@ -289,12 +261,14 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
)
return evaluate_strings_inputs
def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult:
evaluation_result = EvaluationResult(key=self.name, **output)
def _prepare_output(self, output: Dict[str, Any]) -> Dict[str, Any]:
evaluation_result = EvaluationResult(
key=self.name, comment=output.get("reasoning"), **output
)
if RUN_KEY in output:
# TODO: Not currently surfaced. Update
evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
return evaluation_result
return {"feedback": evaluation_result}
def _call(
self,
@ -308,9 +282,9 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
chain_output = self.string_evaluator.evaluate_strings(
**evaluate_strings_inputs,
callbacks=callbacks,
include_run_info=True,
)
evaluation_result = self._prepare_output(chain_output)
return {"feedback": evaluation_result}
return self._prepare_output(chain_output)
async def _acall(
self,
@ -324,52 +298,85 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
chain_output = await self.string_evaluator.aevaluate_strings(
**evaluate_strings_inputs,
callbacks=callbacks,
include_run_info=True,
)
evaluation_result = self._prepare_output(chain_output)
return {"feedback": evaluation_result}
return self._prepare_output(chain_output)
def _prepare_evaluator_output(self, output: Dict[str, Any]) -> EvaluationResult:
feedback: EvaluationResult = output["feedback"]
if RUN_KEY not in feedback.evaluator_info:
feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]
return feedback
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
"""Evaluate an example."""
return self({"run": run, "example": example})["feedback"]
result = self({"run": run, "example": example}, include_run_info=True)
return self._prepare_evaluator_output(result)
async def aevaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
"""Evaluate an example."""
result = await self.acall({"run": run, "example": example})
return result["feedback"]
result = await self.acall(
{"run": run, "example": example}, include_run_info=True
)
return self._prepare_evaluator_output(result)
@classmethod
def from_model_and_evaluator(
def from_run_and_data_type(
cls,
model: Union[Chain, BaseLanguageModel, Tool],
evaluator: StringEvaluator,
run_type: RunTypeEnum,
data_type: DataType,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
reference_key: Optional[str] = None,
tags: Optional[List[str]] = None,
) -> StringRunEvaluatorChain:
"""Create a StringRunEvaluatorChain from a model and evaluator."""
if isinstance(model, BaseLanguageModel):
"""
Create a StringRunEvaluatorChain from an evaluator and the run and dataset types.
This method provides an easy way to instantiate a StringRunEvaluatorChain, by
taking an evaluator and information about the type of run and the data.
The method supports LLM and chain runs.
Args:
evaluator (StringEvaluator): The string evaluator to use.
run_type (RunTypeEnum): The type of run being evaluated.
Supported types are LLM and Chain.
data_type (DataType): The type of dataset used in the run.
input_key (str, optional): The key used to map the input from the run.
prediction_key (str, optional): The key used to map the prediction from the run.
reference_key (str, optional): The key used to map the reference from the dataset.
tags (List[str], optional): List of tags to attach to the evaluation chain.
Returns:
StringRunEvaluatorChain: The instantiated evaluation chain.
Raises:
ValueError: If the run type is not supported, or if the evaluator requires a
reference from the dataset but the reference key is not provided.
""" # noqa: E501
# Configure how run inputs/predictions are passed to the evaluator
if run_type == RunTypeEnum.llm:
run_mapper: StringRunMapper = LLMStringRunMapper()
elif isinstance(model, Chain):
run_mapper = ChainStringRunMapper.from_chain(
model, input_key=input_key, prediction_key=prediction_key
elif run_type == RunTypeEnum.chain:
run_mapper = ChainStringRunMapper(
input_key=input_key, prediction_key=prediction_key
)
elif isinstance(model, Tool):
run_mapper = ToolStringRunMapper()
else:
raise NotImplementedError(
f"{cls.__name__}.from_model_and_evaluator({type(model)})"
" not yet implemented."
"Expected one of [BaseLanguageModel, Chain, Tool]."
raise ValueError(
f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."
)
if reference_key is not None or isinstance(model, BaseLanguageModel):
# Configure how example rows are fed as a reference string to the evaluator
if reference_key is not None or data_type in (DataType.llm, DataType.chat):
example_mapper = StringExampleMapper(reference_key=reference_key)
elif evaluator.requires_reference:
# We could potentially auto-infer if there is only one string in the
# example, but it's preferred to raise earlier.
raise ValueError(
f"Evaluator {evaluator.evaluation_name} requires a reference"
" example from the dataset. Please specify the reference key from"
@ -382,4 +389,5 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
run_mapper=run_mapper,
example_mapper=example_mapper,
string_evaluator=evaluator,
tags=tags,
)

42
poetry.lock generated

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
[[package]]
name = "absl-py"
@ -641,16 +641,12 @@ category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "awadb-0.3.6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:d90318d2d388aa1bb740b0b7e641cb7da00e6ab5700ce97564163c88a1927ed4"},
{file = "awadb-0.3.6-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6154f73aab9996aefe8c8f8bf754f7182d109d6b60302c9f31666c7f50cc7aca"},
{file = "awadb-0.3.6-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:9d7e9dff353517595ecc8c9395a2367acdcfc83c68a64dd4785c8d366eed3f40"},
{file = "awadb-0.3.6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:6f6d10d1e885fa1d64eeb8ffda2de470c3a7508d57a9489213b8649bcddcd31e"},
{file = "awadb-0.3.6-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:475af75d2ffbbe970999d93fbabdf7281797390c66fe852f6a6989e706b90c94"},
{file = "awadb-0.3.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:304be1de63daec1555f0fe9de9a18cdf16a467687a35a6ccf3405cd400fefb48"},
{file = "awadb-0.3.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:176cc27d1afc4aad758515d5f8fb435f555c9ba827a9e84d6f28b1c6ac568965"},
{file = "awadb-0.3.6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:36138b754c990143d0314fd7a9293c96f7ba549860244bda728e3f51b73e0f6e"},
{file = "awadb-0.3.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211d7f6b0f7c3c3d7518d424f0f3dfac5f45f9e5d7bbf397fdae861ff0dc46fd"},
{file = "awadb-0.3.6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:b1f9e9a7ba2fa58bce55fcca784d5b3e159712962aaee2156f6317c5993f4277"},
{file = "awadb-0.3.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:b935ab4ffaa3bcbcc9a381fce91ace5940143b527ffdb467dd4bc630cd94afab"},
]
@ -4382,7 +4378,6 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
]
[[package]]
@ -4675,23 +4670,6 @@ dev = ["black", "pre-commit", "ruff"]
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
tests = ["doctest", "pytest", "pytest-mock"]
[[package]]
name = "langchainplus-sdk"
version = "0.0.20"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
category = "main"
optional = false
python-versions = ">=3.8.1,<4.0"
files = [
{file = "langchainplus_sdk-0.0.20-py3-none-any.whl", hash = "sha256:07a869d476755803aa04c4986ce78d00c2fe4ff584c0eaa57d7570c9664188db"},
{file = "langchainplus_sdk-0.0.20.tar.gz", hash = "sha256:3d300e2e3290f68cc9d842c059f9458deba60e776c9e790309688cad1bfbb219"},
]
[package.dependencies]
pydantic = ">=1,<2"
requests = ">=2,<3"
tenacity = ">=8.1.0,<9.0.0"
[[package]]
name = "langcodes"
version = "3.3.0"
@ -4727,6 +4705,22 @@ whylogs = ">=1.2.3,<2.0.0"
[package.extras]
all = ["datasets (>=2.12.0,<3.0.0)", "nltk (>=3.8.1,<4.0.0)", "openai (>=0.27.6,<0.28.0)", "sentence-transformers (>=2.2.2,<3.0.0)", "torch"]
[[package]]
name = "langsmith"
version = "0.0.5"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
category = "main"
optional = false
python-versions = ">=3.8.1,<4.0"
files = [
{file = "langsmith-0.0.5-py3-none-any.whl", hash = "sha256:c9ce19cf7a45d4b9ef74b3133ace4d0583bc992383296d03c05065e8f871e01f"},
{file = "langsmith-0.0.5.tar.gz", hash = "sha256:ffad2fc638cfee8c9d27c9eae2fa3c3f9ec423bf443b1dc44cc8184fa34cd6b2"},
]
[package.dependencies]
pydantic = ">=1,<2"
requests = ">=2,<3"
[[package]]
name = "lark"
version = "1.1.5"
@ -12719,4 +12713,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "a77a3b8ac071e8ae9cd4004e577dbe4fd39552a69adb3277b06ab91f3fd0c77b"
content-hash = "f8f94ad19dd8f96637f6ffe64401b780ea9e7985543a7c9da31c41c55e94ab0f"

@ -108,7 +108,6 @@ pyspark = {version = "^3.4.0", optional = true}
clarifai = {version = ">=9.1.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true}
nebula3-python = {version = "^3.4.0", optional = true}
langchainplus-sdk = "^0.0.20"
awadb = {version = "^0.3.3", optional = true}
azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
esprima = {version = "^4.0.1", optional = true}
@ -119,6 +118,7 @@ cassio = {version = "^0.0.7", optional = true}
rdflib = {version = "^6.3.2", optional = true}
sympy = {version = "^1.12", optional = true}
rapidfuzz = {version = "^3.1.1", optional = true}
langsmith = "^0.0.5"
[tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0"

@ -1,81 +0,0 @@
import sys
from typing import Iterator
from uuid import uuid4
import pytest
from langchainplus_sdk import LangChainPlusClient as Client
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.client.runner_utils import run_on_dataset
from langchain.evaluation import EvaluatorType
from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
from langchain.llms.openai import OpenAI
@pytest.fixture(
scope="module",
)
def dataset_name() -> Iterator[str]:
import pandas as pd
client = Client()
df = pd.DataFrame(
[
{"question": "5", "answer": 5.0},
{"question": "5 + 3", "answer": 8.0},
{"question": "2^3.171", "answer": 9.006708689094099},
{"question": " 2 ^3.171 ", "answer": 9.006708689094099},
]
)
uid = str(uuid4())[-8:]
_dataset_name = f"lcp integration tests - {uid}"
client.upload_dataframe(
df,
name=_dataset_name,
input_keys=["question"],
output_keys=["answer"],
description="Integration test dataset",
)
yield _dataset_name
def test_chat_model(dataset_name: str) -> None:
llm = ChatOpenAI(temperature=0)
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
llm,
run_evaluators=evaluators,
)
print("CHAT", results, file=sys.stderr)
def test_llm(dataset_name: str) -> None:
llm = OpenAI(temperature=0)
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
llm,
run_evaluators=evaluators,
)
print("LLM", results, file=sys.stderr)
def test_chain(dataset_name: str) -> None:
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
lambda: chain,
run_evaluators=evaluators,
)
print("CHAIN", results, file=sys.stderr)

@ -0,0 +1,429 @@
from typing import Iterator, List
from uuid import uuid4
import pytest
from langsmith import Client as Client
from langsmith.schemas import DataType
from langchain.callbacks.tracers.evaluation import wait_for_all_evaluators
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import EvaluatorType
from langchain.llms.openai import OpenAI
from langchain.schema.messages import BaseMessage, HumanMessage
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain.smith.evaluation import InputFormatError
def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
# Assert that all runs completed, all feedback completed, and that the
# chain or llm passes for the feedback provided.
runs = list(client.list_runs(project_name=_project_name, execution_order=1))
assert len(runs) == 4
wait_for_all_evaluators()
feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
assert len(feedback) == 8
assert all([f.score == 1 for f in feedback])
@pytest.fixture
def eval_project_name() -> str:
return f"lcp integration tests - {str(uuid4())[-8:]}"
@pytest.fixture(scope="module")
def client() -> Client:
return Client()
@pytest.fixture(
scope="module",
)
def kv_dataset_name() -> Iterator[str]:
import pandas as pd
client = Client()
df = pd.DataFrame(
{
"some_input": [
"What's the capital of California?",
"What's the capital of Nevada?",
"What's the capital of Oregon?",
"What's the capital of Washington?",
],
"other_input": [
"a",
"b",
"c",
"d",
],
"some_output": ["Sacramento", "Carson City", "Salem", "Olympia"],
"other_output": ["e", "f", "g", "h"],
}
)
uid = str(uuid4())[-8:]
_dataset_name = f"lcp kv dataset integration tests - {uid}"
client.upload_dataframe(
df,
name=_dataset_name,
input_keys=["some_input", "other_input"],
output_keys=["some_output", "other_output"],
description="Integration test dataset",
)
yield _dataset_name
def test_chat_model(
kv_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output",
)
with pytest.raises(
InputFormatError, match="Example inputs do not match language model"
):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
def input_mapper(d: dict) -> List[BaseMessage]:
return [HumanMessage(content=d["some_input"])]
run_on_dataset(
client,
kv_dataset_name,
llm,
evaluation=eval_config,
input_mapper=input_mapper,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> None:
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output",
)
with pytest.raises(
InputFormatError, match="Example inputs do not match language model"
):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
def input_mapper(d: dict) -> str:
return d["some_input"]
run_on_dataset(
client,
kv_dataset_name,
llm,
evaluation=eval_config,
input_mapper=input_mapper,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) -> None:
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output",
)
with pytest.raises(
InputFormatError, match="Example inputs do not match chain input keys"
):
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
def input_mapper(d: dict) -> dict:
return {"input": d["some_input"]}
with pytest.raises(
InputFormatError,
match=" match the chain's expected input keys.",
):
run_on_dataset(
client,
kv_dataset_name,
lambda: chain,
evaluation=eval_config,
input_mapper=input_mapper,
)
def right_input_mapper(d: dict) -> dict:
return {"question": d["some_input"]}
run_on_dataset(
client,
kv_dataset_name,
lambda: chain,
evaluation=eval_config,
input_mapper=right_input_mapper,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
### Testing Chat Datasets
@pytest.fixture(
scope="module",
)
def chat_dataset_name() -> Iterator[str]:
def _create_message(txt: str, role: str = "human") -> List[dict]:
return [{"type": role, "data": {"content": txt}}]
import pandas as pd
client = Client()
df = pd.DataFrame(
{
"input": [
_create_message(txt)
for txt in (
"What's the capital of California?",
"What's the capital of Nevada?",
"What's the capital of Oregon?",
"What's the capital of Washington?",
)
],
"output": [
_create_message(txt, role="ai")[0]
for txt in ("Sacramento", "Carson City", "Salem", "Olympia")
],
}
)
uid = str(uuid4())[-8:]
_dataset_name = f"lcp chat dataset integration tests - {uid}"
ds = client.create_dataset(
_dataset_name, description="Integration test dataset", data_type=DataType.chat
)
for row in df.itertuples():
client.create_example(
dataset_id=ds.id,
inputs={"input": row.input},
outputs={"output": row.output},
)
yield _dataset_name
def test_chat_model_on_chat_dataset(
chat_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
chat_dataset_name,
llm,
evaluation=eval_config,
project_name=eval_project_name,
)
_check_all_feedback_passed(eval_project_name, client)
def test_llm_on_chat_dataset(
chat_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
chat_dataset_name,
llm,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
def test_chain_on_chat_dataset(chat_dataset_name: str, client: Client) -> None:
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(
ValueError, match="Cannot evaluate a chain on dataset with data_type=chat"
):
run_on_dataset(
client,
chat_dataset_name,
lambda: chain,
evaluation=eval_config,
)
@pytest.fixture(
scope="module",
)
def llm_dataset_name() -> Iterator[str]:
import pandas as pd
client = Client()
df = pd.DataFrame(
{
"input": [
"What's the capital of California?",
"What's the capital of Nevada?",
"What's the capital of Oregon?",
"What's the capital of Washington?",
],
"output": ["Sacramento", "Carson City", "Salem", "Olympia"],
}
)
uid = str(uuid4())[-8:]
_dataset_name = f"lcp llm dataset integration tests - {uid}"
client.upload_dataframe(
df,
name=_dataset_name,
input_keys=["input"],
output_keys=["output"],
description="Integration test dataset",
data_type=DataType.llm,
)
yield _dataset_name
def test_chat_model_on_llm_dataset(
llm_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
llm_dataset_name,
llm,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
def test_llm_on_llm_dataset(
llm_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
llm_dataset_name,
llm,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
def test_chain_on_llm_dataset(llm_dataset_name: str, client: Client) -> None:
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(
ValueError, match="Cannot evaluate a chain on dataset with data_type=llm"
):
run_on_dataset(
client,
llm_dataset_name,
lambda: chain,
evaluation=eval_config,
)
@pytest.fixture(
scope="module",
)
def kv_singleio_dataset_name() -> Iterator[str]:
import pandas as pd
client = Client()
df = pd.DataFrame(
{
"the wackiest input": [
"What's the capital of California?",
"What's the capital of Nevada?",
"What's the capital of Oregon?",
"What's the capital of Washington?",
],
"unthinkable output": ["Sacramento", "Carson City", "Salem", "Olympia"],
}
)
uid = str(uuid4())[-8:]
_dataset_name = f"lcp singleio kv dataset integration tests - {uid}"
client.upload_dataframe(
df,
name=_dataset_name,
input_keys=["the wackiest input"],
output_keys=["unthinkable output"],
description="Integration test dataset",
)
yield _dataset_name
def test_chat_model_on_kv_singleio_dataset(
kv_singleio_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
kv_singleio_dataset_name,
llm,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
def test_llm_on_kv_singleio_dataset(
kv_singleio_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
kv_singleio_dataset_name,
llm,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)
def test_chain_on_kv_singleio_dataset(
kv_singleio_dataset_name: str, eval_project_name: str, client: Client
) -> None:
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
kv_singleio_dataset_name,
lambda: chain,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
)
_check_all_feedback_passed(eval_project_name, client)

@ -3,7 +3,7 @@ import unittest.mock
from typing import Any
from uuid import UUID
from langchainplus_sdk import LangChainPlusClient
from langsmith import Client
from langchain.callbacks.tracers.langchain import LangChainTracer
from langchain.callbacks.tracers.schemas import Run
@ -14,38 +14,36 @@ def test_example_id_assignment_threadsafe() -> None:
"""Test that example assigned at callback start/end is honored."""
example_ids = {}
def mock_create_run(self: Any, **kwargs: Any) -> Any:
def mock_create_run(**kwargs: Any) -> Any:
example_ids[kwargs.get("id")] = kwargs.get("reference_example_id")
return unittest.mock.MagicMock()
client = unittest.mock.MagicMock(spec=Client)
client.create_run = mock_create_run
tracer = LangChainTracer(client=client)
old_persist_run_single = tracer._persist_run_single
def new_persist_run_single(run: Run) -> None:
time.sleep(0.01)
old_persist_run_single(run)
with unittest.mock.patch.object(
LangChainPlusClient, "create_run", new=mock_create_run
tracer, "_persist_run_single", new=new_persist_run_single
):
client = LangChainPlusClient()
tracer = LangChainTracer(client=client)
old_persist_run_single = tracer._persist_run_single
def new_persist_run_single(run: Run) -> None:
time.sleep(0.01)
old_persist_run_single(run)
with unittest.mock.patch.object(
tracer, "_persist_run_single", new=new_persist_run_single
):
run_id_1 = UUID("9d878ab3-e5ca-4218-aef6-44cbdc90160a")
run_id_2 = UUID("f1f9fa53-8b2f-4742-bdbc-38215f7bd1e1")
example_id_1 = UUID("57e42c57-8c79-4d9f-8765-bf6cd3a98055")
tracer.example_id = example_id_1
tracer.on_llm_start({"name": "example_1"}, ["foo"], run_id=run_id_1)
tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_1)
example_id_2 = UUID("4f31216e-7c26-4027-a5fd-0bbf9ace17dc")
tracer.example_id = example_id_2
tracer.on_llm_start({"name": "example_2"}, ["foo"], run_id=run_id_2)
tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_2)
tracer.example_id = None
expected_example_ids = {
run_id_1: example_id_1,
run_id_2: example_id_2,
}
tracer.wait_for_futures()
assert example_ids == expected_example_ids
run_id_1 = UUID("9d878ab3-e5ca-4218-aef6-44cbdc90160a")
run_id_2 = UUID("f1f9fa53-8b2f-4742-bdbc-38215f7bd1e1")
example_id_1 = UUID("57e42c57-8c79-4d9f-8765-bf6cd3a98055")
tracer.example_id = example_id_1
tracer.on_llm_start({"name": "example_1"}, ["foo"], run_id=run_id_1)
tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_1)
example_id_2 = UUID("4f31216e-7c26-4027-a5fd-0bbf9ace17dc")
tracer.example_id = example_id_2
tracer.on_llm_start({"name": "example_2"}, ["foo"], run_id=run_id_2)
tracer.on_llm_end(LLMResult(generations=[], llm_output={}), run_id=run_id_2)
tracer.example_id = None
expected_example_ids = {
run_id_1: example_id_1,
run_id_2: example_id_2,
}
tracer.wait_for_futures()
assert example_ids == expected_example_ids

@ -1,9 +1,14 @@
"""Test the comparison chains."""
import re
import pytest
from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain
from langchain.evaluation.comparison.eval_chain import (
LabeledPairwiseStringEvalChain,
PairwiseStringEvalChain,
)
from tests.unit_tests.llms.fake_llm import FakeLLM
@ -32,7 +37,7 @@ def test_pairwise_string_comparison_chain() -> None:
)
assert res["value"] == "A"
assert res["score"] == 1
with pytest.warns(UserWarning, match=chain._skip_reference_warning):
with pytest.warns(UserWarning, match=re.escape(chain._skip_reference_warning)):
res = chain.evaluate_string_pairs(
prediction="I like pie.",
prediction_b="I hate pie.",
@ -43,7 +48,7 @@ def test_pairwise_string_comparison_chain() -> None:
assert res["score"] == 0
def test_pairwise_string_comparison_chain_missing_ref() -> None:
def test_labeled_pairwise_string_comparison_chain_missing_ref() -> None:
llm = FakeLLM(
queries={
"a": "The values are the same.\n[[C]]",
@ -52,7 +57,7 @@ def test_pairwise_string_comparison_chain_missing_ref() -> None:
},
sequential_responses=True,
)
chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True)
chain = LabeledPairwiseStringEvalChain.from_llm(llm=llm)
with pytest.raises(ValueError):
chain.evaluate_string_pairs(
prediction="I like pie.",

@ -5,18 +5,21 @@ import pytest
from langchain.evaluation.criteria.eval_chain import (
_SUPPORTED_CRITERIA,
Criteria,
CriteriaEvalChain,
LabeledCriteriaEvalChain,
)
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.llms.fake_llm import FakeLLM
def test_resolve_criteria() -> None:
# type: ignore
assert CriteriaEvalChain.resolve_criteria("helpfulness") == {
"helpfulness": _SUPPORTED_CRITERIA["helpfulness"]
"helpfulness": _SUPPORTED_CRITERIA[Criteria.HELPFULNESS]
}
assert CriteriaEvalChain.resolve_criteria(["correctness"]) == {
"correctness": _SUPPORTED_CRITERIA["correctness"]
assert CriteriaEvalChain.resolve_criteria("correctness") == {
"correctness": _SUPPORTED_CRITERIA[Criteria.CORRECTNESS]
}
@ -35,12 +38,11 @@ def test_criteria_eval_chain() -> None:
def test_criteria_eval_chain_missing_reference() -> None:
chain = CriteriaEvalChain.from_llm(
chain = LabeledCriteriaEvalChain.from_llm(
llm=FakeLLM(
queries={"text": "The meaning of life\nY"},
sequential_responses=True,
),
requires_reference=True,
criteria={"my criterion": "my criterion description"},
)
with pytest.raises(ValueError):

@ -25,8 +25,8 @@ def test_eval_chain() -> None:
outputs = fake_qa_eval_chain.evaluate([example, example], [prediction, prediction])
assert outputs[0] == outputs[1]
assert "text" in outputs[0]
assert outputs[0]["text"] == "foo"
assert fake_qa_eval_chain.output_key in outputs[0]
assert outputs[0][fake_qa_eval_chain.output_key] == "foo"
@pytest.mark.skipif(

@ -1,54 +0,0 @@
"""Test run evaluator implementations basic functionality."""
from uuid import UUID
import pytest
from langchainplus_sdk.schemas import Example, Run
from langchain.evaluation.run_evaluators import get_criteria_evaluator, get_qa_evaluator
from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.fixture
def run() -> Run:
return Run(
id=UUID("f77cd087-48f7-4c62-9e0e-297842202107"),
name="My Run",
inputs={"input": "What is the answer to life, the universe, and everything?"},
outputs={"output": "The answer is 42."},
start_time="2021-07-20T15:00:00.000000+00:00",
end_time="2021-07-20T15:00:00.000000+00:00",
run_type="chain",
execution_order=1,
)
@pytest.fixture
def example() -> Example:
return Example(
id=UUID("f77cd087-48f7-4c62-9e0e-297842202106"),
dataset_id=UUID("f77cd087-48f7-4c62-9e0e-297842202105"),
inputs={"input": "What is the answer to life, the universe, and everything?"},
outputs={"output": "The answer is 42."},
created_at="2021-07-20T15:00:00.000000+00:00",
)
def test_get_qa_evaluator(run: Run, example: Example) -> None:
"""Test get_qa_evaluator."""
eval_llm = FakeLLM(
queries={"a": "This checks out.\nCORRECT"}, sequential_responses=True
)
qa_evaluator = get_qa_evaluator(eval_llm)
res = qa_evaluator.evaluate_run(run, example)
assert res.value == "CORRECT"
assert res.score == 1
def test_get_criteria_evaluator(run: Run, example: Example) -> None:
"""Get a criteria evaluator."""
eval_llm = FakeLLM(queries={"a": "This checks out.\nY"}, sequential_responses=True)
criteria_evaluator = get_criteria_evaluator(eval_llm, criteria="conciseness")
res = criteria_evaluator.evaluate_run(run, example)
assert res.value == "Y"
assert res.score == 1

@ -1,114 +0,0 @@
"""Test the loading function for evaluators."""
from unittest.mock import MagicMock
import pytest
from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
from langchain.evaluation.loading import load_evaluators
from langchain.evaluation.run_evaluators.string_run_evaluator import (
StringRunEvaluatorChain,
)
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.chains.test_base import FakeChain
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "generations"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, **kwargs
)
callback = RunCollectorCallbackHandler()
model.predict("Foo input", callbacks=[callback])
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"generations": "Foo output"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Foo input"
assert result["prediction"] == "Foo output"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Foo output"
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
model = FakeChatModel()
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "generations"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, **kwargs
)
callback = RunCollectorCallbackHandler()
model.predict("Foo input", callbacks=[callback])
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"generations": "Another fake response"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Human: Foo input"
assert result["prediction"] == "AI: fake response"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Another fake response"
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
model = FakeChain(
the_input_keys=["an_input", "another_input"],
)
fake_llm = FakeChatModel()
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
# No input key
with pytest.raises(ValueError, match="multiple input keys"):
StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
with pytest.raises(ValueError, match="does not have specified"):
StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, input_key="some_input"
)
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "label_column"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, input_key="an_input", **kwargs
)
callback = RunCollectorCallbackHandler()
model(
{"an_input": "Foo input", "another_input": "Another fake response"},
callbacks=[callback],
)
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"label_column": "Another fake response"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Foo input"
assert result["prediction"] == "baz"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Another fake response"

@ -4,7 +4,7 @@ import pytest
from langchain.embeddings.fake import FakeEmbeddings
from langchain.evaluation.loading import EvaluatorType, load_evaluators
from langchain.evaluation.schema import StringEvaluator
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@ -25,14 +25,25 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
)
def test_criteria_eval_chain_requires_reference() -> None:
@pytest.mark.parametrize(
"evaluator_type",
[
EvaluatorType.LABELED_CRITERIA,
EvaluatorType.LABELED_PAIRWISE_STRING,
EvaluatorType.QA,
EvaluatorType.CONTEXT_QA,
EvaluatorType.COT_QA,
],
)
def test_eval_chain_requires_references(evaluator_type: EvaluatorType) -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators(
[EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True
[evaluator_type],
llm=fake_llm,
)[0]
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
if not isinstance(evaluator, (StringEvaluator, PairwiseStringEvaluator)):
raise ValueError("Evaluator is not a [pairwise]string evaluator")
assert evaluator.requires_reference

@ -1,25 +1,26 @@
"""Test the LangChain+ client."""
"""Test the LangSmith evaluation helpers."""
import uuid
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, Iterator, List, Optional, Union
from unittest import mock
import pytest
from langchainplus_sdk.client import LangChainPlusClient
from langchainplus_sdk.schemas import Dataset, Example
from langsmith.client import Client
from langsmith.schemas import Dataset, Example
from langchain.chains.base import Chain
from langchain.chains.transform import TransformChain
from langchain.client.runner_utils import (
from langchain.schema.language_model import BaseLanguageModel
from langchain.smith.evaluation.runner_utils import (
InputFormatError,
_get_messages,
_get_prompts,
_get_prompt,
_run_llm,
_run_llm_or_chain,
_validate_example_inputs_for_chain,
_validate_example_inputs_for_language_model,
arun_on_dataset,
run_llm,
run_llm_or_chain,
)
from langchain.schema import LLMResult
from langchain.schema.language_model import BaseLanguageModel
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@ -33,19 +34,28 @@ _VALID_MESSAGES = [
{"messages": [_EXAMPLE_MESSAGE], "other_key": "value"},
{"messages": [], "other_key": "value"},
{
"messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE], [_EXAMPLE_MESSAGE]],
"messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]],
"other_key": "value",
},
{"any_key": [_EXAMPLE_MESSAGE]},
{"any_key": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE], [_EXAMPLE_MESSAGE]]},
{"any_key": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]]},
]
_VALID_PROMPTS = [
{"prompts": ["foo", "bar", "baz"], "other_key": "value"},
{"prompts": ["foo"], "other_key": "value"},
{"prompt": "foo", "other_key": ["bar", "baz"]},
{"some_key": "foo"},
{"some_key": ["foo", "bar"]},
{"some_key": ["foo"]},
]
_INVALID_PROMPTS = (
[
{"prompts": "foo"},
{"prompt": ["foo"]},
{"some_key": 3},
{"some_key": "foo", "other_key": "bar"},
],
)
@pytest.mark.parametrize(
"inputs",
@ -61,21 +71,93 @@ def test__get_messages_valid(inputs: Dict[str, Any]) -> None:
_VALID_PROMPTS,
)
def test__get_prompts_valid(inputs: Dict[str, Any]) -> None:
_get_prompts(inputs)
_get_prompt(inputs)
@pytest.mark.parametrize(
"inputs",
[
{"prompts": "foo"},
{"prompt": ["foo"]},
{"some_key": 3},
{"some_key": "foo", "other_key": "bar"},
],
_VALID_PROMPTS,
)
def test__validate_example_inputs_for_language_model(inputs: Dict[str, Any]) -> None:
mock_ = mock.MagicMock()
mock_.inputs = inputs
_validate_example_inputs_for_language_model(mock_, None)
@pytest.mark.parametrize(
"inputs",
_INVALID_PROMPTS,
)
def test__validate_example_inputs_for_language_model_invalid(
inputs: Dict[str, Any]
) -> None:
mock_ = mock.MagicMock()
mock_.inputs = inputs
with pytest.raises(InputFormatError):
_validate_example_inputs_for_language_model(mock_, None)
def test__validate_example_inputs_for_chain_single_input() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar"}
chain = mock.MagicMock()
chain.input_keys = ["def not foo"]
_validate_example_inputs_for_chain(mock_, chain, None)
def test__validate_example_inputs_for_chain_input_mapper() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar", "baz": "qux"}
chain = mock.MagicMock()
chain.input_keys = ["not foo", "not baz", "not qux"]
def wrong_output_format(inputs: dict) -> str:
assert "foo" in inputs
assert "baz" in inputs
return "hehe"
with pytest.raises(InputFormatError, match="must be a dictionary"):
_validate_example_inputs_for_chain(mock_, chain, wrong_output_format)
def wrong_output_keys(inputs: dict) -> dict:
assert "foo" in inputs
assert "baz" in inputs
return {"not foo": "foo", "not baz": "baz"}
with pytest.raises(InputFormatError, match="keys that match"):
_validate_example_inputs_for_chain(mock_, chain, wrong_output_keys)
def input_mapper(inputs: dict) -> dict:
assert "foo" in inputs
assert "baz" in inputs
return {"not foo": inputs["foo"], "not baz": inputs["baz"], "not qux": "qux"}
_validate_example_inputs_for_chain(mock_, chain, input_mapper)
def test__validate_example_inputs_for_chain_multi_io() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar", "baz": "qux"}
chain = mock.MagicMock()
chain.input_keys = ["foo", "baz"]
_validate_example_inputs_for_chain(mock_, chain, None)
def test__validate_example_inputs_for_chain_single_input_multi_expect() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar"}
chain = mock.MagicMock()
chain.input_keys = ["def not foo", "oh here is another"]
with pytest.raises(
InputFormatError, match="Example inputs do not match chain input keys."
):
_validate_example_inputs_for_chain(mock_, chain, None)
@pytest.mark.parametrize("inputs", _INVALID_PROMPTS)
def test__get_prompts_invalid(inputs: Dict[str, Any]) -> None:
with pytest.raises(InputFormatError):
_get_prompts(inputs)
_get_prompt(inputs)
def test_run_llm_or_chain_with_input_mapper() -> None:
@ -101,12 +183,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
assert "the wrong input" in inputs
return {"the right input": inputs["the wrong input"]}
result = run_llm_or_chain(
result = _run_llm_or_chain(
example, lambda: mock_chain, n_repetitions=1, input_mapper=input_mapper
)
assert len(result) == 1
assert result[0] == {"output": "2", "the right input": "1"}
bad_result = run_llm_or_chain(
bad_result = _run_llm_or_chain(
example,
lambda: mock_chain,
n_repetitions=1,
@ -115,18 +197,18 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
assert "Error" in bad_result[0]
# Try with LLM
def llm_input_mapper(inputs: dict) -> List[str]:
def llm_input_mapper(inputs: dict) -> str:
assert "the wrong input" in inputs
return ["the right input"]
return "the right input"
mock_llm = FakeLLM(queries={"the right input": "somenumber"})
result = run_llm_or_chain(
result = _run_llm_or_chain(
example, mock_llm, n_repetitions=1, input_mapper=llm_input_mapper
)
assert len(result) == 1
llm_result = result[0]
assert isinstance(llm_result, LLMResult)
assert llm_result.generations[0][0].text == "somenumber"
assert isinstance(llm_result, str)
assert llm_result == "somenumber"
@pytest.mark.parametrize(
@ -149,13 +231,13 @@ def test__get_messages_invalid(inputs: Dict[str, Any]) -> None:
@pytest.mark.parametrize("inputs", _VALID_PROMPTS + _VALID_MESSAGES)
def test_run_llm_all_formats(inputs: Dict[str, Any]) -> None:
llm = FakeLLM()
run_llm(llm, inputs, mock.MagicMock())
_run_llm(llm, inputs, mock.MagicMock())
@pytest.mark.parametrize("inputs", _VALID_MESSAGES + _VALID_PROMPTS)
def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
llm = FakeChatModel()
run_llm(llm, inputs, mock.MagicMock())
_run_llm(llm, inputs, mock.MagicMock())
@pytest.mark.asyncio
@ -216,8 +298,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
def mock_read_dataset(*args: Any, **kwargs: Any) -> Dataset:
return dataset
def mock_list_examples(*args: Any, **kwargs: Any) -> List[Example]:
return examples
def mock_list_examples(*args: Any, **kwargs: Any) -> Iterator[Example]:
return iter(examples)
async def mock_arun_chain(
example: Example,
@ -235,16 +317,16 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
pass
with mock.patch.object(
LangChainPlusClient, "read_dataset", new=mock_read_dataset
), mock.patch.object(
LangChainPlusClient, "list_examples", new=mock_list_examples
), mock.patch(
"langchain.client.runner_utils._arun_llm_or_chain", new=mock_arun_chain
Client, "read_dataset", new=mock_read_dataset
), mock.patch.object(Client, "list_examples", new=mock_list_examples), mock.patch(
"langchain.smith.evaluation.runner_utils._arun_llm_or_chain",
new=mock_arun_chain,
), mock.patch.object(
LangChainPlusClient, "create_project", new=mock_create_project
Client, "create_project", new=mock_create_project
):
client = LangChainPlusClient(api_url="http://localhost:1984", api_key="123")
client = Client(api_url="http://localhost:1984", api_key="123")
chain = mock.MagicMock()
chain.input_keys = ["foothing"]
num_repetitions = 3
results = await arun_on_dataset(
dataset_name="test",

@ -0,0 +1,347 @@
"""Test the LangSmith evaluation helpers."""
import uuid
from datetime import datetime
from typing import Any, Dict, Iterator, List, Optional, Union
from unittest import mock
import pytest
from langsmith.client import Client
from langsmith.schemas import Dataset, Example
from langchain.chains.base import Chain
from langchain.chains.transform import TransformChain
from langchain.schema.language_model import BaseLanguageModel
from langchain.smith.evaluation.runner_utils import (
InputFormatError,
_get_messages,
_get_prompt,
_run_llm,
_run_llm_or_chain,
_validate_example_inputs_for_chain,
_validate_example_inputs_for_language_model,
arun_on_dataset,
)
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
_CREATED_AT = datetime(2015, 1, 1, 0, 0, 0)
_TENANT_ID = "7a3d2b56-cd5b-44e5-846f-7eb6e8144ce4"
_EXAMPLE_MESSAGE = {
"data": {"content": "Foo", "example": False, "additional_kwargs": {}},
"type": "human",
}
_VALID_MESSAGES = [
{"messages": [_EXAMPLE_MESSAGE], "other_key": "value"},
{"messages": [], "other_key": "value"},
{
"messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]],
"other_key": "value",
},
{"any_key": [_EXAMPLE_MESSAGE]},
{"any_key": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE]]},
]
_VALID_PROMPTS = [
{"prompts": ["foo"], "other_key": "value"},
{"prompt": "foo", "other_key": ["bar", "baz"]},
{"some_key": "foo"},
{"some_key": ["foo"]},
]
_INVALID_PROMPTS = (
[
{"prompts": "foo"},
{"prompt": ["foo"]},
{"some_key": 3},
{"some_key": "foo", "other_key": "bar"},
],
)
@pytest.mark.parametrize(
"inputs",
_VALID_MESSAGES,
)
def test__get_messages_valid(inputs: Dict[str, Any]) -> None:
{"messages": []}
_get_messages(inputs)
@pytest.mark.parametrize(
"inputs",
_VALID_PROMPTS,
)
def test__get_prompts_valid(inputs: Dict[str, Any]) -> None:
_get_prompt(inputs)
@pytest.mark.parametrize(
"inputs",
_VALID_PROMPTS,
)
def test__validate_example_inputs_for_language_model(inputs: Dict[str, Any]) -> None:
mock_ = mock.MagicMock()
mock_.inputs = inputs
_validate_example_inputs_for_language_model(mock_, None)
@pytest.mark.parametrize(
"inputs",
_INVALID_PROMPTS,
)
def test__validate_example_inputs_for_language_model_invalid(
inputs: Dict[str, Any]
) -> None:
mock_ = mock.MagicMock()
mock_.inputs = inputs
with pytest.raises(InputFormatError):
_validate_example_inputs_for_language_model(mock_, None)
def test__validate_example_inputs_for_chain_single_input() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar"}
chain = mock.MagicMock()
chain.input_keys = ["def not foo"]
_validate_example_inputs_for_chain(mock_, chain, None)
def test__validate_example_inputs_for_chain_input_mapper() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar", "baz": "qux"}
chain = mock.MagicMock()
chain.input_keys = ["not foo", "not baz", "not qux"]
def wrong_output_format(inputs: dict) -> str:
assert "foo" in inputs
assert "baz" in inputs
return "hehe"
with pytest.raises(InputFormatError, match="must be a dictionary"):
_validate_example_inputs_for_chain(mock_, chain, wrong_output_format)
def wrong_output_keys(inputs: dict) -> dict:
assert "foo" in inputs
assert "baz" in inputs
return {"not foo": "foo", "not baz": "baz"}
with pytest.raises(InputFormatError, match="keys that match"):
_validate_example_inputs_for_chain(mock_, chain, wrong_output_keys)
def input_mapper(inputs: dict) -> dict:
assert "foo" in inputs
assert "baz" in inputs
return {"not foo": inputs["foo"], "not baz": inputs["baz"], "not qux": "qux"}
_validate_example_inputs_for_chain(mock_, chain, input_mapper)
def test__validate_example_inputs_for_chain_multi_io() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar", "baz": "qux"}
chain = mock.MagicMock()
chain.input_keys = ["foo", "baz"]
_validate_example_inputs_for_chain(mock_, chain, None)
def test__validate_example_inputs_for_chain_single_input_multi_expect() -> None:
mock_ = mock.MagicMock()
mock_.inputs = {"foo": "bar"}
chain = mock.MagicMock()
chain.input_keys = ["def not foo", "oh here is another"]
with pytest.raises(
InputFormatError, match="Example inputs do not match chain input keys."
):
_validate_example_inputs_for_chain(mock_, chain, None)
@pytest.mark.parametrize("inputs", _INVALID_PROMPTS)
def test__get_prompts_invalid(inputs: Dict[str, Any]) -> None:
with pytest.raises(InputFormatError):
_get_prompt(inputs)
def test_run_llm_or_chain_with_input_mapper() -> None:
example = Example(
id=uuid.uuid4(),
created_at=_CREATED_AT,
inputs={"the wrong input": "1", "another key": "2"},
outputs={"output": "2"},
dataset_id=str(uuid.uuid4()),
)
def run_val(inputs: dict) -> dict:
assert "the right input" in inputs
return {"output": "2"}
mock_chain = TransformChain(
input_variables=["the right input"],
output_variables=["output"],
transform=run_val,
)
def input_mapper(inputs: dict) -> dict:
assert "the wrong input" in inputs
return {"the right input": inputs["the wrong input"]}
result = _run_llm_or_chain(
example, lambda: mock_chain, n_repetitions=1, input_mapper=input_mapper
)
assert len(result) == 1
assert result[0] == {"output": "2", "the right input": "1"}
bad_result = _run_llm_or_chain(
example,
lambda: mock_chain,
n_repetitions=1,
)
assert len(bad_result) == 1
assert "Error" in bad_result[0]
# Try with LLM
def llm_input_mapper(inputs: dict) -> str:
assert "the wrong input" in inputs
return "the right input"
mock_llm = FakeLLM(queries={"the right input": "somenumber"})
result = _run_llm_or_chain(
example, mock_llm, n_repetitions=1, input_mapper=llm_input_mapper
)
assert len(result) == 1
llm_result = result[0]
assert isinstance(llm_result, str)
assert llm_result == "somenumber"
@pytest.mark.parametrize(
"inputs",
[
{"one_key": [_EXAMPLE_MESSAGE], "other_key": "value"},
{
"messages": [[_EXAMPLE_MESSAGE, _EXAMPLE_MESSAGE], _EXAMPLE_MESSAGE],
"other_key": "value",
},
{"prompts": "foo"},
{},
],
)
def test__get_messages_invalid(inputs: Dict[str, Any]) -> None:
with pytest.raises(InputFormatError):
_get_messages(inputs)
@pytest.mark.parametrize("inputs", _VALID_PROMPTS + _VALID_MESSAGES)
def test_run_llm_all_formats(inputs: Dict[str, Any]) -> None:
llm = FakeLLM()
_run_llm(llm, inputs, mock.MagicMock())
@pytest.mark.parametrize("inputs", _VALID_MESSAGES + _VALID_PROMPTS)
def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
llm = FakeChatModel()
_run_llm(llm, inputs, mock.MagicMock())
@pytest.mark.asyncio
async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = Dataset(
id=uuid.uuid4(),
name="test",
description="Test dataset",
owner_id="owner",
created_at=_CREATED_AT,
tenant_id=_TENANT_ID,
)
uuids = [
"0c193153-2309-4704-9a47-17aee4fb25c8",
"0d11b5fd-8e66-4485-b696-4b55155c0c05",
"90d696f0-f10d-4fd0-b88b-bfee6df08b84",
"4ce2c6d8-5124-4c0c-8292-db7bdebcf167",
"7b5a524c-80fa-4960-888e-7d380f9a11ee",
]
examples = [
Example(
id=uuids[0],
created_at=_CREATED_AT,
inputs={"input": "1"},
outputs={"output": "2"},
dataset_id=str(uuid.uuid4()),
),
Example(
id=uuids[1],
created_at=_CREATED_AT,
inputs={"input": "3"},
outputs={"output": "4"},
dataset_id=str(uuid.uuid4()),
),
Example(
id=uuids[2],
created_at=_CREATED_AT,
inputs={"input": "5"},
outputs={"output": "6"},
dataset_id=str(uuid.uuid4()),
),
Example(
id=uuids[3],
created_at=_CREATED_AT,
inputs={"input": "7"},
outputs={"output": "8"},
dataset_id=str(uuid.uuid4()),
),
Example(
id=uuids[4],
created_at=_CREATED_AT,
inputs={"input": "9"},
outputs={"output": "10"},
dataset_id=str(uuid.uuid4()),
),
]
def mock_read_dataset(*args: Any, **kwargs: Any) -> Dataset:
return dataset
def mock_list_examples(*args: Any, **kwargs: Any) -> Iterator[Example]:
return iter(examples)
async def mock_arun_chain(
example: Example,
llm_or_chain: Union[BaseLanguageModel, Chain],
n_repetitions: int,
tags: Optional[List[str]] = None,
callbacks: Optional[Any] = None,
**kwargs: Any,
) -> List[Dict[str, Any]]:
return [
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
]
def mock_create_project(*args: Any, **kwargs: Any) -> None:
pass
with mock.patch.object(
Client, "read_dataset", new=mock_read_dataset
), mock.patch.object(Client, "list_examples", new=mock_list_examples), mock.patch(
"langchain.smith.evaluation.runner_utils._arun_llm_or_chain",
new=mock_arun_chain,
), mock.patch.object(
Client, "create_project", new=mock_create_project
):
client = Client(api_url="http://localhost:1984", api_key="123")
chain = mock.MagicMock()
chain.input_keys = ["foothing"]
num_repetitions = 3
results = await arun_on_dataset(
dataset_name="test",
llm_or_chain_factory=lambda: chain,
concurrency_level=2,
project_name="test_project",
num_repetitions=num_repetitions,
client=client,
)
expected = {
uuid_: [
{"result": f"Result for example {uuid.UUID(uuid_)}"}
for _ in range(num_repetitions)
]
for uuid_ in uuids
}
assert results["results"] == expected

@ -38,7 +38,7 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
"aiohttp",
"async-timeout",
"dataclasses-json",
"langchainplus-sdk",
"langsmith",
"numexpr",
"numpy",
"openapi-schema-pydantic",

Loading…
Cancel
Save