使用 Pytest/Vitest 和 LangSmith 测试 ReAct 代理

本教程将向您展示如何使用 LangSmith 与流行的测试工具 Pytest 和 Vitest/Jest 的集成来评估您的 LLM 应用程序。我们将创建一个 ReAct 代理来回答有关公开交易股票的问题，并为其编写一个全面的测试套件。

设置

本教程使用 LangGraph 进行代理编排，使用 OpenAI 的 GPT-4o，使用 Tavily 进行搜索，使用 E2B 的代码解释器，并使用 Polygon 来检索股票数据，但只需稍作修改即可适应其他框架、模型和工具。 Tavily、E2B 和 Polygon 可以免费注册。

安装

首先，安装制作代理所需的软件包：

蟒
TypeScript （类型脚本）

pip install -U langgraph langchain[openai] langchain-community e2b-code-interpreter

yarn add @langchain/openai @langchain/community @langchain/langgraph @langchain/core @e2b/code-interpreter @polygon.io/client-js openai zod

接下来，安装测试框架：

Pytest 测试
维斯特
开玩笑

# Make sure you have langsmith>=0.3.1
pip install -U "langsmith[pytest]"

yarn add -D langsmith vitest

yarn add -D langsmith jest

环境变量

设置以下环境变量：

export LANGSMITH_TRACING=true
export LANGSMITH_API_KEY=<YOUR_LANGSMITH_API_KEY>

export OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
export TAVILY_API_KEY=<YOUR_TAVILY_API_KEY>
export E2B_API_KEY=<YOUR_E2B_API_KEY>
export POLYGON_API_KEY=<YOUR_POLYGON_API_KEY>

创建您的应用

为了定义我们的 React 代理，我们将使用 LangGraph/LangGraph.js 进行 orchestation，使用 LangChain 进行 LLM 和工具。

定义工具

首先，我们将定义我们将在代理中使用的工具。将有 3 个工具：

使用 Tavily 的搜索工具
使用 E2B 的代码解释工具
使用 Polygon 的股票信息工具

蟒
TypeScript （类型脚本）

from langchain_community.tools import TavilySearchResults
from e2b_code_interpreter import Sandbox
from langchain_community.tools.polygon.aggregates import PolygonAggregates
from langchain_community.utilities.polygon import PolygonAPIWrapper
from typing_extensions import Annotated, TypedDict, Optional, Literal
   
# Define search tool 
search_tool = TavilySearchResults(
  max_results=5,
  include_raw_content=True,
)

# Define code tool
def code_tool(code: str) -> str:
  """Execute python code and return the result."""
  sbx = Sandbox()
  execution = sbx.run_code(code) 
  if execution.error:
      return f"Error: {execution.error}"
  return f"Results: {execution.results}, Logs: {execution.logs}"

# Define input schema for stock ticker tool
class TickerToolInput(TypedDict):
  """Input format for the ticker tool.
  
  The tool will pull data in aggregate blocks (timespan_multiplier * timespan) from the from_date to the to_date
  """
  ticker: Annotated[str, ..., "The ticker symbol of the stock"]
  timespan: Annotated[Literal["minute", "hour", "day", "week", "month", "quarter", "year"], ..., "The size of the time window."]
  timespan_multiplier: Annotated[int, ..., "The multiplier for the time window"]
  from_date: Annotated[str, ..., "The date to start pulling data from, YYYY-MM-DD format - ONLY include the year month and day"]
  to_date: Annotated[str, ..., "The date to stop pulling data, YYYY-MM-DD format - ONLY include the year month and day"]

api_wrapper = PolygonAPIWrapper()
polygon_aggregate = PolygonAggregates(api_wrapper=api_wrapper)

# Define stock ticker tool
def ticker_tool(query: TickerToolInput) -> str:
  """Pull data for the ticker."""
  return polygon_aggregate.invoke(query)

import { TavilySearchResults } from "@langchain/community/tools/tavily_search";
import { Sandbox } from "@e2b/code-interpreter";
import { tool } from "@langchain/core/tools";
import { z } from "zod";
import { restClient } from "@polygon.io/client-js";
import { tool } from "@langchain/core/tools";
import { z } from "zod";

// Define search tool
const searchTool = new TavilySearchResults({
  maxResults: 5,
});

// Define code tool
const codeTool = tool(
async (input) => {
  const sbx = await Sandbox.create();
  const execution = await sbx.runCode(input.code);
  if (execution.error) {
    return `Error: ${execution.error}`;
  }
  return `Results: ${execution.results}, Logs: ${execution.logs}`;
},
{
  name: "code",
  description: "Execute python code and return the result.",
  schema: z.object({
    code: z.string().describe("The python code to execute"),
  }),
}
);

// Define input schema for stock ticker tool
const TickerToolInputSchema = z.object({
ticker: z.string().describe("The ticker symbol of the stock"),
timespan: z.enum(["minute", "hour", "day", "week", "month", "quarter", "year"]).describe("The size of the time window."),
timespan_multiplier: z.number().describe("The multiplier for the time window"),
from_date: z
  .string()
  .describe("The date to start pulling data from, YYYY-MM-DD format - ONLY include the year, month, and day"),
to_date: z
  .string()
  .describe("The date to stop pulling data, YYYY-MM-DD format - ONLY include the year, month, and day"),
});

const rest = restClient(process.env.POLYGON_API_KEY);

// Define stock ticker tool
const tickerTool = tool(async (query) => 
{
  const parsed = TickerToolInputSchema.parse(query);
  const result = await rest.stocks.aggregates(
      parsed.ticker,
      parsed.timespan_multiplier,
      parsed.timespan,
      parsed.from_date,
      parsed.to_date
  );
  return JSON.stringify(result);
},
{
  name: "ticker",
  description: "Pull data for the ticker",
  schema: TickerToolInputSchema,
}
);

定义代理

现在我们已经定义了所有工具，我们可以使用 LangGraph 的create_react_agent/createReactAgent创建我们的代理。

蟒
TypeScript （类型脚本）

from typing import Optional
from typing_extensions import Annotated, TypedDict

from langgraph.prebuilt import create_react_agent
    
class AgentOutputFormat(TypedDict):
    numeric_answer: Annotated[Optional[float], ..., "The numeric answer, if the user asked for one"]
    text_answer: Annotated[Optional[str], ..., "The text answer, if the user asked for one"]
    reasoning: Annotated[str, ..., "The reasoning behind the answer"]

agent = create_react_agent(
    model="openai:gpt-4o-mini",
    tools=[code_tool, search_tool, polygon_aggregates],
    response_format=AgentOutputFormat,
    state_modifier="You are a financial expert. Respond to the users query accurately",
)

import { z } from "zod";
import { ChatOpenAI } from "@langchain/openai";
import { createReactAgent } from "@langchain/langgraph/prebuilt";

const AgentOutputFormatSchema = z.object({
  numeric_answer: z.number().optional().describe("The numeric answer, if the user asked for one"),
  text_answer: z.string().optional().describe("The text answer, if the user asked for one"),
  reasoning: z.string().describe("The reasoning behind the answer"),  
})

const tools = [codeTool, searchTool, tickerTool];

const agent = createReactAgent({
  llm: new ChatOpenAI({ model: "gpt-4o" }),
  tools: tools,
  responseFormat: AgentOutputFormatSchema,
  stateModifier: "You are a financial expert. Respond to the users query accurately",
}); 

export default agent;

编写测试

现在我们已经定义了我们的代理，让我们编写一些测试来确保基本功能。在本教程中，我们将测试 agent 的工具调用能力是否正常工作，代理是否知道忽略不相关的问题，以及是否能够回答复杂的问题这涉及使用所有工具。

我们需要首先设置一个测试文件，并在文件顶部添加所需的导入。

Pytest 测试
维斯特
开玩笑

创建一个tests/test_agent.py文件。

from app import agent, polygon_aggregates, search_tool # import from wherever your agent is defined
import pytest
from langsmith import testing as t

Name your test file agent.vitest.eval.ts

import { expect } from "vitest";
import * as ls from "langsmith/vitest";
import agent from "../agent"; // import from wherever your agent is defined

// Optional, but recommended to group tests together
ls.describe("Agent Tests", () => {
// PLACE TESTS Here
});

Name your test file agent.jest.eval.ts

import { expect } from "@jest/globals";
import * as ls from "langsmith/jest";
import agent from "../agent"; // import from wherever your agent is defined

// Optional, but recommended to group tests together
ls.describe("Agent Tests", () => {
// PLACE TESTS Here
});

测试 1：处理偏离主题的问题

第一个测试将是一个简单的检查，确保代理程序没有在不相关的查询上使用工具。

Pytest 测试
维斯特
开玩笑

@pytest.mark.langsmith
@pytest.mark.parametrize(  # <-- Can still use all normal pytest markers
  "query",
  ["Hello!", "How are you doing?"],
)
def test_no_tools_on_offtopic_query(query: str) -> None:
  """Test that the agent does not use tools on offtopic queries."""
  # Log the test example
  t.log_inputs({"query": query})
  expected = []
  t.log_reference_outputs({"tool_calls": expected})

  # Call the agent's model node directly instead of running the ReACT loop.
  result = agent.nodes["agent"].invoke(
      {"messages": [{"role": "user", "content": query}]}
  )
  actual = result["messages"][0].tool_calls
  t.log_outputs({"tool_calls": actual})

  # Check that no tool calls were made.
  assert actual == expected

ls.test.each([
  { inputs: { query: "Hello!" }, expected: { numMessages: 2 } },
  { inputs: { query: "How are you doing?" }, expected: { numMessages: 2 } },
])(
  "should not use tools on offtopic query: %s",
  async ({ inputs: { query }, expected: { numMessages } }) => {
    const result = await agent.invoke({ messages: [{ role: "user", content: query }] });
    ls.logOutputs(result);
    
    // Check that the flow was HUMAN -> AI FINAL RESPONSE (no tools called)
    expect(result.messages).toHaveLength(numMessages);
  }
);

ls.test.each([
  { inputs: { query: "Hello!" }, expected: { numMessages: 2 } },
  { inputs: { query: "How are you doing?" }, expected: { numMessages: 2 } },
])(
  "should not use tools on offtopic query: %s",
  async ({ inputs: { query }, expected: { numMessages } }) => {
    const result = await agent.invoke({ messages: [{ role: "user", content: query }] });
    ls.logOutputs(result);
    
    // Check that the flow was HUMAN -> AI FINAL RESPONSE (no tools called)
    expect(result.messages).toHaveLength(numMessages);
  }
);

测试 2：简单的工具调用

对于工具调用，我们将验证代理是否使用正确的参数调用了正确的工具。

Pytest 测试
维斯特
开玩笑

@pytest.mark.langsmith
def test_searches_for_correct_ticker() -> None:
  """Test that the model looks up the correct ticker on simple query."""
  # Log the test example
  query = "What is the price of Apple?"
  t.log_inputs({"query": query})
  expected = "AAPL"
  t.log_reference_outputs({"ticker": expected})

  # Call the agent's model node directly instead of running the full ReACT loop.
  result = agent.nodes["agent"].invoke(
      {"messages": [{"role": "user", "content": query}]}
  )
  tool_calls = result["messages"][0].tool_calls
  if tool_calls[0]["name"] == polygon_aggregates.name:
      actual = tool_calls[0]["args"]["ticker"]
  else:
      actual = None
  t.log_outputs({"ticker": actual})

  # Check that the right ticker was queried
  assert actual == expected

ls.test(
  "should search for correct ticker",
  {
    inputs: { query: "What is the price of Apple?" },
    expected: { numMessages: 4 },
  },
  async ({ inputs: { query }, expected: { numMessages } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });
    
    ls.logOutputs(result);
    
    // The agent should have made a single tool call to the ticker tool
    const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
    const tickerQuery = JSON.parse(toolCalls[0].function.arguments).query.ticker;

    // Check that the right ticker was queried
    expect(tickerQuery).toBe("AAPL");
    
    // Check that the flow was HUMAN -> AI -> TOOL -> AI FINAL RESPONSE
    expect(result.messages).toHaveLength(numMessages);
  }
);

ls.test(
  "should search for correct ticker",
  {
    inputs: { query: "What is the price of Apple?" },
    expected: { numMessages: 4 },
  },
  async ({ inputs: { query }, expected: { numMessages } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });
    
    ls.logOutputs(result);
    
    // The agent should have made a single tool call to the ticker tool
    const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
    const tickerQuery = JSON.parse(toolCalls[0].function.arguments).query.ticker;

    // Check that the right ticker was queried
    expect(tickerQuery).toBe("AAPL");
    
    // Check that the flow was HUMAN -> AI -> TOOL -> AI FINAL RESPONSE
    expect(result.messages).toHaveLength(numMessages);
  }
);

测试 3：复杂的工具调用

某些工具调用比其他工具调用更容易测试。通过股票代码查找，我们可以断言搜索了正确的股票代码。使用编码工具，该工具的输入和输出受到的限制要小得多，并且有很多方法可以获得正确的答案。在这种情况下，通过运行完整的代理并断言它既调用了编码工具，并且最终得到正确的答案，就可以更简单地测试该工具是否被正确使用。

Pytest 测试
维斯特
开玩笑

@pytest.mark.langsmith
def test_executes_code_when_needed() -> None:
  query = (
      "In the past year Facebook stock went up by 66.76%, "
      "Apple by 25.24%, Google by 37.11%, Amazon by 47.52%, "
      "Netflix by 78.31%. Whats the avg return in the past "
      "year of the FAANG stocks, expressed as a percentage?"
  )
  t.log_inputs({"query": query})
  expected = 50.988
  t.log_reference_outputs({"response": expected})

  # Test that the agent executes code when needed
  result = agent.invoke({"messages": [{"role": "user", "content": query}]})
  t.log_outputs({"result": result["structured_response"].get("numeric_answer")})

  # Grab all the tool calls made by the LLM
  tool_calls = [
      tc["name"]
      for msg in result["messages"]
      for tc in getattr(msg, "tool_calls", [])
  ]

  # This will log the number of steps taken by the agent, which is useful for
  # determining how efficiently the agent gets to an answer.
  t.log_feedback(key="num_steps", score=len(result["messages"]) - 1)

  # Assert that the code tool was used
  assert "code_tool" in tool_calls

  # Assert that a numeric answer was provided:
  assert result["structured_response"].get("numeric_answer") is not None

  # Assert that the answer is correct
  assert abs(result["structured_response"]["numeric_answer"] - expected) <= 0.01

ls.test(
  "should execute code when needed",
  {
    inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
    expected: { answer: 53 },
  },
  async ({ inputs: { query }, expected: { answer } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });
    
    ls.logOutputs(result);
    
    // Grab all the tool calls made by the LLM
    const toolCalls = result.messages
      .filter(m => (m as AIMessage).tool_calls)
      .flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));

    // This will log the number of steps taken by the LLM, which we can track over time to measure performance
    ls.logFeedback({
      key: "num_steps",
      score: result.messages.length - 1, // The first message is the user message
    });

    // Assert that the tool calls include the "code_tool" function
    expect(toolCalls).toContain("code_tool");

    // Assert that the answer is within 1 of the expected answer
    expect(Math.abs(result.structured_response.numeric_answer - answer)).toBeLessThanOrEqual(1);
  }
);

ls.test(
  "should execute code when needed",
  {
    inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
    expected: { answer: 53 },
  },
  async ({ inputs: { query }, expected: { answer } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });
    
    ls.logOutputs(result);

    // Grab all the tool calls made by the LLM
    const toolCalls = result.messages
      .filter(m => (m as AIMessage).tool_calls)
      .flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));

    // This will log the number of steps taken by the LLM, which we can track over time to measure performance
    ls.logFeedback({
      key: "num_steps",
      score: result.messages.length - 1, // The first message is the user message
    });

    // Assert that the tool calls include the "code_tool" function
    expect(toolCalls).toContain("code_tool");

    // Assert that the answer is within 1 of the expected answer
    expect(Math.abs(result.structured_response.numeric_answer - answer)).toBeLessThanOrEqual(1);
  }
);

测试 4：LLM 作为裁判

我们将通过运行 LLM as-a-judge 评估来确保代理的回答以搜索结果为基础。为了独立于我们的代理跟踪 LLM 作为裁判电话，我们将使用提供的 LangSmithtrace_feedback上下文管理器在 Python 中和wrapEvaluator函数。

Pytest 测试
维斯特
开玩笑

from typing_extensions import Annotated, TypedDict

from langchain.chat_models import init_chat_model
    
class Grade(TypedDict):
  """Evaluate the groundedness of an answer in source documents."""

  score: Annotated[
      bool,
      ...,
      "Return True if the answer is fully grounded in the source documents, otherwise False.",
  ]

judge_llm = init_chat_model("gpt-4o").with_structured_output(Grade)

@pytest.mark.langsmith
def test_grounded_in_source_info() -> None:
  """Test that response is grounded in the tool outputs."""
  query = "How did Nvidia stock do in 2024 according to analysts?"
  t.log_inputs({"query": query})

  result = agent.invoke({"messages": [{"role": "user", "content": query}]})

  # Grab all the search calls made by the LLM
  search_results = "\n\n".join(
      msg.content
      for msg in result["messages"]
      if msg.type == "tool" and msg.name == search_tool.name
  )
  t.log_outputs(
      {
          "response": result["structured_response"].get("text_answer"),
          "search_results": search_results,
      }
  )

  # Trace the feedback LLM run separately from the agent run.
  with t.trace_feedback():
      # Instructions for the LLM judge
      instructions = (
          "Grade the following ANSWER. "
          "The ANSWER should be fully grounded in (i.e. supported by) the source DOCUMENTS. "
          "Return True if the ANSWER is fully grounded in the DOCUMENTS. "
          "Return False if the ANSWER is not grounded in the DOCUMENTS."
      )
      answer_and_docs = (
          f"ANSWER: {result['structured_response'].get('text_answer', '')}\n"
          f"DOCUMENTS:\n{search_results}"
      )

      # Run the judge LLM
      grade = judge_llm.invoke(
          [
              {"role": "system", "content": instructions},
              {"role": "user", "content": answer_and_docs},
          ]
      )
      t.log_feedback(key="groundedness", score=grade["score"])

  assert grade['score']

// THIS CODE GOES OUTSIDE THE TEST - IT IS JUST A HELPER FUNCTION
const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });

const groundedEvaluator = async (params: {
  answer: string;
  referenceDocuments: string, 
}) => {
  // Instructions for the LLM judge
  const instructions = [
    "Return 1 if the ANSWER is grounded in the DOCUMENTS",
    "Return 0 if the ANSWER is not grounded in the DOCUMENTS",
  ].join("\n");
  
  // Run the judge LLM
  const grade = await judgeLLM.invoke([
    { role: "system", content: instructions },
    { role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
  ]);

  const score = parseInt(grade.content.toString());
  return { key: "groundedness", score };
};

// THIS CODE GOES INSIDE THE TEST
ls.test(
  "grounded in the source",
  {
    inputs: { query: "How did Nvidia stock do in 2024 according to analysts?" },
  },
  async ({ inputs: { query } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });

    const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);

    await wrappedEvaluator({
      answer: result.structuredResponse.text_answer ?? "",
      referenceDocuments: result.structuredResponse.reasoning,
    })

    ls.logOutputs(result);
  });

// THIS CODE GOES OUTSIDE THE TEST - IT IS JUST A HELPER FUNCTION
const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });

const groundedEvaluator = async (params: {
  answer: string;
  referenceDocuments: string,
}) => {
  // Instructions for the LLM judge
  const instructions = [
    "Return 1 if the ANSWER is grounded in the DOCUMENTS",
    "Return 0 if the ANSWER is not grounded in the DOCUMENTS",
  ].join("\n");
  
  // Run the judge LLM
  const grade = await judgeLLM.invoke([
    { role: "system", content: instructions },
    { role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
  ]);

  const score = parseInt(grade.content.toString());
  return { key: "groundedness", score };
};

// THIS CODE GOES INSIDE THE TEST
ls.test(
  "grounded in the source",
  {
    inputs: { query: "How did Nvidia stock do in 2024 according to analysts?" },
  },
  async ({ inputs: { query } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });

    const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);

    await wrappedEvaluator({
      answer: result.structuredResponse.text_answer ?? "",
      referenceDocuments: result.structuredResponse.reasoning,
    })

    ls.logOutputs(result);
  });

运行测试

设置配置文件后（如果您使用的是 Vitest 或 Jest），您可以使用以下命令运行测试：

Vitest/Jest 的配置文件

维斯特
开玩笑

ls.vitest.config.ts
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: { 
    include: ["**/*.eval.?(c|m)[jt]s"],
    reporters: ["langsmith/vitest/reporter"],
    setupFiles: ["dotenv/config"],
  },
});

ls.jest.config.ts
require('dotenv').config();

module.exports = {
  preset: 'ts-jest',
  testEnvironment: 'node',
  testMatch: [
    '<rootDir>/tests/jest/**/*.jest.eval.ts'
  ],
  testPathIgnorePatterns: [
    '<rootDir>/tests/vitest/.*.vitest.eval.ts$'
  ],
  reporters: ["langsmith/jest/reporter"],
};

Pytest 测试
维斯特
开玩笑

pytest --langsmith-output tests

yarn vitest --config ls.vitest.config.ts

yarn jest --config ls.jest.config.ts

参考代码

请记住，还要将 Vitest 和 Jest 的配置文件添加到您的项目中。

代理

代理代码

Pytest 测试
TypeScript （类型脚本）

from typing import Optional

from e2b_code_interpreter import Sandbox
from langchain_community.tools import PolygonAggregates, TavilySearchResults
from langchain_community.utilities.polygon import PolygonAPIWrapper
from langgraph.prebuilt import create_react_agent
from typing_extensions import Annotated, TypedDict

search_tool = TavilySearchResults(
  max_results=5,
  include_raw_content=True,
)

def code_tool(code: str) -> str:
  """Execute python code and return the result."""
  sbx = Sandbox()
  execution = sbx.run_code(code)
  if execution.error:
      return f"Error: {execution.error}"
  return f"Results: {execution.results}, Logs: {execution.logs}"

polygon_aggregates = PolygonAggregates(api_wrapper=PolygonAPIWrapper())

class AgentOutputFormat(TypedDict):
  numeric_answer: Annotated[
      Optional[float], ..., "The numeric answer, if the user asked for one"
  ]
  text_answer: Annotated[
      Optional[str], ..., "The text answer, if the user asked for one"
  ]
  reasoning: Annotated[str, ..., "The reasoning behind the answer"]

agent = create_react_agent(
  model="openai:gpt-4o-mini",
  tools=[code_tool, search_tool, polygon_aggregates],
  response_format=AgentOutputFormat,
  state_modifier="You are a financial expert. Respond to the users query accurately",
)

import { ChatOpenAI } from "@langchain/openai";
import { createReactAgent } from "@langchain/langgraph/prebuilt";
import { TavilySearchResults } from "@langchain/community/tools/tavily_search";
import { Sandbox } from '@e2b/code-interpreter'
import { restClient } from '@polygon.io/client-js';
import { tool } from "@langchain/core/tools";
import { z } from "zod";

const codeTool = tool(
async (input) => {
  const sbx = await Sandbox.create();
  const execution = await sbx.runCode(input.code);
  if (execution.error) {
    return `Error: ${execution.error}`;
  }
  return `Results: ${execution.results}, Logs: ${execution.logs}`;
},
{
  name: "code",
  description: "Execute python code and return the result.",
  schema: z.object({
    code: z.string().describe("The python code to execute"),
  }),
}
);


const TickerToolInputSchema = z.object({
ticker: z.string().describe("The ticker symbol of the stock"),
timespan: z.enum(["minute", "hour", "day", "week", "month", "quarter", "year"]).describe("The size of the time window."),
timespan_multiplier: z.number().describe("The multiplier for the time window"),
from_date: z
  .string()
  .describe("The date to start pulling data from, YYYY-MM-DD format - ONLY include the year, month, and day"),
to_date: z
  .string()
  .describe("The date to stop pulling data, YYYY-MM-DD format - ONLY include the year, month, and day"),
});

const rest = restClient(process.env.POLYGON_API_KEY);

const tickerTool = tool(async (query) => 
{
  const parsed = TickerToolInputSchema.parse(query);
  const result = await rest.stocks.aggregates(
    parsed.ticker,
    parsed.timespan_multiplier,
    parsed.timespan,
    parsed.from_date,
    parsed.to_date
  );
  return JSON.stringify(result);
},
{
  name: "ticker",
  description: "Pull data for the ticker",
  schema: TickerToolInputSchema,
}
);

const searchTool = new TavilySearchResults({
maxResults: 5,
});

const AgentOutputFormatSchema = z.object({
numeric_answer: z.number().optional().describe("The numeric answer, if the user asked for one"),
text_answer: z.string().optional().describe("The text answer, if the user asked for one"),
reasoning: z.string().describe("The reasoning behind the answer"),  
})

const tools = [codeTool, searchTool, tickerTool];

const agent = createReactAgent({
llm: new ChatOpenAI({ model: "gpt-4o" }),
tools: tools,
responseFormat: AgentOutputFormatSchema,
stateModifier: "You are a financial expert. Respond to the users query accurately",
}); 

export default agent;

测试

测试代码

Pytest 测试
维斯特
开玩笑

# from app import agent, polygon_aggregates, search_tool # import from wherever your agent is defined
import pytest
from langchain.chat_models import init_chat_model
from langsmith import testing as t
from typing_extensions import Annotated, TypedDict

@pytest.mark.langsmith
@pytest.mark.parametrize(  # <-- Can still use all normal pytest markers
  "query",
  ["Hello!", "How are you doing?"],
)
def test_no_tools_on_offtopic_query(query: str) -> None:
  """Test that the agent does not use tools on offtopic queries."""
  # Log the test example
  t.log_inputs({"query": query})
  expected = []
  t.log_reference_outputs({"tool_calls": expected})

  # Call the agent's model node directly instead of running the ReACT loop.
  result = agent.nodes["agent"].invoke(
      {"messages": [{"role": "user", "content": query}]}
  )
  actual = result["messages"][0].tool_calls
  t.log_outputs({"tool_calls": actual})

  # Check that no tool calls were made.
  assert actual == expected

@pytest.mark.langsmith
def test_searches_for_correct_ticker() -> None:
  """Test that the model looks up the correct ticker on simple query."""
  # Log the test example
  query = "What is the price of Apple?"
  t.log_inputs({"query": query})
  expected = "AAPL"
  t.log_reference_outputs({"ticker": expected})

  # Call the agent's model node directly instead of running the full ReACT loop.
  result = agent.nodes["agent"].invoke(
      {"messages": [{"role": "user", "content": query}]}
  )
  tool_calls = result["messages"][0].tool_calls
  if tool_calls[0]["name"] == polygon_aggregates.name:
      actual = tool_calls[0]["args"]["ticker"]
  else:
      actual = None
  t.log_outputs({"ticker": actual})

  # Check that the right ticker was queried
  assert actual == expected

@pytest.mark.langsmith
def test_executes_code_when_needed() -> None:
  query = (
      "In the past year Facebook stock went up by 66.76%, "
      "Apple by 25.24%, Google by 37.11%, Amazon by 47.52%, "
      "Netflix by 78.31%. Whats the avg return in the past "
      "year of the FAANG stocks, expressed as a percentage?"
  )
  t.log_inputs({"query": query})
  expected = 50.988
  t.log_reference_outputs({"response": expected})

  # Test that the agent executes code when needed
  result = agent.invoke({"messages": [{"role": "user", "content": query}]})
  t.log_outputs({"result": result["structured_response"].get("numeric_answer")})

  # Grab all the tool calls made by the LLM
  tool_calls = [
      tc["name"]
      for msg in result["messages"]
      for tc in getattr(msg, "tool_calls", [])
  ]

  # This will log the number of steps taken by the agent, which is useful for
  # determining how efficiently the agent gets to an answer.
  t.log_feedback(key="num_steps", value=len(result["messages"]) - 1)

  # Assert that the code tool was used
  assert "code_tool" in tool_calls

  # Assert that a numeric answer was provided:
  assert result["structured_response"].get("numeric_answer") is not None

  # Assert that the answer is correct
  assert abs(result["structured_response"]["numeric_answer"] - expected) <= 0.01

class Grade(TypedDict):
  """Evaluate the groundedness of an answer in source documents."""

  score: Annotated[
      bool,
      ...,
      "Return True if the answer is fully grounded in the source documents, otherwise False.",
  ]

judge_llm = init_chat_model("gpt-4o").with_structured_output(Grade)

@pytest.mark.langsmith
def test_grounded_in_source_info() -> None:
  """Test that response is grounded in the tool outputs."""
  query = "How did Nvidia stock do in 2024 according to analysts?"
  t.log_inputs({"query": query})

  result = agent.invoke({"messages": [{"role": "user", "content": query}]})

  # Grab all the search calls made by the LLM
  search_results = "\n\n".join(
      msg.content
      for msg in result["messages"]
      if msg.type == "tool" and msg.name == search_tool.name
  )
  t.log_outputs(
      {
          "response": result["structured_response"].get("text_answer"),
          "search_results": search_results,
      }
  )

  # Trace the feedback LLM run separately from the agent run.
  with t.trace_feedback():
      # Instructions for the LLM judge
      instructions = (
          "Grade the following ANSWER. "
          "The ANSWER should be fully grounded in (i.e. supported by) the source DOCUMENTS. "
          "Return True if the ANSWER is fully grounded in the DOCUMENTS. "
          "Return False if the ANSWER is not grounded in the DOCUMENTS."
      )
      answer_and_docs = (
          f"ANSWER: {result['structured_response'].get('text_answer', '')}\n"
          f"DOCUMENTS:\n{search_results}"
      )

      # Run the judge LLM
      grade = judge_llm.invoke(
          [
              {"role": "system", "content": instructions},
              {"role": "user", "content": answer_and_docs},
          ]
      )
      t.log_feedback(key="groundedness", score=grade["score"])

  assert grade["score"]

import { expect } from "vitest";
import * as ls from "langsmith/vitest";
import agent from "../agent";
import { AIMessage, ToolMessage } from "@langchain/core/messages";
import { ChatOpenAI } from "@langchain/openai";

const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });

const groundedEvaluator = async (params: {
  answer: string;
  referenceDocuments: string, 
}) => {
const instructions = [
  "Return 1 if the ANSWER is grounded in the DOCUMENTS",
  "Return 0 if the ANSWER is not grounded in the DOCUMENTS",
].join("\n");
const grade = await judgeLLM.invoke([
  { role: "system", content: instructions },
  { role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
]);

const score = parseInt(grade.content.toString());
return { key: "groundedness", score };
}

ls.describe("Agent Tests", () => {
ls.test.each([
  { inputs: { query: "Hello!" }, referenceOutputs: { numMessages: 2 } },
  { inputs: { query: "How are you doing?" }, referenceOutputs: { numMessages: 2 } },
])(
  "should not use tools on offtopic query: %s",
  async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });
    ls.logOutputs(result);
    expect(result.messages).toHaveLength(numMessages);
  }
);

ls.test(
  "should search for correct ticker",
  {
    inputs: { query: "What is the price of Apple?" },
    referenceOutputs: { numMessages: 4 },
  },
  async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });

    const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
    const tickerQuery = toolCalls[0].args.ticker;
    ls.logOutputs(result);
    expect(tickerQuery).toBe("AAPL");
    expect(result.messages).toHaveLength(numMessages);
  }
);

ls.test(
  "should execute code when needed",
  {
    inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
    referenceOutputs: { answer: 53 },
  },
  async ({ inputs: { query }, referenceOutputs: { answer } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });
    
    const toolCalls = result.messages
      .filter(m => (m as AIMessage).tool_calls)
      .flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));

    ls.logFeedback({
      key: "num_steps",
      score: result.messages.length - 1,
    });
    ls.logOutputs(result);
    expect(toolCalls).toContain("code_tool");
    expect(Math.abs((result.structuredResponse.numeric_answer ?? 0 - answer) - answer)).toBeLessThanOrEqual(1);
  }
);

ls.test(
  "grounded in the source",
  {
    inputs: { query: "How did Nvidia stock do in 2024?" },
    referenceOutputs: {},
  },
  async ({ inputs: { query }, referenceOutputs: {} }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });

    const referenceDocuments = result.messages
      .filter((m): m is ToolMessage => m.name?.includes('tavily_search_results_json') ?? false)
      .map(m => m.content)
      .join('\n');

    const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);

    await wrappedEvaluator({
      answer: result.structuredResponse.text_answer ?? "",
      referenceDocuments: referenceDocuments,
    })
    ls.logOutputs(result);
  });
});

import { expect } from "@jest/globals";
import * as ls from "langsmith/jest";
import agent from "../agent";
import { AIMessage } from "@langchain/core/messages";
import { ChatOpenAI } from "@langchain/openai";

const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });

const groundedEvaluator = async (params: {
  answer: string;
  referenceDocuments: string, 
}) => {
const instructions = [
  "Return 1 if the ANSWER is grounded in the DOCUMENTS",
  "Return 0 if the ANSWER is not grounded in the DOCUMENTS",
].join("\n");
const grade = await judgeLLM.invoke([
  { role: "system", content: instructions },
  { role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
]);

const score = parseInt(grade.content.toString());
return { key: "groundedness", score };
}

ls.describe("Agent Tests", () => {
ls.test.each([
  { inputs: { query: "Hello!" }, referenceOutputs: { numMessages: 2 } },
  { inputs: { query: "How are you doing?" }, referenceOutputs: { numMessages: 2 } },
])(
  "should not use tools on offtopic query: %s",
  async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });
    ls.logOutputs(result);
    expect(result.messages).toHaveLength(numMessages);
  }
);

ls.test(
  "should search for correct ticker",
  {
    inputs: { query: "What is the price of Apple?" },
    referenceOutputs: { numMessages: 4 },
  },
  async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });

    const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
    const tickerQuery = toolCalls[0].args.ticker;
    ls.logOutputs(result);
    expect(tickerQuery).toBe("AAPL");
    expect(result.messages).toHaveLength(numMessages);
  }
);

ls.test(
  "should execute code when needed",
  {
    inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
    referenceOutputs: { answer: 53 },
  },
  async ({ inputs: { query }, referenceOutputs: { answer } }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });

    const toolCalls = result.messages
      .filter(m => (m as AIMessage).tool_calls)
      .flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));

    ls.logFeedback({
      key: "num_steps",
      score: result.messages.length - 1,
    });
    ls.logOutputs(result);
    expect(toolCalls).toContain("code_tool");
    expect(Math.abs(result.structuredResponse.numeric_answer ?? 0 - answer)).toBeLessThanOrEqual(1);
  }
);

ls.test(
  "grounded in the source",
  {
    inputs: { query: "How did Nvidia stock do in 2024 according to analysts?" },
    referenceOutputs: {},
  },
  async ({ inputs: { query }, referenceOutputs: {} }) => {
    const result = await agent.invoke({
      messages: [{ role: "user", content: query }],
    });

    const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);

    await wrappedEvaluator({
      answer: result.structuredResponse.text_answer ?? "",
      referenceDocuments: result.structuredResponse.reasoning,
    })

    ls.logOutputs(result);
  });
});