Skip to content

Guide: Testing Agents

Statistical Testing with SPRT

from largestack._test.assertions import SPRT

sprt = SPRT(h0_rate=0.7, h1_rate=0.9)  # H0: 70% pass, H1: 90% pass
for run in range(100):
    success = run_agent_test()
    verdict = sprt.update(success)
    if verdict:
        print(f"Verdict after {run+1} runs: {verdict}")
        break  # PASS or FAIL — no need to run more
# Typically terminates in 22 runs instead of 100 (78% savings)

Record/Replay for CI

from largestack._test.recorder import Recorder

# Record live interactions
with Recorder("tests/fixtures/research.json") as rec:
    result = await agent.run("AI trends")
    rec.record(messages, response, model)

# Replay deterministically in CI (no API key needed)
from largestack._test.replayer import Replayer
with Replayer("tests/fixtures/research.json") as rep:
    response = rep.next_response()  # Returns recorded LLMResponse

CI/CD Quality Gates

from largestack._test.ci_gates import QualityGate

gate = QualityGate(thresholds={
    "task_completion": (">=", 0.85),
    "tool_correctness": (">=", 0.90),
    "cost_per_run": ("<=", 1.00),
})
results = gate.check({"task_completion": 0.92, "tool_correctness": 0.95, "cost_per_run": 0.45})
print(gate.format_report(results))
# Quality Gate: PASSED ✅
#   ✅ task_completion: 0.92 (threshold: 0.85)
#   ✅ tool_correctness: 0.95 (threshold: 0.90)
#   ✅ cost_per_run: 0.45 (threshold: 1.0)

6 Agent Metrics

from largestack._test.eval_metrics import AgentMetrics

metrics = AgentMetrics.evaluate(
    result,
    expected_tools=["web_search", "calculator"],
    optimal_steps=3,
)
# {"task_completion": 1.0, "tool_correctness": 1.0, "step_efficiency": 0.75}