Skip to content

Assertions Cookbook

pytest-aitest uses plain pytest assertions. There's no custom DSL — you write Python:

assert result.success
assert result.tool_was_called("transfer")
assert "balance" in result.final_response

This page shows how to assert everything you'd want to check, organized by category.

Tool Call Assertions

Was a tool called?

assert result.tool_was_called("get_balance")
assert not result.tool_was_called("delete_account")

How many times?

assert result.tool_call_count("get_balance") == 2
assert result.tool_call_count("transfer") >= 1

Tool call order

Verify tools were called in a specific sequence:

names = [c.name for c in result.all_tool_calls]
assert names.index("get_balance") < names.index("transfer")

Or check an exact sequence of calls:

names = [c.name for c in result.all_tool_calls]
assert names == ["get_balance", "transfer", "get_balance"]

Tool parameters

Check what arguments were passed to a tool:

# First call's argument
assert result.tool_call_arg("transfer", "amount") == 200

# All calls to a tool
for call in result.tool_calls_for("transfer"):
    assert call.arguments["from_account"] == "checking"

# Nested parameters (dot notation — manually)
call = result.tool_calls_for("create_user")[0]
assert call.arguments["address"]["city"] == "Paris"

Parameter pattern matching

import re

call = result.tool_calls_for("search")[0]
assert re.match(r"\d{4}-\d{2}-\d{2}", call.arguments["date"])

Tool results

Inspect what a tool returned:

import json

call = result.tool_calls_for("get_balance")[0]
data = json.loads(call.result)
assert data["balance"] >= 0

Use JSONPath for complex tool results (install jsonpath-ng):

from jsonpath_ng import parse

call = result.tool_calls_for("get_user")[0]
data = json.loads(call.result)
matches = parse("$.accounts[*].type").find(data)
assert any(m.value == "checking" for m in matches)

Only expected tools were called

allowed = {"get_balance", "get_transactions"}
assert result.tool_names_called <= allowed

No tools were called

assert len(result.all_tool_calls) == 0

Output Assertions

Contains text

assert "balance" in result.final_response.lower()
assert "$500" in result.final_response

Does NOT contain text

assert "error" not in result.final_response.lower()
assert "sorry" not in result.final_response.lower()

Regex match

import re

assert re.search(r"\$\d+\.\d{2}", result.final_response)

All responses (multi-turn)

all_text = " ".join(result.all_responses)
assert "transferred" in all_text.lower()

Semantic assertions

Use the built-in llm_assert fixture (powered by pydantic-evals LLM judge) for meaning-based checks:

async def test_response_quality(aitest_run, agent, llm_assert):
    result = await aitest_run(agent, "Show me my balances")
    assert llm_assert(
        result.final_response,
        "includes both checking and savings account balances"
    )

Multi-Dimension Scoring

Use the llm_score fixture for rubric-based evaluation across multiple dimensions:

from pytest_aitest import ScoringDimension, assert_score

RUBRIC = [
    ScoringDimension("accuracy", "Correct and factual content"),
    ScoringDimension("completeness", "Covers all requested topics"),
    ScoringDimension("clarity", "Well-organized and readable"),
]

async def test_output_quality(aitest_run, agent, llm_score):
    result = await aitest_run(agent, "Explain retry patterns")
    scores = llm_score(result.final_response, RUBRIC)
    assert_score(scores, min_total=10)  # 10/15

Threshold variants

# Percentage threshold
assert_score(scores, min_pct=0.7)  # 70% of max

# Per-dimension minimums
assert_score(scores, min_dimensions={"accuracy": 4, "completeness": 3})

# Combined
assert_score(scores, min_total=10, min_dimensions={"accuracy": 4})

Weighted dimensions

RUBRIC = [
    ScoringDimension("accuracy", "Factual correctness", weight=2.0),
    ScoringDimension("style", "Writing quality", weight=0.5),
]
scores = llm_score(content, RUBRIC)
print(scores.weighted_score)  # 0.0-1.0 weighted average

See the Multi-Dimension Scoring guide for full details.

Image Assertions

Check if images were returned

screenshots = result.tool_images_for("screenshot")
assert len(screenshots) > 0
assert screenshots[-1].media_type == "image/png"
assert len(screenshots[-1].data) > 1000  # Reasonable size

AI-graded image evaluation

Use the llm_assert_image fixture to have a vision LLM evaluate an image:

async def test_chart_quality(aitest_run, agent, llm_assert_image):
    result = await aitest_run(agent, "Create a bar chart")
    screenshots = result.tool_images_for("screenshot")
    assert llm_assert_image(
        screenshots[-1],
        "shows a bar chart with labeled axes"
    )

Image properties

screenshots = result.tool_images_for("screenshot")
for img in screenshots:
    print(f"Type: {img.media_type}, Size: {len(img.data)} bytes")

See the Image Assertions guide for complete documentation.

Performance Assertions

Max duration

assert result.duration_ms < 10_000  # Under 10 seconds

Max tokens

total_tokens = result.token_usage.get("prompt", 0) + result.token_usage.get("completion", 0)
assert total_tokens < 5000

Max cost

assert result.cost_usd < 0.01  # Under 1 cent

Max turns

assert len(result.turns) <= 6

Error Assertions

No errors

assert result.success
assert result.error is None

Tool errors

Check that no tool call produced an error:

assert all(c.error is None for c in result.all_tool_calls)

Expected error handling

Verify the agent handled an error gracefully:

result = await aitest_run(agent, "Transfer $1M from empty account")
# Agent should succeed (handle the error), not crash
assert result.success
assert "insufficient" in result.final_response.lower()

Clarification Detection

Requires ClarificationDetection(enabled=True) on the agent. See Agents for setup.

Agent didn't ask questions

assert not result.asked_for_clarification

Count clarification requests

assert result.clarification_count == 0

Inspect clarification details

if result.clarification_stats:
    print(f"Asked {result.clarification_stats.count} time(s)")
    print(f"At turns: {result.clarification_stats.turn_indices}")
    print(f"Examples: {result.clarification_stats.examples}")

CLI Server Assertions

When testing CLI tools via CLIServer, tool results contain JSON with exit_code, stdout, and stderr:

import json

# Get the CLI execution result
call = result.tool_calls_for("git_execute")[0]
cli_result = json.loads(call.result)

# Exit code
assert cli_result["exit_code"] == 0

# Stdout content
assert "main" in cli_result["stdout"]

# Stderr is empty (no errors)
assert cli_result["stderr"] == ""

Regex on CLI output

import re

call = result.tool_calls_for("git_execute")[0]
cli_result = json.loads(call.result)
assert re.search(r"commit [a-f0-9]{7}", cli_result["stdout"])

Session Assertions

Verifying context continuity

@pytest.mark.session("banking-flow")
class TestBankingWorkflow:
    async def test_check_balance(self, aitest_run, agent):
        result = await aitest_run(agent, "What's my checking balance?")
        assert result.success
        assert not result.is_session_continuation

    async def test_transfer(self, aitest_run, agent):
        result = await aitest_run(agent, "Transfer $100 to savings")
        assert result.success
        assert result.is_session_continuation
        assert result.session_context_count > 0

Data extraction between session tests

Extract values from tool results and use them in later tests:

@pytest.mark.session("user-flow")
class TestUserWorkflow:
    user_id: str

    async def test_create(self, aitest_run, agent):
        result = await aitest_run(agent, "Create a user named Alice")
        assert result.success
        # Extract from tool result
        call = result.tool_calls_for("create_user")[0]
        data = json.loads(call.result)
        self.__class__.user_id = data["id"]

    async def test_lookup(self, aitest_run, agent):
        result = await aitest_run(
            agent, f"Find user {self.user_id}"
        )
        assert result.tool_was_called("get_user")

Boolean Combinators

Use Python's and, or, not — no special syntax needed:

# ANY of these tools was called (OR)
assert result.tool_was_called("get_balance") or result.tool_was_called("get_all_balances")

# ALL of these tools were called (AND)
assert result.tool_was_called("get_balance") and result.tool_was_called("transfer")

# This tool was NOT called (NOT)
assert not result.tool_was_called("delete_account")

For complex conditions, use any() / all():

required_tools = ["get_balance", "transfer", "get_transactions"]
assert all(result.tool_was_called(t) for t in required_tools)

optional_tools = ["get_exchange_rate", "convert_currency"]
assert any(result.tool_was_called(t) for t in optional_tools)

Skill Assertions

Skill references were used

When an agent has a skill with references, verify the agent accessed them:

assert result.tool_was_called("read_skill_reference")

Specific reference was read

call = result.tool_calls_for("read_skill_reference")[0]
assert call.arguments["filename"] == "pricing-rules.md"