`pydantic_evals.online_capability`

Online evaluation capability for pydantic-ai agents.

Provides an OnlineEvaluation capability that attaches evaluators to agent runs, dispatching them asynchronously in the background after each run completes.

OnlineEvaluation `dataclass`

Bases: AbstractCapability[AgentDepsT]

Capability that runs online evaluators on agent run results.

Dispatches evaluators asynchronously in the background after each completed agent run. Non-blocking — the agent run returns without waiting for evaluators to finish.

Note

OnlineEvaluation wraps [agent.run()][pydantic_ai.Agent.run], [agent.run_stream()][pydantic_ai.Agent.run_stream], and [agent.iter()][pydantic_ai.Agent.iter] when the run reaches a final result. For streaming runs, evaluators are dispatched only after the final result is available and the surrounding context manager exits.

Example:

from dataclasses import dataclass

from pydantic_ai import Agent
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
from pydantic_evals.online_capability import OnlineEvaluation


@dataclass
class OutputNotEmpty(Evaluator):
    def evaluate(self, ctx: EvaluatorContext) -> bool:
        return bool(ctx.output)


agent = Agent(
    'openai:gpt-5.2',
    name='assistant',
    capabilities=[OnlineEvaluation(evaluators=[OutputNotEmpty()])],
)

Source code in pydantic_evals/pydantic_evals/online_capability.py

@dataclass(kw_only=True)
class OnlineEvaluation(AbstractCapability[AgentDepsT]):
    """Capability that runs online evaluators on agent run results.

    Dispatches evaluators asynchronously in the background after each completed
    agent run. Non-blocking — the agent run returns without waiting for evaluators
    to finish.

    !!! note
        [`OnlineEvaluation`][pydantic_evals.online_capability.OnlineEvaluation]
        wraps [`agent.run()`][pydantic_ai.Agent.run],
        [`agent.run_stream()`][pydantic_ai.Agent.run_stream], and
        [`agent.iter()`][pydantic_ai.Agent.iter] when the run reaches a
        final result.
        For streaming runs, evaluators are dispatched only after the final
        result is available and the surrounding context manager exits.

    Example:
    ```python
    from dataclasses import dataclass

    from pydantic_ai import Agent
    from pydantic_evals.evaluators import Evaluator, EvaluatorContext
    from pydantic_evals.online_capability import OnlineEvaluation


    @dataclass
    class OutputNotEmpty(Evaluator):
        def evaluate(self, ctx: EvaluatorContext) -> bool:
            return bool(ctx.output)


    agent = Agent(
        'openai:gpt-5.2',
        name='assistant',
        capabilities=[OnlineEvaluation(evaluators=[OutputNotEmpty()])],
    )
    ```
    """

    evaluators: Sequence[Evaluator | OnlineEvaluator]
    """Evaluators to run after each agent run."""

    config: OnlineEvalConfig | None = None
    """Optional config override. Defaults to the global `DEFAULT_CONFIG`."""

    _online_evaluators: list[OnlineEvaluator] = field(init=False, repr=False)
    _resolved_config: OnlineEvalConfig = field(init=False, repr=False)

    def __post_init__(self) -> None:
        self._online_evaluators = [
            e if isinstance(e, OnlineEvaluator) else OnlineEvaluator(evaluator=e) for e in self.evaluators
        ]
        self._resolved_config = self.config if self.config is not None else DEFAULT_CONFIG

    @classmethod
    def get_serialization_name(cls) -> str | None:
        return None

    async def wrap_run(
        self,
        ctx: RunContext[AgentDepsT],
        *,
        handler: WrapRunHandler,
    ) -> AgentRunResult[Any]:
        config = self._resolved_config

        # Skip if disabled or already inside an evaluation context (e.g. Dataset.evaluate)
        if not config.should_evaluate():
            return await handler()

        # Use the raw prompt so sampling and evaluation see the same inputs value.
        inputs = ctx.prompt

        # Determine which evaluators are sampled (before running the agent)
        sampled = _online_internal.sample_evaluators(
            self._online_evaluators,
            config,
            inputs,
        )
        if not sampled:
            return await handler()

        # Merge config and run metadata
        metadata: dict[str, Any] | None = None
        if config.metadata is not None or ctx.metadata is not None:
            metadata = {**(config.metadata or {}), **(ctx.metadata or {})}

        # Use the agent's declared name when available so evaluation events can be
        # filtered per-agent. Fall back to the generic 'agent' label when unset.
        agent_name = ctx.agent.name if ctx.agent is not None else None
        target = agent_name or 'agent'
        span_reference = _parse_traceparent(logfire_api.get_context().get('traceparent'))

        # Run the agent with span tree capture and attribute/metric tracking.
        # `get_eval_context_kwargs` is bound by the `with` once `run_task` enters; pre-init
        # to `None` only to satisfy pyright's flow analysis on the except path.
        get_eval_context_kwargs: Callable[[], dict[str, Any]] | None = None
        try:
            with _task_run.run_task() as get_eval_context_kwargs:
                result = await handler()
        except Exception as e:
            error_evaluators = [ev for ev in sampled if ev.run_on_errors]
            if error_evaluators and get_eval_context_kwargs is not None:
                context = EvaluatorContext(
                    name=ctx.run_id,
                    inputs=inputs,
                    output=e,
                    expected_output=None,
                    metadata=metadata,
                    **get_eval_context_kwargs(),
                )
                _online_internal.dispatch_async(
                    _online_internal.dispatch_evaluators(error_evaluators, context, span_reference, target, config)
                )
            raise

        context = EvaluatorContext(
            name=ctx.run_id,
            inputs=inputs,
            output=result.output,
            expected_output=None,
            metadata=metadata,
            **get_eval_context_kwargs(),
        )
        _online_internal.dispatch_async(
            _online_internal.dispatch_evaluators(sampled, context, span_reference, target, config)
        )

        return result

evaluators `instance-attribute`

evaluators: Sequence[Evaluator | OnlineEvaluator]

Evaluators to run after each agent run.

config `class-attribute` `instance-attribute`

config: OnlineEvalConfig | None = None

Optional config override. Defaults to the global DEFAULT_CONFIG.

pydantic_evals.online_capability

OnlineEvaluation dataclass

evaluators instance-attribute

config class-attribute instance-attribute

`pydantic_evals.online_capability`

OnlineEvaluation `dataclass`

evaluators `instance-attribute`

config `class-attribute` `instance-attribute`