Skip to content

pydantic_evals.online_capability

Online evaluation capability for pydantic-ai agents.

Provides an OnlineEvaluation capability that attaches evaluators to agent runs, dispatching them asynchronously in the background after each run completes.

OnlineEvaluation dataclass

Bases: AbstractCapability[AgentDepsT]

Capability that runs online evaluators on agent run results.

Dispatches evaluators asynchronously in the background after each completed agent run. Non-blocking — the agent run returns without waiting for evaluators to finish.

Note

OnlineEvaluation wraps [agent.run()][pydantic_ai.Agent.run], [agent.run_stream()][pydantic_ai.Agent.run_stream], and [agent.iter()][pydantic_ai.Agent.iter] when the run reaches a final result. For streaming runs, evaluators are dispatched only after the final result is available and the surrounding context manager exits.

Example:

from dataclasses import dataclass

from pydantic_ai import Agent
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
from pydantic_evals.online_capability import OnlineEvaluation


@dataclass
class OutputNotEmpty(Evaluator):
    def evaluate(self, ctx: EvaluatorContext) -> bool:
        return bool(ctx.output)


agent = Agent(
    'openai:gpt-5.2',
    name='assistant',
    capabilities=[OnlineEvaluation(evaluators=[OutputNotEmpty()])],
)

Source code in pydantic_evals/pydantic_evals/online_capability.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
@dataclass(kw_only=True)
class OnlineEvaluation(AbstractCapability[AgentDepsT]):
    """Capability that runs online evaluators on agent run results.

    Dispatches evaluators asynchronously in the background after each completed
    agent run. Non-blocking — the agent run returns without waiting for evaluators
    to finish.

    !!! note
        [`OnlineEvaluation`][pydantic_evals.online_capability.OnlineEvaluation]
        wraps [`agent.run()`][pydantic_ai.Agent.run],
        [`agent.run_stream()`][pydantic_ai.Agent.run_stream], and
        [`agent.iter()`][pydantic_ai.Agent.iter] when the run reaches a
        final result.
        For streaming runs, evaluators are dispatched only after the final
        result is available and the surrounding context manager exits.

    Example:
    ```python
    from dataclasses import dataclass

    from pydantic_ai import Agent
    from pydantic_evals.evaluators import Evaluator, EvaluatorContext
    from pydantic_evals.online_capability import OnlineEvaluation


    @dataclass
    class OutputNotEmpty(Evaluator):
        def evaluate(self, ctx: EvaluatorContext) -> bool:
            return bool(ctx.output)


    agent = Agent(
        'openai:gpt-5.2',
        name='assistant',
        capabilities=[OnlineEvaluation(evaluators=[OutputNotEmpty()])],
    )
    ```
    """

    evaluators: Sequence[Evaluator | OnlineEvaluator]
    """Evaluators to run after each agent run."""

    config: OnlineEvalConfig | None = None
    """Optional config override. Defaults to the global `DEFAULT_CONFIG`."""

    _online_evaluators: list[OnlineEvaluator] = field(init=False, repr=False)
    _resolved_config: OnlineEvalConfig = field(init=False, repr=False)

    def __post_init__(self) -> None:
        self._online_evaluators = [
            e if isinstance(e, OnlineEvaluator) else OnlineEvaluator(evaluator=e) for e in self.evaluators
        ]
        self._resolved_config = self.config if self.config is not None else DEFAULT_CONFIG

    @classmethod
    def get_serialization_name(cls) -> str | None:
        return None

    async def wrap_run(
        self,
        ctx: RunContext[AgentDepsT],
        *,
        handler: WrapRunHandler,
    ) -> AgentRunResult[Any]:
        config = self._resolved_config

        # Skip if disabled or already inside an evaluation context (e.g. Dataset.evaluate)
        if not config.should_evaluate():
            return await handler()

        # Use the raw prompt so sampling and evaluation see the same inputs value.
        inputs = ctx.prompt

        # Determine which evaluators are sampled (before running the agent)
        sampled = _online_internal.sample_evaluators(
            self._online_evaluators,
            config,
            inputs,
        )
        if not sampled:
            return await handler()

        # Merge config and run metadata
        metadata: dict[str, Any] | None = None
        if config.metadata is not None or ctx.metadata is not None:
            metadata = {**(config.metadata or {}), **(ctx.metadata or {})}

        # Use the agent's declared name when available so evaluation events can be
        # filtered per-agent. Fall back to the generic 'agent' label when unset.
        agent_name = ctx.agent.name if ctx.agent is not None else None
        target = agent_name or 'agent'
        span_reference = _parse_traceparent(logfire_api.get_context().get('traceparent'))

        # Run the agent with span tree capture and attribute/metric tracking.
        # `get_eval_context_kwargs` is bound by the `with` once `run_task` enters; pre-init
        # to `None` only to satisfy pyright's flow analysis on the except path.
        get_eval_context_kwargs: Callable[[], dict[str, Any]] | None = None
        try:
            with _task_run.run_task() as get_eval_context_kwargs:
                result = await handler()
        except Exception as e:
            error_evaluators = [ev for ev in sampled if ev.run_on_errors]
            if error_evaluators and get_eval_context_kwargs is not None:
                context = EvaluatorContext(
                    name=ctx.run_id,
                    inputs=inputs,
                    output=e,
                    expected_output=None,
                    metadata=metadata,
                    **get_eval_context_kwargs(),
                )
                _online_internal.dispatch_async(
                    _online_internal.dispatch_evaluators(error_evaluators, context, span_reference, target, config)
                )
            raise

        context = EvaluatorContext(
            name=ctx.run_id,
            inputs=inputs,
            output=result.output,
            expected_output=None,
            metadata=metadata,
            **get_eval_context_kwargs(),
        )
        _online_internal.dispatch_async(
            _online_internal.dispatch_evaluators(sampled, context, span_reference, target, config)
        )

        return result

evaluators instance-attribute

Evaluators to run after each agent run.

config class-attribute instance-attribute

config: OnlineEvalConfig | None = None

Optional config override. Defaults to the global DEFAULT_CONFIG.