Configure generators¶

Goal: pick and configure a generator implementation for your run.

When to use this:

Use this guide when generation is the main variable you are changing and you already know the task you want to run.

Procedure¶

Use this chooser when generation is the variable you are changing and the rest of the runtime should remain stable.

flowchart TD
    A["What should produce the candidate?"] --> B{"Deterministic local example?"}
    B -->|Yes| C["builtin/demo_generator"]
    B -->|No| D{"External model or graph runtime?"}
    D -->|Yes| E["Provider adapter"]
    D -->|No| F["Custom Generator"]

The choice is mainly about where candidate production lives, not whether Themis still owns fan-out, storage, and inspection.

Use the builtin demo generator for deterministic tutorials, smoke tests, and local examples.

Use provider adapters when Themis should still own fan-out, reduction, storage, and inspection, but an external model or graph runtime should produce the candidate output.

Prompt-focused experiments:

set GenerationConfig.prompt_spec when you want prompt instructions, prefixes, suffixes, or generic prompt blocks to be part of the experiment identity
PromptSpec.blocks is intentionally generic prompt material, not an example-specific feature
prompt specs flow into GenerateContext, so custom generators can consume them directly
provider-backed adapters such as OpenAI also consume prompt specs, which means prompt changes invalidate generation-stage cache reuse as expected

Review these example sources:

from __future__ import annotations

from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.models import Case, Dataset, GenerationResult


class CustomGenerator:
    """Small example generator that satisfies the Generator protocol."""

    component_id = "generator/custom_example"
    version = "1.0"

    def fingerprint(self) -> str:
        return "custom-example-generator"

    async def generate(self, case: Case, ctx: object) -> GenerationResult:
        del ctx
        return GenerationResult(
            candidate_id=f"{case.case_id}-candidate",
            final_output={"answer": "4"},
        )


def run_example() -> dict[str, object]:
    """Execute an experiment with a custom generator instance."""

    experiment = Experiment(
        generation=GenerationConfig(generator=CustomGenerator()),
        evaluation=EvaluationConfig(
            metrics=["builtin/exact_match"], parsers=["builtin/json_identity"]
        ),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[
                    Case(
                        case_id="case-1",
                        input={"question": "2+2"},
                        expected_output={"answer": "4"},
                    )
                ],
            )
        ],
    )
    result = experiment.run()
    return {"run_id": result.run_id, "status": result.status.value}


if __name__ == "__main__":
    print(run_example())

from __future__ import annotations

from types import SimpleNamespace

from themis import Experiment, InMemoryRunStore
from themis.adapters import openai
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.models import Case, Dataset


class _FakeResponses:
    async def create(self, **kwargs: object) -> object:
        return SimpleNamespace(
            id="resp-1",
            output_text="4",
            usage=SimpleNamespace(input_tokens=3, output_tokens=1),
            headers={"x-ratelimit-limit-requests": "60"},
            model_dump=lambda mode="json": {"request": kwargs, "output_text": "4"},
        )


class _FakeClient:
    @property
    def responses(self) -> _FakeResponses:
        return _FakeResponses()


def run_example() -> dict[str, object]:
    """Execute the OpenAI adapter against a fake injected client."""

    store = InMemoryRunStore()
    generator = openai(
        "gpt-fake",
        client=_FakeClient(),
        instructions="Answer with only the final number.",
    )
    experiment = Experiment(
        generation=GenerationConfig(generator=generator),
        evaluation=EvaluationConfig(),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[Case(case_id="case-1", input={"question": "2+2"})],
            )
        ],
        seeds=[7],
    )
    result = experiment.run(store=store)
    if not result.cases:
        raise RuntimeError("OpenAI example expected at least one case result")
    if not result.cases[0].generated_candidates:
        raise RuntimeError("OpenAI example expected at least one generated candidate")
    candidate = result.cases[0].generated_candidates[0]
    artifact_keys = [] if candidate.artifacts is None else sorted(candidate.artifacts)
    return {
        "run_id": result.run_id,
        "status": result.status.value,
        "artifact_keys": artifact_keys,
    }


if __name__ == "__main__":
    print(run_example())

from __future__ import annotations

from types import SimpleNamespace

from themis import Experiment
from themis.adapters import vllm
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.models import Case, Dataset


class _FakeResponses:
    async def create(self, **kwargs: object) -> object:
        return SimpleNamespace(
            id="vllm-resp-1",
            output_text="4",
            usage=SimpleNamespace(input_tokens=3, output_tokens=1),
            headers={"x-ratelimit-limit-requests": "120"},
            model_dump=lambda mode="json": {"request": kwargs, "output_text": "4"},
        )


class _FakeChatCompletions:
    async def create(self, **kwargs: object) -> object:
        return SimpleNamespace(
            id="chat-1",
            choices=[SimpleNamespace(message=SimpleNamespace(content="4"))],
            usage=SimpleNamespace(prompt_tokens=3, completion_tokens=1),
            headers={"x-ratelimit-limit-requests": "120"},
            model_dump=lambda mode="json": {
                "request": kwargs,
                "choices": [{"message": {"content": "4"}}],
            },
        )


class _FakeChat:
    @property
    def completions(self) -> _FakeChatCompletions:
        return _FakeChatCompletions()


class _FakeClient:
    @property
    def responses(self) -> _FakeResponses:
        return _FakeResponses()

    @property
    def chat(self) -> _FakeChat:
        return _FakeChat()


def run_example() -> dict[str, object]:
    """Execute the vLLM adapter against a fake injected client."""

    generator = vllm(
        "fake-vllm",
        base_url="http://localhost:8000/v1",
        client=_FakeClient(),
        api_mode="chat_completions",
    )
    experiment = Experiment(
        generation=GenerationConfig(generator=generator),
        evaluation=EvaluationConfig(),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[Case(case_id="case-1", input={"question": "2+2"})],
            )
        ],
        seeds=[7],
    )
    result = experiment.run()
    return {
        "run_id": result.run_id,
        "status": result.status.value,
        "api_mode": generator.api_mode,
    }


if __name__ == "__main__":
    print(run_example())

from __future__ import annotations

from themis import Experiment
from themis.adapters import langgraph
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.models import Case, Dataset


class _FakeGraph:
    async def ainvoke(self, payload: object) -> object:
        return {"answer": "4", "input": payload}

    async def astream_events(self, payload: object, *, version: str):
        del version
        yield {
            "name": "plan",
            "event": "step",
            "data": {"input": payload, "output": {"proposed_answer": "4"}},
        }


def run_example() -> dict[str, object]:
    """Execute the LangGraph adapter against a fake graph."""

    generator = langgraph(_FakeGraph(), graph_id="fake-graph", output_key="answer")
    experiment = Experiment(
        generation=GenerationConfig(generator=generator),
        evaluation=EvaluationConfig(),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[Case(case_id="case-1", input={"question": "2+2"})],
            )
        ],
        seeds=[7],
    )
    result = experiment.run()
    trace_steps = len(result.cases[0].generated_candidates[0].trace or [])
    return {
        "run_id": result.run_id,
        "status": result.status.value,
        "trace_steps": trace_steps,
    }


if __name__ == "__main__":
    print(run_example())

Provider-backed examples should be runnable with injected fake clients in docs/tests, even when real deployments need optional extras.

Variants¶

Variant	Best when	Tradeoff	Related APIs / commands
Builtin deterministic runs	You want tutorials, smoke tests, or fixture-backed examples without external providers	Not representative of production model behavior	`builtin/demo_generator`
Provider-backed generation	An external endpoint or graph runtime should generate outputs while Themis owns the rest of the run	Requires provider extras, clients, or service setup	`themis.adapters.openai(...)`, `themis.adapters.vllm(...)`, `themis.adapters.langgraph(...)`
Fully custom generation	Candidate production logic belongs entirely in your own code	Highest implementation effort	`Generator`
Prompt-only experiment change	The generator stays fixed and prompt material is the only experiment variable	Less useful when provider or generator behavior also needs to change	`GenerationConfig.prompt_spec`, `PromptSpec.blocks`

Expected result¶

You should know which generator style matches your run and what prerequisites or optional extras are required.

Configure generators¶

Procedure¶

Variants¶

Expected result¶

Troubleshooting¶