Skip to content

Author custom components

Goal: plug your own runtime behavior into Themis without changing the orchestration core.

When to use this:

Use this guide when builtin components are close but not sufficient and the gap belongs to a runtime extension point.

Procedure

Start with the smallest protocol that solves your need:

  • Generator for candidate production
  • Parser for reduced-output normalization
  • CandidateReducer for selection or synthesis after fan-out
  • PureMetric, LLMMetric, SelectionMetric, or TraceMetric for scoring

Review the runnable examples:

from __future__ import annotations

from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.models import Case, Dataset, GenerationResult


class CustomGenerator:
    """Small example generator that satisfies the Generator protocol."""

    component_id = "generator/custom_example"
    version = "1.0"

    def fingerprint(self) -> str:
        return "custom-example-generator"

    async def generate(self, case: Case, ctx: object) -> GenerationResult:
        del ctx
        return GenerationResult(
            candidate_id=f"{case.case_id}-candidate",
            final_output={"answer": "4"},
        )


def run_example() -> dict[str, object]:
    """Execute an experiment with a custom generator instance."""

    experiment = Experiment(
        generation=GenerationConfig(generator=CustomGenerator()),
        evaluation=EvaluationConfig(
            metrics=["builtin/exact_match"], parsers=["builtin/json_identity"]
        ),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[
                    Case(
                        case_id="case-1",
                        input={"question": "2+2"},
                        expected_output={"answer": "4"},
                    )
                ],
            )
        ],
    )
    result = experiment.run()
    return {"run_id": result.run_id, "status": result.status.value}


if __name__ == "__main__":
    print(run_example())
from __future__ import annotations

from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.contexts import ParseContext
from themis.core.models import Case, Dataset, ParsedOutput, ReducedCandidate


class AnswerStringParser:
    """Parser example that normalizes a JSON-like answer payload to a string."""

    component_id = "parser/answer_string"
    version = "1.0"

    def fingerprint(self) -> str:
        return "parser-answer-string"

    def parse(self, candidate: ReducedCandidate, ctx: ParseContext) -> ParsedOutput:
        del ctx
        value = candidate.final_output
        if isinstance(value, dict) and "answer" in value:
            return ParsedOutput(value=str(value["answer"]), format="string")
        return ParsedOutput(value=str(value), format="string")


def run_example() -> dict[str, object]:
    """Execute an experiment with a custom parser."""

    experiment = Experiment(
        generation=GenerationConfig(
            generator="builtin/demo_generator", reducer="builtin/majority_vote"
        ),
        evaluation=EvaluationConfig(
            metrics=["builtin/exact_match"], parsers=[AnswerStringParser()]
        ),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[
                    Case(
                        case_id="case-1", input={"question": "2+2"}, expected_output="4"
                    )
                ],
            )
        ],
    )
    result = experiment.run()
    return {
        "run_id": result.run_id,
        "status": result.status.value,
        "score_ids": [score.metric_id for score in result.cases[0].scores],
    }


if __name__ == "__main__":
    print(run_example())
from __future__ import annotations

from collections.abc import Mapping

from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.contexts import GenerateContext, ReduceContext
from themis.core.models import Case, Dataset, GenerationResult, ReducedCandidate


class SeededGenerator:
    """Generator example that emits different answers per seed."""

    component_id = "generator/seeded_example"
    version = "1.0"

    def fingerprint(self) -> str:
        return "generator-seeded-example"

    async def generate(self, case: Case, ctx: GenerateContext) -> GenerationResult:
        answer = "4" if (ctx.seed or 0) % 2 else "5"
        return GenerationResult(
            candidate_id=f"{case.case_id}-candidate-{ctx.seed or 0}",
            final_output={"answer": answer},
        )


class PreferCorrectReducer:
    """Reducer example that picks the numerically smaller answer."""

    component_id = "reducer/prefer_correct"
    version = "1.0"

    def fingerprint(self) -> str:
        return "reducer-prefer-correct"

    async def reduce(
        self, candidates: list[GenerationResult], ctx: ReduceContext
    ) -> ReducedCandidate:
        winner = sorted(candidates, key=_answer_value)[0]
        return ReducedCandidate(
            candidate_id=f"{ctx.case_id}-reduced",
            source_candidate_ids=[candidate.candidate_id for candidate in candidates],
            final_output=winner.final_output,
            metadata={"strategy": "prefer_numeric_minimum"},
        )


def _answer_value(candidate: GenerationResult) -> int:
    final_output = candidate.final_output
    if isinstance(final_output, Mapping) and "answer" in final_output:
        return int(str(final_output["answer"]))
    raise TypeError("Expected mapping final_output with an 'answer' key.")


def run_example() -> dict[str, object]:
    """Execute an experiment with a custom reducer."""

    experiment = Experiment(
        generation=GenerationConfig(
            generator=SeededGenerator(),
            candidate_policy={"num_samples": 2},
            reducer=PreferCorrectReducer(),
        ),
        evaluation=EvaluationConfig(
            metrics=["builtin/exact_match"], parsers=["builtin/json_identity"]
        ),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[
                    Case(
                        case_id="case-1",
                        input={"question": "2+2"},
                        expected_output={"answer": "4"},
                    )
                ],
            )
        ],
        seeds=[7, 8],
    )
    result = experiment.run()
    return {
        "run_id": result.run_id,
        "status": result.status.value,
        "score_ids": [score.metric_id for score in result.cases[0].scores],
    }


if __name__ == "__main__":
    print(run_example())
from __future__ import annotations

from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.contexts import ScoreContext
from themis.core.models import Case, Dataset, ParsedOutput, Score


class ExactAnswerMetric:
    """Example pure metric with the minimum required contract."""

    component_id = "metric/exact_answer"
    version = "1.0"
    metric_family = "pure"

    def fingerprint(self) -> str:
        return "metric-exact-answer"

    def score(self, parsed: ParsedOutput, case: Case, ctx: ScoreContext) -> Score:
        del ctx
        matched = parsed.value == case.expected_output
        return Score(
            metric_id=self.component_id,
            value=1.0 if matched else 0.0,
            details={"matched": matched},
        )


def run_example() -> dict[str, object]:
    """Execute an experiment with a custom metric object."""

    experiment = Experiment(
        generation=GenerationConfig(
            generator="builtin/demo_generator", reducer="builtin/majority_vote"
        ),
        evaluation=EvaluationConfig(
            metrics=[ExactAnswerMetric()], parsers=["builtin/json_identity"]
        ),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[
                    Case(
                        case_id="case-1",
                        input={"question": "2+2"},
                        expected_output={"answer": "4"},
                    )
                ],
            )
        ],
    )
    result = experiment.run()
    return {
        "run_id": result.run_id,
        "status": result.status.value,
        "score_ids": [score.metric_id for score in result.cases[0].scores],
    }


if __name__ == "__main__":
    print(run_example())

Choose the smallest protocol that fits the job. If a component starts owning orchestration, it is probably solving the wrong problem.

Variants

Variant Best when Tradeoff Related APIs / commands
Simple deterministic scoring Parsed output alone is enough to decide correctness Less flexible than workflow-backed judging for subjective tasks PureMetric
Workflow-backed evaluation A judge model or richer workflow should score the output Higher latency and judge-model dependencies LLMMetric, SelectionMetric
Artifact-aware generation Generation should emit trace or conversation artifacts for later inspection More generator responsibility and more stored artifacts Generator, GenerationResult.trace, GenerationResult.conversation

Expected result

Your component should compile and run as a first-class Themis component with a stable identity.

Troubleshooting