Author custom components¶
Goal: plug your own runtime behavior into Themis without changing the orchestration core.
When to use this:
Use this guide when builtin components are close but not sufficient and the gap belongs to a runtime extension point.
Procedure¶
Start with the smallest protocol that solves your need:
Generatorfor candidate productionParserfor reduced-output normalizationCandidateReducerfor selection or synthesis after fan-outPureMetric,LLMMetric,SelectionMetric, orTraceMetricfor scoring
Review the runnable examples:
from __future__ import annotations
from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.models import Case, Dataset, GenerationResult
class CustomGenerator:
"""Small example generator that satisfies the Generator protocol."""
component_id = "generator/custom_example"
version = "1.0"
def fingerprint(self) -> str:
return "custom-example-generator"
async def generate(self, case: Case, ctx: object) -> GenerationResult:
del ctx
return GenerationResult(
candidate_id=f"{case.case_id}-candidate",
final_output={"answer": "4"},
)
def run_example() -> dict[str, object]:
"""Execute an experiment with a custom generator instance."""
experiment = Experiment(
generation=GenerationConfig(generator=CustomGenerator()),
evaluation=EvaluationConfig(
metrics=["builtin/exact_match"], parsers=["builtin/json_identity"]
),
storage=StorageConfig(store="memory"),
datasets=[
Dataset(
dataset_id="sample",
cases=[
Case(
case_id="case-1",
input={"question": "2+2"},
expected_output={"answer": "4"},
)
],
)
],
)
result = experiment.run()
return {"run_id": result.run_id, "status": result.status.value}
if __name__ == "__main__":
print(run_example())
from __future__ import annotations
from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.contexts import ParseContext
from themis.core.models import Case, Dataset, ParsedOutput, ReducedCandidate
class AnswerStringParser:
"""Parser example that normalizes a JSON-like answer payload to a string."""
component_id = "parser/answer_string"
version = "1.0"
def fingerprint(self) -> str:
return "parser-answer-string"
def parse(self, candidate: ReducedCandidate, ctx: ParseContext) -> ParsedOutput:
del ctx
value = candidate.final_output
if isinstance(value, dict) and "answer" in value:
return ParsedOutput(value=str(value["answer"]), format="string")
return ParsedOutput(value=str(value), format="string")
def run_example() -> dict[str, object]:
"""Execute an experiment with a custom parser."""
experiment = Experiment(
generation=GenerationConfig(
generator="builtin/demo_generator", reducer="builtin/majority_vote"
),
evaluation=EvaluationConfig(
metrics=["builtin/exact_match"], parsers=[AnswerStringParser()]
),
storage=StorageConfig(store="memory"),
datasets=[
Dataset(
dataset_id="sample",
cases=[
Case(
case_id="case-1", input={"question": "2+2"}, expected_output="4"
)
],
)
],
)
result = experiment.run()
return {
"run_id": result.run_id,
"status": result.status.value,
"score_ids": [score.metric_id for score in result.cases[0].scores],
}
if __name__ == "__main__":
print(run_example())
from __future__ import annotations
from collections.abc import Mapping
from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.contexts import GenerateContext, ReduceContext
from themis.core.models import Case, Dataset, GenerationResult, ReducedCandidate
class SeededGenerator:
"""Generator example that emits different answers per seed."""
component_id = "generator/seeded_example"
version = "1.0"
def fingerprint(self) -> str:
return "generator-seeded-example"
async def generate(self, case: Case, ctx: GenerateContext) -> GenerationResult:
answer = "4" if (ctx.seed or 0) % 2 else "5"
return GenerationResult(
candidate_id=f"{case.case_id}-candidate-{ctx.seed or 0}",
final_output={"answer": answer},
)
class PreferCorrectReducer:
"""Reducer example that picks the numerically smaller answer."""
component_id = "reducer/prefer_correct"
version = "1.0"
def fingerprint(self) -> str:
return "reducer-prefer-correct"
async def reduce(
self, candidates: list[GenerationResult], ctx: ReduceContext
) -> ReducedCandidate:
winner = sorted(candidates, key=_answer_value)[0]
return ReducedCandidate(
candidate_id=f"{ctx.case_id}-reduced",
source_candidate_ids=[candidate.candidate_id for candidate in candidates],
final_output=winner.final_output,
metadata={"strategy": "prefer_numeric_minimum"},
)
def _answer_value(candidate: GenerationResult) -> int:
final_output = candidate.final_output
if isinstance(final_output, Mapping) and "answer" in final_output:
return int(str(final_output["answer"]))
raise TypeError("Expected mapping final_output with an 'answer' key.")
def run_example() -> dict[str, object]:
"""Execute an experiment with a custom reducer."""
experiment = Experiment(
generation=GenerationConfig(
generator=SeededGenerator(),
candidate_policy={"num_samples": 2},
reducer=PreferCorrectReducer(),
),
evaluation=EvaluationConfig(
metrics=["builtin/exact_match"], parsers=["builtin/json_identity"]
),
storage=StorageConfig(store="memory"),
datasets=[
Dataset(
dataset_id="sample",
cases=[
Case(
case_id="case-1",
input={"question": "2+2"},
expected_output={"answer": "4"},
)
],
)
],
seeds=[7, 8],
)
result = experiment.run()
return {
"run_id": result.run_id,
"status": result.status.value,
"score_ids": [score.metric_id for score in result.cases[0].scores],
}
if __name__ == "__main__":
print(run_example())
from __future__ import annotations
from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.contexts import ScoreContext
from themis.core.models import Case, Dataset, ParsedOutput, Score
class ExactAnswerMetric:
"""Example pure metric with the minimum required contract."""
component_id = "metric/exact_answer"
version = "1.0"
metric_family = "pure"
def fingerprint(self) -> str:
return "metric-exact-answer"
def score(self, parsed: ParsedOutput, case: Case, ctx: ScoreContext) -> Score:
del ctx
matched = parsed.value == case.expected_output
return Score(
metric_id=self.component_id,
value=1.0 if matched else 0.0,
details={"matched": matched},
)
def run_example() -> dict[str, object]:
"""Execute an experiment with a custom metric object."""
experiment = Experiment(
generation=GenerationConfig(
generator="builtin/demo_generator", reducer="builtin/majority_vote"
),
evaluation=EvaluationConfig(
metrics=[ExactAnswerMetric()], parsers=["builtin/json_identity"]
),
storage=StorageConfig(store="memory"),
datasets=[
Dataset(
dataset_id="sample",
cases=[
Case(
case_id="case-1",
input={"question": "2+2"},
expected_output={"answer": "4"},
)
],
)
],
)
result = experiment.run()
return {
"run_id": result.run_id,
"status": result.status.value,
"score_ids": [score.metric_id for score in result.cases[0].scores],
}
if __name__ == "__main__":
print(run_example())
Choose the smallest protocol that fits the job. If a component starts owning orchestration, it is probably solving the wrong problem.
Variants¶
| Variant | Best when | Tradeoff | Related APIs / commands |
|---|---|---|---|
| Simple deterministic scoring | Parsed output alone is enough to decide correctness | Less flexible than workflow-backed judging for subjective tasks | PureMetric |
| Workflow-backed evaluation | A judge model or richer workflow should score the output | Higher latency and judge-model dependencies | LLMMetric, SelectionMetric |
| Artifact-aware generation | Generation should emit trace or conversation artifacts for later inspection | More generator responsibility and more stored artifacts | Generator, GenerationResult.trace, GenerationResult.conversation |
Expected result¶
Your component should compile and run as a first-class Themis component with a stable identity.