Skip to content

Use reduction strategies

Goal: choose a selector or reducer for multi-candidate generation.

When to use this:

Use this guide when num_samples is greater than one and you need candidate selection or reduction before parsing or scoring.

Procedure

Use builtin/majority_vote when multiple candidates can converge on the same output and a simple majority is sufficient.

Use builtin/best_of_n when judge-backed comparison should select the best candidate before any optional reduction step.

Use a custom reducer when the selection rule is domain-specific.

from __future__ import annotations

from collections.abc import Mapping

from themis import Experiment
from themis.core.config import EvaluationConfig, GenerationConfig, StorageConfig
from themis.core.contexts import GenerateContext, ReduceContext
from themis.core.models import Case, Dataset, GenerationResult, ReducedCandidate


class SeededGenerator:
    """Generator example that emits different answers per seed."""

    component_id = "generator/seeded_example"
    version = "1.0"

    def fingerprint(self) -> str:
        return "generator-seeded-example"

    async def generate(self, case: Case, ctx: GenerateContext) -> GenerationResult:
        answer = "4" if (ctx.seed or 0) % 2 else "5"
        return GenerationResult(
            candidate_id=f"{case.case_id}-candidate-{ctx.seed or 0}",
            final_output={"answer": answer},
        )


class PreferCorrectReducer:
    """Reducer example that picks the numerically smaller answer."""

    component_id = "reducer/prefer_correct"
    version = "1.0"

    def fingerprint(self) -> str:
        return "reducer-prefer-correct"

    async def reduce(
        self, candidates: list[GenerationResult], ctx: ReduceContext
    ) -> ReducedCandidate:
        winner = sorted(candidates, key=_answer_value)[0]
        return ReducedCandidate(
            candidate_id=f"{ctx.case_id}-reduced",
            source_candidate_ids=[candidate.candidate_id for candidate in candidates],
            final_output=winner.final_output,
            metadata={"strategy": "prefer_numeric_minimum"},
        )


def _answer_value(candidate: GenerationResult) -> int:
    final_output = candidate.final_output
    if isinstance(final_output, Mapping) and "answer" in final_output:
        return int(str(final_output["answer"]))
    raise TypeError("Expected mapping final_output with an 'answer' key.")


def run_example() -> dict[str, object]:
    """Execute an experiment with a custom reducer."""

    experiment = Experiment(
        generation=GenerationConfig(
            generator=SeededGenerator(),
            candidate_policy={"num_samples": 2},
            reducer=PreferCorrectReducer(),
        ),
        evaluation=EvaluationConfig(
            metrics=["builtin/exact_match"], parsers=["builtin/json_identity"]
        ),
        storage=StorageConfig(store="memory"),
        datasets=[
            Dataset(
                dataset_id="sample",
                cases=[
                    Case(
                        case_id="case-1",
                        input={"question": "2+2"},
                        expected_output={"answer": "4"},
                    )
                ],
            )
        ],
        seeds=[7, 8],
    )
    result = experiment.run()
    return {
        "run_id": result.run_id,
        "status": result.status.value,
        "score_ids": [score.metric_id for score in result.cases[0].scores],
    }


if __name__ == "__main__":
    print(run_example())

Variants

Variant Best when Tradeoff Related APIs / commands
Deterministic output voting Multiple candidates often converge on the same normalized answer Less useful when outputs vary semantically but not textually builtin/majority_vote
Judged selection A judge should pick the strongest candidate before reduction Requires judge-backed selection logic builtin/best_of_n
Domain-specific selection Selection or synthesis logic belongs entirely to your own task Requires custom reduction code Custom reducer, CandidateReducer

Choose the reducer based on the decision you want to encode: consensus, judged preference, or a domain-specific selection rule.

Expected result

The run should produce either a selected candidate set or a reduced candidate that downstream parsing and scoring can consume.

Troubleshooting