Skip to content

Specs

Benchmark Authoring

DatasetQuerySpec

Bases: SpecBase

Declarative slice query and sampling controls for dataset providers.

Source code in themis/benchmark/query.py
class DatasetQuerySpec(SpecBase):
    """Declarative slice query and sampling controls for dataset providers."""

    kind: SamplingKind = Field(default=SamplingKind.ALL)
    count: int | None = Field(default=None, gt=0)
    seed: int | None = Field(default=None)
    strata_field: str | None = Field(default=None)
    item_ids: list[str] = Field(default_factory=list)
    metadata_filters: dict[str, str] = Field(default_factory=dict)
    projected_fields: list[str] = Field(default_factory=list)

    @field_validator("kind", mode="before")
    @classmethod
    def _coerce_kind(cls, value: SamplingKind | str) -> SamplingKind:
        if isinstance(value, str):
            return SamplingKind(value)
        return value

    @classmethod
    def all(cls) -> "DatasetQuerySpec":
        return cls(kind=SamplingKind.ALL)

    @classmethod
    def subset(
        cls,
        count: int,
        *,
        seed: int | None = None,
    ) -> "DatasetQuerySpec":
        return cls(kind=SamplingKind.SUBSET, count=count, seed=seed)

    @classmethod
    def stratified(
        cls,
        count: int,
        *,
        strata_field: str,
        seed: int | None = None,
    ) -> "DatasetQuerySpec":
        return cls(
            kind=SamplingKind.STRATIFIED,
            count=count,
            seed=seed,
            strata_field=strata_field,
        )

    @model_validator(mode="after")
    def _validate_semantic(self) -> "DatasetQuerySpec":
        if (
            self.kind in {SamplingKind.SUBSET, SamplingKind.STRATIFIED}
            and self.count is None
        ):
            raise ValueError(
                f"DatasetQuerySpec kind='{self.kind.value}' requires a positive count."
            )
        if self.kind == SamplingKind.STRATIFIED and not self.strata_field:
            raise ValueError(
                "DatasetQuerySpec kind='stratified' requires strata_field."
            )
        if self.item_ids and self.kind in {
            SamplingKind.SUBSET,
            SamplingKind.STRATIFIED,
        }:
            raise ValueError(
                "DatasetQuerySpec item_ids is mutually exclusive with count-based "
                "sampling kinds."
            )
        return self

BenchmarkSpec

Bases: SpecBase

Top-level benchmark configuration compiled into an execution plan.

Source code in themis/benchmark/specs.py
class BenchmarkSpec(SpecBase):
    """Top-level benchmark configuration compiled into an execution plan."""

    benchmark_id: str = Field(..., min_length=1)
    models: list[ModelSpec] = Field(..., min_length=1)
    slices: list[SliceSpec] = Field(..., min_length=1)
    prompt_variants: list[PromptVariantSpec] = Field(..., min_length=1)
    inference_grid: InferenceGridSpec = Field(...)
    num_samples: int = Field(default=1, ge=1)

    @model_validator(mode="after")
    def _validate_semantic(self) -> "BenchmarkSpec":
        slice_ids = [slice_spec.slice_id for slice_spec in self.slices]
        if len(slice_ids) != len(set(slice_ids)):
            raise ValueError(
                f"BenchmarkSpec '{self.benchmark_id}' has duplicate slice_id."
            )
        prompt_variant_ids = [
            prompt_variant.id for prompt_variant in self.prompt_variants
        ]
        if len(prompt_variant_ids) != len(set(prompt_variant_ids)):
            raise ValueError(
                f"BenchmarkSpec '{self.benchmark_id}' has duplicate prompt variant id."
            )
        valid_prompt_variant_ids = set(prompt_variant_ids)
        for slice_spec in self.slices:
            missing_prompt_variant_ids = sorted(
                set(slice_spec.prompt_variant_ids) - valid_prompt_variant_ids
            )
            if missing_prompt_variant_ids:
                missing_joined = ", ".join(missing_prompt_variant_ids)
                raise ValueError(
                    f"SliceSpec '{slice_spec.slice_id}' references unknown prompt "
                    f"variant id(s): {missing_joined}."
                )
        return self

SliceSpec

Bases: SpecBase

One benchmark slice with dataset identity, queries, prompts, and scoring.

Source code in themis/benchmark/specs.py
class SliceSpec(SpecBase):
    """One benchmark slice with dataset identity, queries, prompts, and scoring."""

    slice_id: str = Field(..., min_length=1)
    dataset: DatasetSpec = Field(...)
    dataset_query: DatasetQuerySpec = Field(default_factory=DatasetQuerySpec)
    dimensions: dict[str, str] = Field(default_factory=dict)
    prompt_variant_ids: list[str] = Field(default_factory=list)
    prompt_families: list[str] = Field(default_factory=list)
    generation: GenerationSpec | None = Field(default=None)
    parses: list[ParseSpec] = Field(default_factory=list)
    scores: list[ScoreSpec] = Field(default_factory=list)

    @model_validator(mode="after")
    def _validate_semantic(self) -> "SliceSpec":
        if self.generation is None and not self.parses and not self.scores:
            raise ValueError(
                f"SliceSpec '{self.slice_id}' must define at least one stage."
            )
        parse_names = [parse.name for parse in self.parses]
        parse_name_set = set(parse_names)
        if len(parse_names) != len(set(parse_names)):
            raise ValueError(f"SliceSpec '{self.slice_id}' has duplicate parse name.")
        score_names = [score.name for score in self.scores]
        if len(score_names) != len(set(score_names)):
            raise ValueError(f"SliceSpec '{self.slice_id}' has duplicate score name.")
        for score in self.scores:
            if score.parse is not None and score.parse not in parse_name_set:
                raise ValueError(
                    f"SliceSpec '{self.slice_id}' references unknown parse "
                    f"'{score.parse}' in score '{score.name}'."
                )
        return self

PromptVariantSpec

Bases: SpecBase

Structured prompt variant scoped to one family or benchmark workflow.

Source code in themis/benchmark/specs.py
class PromptVariantSpec(SpecBase):
    """Structured prompt variant scoped to one family or benchmark workflow."""

    id: str = Field(..., min_length=1)
    family: str | None = Field(default=None)
    messages: list[PromptMessage] = Field(..., min_length=1)
    variables: JSONDict = Field(
        default_factory=dict,
        description="Static prompt-scoped variables exposed to prompt rendering.",
    )

ParseSpec

Bases: SpecBase

Named parse pipeline backed by one extractor chain.

Source code in themis/benchmark/specs.py
class ParseSpec(SpecBase):
    """Named parse pipeline backed by one extractor chain."""

    name: str = Field(..., min_length=1)
    extractors: list[ExtractorRefSpec] = Field(default_factory=list)

    @field_validator("extractors", mode="before")
    @classmethod
    def _coerce_extractors(cls, value: object, info: ValidationInfo) -> object:
        del info
        if not isinstance(value, list):
            return value
        coerced: list[ExtractorRefSpec | object] = []
        for item in value:
            if isinstance(item, str):
                coerced.append(ExtractorRefSpec(id=item))
            else:
                coerced.append(item)
        return coerced

ScoreSpec

Bases: SpecBase

Named scoring pass over raw or parsed candidate outputs.

Source code in themis/benchmark/specs.py
class ScoreSpec(SpecBase):
    """Named scoring pass over raw or parsed candidate outputs."""

    name: str = Field(..., min_length=1)
    parse: str | None = Field(default=None)
    metrics: list[str] = Field(default_factory=list)

    @model_validator(mode="after")
    def _validate_semantic(self) -> "ScoreSpec":
        if not self.metrics:
            raise ValueError("ScoreSpec must define at least one metric.")
        return self

Project and Runtime Support

ProjectSpec

Bases: SpecBase

Shared project-level identity, storage defaults, and execution policy.

Keep this stable across related experiment runs so resume behavior and run manifests refer to the same storage and backend context.

Source code in themis/specs/experiment.py
class ProjectSpec(SpecBase):
    """Shared project-level identity, storage defaults, and execution policy.

    Keep this stable across related experiment runs so resume behavior and run
    manifests refer to the same storage and backend context.
    """

    project_name: str = Field(..., description="Human readable project name.")
    researcher_id: str = Field(
        ..., description="Stable owner identifier for experiment lineage."
    )
    global_seed: int = Field(
        ..., description="Default deterministic seed shared across experiments."
    )
    storage: StorageConfig = Field(..., description="Shared storage defaults.")
    execution_policy: ExecutionPolicySpec = Field(
        ..., description="Shared retry and circuit-breaker policy."
    )
    execution_backend: ExecutionBackendConfig = Field(
        default_factory=LocalExecutionBackendSpec,
        description="Execution backend used for local, worker-pool, or batch orchestration.",
    )
    metadata: dict[str, str] = Field(
        default_factory=dict, description="User-defined project metadata."
    )

StorageConfig module-attribute

StorageConfig = Annotated[
    SqliteBlobStorageSpec | PostgresBlobStorageSpec,
    Field(discriminator="backend"),
]

StorageSpec module-attribute

StorageSpec = SqliteBlobStorageSpec

SqliteBlobStorageSpec

Bases: _StorageSpecBase

SQLite event/projection store plus local filesystem blob persistence.

Source code in themis/specs/experiment.py
class SqliteBlobStorageSpec(_StorageSpecBase):
    """SQLite event/projection store plus local filesystem blob persistence."""

    backend: Literal[StorageBackend.SQLITE_BLOB] = Field(
        default=StorageBackend.SQLITE_BLOB
    )
    root_dir: str = Field(
        ..., description="Storage root for event, projection, and blob data."
    )

PostgresBlobStorageSpec

Bases: _StorageSpecBase

Postgres event/projection store plus local filesystem blob persistence.

Source code in themis/specs/experiment.py
class PostgresBlobStorageSpec(_StorageSpecBase):
    """Postgres event/projection store plus local filesystem blob persistence."""

    backend: Literal[StorageBackend.POSTGRES_BLOB] = Field(
        default=StorageBackend.POSTGRES_BLOB
    )
    database_url: str = Field(
        ..., description="Postgres connection URL for events and projections."
    )
    blob_root_dir: str = Field(
        ..., description="Local blob root for content-addressed artifact storage."
    )

ExecutionPolicySpec

Bases: SpecBase

Retry, backoff, circuit-breaker, and concurrency controls for orchestration.

These settings live above provider SDK behavior. Engines are still responsible for classifying provider failures into stable retryable codes.

Source code in themis/specs/experiment.py
class ExecutionPolicySpec(SpecBase):
    """Retry, backoff, circuit-breaker, and concurrency controls for orchestration.

    These settings live above provider SDK behavior. Engines are still
    responsible for classifying provider failures into stable retryable codes.
    """

    max_retries: int = Field(default=3, ge=0)
    retry_backoff_factor: float = Field(default=1.5, gt=0.0)
    circuit_breaker_threshold: int = Field(default=5, ge=1)
    max_in_flight_work_items: int = Field(default=32, ge=1)
    retryable_error_codes: list[str] = Field(
        default_factory=list,
        description="Stable error-code values treated as retryable for persisted work items.",
    )

InferenceGridSpec

Bases: SpecBase

Typed inference sweep over base params and scalar override grids.

Use this for temperature, top-p, or provider-extra sweeps while keeping unchanged parameter combinations resumable across runs.

Source code in themis/specs/experiment.py
class InferenceGridSpec(SpecBase):
    """Typed inference sweep over base params and scalar override grids.

    Use this for temperature, top-p, or provider-extra sweeps while keeping
    unchanged parameter combinations resumable across runs.
    """

    params: list[InferenceParamsSpec] = Field(..., min_length=1)
    overrides: dict[str, list[str | int | float | bool]] = Field(default_factory=dict)

    def expand(self) -> list[InferenceParamsSpec]:
        """Expand the base inference params over all configured overrides."""
        if not self.overrides:
            return list(self.params)

        expanded: list[InferenceParamsSpec] = []
        override_keys = sorted(self.overrides)
        override_values = [self.overrides[key] for key in override_keys]

        for base in self.params:
            base_payload = base.model_dump()
            for combination in itertools.product(*override_values):
                payload = dict(base_payload)
                extras = dict(payload.get("extras", {}))
                for key, value in zip(override_keys, combination):
                    if key in InferenceParamsSpec.model_fields:
                        payload[key] = value
                    else:
                        extras[key] = value
                if extras:
                    payload["extras"] = extras
                expanded.append(InferenceParamsSpec.model_validate(payload))
        return expanded

expand

expand() -> list[InferenceParamsSpec]

Expand the base inference params over all configured overrides.

Source code in themis/specs/experiment.py
def expand(self) -> list[InferenceParamsSpec]:
    """Expand the base inference params over all configured overrides."""
    if not self.overrides:
        return list(self.params)

    expanded: list[InferenceParamsSpec] = []
    override_keys = sorted(self.overrides)
    override_values = [self.overrides[key] for key in override_keys]

    for base in self.params:
        base_payload = base.model_dump()
        for combination in itertools.product(*override_values):
            payload = dict(base_payload)
            extras = dict(payload.get("extras", {}))
            for key, value in zip(override_keys, combination):
                if key in InferenceParamsSpec.model_fields:
                    payload[key] = value
                else:
                    extras[key] = value
            if extras:
                payload["extras"] = extras
            expanded.append(InferenceParamsSpec.model_validate(payload))
    return expanded

InferenceParamsSpec

Bases: SpecBase

Sampling and response-shape settings forwarded to inference engines.

Source code in themis/specs/experiment.py
class InferenceParamsSpec(SpecBase):
    """Sampling and response-shape settings forwarded to inference engines."""

    temperature: float = Field(
        default=0.0, ge=0.0, description="Sampling randomness. 0.0 is deterministic."
    )
    top_p: float | None = Field(
        default=None, ge=0.0, le=1.0, description="Nucleus sampling threshold."
    )
    top_k: int | None = Field(default=None, ge=0, description="Top-k token threshold.")
    max_tokens: int = Field(
        default=1024, gt=0, description="Max string length generated."
    )
    stop_sequences: list[str] = Field(
        default_factory=list, description="Sequences that end generation."
    )
    logprobs: int | None = Field(
        default=None, ge=0, description="Request token logprobs if available."
    )
    response_format: ResponseFormat | None = Field(default=None)
    seed: int | None = Field(
        default=None, description="Optional deterministic PRNG seed."
    )
    extras: JSONDict = Field(
        default_factory=dict, description="Provider-specific sampling args."
    )

    @field_validator("response_format", mode="before")
    @classmethod
    def _coerce_response_format(
        cls, value: ResponseFormat | str | None
    ) -> ResponseFormat | str | None:
        if isinstance(value, str):
            return ResponseFormat(value)
        return value

PromptMessage

Bases: BaseModel

One structured chat message in a prompt template.

Source code in themis/specs/experiment.py
class PromptMessage(BaseModel):
    """One structured chat message in a prompt template."""

    model_config = ConfigDict(frozen=True, extra="forbid", strict=True)

    role: PromptRole
    content: str

    @field_validator("role", mode="before")
    @classmethod
    def _coerce_role(cls, value: PromptRole | str) -> PromptRole | str:
        if isinstance(value, str):
            return PromptRole(value)
        return value

ModelSpec

Bases: SpecBase

Configures one inference-engine target and its provider-specific extras.

Source code in themis/specs/foundational.py
class ModelSpec(SpecBase):
    """Configures one inference-engine target and its provider-specific extras."""

    model_id: str = Field(
        ..., description="The unique name/ID of the model (e.g., 'gpt-4')."
    )
    provider: str = Field(
        ..., description="The provider adapter to route to (e.g., 'openai')."
    )
    extras: JSONDict = Field(
        default_factory=dict, description="Provider-specific initialization arguments."
    )

DatasetSpec

Bases: SpecBase

Declarative dataset source description passed to a dataset loader.

Dataset identity is part of deterministic planning. Use revision when the upstream dataset source supports version pinning.

Source code in themis/specs/foundational.py
class DatasetSpec(SpecBase):
    """Declarative dataset source description passed to a dataset loader.

    Dataset identity is part of deterministic planning. Use `revision` when the
    upstream dataset source supports version pinning.
    """

    source: DatasetSource = Field(
        default=DatasetSource.HUGGINGFACE,
        description="Dataset adapter type (huggingface, local, memory).",
    )
    dataset_id: str | None = Field(
        default=None, description="Remote ID or local file path."
    )
    data_dir: str | None = Field(
        default=None, description="Local directory containing the data."
    )
    split: str = Field(default="test", description="Dataset split to evaluate.")
    revision: str | None = Field(
        default=None, description="Git commit or tag for version pinning."
    )
    transforms: list[TransformSpec] = Field(
        default_factory=list, description="Optional dataset normalization transforms."
    )

    @field_validator("source", mode="before")
    @classmethod
    def _coerce_source(cls, value: DatasetSource | str) -> DatasetSource | str:
        if isinstance(value, str):
            return DatasetSource(value)
        return value

    @model_validator(mode="after")
    def _validate_semantic(self) -> DatasetSpec:
        if self.source == DatasetSource.HUGGINGFACE and not self.dataset_id:
            raise ValueError("DatasetSpec source='huggingface' requires a dataset_id.")
        if self.source == DatasetSource.LOCAL and not (
            self.data_dir or self.dataset_id
        ):
            raise ValueError(
                "DatasetSpec source='local' requires data_dir or dataset_id path."
            )
        return self

GenerationSpec

Bases: SpecBase

Marker that a task participates in generation-stage execution.

Source code in themis/specs/foundational.py
class GenerationSpec(SpecBase):
    """Marker that a task participates in generation-stage execution."""

JudgeInferenceSpec

Bases: SpecBase

Optional judge-model configuration used by judge-backed metrics.

Separate metrics can carry separate judge specs, which is how one candidate can be scored by multiple judge prompts or judge models in the same run.

Source code in themis/specs/foundational.py
class JudgeInferenceSpec(SpecBase):
    """Optional judge-model configuration used by judge-backed metrics.

    Separate metrics can carry separate judge specs, which is how one candidate
    can be scored by multiple judge prompts or judge models in the same run.
    """

    model: ModelSpec = Field(
        ..., description="The model configuration to power the judge."
    )
    params: InferenceParamsSpec = Field(
        default_factory=_default_judge_params,
        description="Sampling parameters for the judge model.",
    )
    extras: JSONDict = Field(
        default_factory=dict, description="Metric-specific overrides or config."
    )