Skip to content

Themis

Extractors

pittawat2542/themis

Extractors¶

Built-in parsing helpers are registered automatically by PluginRegistry.

builtin ¶

Built-in extractor implementations auto-registered by PluginRegistry.

BoxedTextExtractor ¶

Extract the final LaTeX-style boxed answer from raw text.

Source code in themis/extractors/builtin.py

class BoxedTextExtractor:
    """Extract the final LaTeX-style boxed answer from raw text."""

    def extract(
        self,
        trial: TrialSpec,
        candidate: CandidateRecord,
        config: Mapping[str, JSONValueType] | None = None,
    ) -> ExtractionRecord:
        """Extract the final boxed segment from candidate output."""
        del trial
        cfg = dict(config or {})
        boxed = _last_boxed_text(_raw_text(candidate))
        if boxed is None:
            return _failure(
                "boxed_text",
                candidate,
                cfg,
                "No boxed answer found in the inference output.",
            )
        return _success("boxed_text", candidate, cfg, boxed)

extract ¶

extract(
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord

Extract the final boxed segment from candidate output.

Source code in themis/extractors/builtin.py

def extract(
    self,
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord:
    """Extract the final boxed segment from candidate output."""
    del trial
    cfg = dict(config or {})
    boxed = _last_boxed_text(_raw_text(candidate))
    if boxed is None:
        return _failure(
            "boxed_text",
            candidate,
            cfg,
            "No boxed answer found in the inference output.",
        )
    return _success("boxed_text", candidate, cfg, boxed)

ChoiceLetterExtractor ¶

Extract an uppercase multiple-choice letter from candidate raw text.

Source code in themis/extractors/builtin.py

class ChoiceLetterExtractor:
    """Extract an uppercase multiple-choice letter from candidate raw text."""

    def extract(
        self,
        trial: TrialSpec,
        candidate: CandidateRecord,
        config: Mapping[str, JSONValueType] | None = None,
    ) -> ExtractionRecord:
        """Extract a multiple-choice answer letter from candidate output."""
        del trial
        cfg = dict(config or {})
        configured_choices = cfg.get("choices")
        if isinstance(configured_choices, list) and configured_choices:
            choices = [str(choice).upper() for choice in configured_choices]
        else:
            choices = ["A", "B", "C", "D", "E"]
        choices_pattern = "".join(re.escape(choice) for choice in choices)
        text = _last_boxed_text(_raw_text(candidate)) or _raw_text(candidate)

        match = re.search(
            rf"\b(?:option|answer|choice)\s*[:\-]?\s*([{choices_pattern}])\b",
            text,
            flags=re.IGNORECASE,
        )
        if match is None:
            match = re.search(rf"\b([{choices_pattern}])\b", text, flags=re.IGNORECASE)
        if match is None:
            return _failure(
                "choice_letter",
                candidate,
                cfg,
                "No choice letter found in the inference output.",
            )
        return _success("choice_letter", candidate, cfg, match.group(1).upper())

extract ¶

extract(
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord

Extract a multiple-choice answer letter from candidate output.

Source code in themis/extractors/builtin.py

def extract(
    self,
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord:
    """Extract a multiple-choice answer letter from candidate output."""
    del trial
    cfg = dict(config or {})
    configured_choices = cfg.get("choices")
    if isinstance(configured_choices, list) and configured_choices:
        choices = [str(choice).upper() for choice in configured_choices]
    else:
        choices = ["A", "B", "C", "D", "E"]
    choices_pattern = "".join(re.escape(choice) for choice in choices)
    text = _last_boxed_text(_raw_text(candidate)) or _raw_text(candidate)

    match = re.search(
        rf"\b(?:option|answer|choice)\s*[:\-]?\s*([{choices_pattern}])\b",
        text,
        flags=re.IGNORECASE,
    )
    if match is None:
        match = re.search(rf"\b([{choices_pattern}])\b", text, flags=re.IGNORECASE)
    if match is None:
        return _failure(
            "choice_letter",
            candidate,
            cfg,
            "No choice letter found in the inference output.",
        )
    return _success("choice_letter", candidate, cfg, match.group(1).upper())

FirstNumberExtractor ¶

Extract the first integer or floating-point token from raw text.

Source code in themis/extractors/builtin.py

class FirstNumberExtractor:
    """Extract the first integer or floating-point token from raw text."""

    def extract(
        self,
        trial: TrialSpec,
        candidate: CandidateRecord,
        config: Mapping[str, JSONValueType] | None = None,
    ) -> ExtractionRecord:
        """Extract the first numeric token from candidate output."""
        del trial
        cfg = dict(config or {})
        text = _raw_text(candidate)
        match = re.search(r"[-+]?\d+(?:\.\d+)?", text)
        if match is None:
            return _failure(
                "first_number",
                candidate,
                cfg,
                "No numeric token found in the inference output.",
            )
        number_text = match.group(0)
        parsed: int | float = (
            int(number_text) if "." not in number_text else float(number_text)
        )
        return _success("first_number", candidate, cfg, parsed)

extract ¶

extract(
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord

Extract the first numeric token from candidate output.

Source code in themis/extractors/builtin.py

def extract(
    self,
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord:
    """Extract the first numeric token from candidate output."""
    del trial
    cfg = dict(config or {})
    text = _raw_text(candidate)
    match = re.search(r"[-+]?\d+(?:\.\d+)?", text)
    if match is None:
        return _failure(
            "first_number",
            candidate,
            cfg,
            "No numeric token found in the inference output.",
        )
    number_text = match.group(0)
    parsed: int | float = (
        int(number_text) if "." not in number_text else float(number_text)
    )
    return _success("first_number", candidate, cfg, parsed)

JsonSchemaExtractor ¶

Parse candidate raw text as JSON and validate it against a schema.

Source code in themis/extractors/builtin.py

class JsonSchemaExtractor:
    """Parse candidate raw text as JSON and validate it against a schema."""

    def extract(
        self,
        trial: TrialSpec,
        candidate: CandidateRecord,
        config: Mapping[str, JSONValueType] | None = None,
    ) -> ExtractionRecord:
        """Parse candidate output as JSON and validate it against a schema."""
        del trial
        cfg = dict(config or {})
        schema = cfg.get("schema")
        if not isinstance(schema, Mapping):
            return _failure(
                "json_schema",
                candidate,
                cfg,
                "Json schema extractor requires a 'schema' object.",
            )

        try:
            parsed = json.loads(_raw_text(candidate))
        except json.JSONDecodeError as exc:
            return _failure(
                "json_schema",
                candidate,
                cfg,
                f"Response was not valid JSON: {exc.msg}.",
            )

        try:
            jsonschema = import_optional("jsonschema", extra="extractors")
        except ThemisError as exc:
            return _failure("json_schema", candidate, cfg, exc.message)

        try:
            validator_cls = jsonschema.validators.validator_for(schema)
            validator_cls.check_schema(schema)
            validator_cls(schema).validate(parsed)
        except jsonschema.exceptions.SchemaError as exc:
            return _failure(
                "json_schema", candidate, cfg, f"Invalid JSON schema: {exc.message}"
            )
        except jsonschema.exceptions.ValidationError as exc:
            return _failure("json_schema", candidate, cfg, exc.message)
        return _success("json_schema", candidate, cfg, parsed)

extract ¶

extract(
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord

Parse candidate output as JSON and validate it against a schema.

Source code in themis/extractors/builtin.py

def extract(
    self,
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord:
    """Parse candidate output as JSON and validate it against a schema."""
    del trial
    cfg = dict(config or {})
    schema = cfg.get("schema")
    if not isinstance(schema, Mapping):
        return _failure(
            "json_schema",
            candidate,
            cfg,
            "Json schema extractor requires a 'schema' object.",
        )

    try:
        parsed = json.loads(_raw_text(candidate))
    except json.JSONDecodeError as exc:
        return _failure(
            "json_schema",
            candidate,
            cfg,
            f"Response was not valid JSON: {exc.msg}.",
        )

    try:
        jsonschema = import_optional("jsonschema", extra="extractors")
    except ThemisError as exc:
        return _failure("json_schema", candidate, cfg, exc.message)

    try:
        validator_cls = jsonschema.validators.validator_for(schema)
        validator_cls.check_schema(schema)
        validator_cls(schema).validate(parsed)
    except jsonschema.exceptions.SchemaError as exc:
        return _failure(
            "json_schema", candidate, cfg, f"Invalid JSON schema: {exc.message}"
        )
    except jsonschema.exceptions.ValidationError as exc:
        return _failure("json_schema", candidate, cfg, exc.message)
    return _success("json_schema", candidate, cfg, parsed)

NormalizedTextExtractor ¶

Normalize free-form text for robust exact-match style scoring.

Source code in themis/extractors/builtin.py

class NormalizedTextExtractor:
    """Normalize free-form text for robust exact-match style scoring."""

    def extract(
        self,
        trial: TrialSpec,
        candidate: CandidateRecord,
        config: Mapping[str, JSONValueType] | None = None,
    ) -> ExtractionRecord:
        """Normalize either the boxed answer or the full raw text."""
        del trial
        cfg = dict(config or {})
        source = _last_boxed_text(_raw_text(candidate)) or _raw_text(candidate)
        return _success("normalized_text", candidate, cfg, _normalize_text(source))

extract ¶

extract(
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord

Normalize either the boxed answer or the full raw text.

Source code in themis/extractors/builtin.py

def extract(
    self,
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord:
    """Normalize either the boxed answer or the full raw text."""
    del trial
    cfg = dict(config or {})
    source = _last_boxed_text(_raw_text(candidate)) or _raw_text(candidate)
    return _success("normalized_text", candidate, cfg, _normalize_text(source))

RegexExtractor ¶

Extract a regex match or capture group from candidate raw text.

Source code in themis/extractors/builtin.py

class RegexExtractor:
    """Extract a regex match or capture group from candidate raw text."""

    def extract(
        self,
        trial: TrialSpec,
        candidate: CandidateRecord,
        config: Mapping[str, JSONValueType] | None = None,
    ) -> ExtractionRecord:
        """Extract a configured regex match from candidate output."""
        del trial
        cfg = dict(config or {})
        pattern = cfg.get("pattern")
        if not isinstance(pattern, str) or not pattern:
            return _failure(
                "regex",
                candidate,
                cfg,
                "Regex extractor requires a non-empty 'pattern'.",
            )

        group = cfg.get("group", 0)
        text = _raw_text(candidate)
        match = re.search(pattern, text)
        if match is None:
            return _failure(
                "regex", candidate, cfg, "Pattern did not match the inference output."
            )

        if not isinstance(group, (int, str)):
            return _failure(
                "regex",
                candidate,
                cfg,
                "Configured capture group must be an integer or string.",
            )

        try:
            value = match.group(int(group))
        except (IndexError, ValueError, TypeError):
            return _failure(
                "regex",
                candidate,
                cfg,
                "Configured capture group was not present in the regex match.",
            )
        return _success("regex", candidate, cfg, value)

extract ¶

extract(
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord

Extract a configured regex match from candidate output.

Source code in themis/extractors/builtin.py

def extract(
    self,
    trial: TrialSpec,
    candidate: CandidateRecord,
    config: Mapping[str, JSONValueType] | None = None,
) -> ExtractionRecord:
    """Extract a configured regex match from candidate output."""
    del trial
    cfg = dict(config or {})
    pattern = cfg.get("pattern")
    if not isinstance(pattern, str) or not pattern:
        return _failure(
            "regex",
            candidate,
            cfg,
            "Regex extractor requires a non-empty 'pattern'.",
        )

    group = cfg.get("group", 0)
    text = _raw_text(candidate)
    match = re.search(pattern, text)
    if match is None:
        return _failure(
            "regex", candidate, cfg, "Pattern did not match the inference output."
        )

    if not isinstance(group, (int, str)):
        return _failure(
            "regex",
            candidate,
            cfg,
            "Configured capture group must be an integer or string.",
        )

    try:
        value = match.group(int(group))
    except (IndexError, ValueError, TypeError):
        return _failure(
            "regex",
            candidate,
            cfg,
            "Configured capture group was not present in the regex match.",
        )
    return _success("regex", candidate, cfg, value)