cjm-capability-whisper

An OpenAI Whisper speech-to-text capability for the cjm-substrate runtime that provides local transcription with configurable model selection and parameter control.

Install

pip install cjm_capability_whisper

Project Structure

nbs/
└── capability.ipynb # Capability implementation for OpenAI Whisper transcription

Total: 1 notebook across 1 directory

Module Dependencies

graph LR
    capability[capability<br/>Whisper Capability]

No cross-module dependencies detected.

CLI Reference

No CLI commands found in this project.

Module Overview

Detailed documentation for each module in the project:

Whisper Capability (`capability.ipynb`)

Capability implementation for OpenAI Whisper transcription

Import

from cjm_capability_whisper.capability import (
    WHISPER_AVAILABLE,
    WhisperCapabilityConfig,
    WhisperLocalCapability
)

Functions

@patch
def _apply_config(
    self:WhisperLocalCapability,
    config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
    """
    CR-4: apply config values + derive config-dependent state (device,
    model_dir). No heavy-resource work. Called by initialize (first-time) and by
    the substrate's reconfigure delta path. Model release on a model/device/
    model_dir/compile_model change is handled declaratively via RELOAD_TRIGGER
    -> _release_model (fired by the substrate BEFORE this re-applies config).
    """

@patch
def _release_model(self:WhisperLocalCapability) -> None
    """
    CR-4: release the loaded model + free CUDA cache. RELOAD_TRIGGER target for
    model/device/model_dir/compile_model; on_disable / cleanup delegate here.
    Idempotent via cjm-substrate-torch-utils' release_model (no-op when already None).
    """

@patch
def _load_model(self:WhisperLocalCapability) -> None:
    """Load the Whisper model (lazy loading)."""
    if self.model is None
    "Load the Whisper model (lazy loading)."

@patch
def _prepare_audio(
    self:WhisperLocalCapability,
    audio: Union[str, Path] # Path to a decodable audio file
) -> str: # The audio file path
    """
    Validate the audio input and return it as a path string.
    
    The caller (orchestration / proxy) guarantees a model-ready audio file;
    in-memory preparation is no longer a capability responsibility.
    """

@patch
def is_available(self:WhisperLocalCapability) -> bool: # True if Whisper and its dependencies are available
    "Check if Whisper is available."

@patch
def prefetch(self:WhisperLocalCapability) -> None
    """
    CR-4 (SG-19): eagerly load the model so the first execute() doesn't pay
    the download/load cost. Idempotent via _load_model's None-guard.
    """

@patch
def on_disable(self:WhisperLocalCapability) -> None
    """
    CR-2: release the GPU model when the operator disables the capability (the
    worker stays alive); lazy reload on the next execute after re-enable.
    """

@patch
def cleanup(self:WhisperLocalCapability) -> None
    "Release resources on unload."

Classes

@dataclass
class WhisperCapabilityConfig:
    "Configuration for Whisper transcription capability."
    
    model: str = field(...)
    device: str = field(...)
    language: Optional[str] = field(...)
    task: str = field(...)
    temperature: float = field(...)
    temperature_increment_on_fallback: Optional[float] = field(...)
    beam_size: int = field(...)
    best_of: int = field(...)
    patience: float = field(...)
    length_penalty: Optional[float] = field(...)
    suppress_tokens: str = field(...)
    initial_prompt: Optional[str] = field(...)
    condition_on_previous_text: bool = field(...)
    fp16: bool = field(...)
    compression_ratio_threshold: float = field(...)
    logprob_threshold: float = field(...)
    no_speech_threshold: float = field(...)
    word_timestamps: bool = field(...)
    prepend_punctuations: str = field(...)
    append_punctuations: str = field(...)
    threads: int = field(...)
    model_dir: Optional[str] = field(...)
    compile_model: bool = field(...)

class WhisperLocalCapability:
    def __init__(self):
        """Initialize the Whisper capability with default configuration."""
        self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
        self.config: WhisperCapabilityConfig = None
    """
    OpenAI Whisper transcription capability (stage 8: pure-compute tool capability).
    
    Native-surface model (PILLAR 1c): this tool is PURE COMPUTE — `transcribe`
    loads the model, runs inference, and builds the typed `TranscriptionResult`.
    The cache-check + persistence bookends + the per-call `force` control live in
    the generic transcription adapter (cjm-transcription-adapter-interface); the
    result DTO lives in cjm-capability-primitives; identity is derived from the
    installed distribution. No `get_plugin_metadata`, no `self.storage`.
    """
    
    def __init__(self):
            """Initialize the Whisper capability with default configuration."""
            self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
            self.config: WhisperCapabilityConfig = None
        "Initialize the Whisper capability with default configuration."
    
    def name(self) -> str: # Capability name identifier
            """Capability identity, derived from the installed distribution (PILLAR 1c).
    
            Runtime-derived: in the worker / in-env introspection `__package__`
            resolves; the manifest records the same value independently (the
            dual-mode generator reads it from the distribution)."""
            from importlib.metadata import metadata, packages_distributions
            dist = (packages_distributions().get(__package__) or [__package__.replace("_", "-")])[0]
            return metadata(dist)["Name"]
    
        @property
        def version(self) -> str: # Capability version string
        "Capability identity, derived from the installed distribution (PILLAR 1c).

Runtime-derived: in the worker / in-env introspection `__package__`
resolves; the manifest records the same value independently (the
dual-mode generator reads it from the distribution)."
    
    def version(self) -> str: # Capability version string
            """Get the capability version string."""
            from cjm_capability_whisper import __version__
            return __version__
    
        def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
        "Get the capability version string."
    
    def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
            """Return current configuration state."""
            if not self.config
        "Return current configuration state."
    
    def get_config_schema(self) -> Dict[str, Any]: # JSON Schema for configuration
            """Return JSON Schema for UI generation."""
            return dataclass_to_jsonschema(WhisperCapabilityConfig)
    
        @staticmethod
        def get_config_dataclass() -> WhisperCapabilityConfig: # Configuration dataclass
        "Return JSON Schema for UI generation."
    
    def get_config_dataclass() -> WhisperCapabilityConfig: # Configuration dataclass
            """Return dataclass describing the capability's configuration options."""
            return WhisperCapabilityConfig
    
        def initialize(
            self,
            config: Optional[Any] = None # Configuration dataclass, dict, or None
        ) -> None
        "Return dataclass describing the capability's configuration options."
    
    def initialize(
            self,
            config: Optional[Any] = None # Configuration dataclass, dict, or None
        ) -> None
        "First-time setup. CR-4: the manual model/device diff-and-reload is replaced
by declarative RELOAD_TRIGGER metadata; the substrate's reconfigure path fires
_release_model then re-applies config via _apply_config."
    
    def transcribe(
            self,
            audio: Union[str, Path], # Path to MODEL-READY audio (converted upstream)
            **kwargs # Provenance (source_start_time/source_end_time) stamped into metadata
        ) -> TranscriptionResult: # Typed transcription output
        "Transcribe model-ready audio using Whisper — PURE COMPUTE.

Stage 8 / PILLAR 1c: the cache-check + persistence bookends moved to the
generic transcription adapter; this method loads the model, runs
inference, and builds the typed result. Model params come from
`self.config` (the CR-15 per-call override path is gone — the tool runs
its effective config, no metadata lie); `source_start_time` /
`source_end_time` ride the provenance kwarg channel into metadata."

Variables

WHISPER_AVAILABLE