cjm-capability-whisper
An OpenAI Whisper speech-to-text capability for the cjm-substrate runtime that provides local transcription with configurable model selection and parameter control.
Install
pip install cjm_capability_whisperProject Structure
nbs/
└── capability.ipynb # Capability implementation for OpenAI Whisper transcription
Total: 1 notebook across 1 directory
Module Dependencies
graph LR
capability[capability<br/>Whisper Capability]
No cross-module dependencies detected.
CLI Reference
No CLI commands found in this project.
Module Overview
Detailed documentation for each module in the project:
Whisper Capability (capability.ipynb)
Capability implementation for OpenAI Whisper transcription
Import
from cjm_capability_whisper.capability import (
WHISPER_AVAILABLE,
WhisperCapabilityConfig,
WhisperLocalCapability
)Functions
@patch
def _apply_config(
self:WhisperLocalCapability,
config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
"""
CR-4: apply config values + derive config-dependent state (device,
model_dir). No heavy-resource work. Called by initialize (first-time) and by
the substrate's reconfigure delta path. Model release on a model/device/
model_dir/compile_model change is handled declaratively via RELOAD_TRIGGER
-> _release_model (fired by the substrate BEFORE this re-applies config).
"""@patch
def _release_model(self:WhisperLocalCapability) -> None
"""
CR-4: release the loaded model + free CUDA cache. RELOAD_TRIGGER target for
model/device/model_dir/compile_model; on_disable / cleanup delegate here.
Idempotent via cjm-substrate-torch-utils' release_model (no-op when already None).
"""@patch
def _load_model(self:WhisperLocalCapability) -> None:
"""Load the Whisper model (lazy loading)."""
if self.model is None
"Load the Whisper model (lazy loading)."@patch
def _prepare_audio(
self:WhisperLocalCapability,
audio: Union[str, Path] # Path to a decodable audio file
) -> str: # The audio file path
"""
Validate the audio input and return it as a path string.
The caller (orchestration / proxy) guarantees a model-ready audio file;
in-memory preparation is no longer a capability responsibility.
"""@patch
def is_available(self:WhisperLocalCapability) -> bool: # True if Whisper and its dependencies are available
"Check if Whisper is available."@patch
def prefetch(self:WhisperLocalCapability) -> None
"""
CR-4 (SG-19): eagerly load the model so the first execute() doesn't pay
the download/load cost. Idempotent via _load_model's None-guard.
"""@patch
def on_disable(self:WhisperLocalCapability) -> None
"""
CR-2: release the GPU model when the operator disables the capability (the
worker stays alive); lazy reload on the next execute after re-enable.
"""@patch
def cleanup(self:WhisperLocalCapability) -> None
"Release resources on unload."Classes
@dataclass
class WhisperCapabilityConfig:
"Configuration for Whisper transcription capability."
model: str = field(...)
device: str = field(...)
language: Optional[str] = field(...)
task: str = field(...)
temperature: float = field(...)
temperature_increment_on_fallback: Optional[float] = field(...)
beam_size: int = field(...)
best_of: int = field(...)
patience: float = field(...)
length_penalty: Optional[float] = field(...)
suppress_tokens: str = field(...)
initial_prompt: Optional[str] = field(...)
condition_on_previous_text: bool = field(...)
fp16: bool = field(...)
compression_ratio_threshold: float = field(...)
logprob_threshold: float = field(...)
no_speech_threshold: float = field(...)
word_timestamps: bool = field(...)
prepend_punctuations: str = field(...)
append_punctuations: str = field(...)
threads: int = field(...)
model_dir: Optional[str] = field(...)
compile_model: bool = field(...)class WhisperLocalCapability:
def __init__(self):
"""Initialize the Whisper capability with default configuration."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: WhisperCapabilityConfig = None
"""
OpenAI Whisper transcription capability (stage 8: pure-compute tool capability).
Native-surface model (PILLAR 1c): this tool is PURE COMPUTE — `transcribe`
loads the model, runs inference, and builds the typed `TranscriptionResult`.
The cache-check + persistence bookends + the per-call `force` control live in
the generic transcription adapter (cjm-transcription-adapter-interface); the
result DTO lives in cjm-capability-primitives; identity is derived from the
installed distribution. No `get_plugin_metadata`, no `self.storage`.
"""
def __init__(self):
"""Initialize the Whisper capability with default configuration."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: WhisperCapabilityConfig = None
"Initialize the Whisper capability with default configuration."
def name(self) -> str: # Capability name identifier
"""Capability identity, derived from the installed distribution (PILLAR 1c).
Runtime-derived: in the worker / in-env introspection `__package__`
resolves; the manifest records the same value independently (the
dual-mode generator reads it from the distribution)."""
from importlib.metadata import metadata, packages_distributions
dist = (packages_distributions().get(__package__) or [__package__.replace("_", "-")])[0]
return metadata(dist)["Name"]
@property
def version(self) -> str: # Capability version string
"Capability identity, derived from the installed distribution (PILLAR 1c).
Runtime-derived: in the worker / in-env introspection `__package__`
resolves; the manifest records the same value independently (the
dual-mode generator reads it from the distribution)."
def version(self) -> str: # Capability version string
"""Get the capability version string."""
from cjm_capability_whisper import __version__
return __version__
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"Get the capability version string."
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"""Return current configuration state."""
if not self.config
"Return current configuration state."
def get_config_schema(self) -> Dict[str, Any]: # JSON Schema for configuration
"""Return JSON Schema for UI generation."""
return dataclass_to_jsonschema(WhisperCapabilityConfig)
@staticmethod
def get_config_dataclass() -> WhisperCapabilityConfig: # Configuration dataclass
"Return JSON Schema for UI generation."
def get_config_dataclass() -> WhisperCapabilityConfig: # Configuration dataclass
"""Return dataclass describing the capability's configuration options."""
return WhisperCapabilityConfig
def initialize(
self,
config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
"Return dataclass describing the capability's configuration options."
def initialize(
self,
config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
"First-time setup. CR-4: the manual model/device diff-and-reload is replaced
by declarative RELOAD_TRIGGER metadata; the substrate's reconfigure path fires
_release_model then re-applies config via _apply_config."
def transcribe(
self,
audio: Union[str, Path], # Path to MODEL-READY audio (converted upstream)
**kwargs # Provenance (source_start_time/source_end_time) stamped into metadata
) -> TranscriptionResult: # Typed transcription output
"Transcribe model-ready audio using Whisper — PURE COMPUTE.
Stage 8 / PILLAR 1c: the cache-check + persistence bookends moved to the
generic transcription adapter; this method loads the model, runs
inference, and builds the typed result. Model params come from
`self.config` (the CR-15 per-call override path is gone — the tool runs
its effective config, no metadata lie); `source_start_time` /
`source_end_time` ride the provenance kwarg channel into metadata."Variables
WHISPER_AVAILABLE