cjm-transcription-plugin-gemini
Google Gemini API plugin for the cjm-transcription-plugin-system library - provides speech-to-text transcription with configurable model selection and parameter control.
Install
pip install cjm_transcription_plugin_geminiProject Structure
nbs/
├── meta.ipynb # Metadata introspection for the Gemini plugin used by cjm-ctl to generate the registration manifest.
└── plugin.ipynb # Plugin implementation for Google Gemini API transcription
Total: 2 notebooks across 1 directory
Module Dependencies
graph LR
meta["meta<br/>Metadata"]
plugin["plugin<br/>Gemini Plugin"]
plugin --> meta
1 cross-module dependencies detected
CLI Reference
No CLI commands found in this project.
Module Overview
Detailed documentation for each module in the project:
Metadata (meta.ipynb)
Metadata introspection for the Gemini plugin used by cjm-ctl to generate the registration manifest.
Import
from cjm_transcription_plugin_gemini.meta import (
get_plugin_metadata
)Functions
def get_plugin_metadata() -> Dict[str, Any]: # Plugin metadata for manifest generation
"""Return metadata required to register this plugin with the PluginManager."""
# Fallback base path (current behavior for backward compatibility)
base_path = os.path.dirname(os.path.dirname(sys.executable))
# Use CJM config if available, else fallback to env-relative paths
cjm_data_dir = os.environ.get("CJM_DATA_DIR")
# Plugin data directory
plugin_name = "cjm-transcription-plugin-gemini"
if cjm_data_dir
"Return metadata required to register this plugin with the PluginManager."Gemini Plugin (plugin.ipynb)
Plugin implementation for Google Gemini API transcription
Import
from cjm_transcription_plugin_gemini.plugin import (
GeminiPluginConfig,
GeminiPlugin
)Functions
@patch
def _refresh_available_models(
self:GeminiPlugin
) -> List[str]: # Live audio-capable model names (empty if no client / API unreachable)
"""
Fetch + filter audio-capable models from the LIVE Gemini API (CR-11).
Returns [] when there's no client or the API can't be reached — there is no
static fallback list (a stale model list is worse than none; a baked default
rots as Google retires/adds models). Side effect: repopulates
self.model_token_limits (model name -> output_token_limit) for the live set.
"""@patch
def _update_max_tokens_for_model(
self:GeminiPlugin,
model_name: str # Model name to update tokens for
) -> None
"Clamp max_output_tokens to the model's live output_token_limit when known."@patch
def _report_usage_from(
self:GeminiPlugin,
obj # A Gemini response or stream chunk carrying .usage_metadata
) -> None
"""
SG-54: extract token usage from a Gemini response's usage_metadata and
report it to the substrate (unit-agnostic). Gemini's unit is tokens; the
plugin picks the unit-name keys. Defensive no-op if usage_metadata absent.
"""@patch
def update_config(
self:GeminiPlugin,
config: Union[Dict[str, Any], GeminiPluginConfig] # New configuration values
) -> None
"Update plugin configuration, adjusting max_tokens if model changes."@patch
def _prepare_audio(
self:GeminiPlugin,
audio: Union[str, Path] # Path to a decodable audio file
) -> Tuple[Path, bool]: # Tuple of (processed audio path, whether temp file was created)
"""
Prepare audio file for upload.
The caller provides a decodable audio file path; in-memory preparation is no
longer a plugin responsibility. Optional downsampling (a relocation candidate
that belongs in an upstream ffmpeg pipeline step) is retained for now.
"""@patch
def _upload_audio_file(
self:GeminiPlugin,
audio_path: Path # Path to audio file to upload
) -> Any: # Uploaded file object
"Upload audio file to Gemini API."@patch
def _delete_uploaded_file(
self:GeminiPlugin,
file_name: str # Name of file to delete
) -> None
"Delete an uploaded file from Gemini API."@patch
def get_available_models(
self:GeminiPlugin
) -> List[str]: # Live audio-capable model names (empty if no API key / API unreachable)
"""
Get the live list of audio-capable models (CR-11).
Lazily ensures the client from the injected API key, then refreshes from the
API. Returns [] when no key is available — no static fallback.
"""@patch
def get_model_info(
self:GeminiPlugin,
model_name: Optional[str] = None # Model name to get info for, defaults to current model
) -> Dict[str, Any]: # Dict with model information
"Get information about a specific model including its token limit."@patch
def supports_streaming(
self:GeminiPlugin
) -> bool: # True if streaming is supported
"Check if this plugin supports streaming transcription."@patch
def execute_stream(
self:GeminiPlugin,
audio: Union[str, Path], # Audio data object or path to audio file
**kwargs # Additional arguments to override config
) -> Generator[str, None, TranscriptionResult]: # Yields text chunks, returns final result
"Stream transcription results chunk by chunk."Classes
@dataclass
class GeminiPluginConfig:
"""
Configuration for Gemini transcription plugin.
CR-11/CR-12 notes:
- `model` has NO static enum and NO default. Its valid domain is the live
Gemini model list, surfaced at runtime via `get_config_options()` — a
baked-in enum/default goes stale (models are retired/added server-side),
and if the live list can't be fetched the API is unusable anyway, so a
default selection is worthless. `model` is None until the operator picks
one from the live list.
- The API key is NOT a config field. It is a secret declared in
`GeminiPlugin.WORKER_ENV` and resolved from the SecretStore into the
worker env at spawn (see CR-12).
"""
model: Optional[str] = field(...)
prompt: str = field(...)
temperature: float = field(...)
top_p: float = field(...)
max_output_tokens: int = field(...)
seed: Optional[int] = field(...)
response_mime_type: str = field(...)
downsample_audio: bool = field(...)
downsample_rate: int = field(...)
downsample_channels: int = field(...)
safety_settings: str = field(...)
model_filter: List[str] = field(...)
use_file_upload: bool = field(...)
use_streaming: bool = field(...)
delete_uploaded_files: bool = field(...)class GeminiPlugin:
def __init__(self):
"""Initialize the Gemini plugin with default configuration."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: GeminiPluginConfig = None
"Google Gemini API transcription plugin."
def __init__(self):
"""Initialize the Gemini plugin with default configuration."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: GeminiPluginConfig = None
"Initialize the Gemini plugin with default configuration."
def name(self) -> str: # Plugin name identifier
"""Return the plugin name identifier."""
return "gemini"
@property
def version(self) -> str: # Plugin version string
"Return the plugin name identifier."
def version(self) -> str: # Plugin version string
"""Return the plugin version string."""
return "1.0.0"
@property
def supported_formats(self) -> List[str]: # List of supported audio formats
"Return the plugin version string."
def supported_formats(self) -> List[str]: # List of supported audio formats
"""Return list of supported audio file formats."""
return ["wav", "mp3", "aiff", "aac", "ogg", "flac"]
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"Return list of supported audio file formats."
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"""Return current configuration state."""
if not self.config
"Return current configuration state."
def get_config_schema(self) -> Dict[str, Any]: # JSON Schema for configuration
"""Return JSON Schema for UI generation."""
return dataclass_to_jsonschema(GeminiPluginConfig)
@staticmethod
def get_config_dataclass() -> GeminiPluginConfig: # Configuration dataclass
"Return JSON Schema for UI generation."
def get_config_dataclass() -> GeminiPluginConfig: # Configuration dataclass
"""Return dataclass describing the plugin's configuration options."""
return GeminiPluginConfig
def initialize(
self,
config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
"Return dataclass describing the plugin's configuration options."
def initialize(
self,
config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
"First-time setup (CR-4): apply config + set up storage.
Does NOT eagerly create the API client or fetch the model list — the
client is created lazily (see `_ensure_client`) once the GEMINI_API_KEY
is present in the worker env. This lets the plugin LOAD without a key
(so a config UI can collect one post-load), and means the live model
list (`get_config_options`) only populates once the key is set + the
worker has respawned with it injected."
def execute(
self,
audio: Union[str, Path], # Audio data object or path to audio file
**kwargs # Additional arguments to override config
) -> TranscriptionResult: # Transcription result object
"Transcribe audio using Gemini."
def is_available(self) -> bool: # True if the Gemini API is available
"""Check if Gemini API is available."""
return GEMINI_AVAILABLE
def get_config_options(self) -> Dict[str, "FieldOptions"]
"Check if Gemini API is available."
def get_config_options(self) -> Dict[str, "FieldOptions"]:
"""CR-11: live config option domains, keyed by config field name.
Surfaces the live Gemini model list for `model` (with per-model output
token limits as option metadata + a derived `max_output_tokens` ceiling
constraint for the currently-selected model). Returns {} when no client
is available (no API key yet) or the API can't be reached — there is no
useful static fallback (a stale model list is worse than none), so the
UI shows 'set an API key first' rather than a frozen enum.
"""
self._ensure_client()
if self.client is None
"CR-11: live config option domains, keyed by config field name.
Surfaces the live Gemini model list for `model` (with per-model output
token limits as option metadata + a derived `max_output_tokens` ceiling
constraint for the currently-selected model). Returns {} when no client
is available (no API key yet) or the API can't be reached — there is no
useful static fallback (a stale model list is worse than none), so the
UI shows 'set an API key first' rather than a frozen enum."
def prefetch(self) -> None:
"""CR-4: eagerly create the client + warm the live model list.
Optional pre-warm so the first get_config_options / execute is fast.
No-op (client stays None) when no API key is injected yet.
"""
self._ensure_client()
if self.client is not None
"CR-4: eagerly create the client + warm the live model list.
Optional pre-warm so the first get_config_options / execute is fast.
No-op (client stays None) when no API key is injected yet."
def on_disable(self) -> None:
"""CR-2: release the API client when the operator disables the plugin.
Lightweight for an API plugin (no GPU/model memory), but drops the client
so a re-enable / next execute lazily re-creates it from the current env.
"""
self.client = None
def cleanup(
self
) -> None
"CR-2: release the API client when the operator disables the plugin.
Lightweight for an API plugin (no GPU/model memory), but drops the client
so a re-enable / next execute lazily re-creates it from the current env."
def cleanup(
self
) -> None
"Clean up resources."Variables
_AUDIO_MIME_MAP = {6 items}