Gemini Plugin

Plugin implementation for Google Gemini API transcription

GeminiPluginConfig


def GeminiPluginConfig(
    model:Optional=None,
    prompt:str='Generate a transcription of the audio, only extract speech and ignore background audio.',
    temperature:float=0.0, top_p:float=0.95, max_output_tokens:int=65536, seed:Optional=None,
    response_mime_type:str='text/plain', downsample_audio:bool=False, downsample_rate:int=16000,
    downsample_channels:int=1, safety_settings:str='OFF', model_filter:List=<factory>, use_file_upload:bool=False,
    use_streaming:bool=False, delete_uploaded_files:bool=True
)->None:

Configuration for Gemini transcription plugin.

CR-11/CR-12 notes: - model has NO static enum and NO default. Its valid domain is the live Gemini model list, surfaced at runtime via get_config_options() — a baked-in enum/default goes stale (models are retired/added server-side), and if the live list can’t be fetched the API is unusable anyway, so a default selection is worthless. model is None until the operator picks one from the live list. - The API key is NOT a config field. It is a secret declared in GeminiPlugin.WORKER_ENV and resolved from the SecretStore into the worker env at spawn (see CR-12).


GeminiPlugin


def GeminiPlugin(
    
):

Google Gemini API transcription plugin.


GeminiPlugin.update_config


def update_config(
    config:Union, # New configuration values
)->None:

Update plugin configuration, adjusting max_tokens if model changes.


GeminiPlugin.get_model_info


def get_model_info(
    model_name:Optional=None, # Model name to get info for, defaults to current model
)->Dict: # Dict with model information

Get information about a specific model including its token limit.


GeminiPlugin.get_available_models


def get_available_models(
    
)->List: # Live audio-capable model names (empty if no API key / API unreachable)

Get the live list of audio-capable models (CR-11).

Lazily ensures the client from the injected API key, then refreshes from the API. Returns [] when no key is available — no static fallback.


GeminiPlugin.execute_stream


def execute_stream(
    audio:Union, # Audio data object or path to audio file
    kwargs:VAR_KEYWORD
)->Generator: # Yields text chunks, returns final result

Stream transcription results chunk by chunk.


GeminiPlugin.supports_streaming


def supports_streaming(
    
)->bool: # True if streaming is supported

Check if this plugin supports streaming transcription.

Testing the Plugin

# Test basic functionality
plugin = GeminiPlugin()

# Check availability
print(f"Gemini available: {plugin.is_available()}")
print(f"Plugin name: {plugin.name}")
print(f"Plugin version: {plugin.version}")
print(f"Supported formats: {plugin.supported_formats}")
print(f"Config class: {plugin.config_class.__name__}")
Gemini available: True
Plugin name: gemini
Plugin version: 1.0.0
Supported formats: ['wav', 'mp3', 'aiff', 'aac', 'ogg', 'flac']
Config class: GeminiPluginConfig
# Test configuration dataclass
from dataclasses import fields

print("Configuration fields:")
for f in fields(GeminiPluginConfig):
    desc = f.metadata.get(SCHEMA_DESC, 'No description')
    default = f.default if f.default is not f.default_factory else f.default_factory()
    print(f"  {f.name}: {desc} (default: {default!r})")
Configuration fields:
  model: Gemini model for transcription. Domain is runtime-derived from the live API model list (get_config_options); no static default. (default: None)
  prompt: Prompt for transcription (default: 'Generate a transcription of the audio, only extract speech and ignore background audio.')
  temperature: Sampling temperature (default: 0.0)
  top_p: Top-p sampling parameter (default: 0.95)
  max_output_tokens: Maximum number of output tokens (per-model ceiling surfaced as a live constraint via get_config_options) (default: 65536)
  seed: Random seed for reproducibility (default: None)
  response_mime_type: Response MIME type (default: 'text/plain')
  downsample_audio: Downsample audio before uploading (requires ffmpeg) (default: False)
  downsample_rate: Target sample rate for downsampling (default: 16000)
  downsample_channels: Number of audio channels (1=mono, 2=stereo) (default: 1)
  safety_settings: Safety filter threshold (default: 'OFF')
  model_filter: Keywords to exclude from the live model list (e.g., ['tts', 'image']) (default: <dataclasses._MISSING_TYPE object at 0x7ff537f5b560>)
  use_file_upload: Upload audio files to Gemini API instead of embedding in request (default: False)
  use_streaming: Use streaming response for transcription (default: False)
  delete_uploaded_files: Delete uploaded files after transcription (default: True)
# Test get_config_schema for UI generation
import json

schema = plugin.get_config_schema()
print("JSON Schema for GeminiPluginConfig:")
print(f"  Name: {schema['name']}")
print(f"  Properties count: {len(schema['properties'])}")
print(f"  Model has static enum? {'enum' in schema['properties']['model']} "
      f"(False by design — the model domain is runtime-derived via get_config_options)")
print(f"\nSample properties:")
print(json.dumps({k: v for k, v in list(schema['properties'].items())[:3]}, indent=2))
JSON Schema for GeminiPluginConfig:
  Name: GeminiPluginConfig
  Properties count: 17
  Model field enum: ['gemini-2.5-flash', 'gemini-2.5-flash-preview-05-20', 'gemini-2.5-pro']...

Sample properties:
{
  "model": {
    "type": "string",
    "title": "Model",
    "description": "Gemini model to use for transcription",
    "enum": [
      "gemini-2.5-flash",
      "gemini-2.5-flash-preview-05-20",
      "gemini-2.5-pro",
      "gemini-2.5-pro-preview-05-06",
      "gemini-2.0-flash",
      "gemini-2.0-flash-exp",
      "gemini-1.5-flash",
      "gemini-1.5-flash-latest",
      "gemini-1.5-pro",
      "gemini-1.5-pro-latest"
    ],
    "default": "gemini-2.5-flash"
  },
  "api_key": {
    "type": [
      "string",
      "null"
    ],
    "title": "API Key",
    "description": "Google API key (defaults to GEMINI_API_KEY env var)",
    "default": null
  },
  "prompt": {
    "type": "string",
    "title": "Prompt",
    "description": "Prompt for transcription",
    "default": "Generate a transcription of the audio, only extract speech and ignore background audio."
  }
}
# Test initialization (requires API key)
if os.environ.get("GEMINI_API_KEY"):
    plugin.initialize({"model": "gemini-2.5-flash"})
    print(f"Initialized with model: {plugin.config.model}")
    
    # Get available models
    models = plugin.get_available_models()
    print(f"\nFound {len(models)} available models")
    print("Top 5 models:")
    for model in models[:5]:
        print(f"  - {model}")
else:
    print("Set GEMINI_API_KEY environment variable to test initialization")
Initialized with model: gemini-2.5-flash

Found 25 available models
Top 5 models:
  - nano-banana-pro-preview
  - gemma-3n-e4b-it
  - gemma-3n-e2b-it
  - gemma-3-4b-it
  - gemma-3-27b-it

Testing Dynamic Token Limits

Test that max_output_tokens is dynamically updated based on the selected model’s output_token_limit.

# Test dynamic token limit updates
if os.environ.get("GEMINI_API_KEY"):
    # Initialize plugin
    plugin = GeminiPlugin()
    plugin.initialize({"model": "gemini-2.5-flash"})
    plugin.get_available_models()  # CR-11: lazily fetch live models -> populates model_token_limits
    
    # Check token limits for different models
    print("Token limits for different models:")
    print("-" * 50)
    
    # Display token limits that were discovered
    for model_name in list(plugin.model_token_limits.keys())[:5]:
        token_limit = plugin.model_token_limits[model_name]
        print(f"{model_name}: {token_limit:,} tokens")
    
    print("\nCurrent configuration:")
    print(f"Model: {plugin.config.model}")
    print(f"Max output tokens: {plugin.config.max_output_tokens:,}")
    
    # Get model info
    model_info = plugin.get_model_info()
    print(f"\nModel info for {model_info['name']}:")
    print(f"  Output token limit: {model_info['output_token_limit']:,}")
    print(f"  Current max_output_tokens: {model_info['current_max_output_tokens']:,}")
else:
    print("Set GEMINI_API_KEY environment variable to test token limits")
Token limits for different models:
--------------------------------------------------
gemini-2.5-flash: 65,536 tokens
gemini-2.5-pro: 65,536 tokens
gemini-2.0-flash: 8,192 tokens
gemini-2.0-flash-001: 8,192 tokens
gemini-2.0-flash-lite-001: 8,192 tokens

Current configuration:
Model: gemini-2.5-flash
Max output tokens: 65,536

Model info for gemini-2.5-flash:
  Output token limit: 65,536
  Current max_output_tokens: 65,536
# Test switching models and automatic token limit update
if os.environ.get("GEMINI_API_KEY"):
    # Switch to a different model
    print("Testing model switching and token limit updates:")
    print("-" * 50)
    plugin.get_available_models()  # CR-11: ensure live model list is loaded
    
    test_models = ["gemini-2.5-flash", "gemini-1.5-pro", "gemini-2.0-flash"]
    
    for model_name in test_models:
        if model_name in plugin.model_token_limits:
            # Update configuration with new model
            plugin.update_config({"model": model_name})
            
            print(f"\nSwitched to model: {model_name}")
            print(f"  Token limit: {plugin.model_token_limits[model_name]:,}")
            print(f"  Config max_output_tokens: {plugin.config.max_output_tokens:,}")
else:
    print("Set GEMINI_API_KEY environment variable to test model switching")
Testing model switching and token limit updates:
--------------------------------------------------

Switched to model: gemini-2.5-flash
  Token limit: 65,536
  Config max_output_tokens: 65,536

Switched to model: gemini-2.0-flash
  Token limit: 8,192
  Config max_output_tokens: 8,192
# Test execution with runtime model override
if os.environ.get("GEMINI_API_KEY"):
    print("Testing runtime model override:")
    print("-" * 50)
    
    # AudioData was retired; execute() takes a decodable audio file path.
    from pathlib import Path as _P
    plugin.get_available_models()  # CR-11: load the live model list first
    test_audio = str((_P('..') / 'test_files' / 'short_test_audio.mp3').resolve())
    
    # Current model and token limit
    print(f"Current model: {plugin.config.model}")
    print(f"Current max_output_tokens: {plugin.config.max_output_tokens:,}")
    
    # Execute with a different model at runtime
    override_model = "gemini-2.0-flash" if plugin.config.model != "gemini-2.0-flash" else "gemini-2.5-flash"
    
    if override_model in plugin.model_token_limits:
        print(f"\nExecuting with override model: {override_model}")
        print(f"Expected token limit: {plugin.model_token_limits[override_model]:,}")
        
        try:
            result = plugin.execute(
                test_audio,
                model=override_model,
                prompt="This is a test audio signal."
            )
            
            print(f"\nTranscription metadata:")
            print(f"  Model used: {result.metadata['model']}")
            print(f"  Max output tokens: {result.metadata['max_output_tokens']:,}")
            
            # Check if config was updated
            print(f"\nConfig after execution:")
            print(f"  Model: {plugin.config.model}")
            print(f"  Max output tokens: {plugin.config.max_output_tokens:,}")
            
        except Exception as e:
            print(f"Execution error: {e}")
else:
    print("Set GEMINI_API_KEY environment variable to test runtime override")
Set GEMINI_API_KEY environment variable to test runtime override