Voxtral VLLM Plugin

Plugin implementation for Mistral Voxtral transcription through vLLM server

VLLMServer


def VLLMServer(
    model:str='mistralai/Voxtral-Mini-3B-2507', # Model name to serve
    port:int=8000, # Port for the server
    host:str='0.0.0.0', # Host address to bind to
    gpu_memory_utilization:float=0.85, # Fraction of GPU memory to use
    log_level:str='INFO', # Logging level (DEBUG, INFO, WARNING, ERROR)
    capture_logs:bool=True, # Whether to capture and display server logs
    kwargs:VAR_KEYWORD
):

vLLM server manager for Voxtral models.

VoxtralVLLMPluginConfig


def VoxtralVLLMPluginConfig(
    model_id:str='mistralai/Voxtral-Mini-3B-2507', device:str='cuda', server_mode:str='managed',
    server_url:str='http://localhost:8000', server_port:int=8000, gpu_memory_utilization:float=0.85,
    max_model_len:int=32768, language:Optional='en', temperature:float=0.0, auto_start_server:bool=True,
    capture_server_logs:bool=True, dtype:str='auto', tensor_parallel_size:int=1
)->None:

Configuration for Voxtral VLLM transcription plugin.

VoxtralVLLMPlugin


def VoxtralVLLMPlugin(
    
):

Mistral Voxtral transcription plugin via vLLM server.

VoxtralVLLMPlugin.prefetch


def prefetch(
    
)->None:

CR-4 (SG-19): eagerly spawn the managed vLLM server so the first execute() doesn’t pay the startup cost (model load, CUDA graph capture, weight download for cold caches). No-op in external mode (caller manages the server). Idempotent via _ensure_server_running’s is_running() check.

Session A 2026-05-27: passes self.report_progress (inherited from PluginInterface) through to VLLMServer so substrate.proxy.prefetch’s stall detection sees progress events on every vLLM log line. Replaces the prior wall-clock server_startup_timeout config field — operators no longer race network speeds against an arbitrary timeout value.

VoxtralVLLMPlugin.on_disable


def on_disable(
    
)->None:

CR-2: release the vLLM server subprocess when the operator disables the plugin while keeping the worker alive. Re-enable + next execute lazy-respawns via _ensure_server_running.

VoxtralVLLMPlugin.is_available


def is_available(
    
)->bool: # True if vLLM and dependencies are available

Check if vLLM and required dependencies are available.

VoxtralVLLMPlugin.cleanup


def cleanup(
    
)->None:

Release resources on unload. CR-4: delegates to _release_vllm_server so both the worker-unload path AND the operator-disable / reconfigure paths converge on one release implementation.

VoxtralVLLMPlugin.execute_stream


def execute_stream(
    audio:Union, # Audio data or path to audio file
    kwargs:VAR_KEYWORD
)->Generator: # Yields text chunks, returns final result

Stream transcription results chunk by chunk.

VoxtralVLLMPlugin.supports_streaming


def supports_streaming(
    
)->bool: # True if streaming is supported

Check if this plugin supports streaming transcription.

Testing the Plugin

# Test basic functionality
plugin = VoxtralVLLMPlugin()

# Check availability
print(f"Voxtral VLLM available: {plugin.is_available()}")
print(f"Plugin name: {plugin.name}")
print(f"Plugin version: {plugin.version}")
print(f"Supported formats: {plugin.supported_formats}")
print(f"Supports streaming: {plugin.supports_streaming()}")

Voxtral VLLM available: True
Plugin name: voxtral_vllm
Plugin version: 1.0.0
Supported formats: ['wav', 'mp3', 'flac', 'm4a', 'ogg', 'webm', 'mp4', 'avi', 'mov']
Supports streaming: True

# Test configuration dataclass
from dataclasses import fields

print("Available models:")
model_field = next(f for f in fields(VoxtralVLLMPluginConfig) if f.name == "model_id")
for model in model_field.metadata.get(SCHEMA_ENUM, []):
    print(f"  - {model}")

server_field = next(f for f in fields(VoxtralVLLMPluginConfig) if f.name == "server_mode")
print(f"\nServer modes: {server_field.metadata.get(SCHEMA_ENUM)}")

Available models:
  - mistralai/Voxtral-Mini-3B-2507
  - mistralai/Voxtral-Small-24B-2507

Server modes: ['managed', 'external']

# Test get_config_schema for UI generation
import json

schema = plugin.get_config_schema()
print("JSON Schema for VoxtralVLLMPluginConfig:")
print(f"  Name: {schema['name']}")
print(f"  Properties count: {len(schema['properties'])}")
print(f"  Model field enum: {schema['properties']['model_id'].get('enum', [])}")
print(f"\nSample properties:")
print(json.dumps({k: v for k, v in list(schema['properties'].items())[:3]}, indent=2))

JSON Schema for VoxtralVLLMPluginConfig:
  Name: VoxtralVLLMPluginConfig
  Properties count: 15
  Model field enum: ['mistralai/Voxtral-Mini-3B-2507', 'mistralai/Voxtral-Small-24B-2507']

Sample properties:
{
  "model_id": {
    "type": "string",
    "title": "Model ID",
    "description": "Voxtral model to use. Mini is faster, Small is more accurate.",
    "enum": [
      "mistralai/Voxtral-Mini-3B-2507",
      "mistralai/Voxtral-Small-24B-2507"
    ],
    "default": "mistralai/Voxtral-Mini-3B-2507"
  },
  "device": {
    "type": "string",
    "title": "Device",
    "description": "Device for inference (will use CUDA if available)",
    "enum": [
      "cuda"
    ],
    "default": "cuda"
  },
  "server_mode": {
    "type": "string",
    "title": "Server Mode",
    "description": "'managed': plugin manages server lifecycle, 'external': connect to existing server",
    "enum": [
      "managed",
      "external"
    ],
    "default": "managed"
  }
}

# Test configuration validation
from dataclasses import asdict
from cjm_plugin_system.utils.validation import extract_defaults

plugin = VoxtralVLLMPlugin()

test_configs = [
    ({"model_id": "mistralai/Voxtral-Mini-3B-2507"}, "Valid config"),
    ({"model_id": "invalid_model"}, "Invalid model"),
    ({"server_port": 9000}, "Valid port change"),
    ({"temperature": 2.5}, "Temperature out of range"),
]

# Get defaults for merging
defaults = extract_defaults(VoxtralVLLMPluginConfig)

for config_update, description in test_configs:
    try:
        merged = {**defaults, **config_update}
        test_config = dict_to_config(VoxtralVLLMPluginConfig, merged, validate=True)
        print(f"{description}: Valid=True")
    except ValueError as e:
        print(f"{description}: Valid=False")
        print(f"  Error: {str(e)[:100]}")

Valid config: Valid=True
Invalid model: Valid=False
  Error: model_id: 'invalid_model' is not one of ['mistralai/Voxtral-Mini-3B-2507', 'mistralai/Voxtral-Small-
Valid port change: Valid=True
Temperature out of range: Valid=False
  Error: temperature: 2.5 is greater than maximum 2.0

# Test initialization with external server mode
plugin.initialize({
    "model_id": "mistralai/Voxtral-Mini-3B-2507",
    "server_mode": "external",
    "server_url": "http://localhost:8000"
})
print(f"Current config mode: {plugin.get_current_config().get('server_mode')}")
print(f"Current model: {plugin.get_current_config().get('model_id')}")

Current config mode: external
Current model: mistralai/Voxtral-Mini-3B-2507