Forced Alignment Plugin Interface

Domain-specific plugin interface for word-level audio-text alignment

ForcedAlignmentPlugin


def ForcedAlignmentPlugin(
    args:VAR_POSITIONAL, kwargs:VAR_KEYWORD
):

Abstract base class for all forced alignment plugins.

Extends PluginInterface with forced-alignment-specific requirements: - supported_formats: List of audio file extensions this plugin can handle - execute: Accepts an audio file path and transcript text, returns ForcedAlignResult

Input contract: plugins receive a path to a decodable audio file. Producing a model-ready file (format / sample-rate / channel normalization) is the caller’s responsibility — e.g. an upstream ffmpeg step in the orchestration pipeline — not the plugin’s. This keeps the interface library dependency-light.

How It Works

The Host submits an audio file path plus the transcript text; the Worker reads the file from disk and aligns:

Host / Orchestration                      Worker Process (Isolated Env)
+-------------------------+              +-----------------------------+
| # caller ensures the    |              |  ForcedAlignmentPlugin      |
| # audio file is model-  |   HTTP/JSON  |    .execute(                |
| # ready (e.g. ffmpeg)   | -------------+       audio="/tmp/seg.wav", |
| plugin.execute(         |  (path str)  |       text="Hello world"    |
|   audio="/tmp/seg.wav", |              |    )                        |
|   text="Hello world")   |              |  # reads file, aligns words |
+-------------------------+              +-----------------------------+

Audio preparation (format conversion, resampling, channel downmix) is an upstream pipeline concern, not the plugin’s.

Example Implementation

A minimal forced alignment plugin that demonstrates the interface:

from typing import Any, Dict, Optional
from cjm_transcription_plugin_system.forced_alignment_core import ForcedAlignItem

class ExampleForcedAlignmentPlugin(ForcedAlignmentPlugin):
    """Example implementation showing how to create a forced alignment plugin."""

    def __init__(self):
        self._config: Dict[str, Any] = {}

    @property
    def name(self) -> str:
        return "example-forced-alignment"

    @property
    def version(self) -> str:
        return "1.0.0"

    @property
    def supported_formats(self) -> List[str]:
        return ["wav", "mp3", "flac"]

    def initialize(self, config: Optional[Dict[str, Any]] = None) -> None:
        self._config = config or {"language": "English"}

    def execute(
        self,
        audio: Union[str, Path],
        text: str,
        **kwargs
    ) -> ForcedAlignResult:
        """Mock alignment that assigns equal time to each word."""
        words = text.split()
        duration = 10.0  # Assume 10 seconds of audio
        time_per_word = duration / max(len(words), 1)

        items = []
        for i, word in enumerate(words):
            items.append(ForcedAlignItem(
                text=word.strip('.,!?;:'),
                start_time=round(i * time_per_word, 2),
                end_time=round((i + 1) * time_per_word, 2),
            ))

        return ForcedAlignResult(
            items=items,
            metadata={"model": "mock", "language": self._config.get("language")}
        )

    def get_config_schema(self) -> Dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "language": {"type": "string", "default": "English"}
            }
        }

    def get_current_config(self) -> Dict[str, Any]:
        return self._config

    def cleanup(self) -> None:
        pass

# Test the example plugin
plugin = ExampleForcedAlignmentPlugin()
plugin.initialize({"language": "English"})

print(f"Plugin: {plugin.name} v{plugin.version}")
print(f"Supported formats: {plugin.supported_formats}")
print(f"Entry point group: {plugin.entry_point_group}")

# Test execution with text
result = plugin.execute("/tmp/audio.wav", text="Hello world how are you")
print(f"\nResult: {len(result.items)} items")
for item in result.items:
    print(f"  {item}")
print(f"Metadata: {result.metadata}")

assert len(result.items) == 5
assert result.items[0].text == "Hello"
assert result.metadata["language"] == "English"

plugin.cleanup()

Plugin: example-forced-alignment v1.0.0
Supported formats: ['wav', 'mp3', 'flac']
Entry point group: transcription.forced_alignment_plugins

Result: 5 items
  ForcedAlignItem(text='Hello', start_time=0.0, end_time=2.0)
  ForcedAlignItem(text='world', start_time=2.0, end_time=4.0)
  ForcedAlignItem(text='how', start_time=4.0, end_time=6.0)
  ForcedAlignItem(text='are', start_time=6.0, end_time=8.0)
  ForcedAlignItem(text='you', start_time=8.0, end_time=10.0)
Metadata: {'model': 'mock', 'language': 'English'}