Forced Alignment Plugin Interface

Domain-specific plugin interface for word-level audio-text alignment

source

ForcedAlignmentPlugin


def ForcedAlignmentPlugin(
    args:VAR_POSITIONAL, kwargs:VAR_KEYWORD
):

Abstract base class for all forced alignment plugins.

Extends PluginInterface with forced-alignment-specific requirements: - supported_formats: List of audio file extensions this plugin can handle - execute: Accepts audio path and transcript text, returns ForcedAlignResult

NOTE: When running via RemotePluginProxy, AudioData objects are automatically serialized to temp files via FileBackedDTO, so the Worker receives a file path.

How It Works

Host Process                              Worker Process (Isolated Env)
+---------------------+                  +-----------------------------+
| audio = AudioData(  |                  |                             |
|   samples=np.array, |                  |  ForcedAlignmentPlugin      |
|   sample_rate=16000 |                  |    .execute(                |
| )                   |                  |       audio="/tmp/xyz.wav", |
|                     |                  |       text="Hello world"    |
| plugin.execute(     |   HTTP/JSON      |    )                        |
|   audio=audio,      | -----------------+                             |
|   text="Hello world"|  (path string)   |  # audio is now a PATH      |
| )                   |                  |  # text passed as-is        |
|                     |                  |  # Plugin aligns words      |
| # Proxy detects     |                  |  # Returns ForcedAlignResult|
| # FileBackedDTO,    |                  |                             |
| # calls to_temp_file|                  |                             |
+---------------------+                  +-----------------------------+

The RemotePluginProxy automatically: 1. Detects AudioData implements FileBackedDTO 2. Calls audio.to_temp_file() to save to disk 3. Sends the file path string to the Worker 4. Worker’s execute() receives a path (not AudioData) plus the text string

Example Implementation

A minimal forced alignment plugin that demonstrates the interface:

from typing import Any, Dict, Optional
from cjm_transcription_plugin_system.forced_alignment_core import ForcedAlignItem

class ExampleForcedAlignmentPlugin(ForcedAlignmentPlugin):
    """Example implementation showing how to create a forced alignment plugin."""

    def __init__(self):
        self._config: Dict[str, Any] = {}

    @property
    def name(self) -> str:
        return "example-forced-alignment"

    @property
    def version(self) -> str:
        return "1.0.0"

    @property
    def supported_formats(self) -> List[str]:
        return ["wav", "mp3", "flac"]

    def initialize(self, config: Optional[Dict[str, Any]] = None) -> None:
        self._config = config or {"language": "English"}

    def execute(
        self,
        audio: Union[AudioData, str, Path],
        text: str,
        **kwargs
    ) -> ForcedAlignResult:
        """Mock alignment that assigns equal time to each word."""
        words = text.split()
        duration = 10.0  # Assume 10 seconds of audio
        time_per_word = duration / max(len(words), 1)

        items = []
        for i, word in enumerate(words):
            items.append(ForcedAlignItem(
                text=word.strip('.,!?;:'),
                start_time=round(i * time_per_word, 2),
                end_time=round((i + 1) * time_per_word, 2),
            ))

        return ForcedAlignResult(
            items=items,
            metadata={"model": "mock", "language": self._config.get("language")}
        )

    def get_config_schema(self) -> Dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "language": {"type": "string", "default": "English"}
            }
        }

    def get_current_config(self) -> Dict[str, Any]:
        return self._config

    def cleanup(self) -> None:
        pass
# Test the example plugin
plugin = ExampleForcedAlignmentPlugin()
plugin.initialize({"language": "English"})

print(f"Plugin: {plugin.name} v{plugin.version}")
print(f"Supported formats: {plugin.supported_formats}")
print(f"Entry point group: {plugin.entry_point_group}")

# Test execution with text
result = plugin.execute("/tmp/audio.wav", text="Hello world how are you")
print(f"\nResult: {len(result.items)} items")
for item in result.items:
    print(f"  {item}")
print(f"Metadata: {result.metadata}")

assert len(result.items) == 5
assert result.items[0].text == "Hello"
assert result.metadata["language"] == "English"

plugin.cleanup()
Plugin: example-forced-alignment v1.0.0
Supported formats: ['wav', 'mp3', 'flac']
Entry point group: transcription.forced_alignment_plugins

Result: 5 items
  ForcedAlignItem(text='Hello', start_time=0.0, end_time=2.0)
  ForcedAlignItem(text='world', start_time=2.0, end_time=4.0)
  ForcedAlignItem(text='how', start_time=4.0, end_time=6.0)
  ForcedAlignItem(text='are', start_time=6.0, end_time=8.0)
  ForcedAlignItem(text='you', start_time=8.0, end_time=10.0)
Metadata: {'model': 'mock', 'language': 'English'}