cjm-transcription-plugin-system
A flexible plugin system for audio transcription intended to make it easy to add support for multiple backends.
Install
pip install cjm_transcription_plugin_systemProject Structure
nbs/
├── core.ipynb # DTOs for audio transcription with FileBackedDTO support for zero-copy transfer
├── forced_alignment_core.ipynb # Data structures for word-level forced alignment results
├── forced_alignment_interface.ipynb # Domain-specific plugin interface for word-level audio-text alignment
├── forced_alignment_storage.ipynb # Standardized SQLite storage for forced alignment results with content hashing
├── plugin_interface.ipynb # Domain-specific plugin interface for audio transcription
└── storage.ipynb # Standardized SQLite storage for transcription results with content hashing
Total: 6 notebooks
Module Dependencies
graph LR
core[core<br/>Core Data Structures]
forced_alignment_core[forced_alignment_core<br/>Forced Alignment Core]
forced_alignment_interface[forced_alignment_interface<br/>Forced Alignment Plugin Interface]
forced_alignment_storage[forced_alignment_storage<br/>Forced Alignment Storage]
plugin_interface[plugin_interface<br/>Transcription Plugin Interface]
storage[storage<br/>Transcription Storage]
forced_alignment_interface --> forced_alignment_core
forced_alignment_interface --> core
plugin_interface --> core
3 cross-module dependencies detected
CLI Reference
No CLI commands found in this project.
Module Overview
Detailed documentation for each module in the project:
Core Data Structures (core.ipynb)
DTOs for audio transcription with FileBackedDTO support for zero-copy transfer
Import
from cjm_transcription_plugin_system.core import (
AudioData,
TranscriptionResult
)Classes
@dataclass
class AudioData:
"""
Container for raw audio data.
Implements FileBackedDTO for zero-copy transfer between Host and Worker processes.
"""
samples: np.ndarray # Audio sample data as numpy array
sample_rate: int # Sample rate in Hz (e.g., 16000, 44100)
def to_temp_file(self) -> str: # Absolute path to temporary WAV file
"""Save audio to a temp file for zero-copy transfer to Worker process."""
# Create temp file (delete=False so Worker can read it)
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
# Ensure float32 format
audio = self.samples
if audio.dtype != np.float32
"Save audio to a temp file for zero-copy transfer to Worker process."
def to_dict(self) -> Dict[str, Any]: # Serialized representation
"""Convert to dictionary for smaller payloads."""
return {
"samples": self.samples.tolist(),
"Convert to dictionary for smaller payloads."
def from_file(
cls,
filepath: str # Path to audio file
) -> "AudioData": # AudioData instance
"Load audio from a file."@dataclass
class TranscriptionResult:
"Standardized output for all transcription plugins."
text: str # The transcribed text
confidence: Optional[float] # Overall confidence (0.0 to 1.0)
segments: Optional[List[Dict[str, Any]]] # Timestamped segments
metadata: Dict[str, Any] = field(...) # Additional metadataForced Alignment Core (forced_alignment_core.ipynb)
Data structures for word-level forced alignment results
Import
from cjm_transcription_plugin_system.forced_alignment_core import (
ForcedAlignItem,
ForcedAlignResult
)Classes
@dataclass
class ForcedAlignItem:
"A single word-level alignment result."
text: str # The aligned word (punctuation typically stripped by model)
start_time: float # Start time in seconds
end_time: float # End time in seconds@dataclass
class ForcedAlignResult:
"Standardized output for all forced alignment plugins."
items: List[ForcedAlignItem] # Word-level alignments
metadata: Dict[str, Any] = field(...) # Plugin-specific metadataForced Alignment Plugin Interface (forced_alignment_interface.ipynb)
Domain-specific plugin interface for word-level audio-text alignment
Import
from cjm_transcription_plugin_system.forced_alignment_interface import (
ForcedAlignmentPlugin
)Classes
class ForcedAlignmentPlugin(PluginInterface):
"""
Abstract base class for all forced alignment plugins.
Extends PluginInterface with forced-alignment-specific requirements:
- `supported_formats`: List of audio file extensions this plugin can handle
- `execute`: Accepts audio path and transcript text, returns ForcedAlignResult
NOTE: When running via RemotePluginProxy, AudioData objects are automatically
serialized to temp files via FileBackedDTO, so the Worker receives a file path.
"""
def supported_formats(self) -> List[str]: # e.g., ['wav', 'mp3', 'flac']
"""List of supported audio file extensions (without the dot)."""
...
@abstractmethod
def execute(
self,
audio: Union[AudioData, str, Path], # Audio data or file path
text: str, # Transcript text to align against
**kwargs
) -> ForcedAlignResult: # Word-level alignment result
"List of supported audio file extensions (without the dot)."
def execute(
self,
audio: Union[AudioData, str, Path], # Audio data or file path
text: str, # Transcript text to align against
**kwargs
) -> ForcedAlignResult: # Word-level alignment result
"Perform forced alignment of text against audio.
When called via Proxy, AudioData is auto-converted to a file path string
before reaching this method in the Worker process."Forced Alignment Storage (forced_alignment_storage.ipynb)
Standardized SQLite storage for forced alignment results with content hashing
Import
from cjm_transcription_plugin_system.forced_alignment_storage import (
ForcedAlignmentRow,
ForcedAlignmentStorage
)Classes
@dataclass
class ForcedAlignmentRow:
"A single row from the forced_alignments table."
job_id: str # Unique job identifier
audio_path: str # Path to the source audio file
audio_hash: str # Hash of source audio in "algo:hexdigest" format
text: str # Input transcript text that was aligned
text_hash: str # Hash of input text in "algo:hexdigest" format
items: Optional[List[Dict[str, Any]]] # Serialized ForcedAlignItems
metadata: Optional[Dict[str, Any]] # Plugin metadata
created_at: Optional[float] # Unix timestampclass ForcedAlignmentStorage:
def __init__(
self,
db_path: str # Absolute path to the SQLite database file
)
"Standardized SQLite storage for forced alignment results."
def __init__(
self,
db_path: str # Absolute path to the SQLite database file
)
"Initialize storage and create table if needed."
def save(
self,
job_id: str, # Unique job identifier
audio_path: str, # Path to the source audio file
audio_hash: str, # Hash of source audio in "algo:hexdigest" format
text: str, # Input transcript text
text_hash: str, # Hash of input text in "algo:hexdigest" format
items: Optional[List[Dict[str, Any]]] = None, # Serialized ForcedAlignItems
metadata: Optional[Dict[str, Any]] = None # Plugin metadata
) -> None
"Save a forced alignment result to the database."
def get_by_job_id(
self,
job_id: str # Job identifier to look up
) -> Optional[ForcedAlignmentRow]: # Row or None if not found
"Retrieve a forced alignment result by job ID."
def list_jobs(
self,
limit: int = 100 # Maximum number of rows to return
) -> List[ForcedAlignmentRow]: # List of forced alignment rows
"List forced alignment jobs ordered by creation time (newest first)."
def verify_audio(
self,
job_id: str # Job identifier to verify
) -> Optional[bool]: # True if audio matches, False if tampered, None if job not found
"Verify the source audio file still matches its stored hash."
def verify_text(
self,
job_id: str # Job identifier to verify
) -> Optional[bool]: # True if text matches, False if tampered, None if job not found
"Verify the input text still matches its stored hash."Transcription Plugin Interface (plugin_interface.ipynb)
Domain-specific plugin interface for audio transcription
Import
from cjm_transcription_plugin_system.plugin_interface import (
TranscriptionPlugin
)Classes
class TranscriptionPlugin(PluginInterface):
"""
Abstract base class for all transcription plugins.
Extends PluginInterface with transcription-specific requirements:
- `supported_formats`: List of audio file extensions this plugin can handle
- `execute`: Accepts audio path (str) or AudioData, returns TranscriptionResult
NOTE: When running via RemotePluginProxy, AudioData objects are automatically
serialized to temp files via FileBackedDTO, so the Worker receives a file path.
"""
def supported_formats(self) -> List[str]: # e.g., ['wav', 'mp3', 'flac']
"""List of supported audio file extensions (without the dot)."""
...
@abstractmethod
def execute(
self,
audio: Union[AudioData, str, Path], # Audio data or file path
**kwargs
) -> TranscriptionResult: # Transcription result with text, confidence, segments
"List of supported audio file extensions (without the dot)."
def execute(
self,
audio: Union[AudioData, str, Path], # Audio data or file path
**kwargs
) -> TranscriptionResult: # Transcription result with text, confidence, segments
"Transcribe audio to text.
When called via Proxy, AudioData is auto-converted to a file path string
before reaching this method in the Worker process."Transcription Storage (storage.ipynb)
Standardized SQLite storage for transcription results with content hashing
Import
from cjm_transcription_plugin_system.storage import (
TranscriptionRow,
TranscriptionStorage
)Classes
@dataclass
class TranscriptionRow:
"A single row from the transcriptions table."
job_id: str # Unique job identifier
audio_path: str # Path to the source audio file
audio_hash: str # Hash of source audio in "algo:hexdigest" format
text: str # Transcribed text output
text_hash: str # Hash of transcribed text in "algo:hexdigest" format
segments: Optional[List[Dict[str, Any]]] # Timestamped segments
metadata: Optional[Dict[str, Any]] # Plugin metadata
created_at: Optional[float] # Unix timestampclass TranscriptionStorage:
def __init__(
self,
db_path: str # Absolute path to the SQLite database file
)
"Standardized SQLite storage for transcription results."
def __init__(
self,
db_path: str # Absolute path to the SQLite database file
)
"Initialize storage and create table if needed."
def save(
self,
job_id: str, # Unique job identifier
audio_path: str, # Path to the source audio file
audio_hash: str, # Hash of source audio in "algo:hexdigest" format
text: str, # Transcribed text output
text_hash: str, # Hash of transcribed text in "algo:hexdigest" format
segments: Optional[List[Dict[str, Any]]] = None, # Timestamped segments
metadata: Optional[Dict[str, Any]] = None # Plugin metadata
) -> None
"Save a transcription result to the database."
def get_by_job_id(
self,
job_id: str # Job identifier to look up
) -> Optional[TranscriptionRow]: # Row or None if not found
"Retrieve a transcription result by job ID."
def list_jobs(
self,
limit: int = 100 # Maximum number of rows to return
) -> List[TranscriptionRow]: # List of transcription rows
"List transcription jobs ordered by creation time (newest first)."
def verify_audio(
self,
job_id: str # Job identifier to verify
) -> Optional[bool]: # True if audio matches, False if tampered, None if job not found
"Verify the source audio file still matches its stored hash."
def verify_text(
self,
job_id: str # Job identifier to verify
) -> Optional[bool]: # True if text matches, False if tampered, None if job not found
"Verify the transcription text still matches its stored hash."