Forced Alignment Storage

Standardized SQLite storage for forced alignment results with content hashing

ForcedAlignmentRow

A dataclass representing a single row in the standardized forced alignments table. This provides a type-safe way to work with stored forced alignment results.


source

ForcedAlignmentRow


def ForcedAlignmentRow(
    job_id:str, audio_path:str, audio_hash:str, text:str, text_hash:str, items:Optional=None, metadata:Optional=None,
    created_at:Optional=None
)->None:

A single row from the forced_alignments table.

# Test ForcedAlignmentRow creation
row = ForcedAlignmentRow(
    job_id="fa_job_abc123",
    audio_path="/tmp/test.mp3",
    audio_hash="sha256:" + "a" * 64,
    text="Hello world",
    text_hash="sha256:" + "b" * 64,
    items=[{"text": "Hello", "start_time": 0.0, "end_time": 0.5}],
    metadata={"model": "qwen3-forced-aligner"}
)

print(f"Row: job_id={row.job_id}, text={row.text}")
print(f"Audio hash: {row.audio_hash[:20]}...")
print(f"Text hash: {row.text_hash[:20]}...")
Row: job_id=fa_job_abc123, text=Hello world
Audio hash: sha256:aaaaaaaaaaaaa...
Text hash: sha256:bbbbbbbbbbbbb...

ForcedAlignmentStorage

Standardized SQLite storage that all forced alignment plugins should use. Defines the canonical schema for the forced_alignments table with content hash columns for traceability.

Schema:

CREATE TABLE IF NOT EXISTS forced_alignments (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id TEXT UNIQUE NOT NULL,
    audio_path TEXT NOT NULL,
    audio_hash TEXT NOT NULL,
    text TEXT NOT NULL,
    text_hash TEXT NOT NULL,
    items JSON,
    metadata JSON,
    created_at REAL NOT NULL
);

The audio_hash and text_hash columns use the self-describing "algo:hexdigest" format (e.g., "sha256:a3f2b8..."), enabling downstream consumers to verify content integrity.


source

ForcedAlignmentStorage


def ForcedAlignmentStorage(
    db_path:str, # Absolute path to the SQLite database file
):

Standardized SQLite storage for forced alignment results.

Testing

import tempfile
import os

# Create storage with temp database
tmp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
storage = ForcedAlignmentStorage(tmp_db.name)

print(f"Storage initialized at: {tmp_db.name}")
Storage initialized at: /tmp/tmpwfon0jh4.db
# Save a forced alignment result with hashes
test_text = "November the 10th, Wednesday, 9 p.m."
text_hash = hash_bytes(test_text.encode())
audio_hash = "sha256:" + "e3b0c44298" * 6 + "e3b0"  # Simulated audio hash

test_items = [
    {"text": "November", "start_time": 1.04, "end_time": 1.6},
    {"text": "the", "start_time": 1.6, "end_time": 1.68},
    {"text": "10th", "start_time": 1.76, "end_time": 2.08},
    {"text": "Wednesday", "start_time": 2.48, "end_time": 3.04},
    {"text": "9", "start_time": 3.84, "end_time": 4.16},
    {"text": "pm", "start_time": 4.16, "end_time": 4.64},
]

storage.save(
    job_id="fa_test_001",
    audio_path="/tmp/test_audio.mp3",
    audio_hash=audio_hash,
    text=test_text,
    text_hash=text_hash,
    items=test_items,
    metadata={"model_id": "Qwen/Qwen3-ForcedAligner-0.6B", "language": "English", "word_count": 6}
)

print(f"Saved fa_test_001")
print(f"Text hash: {text_hash}")
Saved fa_test_001
Text hash: sha256:bb6827d460a3f8dff6d8b5704e237e54141853a0c58d1d7aefa49417ec6e49ad
# Retrieve by job ID
row = storage.get_by_job_id("fa_test_001")
assert row is not None
assert row.job_id == "fa_test_001"
assert row.text == test_text
assert row.text_hash == text_hash
assert row.audio_hash == audio_hash
assert row.items is not None
assert len(row.items) == 6
assert row.items[0]["text"] == "November"
assert row.items[0]["start_time"] == 1.04
assert row.metadata["model_id"] == "Qwen/Qwen3-ForcedAligner-0.6B"
assert row.created_at is not None

print(f"Retrieved: {row.job_id}")
print(f"Text: {row.text}")
print(f"Items: {len(row.items)} words")
print(f"First item: {row.items[0]}")
print(f"Created at: {row.created_at}")
Retrieved: fa_test_001
Text: November the 10th, Wednesday, 9 p.m.
Items: 6 words
First item: {'text': 'November', 'start_time': 1.04, 'end_time': 1.6}
Created at: 1773970556.1129797
# Missing job returns None
missing = storage.get_by_job_id("nonexistent")
assert missing is None
print("get_by_job_id returns None for missing job: OK")
get_by_job_id returns None for missing job: OK
# Save another and test list_jobs
storage.save(
    job_id="fa_test_002",
    audio_path="/tmp/test_audio_2.mp3",
    audio_hash="sha256:" + "f" * 64,
    text="Second alignment test.",
    text_hash=hash_bytes(b"Second alignment test."),
    items=[{"text": "Second", "start_time": 0.0, "end_time": 0.5}],
)

jobs = storage.list_jobs()
assert len(jobs) == 2
# Newest first
assert jobs[0].job_id == "fa_test_002"
assert jobs[1].job_id == "fa_test_001"

print(f"list_jobs returned {len(jobs)} rows (newest first): {[j.job_id for j in jobs]}")
list_jobs returned 2 rows (newest first): ['fa_test_002', 'fa_test_001']
# Test text verification
assert storage.verify_text("fa_test_001") == True
print("verify_text with unchanged text: True")

# Tamper with text directly in DB
with sqlite3.connect(tmp_db.name) as con:
    con.execute("UPDATE forced_alignments SET text = 'TAMPERED' WHERE job_id = 'fa_test_001'")

assert storage.verify_text("fa_test_001") == False
print("verify_text after tampering: False")

# Missing job returns None
assert storage.verify_text("nonexistent") is None
print("verify_text for missing job: None")
verify_text with unchanged text: True
verify_text after tampering: False
verify_text for missing job: None
# Cleanup
os.unlink(tmp_db.name)
print("Cleanup complete")
Cleanup complete