# Transcription Storage


<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

## TranscriptionRow

A dataclass representing a single row in the standardized transcriptions
table. This provides a type-safe way to work with stored transcription
results.

------------------------------------------------------------------------

<a
href="https://github.com/cj-mills/cjm-transcription-plugin-system/blob/main/cjm_transcription_plugin_system/storage.py#L19"
target="_blank" style="float:right; font-size:smaller">source</a>

### TranscriptionRow

``` python

def TranscriptionRow(
    job_id:str, audio_path:str, audio_hash:str, text:str, text_hash:str, segments:Optional=None,
    metadata:Optional=None, created_at:Optional=None
)->None:

```

*A single row from the transcriptions table.*

``` python
# Test TranscriptionRow creation
row = TranscriptionRow(
    job_id="job_abc123",
    audio_path="/tmp/test.mp3",
    audio_hash="sha256:" + "a" * 64,
    text="Hello world",
    text_hash="sha256:" + "b" * 64,
    segments=[{"start": 0.0, "end": 1.0, "text": "Hello world"}],
    metadata={"model": "whisper-large-v3"}
)

print(f"Row: job_id={row.job_id}, text={row.text[:20]}...")
print(f"Audio hash: {row.audio_hash[:20]}...")
print(f"Text hash: {row.text_hash[:20]}...")
```

    Row: job_id=job_abc123, text=Hello world...
    Audio hash: sha256:aaaaaaaaaaaaa...
    Text hash: sha256:bbbbbbbbbbbbb...

## TranscriptionStorage

Standardized SQLite storage that all transcription plugins should use.
Defines the canonical schema for the `transcriptions` table with content
hash columns for traceability.

**Schema:**

``` sql
CREATE TABLE IF NOT EXISTS transcriptions (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id TEXT UNIQUE NOT NULL,
    audio_path TEXT NOT NULL,
    audio_hash TEXT NOT NULL,
    text TEXT NOT NULL,
    text_hash TEXT NOT NULL,
    segments JSON,
    metadata JSON,
    created_at REAL NOT NULL
);
```

The `audio_hash` and `text_hash` columns use the self-describing
`"algo:hexdigest"` format (e.g., `"sha256:a3f2b8..."`), enabling
downstream consumers to verify content integrity.

------------------------------------------------------------------------

<a
href="https://github.com/cj-mills/cjm-transcription-plugin-system/blob/main/cjm_transcription_plugin_system/storage.py#L31"
target="_blank" style="float:right; font-size:smaller">source</a>

### TranscriptionStorage

``` python

def TranscriptionStorage(
    db_path:str, # Absolute path to the SQLite database file
):

```

*Standardized SQLite storage for transcription results.*

## Testing

``` python
import tempfile
import os

# Create storage with temp database
tmp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
storage = TranscriptionStorage(tmp_db.name)

print(f"Storage initialized at: {tmp_db.name}")
```

    Storage initialized at: /tmp/tmp0clqja4v.db

``` python
# Save a transcription result with hashes
test_text = "Laying Plans Sun Tzu said, The art of war is of vital importance to the state."
text_hash = hash_bytes(test_text.encode())
audio_hash = "sha256:" + "e3b0c44298" * 6 + "e3b0"  # Simulated audio hash

storage.save(
    job_id="job_test_001",
    audio_path="/tmp/test_audio.mp3",
    audio_hash=audio_hash,
    text=test_text,
    text_hash=text_hash,
    segments=[{"start": 0.0, "end": 5.0, "text": test_text}],
    metadata={"model": "whisper-large-v3", "language": "en"}
)

print(f"Saved job_test_001")
print(f"Text hash: {text_hash}")
```

    Saved job_test_001
    Text hash: sha256:83efd1674de9fcf20e5c2edacf9246f7f34ad04bf07ddcb2b4e2765269e1edd1

``` python
# Retrieve by job ID
row = storage.get_by_job_id("job_test_001")
assert row is not None
assert row.job_id == "job_test_001"
assert row.text == test_text
assert row.text_hash == text_hash
assert row.audio_hash == audio_hash
assert row.segments is not None
assert row.metadata["model"] == "whisper-large-v3"
assert row.created_at is not None

print(f"Retrieved: {row.job_id}")
print(f"Text: {row.text[:40]}...")
print(f"Audio hash: {row.audio_hash[:30]}...")
print(f"Text hash: {row.text_hash[:30]}...")
print(f"Created at: {row.created_at}")
```

    Retrieved: job_test_001
    Text: Laying Plans Sun Tzu said, The art of wa...
    Audio hash: sha256:e3b0c44298e3b0c44298e3b...
    Text hash: sha256:83efd1674de9fcf20e5c2ed...
    Created at: 1770425259.7641876

``` python
# Missing job returns None
missing = storage.get_by_job_id("nonexistent")
assert missing is None
print("get_by_job_id returns None for missing job: OK")
```

    get_by_job_id returns None for missing job: OK

``` python
# Save another and test list_jobs
storage.save(
    job_id="job_test_002",
    audio_path="/tmp/test_audio_2.mp3",
    audio_hash="sha256:" + "f" * 64,
    text="Second transcription.",
    text_hash=hash_bytes(b"Second transcription.")
)

jobs = storage.list_jobs()
assert len(jobs) == 2
# Newest first
assert jobs[0].job_id == "job_test_002"
assert jobs[1].job_id == "job_test_001"

print(f"list_jobs returned {len(jobs)} rows (newest first): {[j.job_id for j in jobs]}")
```

    list_jobs returned 2 rows (newest first): ['job_test_002', 'job_test_001']

``` python
# Test text verification
assert storage.verify_text("job_test_001") == True
print("verify_text with unchanged text: True")

# Tamper with text directly in DB
with sqlite3.connect(tmp_db.name) as con:
    con.execute("UPDATE transcriptions SET text = 'TAMPERED' WHERE job_id = 'job_test_001'")

assert storage.verify_text("job_test_001") == False
print("verify_text after tampering: False")

# Missing job returns None
assert storage.verify_text("nonexistent") is None
print("verify_text for missing job: None")
```

    verify_text with unchanged text: True
    verify_text after tampering: False
    verify_text for missing job: None

``` python
# Cleanup
os.unlink(tmp_db.name)
print("Cleanup complete")
```

    Cleanup complete