Segment Slice Specifications

Typed slice dataclasses for specifying referenced content regions in SourceRef

SliceSpec Protocol

All slice types implement to_slice_string() which produces the string stored in SourceRef.segment_slice. The format is always "type:params" (e.g., "char:0-500", "timestamp:10.5-30.0").


source

SliceSpec


def SliceSpec(
    args:VAR_POSITIONAL, kwargs:VAR_KEYWORD
):

Protocol for typed segment slice specifications.

Slice Types

Typed dataclasses for each content modality. Each serializes to a string format that can be stored in SourceRef.segment_slice and parsed back via parse_slice().


source

CharSlice


def CharSlice(
    start:int, end:int
)->None:

Character-range slice for text content.


source

TimestampSlice


def TimestampSlice(
    start:float, end:float
)->None:

Temporal slice for audio/video content.


source

FrameSlice


def FrameSlice(
    start:int, end:int
)->None:

Frame-range slice for video content.


source

PageSlice


def PageSlice(
    page:int, bbox:Optional=None
)->None:

Page slice for document content (PDFs, EPUBs).


source

LineSlice


def LineSlice(
    start:int, end:int
)->None:

Line-range slice for code or structured text.


source

FullContent


def FullContent(
    content_type:str='text'
)->None:

Reference to complete content (no slicing).

# Test all slice types serialize correctly
assert CharSlice(0, 500).to_slice_string() == "char:0-500"
assert TimestampSlice(10.5, 30.0).to_slice_string() == "timestamp:10.5-30.0"
assert FrameSlice(0, 120).to_slice_string() == "frame:0-120"
assert PageSlice(3).to_slice_string() == "page:3"
assert PageSlice(3, bbox="10,20,300,400").to_slice_string() == "page:3:bbox:10,20,300,400"
assert LineSlice(10, 25).to_slice_string() == "line:10-25"
assert FullContent("text").to_slice_string() == "full_text"
assert FullContent("audio").to_slice_string() == "full_audio"

print("All slice type serialization tests passed")
All slice type serialization tests passed
# Test all types implement SliceSpec protocol
for cls in [CharSlice, TimestampSlice, FrameSlice, PageSlice, LineSlice, FullContent]:
    instance = cls.__new__(cls)
    assert isinstance(instance, SliceSpec), f"{cls.__name__} does not implement SliceSpec"

print("All slice types implement SliceSpec protocol")
All slice types implement SliceSpec protocol

parse_slice

Parses a slice string (as stored in SourceRef.segment_slice) back into a typed SliceSpec dataclass. Dispatches based on the string prefix.


source

parse_slice


def parse_slice(
    s:str, # Slice string to parse (e.g., "char:0-500", "timestamp:10.5-30.0")
)->SliceSpec: # Parsed slice specification

Parse a slice string into a typed SliceSpec.

# Test parse_slice round-trips
test_cases = [
    CharSlice(0, 500),
    TimestampSlice(10.5, 30.0),
    FrameSlice(0, 120),
    PageSlice(3),
    PageSlice(3, bbox="10,20,300,400"),
    LineSlice(10, 25),
    FullContent("text"),
    FullContent("audio"),
]

for original in test_cases:
    s = original.to_slice_string()
    parsed = parse_slice(s)
    assert parsed == original, f"Round-trip failed for {s}: {parsed} != {original}"
    print(f"  {s:40s} -> {type(parsed).__name__}")

print("\nAll parse_slice round-trip tests passed")
  char:0-500                               -> CharSlice
  timestamp:10.5-30.0                      -> TimestampSlice
  frame:0-120                              -> FrameSlice
  page:3                                   -> PageSlice
  page:3:bbox:10,20,300,400                -> PageSlice
  line:10-25                               -> LineSlice
  full_text                                -> FullContent
  full_audio                               -> FullContent

All parse_slice round-trip tests passed
# Test parse_slice with unknown format
try:
    parse_slice("unknown:data")
    assert False, "Should have raised ValueError"
except ValueError as e:
    print(f"Correctly raised ValueError: {e}")
Correctly raised ValueError: Unknown slice format: 'unknown:data'
# Integration test: use slice types with SourceRef
from cjm_graph_plugin_system.core import SourceRef

# Create a SourceRef using typed slices
slice_spec = CharSlice(0, 500)
ref = SourceRef(
    plugin_name="cjm-transcription-plugin-voxtral-hf",
    table_name="transcriptions",
    row_id="job_abc123",
    content_hash=SourceRef.compute_hash(b"first 500 chars of transcript"),
    segment_slice=slice_spec.to_slice_string()
)

print(f"SourceRef segment_slice: {ref.segment_slice}")
assert ref.segment_slice == "char:0-500"

# Parse it back
parsed = parse_slice(ref.segment_slice)
assert isinstance(parsed, CharSlice)
assert parsed.start == 0
assert parsed.end == 500

print("SourceRef integration test passed")
SourceRef segment_slice: char:0-500
SourceRef integration test passed