SliceSpec Protocol
All slice types implement to_slice_string() which produces the string stored in SourceRef.segment_slice. The format is always "type:params" (e.g., "char:0-500", "timestamp:10.5-30.0").
source
SliceSpec
def SliceSpec(
args:VAR_POSITIONAL, kwargs:VAR_KEYWORD
):
Protocol for typed segment slice specifications.
Slice Types
Typed dataclasses for each content modality. Each serializes to a string format that can be stored in SourceRef.segment_slice and parsed back via parse_slice() .
source
CharSlice
def CharSlice(
start:int , end:int
)-> None :
Character-range slice for text content.
source
TimestampSlice
def TimestampSlice(
start:float , end:float
)-> None :
Temporal slice for audio/video content.
source
FrameSlice
def FrameSlice(
start:int , end:int
)-> None :
Frame-range slice for video content.
source
PageSlice
def PageSlice(
page:int , bbox:Optional= None
)-> None :
Page slice for document content (PDFs, EPUBs).
source
LineSlice
def LineSlice(
start:int , end:int
)-> None :
Line-range slice for code or structured text.
source
FullContent
def FullContent(
content_type:str = 'text'
)-> None :
Reference to complete content (no slicing).
# Test all slice types serialize correctly
assert CharSlice(0 , 500 ).to_slice_string() == "char:0-500"
assert TimestampSlice(10.5 , 30.0 ).to_slice_string() == "timestamp:10.5-30.0"
assert FrameSlice(0 , 120 ).to_slice_string() == "frame:0-120"
assert PageSlice(3 ).to_slice_string() == "page:3"
assert PageSlice(3 , bbox= "10,20,300,400" ).to_slice_string() == "page:3:bbox:10,20,300,400"
assert LineSlice(10 , 25 ).to_slice_string() == "line:10-25"
assert FullContent("text" ).to_slice_string() == "full_text"
assert FullContent("audio" ).to_slice_string() == "full_audio"
print ("All slice type serialization tests passed" )
All slice type serialization tests passed
# Test all types implement SliceSpec protocol
for cls in [CharSlice, TimestampSlice, FrameSlice, PageSlice, LineSlice, FullContent]:
instance = cls.__new__ (cls)
assert isinstance (instance, SliceSpec), f" { cls. __name__ } does not implement SliceSpec"
print ("All slice types implement SliceSpec protocol" )
All slice types implement SliceSpec protocol
parse_slice
Parses a slice string (as stored in SourceRef.segment_slice) back into a typed SliceSpec dataclass. Dispatches based on the string prefix.
source
parse_slice
def parse_slice(
s:str , # Slice string to parse (e.g., "char:0-500", "timestamp:10.5-30.0")
)-> SliceSpec: # Parsed slice specification
Parse a slice string into a typed SliceSpec.
# Test parse_slice round-trips
test_cases = [
CharSlice(0 , 500 ),
TimestampSlice(10.5 , 30.0 ),
FrameSlice(0 , 120 ),
PageSlice(3 ),
PageSlice(3 , bbox= "10,20,300,400" ),
LineSlice(10 , 25 ),
FullContent("text" ),
FullContent("audio" ),
]
for original in test_cases:
s = original.to_slice_string()
parsed = parse_slice(s)
assert parsed == original, f"Round-trip failed for { s} : { parsed} != { original} "
print (f" { s:40s} -> { type (parsed). __name__ } " )
print (" \n All parse_slice round-trip tests passed" )
char:0-500 -> CharSlice
timestamp:10.5-30.0 -> TimestampSlice
frame:0-120 -> FrameSlice
page:3 -> PageSlice
page:3:bbox:10,20,300,400 -> PageSlice
line:10-25 -> LineSlice
full_text -> FullContent
full_audio -> FullContent
All parse_slice round-trip tests passed
# Test parse_slice with unknown format
try :
parse_slice("unknown:data" )
assert False , "Should have raised ValueError"
except ValueError as e:
print (f"Correctly raised ValueError: { e} " )
Correctly raised ValueError: Unknown slice format: 'unknown:data'
# Integration test: use slice types with SourceRef
from cjm_graph_plugin_system.core import SourceRef
# Create a SourceRef using typed slices
slice_spec = CharSlice(0 , 500 )
ref = SourceRef(
plugin_name= "cjm-transcription-plugin-voxtral-hf" ,
table_name= "transcriptions" ,
row_id= "job_abc123" ,
content_hash= SourceRef.compute_hash(b"first 500 chars of transcript" ),
segment_slice= slice_spec.to_slice_string()
)
print (f"SourceRef segment_slice: { ref. segment_slice} " )
assert ref.segment_slice == "char:0-500"
# Parse it back
parsed = parse_slice(ref.segment_slice)
assert isinstance (parsed, CharSlice)
assert parsed.start == 0
assert parsed.end == 500
print ("SourceRef integration test passed" )
SourceRef segment_slice: char:0-500
SourceRef integration test passed