assert count_words("") == 0
assert count_words("hello") == 1
assert count_words("The art of war") == 4
print("count_words tests passed")utils
Text processing utilities for segmentation: word counting, position mapping, and statistics
Word Operations
count_words
def count_words(
text:str, # Text to count words in
)->int: # Word count
Count the number of whitespace-delimited words in text.
Position Mapping
word_index_to_char_position
def word_index_to_char_position(
text:str, # Full text
word_index:int, # Word index (0-based, split happens before this word)
)->int: # Character position for split
Convert a word index to the character position where a split should occur.
Segment Statistics
calculate_segment_stats
def calculate_segment_stats(
segments:List, # List of segments to analyze
)->Dict: # Statistics dictionary with total_words, total_segments
Calculate aggregate statistics for a list of segments.
Source Boundaries
get_source_boundaries
def get_source_boundaries(
segments:List, # Ordered list of segments
)->Set: # Indices where source_id changes from the previous segment
Find indices where source_id changes between adjacent segments.
A boundary at index N means segment[N].source_id differs from segment[N-1].source_id. Both must be non-None for a boundary to exist.
get_source_count
def get_source_count(
segments:List, # Ordered list of segments
)->int: # Number of unique non-None source_ids
Count the number of unique audio sources in the segment list.
get_source_position
def get_source_position(
segments:List, # Ordered list of segments
focused_index:int, # Index of the focused segment
)->Optional: # 1-based position in ordered unique sources, or None
Get the source position (1-based) of the focused segment.
Returns which source group the focused segment belongs to, based on order of first appearance.
Tests
text = "The art of war is vital"
assert word_index_to_char_position(text, 0) == 0
assert word_index_to_char_position(text, 100) == len(text)
print("word_index_to_char_position tests passed")from cjm_transcript_segmentation.models import TextSegment
test_segments = [
TextSegment(index=0, text="The art of war"),
TextSegment(index=1, text="is of vital importance"),
TextSegment(index=2, text="to the state"),
]
stats = calculate_segment_stats(test_segments)
assert stats["total_segments"] == 3
assert stats["total_words"] == 11
print("calculate_segment_stats tests passed")calculate_segment_stats tests passed
# Test get_source_boundaries
segs_single = [
TextSegment(index=0, text="a", source_id="src1"),
TextSegment(index=1, text="b", source_id="src1"),
TextSegment(index=2, text="c", source_id="src1"),
]
assert get_source_boundaries(segs_single) == set()
segs_multi = [
TextSegment(index=0, text="a", source_id="src1"),
TextSegment(index=1, text="b", source_id="src1"),
TextSegment(index=2, text="c", source_id="src2"),
TextSegment(index=3, text="d", source_id="src2"),
TextSegment(index=4, text="e", source_id="src3"),
]
assert get_source_boundaries(segs_multi) == {2, 4}
segs_none = [
TextSegment(index=0, text="a", source_id="src1"),
TextSegment(index=1, text="b", source_id=None),
TextSegment(index=2, text="c", source_id="src2"),
]
assert get_source_boundaries(segs_none) == set()
assert get_source_boundaries([]) == set()
print("get_source_boundaries tests passed")# Test get_source_count
assert get_source_count(segs_single) == 1
assert get_source_count(segs_multi) == 3
assert get_source_count(segs_none) == 2
assert get_source_count([]) == 0
print("get_source_count tests passed")# Test get_source_position
assert get_source_position(segs_multi, 0) == 1 # src1
assert get_source_position(segs_multi, 1) == 1 # src1
assert get_source_position(segs_multi, 2) == 2 # src2
assert get_source_position(segs_multi, 4) == 3 # src3
assert get_source_position(segs_multi, 99) is None # out of bounds
assert get_source_position([], 0) is None
print("get_source_position tests passed")