utils

Text processing utilities for segmentation: word counting, position mapping, and statistics

Word Operations

count_words


def count_words(
    text:str, # Text to count words in
)->int: # Word count

Count the number of whitespace-delimited words in text.

Position Mapping

word_index_to_char_position


def word_index_to_char_position(
    text:str, # Full text
    word_index:int, # Word index (0-based, split happens before this word)
)->int: # Character position for split

Convert a word index to the character position where a split should occur.

Segment Statistics

calculate_segment_stats


def calculate_segment_stats(
    segments:List, # List of segments to analyze
)->Dict: # Statistics dictionary with total_words, total_segments

Calculate aggregate statistics for a list of segments.

Source Boundaries

get_source_boundaries


def get_source_boundaries(
    segments:List, # Ordered list of segments
)->Set: # Indices where source_id changes from the previous segment

Find indices where source_id changes between adjacent segments.

A boundary at index N means segment[N].source_id differs from segment[N-1].source_id. Both must be non-None for a boundary to exist.

get_source_count


def get_source_count(
    segments:List, # Ordered list of segments
)->int: # Number of unique non-None source_ids

Count the number of unique audio sources in the segment list.

get_source_position


def get_source_position(
    segments:List, # Ordered list of segments
    focused_index:int, # Index of the focused segment
)->Optional: # 1-based position in ordered unique sources, or None

Get the source position (1-based) of the focused segment.

Returns which source group the focused segment belongs to, based on order of first appearance.

Tests

assert count_words("") == 0
assert count_words("hello") == 1
assert count_words("The art of war") == 4
print("count_words tests passed")

text = "The art of war is vital"

assert word_index_to_char_position(text, 0) == 0
assert word_index_to_char_position(text, 100) == len(text)
print("word_index_to_char_position tests passed")

from cjm_transcript_segmentation.models import TextSegment

test_segments = [
    TextSegment(index=0, text="The art of war"),
    TextSegment(index=1, text="is of vital importance"),
    TextSegment(index=2, text="to the state"),
]

stats = calculate_segment_stats(test_segments)
assert stats["total_segments"] == 3
assert stats["total_words"] == 11
print("calculate_segment_stats tests passed")

calculate_segment_stats tests passed

# Test get_source_boundaries
segs_single = [
    TextSegment(index=0, text="a", source_id="src1"),
    TextSegment(index=1, text="b", source_id="src1"),
    TextSegment(index=2, text="c", source_id="src1"),
]
assert get_source_boundaries(segs_single) == set()

segs_multi = [
    TextSegment(index=0, text="a", source_id="src1"),
    TextSegment(index=1, text="b", source_id="src1"),
    TextSegment(index=2, text="c", source_id="src2"),
    TextSegment(index=3, text="d", source_id="src2"),
    TextSegment(index=4, text="e", source_id="src3"),
]
assert get_source_boundaries(segs_multi) == {2, 4}

segs_none = [
    TextSegment(index=0, text="a", source_id="src1"),
    TextSegment(index=1, text="b", source_id=None),
    TextSegment(index=2, text="c", source_id="src2"),
]
assert get_source_boundaries(segs_none) == set()

assert get_source_boundaries([]) == set()
print("get_source_boundaries tests passed")

# Test get_source_count
assert get_source_count(segs_single) == 1
assert get_source_count(segs_multi) == 3
assert get_source_count(segs_none) == 2
assert get_source_count([]) == 0
print("get_source_count tests passed")

# Test get_source_position
assert get_source_position(segs_multi, 0) == 1  # src1
assert get_source_position(segs_multi, 1) == 1  # src1
assert get_source_position(segs_multi, 2) == 2  # src2
assert get_source_position(segs_multi, 4) == 3  # src3
assert get_source_position(segs_multi, 99) is None  # out of bounds
assert get_source_position([], 0) is None
print("get_source_position tests passed")