segmentation

Segmentation service for text decomposition via NLTK plugin

SegmentationService

This service wraps the NLTK text processing plugin to provide sentence splitting functionality. It converts raw text into TextSegment objects for further refinement in the UI.

SegmentationService


def SegmentationService(
    plugin_manager:PluginManager, # Plugin manager for accessing text plugin
    plugin_name:str='cjm-text-plugin-nltk', # Name of the text processing plugin
):

Service for text segmentation via NLTK plugin.

Segment Manipulation Helpers

These functions support the UI operations for splitting, merging, and reordering segments.

split_segment_at_position


def split_segment_at_position(
    segment:TextSegment, # Segment to split
    char_position:int, # Character position to split at (relative to segment text)
)->tuple: # Two new segments

Split a segment into two at the given character position.

merge_text_segments


def merge_text_segments(
    first:TextSegment, # First segment (earlier in sequence)
    second:TextSegment, # Second segment (later in sequence)
    separator:str=' ', # Text separator between segments
)->TextSegment: # Merged segment

Merge two adjacent segments into one.

reindex_segments


def reindex_segments(
    segments:List, # List of segments to reindex
)->List: # Segments with corrected indices

Reindex segments to have sequential indices starting from 0.

Source Block Reconstruction

reconstruct_source_blocks


def reconstruct_source_blocks(
    segment_dicts:List, # Serialized working segments
)->List: # Reconstructed source blocks with combined text

Reconstruct source blocks by grouping segments by source_id and combining text.

Tests

The following cells demonstrate the segmentation service and helper functions.

# Test split_segment_at_position
segment = TextSegment(
    index=0,
    text="The art of war is of vital importance to the state.",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=0,
    end_char=51
)

# Split at position 18 (after "The art of war is ")
first, second = split_segment_at_position(segment, 18)
print(f"Original: '{segment.text}'")
print(f"First:    '{first.text}' (chars {first.start_char}-{first.end_char})")
print(f"Second:   '{second.text}' (chars {second.start_char}-{second.end_char})")

Original: 'The art of war is of vital importance to the state.'
First:    'The art of war is' (chars 0-18)
Second:   'of vital importance to the state.' (chars 18-51)

# Test merge_text_segments
seg1 = TextSegment(
    index=0,
    text="The art of war",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=0,
    end_char=14,
)

seg2 = TextSegment(
    index=1,
    text="is of vital importance to the state.",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=15,
    end_char=51,
)

merged = merge_text_segments(seg1, seg2)
print(f"Segment 1: '{seg1.text}'")
print(f"Segment 2: '{seg2.text}'")
print(f"Merged:    '{merged.text}'")
print(f"Char range: {merged.start_char} - {merged.end_char}")

Segment 1: 'The art of war'
Segment 2: 'is of vital importance to the state.'
Merged:    'The art of war is of vital importance to the state.'
Char range: 0 - 51

# Test reindex_segments
segments = [
    TextSegment(index=5, text="First"),
    TextSegment(index=10, text="Second"),
    TextSegment(index=3, text="Third")
]

print("Before reindex:")
for s in segments:
    print(f"  index={s.index}: '{s.text}'")

reindex_segments(segments)

print("\nAfter reindex:")
for s in segments:
    print(f"  index={s.index}: '{s.text}'")

Before reindex:
  index=5: 'First'
  index=10: 'Second'
  index=3: 'Third'

After reindex:
  index=0: 'First'
  index=1: 'Second'
  index=2: 'Third'

# Test reconstruct_source_blocks
seg_dicts = [
    {"text": "First sentence.", "source_id": "job_1", "source_provider_id": "provider_a"},
    {"text": "Second sentence.", "source_id": "job_1", "source_provider_id": "provider_a"},
    {"text": "Third sentence.", "source_id": "job_2", "source_provider_id": "provider_b"},
]

blocks = reconstruct_source_blocks(seg_dicts)
assert len(blocks) == 2
assert blocks[0].id == "job_1"
assert blocks[0].provider_id == "provider_a"
assert blocks[0].text == "First sentence. Second sentence."
assert blocks[1].id == "job_2"
assert blocks[1].text == "Third sentence."

# Empty input
assert reconstruct_source_blocks([]) == []

# Missing source_id defaults to "unknown"
blocks = reconstruct_source_blocks([{"text": "orphan"}])
assert blocks[0].id == "unknown"

print("reconstruct_source_blocks tests passed")

reconstruct_source_blocks tests passed

SegmentationService with NLTK Plugin

These tests require the NLTK plugin to be installed and discoverable.

# Test SegmentationService with NLTK plugin
from pathlib import Path
from cjm_plugin_system.core.manager import PluginManager

# Calculate project root from notebook location (nbs/services/ -> project root)
project_root = Path.cwd().parent.parent
manifests_dir = project_root / ".cjm" / "manifests"

# Create plugin manager with explicit search path
manager = PluginManager(search_paths=[manifests_dir])
manager.discover_manifests()

print(f"Discovered {len(manager.discovered)} plugins from {manifests_dir}")

# Check if NLTK plugin is available
nltk_meta = manager.get_discovered_meta("cjm-text-plugin-nltk")
if nltk_meta:
    print(f"Found plugin: {nltk_meta.name} v{nltk_meta.version}")
else:
    print("NLTK plugin not found - install via plugins.yaml")

[PluginManager] Discovered manifest: cjm-text-plugin-nltk from /mnt/SN850X_8TB_EXT4/Projects/GitHub/cj-mills/cjm-transcript-segmentation/.cjm/manifests/cjm-text-plugin-nltk.json

Discovered 1 plugins from /mnt/SN850X_8TB_EXT4/Projects/GitHub/cj-mills/cjm-transcript-segmentation/.cjm/manifests
Found plugin: cjm-text-plugin-nltk v0.0.2

# Initialize and test SegmentationService
if nltk_meta:
    # Load the plugin
    manager.load_plugin(nltk_meta, {"language": "english"})
    
    seg_service = SegmentationService(manager)
    print(f"Plugin available: {seg_service.is_available()}")
    
    # Test sentence splitting (use await directly - Jupyter supports top-level await)
    test_text = (
        "The art of war is of vital importance to the state. "
        "It is a matter of life and death, a road either to safety or to ruin. "
        "Hence it is a subject of inquiry which can on no account be neglected."
    )
    
    segments = await seg_service.split_sentences_async(
        text=test_text,
        source_id="test_job",
        source_provider_id="test"
    )
    
    print(f"\nSplit into {len(segments)} segments:")
    for seg in segments:
        print(f"  [{seg.index}] chars {seg.start_char}-{seg.end_char}: '{seg.text[:40]}...'")

[PluginManager] Launching worker for cjm-text-plugin-nltk...

[cjm-text-plugin-nltk] Starting worker on port 33111...
[cjm-text-plugin-nltk] Logs: /home/innom-dt/.cjm/logs/cjm-text-plugin-nltk.log

[PluginManager] HTTP Request: GET http://127.0.0.1:33111/health "HTTP/1.1 200 OK"

[cjm-text-plugin-nltk] Worker ready.

[PluginManager] HTTP Request: POST http://127.0.0.1:33111/initialize "HTTP/1.1 200 OK"
[PluginManager] Loaded plugin: cjm-text-plugin-nltk
[PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"

Plugin available: True

Split into 3 segments:
  [0] chars 0-51: 'The art of war is of vital importance to...'
  [1] chars 52-121: 'It is a matter of life and death, a road...'
  [2] chars 122-192: 'Hence it is a subject of inquiry which c...'

# Test split_combined_sources_async with multiple source blocks
from cjm_source_provider.models import SourceBlock

if nltk_meta and seg_service.is_available():
    # Create test source blocks
    blocks = [
        SourceBlock(
            id="job_1",
            provider_id="provider_a",
            text="Sun Tzu said the art of war is vital. It determines victory or defeat."
        ),
        SourceBlock(
            id="job_2",
            provider_id="provider_b",
            text="Know your enemy. Know yourself. A hundred battles, a hundred victories."
        )
    ]
    
    # Use await directly (Jupyter supports top-level await)
    all_segments = await seg_service.split_combined_sources_async(blocks)
    
    print(f"Combined {len(blocks)} blocks into {len(all_segments)} segments:")
    for seg in all_segments:
        print(f"  [{seg.index}] source={seg.source_id}: '{seg.text[:35]}...'")

[PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"
[PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"

Combined 2 blocks into 5 segments:
  [0] source=job_1: 'Sun Tzu said the art of war is vita...'
  [1] source=job_1: 'It determines victory or defeat....'
  [2] source=job_2: 'Know your enemy....'
  [3] source=job_2: 'Know yourself....'
  [4] source=job_2: 'A hundred battles, a hundred victor...'

# Cleanup
if nltk_meta:
    manager.unload_all()
    print("Plugins unloaded")

[PluginManager] HTTP Request: POST http://127.0.0.1:33111/cleanup "HTTP/1.1 200 OK"
[PluginManager] Unloaded plugin: cjm-text-plugin-nltk

Plugins unloaded