# segmentation


<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

## SegmentationService

This service wraps the NLTK text processing plugin to provide sentence
splitting functionality. It converts raw text into `TextSegment` objects
for further refinement in the UI.

------------------------------------------------------------------------

### SegmentationService

``` python

def SegmentationService(
    plugin_manager:PluginManager, # Plugin manager for accessing text plugin
    plugin_name:str='cjm-text-plugin-nltk', # Name of the text processing plugin
):

```

*Service for text segmentation via NLTK plugin.*

## Segment Manipulation Helpers

These functions support the UI operations for splitting, merging, and
reordering segments.

------------------------------------------------------------------------

### split_segment_at_position

``` python

def split_segment_at_position(
    segment:TextSegment, # Segment to split
    char_position:int, # Character position to split at (relative to segment text)
)->tuple: # Two new segments

```

*Split a segment into two at the given character position.*

------------------------------------------------------------------------

### merge_text_segments

``` python

def merge_text_segments(
    first:TextSegment, # First segment (earlier in sequence)
    second:TextSegment, # Second segment (later in sequence)
    separator:str=' ', # Text separator between segments
)->TextSegment: # Merged segment

```

*Merge two adjacent segments into one.*

------------------------------------------------------------------------

### reindex_segments

``` python

def reindex_segments(
    segments:List, # List of segments to reindex
)->List: # Segments with corrected indices

```

*Reindex segments to have sequential indices starting from 0.*

## Source Block Reconstruction

------------------------------------------------------------------------

### reconstruct_source_blocks

``` python

def reconstruct_source_blocks(
    segment_dicts:List, # Serialized working segments
)->List: # Reconstructed source blocks with combined text

```

*Reconstruct source blocks by grouping segments by source_id and
combining text.*

## Tests

The following cells demonstrate the segmentation service and helper
functions.

``` python
# Test split_segment_at_position
segment = TextSegment(
    index=0,
    text="The art of war is of vital importance to the state.",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=0,
    end_char=51
)

# Split at position 18 (after "The art of war is ")
first, second = split_segment_at_position(segment, 18)
print(f"Original: '{segment.text}'")
print(f"First:    '{first.text}' (chars {first.start_char}-{first.end_char})")
print(f"Second:   '{second.text}' (chars {second.start_char}-{second.end_char})")
```

    Original: 'The art of war is of vital importance to the state.'
    First:    'The art of war is' (chars 0-18)
    Second:   'of vital importance to the state.' (chars 18-51)

``` python
# Test merge_text_segments
seg1 = TextSegment(
    index=0,
    text="The art of war",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=0,
    end_char=14,
)

seg2 = TextSegment(
    index=1,
    text="is of vital importance to the state.",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=15,
    end_char=51,
)

merged = merge_text_segments(seg1, seg2)
print(f"Segment 1: '{seg1.text}'")
print(f"Segment 2: '{seg2.text}'")
print(f"Merged:    '{merged.text}'")
print(f"Char range: {merged.start_char} - {merged.end_char}")
```

    Segment 1: 'The art of war'
    Segment 2: 'is of vital importance to the state.'
    Merged:    'The art of war is of vital importance to the state.'
    Char range: 0 - 51

``` python
# Test reindex_segments
segments = [
    TextSegment(index=5, text="First"),
    TextSegment(index=10, text="Second"),
    TextSegment(index=3, text="Third")
]

print("Before reindex:")
for s in segments:
    print(f"  index={s.index}: '{s.text}'")

reindex_segments(segments)

print("\nAfter reindex:")
for s in segments:
    print(f"  index={s.index}: '{s.text}'")
```

    Before reindex:
      index=5: 'First'
      index=10: 'Second'
      index=3: 'Third'

    After reindex:
      index=0: 'First'
      index=1: 'Second'
      index=2: 'Third'

``` python
# Test reconstruct_source_blocks
seg_dicts = [
    {"text": "First sentence.", "source_id": "job_1", "source_provider_id": "provider_a"},
    {"text": "Second sentence.", "source_id": "job_1", "source_provider_id": "provider_a"},
    {"text": "Third sentence.", "source_id": "job_2", "source_provider_id": "provider_b"},
]

blocks = reconstruct_source_blocks(seg_dicts)
assert len(blocks) == 2
assert blocks[0].id == "job_1"
assert blocks[0].provider_id == "provider_a"
assert blocks[0].text == "First sentence. Second sentence."
assert blocks[1].id == "job_2"
assert blocks[1].text == "Third sentence."

# Empty input
assert reconstruct_source_blocks([]) == []

# Missing source_id defaults to "unknown"
blocks = reconstruct_source_blocks([{"text": "orphan"}])
assert blocks[0].id == "unknown"

print("reconstruct_source_blocks tests passed")
```

    reconstruct_source_blocks tests passed

### SegmentationService with NLTK Plugin

These tests require the NLTK plugin to be installed and discoverable.

``` python
# Test SegmentationService with NLTK plugin
from pathlib import Path
from cjm_plugin_system.core.manager import PluginManager

# Calculate project root from notebook location (nbs/services/ -> project root)
project_root = Path.cwd().parent.parent
manifests_dir = project_root / ".cjm" / "manifests"

# Create plugin manager with explicit search path
manager = PluginManager(search_paths=[manifests_dir])
manager.discover_manifests()

print(f"Discovered {len(manager.discovered)} plugins from {manifests_dir}")

# Check if NLTK plugin is available
nltk_meta = manager.get_discovered_meta("cjm-text-plugin-nltk")
if nltk_meta:
    print(f"Found plugin: {nltk_meta.name} v{nltk_meta.version}")
else:
    print("NLTK plugin not found - install via plugins.yaml")
```

    [PluginManager] Discovered manifest: cjm-text-plugin-nltk from /mnt/SN850X_8TB_EXT4/Projects/GitHub/cj-mills/cjm-transcript-segmentation/.cjm/manifests/cjm-text-plugin-nltk.json

    Discovered 1 plugins from /mnt/SN850X_8TB_EXT4/Projects/GitHub/cj-mills/cjm-transcript-segmentation/.cjm/manifests
    Found plugin: cjm-text-plugin-nltk v0.0.2

``` python
# Initialize and test SegmentationService
if nltk_meta:
    # Load the plugin
    manager.load_plugin(nltk_meta, {"language": "english"})
    
    seg_service = SegmentationService(manager)
    print(f"Plugin available: {seg_service.is_available()}")
    
    # Test sentence splitting (use await directly - Jupyter supports top-level await)
    test_text = (
        "The art of war is of vital importance to the state. "
        "It is a matter of life and death, a road either to safety or to ruin. "
        "Hence it is a subject of inquiry which can on no account be neglected."
    )
    
    segments = await seg_service.split_sentences_async(
        text=test_text,
        source_id="test_job",
        source_provider_id="test"
    )
    
    print(f"\nSplit into {len(segments)} segments:")
    for seg in segments:
        print(f"  [{seg.index}] chars {seg.start_char}-{seg.end_char}: '{seg.text[:40]}...'")
```

    [PluginManager] Launching worker for cjm-text-plugin-nltk...

    [cjm-text-plugin-nltk] Starting worker on port 33111...
    [cjm-text-plugin-nltk] Logs: /home/innom-dt/.cjm/logs/cjm-text-plugin-nltk.log

    [PluginManager] HTTP Request: GET http://127.0.0.1:33111/health "HTTP/1.1 200 OK"

    [cjm-text-plugin-nltk] Worker ready.

    [PluginManager] HTTP Request: POST http://127.0.0.1:33111/initialize "HTTP/1.1 200 OK"
    [PluginManager] Loaded plugin: cjm-text-plugin-nltk
    [PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"

    Plugin available: True

    Split into 3 segments:
      [0] chars 0-51: 'The art of war is of vital importance to...'
      [1] chars 52-121: 'It is a matter of life and death, a road...'
      [2] chars 122-192: 'Hence it is a subject of inquiry which c...'

``` python
# Test split_combined_sources_async with multiple source blocks
from cjm_source_provider.models import SourceBlock

if nltk_meta and seg_service.is_available():
    # Create test source blocks
    blocks = [
        SourceBlock(
            id="job_1",
            provider_id="provider_a",
            text="Sun Tzu said the art of war is vital. It determines victory or defeat."
        ),
        SourceBlock(
            id="job_2",
            provider_id="provider_b",
            text="Know your enemy. Know yourself. A hundred battles, a hundred victories."
        )
    ]
    
    # Use await directly (Jupyter supports top-level await)
    all_segments = await seg_service.split_combined_sources_async(blocks)
    
    print(f"Combined {len(blocks)} blocks into {len(all_segments)} segments:")
    for seg in all_segments:
        print(f"  [{seg.index}] source={seg.source_id}: '{seg.text[:35]}...'")
```

    [PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"
    [PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"

    Combined 2 blocks into 5 segments:
      [0] source=job_1: 'Sun Tzu said the art of war is vita...'
      [1] source=job_1: 'It determines victory or defeat....'
      [2] source=job_2: 'Know your enemy....'
      [3] source=job_2: 'Know yourself....'
      [4] source=job_2: 'A hundred battles, a hundred victor...'

``` python
# Cleanup
if nltk_meta:
    manager.unload_all()
    print("Plugins unloaded")
```

    [PluginManager] HTTP Request: POST http://127.0.0.1:33111/cleanup "HTTP/1.1 200 OK"
    [PluginManager] Unloaded plugin: cjm-text-plugin-nltk

    Plugins unloaded
