# NLTK Plugin


<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

------------------------------------------------------------------------

<a
href="https://github.com/cj-mills/cjm-text-plugin-nltk/blob/main/cjm_text_plugin_nltk/plugin.py#L30"
target="_blank" style="float:right; font-size:smaller">source</a>

### NLTKPluginConfig

``` python

def NLTKPluginConfig(
    tokenizer:str='punkt', language:str='english'
)->None:

```

*Configuration for NLTK text processing plugin.*

------------------------------------------------------------------------

<a
href="https://github.com/cj-mills/cjm-text-plugin-nltk/blob/main/cjm_text_plugin_nltk/plugin.py#L50"
target="_blank" style="float:right; font-size:smaller">source</a>

### NLTKPlugin

``` python

def NLTKPlugin(
    
):

```

*NLTK-based text processing plugin with character-level span tracking.*

## Testing the Plugin

``` python
# Test basic functionality
plugin = NLTKPlugin()

print(f"Plugin name: {plugin.name}")
print(f"Plugin version: {plugin.version}")
print(f"Config class: {plugin.config_class.__name__}")

# Test configuration dataclass
from dataclasses import fields

print("Available languages:")
lang_field = next(f for f in fields(NLTKPluginConfig) if f.name == "language")
for lang in lang_field.metadata.get(SCHEMA_ENUM, []):
    print(f"  - {lang}")

# Test initialization
plugin.initialize({"language": "english"})

current_config = plugin.get_current_config()
print(f"Current config: {current_config}")

# Test get_config_schema for UI generation
import json

schema = plugin.get_config_schema()
print("JSON Schema for NLTKPluginConfig:")
print(json.dumps(schema, indent=2))
```

    Plugin name: nltk_text
    Plugin version: 1.0.0
    Config class: NLTKPluginConfig
    Available languages:
      - english
      - german
      - french
      - spanish
      - italian
      - portuguese
      - dutch
    Current config: {'tokenizer': 'punkt', 'language': 'english'}
    JSON Schema for NLTKPluginConfig:
    {
      "name": "NLTKPluginConfig",
      "title": "NLTKPluginConfig",
      "description": "Configuration for NLTK text processing plugin.",
      "type": "object",
      "properties": {
        "tokenizer": {
          "type": "string",
          "title": "Tokenizer",
          "description": "NLTK tokenizer to use for sentence splitting",
          "enum": [
            "punkt"
          ],
          "default": "punkt"
        },
        "language": {
          "type": "string",
          "title": "Language",
          "description": "Language for tokenization (affects sentence boundary detection)",
          "enum": [
            "english",
            "german",
            "french",
            "spanish",
            "italian",
            "portuguese",
            "dutch"
          ],
          "default": "english"
        }
      }
    }

``` python
# Test split_sentences directly
text = "Hello world. How are you? I am fine! This is a test."
result = plugin.split_sentences(text)

print(f"Input: '{text}'")
print(f"Spans found: {len(result.spans)}")
print(f"Metadata: {result.metadata}")

for i, span in enumerate(result.spans):
    print(f"  {i}: '{span.text}' [{span.start_char}:{span.end_char}]")
    # Verify mapping back to original
    assert text[span.start_char:span.end_char] == span.text, f"Mismatch at span {i}"
```

    Input: 'Hello world. How are you? I am fine! This is a test.'
    Spans found: 4
    Metadata: {'processor': 'nltk_text', 'tokenizer': 'punkt', 'language': 'english', 'nltk_data_dir': None}
      0: 'Hello world.' [0:12]
      1: 'How are you?' [13:25]
      2: 'I am fine!' [26:36]
      3: 'This is a test.' [37:52]

``` python
# Test execute() dispatcher (as Worker would call it)
json_result = plugin.execute(action="split_sentences", text=text)

print(f"JSON result from execute():")
print(f"  spans: {len(json_result['spans'])} items")
print(f"  metadata: {json_result['metadata']}")

for span_dict in json_result['spans']:
    print(f"    - {span_dict['text']!r} [{span_dict['start_char']}:{span_dict['end_char']}]")
```

    JSON result from execute():
      spans: 4 items
      metadata: {'processor': 'nltk_text', 'tokenizer': 'punkt', 'language': 'english', 'nltk_data_dir': None}
        - 'Hello world.' [0:12]
        - 'How are you?' [13:25]
        - 'I am fine!' [26:36]
        - 'This is a test.' [37:52]

``` python
# Test with multi-paragraph text
multi_text = """First paragraph. It has two sentences.

Second paragraph starts here. And continues here!

Third paragraph: What about questions? They work too."""

result = plugin.split_sentences(multi_text)
print(f"Multi-paragraph text - {len(result.spans)} sentences found:")
for i, span in enumerate(result.spans):
    # Show first 50 chars of each span
    preview = span.text[:50] + "..." if len(span.text) > 50 else span.text
    print(f"  {i}: [{span.start_char:3d}:{span.end_char:3d}] {preview!r}")
```

    Multi-paragraph text - 6 sentences found:
      0: [  0: 16] 'First paragraph.'
      1: [ 17: 38] 'It has two sentences.'
      2: [ 40: 69] 'Second paragraph starts here.'
      3: [ 70: 89] 'And continues here!'
      4: [ 91:129] 'Third paragraph: What about questions?'
      5: [130:144] 'They work too.'

``` python
# Cleanup
plugin.cleanup()
```
