NLTK Plugin

Plugin implementation for NLTK-based text processing with character-level span tracking

source

NLTKPluginConfig


def NLTKPluginConfig(
    tokenizer:str='punkt', language:str='english'
)->None:

Configuration for NLTK text processing plugin.


source

NLTKPlugin


def NLTKPlugin(
    
):

NLTK-based text processing plugin with character-level span tracking.

Testing the Plugin

# Test basic functionality
plugin = NLTKPlugin()

print(f"Plugin name: {plugin.name}")
print(f"Plugin version: {plugin.version}")
print(f"Config class: {plugin.config_class.__name__}")

# Test configuration dataclass
from dataclasses import fields

print("Available languages:")
lang_field = next(f for f in fields(NLTKPluginConfig) if f.name == "language")
for lang in lang_field.metadata.get(SCHEMA_ENUM, []):
    print(f"  - {lang}")

# Test initialization
plugin.initialize({"language": "english"})

current_config = plugin.get_current_config()
print(f"Current config: {current_config}")

# Test get_config_schema for UI generation
import json

schema = plugin.get_config_schema()
print("JSON Schema for NLTKPluginConfig:")
print(json.dumps(schema, indent=2))
Plugin name: nltk_text
Plugin version: 1.0.0
Config class: NLTKPluginConfig
Available languages:
  - english
  - german
  - french
  - spanish
  - italian
  - portuguese
  - dutch
Current config: {'tokenizer': 'punkt', 'language': 'english'}
JSON Schema for NLTKPluginConfig:
{
  "name": "NLTKPluginConfig",
  "title": "NLTKPluginConfig",
  "description": "Configuration for NLTK text processing plugin.",
  "type": "object",
  "properties": {
    "tokenizer": {
      "type": "string",
      "title": "Tokenizer",
      "description": "NLTK tokenizer to use for sentence splitting",
      "enum": [
        "punkt"
      ],
      "default": "punkt"
    },
    "language": {
      "type": "string",
      "title": "Language",
      "description": "Language for tokenization (affects sentence boundary detection)",
      "enum": [
        "english",
        "german",
        "french",
        "spanish",
        "italian",
        "portuguese",
        "dutch"
      ],
      "default": "english"
    }
  }
}
# Test split_sentences directly
text = "Hello world. How are you? I am fine! This is a test."
result = plugin.split_sentences(text)

print(f"Input: '{text}'")
print(f"Spans found: {len(result.spans)}")
print(f"Metadata: {result.metadata}")

for i, span in enumerate(result.spans):
    print(f"  {i}: '{span.text}' [{span.start_char}:{span.end_char}]")
    # Verify mapping back to original
    assert text[span.start_char:span.end_char] == span.text, f"Mismatch at span {i}"
Input: 'Hello world. How are you? I am fine! This is a test.'
Spans found: 4
Metadata: {'processor': 'nltk_text', 'tokenizer': 'punkt', 'language': 'english', 'nltk_data_dir': None}
  0: 'Hello world.' [0:12]
  1: 'How are you?' [13:25]
  2: 'I am fine!' [26:36]
  3: 'This is a test.' [37:52]
# Test execute() dispatcher (as Worker would call it)
json_result = plugin.execute(action="split_sentences", text=text)

print(f"JSON result from execute():")
print(f"  spans: {len(json_result['spans'])} items")
print(f"  metadata: {json_result['metadata']}")

for span_dict in json_result['spans']:
    print(f"    - {span_dict['text']!r} [{span_dict['start_char']}:{span_dict['end_char']}]")
JSON result from execute():
  spans: 4 items
  metadata: {'processor': 'nltk_text', 'tokenizer': 'punkt', 'language': 'english', 'nltk_data_dir': None}
    - 'Hello world.' [0:12]
    - 'How are you?' [13:25]
    - 'I am fine!' [26:36]
    - 'This is a test.' [37:52]
# Test with multi-paragraph text
multi_text = """First paragraph. It has two sentences.

Second paragraph starts here. And continues here!

Third paragraph: What about questions? They work too."""

result = plugin.split_sentences(multi_text)
print(f"Multi-paragraph text - {len(result.spans)} sentences found:")
for i, span in enumerate(result.spans):
    # Show first 50 chars of each span
    preview = span.text[:50] + "..." if len(span.text) > 50 else span.text
    print(f"  {i}: [{span.start_char:3d}:{span.end_char:3d}] {preview!r}")
Multi-paragraph text - 6 sentences found:
  0: [  0: 16] 'First paragraph.'
  1: [ 17: 38] 'It has two sentences.'
  2: [ 40: 69] 'Second paragraph starts here.'
  3: [ 70: 89] 'And continues here!'
  4: [ 91:129] 'Third paragraph: What about questions?'
  5: [130:144] 'They work too.'
# Cleanup
plugin.cleanup()