NLTK-based text processing plugin with character-level span tracking.
Testing the Plugin
# Test basic functionalityplugin = NLTKPlugin()print(f"Plugin name: {plugin.name}")print(f"Plugin version: {plugin.version}")print(f"Config class: {plugin.config_class.__name__}")# Test configuration dataclassfrom dataclasses import fieldsprint("Available languages:")lang_field =next(f for f in fields(NLTKPluginConfig) if f.name =="language")for lang in lang_field.metadata.get(SCHEMA_ENUM, []):print(f" - {lang}")# Test initializationplugin.initialize({"language": "english"})current_config = plugin.get_current_config()print(f"Current config: {current_config}")# Test get_config_schema for UI generationimport jsonschema = plugin.get_config_schema()print("JSON Schema for NLTKPluginConfig:")print(json.dumps(schema, indent=2))
Plugin name: nltk_text
Plugin version: 1.0.0
Config class: NLTKPluginConfig
Available languages:
- english
- german
- french
- spanish
- italian
- portuguese
- dutch
Current config: {'tokenizer': 'punkt', 'language': 'english'}
JSON Schema for NLTKPluginConfig:
{
"name": "NLTKPluginConfig",
"title": "NLTKPluginConfig",
"description": "Configuration for NLTK text processing plugin.",
"type": "object",
"properties": {
"tokenizer": {
"type": "string",
"title": "Tokenizer",
"description": "NLTK tokenizer to use for sentence splitting",
"enum": [
"punkt"
],
"default": "punkt"
},
"language": {
"type": "string",
"title": "Language",
"description": "Language for tokenization (affects sentence boundary detection)",
"enum": [
"english",
"german",
"french",
"spanish",
"italian",
"portuguese",
"dutch"
],
"default": "english"
}
}
}
# Test split_sentences directlytext ="Hello world. How are you? I am fine! This is a test."result = plugin.split_sentences(text)print(f"Input: '{text}'")print(f"Spans found: {len(result.spans)}")print(f"Metadata: {result.metadata}")for i, span inenumerate(result.spans):print(f" {i}: '{span.text}' [{span.start_char}:{span.end_char}]")# Verify mapping back to originalassert text[span.start_char:span.end_char] == span.text, f"Mismatch at span {i}"
Input: 'Hello world. How are you? I am fine! This is a test.'
Spans found: 4
Metadata: {'processor': 'nltk_text', 'tokenizer': 'punkt', 'language': 'english', 'nltk_data_dir': None}
0: 'Hello world.' [0:12]
1: 'How are you?' [13:25]
2: 'I am fine!' [26:36]
3: 'This is a test.' [37:52]
# Test execute() dispatcher (as Worker would call it)json_result = plugin.execute(action="split_sentences", text=text)print(f"JSON result from execute():")print(f" spans: {len(json_result['spans'])} items")print(f" metadata: {json_result['metadata']}")for span_dict in json_result['spans']:print(f" - {span_dict['text']!r} [{span_dict['start_char']}:{span_dict['end_char']}]")
JSON result from execute():
spans: 4 items
metadata: {'processor': 'nltk_text', 'tokenizer': 'punkt', 'language': 'english', 'nltk_data_dir': None}
- 'Hello world.' [0:12]
- 'How are you?' [13:25]
- 'I am fine!' [26:36]
- 'This is a test.' [37:52]
# Test with multi-paragraph textmulti_text ="""First paragraph. It has two sentences.Second paragraph starts here. And continues here!Third paragraph: What about questions? They work too."""result = plugin.split_sentences(multi_text)print(f"Multi-paragraph text - {len(result.spans)} sentences found:")for i, span inenumerate(result.spans):# Show first 50 chars of each span preview = span.text[:50] +"..."iflen(span.text) >50else span.textprint(f" {i}: [{span.start_char:3d}:{span.end_char:3d}] {preview!r}")
Multi-paragraph text - 6 sentences found:
0: [ 0: 16] 'First paragraph.'
1: [ 17: 38] 'It has two sentences.'
2: [ 40: 69] 'Second paragraph starts here.'
3: [ 70: 89] 'And continues here!'
4: [ 91:129] 'Third paragraph: What about questions?'
5: [130:144] 'They work too.'