cjm-text-plugin-nltk
A local, NLTK-based text processing worker for the cjm-plugin-system that provides sentence splitting and tokenization with character-level span tracking.
Install
pip install cjm_text_plugin_nltkProject Structure
nbs/
├── meta.ipynb # Metadata introspection for the NLTK text plugin used by cjm-ctl to generate the registration manifest.
└── plugin.ipynb # Plugin implementation for NLTK-based text processing with character-level span tracking
Total: 2 notebooks
Module Dependencies
graph LR
meta[meta<br/>Metadata]
plugin[plugin<br/>NLTK Plugin]
plugin --> meta
1 cross-module dependencies detected
CLI Reference
No CLI commands found in this project.
Module Overview
Detailed documentation for each module in the project:
Metadata (meta.ipynb)
Metadata introspection for the NLTK text plugin used by cjm-ctl to generate the registration manifest.
Import
from cjm_text_plugin_nltk.meta import (
get_plugin_metadata
)Functions
def get_plugin_metadata() -> Dict[str, Any]: # Plugin metadata for manifest generation
"""Return metadata required to register this plugin with the PluginManager."""
# Fallback base path (current behavior for backward compatibility)
base_path = os.path.dirname(os.path.dirname(sys.executable))
# Use CJM config if available, else fallback to env-relative paths
cjm_data_dir = os.environ.get("CJM_DATA_DIR")
# Plugin data directory
plugin_name = "cjm-text-plugin-nltk"
if cjm_data_dir
"Return metadata required to register this plugin with the PluginManager."NLTK Plugin (plugin.ipynb)
Plugin implementation for NLTK-based text processing with character-level span tracking
Import
from cjm_text_plugin_nltk.plugin import (
NLTKPluginConfig,
NLTKPlugin
)Classes
@dataclass
class NLTKPluginConfig:
"Configuration for NLTK text processing plugin."
tokenizer: str = field(...)
language: str = field(...)class NLTKPlugin:
def __init__(self):
"""Initialize the NLTK plugin."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: NLTKPluginConfig = None
"NLTK-based text processing plugin with character-level span tracking."
def __init__(self):
"""Initialize the NLTK plugin."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: NLTKPluginConfig = None
"Initialize the NLTK plugin."
def name(self) -> str: # Plugin name identifier
"""Get the plugin name identifier."""
return "nltk_text"
@property
def version(self) -> str: # Plugin version string
"Get the plugin name identifier."
def version(self) -> str: # Plugin version string
"""Get the plugin version string."""
return "1.0.0"
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"Get the plugin version string."
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"""Return current configuration state."""
if not self.config
"Return current configuration state."
def get_config_schema(self) -> Dict[str, Any]: # JSON Schema for configuration
"""Return JSON Schema for UI generation."""
return dataclass_to_jsonschema(NLTKPluginConfig)
@staticmethod
def get_config_dataclass() -> NLTKPluginConfig: # Configuration dataclass
"Return JSON Schema for UI generation."
def get_config_dataclass() -> NLTKPluginConfig: # Configuration dataclass
"""Return dataclass describing the plugin's configuration options."""
return NLTKPluginConfig
def _ensure_nltk_data(self) -> None
"Return dataclass describing the plugin's configuration options."
def initialize(
self,
config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
"Initialize or re-configure the plugin (idempotent)."
def execute(
self,
action: str = "split_sentences", # Operation: 'split_sentences'
**kwargs
) -> Dict[str, Any]: # JSON-serializable result
"Execute a text processing operation."
def split_sentences(
self,
text: str, # Input text to split into sentences
**kwargs
) -> TextProcessResult: # Result with TextSpan objects containing character indices
"Split text into sentence spans with accurate character positions."
def cleanup(self) -> None
"Clean up resources."