assert count_tokens("") == 0
assert count_tokens("hello") == 1
assert count_tokens("hello world") == 2
assert count_tokens(" spaced out ") == 2
print("count_tokens tests passed!")count_tokens tests passed!
Count the number of whitespace-delimited tokens in text.
count_tokens tests passed!
Split text into a list of whitespace-delimited tokens.
Convert a token index to the character position where a split should occur.
# Basic cases
assert token_index_to_char_position("hello world", 0) == 0
assert token_index_to_char_position("hello world", 1) == 6
assert token_index_to_char_position("hello world", 2) == 11 # past end
# Multi-word
assert token_index_to_char_position("the quick brown fox", 2) == 10
# Multiple spaces
assert token_index_to_char_position("hello world", 1) == 7
# Negative index
assert token_index_to_char_position("hello", -1) == 0
print("token_index_to_char_position tests passed!")token_index_to_char_position tests passed!
Converts a raw text string or pre-tokenized list into Token objects.
Convert text or a pre-tokenized list into Token objects.
# From string
tokens = tokenize("hello world")
assert len(tokens) == 2
assert tokens[0].text == "hello" and tokens[0].index == 0
assert tokens[1].text == "world" and tokens[1].index == 1
assert tokens[0].metadata is None
# From pre-tokenized list
tokens = tokenize(["pre", "tokenized", "list"])
assert len(tokens) == 3
assert tokens[2].text == "list" and tokens[2].index == 2
# With metadata
tokens = tokenize("hello world", metadata=["noun", "noun"])
assert tokens[0].metadata == "noun"
assert tokens[1].metadata == "noun"
# Empty
assert tokenize("") == []
assert tokenize([]) == []
# Metadata length mismatch
try:
tokenize("hello world", metadata=["noun"])
assert False, "Should have raised ValueError"
except ValueError:
pass
print("tokenize tests passed!")tokenize tests passed!