tokenizers / test_tokenizer.py
afeng's picture
add tokenizer
af99c46
#!/usr/bin/env python3
"""
Simple test script to verify tokenizer functionality.
This tests the core functions without launching the Gradio interface.
"""
import sys
import json
# Test imports
try:
from transformers import AutoTokenizer
print("βœ“ transformers imported successfully")
except ImportError as e:
print(f"βœ— Failed to import transformers: {e}")
sys.exit(1)
try:
import gradio as gr
print("βœ“ gradio imported successfully")
except ImportError as e:
print(f"βœ— Failed to import gradio: {e}")
sys.exit(1)
# Test basic tokenization
def test_basic_tokenization():
"""Test basic tokenization with a small model."""
print("\n--- Testing Basic Tokenization ---")
try:
# Use GPT-2 as it's small and commonly available
model_id = "openai-community/gpt2"
text = "Hello, world! This is a test."
print(f"Loading tokenizer: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(model_id)
print("βœ“ Tokenizer loaded successfully")
# Test encoding
encoded = tokenizer.encode(text)
print(f"βœ“ Text encoded: {encoded[:10]}...") # Show first 10 tokens
# Test decoding
decoded = tokenizer.decode(encoded)
print(f"βœ“ Text decoded: {decoded}")
# Verify round-trip
assert decoded == text, "Round-trip tokenization failed"
print("βœ“ Round-trip tokenization successful")
# Test token conversion
tokens = tokenizer.convert_ids_to_tokens(encoded)
print(f"βœ“ Tokens: {tokens[:5]}...") # Show first 5 tokens
return True
except Exception as e:
print(f"βœ— Test failed: {e}")
return False
def test_special_tokens():
"""Test special token handling."""
print("\n--- Testing Special Tokens ---")
try:
model_id = "openai-community/gpt2"
text = "Test text"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# With special tokens
encoded_with = tokenizer.encode(text, add_special_tokens=True)
# Without special tokens
encoded_without = tokenizer.encode(text, add_special_tokens=False)
print(f"βœ“ With special tokens: {len(encoded_with)} tokens")
print(f"βœ“ Without special tokens: {len(encoded_without)} tokens")
# Decode with and without special tokens
decoded_with = tokenizer.decode(encoded_with, skip_special_tokens=False)
decoded_without = tokenizer.decode(encoded_with, skip_special_tokens=True)
print(f"βœ“ Decoded with special: {decoded_with}")
print(f"βœ“ Decoded without special: {decoded_without}")
return True
except Exception as e:
print(f"βœ— Test failed: {e}")
return False
def test_app_functions():
"""Test the main app functions."""
print("\n--- Testing App Functions ---")
try:
# Import app functions
from app import tokenize_text, decode_tokens, analyze_vocabulary
# Test tokenize_text
print("Testing tokenize_text function...")
result = tokenize_text(
text="Hello world",
model_id="openai-community/gpt2",
add_special_tokens=True,
show_special_tokens=True,
custom_model_id=None
)
assert len(result) == 5, "tokenize_text should return 5 values"
print("βœ“ tokenize_text function works")
# Test decode_tokens
print("Testing decode_tokens function...")
decode_result = decode_tokens(
token_ids_str="[15496, 11, 995]", # "Hello, world" in GPT-2
model_id="openai-community/gpt2",
skip_special_tokens=False,
custom_model_id=None
)
assert "Decoded Text:" in decode_result, "decode_tokens should return decoded text"
print("βœ“ decode_tokens function works")
# Test analyze_vocabulary
print("Testing analyze_vocabulary function...")
vocab_result = analyze_vocabulary(
model_id="openai-community/gpt2",
custom_model_id=None
)
assert "Vocabulary Size:" in vocab_result, "analyze_vocabulary should return vocabulary info"
print("βœ“ analyze_vocabulary function works")
return True
except Exception as e:
print(f"βœ— Test failed: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Run all tests."""
print("=" * 50)
print("Tokenizer Playground Test Suite")
print("=" * 50)
tests = [
test_basic_tokenization,
test_special_tokens,
test_app_functions
]
results = []
for test in tests:
results.append(test())
print("\n" + "=" * 50)
print("Test Summary")
print("=" * 50)
passed = sum(results)
total = len(results)
print(f"Passed: {passed}/{total}")
if passed == total:
print("βœ… All tests passed!")
return 0
else:
print("❌ Some tests failed")
return 1
if __name__ == "__main__":
sys.exit(main())