Spaces:
Running
Running
File size: 5,090 Bytes
af99c46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
#!/usr/bin/env python3
"""
Simple test script to verify tokenizer functionality.
This tests the core functions without launching the Gradio interface.
"""
import sys
import json
# Test imports
try:
from transformers import AutoTokenizer
print("β transformers imported successfully")
except ImportError as e:
print(f"β Failed to import transformers: {e}")
sys.exit(1)
try:
import gradio as gr
print("β gradio imported successfully")
except ImportError as e:
print(f"β Failed to import gradio: {e}")
sys.exit(1)
# Test basic tokenization
def test_basic_tokenization():
"""Test basic tokenization with a small model."""
print("\n--- Testing Basic Tokenization ---")
try:
# Use GPT-2 as it's small and commonly available
model_id = "openai-community/gpt2"
text = "Hello, world! This is a test."
print(f"Loading tokenizer: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(model_id)
print("β Tokenizer loaded successfully")
# Test encoding
encoded = tokenizer.encode(text)
print(f"β Text encoded: {encoded[:10]}...") # Show first 10 tokens
# Test decoding
decoded = tokenizer.decode(encoded)
print(f"β Text decoded: {decoded}")
# Verify round-trip
assert decoded == text, "Round-trip tokenization failed"
print("β Round-trip tokenization successful")
# Test token conversion
tokens = tokenizer.convert_ids_to_tokens(encoded)
print(f"β Tokens: {tokens[:5]}...") # Show first 5 tokens
return True
except Exception as e:
print(f"β Test failed: {e}")
return False
def test_special_tokens():
"""Test special token handling."""
print("\n--- Testing Special Tokens ---")
try:
model_id = "openai-community/gpt2"
text = "Test text"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# With special tokens
encoded_with = tokenizer.encode(text, add_special_tokens=True)
# Without special tokens
encoded_without = tokenizer.encode(text, add_special_tokens=False)
print(f"β With special tokens: {len(encoded_with)} tokens")
print(f"β Without special tokens: {len(encoded_without)} tokens")
# Decode with and without special tokens
decoded_with = tokenizer.decode(encoded_with, skip_special_tokens=False)
decoded_without = tokenizer.decode(encoded_with, skip_special_tokens=True)
print(f"β Decoded with special: {decoded_with}")
print(f"β Decoded without special: {decoded_without}")
return True
except Exception as e:
print(f"β Test failed: {e}")
return False
def test_app_functions():
"""Test the main app functions."""
print("\n--- Testing App Functions ---")
try:
# Import app functions
from app import tokenize_text, decode_tokens, analyze_vocabulary
# Test tokenize_text
print("Testing tokenize_text function...")
result = tokenize_text(
text="Hello world",
model_id="openai-community/gpt2",
add_special_tokens=True,
show_special_tokens=True,
custom_model_id=None
)
assert len(result) == 5, "tokenize_text should return 5 values"
print("β tokenize_text function works")
# Test decode_tokens
print("Testing decode_tokens function...")
decode_result = decode_tokens(
token_ids_str="[15496, 11, 995]", # "Hello, world" in GPT-2
model_id="openai-community/gpt2",
skip_special_tokens=False,
custom_model_id=None
)
assert "Decoded Text:" in decode_result, "decode_tokens should return decoded text"
print("β decode_tokens function works")
# Test analyze_vocabulary
print("Testing analyze_vocabulary function...")
vocab_result = analyze_vocabulary(
model_id="openai-community/gpt2",
custom_model_id=None
)
assert "Vocabulary Size:" in vocab_result, "analyze_vocabulary should return vocabulary info"
print("β analyze_vocabulary function works")
return True
except Exception as e:
print(f"β Test failed: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Run all tests."""
print("=" * 50)
print("Tokenizer Playground Test Suite")
print("=" * 50)
tests = [
test_basic_tokenization,
test_special_tokens,
test_app_functions
]
results = []
for test in tests:
results.append(test())
print("\n" + "=" * 50)
print("Test Summary")
print("=" * 50)
passed = sum(results)
total = len(results)
print(f"Passed: {passed}/{total}")
if passed == total:
print("β
All tests passed!")
return 0
else:
print("β Some tests failed")
return 1
if __name__ == "__main__":
sys.exit(main()) |