Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Simple test script to verify tokenizer functionality. | |
| This tests the core functions without launching the Gradio interface. | |
| """ | |
| import sys | |
| import json | |
| # Test imports | |
| try: | |
| from transformers import AutoTokenizer | |
| print("β transformers imported successfully") | |
| except ImportError as e: | |
| print(f"β Failed to import transformers: {e}") | |
| sys.exit(1) | |
| try: | |
| import gradio as gr | |
| print("β gradio imported successfully") | |
| except ImportError as e: | |
| print(f"β Failed to import gradio: {e}") | |
| sys.exit(1) | |
| # Test basic tokenization | |
| def test_basic_tokenization(): | |
| """Test basic tokenization with a small model.""" | |
| print("\n--- Testing Basic Tokenization ---") | |
| try: | |
| # Use GPT-2 as it's small and commonly available | |
| model_id = "openai-community/gpt2" | |
| text = "Hello, world! This is a test." | |
| print(f"Loading tokenizer: {model_id}") | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| print("β Tokenizer loaded successfully") | |
| # Test encoding | |
| encoded = tokenizer.encode(text) | |
| print(f"β Text encoded: {encoded[:10]}...") # Show first 10 tokens | |
| # Test decoding | |
| decoded = tokenizer.decode(encoded) | |
| print(f"β Text decoded: {decoded}") | |
| # Verify round-trip | |
| assert decoded == text, "Round-trip tokenization failed" | |
| print("β Round-trip tokenization successful") | |
| # Test token conversion | |
| tokens = tokenizer.convert_ids_to_tokens(encoded) | |
| print(f"β Tokens: {tokens[:5]}...") # Show first 5 tokens | |
| return True | |
| except Exception as e: | |
| print(f"β Test failed: {e}") | |
| return False | |
| def test_special_tokens(): | |
| """Test special token handling.""" | |
| print("\n--- Testing Special Tokens ---") | |
| try: | |
| model_id = "openai-community/gpt2" | |
| text = "Test text" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| # With special tokens | |
| encoded_with = tokenizer.encode(text, add_special_tokens=True) | |
| # Without special tokens | |
| encoded_without = tokenizer.encode(text, add_special_tokens=False) | |
| print(f"β With special tokens: {len(encoded_with)} tokens") | |
| print(f"β Without special tokens: {len(encoded_without)} tokens") | |
| # Decode with and without special tokens | |
| decoded_with = tokenizer.decode(encoded_with, skip_special_tokens=False) | |
| decoded_without = tokenizer.decode(encoded_with, skip_special_tokens=True) | |
| print(f"β Decoded with special: {decoded_with}") | |
| print(f"β Decoded without special: {decoded_without}") | |
| return True | |
| except Exception as e: | |
| print(f"β Test failed: {e}") | |
| return False | |
| def test_app_functions(): | |
| """Test the main app functions.""" | |
| print("\n--- Testing App Functions ---") | |
| try: | |
| # Import app functions | |
| from app import tokenize_text, decode_tokens, analyze_vocabulary | |
| # Test tokenize_text | |
| print("Testing tokenize_text function...") | |
| result = tokenize_text( | |
| text="Hello world", | |
| model_id="openai-community/gpt2", | |
| add_special_tokens=True, | |
| show_special_tokens=True, | |
| custom_model_id=None | |
| ) | |
| assert len(result) == 5, "tokenize_text should return 5 values" | |
| print("β tokenize_text function works") | |
| # Test decode_tokens | |
| print("Testing decode_tokens function...") | |
| decode_result = decode_tokens( | |
| token_ids_str="[15496, 11, 995]", # "Hello, world" in GPT-2 | |
| model_id="openai-community/gpt2", | |
| skip_special_tokens=False, | |
| custom_model_id=None | |
| ) | |
| assert "Decoded Text:" in decode_result, "decode_tokens should return decoded text" | |
| print("β decode_tokens function works") | |
| # Test analyze_vocabulary | |
| print("Testing analyze_vocabulary function...") | |
| vocab_result = analyze_vocabulary( | |
| model_id="openai-community/gpt2", | |
| custom_model_id=None | |
| ) | |
| assert "Vocabulary Size:" in vocab_result, "analyze_vocabulary should return vocabulary info" | |
| print("β analyze_vocabulary function works") | |
| return True | |
| except Exception as e: | |
| print(f"β Test failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def main(): | |
| """Run all tests.""" | |
| print("=" * 50) | |
| print("Tokenizer Playground Test Suite") | |
| print("=" * 50) | |
| tests = [ | |
| test_basic_tokenization, | |
| test_special_tokens, | |
| test_app_functions | |
| ] | |
| results = [] | |
| for test in tests: | |
| results.append(test()) | |
| print("\n" + "=" * 50) | |
| print("Test Summary") | |
| print("=" * 50) | |
| passed = sum(results) | |
| total = len(results) | |
| print(f"Passed: {passed}/{total}") | |
| if passed == total: | |
| print("β All tests passed!") | |
| return 0 | |
| else: | |
| print("β Some tests failed") | |
| return 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |