File size: 5,090 Bytes
af99c46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
"""
Simple test script to verify tokenizer functionality.
This tests the core functions without launching the Gradio interface.
"""

import sys
import json

# Test imports
try:
    from transformers import AutoTokenizer
    print("βœ“ transformers imported successfully")
except ImportError as e:
    print(f"βœ— Failed to import transformers: {e}")
    sys.exit(1)

try:
    import gradio as gr
    print("βœ“ gradio imported successfully")
except ImportError as e:
    print(f"βœ— Failed to import gradio: {e}")
    sys.exit(1)

# Test basic tokenization
def test_basic_tokenization():
    """Test basic tokenization with a small model."""
    print("\n--- Testing Basic Tokenization ---")
    try:
        # Use GPT-2 as it's small and commonly available
        model_id = "openai-community/gpt2"
        text = "Hello, world! This is a test."

        print(f"Loading tokenizer: {model_id}")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        print("βœ“ Tokenizer loaded successfully")

        # Test encoding
        encoded = tokenizer.encode(text)
        print(f"βœ“ Text encoded: {encoded[:10]}...")  # Show first 10 tokens

        # Test decoding
        decoded = tokenizer.decode(encoded)
        print(f"βœ“ Text decoded: {decoded}")

        # Verify round-trip
        assert decoded == text, "Round-trip tokenization failed"
        print("βœ“ Round-trip tokenization successful")

        # Test token conversion
        tokens = tokenizer.convert_ids_to_tokens(encoded)
        print(f"βœ“ Tokens: {tokens[:5]}...")  # Show first 5 tokens

        return True
    except Exception as e:
        print(f"βœ— Test failed: {e}")
        return False

def test_special_tokens():
    """Test special token handling."""
    print("\n--- Testing Special Tokens ---")
    try:
        model_id = "openai-community/gpt2"
        text = "Test text"

        tokenizer = AutoTokenizer.from_pretrained(model_id)

        # With special tokens
        encoded_with = tokenizer.encode(text, add_special_tokens=True)
        # Without special tokens
        encoded_without = tokenizer.encode(text, add_special_tokens=False)

        print(f"βœ“ With special tokens: {len(encoded_with)} tokens")
        print(f"βœ“ Without special tokens: {len(encoded_without)} tokens")

        # Decode with and without special tokens
        decoded_with = tokenizer.decode(encoded_with, skip_special_tokens=False)
        decoded_without = tokenizer.decode(encoded_with, skip_special_tokens=True)

        print(f"βœ“ Decoded with special: {decoded_with}")
        print(f"βœ“ Decoded without special: {decoded_without}")

        return True
    except Exception as e:
        print(f"βœ— Test failed: {e}")
        return False

def test_app_functions():
    """Test the main app functions."""
    print("\n--- Testing App Functions ---")
    try:
        # Import app functions
        from app import tokenize_text, decode_tokens, analyze_vocabulary

        # Test tokenize_text
        print("Testing tokenize_text function...")
        result = tokenize_text(
            text="Hello world",
            model_id="openai-community/gpt2",
            add_special_tokens=True,
            show_special_tokens=True,
            custom_model_id=None
        )
        assert len(result) == 5, "tokenize_text should return 5 values"
        print("βœ“ tokenize_text function works")

        # Test decode_tokens
        print("Testing decode_tokens function...")
        decode_result = decode_tokens(
            token_ids_str="[15496, 11, 995]",  # "Hello, world" in GPT-2
            model_id="openai-community/gpt2",
            skip_special_tokens=False,
            custom_model_id=None
        )
        assert "Decoded Text:" in decode_result, "decode_tokens should return decoded text"
        print("βœ“ decode_tokens function works")

        # Test analyze_vocabulary
        print("Testing analyze_vocabulary function...")
        vocab_result = analyze_vocabulary(
            model_id="openai-community/gpt2",
            custom_model_id=None
        )
        assert "Vocabulary Size:" in vocab_result, "analyze_vocabulary should return vocabulary info"
        print("βœ“ analyze_vocabulary function works")

        return True
    except Exception as e:
        print(f"βœ— Test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    """Run all tests."""
    print("=" * 50)
    print("Tokenizer Playground Test Suite")
    print("=" * 50)

    tests = [
        test_basic_tokenization,
        test_special_tokens,
        test_app_functions
    ]

    results = []
    for test in tests:
        results.append(test())

    print("\n" + "=" * 50)
    print("Test Summary")
    print("=" * 50)
    passed = sum(results)
    total = len(results)
    print(f"Passed: {passed}/{total}")

    if passed == total:
        print("βœ… All tests passed!")
        return 0
    else:
        print("❌ Some tests failed")
        return 1

if __name__ == "__main__":
    sys.exit(main())