Spaces:

afeng
/

tokenizers

Running

File size: 5,090 Bytes

af99c46

#!/usr/bin/env python3
"""
Simple test script to verify tokenizer functionality.
This tests the core functions without launching the Gradio interface.
"""

import sys
import json

# Test imports
try:
    from transformers import AutoTokenizer
    print("✓ transformers imported successfully")
except ImportError as e:
    print(f"✗ Failed to import transformers: {e}")
    sys.exit(1)

try:
    import gradio as gr
    print("✓ gradio imported successfully")
except ImportError as e:
    print(f"✗ Failed to import gradio: {e}")
    sys.exit(1)

# Test basic tokenization
def test_basic_tokenization():
    """Test basic tokenization with a small model."""
    print("\n--- Testing Basic Tokenization ---")
    try:
        # Use GPT-2 as it's small and commonly available
        model_id = "openai-community/gpt2"
        text = "Hello, world! This is a test."

        print(f"Loading tokenizer: {model_id}")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        print("✓ Tokenizer loaded successfully")

        # Test encoding
        encoded = tokenizer.encode(text)
        print(f"✓ Text encoded: {encoded[:10]}...")  # Show first 10 tokens

        # Test decoding
        decoded = tokenizer.decode(encoded)
        print(f"✓ Text decoded: {decoded}")

        # Verify round-trip
        assert decoded == text, "Round-trip tokenization failed"
        print("✓ Round-trip tokenization successful")

        # Test token conversion
        tokens = tokenizer.convert_ids_to_tokens(encoded)
        print(f"✓ Tokens: {tokens[:5]}...")  # Show first 5 tokens

        return True
    except Exception as e:
        print(f"✗ Test failed: {e}")
        return False

def test_special_tokens():
    """Test special token handling."""
    print("\n--- Testing Special Tokens ---")
    try:
        model_id = "openai-community/gpt2"
        text = "Test text"

        tokenizer = AutoTokenizer.from_pretrained(model_id)

        # With special tokens
        encoded_with = tokenizer.encode(text, add_special_tokens=True)
        # Without special tokens
        encoded_without = tokenizer.encode(text, add_special_tokens=False)

        print(f"✓ With special tokens: {len(encoded_with)} tokens")
        print(f"✓ Without special tokens: {len(encoded_without)} tokens")

        # Decode with and without special tokens
        decoded_with = tokenizer.decode(encoded_with, skip_special_tokens=False)
        decoded_without = tokenizer.decode(encoded_with, skip_special_tokens=True)

        print(f"✓ Decoded with special: {decoded_with}")
        print(f"✓ Decoded without special: {decoded_without}")

        return True
    except Exception as e:
        print(f"✗ Test failed: {e}")
        return False

def test_app_functions():
    """Test the main app functions."""
    print("\n--- Testing App Functions ---")
    try:
        # Import app functions
        from app import tokenize_text, decode_tokens, analyze_vocabulary

        # Test tokenize_text
        print("Testing tokenize_text function...")
        result = tokenize_text(
            text="Hello world",
            model_id="openai-community/gpt2",
            add_special_tokens=True,
            show_special_tokens=True,
            custom_model_id=None
        )
        assert len(result) == 5, "tokenize_text should return 5 values"
        print("✓ tokenize_text function works")

        # Test decode_tokens
        print("Testing decode_tokens function...")
        decode_result = decode_tokens(
            token_ids_str="[15496, 11, 995]",  # "Hello, world" in GPT-2
            model_id="openai-community/gpt2",
            skip_special_tokens=False,
            custom_model_id=None
        )
        assert "Decoded Text:" in decode_result, "decode_tokens should return decoded text"
        print("✓ decode_tokens function works")

        # Test analyze_vocabulary
        print("Testing analyze_vocabulary function...")
        vocab_result = analyze_vocabulary(
            model_id="openai-community/gpt2",
            custom_model_id=None
        )
        assert "Vocabulary Size:" in vocab_result, "analyze_vocabulary should return vocabulary info"
        print("✓ analyze_vocabulary function works")

        return True
    except Exception as e:
        print(f"✗ Test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    """Run all tests."""
    print("=" * 50)
    print("Tokenizer Playground Test Suite")
    print("=" * 50)

    tests = [
        test_basic_tokenization,
        test_special_tokens,
        test_app_functions
    ]

    results = []
    for test in tests:
        results.append(test())

    print("\n" + "=" * 50)
    print("Test Summary")
    print("=" * 50)
    passed = sum(results)
    total = len(results)
    print(f"Passed: {passed}/{total}")

    if passed == total:
        print("✅ All tests passed!")
        return 0
    else:
        print("❌ Some tests failed")
        return 1

if __name__ == "__main__":
    sys.exit(main())