Spaces:

afeng
/

tokenizers

Sleeping

App Files Files Community

tokenizers / test_tokenizer.py

afeng

add tokenizer

af99c46 about 1 month ago

raw

history blame contribute delete

5.09 kB

	#!/usr/bin/env python3
	"""
	Simple test script to verify tokenizer functionality.
	This tests the core functions without launching the Gradio interface.
	"""

	import sys
	import json

	# Test imports
	try:
	from transformers import AutoTokenizer
	print("✓ transformers imported successfully")
	except ImportError as e:
	print(f"✗ Failed to import transformers: {e}")
	sys.exit(1)

	try:
	import gradio as gr
	print("✓ gradio imported successfully")
	except ImportError as e:
	print(f"✗ Failed to import gradio: {e}")
	sys.exit(1)

	# Test basic tokenization
	def test_basic_tokenization():
	"""Test basic tokenization with a small model."""
	print("\n--- Testing Basic Tokenization ---")
	try:
	# Use GPT-2 as it's small and commonly available
	model_id = "openai-community/gpt2"
	text = "Hello, world! This is a test."

	print(f"Loading tokenizer: {model_id}")
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	print("✓ Tokenizer loaded successfully")

	# Test encoding
	encoded = tokenizer.encode(text)
	print(f"✓ Text encoded: {encoded[:10]}...") # Show first 10 tokens

	# Test decoding
	decoded = tokenizer.decode(encoded)
	print(f"✓ Text decoded: {decoded}")

	# Verify round-trip
	assert decoded == text, "Round-trip tokenization failed"
	print("✓ Round-trip tokenization successful")

	# Test token conversion
	tokens = tokenizer.convert_ids_to_tokens(encoded)
	print(f"✓ Tokens: {tokens[:5]}...") # Show first 5 tokens

	return True
	except Exception as e:
	print(f"✗ Test failed: {e}")
	return False

	def test_special_tokens():
	"""Test special token handling."""
	print("\n--- Testing Special Tokens ---")
	try:
	model_id = "openai-community/gpt2"
	text = "Test text"

	tokenizer = AutoTokenizer.from_pretrained(model_id)

	# With special tokens
	encoded_with = tokenizer.encode(text, add_special_tokens=True)
	# Without special tokens
	encoded_without = tokenizer.encode(text, add_special_tokens=False)

	print(f"✓ With special tokens: {len(encoded_with)} tokens")
	print(f"✓ Without special tokens: {len(encoded_without)} tokens")

	# Decode with and without special tokens
	decoded_with = tokenizer.decode(encoded_with, skip_special_tokens=False)
	decoded_without = tokenizer.decode(encoded_with, skip_special_tokens=True)

	print(f"✓ Decoded with special: {decoded_with}")
	print(f"✓ Decoded without special: {decoded_without}")

	return True
	except Exception as e:
	print(f"✗ Test failed: {e}")
	return False

	def test_app_functions():
	"""Test the main app functions."""
	print("\n--- Testing App Functions ---")
	try:
	# Import app functions
	from app import tokenize_text, decode_tokens, analyze_vocabulary

	# Test tokenize_text
	print("Testing tokenize_text function...")
	result = tokenize_text(
	text="Hello world",
	model_id="openai-community/gpt2",
	add_special_tokens=True,
	show_special_tokens=True,
	custom_model_id=None
	)
	assert len(result) == 5, "tokenize_text should return 5 values"
	print("✓ tokenize_text function works")

	# Test decode_tokens
	print("Testing decode_tokens function...")
	decode_result = decode_tokens(
	token_ids_str="[15496, 11, 995]", # "Hello, world" in GPT-2
	model_id="openai-community/gpt2",
	skip_special_tokens=False,
	custom_model_id=None
	)
	assert "Decoded Text:" in decode_result, "decode_tokens should return decoded text"
	print("✓ decode_tokens function works")

	# Test analyze_vocabulary
	print("Testing analyze_vocabulary function...")
	vocab_result = analyze_vocabulary(
	model_id="openai-community/gpt2",
	custom_model_id=None
	)
	assert "Vocabulary Size:" in vocab_result, "analyze_vocabulary should return vocabulary info"
	print("✓ analyze_vocabulary function works")

	return True
	except Exception as e:
	print(f"✗ Test failed: {e}")
	import traceback
	traceback.print_exc()
	return False

	def main():
	"""Run all tests."""
	print("=" * 50)
	print("Tokenizer Playground Test Suite")
	print("=" * 50)

	tests = [
	test_basic_tokenization,
	test_special_tokens,
	test_app_functions
	]

	results = []
	for test in tests:
	results.append(test())

	print("\n" + "=" * 50)
	print("Test Summary")
	print("=" * 50)
	passed = sum(results)
	total = len(results)
	print(f"Passed: {passed}/{total}")

	if passed == total:
	print("✅ All tests passed!")
	return 0
	else:
	print("❌ Some tests failed")
	return 1

	if __name__ == "__main__":
	sys.exit(main())