ELIA / function_vectors /translate_prompts.py
aaron0eidt's picture
Deploy static demo
5b6c556
import os
import sys
from pathlib import Path
import requests
import json
import time
from tqdm import tqdm
# Add root project dir to path
sys.path.append(str(Path(__file__).parent.parent))
from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES, FUNCTION_TYPES
# API configuration for Qwen.
QWEN_API_CONFIG = {
"api_key": "6e3def45d61b0b20547a1fcbab6464d8",
"api_endpoint": "https://chat-ai.academiccloud.de/v1",
"model": "qwen2.5-vl-72b-instruct",
"rate_limit_per_minute": 2,
}
# --- Translation Logic ---
def translate_text(text, target_language="German"):
# Translates a single string using the Qwen API.
headers = {
"Authorization": f"Bearer {QWEN_API_CONFIG['api_key']}",
"Content-Type": "application/json"
}
prompt = f"Translate the following English text to {target_language}. Respond with ONLY the translated text, without any introductory phrases, explanations, or quotation marks. The original text is:\n\n'{text}'"
data = {
"model": QWEN_API_CONFIG["model"],
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 150,
"temperature": 0.1,
}
try:
response = requests.post(
f"{QWEN_API_CONFIG['api_endpoint']}/chat/completions",
headers=headers,
json=data,
timeout=60
)
if response.status_code == 200:
result = response.json()
translated_text = result["choices"][0]["message"]["content"].strip()
# Clean up quotes from the model's response.
if translated_text.startswith('"') and translated_text.endswith('"'):
translated_text = translated_text[1:-1]
return translated_text
elif response.status_code == 429:
# Handle rate limiting.
reset_time = response.headers.get('RateLimit-Reset', '0')
try:
wait_seconds = int(reset_time)
print(f"Hourly rate limit reached. Waiting {wait_seconds} seconds for reset...")
return f"RATE_LIMIT_HOURLY:{wait_seconds}"
except ValueError:
print("Rate limit exceeded. Waiting 60 seconds...")
return "RATE_LIMIT_EXCEEDED"
else:
print(f"API Error: Status {response.status_code}, Response: {response.text}")
return None
except requests.RequestException as e:
print(f"Request failed: {e}")
return None
def translate_batch_texts(texts, target_language="German"):
# Translates a batch of strings in one API call.
headers = {
"Authorization": f"Bearer {QWEN_API_CONFIG['api_key']}",
"Content-Type": "application/json"
}
# A stronger prompt to ensure full translation.
batch_prompt = (
f"Translate the following English texts to {target_language}. "
"For each text, translate ALL words and phrases, including any words in quotation marks, into natural German. "
"Do NOT leave any English words in the translation. Respond with ONLY the German translations, one per line, in the same order.\n\n"
)
for i, text in enumerate(texts, 1):
batch_prompt += f"{i}. {text}\n"
batch_prompt += "\nProvide the German translations in the same order, one per line:"
data = {
"model": QWEN_API_CONFIG["model"],
"messages": [{"role": "user", "content": batch_prompt}],
"max_tokens": 300, # Increased for batch processing
"temperature": 0.1,
}
try:
response = requests.post(
f"{QWEN_API_CONFIG['api_endpoint']}/chat/completions",
headers=headers,
json=data,
timeout=60
)
if response.status_code == 200:
result = response.json()
translated_text = result["choices"][0]["message"]["content"].strip()
# Split the response into individual lines.
lines = [line.strip() for line in translated_text.split('\n') if line.strip()]
cleaned_translations = []
for line in lines:
# Remove numbering if the model adds it.
if line and line[0].isdigit() and '.' in line:
line = line.split('.', 1)[1].strip()
# Clean up quotes.
if line.startswith('"') and line.endswith('"'):
line = line[1:-1]
if line:
cleaned_translations.append(line)
# Make sure we have the right number of translations.
if len(cleaned_translations) >= len(texts):
return cleaned_translations[:len(texts)]
else:
print(f"Warning: Expected {len(texts)} translations, got {len(cleaned_translations)}")
# Pad with error messages if some translations failed.
while len(cleaned_translations) < len(texts):
cleaned_translations.append(f"TRANSLATION_ERROR: {texts[len(cleaned_translations)]}")
return cleaned_translations
elif response.status_code == 429:
# Handle rate limiting.
reset_time = response.headers.get('RateLimit-Reset', '0')
try:
wait_seconds = int(reset_time)
print(f"Hourly rate limit reached. Waiting {wait_seconds} seconds for reset...")
return f"RATE_LIMIT_HOURLY:{wait_seconds}"
except ValueError:
print("Rate limit exceeded. Waiting 60 seconds...")
return "RATE_LIMIT_EXCEEDED"
else:
print(f"API Error: Status {response.status_code}, Response: {response.text}")
return None
except requests.RequestException as e:
print(f"Request failed: {e}")
return None
def update_multilingual_categories_file(new_categories):
# Updates the multilingual_function_categories.py file.
file_path = Path(__file__).parent / "data" / "multilingual_function_categories.py"
# Create the new file content.
file_content = "# -*- coding: utf-8 -*-\n"
file_content += '"""\nThis file contains the multilingual prompts for function vector analysis.\n'
file_content += 'It is automatically updated by the translate_prompts.py script.\n"""\n\n'
# Format the FUNCTION_TYPES dictionary.
ft_content = "FUNCTION_TYPES = {\n"
for ft, cats in FUNCTION_TYPES.items():
ft_content += f' "{ft}": [\n'
for cat in cats:
ft_content += f' "{cat}",\n'
ft_content += " ],\n"
ft_content += "}\n\n"
file_content += ft_content
# Add the function categories.
file_content += f"FUNCTION_CATEGORIES = {json.dumps(new_categories, indent=4, ensure_ascii=False)}\n"
with open(file_path, "w", encoding="utf-8") as f:
f.write(file_content)
print(f"\n✅ Progress saved to '{file_path}'")
def main():
# Translates all prompts and updates the file.
print("🚀 Starting batch translation of prompts to German...")
# Load existing categories to resume from where we left off.
translated_categories = FUNCTION_CATEGORIES.copy()
# Count how many prompts need to be translated.
total_prompts = sum(len(prompts.get('en', [])) for prompts in FUNCTION_CATEGORIES.values())
# Set up a progress bar.
with tqdm(total=total_prompts, desc="Translating Prompts") as pbar:
# Check how many are already translated.
already_translated_count = 0
for category_key, data in FUNCTION_CATEGORIES.items():
if 'de' not in translated_categories.get(category_key, {}):
if category_key not in translated_categories:
translated_categories[category_key] = {}
translated_categories[category_key]['de'] = []
if 'de' in translated_categories[category_key]:
already_translated_count += len(translated_categories[category_key]['de'])
pbar.update(already_translated_count)
# Get a list of all prompts that still need to be translated.
all_prompts_to_translate = []
prompt_mapping = []
for category_key, data in FUNCTION_CATEGORIES.items():
english_prompts = data.get('en', [])
# Make sure the 'de' key exists.
if 'de' not in translated_categories[category_key]:
translated_categories[category_key]['de'] = []
german_prompts = translated_categories[category_key]['de']
# Skip if this category is already done.
if len(german_prompts) == len(english_prompts):
continue
# Add prompts that are missing a translation.
for i in range(len(german_prompts), len(english_prompts)):
all_prompts_to_translate.append(english_prompts[i])
prompt_mapping.append((category_key, i))
# Process the prompts in batches.
batch_size = 6
for i in range(0, len(all_prompts_to_translate), batch_size):
batch_prompts = all_prompts_to_translate[i:i + batch_size]
batch_mapping = prompt_mapping[i:i + batch_size]
# Wait between batches to avoid hitting the rate limit.
time.sleep(30)
translated_batch = translate_batch_texts(batch_prompts)
# Handle rate limit responses.
if translated_batch and isinstance(translated_batch, str) and translated_batch.startswith("RATE_LIMIT_HOURLY:"):
wait_seconds = int(translated_batch.split(":")[1])
print(f"Waiting {wait_seconds} seconds for hourly rate limit reset...")
time.sleep(wait_seconds)
# Retry the batch.
translated_batch = translate_batch_texts(batch_prompts)
retry_wait = 60
while translated_batch == "RATE_LIMIT_EXCEEDED":
# Wait and retry if we hit the rate limit.
print(f"Waiting for {retry_wait} seconds due to rate limit...")
time.sleep(retry_wait)
translated_batch = translate_batch_texts(batch_prompts)
retry_wait *= 1.5
if translated_batch and isinstance(translated_batch, list):
# Add the new translations to our data.
for j, (category_key, prompt_idx) in enumerate(batch_mapping):
if j < len(translated_batch):
translated_categories[category_key]['de'].append(translated_batch[j])
# Save progress every so often.
if (pbar.n + len(batch_prompts)) % 30 == 0:
update_multilingual_categories_file(translated_categories)
pbar.update(len(batch_prompts))
else:
print(f"❌ Failed to translate batch. Stopping.")
# Save any progress we made before stopping.
update_multilingual_categories_file(translated_categories)
return
# Final save at the end.
update_multilingual_categories_file(translated_categories)
print("\n✅ All prompts translated and file updated successfully.")
if __name__ == "__main__":
main()