Spaces:

aaron0eidt
/

ELIA

Sleeping

File size: 11,432 Bytes

5b6c556

import os
import sys
from pathlib import Path
import requests
import json
import time
from tqdm import tqdm

# Add root project dir to path
sys.path.append(str(Path(__file__).parent.parent))
from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES, FUNCTION_TYPES

# API configuration for Qwen.
QWEN_API_CONFIG = {
    "api_key": "6e3def45d61b0b20547a1fcbab6464d8",
    "api_endpoint": "https://chat-ai.academiccloud.de/v1",
    "model": "qwen2.5-vl-72b-instruct",
    "rate_limit_per_minute": 2,
}

# --- Translation Logic ---

def translate_text(text, target_language="German"):
    # Translates a single string using the Qwen API.
    headers = {
        "Authorization": f"Bearer {QWEN_API_CONFIG['api_key']}",
        "Content-Type": "application/json"
    }
    
    prompt = f"Translate the following English text to {target_language}. Respond with ONLY the translated text, without any introductory phrases, explanations, or quotation marks. The original text is:\n\n'{text}'"

    data = {
        "model": QWEN_API_CONFIG["model"],
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 150,
        "temperature": 0.1,
    }
    
    try:
        response = requests.post(
            f"{QWEN_API_CONFIG['api_endpoint']}/chat/completions",
            headers=headers,
            json=data,
            timeout=60
        )
        
        if response.status_code == 200:
            result = response.json()
            translated_text = result["choices"][0]["message"]["content"].strip()
            # Clean up quotes from the model's response.
            if translated_text.startswith('"') and translated_text.endswith('"'):
                translated_text = translated_text[1:-1]
            return translated_text
        elif response.status_code == 429:
            # Handle rate limiting.
            reset_time = response.headers.get('RateLimit-Reset', '0')
            try:
                wait_seconds = int(reset_time)
                print(f"Hourly rate limit reached. Waiting {wait_seconds} seconds for reset...")
                return f"RATE_LIMIT_HOURLY:{wait_seconds}"
            except ValueError:
                print("Rate limit exceeded. Waiting 60 seconds...")
                return "RATE_LIMIT_EXCEEDED"
        else:
            print(f"API Error: Status {response.status_code}, Response: {response.text}")
            return None
            
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def translate_batch_texts(texts, target_language="German"):
    # Translates a batch of strings in one API call.
    headers = {
        "Authorization": f"Bearer {QWEN_API_CONFIG['api_key']}",
        "Content-Type": "application/json"
    }
    # A stronger prompt to ensure full translation.
    batch_prompt = (
        f"Translate the following English texts to {target_language}. "
        "For each text, translate ALL words and phrases, including any words in quotation marks, into natural German. "
        "Do NOT leave any English words in the translation. Respond with ONLY the German translations, one per line, in the same order.\n\n"
    )
    for i, text in enumerate(texts, 1):
        batch_prompt += f"{i}. {text}\n"
    batch_prompt += "\nProvide the German translations in the same order, one per line:"
    data = {
        "model": QWEN_API_CONFIG["model"],
        "messages": [{"role": "user", "content": batch_prompt}],
        "max_tokens": 300,  # Increased for batch processing
        "temperature": 0.1,
    }
    try:
        response = requests.post(
            f"{QWEN_API_CONFIG['api_endpoint']}/chat/completions",
            headers=headers,
            json=data,
            timeout=60
        )
        if response.status_code == 200:
            result = response.json()
            translated_text = result["choices"][0]["message"]["content"].strip()
            # Split the response into individual lines.
            lines = [line.strip() for line in translated_text.split('\n') if line.strip()]
            cleaned_translations = []
            for line in lines:
                # Remove numbering if the model adds it.
                if line and line[0].isdigit() and '.' in line:
                    line = line.split('.', 1)[1].strip()
                # Clean up quotes.
                if line.startswith('"') and line.endswith('"'):
                    line = line[1:-1]
                if line:
                    cleaned_translations.append(line)
            # Make sure we have the right number of translations.
            if len(cleaned_translations) >= len(texts):
                return cleaned_translations[:len(texts)]
            else:
                print(f"Warning: Expected {len(texts)} translations, got {len(cleaned_translations)}")
                # Pad with error messages if some translations failed.
                while len(cleaned_translations) < len(texts):
                    cleaned_translations.append(f"TRANSLATION_ERROR: {texts[len(cleaned_translations)]}")
                return cleaned_translations
        elif response.status_code == 429:
            # Handle rate limiting.
            reset_time = response.headers.get('RateLimit-Reset', '0')
            try:
                wait_seconds = int(reset_time)
                print(f"Hourly rate limit reached. Waiting {wait_seconds} seconds for reset...")
                return f"RATE_LIMIT_HOURLY:{wait_seconds}"
            except ValueError:
                print("Rate limit exceeded. Waiting 60 seconds...")
                return "RATE_LIMIT_EXCEEDED"
        else:
            print(f"API Error: Status {response.status_code}, Response: {response.text}")
            return None
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def update_multilingual_categories_file(new_categories):
    # Updates the multilingual_function_categories.py file.
    file_path = Path(__file__).parent / "data" / "multilingual_function_categories.py"
    
    # Create the new file content.
    file_content = "# -*- coding: utf-8 -*-\n"
    file_content += '"""\nThis file contains the multilingual prompts for function vector analysis.\n'
    file_content += 'It is automatically updated by the translate_prompts.py script.\n"""\n\n'
    
    # Format the FUNCTION_TYPES dictionary.
    ft_content = "FUNCTION_TYPES = {\n"
    for ft, cats in FUNCTION_TYPES.items():
        ft_content += f'    "{ft}": [\n'
        for cat in cats:
            ft_content += f'        "{cat}",\n'
        ft_content += "    ],\n"
    ft_content += "}\n\n"

    file_content += ft_content

    # Add the function categories.
    file_content += f"FUNCTION_CATEGORIES = {json.dumps(new_categories, indent=4, ensure_ascii=False)}\n"
    
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(file_content)
    print(f"\n✅ Progress saved to '{file_path}'")


def main():
    # Translates all prompts and updates the file.
    print("🚀 Starting batch translation of prompts to German...")

    # Load existing categories to resume from where we left off.
    translated_categories = FUNCTION_CATEGORIES.copy()
    
    # Count how many prompts need to be translated.
    total_prompts = sum(len(prompts.get('en', [])) for prompts in FUNCTION_CATEGORIES.values())
    
    # Set up a progress bar.
    with tqdm(total=total_prompts, desc="Translating Prompts") as pbar:
        # Check how many are already translated.
        already_translated_count = 0
        for category_key, data in FUNCTION_CATEGORIES.items():
            if 'de' not in translated_categories.get(category_key, {}):
                if category_key not in translated_categories:
                    translated_categories[category_key] = {}
                translated_categories[category_key]['de'] = []

            if 'de' in translated_categories[category_key]:
                 already_translated_count += len(translated_categories[category_key]['de'])
        pbar.update(already_translated_count)

        # Get a list of all prompts that still need to be translated.
        all_prompts_to_translate = []
        prompt_mapping = []
        
        for category_key, data in FUNCTION_CATEGORIES.items():
            english_prompts = data.get('en', [])
            
            # Make sure the 'de' key exists.
            if 'de' not in translated_categories[category_key]:
                translated_categories[category_key]['de'] = []
            
            german_prompts = translated_categories[category_key]['de']

            # Skip if this category is already done.
            if len(german_prompts) == len(english_prompts):
                continue

            # Add prompts that are missing a translation.
            for i in range(len(german_prompts), len(english_prompts)):
                all_prompts_to_translate.append(english_prompts[i])
                prompt_mapping.append((category_key, i))

        # Process the prompts in batches.
        batch_size = 6
        for i in range(0, len(all_prompts_to_translate), batch_size):
            batch_prompts = all_prompts_to_translate[i:i + batch_size]
            batch_mapping = prompt_mapping[i:i + batch_size]
            
            # Wait between batches to avoid hitting the rate limit.
            time.sleep(30)
            
            translated_batch = translate_batch_texts(batch_prompts)
            
            # Handle rate limit responses.
            if translated_batch and isinstance(translated_batch, str) and translated_batch.startswith("RATE_LIMIT_HOURLY:"):
                wait_seconds = int(translated_batch.split(":")[1])
                print(f"Waiting {wait_seconds} seconds for hourly rate limit reset...")
                time.sleep(wait_seconds)
                # Retry the batch.
                translated_batch = translate_batch_texts(batch_prompts)
            
            retry_wait = 60
            while translated_batch == "RATE_LIMIT_EXCEEDED":
                # Wait and retry if we hit the rate limit.
                print(f"Waiting for {retry_wait} seconds due to rate limit...")
                time.sleep(retry_wait)
                translated_batch = translate_batch_texts(batch_prompts)
                retry_wait *= 1.5
            
            if translated_batch and isinstance(translated_batch, list):
                # Add the new translations to our data.
                for j, (category_key, prompt_idx) in enumerate(batch_mapping):
                    if j < len(translated_batch):
                        translated_categories[category_key]['de'].append(translated_batch[j])
                
                # Save progress every so often.
                if (pbar.n + len(batch_prompts)) % 30 == 0:
                    update_multilingual_categories_file(translated_categories)
                
                pbar.update(len(batch_prompts))
            else:
                print(f"❌ Failed to translate batch. Stopping.")
                # Save any progress we made before stopping.
                update_multilingual_categories_file(translated_categories)
                return

    # Final save at the end.
    update_multilingual_categories_file(translated_categories)
    print("\n✅ All prompts translated and file updated successfully.")

if __name__ == "__main__":
    main()