Spaces:

aaron0eidt
/

ELIA

Sleeping

File size: 3,605 Bytes

5b6c556

import os
import sys
from pathlib import Path
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Add root project dir to path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES

def generate_german_vectors():
    # Generates and saves function vectors for all German prompts.
    print("🚀 Starting German function vector generation...")
    
    # Load the model and tokenizer.
    print("🔧 Loading OLMo-2-7B model and tokenizer... (this may take a moment)")
    try:
        model_path = "./models/OLMo-2-1124-7B"
        device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
        
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"
        
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="auto",
            output_hidden_states=True
        )
        print(f"✅ Model loaded successfully on device: {device}")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("Please ensure the model exists at './Models/OLMo-2-1124-7B'")
        return

    # Function to get activation vectors.
    def get_activation_for_prompt(prompt):
        # Calculates the model's activation for a given prompt.
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        
        last_token_pos = inputs['attention_mask'].sum(dim=1) - 1
        last_hidden_state = outputs.hidden_states[-1]
        activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy()
        return activation.astype(np.float64)

    # Generate vectors for German prompts.
    print("\n🇩🇪 Generating vectors for German prompts...")
    german_category_vectors = {}
    
    # Loop over all categories and generate vectors.
    for category_key, data in tqdm(FUNCTION_CATEGORIES.items(), desc="Processing Categories"):
        german_prompts = data.get('de', [])
        
        if not german_prompts:
            print(f"⚠️ Warning: No German prompts found for category '{category_key}'. Skipping.")
            continue
            
        # Get activations for all German prompts in the category
        activations = [get_activation_for_prompt(p) for p in german_prompts]
        
        if activations:
            # Average the activations to get one vector per category.
            german_category_vectors[category_key] = np.mean(activations, axis=0)

    # Save the generated vectors.
    if not german_category_vectors:
        print("❌ No vectors were generated. Aborting save.")
        return
        
    output_dir = Path(__file__).parent / "data" / "vectors"
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / "de_category_vectors.npz"
    
    try:
        np.savez_compressed(output_path, **german_category_vectors)
        print(f"\n✅ Successfully generated and saved German function vectors to:")
        print(f"   {output_path}")
    except Exception as e:
        print(f"❌ Error saving vectors: {e}")

if __name__ == "__main__":
    generate_german_vectors()