import os import sys from pathlib import Path import numpy as np import torch from transformers import AutoTokenizer, AutoModelForCausalLM from tqdm import tqdm # Add root project dir to path sys.path.append(str(Path(__file__).resolve().parent.parent)) from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES def generate_german_vectors(): # Generates and saves function vectors for all German prompts. print("šŸš€ Starting German function vector generation...") # Load the model and tokenizer. print("šŸ”§ Loading OLMo-2-7B model and tokenizer... (this may take a moment)") try: model_path = "./models/OLMo-2-1124-7B" device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto", output_hidden_states=True ) print(f"āœ… Model loaded successfully on device: {device}") except Exception as e: print(f"āŒ Error loading model: {e}") print("Please ensure the model exists at './Models/OLMo-2-1124-7B'") return # Function to get activation vectors. def get_activation_for_prompt(prompt): # Calculates the model's activation for a given prompt. inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model(**inputs, output_hidden_states=True) last_token_pos = inputs['attention_mask'].sum(dim=1) - 1 last_hidden_state = outputs.hidden_states[-1] activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy() return activation.astype(np.float64) # Generate vectors for German prompts. print("\nšŸ‡©šŸ‡Ŗ Generating vectors for German prompts...") german_category_vectors = {} # Loop over all categories and generate vectors. for category_key, data in tqdm(FUNCTION_CATEGORIES.items(), desc="Processing Categories"): german_prompts = data.get('de', []) if not german_prompts: print(f"āš ļø Warning: No German prompts found for category '{category_key}'. Skipping.") continue # Get activations for all German prompts in the category activations = [get_activation_for_prompt(p) for p in german_prompts] if activations: # Average the activations to get one vector per category. german_category_vectors[category_key] = np.mean(activations, axis=0) # Save the generated vectors. if not german_category_vectors: print("āŒ No vectors were generated. Aborting save.") return output_dir = Path(__file__).parent / "data" / "vectors" output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "de_category_vectors.npz" try: np.savez_compressed(output_path, **german_category_vectors) print(f"\nāœ… Successfully generated and saved German function vectors to:") print(f" {output_path}") except Exception as e: print(f"āŒ Error saving vectors: {e}") if __name__ == "__main__": generate_german_vectors()