Spaces:
Sleeping
Sleeping
File size: 3,427 Bytes
5b6c556 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
import sys
from pathlib import Path
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
# Adjust path to import from the new 'data' directory
sys.path.append(str(Path(__file__).resolve().parent.parent))
from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES
def generate_all_vectors():
# Generates and saves function vectors for all English and German prompts.
print("π Starting function vector generation for both English and German...")
# Load the model and tokenizer.
print("π§ Loading OLMo-2-7B model and tokenizer...")
try:
model_path = "./models/OLMo-2-1124-7B"
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="auto",
output_hidden_states=True
)
print(f"β
Model loaded successfully on device: {device}")
except Exception as e:
print(f"β Error loading model: {e}")
return
# Function to get activation vectors.
def get_activation_for_prompt(prompt):
# Calculates the model's activation for a given prompt.
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs, output_hidden_states=True)
last_token_pos = inputs['attention_mask'].sum(dim=1) - 1
last_hidden_state = outputs.hidden_states[-1]
activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy()
return activation.astype(np.float64)
# Generate and save vectors for both languages.
output_dir = Path(__file__).parent / "data" / "vectors"
output_dir.mkdir(parents=True, exist_ok=True)
for lang in ["en", "de"]:
print(f"\nπ Generating vectors for {lang.upper()} prompts...")
category_vectors = {}
for category_key, data in tqdm(FUNCTION_CATEGORIES.items(), desc=f"Processing {lang.upper()} Categories"):
prompts = data.get(lang, [])
if not prompts:
print(f"β οΈ Warning: No {lang.upper()} prompts for '{category_key}'. Skipping.")
continue
activations = [get_activation_for_prompt(p) for p in prompts]
if activations:
category_vectors[category_key] = np.mean(activations, axis=0)
if not category_vectors:
print(f"β No vectors were generated for {lang.upper()}. Aborting save.")
continue
output_path = output_dir / f"{lang}_category_vectors.npz"
try:
np.savez_compressed(output_path, **category_vectors)
print(f"β
Successfully saved {lang.upper()} vectors to: {output_path}")
except Exception as e:
print(f"β Error saving {lang.upper()} vectors: {e}")
if __name__ == "__main__":
generate_all_vectors() |