Spaces:

Sneha7
/

phi2-helpfulness-grpo-demo

Runtime error

App Files Files Community

Sneha7 commited on 3 days ago

Commit

c8e38f6

verified ·

1 Parent(s): 55bd1b0

Update grpo_train.py

Browse files

Files changed (1) hide show

grpo_train.py +17 -20

grpo_train.py CHANGED Viewed

@@ -12,25 +12,19 @@ def grpo_step(
     eps_clip: float = 0.2,
     group_size: int = 4,
 ):
-    """
-    GRPO step with:
-      - Sampling from gen_model (CPU)
-      - Policy/Ref both from policy_model on GPU (ref = frozen logits this step)
-    """
     device = policy_model.device
-    # 1) Tokenize on GPU for policy, but copy to CPU for gen_model
     inputs = tokenizer(prompt, return_tensors="pt")
     inputs_gpu = {k: v.to(device) for k, v in inputs.items()}
-    input_ids_gpu = inputs_gpu["input_ids"]          # [1, L]
     attn_gpu = inputs_gpu.get("attention_mask", None)
-    # Group repeat for GPU tensors
     input_ids_gpu = input_ids_gpu.repeat_interleave(group_size, dim=0)
     if attn_gpu is not None:
         attn_gpu = attn_gpu.repeat_interleave(group_size, dim=0)
-    # For CPU gen_model, keep a CPU copy
     input_ids_cpu = input_ids_gpu.cpu()
     attn_cpu = attn_gpu.cpu() if attn_gpu is not None else None
@@ -38,7 +32,7 @@ def grpo_step(
     if attn_cpu is not None:
         gen_inputs["attention_mask"] = attn_cpu
-    # 2) Generate on CPU (slower but fits memory)
     with torch.no_grad():
         gen_output = gen_model.generate(
             **gen_inputs,
@@ -52,25 +46,25 @@ def grpo_step(
             output_scores=False,
         )
-    sequences_cpu = gen_output.sequences                   # [G, L+T] on CPU
-    sequences = sequences_cpu.to(device)                   # send batch to GPU once
     texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in sequences_cpu]
     rewards = torch.tensor(
         [reward_fn(text) for text in texts],
         device=device,
         dtype=torch.float32,
-    ).clamp_(-2.0, 2.0)
     # 3) Group-normalized advantages
     group_mean = rewards.mean()
     group_std = rewards.std(unbiased=False) + 1e-8
     advantages = (rewards - group_mean) / group_std
-    advantages = torch.clamp(advantages, -5.0, 5.0)
     orig_len = inputs["input_ids"].shape[1]
-    # 4) Compute "ref" logprobs as frozen snapshot of current policy
     with torch.no_grad():
         ref_out = policy_model(sequences)
         ref_logits = ref_out.logits[:, :-1, :]
@@ -78,7 +72,7 @@ def grpo_step(
         ref_lp_all = ref_logprobs.gather(-1, sequences[:, 1:].unsqueeze(-1)).squeeze(-1)
         ref_lp_gen = ref_lp_all[:, orig_len - 1 :]
-    # 5) Current policy logprobs (trainable)
     out = policy_model(sequences)
     logits = out.logits[:, :-1, :]
     logprobs = F.log_softmax(logits, dim=-1)
@@ -97,11 +91,14 @@ def grpo_step(
             "loss": 0.0,
         }
-    # 6) Ratios, KL, loss
-    log_ratio = (lp_gen - ref_lp_gen).mean(dim=1).clamp_(-10.0, 10.0)
-    ratio = torch.exp(log_ratio).clamp_(0.0, 10.0)
-    kl_per_sample = (lp_gen - ref_lp_gen).mean(dim=1).clamp_(-10.0, 10.0)
     kl_scalar = kl_per_sample.abs().mean()
     surr1 = ratio * advantages

     eps_clip: float = 0.2,
     group_size: int = 4,
 ):
     device = policy_model.device
+    # 1) Tokenize
     inputs = tokenizer(prompt, return_tensors="pt")
     inputs_gpu = {k: v.to(device) for k, v in inputs.items()}
+    input_ids_gpu = inputs_gpu["input_ids"]
     attn_gpu = inputs_gpu.get("attention_mask", None)
     input_ids_gpu = input_ids_gpu.repeat_interleave(group_size, dim=0)
     if attn_gpu is not None:
         attn_gpu = attn_gpu.repeat_interleave(group_size, dim=0)
+    # CPU copy for gen_model
     input_ids_cpu = input_ids_gpu.cpu()
     attn_cpu = attn_gpu.cpu() if attn_gpu is not None else None
     if attn_cpu is not None:
         gen_inputs["attention_mask"] = attn_cpu
+    # 2) Generate on CPU
     with torch.no_grad():
         gen_output = gen_model.generate(
             **gen_inputs,
             output_scores=False,
         )
+    sequences_cpu = gen_output.sequences
+    sequences = sequences_cpu.to(device)
     texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in sequences_cpu]
     rewards = torch.tensor(
         [reward_fn(text) for text in texts],
         device=device,
         dtype=torch.float32,
+    ).clamp(-2.0, 2.0)
     # 3) Group-normalized advantages
     group_mean = rewards.mean()
     group_std = rewards.std(unbiased=False) + 1e-8
     advantages = (rewards - group_mean) / group_std
+    advantages = advantages.clamp(-5.0, 5.0)
     orig_len = inputs["input_ids"].shape[1]
+    # 4) Ref logprobs (no grad)
     with torch.no_grad():
         ref_out = policy_model(sequences)
         ref_logits = ref_out.logits[:, :-1, :]
         ref_lp_all = ref_logprobs.gather(-1, sequences[:, 1:].unsqueeze(-1)).squeeze(-1)
         ref_lp_gen = ref_lp_all[:, orig_len - 1 :]
+    # 5) Current policy logprobs (with grad)
     out = policy_model(sequences)
     logits = out.logits[:, :-1, :]
     logprobs = F.log_softmax(logits, dim=-1)
             "loss": 0.0,
         }
+    # 6) Ratios, KL, loss (no in-place ops)
+    log_ratio = (lp_gen - ref_lp_gen).mean(dim=1)
+    log_ratio = log_ratio.clamp(-10.0, 10.0)
+    ratio = torch.exp(log_ratio)
+    ratio = ratio.clamp(0.0, 10.0)
+    kl_per_sample = (lp_gen - ref_lp_gen).mean(dim=1)
+    kl_per_sample = kl_per_sample.clamp(-10.0, 10.0)
     kl_scalar = kl_per_sample.abs().mean()
     surr1 = ratio * advantages