Update RL utils and train-sa using new KL and Beta computation+capping

Browse files

Files changed (3) hide show

__init__.py +13 -13
rl_utils.py +60 -24
train_ppokl_withsa.py +36 -8

__init__.py CHANGED Viewed

@@ -1,14 +1,14 @@
-# __init__.py
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-from .configuration_chemq3mtp import ChemQ3MTPConfig
-from .modeling_chemq3mtp import ChemQ3MTPForCausalLM
-from .FastChemTokenizerHF import FastChemTokenizerSelfies
-# Register the model
-AutoConfig.register("chemq3_mtp", ChemQ3MTPConfig)
-AutoModelForCausalLM.register(ChemQ3MTPConfig, ChemQ3MTPForCausalLM)
-# Register the tokenizer
-AutoTokenizer.register(ChemQ3MTPConfig, FastChemTokenizerSelfies)
 __all__ = ["ChemQ3MTPConfig", "ChemQ3MTPForCausalLM", "FastChemTokenizerSelfies"]

+# __init__.py
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from .configuration_chemq3mtp import ChemQ3MTPConfig
+from .modeling_chemq3mtp import ChemQ3MTPForCausalLM
+from .FastChemTokenizerHF import FastChemTokenizerSelfies
+# Register the model
+AutoConfig.register("chemq3_mtp", ChemQ3MTPConfig)
+AutoModelForCausalLM.register(ChemQ3MTPConfig, ChemQ3MTPForCausalLM)
+# Register the tokenizer
+AutoTokenizer.register(ChemQ3MTPConfig, FastChemTokenizerSelfies)
 __all__ = ["ChemQ3MTPConfig", "ChemQ3MTPForCausalLM", "FastChemTokenizerSelfies"]

rl_utils.py CHANGED Viewed

@@ -290,31 +290,67 @@ def selfies_to_lipinski_reward(selfies_str: str) -> float:
 # ========================
 class AdaptiveKLController:
-    def __init__(self, init_kl_coef: float = 0.1, target_kl: float = 0.01,
-                 kl_horizon: int = 200, increase_rate: float = 2.0,
-                 decrease_rate: float = 0.7):
-        self.kl_coef = float(init_kl_coef)
-        self.target_kl = float(target_kl)
-        self.kl_horizon = int(kl_horizon)
-        self.inc = float(increase_rate)
-        self.dec = float(decrease_rate)
-        self.buffer: List[float] = []
-    def update(self, kl: float) -> float:
-        self.buffer.append(float(kl))
-        if len(self.buffer) >= self.kl_horizon:
-            avg_kl = sum(self.buffer) / len(self.buffer)
-            self.buffer.clear()
-            if avg_kl > self.target_kl * 1.5:
-                self.kl_coef *= self.inc
-                print(f"KL too high ({avg_kl:.6f}), increasing β to {self.kl_coef:.6f}")
-            elif avg_kl < self.target_kl * 0.5:
-                self.kl_coef *= self.dec
-                print(f"KL too low ({avg_kl:.6f}), decreasing β to {self.kl_coef:.6f}")
-        return self.kl_coef
-    def reset(self):
-        self.buffer.clear()
 class EnhancedEntropyController:

 # ========================
 class AdaptiveKLController:
+    """
+    Adaptive KL controller with hard clipping and EMA smoothing.
+    Prevents runaway beta values and exploding KL penalties.
+    """
+    def __init__(
+        self,
+        init_kl_coef: float = 0.2,
+        target_kl: float = 6.0,
+        horizon: int = 10000,
+        max_kl_coef: float = 10.0,
+        max_inc_factor: float = 2.0,
+        ema_alpha: float = 0.9,
+        kl_penalty_cap: float = 10.0,
+    ):
+        self.value = init_kl_coef
+        self.target = target_kl
+        self.horizon = horizon
+        self.max_kl_coef = max_kl_coef
+        self.max_inc_factor = max_inc_factor
+        self.ema_alpha = ema_alpha
+        self.kl_penalty_cap = kl_penalty_cap
+        # Exponential moving average of KL
+        self.ema_kl = None
+    def update(self, current_kl: float, n_steps: int) -> None:
+        # update EMA
+        if self.ema_kl is None:
+            self.ema_kl = current_kl
+        else:
+            self.ema_kl = (
+                self.ema_alpha * self.ema_kl + (1 - self.ema_alpha) * current_kl
+            )
+        proportional_error = np.clip(
+            (self.ema_kl - self.target) / self.target, -1.0, 1.0
+        )
+        mult = 1.0 + proportional_error * (n_steps / self.horizon)
+        # cap growth
+        if mult > self.max_inc_factor:
+            mult = self.max_inc_factor
+        # update beta
+        new_val = self.value * mult
+        self.value = min(new_val, self.max_kl_coef)
+    def __call__(self) -> float:
+        return self.value
+def compute_kl_penalty(kl_vals: torch.Tensor, kl_coef: float, kl_penalty_cap: float):
+    """
+    Compute KL penalty with clipping.
+    Returns (clipped_penalty, raw_penalty, kl_mean).
+    """
+    kl_mean = kl_vals.mean()
+    raw_penalty = kl_coef * kl_mean
+    clipped_penalty = torch.clamp(raw_penalty, max=kl_penalty_cap)
+    return clipped_penalty, raw_penalty, kl_mean
 class EnhancedEntropyController:

train_ppokl_withsa.py CHANGED Viewed

@@ -12,7 +12,7 @@ import numpy as np
 from tqdm import tqdm
 from FastChemTokenizerHF import FastChemTokenizerSelfies
 from ChemQ3MTP import ChemQ3MTPForCausalLM
-from ChemQ3MTP.rl_utils import CurriculumManager, AdaptiveKLController, batch_compute_rewards, compute_ppo_loss, compute_kl_divergence, compute_entropy_bonus
 def main():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -30,8 +30,15 @@ def main():
     print("\n🎯 Phase 2: RL Fine-tuning with PPO + Curriculum Learning")
     model.set_mtp_training(False)
-    # Initialize KL controller
-    kl_controller = AdaptiveKLController(init_kl_coef=0.1, target_kl=0.01, kl_horizon=100)
     model.kl_controller = kl_controller  # Set on model for consistency
     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
@@ -45,13 +52,14 @@ def main():
     input_ids = dummy_input.input_ids.to(device)
     # Training config
-    total_steps = 10000
     checkpoint_steps = {total_steps // 4, total_steps // 2, 3 * total_steps // 4, total_steps}
     checkpoint_dir = "./ppo_checkpoints_test"
     os.makedirs(checkpoint_dir, exist_ok=True)
     # --- RL Training Loop with tqdm ---
     for step in tqdm(range(total_steps), desc="RL Training"):
         max_new_tokens = curriculum.get_max_new_tokens()
         # === PPO Rollout ===
@@ -83,7 +91,7 @@ def main():
         # === Compute rewards using rl_utils ===
         rewards_dict = batch_compute_rewards(
             selfies_list=selfies_list,
-            reward_mode="sa",  # SA-only mode
         )
         rewards = rewards_dict["total_rewards"].to(device)
@@ -96,10 +104,28 @@ def main():
             baseline=baseline
         )
         # === Compute KL divergence and update controller ===
         kl_div = compute_kl_divergence(old_action_probs, new_action_probs)
-        beta = kl_controller.update(kl_div.mean().item())
-        kl_penalty = beta * kl_div.mean()
         # === Compute entropy bonus with adaptive weighting ===
         entropy_per_example = compute_entropy_bonus(new_action_probs)
@@ -184,7 +210,9 @@ def main():
                 f"Lipinski={lipinski_score:.3f} | "
                 f"Reward={rewards.mean().item():.3f} | "
                 f"Entropy={entropy.item():.3f} | "
-                f"EntropyW={adaptive_entropy_weight:.4f}"
             )
             if avg_sa_reward is not None:
                 log_line += f" | SA={avg_sa_reward:.3f}"

 from tqdm import tqdm
 from FastChemTokenizerHF import FastChemTokenizerSelfies
 from ChemQ3MTP import ChemQ3MTPForCausalLM
+from ChemQ3MTP.rl_utils import CurriculumManager, AdaptiveKLController, batch_compute_rewards, compute_ppo_loss, compute_kl_divergence, compute_entropy_bonus, compute_kl_penalty
 def main():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("\n🎯 Phase 2: RL Fine-tuning with PPO + Curriculum Learning")
     model.set_mtp_training(False)
+    # Initialize KL controller - Using correct parameter name based on class definition
+    kl_controller = AdaptiveKLController(
+        init_kl_coef=0.1,
+        target_kl=0.01,
+        horizon=100,        # <-- use horizon instead of kl_horizon
+        max_kl_coef=100.0,  # optional
+        ema_alpha=0.9,      # optional
+        kl_penalty_cap=10.0 # optional
+    )
     model.kl_controller = kl_controller  # Set on model for consistency
     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
     input_ids = dummy_input.input_ids.to(device)
     # Training config
+    total_steps = 2500
     checkpoint_steps = {total_steps // 4, total_steps // 2, 3 * total_steps // 4, total_steps}
     checkpoint_dir = "./ppo_checkpoints_test"
     os.makedirs(checkpoint_dir, exist_ok=True)
     # --- RL Training Loop with tqdm ---
     for step in tqdm(range(total_steps), desc="RL Training"):
+        global_step = step  # Define global_step for KL controller
         max_new_tokens = curriculum.get_max_new_tokens()
         # === PPO Rollout ===
         # === Compute rewards using rl_utils ===
         rewards_dict = batch_compute_rewards(
             selfies_list=selfies_list,
+            reward_mode="chemq3",  # Bioaware-only mode
         )
         rewards = rewards_dict["total_rewards"].to(device)
             baseline=baseline
         )
+        # === Compute KL divergence and update controller ===
+        # Compute KL divergence per batch
         # === Compute KL divergence and update controller ===
         kl_div = compute_kl_divergence(old_action_probs, new_action_probs)
+        kl_mean = kl_div.mean().item()
+        # Update KL controller using EMA-smoothed KL
+        kl_controller.update(kl_mean, n_steps=global_step)
+        beta = kl_controller()  # get current coefficient
+        # Compute clipped KL penalty
+        kl_penalty, raw_kl_penalty, kl_mean_tensor = compute_kl_penalty(
+            kl_div, beta, kl_controller.kl_penalty_cap
+        )
+        # --- Logging (safe, interpretable values) ---
+        logs = {}
+        logs["kl_mean"] = kl_mean_tensor.item()
+        logs["kl_beta"] = beta
+        logs["kl_penalty_raw"] = raw_kl_penalty.item()
+        logs["kl_penalty_clipped"] = kl_penalty.item()
         # === Compute entropy bonus with adaptive weighting ===
         entropy_per_example = compute_entropy_bonus(new_action_probs)
                 f"Lipinski={lipinski_score:.3f} | "
                 f"Reward={rewards.mean().item():.3f} | "
                 f"Entropy={entropy.item():.3f} | "
+                f"EntropyW={adaptive_entropy_weight:.4f} | "
+                f"KL_Beta={beta:.4f} | "
+                f"KL_Mean={kl_mean:.4f}"
             )
             if avg_sa_reward is not None:
                 log_line += f" | SA={avg_sa_reward:.3f}"