Shrey Goel commited on Sep 25

Commit

d04a061

0 Parent(s):

adding code

Browse files

Files changed (22) hide show

.DS_Store +0 -0
.gitignore +24 -0
README.md +1 -0
__init__.py +0 -0
setup.py +10 -0
src/configs/guidance.yaml +74 -0
src/configs/lm.yaml +66 -0
src/guidance/dataloader.py +108 -0
src/guidance/main.py +73 -0
src/guidance/solubility_module.py +155 -0
src/guidance/utils.py +21 -0
src/lm/memdlm/dataloader.py +88 -0
src/lm/memdlm/diffusion_module.py +189 -0
src/lm/memdlm/loss.py +42 -0
src/lm/memdlm/main.py +88 -0
src/sampling/guided_generator.py +90 -0
src/sampling/guided_sampler.py +291 -0
src/sampling/unconditional_generator.py +114 -0
src/sampling/unconditional_sampler.py +108 -0
src/utils/generate_utils.py +151 -0
src/utils/model_utils.py +139 -0
src/utils/optimizer_utils.py +300 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# .gitignore
+/checkpoints/
+/data/
+/results/
+/build/
+/src/scripts/
+/src/benchmarks
+/src/lm/dplm
+/src/lm/evodiff
+/src/lm/dplm_playground.ipynb
+/src/lm/evoflow_playground.ipynb
+/src/utils/ubuntu_font
+/src/sampling/old_guidance.py
+/MeMDLM_v2.egg-info/
+*.pth
+*.ckpt
+*.err
+*.out
+*.csv
+__pycache__/

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # MeMDLM_v2

__init__.py ADDED Viewed

File without changes

setup.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from setuptools import setup, find_packages
+setup(
+    name='MeMDLM_v2',
+    version='1.0',
+    packages=find_packages(),
+    install_requires=[],
+    author='Shrey Goel',
+    author_email='[email protected]'
+)

src/configs/guidance.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+seed: 42
+base_dir: /scratch/sgoel/MeMDLM_v2
+lm:
+  pretrained_esm: facebook/esm2_t33_650M_UR50D
+  pretrained_evoflow: fredzzp/EvoFlow-650M-context-3070
+  pretrained_dplm: airkingbd/dplm_650m
+  ft_evoflow: ft_eflow-3070-650M_steps=50k_layers=3_lr=0.00004_wd=.01_polynom_pwr=1_betas=.9-.98_bsz=8_gclip=1.0
+  ft_dplm: ft_dplm-650M_steps=5k_layers=3_lr=0.00004_wd=.01_polynom_pwr=1_betas=.9-.98_bsz=32_gclip=1.0
+model:
+  d_model: 1280
+  num_heads: 2
+  dropout: 0.5
+  num_layers: 4
+  label_pad_value: -100
+optim:
+  type: adamw
+  lr: 3e-5
+  lr_end: 1e-5
+  weight_decay: 0.01
+  beta1: 0.9
+  beta2: 0.98
+  power: 1
+training:
+  mode: test  # train / test
+  n_layers: 4
+  max_steps: 3000
+  warmup_steps: 150
+  log_every_n_steps: 10
+  num_sanity_val_steps: 2
+  val_check_interval: 250
+  enable_progress_bar: true
+  grad_clip_val: 1.0
+  devices: [0]  # list of GPU IDs from 0-7
+guidance:
+  n_steps: 128
+  alpha: 3
+  gamma: 0.3
+  saliency_eps: 1e-4
+  saliency_t: 2.0
+  sampling_t: 0.7
+  boltzmann_t: 0.3
+  top_p: 0.2
+  steps: 128
+  prior: lm_probs  # lm_probs / boltzmann
+data:
+  batch_size: 32
+  max_seq_len: 1024
+  train: ${base_dir}/data/classifier/train.csv
+  test: ${base_dir}/data/classifier/test.csv
+  val: ${base_dir}/data/classifier/val.csv
+wandb:
+  project: memdlm_guidance
+  group: programmablebio
+  name: new_data_cleaned_steps3k_lr3e-5_bsz32_heads2_drpt0.5_layers4
+  id: ${.name}_${seed}
+checkpointing:
+  save_every_n_steps: 250
+  save_dir: ${base_dir}/checkpoints/${wandb.name}
+  resume_ckpt_path: ${checkpointing.save_dir}/last.ckpt
+  best_ckpt_path: ${checkpointing.save_dir}/best_model.ckpt

src/configs/lm.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+seed: 42
+base_dir: /scratch/pranamlab/sgoel/MeMDLM_v2
+lm:
+  pretrained_esm: facebook/esm2_t33_650M_UR50D
+  pretrained_evoflow: fredzzp/EvoFlow-650M-context-3070
+  pretrained_dplm: airkingbd/dplm_650m
+  pretrained_progen: hugohrban/progen2-base
+  num_diffusion_timesteps: 500
+  weight_type: linear  # constant / linear
+optim:
+  type: adamw
+  scheduler: polynomial
+  lr: 0.00004
+  lr_end: 1e-5
+  warmup_init_lr: 1e-07
+  weight_decay: 0.01
+  beta1: 0.9
+  beta2: 0.98
+  power: 1
+training:
+  mode: train  # train / test / resume_from_checkpoint
+  n_layers: 3
+  max_steps: 5000
+  warmup_steps: 25
+  log_every_n_steps: 10
+  num_sanity_val_steps: 2
+  val_check_interval: 250
+  enable_progress_bar: true
+  grad_clip_val: 1.0
+  devices: [0,1,2]  # list of GPU IDs
+sampling:
+  n_steps: 128
+data:
+  batch_size: 8
+  max_seq_len: 1024
+  train: ${base_dir}/data/new/train.csv
+  test: ${base_dir}/data/new/test.csv
+  val: ${base_dir}/data/new/val.csv
+wandb:
+  project: memdlm
+  group: programmablebio
+  name: ft_eflow-3070-650M_steps=5k_layers=3_lr=0.00004_wd=.01_polynom_pwr=1_betas=.9-.98_bsz=8_gclip=1.0_ml=1024
+  # name: ft_progen-base-764M_steps=50k_layers=2_lr=0.00004_wd=.1_cosine-to-frac_betas=.9-.999_bsz=8_gclip=0.8
+  # name: ft_dplm-650M_steps=5k_layers=3_lr=0.00004_wd=.01_polynom_pwr=1_betas=.9-.98_bsz=32_gclip=1.0
+  # name: ft_esm-650M_steps=3k_layers=3_lr=0.00004_wd=.01_polynom_pwr=1_betas=.9-.98_bsz=32_gclip=1.0
+  id: ${.name}_${seed}
+checkpointing:
+  save_every_n_steps: 250
+  save_dir: ${base_dir}/checkpoints/${wandb.name}
+  resume_ckpt_path: ${checkpointing.save_dir}/last.ckpt
+  best_ckpt_path: ${checkpointing.save_dir}/best_model.ckpt

src/guidance/dataloader.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import pandas as pd
+import lightning.pytorch as pl
+from transformers import AutoModel, AutoTokenizer
+from torch.utils.data import Dataset, DataLoader
+class MembraneDataset(Dataset):
+    def __init__(self, config, data_path):
+        self.config = config
+        self.data = pd.read_csv(data_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.config.lm.pretrained_esm)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sequence = self.data.iloc[idx]["Sequence"]
+        tokens = self.tokenizer(
+            sequence.upper(),
+            return_tensors='pt',
+            padding='max_length',
+            truncation=True,
+            max_length=self.config.data.max_seq_len,
+        )
+        labels = self.get_labels(sequence)
+        return {
+            "input_ids": tokens['input_ids'],
+            "attention_mask": tokens['attention_mask'],
+            "labels": labels
+        }
+    def get_labels(self, sequence):
+        max_len = self.config.data.max_seq_len
+        # Create per-residue labels
+        labels = torch.tensor([1 if residue.islower() else 0 for residue in sequence], dtype=torch.float)
+        if len(labels) < max_len: # Padding if sequence shorter than tokenizer truncation length
+            padded_labels = torch.cat(
+                [labels, torch.full(size=(max_len - len(labels),), fill_value=self.config.model.label_pad_value)]
+            )
+        else: # Truncation otherwise
+            padded_labels = labels[:max_len]
+        return padded_labels
+def collate_fn(batch):
+    input_ids = torch.stack([item['input_ids'].squeeze(0) for item in batch])
+    masks = torch.stack([item['attention_mask'].squeeze(0) for item in batch])
+    labels = torch.stack([item['labels'] for item in batch])
+    return {
+        'input_ids': input_ids,
+        'attention_mask': masks,
+        'labels': labels
+    }
+class MembraneDataModule(pl.LightningDataModule):
+    def __init__(self, config, train_dataset, val_dataset, test_dataset, collate_fn=collate_fn):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.test_dataset = test_dataset
+        self.collate_fn = collate_fn
+        self.batch_size = config.data.batch_size
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=self.collate_fn,
+                          num_workers=8,
+                          pin_memory=True)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=self.collate_fn,
+                          num_workers=8,
+                          pin_memory=True)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=self.collate_fn,
+                          num_workers=8,
+                          pin_memory=True)
+def get_datasets(config):
+    """Helper method to grab datasets to quickly init data module in main.py"""
+    esm_model = AutoModel.from_pretrained(config.lm.pretrained_esm)
+    tokenizer = AutoTokenizer.from_pretrained(config.lm.pretrained_esm)
+    train_dataset = MembraneDataset(config, config.data.train)
+    val_dataset = MembraneDataset(config, config.data.val)
+    test_dataset = MembraneDataset(config, config.data.test)
+    return  {
+        "train": train_dataset,
+        "val": val_dataset,
+        "test": test_dataset
+    }

src/guidance/main.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/env python3
+import os
+import wandb
+import lightning.pytorch as pl
+from omegaconf import OmegaConf
+from lightning.pytorch.strategies import DDPStrategy
+from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
+from src.utils.model_utils import _print
+from src.guidance.solubility_module import SolubilityClassifier
+from src.guidance.dataloader import MembraneDataModule, get_datasets
+config = OmegaConf.load("/scratch/sgoel/MeMDLM_v2/src/configs/guidance.yaml")
+wandb.login(key='2b76a2fa2c1cdfddc5f443602c17b011fefb0a8f')
+# data
+datasets = get_datasets(config)
+data_module = MembraneDataModule(
+    config=config,
+    train_dataset=datasets['train'],
+    val_dataset=datasets['val'],
+    test_dataset=datasets['test'],
+)
+# wandb logging
+#wandb.init(project=config.wandb.project, name=config.wandb.name)
+wandb_logger = WandbLogger(**config.wandb)
+# lightning checkpoints
+lr_monitor = LearningRateMonitor(logging_interval="step")
+checkpoint_callback = ModelCheckpoint(
+    monitor="val/loss",
+    save_top_k=1,
+    mode="min",
+    dirpath=config.checkpointing.save_dir,
+    filename="best_model",
+)
+# lightning trainer
+trainer = pl.Trainer(
+    max_steps=config.training.max_steps,
+    accelerator="cuda",
+    devices=1, #config.training.devices if config.training.mode=='train' else [0],
+    #strategy=DDPStrategy(find_unused_parameters=True),
+    callbacks=[checkpoint_callback, lr_monitor],
+    logger=wandb_logger,
+    log_every_n_steps=config.training.log_every_n_steps
+)
+# Folder to save checkpoints
+ckpt_dir = config.checkpointing.save_dir
+os.makedirs(ckpt_dir, exist_ok=True)
+# instantiate model
+model = SolubilityClassifier(config)
+# train or evalute the model
+if config.training.mode == "train":
+    trainer.fit(model, datamodule=data_module)
+elif config.training.mode == "test":
+    ckpt_path = os.path.join(ckpt_dir, "best_model.ckpt")
+    state_dict = model.get_state_dict(ckpt_path)
+    model.load_state_dict(state_dict)
+    trainer.test(model, datamodule=data_module, ckpt_path=ckpt_path)
+else:
+    raise ValueError(f"{config.training.mode} is invalid. Must be 'train' or 'test'")
+wandb.finish()

src/guidance/solubility_module.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import gc
+import torch
+import torch.nn as nn
+import lightning.pytorch as pl
+from omegaconf import OmegaConf
+from transformers import AutoModel
+from torchmetrics.classification import BinaryAUROC, BinaryAccuracy
+from src.utils.model_utils import _print
+from src.guidance.utils import CosineWarmup
+config = OmegaConf.load("/scratch/sgoel/MeMDLM_v2/src/configs/guidance.yaml")
+class SolubilityClassifier(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.loss_fn = nn.BCEWithLogitsLoss(reduction='none')
+        self.auroc = BinaryAUROC()
+        self.accuracy = BinaryAccuracy()
+        self.esm_model = AutoModel.from_pretrained(self.config.lm.pretrained_esm)
+        for p in self.esm_model.parameters():
+            p.requires_grad = False
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.model.d_model,
+            nhead=config.model.num_heads,
+            dropout=config.model.dropout,
+            batch_first=True
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.model.num_layers)
+        self.layer_norm = nn.LayerNorm(config.model.d_model)
+        self.dropout = nn.Dropout(config.model.dropout)
+        self.mlp = nn.Sequential(
+            nn.Linear(config.model.d_model, config.model.d_model // 2),
+            nn.ReLU(),
+            nn.Dropout(config.model.dropout),
+            nn.Linear(config.model.d_model // 2, 1),
+        )
+    # -------# Classifier step #-------- #
+    def forward(self, batch):
+        if 'input_ids' in batch:
+            esm_embeds = self.get_esm_embeddings(batch['input_ids'], batch['attention_mask'])
+        elif 'embeds' in batch:
+            esm_embeds = batch['embeds']
+        encodings = self.encoder(esm_embeds, src_key_padding_mask=(batch['attention_mask'] == 0))
+        encodings = self.dropout(self.layer_norm(encodings))
+        logits = self.mlp(encodings).squeeze(-1)
+        return logits
+    # -------# Training / Evaluation #-------- #
+    def training_step(self, batch, batch_idx):
+        train_loss, _ = self.compute_loss(batch)
+        self.log(name="train/loss", value=train_loss.item(), on_step=True, on_epoch=False, logger=True, sync_dist=True)
+        self.save_ckpt()
+        return train_loss
+    def validation_step(self, batch, batch_idx):
+        val_loss, _ = self.compute_loss(batch)
+        self.log(name="val/loss", value=val_loss.item(), on_step=False, on_epoch=True, logger=True, sync_dist=True)
+        return val_loss
+    def test_step(self, batch):
+        test_loss, preds = self.compute_loss(batch)
+        auroc, accuracy = self.get_metrics(batch, preds)
+        self.log(name="test/loss", value=test_loss.item(), on_step=False, on_epoch=True, logger=True, sync_dist=True)
+        self.log(name="test/AUROC", value=auroc.item(), on_step=False, on_epoch=True, logger=True, sync_dist=True)
+        self.log(name="test/accuracy", value=accuracy.item(), on_step=False, on_epoch=True, logger=True, sync_dist=True)
+        return test_loss
+    def on_test_epoch_end(self):
+        self.auroc.reset()
+        self.accuracy.reset()
+    def optimizer_step(self, *args, **kwargs):
+        super().optimizer_step(*args, **kwargs)
+        gc.collect()
+        torch.cuda.empty_cache()
+    def configure_optimizers(self):
+        path = self.config.training
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config.optim.lr)
+        lr_scheduler = CosineWarmup(
+            optimizer,
+            warmup_steps=path.warmup_steps,
+            total_steps=path.max_steps,
+        )
+        scheduler_dict = {
+            "scheduler": lr_scheduler,
+            "interval": 'step',
+            'frequency': 1,
+            'monitor': 'val/loss',
+            'name': 'learning_rate'
+        }
+        return [optimizer], [scheduler_dict]
+    def save_ckpt(self):
+        curr_step = self.global_step
+        save_every = self.config.training.val_check_interval
+        if curr_step % save_every == 0 and curr_step > 0:  # Save every 250 steps
+            ckpt_path = f"{self.config.checkpointing.save_dir}/step={curr_step}.ckpt"
+            self.trainer.save_checkpoint(ckpt_path)
+    # -------# Loss and Test Set Metrics #-------- #
+    @torch.no_grad
+    def get_esm_embeddings(self, input_ids, attention_mask):
+        outputs = self.esm_model(input_ids=input_ids, attention_mask=attention_mask)
+        embeddings = outputs.last_hidden_state
+        return embeddings
+    def compute_loss(self, batch):
+        """Helper method to handle loss calculation"""
+        labels = batch['labels']
+        preds = self.forward(batch)
+        loss = self.loss_fn(preds, labels)
+        loss_mask = (labels != self.config.model.label_pad_value) # only calculate loss over non-pad tokens
+        loss = (loss * loss_mask).sum() / loss_mask.sum()
+        return loss, preds
+    def get_metrics(self, batch, preds):
+        """Helper method to compute metrics"""
+        labels = batch['labels']
+        valid_mask = (labels != self.config.model.label_pad_value)
+        labels = labels[valid_mask]
+        preds = preds[valid_mask]
+        _print(f"labels {labels.shape}")
+        _print(f"preds {preds.shape}")
+        auroc = self.auroc.forward(preds, labels)
+        accuracy = self.accuracy.forward(preds, labels)
+        return auroc, accuracy
+    # -------# Helper Functions #-------- #
+    def get_state_dict(self, ckpt_path):
+        """Helper method to load and process a trained model's state dict from saved checkpoint"""
+        def remove_model_prefix(state_dict):
+            for k in state_dict.keys():
+                if "model." in k:
+                    k.replace('model.', '')
+            return state_dict
+        checkpoint = torch.load(ckpt_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
+        state_dict = checkpoint.get("state_dict", checkpoint)
+        if any(k.startswith("model.") for k in state_dict.keys()):
+            state_dict = remove_model_prefix(state_dict)
+        return state_dict

src/guidance/utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import numpy as np
+from torch.optim.lr_scheduler import _LRScheduler
+class CosineWarmup(_LRScheduler):
+    def __init__(self, optimizer, warmup_steps, total_steps, eta_ratio=0.1, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.eta_ratio = eta_ratio  # The ratio of minimum to maximum learning rate
+        super(CosineWarmup, self).__init__(optimizer, last_epoch)
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return [base_lr * self.last_epoch / self.warmup_steps for base_lr in self.base_lrs]
+        progress = (self.last_epoch - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+        cosine_decay = 0.5 * (1 + np.cos(np.pi * progress))
+        decayed_lr = (1 - self.eta_ratio) * cosine_decay + self.eta_ratio
+        return [decayed_lr * base_lr for base_lr in self.base_lrs]

src/lm/memdlm/dataloader.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import pandas as pd
+import lightning.pytorch as pl
+from transformers import AutoModel, AutoTokenizer
+from torch.utils.data import Dataset, DataLoader
+class MembraneDataset(Dataset):
+    def __init__(self, config, data_path):
+        self.config = config
+        self.data = pd.read_csv(data_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(config.lm.pretrained_evoflow)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        sequence = self.data.iloc[idx]["Sequence"]
+        tokens = self.tokenizer(
+            sequence.upper(),
+            return_tensors='pt',
+            padding='max_length',
+            truncation=True,
+            max_length=self.config.data.max_seq_len
+        )
+        #return {"input_ids": tokens['input_ids'], "attention_mask": tokens['attention_mask']}
+        return {
+            "input_ids": tokens['input_ids'].squeeze(0),
+            "attention_mask": tokens['attention_mask'].squeeze(0)
+        }
+def collate_fn(batch):
+    input_ids = torch.stack([item['input_ids'] for item in batch])#.squeeze()
+    masks = torch.stack([item['attention_mask'] for item in batch])#.squeeze()
+    return {'input_ids': input_ids, 'attention_mask': masks}
+class MembraneDataModule(pl.LightningDataModule):
+    def __init__(self, config, train_dataset, val_dataset, test_dataset, collate_fn=collate_fn):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.test_dataset = test_dataset
+        self.collate_fn = collate_fn
+        self.batch_size = config.data.batch_size
+        self.tokenizer = AutoTokenizer.from_pretrained(config.lm.pretrained_evoflow)
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=self.collate_fn,
+                          num_workers=8,
+                          pin_memory=True)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=self.collate_fn,
+                          num_workers=8,
+                          shuffle=False,
+                          pin_memory=True)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=self.collate_fn,
+                          num_workers=8,
+                          shuffle=False,
+                          pin_memory=True)
+def get_datasets(config):
+    """Helper method to grab datasets to quickly init data module in main.py"""
+    train_dataset = MembraneDataset(config, config.data.train)
+    test_dataset = MembraneDataset(config, config.data.test)
+    val_dataset = MembraneDataset(config, config.data.val)
+    return  {
+        "train": train_dataset,
+        "val": val_dataset,
+        "test": test_dataset
+    }

src/lm/memdlm/diffusion_module.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import gc
+import torch
+import torch.nn.functional as F
+import lightning as pl
+from typing import Optional
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from src.utils.model_utils import _print
+from src.utils.optimizer_utils import get_optimizer, get_scheduler
+class MembraneDiffusion(pl.LightningModule):
+    def __init__(self, config):
+        """
+        Args:
+            config (OmegaConf): config.yaml file with all training parameters
+        """
+        super().__init__()
+        self.config = config
+        self.save_hyperparameters(logger=True)
+        self.model = AutoModelForMaskedLM.from_pretrained(config.lm.pretrained_evoflow, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(config.lm.pretrained_evoflow)
+        self.mask_id = self.tokenizer.mask_token_id
+        self.pad_id = self.tokenizer.pad_token_id
+    def forward(self, input_ids, attention_mask, guidance: Optional[bool] = False):
+        """
+        Forward pass through language model.
+        Args:
+            - input_ids (torch.Tensor): [B, L], token ids
+            - attention_mask (torch.Tensor): [B, L], pad/non-pad binary mask
+        Returns:
+            - logits (torch.Tensor): [B, L, V], unnormalized model outputs
+        """
+        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits
+    # -------# Diffusion #-------- #
+    def step(self, batch):
+        labels = batch['input_ids']
+        # Forward diffusion
+        t1 = self.sample_t(labels) # Sample timestep
+        xt, _ = self.noise_x0(labels, t1, maskable_mask=self.is_maskable(labels)) # Noise sequence
+        logits = self.forward(input_ids=xt, attention_mask=batch['attention_mask']) # Model logits
+        # Loss computation
+        weight = self.get_weight(t1, weight_type=self.config.lm.weight_type)  # RDM uses a weighted cross entropy loss
+        loss_out = self.compute_loss(logits, labels, weight) # Compute loss and ppl
+        self.cleanup()
+        return loss_out['loss'], loss_out['ppl']
+    def sample_t(self, labels, rdm_coupling=False):
+        """
+        Sample diffusion timesteps. Non-coupling RDM only uses one timestep (t1).
+        """
+        timesteps = torch.randint(
+            1,
+            self.config.lm.num_diffusion_timesteps + 1,
+            (2 if rdm_coupling else 1) * (labels.size(0),),
+            device=labels.device
+        )
+        if rdm_coupling:
+            return timesteps.chunk(2)
+        return timesteps
+    def noise_x0(self, x0, t1, maskable_mask):
+        """
+        Apply noise to the initial sequence x0.
+        """
+        u = torch.rand_like(x0, dtype=torch.float)
+        t1_mask = (u < (t1 / self.config.lm.num_diffusion_timesteps)[:, None]) & maskable_mask
+        x_t1 = x0.masked_fill(t1_mask, self.mask_id)
+        x_t1 = x_t1.masked_fill(t1_mask, self.mask_id)
+        return x_t1, t1_mask
+    def get_weight(self, t, weight_type):
+        """
+        Compute the weighting factor for the RDM-derived loss (weighted cross-entropy).
+        """
+        num_timesteps = self.config.lm.num_diffusion_timesteps
+        weight = {
+            "linear": (num_timesteps - (t - 1)),  # num_timesteps * (1 - (t-1)/num_timesteps)
+            "constant": num_timesteps * torch.ones_like(t),
+        }[weight_type][:, None].float() / num_timesteps
+        return weight.squeeze()
+    def compute_loss(self, logits, labels, weight):
+        """
+        Compute the cross entropy loss per sample.
+        First, compute the per-token loss (with no reduction), then reduce over the sequence length for each sample.
+        Finally, average over the batch.
+        Args:
+            logits (torch.Tensor): [B, L, vocab_size], unnormalized model outputs
+            labels (torch.Tensor): [B, L], target labels (with padding tokens as -100)
+            weight (torch.Tensor): [B, 1], per-sample weight for loss calculation
+        Returns:
+            loss (torch.Tensor): Averaged loss over the batch
+            logging_output (torch.Tensor): Dictionary of values for logging
+        """
+        loss_token = F.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            labels.view(-1),
+            reduction='none',
+            ignore_index=self.pad_id,
+        )
+        loss_token = loss_token.view(labels.size(0), labels.size(1)) # Reshape to [B, L]
+        valid_mask = (labels != self.pad_id)
+        sample_loss = (loss_token * valid_mask.float()).sum(dim=1) / valid_mask.float().sum(dim=1).clamp(min=1)
+        sample_loss *= weight # RDM weighting
+        ppl = torch.exp(sample_loss)
+        return {'ppl': ppl.mean(), 'loss': sample_loss.mean()}
+    # -------# Training / Evaluation #-------- #
+    def training_step(self, batch):
+        loss, ppl = self.step(batch)
+        self.log("train/loss", loss.item(), on_step=True, on_epoch=False, prog_bar=True)
+        self.log("train/ppl", ppl.item(), on_step=True, on_epoch=False, prog_bar=False)
+        return loss
+    def validation_step(self, batch):
+        loss, ppl = self.step(batch)
+        self.cleanup()
+        self.log("val/loss", loss.item(), on_step=False, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log("val/ppl", ppl.item(), on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
+        return loss
+    def test_step(self, batch):
+        loss, ppl = self.step(batch)
+        self.cleanup()
+        self.log('test/loss', loss.item(), on_step=False, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log("test/ppl", ppl.item(), on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
+        return loss
+    # -------# Helper methods #-------- #
+    def is_maskable(self, input_ids: torch.Tensor):
+        return (
+            (input_ids != self.tokenizer.pad_token_id)
+            & (input_ids != self.tokenizer.cls_token_id)
+            & (input_ids != self.tokenizer.eos_token_id)
+        )
+    def configure_optimizers(self):
+        """
+        Choosing which optimizer and lr scheduler to use.
+        """
+        optimizer = get_optimizer(self.config, self.model.parameters())
+        lr_scheduler, extra_kwargs = get_scheduler(self.config, optimizer) # Polynomial scheduler
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {"scheduler": lr_scheduler, **extra_kwargs},
+        }
+    def validate_config(self):
+        assert os.path.isdir(self.config.checkpointing.save_dir), "invalid checkpointing path"
+        assert self.config.training.mode in ["train", "test", "resume_from_checkpoint"], "invalid mode"
+    def get_state_dict(self, ckpt_path):
+        def remove_model_prefix(state_dict):
+            for k, v in state_dict.items():
+                if "model." in k:
+                    k.replace('model.', '')
+            return state_dict
+        checkpoint = torch.load(ckpt_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
+        state_dict = checkpoint.get("state_dict", checkpoint)
+        if any(k.startswith("model.") for k in state_dict.keys()):
+            state_dict = remove_model_prefix(state_dict)
+        return state_dict
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        gc.collect()

src/lm/memdlm/loss.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Ignore file
+class RDMCrossEntropyLoss(nn.CrossEntropyLoss):
+    def __init__(self, ignore_index):
+        self.ignore_index = ignore_index
+    def forward(self,
+                scores: torch.Tensor,
+                target: torch.Tensor,
+                label_mask,
+                weights,
+                ) -> torch.Tensor:
+        """
+        Computes the RDM-derived loss (weighted cross-entropy).
+        """
+        sample_size = target.ne(self.ignore_index).float().sum()
+        lprobs = F.log_softmax(scores, dim=-1)
+        loss = lprobs * weights
+        fullseq_loss = loss.sum() / sample_size
+        # use coord masked loss for model training,
+        # ignoring those position with missing coords (as nan)
+        label_mask = label_mask.float()
+        sample_size = label_mask.sum()  # sample size should be set to valid coordinates
+        loss = (loss * label_mask).sum() / sample_size
+        ppl = torch.exp(loss)
+        logging_output = {
+            'ppl': ppl.data,
+            'fullseq_loss': fullseq_loss.data,
+            'weight_diff_loss': loss.data
+        }
+        return logging_output

src/lm/memdlm/main.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import gc
+import sys
+import torch
+import wandb
+import torch.nn as nn
+import lightning.pytorch as pl
+from omegaconf import OmegaConf
+from lightning.pytorch.strategies import DDPStrategy
+from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
+from src.lm.memdlm.diffusion_module import MembraneDiffusion
+from src.lm.memdlm.dataloader import MembraneDataModule, get_datasets
+from src.utils.model_utils import apply_rdm_freezing
+wandb.login(key='2b76a2fa2c1cdfddc5f443602c17b011fefb0a8f')
+# Load yaml config
+config = OmegaConf.load("/scratch/pranamlab/sgoel/MeMDLM_v2/src/configs/lm.yaml")
+# Get datasets
+datasets = get_datasets(config)
+data_module = MembraneDataModule(
+    config=config,
+    train_dataset=datasets['train'],
+    val_dataset=datasets['val'],
+    test_dataset=datasets['test'],
+)
+# Initialize WandB for logging
+wandb.init(project=config.wandb.project, name=config.wandb.name)
+wandb_logger = WandbLogger(**config.wandb)
+# PL checkpoints
+lr_monitor = LearningRateMonitor(logging_interval="step")
+checkpoint_callback = ModelCheckpoint(
+    monitor="val/loss",
+    save_top_k=1,
+    mode="min",
+    dirpath=config.checkpointing.save_dir,
+    filename="best_model",
+    every_n_train_steps=config.checkpointing.save_every_n_steps
+)
+# PL trainer
+trainer = pl.Trainer(
+    max_steps=config.training.max_steps,
+    max_epochs=None,  # Ensure training is based on num steps
+    accelerator="cuda" if torch.cuda.is_available() else "cpu",
+    devices=config.training.devices if config.training.mode=='train' else [0],
+    strategy=DDPStrategy(find_unused_parameters=True),
+    callbacks=[checkpoint_callback, lr_monitor],
+    logger=wandb_logger,
+    log_every_n_steps=config.training.log_every_n_steps
+)
+# Folder to save checkpoints
+ckpt_path = config.checkpointing.save_dir
+try: os.makedirs(ckpt_path, exist_ok=False)
+except FileExistsError: pass
+# PL Model for training
+diffusion = MembraneDiffusion(config)
+diffusion.validate_config()
+# Start/resume training or evaluate the model
+model_type = "evoflow"
+if config.training.mode == "train":
+    apply_rdm_freezing(diffusion.model, config.training.n_layers, model_type)
+    trainer.fit(diffusion, datamodule=data_module)
+elif config.training.mode == "test":
+    state_dict = diffusion.get_state_dict(config.checkpointing.best_ckpt_path)
+    diffusion.load_state_dict(state_dict)
+    trainer.test(diffusion, datamodule=data_module, ckpt_path=config.checkpointing.best_ckpt_path)
+elif config.training.mode == "resume_from_checkpoint":
+    state_dict = diffusion.get_state_dict(config.training.resume_ckpt_path)
+    diffusion.load_state_dict(state_dict)
+    apply_rdm_freezing(diffusion.model, config.training.n_layers, model_type)
+    trainer.fit(diffusion, datamodule=data_module, ckpt_path=ckpt_path)
+wandb.finish()

src/sampling/guided_generator.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python3
+import sys
+import os
+import torch
+import pandas as pd
+from tqdm import tqdm
+from datetime import datetime
+from omegaconf import OmegaConf
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from src.lm.memdlm.diffusion_module import MembraneFlow
+from src.utils.model_utils import _print
+from src.sampling.guided_sampler import GuidedSampler
+from src.utils.generate_utils import (
+    mask_for_scaffold,
+    calc_blosum_score,
+    calc_ppl
+)
+config = OmegaConf.load("/home/a03-sgoel/MeMDLM_v2/src/configs/guidance.yaml")
+os.chdir(f'/home/a03-sgoel/MeMDLM_v2/results/infilling/guided/{config.lm.ft_evoflow}/test_set/')
+todays_date = datetime.today().strftime('%Y-%m-%d')
+csv_save_path = f'./{todays_date}_boltzmann-soft_new_clf_data_cleaned/'
+try: os.makedirs(csv_save_path, exist_ok=False)
+except FileExistsError: pass
+def main():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    tokenizer = AutoTokenizer.from_pretrained(config.lm.pretrained_esm)
+    esm_model = AutoModelForMaskedLM.from_pretrained(config.lm.pretrained_esm).eval().to(device)
+    diffusion = MembraneFlow(config).to(device)
+    state_dict = diffusion.get_state_dict(f"/home/a03-sgoel/MeMDLM_v2/checkpoints/{config.lm.ft_evoflow}/best_model.ckpt")
+    diffusion.load_state_dict(state_dict)
+    diffusion.eval().to(device)
+    sampler = GuidedSampler(config, esm_model, tokenizer, diffusion, device)
+    df = pd.read_csv('/home/a03-sgoel/MeMDLM_v2/data/classifier/test.csv')
+    sequences = df['Sequence'].tolist()
+    gen_seqs, ppls, blosums = [], [], []
+    for seq in tqdm(sequences, desc='Infilling Sequences'):
+        masked_seq = mask_for_scaffold(seq, generate_type='uppercase', mask_token='<mask>')
+        tokens = tokenizer(masked_seq, return_tensors='pt')
+        input_ids, attn_masks = tokens['input_ids'].to(device), tokens['attention_mask'].to(device)
+        soluble_idxs = [i for i in range(len(seq)) if seq[i].isupper()]
+        infilled_tokens = sampler.optimize_sequence(
+            input_ids=input_ids,
+            attn_masks=attn_masks,
+            soluble_indices=soluble_idxs,
+        )
+        infilled_seq = tokenizer.decode(infilled_tokens).replace(" ", "")[5:-5]
+        bl = calc_blosum_score(seq.upper(), infilled_seq, soluble_idxs)
+        try:
+            ppl = calc_ppl(esm_model, tokenizer, infilled_seq, [i for i in range(len(seq))], model_type='esm')
+        except:
+            ppl = float('inf')
+        gen_seqs.append(infilled_seq)
+        ppls.append(ppl)
+        blosums.append(bl)
+        _print(seq)
+        _print(infilled_seq)
+        _print(ppl)
+        _print(bl)
+        _print('\n')
+    df['MeMDLM Sequence'] = gen_seqs
+    df['MeMDLM PPL'] = ppls
+    df['MeMDLM BLOSUM'] = blosums
+    _print(df)
+    df.to_csv(f'./{csv_save_path}/t=0.7_new-data-cleaned_infilled_seqs.csv', index=False)
+if __name__ == "__main__":
+    main()

src/sampling/guided_sampler.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import os
+import math
+import torch
+import torch.nn.functional as F
+from src.utils.model_utils import _print
+from src.guidance.solubility_module import SolubilityClassifier
+from src.sampling.unconditional_sampler import UnconditionalSampler
+class GuidedSampler:
+    def __init__(self, config, esm_model, tokenizer, diffusion, device):
+        self.config = config
+        self.device = device
+        self.esm = esm_model
+        self.memdlm = diffusion
+        self.tokenizer = tokenizer
+        self.uncond_generator = UnconditionalSampler(self.tokenizer, self.memdlm)
+        ckpt_path = os.path.join(f"/home/a03-sgoel/MeMDLM_v2/checkpoints/{config.wandb.name}/best_model.ckpt")
+        self.classifier_model = SolubilityClassifier(config)
+        state_dict = self.classifier_model.get_state_dict(ckpt_path)
+        self.classifier_model.load_state_dict(state_dict)
+        self.classifier_model.eval().to(self.device)
+        self.top_p = self.config.guidance.top_p
+        self.alpha = self.config.guidance.alpha
+        self.gamma = self.config.guidance.gamma
+        self.saliency_eps = self.config.guidance.saliency_eps
+        self.saliency_t = self.config.guidance.saliency_t
+        self.sampling_t = self.config.guidance.sampling_t
+        self.boltzmann_t = self.config.guidance.boltzmann_t
+    def embed_sequence(self, input_ids, attention_masks):
+        with torch.no_grad():
+            outs = self.esm(
+                input_ids=input_ids,
+                attention_mask=attention_masks,
+                output_hidden_states=True,
+                output_attentions=True
+            )
+            embeds = outs.hidden_states[-1]
+            attn_matrix = outs.attentions
+        return embeds, attn_matrix
+    def sample_from_categorical(self, logits, temperature, noise_scale=1.0):
+        gumbel_noise = -torch.log(-torch.log(torch.rand_like(logits) + 1e-8) + 1e-8)
+        logits = (logits / temperature) + (noise_scale * gumbel_noise)
+        log_probs = F.log_softmax(logits, dim=-1)
+        _, tokens = log_probs.max(dim=-1)
+        return tokens, log_probs
+    def denoise_sequence(self, input_ids, attn_masks):
+        """
+        Compute the current and prior sequences' log prob distribution.
+        """
+        has_masks = (input_ids == self.tokenizer.mask_token_id).any()
+        # Denosie the sequence if needed
+        if has_masks:
+            xt_prior, logits_prior = self.uncond_generator.sample_unconditional(
+                xt=input_ids,
+                num_steps=self.config.guidance.n_steps,
+                tau=self.sampling_t,
+                return_logits=True
+            )
+        else:
+            xt_prior = input_ids
+            logits_prior = self.memdlm(input_ids=input_ids, attention_mask=attn_masks)
+        # Take the final sampling step
+        _, logits = self.uncond_generator.sample_unconditional(
+            xt=xt_prior,
+            num_steps=1, # Only need 1 sampling step
+            tau=self.sampling_t,
+            return_logits=True
+        )
+        # Get final sequence log probs (always needed)
+        x0, logp_lm = self.sample_from_categorical(logits, temperature=self.sampling_t)
+        return x0.squeeze(), logp_lm.squeeze(), logits_prior
+    def get_prior(self, logits_prior, solubility_logits):
+        if self.config.guidance.prior == "boltzmann":
+            hydrophilic = ["D","E","K","R","N","Q","H","S","T","Y"]
+            hydrophobic = ["L","I","V","F","W","M","A","C","G","P"]
+            amino_acids = hydrophilic + hydrophobic
+            tokens = list(self.tokenizer.get_vocab().keys())
+            other = [tok for tok in tokens if tok not in amino_acids]
+            hydrophilic_idxs = [self.tokenizer.convert_tokens_to_ids(aa) for aa in hydrophilic]
+            hydrophobic_idxs = [self.tokenizer.convert_tokens_to_ids(aa) for aa in hydrophobic]
+            other_idxs = [self.tokenizer.convert_tokens_to_ids(tok) for tok in other]
+            bias = torch.zeros(len(tokens), device=self.device)
+            bias[hydrophilic_idxs] = 1.0
+            bias[hydrophobic_idxs] = -1.0
+            bias[other_idxs] = 0.0
+            sol_scores = torch.sigmoid(solubility_logits)
+            token_bias = sol_scores.unsqueeze(-1) * bias
+            lm_probs = F.softmax(logits_prior / self.sampling_t, dim=-1)
+            boltz_weight = torch.exp(token_bias / self.boltzmann_t)
+            p_prior = lm_probs * boltz_weight
+            p_prior = p_prior / p_prior.sum(dim=-1, keepdim=True)
+            logp_prior = torch.log(p_prior)
+        elif self.config.guidance.prior == "lm_probs":
+            _, logp_prior = self.sample_from_categorical(logits_prior, temperature=self.sampling_t)
+        return logp_prior.squeeze()
+    def compute_saliency_map(self, embeds, solubility_logits):
+        """
+        Compute a saliency map as in LaMBO-2 (https://arxiv.org/abs/2305.20009) Eq. 5
+        """
+        # Gradient tracking is already enabled for the embeddings
+        solubility_logits.sum().backward(retain_graph=True) # Clf gradients wrt hidden states
+        grads = embeds.grad.abs().sum(dim=-1) # Aggergate across hidden dim. Abs value for mangitude only.
+        saliency = grads.pow(1.0 / self.saliency_t).clamp(min=self.saliency_eps).to(self.device)
+        saliency = (saliency - saliency.min()) / (saliency.max() - saliency.min() + 1e-6)
+        return saliency.squeeze()
+    def determine_edit_positions(self, saliency_map, soluble_indices, solubility_logits):
+        """
+        Fix the insoluble residues and additional TM residues to
+        maintain membrane-like protein structure.
+        """
+        seq_len = saliency_map.shape[0]
+        # Initialize a mask to store the editable token positions
+        edit_mask = torch.ones(seq_len, dtype=torch.bool, device=self.device)
+        # Check for any provided soluble residues, otherwise use classifier preds
+        if len(soluble_indices) > 0:
+            edit_mask[soluble_indices] = False
+        elif soluble_indices is None or len(soluble_indices) == 0:
+            solubility_preds = F.sigmoid(solubility_logits)
+            edit_mask[solubility_preds > 0.5] = False
+        # Find additional TM residues
+        num_conserved = max(1, int(0.1 * edit_mask.sum()))
+        _, topk_idxs = torch.topk(saliency_map, num_conserved)
+        edit_mask[topk_idxs] = False
+        edit_idxs = edit_mask.nonzero(as_tuple=True)[0]
+        return edit_idxs
+    def create_neighborhood(self, edit_pos, attn_matrix, top_p):
+        """
+        Select a dynamic "neighborhood" of tokens for edit position via top-p sampling.
+        Attention scores find relevant tokens, avoding blind updates of the individual token
+        """
+        # Get the attention scores for the current edit position
+        row = attn_matrix[edit_pos].clone().squeeze()
+        row = row.index_fill(
+            dim=0,
+            index=torch.tensor([0, edit_pos, row.size(0)-1], device=row.device),
+            value=float('-inf')
+        )
+        # Top-p (nucleus) sampling of tokens via normed attention scores
+        temp = 1.0 / math.log(row.size(0)) # scale temp with seq len to balance
+        attn_probs = F.softmax(row / temp, dim=0)
+        sorted_probs, sorted_idxs = torch.sort(attn_probs, descending=True)
+        cum_probs = sorted_probs.cumsum(dim=0)
+        cutoff = (cum_probs <= top_p).nonzero(as_tuple=True)[0]
+        # Ensure neighborhoods will always have 1 token
+        final_idx = cutoff[-1].item() + 1 if cutoff.numel() > 0 else 1
+        neighborhood = sorted_idxs[:final_idx]
+        return neighborhood
+    def compute_saliency_weight(self, edit_pos, attn_mat, saliency_map, neighborhood):
+        """
+        Blend the saliency of the neighborhood's tokens and the token at the edit position.
+        """
+        neighborhood_attns = attn_mat[edit_pos, neighborhood]
+        neighborhood_attns /= neighborhood_attns.sum()
+        neighborhood_saliencies = saliency_map[neighborhood]
+        neighborhood_weight = torch.sum(neighborhood_attns * neighborhood_saliencies)
+        ctxt_aware_saliency = saliency_map[edit_pos] + (self.gamma * neighborhood_weight)
+        return ctxt_aware_saliency
+    def compute_guidance_dist(self, logp_lm, logp_prior, saliency_weight):
+        """
+        Define a guidance distribution between a prior and the current LM probs.
+        Compute the log probs of the "new" (optimized) token.
+        """
+        w = torch.sigmoid(saliency_weight * self.alpha)  # Between [0, 1] to ensure valid probs
+        p_lm = torch.exp(logp_lm)
+        p_prior = torch.exp(logp_prior)
+        mixed_probs = (1 - w) * p_lm + w * p_prior
+        guidance_dist = torch.log(mixed_probs + 1e-12)
+        return guidance_dist
+    def check_scaffold(self, seq1, seq2, idxs):
+        changed = (seq1[idxs] != seq2[idxs])
+        if changed.any():
+            _print('soluble residues changed')
+        else:
+            _print('no soluble residue changes')
+    def optimize_sequence(self, input_ids, attn_masks, soluble_indices):
+        _print(f'soluble idx: {soluble_indices}')
+        # Initialize token ids, logits, and log probs of sequence
+        x0, logp_lm, logits_prior = self.denoise_sequence(input_ids, attn_masks)
+        _print(f'og tokens: {x0}')
+        _print(f'og tokens: {x0.shape}')
+        _print(f'og log probs: {logp_lm.shape}')
+        # Embeddings and attention matrix of current sequence
+        embeds, attn_mats = self.embed_sequence(x0.unsqueeze(0), attn_masks)
+        embeds = embeds.detach().clone().requires_grad_(True) # enable grad tracking for saliency map
+        attn_matrix = attn_mats[-1].mean(dim=1)[0].squeeze(0)
+        # Precompute logits of the classifier to avoid repeated calls
+        batch = {"embeds": embeds, "attention_mask": attn_masks}
+        solubility_logits = self.classifier_model(batch)
+        # Create a saliency map to determined optimal edit positions
+        saliency_map = self.compute_saliency_map(embeds, solubility_logits)
+        _print(f'saliency map: {saliency_map}')
+        edit_positions = self.determine_edit_positions(saliency_map, soluble_indices, solubility_logits)
+        _print(f'edit positions: {edit_positions}')
+        # Compute the log probs of the prior dist
+        logp_prior = self.get_prior(logits_prior, solubility_logits)
+        _print(f'prior log probs: {logp_prior.shape}')
+        # Optimize the insoluble residues
+        for edit_pos in edit_positions.tolist():
+            neighborhood = self.create_neighborhood(
+                edit_pos,
+                attn_matrix,
+                self.top_p
+            )
+            _print(f'neighborhood: {neighborhood}')
+            ctxt_aware_saliency = self.compute_saliency_weight(
+                edit_pos,
+                attn_matrix,
+                saliency_map,
+                neighborhood
+            )
+            _print(f'ctx aware saliency: {ctxt_aware_saliency}')
+            logp_lm_prime = self.compute_guidance_dist(
+                logp_lm[edit_pos],
+                logp_prior[edit_pos],
+                ctxt_aware_saliency
+            )
+            logp_lm[edit_pos] = logp_lm_prime
+            tot = torch.exp(logp_lm_prime).sum()
+            one = torch.tensor(1.0, dtype=tot.dtype, device=tot.device)
+            assert torch.isclose(tot, one, atol=1e-4), f"Invalid prob distribution. Sum = {tot:5f}"
+        # Sample new tokens
+        x0_prime = torch.distributions.Categorical(logits=logp_lm).sample()
+        # Check if any soluble residues have been changed
+        self.check_scaffold(x0, x0_prime, soluble_indices)
+        # Preserve the initial sequence scaffold by copying over the soluble tokens
+        x0_prime[soluble_indices] = x0[soluble_indices]
+        self.check_scaffold(x0, x0_prime, soluble_indices)
+        return x0_prime

src/sampling/unconditional_generator.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#!/usr/bin/env python3
+import sys
+import os
+import random
+import torch
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from collections import Counter
+from omegaconf import OmegaConf
+from datetime import datetime
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from MeMDLM_v2.src.lm.diffusion_module import MembraneFlow
+from src.sampling.unconditional_sampler import UnconditionalSampler
+from src.utils.generate_utils import mask_for_de_novo, calc_ppl
+from src.utils.model_utils import _print
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+os.chdir('/home/a03-sgoel/MeMDLM_v2')
+config = OmegaConf.load("./src/configs/lm.yaml")
+date = datetime.now().strftime("%Y-%m-%d")
+def generate_sequence(prior: str, tokenizer, generator, device):
+    input_ids = tokenizer(prior, return_tensors="pt").to(device)['input_ids']
+    ids = generator.sample_unconditional(
+        xt=input_ids,
+        num_steps=config.sampling.n_steps,
+        return_logits=False,
+        banned_token_ids=None
+        #banned_token_ids=[tokenizer.convert_tokens_to_ids("P"), tokenizer.convert_tokens_to_ids("C")]
+    )
+    generated_sequence = tokenizer.decode(ids[0].squeeze())[5:-5].replace(" ", "") # bos/eos tokens & spaces between residues
+    return generated_sequence
+def main():
+    csv_save_path = f'./results/denovo/unconditional/{config.wandb.name}/{date}_tau=3.0_test-set_distribution'
+    try: os.makedirs(csv_save_path, exist_ok=False)
+    except FileExistsError: pass
+    tokenizer = AutoTokenizer.from_pretrained(config.lm.pretrained_evoflow)
+    flow = MembraneFlow(config).to(device)
+    state_dict = flow.get_state_dict(f"./checkpoints/{config.wandb.name}/best_model.ckpt")
+    flow.load_state_dict(state_dict)
+    flow.eval()
+    esm_pth = config.lm.pretrained_esm
+    esm_model = AutoModelForMaskedLM.from_pretrained(esm_pth).to(device)
+    esm_model.eval()
+    generator = UnconditionalSampler(tokenizer, flow)
+    # # Get 100 random sequence lengths to generate
+    # seq_lengths = [random.randint(50, 250) for _ in range(5000)]
+    # # Determine length from positive controls
+    # df = pd.read_csv(f'./results/denovo/unconditional/{config.wandb.name}/perin_pos_ctrl/raw_seqs.csv')
+    # seq_lengths = [len(seq) for seq in df['Sequence'].tolist() for _ in range(500)] # generate each length 100 times
+    # _print(seq_lengths)
+    # Determine lengths from test set distribution
+    df = pd.read_csv("./data/test.csv")
+    seq_lengths = [len(seq) for seq in df['Sequence'].tolist()]
+    length_counts = Counter(seq_lengths) # {L1: freq, L2: freq, ...}
+    total = sum(length_counts.values()) # total number of tokens
+    lengths = np.array(list(length_counts.keys())) # Frequency of each length
+    probs = np.array([length_counts[l] / total for l in lengths])
+    seq_lengths = np.random.choice(lengths, size=len(seq_lengths), p=probs)
+    generation_results = []
+    for seq_len in tqdm(seq_lengths, desc=f"Generating sequences: "):
+        seq_res = []
+        masked_seq = mask_for_de_novo(seq_len) # Sequence of all <mask> tokens
+        gen_seq = ""
+        attempts = 0
+        while len(gen_seq) != seq_len and attempts < 3:
+            gen_seq = generate_sequence(masked_seq, tokenizer, generator, device)
+            attempts += 1
+        if len(gen_seq) != seq_len:
+            esm_ppl, flow_ppl = None, None
+        else:
+            esm_ppl = calc_ppl(esm_model, tokenizer, gen_seq, [i for i in range(len(gen_seq))], model_type='esm')
+            flow_ppl = calc_ppl(flow, tokenizer, gen_seq, [i for i in range(len(gen_seq))], model_type='flow')
+        _print(f'gen seq: {gen_seq}')
+        _print(f'esm ppl: {esm_ppl}')
+        _print(f'flow ppl: {flow_ppl}')
+        seq_res.append(gen_seq)
+        seq_res.append(esm_ppl)
+        seq_res.append(flow_ppl)
+        generation_results.append(seq_res)
+    df = pd.DataFrame(generation_results, columns=['Generated Sequence', 'ESM PPL', 'Flow PPL'])
+    df.to_csv(csv_save_path + "/seqs_with_ppl.csv", index=False)
+if __name__ == "__main__":
+    main()

src/sampling/unconditional_sampler.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import sys
+import torch
+import random
+import numpy as np
+from tqdm import tqdm
+from src.utils.model_utils import _print
+class UnconditionalSampler:
+    def __init__(self, tokenizer, model):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = self.model.device
+        self.mask_id = self.tokenizer.mask_token_id
+        self.seed_everything(seed=42)
+    @torch.inference_mode()
+    def sample_unconditional(self, xt, num_steps, tau=0.7, kappa_fn=lambda t: t, eta=1, alpha=1., banned_token_ids=None, return_logits=None):
+        """
+        Stochastic remasking sampling method for iterative refinement of sequences.
+        Args:
+            xt (Tensor): Initial token tensor.
+            num_steps (int): Number of refinement steps.
+            tau (float): Temperature parameter for softmax sampling.
+            kappa_fn (callable): Function controlling the unmasking schedule.
+            eta (float): Scaling factor for score adjustments.
+            alpha (float): Weighting for confidence-based scoring.
+        Returns:
+            Tensor: Final sampled sequence tensor.
+        """
+        dt = 1 / num_steps
+        fix_mask = xt != self.mask_id # tokens to retain
+        attention_mask = torch.ones_like(xt).to(self.device)
+        for i in range(1, num_steps + 1):
+            kappa_t = kappa_fn(i * dt)
+            logits = self.model(input_ids=xt, attention_mask=attention_mask)
+            last_mask = xt == self.mask_id # tokens currently masked
+            unmask_t = ~last_mask & ~fix_mask # unmasked and not fixed tokens - candidates for masking
+            x0, logp = self.stochastic_sample_from_categorical(logits, tau, banned_token_ids=banned_token_ids) # tokens, logprobs
+            # Confidence-based scoring
+            entropy = torch.distributions.Categorical(logits=logits).entropy()
+            score = alpha * logp + (1 - alpha) * -entropy # alpha = 1 --> score = logp
+            score = score.masked_fill(fix_mask, float('inf'))
+            score[unmask_t] = score[unmask_t] * eta
+            num_to_mask = ((~fix_mask).sum(1, keepdim=True).float() * (1 - kappa_t)).long()
+            lowest_k_mask = self.topk_lowest_masking(score, num_to_mask)
+            xt[lowest_k_mask] = self.mask_id
+            mask_2_x0 = last_mask & ~lowest_k_mask
+            xt[mask_2_x0] = x0[mask_2_x0]
+            # print(f"Step {i}/{num_steps} | eta: {eta}, alpha: {alpha}, Stochastic remask: \n", xt[0])
+        xt[xt == self.mask_id] = x0[xt == self.mask_id]
+        return xt, logits if return_logits else xt
+    def stochastic_sample_from_categorical(self, logits, temperature, noise_scale=1.0, banned_token_ids=None):
+        """
+        Sample from a categorical distribution with optional temperature scaling and Gumbel noise.
+        """
+        logits = logits.double()
+        if banned_token_ids is not None:
+            banned_token_mask = torch.zeros_like(logits, device=logits.device).bool()
+            for token_id in banned_token_ids:
+                banned_token_mask[..., token_id] = True
+            logits = logits.masked_fill(banned_token_mask, float('-inf'))
+        if temperature != 0:
+            gumbel_noise = -torch.log(-torch.log(torch.rand_like(logits) + 1e-8) + 1e-8)
+            logits = logits / temperature + noise_scale * gumbel_noise
+        scores, tokens = logits.log_softmax(dim=-1).max(dim=-1)
+        return tokens, scores
+    def topk_lowest_masking(self, scores, cutoff_len):
+        """
+        scores: [b, n]
+        cutoff_len: [b, 1]
+        returns:
+            mask: [b, n], with 1 if the token is in top-k lowest scores, 0 otherwise
+        """
+        sorted_index = scores.sort(-1)[0]
+        cutoff = sorted_index.gather(dim=-1, index=cutoff_len)
+        return scores < cutoff
+    def seed_everything(self, seed):
+        """
+        Set the seed for reproducibility across various libraries.
+        """
+        if seed is None:
+            return
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)  # if using multi-GPU
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False

src/utils/generate_utils.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+import math
+import sys
+import torch.nn.functional as F
+import pandas as pd
+import numpy as np
+from omegaconf import OmegaConf
+from transformers import AutoModelForMaskedLM, AutoModel, AutoTokenizer
+from src.lm.memdlm.diffusion_module import MembraneFlow
+from src.lm.dplm.diffusion_module import DPLM
+from src.utils.model_utils import get_latents, _print
+from src.sampling.unconditional_sampler import UnconditionalSampler
+from src.lm.dplm.unconditional_sampler import UnconditionalSampler as DPLMUnconditionalSampler
+config = OmegaConf.load("/home/a03-sgoel/MeMDLM_v2/src/configs/lm.yaml")
+# -------# Masking #-------- #
+def mask_for_de_novo(sequence_length):
+    return "<mask>" * sequence_length
+def mask_for_scaffold(sequence, generate_type, mask_token):
+    if generate_type == "uppercase":
+        sequence = ''.join([mask_token if residue.isupper() else residue.upper() for residue in sequence])
+    elif generate_type == "lowercase":
+        sequence = ''.join([mask_token if residue.islower() else residue for residue in sequence])
+    return sequence
+# -------# Generation #-------- #
+def memflow_infill_uncond(masked_seq, tokenizer, model: MembraneFlow):
+    generator = UnconditionalSampler(tokenizer, model) # initialize the generator object
+    xt = tokenizer(masked_seq, return_tensors='pt')['input_ids'].to(model.device)
+    denoised_tokens = generator.sample_unconditional(xt, config.sampling.n_steps)[0].squeeze()
+    generated_sequence = tokenizer.decode(denoised_tokens).replace(" ", "")[5:-5]
+    return generated_sequence
+def evodiff_infill(motif_seq, tokenizer, model, device, batch_size=1):
+    """
+    Following the given evodiff example
+    https://github.com/microsoft/evodiff/blob/main/examples/evodiff.ipynb
+    """
+    # Manual masking of infilling sequence
+    motif_seq = ''.join(["#" if aa.islower() else aa for aa in motif_seq])  # Mask token is "#" in evodiff tokenizer
+    tkns = tokenizer.tokenize([motif_seq])
+    sample = torch.as_tensor(tkns).to(device)
+    # Create input motif + scaffold
+    loc = torch.arange(0, len(motif_seq)).to(device)[sample==tokenizer.mask_id].cpu().numpy()
+    np.random.shuffle(loc)
+    sample = sample.to(device).unsqueeze(0)
+    # og_sample = sample.clone()
+    with torch.no_grad():
+        for i in loc:
+            timestep = torch.tensor([0] * batch_size).to(device)  # placeholder but not called in model
+            timestep = timestep.to(device)
+            prediction = model(sample, timestep)
+            p = prediction[:, i, :len(tokenizer.all_aas) - 6]  # only canonical
+            p = F.softmax(p, dim=1)  # softmax over logits
+            p_sample = torch.multinomial(p, num_samples=1) # sample from categorical distribution
+            sample[:, i] = p_sample.squeeze()
+    output = [tokenizer.untokenize(s) for s in sample]
+    return output[0] #if batch_size==1 else output, og_sample, loc
+def dplm_infill(masked_seq, tokenizer, model: DPLM, device):
+    generator = DPLMUnconditionalSampler(tokenizer, model)
+    xt = tokenizer(masked_seq, return_tensors='pt')['input_ids'].to(model.device)
+    denoised_tokens = generator.sample_unconditional(xt, config.sampling.n_steps)[0].squeeze()
+    generated_sequence = tokenizer.decode(denoised_tokens).replace(" ", "")[5:-5]
+    return generated_sequence
+# -------# Metrics #-------- #
+def calc_progen_ppl(model, tokenizer, target, device, fp16=True):
+    """Compute causal LM cross-entropy loss for a given sequence."""
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(enabled=fp16):
+            logits = model(
+                input_ids = target,
+                attention_mask = torch.ones_like(target)
+            ).logits
+            # Shift
+            logits = logits[:-1, ...]
+            target = target[1:]
+            loss = torch.nn.functional.cross_entropy(
+                input=logits,
+                target=target,
+                reduction='mean'
+            )
+            return torch.exp(loss).item()
+def calc_ppl(model, tokenizer, generated_sequence, mask_token_indices, model_type):
+    total_loss = 0.0
+    tensor_input = tokenizer.encode(generated_sequence, return_tensors='pt').to(model.device)
+    attn_mask = torch.ones_like(tensor_input).to(model.device)
+    for i in mask_token_indices:
+        masked_input = tensor_input.clone()
+        masked_input[0, i] = tokenizer.mask_token_id
+        labels = torch.full(tensor_input.shape, -100).to(model.device)
+        labels[0, i] = tensor_input[0, i]
+        with torch.no_grad():
+            if model_type == 'esm':
+                loss = model(masked_input, labels=labels).loss.item()
+            elif model_type == 'flow':
+                logits = model.forward(masked_input, attention_mask=attn_mask)
+                loss = F.cross_entropy(
+                    logits.view(-1, logits.size(-1)),
+                    labels.view(-1),
+                    reduction='none',
+                    ignore_index=-100,
+                )[i].item()
+            total_loss += loss
+    avg_loss = total_loss / len(generated_sequence)
+    perplexity = math.exp(avg_loss)
+    return perplexity
+def calc_blosum_score(og_seq, gen_seq, indices):
+    import blosum as bl
+    mat = bl.BLOSUM(62)
+    tot_score = 0
+    for i in indices:
+        og_res, gen_res = og_seq[i], gen_seq[i]
+        try:
+            val = mat[og_res][gen_res]
+            tot_score += val
+        except KeyError:
+             # -4 is lowest BLOSUM score indicating biological implausability
+            tot_score += -4
+    return tot_score / len(indices) if indices else 0
+def calc_cos_sim(original_sequence, generated_sequence, tokenizer, esm_model, device):
+    og_embeddings = get_latents(esm_model, tokenizer, original_sequence.upper(), device)
+    new_embeddings = get_latents(esm_model, tokenizer, generated_sequence, device)
+    cosine_sim = torch.nn.functional.cosine_similarity(og_embeddings, new_embeddings, dim=-1)
+    cosine_sim = torch.mean(cosine_sim).item()
+    return cosine_sim

src/utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import sys
+import torch
+import torch.nn as nn
+def _print(s):
+    print(s)
+    sys.stdout.flush()
+def get_latents(model, tokenizer, sequence, device):
+    tokens = tokenizer(sequence, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**tokens)
+        embeds = outputs.hidden_states[-1].squeeze(0) # Get last hidden states
+    return embeds
+# General model freezing
+def freeze_model(model: nn.Module):
+    # Disable parameter updates for all layers
+    for param in model.parameters():
+        param.requires_grad = False
+# For ProGen2 architecture
+def apply_gptj_freezing(model, N_layers):
+    def unfreeze_n_layers(model, N_layers):
+        # Count number of encoder layers
+        model_layers = len(model.transformer.h)
+        for i, h in enumerate(model.transformer.h):
+            if i >= model_layers - N_layers:
+                for module in h.attn.modules():
+                    for param in module.parameters():
+                        param.requires_grad = True
+    def check_frozen_model(model, N_layers: int):
+        """
+        Verify that only the last N_layers of model.transformer.h are unfrozen.
+        Source: https://github.com/enijkamp/progen2/blob/main/progen/modeling_progen.py
+        """
+        model_layers = len(model.transformer.h)
+        frozen_layers = 0
+        unfrozen_layers = 0
+        for i, h in enumerate(model.transformer.h):
+            if i >= model_layers - N_layers:  # should be unfrozen
+                if any(param.requires_grad for param in h.parameters()):
+                    unfrozen_layers += 1
+                else:
+                    print(f"Layer {i} has all parameters frozen, but it should be unfrozen.")
+            else:  # should be frozen
+                if any(param.requires_grad for param in h.parameters()):
+                    print(f"Layer {i} is not frozen, but it should be frozen.")
+                else:
+                    frozen_layers += 1
+        assert frozen_layers == model_layers - N_layers and unfrozen_layers == N_layers, \
+            f"frozen layers: {frozen_layers}, unfrozen layers: {unfrozen_layers}"
+        print(f"frozen layers: {frozen_layers}, unfrozen layers: {unfrozen_layers}")
+    freeze_model(model)
+    unfreeze_n_layers(model, N_layers)
+    check_frozen_model(model, N_layers)
+# For RDM-based architectures
+def apply_rdm_freezing(model: nn.Module, N_layers: int, model_type: str):
+    """
+    Freeze all layers except last N for esm-like architectures
+    Args:
+        model (nn.Module): model to freeze
+        N_layers (int): num encoder layers to unfreeze
+        model_type (str): one of {"esm", "evoflow", "dplm"}
+    """
+    # choose encoder layers based on the model type
+    if model_type == "dplm":
+        encoder_layers = model.net.esm.encoder.layer
+    elif model_type in ("esm", "evoflow"):
+        encoder_layers = model.esm.encoder.layer
+    else:
+        raise ValueError(f"Unknown model_type: {model_type}")
+    def unfreeze_n_layers(layers, N_layers: int):
+        model_layers = len(layers)
+        for i, layer in enumerate(layers):
+            if i >= model_layers - N_layers:
+                for module in layer.attention.self.key.modules():
+                    for param in module.parameters():
+                        param.requires_grad = True
+                for module in layer.attention.self.query.modules():
+                    for param in module.parameters():
+                        param.requires_grad = True
+                for module in layer.attention.self.value.modules():
+                    for param in module.parameters():
+                        param.requires_grad = True
+    def check_model(layers, N_layers: int):
+        model_layers = len(layers)
+        frozen_layers = 0
+        unfrozen_layers = 0
+        for i, layer in enumerate(layers):
+            if i >= model_layers - N_layers:
+                layer_frozen = True
+                for module in layer.attention.self.key.modules():
+                    if any(param.requires_grad for param in module.parameters()):
+                        layer_frozen = False
+                for module in layer.attention.self.query.modules():
+                    if any(param.requires_grad for param in module.parameters()):
+                        layer_frozen = False
+                for module in layer.attention.self.value.modules():
+                    if any(param.requires_grad for param in module.parameters()):
+                        layer_frozen = False
+                if layer_frozen:
+                    print(f"layer {i} has all parameters frozen, but it should be unfrozen.")
+                else:
+                    unfrozen_layers += 1
+            else:
+                if any(param.requires_grad for param in layer.parameters()):
+                    print(f"layer {i} is not frozen, but it should")
+                else:
+                    frozen_layers += 1
+        assert (frozen_layers == model_layers - N_layers) and (unfrozen_layers == N_layers), \
+            f"frozen layers: {frozen_layers}, unfrozen layers: {unfrozen_layers}"
+    freeze_model(model)
+    unfreeze_n_layers(encoder_layers, N_layers)
+    check_model(encoder_layers, N_layers)

src/utils/optimizer_utils.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import torch
+import math
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+from torch.optim.adamw import adamw
+try:
+    import deepspeed
+    from deepspeed.ops.adam import FusedAdam
+    from deepspeed.ops.adam import DeepSpeedCPUAdam
+except:
+    pass
+def get_optimizer(cfg, params):
+    if cfg.optim.type == 'adam':
+        return torch.optim.Adam(
+            params=params,
+            lr=cfg.optim.lr,
+            weight_decay=cfg.optim.weight_decay,
+            betas=(cfg.optim.beta1, cfg.optim.beta2)
+        )
+    elif cfg.optim.type == 'adamw':
+        return AdamW(
+            params=params,
+            lr=cfg.optim.lr,
+            weight_decay=cfg.optim.weight_decay,
+            betas=(cfg.optim.beta1, cfg.optim.beta2)
+        )
+    elif cfg.type == 'fusedadam':
+        return FusedAdam(
+            params=params,
+            lr=cfg.lr,
+            weight_decay=cfg.weight_decay,
+            betas=cfg.betas,
+        )
+    else:
+        raise NotImplementedError('Optimizer not supported: %s' % cfg.type)
+class AdamW(torch.optim.AdamW):
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
+                        if self.defaults['capturable'] else torch.tensor(0.)
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                state_steps.append(state['step'].cpu())
+            adamw(params_with_grad,
+                  grads,
+                  exp_avgs,
+                  exp_avg_sqs,
+                  max_exp_avg_sqs,
+                  state_steps,
+                  amsgrad=amsgrad,
+                  beta1=beta1,
+                  beta2=beta2,
+                  lr=group['lr'],
+                  weight_decay=group['weight_decay'],
+                  eps=group['eps'],
+                  maximize=group['maximize'],
+                  foreach=group['foreach'],
+                  capturable=group['capturable'])
+        return loss
+def get_scheduler(cfg, optimizer):
+    if cfg.optim.scheduler is None:
+        return BlackHole()
+    elif cfg.optim.scheduler == 'plateau':
+        return (
+            torch.optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer,
+                mode=cfg.mode,
+                factor=cfg.factor,
+                patience=cfg.patience,
+                min_lr=cfg.min_lr,
+            ),
+            {'monitor': "val/loss", 'interval': 'epoch'}
+        )
+    elif cfg.optim.scheduler == 'noam':
+        return (
+            NoamScheduler(
+                optimizer,
+                lr=cfg.lr,
+                warmup_steps=cfg.warmup_steps,
+                model_size=cfg.model_size,
+                warmup_init_lr=cfg.get('warmup_init_lr')
+            ),
+            {'frequency': 1, 'interval': 'step'}
+        )
+    elif cfg.optim.scheduler == 'polynomial':
+        return (
+            PolyNomialLRScheduler(
+                optimizer,
+                total_steps=cfg.training.max_steps,
+                warmup_steps=cfg.training.warmup_steps,
+                lr=cfg.optim.lr,
+                lr_end=cfg.optim.lr_end,
+                warmup_init_lr=cfg.optim.warmup_init_lr,
+                power=cfg.optim.power
+            ),
+            {'frequency': 1, 'interval': 'step'}
+        )
+    elif cfg.optim.scheduler == 'multistep':
+        return torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            milestones=cfg.milestones,
+            gamma=cfg.gamma,
+        )
+    elif cfg.optim.scheduler == 'exp':
+        return torch.optim.lr_scheduler.ExponentialLR(
+            optimizer,
+            gamma=cfg.gamma,
+        )
+    elif cfg.optim.scheduler == 'progen_ft':
+        sched = CosineToFrac(
+            optimizer=optimizer,
+            total_steps=cfg.training.max_steps,
+            final_frac=0.2,   # decay to lr/5
+        )
+        return (sched, {'frequency': 1, 'interval': 'step'})
+    elif cfg.optim.scheduler is None:
+        return BlackHole()
+    else:
+        raise NotImplementedError('Scheduler not supported: %s' % cfg.optim.scheduler)
+class BlackHole(object):
+    def __setattr__(self, name, value):
+        pass
+    def __call__(self, *args, **kwargs):
+        return self
+    def __getattr__(self, name):
+        return self
+# -------# DPLM Scheduler #-------- #
+def polynomial_lr_schedule(step, total_steps, warmup_steps, warmup_init_lr, lr, lr_end, power):
+    if step < warmup_steps:
+        return warmup_init_lr + (lr - warmup_init_lr) * step / warmup_steps
+    elif step > total_steps:
+        return lr_end
+    else:
+        return lr_end + (lr - lr_end) * (1 - (step - warmup_steps) / (total_steps - warmup_steps)) ** power
+class PolyNomialLRScheduler(LambdaLR):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        total_steps: int = 1000,
+        warmup_steps: int = 0,
+        lr: float = 0.00004,  # 5e-04,
+        lr_end: float = 1e-5, #1e-07,
+        warmup_init_lr: float = 1e-07, # 1e-07,
+        power: float = 1.0,
+    ) -> None:
+        self.warmup_init_lr = warmup_init_lr
+        self.warmup_steps = warmup_steps
+        def lr_lambda(step):
+            return polynomial_lr_schedule(
+                step, total_steps, warmup_steps, warmup_init_lr, lr, lr_end, power
+            ) / lr
+        super().__init__(optimizer, lr_lambda=lr_lambda)
+# -------# ProGen2 Fine-Tuning Scheduler #-------- #
+def cosine_frac_scheduler(step, total_steps, final_frac):
+    s = min(max(step, 0), total_steps)
+    cos = 0.5 * (1.0 + math.cos(math.pi * s / total_steps))  # 1 --> 0
+    return final_frac + (1.0 - final_frac) * cos  # multiplier goes from 1.0 down to final_frac
+class CosineToFrac(LambdaLR):
+    """
+    Cosine decay of the LR multiplier from 1.0 -> final_frac over total_steps (no warmup).
+    For ProGen fine-tuning, final_frac=0.2 implements decay to lr/5.
+    """
+    def __init__(self, optimizer, total_steps, final_frac=0.2):
+        self.total_steps = max(int(total_steps), 1)
+        self.final_frac = float(final_frac)
+        def lr_lambda(step):
+            return cosine_frac_scheduler(
+                step=step,
+                total_steps=self.total_steps,
+                final_frac=self.final_frac
+            )
+        super().__init__(optimizer, lr_lambda=lr_lambda)
+def inverse_sqrt_lr_schedule(step, warmup_steps, warmup_init_lr, lr_step, decay_step):
+    if step == 0:
+        step = 1
+    if step < warmup_steps:
+        return warmup_init_lr + lr_step * step
+    else:
+        return decay_step * step ** -0.5
+class InverseSqrtLRScheduler(LambdaLR):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        warmup_steps: int = 0,
+        lr: float = 5e-04,
+        warmup_init_lr: float = 1e-07,
+    ) -> None:
+        self.warmup_init_lr = warmup_init_lr
+        self.warmup_steps = warmup_steps
+        self.lr_step = (lr - warmup_init_lr) / warmup_steps
+        self.decay_step = lr * warmup_steps ** 0.5
+        def lr_lambda(step):
+            return inverse_sqrt_lr_schedule(
+                step, warmup_steps, warmup_init_lr, self.lr_step, self.decay_step
+            ) / lr
+        super().__init__(optimizer, lr_lambda=lr_lambda)
+def noam_lr_schedule(step, warmup_steps, factor, model_size):
+    if step == 0:
+        step = 1
+    return factor * (model_size ** (-0.5) * min(step ** (-0.5), step * warmup_steps ** (-1.5)))
+class NoamScheduler(LambdaLR):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        lr,
+        warmup_init_lr,
+        model_size: int = 128,
+        warmup_steps: int = 0,
+        factor: int = 2,
+    ) -> None:
+        # dummy_lr = self.base_lrs[0]
+        def lr_lambda(step):
+            return noam_lr_schedule(step, warmup_steps, factor, model_size) / lr
+        super().__init__(optimizer, lr_lambda=lr_lambda)