zyc4975matholic commited on Sep 30

Commit

303c2e0

1 Parent(s): 63d28a7

Include DNA training code

Files changed (34) hide show

tr2d2-dna/README.md +49 -0
tr2d2-dna/configs_gosai/callbacks/checkpoint_every_n_steps.yaml +8 -0
tr2d2-dna/configs_gosai/callbacks/checkpoint_monitor.yaml +10 -0
tr2d2-dna/configs_gosai/callbacks/learning_rate_monitor.yaml +3 -0
tr2d2-dna/configs_gosai/config_gosai.yaml +109 -0
tr2d2-dna/configs_gosai/lr_scheduler/constant_warmup.yaml +2 -0
tr2d2-dna/configs_gosai/lr_scheduler/cosine_decay_warmup.yaml +7 -0
tr2d2-dna/configs_gosai/model/dnaconv.yaml +12 -0
tr2d2-dna/configs_gosai/noise/ar.yaml +2 -0
tr2d2-dna/configs_gosai/noise/cosine.yaml +1 -0
tr2d2-dna/configs_gosai/noise/geometric.yaml +3 -0
tr2d2-dna/configs_gosai/noise/linear.yaml +3 -0
tr2d2-dna/configs_gosai/noise/loglinear.yaml +3 -0
tr2d2-dna/configs_gosai/noise/polynomial.yaml +5 -0
tr2d2-dna/configs_gosai/strategy/ddp.yaml +2 -0
tr2d2-dna/configs_gosai/strategy/fsdp.yaml +3 -0
tr2d2-dna/dataloader_gosai.py +211 -0
tr2d2-dna/diffusion.py +1604 -0
tr2d2-dna/diffusion_gosai_cfg.py +729 -0
tr2d2-dna/env.sh +20 -0
tr2d2-dna/eval_runs_batch.py +347 -0
tr2d2-dna/eval_utils.py +29 -0
tr2d2-dna/finetune.py +149 -0
tr2d2-dna/finetune_dna.py +113 -0
tr2d2-dna/finetune_utils.py +147 -0
tr2d2-dna/mcts.py +581 -0
tr2d2-dna/models/__init__.py +2 -0
tr2d2-dna/models/dnaconv.py +121 -0
tr2d2-dna/models/ema.py +97 -0
tr2d2-dna/noise_schedule.py +151 -0
tr2d2-dna/oracle.py +344 -0
tr2d2-dna/run_batch_eval.sh +30 -0
tr2d2-dna/train.sh +51 -0
tr2d2-dna/utils.py +175 -0

tr2d2-dna/README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# TR2-D2 For Enhancer DNA Design
+This part of the code is for finetuning DNA sequence models for optimizing DNA enhancer activity with TR2-D2.
+The codebase is built upon [MDLM (Sahoo et.al, 2023)](https://github.com/kuleshov-group/mdlm), [Drakes (Wang et.al, 2024)](https://github.com/ChenyuWang-Monica/DRAKES), [SEPO (Zekri et.al, 2025)](https://github.com/ozekri/SEPO/tree/main), and [MDNS (Zhu et.al, 2025)](https://arxiv.org/abs/2508.10684).
+## Environment Installation
+```
+conda create -n tr2d2-dna python=3.9.18
+conda activate tr2d2-dna
+bash env.sh
+```
+## Model Pretrained Weights Download
+All data and model weights can be downloaded from the link below, which is provided by the [DRAKES](https://arxiv.org/abs/2410.13643) author. Save the downloaded file in `$BASE_PATH`.
+https://www.dropbox.com/scl/fi/zi6egfppp0o78gr0tmbb1/DRAKES_data.zip?rlkey=yf7w0pm64tlypwsewqc01wmfq&st=xe8dzn8k&dl=0
+For downloading using terminal, use
+```
+curl -L -o dna.zip "https://www.dropbox.com/scl/fi/zi6egfppp0o78gr0tmbb1/DRAKES_data.zip?rlkey=yf7w0pm64tlypwsewqc01wmfq&st=xe8dzn8k&dl=0"
+unzip dna.zip
+```
+## Finetune with TR2-D2
+After downloading the pretrained checkpoints, fill in the `base_path` in `dataloader_gosai.py`, `oracle.py`, and `finetune.sh`. Fill in `HOME_LOC` and `SAVE_PATH` in `finetune.sh` as well.
+Reproduce the DNA experiments with $\alpha = 0.1$ using
+```
+sbatch train.sh
+```
+## Evaluate saved checkpoints
+The checkpoints will be saved to `SAVE_PATH`.
+Fill in `RUNS_DIR` in `run_batch_eval.sh` to be the same as `SAVE_PATH`. The checkpoints can be evaluated with
+```
+sbatch run_batch_eval.sh
+```

tr2d2-dna/configs_gosai/callbacks/checkpoint_every_n_steps.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+checkpoint_every_n_steps:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  save_top_k: -1 # Do not save any "best" models; this callback is being used to save every n train steps
+  save_last: True # save model as ${save_dir}/checkpoints/last.ckpt
+  dirpath: ${checkpointing.save_dir}/checkpoints
+  verbose: True
+  auto_insert_metric_name: False
+  every_n_train_steps: 500

tr2d2-dna/configs_gosai/callbacks/checkpoint_monitor.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+checkpoint_monitor:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  monitor: val/nll # name of the logged metric which determines when model is improving
+  mode: min # can be "max" or "min"
+  save_top_k: 1 # save k best models (determined by above metric)
+  save_last: False # True = additionally always save model from last epoch
+  dirpath: ${checkpointing.save_dir}/checkpoints
+  filename: best
+  auto_insert_metric_name: False
+  verbose: True

tr2d2-dna/configs_gosai/callbacks/learning_rate_monitor.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+learning_rate_monitor:
+  _target_: lightning.pytorch.callbacks.LearningRateMonitor
+  logging_interval: step

tr2d2-dna/configs_gosai/config_gosai.yaml ADDED Viewed

	@@ -0,0 +1,109 @@

+defaults:
+  - _self_
+  - /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor]
+  - /model: dnaconv
+  - /strategy: ddp
+  - /noise: loglinear
+  - /lr_scheduler: constant_warmup
+mode: train
+diffusion: absorbing_state
+backbone: cnn
+parameterization: subs
+time_conditioning: False
+T: 0  # 0 (continuous time) / 1000
+subs_masking: False
+debug_mode: False
+seed: 1
+data:
+  streaming: False
+loader:
+  global_batch_size: 512
+  eval_global_batch_size: ${.global_batch_size}
+  # Note: batch_size and eval_batch_size are **per machine**
+  batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"}
+  pin_memory: True
+sampling:
+  predictor: ddpm
+  steps: 128
+  noise_removal: True
+  num_sample_batches: 2  # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches
+  num_sample_log: 2
+  semi_ar: False
+  stride_length: 1
+  num_strides: 1
+training:
+  ema: 0.9999
+  antithetic_sampling: True
+  importance_sampling: False
+  sampling_eps: 1e-3
+  change_of_variables: False
+eval:
+  checkpoint_path: ''  # Used to evaluate a checkpoint after training.
+  disable_ema: False
+  compute_generative_perplexity: True # False
+  perplexity_batch_size: 8
+  compute_perplexity_on_sanity: False
+  gen_ppl_eval_model_name_or_path: gpt2-large  # gpt2-large, meta-llama/Llama-2-7b-hf
+  generate_samples: True
+  subset_size: 5000
+optim:
+  weight_decay: 0
+  lr: 3e-4
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1e-8
+trainer:
+  _target_: lightning.Trainer
+  accelerator: cuda
+  num_nodes: 1
+  devices: ${device_count:}
+  accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
+  gradient_clip_val: 1.0
+  precision: 'bf16'
+  num_sanity_val_steps: 2
+  max_steps: 131500 # 100 epochs
+  log_every_n_steps: 10
+  limit_train_batches: 1.0   # train on full dataset, can be used to toggle quick run
+  limit_val_batches: 1.0     # validate on full dataset, can be used to toggle quick run
+  val_check_interval: 1000
+wandb:
+  project: gosai-dna
+  notes: null
+  group: null
+  job_type: null
+  name: null
+  id: ${uuid:}
+  tags:
+    - ${noise.type}
+hydra:
+  run:
+    dir: ${now:%Y.%m.%d}/${now:%H%M%S}
+  job:
+    chdir: true
+checkpointing:
+  # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
+  save_dir: ${cwd:}
+  # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
+  resume_from_ckpt: true
+  resume_ckpt_path: ${.save_dir}/checkpoints/last.ckpt
+finetuning:
+  gumbel_softmax_temp: 1.0
+  truncate_steps: 3
+mcts:
+  sampling: 0 # 0: gumbel noise, >0 top-k sampling

tr2d2-dna/configs_gosai/lr_scheduler/constant_warmup.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _target_: transformers.get_constant_schedule_with_warmup
2	+ num_warmup_steps: 2500

tr2d2-dna/configs_gosai/lr_scheduler/cosine_decay_warmup.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_target_: utils.CosineDecayWarmupLRScheduler
+t_in_epochs: False
+t_initial: ${eval:${trainer.max_steps}-${.warmup_t}}
+warmup_prefix: True
+warmup_lr_init: 1e-6
+warmup_t: ${eval:0.1*${trainer.max_steps}}
+lr_min: 1e-6

tr2d2-dna/configs_gosai/model/dnaconv.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: dnaconv
+type: cnn
+length: 200 # for gosai
+hidden_dim: 128
+num_cnn_stacks: 4
+dropout: 0.0
+clean_data: False
+cls_free_guidance: False
+cls_free_threshold: 2.52
+cls_free_prob: 0.3
+cls_free_weight: 0.3 # weight in sampling

tr2d2-dna/configs_gosai/noise/ar.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ type: ar
2	+ scale: 6.0

tr2d2-dna/configs_gosai/noise/cosine.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ type: cosine

tr2d2-dna/configs_gosai/noise/geometric.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: geometric
+sigma_min: 1e-4
+sigma_max: 20

tr2d2-dna/configs_gosai/noise/linear.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: linear
+sigma_min: 1e-3
+sigma_max: 7.0

tr2d2-dna/configs_gosai/noise/loglinear.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: loglinear
+sigma_min: 1e-4
+sigma_max: 20

tr2d2-dna/configs_gosai/noise/polynomial.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+type: polynomial
+a: -3
+b: 5
+c: -4
+eps: 1e-3

tr2d2-dna/configs_gosai/strategy/ddp.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _target_: lightning.pytorch.strategies.DDPStrategy
2	+ find_unused_parameters: false # TODO(yair): this seems hacky, I think if things are correct we shouldn't need this

tr2d2-dna/configs_gosai/strategy/fsdp.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+# TODO(yair): Currenly not compatible with grad clipping
+_target_: lightning.pytorch.strategies.FSDPStrategy
+sharding_strategy: SHARD_GRAD_OP

tr2d2-dna/dataloader_gosai.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import torch
+import pandas as pd
+import typing
+import math
+import utils
+import numpy as np
+import os
+base_path = "" # Fill in directory of the pretrained checkpoints, e.g., "...../data_and_model/"
+LOGGER = utils.get_logger(__name__)
+DNA_ALPHABET = {'A': 0, 'C': 1, 'G': 2, 'T': 3} #, 'M': 4}
+INDEX_TO_DNA = {v: k for k, v in DNA_ALPHABET.items()}
+lookup_array = np.array([INDEX_TO_DNA[i] for i in range(len(INDEX_TO_DNA))])
+def dna_detokenize(seq):
+  return ''.join([list(DNA_ALPHABET.keys())[int(i)] for i in seq])
+def batch_dna_detokenize(batch_seq):
+    """
+    batch_seq: numpy array of shape [batch_size, seq_len]
+    return: list of strings
+    """
+    detokenized_batch = lookup_array[batch_seq]
+    detokenized_batch = [''.join(seq) for seq in detokenized_batch]
+    return detokenized_batch
+def dna_tokenize(seq):
+  return [DNA_ALPHABET[c] for c in seq]
+def batch_dna_tokenize(batch_seq):
+    """
+    batch_seq: list of strings
+    return: numpy array of shape [batch_size, seq_len]
+    """
+    tokenized_batch = np.array([[DNA_ALPHABET[c] for c in seq] for seq in batch_seq])
+    return tokenized_batch
+class GosaiDataset(torch.utils.data.Dataset):
+    def __init__(self):
+        data_df = pd.read_csv(os.path.join(base_path, f'mdlm/gosai_data/processed_data/gosai_all.csv'))
+        self.seqs = torch.tensor(data_df['seq'].apply(lambda x: [DNA_ALPHABET[c] for c in x]).tolist())
+        self.clss = torch.tensor(data_df[['hepg2', 'k562', 'sknsh']].to_numpy())
+        LOGGER.info(f'Loaded data: seqs shape: {self.seqs.shape}, clss shape: {self.clss.shape}')
+    def __len__(self):
+        return len(self.seqs)
+    def __getitem__(self, idx):
+        return {'seqs': self.seqs[idx], 'clss': self.clss[idx], 'attention_mask': torch.ones(len(self.seqs[idx]))}
+def get_datasets_gosai():
+  return GosaiDataset()
+def get_dataloaders_gosai(config, skip_valid=False, valid_seed=None):
+  num_gpus = torch.cuda.device_count()
+  if config.loader.global_batch_size % (
+    num_gpus * config.trainer.accumulate_grad_batches) != 0:
+    raise ValueError(
+      f'Train Batch Size {config.training.batch_size}'
+      f'not divisible by {num_gpus} gpus with accumulation '
+      f'{config.trainer.accumulate_grad_batches}.')
+  if config.loader.eval_global_batch_size % num_gpus != 0:
+    raise ValueError(
+      f'Eval Batch Size for {config.eval.batch_size} '
+      f'not divisible by {num_gpus}.')
+  train_set = GosaiDataset()
+  # randomly sample a subset of the train_set as valid_set
+  valid_set = torch.utils.data.Subset(train_set, np.random.choice(len(train_set), 40000, replace=False))
+  test_set = torch.utils.data.Subset(train_set, np.random.choice(len(train_set), 40000, replace=False))
+  train_loader = torch.utils.data.DataLoader(
+    train_set,
+    batch_size=config.loader.batch_size,
+    num_workers=config.loader.num_workers,
+    pin_memory=config.loader.pin_memory,
+    shuffle=not config.data.streaming,
+    persistent_workers=True)
+  if skip_valid:
+    valid_loader = None
+    test_loader = None
+  else:
+    if valid_seed is None:
+      shuffle_valid = False
+      generator = None
+    else:
+      shuffle_valid = True
+      generator = torch.Generator().manual_seed(valid_seed)
+    valid_loader = torch.utils.data.DataLoader(
+      valid_set,
+      batch_size=config.loader.eval_batch_size,
+      num_workers=config.loader.num_workers,
+      pin_memory=config.loader.pin_memory,
+      shuffle=shuffle_valid,
+      generator=generator)
+    test_loader = torch.utils.data.DataLoader(
+      test_set,
+      batch_size=config.loader.eval_batch_size,
+      num_workers=config.loader.num_workers,
+      pin_memory=config.loader.pin_memory,
+      shuffle=shuffle_valid,
+      generator=generator)
+  return train_loader, valid_loader, test_loader
+# Samplers adapted from: https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/fault_tolerant_sampler.py
+class RandomFaultTolerantSampler(torch.utils.data.RandomSampler):
+  def __init__(self, *args, generator=None, **kwargs):
+    # TD [2022-07-17]: We don't force the seed to be zero. We generate random seed,
+    # which should be reproducible if pl.seed_everything was called beforehand.
+    # This means that changing the seed of the experiment will also change the
+    # sampling order.
+    if generator is None:
+      seed = int(torch.empty((), dtype=torch.int64).random_().item())
+      generator = torch.Generator().manual_seed(seed)
+    kwargs.pop('shuffle', None)
+    super().__init__(*args, generator=generator, **kwargs)
+    self.counter = 0
+    self.restarting = False
+  def state_dict(self):
+    return {'random_state': self.generator.get_state(),
+            'counter': self.counter}
+  def load_state_dict(self, state_dict):
+    self.generator.set_state(state_dict.get('random_state'))
+    self.counter = state_dict['counter']
+    # self.start_counter = self.counter
+    self.restarting = True
+  # TD [2022-08-28] Setting the len will cause PL to think there are only a few batches left per
+  # epoch, and subsequent epoch will have very few batches.
+  def __iter__(self) -> typing.Iterator[int]:
+    n = len(self.data_source)
+    self.state = self.generator.get_state()
+    indices = torch.randperm(n, generator=self.generator).tolist()
+    if not self.restarting:
+      self.counter = 0
+    else:
+      indices = indices[self.counter:]
+      self.restarting = False
+    for index in indices:
+      self.counter += 1
+      yield index
+    self.counter = 0
+class FaultTolerantDistributedSampler(torch.utils.data.DistributedSampler):
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.counter = 0
+    self.restarting = False
+  def state_dict(self):
+    return {'epoch': self.epoch, 'counter': self.counter}
+  def load_state_dict(self, state_dict):
+    self.epoch = state_dict['epoch']
+    self.counter = state_dict['counter']
+    self.restarting = True
+  # TD [2022-08-28] Setting the len will cause PL to think there are only a few batches left per
+  # epoch, and subsequent epoch will have very few batches.
+  def __iter__(self):
+    if self.shuffle:
+      # deterministically shuffle based on epoch and seed
+      g = torch.Generator()
+      g.manual_seed(self.seed + self.epoch)
+      indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+    else:
+      indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+    if not self.drop_last:
+      # add extra samples to make it evenly divisible
+      padding_size = self.total_size - len(indices)
+      if padding_size <= len(indices):
+        indices += indices[:padding_size]
+      else:
+        indices += (indices * math.ceil(
+          padding_size / len(indices)))[:padding_size]
+    else:
+      # remove tail of data to make it evenly divisible.
+      indices = indices[:self.total_size]
+    assert len(indices) == self.total_size
+    # subsample
+    indices = indices[self.rank:self.total_size:self.num_replicas]
+    assert len(indices) == self.num_samples
+    if not self.restarting:
+      self.counter = 0
+    else:
+      indices = indices[self.counter:]
+      self.restarting = False
+    for index in indices:
+      self.counter += 1
+      yield index
+    self.counter = 0

tr2d2-dna/diffusion.py ADDED Viewed

	@@ -0,0 +1,1604 @@

+import itertools
+import math
+from dataclasses import dataclass
+import hydra.utils
+import lightning as L
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchmetrics
+from torch import Tensor
+import dataloader_gosai
+import models
+import noise_schedule
+import utils
+import oracle
+from scipy.stats import wasserstein_distance, pearsonr
+from finetune_utils import to_one_hot
+LOG2 = math.log(2)
+LOGGER = utils.get_logger(__name__)
+def _sample_categorical(categorical_probs):
+    gumbel_norm = (
+        1e-10
+        - (torch.rand_like(categorical_probs) + 1e-10).log())
+    return (categorical_probs / gumbel_norm).argmax(dim=-1).to(dtype=torch.long)
+def _sample_categorical_gradient(categorical_probs, temp = 1.0):
+    gumbel_norm = (
+        1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log())
+    output = torch.nn.functional.softmax((torch.log(categorical_probs)-torch.log(gumbel_norm))/temp, 2)
+    return output
+def _unsqueeze(x, reference):
+    return x.view(
+        * x.shape,
+        * ((1,) * (len(reference.shape) - len(x.shape))))
+def sample_batched_categorical(categorical_probs, batch_size):
+    """
+    Generates `m` distinct sequences sampled from categorical probabilities
+    using the Gumbel distribution to ensure randomness while following probabilities
+    Args:
+        categorical_probs (torch.Tensor): tensor of shape (sequence_length, vocab_length)
+                                          representing categorical probabilities
+        m (int): number of distinct sequences to sample
+    Returns:
+        torch.Tensor: tensor of shape (m, sequence_length), where each row is a
+                      distinct sequence of sampled category indices.
+    """
+    _, sequence_length, vocab_size = categorical_probs.shape
+    # add Gumbel noise and sample m sequences
+    gumbel_noise = (-torch.log(-torch.log(torch.rand(batch_size, sequence_length, vocab_size) + 1e-10) + 1e-10)).to(categorical_probs.device)
+    noisy_scores = torch.log(categorical_probs) + gumbel_noise  # add Gumbel noise to log probabilities
+    # select the highest score (most likely category after Gumbel noise)
+    sampled_sequences = noisy_scores.argmax(dim=-1).to(dtype=torch.long)  # shape: (m, sequence_length)
+    return sampled_sequences
+def sample_batched_top_k(categorical_probs, batch_size, k):
+    """
+    Generates `m` sequences sampled from the top-k probabilities of each token
+    using Gumbel noise to ensure randomness and reduce bias towards the most likely options.
+    Args:
+        categorical_probs (torch.Tensor): A tensor of shape (sequence_length, vocab_length)
+                                          representing categorical probabilities.
+        m (int): Number of sequences to sample.
+        k (int): Number of top probabilities to consider for sampling.
+    Returns:
+        torch.Tensor: A tensor of shape (m, sequence_length), where each row is a
+                      sampled sequence of category indices.
+    """
+    _, sequence_length, vocab_length = categorical_probs.shape
+    # Add Gumbel noise to the log probabilities
+    gumbel_noise = -torch.log(-torch.log(torch.rand(batch_size, sequence_length, vocab_length) + 1e-10) + 1e-10).to(categorical_probs.device)
+    noisy_scores = torch.log(categorical_probs[None, :, :]) + gumbel_noise  # Shape: (m, sequence_length, vocab_length)
+    # Get the top-k categories based on noisy scores
+    top_k_scores, top_k_indices = torch.topk(noisy_scores, k, dim=-1)  # Shape: (m, sequence_length, k)
+    # Convert top-k scores back to probabilities and normalize
+    top_k_probs = torch.softmax(top_k_scores, dim=-1).to(categorical_probs.device)  # Shape: (m, sequence_length, k)
+    # Sample randomly from the top-k probabilities
+    sampled_indices_in_top_k = torch.multinomial(top_k_probs.reshape(-1, k), num_samples=1).squeeze(-1).to(categorical_probs.device)
+    sampled_indices_in_top_k = sampled_indices_in_top_k.view(batch_size, sequence_length).to(categorical_probs.device)  # Shape: (batch_size, sequence_length)
+    # Map sampled indices back to the original vocabulary indices
+    sampled_sequences = torch.gather(top_k_indices, -1, sampled_indices_in_top_k.unsqueeze(-1)).squeeze(-1).to(categorical_probs.device).to(dtype=torch.long)
+    return sampled_sequences
+@dataclass
+class Loss:
+    loss: torch.FloatTensor
+    nlls: torch.FloatTensor
+    token_mask: torch.FloatTensor
+class NLL(torchmetrics.aggregation.MeanMetric):
+    pass
+class BPD(NLL):
+  def compute(self) -> Tensor:
+    """Computes the bits per dimension.
+    Returns:
+      bpd
+    """
+    return self.mean_value / self.weight / LOG2
+class Perplexity(NLL):
+    def compute(self) -> Tensor:
+        """Computes the Perplexity.
+        Returns:
+        Perplexity
+        """
+        return torch.exp(self.mean_value / self.weight)
+class Diffusion(L.LightningModule):
+    def __init__(
+        self,
+        config,
+        eval=False):
+        super().__init__()
+        self.save_hyperparameters()
+        self.config = config
+        self.vocab_size = 4
+        self.sampler = self.config.sampling.predictor
+        self.antithetic_sampling = self.config.training.antithetic_sampling
+        self.importance_sampling = self.config.training.importance_sampling
+        self.change_of_variables = self.config.training.change_of_variables
+        # add mask token
+        self.mask_index = self.vocab_size
+        self.vocab_size += 1
+        self.parameterization = self.config.parameterization
+        # dna backbone model
+        if self.config.backbone == 'cnn':
+            self.backbone = models.dnaconv.CNNModel(
+                self.config.model, alphabet_size=self.vocab_size, num_cls=3) # num_cls is not used since classifier is always set to False
+        else:
+            raise ValueError(f'Unknown backbone: {self.config.backbone}')
+        self.T = self.config.T
+        self.subs_masking = self.config.subs_masking
+        self.softplus = torch.nn.Softplus()
+        # metrics are automatically reset at end of epoch
+        metrics = torchmetrics.MetricCollection({
+            'nll': NLL(),
+            'bpd': BPD(),
+            'ppl': Perplexity(),
+        })
+        metrics.set_dtype(torch.float64)
+        self.train_metrics = metrics.clone(prefix='train/')
+        self.valid_metrics = metrics.clone(prefix='val/')
+        self.test_metrics = metrics.clone(prefix='test/')
+        # generative perplexity
+        self.gen_ppl_metric = Perplexity()
+        self.noise = noise_schedule.get_noise(self.config,
+                                            dtype=self.dtype)
+        # ema
+        if self.config.training.ema > 0:
+            self.ema = models.ema.ExponentialMovingAverage(
+                itertools.chain(self.backbone.parameters(),
+                                self.noise.parameters()),
+                decay=self.config.training.ema)
+        else:
+            self.ema = None
+        self.lr = self.config.optim.lr
+        self.sampling_eps = self.config.training.sampling_eps
+        self.time_conditioning = self.config.time_conditioning
+        self.neg_infinity = -1000000.0
+        self.fast_forward_epochs = None
+        self.fast_forward_batches = None
+        self._validate_configuration()
+        # subset of data for evaluation
+        if eval:
+            self.eval_sets_sp = oracle.subset_for_eval(n=config.eval.subset_size)
+            self.eval_sets_sp_clss = oracle.subset_eval_groundtruth(self.eval_sets_sp)
+            self.eval_sets_sp_preds = oracle.subset_eval_preds(self.eval_sets_sp)
+            self.eval_sets_sp_kmers = oracle.subset_eval_kmers(self.eval_sets_sp)
+            self.emb_pca = oracle.cal_emb_pca(oracle.subset_for_eval(n=40000), n_components=50)
+            self.eval_sets_sp_embs_pca = oracle.subset_eval_embs_pca(self.eval_sets_sp, self.emb_pca)
+    def _validate_configuration(self):
+        assert not (self.change_of_variables and self.importance_sampling)
+        assert self.parameterization == 'subs'
+    def on_load_checkpoint(self, checkpoint):
+        if self.ema:
+            self.ema.load_state_dict(checkpoint['ema'])
+        # Copied from:
+        # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
+        self.fast_forward_epochs = checkpoint['loops']['fit_loop']['epoch_progress']['current']['completed']
+        self.fast_forward_batches = checkpoint['loops'][
+            'fit_loop']['epoch_loop.batch_progress'][
+                'current']['completed']
+    def on_save_checkpoint(self, checkpoint):
+        if self.ema:
+            checkpoint['ema'] = self.ema.state_dict()
+        # Copied from:
+        # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/tasks/seq.py
+        # ['epoch_loop.batch_progress']['total']['completed'] is 1 iteration
+        # behind, so we're using the optimizer's progress.
+        checkpoint['loops']['fit_loop'][
+            'epoch_loop.batch_progress']['total'][
+                'completed'] = checkpoint['loops']['fit_loop'][
+                'epoch_loop.automatic_optimization.optim_progress'][
+                    'optimizer']['step']['total'][
+                    'completed'] * self.trainer.accumulate_grad_batches
+        checkpoint['loops']['fit_loop'][
+            'epoch_loop.batch_progress']['current'][
+                'completed'] = checkpoint['loops']['fit_loop'][
+                'epoch_loop.automatic_optimization.optim_progress'][
+                    'optimizer']['step']['current'][
+                    'completed'] * self.trainer.accumulate_grad_batches
+        # _batches_that_stepped tracks the number of global steps, not the number
+        # of local steps, so we don't multiply with self.trainer.accumulate_grad_batches here.
+        checkpoint['loops']['fit_loop'][
+            'epoch_loop.state_dict'][
+                '_batches_that_stepped'] = checkpoint['loops']['fit_loop'][
+                'epoch_loop.automatic_optimization.optim_progress'][
+                    'optimizer']['step']['total']['completed']
+        if 'sampler' not in checkpoint.keys():
+            checkpoint['sampler'] = {}
+        if hasattr(self.trainer.train_dataloader.sampler, 'state_dict'):
+            sampler_state_dict = self.trainer.train_dataloader.sampler.state_dict()
+            checkpoint['sampler']['random_state'] = sampler_state_dict.get('random_state', None)
+        else:
+            checkpoint['sampler']['random_state'] = None
+    def on_train_start(self):
+        if self.ema:
+            self.ema.move_shadow_params_to_device(self.device)
+        distributed = (
+            self.trainer._accelerator_connector.use_distributed_sampler
+            and self.trainer._accelerator_connector.is_distributed)
+        print('distributed:', distributed)
+        if distributed:
+            sampler_cls = dataloader_gosai.FaultTolerantDistributedSampler
+        else:
+            sampler_cls = dataloader_gosai.RandomFaultTolerantSampler
+        updated_dls = []
+        for dl in self.trainer.fit_loop._combined_loader.flattened:
+            if hasattr(dl.sampler, 'shuffle'):
+                dl_sampler = sampler_cls(dl.dataset, shuffle=dl.sampler.shuffle)
+            else:
+                dl_sampler = sampler_cls(dl.dataset)
+                if (distributed and self.fast_forward_epochs is not None
+                    and self.fast_forward_batches is not None):
+                    dl_sampler.load_state_dict({
+                        'epoch': self.fast_forward_epochs,
+                        'counter': (self.fast_forward_batches
+                                    * self.config.loader.batch_size)})
+                updated_dls.append(
+                    torch.utils.data.DataLoader(
+                        dl.dataset,
+                        batch_size=self.config.loader.batch_size,
+                        num_workers=self.config.loader.num_workers,
+                        pin_memory=self.config.loader.pin_memory,
+                        sampler=dl_sampler,
+                        shuffle=False,
+                        persistent_workers=True))
+        self.trainer.fit_loop._combined_loader.flattened = updated_dls
+    def optimizer_step(self, *args, **kwargs):
+        super().optimizer_step(*args, **kwargs)
+        if self.ema:
+            self.ema.update(itertools.chain(
+                self.backbone.parameters(),
+                self.noise.parameters()))
+    # subs parameterization from MDLM
+    def _subs_parameterization(self, logits, xt):
+        logits[:, :, self.mask_index] += self.neg_infinity
+        logits = logits - torch.logsumexp(logits, dim=-1, keepdim=True)
+        if xt.ndim > 2 and xt.shape[-1] == self.vocab_size:
+            # this is for finetuning setting when the input is one-hot encoded or probs
+            xt = xt.argmax(dim=-1)
+        unmasked_indices = (xt != self.mask_index)
+        logits[unmasked_indices] = self.neg_infinity
+        logits[unmasked_indices, xt[unmasked_indices]] = 0
+        return logits
+    def _process_sigma(self, sigma):
+        if sigma is None:
+            assert self.parameterization == 'ar'
+            return sigma
+        if sigma.ndim > 1:
+            sigma = sigma.squeeze(-1)
+        if not self.time_conditioning:
+            sigma = torch.zeros_like(sigma)
+        assert sigma.ndim == 1, sigma.shape
+        return sigma
+    def forward(self, x, sigma):
+        """Returns log score."""
+        sigma = self._process_sigma(sigma)
+        x = x.to(dtype=torch.long)
+        with torch.cuda.amp.autocast(dtype=torch.float32):
+            logits = self.backbone(x, sigma)
+        if self.parameterization == 'subs':
+            return self._subs_parameterization(logits=logits, xt=x)
+        return logits
+    # might need changing to match wdce loss
+    def _compute_loss(self, batch, prefix):
+        if 'attention_mask' in batch:
+            attention_mask = batch['attention_mask']
+        else:
+            attention_mask = None
+        losses = self._loss(batch['seqs'], attention_mask)
+        loss = losses.loss
+        if prefix == 'train':
+            self.train_metrics.update(losses.nlls, losses.token_mask)
+            metrics = self.train_metrics
+        elif prefix == 'val':
+            self.valid_metrics.update(losses.nlls, losses.token_mask)
+            metrics = self.valid_metrics
+        elif prefix == 'test':
+            self.test_metrics.update(losses.nlls, losses.token_mask)
+            metrics = self.test_metrics
+        else:
+            raise ValueError(f'Invalid prefix: {prefix}')
+        self.log_dict(metrics, on_step=False, on_epoch=True, sync_dist=True)
+        return loss
+    def on_train_epoch_start(self):
+        self.backbone.train()
+        self.noise.train()
+    def training_step(self, batch, batch_idx):
+        loss = self._compute_loss(batch, prefix='train')
+        self.log(name='trainer/loss',
+                value=loss.item(),
+                on_step=True,
+                on_epoch=False,
+                sync_dist=True)
+        return loss
+    def on_validation_epoch_start(self):
+        if self.ema:
+            self.ema.store(itertools.chain(
+                    self.backbone.parameters(),
+                    self.noise.parameters()))
+            self.ema.copy_to(itertools.chain(
+                    self.backbone.parameters(),
+                    self.noise.parameters()))
+        self.backbone.eval()
+        self.noise.eval()
+        assert self.valid_metrics.nll.mean_value == 0
+        assert self.valid_metrics.nll.weight == 0
+    def validation_step(self, batch, batch_idx):
+        return self._compute_loss(batch, prefix='val')
+    def on_validation_epoch_end(self):
+        if ((self.config.eval.compute_perplexity_on_sanity
+                or not self.trainer.sanity_checking)
+                and self.config.eval.generate_samples
+                and not self.parameterization == 'ar'):
+            all_samples, all_detoeknized_samples = [], []
+            for _ in range(self.config.sampling.num_sample_batches):
+                samples = self._sample().detach().cpu().numpy()
+                detokenized_samples = dataloader_gosai.batch_dna_detokenize(samples)
+                all_samples.append(samples)
+                all_detoeknized_samples.extend(detokenized_samples)
+            all_samples = np.concatenate(all_samples, axis=0)
+            ws_distance_dict = self.cal_wasserstein_distance(all_detoeknized_samples)
+            pearsonr_list = self.cal_kmer_pearsonr(all_detoeknized_samples)
+            ws_embpca_list = self.cal_ws_distance_embpca(all_detoeknized_samples)
+        current_step = self.trainer.global_step
+        LOGGER.info(f'Current step: {current_step}')
+        LOGGER.info(f'Wasserstein distance: {ws_distance_dict}')
+        LOGGER.info(f'3mer Pearsonr: {pearsonr_list}')
+        LOGGER.info(f'Wasserstein distance embpca: {ws_embpca_list}')
+        self.log('val/3mer_pearsonr', pearsonr_list, on_step=False, on_epoch=True, sync_dist=True)
+        self.log('val/ws_embpca', ws_embpca_list, on_step=False, on_epoch=True, sync_dist=True)
+        for key in ws_distance_dict:
+            for cell_type in ws_distance_dict[key]:
+                metric_values = ws_distance_dict[key][cell_type]
+                if metric_values:  # Check if the list is not empty
+                    # Assuming metric_values contains [train_metric, valid_metric, test_metric]
+                    self.log(f'val/{key}_{cell_type}', metric_values[0], on_step=False, on_epoch=True, sync_dist=True)
+        if self.ema:
+            self.ema.restore(itertools.chain(self.backbone.parameters(),
+                                self.noise.parameters()))
+    ### VALIDATION METRICS ###
+    def cal_wasserstein_distance(self, seqs):
+        generated_preds = oracle.cal_gosai_pred_new(seqs)
+        ws_distance_dict = {'truth': {'hepg2': [], 'k562': [], 'sknsh': []},
+                            'preds': {'hepg2': [], 'k562': [], 'sknsh': []}}
+        ws_distance_dict['truth']['hepg2'].append(wasserstein_distance(generated_preds[:, 0], self.eval_sets_sp_clss[:, 0]))
+        ws_distance_dict['truth']['k562'].append(wasserstein_distance(generated_preds[:, 1], self.eval_sets_sp_clss[:, 1]))
+        ws_distance_dict['truth']['sknsh'].append(wasserstein_distance(generated_preds[:, 2], self.eval_sets_sp_clss[:, 2]))
+        ws_distance_dict['preds']['hepg2'].append(wasserstein_distance(generated_preds[:, 0], self.eval_sets_sp_preds[:, 0]))
+        ws_distance_dict['preds']['k562'].append(wasserstein_distance(generated_preds[:, 1], self.eval_sets_sp_preds[:, 1]))
+        ws_distance_dict['preds']['sknsh'].append(wasserstein_distance(generated_preds[:, 2], self.eval_sets_sp_preds[:, 2]))
+        return ws_distance_dict
+    def cal_ws_distance_embpca(self, seqs):
+        generated_embs = oracle.cal_gosai_emb(seqs)
+        generated_embs_pca = self.emb_pca.transform(generated_embs.reshape(generated_embs.shape[0], -1))
+        return oracle.get_wasserstein_dist(generated_embs_pca, self.eval_sets_sp_embs_pca)
+    def compare_kmer(self, kmer1, kmer2, n_sp1, n_sp2):
+        kmer_set = set(kmer1.keys()) | set(kmer2.keys())
+        counts = np.zeros((len(kmer_set), 2))
+        for i, kmer in enumerate(kmer_set):
+            if kmer in kmer1:
+                counts[i][1] = kmer1[kmer] * n_sp2 / n_sp1
+            if kmer in kmer2:
+                counts[i][0] = kmer2[kmer]
+        return pearsonr(counts[:, 0], counts[:, 1])[0]
+    def cal_kmer_pearsonr(self, seqs):
+        generated_kmer = oracle.count_kmers(seqs)
+        return self.compare_kmer(self.eval_sets_sp_kmers, generated_kmer, self.config.eval.subset_size, len(seqs))
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            itertools.chain(self.backbone.parameters(),
+                            self.noise.parameters()),
+            lr=self.config.optim.lr,
+            betas=(self.config.optim.beta1, self.config.optim.beta2),
+            eps=self.config.optim.eps,
+            weight_decay=self.config.optim.weight_decay)
+        scheduler = hydra.utils.instantiate(self.config.lr_scheduler, optimizer=optimizer)
+        scheduler_dict = {
+            'scheduler': scheduler,
+            'interval': 'step',
+            'monitor': 'val/loss',
+            'name': 'trainer/lr',
+        }
+        return [optimizer], [scheduler_dict]
+    def q_xt(self, x, move_chance):
+        """Computes the noisy sample xt.
+        Args:
+        x: int torch.Tensor with shape (batch_size,
+            diffusion_model_input_length), input.
+        move_chance: float torch.Tensor with shape (batch_size, 1).
+        """
+        move_indices = torch.rand(* x.shape, device=x.device) < move_chance
+        xt = torch.where(move_indices, self.mask_index, x)
+        return xt
+    def _sample_prior(self, *batch_dims):
+        """
+            Returns array of fully masked sequences with same shape as input
+        """
+        return self.mask_index * torch.ones(* batch_dims, dtype=torch.int64)
+    def _ddpm_caching_update(self, x, t, dt, p_x0=None):
+        assert self.config.noise.type == 'loglinear'
+        sigma_t, _ = self.noise(t)
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        assert t.ndim == 1
+        move_chance_t = t[:, None, None]
+        move_chance_s = (t - dt)[:, None, None]
+        assert move_chance_t.ndim == 3, move_chance_t.shape
+        if p_x0 is None:
+            p_x0 = self.forward(x, sigma_t).exp()
+        assert move_chance_t.ndim == p_x0.ndim
+        q_xs = p_x0 * (move_chance_t - move_chance_s)
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        _x = _sample_categorical(q_xs)
+        copy_flag = (x != self.mask_index).to(x.dtype)
+        return p_x0, copy_flag * x + (1 - copy_flag) * _x
+    def _ddpm_update(self, x, t, dt, return_process=False):
+        sigma_t, _ = self.noise(t)
+        sigma_s, _ = self.noise(t - dt)
+        if sigma_t.ndim > 1:
+            sigma_t = sigma_t.squeeze(-1)
+        if sigma_s.ndim > 1:
+            sigma_s = sigma_s.squeeze(-1)
+        assert sigma_t.ndim == 1, sigma_t.shape
+        assert sigma_s.ndim == 1, sigma_s.shape
+        move_chance_t = 1 - torch.exp(-sigma_t) # t
+        move_chance_s = 1 - torch.exp(-sigma_s)
+        move_chance_t = move_chance_t[:, None, None]
+        move_chance_s = move_chance_s[:, None, None]
+        unet_conditioning = sigma_t
+        log_p_x0 = self.forward(x, unet_conditioning)
+        assert move_chance_t.ndim == log_p_x0.ndim
+        q_xs = log_p_x0.exp() * (move_chance_t - move_chance_s)
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        _x = _sample_categorical(q_xs)
+        copy_flag = (x != self.mask_index).to(x.dtype)
+        if return_process:
+            return copy_flag * x + (1 - copy_flag) * _x, x, unet_conditioning, move_chance_t, copy_flag
+        else:
+            return copy_flag * x + (1 - copy_flag) * _x
+    def _ar_sampler(self, bsz):
+        # precompute token buffer
+        num_pred_tokens = self.config.model.length - 1
+        x = torch.zeros(
+            (bsz, num_pred_tokens + 1),
+            dtype=torch.long,
+            device=self.device)
+        x[:, 0] = self.tokenizer.bos_token_id
+        # precompute noise
+        noise = (torch.distributions.Gumbel(0, 1)
+                .sample((bsz, num_pred_tokens, self.vocab_size))
+                .to(self.device))
+        for i in range(num_pred_tokens):
+            next_logits = self.forward(x[:, :i + 1], None)[:, -1]
+            y = (next_logits + noise[:, i]).argmax(-1)
+            x[:, i + 1] = y
+        return x
+    @torch.no_grad()
+    def _sample(self, num_steps=None, eps=1e-5, eval_sp_size=None):
+        """Generate samples from the model."""
+        if eval_sp_size is None:
+            batch_size_per_gpu = self.config.loader.eval_batch_size
+        else:
+            batch_size_per_gpu = eval_sp_size
+        if self.parameterization == 'ar':
+            return self._ar_sampler(batch_size_per_gpu)
+        # Lightning auto-casting is not working in this method for some reason
+        if num_steps is None:
+            num_steps = self.config.sampling.steps
+        x = self._sample_prior(
+            batch_size_per_gpu,
+            self.config.model.length).to(self.device)
+        timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        p_x0_cache = None
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(x.shape[0], 1, device=self.device)
+            if self.sampler == 'ddpm':
+                x = self._ddpm_update(x, t, dt)
+            elif self.sampler == 'ddpm_cache':
+                p_x0_cache, x_next = self._ddpm_caching_update(x, t, dt, p_x0=p_x0_cache)
+                if (not torch.allclose(x_next, x) or self.time_conditioning):
+                    p_x0_cache = None
+                x = x_next
+            else:
+                x = self._analytic_update(x, t, dt)
+        if self.config.sampling.noise_removal:
+            t = timesteps[-1] * torch.ones(x.shape[0], 1,
+                                        device=self.device)
+            if self.sampler == 'analytic':
+                x = self._denoiser_update(x, t)
+            else:
+                unet_conditioning = self.noise(t)[0]
+                logits = self.forward(x, unet_conditioning)
+                x = logits[:, :, :-1].argmax(dim=-1)
+        return x
+    ### FOR THE EXPANSION AND ROLLOUT STEP ###
+    def sample_finetuned_with_rnd(self, args, reward_model,pretrained, eps=1e-5):
+        num_steps = args.total_num_steps
+        x_rollout = self._sample_prior(
+            args.batch_size,
+            args.seq_length).to(self.device)
+        log_rnd = torch.zeros(args.batch_size, device=self.device)
+        timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(x_rollout.shape[0], 1, device=self.device)
+            log_p, x_next, log_policy_step, log_pretrained_step = self.mcts_reverse_step(x_rollout, t=t, dt=dt, pretrained=pretrained)
+            log_rnd += log_pretrained_step - log_policy_step
+            x_rollout = x_next
+        # if mask token remains, fully unmask
+        mask_positions = (x_rollout == self.mask_index)        # (B, L) bool
+        # does **any** mask remain in any sequence
+        any_mask_global = mask_positions.any().item()  # true if mask remains
+        if any_mask_global:
+            log_p, x_next = self.single_noise_removal(x_rollout, t=t, dt=dt)
+            x_rollout = x_next
+        x_final = x_rollout
+        x_one_hot = to_one_hot(x_final)
+        x_one_hot_reward = torch.transpose(x_one_hot, 1, 2)
+        reward_preds = reward_model(x_one_hot_reward).squeeze(-1) # (num_children, 4)
+        rewards = reward_preds[:, 0] # (num_children, 1)
+        log_rnd = log_rnd + rewards / args.alpha
+        mean_reward = rewards.mean()
+        return x_final, log_rnd, rewards
+    def sample_finetuned(self, args, reward_model, eps=1e-5):
+        num_steps = args.total_num_steps
+        x_rollout = self._sample_prior(
+            args.batch_size,
+            args.seq_length).to(self.device)
+        timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(x_rollout.shape[0], 1, device=self.device)
+            log_p, x_next = self.single_reverse_step(x_rollout, t=t, dt=dt)
+            x_rollout = x_next
+        # if mask token remains, fully unmask
+        mask_positions = (x_rollout == self.mask_index)        # (B, L) bool
+        # does **any** mask remain in any sequence
+        any_mask_global = mask_positions.any().item()  # true if mask remains
+        if any_mask_global:
+            log_p, x_next = self.single_noise_removal(x_rollout, t=t, dt=dt)
+            x_rollout = x_next
+        x_final = x_rollout
+        x_one_hot = to_one_hot(x_final)
+        x_one_hot_reward = torch.transpose(x_one_hot, 1, 2)
+        reward_preds = reward_model(x_one_hot_reward).squeeze(-1) # (num_children, 4)
+        rewards = reward_preds[:, 0] # (num_children, 1)
+        mean_reward = rewards.mean()
+        return x_final, mean_reward
+    def compute_log_policy(self, token_array, x_next, t, dt):
+        sigma_t, _ = self.noise(t)
+        if token_array.ndim == 1:
+            token_array = token_array.unsqueeze(0)
+        if x_next.ndim == 1:
+            x_next = x_next.unsqueeze(0)
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        assert t.ndim == 1
+        change_prob_t = t[:, None, None]
+        change_prob_s = (t - dt)[:, None, None]
+        assert change_prob_t.ndim == 3, change_prob_t.shape
+        log_p = self.forward(token_array, sigma=sigma_t)
+        p_x0 = log_p.exp()
+        assert change_prob_t.ndim == p_x0.ndim
+        q_xs = p_x0 * (change_prob_t - change_prob_s)
+        # zero-masking probability
+        q_xs[:, :, self.mask_index] = change_prob_s[:, :, 0]
+        copy_flag = (token_array != self.mask_index)
+        assert copy_flag.dtype == torch.bool, "copy_flag must be bool"
+        changed_mask = (~copy_flag)
+        # compute the per-sequence log-probability under the pretrained model
+        log_policy_token = log_p.gather(-1, x_next.unsqueeze(-1)).squeeze(-1)
+        unmasked_this_step = (changed_mask & (x_next != self.mask_index)).to(log_policy_token.dtype)
+        log_policy_step = (log_policy_token * unmasked_this_step).sum(dim=-1)
+        # returns:
+        # log_policy_step (B, ) log probability x_next tokens under policy
+        if log_policy_step.ndim == 1:
+            log_policy_step = log_policy_step.squeeze(0)
+        return log_policy_step
+    def single_reverse_step(self, token_array, t, dt, p_x0=None):
+        assert self.config.noise.type == 'loglinear'
+        sigma_t, _ = self.noise(t)
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        assert t.ndim == 1
+        change_prob_t = t[:, None, None]
+        change_prob_s = (t - dt)[:, None, None]
+        assert change_prob_t.ndim == 3, change_prob_t.shape
+        if p_x0 is None:
+            log_p = self.forward(token_array, sigma=sigma_t)
+            p_x0 = log_p.exp()
+        assert change_prob_t.ndim == p_x0.ndim
+        q_xs = p_x0 * (change_prob_t - change_prob_s)
+        # zero-masking probability
+        q_xs[:, :, self.mask_index] = change_prob_s[:, :, 0]
+        x_changed = _sample_categorical(q_xs)
+        copy_flag = (token_array != self.mask_index)
+        int_copy_flag = copy_flag.to(token_array.dtype)
+        x_next = int_copy_flag * token_array + (1 - int_copy_flag) * x_changed
+        # returns:
+        # log_p (B, L, D) log probabilties of each token under the policy model
+        # x_next (B, L) next sequences
+        return log_p, x_next
+    def single_noise_removal(self, token_array, t, dt, p_x0=None):
+        assert self.config.noise.type == 'loglinear'
+        sigma_t, _ = self.noise(t)
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        assert t.ndim == 1
+        change_prob_t = t[:, None, None]
+        change_prob_s = (t - dt)[:, None, None]
+        assert change_prob_t.ndim == 3, change_prob_t.shape
+        if p_x0 is None:
+            log_p = self.forward(token_array, sigma=sigma_t)
+            p_x0 = log_p.exp()
+        assert change_prob_t.ndim == p_x0.ndim
+        # changed for noise removal
+        p_x0 = p_x0.clone()
+        p_x0[:, :, self.mask_index] = 0.0 # prevent remaining a mask
+        p_x0 = p_x0 / p_x0.sum(dim=-1, keepdim=True).clamp_min(1e-12)  # renorm over non-MASK
+        q_xs = p_x0 * (change_prob_t - change_prob_s)
+        x_changed = _sample_categorical(q_xs)
+        copy_flag = (token_array != self.mask_index)
+        int_copy_flag = copy_flag.to(token_array.dtype)
+        x_next = int_copy_flag * token_array + (1 - int_copy_flag) * x_changed
+        # returns:
+        # log_p (B, L, D) log probabilties of each token under the policy model
+        # x_next (B, L) next sequences
+        return log_p, x_next
+    def mcts_reverse_step(self, token_array, t, dt, pretrained, p_x0=None):
+        assert self.config.noise.type == 'loglinear'
+        sigma_t, _ = self.noise(t)
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        assert t.ndim == 1
+        change_prob_t = t[:, None, None]
+        change_prob_s = (t - dt)[:, None, None]
+        assert change_prob_t.ndim == 3, change_prob_t.shape
+        if p_x0 is None:
+            log_p = self.forward(token_array, sigma=sigma_t)
+            p_x0 = log_p.exp()
+        assert change_prob_t.ndim == p_x0.ndim
+        q_xs = p_x0 * (change_prob_t - change_prob_s)
+        # zero-masking probability
+        q_xs[:, :, self.mask_index] = change_prob_s[:, :, 0]
+        x_changed = _sample_categorical(q_xs)
+        copy_flag = (token_array != self.mask_index)
+        int_copy_flag = copy_flag.to(token_array.dtype)
+        x_next = int_copy_flag * token_array + (1 - int_copy_flag) * x_changed
+        # compute the log-probability under pretrained model at each step
+        with torch.no_grad():
+            # pretrained should output log-probs over vocab at each position given the *parent* (masked) input
+            log_pre = pretrained.forward(token_array, sigma=sigma_t)
+            # log-prob of the *sampled token* at each position
+            log_pre_token = log_pre.gather(-1, x_next.unsqueeze(-1)).squeeze(-1)  # [B*batch,L]
+            # sum only over the sites actually sampled this step (i.e., where parent was mask)
+            assert copy_flag.dtype == torch.bool, "copy_flag must be bool"
+            changed_mask = (~copy_flag)
+            # mask of tokens that were unmasked in this step
+            unmasked_this_step = (changed_mask & (x_next != self.mask_index)).to(log_pre_token.dtype)
+            log_pretrained_step = (log_pre_token * unmasked_this_step).sum(dim=-1)
+        # compute the per-sequence log-probability under the pretrained model
+        log_policy_token = log_p.gather(-1, x_next.unsqueeze(-1)).squeeze(-1)      # [B*batch,L]
+        log_policy_step = (log_policy_token * unmasked_this_step).sum(dim=-1)
+        # returns:
+        # log_p (B, L, D) log probabilties of each token under the policy model
+        # x_next (B, L) next sequences
+        # log_policy_step (B, ) log probability of all unmasked tokens under policy
+        # log_pretrained_step (B, ) log probabiltiy of all unmasked tokens under pretrained model
+        return log_p, x_next, log_policy_step, log_pretrained_step
+    def mcts_noise_removal(self, token_array, t, dt, pretrained, p_x0=None):
+        assert self.config.noise.type == 'loglinear'
+        sigma_t, _ = self.noise(t)
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        assert t.ndim == 1
+        change_prob_t = t[:, None, None]
+        change_prob_s = (t - dt)[:, None, None]
+        assert change_prob_t.ndim == 3, change_prob_t.shape
+        if p_x0 is None:
+            log_p = self.forward(token_array, sigma=sigma_t)
+            p_x0 = log_p.exp()
+        assert change_prob_t.ndim == p_x0.ndim
+        # changed for noise removal
+        p_x0 = p_x0.clone()
+        p_x0[:, :, self.mask_index] = 0.0 # prevent remaining a mask
+        p_x0 = p_x0 / p_x0.sum(dim=-1, keepdim=True).clamp_min(1e-12)  # renorm over non-MASK
+        q_xs = p_x0 * (change_prob_t - change_prob_s)
+        x_changed = _sample_categorical(q_xs)
+        copy_flag = (token_array != self.mask_index)
+        int_copy_flag = copy_flag.to(token_array.dtype)
+        x_next = int_copy_flag * token_array + (1 - int_copy_flag) * x_changed
+        # compute the log-probability under pretrained model at each step
+        with torch.no_grad():
+            # pretrained should output log-probs over vocab at each position given the *parent* (masked) input
+            log_pre = pretrained.forward(token_array, sigma=sigma_t)
+            # log-prob of the *sampled token* at each position
+            log_pre_token = log_pre.gather(-1, x_next.unsqueeze(-1)).squeeze(-1)  # [B*batch,L]
+            # sum only over the sites actually sampled this step (i.e., where parent was mask)
+            assert copy_flag.dtype == torch.bool, "copy_flag must be bool"
+            changed_mask = (~copy_flag)
+            # mask of tokens that were unmasked in this step
+            unmasked_this_step = (changed_mask & (x_next != self.mask_index)).to(log_pre_token.dtype)
+            log_pretrained_step = (log_pre_token * unmasked_this_step).sum(dim=-1)
+        # compute the per-sequence log-probability under the pretrained model
+        log_policy_token = log_p.gather(-1, x_next.unsqueeze(-1)).squeeze(-1)      # [B*batch,L]
+        log_policy_step = (log_policy_token * unmasked_this_step).sum(dim=-1)
+        # returns:
+        # log_p (B, L, D) log probabilties of each token under the policy model
+        # x_next (B, L) next sequences
+        # log_policy_step (B, ) log probability of all unmasked tokens under policy
+        # log_pretrained_step (B, ) log probabiltiy of all unmasked tokens under pretrained model
+        return log_p, x_next, log_policy_step, log_pretrained_step
+    # first step in expansion
+    def batch_mcts_reverse_step(self, token_array, t, dt, batch_size, pretrained, p_x0=None):
+        assert self.config.noise.type == 'loglinear'
+        sigma_t, _ = self.noise(t)
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        assert t.ndim == 1
+        change_prob_t = t[:, None, None]
+        change_prob_s = (t - dt)[:, None, None]
+        assert change_prob_t.ndim == 3, change_prob_t.shape
+        if token_array.dim() == 1:
+            token_array = token_array.unsqueeze(0)
+        # expand to match (num_children, L)
+        if p_x0 is None:
+            log_p = self.forward(token_array, sigma=sigma_t)
+            p_x0 = log_p.exp()
+        assert change_prob_t.ndim == p_x0.ndim
+        q_xs = p_x0 * (change_prob_t - change_prob_s)
+        # zero-masking probability
+        q_xs[:, :, self.mask_index] = change_prob_s[:, :, 0]
+        # repeat the parent token along the first dimension which will be unmasked into distinct sequences
+        token_array_expanded = token_array.repeat(batch_size, 1)
+        if self.config.mcts.sampling == 0:
+            x_changed = sample_batched_categorical(q_xs.to(self.device), batch_size)
+        else:
+            x_changed = sample_batched_top_k(q_xs.to(self.device), batch_size, self.config.mcts.sampling)
+        copy_flag = (token_array_expanded != self.mask_index)
+        int_copy_flag = copy_flag.to(token_array.dtype)
+        x_children = int_copy_flag * token_array_expanded + (1 - int_copy_flag) * x_changed
+        # compute the log-probability under pretrained model at each step
+        with torch.no_grad():
+            # pretrained should output log-probs over vocab at each position given the *parent* (masked) input
+            log_pre = pretrained.forward(token_array, sigma=sigma_t)
+            # expand to match the shape of x_children
+            log_pre = log_pre.repeat(batch_size, 1, 1)
+            # log-prob of the *sampled token* at each position
+            log_pre_token = log_pre.gather(-1, x_children.unsqueeze(-1)).squeeze(-1)  # [B*batch,L]
+            # sum only over the sites actually sampled this step (i.e., where parent was mask)
+            assert copy_flag.dtype == torch.bool, "copy_flag must be bool"
+            changed_mask = (~copy_flag)
+            # mask of tokens that were unmasked in this step
+            unmasked_this_step = (changed_mask & (x_children != self.mask_index)).to(log_pre_token.dtype)
+            log_pretrained_step = (log_pre_token * unmasked_this_step).sum(dim=-1)
+        # compute the per-child log-probability under the pretrained model
+        log_p = log_p.repeat(batch_size, 1, 1)
+        log_policy_token = log_p.gather(-1, x_children.unsqueeze(-1)).squeeze(-1)  # (B, L) probability of each chosen token
+        #print(log_policy_token)
+        log_policy_step = (log_policy_token * unmasked_this_step).sum(dim=-1)
+        # returns:
+        # log_p (B, L, D) log probabilties of each token under the policy model
+        # x_children (B, L) child sequences
+        # log_policy_step (B, ) log probability of all unmasked tokens under policy
+        # log_pretrained_step (B, ) log probabiltiy of all unmasked tokens under pretrained model
+        return log_p, x_children, log_policy_step, log_pretrained_step
+    ### SPECIFIC TO DRAKES? ###
+    def _ddpm_update_finetune_gradient(self, x, t, dt, copy_flag_temp, return_process=False):
+        if x.ndim == 2 or x.shape[-1] != self.vocab_size:
+            x = F.one_hot(x, num_classes=self.vocab_size).to(torch.float32)
+        sigma_t, _ = self.noise(t)
+        sigma_s, _ = self.noise(t - dt)
+        if sigma_t.ndim > 1:
+            sigma_t = sigma_t.squeeze(-1)
+        if sigma_s.ndim > 1:
+            sigma_s = sigma_s.squeeze(-1)
+        assert sigma_t.ndim == 1, sigma_t.shape
+        assert sigma_s.ndim == 1, sigma_s.shape
+        move_chance_t = 1 - torch.exp(-sigma_t) # (1-eps)*t
+        move_chance_s = 1 - torch.exp(-sigma_s)
+        move_chance_t = move_chance_t[:, None, None]
+        move_chance_s = move_chance_s[:, None, None]
+        unet_conditioning = sigma_t
+        log_p_x0 = self.forward(x, unet_conditioning)
+        assert move_chance_t.ndim == log_p_x0.ndim
+        q_xs = log_p_x0.exp() * (move_chance_t - move_chance_s)
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        _x = _sample_categorical_gradient(q_xs, temp=self.config.finetuning.gumbel_softmax_temp)
+        if copy_flag_temp is not None:
+            copy_flag_prob = 1 - x[:, :, self.mask_index].unsqueeze(-1)
+            soft_copy_flag = torch.nn.functional.sigmoid(copy_flag_prob/copy_flag_temp)
+        else:
+            soft_copy_flag = 1 - x[:, :, self.mask_index].unsqueeze(-1)
+        if return_process:
+            return soft_copy_flag * x + (1 - soft_copy_flag) * _x, x, unet_conditioning, move_chance_t, soft_copy_flag
+        else:
+            return soft_copy_flag * x + (1 - soft_copy_flag) * _x
+    def _sample_finetune_gradient(self, num_steps=None, eps=1e-5, eval_sp_size=None, copy_flag_temp=None):
+        """Generate samples from the model."""
+        assert self.parameterization == 'subs' and self.sampler == 'ddpm'
+        if eval_sp_size is None:
+            batch_size_per_gpu = self.config.loader.eval_batch_size
+        else:
+            batch_size_per_gpu = eval_sp_size
+        if num_steps is None:
+            num_steps = self.config.sampling.steps
+        x = self._sample_prior(
+            batch_size_per_gpu,
+            self.config.model.length).to(self.device)
+        timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        p_x0_cache = None
+        last_x_list = []
+        condt_list = []
+        move_chance_t_list = []
+        copy_flag_list = []
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(x.shape[0], 1, device=self.device)
+            if self.sampler == 'ddpm':
+                    if i < num_steps - self.config.finetuning.truncate_steps:
+                        x, last_x, condt, move_chance_t, copy_flag = self._ddpm_update(x, t, dt, return_process=True)
+                        x = x.detach()
+                        copy_flag = copy_flag.unsqueeze(-1)
+                        last_x = F.one_hot(last_x, num_classes=self.vocab_size).to(torch.float32).detach()
+                    else:
+                        x, last_x, condt, move_chance_t, copy_flag = self._ddpm_update_finetune_gradient(x, t, dt, copy_flag_temp, return_process=True)
+            last_x_list.append(last_x)
+            condt_list.append(condt)
+            move_chance_t_list.append(move_chance_t)
+            copy_flag_list.append(copy_flag)
+        x_argmax = x[:, :, :-1].argmax(dim=-1)
+        x_argmax = torch.nn.functional.one_hot(x_argmax, num_classes=self.vocab_size-1).to(torch.float32)
+        return x[:, :, :-1] + (x_argmax - x[:, :, :-1]).detach(), last_x_list, condt_list, move_chance_t_list, copy_flag_list
+    @torch.no_grad()
+    def _ddpm_update_finetune_controlled_SMC(self, x, t, dt, reward_model, alpha = 1.0):
+        sigma_t, _ = self.noise(t)
+        sigma_s, _ = self.noise(t - dt)
+        if sigma_t.ndim > 1:
+            sigma_t = sigma_t.squeeze(-1)
+        if sigma_s.ndim > 1:
+            sigma_s = sigma_s.squeeze(-1)
+        assert sigma_t.ndim == 1, sigma_t.shape
+        assert sigma_s.ndim == 1, sigma_s.shape
+        move_chance_t = 1 - torch.exp(-sigma_t)
+        move_chance_s = 1 - torch.exp(-sigma_s)
+        move_chance_t = move_chance_t[:, None, None]
+        move_chance_s = move_chance_s[:, None, None]
+        unet_conditioning = sigma_t
+        log_p_x0 = self.forward(x, unet_conditioning)
+        assert move_chance_t.ndim == log_p_x0.ndim
+        q_xs = log_p_x0.exp() * (move_chance_t - move_chance_s)
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        copy_flag = (x != self.mask_index).to(x.dtype)
+        sample = copy_flag * x + (1 - copy_flag) * _sample_categorical(q_xs)
+        '''
+        Calcualte exp(v_{t-1}(x_{t-1})/alpha)
+        '''
+        expected_x0 = self.forward(sample, sigma_s) # Calcualte E[x_0|x_{t-1}]
+        expected_x0_arg = torch.argmax(expected_x0,dim=2)
+        expected_x0_onehot = torch.nn.functional.one_hot(expected_x0_arg)
+        reward_num = reward_model(expected_x0_onehot.float().transpose(1, 2)).detach()[:, 0][:, 0]
+        '''
+        Calcualte exp(v_{t}(x_{t})/alpha)
+        '''
+        expected_x0 = self.forward(x, sigma_s) # Calcualte E[x_0|x_t]
+        expected_x0_arg = torch.argmax(expected_x0,dim=2)
+        expected_x0_onehot = torch.nn.functional.one_hot(expected_x0_arg)
+        reward_den = reward_model(expected_x0_onehot.float().transpose(1, 2)).detach()[:, 0][:, 0]
+        ratio = torch.exp(1.0/alpha * (reward_num - reward_den)) # Now calculate exp( (v_{t-1}(x_{t-1) -v_{t}(x_{t}) /alpha)
+        ratio = ratio.detach().cpu().numpy()
+        final_sample_indices = np.random.choice(reward_num.shape[0], reward_num.shape[0], p =  ratio/ratio.sum() )
+        return sample[final_sample_indices]
+    def _ddpm_update_finetune_controlled_CG(self, x, t, dt, reward_model,  guidance_scale):
+        sigma_t, _ = self.noise(t)
+        sigma_s, _ = self.noise(t - dt)
+        if sigma_t.ndim > 1:
+            sigma_t = sigma_t.squeeze(-1)
+        if sigma_s.ndim > 1:
+            sigma_s = sigma_s.squeeze(-1)
+        assert sigma_t.ndim == 1, sigma_t.shape
+        assert sigma_s.ndim == 1, sigma_s.shape
+        move_chance_t = 1 - torch.exp(-sigma_t)
+        move_chance_s = 1 - torch.exp(-sigma_s)
+        move_chance_t = move_chance_t[:, None, None]
+        move_chance_s = move_chance_s[:, None, None]
+        unet_conditioning = sigma_t
+        log_p_x0 = self.forward(x, unet_conditioning)
+        assert move_chance_t.ndim == log_p_x0.ndim
+        q_xs = log_p_x0.exp() * (move_chance_t - move_chance_s)
+        x_onehot = F.one_hot(x, num_classes=5).float()
+        x_grad = self.compute_gradient_CG(x_onehot, x, reward_model, sigma_s )
+        guidance = guidance_scale * (x_grad - x_grad[:, :, self.mask_index][:, :, None])
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        q_xs = q_xs * guidance.exp()
+        _x = _sample_categorical(q_xs)
+        copy_flag = (x != self.mask_index).to(x.dtype)
+        return copy_flag * x + (1 - copy_flag) * _x
+    def compute_gradient_CG(self, x_onehot, x, reward_model, sigma_s):
+        x_onehot.requires_grad_(True)
+        expected_x0 = self.forward(x_onehot, sigma_s) # Calcualte E[x_0|x_t]
+        scores = reward_model(expected_x0.transpose(1, 2)[:,0:4,:])[:, 0]
+        scores = scores.mean()
+        scores.backward()
+        x_grad = x_onehot.grad.clone()
+        return x_grad
+    def _ddpm_update_finetune_controlled_TDS(self, x, t, dt, reward_model, alpha = 1.0, guidance_scale=1000):
+        # SMC with the twisted proposal
+        sigma_t, _ = self.noise(t)
+        sigma_s, _ = self.noise(t - dt)
+        if sigma_t.ndim > 1:
+            sigma_t = sigma_t.squeeze(-1)
+        if sigma_s.ndim > 1:
+            sigma_s = sigma_s.squeeze(-1)
+        assert sigma_t.ndim == 1, sigma_t.shape
+        assert sigma_s.ndim == 1, sigma_s.shape
+        move_chance_t = 1 - torch.exp(-sigma_t)
+        move_chance_s = 1 - torch.exp(-sigma_s)
+        move_chance_t = move_chance_t[:, None, None]
+        move_chance_s = move_chance_s[:, None, None]
+        unet_conditioning = sigma_t
+        log_p_x0 = self.forward(x, unet_conditioning)
+        assert move_chance_t.ndim == log_p_x0.ndim
+        q_xs = log_p_x0.exp() * (move_chance_t
+                                - move_chance_s)
+        x_onehot = F.one_hot(x, num_classes=5).float()
+        x_grad = self.compute_gradient_CG(x_onehot, x, reward_model, sigma_s )
+        guidance = guidance_scale * (x_grad - x_grad[:, :, self.mask_index][:, :, None])
+        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+        # print(q_xs.sum(-1))
+        q_xs = q_xs * guidance.exp()
+        _x = _sample_categorical(q_xs)
+        copy_flag = (x != self.mask_index).to(x.dtype)
+        sample = copy_flag * x + (1 - copy_flag) * _x
+        prob_multiplier = (1 - copy_flag) * torch.gather(guidance.exp(), 2, _x.unsqueeze(-1)).squeeze(-1) + copy_flag * torch.ones_like(_x)
+        '''
+        Calcualte exp(v_{t-1}(x_{t-1})/alpha)
+        '''
+        expected_x0 = self.forward(sample, sigma_s) # Calcualte E[x_0|x_{t-1}]
+        expected_x0_arg = torch.argmax(expected_x0,dim=2)
+        expected_x0_onehot = torch.nn.functional.one_hot(expected_x0_arg)
+        reward_num = reward_model(expected_x0_onehot.float().transpose(1, 2)).detach()[:, 0][:, 0]
+        '''
+        Calcualte exp(v_{t}(x_{t})/alpha)
+        '''
+        expected_x0 = self.forward(x, sigma_s) # Calcualte E[x_0|x_t]
+        expected_x0_arg = torch.argmax(expected_x0,dim=2)
+        expected_x0_onehot = torch.nn.functional.one_hot(expected_x0_arg)
+        reward_den = reward_model(expected_x0_onehot.float().transpose(1, 2)).detach()[:, 0][:, 0]
+        # set the nan values to 1
+        prob_multiplier[torch.isnan(prob_multiplier)] = 1
+        ratio = torch.exp(1.0/alpha * (reward_num - reward_den)) / prob_multiplier.prod(dim=-1)
+        ratio = ratio.detach().cpu().numpy()
+        final_sample_indices = np.random.choice(reward_num.shape[0], reward_num.shape[0], p =  ratio/ratio.sum() )
+        return sample[final_sample_indices]
+    @torch.no_grad()
+    def controlled_sample_SMC(self, reward_model, alpha, num_steps=None, eps=1e-5, eval_sp_size=None):
+        """Generate samples from the model."""
+        if eval_sp_size is None:
+            batch_size_per_gpu = self.config.loader.eval_batch_size
+        else:
+            batch_size_per_gpu = eval_sp_size
+        if self.parameterization == 'ar':
+            return self._ar_sampler(batch_size_per_gpu)
+        # Lightning auto-casting is not working in this method for some reason
+        if num_steps is None:
+            num_steps = self.config.sampling.steps
+        x = self._sample_prior(
+            batch_size_per_gpu,
+            self.config.model.length).to(self.device)
+        timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        p_x0_cache = None
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(
+                    x.shape[0], 1, device=self.device)
+            if self.sampler == 'ddpm':
+                    x  = self._ddpm_update_finetune_controlled_SMC(x, t, dt, reward_model, alpha)
+            else:
+                    x = self._analytic_update(x, t, dt)
+        if self.config.sampling.noise_removal:
+            t = timesteps[-1] * torch.ones(x.shape[0], 1, device=self.device)
+            if self.sampler == 'analytic':
+                    x = self._denoiser_update(x, t)
+            else:
+                    unet_conditioning = self.noise(t)[0]
+                    logits = self.forward(x, unet_conditioning)
+                    x = logits[:, :, :-1].argmax(dim=-1)
+        return x
+    def controlled_sample_CG(self, reward_model, guidance_scale, num_steps=None, eps=1e-5, eval_sp_size=None):
+        """Generate samples from the model."""
+        if eval_sp_size is None:
+            batch_size_per_gpu = self.config.loader.eval_batch_size
+        else:
+            batch_size_per_gpu = eval_sp_size
+        if self.parameterization == 'ar':
+            return self._ar_sampler(batch_size_per_gpu)
+        # Lightning auto-casting is not working in this method for some reason
+        if num_steps is None:
+            num_steps = self.config.sampling.steps
+        x = self._sample_prior(
+            batch_size_per_gpu,
+            self.config.model.length).to(self.device)
+        timesteps = torch.linspace(
+            1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        p_x0_cache = None
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(
+                x.shape[0], 1, device=self.device)
+            if self.sampler == 'ddpm':
+                x  = self._ddpm_update_finetune_controlled_CG(x, t, dt, reward_model, guidance_scale)
+            else:
+                x = self._analytic_update(x, t, dt)
+        if self.config.sampling.noise_removal:
+            t = timesteps[-1] * torch.ones(x.shape[0], 1,
+                                        device=self.device)
+            if self.sampler == 'analytic':
+                x = self._denoiser_update(x, t)
+            else:
+                unet_conditioning = self.noise(t)[0]
+                logits = self.forward(x, unet_conditioning)
+                x = logits[:, :, :-1].argmax(dim=-1)
+        return x
+    def controlled_sample_TDS(self, reward_model, alpha, guidance_scale, num_steps=None, eps=1e-5, eval_sp_size=None):
+        """Generate samples from the model."""
+        if eval_sp_size is None:
+            batch_size_per_gpu = self.config.loader.eval_batch_size
+        else:
+            batch_size_per_gpu = eval_sp_size
+        if self.parameterization == 'ar':
+            return self._ar_sampler(batch_size_per_gpu)
+        if num_steps is None:
+            num_steps = self.config.sampling.steps
+        x = self._sample_prior(
+            batch_size_per_gpu,
+            self.config.model.length).to(self.device)
+        timesteps = torch.linspace(
+            1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        p_x0_cache = None
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(
+                x.shape[0], 1, device=self.device)
+            if self.sampler == 'ddpm':
+                x  = self._ddpm_update_finetune_controlled_TDS(x, t, dt, reward_model,alpha, guidance_scale)
+            else:
+                x = self._analytic_update(x, t, dt)
+        if self.config.sampling.noise_removal:
+            t = timesteps[-1] * torch.ones(x.shape[0], 1,
+                                        device=self.device)
+            if self.sampler == 'analytic':
+                x = self._denoiser_update(x, t)
+            else:
+                unet_conditioning = self.noise(t)[0]
+                logits = self.forward(x, unet_conditioning)
+                x = logits[:, :, :-1].argmax(dim=-1)
+        return x
+    @torch.no_grad()
+    def get_likelihood(self, x0, num_steps=None, eps=1e-5, n_samples=1):
+        """Compute the likelihood of a sequence under the model.
+        x0: int torch.Tensor with shape (batch_size,
+            diffusion_model_input_length)
+        """
+        if num_steps is None:
+            num_steps = self.config.sampling.steps
+        timesteps = torch.linspace(
+            1, eps, num_steps + 1, device=self.device) # t=0 is clean data
+        dt = (1 - eps) / num_steps
+        log_p_sample_list = []
+        for _ in range(n_samples):
+            log_p_at_time_list = []
+            for i in range(num_steps):
+                t = timesteps[i] * torch.ones(
+                x0.shape[0], 1, device=self.device)
+                sigma_t, _ = self.noise(t)
+                sigma_s, _ = self.noise(t - dt)
+                if sigma_t.ndim > 1:
+                    sigma_t = sigma_t.squeeze(-1)
+                if sigma_s.ndim > 1:
+                    sigma_s = sigma_s.squeeze(-1)
+                assert sigma_t.ndim == 1, sigma_t.shape
+                assert sigma_s.ndim == 1, sigma_s.shape
+                move_chance_t = 1 - torch.exp(-sigma_t) # (1-eps)*t
+                move_chance_s = 1 - torch.exp(-sigma_s)
+                move_chance_t = move_chance_t[:, None] # [bsz, 1]
+                move_chance_s = move_chance_s[:, None]
+                unet_conditioning = sigma_t # [bsz]
+                multiplier = (move_chance_t - move_chance_s)/move_chance_t # [bsz, 1]
+                xt = self.q_xt(x0, move_chance_t) # [bsz, seq_len]
+                # log prob, already apply subs parametrization (unmasked token remains unchanged)
+                model_output = self.forward(xt, unet_conditioning) # [bsz, seq_len, vocab_size]
+                # take the log prob of the token that corresponds to x0
+                log_p_x0 = model_output.gather(-1, x0[..., None]).squeeze(-1) # [bsz, seq_len]
+                log_p_x0 = log_p_x0 * multiplier
+                log_p_at_time_list.append(log_p_x0)
+            log_p_x0 = torch.stack(log_p_at_time_list, dim=0).sum(dim=0) # [bsz, seq_len]
+            log_p_sample_list.append(log_p_x0.sum(dim=-1))
+        log_p_sample = torch.stack(log_p_sample_list, dim=0).mean(dim=0)
+        return log_p_sample
+    def get_score(self, x, sigma):
+        model_output = self.forward(x, sigma)
+        if self.parameterization == 'subs':
+            # score(x, t) = p_t(y) / p_t(x)
+            # => log score(x, t) = log p_t(y) - log p_t(x)
+            # case 1: x = masked
+            #   (i) y = unmasked
+            #     log score(x, t) = log p_\theta(x)|_y + log k
+            #     where k = exp(- sigma) / (1 - exp(- sigma))
+            #   (ii) y = masked
+            #     log score(x, t) = 0
+            # case 2: x = unmasked
+            #   (i) y != masked, y != x
+            #     log score(x_i, t) = - inf
+            #   (ii) y = x
+            #     log score(x_i, t) = 0
+            #   (iii) y = masked token
+            #     log score(x_i, t) = - log k
+            #     where k = exp(- sigma) / (1 - exp(- sigma))
+            log_k = - torch.log(torch.expm1(sigma)).squeeze(-1)
+            assert log_k.ndim == 1
+            masked_score = model_output + log_k[:, None, None]
+            masked_score[:, :, self.mask_index] = 0
+            unmasked_score = self.neg_infinity * torch.ones_like(
+                model_output)
+            unmasked_score = torch.scatter(
+                unmasked_score,
+                -1,
+                x[..., None],
+                torch.zeros_like(unmasked_score[..., :1]))
+            unmasked_score[:, :, self.mask_index] = - (
+                log_k[:, None] * torch.ones_like(x))
+            masked_indices = (x == self.mask_index).to(
+                model_output.dtype)[:, :, None]
+            model_output = (
+                masked_score * masked_indices
+                + unmasked_score * (1 - masked_indices))
+        return model_output.exp()
+    def _staggered_score(self, score, dsigma):
+        score = score.clone()
+        extra_const = (1 - dsigma.exp()) * score.sum(dim=-1)
+        score *= dsigma.exp()[:, None]
+        score[..., self.mask_index] += extra_const
+        return score
+    def _analytic_update(self, x, t, step_size):
+        curr_sigma, _ = self.noise(t)
+        next_sigma, _ = self.noise(t - step_size)
+        dsigma = curr_sigma - next_sigma
+        score = self.get_score(x, curr_sigma)
+        stag_score = self._staggered_score(score, dsigma)
+        probs = stag_score * self._transp_transition(x, dsigma)
+        return _sample_categorical(probs)
+    def _denoiser_update(self, x, t):
+        sigma, _ = self.noise(t)
+        score = self.get_score(x, sigma)
+        stag_score = self._staggered_score(score, sigma)
+        probs = stag_score * self._transp_transition(x, sigma)
+        probs[..., self.mask_index] = 0
+        samples = _sample_categorical(probs)
+        return samples
+    def _transp_transition(self, i, sigma):
+        sigma = _unsqueeze(sigma, reference=i[..., None])
+        edge = torch.exp(-sigma) * F.one_hot(
+            i, num_classes=self.vocab_size)
+        edge += torch.where(i == self.mask_index,
+                            1 - torch.exp(-sigma).squeeze(-1),
+                            0)[..., None]
+        return edge
+    def _sample_t(self, n, device):
+        _eps_t = torch.rand(n, device=device)
+        if self.antithetic_sampling:
+            # for variance reduction
+            offset = torch.arange(n, device=device) / n
+            _eps_t = (_eps_t / n + offset) % 1
+        t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
+        if self.importance_sampling:
+            return self.noise.importance_sampling_transformation(t)
+        return t
+    def _maybe_sub_sample(self, x0, attention_mask):
+        seqlen = x0.shape[1]
+        if seqlen > self.config.model.length:
+            raise NotImplementedError('Sub-sampling not implemented')
+        elif self.parameterization == 'ar':
+            input_tokens = x0[:, :-1]
+            output_tokens = x0[:, 1:]
+            new_attention_mask = attention_mask[:, 1:]
+        else:
+            input_tokens = x0
+            output_tokens = None
+            new_attention_mask = attention_mask
+        return input_tokens, output_tokens, new_attention_mask
+    def _reconstruction_loss(self, x0):
+        t0 = torch.zeros(x0.shape[0], dtype=self.dtype,
+                        device=self.device)
+        assert self.config.noise.type == 'loglinear'
+        # The above assert is for d3pm parameterization
+        unet_conditioning = self.noise(t0)[0][:, None]
+        model_output_t0 = self.forward(x0, unet_conditioning)
+        return - torch.gather(input=model_output_t0,
+                            dim=-1,
+                            index=x0[:, :, None]).squeeze(-1)
+    def _forward_pass_diffusion(self, x0):
+        t = self._sample_t(x0.shape[0], x0.device)
+        if self.T > 0:
+            # else ts are between 0 and 1
+            t = (t * self.T).to(torch.int)
+            t = t / self.T
+            # t \in {1/T, 2/T, ..., 1}
+            t += (1 / self.T)
+        if self.change_of_variables: # False
+            unet_conditioning = t[:, None]
+            f_T = torch.log1p(- torch.exp(- self.noise.sigma_max))
+            f_0 = torch.log1p(- torch.exp(- self.noise.sigma_min))
+            move_chance = torch.exp(f_0 + t * (f_T - f_0))
+            move_chance = move_chance[:, None]
+        else:
+            sigma, dsigma = self.noise(t) # total noise, rate noise
+            unet_conditioning = sigma[:, None]
+            move_chance = 1 - torch.exp(-sigma[:, None])
+        xt = self.q_xt(x0, move_chance) # q(xt|x0)
+        model_output = self.forward(xt, unet_conditioning)
+        utils.print_nans(model_output, 'model_output')
+        if self.parameterization == 'sedd':
+            return dsigma[:, None] * self._score_entropy(
+                model_output, sigma[:, None], xt, x0)
+        if self.T > 0:
+            diffusion_loss = self._d3pm_loss(
+                    model_output=model_output, xt=xt, x0=x0, t=t)
+            if self.parameterization == 'd3pm':
+                    reconstruction_loss = self._reconstruction_loss(x0)
+            elif self.parameterization == 'subs':
+                    reconstruction_loss = 0
+            return reconstruction_loss + diffusion_loss
+        # SUBS parameterization, continuous time.
+        log_p_theta = torch.gather(
+            input=model_output,
+            dim=-1,
+            index=x0[:, :, None]).squeeze(-1)
+        if self.change_of_variables or self.importance_sampling:
+            return log_p_theta * torch.log1p(
+                - torch.exp(- self.noise.sigma_min))
+        return - log_p_theta * (
+            dsigma / torch.expm1(sigma))[:, None]
+    def _loss(self, x0, attention_mask):
+        (input_tokens, output_tokens, attention_mask) = self._maybe_sub_sample(
+                x0, attention_mask)
+        if self.parameterization == 'ar':
+            logprobs = self.backbone(input_tokens, None)
+            loss = - logprobs.gather(
+                -1, output_tokens[:, :, None])[:, :, 0]
+        else:
+            loss = self._forward_pass_diffusion(input_tokens)
+        nlls = loss * attention_mask
+        count = attention_mask.sum()
+        batch_nll = nlls.sum()
+        token_nll = batch_nll / count
+        return Loss(loss=token_nll,
+                    nlls=nlls,
+                    token_mask=attention_mask)
+    def _score_entropy(self, log_score, sigma, xt, x0):
+        """Computes the SEDD loss.
+        Args:
+        log_score: float torch.Tensor with shape (batch_size,
+            diffusion_model_input_length, vocab_size),
+            log score, output of the denoising network.
+        xt: int torch.Tensor with shape (batch_size,
+            diffusion_model_input_length), input.
+        x0: int torch.Tensor with shape (batch_size,
+            diffusion_model_input_length), input.
+        sigma: float torch.Tensor with shape (batch_size, 1).
+        Returns:
+        loss with shape (batch_size, diffusion_model_input_length)
+        """
+        # seems that it takes y=x0,xt=M case
+        # what is the const term for, seems to be y=M,xt=x0 case and x0 is known so score estimation is precise
+        masked_indices = xt == self.mask_index
+        expsig_minus_1 = torch.expm1(sigma).expand_as(xt)
+        q_ratio = 1 / expsig_minus_1[masked_indices]
+        words_that_were_masked = x0[masked_indices]
+        neg_term = q_ratio * torch.gather(
+            log_score[masked_indices],
+            -1,
+            words_that_were_masked[..., None]).squeeze(-1)
+        score = log_score[masked_indices].exp()
+        if self.mask_index == self.vocab_size - 1:
+            pos_term = score[:, :-1].sum(dim=-1)
+        else:
+            pos_term = score[:, : self.mask_index].sum(
+                dim=-1) + score[:, self.mask_index + 1:].sum(dim=-1)
+        const = q_ratio * (q_ratio.log() - 1)
+        entropy = torch.zeros(* xt.shape, device=xt.device)
+        entropy[masked_indices] += pos_term - neg_term + const
+        return entropy

tr2d2-dna/diffusion_gosai_cfg.py ADDED Viewed

	@@ -0,0 +1,729 @@

+import itertools
+import math
+from dataclasses import dataclass
+import hydra.utils
+import lightning as L
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchmetrics
+from torch import Tensor
+import dataloader_gosai
+import models
+import noise_schedule
+import utils
+import oracle
+LOG2 = math.log(2)
+LOGGER = utils.get_logger(__name__)
+def _sample_categorical(categorical_probs):
+  gumbel_norm = (
+    1e-10
+    - (torch.rand_like(categorical_probs) + 1e-10).log())
+  return (categorical_probs / gumbel_norm).argmax(dim=-1)
+def _unsqueeze(x, reference):
+  return x.view(
+    * x.shape,
+    * ((1,) * (len(reference.shape) - len(x.shape))))
+@dataclass
+class Loss:
+  loss: torch.FloatTensor
+  nlls: torch.FloatTensor
+  token_mask: torch.FloatTensor
+class NLL(torchmetrics.aggregation.MeanMetric):
+  pass
+class BPD(NLL):
+  def compute(self) -> Tensor:
+    """Computes the bits per dimension.
+    Returns:
+      bpd
+    """
+    return self.mean_value / self.weight / LOG2
+class Perplexity(NLL):
+  def compute(self) -> Tensor:
+    """Computes the Perplexity.
+    Returns:
+     Perplexity
+    """
+    return torch.exp(self.mean_value / self.weight)
+class Diffusion(L.LightningModule):
+  def __init__(
+    self,
+    config,
+    eval=True):
+    super().__init__()
+    self.save_hyperparameters()
+    self.config = config
+    self.vocab_size = 4
+    self.sampler = self.config.sampling.predictor
+    self.antithetic_sampling = self.config.training.antithetic_sampling
+    self.importance_sampling = self.config.training.importance_sampling
+    self.change_of_variables = self.config.training.change_of_variables
+    self.mask_index = self.vocab_size
+    self.vocab_size += 1
+    self.parameterization = self.config.parameterization
+    if self.config.backbone == 'cnn':
+      self.backbone = models.dnaconv.CNNModel(
+        self.config.model, alphabet_size=self.vocab_size, num_cls=2)
+    else:
+      raise ValueError(
+        f'Unknown backbone: {self.config.backbone}')
+    self.T = self.config.T
+    self.subs_masking = self.config.subs_masking
+    self.softplus = torch.nn.Softplus()
+    # metrics are automatically reset at end of epoch
+    metrics = torchmetrics.MetricCollection({
+      'nll': NLL(),
+      'bpd': BPD(),
+      'ppl': Perplexity(),
+    })
+    metrics.set_dtype(torch.float64)
+    self.train_metrics = metrics.clone(prefix='train/')
+    self.valid_metrics = metrics.clone(prefix='val/')
+    self.test_metrics = metrics.clone(prefix='test/')
+    # generative perplexity
+    self.gen_ppl_metric = Perplexity()
+    self.noise = noise_schedule.get_noise(self.config,
+                                          dtype=self.dtype)
+    if self.config.training.ema > 0:
+      self.ema = models.ema.ExponentialMovingAverage(
+        itertools.chain(self.backbone.parameters(),
+                        self.noise.parameters()),
+        decay=self.config.training.ema)
+    else:
+      self.ema = None
+    self.lr = self.config.optim.lr
+    self.sampling_eps = self.config.training.sampling_eps
+    self.time_conditioning = self.config.time_conditioning
+    self.neg_infinity = -1000000.0
+    self.fast_forward_epochs = None
+    self.fast_forward_batches = None
+    self._validate_configuration()
+    # subset of data for evaluation
+    if eval:
+      self.eval_sets_sp = oracle.subset_for_eval(n=config.eval.subset_size)
+      self.eval_sets_sp_clss = oracle.subset_eval_groundtruth(self.eval_sets_sp)
+      self.eval_sets_sp_preds = oracle.subset_eval_preds(self.eval_sets_sp)
+      self.eval_sets_sp_kmers = oracle.subset_eval_kmers(self.eval_sets_sp)
+      self.emb_pca = oracle.cal_emb_pca(oracle.subset_for_eval(n=40000), n_components=50)
+      self.eval_sets_sp_embs_pca = oracle.subset_eval_embs_pca(self.eval_sets_sp, self.emb_pca)
+  def _validate_configuration(self):
+    assert not (self.change_of_variables
+                and self.importance_sampling)
+    assert self.parameterization == 'subs'
+  def on_load_checkpoint(self, checkpoint):
+    if self.ema:
+      self.ema.load_state_dict(checkpoint['ema'])
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
+    self.fast_forward_epochs = checkpoint['loops'][
+      'fit_loop']['epoch_progress']['current']['completed']
+    self.fast_forward_batches = checkpoint['loops'][
+      'fit_loop']['epoch_loop.batch_progress'][
+        'current']['completed']
+  def on_save_checkpoint(self, checkpoint):
+    if self.ema:
+      checkpoint['ema'] = self.ema.state_dict()
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/tasks/seq.py
+    # ['epoch_loop.batch_progress']['total']['completed'] is 1 iteration
+    # behind, so we're using the optimizer's progress.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['total'][
+        'completed'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['total'][
+              'completed'] * self.trainer.accumulate_grad_batches
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['current'][
+        'completed'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['current'][
+              'completed'] * self.trainer.accumulate_grad_batches
+    # _batches_that_stepped tracks the number of global steps, not the number
+    # of local steps, so we don't multiply with self.trainer.accumulate_grad_batches here.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.state_dict'][
+        '_batches_that_stepped'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['total']['completed']
+    if 'sampler' not in checkpoint.keys():
+      checkpoint['sampler'] = {}
+    if hasattr(self.trainer.train_dataloader.sampler,
+               'state_dict'):
+      sampler_state_dict = self.trainer.\
+        train_dataloader.sampler.state_dict()
+      checkpoint['sampler'][
+        'random_state'] = sampler_state_dict.get(
+          'random_state', None)
+    else:
+      checkpoint['sampler']['random_state'] = None
+  def on_train_start(self):
+    if self.ema:
+      self.ema.move_shadow_params_to_device(self.device)
+    # Adapted from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
+    distributed = (
+      self.trainer._accelerator_connector.use_distributed_sampler
+      and self.trainer._accelerator_connector.is_distributed)
+    print('distributed:', distributed)
+    if distributed:
+      sampler_cls = dataloader_gosai.FaultTolerantDistributedSampler
+    else:
+      sampler_cls = dataloader_gosai.RandomFaultTolerantSampler
+    updated_dls = []
+    for dl in self.trainer.fit_loop._combined_loader.flattened:
+      if hasattr(dl.sampler, 'shuffle'):
+        dl_sampler = sampler_cls(
+          dl.dataset, shuffle=dl.sampler.shuffle)
+      else:
+        dl_sampler = sampler_cls(dl.dataset)
+      if (distributed
+          and self.fast_forward_epochs is not None
+          and self.fast_forward_batches is not None):
+        dl_sampler.load_state_dict({
+          'epoch': self.fast_forward_epochs,
+          'counter': (self.fast_forward_batches
+                      * self.config.loader.batch_size)})
+      updated_dls.append(
+        torch.utils.data.DataLoader(
+          dl.dataset,
+          batch_size=self.config.loader.batch_size,
+          num_workers=self.config.loader.num_workers,
+          pin_memory=self.config.loader.pin_memory,
+          sampler=dl_sampler,
+          shuffle=False,
+          persistent_workers=True))
+    self.trainer.fit_loop._combined_loader.flattened = updated_dls
+  def optimizer_step(self, *args, **kwargs):
+    super().optimizer_step(*args, **kwargs)
+    if self.ema:
+      self.ema.update(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+  def _subs_parameterization(self, logits, xt):
+    logits[:, :, self.mask_index] += self.neg_infinity
+    logits = logits - torch.logsumexp(logits, dim=-1,
+                                      keepdim=True)
+    unmasked_indices = (xt != self.mask_index)
+    logits[unmasked_indices] = self.neg_infinity
+    logits[unmasked_indices, xt[unmasked_indices]] = 0
+    return logits
+  def _process_sigma(self, sigma):
+    if sigma is None:
+      assert self.parameterization == 'ar'
+      return sigma
+    if sigma.ndim > 1:
+      sigma = sigma.squeeze(-1)
+    if not self.time_conditioning:
+      sigma = torch.zeros_like(sigma)
+    assert sigma.ndim == 1, sigma.shape
+    return sigma
+  def forward(self, x, sigma, binary_clss=None):
+    """Returns log score."""
+    sigma = self._process_sigma(sigma)
+    with torch.cuda.amp.autocast(dtype=torch.float32):
+      logits = self.backbone(x, sigma, cls=binary_clss)
+    if self.parameterization == 'subs':
+      return self._subs_parameterization(logits=logits, xt=x)
+    return logits
+  def _compute_loss(self, batch, prefix):
+    if 'attention_mask' in batch:
+      attention_mask = batch['attention_mask']
+    else:
+      attention_mask = None
+    # classifier-free guidance
+    assert self.config.model.cls_free_guidance == True
+    binary_clss = (batch['clss'][:,0] > self.config.model.cls_free_threshold).long()
+    random_list = np.random.binomial(1, self.config.model.cls_free_prob, binary_clss.shape[0])
+    binary_clss[random_list==1] = 2
+    losses = self._loss(batch['seqs'], attention_mask, binary_clss)
+    loss = losses.loss
+    if prefix == 'train':
+      self.train_metrics.update(losses.nlls, losses.token_mask)
+      metrics = self.train_metrics
+    elif prefix == 'val':
+      self.valid_metrics.update(losses.nlls, losses.token_mask)
+      metrics = self.valid_metrics
+    elif prefix == 'test':
+      self.test_metrics.update(losses.nlls, losses.token_mask)
+      metrics = self.test_metrics
+    else:
+      raise ValueError(f'Invalid prefix: {prefix}')
+    self.log_dict(metrics,
+                  on_step=False,
+                  on_epoch=True,
+                  sync_dist=True)
+    return loss
+  def on_train_epoch_start(self):
+    self.backbone.train()
+    self.noise.train()
+  def training_step(self, batch, batch_idx):
+    loss = self._compute_loss(batch, prefix='train')
+    self.log(name='trainer/loss',
+             value=loss.item(),
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True)
+    return loss
+  def on_validation_epoch_start(self):
+    if self.ema:
+      self.ema.store(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+      self.ema.copy_to(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+    self.backbone.eval()
+    self.noise.eval()
+    assert self.valid_metrics.nll.mean_value == 0
+    assert self.valid_metrics.nll.weight == 0
+  def validation_step(self, batch, batch_idx):
+    return self._compute_loss(batch, prefix='val')
+  def on_validation_epoch_end(self):
+    if ((self.config.eval.compute_perplexity_on_sanity
+         or not self.trainer.sanity_checking)
+         and self.config.eval.generate_samples
+         and not self.parameterization == 'ar'):
+      all_samples, all_detoeknized_samples = [], []
+      for _ in range(
+        self.config.sampling.num_sample_batches):
+        samples = self._sample(cls=1).detach().cpu().numpy()
+        detokenized_samples = dataloader_gosai.batch_dna_detokenize(samples)
+        all_samples.append(samples)
+        all_detoeknized_samples.extend(detokenized_samples)
+      all_samples = np.concatenate(all_samples, axis=0)
+      generated_preds = oracle.cal_gosai_pred(all_detoeknized_samples, mode='eval')[:,0]
+      avg_generated_preds = np.mean(generated_preds, axis=0)
+      current_step = self.trainer.global_step
+      LOGGER.info(f'Current step: {current_step}')
+      LOGGER.info(f'Generated preds: {avg_generated_preds}')
+      self.log('val/gosai_preds_avg', avg_generated_preds, on_step=False, on_epoch=True, sync_dist=True)
+    if self.ema:
+      self.ema.restore(
+        itertools.chain(self.backbone.parameters(),
+                        self.noise.parameters()))
+  def configure_optimizers(self):
+    # TODO(yair): Lightning currently giving this warning when using `fp16`:
+    #  "Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+    #  Not clear if this is a problem or not.
+    #  See: https://github.com/Lightning-AI/pytorch-lightning/issues/5558
+    optimizer = torch.optim.AdamW(
+      itertools.chain(self.backbone.parameters(),
+                      self.noise.parameters()),
+      lr=self.config.optim.lr,
+      betas=(self.config.optim.beta1,
+             self.config.optim.beta2),
+      eps=self.config.optim.eps,
+      weight_decay=self.config.optim.weight_decay)
+    scheduler = hydra.utils.instantiate(
+      self.config.lr_scheduler, optimizer=optimizer)
+    scheduler_dict = {
+      'scheduler': scheduler,
+      'interval': 'step',
+      'monitor': 'val/loss',
+      'name': 'trainer/lr',
+    }
+    return [optimizer], [scheduler_dict]
+  def q_xt(self, x, move_chance):
+    """Computes the noisy sample xt.
+    Args:
+      x: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      move_chance: float torch.Tensor with shape (batch_size, 1).
+    """
+    move_indices = torch.rand(
+      * x.shape, device=x.device) < move_chance
+    xt = torch.where(move_indices, self.mask_index, x)
+    return xt
+  def _sample_prior(self, *batch_dims):
+    return self.mask_index * torch.ones(
+      * batch_dims, dtype=torch.int64)
+  def _ddpm_caching_update(self, x, t, dt, p_x0=None):
+    assert self.config.noise.type == 'loglinear'
+    sigma_t, _ = self.noise(t)
+    if t.ndim > 1:
+      t = t.squeeze(-1)
+    assert t.ndim == 1
+    move_chance_t = t[:, None, None]
+    move_chance_s = (t - dt)[:, None, None]
+    assert move_chance_t.ndim == 3, move_chance_t.shape
+    if p_x0 is None:
+      p_x0 = self.forward(x, sigma_t).exp()
+    assert move_chance_t.ndim == p_x0.ndim
+    q_xs = p_x0 * (move_chance_t - move_chance_s)
+    q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+    _x = _sample_categorical(q_xs)
+    copy_flag = (x != self.mask_index).to(x.dtype)
+    return p_x0, copy_flag * x + (1 - copy_flag) * _x
+  def _ddpm_update(self, x, t, dt, cls, w):
+    sigma_t, _ = self.noise(t)
+    sigma_s, _ = self.noise(t - dt)
+    if sigma_t.ndim > 1:
+      sigma_t = sigma_t.squeeze(-1)
+    if sigma_s.ndim > 1:
+      sigma_s = sigma_s.squeeze(-1)
+    assert sigma_t.ndim == 1, sigma_t.shape
+    assert sigma_s.ndim == 1, sigma_s.shape
+    move_chance_t = 1 - torch.exp(-sigma_t)
+    move_chance_s = 1 - torch.exp(-sigma_s)
+    move_chance_t = move_chance_t[:, None, None]
+    move_chance_s = move_chance_s[:, None, None]
+    unet_conditioning = sigma_t
+    uncond = (2 * torch.ones(x.shape[0], device=x.device)).long()
+    cond = (cls * torch.ones(x.shape[0], device=x.device)).long()
+    log_p_x0_uncond = self.forward(x, unet_conditioning, uncond)
+    log_p_x0_cond = self.forward(x, unet_conditioning, cond)
+    log_p_x0 = (1+w) * log_p_x0_cond - w * log_p_x0_uncond
+    assert move_chance_t.ndim == log_p_x0.ndim
+    q_xs = log_p_x0.exp() * (move_chance_t
+                             - move_chance_s)
+    q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+    _x = _sample_categorical(q_xs)
+    copy_flag = (x != self.mask_index).to(x.dtype)
+    return copy_flag * x + (1 - copy_flag) * _x
+  def _ar_sampler(self, bsz):
+    # precompute token buffer
+    num_pred_tokens = self.config.model.length - 1
+    x = torch.zeros(
+      (bsz, num_pred_tokens + 1),
+      dtype=torch.long,
+      device=self.device)
+    x[:, 0] = self.tokenizer.bos_token_id
+    # precompute noise
+    noise = (torch.distributions.Gumbel(0, 1)
+             .sample((bsz, num_pred_tokens, self.vocab_size))
+             .to(self.device))
+    for i in range(num_pred_tokens):
+      next_logits = self.forward(x[:, :i + 1], None)[:, -1]
+      y = (next_logits + noise[:, i]).argmax(-1)
+      x[:, i + 1] = y
+    return x
+  @torch.no_grad()
+  def _sample(self, num_steps=None, eps=1e-5, eval_sp_size=None, cls=1, w=None):
+    """Generate samples from the model."""
+    if w is None:
+      w = self.config.model.cls_free_weight
+    if eval_sp_size is None:
+      batch_size_per_gpu = self.config.loader.eval_batch_size
+    else:
+      batch_size_per_gpu = eval_sp_size
+    if self.parameterization == 'ar':
+      return self._ar_sampler(batch_size_per_gpu)
+    if num_steps is None:
+      num_steps = self.config.sampling.steps
+    x = self._sample_prior(
+      batch_size_per_gpu,
+      self.config.model.length).to(self.device)
+    timesteps = torch.linspace(
+      1, eps, num_steps + 1, device=self.device)
+    dt = (1 - eps) / num_steps
+    p_x0_cache = None
+    for i in range(num_steps):
+      t = timesteps[i] * torch.ones(
+        x.shape[0], 1, device=self.device)
+      if self.sampler == 'ddpm':
+        x = self._ddpm_update(x, t, dt, cls, w)
+      else:
+        raise NotImplementedError
+    if self.config.sampling.noise_removal:
+      t = timesteps[-1] * torch.ones(x.shape[0], 1,
+                                     device=self.device)
+      unet_conditioning = self.noise(t)[0]
+      uncond = (2 * torch.ones(x.shape[0], device=x.device)).long()
+      cond = (cls * torch.ones(x.shape[0], device=x.device)).long()
+      log_p_x0_uncond = self.forward(x, unet_conditioning, uncond)
+      log_p_x0_cond = self.forward(x, unet_conditioning, cond)
+      logits = (1+w) * log_p_x0_cond - w * log_p_x0_uncond
+      x = logits[:, :, :-1].argmax(dim=-1)
+    return x
+  def get_score(self, x, sigma):
+    model_output = self.forward(x, sigma)
+    if self.parameterization == 'subs':
+      # score(x, t) = p_t(y) / p_t(x)
+      # => log score(x, t) = log p_t(y) - log p_t(x)
+      # case 1: x = masked
+      #   (i) y = unmasked
+      #     log score(x, t) = log p_\theta(x)|_y + log k
+      #     where k = exp(- sigma) / (1 - exp(- sigma))
+      #   (ii) y = masked
+      #     log score(x, t) = 0
+      # case 2: x = unmasked
+      #   (i) y != masked, y != x
+      #     log score(x_i, t) = - inf
+      #   (ii) y = x
+      #     log score(x_i, t) = 0
+      #   (iii) y = masked token
+      #     log score(x_i, t) = - log k
+      #     where k = exp(- sigma) / (1 - exp(- sigma))
+      log_k = - torch.log(torch.expm1(sigma)).squeeze(-1)
+      assert log_k.ndim == 1
+      masked_score = model_output + log_k[:, None, None]
+      masked_score[:, :, self.mask_index] = 0
+      unmasked_score = self.neg_infinity * torch.ones_like(
+        model_output)
+      unmasked_score = torch.scatter(
+        unmasked_score,
+        -1,
+        x[..., None],
+        torch.zeros_like(unmasked_score[..., :1]))
+      unmasked_score[:, :, self.mask_index] = - (
+        log_k[:, None] * torch.ones_like(x))
+      masked_indices = (x == self.mask_index).to(
+        model_output.dtype)[:, :, None]
+      model_output = (
+        masked_score * masked_indices
+        + unmasked_score * (1 - masked_indices))
+    return model_output.exp()
+  def _staggered_score(self, score, dsigma):
+    score = score.clone()
+    extra_const = (1 - dsigma.exp()) * score.sum(dim=-1)
+    score *= dsigma.exp()[:, None]
+    score[..., self.mask_index] += extra_const
+    return score
+  def _analytic_update(self, x, t, step_size):
+    curr_sigma, _ = self.noise(t)
+    next_sigma, _ = self.noise(t - step_size)
+    dsigma = curr_sigma - next_sigma
+    score = self.get_score(x, curr_sigma)
+    stag_score = self._staggered_score(score, dsigma)
+    probs = stag_score * self._transp_transition(x, dsigma)
+    return _sample_categorical(probs)
+  def _denoiser_update(self, x, t):
+    sigma, _ = self.noise(t)
+    score = self.get_score(x, sigma)
+    stag_score = self._staggered_score(score, sigma)
+    probs = stag_score * self._transp_transition(x, sigma)
+    probs[..., self.mask_index] = 0
+    samples = _sample_categorical(probs)
+    return samples
+  def _transp_transition(self, i, sigma):
+    sigma = _unsqueeze(sigma, reference=i[..., None])
+    edge = torch.exp(-sigma) * F.one_hot(
+      i, num_classes=self.vocab_size)
+    edge += torch.where(i == self.mask_index,
+                        1 - torch.exp(-sigma).squeeze(-1),
+                        0)[..., None]
+    return edge
+  def _sample_t(self, n, device):
+    _eps_t = torch.rand(n, device=device)
+    if self.antithetic_sampling:
+      # for variance reduction
+      offset = torch.arange(n, device=device) / n
+      _eps_t = (_eps_t / n + offset) % 1
+    t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
+    if self.importance_sampling:
+      return self.noise.importance_sampling_transformation(t)
+    return t
+  def _maybe_sub_sample(self, x0, attention_mask):
+    seqlen = x0.shape[1]
+    if seqlen > self.config.model.length:
+      raise NotImplementedError('Sub-sampling not implemented')
+    elif self.parameterization == 'ar':
+      input_tokens = x0[:, :-1]
+      output_tokens = x0[:, 1:]
+      new_attention_mask = attention_mask[:, 1:]
+    else:
+      input_tokens = x0
+      output_tokens = None
+      new_attention_mask = attention_mask
+    return input_tokens, output_tokens, new_attention_mask
+  def _reconstruction_loss(self, x0):
+    t0 = torch.zeros(x0.shape[0], dtype=self.dtype,
+                     device=self.device)
+    assert self.config.noise.type == 'loglinear'
+    # The above assert is for d3pm parameterization
+    unet_conditioning = self.noise(t0)[0][:, None]
+    model_output_t0 = self.forward(x0, unet_conditioning)
+    return - torch.gather(input=model_output_t0,
+                          dim=-1,
+                          index=x0[:, :, None]).squeeze(-1)
+  def _forward_pass_diffusion(self, x0, binary_clss=None):
+    t = self._sample_t(x0.shape[0], x0.device)
+    if self.T > 0:
+      # else ts are between 0 and 1
+      t = (t * self.T).to(torch.int)
+      t = t / self.T
+      # t \in {1/T, 2/T, ..., 1}
+      t += (1 / self.T)
+    if self.change_of_variables: # False
+      unet_conditioning = t[:, None]
+      f_T = torch.log1p(- torch.exp(- self.noise.sigma_max))
+      f_0 = torch.log1p(- torch.exp(- self.noise.sigma_min))
+      move_chance = torch.exp(f_0 + t * (f_T - f_0))
+      move_chance = move_chance[:, None]
+    else:
+      sigma, dsigma = self.noise(t) # total noise, rate noise
+      unet_conditioning = sigma[:, None]
+      move_chance = 1 - torch.exp(-sigma[:, None])
+    xt = self.q_xt(x0, move_chance) # q(xt|x0)
+    model_output = self.forward(xt, unet_conditioning, binary_clss=binary_clss)
+    utils.print_nans(model_output, 'model_output')
+    if self.parameterization == 'sedd':
+      return dsigma[:, None] * self._score_entropy(
+        model_output, sigma[:, None], xt, x0)
+    if self.T > 0:
+      diffusion_loss = self._d3pm_loss(
+        model_output=model_output, xt=xt, x0=x0, t=t)
+      if self.parameterization == 'd3pm':
+        reconstruction_loss = self._reconstruction_loss(x0)
+      elif self.parameterization == 'subs':
+        reconstruction_loss = 0
+      return reconstruction_loss + diffusion_loss
+    # SUBS parameterization, continuous time.
+    log_p_theta = torch.gather(
+      input=model_output,
+      dim=-1,
+      index=x0[:, :, None]).squeeze(-1)
+    if self.change_of_variables or self.importance_sampling:
+      return log_p_theta * torch.log1p(
+        - torch.exp(- self.noise.sigma_min))
+    return - log_p_theta * (
+      dsigma / torch.expm1(sigma))[:, None]
+  def _loss(self, x0, attention_mask, binary_clss):
+    (input_tokens, output_tokens,
+     attention_mask) = self._maybe_sub_sample(
+       x0, attention_mask)
+    if self.parameterization == 'ar':
+      logprobs = self.backbone(input_tokens, None, cls=binary_clss)
+      loss = - logprobs.gather(
+        -1, output_tokens[:, :, None])[:, :, 0]
+    else:
+      loss = self._forward_pass_diffusion(input_tokens, binary_clss=binary_clss)
+    nlls = loss * attention_mask
+    count = attention_mask.sum()
+    batch_nll = nlls.sum()
+    token_nll = batch_nll / count
+    return Loss(loss=token_nll,
+                nlls=nlls,
+                token_mask=attention_mask)
+  def _score_entropy(self, log_score, sigma, xt, x0):
+    """Computes the SEDD loss.
+    Args:
+      log_score: float torch.Tensor with shape (batch_size,
+          diffusion_model_input_length, vocab_size),
+          log score, output of the denoising network.
+      xt: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      x0: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      sigma: float torch.Tensor with shape (batch_size, 1).
+    Returns:
+      loss with shape (batch_size, diffusion_model_input_length)
+    """
+    # seems that it takes y=x0,xt=M case
+    # what is the const term for, seems to be y=M,xt=x0 case and x0 is known so score estimation is precise
+    masked_indices = xt == self.mask_index
+    expsig_minus_1 = torch.expm1(sigma).expand_as(xt)
+    q_ratio = 1 / expsig_minus_1[masked_indices]
+    words_that_were_masked = x0[masked_indices]
+    neg_term = q_ratio * torch.gather(
+      log_score[masked_indices],
+      -1,
+      words_that_were_masked[..., None]).squeeze(-1)
+    score = log_score[masked_indices].exp()
+    if self.mask_index == self.vocab_size - 1:
+      pos_term = score[:, :-1].sum(dim=-1)
+    else:
+      pos_term = score[:, : self.mask_index].sum(
+        dim=-1) + score[:, self.mask_index + 1:].sum(dim=-1)
+    const = q_ratio * (q_ratio.log() - 1)
+    entropy = torch.zeros(* xt.shape, device=xt.device)
+    entropy[masked_indices] += pos_term - neg_term + const
+    return entropy

tr2d2-dna/env.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia
+pip install packaging
+pip install ninja
+pip install transformers
+pip install datasets
+pip install omegaconf
+conda install ipykernel
+python -m ipykernel install --user --name tr2d2 --display-name "Python (tr2d2)"
+pip install hydra-core --upgrade
+pip install hydra-submitit-launcher
+# for mdlm
+pip install causal-conv1d
+pip install lightning
+pip install timm
+pip install rich
+pip install scipy
+pip install wandb
+pip install gReLU

tr2d2-dna/eval_runs_batch.py ADDED Viewed

	@@ -0,0 +1,347 @@

+#!/usr/bin/env python3
+"""
+Batch evaluation script for multiple runs with checkpoints.
+This script:
+1. Scans a folder containing different runs
+2. For each run, finds checkpoints and selects the one with largest epoch number
+3. Evaluates that checkpoint and saves results indexed by run folder name
+"""
+import os
+import re
+import glob
+import argparse
+from pathlib import Path
+from diffusion import Diffusion
+import dataloader_gosai
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import oracle
+from scipy.stats import pearsonr
+import torch
+from tqdm import tqdm
+from eval_utils import get_eval_matrics
+from hydra import initialize, compose
+from hydra.core.global_hydra import GlobalHydra
+from dataclasses import dataclass
+from datetime import datetime
+import json
+@dataclass
+class Args:
+    total_num_steps: int
+    batch_size: int
+    num_seeds: int
+    total_samples: int
+    seq_length: int
+def find_latest_checkpoint(run_dir):
+    """
+    Find the checkpoint with the largest epoch/step number in a run directory.
+    Args:
+        run_dir (str): Path to the run directory
+    Returns:
+        str or None: Path to the latest checkpoint, or None if no checkpoints found
+    """
+    ckpt_pattern = os.path.join(run_dir, "model_*.ckpt")
+    ckpt_files = glob.glob(ckpt_pattern)
+    if not ckpt_files:
+        return None
+    # Extract step numbers from checkpoint filenames
+    step_numbers = []
+    for ckpt_file in ckpt_files:
+        filename = os.path.basename(ckpt_file)
+        match = re.search(r'model_(\d+)\.ckpt', filename)
+        if match:
+            step_numbers.append((int(match.group(1)), ckpt_file))
+    if not step_numbers:
+        return None
+    # Return checkpoint with largest step number
+    step_numbers.sort(key=lambda x: x[0], reverse=True)
+    return step_numbers[0][1]
+def evaluate_checkpoint(checkpoint_path, args, cfg, pretrained_model, gosai_oracle,
+                       cal_atac_pred_new_mdl, highexp_kmers_999, n_highexp_kmers_999, device):
+    """
+    Evaluate a single checkpoint.
+    Args:
+        checkpoint_path (str): Path to the checkpoint file
+        args: Evaluation arguments
+        cfg: Configuration object
+        pretrained_model: Pretrained reference model
+        gosai_oracle: GOSAI oracle model
+        cal_atac_pred_new_mdl: ATAC prediction model
+        highexp_kmers_999: High expression k-mers
+        n_highexp_kmers_999: Number of high expression k-mers
+        device: Device to run evaluation on
+    Returns:
+        tuple: (eval_metrics_agg, total_rewards_agg) containing aggregated results
+    """
+    # Load the policy model from checkpoint
+    policy_model = Diffusion(cfg).to(device)
+    policy_model.load_state_dict(torch.load(checkpoint_path, map_location=device))
+    policy_model.eval()
+    total_rewards_all = []
+    eval_metrics_all = []
+    print(f"Evaluating checkpoint: {os.path.basename(checkpoint_path)}")
+    for i in range(args.num_seeds):
+        iter_times = args.total_samples // args.batch_size
+        total_samples = []
+        total_rewards = []
+        range_bar = tqdm(range(iter_times), desc=f"Seed {i+1}", leave=False)
+        for j in range_bar:
+            x_eval, mean_reward_eval = policy_model.sample_finetuned(args, gosai_oracle)
+            total_samples.append(x_eval)
+            total_rewards.append(mean_reward_eval.item() * args.batch_size)
+        total_samples = torch.concat(total_samples)
+        eval_metrics = get_eval_matrics(samples=total_samples, ref_model=pretrained_model,
+                                        gosai_oracle=gosai_oracle, cal_atac_pred_new_mdl=cal_atac_pred_new_mdl,
+                                        highexp_kmers_999=highexp_kmers_999, n_highexp_kmers_999=n_highexp_kmers_999)
+        eval_metrics_all.append(eval_metrics)
+        total_rewards_all.append(np.sum(total_rewards) / args.total_samples)
+    # Aggregate results
+    eval_metrics_agg = {k: (np.mean([eval_metrics[k] for eval_metrics in eval_metrics_all]),
+                            np.std([eval_metrics[k] for eval_metrics in eval_metrics_all]))
+                       for k in eval_metrics_all[0].keys()}
+    total_rewards_agg = (np.mean(total_rewards_all), np.std(total_rewards_all))
+    return eval_metrics_agg, total_rewards_agg
+def save_results(results, output_file):
+    """
+    Save evaluation results to a text file.
+    Args:
+        results (dict): Dictionary containing results for each run
+        output_file (str): Path to output file
+    """
+    with open(output_file, 'w') as f:
+        f.write("="*80 + "\n")
+        f.write("BATCH EVALUATION RESULTS\n")
+        f.write("="*80 + "\n")
+        f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Total runs evaluated: {len(results)}\n\n")
+        for run_name, result in results.items():
+            if result is None:
+                f.write(f"RUN: {run_name}\n")
+                f.write("-" * 60 + "\n")
+                f.write("Status: No checkpoints found or evaluation failed\n\n")
+                continue
+            eval_metrics_agg, total_rewards_agg, checkpoint_path = result
+            f.write(f"RUN: {run_name}\n")
+            f.write("-" * 60 + "\n")
+            f.write(f"Checkpoint: {os.path.basename(checkpoint_path)}\n")
+            f.write(f"Full path: {checkpoint_path}\n\n")
+            f.write("📊 EVALUATION METRICS:\n")
+            for metric_name in eval_metrics_agg.keys():
+                mean_val = eval_metrics_agg[metric_name][0]
+                std_val = eval_metrics_agg[metric_name][1]
+                f.write(f"  {metric_name:<20}: {mean_val:8.4f} ± {std_val:6.4f}\n")
+            f.write(f"\n🎯 TOTAL REWARDS:\n")
+            f.write(f"  {'Mean':<20}: {total_rewards_agg[0]:8.4f}\n")
+            f.write(f"  {'Std':<20}: {total_rewards_agg[1]:8.4f}\n")
+            f.write("\n")
+    print(f"Results saved to: {output_file}")
+def append_single_result(run_name, result, output_file, is_first_run=False):
+    """
+    Append a single successful run result to the output file.
+    Args:
+        run_name (str): Name of the run
+        result: Result tuple (eval_metrics_agg, total_rewards_agg, checkpoint_path)
+        output_file (str): Path to output file
+        is_first_run (bool): Whether this is the first successful run (write header)
+    """
+    mode = 'w' if is_first_run else 'a'
+    with open(output_file, mode) as f:
+        if is_first_run:
+            f.write("="*80 + "\n")
+            f.write("BATCH EVALUATION RESULTS\n")
+            f.write("="*80 + "\n")
+            f.write(f"Started on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write("Results are saved incrementally as each run completes.\n")
+            f.write("Only successful evaluations are included.\n\n")
+        eval_metrics_agg, total_rewards_agg, checkpoint_path = result
+        f.write(f"RUN: {run_name}\n")
+        f.write("-" * 60 + "\n")
+        f.write(f"Checkpoint: {os.path.basename(checkpoint_path)}\n")
+        f.write(f"Full path: {checkpoint_path}\n")
+        f.write(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+        f.write("📊 EVALUATION METRICS:\n")
+        for metric_name in eval_metrics_agg.keys():
+            mean_val = eval_metrics_agg[metric_name][0]
+            std_val = eval_metrics_agg[metric_name][1]
+            f.write(f"  {metric_name:<20}: {mean_val:8.4f} ± {std_val:6.4f}\n")
+        f.write(f"\n🎯 TOTAL REWARDS:\n")
+        f.write(f"  {'Mean':<20}: {total_rewards_agg[0]:8.4f}\n")
+        f.write(f"  {'Std':<20}: {total_rewards_agg[1]:8.4f}\n")
+        f.write("\n" + "="*80 + "\n\n")  # Add separator line and extra spacing
+def main():
+    parser = argparse.ArgumentParser(description="Batch evaluation of multiple runs")
+    parser.add_argument("--runs_dir", type=str, required=True,
+                       help="Directory containing run folders with checkpoints")
+    parser.add_argument("--output_file", type=str, default="batch_eval_results.txt",
+                       help="Output file to save results")
+    parser.add_argument("--device", type=str, default="cuda:0",
+                       help="Device to run evaluation on")
+    parser.add_argument("--total_num_steps", type=int, default=128,
+                       help="Total number of diffusion steps")
+    parser.add_argument("--batch_size", type=int, default=128,
+                       help="Batch size for evaluation")
+    parser.add_argument("--num_seeds", type=int, default=3,
+                       help="Number of random seeds for evaluation")
+    parser.add_argument("--total_samples", type=int, default=640,
+                       help="Total number of samples to generate")
+    parser.add_argument("--seq_length", type=int, default=200,
+                       help="Sequence length")
+    parser.add_argument("--pretrained_path", type=str,
+                       default=None,
+                       help="Path to pretrained model checkpoint")
+    args = parser.parse_args()
+    # Setup evaluation arguments
+    eval_args = Args(
+        total_num_steps=args.total_num_steps,
+        batch_size=args.batch_size,
+        num_seeds=args.num_seeds,
+        total_samples=args.total_samples,
+        seq_length=args.seq_length
+    )
+    device = args.device
+    # Initialize Hydra configuration
+    if GlobalHydra().is_initialized():
+        GlobalHydra.instance().clear()
+    initialize(config_path="configs_gosai", job_name="batch_eval")
+    cfg = compose(config_name="config_gosai.yaml")
+    print("Loading pretrained model and oracles...")
+    # Load pretrained model
+    pretrained_model = Diffusion.load_from_checkpoint(args.pretrained_path, config=cfg, map_location=device)
+    pretrained_model.eval()
+    # Load oracles
+    _, _, highexp_kmers_999, n_highexp_kmers_999, _, _, _ = oracle.cal_highexp_kmers(return_clss=True)
+    cal_atac_pred_new_mdl = oracle.get_cal_atac_orale(device=device)
+    cal_atac_pred_new_mdl.eval()
+    gosai_oracle = oracle.get_gosai_oracle(mode='eval', device=device)
+    gosai_oracle.eval()
+    print("Scanning for runs...")
+    # Find all run directories
+    runs_dir = Path(args.runs_dir)
+    if not runs_dir.exists():
+        print(f"Error: Directory {args.runs_dir} does not exist")
+        return
+    run_dirs = [d for d in runs_dir.iterdir() if d.is_dir()]
+    run_dirs.sort()  # Sort for consistent ordering
+    print(f"Found {len(run_dirs)} run directories")
+    results = {}
+    successful_runs = 0
+    failed_runs = 0
+    # Process each run
+    for i, run_dir in enumerate(tqdm(run_dirs, desc="Processing runs")):
+        run_name = run_dir.name
+        print(f"\nProcessing run {i+1}/{len(run_dirs)}: {run_name}")
+        # Find latest checkpoint
+        latest_ckpt = find_latest_checkpoint(str(run_dir))
+        if latest_ckpt is None:
+            print(f"  No checkpoints found in {run_name} - skipping")
+            failed_runs += 1
+            continue  # Skip this run entirely, don't save anything to file
+        print(f"  Found latest checkpoint: {os.path.basename(latest_ckpt)}")
+        try:
+            # Evaluate checkpoint
+            eval_metrics_agg, total_rewards_agg = evaluate_checkpoint(
+                latest_ckpt, eval_args, cfg, pretrained_model, gosai_oracle,
+                cal_atac_pred_new_mdl, highexp_kmers_999, n_highexp_kmers_999, device
+            )
+            result = (eval_metrics_agg, total_rewards_agg, latest_ckpt)
+            results[run_name] = result
+            successful_runs += 1
+            print(f"  ✓ Evaluation completed successfully")
+            # Save result incrementally (only for successful evaluations)
+            is_first_run = (len(results) == 1)  # First successful run
+            append_single_result(run_name, result, args.output_file, is_first_run=is_first_run)
+            print(f"  Result saved to {args.output_file}")
+        except Exception as e:
+            print(f"  ✗ Evaluation failed: {str(e)}")
+            failed_runs += 1
+            # Don't save failed evaluations to file either
+    # Add final summary to the file (only if there were successful runs)
+    if successful_runs > 0:
+        with open(args.output_file, 'a') as f:
+            f.write("="*80 + "\n")
+            f.write("FINAL SUMMARY\n")
+            f.write("="*80 + "\n")
+            f.write(f"Completed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(f"Total runs processed: {len(run_dirs)}\n")
+            f.write(f"Successful evaluations: {successful_runs}\n")
+            f.write(f"Failed/skipped runs: {failed_runs}\n")
+    else:
+        print(f"No successful evaluations - output file {args.output_file} not created")
+    # Print summary
+    print(f"\nFinal Summary:")
+    print(f"  Total runs processed: {len(run_dirs)}")
+    print(f"  Successful evaluations: {successful_runs}")
+    print(f"  Failed/skipped runs: {failed_runs}")
+    if successful_runs > 0:
+        print(f"  Results saved to: {args.output_file}")
+    else:
+        print(f"  No output file created (no successful evaluations)")
+if __name__ == "__main__":
+    main()

tr2d2-dna/eval_utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+import torch
+from scipy.stats import pearsonr
+import dataloader_gosai
+import oracle
+def compare_kmer(kmer1, kmer2, n_sp1, n_sp2):
+    kmer_set = set(kmer1.keys()) | set(kmer2.keys())
+    counts = np.zeros((len(kmer_set), 2))
+    for i, kmer in enumerate(kmer_set):
+        if kmer in kmer1: counts[i][1] = kmer1[kmer] * n_sp2 / n_sp1
+        if kmer in kmer2: counts[i][0] = kmer2[kmer]
+    return pearsonr(counts[:, 0], counts[:, 1])[0]
+def get_eval_matrics(samples, ref_model, gosai_oracle, cal_atac_pred_new_mdl, highexp_kmers_999, n_highexp_kmers_999):
+    """samples: [B, 200]"""
+    info = {}
+    detokenized_samples = dataloader_gosai.batch_dna_detokenize(samples.detach().cpu().numpy()) # [B], strings with length 200
+    ref_log_lik = ref_model.get_likelihood(samples, num_steps=128, n_samples=1) # [B]
+    info['[log-lik-med]'] = torch.median(ref_log_lik).item()
+    preds = oracle.cal_gosai_pred_new(detokenized_samples, gosai_oracle, mode='eval')[:, 0]
+    info['[pred-activity-med]'] = np.median(preds).item()
+    atac = oracle.cal_atac_pred_new(detokenized_samples, cal_atac_pred_new_mdl)[:, 1]
+    info['[atac-acc%]'] = (atac > 0.5).sum().item() / len(samples) * 100
+    kmer = oracle.count_kmers(detokenized_samples)
+    info['[3-mer-corr]'] = compare_kmer(highexp_kmers_999, kmer, n_highexp_kmers_999, len(detokenized_samples)).item()
+    return info

tr2d2-dna/finetune.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# direct reward backpropagation
+from diffusion import Diffusion
+from hydra import initialize, compose
+from hydra.core.global_hydra import GlobalHydra
+import numpy as np
+import oracle
+from scipy.stats import pearsonr
+import torch
+import torch.nn.functional as F
+import argparse
+import wandb
+import os
+import datetime
+from utils import str2bool, set_seed
+from finetune_dna import finetune
+from mcts import MCTS
+argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+argparser.add_argument('--base_path', type=str, default="")
+argparser.add_argument('--learning_rate', type=float, default=1e-4)
+argparser.add_argument('--num_epochs', type=int, default=100)
+argparser.add_argument('--num_accum_steps', type=int, default=4)
+argparser.add_argument('--truncate_steps', type=int, default=50)
+argparser.add_argument("--truncate_kl", type=str2bool, default=False)
+argparser.add_argument('--gumbel_temp', type=float, default=1.0)
+argparser.add_argument('--gradnorm_clip', type=float, default=1.0)
+argparser.add_argument('--batch_size', type=int, default=32)
+argparser.add_argument('--name', type=str, default='debug')
+argparser.add_argument('--total_num_steps', type=int, default=128)
+argparser.add_argument('--copy_flag_temp', type=float, default=None)
+argparser.add_argument('--save_every_n_epochs', type=int, default=10)
+argparser.add_argument('--eval_every_n_epochs', type=int, default=200)
+argparser.add_argument('--alpha', type=float, default=0.001)
+argparser.add_argument('--alpha_schedule_warmup', type=int, default=0)
+argparser.add_argument("--seed", type=int, default=0)
+# new
+argparser.add_argument('--run_name', type=str, default='drakes')
+argparser.add_argument("--device", default="cuda:0", type=str)
+argparser.add_argument("--save_path_dir", default=None, type=str)
+argparser.add_argument("--no_mcts", action='store_true', default=False)
+argparser.add_argument("--centering", action='store_true', default=False)
+argparser.add_argument("--reward_clip", action='store_true', default=False)
+argparser.add_argument("--reward_clip_value", type=float, default=15.0)
+argparser.add_argument("--select_topk", action='store_true', default=False)
+argparser.add_argument('--select_topk_value', type=int, default=10)
+argparser.add_argument("--restart_ckpt_path", type=str, default=None)
+# mcts
+argparser.add_argument('--num_sequences', type=int, default=10)
+argparser.add_argument('--num_children', type=int, default=50)
+argparser.add_argument('--num_iter', type=int, default=30) # iterations of mcts
+argparser.add_argument('--seq_length', type=int, default=200)
+argparser.add_argument('--time_conditioning', action='store_true', default=False)
+argparser.add_argument('--mcts_sampling', type=int, default=0) # for batched categorical sampling: '0' means gumbel noise
+argparser.add_argument('--buffer_size', type=int, default=100)
+argparser.add_argument('--wdce_num_replicates', type=int, default=16)
+argparser.add_argument('--noise_removal', action='store_true', default=False)
+argparser.add_argument('--grad_clip', action='store_true', default=False)
+argparser.add_argument('--resample_every_n_step', type=int, default=10)
+argparser.add_argument('--exploration', type=float, default=0.1)
+argparser.add_argument('--reset_tree', action='store_true', default=False)
+# eval
+args = argparser.parse_args()
+print(args)
+# pretrained model path
+CKPT_PATH = os.path.join(args.base_path, 'mdlm/outputs_gosai/pretrained.ckpt')
+log_base_dir = os.path.join(args.save_path_dir, 'mdlm/reward_bp_results_final')
+# reinitialize Hydra
+GlobalHydra.instance().clear()
+# Initialize Hydra and compose the configuration
+initialize(config_path="configs_gosai", job_name="load_model")
+cfg = compose(config_name="config_gosai.yaml")
+cfg.eval.checkpoint_path = CKPT_PATH
+curr_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+if args.no_mcts:
+    run_name = f'MDNS_buffer{args.buffer_size}_alpha{args.alpha}_resample{args.resample_every_n_step}_centering{args.centering}_{curr_time}'
+else:
+    run_name = f'MCTS_buffer{args.buffer_size}_alpha{args.alpha}_resample{args.resample_every_n_step}_num_iter{args.num_iter}_centering{args.centering}_select_topk{args.select_topk}_select_topk_value{args.select_topk_value}_{curr_time}'
+args.save_path = os.path.join(args.save_path_dir, run_name)
+os.makedirs(args.save_path, exist_ok=True)
+# wandb init
+wandb.init(project='search-rl', name=run_name, config=args, dir=args.save_path)
+log_path = os.path.join(args.save_path, 'log.txt')
+set_seed(args.seed, use_cuda=True)
+# Initialize the model
+if args.restart_ckpt_path is not None:
+    # Resume from saved ckpt
+    restart_ckpt_path = os.path.join(args.base_path, args.restart_ckpt_path)
+    restart_epoch = restart_ckpt_path.split('_')[-1].split('.')[0]
+    args.restart_epoch = restart_epoch
+    policy_model = Diffusion(cfg).to(args.device)
+    policy_model.load_state_dict(torch.load(restart_ckpt_path, map_location=args.device))
+else:
+    # Start from pretrained model
+    policy_model = Diffusion.load_from_checkpoint(cfg.eval.checkpoint_path, config=cfg, map_location=args.device)
+pretrained = Diffusion.load_from_checkpoint(cfg.eval.checkpoint_path, config=cfg, map_location=args.device)
+reward_model = oracle.get_gosai_oracle(mode='train', device=args.device)
+#reward_model_eval = oracle.get_gosai_oracle(mode='eval').to(args.device)
+reward_model.eval()
+pretrained.eval()
+#reward_model_eval.eval()
+# define mcts
+mcts = MCTS(args, cfg, policy_model, pretrained, reward_model)
+_, _, highexp_kmers_999, n_highexp_kmers_999, _, _, _ = oracle.cal_highexp_kmers(return_clss=True)
+cal_atac_pred_new_mdl = oracle.get_cal_atac_orale(device=args.device)
+cal_atac_pred_new_mdl.eval()
+gosai_oracle = oracle.get_gosai_oracle(mode='eval', device=args.device)
+gosai_oracle.eval()
+print("args.device:", args.device)
+print("policy_model device:", policy_model.device)
+print("pretrained device:", pretrained.device)
+print("reward_model device:", reward_model.device)
+print("mcts device:", mcts.device)
+print("gosai_oracle device:", gosai_oracle.device)
+print("cal_atac_pred_new_mdl device:", cal_atac_pred_new_mdl.device)
+eval_model_dict = {
+    "gosai_oracle": gosai_oracle,
+    "highexp_kmers_999": highexp_kmers_999,
+    "n_highexp_kmers_999": n_highexp_kmers_999,
+    "cal_atac_pred_new_mdl": cal_atac_pred_new_mdl,
+    "gosai_oracle": gosai_oracle
+}
+finetune(args = args, cfg = cfg, policy_model = policy_model,
+        reward_model = reward_model, mcts = mcts,
+        pretrained_model = pretrained,
+        eval_model_dict = eval_model_dict)

tr2d2-dna/finetune_dna.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# direct reward backpropagation
+from hydra import initialize, compose
+from hydra.core.global_hydra import GlobalHydra
+import numpy as np
+import oracle
+from scipy.stats import pearsonr
+import torch
+import torch.nn.functional as F
+import argparse
+import wandb
+import os
+import datetime
+from utils import str2bool, set_seed
+# imports
+from finetune_utils import loss_wdce
+from tqdm import tqdm
+def finetune(args, cfg, policy_model, reward_model, mcts = None, pretrained_model = None, eps=1e-5):
+    """
+    Finetuning with WDCE loss
+    """
+    dt = (1 - eps) / args.total_num_steps
+    if args.no_mcts:
+        assert pretrained_model is not None, "pretrained model is required for no mcts"
+    else:
+        assert mcts is not None, "mcts is required for mcts"
+    # set model to train mode
+    policy_model.train()
+    torch.set_grad_enabled(True)
+    optim = torch.optim.AdamW(policy_model.parameters(), lr=args.learning_rate)
+    # record metrics
+    batch_losses = []
+    batch_rewards = []
+    # initialize the final seqs and log_rnd of the trajectories that generated those seqs
+    x_saved, log_rnd_saved, final_rewards_saved = None, None, None
+    # finetuning loop
+    pbar = tqdm(range(args.num_epochs))
+    for epoch in pbar:
+        # store metrics
+        rewards = []
+        losses = []
+        policy_model.train()
+        with torch.no_grad():
+            if x_saved is None or epoch % args.resample_every_n_step == 0:
+                # compute final sequences and trajectory log_rnd
+                if args.no_mcts:
+                    x_final, log_rnd, final_rewards = policy_model.sample_finetuned_with_rnd(args, reward_model, pretrained_model)
+                else:
+                    x_final, log_rnd, final_rewards = mcts.forward(args.reset_tree)
+                # save for next iteration
+                x_saved, log_rnd_saved, final_rewards_saved = x_final, log_rnd, final_rewards
+            else:
+                x_final, log_rnd, final_rewards = x_saved, log_rnd_saved, final_rewards_saved
+        # compute wdce loss
+        loss = loss_wdce(policy_model, log_rnd, x_final, num_replicates=args.wdce_num_replicates)
+        # gradient descent
+        loss.backward()
+        # optimizer
+        if args.grad_clip:
+            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), args.gradnorm_clip)
+        optim.step()
+        optim.zero_grad()
+        pbar.set_postfix(loss=loss.item())
+        losses.append(loss.item())
+        # sample a eval batch with updated policy to evaluate rewards
+        x_eval, mean_reward_eval = policy_model.sample_finetuned(args, reward_model)
+        batch_losses.append(loss.cpu().detach().numpy())
+        batch_rewards.append(mean_reward_eval.cpu().detach().item())
+        losses.append(loss.cpu().detach().numpy())
+        rewards = np.array(mean_reward_eval.detach().cpu().numpy())
+        losses = np.array(losses)
+        mean_reward_search = final_rewards.mean().item()
+        min_reward_search = final_rewards.min().item()
+        max_reward_search = final_rewards.max().item()
+        median_reward_search = final_rewards.median().item()
+        #reward_losses = np.array(reward_losses)
+        print("epoch %d"%epoch, "mean reward %f"%mean_reward_eval, "mean loss %f"%np.mean(losses))
+        wandb.log({"epoch": epoch, "mean_reward": mean_reward_eval, "mean_loss": np.mean(losses),
+                   "mean_reward_search": mean_reward_search, "min_reward_search": min_reward_search,
+                   "max_reward_search": max_reward_search, "median_reward_search": median_reward_search})
+        if (epoch+1) % args.save_every_n_epochs == 0:
+            model_path = os.path.join(args.save_path, f'model_{epoch}.ckpt')
+            torch.save(policy_model.state_dict(), model_path)
+            print(f"model saved at epoch {epoch}")
+    wandb.finish()
+    return batch_losses

tr2d2-dna/finetune_utils.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import random
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+from utils import sample_categorical_logits
+import numpy as np
+from tqdm import tqdm
+import torch.distributed as dist
+import torch.nn.functional as F
+def compute_ess(log_rnd, normalize=True):
+    """
+    log_rnd: [B]
+    Compute effective sample size:
+        If normalize: divide ESS by batch size, so range is [0, 1];
+        otherwise, range is [0, B]
+    """
+    weights = log_rnd.detach().softmax(dim=-1)
+    ess = 1 / (weights ** 2).sum().item()
+    return ess / log_rnd.shape[0] if normalize else ess
+def to_one_hot(x_idx, num_classes=4):
+    oh = F.one_hot(x_idx.long(), num_classes=num_classes)
+    return oh.float()
+def rnd(model, reward_model, batch_size, scale=1, device='cuda:0'):
+    r"""
+    Run random order sampling and compute the RND $\log\frac{dP^*}{dP^u}$ along the trajectory
+    reward_model: r(X)
+    return:
+    - x: the final samples, [B, D]
+    - log_rnd: the log RND along this trajectory, [B]
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+    x = torch.full((batch_size, model.length), model.vocab_size-1).to(device=device, dtype=torch.int64)
+    batch_arange = torch.arange(batch_size, device=device)
+    jump_pos = torch.rand(x.shape, device=device).argsort(dim=-1)
+    # jump_times, jump_pos = torch.rand(x.shape, device=device).sort(dim=-1)
+    # jump_times: Unif[0,1] in increasing order
+    # jump_pos: random permutation of range(D)
+    log_rnd = torch.zeros(batch_size, device=device) # [B]
+    for d in range(model.length-1, -1, -1):
+        # jump at time jump_times[:, d] at position jump_pos[:, d]
+        logits = model(x)[:, :, :-1] # [B, D, N-1]
+        update = sample_categorical_logits(
+            logits[batch_arange, jump_pos[:, d]]) # [B]
+        if torch.is_grad_enabled(): # avoid issues with in-place operations
+            x = x.clone()
+        x[batch_arange, jump_pos[:, d]] = update
+        log_rnd += -np.log(model.vocab_size-1) - logits[batch_arange, jump_pos[:, d], update]
+    log_rnd += scale * reward_model(x) # [B]
+    return x, log_rnd
+@torch.no_grad()
+def sampling(model, batch_size, rounds=1, device='cuda:0'):
+    """Any order autoregressive sampling"""
+    if hasattr(model, 'module'):
+        model = model.module
+    batch_arange = torch.arange(batch_size, device=device)
+    all_samples = []
+    for _ in tqdm(range(rounds), leave=False):
+        x = torch.full((batch_size, model.length), model.vocab_size-1).to(device=device, dtype=torch.int64)
+        jump_pos = torch.rand(x.shape, device=device).argsort(dim=-1)
+        # jump_times, jump_pos = torch.rand(x.shape, device=device).sort(dim=-1)
+        # jump_times: Unif[0,1] in increasing order
+        # jump_pos: random permutation of range(D)
+        for d in tqdm(range(model.length-1, -1, -1), leave=False):
+            # jump at time jump_times[:, d] at position jump_pos[:, d]
+            logits = model.logits(x)[:, :, :-1] # [B, D, N-1], not log-softmaxed but fine
+            update = sample_categorical_logits(
+                logits[batch_arange, jump_pos[:, d]]) # [B]
+            x[batch_arange, jump_pos[:, d]] = update
+        all_samples.append(x)
+    return torch.cat(all_samples) # (rounds * B, L)
+def loss_ce(log_rnd):
+    """Cross entropy loss KL(P^*||P^u)"""
+    weights = log_rnd.detach().softmax(dim=-1)
+    return (log_rnd * weights).sum()
+def loss_lv(log_rnd):
+    r"""Log variance loss Var_{P^\bar{u}}\log\frac{dP^*}{dP^u}"""
+    return log_rnd.var()
+def loss_re_rf(log_rnd, const=0):
+    r"""Relative entropy loss KL(P^u||P^*) with REINFORCE trick"""
+    return (-log_rnd * (-log_rnd.detach() + const)).mean()
+def loss_wdce(policy_model, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering = False):
+    r"""
+    Weighted denoising cross entropy loss
+    X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
+    log_rnd: [B]; x: [B, L] (no mask)
+    num_replicates: R, number of replicates of each row in x
+    weight_func: w(lambda) for each sample, 1/lambda by default
+    """
+    mask_index = policy_model.mask_index
+    if hasattr(policy_model, 'module'):
+        policy_model = policy_model.module
+    batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
+    batch_weights = log_rnd.detach_().softmax(dim=-1)
+    if centering:
+        batch_weights = batch_weights - batch_weights.mean(dim=-1, keepdim=True)
+    batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0) # [B*R]
+    lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
+    lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
+    masked_index = torch.rand(*batch.shape, device=batch.device) < lamda[..., None] # [B*R, D]
+    perturbed_batch = torch.where(masked_index, mask_index, batch)
+    # add time conditioning
+    t = lamda
+    sigma_t = -torch.log1p(-(1 - eps) * t)
+    # compute logits
+    logits = policy_model(perturbed_batch, sigma_t)
+    losses = torch.zeros(*batch.shape, device=batch.device, dtype=logits.dtype) # [B*R, D]
+    losses[masked_index] = torch.gather(input=logits[masked_index], dim=-1,
+                                        index=batch[masked_index][..., None]).squeeze(-1)
+    return - (losses.sum(dim=-1) * lamda_weights * batch_weights).mean()
+def loss_dce(model, x, weight_func=lambda l: 1/l):
+    r"""
+    Denoising cross entropy loss, x [B, D] are ground truth samples
+    weight_func: w(lambda) for each sample, 1/lambda by default
+    """
+    lamda = torch.rand(x.shape[0], device=x.device) # [B]
+    lamda_weights = weight_func(lamda).clamp(max=1e5) # [B]
+    masked_index = torch.rand(*x.shape, device=x.device) < lamda[..., None] # [B, D]
+    perturbed_batch = torch.where(masked_index, model.vocab_size-1, x)
+    logits = model(perturbed_batch)
+    losses = torch.zeros(*x.shape, device=x.device, dtype=logits.dtype) # [B, D]
+    losses[masked_index] = torch.gather(input=logits[masked_index], dim=-1,
+                                        index=x[masked_index][..., None]).squeeze(-1)
+    return - (losses.sum(dim=-1) * lamda_weights).mean()

tr2d2-dna/mcts.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import numpy as np
+import random as rd
+from finetune_utils import to_one_hot
+from utils import StepTimer
+import noise_schedule
+### BEGINNING OF NODE CLASS ###
+class Node:
+    """
+        Node class: partially unmasked sequence
+        - parentNode: Node object at previous time step
+        - childNodes: set of M Node objects generated from sampling M distinct unmasking schemes
+        - totalReward: vector of cumulative rewards for all K objectives
+        - visits: number of times the node has been visited by an interation
+        - path: array of partially unmasked SMILES strings leading to the node from the completely masked root node
+        - timestep: the time step where the sequence was sampled
+    """
+    def __init__(self, args, tokens=None, log_rnd=None, log_policy_step=None, log_pretrained_step=None, parentNode=None, childNodes=None, totalReward=None, timestep=None):
+        self.args = args
+        self.parentNode = parentNode
+        self.childNodes = [] if childNodes is None else childNodes
+        self.log_rnd = log_rnd # stores the log_rnd up to that step
+        #self.log_p0 = 0 # stores the log probabiltiy of the unmasking step from the previous iteration
+        self.log_policy_step = log_policy_step # stores the log probability of the unmasking step under the current policy
+        self.log_pretrained_step = log_pretrained_step
+        # initialize total rewards to the reward of the roll out unmasked sequence
+        self.totalReward = totalReward # potential reward of the node based on generated children
+        # set initial visits to 1
+        self.visits = 1
+        #self.path = path
+        # set timestep (value between 0 and num_steps)
+        self.timestep = timestep
+        # set the sampling probabiltiy equal to the probability from the reverse posterior
+        #self.sampleProb = sampleProb # stores the probability of the sampling step under the current policy
+        # dict with 'seqs' as token array and 'attention_mask'
+        self.tokens = tokens
+    def selectNode(self, rootNode):
+        """
+            Selects a node to move to among the children nodes based on select score
+        """
+        # extract the status of the current node
+        nodeStatus = self.getExpandStatus()
+        # if the node is a legal non-leaf node
+        if (nodeStatus == 3):
+            # initialize array that will store select score vectors of each child node
+            selectScores = []
+            selectable_children = [] # children nodes that can be selected
+            for childNode in self.childNodes:
+                childStatus = childNode.getExpandStatus()
+                # only append child if it is legal leaf node (expandable) or legal non-leaf node
+                if childStatus == 2 or childStatus == 3:
+                    selectScore = childNode.calcSelectScore()
+                    if torch.is_tensor(selectScore) and selectScore.numel() == 1:
+                        selectScore = selectScore.item()
+                    selectable_children.append(childNode)
+                    selectScores.append(float(selectScore))
+            # no selectable children
+            if len(selectable_children) == 0:
+                return rootNode, 3
+            selectScores = np.asarray(selectScores, dtype=np.float64)
+            temp = 1.0
+            # compute softmax probabiltiies
+            m = np.max(selectScores)
+            exps = np.exp((selectScores - m) / temp)
+            tot = exps.sum()
+            if not np.isfinite(tot) or tot <= 0.0:
+                probs = np.full(len(selectable_children), 1.0 / len(selectable_children))
+            else:
+                probs = exps / tot
+            # choose child index from categorical distribution
+            idx = np.random.choice(len(selectable_children), p=probs)
+            selected = selectable_children[idx]
+            # return selected child node and status
+            return selected, selected.getExpandStatus()
+        elif (nodeStatus == 2):
+            return self, nodeStatus
+        # if node is not valid non-leaf node
+        return rootNode, 3
+    def selectNodeTopK(self, rootNode, k = 3, temp  = 1.0):
+        """
+        Pick from the top-k by select score.
+        Returns: (selected_node, selected_status)
+        """
+        nodeStatus = self.getExpandStatus()
+        # If expandable leaf, return it directly
+        if nodeStatus == 2:
+            return self, nodeStatus
+        if nodeStatus == 3:
+            selectable_children = []
+            selectScores = []
+            # collect candidates
+            for ch in self.childNodes:
+                s = ch.getExpandStatus()
+                if s in (2, 3):
+                    sc = ch.calcSelectScore()
+                    if torch.is_tensor(sc):
+                        sc = sc.item() if sc.numel() == 1 else float(sc.mean().item())
+                    sc = float(sc) if np.isfinite(sc) else -np.inf  # push bad scores to -inf
+                    selectable_children.append(ch)
+                    selectScores.append(sc)
+            if not selectable_children:
+                return rootNode, 3
+            scores = np.asarray(selectScores, dtype=np.float64)
+            # top-k indices (largest scores)
+            k_eff = min(k, len(scores))
+            topk_idx = np.argpartition(-scores, kth=k_eff-1)[:k_eff]
+            # sort the top-k by score desc for stability
+            topk_idx = topk_idx[np.argsort(-scores[topk_idx])]
+            # slice down to top-k pool
+            pool_scores = scores[topk_idx]
+            pool_children = [selectable_children[i] for i in topk_idx]
+            # softmax over the top-k
+            m = np.max(pool_scores)
+            z = (pool_scores - m) / max(1e-8, temp)
+            exps = np.exp(np.clip(z, -60, 60))
+            tot = exps.sum()
+            if not np.isfinite(tot) or tot <= 0.0:
+                idx_local = 0  # best
+            else:
+                probs = exps / tot
+                idx_local = int(np.random.choice(len(pool_children), p=probs))
+            selected = pool_children[idx_local]
+            return selected, selected.getExpandStatus()
+        return rootNode, 3
+    def addChildNode(self, tokens, log_rnd, log_policy_step, log_pretrained_step, totalReward):
+        """"
+            Adds a child node:
+            log_rnd: log_rnd of the path up to the added child node
+            log_policy_step: scalar value of the log-prob of sampling the step under the policy
+            log_pretrained_step: scalar value of the log-prob of sampling the step under the pretrained model
+        """
+        child = Node(args=self.args,
+                     tokens=tokens,
+                     log_rnd = log_rnd,
+                     log_policy_step=log_policy_step,
+                     log_pretrained_step=log_pretrained_step,
+                     parentNode=self,
+                     childNodes=[],
+                     totalReward=totalReward,
+                     timestep=self.timestep+1)
+        self.childNodes.append(child)
+        return child
+    def update_logrnd(self, log_policy_step, log_rnd):
+        self.log_policy_step = log_policy_step
+        self.log_rnd = log_rnd
+    def updateNode(self, rewards):
+        """
+            Updates the cumulative rewards vector with the reward vector at a descendent leaf node.
+            Increments the number of visits to the node.
+        """
+        self.visits += 1
+        self.totalReward += rewards # singleton tensor
+    def calcSelectScore(self):
+        """
+            Calculates the select score for the node from the cumulative rewards vector and number of visits.
+            - c: determines the degree of exploration
+            - minSelectScore: determines the
+        """
+        # K-dimensional vector of normalized rewards for each objective
+        normRewards = self.totalReward / self.visits
+        # scales the cumulative reward by the sampling probability
+        return normRewards + (self.args.exploration * self.log_policy_step * np.sqrt(self.parentNode.visits) / self.visits)
+    def getExpandStatus(self):
+        """
+            Returns an integer indicating whether the node is a:
+            1. terminal node (sequence is fully unmasked)
+            2. legal leaf node (partially unmasked sequence that can be expanded)
+            3. legal non-leaf node (already expanded sequence with M child nodes)
+        """
+        if self.timestep == self.args.total_num_steps:
+            return 1
+        elif (self.timestep < self.args.total_num_steps) and (len(self.childNodes) == 0):
+            return 2
+        return 3
+### END OF NODE CLASS ###
+### BEGINNING OF MCTS CLASS ###
+class MCTS:
+    def __init__(self, args, config, policy_model, pretrained, rewardFunc, rootNode=None):
+        self.timer = StepTimer(policy_model.device)
+        # debugging
+        self.buf_stats = {"insert":0, "replace":0, "reject_worse":0,
+                          "reject_dup":0, "reject_nonfinite":0}
+        self._seen_hashes = set()
+        self.device = policy_model.device
+        print(f"MCTS device: {self.device}")
+        self.args = args
+        self.config = config
+        self.noise = noise_schedule.get_noise(config)
+        self.time_conditioning = args.time_conditioning
+        self.mask_index = policy_model.mask_index
+        masked_seq = torch.ones((self.args.seq_length), device = self.device) * self.mask_index
+        masked_tokens = {'seqs': masked_seq.to(dtype=torch.long), 'attention_mask': torch.ones_like(masked_seq).to(self.device)}
+        if rootNode is None:
+            self.rootNode = Node(self.args, tokens = masked_tokens,
+                                 log_rnd=torch.zeros((), device=self.device),
+                                 log_policy_step=torch.zeros((), device=self.device),
+                                 log_pretrained_step=torch.zeros((), device=self.device),
+                                 totalReward=torch.zeros((), device=self.device), timestep=0)
+        else:
+            self.rootNode = rootNode  # stores the root node of the tree
+        # dictionary:
+        # "seq": final unmasked sequence
+        # "traj": list of (N_steps, L)
+        # "reward": reward of the trajectory
+        self.buffer = [] # List[Dict[str, Any]]
+        self.buffer_size = args.buffer_size
+        self.num_steps = args.total_num_steps
+        self.num_sequences = args.num_sequences
+        # pretrained model
+        self.pretrained = pretrained
+        # the policy model that we want to finetune
+        self.policy_model = policy_model
+        #self.tokenizer = policy_model.tokenizer
+        self.device = policy_model.device
+        self.sequence_length = args.seq_length
+        self.num_iter = args.num_iter
+        self.num_children = args.num_children
+        # score functions
+        self.rewardFunc = rewardFunc
+        self.iter_num = 0
+        self.reward_log = []
+        self.logrnd_log = []
+        self.policy_model.eval()
+        self.pretrained.eval()
+        self.rewardFunc.eval()
+    def _hash_tokens(self, t):
+        # t: (L,) torch.long
+        return tuple(t.detach().cpu().tolist())
+    def reset(self, resetTree):
+        self.iter_num = 0
+        self.buffer = []
+        self._seen_hashes = set()  # Clear the hash set too!
+        self.reward_log = []
+        self.logrnd_log = []
+        # add option to continue with the same tree
+        if resetTree:
+            masked_seq = torch.ones((self.args.seq_length), device = self.device) * self.mask_index
+            masked_tokens = {'seqs': masked_seq.to(dtype=torch.long), 'attention_mask': torch.ones_like(masked_seq).to(self.device)}
+            self.rootNode = Node(self.args, tokens = masked_tokens,
+                                 log_rnd=torch.zeros((), device=self.device),
+                                 log_policy_step=torch.zeros((), device=self.device),
+                                 log_pretrained_step=torch.zeros((), device=self.device),
+                                 totalReward=torch.zeros((), device=self.device), timestep=0)
+    def forward(self, resetTree=False):
+        self.reset(resetTree)
+        while (self.iter_num < self.num_iter):
+            self.iter_num += 1
+            # traverse the tree form the root node until a leaf node
+            with self.timer.section("select"):
+                leafNode, _ = self.select(self.rootNode)
+            # expand leaf node into num_children partially unmasked sequences at the next timestep
+            with self.timer.section("expand"):
+                self.expand(leafNode)
+        final_x, log_rnd, final_rewards = self.consolidateBuffer()
+        rows = self.timer.summary()
+        print("\n=== Timing summary (by total time) ===")
+        for name, cnt, total, mean, p50, p95 in rows:
+            print(f"{name:30s}  n={cnt:5d}  total={total:8.3f}s  mean={mean*1e3:7.2f}ms  "
+                f"p50={p50*1e3:7.2f}ms  p95={p95*1e3:7.2f}ms")
+        # return final_seqs (B, L), log_rnd (B, ), and final rewards (B, )
+        return final_x, log_rnd, final_rewards
+    def updateBuffer(self, x_final, log_rnd, final_reward):
+        B = x_final.shape[0]
+        for i in range(B):
+            # Finite check
+            if not torch.isfinite(final_reward[i]) or not torch.isfinite(log_rnd[i]):
+                self.buf_stats["reject_nonfinite"] += 1
+                continue
+            h = self._hash_tokens(x_final[i])
+            if h in self._seen_hashes:
+                self.buf_stats["reject_dup"] += 1
+                continue
+            item = {"x_final": x_final[i].clone(),
+                    "log_rnd": log_rnd[i].clone(),
+                    "final_reward": final_reward[i].clone()}
+            if len(self.buffer) < self.buffer_size:
+                self.buffer.append(item)
+                self._seen_hashes.add(h)
+                self.buf_stats["insert"] += 1
+            else:
+                # replace if strictly better, or tie-break with log_rnd
+                min_idx, min_item = min(
+                    enumerate(self.buffer),
+                    key=lambda kv: (kv[1]["final_reward"].item(), kv[1]["log_rnd"].item())
+                )
+                cand_key = (final_reward[i].item(), log_rnd[i].item())
+                min_key  = (min_item["final_reward"].item(), min_item["log_rnd"].item())
+                if cand_key > min_key:  # allow ties via 2nd key
+                    # update hash set
+                    old_h = self._hash_tokens(self.buffer[min_idx]["x_final"])
+                    if old_h in self._seen_hashes:
+                        self._seen_hashes.remove(old_h)
+                    self.buffer[min_idx] = item
+                    self._seen_hashes.add(h)
+                    self.buf_stats["replace"] += 1
+                else:
+                    self.buf_stats["reject_worse"] += 1
+    def print_buffer_stats(self):
+        print("[BUFFER] ",
+              " ".join(f"{k}={v}" for k,v in self.buf_stats.items()),
+              f" size={len(self.buffer)}/{self.buffer_size}")
+        if len(self.buffer):
+            vals = torch.stack([b["final_reward"] for b in self.buffer]).float()
+            print(f"[BUFFER] reward min/mean/max: {vals.min():.4f} {vals.mean():.4f} {vals.max():.4f}")
+    def consolidateBuffer(self):
+        """
+        returns x_final, log_rnd, and final_rewards in tensors
+        """
+        x_final = []
+        log_rnd = []
+        final_rewards = []
+        for item in self.buffer:
+            x_final.append(item["x_final"])
+            log_rnd.append(item["log_rnd"])
+            final_rewards.append(item["final_reward"])
+        x_final = torch.stack(x_final, dim=0) # (B, L)
+        log_rnd = torch.stack(log_rnd, dim=0).to(dtype=torch.float32) # (B)
+        final_rewards = torch.stack(final_rewards, dim=0).to(dtype=torch.float32) # (B)
+        return x_final, log_rnd, final_rewards
+    def isPathEnd(self, path, maxDepth):
+        """
+            Checks if the node is completely unmasked (ie. end of path)
+            or if the path is at the max depth
+        """
+        if (path[-1] != self.mask_index).all():
+            return True
+        elif len(path) >= maxDepth:
+            return True
+        return False
+    def select(self, currNode, eps=1e-5):
+        """
+            Traverse the tree from the root node until reaching a legal leaf node
+        """
+        #iter = 1
+        updated_log_rnd = torch.zeros((), device=self.device)
+        while True:
+            if self.args.select_topk:
+                currNode, nodeStatus = currNode.selectNodeTopK(self.rootNode, k=self.args.select_topk_value, temp=1.0)
+            else:
+                currNode, nodeStatus = currNode.selectNode(self.rootNode)
+            if currNode.parentNode is not None:
+                # compute new log_policy
+                child_tokens = currNode.tokens['seqs'].to(self.device)
+                attn_mask = currNode.tokens['attention_mask'].to(self.device)
+                parent = currNode.parentNode
+                parent_tokens = parent.tokens['seqs'].to(self.device)
+                t = torch.ones(1, device = self.device)
+                dt = (1 - eps) / self.num_steps
+                with torch.no_grad():
+                    with self.timer.section("select.compute_log_policy"):
+                        updated_log_policy_step = self.policy_model.compute_log_policy(parent_tokens,
+                                                                                   child_tokens,
+                                                                                   t=t, dt=dt)
+                updated_log_rnd += (currNode.log_pretrained_step - updated_log_policy_step)
+                currNode.update_logrnd(updated_log_policy_step, updated_log_rnd) # update log_rnd
+            # node is terminal node or logal leaf node, return for expansion
+            if nodeStatus == 2:
+                return currNode, nodeStatus
+            elif nodeStatus == 1:
+                currNode = self.rootNode
+    def expand(self, parentNode, eps=1e-5):
+        """
+            Sample unmasking steps from the pre-trained MDLM
+            adds num_children partially unmasked sequences to the children of the parentNode
+        """
+        num_children = self.num_children
+        # initialize child rewards that will be added to total rewards
+        allChildReward = torch.zeros((), device=self.device)
+        # compute number of rollout steps
+        # if parentNode.timestep = self.num_steps then num_rollout_steps = 1
+        num_rollout_steps = self.num_steps - parentNode.timestep
+        # array of rollout timesteps from the timestep of parent node to 0
+        rollout_t = torch.linspace(1, eps, self.num_steps + 1, device=self.device)
+        dt = (1 - eps) / self.num_steps
+        # initialize x and attn_mask
+        x = parentNode.tokens['seqs'].to(self.device)
+        attn_mask = parentNode.tokens['attention_mask'].to(self.device)
+        parent_log_rnd = parentNode.log_rnd # stores the log_rnd up to parent node
+        t = rollout_t[parentNode.timestep] * torch.ones(1, 1, device = self.device)
+        # generate (n_children, seq_length) array of sampled children nodes
+        # sample M child sequences and compute their log probabilities
+        with torch.no_grad():
+            with self.timer.section("expand.batch_mcts_reverse_step"):
+                child_log_p, x_children, child_log_policy_step, child_log_pretrained_step = \
+                    self.policy_model.batch_mcts_reverse_step(token_array=x,
+                                                            t=t, dt=dt,
+                                                            batch_size=num_children,
+                                                            pretrained=self.pretrained)
+        # compute weight of the step (num_children, 1)
+        child_log_rnd = (parent_log_rnd + (child_log_pretrained_step - child_log_policy_step)).to(self.device)
+        x_rollout = x_children
+        traj_log_rnd = child_log_rnd # initialize log_rnd for entire rolled out trajectory
+        # rollout under the policy and compute the log ratio at each step
+        with self.timer.section("expand.rollout_total"):
+            for i in range(1, num_rollout_steps):
+                t = rollout_t[parentNode.timestep + i] * torch.ones(num_children, 1, device = self.device)
+                with torch.no_grad():
+                    log_p, x_next, log_policy_step, log_pretrained_step = \
+                        self.policy_model.mcts_reverse_step(x_rollout,
+                                                            t=t, dt=dt,
+                                                            pretrained=self.pretrained)
+                # add the rollout step
+                traj_log_rnd += log_pretrained_step - log_policy_step
+                x_rollout = x_next
+        # if mask token remains, fully unmask
+        mask_positions = (x_rollout == self.mask_index)        # (B, L) bool
+        # does **any** mask remain in any sequence
+        any_mask_global = mask_positions.any().item()  # true if mask remains
+        if any_mask_global:
+            with torch.no_grad():
+                with self.timer.section("expand.noise_removal"):
+                    log_p, x_next, log_policy_step, log_pretrained_step = \
+                        self.policy_model.mcts_noise_removal(x_rollout,
+                                                            t=t, dt=dt,
+                                                            pretrained=self.pretrained)
+            traj_log_rnd += log_pretrained_step - log_policy_step
+            x_rollout = x_next
+        x_final = x_rollout # final sequences (B, L)
+        # edit? how is the reward model defined?
+        #childSequences = self.tokenizer.batch_decode(x_rollout)
+        #if self.args.data == "peptide":
+            #validSequences = []
+        # get final rewards
+        x_one_hot = to_one_hot(x_final)
+        x_one_hot_reward = torch.transpose(x_one_hot, 1, 2)
+        reward_preds = self.rewardFunc(x_one_hot_reward).squeeze(-1) # (num_children, 4)
+        rewards_value = reward_preds[:, 0] # (num_children, 1)
+        if self.args.reward_clip:
+            rewards = torch.clamp(rewards_value, max=self.args.reward_clip_value)
+        else:
+            rewards = rewards_value
+        traj_log_rnd += rewards / self.args.alpha
+        self.reward_log.append(rewards.detach().cpu().numpy())
+        self.logrnd_log.append(traj_log_rnd.detach().cpu().numpy())
+        # update buffer
+        with self.timer.section("expand.update_buffer"):
+            self.updateBuffer(x_final, traj_log_rnd, rewards)
+        for i in range(num_children):
+            # add to all child reward vector for backprop
+            allChildReward += rewards[i]
+            # create node for sequence and add to the children node of parent
+            childTokens = {'seqs': x_children[i].to(dtype=torch.long), 'attention_mask': attn_mask}
+            parentNode.addChildNode(tokens=childTokens,
+                                    log_rnd=child_log_rnd[i],
+                                    log_policy_step=child_log_policy_step[i],
+                                    log_pretrained_step=child_log_pretrained_step[i],
+                                    totalReward=rewards[i])
+        # backpropogate all child rewards
+        with self.timer.section("expand.backprop"):
+            self.backprop(parentNode, allChildReward)
+    def backprop(self, node, allChildReward):
+        # backpropogate rewards through the path leading to the leaf node from the root
+        while node:
+            node.updateNode(allChildReward)
+            node = node.parentNode

tr2d2-dna/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import ema
2	+ from . import dnaconv

tr2d2-dna/models/dnaconv.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import copy
+import torch.nn.functional as F
+class GaussianFourierProjection(nn.Module):
+    """
+    Gaussian random features for encoding time steps.
+    """
+    def __init__(self, embed_dim, scale=30.):
+        super().__init__()
+        # Randomly sample weights during initialization. These weights are fixed
+        # during optimization and are not trainable.
+        self.W = nn.Parameter(torch.randn(embed_dim // 2) * scale, requires_grad=False)
+    def forward(self, x):
+        x_proj = x[:, None] * self.W[None, :] * 2 * np.pi
+        return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+class Dense(nn.Module):
+    """
+    A fully connected layer that reshapes outputs to feature maps.
+    """
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.dense(x)[...]
+# from https://github.com/HannesStark/dirichlet-flow-matching
+class CNNModel(nn.Module):
+    def __init__(self, args, alphabet_size, num_cls, classifier=False):
+        super().__init__()
+        self.alphabet_size = alphabet_size
+        self.args = args
+        self.classifier = classifier
+        self.num_cls = num_cls
+        if self.args.clean_data:
+            self.linear = nn.Embedding(self.alphabet_size, embedding_dim=args.hidden_dim)
+        else:
+            inp_size = self.alphabet_size #+ 1
+            self.linear = nn.Conv1d(inp_size, args.hidden_dim, kernel_size=9, padding=4)
+            self.time_embedder = nn.Sequential(GaussianFourierProjection(embed_dim= args.hidden_dim),nn.Linear(args.hidden_dim, args.hidden_dim))
+        self.num_layers = 5 * args.num_cnn_stacks
+        self.convs = [nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, padding=4),
+                                     nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, padding=4),
+                                     nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, dilation=4, padding=16),
+                                     nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, dilation=16, padding=64),
+                                     nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, dilation=64, padding=256)]
+        self.convs = nn.ModuleList([copy.deepcopy(layer) for layer in self.convs for i in range(args.num_cnn_stacks)])
+        self.time_layers = nn.ModuleList([Dense(args.hidden_dim, args.hidden_dim) for _ in range(self.num_layers)])
+        self.norms = nn.ModuleList([nn.LayerNorm(args.hidden_dim) for _ in range(self.num_layers)])
+        self.final_conv = nn.Sequential(nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=1),
+                                   nn.ReLU(),
+                                   nn.Conv1d(args.hidden_dim, args.hidden_dim if classifier else self.alphabet_size, kernel_size=1))
+        self.dropout = nn.Dropout(args.dropout)
+        if classifier:
+            self.cls_head = nn.Sequential(nn.Linear(args.hidden_dim, args.hidden_dim),
+                                   nn.ReLU(),
+                                   nn.Linear(args.hidden_dim, self.num_cls))
+        if self.args.cls_free_guidance and not self.classifier:
+            self.cls_embedder = nn.Embedding(num_embeddings=self.num_cls + 1, embedding_dim=args.hidden_dim)
+            self.cls_layers = nn.ModuleList([Dense(args.hidden_dim, args.hidden_dim) for _ in range(self.num_layers)])
+    def forward(self, seq, t, cls = None, return_embedding=False):
+        # adapt it to support both seq indices input and one-hot input
+        if not (seq.ndim > 2 and seq.shape[-1] == self.alphabet_size):
+            seq = F.one_hot(seq, num_classes=self.alphabet_size).float()
+        if self.args.clean_data:
+            feat = self.linear(seq)
+            feat = feat.permute(0, 2, 1)
+        else:
+            time_emb = F.relu(self.time_embedder(t))
+            feat = seq.permute(0, 2, 1)
+            feat = F.relu(self.linear(feat))
+        if self.args.cls_free_guidance and not self.classifier:
+            cls_emb = self.cls_embedder(cls)
+        for i in range(self.num_layers):
+            h = self.dropout(feat.clone())
+            if not self.args.clean_data:
+                h = h + self.time_layers[i](time_emb)[:, :, None]
+            if self.args.cls_free_guidance and not self.classifier:
+                h = h + self.cls_layers[i](cls_emb)[:, :, None]
+            h = self.norms[i]((h).permute(0, 2, 1))
+            h = F.relu(self.convs[i](h.permute(0, 2, 1)))
+            if h.shape == feat.shape:
+                feat = h + feat
+            else:
+                feat = h
+        feat = self.final_conv(feat)
+        feat = feat.permute(0, 2, 1)
+        if self.classifier:
+            feat = feat.mean(dim=1)
+            if return_embedding:
+                embedding = self.cls_head[:1](feat)
+                return self.cls_head[1:](embedding), embedding
+            else:
+                return self.cls_head(feat)
+        return feat

tr2d2-dna/models/ema.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+class ExponentialMovingAverage:
+  """
+  Maintains (exponential) moving average of a set of parameters.
+  """
+  def __init__(self, parameters, decay, use_num_updates=True):
+    """
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; usually the result of
+            `model.parameters()`.
+        decay: The exponential decay.
+        use_num_updates: Whether to use number of updates when computing
+            averages.
+    """
+    if decay < 0.0 or decay > 1.0:
+      raise ValueError('Decay must be between 0 and 1')
+    self.decay = decay
+    self.num_updates = 0 if use_num_updates else None
+    self.shadow_params = [p.clone().detach()
+                          for p in parameters if p.requires_grad]
+    self.collected_params = []
+  def move_shadow_params_to_device(self, device):
+    self.shadow_params = [i.to(device) for i in self.shadow_params]
+  def update(self, parameters):
+    """
+    Update currently maintained parameters.
+    Call this every time the parameters are updated, such as the result of
+    the `optimizer.step()` call.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; usually the same set of
+            parameters used to initialize this object.
+    """
+    decay = self.decay
+    if self.num_updates is not None:
+      self.num_updates += 1
+      decay = min(decay, (1 + self.num_updates) /
+                  (10 + self.num_updates))
+    one_minus_decay = 1.0 - decay
+    with torch.no_grad():
+      parameters = [p for p in parameters if p.requires_grad]
+      for s_param, param in zip(self.shadow_params, parameters):
+        s_param.sub_(one_minus_decay * (s_param - param))
+  def copy_to(self, parameters):
+    """
+    Copy current parameters into given collection of parameters.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored moving averages.
+    """
+    parameters = [p for p in parameters if p.requires_grad]
+    for s_param, param in zip(self.shadow_params, parameters):
+      if param.requires_grad:
+        param.data.copy_(s_param.data)
+  def store(self, parameters):
+    """
+    Save the current parameters for restoring later.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+    """
+    self.collected_params = [param.clone() for param in parameters]
+  def restore(self, parameters):
+    """
+    Restore the parameters stored with the `store` method.
+    Useful to validate the model with EMA parameters without affecting the
+    original optimization process. Store the parameters before the
+    `copy_to` method. After validation (or model saving), use this to
+    restore the former parameters.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+    """
+    for c_param, param in zip(self.collected_params, parameters):
+      param.data.copy_(c_param.data)
+  def state_dict(self):
+    return dict(decay=self.decay,
+                num_updates=self.num_updates,
+                shadow_params=self.shadow_params)
+  def load_state_dict(self, state_dict):
+    self.decay = state_dict['decay']
+    self.num_updates = state_dict['num_updates']
+    self.shadow_params = state_dict['shadow_params']

tr2d2-dna/noise_schedule.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import abc
+import torch
+import torch.nn as nn
+# Flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def get_noise(config, dtype=torch.float32):
+  if config.noise.type == 'geometric':
+    return GeometricNoise(config.noise.sigma_min,
+                          config.noise.sigma_max)
+  elif config.noise.type == 'loglinear':
+    return LogLinearNoise()
+  elif config.noise.type == 'cosine':
+    return CosineNoise()
+  elif config.noise.type == 'cosinesqr':
+    return CosineSqrNoise()
+  elif config.noise.type == 'linear':
+    return Linear(config.noise.sigma_min,
+                  config.noise.sigma_max,
+                  dtype)
+  else:
+    raise ValueError(f'{config.noise.type} is not a valid noise')
+def binary_discretization(z):
+  z_hard = torch.sign(z)
+  z_soft = z / torch.norm(z, dim=-1, keepdim=True)
+  return z_soft + (z_hard - z_soft).detach()
+class Noise(abc.ABC, nn.Module):
+  """
+  Baseline forward method to get the total + rate of noise at a timestep
+  """
+  def forward(self, t):
+    # Assume time goes from 0 to 1
+    return self.total_noise(t), self.rate_noise(t)
+  @abc.abstractmethod
+  def rate_noise(self, t):
+    """
+    Rate of change of noise ie g(t)
+    """
+    pass
+  @abc.abstractmethod
+  def total_noise(self, t):
+    """
+    Total noise ie \int_0^t g(t) dt + g(0)
+    """
+    pass
+class CosineNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * torch.cos(t * torch.pi / 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi / 2)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2)
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class CosineSqrNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * (
+      torch.cos(t * torch.pi / 2) ** 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2) ** 2
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class Linear(Noise):
+  def __init__(self, sigma_min=0, sigma_max=10, dtype=torch.float32):
+    super().__init__()
+    self.sigma_min = torch.tensor(sigma_min, dtype=dtype)
+    self.sigma_max = torch.tensor(sigma_max, dtype=dtype)
+  def rate_noise(self, t):
+    return self.sigma_max - self.sigma_min
+  def total_noise(self, t):
+    return self.sigma_min + t * (self.sigma_max - self.sigma_min)
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(- torch.exp(t * f_T + (1 - t) * f_0))
+    return (sigma_t - self.sigma_min) / (
+      self.sigma_max - self.sigma_min)
+class GeometricNoise(Noise):
+  def __init__(self, sigma_min=1e-3, sigma_max=1):
+    super().__init__()
+    self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
+  def rate_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (
+      self.sigmas[1].log() - self.sigmas[0].log())
+  def total_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
+class LogLinearNoise(Noise):
+  """Log Linear noise schedule.
+  Built such that 1 - 1/e^(n(t)) interpolates between 0 and
+  ~1 when t varies from 0 to 1. Total noise is
+  -log(1 - (1 - eps) * t), so the sigma will be
+  (1 - eps) * t.
+  """
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+    self.sigma_max = self.total_noise(torch.tensor(1.0))
+    self.sigma_min = self.eps + self.total_noise(torch.tensor(0.0))
+  def rate_noise(self, t):
+    return (1 - self.eps) / (1 - (1 - self.eps) * t)
+  def total_noise(self, t):
+    return -torch.log1p(-(1 - self.eps) * t)
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(- torch.exp(t * f_T + (1 - t) * f_0))
+    t = - torch.expm1(- sigma_t) / (1 - self.eps)
+    return t

tr2d2-dna/oracle.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import torch
+import tempfile
+import grelu
+import pandas as pd
+import os
+from grelu.lightning import LightningModel
+import grelu.data.preprocess
+import grelu.data.dataset
+import dataloader_gosai
+import numpy as np
+from typing import Callable, Union, List
+from scipy.linalg import sqrtm
+from scipy.stats import pearsonr
+import torch.nn.functional as F
+import io
+base_path = "" # Fill in directory of the pretrained checkpoints, e.g., "...../data_and_model/"
+def get_cal_atac_orale(device=None):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+    ckpt_path = os.path.join(base_path, 'mdlm/gosai_data/binary_atac_cell_lines.ckpt')
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    hp = ckpt.get("hyper_parameters", {})
+    ckpt.setdefault("data_params", hp.get("data_params", {}))
+    ckpt.setdefault("performance", {})
+    if not ckpt["performance"]:
+        ckpt["performance"] = {
+            "best_step": ckpt.get("global_step", 0),
+            "best_metric": None,
+        }
+    # Load model from in-memory checkpoint (no file I/O needed)
+    buffer = io.BytesIO()
+    torch.save(ckpt, buffer)
+    buffer.seek(0)  # Reset buffer position to the beginning
+    model_load = LightningModel.load_from_checkpoint(buffer, map_location="cpu")
+    model_load.to(device)
+    model_load.train_params['logger'] = None
+    return model_load
+def get_gosai_oracle(mode='train', device=None):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+    if mode == 'train':
+        ckpt_path = os.path.join(base_path, "mdlm/outputs_gosai/lightning_logs/reward_oracle_ft.ckpt")
+        ckpt = torch.load(ckpt_path, map_location="cpu")
+        hp = ckpt.get("hyper_parameters", {})
+        ckpt.setdefault("data_params", hp.get("data_params", {}))
+        ckpt.setdefault("performance", {})
+        if not ckpt["performance"]:
+            ckpt["performance"] = {
+                "best_step": ckpt.get("global_step", 0),
+                "best_metric": None,
+            }
+        # Load model from in-memory checkpoint (no file I/O needed)
+        buffer = io.BytesIO()
+        torch.save(ckpt, buffer)
+        buffer.seek(0)  # Reset buffer position to the beginning
+        model_load = LightningModel.load_from_checkpoint(buffer, map_location="cpu")
+        model_load.to(device)
+    elif mode == 'eval':
+        ckpt_path = os.path.join(base_path, "mdlm/outputs_gosai/lightning_logs/reward_oracle_eval.ckpt")
+        ckpt = torch.load(ckpt_path, map_location="cpu")
+        hp = ckpt.get("hyper_parameters", {})
+        ckpt.setdefault("data_params", hp.get("data_params", {}))  # safe default
+        ckpt.setdefault("performance", {})                         # safe default
+        # Optional: add minimal hints if code later reads fields
+        if not ckpt["performance"]:
+            ckpt["performance"] = {
+                "best_step": ckpt.get("global_step", 0),
+                "best_metric": None,
+            }
+        # Load model from in-memory checkpoint (no file I/O needed)
+        buffer = io.BytesIO()
+        torch.save(ckpt, buffer)
+        buffer.seek(0)  # Reset buffer position to the beginning
+        model_load = LightningModel.load_from_checkpoint(buffer, map_location="cpu")
+        model_load.to(device)
+    else:
+        raise ValueError
+    model_load.train_params['logger'] = None
+    return model_load
+def cal_gosai_pred(seqs, model=None, mode='eval'):
+    """
+    seqs: list of sequences (detokenized ACGT...)
+    """
+    if model is None:
+        model = get_gosai_oracle(mode=mode)
+    df_seqs = pd.DataFrame(seqs, columns=['seq'])
+    pred_dataset = grelu.data.dataset.DFSeqDataset(df_seqs)
+    preds = model.predict_on_dataset(pred_dataset, devices=[0])
+    return preds.squeeze() # numpy array with shape [n_seqs, 3]
+def cal_gosai_pred_new(seqs, model=None, mode='eval'):
+    """
+    seqs: list of sequences (detokenized ACGT...)
+    """
+    if model is None:
+        model = get_gosai_oracle(mode=mode)
+    model.eval()
+    tokens = dataloader_gosai.batch_dna_tokenize(seqs)
+    tokens = torch.tensor(tokens).long().to(model.device)
+    onehot_tokens = F.one_hot(tokens, num_classes=4).float()
+    preds = model(onehot_tokens.float().transpose(1, 2)).detach().cpu().numpy()
+    return preds.squeeze()
+def cal_atac_pred(seqs, model=None):
+    """
+    seqs: list of sequences (detokenized ACGT...)
+    """
+    if model is None:
+        model = LightningModel.load_from_checkpoint(os.path.join(base_path, 'mdlm/gosai_data/binary_atac_cell_lines.ckpt'), map_location='cuda')
+    df_seqs = pd.DataFrame(seqs, columns=['seq'])
+    pred_dataset = grelu.data.dataset.DFSeqDataset(df_seqs)
+    preds = model.predict_on_dataset(pred_dataset, devices=[0])
+    return preds.squeeze() # numpy array with shape [n_seqs, 7]
+def cal_atac_pred_new(seqs, model=None):
+    """
+    seqs: list of sequences (detokenized ACGT...)
+    """
+    if model is None:
+        model = LightningModel.load_from_checkpoint(os.path.join(base_path, 'mdlm/gosai_data/binary_atac_cell_lines.ckpt'), map_location='cuda')
+    model.eval()
+    tokens = dataloader_gosai.batch_dna_tokenize(seqs)
+    tokens = torch.tensor(tokens).long().to(model.device)
+    onehot_tokens = F.one_hot(tokens, num_classes=4).float()
+    preds = model(onehot_tokens.float().transpose(1, 2)).detach().cpu().numpy()
+    return preds.squeeze() # numpy array with shape [n_seqs, 7]
+def count_kmers(seqs, k=3):
+    counts = {}
+    for seq in seqs:
+        for i in range(len(seq) - k + 1):
+            subseq = seq[i : i + k]
+            try:
+                counts[subseq] += 1
+            except KeyError:
+                counts[subseq] = 1
+    return counts
+def subset_for_eval(n=5000, seed=0):
+    train_set = dataloader_gosai.get_datasets_gosai()
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    train_set_sp = torch.utils.data.Subset(train_set, np.random.choice(len(train_set), n, replace=False))
+    return train_set_sp
+def subset_eval_groundtruth(sets_sp):
+    train_set_sp = sets_sp
+    train_set_sp_clss = train_set_sp.dataset.clss[train_set_sp.indices]
+    return train_set_sp_clss
+def subset_eval_preds(sets_sp, oracle_model=None):
+    train_set_sp = sets_sp
+    train_preds = cal_gosai_pred(
+        dataloader_gosai.batch_dna_detokenize(train_set_sp.dataset.seqs[train_set_sp.indices].numpy()), oracle_model)
+    return train_preds
+def subset_eval_kmers(sets_sp, k=3):
+    train_set_sp = sets_sp
+    train_seqs = dataloader_gosai.batch_dna_detokenize(train_set_sp.dataset.seqs[train_set_sp.indices].numpy())
+    train_kmers = count_kmers(train_seqs, k)
+    return train_kmers
+def subset_eval_embs(sets_sp, oracle_model=None):
+    train_set_sp = sets_sp
+    train_sp_emb = cal_gosai_emb(
+        dataloader_gosai.batch_dna_detokenize(train_set_sp.dataset.seqs[train_set_sp.indices].numpy()), oracle_model)
+    return train_sp_emb
+def cal_emb_pca(sets_sp, n_components=50, oracle_model=None):
+    train_set_sp = sets_sp
+    train_sp_emb = cal_gosai_emb(
+        dataloader_gosai.batch_dna_detokenize(train_set_sp.dataset.seqs[train_set_sp.indices].numpy()), oracle_model)
+    from sklearn.decomposition import PCA
+    pca = PCA(n_components=n_components)
+    pca.fit(train_sp_emb.reshape(train_sp_emb.shape[0], -1))
+    return pca
+def subset_eval_embs_pca(sets_sp, pca, oracle_model=None):
+    train_sp_emb = subset_eval_embs(sets_sp, oracle_model)
+    train_sp_emb_pca = pca.transform(train_sp_emb.reshape(train_sp_emb.shape[0], -1))
+    return train_sp_emb_pca
+# https://github.com/HannesStark/dirichlet-flow-matching/blob/main/utils/flow_utils.py
+def get_wasserstein_dist(embeds1, embeds2):
+    if np.isnan(embeds2).any() or np.isnan(embeds1).any() or len(embeds1) == 0 or len(embeds2) == 0:
+        return float('nan')
+    mu1, sigma1 = embeds1.mean(axis=0), np.cov(embeds1, rowvar=False)
+    mu2, sigma2 = embeds2.mean(axis=0), np.cov(embeds2, rowvar=False)
+    ssdiff = np.sum((mu1 - mu2) ** 2.0)
+    covmean = sqrtm(sigma1.dot(sigma2))
+    if np.iscomplexobj(covmean):
+        covmean = covmean.real
+    dist = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
+    return dist
+def embed_on_dataset(
+    model,
+    dataset: Callable,
+    devices: Union[str, int, List[int]] = "cpu",
+    num_workers: int = 1,
+    batch_size: int = 256,
+):
+    """
+    Return embeddings for a dataset of sequences
+    Args:
+        dataset: Dataset object that yields one-hot encoded sequences
+        devices: Device IDs to use
+        num_workers: Number of workers for data loader
+        batch_size: Batch size for data loader
+    Returns:
+        Numpy array of shape (B, T, L) containing embeddings.
+    """
+    torch.set_float32_matmul_precision("medium")
+    # Make dataloader
+    dataloader = model.make_predict_loader(
+        dataset, num_workers=num_workers, batch_size=batch_size
+    )
+    # Get device
+    orig_device = model.device
+    device = model.parse_devices(devices)[1]
+    if isinstance(device, list):
+        device = device[0]
+    model.to(device)
+    # Get embeddings
+    preds = []
+    model.model = model.model.eval()
+    for batch in iter(dataloader):
+        batch = batch.to(device)
+        preds.append(model.model.embedding(batch).detach().cpu())
+    # Return to original device
+    model.to(orig_device)
+    return torch.vstack(preds).numpy()
+def cal_gosai_emb(seqs, model=None):
+    """
+    seqs: list of sequences (detokenized ACGT...)
+    """
+    if model is None:
+        model = get_gosai_oracle()
+    df_seqs = pd.DataFrame(seqs, columns=['seq'])
+    pred_dataset = grelu.data.dataset.DFSeqDataset(df_seqs)
+    embs = embed_on_dataset(model, pred_dataset, devices=[0])
+    return embs # numpy array with shape [n_seqs, 3072, 2]
+def cal_highexp_kmers(k=3, return_clss=False):
+    train_set = dataloader_gosai.get_datasets_gosai()
+    exp_threshold = np.quantile(train_set.clss[:, 0].numpy(), 0.99) # 4.56
+    highexp_indices = [i for i, data in enumerate(train_set) if data['clss'][0] > exp_threshold]
+    highexp_set_sp = torch.utils.data.Subset(train_set, highexp_indices)
+    highexp_seqs = dataloader_gosai.batch_dna_detokenize(highexp_set_sp.dataset.seqs[highexp_set_sp.indices].numpy())
+    highexp_kmers_99 = count_kmers(highexp_seqs, k=k)
+    n_highexp_kmers_99 = len(highexp_indices)
+    exp_threshold = np.quantile(train_set.clss[:, 0].numpy(), 0.999) # 6.27
+    highexp_indices = [i for i, data in enumerate(train_set) if data['clss'][0] > exp_threshold]
+    highexp_set_sp = torch.utils.data.Subset(train_set, highexp_indices)
+    highexp_seqs = dataloader_gosai.batch_dna_detokenize(highexp_set_sp.dataset.seqs[highexp_set_sp.indices].numpy())
+    highexp_kmers_999 = count_kmers(highexp_seqs, k=k)
+    n_highexp_kmers_999 = len(highexp_indices)
+    if return_clss:
+        highexp_set_sp_clss_999 = highexp_set_sp.dataset.clss[highexp_set_sp.indices]
+        highexp_preds_999 = cal_gosai_pred_new(
+                dataloader_gosai.batch_dna_detokenize(highexp_set_sp.dataset.seqs[highexp_set_sp.indices].numpy()))
+        return highexp_kmers_99, n_highexp_kmers_99, highexp_kmers_999, n_highexp_kmers_999, highexp_set_sp_clss_999, highexp_preds_999, highexp_seqs
+    return highexp_kmers_99, n_highexp_kmers_99, highexp_kmers_999, n_highexp_kmers_999
+def cal_kmer_corr(model, highexp_kmers, n_highexp_kmers, n_sample=128):
+    model.eval()
+    all_detoeknized_samples = []
+    for _ in range(10):
+        samples = model._sample(eval_sp_size=n_sample).detach().cpu().numpy()
+        detokenized_samples = dataloader_gosai.batch_dna_detokenize(samples)
+        all_detoeknized_samples.extend(detokenized_samples)
+    generated_kmer = count_kmers(all_detoeknized_samples)
+    kmer_set = set(highexp_kmers.keys()) | set(generated_kmer.keys())
+    counts = np.zeros((len(kmer_set), 2))
+    for i, kmer in enumerate(kmer_set):
+        if kmer in highexp_kmers:
+            counts[i][1] = highexp_kmers[kmer] * len(generated_kmer) / n_highexp_kmers
+        if kmer in generated_kmer:
+            counts[i][0] = generated_kmer[kmer]
+    corr = pearsonr(counts[:, 0], counts[:, 1])[0]
+    return corr
+def cal_avg_likelihood(model, old_model, n_sample=128):
+    model.eval()
+    old_model.eval()
+    all_raw_samples = []
+    for _ in range(10):
+        samples = model._sample(eval_sp_size=n_sample)
+        all_raw_samples.append(samples)
+    all_raw_samples = torch.concat(all_raw_samples)
+    avg_likelihood = old_model._forward_pass_diffusion(all_raw_samples).sum(-1).mean().item()
+    return avg_likelihood

tr2d2-dna/run_batch_eval.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/bash
+#SBATCH --job-name=dna
+#SBATCH --partition=coe-gpu
+#SBATCH --gres=gpu:H200:1
+#SBATCH --time=16:00:00
+#SBATCH --mem-per-gpu=60G
+#SBATCH --cpus-per-task=2
+#SBATCH --wait-all-nodes=1
+#SBATCH --output=../outputs/%j.%x/.log
+# Set the path to your runs directory
+RUNS_DIR="" # Fill in directory of which to eval the checkpoints
+# Set output file name with timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+OUTPUT_FILE="batch_eval_results_${TIMESTAMP}.txt"
+# Run the batch evaluation
+python eval_runs_batch.py \
+    --runs_dir "$RUNS_DIR" \
+    --output_file "$OUTPUT_FILE" \
+    --device "cuda:0" \
+    --total_num_steps 128 \
+    --batch_size 128 \
+    --num_seeds 3 \
+    --total_samples 640 \
+    --seq_length 200
+echo "Batch evaluation completed. Results saved to: $OUTPUT_FILE"

tr2d2-dna/train.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+#SBATCH --job-name=dna_mdns
+#SBATCH --partition=coe-gpu
+#SBATCH --gres=gpu:H100:1
+#SBATCH --time=16:00:00
+# max 16 GPU hours, i.e., time <= 16h / num of GPUs
+#SBATCH --mem-per-gpu=60G
+# maximum GPU RAM, 141G for H200, 94G for H100
+# in the current setting, 40G is enough for num_replicates=2 and 80G is enough for num_replicates=4
+#SBATCH --cpus-per-task=2
+#SBATCH --wait-all-nodes=1
+#SBATCH --output=../outputs/%j.%x/.log
+HOME_LOC= "" # Fill in directory of the repo
+SAVE_PATH = "" # Fill in directory to save the checkpoints
+BASE_PATH = "" # Fill in directory of the pretrained checkpoints, e.g., "...../data_and_model/"
+SCRIPT_LOC=$HOME_LOC/tr2d2/dna
+LOG_LOC=$HOME_LOC/tr2d2/dna/logs
+DATE=$(date +%m_%d)
+mkdir -p "$LOG_LOC"
+# set 3 have skip connection
+# ===================================================================
+python $SCRIPT_LOC/finetune.py \
+    --base_path $BASE_PATH \
+    --device "cuda:0" \
+    --noise_removal \
+    --save_path_dir $SAVE_PATH \
+    --wdce_num_replicates 16 \
+    --buffer_size 160 \
+    --batch_size 160 \
+    --seq_length 200 \
+    --num_children 32 \
+    --total_num_steps 128 \
+    --num_iter 5 \
+    --resample_every_n_step 5 \
+    --eval_every_n_epochs 10 \
+    --num_epochs 60000 \
+    --exploration 0.1 \
+    --save_every_n_epoch 2000 \
+    --alpha 0.1 \
+    --centering \
+    --grad_clip \
+    --reward_clip \
+    --reward_clip_value 15.0 \
+    --reset_tree

tr2d2-dna/utils.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Console logger utilities.
+Copied from https://github.com/HazyResearch/transformers/blob/master/src/utils/utils.py
+Copied from https://docs.python.org/3/howto/logging-cookbook.html#using-a-context-manager-for-selective-logging
+"""
+import logging
+import fsspec
+import lightning
+import torch
+from timm.scheduler import CosineLRScheduler
+import argparse
+import numpy as np
+import random
+import os
+import time, torch
+from collections import defaultdict
+from contextlib import contextmanager
+class StepTimer:
+    def __init__(self, device=None):
+        self.times = defaultdict(list)
+        self.device = device
+        self._use_cuda_sync = (
+            isinstance(device, torch.device) and device.type == "cuda"
+        ) or (isinstance(device, str) and "cuda" in device)
+    @contextmanager
+    def section(self, name):
+        if self._use_cuda_sync:
+            torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        try:
+            yield
+        finally:
+            if self._use_cuda_sync:
+                torch.cuda.synchronize()
+            dt = time.perf_counter() - t0
+            self.times[name].append(dt)
+    def summary(self, top_k=None):
+        # returns (name, count, total, mean, p50, p95)
+        import numpy as np
+        rows = []
+        for k, v in self.times.items():
+            a = np.array(v, dtype=float)
+            rows.append((k, len(a), a.sum(), a.mean(), np.median(a), np.percentile(a, 95)))
+        rows.sort(key=lambda r: r[2], reverse=True)  # by total time
+        return rows[:top_k] if top_k else rows
+def sample_categorical_logits(logits, dtype=torch.float64):
+  # do not require logits to be log-softmaxed
+  gumbel_noise = -(1e-10 - (torch.rand_like(logits, dtype=dtype) + 1e-10).log()).log()
+  return (logits + gumbel_noise).argmax(dim=-1)
+def fsspec_exists(filename):
+  """Check if a file exists using fsspec."""
+  fs, _ = fsspec.core.url_to_fs(filename)
+  return fs.exists(filename)
+def fsspec_listdir(dirname):
+  """Listdir in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  return fs.ls(dirname)
+def fsspec_mkdirs(dirname, exist_ok=True):
+  """Mkdirs in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  fs.makedirs(dirname, exist_ok=exist_ok)
+def print_nans(tensor, name):
+  if torch.isnan(tensor).any():
+    print(name, tensor)
+class CosineDecayWarmupLRScheduler(
+  CosineLRScheduler,
+  torch.optim.lr_scheduler._LRScheduler):
+  """Wrap timm.scheduler.CosineLRScheduler
+  Enables calling scheduler.step() without passing in epoch.
+  Supports resuming as well.
+  Adapted from:
+    https://github.com/HazyResearch/hyena-dna/blob/main/src/utils/optim/schedulers.py
+  """
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._last_epoch = -1
+    self.step(epoch=0)
+  def step(self, epoch=None):
+    if epoch is None:
+      self._last_epoch += 1
+    else:
+      self._last_epoch = epoch
+    # We call either step or step_update, depending on
+    # whether we're using the scheduler every epoch or every
+    # step.
+    # Otherwise, lightning will always call step (i.e.,
+    # meant for each epoch), and if we set scheduler
+    # interval to "step", then the learning rate update will
+    # be wrong.
+    if self.t_in_epochs:
+      super().step(epoch=self._last_epoch)
+    else:
+      super().step_update(num_updates=self._last_epoch)
+class LoggingContext:
+  """Context manager for selective logging."""
+  def __init__(self, logger, level=None, handler=None, close=True):
+    self.logger = logger
+    self.level = level
+    self.handler = handler
+    self.close = close
+  def __enter__(self):
+    if self.level is not None:
+      self.old_level = self.logger.level
+      self.logger.setLevel(self.level)
+    if self.handler:
+      self.logger.addHandler(self.handler)
+  def __exit__(self, et, ev, tb):
+    if self.level is not None:
+      self.logger.setLevel(self.old_level)
+    if self.handler:
+      self.logger.removeHandler(self.handler)
+    if self.handler and self.close:
+      self.handler.close()
+def get_logger(name=__name__, level=logging.INFO) -> logging.Logger:
+  """Initializes multi-GPU-friendly python logger."""
+  logger = logging.getLogger(name)
+  logger.setLevel(level)
+  # this ensures all logging levels get marked with the rank zero decorator
+  # otherwise logs would get multiplied for each GPU process in multi-GPU setup
+  for level in ('debug', 'info', 'warning', 'error',
+                'exception', 'fatal', 'critical'):
+    setattr(logger,
+            level,
+            lightning.pytorch.utilities.rank_zero_only(
+              getattr(logger, level)))
+  return logger
+def str2bool(v):
+  if isinstance(v, bool):
+    return v
+  if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    return True
+  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    return False
+  else:
+    raise argparse.ArgumentTypeError('Boolean value expected.')
+def set_seed(seed, use_cuda):
+  os.environ['PYTHONHASHSEED'] = str(seed)
+  np.random.seed(seed)
+  random.seed(seed)
+  torch.manual_seed(seed)
+  # torch.backends.cudnn.deterministic = True
+  if use_cuda:
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+  print(f'=> Seed of the run set to {seed}')