PepTune / src /diffusion.py

Sophia Tang

initial commit

40e7e76 8 days ago

39.6 kB

	## Adapted from MDLM: https://github.com/kuleshov-group/mdlm

	import numpy as np
	import sys
	import itertools
	import time
	import torch
	from torch import Tensor
	import math
	import torch.nn.functional as F
	import numpy as np
	import random as rd
	import lightning as L
	import torchmetrics
	from dataclasses import dataclass
	import gc
	import pickle
	import utils.utils as utils
	from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
	import noise_schedule
	from torch.optim.lr_scheduler import _LRScheduler
	import roformer as roformer
	from utils.app import PeptideAnalyzer

	@dataclass
	class Loss:
	loss: torch.FloatTensor
	nlls: torch.FloatTensor
	attn_mask: torch.FloatTensor


	class NLL(torchmetrics.aggregation.MeanMetric):
	pass


	class BPD(NLL):
	def compute(self) -> Tensor:
	"""Computes the bits per dimension.

	Returns:
	bpd
	"""
	return self.mean_value / self.weight / math.log(2)


	class Perplexity(NLL):
	def compute(self) -> Tensor:
	"""Computes the Perplexity.

	Returns:
	Perplexity
	"""
	return torch.exp(self.mean_value / self.weight)


	class Diffusion(L.LightningModule):
	def __init__(self, config, tokenizer):

	super().__init__()
	self.config = config
	#self.save_hyperparameters()

	# PeptideCLM tokenizer
	self.tokenizer = tokenizer
	self.vocab_size = self.tokenizer.vocab_size
	self.mask_token_id = self.tokenizer.mask_token_id
	self.sampler = self.config.sampling.predictor
	self.analyzer = PeptideAnalyzer()

	# backbone LM PeptideCLM model
	if self.config.backbone == 'roformer':
	self.backbone = roformer.Roformer(self.config, self.tokenizer)
	self.backbone.unfreeze_all_layers()
	elif self.config.backbone == 'finetune_roformer':
	self.backbone = roformer.Roformer(self.config, self.tokenizer)
	self.backbone.freeze_model()
	self.backbone.unfreeze_n_layers(n=8)
	else:
	Exception('invalid backbone config')

	self.neg_infinity = -1000000.0
	self.T = config.T
	# noise schedule for non-peptide bond tokens (default to log-linear)
	self.noise = noise_schedule.get_noise(config)
	# noise schedule for peptide bonds (log-polynomial)
	self.bond_noise = noise_schedule.LogPolyNoise()
	self.time_conditioning = self.config.time_conditioning
	self.fast_forward_epochs = None
	self.fast_forward_batches = None

	self.gen_ppl_eval_model_name_or_path = self.config.eval.gen_ppl_eval_model_name_or_path
	self.gen_ppl_metric = Perplexity()

	self.lr = self.config.optim.lr
	self.sampling_eps = self.config.training.sampling_eps

	metrics = torchmetrics.MetricCollection({
	'nll': NLL(),
	'bpd': BPD(),
	'ppl': Perplexity(),
	})
	metrics.set_dtype(torch.float64)
	self.train_metrics = metrics.clone(prefix='trainer/')
	self.valid_metrics = metrics.clone(prefix='val/')
	self.test_metrics = metrics.clone(prefix='test/')


	"""LOSS FOR INVALID PEPTIDES"""

	@torch.no_grad()
	def conditional_gumbel(self, logits, D, k):
	"""
	Outputs k samples of Q = StandardGumbel(), such that argmax(logits
	+ Q) is given by D (one-hot vector).

	Input:
	- logits: Tensor of shape (batch_size, seq_len, vocab_size)
	- D: One-hot tensor of shape (batch_size, seq_len, vocab_size)
	- k: Number of Gumbel samples

	Output:
	- Adjusted logits with shape (k, batch_size, seq_len, vocab_size)
	"""

	# iid. exponential samples of shape (k, batch_size, seq_len, vocab_size)
	E = torch.distributions.exponential.Exponential(rate=torch.ones_like(logits)).sample([k])

	# E of the chosen class, shape (k, batch_size, seq_len, 1)
	Ei = (D * E).sum(dim=-1, keepdim=True)

	# Partition function (normalization constant), shape (batch_size, seq_len, 1)
	Z = logits.exp().sum(dim=-1, keepdim=True)

	# Adjusted logits for Gumbel distribution
	adjusted = (
	D * (-torch.log(Ei) + torch.log(Z)) +
	(1 - D) * -torch.log(E / logits.exp() + Ei / Z)
	)

	# Adjusted logits shape: (k, batch_size, seq_len, vocab_size)
	return adjusted - logits

	def replace_gradient(self, value, surrogate):
	"""
	Returns `value` but backpropagates gradients through `surrogate`.
	"""
	return surrogate + (value - surrogate).detach()

	def gumbel_rao(self, logits, k, temp=1.0, I=None):
	"""
	Returns a categorical sample from logits (over axis=-1) as a
	one-hot vector, with gumbel-rao gradient.

	Input:
	- logits: Tensor of shape (batch_size, seq_len, vocab_size)
	- k: Number of Gumbel samples for Rao-Blackwellization
	- temp: Temperature for softmax
	- I: Optional, precomputed categorical sample tensor of shape (batch_size, seq_len)

	Output:
	- One-hot tensor of shape (batch_size, seq_len, vocab_size)
	with Gumbel-Rao gradient.
	"""
	assert logits.shape[-1] == self.tokenizer.vocab_size
	vocab_size = logits.shape[-1]

	if I is None:
	# Sample indices for each token in the batch
	I = torch.distributions.categorical.Categorical(logits=logits).sample() # (batch_size, seq_len)

	# Convert indices to one-hot encodings, shape (batch_size, seq_len, vocab_size)
	D = torch.nn.functional.one_hot(I, num_classes=vocab_size).float()

	# Generate k different adjusted logits that all evaluate to the same sequence
	adjusted = logits + self.conditional_gumbel(logits, D, k=k) # (k, batch_size, seq_len, vocab_size)

	# Compute the surrogate by averaging softmax across k samples
	surrogate = torch.nn.functional.softmax(adjusted / temp, dim=-1).mean(dim=0) # (batch_size, seq_len, vocab_size)

	# Return one-hot representation with surrogate gradient
	return self.replace_gradient(D, surrogate)

	def compute_invalid_loss(self, logits, k=None, temp=None):
	"""
	Penalizes logits that produce invalid sequences using the `is_peptide` function,
	scaling penalties inversely with token probabilities.

	Args:
	logits: Tensor of shape [batch_size, seq_len, vocab_size].
	k: Number of samples for Gumbel-Rao.
	temp: Temperature for softmax.

	Returns:
	loss: A scalar tensor representing the total loss for invalid sequences.
	"""

	#samples = self.gumbel_rao(logits, k=k, temp=temp) # (batch_size, seq_len, vocab_size)

	# Convert logits to sequences using the tokenizer
	batch_token_ids = logits.argmax(dim=-1).to(self.device) # (batch_size, seq_len)
	sampled_sequences = self.tokenizer.batch_decode(batch_token_ids)

	# Check validity of each sampled sequence (not differentiable)
	penalties = torch.tensor(
	[1 if not self.analyzer.is_peptide(seq) else 0 for seq in sampled_sequences],
	dtype=torch.float32,
	device=self.device
	)
	#print(penalties)

	# Compute probabilities for each token (batch_size, seq_length)
	sampled_probs = torch.softmax(logits, dim=-1).gather(dim=-1, index=batch_token_ids.unsqueeze(-1)).squeeze(-1).to(self.device)

	# scale penalties by softmax probability of sampled tokens
	scaled_penalty = penalties[:, None] * sampled_probs # (batch_size, seq_length)

	return scaled_penalty.to(self.device)

	"""DIFFUSION LOSS"""

	def sample_t(self, n, device):
	"""
	Sample random time steps for batch training
	"""
	# sample values uniformly at random from [0, 1)
	eps_t = torch.rand(n, device=device)
	# antithetic sampling: reduce variance by pairing each sample with complementary sample
	if self.config.training.antithetic_sampling:
	# compute interval between sampled time steps
	offset = torch.arange(n, device=device) / n
	# ensure that each eps value is evenly spaced between [0, 1)
	eps_t = ((eps_t / n) + offset) % 1

	# ensures values are not exactly 0 or 1
	t = (1 - self.config.training.sampling_eps) * eps_t + self.config.training.sampling_eps

	return t

	def q_xt(self, x, mask_prob):
	"""Computes the noisy sample xt.

	Args:
	x: int torch.Tensor with shape (batch_size,
	diffusion_model_input_length), input.
	move_chance: float torch.Tensor with shape (batch_size, 1).
	"""

	actual_seq_length = (x != 0).sum(dim=-1, keepdim=True)
	#print(actual_seq_length)

	max_mask_length = (actual_seq_length * 0.75).long()

	mask_indices = torch.rand(*x.shape, device=x.device) < mask_prob

	restricted_move_indices = torch.zeros_like(mask_indices, dtype=torch.bool)

	for i in range(x.shape[0]):
	true_positions = torch.where(mask_indices[i])[0]
	if len(true_positions) > max_mask_length[i]:
	selected_positions = true_positions[:max_mask_length[i].item()]
	restricted_move_indices[i, selected_positions] = True
	else:
	restricted_move_indices[i] = mask_indices[i]

	xt = torch.where(restricted_move_indices, self.tokenizer.mask_token_id, x)

	return xt


	def sample_prior(self, *batch_dims):
	"""
	Returns array of fully masked sequences with same shape as input
	"""
	return self.mask_token_id * torch.ones(* batch_dims, dtype=torch.int64)


	"""COMPUTING LOSS"""

	def compute_diffusion_loss(self, model_output, xt, x0, t):
	"""
	Computes diffusion loss term in ELBO
	(evaluates how accurately the model predicts the token probabilities at each time step)

	Inputs:
	- model_output: [sequence length, vocab size, vocab size] array of logits for each token at each sequence position
	- zt: corrupted version of original input x0 at timestep t
	- x0: original input sequence
	- t: timestep
	"""
	# compute interval between each timestep
	dt = 1 / self.T

	# compute vectorized alpha scaling terms for the logits at timestep s and t
	alpha_t = 1 - t + torch.zeros_like(x0)
	# s = t - dt
	alpha_s = 1 - (t - dt) + torch.zeros_like(x0)

	# gather vector of log-probabilities for each token in x0
	# log<x_theta, x>
	log_x_theta_at_x0 = torch.gather(model_output, -1, x0[:, :, None]) # shape (B, L, vocab_size)
	# gather log-probabillities for assigning a masked token at each position in the sequence at time t
	# log<x_theta, m>
	log_x_theta_at_m = model_output[:, :, self.mask_token_id]
	# obtain non-log probability of assigning a masked token
	# <xt, m>
	x_theta_at_m = log_x_theta_at_m.exp()

	# first term of diffusion loss
	term_1_coef = dt / t
	term_1_log_numerator = torch.log((alpha_t * x_theta_at_m) / t + 1)
	term_1_log_denom = log_x_theta_at_x0

	# second term of diffusion loss
	term_2_coef = 1 - (dt / t)
	term_2_log_numerator = term_1_log_numerator
	term_2_log_denom = torch.log((alpha_s * x_theta_at_m) / (t - dt) + 1)

	L_vb_masked = (term_1_coef * (term_1_log_numerator - term_1_log_denom) +
	term_2_coef * (term_2_log_numerator - term_2_log_denom))

	# multiply by <zt, m> term
	L_vb = L_vb_masked * (xt == self.mask_token_id)

	# scale by T and return
	return self.T * L_vb

	def _forward_pass_diffusion(self, x0, attn_mask, bond_mask=None, mask=None):
	"""
	Training reverse diffusion model x_theta to reconstruct samples x0

	bond_mask: (batch, seq_length)
	"""
	# randomly sample time steps to start the denoising process for each x0 in batch
	t = self.sample_t(x0.shape[0], self.device)

	# if we are training the intermediate transition blocks
	if self.T > 0:
	# scale by total timesteps T and cast to integer
	t = (t * self.T).to(torch.int)
	# scale down by T to get a multiple of 1/T
	t = t / self.T
	# add 1/T to ensure no 0 values
	t += (1 / self.T)

	# get noise and rate of noise at timestep t
	# sigma = -log(1-t); dsigma = 1 / (1-t)
	sigma, dsigma = self.noise(t)
	time_conditioning = sigma[:, None]

	# Get masking probabilities for all tokens for each batch
	# log-linear: 1 - alpha = t
	base_mask_prob = 1 - torch.exp(-sigma[:, None]) # (batch_size, L)

	if self.config.noise.state_dependent and (bond_mask is not None):
	# log-polynomial masking schedule: alpha = 1 - t^w
	# bond_sigma = -log(1-t^w) for w = 3 (default)
	# bond_dsigma = -wt^(w-1) / (1-t^w)
	bond_sigma, bond_dsigma = self.bond_noise(t) # scalar
	# expand dimensions for broadcasting to (B, L)
	bond_sigma = bond_sigma[:, None]
	bond_dsigma = bond_dsigma[:, None]
	sigma = sigma[:, None]
	dsigma = dsigma[:, None]

	# compute masking probability for peptide bonds 1 - bond_alpha = t^w
	bond_mask_prob = 1 - torch.exp(-bond_sigma).to(self.device)
	# piece together (B, L) tensor with modified masking prob at peptide-bond locations
	mask_prob = torch.where(bond_mask == 1, bond_mask_prob, base_mask_prob).to(self.device)
	#print(mask_prob)
	dsigma = torch.where(bond_mask == 1, bond_dsigma, dsigma).to(self.device)
	sigma = torch.where(bond_mask == 1, bond_sigma, sigma).to(self.device)
	else:
	mask_prob = base_mask_prob.to(self.device)

	# get masked samples at different timesteps
	if mask is None:
	zt = self.q_xt(x0, mask_prob).to(self.device)
	else:
	zt = x0.where(mask==1, torch.full_like(x0, self.mask_token_id)).to(self.device)

	model_output = self.forward(zt, attn_mask=attn_mask.to(self.device), sigma=time_conditioning).to(self.device)

	# debugging
	assert not torch.isnan(model_output).any()
	assert model_output.is_cuda
	utils.print_nans(model_output, 'model_output')

	# compute invalid loss
	invalid_loss = self.compute_invalid_loss(logits=model_output).to(self.device) # (B, L)
	#print(invalid_loss)

	if self.T > 0:
	# compute diffusion loss
	diffusion_loss = self.compute_diffusion_loss(model_output, zt, x0, t)
	return diffusion_loss

	# compute loss for the final that converts from z0 to x0
	# -log(p_theta)
	# get (batch_size, L) array of log-probabilities
	log_p_theta = torch.gather(input=model_output, dim=-1, index=x0[:, :, None]).squeeze(-1).to(self.device) # (B, L)

	if self.config.noise.state_dependent and (bond_mask is not None):
	return (-log_p_theta * (dsigma / torch.expm1(sigma)) + invalid_loss).to(self.device)
	else:
	return ((-log_p_theta * (dsigma / torch.expm1(sigma))[:, None]) + invalid_loss).to(self.device)

	def _loss(self, x0, attn_mask, bond_mask=None, mask=None):
	loss = self._forward_pass_diffusion(x0, attn_mask, bond_mask, mask)

	# negative log loss
	nlls = loss * attn_mask

	# count number of tokens
	num_tokens = attn_mask.sum()

	# compute batch loss
	batch_nll = nlls.sum()
	# compute per token loss
	token_nll = batch_nll / num_tokens
	# return losses
	return Loss(loss = token_nll.to(self.device), nlls = nlls.to(self.device), attn_mask = attn_mask.to(self.device))

	def _compute_loss(self, batch, prefix, bond_mask=None):

	attn_mask = batch['attention_mask'].to(self.device)

	if 'mask' in batch:
	mask = batch['mask'].to(self.device)
	else:
	mask = None

	if 'bond_mask' in batch:
	bond_mask = batch['bond_mask'].to(self.device)
	else:
	bond_mask = None

	losses = self._loss(batch['input_ids'].to(self.device), attn_mask, bond_mask, mask)
	loss = losses.loss

	if prefix == 'train':
	self.train_metrics.update(
	losses.nlls.to(self.device),
	losses.attn_mask.to(self.device)
	)
	metrics = self.train_metrics
	elif prefix == 'val':
	self.valid_metrics.update(
	losses.nlls.to(self.device),
	losses.attn_mask.to(self.device)
	)
	metrics = self.valid_metrics
	elif prefix == 'test':
	self.test_metrics.update(losses.nlls, losses.attn_mask)
	metrics = self.test_metrics
	else:
	raise ValueError(f'Invalid prefix: {prefix}')

	self.log_dict(metrics,
	on_step=False,
	on_epoch=True,
	sync_dist=True)

	return loss


	"""SAMPLING"""

	def generate_from_masked(self, num_samples=None, seq_length=None, sample_steps=128, eps=1e-5):
	# get number of timesteps
	if sample_steps is None:
	sample_steps = self.config.sampling.steps

	if seq_length is None:
	seq_length = self.config.sampling.seq_length

	# sample fully masked sequences
	z = self.sample_prior(num_samples, seq_length).to(self.device)

	# create vector of sample_steps timesteps
	timesteps = torch.linspace(1, eps, sample_steps + 1, device=self.device)

	# compute interval between timesteps
	dt = (1 - eps) / sample_steps

	for i in range(sample_steps):
	t = timesteps[i] * torch.ones(z.shape[0], 1, device=self.device)

	z = self.single_reverse_step(z, t, dt)

	return z


	"""SAMPLING STEP"""

	def single_reverse_step(self, zt, t, dt, attn_mask=None):
	"""
	Take a single reverse diffusion step for the expansion step of the MCTS algorithm
	"""
	# get sigma values that determine masking prob
	sigma_t, _ = self.noise(t)
	sigma_s, _ = self.noise(t - dt)

	# reshape sigmas
	if sigma_t.ndim > 1:
	sigma_t = sigma_t.squeeze(-1)
	if sigma_s.ndim > 1:
	sigma_s = sigma_s.squeeze(-1)
	assert sigma_t.ndim == 1, sigma_t.shape
	assert sigma_s.ndim == 1, sigma_s.shape

	# compute masking probabilities for each timestep
	change_prob_t = 1 - torch.exp(-sigma_t)
	change_prob_s = 1 - torch.exp(-sigma_s)

	# expand dimensions
	change_prob_t = change_prob_t[:, None, None]
	change_prob_s = change_prob_s[:, None, None]

	# get prodiction model that outputs token probabilities
	log_p_x0 = self.forward(zt, attn_mask=attn_mask, sigma=sigma_t)

	# check dimensions match
	assert change_prob_t.ndim == log_p_x0.ndim

	# compute reverse diffusion probability of being unmasked at timestep s
	# (sigma_s - sigma_t)*x_theta
	q_zs = log_p_x0.exp() * (change_prob_t - change_prob_s)

	# compute reverse diffusion probability of remaining masked at timestep s
	# (1 - sigma_s)*m
	q_zs[:, :, self.mask_token_id] = change_prob_s[:, :, 0]

	# sample sequence at timestep s from categorical distribution of q_zs
	z_changed = sample_categorical(q_zs)

	copy_flag = (zt != self.mask_token_id).to(zt.dtype)
	return (copy_flag * zt) + ((1 - copy_flag) * z_changed)

	def cached_reverse_step(self, x, t, dt, p_x0=None, attn_mask=None):
	assert self.config.noise.type == 'loglinear'
	sigma_t, _ = self.noise(t)

	if t.ndim > 1:
	t = t.squeeze(-1)
	assert t.ndim == 1

	change_prob_t = t[:, None, None]
	change_prob_s = (t - dt)[:, None, None]

	assert change_prob_t.ndim == 3, change_prob_t.shape

	if p_x0 is None:
	p_x0 = self.forward(x, attn_mask=attn_mask, sigma=sigma_t).exp()

	assert change_prob_t.ndim == p_x0.ndim

	q_xs = p_x0 * (change_prob_t - change_prob_s)

	q_xs[:, :, self.mask_token_id] = change_prob_s[:, :, 0]

	x_changed = sample_categorical(q_xs)

	copy_flag = (x != self.mask_token_id).to(x.dtype)

	return p_x0, copy_flag * x + (1 - copy_flag) * x_changed

	# first step in expansion
	def batch_cached_reverse_step(self, token_array, t, dt, batch_size, p_x0=None, attn_mask=None):

	assert self.config.noise.type == 'loglinear'
	sigma_t, _ = self.noise(t)

	if t.ndim > 1:
	t = t.squeeze(-1)
	assert t.ndim == 1

	change_prob_t = t[:, None, None]
	change_prob_s = (t - dt)[:, None, None]

	assert change_prob_t.ndim == 3, change_prob_t.shape

	if token_array.dim() == 1:
	token_array = token_array.unsqueeze(0)
	#token_array = token_array.repeat(batch_size, 1)

	attn_mask = torch.ones_like(token_array)

	if p_x0 is None:
	p_x0 = self.forward(token_array, attn_mask=attn_mask, sigma=sigma_t).exp()

	assert change_prob_t.ndim == p_x0.ndim

	q_xs = p_x0 * (change_prob_t - change_prob_s)

	# zero-masking probability
	q_xs[:, :, self.mask_token_id] = change_prob_s[:, :, 0]

	# repeat the parent token along the first dimension which will be unmasked into distinct sequences
	token_array = token_array.repeat(batch_size, 1)

	if self.config.mcts.sampling == 0:
	x_changed = sample_batched_categorical(q_xs.to(self.device), batch_size)
	else:
	x_changed = sample_batched_top_k(q_xs.to(self.device), batch_size, self.config.mcts.sampling)

	copy_flag = (token_array != self.mask_token_id).to(token_array.dtype)

	return p_x0, copy_flag * token_array + (1 - copy_flag) * x_changed

	def _process_sigma(self, sigma):
	if sigma.ndim > 1:
	sigma = sigma.squeeze(-1)
	if not self.time_conditioning:
	sigma = torch.zeros_like(sigma)
	assert sigma.ndim == 1, sigma.shape
	return sigma

	def forward(self, zt, attn_mask, sigma):
	"""
	Predicts the token log-probabilities from zt at time t with noise schedule sigma
	"""
	sigma = self._process_sigma(sigma)

	with torch.amp.autocast("cuda", enabled=True, dtype=torch.float32, cache_enabled=True):
	logits = self.backbone(zt, attn_mask).to(self.device)

	return self.subs_parameterization(logits, zt)

	def subs_parameterization(self, logits, zt):
	"""
	Updates reverse diffusion logits based on SUBS parameterization:
	- zero masking probabilities: -infinity probability of being masked during reverse diffusion
	- carry-over unmasking: unmasked input tokens remain unchanged during reverse diffusion

	Args:
	logits: vector of token probabilities for unmasking masked tokens
	zt: partially unmasked sequence at current timestep
	"""
	logits[:, :, self.mask_token_id] += self.neg_infinity # [sequence index, current token, next token]


	logits = (logits - torch.logsumexp(logits, dim=-1, keepdim=True)).to(self.device)


	unmasked_indices = (zt != self.mask_token_id).to(self.device) # shape: [200, seq_length]
	batch_idx, seq_idx = torch.where(unmasked_indices) # Get explicit indices
	batch_idx = batch_idx.to(self.device)
	seq_idx = seq_idx.to(self.device)
	tokens = zt[batch_idx, seq_idx].to(self.device) # Get the tokens at those positions

	assert logits.is_contiguous(), "logits tensor is not contiguous"
	assert unmasked_indices.shape == zt.shape, "same shape"
	assert not torch.isnan(logits).any(), "NaN values found in logits"
	assert tokens.max() < logits.shape[-1], "token indices out of bounds"
	assert batch_idx.max() < logits.shape[0], "batch index out of bounds"
	assert seq_idx.max() < logits.shape[1], "seq index out of bounds"
	assert batch_idx.device == seq_idx.device == logits.device == tokens.device, "device inconsistent"

	logits[batch_idx, seq_idx] = self.neg_infinity # Set everything to -inf first
	logits[batch_idx, seq_idx, tokens] = 0 # Set only the specific token positions to 0
	# return logits with SUBS parameterization
	return logits.to(self.device)

	"""SAMPLING"""
	@torch.no_grad()
	def _sample(self, num_steps=None, eps=1e-5, x_input=None):
	"""
	Generate samples
	"""
	batch_size_per_gpu = self.config.eval.perplexity_batch_size

	if num_steps is None:
	num_steps = self.config.sampling.steps

	if x_input is not None:
	x = x_input['input_ids'].to(self.device)
	attn_mask = x_input['attention_mask'].to(self.device)
	else:
	x = self.sample_prior(batch_size_per_gpu, self.config.model.length).to(self.device)
	attn_mask = torch.ones_like(x).to(self.device)


	timesteps = torch.linspace(1, eps, num_steps+1, device=self.device)
	dt = (1 - eps) / num_steps
	p_x0_cache = None
	generation_history = [] # used to track which tokens are unmasked

	for i in range(num_steps):
	t = timesteps[i] * torch.ones(x.shape[0], 1, device = self.device)
	if self.sampler == 'ddpm':
	x = self.single_reverse_step(x, t, dt).to(self.device)

	elif self.sampler == 'ddpm_cache':
	p_x0_cache, x_next = self.cached_reverse_step(x, t, dt, p_x0=p_x0_cache, attn_mask=attn_mask)
	if (not torch.allclose(x_next, x) or self.time_conditioning):
	# Disable caching
	p_x0_cache = None
	x = x_next.to(self.device)
	#print(self.tokenizer.decode(x.squeeze()))
	else:
	x = self._analytic_update(x, t, dt, attn_mask).to(self.device)

	if self.config.sampling.noise_removal:
	t = timesteps[-1] * torch.ones(x.shape[0], 1, device=self.device)
	if self.sampler == 'analytic':
	x = self._denoiser_update(x, t).to(self.device)
	else:
	time_conditioning = self.noise(t)[0].to(self.device)
	x = self.forward(x, attn_mask=attn_mask, sigma=time_conditioning).argmax(dim=-1).to(self.device)
	#print(self.tokenizer.decode(x.squeeze()))
	return x.to(self.device)


	def restore_model_and_sample(self, num_steps, eps=1e-5):
	"""Generate samples from the model."""
	self.backbone.eval()
	self.noise.eval()
	samples = self._sample(num_steps=num_steps, eps=eps)
	self.backbone.train()
	self.noise.train()
	return samples

	def get_score(self, zt, sigma, attn_mask=None):

	# score(x, t) = p_t(y) / p_t(x)
	# => log score(x, t) = log p_t(y) - log p_t(x)

	# case 1: x = masked
	# (i) y = unmasked
	# log score(x, t) = log p_\theta(x)\|_y + log k
	# where k = exp(- sigma) / (1 - exp(- sigma))
	# (ii) y = masked
	# log score(x, t) = 0

	# case 2: x = unmasked
	# (i) y != masked, y != x
	# log score(x_i, t) = - inf
	# (ii) y = x
	# log score(x_i, t) = 0
	# (iii) y = masked token
	# log score(x_i, t) = - log k
	# where k = exp(- sigma) / (1 - exp(- sigma))

	model_output = self.forward(zt, attn_mask=attn_mask, sigma=sigma)

	log_k = -torch.log(torch.expm1(sigma)).squeeze(-1)
	assert log_k.ndim == 1

	masked_score = model_output + log_k[:, None, None]
	masked_score[:, :, self.mask_token_id] = 0

	unmasked_score = self.neg_infinity * torch.ones_like(model_output)
	unmasked_score = torch.scatter(
	unmasked_score, -1,
	zt[..., None],
	torch.zeros_like(unmasked_score[..., :1]))

	unmasked_score[:, :, self.mask_token_id] = - (log_k[:, None] * torch.ones_like(zt))

	masked_indices = (zt == self.mask_token_id).to(model_output.dtype)[:, :, None]

	model_output = (masked_score * masked_indices + unmasked_score * (1 - masked_indices))

	return model_output.exp()

	def _staggered_score(self, score, dsigma):
	score = score.clone()
	extra_const = (1 - dsigma.exp()) * score.sum(dim=-1)
	score *= dsigma.exp()[:, None]
	score[..., self.mask_token_id] += extra_const
	return score

	def _analytic_update(self, x, t, step_size, attn_mask=None):
	curr_sigma, _ = self.noise(t)
	next_sigma, _ = self.noise(t - step_size)
	dsigma = curr_sigma - next_sigma
	score = self.get_score(x, attn_mask, curr_sigma)
	stag_score = self._staggered_score(score, dsigma)
	probs = stag_score * self._transp_transition(x, dsigma)
	return sample_categorical(probs)

	def _denoiser_update(self, x, t):
	sigma, _ = self.noise(t)
	score = self.get_score(x, sigma)
	stag_score = self._staggered_score(score, sigma)
	probs = stag_score * self._transp_transition(x, sigma)
	probs[..., self.mask_token_id] = 0
	samples = sample_categorical(probs)
	return samples

	def _transp_transition(self, i, sigma):
	sigma = unsqueeze(sigma, reference=i[..., None])
	edge = torch.exp(-sigma) * F.one_hot(
	i, num_classes=self.vocab_size)
	edge += torch.where(i == self.mask_token_id,
	1 - torch.exp(-sigma).squeeze(-1),
	0)[..., None]
	return edge


	def on_train_epoch_start(self):
	torch.cuda.empty_cache()
	self.backbone.train()
	self.noise.train()


	def training_step(self, batch, batch_idx):
	# Initialize throughput calculation
	start_time = time.time()

	if self.config.vocab == 'old_smiles' or self.config.vocab == 'new_smiles':
	loss = self._compute_loss(batch, prefix='train', bond_mask=batch['bond_mask'])
	else:
	loss = self._compute_loss(batch, prefix='train')

	self.log(name='trainer/loss',
	value=loss.item(),
	on_step=True,
	on_epoch=False,
	sync_dist=True)

	# Calculate throughput
	elapsed_time = time.time() - start_time
	total_tokens = batch['input_ids'].numel()
	throughput = total_tokens / elapsed_time

	self.log(name='trainer/throughput',
	value=throughput,
	on_step=True,
	on_epoch=False,
	sync_dist=True)

	return loss


	def on_load_checkpoint(self, checkpoint):
	self.fast_forward_epochs = checkpoint['loops']['fit_loop']['epoch_progress']['current']['completed']
	self.fast_forward_batches = checkpoint['loops']['fit_loop']['epoch_loop.batch_progress']['current']['completed']

	"""VALIDATION"""
	def on_validation_epoch_start(self):
	gc.collect()
	torch.cuda.empty_cache()
	self.backbone.eval()
	self.noise.eval()
	assert self.valid_metrics.nll.mean_value == 0
	assert self.valid_metrics.nll.weight == 0

	def validation_step(self, batch, batch_idx):
	if self.config.vocab == 'old_smiles' or self.config.vocab == 'new_smiles':
	loss = self._compute_loss(batch, prefix='val', bond_mask=batch['bond_mask'])
	else:
	loss = self._compute_loss(batch, prefix='val')

	self.log(name='trainer/val_loss',
	value=loss.item(),
	on_step=True,
	on_epoch=False,
	prog_bar=True,
	sync_dist=True)
	return loss

	def on_validation_epoch_end(self):
	gc.collect()
	torch.cuda.empty_cache()

	"""OPTIMIZATION"""

	def optimizer_step(self, args, *kwargs):
	super().optimizer_step(args, *kwargs)

	gc.collect()
	torch.cuda.empty_cache()

	def configure_optimizers(self):
	optimizer = torch.optim.AdamW(
	itertools.chain(self.backbone.parameters(),self.noise.parameters()),
	lr=self.config.optim.lr,
	betas=(self.config.optim.beta1, self.config.optim.beta2),
	eps=self.config.optim.eps,
	weight_decay=self.config.optim.weight_decay
	)

	self.total_steps = self.config.trainer.max_steps
	scheduler = CosineWarmup(optimizer,
	warmup_steps=self.config.lr_scheduler.num_warmup_steps,
	total_steps=self.total_steps)

	scheduler_dict = {
	'scheduler': scheduler,
	'interval': 'step',
	'frequency': 1,
	'monitor': 'val/loss',
	'name': 'trainer/lr'
	}

	return [optimizer], [scheduler_dict]

	@torch.no_grad()
	def compute_masked_perplexity(self, generated_ids, input_ids):
	"""
	Computes masked perplexity between array of generated token ids and masked ids that are converted to logits
	"""

	total_nll = 0
	total_tokens = 0

	input_ids = torch.tensor(input_ids).to(self.device)
	#print(input_ids)

	for sequence in generated_ids:
	# tokenize the sequence

	gt_ids = torch.tensor(sequence).to(self.device)
	#print(gt_ids)

	sys.stdout.flush()

	# forward pass thorugh backbone peptideclm model
	attn_mask = torch.ones_like(input_ids).to(self.device)

	# compute logits using backbone

	outputs = self.backbone.forward(input_ids=input_ids, attn_mask=attn_mask)


	# get logits for each position in sequence across all tokens in vocab
	#logits = outputs[-1] # (batch_size, seq_length, vocab_size)

	logits = outputs.view(-1, outputs.size(-1))
	gt_ids = gt_ids.view(-1)

	#print(logits.shape)
	#print(gt_ids.shape)

	# compute loss
	# shift_logits = logits[:, :-1, :].contiguous() # remove eos
	# shift_labels = input_ids[:, 1:].contiguous()
	# print(masked)

	loss = F.cross_entropy(logits,
	gt_ids.where(input_ids==self.mask_token_id, torch.full_like(gt_ids, -100)).view(-1),
	reduction='sum')

	total_nll += loss.item()
	# count all non-padding tokens
	total_tokens += input_ids.ne(self.tokenizer.pad_token_id).sum().item() # count in bos and eos

	# compute pseudo-perplexity
	# print(total_nll, ",;,", total_tokens)
	pseudo_perplexity = torch.exp(torch.tensor(total_nll / total_tokens))
	self.gen_ppl_metric.update(pseudo_perplexity)

	return pseudo_perplexity.item()


	def sample_categorical(categorical_probs):
	gumbel_norm = (
	1e-10
	- (torch.rand_like(categorical_probs) + 1e-10).log())
	return (categorical_probs / gumbel_norm).argmax(dim=-1)

	def sample_batched_categorical(categorical_probs, batch_size):
	_, sequence_length, vocab_size = categorical_probs.shape

	# add Gumbel noise and sample m sequences
	gumbel_noise = (-torch.log(-torch.log(torch.rand(batch_size, sequence_length, vocab_size) + 1e-10) + 1e-10)).to(categorical_probs.device)
	noisy_scores = torch.log(categorical_probs) + gumbel_noise # add Gumbel noise to log probabilities

	# select the highest score (most likely category after Gumbel noise)
	sampled_sequences = noisy_scores.argmax(dim=-1) # shape: (m, sequence_length)

	return sampled_sequences

	def sample_batched_top_k(categorical_probs, batch_size, k):
	_, sequence_length, vocab_length = categorical_probs.shape

	# Add Gumbel noise to the log probabilities
	gumbel_noise = -torch.log(-torch.log(torch.rand(batch_size, sequence_length, vocab_length) + 1e-10) + 1e-10).to(categorical_probs.device)
	noisy_scores = torch.log(categorical_probs[None, :, :]) + gumbel_noise # Shape: (m, sequence_length, vocab_length)

	# Get the top-k categories based on noisy scores
	top_k_scores, top_k_indices = torch.topk(noisy_scores, k, dim=-1) # Shape: (m, sequence_length, k)

	# Convert top-k scores back to probabilities and normalize
	top_k_probs = torch.softmax(top_k_scores, dim=-1).to(categorical_probs.device) # Shape: (m, sequence_length, k)

	# Sample randomly from the top-k probabilities
	sampled_indices_in_top_k = torch.multinomial(top_k_probs.reshape(-1, k), num_samples=1).squeeze(-1).to(categorical_probs.device)
	sampled_indices_in_top_k = sampled_indices_in_top_k.view(batch_size, sequence_length).to(categorical_probs.device) # Shape: (batch_size, sequence_length)

	# Map sampled indices back to the original vocabulary indices
	sampled_sequences = torch.gather(top_k_indices, -1, sampled_indices_in_top_k.unsqueeze(-1)).squeeze(-1).to(categorical_probs.device)

	return sampled_sequences

	def unsqueeze(x, reference):
	return x.view(* x.shape, * ((1,) * (len(reference.shape) - len(x.shape))))

	class CosineWarmup(_LRScheduler):
	def __init__(self, optimizer, warmup_steps, total_steps, eta_ratio=0.1, last_epoch=-1):
	self.warmup_steps = warmup_steps
	self.total_steps = total_steps
	self.eta_ratio = eta_ratio # The ratio of minimum to maximum learning rate
	super(CosineWarmup, self).__init__(optimizer, last_epoch)

	def get_lr(self):
	if self.last_epoch < self.warmup_steps:
	return [base_lr * self.last_epoch / self.warmup_steps for base_lr in self.base_lrs]

	progress = (self.last_epoch - self.warmup_steps) / (self.total_steps - self.warmup_steps)
	cosine_decay = 0.5 * (1 + np.cos(np.pi * progress))
	decayed_lr = (1 - self.eta_ratio) * cosine_decay + self.eta_ratio

	return [decayed_lr * base_lr for base_lr in self.base_lrs]