#

Denoising Diffusion Implicit Models (DDIM) Sampling

This implements DDIM sampling from the paper Denoising Diffusion Implicit Models

16from typing import Optional, List
17
18import numpy as np
19import torch
20
21from labml import monit
22from labml_nn.diffusion.stable_diffusion.latent_diffusion import LatentDiffusion
23from labml_nn.diffusion.stable_diffusion.sampler import DiffusionSampler

#

DDIM Sampler

This extends the DiffusionSampler base class.

DDIM samples images by repeatedly removing noise by sampling step by step using,

x_{τ_{i - 1}} = α_{τ_{i - 1}} (\frac{x _{τ_{i}} - 1 - α _{τ_{i}} ϵ _{θ} ( x _{τ_{i}} )}{α _{τ_{i}}}) + 1 - α_{τ_{i - 1}} - σ_{τ_{i}}^{2} \cdot ϵ_{θ} (x_{τ_{i}}) + σ_{τ_{i}} ϵ_{τ_{i}}

where $ϵ_{τ_{i}}$ is random noise, $τ$ is a subsequence of $[1, 2, \dots, T]$ of length $S$ , and $σ_{τ_{i}} = η \frac{1 - α _{τ_{i - 1}}}{1 - α _{τ_{i}}} 1 - \frac{α _{τ_{i}}}{α _{τ_{i - 1}}}$

Note that, $α_{t}$ in DDIM paper refers to $\overset{α_{t}}{ˉ}$ from DDPM.

26class DDIMSampler(DiffusionSampler):

#

52    model: LatentDiffusion

#

model is the model to predict noise $ϵ_{c ond} (x_{t}, c)$
n_steps is the number of DDIM sampling steps, $S$
ddim_discretize specifies how to extract $τ$ from $[1, 2, \dots, T]$ . It can be either uniform or quad .
ddim_eta is $η$ used to calculate $σ_{τ_{i}}$ . $η = 0$ makes the sampling process deterministic.

54    def __init__(self, model: LatentDiffusion, n_steps: int, ddim_discretize: str = "uniform", ddim_eta: float = 0.):

#

63        super().__init__(model)

#

Number of steps, $T$

65        self.n_steps = model.n_steps

#

Calculate $τ$ to be uniformly distributed across $[1, 2, \dots, T]$

68        if ddim_discretize == 'uniform':
69            c = self.n_steps // n_steps
70            self.time_steps = np.asarray(list(range(0, self.n_steps, c))) + 1

#

Calculate $τ$ to be quadratically distributed across $[1, 2, \dots, T]$

72        elif ddim_discretize == 'quad':
73            self.time_steps = ((np.linspace(0, np.sqrt(self.n_steps * .8), n_steps)) ** 2).astype(int) + 1
74        else:
75            raise NotImplementedError(ddim_discretize)
76
77        with torch.no_grad():

#

Get $\overset{α_{t}}{ˉ}$

79            alpha_bar = self.model.alpha_bar

#

$α_{τ_{i}}$

82            self.ddim_alpha = alpha_bar[self.time_steps].clone().to(torch.float32)

#

$α_{τ_{i}}$

84            self.ddim_alpha_sqrt = torch.sqrt(self.ddim_alpha)

#

$α_{τ_{i - 1}}$

86            self.ddim_alpha_prev = torch.cat([alpha_bar[0:1], alpha_bar[self.time_steps[:-1]]])

#

$σ_{τ_{i}} = η \frac{1 - α _{τ_{i - 1}}}{1 - α _{τ_{i}}} 1 - \frac{α _{τ_{i}}}{α _{τ_{i - 1}}}$

91            self.ddim_sigma = (ddim_eta *
92                               ((1 - self.ddim_alpha_prev) / (1 - self.ddim_alpha) *
93                                (1 - self.ddim_alpha / self.ddim_alpha_prev)) ** .5)

#

$1 - α_{τ_{i}}$

96            self.ddim_sqrt_one_minus_alpha = (1. - self.ddim_alpha) ** .5

#

Sampling Loop

shape is the shape of the generated images in the form [batch_size, channels, height, width]
cond is the conditional embeddings $c$
temperature is the noise temperature (random noise gets multiplied by this)
x_last is $x_{τ_{S}}$ . If not provided random noise will be used.
uncond_scale is the unconditional guidance scale $s$ . This is used for $ϵ_{θ} (x_{t}, c) = s ϵ_{c ond} (x_{t}, c) + (s - 1) ϵ_{c ond} (x_{t}, c_{u})$
uncond_cond is the conditional embedding for empty prompt $c_{u}$
skip_steps is the number of time steps to skip $i^{'}$ . We start sampling from $S - i^{'}$ . And x_last is then $x_{τ_{S - i^{'}}}$ .

98    @torch.no_grad()
99    def sample(self,
100               shape: List[int],
101               cond: torch.Tensor,
102               repeat_noise: bool = False,
103               temperature: float = 1.,
104               x_last: Optional[torch.Tensor] = None,
105               uncond_scale: float = 1.,
106               uncond_cond: Optional[torch.Tensor] = None,
107               skip_steps: int = 0,
108               ):

#

Get device and batch size

125        device = self.model.device
126        bs = shape[0]

#

Get $x_{τ_{S}}$

129        x = x_last if x_last is not None else torch.randn(shape, device=device)

#

Time steps to sample at $τ_{S - i^{'}}, τ_{S - i^{'} - 1}, \dots, τ_{1}$

132        time_steps = np.flip(self.time_steps)[skip_steps:]
133
134        for i, step in monit.enum('Sample', time_steps):

#

Index $i$ in the list $[τ_{1}, τ_{2}, \dots, τ_{S}]$

136            index = len(time_steps) - i - 1

#

Time step $τ_{i}$

138            ts = x.new_full((bs,), step, dtype=torch.long)

#

Sample $x_{τ_{i - 1}}$

141            x, pred_x0, e_t = self.p_sample(x, cond, ts, step, index=index,
142                                            repeat_noise=repeat_noise,
143                                            temperature=temperature,
144                                            uncond_scale=uncond_scale,
145                                            uncond_cond=uncond_cond)

#

Return $x_{0}$

148        return x

#

Sample $x_{τ_{i - 1}}$

x is $x_{τ_{i}}$ of shape [batch_size, channels, height, width]
c is the conditional embeddings $c$ of shape [batch_size, emb_size]
t is $τ_{i}$ of shape [batch_size]
step is the step $τ_{i}$ as an integer
index is index $i$ in the list $[τ_{1}, τ_{2}, \dots, τ_{S}]$
repeat_noise specified whether the noise should be same for all samples in the batch
temperature is the noise temperature (random noise gets multiplied by this)
uncond_scale is the unconditional guidance scale $s$ . This is used for $ϵ_{θ} (x_{t}, c) = s ϵ_{c ond} (x_{t}, c) + (s - 1) ϵ_{c ond} (x_{t}, c_{u})$
uncond_cond is the conditional embedding for empty prompt $c_{u}$

150    @torch.no_grad()
151    def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int, index: int, *,
152                 repeat_noise: bool = False,
153                 temperature: float = 1.,
154                 uncond_scale: float = 1.,
155                 uncond_cond: Optional[torch.Tensor] = None):

#

Get $ϵ_{θ} (x_{τ_{i}})$

172        e_t = self.get_eps(x, t, c,
173                           uncond_scale=uncond_scale,
174                           uncond_cond=uncond_cond)

#

Calculate $x_{τ_{i - 1}}$ and predicted $x_{0}$

177        x_prev, pred_x0 = self.get_x_prev_and_pred_x0(e_t, index, x,
178                                                      temperature=temperature,
179                                                      repeat_noise=repeat_noise)

#

182        return x_prev, pred_x0, e_t

#

Sample $x_{τ_{i - 1}}$ given $ϵ_{θ} (x_{τ_{i}})$

184    def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor, *,
185                               temperature: float,
186                               repeat_noise: bool):

#

$α_{τ_{i}}$

192        alpha = self.ddim_alpha[index]

#

$α_{τ_{i - 1}}$

194        alpha_prev = self.ddim_alpha_prev[index]

#

$σ_{τ_{i}}$

196        sigma = self.ddim_sigma[index]

#

$1 - α_{τ_{i}}$

198        sqrt_one_minus_alpha = self.ddim_sqrt_one_minus_alpha[index]

#

Current prediction for $x_{0}$ , $\frac{x _{τ_{i}} - 1 - α _{τ_{i}} ϵ _{θ} ( x _{τ_{i}} )}{α _{τ_{i}}}$

202        pred_x0 = (x - sqrt_one_minus_alpha * e_t) / (alpha ** 0.5)

#

Direction pointing to $x_{t}$ $1 - α_{τ_{i - 1}} - σ_{τ_{i}}^{2} \cdot ϵ_{θ} (x_{τ_{i}})$

205        dir_xt = (1. - alpha_prev - sigma ** 2).sqrt() * e_t

#

No noise is added, when $η = 0$

208        if sigma == 0.:
209            noise = 0.

#

If same noise is used for all samples in the batch

211        elif repeat_noise:
212            noise = torch.randn((1, *x.shape[1:]), device=x.device)

#

Different noise for each sample

214        else:
215            noise = torch.randn(x.shape, device=x.device)

#

Multiply noise by the temperature

218        noise = noise * temperature

#

x_{τ_{i - 1}} = α_{τ_{i - 1}} (\frac{x _{τ_{i}} - 1 - α _{τ_{i}} ϵ _{θ} ( x _{τ_{i}} )}{α _{τ_{i}}}) + 1 - α_{τ_{i - 1}} - σ_{τ_{i}}^{2} \cdot ϵ_{θ} (x_{τ_{i}}) + σ_{τ_{i}} ϵ_{τ_{i}}

227        x_prev = (alpha_prev ** 0.5) * pred_x0 + dir_xt + sigma * noise

#

230        return x_prev, pred_x0

#

Sample from $q_{σ, τ} (x_{τ_{i}} ∣ x_{0})$

$q_{σ, τ} (x_{t} ∣ x_{0}) = N (x_{t}; α_{τ_{i}} x_{0}, (1 - α_{τ_{i}}) I)$

x0 is $x_{0}$ of shape [batch_size, channels, height, width]
index is the time step $τ_{i}$ index $i$
noise is the noise, $ϵ$

232    @torch.no_grad()
233    def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):

#

Random noise, if noise is not specified

246        if noise is None:
247            noise = torch.randn_like(x0)

#

Sample from $q_{σ, τ} (x_{t} ∣ x_{0}) = N (x_{t}; α_{τ_{i}} x_{0}, (1 - α_{τ_{i}}) I)$

252        return self.ddim_alpha_sqrt[index] * x0 + self.ddim_sqrt_one_minus_alpha[index] * noise

#

Painting Loop

x is $x_{S^{'}}$ of shape [batch_size, channels, height, width]
cond is the conditional embeddings $c$
t_start is the sampling step to start from, $S^{'}$
orig is the original image in latent page which we are in paining. If this is not provided, it'll be an image to image transformation.
mask is the mask to keep the original image.
orig_noise is fixed noise to be added to the original image.
uncond_scale is the unconditional guidance scale $s$ . This is used for $ϵ_{θ} (x_{t}, c) = s ϵ_{c ond} (x_{t}, c) + (s - 1) ϵ_{c ond} (x_{t}, c_{u})$
uncond_cond is the conditional embedding for empty prompt $c_{u}$

254    @torch.no_grad()
255    def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *,
256              orig: Optional[torch.Tensor] = None,
257              mask: Optional[torch.Tensor] = None, orig_noise: Optional[torch.Tensor] = None,
258              uncond_scale: float = 1.,
259              uncond_cond: Optional[torch.Tensor] = None,
260              ):

#

Get batch size

276        bs = x.shape[0]

#

Time steps to sample at $τ_{S ‘}, τ_{S^{'} - 1}, \dots, τ_{1}$

279        time_steps = np.flip(self.time_steps[:t_start])
280
281        for i, step in monit.enum('Paint', time_steps):

#

Index $i$ in the list $[τ_{1}, τ_{2}, \dots, τ_{S}]$

283            index = len(time_steps) - i - 1

#

Time step $τ_{i}$

285            ts = x.new_full((bs,), step, dtype=torch.long)

#

Sample $x_{τ_{i - 1}}$

288            x, _, _ = self.p_sample(x, cond, ts, step, index=index,
289                                    uncond_scale=uncond_scale,
290                                    uncond_cond=uncond_cond)

#

Replace the masked area with original image

293            if orig is not None:

#

Get the $q_{σ, τ} (x_{τ_{i}} ∣ x_{0})$ for original image in latent space

295                orig_t = self.q_sample(orig, index, noise=orig_noise)

#

Replace the masked area

297                x = orig_t * mask + x * (1 - mask)

#

300        return x

Denoising Diffusion Implicit Models (DDIM) Sampling

DDIM Sampler

Sampling Loop

Sample xτi−1​​

Sample xτi−1​​ given ϵθ​(xτi​​)

Sample from qσ,τ​(xτi​​∣x0​)

Painting Loop

Sample $x_{τ_{i - 1}}$

Sample $x_{τ_{i - 1}}$ given $ϵ_{θ} (x_{τ_{i}})$

Sample from $q_{σ, τ} (x_{τ_{i}} ∣ x_{0})$