#

Generate images using stable diffusion with a prompt

11import argparse
12import os
13from pathlib import Path
14
15import torch
16
17from labml import lab, monit
18from labml_nn.diffusion.stable_diffusion.latent_diffusion import LatentDiffusion
19from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler
20from labml_nn.diffusion.stable_diffusion.sampler.ddpm import DDPMSampler
21from labml_nn.diffusion.stable_diffusion.util import load_model, save_images, set_seed

#

Text to image class

24class Txt2Img:

#

28    model: LatentDiffusion

#

checkpoint_path is the path of the checkpoint
sampler_name is the name of the sampler
n_steps is the number of sampling steps
ddim_eta is the DDIM sampling $η$ constant

30    def __init__(self, *,
31                 checkpoint_path: Path,
32                 sampler_name: str,
33                 n_steps: int = 50,
34                 ddim_eta: float = 0.0,
35                 ):

#

Load latent diffusion model

43        self.model = load_model(checkpoint_path)

#

Get device

45        self.device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

#

Move the model to device

47        self.model.to(self.device)

#

Initialize sampler

50        if sampler_name == 'ddim':
51            self.sampler = DDIMSampler(self.model,
52                                       n_steps=n_steps,
53                                       ddim_eta=ddim_eta)
54        elif sampler_name == 'ddpm':
55            self.sampler = DDPMSampler(self.model)

#

dest_path is the path to store the generated images
batch_size is the number of images to generate in a batch
prompt is the prompt to generate images with
h is the height of the image
w is the width of the image
uncond_scale is the unconditional guidance scale $s$ . This is used for $ϵ_{θ} (x_{t}, c) = s ϵ_{cond} (x_{t}, c) + (s - 1) ϵ_{cond} (x_{t}, c_{u})$

57    @torch.no_grad()
58    def __call__(self, *,
59                 dest_path: str,
60                 batch_size: int = 3,
61                 prompt: str,
62                 h: int = 512, w: int = 512,
63                 uncond_scale: float = 7.5,
64                 ):

#

Number of channels in the image

75        c = 4

#

Image to latent space resolution reduction

77        f = 8

#

Make a batch of prompts

80        prompts = batch_size * [prompt]

#

AMP auto casting

83        with torch.cuda.amp.autocast():

#

In unconditional scaling is not $1$ get the embeddings for empty prompts (no conditioning).

85            if uncond_scale != 1.0:
86                un_cond = self.model.get_text_conditioning(batch_size * [""])
87            else:
88                un_cond = None

#

Get the prompt embeddings

90            cond = self.model.get_text_conditioning(prompts)

#

Sample in the latent space. x will be of shape [batch_size, c, h / f, w / f]

93            x = self.sampler.sample(cond=cond,
94                                    shape=[batch_size, c, h // f, w // f],
95                                    uncond_scale=uncond_scale,
96                                    uncond_cond=un_cond)

#

Decode the image from the autoencoder

98            images = self.model.autoencoder_decode(x)

#

Save images

101        save_images(images, dest_path, 'txt_')

#

CLI

104def main():

#

108    parser = argparse.ArgumentParser()
109
110    parser.add_argument(
111        "--prompt",
112        type=str,
113        nargs="?",
114        default="a painting of a virus monster playing guitar",
115        help="the prompt to render"
116    )
117
118    parser.add_argument("--batch_size", type=int, default=4, help="batch size")
119
120    parser.add_argument(
121        '--sampler',
122        dest='sampler_name',
123        choices=['ddim', 'ddpm'],
124        default='ddim',
125        help=f'Set the sampler.',
126    )
127
128    parser.add_argument("--flash", action='store_true', help="whether to use flash attention")
129
130    parser.add_argument("--steps", type=int, default=50, help="number of sampling steps")
131
132    parser.add_argument("--scale", type=float, default=7.5,
133                        help="unconditional guidance scale: "
134                             "eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))")
135
136    opt = parser.parse_args()
137
138    set_seed(42)

#

Set flash attention

141    from labml_nn.diffusion.stable_diffusion.model.unet_attention import CrossAttention
142    CrossAttention.use_flash_attention = opt.flash

#

145    txt2img = Txt2Img(checkpoint_path=lab.get_data_path() / 'stable-diffusion' / 'sd-v1-4.ckpt',
146                      sampler_name=opt.sampler_name,
147                      n_steps=opt.steps)
148
149    with monit.section('Generate'):
150        txt2img(dest_path='outputs',
151                batch_size=opt.batch_size,
152                prompt=opt.prompt,
153                uncond_scale=opt.scale)

#

157if __name__ == "__main__":
158    main()