core

Some utility functions for working with 🤗 diffusers
from cjm_pytorch_utils.core import get_torch_device
device = get_torch_device()
dtype = torch.float16 if device == 'cuda' else torch.float32
device, dtype
('cpu', torch.float32)

source

pil_to_latent

 pil_to_latent (image:<module'PIL.Image'from'/opt/hostedtoolcache/Python/3
                .9.16/x64/lib/python3.9/site-packages/PIL/Image.py'>,
                vae:diffusers.models.autoencoder_kl.AutoencoderKL)

This function converts an image to latents using a VAE model.

Returns: latents (torch.Tensor): The latents generated from the image.

Type Details
image Image The image to be converted to latents.
vae AutoencoderKL The VAE model used to convert the image to latents.

Select a model

model_name = "stabilityai/stable-diffusion-2-1"

Load autoencoder

vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(device=device, dtype=dtype)

Open sample image

img_path = '../images/cat.jpg'
src_img = Image.open(img_path).convert('RGB')
src_img

Encode image

img_latents = pil_to_latent(src_img, vae)
img_latents.shape
torch.Size([1, 4, 64, 96])

source

latent_to_pil

 latent_to_pil (latents:torch.Tensor,
                vae:diffusers.models.autoencoder_kl.AutoencoderKL)

This function converts latents to an image using a VAE model.

Returns: image (PIL.Image): The image generated from the latents.

Type Details
latents torch.Tensor The latents to be converted to an image.
vae AutoencoderKL The VAE model used to convert the latents to an image.

Decode latents

decoded_img = latent_to_pil(img_latents, vae)
decoded_img


source

text_to_emb

 text_to_emb (prompt:str,
              tokenizer:transformers.models.clip.tokenization_clip.CLIPTok
              enizer, text_encoder:transformers.models.clip.modeling_clip.
              CLIPTextModel, negative_prompt:str='', maxlen:int=None)

Encodes the provided text prompts using the specified text encoder.

Returns: torch.Tensor: The encoded text.

Type Default Details
prompt str The text prompt to be encoded.
tokenizer CLIPTokenizer The tokenizer to be used.
text_encoder CLIPTextModel The text encoder to be used.
negative_prompt str The negative text prompt to be encoded.
maxlen int None The maximum length of the encoded text. Default is None.

Load tokenizer

# Load the tokenizer for the specified model
tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")

Load text encoder

# Load the text encoder for the specified model
text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(device=device, dtype=dtype)

Define sample prompt

prompt = "A cat sitting on the floor."

Encode sample prompt

text_emb = text_to_emb(prompt, tokenizer, text_encoder)
text_emb.shape
torch.Size([2, 77, 1024])

source

prepare_noise_scheduler

 prepare_noise_scheduler (noise_scheduler, max_steps:int=50,
                          noise_strength:float=1.0)

Prepare the noise scheduler by setting the timesteps and adjusting the noise strength.

Returns: noise_scheduler (object): The modified noise scheduler object

Type Default Details
noise_scheduler The noise scheduler object to be modified
max_steps int 50 The maximum number of steps
noise_strength float 1.0 The strength of the noise

Load noise scheduler

noise_scheduler = DEISMultistepScheduler.from_pretrained(model_name, subfolder='scheduler')

print(f"Number of timesteps: {len(noise_scheduler.timesteps)}")
noise_scheduler.timesteps[:10]
Number of timesteps: 1000
tensor([999., 998., 997., 996., 995., 994., 993., 992., 991., 990.])

Update noise scheduler

noise_scheduler = prepare_noise_scheduler(noise_scheduler, 25, 1.0)

print(f"Number of timesteps: {len(noise_scheduler.timesteps)}")
noise_scheduler.timesteps[:10]
Number of timesteps: 25
tensor([999, 959, 919, 879, 839, 799, 759, 719, 679, 639])

source

prepare_depth_mask

 prepare_depth_mask (depth_map, divisor=8)

Prepare the depth mask by resizing and normalizing the depth map.

Returns: depth_mask (torch.Tensor): The normalized and resized depth mask

Type Default Details
depth_map The depth map image
divisor int 8 The divisor value used to resize the depth map

Load depth map

depth_map_path = '../images/depth-cat.png'
depth_map = Image.open(depth_map_path)
depth_map

Prepare depth mask

depth_mask = prepare_depth_mask(depth_map).to(device=device, dtype=dtype)
depth_mask.shape, depth_mask.min(), depth_mask.max()
(torch.Size([1, 1, 64, 96]),
 tensor(-1., device='cuda:0', dtype=torch.float16),
 tensor(1., device='cuda:0', dtype=torch.float16))

source

denoise_depth2img

 denoise_depth2img (latents:torch.Tensor, depth_mask:torch.Tensor,
                    text_emb:torch.Tensor, unet:diffusers.models.unet_2d_c
                    ondition.UNet2DConditionModel, noise_scheduler,
                    guidance_scale:float=8.0)

Generate an image from a given initial image, depth map and prompt.

Type Default Details
latents Tensor The initial image latents
depth_mask Tensor The image depth mask
text_emb Tensor The embedded text prompt and negative prompt
unet UNet2DConditionModel The Unet denoiser
noise_scheduler The noise scheduler
guidance_scale float 8.0 The guidance scale
model_name = "stabilityai/stable-diffusion-2-depth"
unet = UNet2DConditionModel.from_pretrained(model_name, subfolder="unet").to(device=device, dtype=dtype)
vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(device=device, dtype=dtype)

tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(device=device, dtype=dtype)

noise_scheduler = DEISMultistepScheduler.from_pretrained(model_name, subfolder='scheduler')
noise_scheduler = prepare_noise_scheduler(noise_scheduler, 25, 0.9)
img_latents = pil_to_latent(src_img, vae).to(device=device, dtype=dtype)

prompt = "A Monet oil painting of a cat"
negative_prompt = "bad, deformed, ugly, bad anotomy"
text_emb = text_to_emb(prompt=prompt, 
                       tokenizer=tokenizer, 
                       text_encoder=text_encoder, 
                       negative_prompt=negative_prompt)

# Generate latent noise
noise = torch.randn(img_latents.shape, device=unet.device, dtype=unet.dtype)

# Add noise to the image latents at the first timestep
latents = noise_scheduler.add_noise(img_latents, 
                                    noise, 
                                    noise_scheduler.timesteps[[0]]).to(unet.device)

denoised_latents = denoise_depth2img(latents=latents,
                                     depth_mask=depth_mask,
                                     text_emb=text_emb,
                                     unet=unet,
                                     noise_scheduler=noise_scheduler,
                                     guidance_scale=8.0)

latent_to_pil(denoised_latents, vae)