[SD-XL] Add inpainting by patrickvonplaten · Pull Request #4098 · huggingface/diffusers (original) (raw)

SD-XL inpainting

This PR solves: #4080 and is ready for a review.

Inpainting works well for both the vanilla case and the "Ensemble of Expert Denoisers case".

You can try the following to see for yourself:

Vanilla inpainting:

import torch from diffusers import StableDiffusionXLInpaintPipeline from diffusers.utils import load_image

pipe = StableDiffusionXLInpaintPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, variant="fp16", use_safetensors=True ) pipe.to("cuda")

img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"

init_image = load_image(img_url).convert("RGB") mask_image = load_image(mask_url).convert("RGB")

prompt = "A red cat sitting on a bench" image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80).images[0]

Ensemble of Expert of denoisers

which should give slightly better quality:

from diffusers import StableDiffusionXLInpaintPipeline from diffusers.utils import load_image

pipe = StableDiffusionXLInpaintPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, variant="fp16", use_safetensors=True ) pipe.to("cuda")

refiner = StableDiffusionXLInpaintPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-0.9", text_encoder_2=pipe.text_encoder_2, vae=pipe.vae, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", ) refiner.to("cuda")

img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"

init_image = load_image(img_url).convert("RGB") mask_image = load_image(mask_url).convert("RGB")

prompt = "A red cat sitting on a bench" num_inference_steps = 75 high_noise_frac = 0.7

image = pipe( prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=num_inference_steps, strength=0.80, denoising_start=high_noise_frac, output_type="latent", ).images image = refiner( prompt=prompt, image=image, mask_image=mask_image, num_inference_steps=num_inference_steps, denoising_start=high_noise_frac, ).images[0]

aaa