add ChronoEdit by zhangjiewu · Pull Request #12593 · huggingface/diffusers (original) (raw)
add ChronoEdit
This PR adds ChronoEdit, a state-of-the-art image editing model that reframes image editing as a video generation task to achieve physically consistent edits.
HF Model: https://huggingface.co/nvidia/ChronoEdit-14B-Diffusers
Gradio Demo: https://huggingface.co/spaces/nvidia/ChronoEdit
Paper: https://arxiv.org/abs/2510.04290
Code: https://github.com/nv-tlabs/ChronoEdit
Website: https://research.nvidia.com/labs/toronto-ai/chronoedit/
cc: @sayakpaul @yiyixuxu @asomoza
Usage
Full model
import torch import numpy as np from diffusers import AutoencoderKLWan, ChronoEditTransformer3DModel, ChronoEditPipeline from diffusers.utils import export_to_video, load_image from transformers import CLIPVisionModel from PIL import Image
model_id = "nvidia/ChronoEdit-14B-Diffusers" image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32) vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) transformer = ChronoEditTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16) pipe = ChronoEditPipeline.from_pretrained(model_id, image_encoder=image_encoder, transformer=transformer, vae=vae, torch_dtype=torch.bfloat16) pipe.to("cuda")
image = load_image( "https://huggingface.co/spaces/nvidia/ChronoEdit/resolve/main/examples/3.png" ) max_area = 720 * 1280 aspect_ratio = image.height / image.width mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1] height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value print("width", width, "height", height) image = image.resize((width, height)) prompt = ( "The user wants to transform the image by adding a small, cute mouse sitting inside the floral teacup, enjoying a spa bath. The mouse should appear relaxed and cheerful, with a tiny white bath towel draped over its head like a turban. It should be positioned comfortably in the cup’s liquid, with gentle steam rising around it to blend with the cozy atmosphere. " "The mouse’s pose should be natural—perhaps sitting upright with paws resting lightly on the rim or submerged in the tea. The teacup’s floral design, gold trim, and warm lighting must remain unchanged to preserve the original aesthetic. The steam should softly swirl around the mouse, enhancing the spa-like, whimsical mood." )
output = pipe( image=image, prompt=prompt, height=height, width=width, num_frames=5, num_inference_steps=50, guidance_scale=5.0, enable_temporal_reasoning=False, num_temporal_reasoning_steps=0, ).frames[0] export_to_video(output, "output.mp4", fps=4) Image.fromarray((output[-1] * 255).clip(0, 255).astype("uint8")).save("output.png")
Full model with temporal reasoning
output = pipe( image=image, prompt=prompt, height=height, width=width, num_frames=29, num_inference_steps=50, guidance_scale=5.0, enable_temporal_reasoning=True, num_temporal_reasoning_steps=50, ).frames[0]
With 8-steps distillation LoRA
import torch import numpy as np from diffusers import AutoencoderKLWan, ChronoEditTransformer3DModel, ChronoEditPipeline from diffusers.utils import export_to_video, load_image from transformers import CLIPVisionModel from PIL import Image
model_id = "nvidia/ChronoEdit-14B-Diffusers" image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32) vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) transformer = ChronoEditTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16) pipe = ChronoEditPipeline.from_pretrained(model_id, image_encoder=image_encoder, transformer=transformer, vae=vae, torch_dtype=torch.bfloat16) lora_path = hf_hub_download(repo_id=model_id, filename="lora/chronoedit_distill_lora.safetensors") pipe.load_lora_weights(lora_path) pipe.fuse_lora(lora_scale=1.0) pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0) pipe.to("cuda")
image = load_image( "https://huggingface.co/spaces/nvidia/ChronoEdit/resolve/main/examples/3.png" ) max_area = 720 * 1280 aspect_ratio = image.height / image.width mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1] height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value print("width", width, "height", height) image = image.resize((width, height)) prompt = ( "The user wants to transform the image by adding a small, cute mouse sitting inside the floral teacup, enjoying a spa bath. The mouse should appear relaxed and cheerful, with a tiny white bath towel draped over its head like a turban. It should be positioned comfortably in the cup’s liquid, with gentle steam rising around it to blend with the cozy atmosphere. " "The mouse’s pose should be natural—perhaps sitting upright with paws resting lightly on the rim or submerged in the tea. The teacup’s floral design, gold trim, and warm lighting must remain unchanged to preserve the original aesthetic. The steam should softly swirl around the mouse, enhancing the spa-like, whimsical mood." )
output = pipe( image=image, prompt=prompt, height=height, width=width, num_frames=5, num_inference_steps=8, guidance_scale=1.0, enable_temporal_reasoning=False, num_temporal_reasoning_steps=0, ).frames[0] export_to_video(output, "output.mp4", fps=4) Image.fromarray((output[-1] * 255).clip(0, 255).astype("uint8")).save("output.png")