HunyuanImage21 by yiyixuxu · Pull Request #12333 · huggingface/diffusers (original) (raw)
Click to expand
from diffusers import HunyuanImagePipeline, HunyuanImageRefinerPipeline import torch from diffusers.utils import load_image import gc
device = "cuda:0" dtype = torch.bfloat16 output_prefix = "test_hyimage_output"
repo = "YiYiXu/HunyuanImage-2.1-Diffusers" distilled_repo = "YiYiXu/HunyuanImage-2.1-Distilled-Diffusers" refined_repo = "YiYiXu/HunyuanImage-2.1-Refiner-Diffusers"
test_prompts = [ "A cute, cartoon-style anthropomorphic penguin plush toy with fluffy fur, standing in a painting studio, wearing a red knitted scarf and a red beret with the word “Tencent” on it, holding a paintbrush with a focused expression as it paints an oil painting of the Mona Lisa, rendered in a photorealistic photographic style.", # "宏伟教堂的内部,穹顶下方的中央矗立着一尊小巧的维纳斯雕像,微微侧对镜头。雕像没有双手,布满裂纹,表面若干古老的水泥片剥落,露出内部真人质感的牛奶肌肤。雕像穿着薄薄的白色婚纱,在雕像的身后,一只浮空水泥断手轻轻提起长长的婚纱拖尾;在雕像的头顶上方,另一只浮空水泥断手正为她戴上一个由白色花朵组成的花环,雕像本身是没有双手的。教堂穹顶上布满彩色玻璃窗,一束阳光从上往下照射到雕像上,形成丁达尔效应,光斑点点洒在雕像的脸庞和胸前。充满神性的光辉,背景微微虚化,物体的边缘模糊柔和。拉斐尔前派的梦幻朦胧美学风格。", # "A hyper-realistic photograph of a crystal ball diorama sitting atop fluffy forest moss and surrounded by scattered sunlight. Inside, detailed diorama features a Tencent meeting room, an animated chat bubble sculpture, and several joyful penguins—one wearing a graduation cap, others playing soccer and waving tiny banners. The base of the crystal sphere boldly presents "Tencent" in large, crisp, white 3D letters. Background is softly blurred and bokeh-rich, emphasizing the cute, vibrant details of the sphere.", # "A close-up portrait of an elderly Italian man with deeply wrinkled skin, expressive hazel eyes, and a neatly trimmed white mustache. His olive-toned complexion shows the marks of sun and age, and he wears a flat cap slightly tilted to the side. He smiles faintly, revealing warmth and wisdom, while holding a small espresso cup in one hand. The softly blurred background shows a rustic stone wall with climbing ivy, captured in a realistic photography style.", # "An open vintage suitcase on a neutral, softly lit background. The suitcase is made of deep brown, worn leather with visible scuffs and creases, and its interior is lined with dark, plush fabric. Inside the suitcase is a meticulously crafted miniature landscape of China, featuring the Great Wall of China winding across model mountains, the pagoda roofs of the Forbidden City, and a representation of the terracotta army, all interwoven with vibrant green rice paddies. On the side of the suitcase, a text "China" is labeled. The entire diorama is bathed in warm, ethereal light, with a dreamy lens bloom and soft, glowing highlights. Photorealistic style, ultra-detailed textures, cinematic lighting." ]
test1: test hunyuanimage-v2.1
pipe = HunyuanImagePipeline.from_pretrained(repo, torch_dtype=dtype) pipe = pipe.to(device)
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_v2.1_{prompt[:10]}.png")test2: test hunyuanimage-v2.1-distilled
del pipe gc.collect() torch.cuda.empty_cache()
pipe = HunyuanImagePipeline.from_pretrained(distilled_repo, torch_dtype=dtype) pipe = pipe.to(device)
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=8, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_v2.1-distilled_{prompt[:10]}.png")test3: test hunyuanimage-v2.1-refined
del pipe gc.collect() torch.cuda.empty_cache()
pipe = HunyuanImageRefinerPipeline.from_pretrained(refined_repo, torch_dtype=dtype) pipe = pipe.to(device)
for prompt in test_prompts: image = load_image(f"{output_prefix}v2.1{prompt[:10]}.png") generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, image=image, num_inference_steps=4, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_v2.1-refined_{prompt[:10]}.png")This PR adds guider support to HunyuanImage pipeline(requested by Hunyuan team). This is the first pipeline to use guiders and sets the pattern for future pipelines. I've attached a test sript that covers main usage patterns
""" This script demonstrates how guiders work with both non-distilled and distilled models in HunyuanImage pipeline, covering main use cases:
- Default behavior with "pretrained" guiders (the one you load from hub, similar to schedulers)
- Update guider: change guider.guidance_scale
- Disable guider: disable()
- Change distilled_guidance_scale
- Use traditional CFG with distilled models """
from diffusers import HunyuanImagePipeline from diffusers.guiders import AdaptiveProjectedMixGuidance import torch import gc
device = "cuda:1" dtype = torch.bfloat16 output_prefix = "yiyi_test_hyimage_one_more_output_1022"
repo = "YiYiXu/HunyuanImage-2.1-Diffusers" distilled_repo = "YiYiXu/HunyuanImage-2.1-Distilled-Diffusers"
test_prompts = [ "A cute, cartoon-style anthropomorphic penguin plush toy with fluffy fur, standing in a painting studio, wearing a red knitted scarf and a red beret with the word “Tencent” on it, holding a paintbrush with a focused expression as it paints an oil painting of the Mona Lisa, rendered in a photorealistic photographic style.", # "宏伟教堂的内部,穹顶下方的中央矗立着一尊小巧的维纳斯雕像,微微侧对镜头。雕像没有双手,布满裂纹,表面若干古老的水泥片剥落,露出内部真人质感的牛奶肌肤。雕像穿着薄薄的白色婚纱,在雕像的身后,一只浮空水泥断手轻轻提起长长的婚纱拖尾;在雕像的头顶上方,另一只浮空水泥断手正为她戴上一个由白色花朵组成的花环,雕像本身是没有双手的。教堂穹顶上布满彩色玻璃窗,一束阳光从上往下照射到雕像上,形成丁达尔效应,光斑点点洒在雕像的脸庞和胸前。充满神性的光辉,背景微微虚化,物体的边缘模糊柔和。拉斐尔前派的梦幻朦胧美学风格。", # "A hyper-realistic photograph of a crystal ball diorama sitting atop fluffy forest moss and surrounded by scattered sunlight. Inside, detailed diorama features a Tencent meeting room, an animated chat bubble sculpture, and several joyful penguins—one wearing a graduation cap, others playing soccer and waving tiny banners. The base of the crystal sphere boldly presents "Tencent" in large, crisp, white 3D letters. Background is softly blurred and bokeh-rich, emphasizing the cute, vibrant details of the sphere.", # "A close-up portrait of an elderly Italian man with deeply wrinkled skin, expressive hazel eyes, and a neatly trimmed white mustache. His olive-toned complexion shows the marks of sun and age, and he wears a flat cap slightly tilted to the side. He smiles faintly, revealing warmth and wisdom, while holding a small espresso cup in one hand. The softly blurred background shows a rustic stone wall with climbing ivy, captured in a realistic photography style.", # "An open vintage suitcase on a neutral, softly lit background. The suitcase is made of deep brown, worn leather with visible scuffs and creases, and its interior is lined with dark, plush fabric. Inside the suitcase is a meticulously crafted miniature landscape of China, featuring the Great Wall of China winding across model mountains, the pagoda roofs of the Forbidden City, and a representation of the terracotta army, all interwoven with vibrant green rice paddies. On the side of the suitcase, a text "China" is labeled. The entire diorama is bathed in warm, ethereal light, with a dreamy lens bloom and soft, glowing highlights. Photorealistic style, ultra-detailed textures, cinematic lighting." ]
=============================================================================
Test Suite 1: Non-Distilled Model (HunyuanImage-v2.1)
=============================================================================
The non-distilled checkpoint has guider config like this:
{"enabled": true, "guidance_scale": 3.5, ...}
Traditional CFG is enabled by default with guidance_scale=3.5.
(see https://huggingface.co/YiYiXu/HunyuanImage-2.1-Diffusers/blob/main/guider/guider_config.json)
pipe = HunyuanImagePipeline.from_pretrained(repo, torch_dtype=dtype) pipe = pipe.to(device)
-----------------------------------------------------------------------------
Test 1.1: Default Behavior
-----------------------------------------------------------------------------
Load model and use checkpoint's default guider configuration.
No runtime parameters needed - guidance_scale=3.5 from guider config is used.
print("\n" + "="*80) print("Test 1.1: Default behavior with checkpoint-configured guider") print("="*80)
HunyuanImage has two guiders:
- pipe.guider: Used for normal prompts
- pipe.ocr_guider: Used when text rendering is detected in prompt
Both are loaded from checkpoint config
print(f"Loaded guider config: {pipe.guider}") print(f"Loaded ocr_guider config: {pipe.ocr_guider}") print(f" guidance_scale: {pipe.guider.guidance_scale}") print(f" enabled: {pipe.guider._enabled}")
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_1.1_{prompt[:10]}.png")print("✓ Generated with guidance_scale=3.5 from guider config\n")
-----------------------------------------------------------------------------
Test 1.2: update guider: change guider.guidance_scale to 1.0
-----------------------------------------------------------------------------
print("="*80) print("Test 1.2: change guider.guidance_scale to 1.0") print("="*80)
pipe.guider = pipe.guider.new(guidance_scale=1.0) pipe.ocr_guider = pipe.ocr_guider.new(guidance_scale=1.0) print(f"Changed guider.guidance_scale to 1.0: {pipe.guider.guidance_scale}")
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_1.2_{prompt[:10]}.png")print("✓ Generated with guider.guidance_scale=1.0")
-----------------------------------------------------------------------------
Test 1.3: Update Guider
-----------------------------------------------------------------------------
update guider: change guider.guidance_scale to 9.0
print("="*80) print("Test 1.3: update guider: change guider.guidance_scale to 9.0") print("="*80)
pipe.guider = pipe.guider.new(guidance_scale=9.0) pipe.guider = pipe.ocr_guider.new(guidance_scale=9.0) print(f"Changed guider.guidance_scale to 9.0: {pipe.guider.guidance_scale}")
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_1.3_{prompt[:10]}.png")Guider modification persists after generation
assert pipe.guider.guidance_scale == 9.0 print("✓ Generated with guidance_scale=9.0")
-----------------------------------------------------------------------------
Test 1.5: Disable Guider
-----------------------------------------------------------------------------
Disable guider to turn off CFG entirely (equivalent to true_cfg_scale=1.0).
Useful when you want to see unconditional generation but do not want to modify the guider config.
print("="*80) print("Test 1.5: disable guider (no guidance)") print("="*80)
pipe.guider.disable() pipe.ocr_guider.disable() print(f"✓ Guider disabled: {pipe.guider._enabled}")
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_1.5_{prompt[:10]}.png")print("✓ Generated without guidance (guider disabled)\n")
=============================================================================
Test Suite 2: Distilled Model (HunyuanImage-v2.1-Distilled)
=============================================================================
The distilled checkpoint does not have guider
del pipe gc.collect() torch.cuda.empty_cache()
pipe = HunyuanImagePipeline.from_pretrained(distilled_repo, torch_dtype=dtype) pipe = pipe.to(device)
print(f"\nLoaded distilled model guider: {pipe.guider}")
-----------------------------------------------------------------------------
Test 2.1: Default Behavior (Distilled Model)
-----------------------------------------------------------------------------
Use checkpoint's default distilled_guidance_scale=3.25.
No traditional CFG applied (guider disabled).
print("\n" + "="*80) print("Test 2.1: Default behavior with distilled model") print("="*80) for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=8, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_2.1_{prompt[:10]}.png")print("✓ Generated with distilled_guidance_scale=3.25 from guider config\n")
-----------------------------------------------------------------------------
Test 2.2: change distilled_guidance_scale to 5.0
-----------------------------------------------------------------------------
print("="*80) print("Test 2.2: change distilled_guidance_scale to 5.0") print("="*80)
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=8, distilled_guidance_scale=5.0, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_2.2_{prompt[:10]}.png")Verify guider unchanged
assert pipe.guider is None print("✓ Generated with distilled_guidance_scale=5.0")
-----------------------------------------------------------------------------
Test 2.3: Traditional CFG with Distilled Model
-----------------------------------------------------------------------------
Enable traditional CFG on top of distilled guidance.
This combines both: distilled guidance (baked in model) + traditional CFG (or other guidance).
print("="*80) print("Test 2.3: Enable traditional CFG with distilled model") print("="*80)
Enable traditional guidance
pipe.guider = AdaptiveProjectedMixGuidance(guidance_scale=5.0) print(f"✓ Guider enabled: {pipe.guider}") print(" Now using BOTH distilled guidance AND traditional guidance")
Use both distilled guidance (distilled_guidance_scale=1.0) and traditional CFG (guidance_scale=5.0)
for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, negative_prompt = "bad quality, ugly, deformed, distorted, blurry, low resolution,", distilled_guidance_scale=1.0, num_inference_steps=8, height=2048, width=2048, generator=generator, ).images[0]
out.save(f"{output_prefix}_2.3_{prompt[:10]}.png")print("✓ Generated with distilled_guidance_scale=1.0 + guidance_scale=5.0") print(" (distilled guidance disabled, traditional CFG applied)\n")
print("\n" + "="*80) print("All tests completed successfully!") print("="*80)