HunyuanImage21 by yiyixuxu · Pull Request #12333 · huggingface/diffusers (original) (raw)

Click to expand

from diffusers import HunyuanImagePipeline, HunyuanImageRefinerPipeline import torch from diffusers.utils import load_image import gc

device = "cuda:0" dtype = torch.bfloat16 output_prefix = "test_hyimage_output"

repo = "YiYiXu/HunyuanImage-2.1-Diffusers" distilled_repo = "YiYiXu/HunyuanImage-2.1-Distilled-Diffusers" refined_repo = "YiYiXu/HunyuanImage-2.1-Refiner-Diffusers"

test_prompts = [ "A cute, cartoon-style anthropomorphic penguin plush toy with fluffy fur, standing in a painting studio, wearing a red knitted scarf and a red beret with the word “Tencent” on it, holding a paintbrush with a focused expression as it paints an oil painting of the Mona Lisa, rendered in a photorealistic photographic style.", # "宏伟教堂的内部，穹顶下方的中央矗立着一尊小巧的维纳斯雕像，微微侧对镜头。雕像没有双手，布满裂纹，表面若干古老的水泥片剥落，露出内部真人质感的牛奶肌肤。雕像穿着薄薄的白色婚纱，在雕像的身后，一只浮空水泥断手轻轻提起长长的婚纱拖尾；在雕像的头顶上方，另一只浮空水泥断手正为她戴上一个由白色花朵组成的花环，雕像本身是没有双手的。教堂穹顶上布满彩色玻璃窗，一束阳光从上往下照射到雕像上，形成丁达尔效应，光斑点点洒在雕像的脸庞和胸前。充满神性的光辉，背景微微虚化，物体的边缘模糊柔和。拉斐尔前派的梦幻朦胧美学风格。", # "A hyper-realistic photograph of a crystal ball diorama sitting atop fluffy forest moss and surrounded by scattered sunlight. Inside, detailed diorama features a Tencent meeting room, an animated chat bubble sculpture, and several joyful penguins—one wearing a graduation cap, others playing soccer and waving tiny banners. The base of the crystal sphere boldly presents "Tencent" in large, crisp, white 3D letters. Background is softly blurred and bokeh-rich, emphasizing the cute, vibrant details of the sphere.", # "A close-up portrait of an elderly Italian man with deeply wrinkled skin, expressive hazel eyes, and a neatly trimmed white mustache. His olive-toned complexion shows the marks of sun and age, and he wears a flat cap slightly tilted to the side. He smiles faintly, revealing warmth and wisdom, while holding a small espresso cup in one hand. The softly blurred background shows a rustic stone wall with climbing ivy, captured in a realistic photography style.", # "An open vintage suitcase on a neutral, softly lit background. The suitcase is made of deep brown, worn leather with visible scuffs and creases, and its interior is lined with dark, plush fabric. Inside the suitcase is a meticulously crafted miniature landscape of China, featuring the Great Wall of China winding across model mountains, the pagoda roofs of the Forbidden City, and a representation of the terracotta army, all interwoven with vibrant green rice paddies. On the side of the suitcase, a text "China" is labeled. The entire diorama is bathed in warm, ethereal light, with a dreamy lens bloom and soft, glowing highlights. Photorealistic style, ultra-detailed textures, cinematic lighting." ]

test1: test hunyuanimage-v2.1

pipe = HunyuanImagePipeline.from_pretrained(repo, torch_dtype=dtype) pipe = pipe.to(device)

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_v2.1_{prompt[:10]}.png")

test2: test hunyuanimage-v2.1-distilled

del pipe gc.collect() torch.cuda.empty_cache()

pipe = HunyuanImagePipeline.from_pretrained(distilled_repo, torch_dtype=dtype) pipe = pipe.to(device)

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=8, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_v2.1-distilled_{prompt[:10]}.png")

test3: test hunyuanimage-v2.1-refined

del pipe gc.collect() torch.cuda.empty_cache()

pipe = HunyuanImageRefinerPipeline.from_pretrained(refined_repo, torch_dtype=dtype) pipe = pipe.to(device)

for prompt in test_prompts: image = load_image(f"{output_prefix}v2.1{prompt[:10]}.png") generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, image=image, num_inference_steps=4, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_v2.1-refined_{prompt[:10]}.png")

This PR adds guider support to HunyuanImage pipeline(requested by Hunyuan team). This is the first pipeline to use guiders and sets the pattern for future pipelines. I've attached a test sript that covers main usage patterns

""" This script demonstrates how guiders work with both non-distilled and distilled models in HunyuanImage pipeline, covering main use cases:

Default behavior with "pretrained" guiders (the one you load from hub, similar to schedulers)
Update guider: change guider.guidance_scale
Disable guider: disable()
Change distilled_guidance_scale
Use traditional CFG with distilled models """

from diffusers import HunyuanImagePipeline from diffusers.guiders import AdaptiveProjectedMixGuidance import torch import gc

device = "cuda:1" dtype = torch.bfloat16 output_prefix = "yiyi_test_hyimage_one_more_output_1022"

repo = "YiYiXu/HunyuanImage-2.1-Diffusers" distilled_repo = "YiYiXu/HunyuanImage-2.1-Distilled-Diffusers"

=============================================================================

Test Suite 1: Non-Distilled Model (HunyuanImage-v2.1)

=============================================================================

The non-distilled checkpoint has guider config like this:

{"enabled": true, "guidance_scale": 3.5, ...}

Traditional CFG is enabled by default with guidance_scale=3.5.

(see https://huggingface.co/YiYiXu/HunyuanImage-2.1-Diffusers/blob/main/guider/guider_config.json)

pipe = HunyuanImagePipeline.from_pretrained(repo, torch_dtype=dtype) pipe = pipe.to(device)

-----------------------------------------------------------------------------

Test 1.1: Default Behavior

-----------------------------------------------------------------------------

Load model and use checkpoint's default guider configuration.

No runtime parameters needed - guidance_scale=3.5 from guider config is used.

print("\n" + "="*80) print("Test 1.1: Default behavior with checkpoint-configured guider") print("="*80)

HunyuanImage has two guiders:

- pipe.guider: Used for normal prompts

- pipe.ocr_guider: Used when text rendering is detected in prompt

Both are loaded from checkpoint config

print(f"Loaded guider config: {pipe.guider}") print(f"Loaded ocr_guider config: {pipe.ocr_guider}") print(f" guidance_scale: {pipe.guider.guidance_scale}") print(f" enabled: {pipe.guider._enabled}")

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_1.1_{prompt[:10]}.png")

print("✓ Generated with guidance_scale=3.5 from guider config\n")

-----------------------------------------------------------------------------

Test 1.2: update guider: change guider.guidance_scale to 1.0

-----------------------------------------------------------------------------

print("="*80) print("Test 1.2: change guider.guidance_scale to 1.0") print("="*80)

pipe.guider = pipe.guider.new(guidance_scale=1.0) pipe.ocr_guider = pipe.ocr_guider.new(guidance_scale=1.0) print(f"Changed guider.guidance_scale to 1.0: {pipe.guider.guidance_scale}")

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_1.2_{prompt[:10]}.png")

print("✓ Generated with guider.guidance_scale=1.0")

-----------------------------------------------------------------------------

Test 1.3: Update Guider

-----------------------------------------------------------------------------

update guider: change guider.guidance_scale to 9.0

print("="*80) print("Test 1.3: update guider: change guider.guidance_scale to 9.0") print("="*80)

pipe.guider = pipe.guider.new(guidance_scale=9.0) pipe.guider = pipe.ocr_guider.new(guidance_scale=9.0) print(f"Changed guider.guidance_scale to 9.0: {pipe.guider.guidance_scale}")

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_1.3_{prompt[:10]}.png")

Guider modification persists after generation

assert pipe.guider.guidance_scale == 9.0 print("✓ Generated with guidance_scale=9.0")

-----------------------------------------------------------------------------

Test 1.5: Disable Guider

-----------------------------------------------------------------------------

Disable guider to turn off CFG entirely (equivalent to true_cfg_scale=1.0).

Useful when you want to see unconditional generation but do not want to modify the guider config.

print("="*80) print("Test 1.5: disable guider (no guidance)") print("="*80)

pipe.guider.disable() pipe.ocr_guider.disable() print(f"✓ Guider disabled: {pipe.guider._enabled}")

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=50, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_1.5_{prompt[:10]}.png")

print("✓ Generated without guidance (guider disabled)\n")

=============================================================================

Test Suite 2: Distilled Model (HunyuanImage-v2.1-Distilled)

=============================================================================

The distilled checkpoint does not have guider

del pipe gc.collect() torch.cuda.empty_cache()

pipe = HunyuanImagePipeline.from_pretrained(distilled_repo, torch_dtype=dtype) pipe = pipe.to(device)

print(f"\nLoaded distilled model guider: {pipe.guider}")

-----------------------------------------------------------------------------

Test 2.1: Default Behavior (Distilled Model)

-----------------------------------------------------------------------------

Use checkpoint's default distilled_guidance_scale=3.25.

No traditional CFG applied (guider disabled).

print("\n" + "="*80) print("Test 2.1: Default behavior with distilled model") print("="*80) for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=8, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_2.1_{prompt[:10]}.png")

print("✓ Generated with distilled_guidance_scale=3.25 from guider config\n")

-----------------------------------------------------------------------------

Test 2.2: change distilled_guidance_scale to 5.0

-----------------------------------------------------------------------------

print("="*80) print("Test 2.2: change distilled_guidance_scale to 5.0") print("="*80)

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, num_inference_steps=8, distilled_guidance_scale=5.0, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_2.2_{prompt[:10]}.png")

Verify guider unchanged

assert pipe.guider is None print("✓ Generated with distilled_guidance_scale=5.0")

-----------------------------------------------------------------------------

Test 2.3: Traditional CFG with Distilled Model

-----------------------------------------------------------------------------

Enable traditional CFG on top of distilled guidance.

This combines both: distilled guidance (baked in model) + traditional CFG (or other guidance).

print("="*80) print("Test 2.3: Enable traditional CFG with distilled model") print("="*80)

Enable traditional guidance

pipe.guider = AdaptiveProjectedMixGuidance(guidance_scale=5.0) print(f"✓ Guider enabled: {pipe.guider}") print(" Now using BOTH distilled guidance AND traditional guidance")

Use both distilled guidance (distilled_guidance_scale=1.0) and traditional CFG (guidance_scale=5.0)

for prompt in test_prompts: generator = torch.Generator(device=device).manual_seed(649151) out = pipe( prompt, negative_prompt = "bad quality, ugly, deformed, distorted, blurry, low resolution,", distilled_guidance_scale=1.0, num_inference_steps=8, height=2048, width=2048, generator=generator, ).images[0]

out.save(f"{output_prefix}_2.3_{prompt[:10]}.png")

print("✓ Generated with distilled_guidance_scale=1.0 + guidance_scale=5.0") print(" (distilled guidance disabled, traditional CFG applied)\n")

print("\n" + "="*80) print("All tests completed successfully!") print("="*80)