LTX-Video (original) (raw)

import torch from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition from diffusers.utils import export_to_video, load_video

pipeline = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-distilled", torch_dtype=torch.bfloat16) pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipeline.vae, torch_dtype=torch.bfloat16) pipeline.to("cuda") pipe_upsample.to("cuda") pipeline.vae.enable_tiling()

def round_to_nearest_resolution_acceptable_by_vae(height, width): height = height - (height % pipeline.vae_temporal_compression_ratio) width = width - (width % pipeline.vae_temporal_compression_ratio) return height, width

prompt = """ artistic anatomical 3d render, utlra quality, human half full male body with transparent skin revealing structure instead of organs, muscular, intricate creative patterns, monochromatic with backlighting, lightning mesh, scientific concept art, blending biology with botany, surreal and ethereal quality, unreal engine 5, ray tracing, ultra realistic, 16K UHD, rich details. camera zooms out in a rotating fashion """ negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" expected_height, expected_width = 768, 1152 downscale_factor = 2 / 3 num_frames = 161

downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width) latents = pipeline( prompt=prompt, negative_prompt=negative_prompt, width=downscaled_width, height=downscaled_height, num_frames=num_frames, timesteps=[1000, 993, 987, 981, 975, 909, 725, 0.03], decode_timestep=0.05, decode_noise_scale=0.025, image_cond_noise_scale=0.0, guidance_scale=1.0, guidance_rescale=0.7, generator=torch.Generator().manual_seed(0), output_type="latent", ).frames

upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 upscaled_latents = pipe_upsample( latents=latents, adain_factor=1.0, output_type="latent" ).frames

video = pipeline( prompt=prompt, negative_prompt=negative_prompt, width=upscaled_width, height=upscaled_height, num_frames=num_frames, denoise_strength=0.999,
timesteps=[1000, 909, 725, 421, 0], latents=upscaled_latents, decode_timestep=0.05, decode_noise_scale=0.025, image_cond_noise_scale=0.0, guidance_scale=1.0, guidance_rescale=0.7, generator=torch.Generator().manual_seed(0), output_type="pil", ).frames[0]

video = [frame.resize((expected_width, expected_height)) for frame in video]

export_to_video(video, "output.mp4", fps=24)