【Diffusers】AnimateDiff + IP-Adapter で動画作成してみる

IP-Adapter用の画像を作成

まずは「yabalMixTrue25D」というモデルを使って女性の画像を作成しました。

なんとなく作ったのですが手の描写が完璧すぎて驚きました。

from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler, AutoencoderKL

pipe = DiffusionPipeline.from_pretrained(
    "model/yabalMixTrue25Dv5_ema",
    custom_pipeline="lpw_stable_diffusion",
    vae=AutoencoderKL.from_single_file("vae/vae-ft-mse-840000-ema-pruned.safetensors"),
    safety_checker=None
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(
        pipe.scheduler.config,
        algorithm_type="sde-dpmsolver++",
        use_karras_sigmas=True
    )
pipe.to("cuda")

prompt = "(masterpiece:1.2), (absurdres:1.2), (best quality:1.2), (looking at viewer), shiny skin, pantyhose, cowboy shot, skirt, outdoors, day, countryside, hand on hip, (white|black theme), medium breasts, happy face"
neg_prompt = "(worst quality), (low quality), (bad quality), (bad anatomy), (hat:1.2), (cap:1.2), greyscale"

image = pipe.text2img(
    prompt=prompt, 
    negative_prompt=neg_prompt,
    width=512,
    height=768,
    max_embeddings_multiples=3
).images[0]

image.save("1girl.png")

AnimateDiffの実行

なぜかLCM-RoLAを使った方が結果が良かったです。

作成時間も短くなるし一石二鳥です。

import torch
from diffusers import MotionAdapter, AnimateDiffPipeline, AutoencoderKL, LCMScheduler
from diffusers.utils import export_to_gif, load_image

lcm_weight = 1.0
ipadapter_scale = 0.6
seed=10000000

adapter = MotionAdapter.from_pretrained("animatediff-motion-adapter-v1-5-2")

model_id = "model/yabalMixTrue25Dv5_ema"
pipe = AnimateDiffPipeline.from_pretrained(
    model_id, 
    motion_adapter=adapter,
    vae=AutoencoderKL.from_single_file("vae/vae-ft-mse-840000-ema-pruned.safetensors")
)
pipe.scheduler = LCMScheduler.from_config(
    pipe.scheduler.config,
    beta_schedule="linear"
)
pipe.load_lora_weights("lora/lcm-lora-sdv1-5", adapter_name="lcm")
pipe.set_adapters(["lcm"], adapter_weights=[lcm_weight])

pipe.load_ip_adapter(
    "IP-Adapter",
    subfolder="models",
    weight_name="ip-adapter-plus_sd15.safetensors"
)

pipe.set_ip_adapter_scale(ipadapter_scale)

ip_image = load_image("1girl.png")

# enable memory savings
pipe.enable_vae_slicing()
pipe.enable_model_cpu_offload()

output = pipe(
    prompt="a girl, walking, best quality, extremely detailed",
    negative_prompt="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
    ip_adapter_image=ip_image,
    num_frames=16,
    guidance_scale=1.0,
    num_inference_steps=5,
    generator=torch.manual_seed(seed),
    width=512,
    height=768
)
frames = output.frames[0]
export_to_gif(frames, f"adapter{ipadapter_scale}_lcm{lcm_weight}.gif")

結果

同一人物には見えませんがなんとなく特徴は継承しているかなと感じます。

ランキング参加中

プログラミング