【Diffusers】AnimateDiff + Multi-ControlNet + IP-Adapterで動画作成してみる

目的

この女性が歩く動画（アニメ調）を作っていきます。

この女性の写真はこちらで作成したものです。
touch-sp.hatenablog.com

walking動画

walking動画はPixabayからこちらの動画を使わせて頂きました。

結果

LCM-LoRAなし

なんとなく顔は似ているような気がします。

少し右腕の服がおかしなことになってしまっています。

残念！！

方法

設定をYAMLファイルに書き込んでそれを読み込む形にしています。

YAMLファイル

pretrained_model_path: "model/yabalMixTrue25Dv5_ema"
vae: 
  single_file: True 
  model_path: "vae/vae-ft-mse-840000-ema-pruned.safetensors"

motion_module_path: "animatediff-motion-adapter-v1-5-2"

controlnet:
  - model_path: "controlnet/control_v11p_sd15_openpose"
    image_path: "controlnet-image/openpose.gif"
    conditioning_scale: 0.5
  - model_path: "controlnet/control_v11f1e_sd15_tile"
    image_path: "controlnet-image/tile.gif"
    conditioning_scale: 0.4

prompt: "a girl, walking, anime style, best quality, extremely detailed"
negative_prompt: "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"

seed: 222
steps: 30
guidance_scale: 7.5
width: 512
height: 768
clip_skip: None

ip_adapter:
  enable: True
  folder: "IP-Adapter"
  subfolder: "models"
  weight_name: "ip-adapter-plus-face_sd15.safetensors"
  image_path: "face.png"
  scale: 0.6

lcm_lora:
  enable: False
  model_path: "lora/lcm-lora-sdv1-5"
  weight: 1.0

Pythonスクリプト

import torch
from diffusers import DiffusionPipeline, AutoencoderKL, ControlNetModel, MotionAdapter
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
from PIL import Image
import os
import argparse
import yaml
import datetime
import shutil

def gif2images(gif_filename):
    gif=Image.open(gif_filename)
    frames=[]
    for i in range(gif.n_frames):
        gif.seek(i)
        img = gif.copy()
        frames.append(img)
    return frames

parser = argparse.ArgumentParser()
parser.add_argument(
    '--config',
    type=str,
    required=True,
    help="path to yaml file"
)
args = parser.parse_args()

with open(args.config, "r") as f:
    config_dict = yaml.load(f, Loader=yaml.SafeLoader)

time_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
os.makedirs(os.path.join("outputs", time_str), exist_ok=False)
shutil.copyfile(args.config, os.path.join("outputs",time_str, "config.yaml"))

adapter = MotionAdapter.from_pretrained(
    config_dict["motion_module_path"]
)

controlnet_list = config_dict["controlnet"]

controlnet = MultiControlNetModel(
    [ 
        ControlNetModel.from_pretrained(
            x["model_path"],
            torch_dtype=torch.float16
        ) 
        for x in controlnet_list
    ]
)

controlimage = [gif2images(x["image_path"]) for x in controlnet_list]
n_frames = 32 if min([len(x) for x in controlimage])>32 else min([len(x) for x in controlimage])
controlimage = [x[0:n_frames] for x in controlimage]

controlnet_conditioning_scale = [x["conditioning_scale"] for x in controlnet_list]

if config_dict["vae"]["single_file"]:
    vae = AutoencoderKL.from_single_file(
        config_dict["vae"]["model_path"],
        torch_dtype=torch.float16
    )
else:
    vae = AutoencoderKL.from_pretrained(
        config_dict["vae"]["model_path"],
        torch_dtype=torch.float16
    )

model_id = config_dict["pretrained_model_path"]
pipe = DiffusionPipeline.from_pretrained(
    model_id,
    motion_adapter=adapter,
    controlnet=controlnet,
    vae=vae,
    custom_pipeline="custom-pipeline/pipeline_animatediff_controlnet.py",
    torch_dtype=torch.float16
).to("cuda")

use_ipadapter = config_dict["ip_adapter"]["enable"]
if use_ipadapter:
    pipe.load_ip_adapter(
        config_dict["ip_adapter"]["folder"],
        subfolder=config_dict["ip_adapter"]["subfolder"],
        weight_name=config_dict["ip_adapter"]["weight_name"],
        torch_dtype=torch.float16
    )
    pipe.set_ip_adapter_scale(config_dict["ip_adapter"]["scale"])
    
use_lcmlora = config_dict["lcm_lora"]["enable"]
if use_lcmlora:
    from diffusers import LCMScheduler
    pipe.scheduler = LCMScheduler.from_config(
        pipe.scheduler.config,
        beta_schedule="linear"
    )
    pipe.load_lora_weights(config_dict["lcm_lora"]["model_path"], adapter_name="lcm")
    pipe.set_adapters(["lcm"], adapter_weights=[config_dict["lcm_lora"]["weight"]])
else:
    from diffusers import DDIMScheduler
    pipe.scheduler = DDIMScheduler.from_config(
        pipe.scheduler.config,
        beta_schedule="linear",
    )

pipe.enable_vae_slicing()

prompt = config_dict["prompt"]
negative_prompt = config_dict["negative_prompt"]
seed = config_dict["seed"]
steps = config_dict["steps"]
guidance_scale = 1.0 if use_lcmlora else config_dict["guidance_scale"]
width = config_dict["width"]
height = config_dict["height"]
clip_skip = config_dict["clip_skip"] if isinstance(config_dict["clip_skip"], int) else None

if use_ipadapter:
    ip_image = Image.open(config_dict["ip_adapter"]["image_path"])
    result = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        ip_adapter_image=ip_image,
        num_frames=n_frames,
        width=width,
        height=height,
        conditioning_frames=controlimage,
        num_inference_steps=steps,
        guidance_scale=guidance_scale,
        generator=torch.manual_seed(seed),
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        clip_skip=clip_skip
    ).frames[0]
else:
    result = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_frames=n_frames,
        width=width,
        height=height,
        conditioning_frames=controlimage,
        num_inference_steps=steps,
        guidance_scale=guidance_scale,
        generator=torch.manual_seed(seed),
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        clip_skip=clip_skip
    ).frames[0]

from diffusers.utils import export_to_gif
export_to_gif(result, os.path.join("outputs", time_str, "result.gif"))

LCM-LoRAあり

YAMLファイル

pretrained_model_path: "model/yabalMixTrue25Dv5_ema"
vae: 
  single_file: True 
  model_path: "vae/vae-ft-mse-840000-ema-pruned.safetensors"

motion_module_path: "animatediff-motion-adapter-v1-5-2"

controlnet:
  - model_path: "controlnet/control_v11p_sd15_openpose"
    image_path: "controlnet-image/openpose.gif"
    conditioning_scale: 0.5
  - model_path: "controlnet/control_v11f1e_sd15_tile"
    image_path: "controlnet-image/tile.gif"
    conditioning_scale: 0.30

prompt: "a girl, walking, anime style, best quality, extremely detailed"
negative_prompt: "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"

seed: 222
steps: 5
guidance_scale: 7.5
width: 512
height: 768
clip_skip: None

ip_adapter:
  enable: True
  folder: "IP-Adapter"
  subfolder: "models"
  weight_name: "ip-adapter-plus-face_sd15.safetensors"
  image_path: "face.png"
  scale: 0.6

lcm_lora:
  enable: True
  model_path: "lora/lcm-lora-sdv1-5"
  weight: 1.0

ランキング参加中

プログラミング