OpenMMLab の MMagic で AnimateDiff が使えるようになったので試してみました

環境

Windows 11
CUDA 11.7
Python 3.11.5

Python環境構築

pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 --index-url https://download.pytorch.org/whl/cu117
pip install mmcv==2.0.1 -f https://download.openmmlab.com/mmcv/dist/cu117/torch2.0.0/index.html
pip install openmim==0.3.9
pip install mmagic==1.1.0
pip install accelerate==0.23.0
pip install albumentations==1.3.1
pip install xformers==0.0.21

これが失敗するようなら以下を実行してみて下さい。

pip install -U setuptools wheel

モデルのダウンロード

以下のようなファイル構造になるようにします。

models
├─DreamBooth_LoRA
│      toonyou_beta6.safetensors
│
├─Motion_Module
│      mm_sd_v15_v2.ckpt
│
└─StableDiffusion
    └─stable-diffusion-v1-5

Configファイルの準備

「animatediff_config.py」というファイル名にしています。

stable_diffusion_v15_url = "./models/StableDiffusion/stable-diffusion-v1-5"
models_path = './models/'

diffusion_scheduler = dict(
    type='DDIMScheduler',
    beta_end=0.012,
    beta_schedule='linear',
    beta_start=0.00085,
    num_train_timesteps=1000,
    prediction_type='epsilon',
    set_alpha_to_one=True,
    clip_sample=False,
    thresholding=False,
    steps_offset=1
    )

model = dict(
    type='AnimateDiff',
    vae=dict(
        type='AutoencoderKL',
        from_pretrained=stable_diffusion_v15_url,
        subfolder='vae'),
    unet=dict(
        type='UNet3DConditionMotionModel',
        unet_use_cross_frame_attention=False,
        unet_use_temporal_attention=False,
        use_motion_module=True,
        motion_module_resolutions=[1, 2, 4, 8],
        motion_module_mid_block=True,
        motion_module_decoder_only=False,
        motion_module_type='Vanilla',
        motion_module_kwargs=dict(
            num_attention_heads=8,
            num_transformer_block=1,
            attention_block_types=['Temporal_Self', 'Temporal_Self'],
            temporal_position_encoding=True,
            temporal_position_encoding_max_len=32,
            temporal_attention_dim_div=1
            ),
        subfolder='unet',
        from_pretrained=stable_diffusion_v15_url
        ),
    text_encoder=dict(
        type='ClipWrapper',
        clip_type='huggingface',
        pretrained_model_name_or_path=stable_diffusion_v15_url,
        subfolder='text_encoder'
        ),
    tokenizer=stable_diffusion_v15_url,
    scheduler=diffusion_scheduler,
    test_scheduler=diffusion_scheduler,
    data_preprocessor=dict(type='DataPreprocessor'),
    motion_module_cfg=dict(path=models_path + 'Motion_Module/mm_sd_v15_v2.ckpt'),
    dream_booth_lora_cfg=dict(
        type='toonyou',
        path=models_path + 'DreamBooth_LoRA/toonyou_beta6.safetensors'
        )
    )

実行

from mmengine import Config
from mmagic.registry import MODELS
from mmagic.utils import register_all_modules
import os
import datetime
from mmagic.models.editors.animatediff import save_videos_grid

prompts = ["a girl, dancing, blue denim, white plain t-shirt, best quality, extremely detailed"] * 5
n_prompt = ["longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"] * 5
random_seeds = [1000000000, 1500000000, 2000000000, 2500000000, 3000000000]

register_all_modules()

cfg = Config.fromfile("animatediff_config.py")
animatediff = MODELS.build(cfg.model).cuda()

time_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
savedir = f"samples/{cfg.model['dream_booth_lora_cfg']['type']}-{time_str}"
os.makedirs(savedir)
for prompt_idx, (prompt, n_prompt, random_seed) in enumerate(zip(prompts, n_prompt, random_seeds)):
    output_dict = animatediff.infer(
        prompt=prompt,
        negative_prompt=n_prompt,
        video_length=16,
        height=512, width=512,
        seed=random_seed,
        num_inference_steps=40,
        guidance_scale=8.5
        )
    sample = output_dict['samples']
    save_videos_grid(sample, f"{savedir}/{prompt_idx}.gif")