日本語で動画生成ができるAIdeaLabの「AIdeaLab VideoJP」を試してみる

はじめに

こちらを使わせて頂きました。
note.com

使用したPC

OS		Windows 11
プロセッサ	Intel(R) Core(TM) i7-12700H
実装 RAM	32.0 GB
GPU		RTX 3080 Laptop (VRAM 16GB)

CUDA 11.8
Python 3.12

Python環境構築

pip install torch==2.5.1+cu118 torchvision==0.20.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install diffusers[torch]
pip install transformers imageio imageio-ffmpeg

diffusers==0.32.2
imageio==2.36.1
imageio-ffmpeg==0.5.1
torch==2.5.1+cu118
torchvision==0.20.1+cu118
transformers==4.48.0

Pythonスクリプト

from diffusers.utils import export_to_video
import tqdm
from torchvision.transforms import ToPILImage
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from diffusers import CogVideoXTransformer3DModel, AutoencoderKLCogVideoX
from decorator import gpu_monitor, time_monitor
import gc

def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

@gpu_monitor(interval=0.5)
@time_monitor
def main():
    prompt="チューリップや菜の花、色とりどりの花が果てしなく続く畑を埋め尽くし、まるでパッチワークのようにカラフルに彩る。朝の柔らかな光が花びらを透かし、淡いグラデーションが映える。風に揺れる花々をスローモーションで捉え、花びらが優雅に舞う姿を映画のような演出で撮影。背景には遠くに連なる山並みや青い空、浮かぶ白い雲が立体感を引き立てる。"
    frames = 48
    width = 256
    height = 256
    device="cuda"
    shape=(1, frames//4, 16 ,height//8, width//8)
    sample_N=25
    torch_dtype=torch.bfloat16
    eps=1
    cfg=2.5

    tokenizer = AutoTokenizer.from_pretrained(
        "llm-jp/llm-jp-3-1.8b"
    )

    text_encoder = AutoModelForCausalLM.from_pretrained(
        "llm-jp/llm-jp-3-1.8b",
        torch_dtype=torch_dtype
    ).to(device)

    text_inputs = tokenizer(
        prompt,
        padding="max_length",
        max_length=512,
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
    )
    text_input_ids = text_inputs.input_ids
    prompt_embeds = text_encoder(
        text_input_ids.to(device),
        output_hidden_states=True,
        attention_mask=text_inputs.attention_mask.to(device)
    ).hidden_states[-1]

    null_text_inputs = tokenizer(
        "",
        padding="max_length",
        max_length=512,
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
    )
    null_text_input_ids = null_text_inputs.input_ids
    null_prompt_embeds = text_encoder(
        null_text_input_ids.to(device),
        output_hidden_states=True,
        attention_mask=null_text_inputs.attention_mask.to(device)
    ).hidden_states[-1]

    print("text_encoder:")
    print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB")

    del text_encoder
    flush()

    transformer = CogVideoXTransformer3DModel.from_pretrained(
        "aidealab/AIdeaLab-VideoJP",
        torch_dtype=torch_dtype
    ).to(device)

    # euler discreate sampler with cfg
    z0 = torch.randn(shape, device=device)
    latents = z0.detach().clone().to(torch_dtype)

    dt = 1.0 / sample_N
    with torch.no_grad():
        for i in tqdm.tqdm(range(sample_N)):
            num_t = i / sample_N
            t = torch.ones(shape[0], device=device) * num_t
            psudo_t=(1000-eps)*(1-t)+eps
            positive_conditional = transformer(
                hidden_states=latents,
                timestep=psudo_t,
                encoder_hidden_states=prompt_embeds,
                image_rotary_emb=None
            )
            null_conditional = transformer(
                hidden_states=latents,
                timestep=psudo_t,
                encoder_hidden_states=null_prompt_embeds,
                image_rotary_emb=None
            )
            pred = null_conditional.sample + cfg * (positive_conditional.sample-null_conditional.sample)
            latents = latents.detach().clone() + dt * pred.detach().clone()

    print("transformer:")
    print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB")

    del transformer
    flush()

    vae = AutoencoderKLCogVideoX.from_pretrained(
        "THUDM/CogVideoX-2b",
        subfolder="vae",
        torch_dtype=torch_dtype
    ).to(device)

    vae.enable_slicing()
    vae.enable_tiling()

    with torch.no_grad():
        latents = latents / vae.config.scaling_factor
        latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
        x = vae.decode(latents).sample
        x = x / 2 + 0.5
        x = x.clamp(0,1)
        x = x.permute(0, 2, 1, 3, 4).to(torch.float32)# [B, F, C, H, W]
        x = [ToPILImage()(frame) for frame in x[0]]

    export_to_video(x, "output.mp4", fps=12)

    print("vae:")
    print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB")

if __name__ == "__main__":
    main()

結果

text_encoder:
torch.cuda.max_memory_allocated: 6.23 GB

transformer:
torch.cuda.max_memory_allocated: 8.98 GB

vae:
torch.cuda.max_memory_allocated: 8.62 GB

time: 36.94 sec
GPU 0 - Used memory: 10.42/16.00 GB

作成された動画はGoogle Bloggerに載せています。
support-touchsp.blogspot.com