はじめに
こちらを使わせて頂きました。note.com
使用したPC
OS Windows 11 プロセッサ Intel(R) Core(TM) i7-12700H 実装 RAM 32.0 GB GPU RTX 3080 Laptop (VRAM 16GB)
CUDA 11.8 Python 3.12
Python環境構築
pip install torch==2.5.1+cu118 torchvision==0.20.1+cu118 --index-url https://download.pytorch.org/whl/cu118 pip install diffusers[torch] pip install transformers imageio imageio-ffmpeg
diffusers==0.32.2 imageio==2.36.1 imageio-ffmpeg==0.5.1 torch==2.5.1+cu118 torchvision==0.20.1+cu118 transformers==4.48.0
Pythonスクリプト
from diffusers.utils import export_to_video import tqdm from torchvision.transforms import ToPILImage import torch from transformers import AutoTokenizer, AutoModelForCausalLM from diffusers import CogVideoXTransformer3DModel, AutoencoderKLCogVideoX from decorator import gpu_monitor, time_monitor import gc def flush(): gc.collect() torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() @gpu_monitor(interval=0.5) @time_monitor def main(): prompt="チューリップや菜の花、色とりどりの花が果てしなく続く畑を埋め尽くし、まるでパッチワークのようにカラフルに彩る。朝の柔らかな光が花びらを透かし、淡いグラデーションが映える。風に揺れる花々をスローモーションで捉え、花びらが優雅に舞う姿を映画のような演出で撮影。背景には遠くに連なる山並みや青い空、浮かぶ白い雲が立体感を引き立てる。" frames = 48 width = 256 height = 256 device="cuda" shape=(1, frames//4, 16 ,height//8, width//8) sample_N=25 torch_dtype=torch.bfloat16 eps=1 cfg=2.5 tokenizer = AutoTokenizer.from_pretrained( "llm-jp/llm-jp-3-1.8b" ) text_encoder = AutoModelForCausalLM.from_pretrained( "llm-jp/llm-jp-3-1.8b", torch_dtype=torch_dtype ).to(device) text_inputs = tokenizer( prompt, padding="max_length", max_length=512, truncation=True, add_special_tokens=True, return_tensors="pt", ) text_input_ids = text_inputs.input_ids prompt_embeds = text_encoder( text_input_ids.to(device), output_hidden_states=True, attention_mask=text_inputs.attention_mask.to(device) ).hidden_states[-1] null_text_inputs = tokenizer( "", padding="max_length", max_length=512, truncation=True, add_special_tokens=True, return_tensors="pt", ) null_text_input_ids = null_text_inputs.input_ids null_prompt_embeds = text_encoder( null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device) ).hidden_states[-1] print("text_encoder:") print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB") del text_encoder flush() transformer = CogVideoXTransformer3DModel.from_pretrained( "aidealab/AIdeaLab-VideoJP", torch_dtype=torch_dtype ).to(device) # euler discreate sampler with cfg z0 = torch.randn(shape, device=device) latents = z0.detach().clone().to(torch_dtype) dt = 1.0 / sample_N with torch.no_grad(): for i in tqdm.tqdm(range(sample_N)): num_t = i / sample_N t = torch.ones(shape[0], device=device) * num_t psudo_t=(1000-eps)*(1-t)+eps positive_conditional = transformer( hidden_states=latents, timestep=psudo_t, encoder_hidden_states=prompt_embeds, image_rotary_emb=None ) null_conditional = transformer( hidden_states=latents, timestep=psudo_t, encoder_hidden_states=null_prompt_embeds, image_rotary_emb=None ) pred = null_conditional.sample + cfg * (positive_conditional.sample-null_conditional.sample) latents = latents.detach().clone() + dt * pred.detach().clone() print("transformer:") print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB") del transformer flush() vae = AutoencoderKLCogVideoX.from_pretrained( "THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch_dtype ).to(device) vae.enable_slicing() vae.enable_tiling() with torch.no_grad(): latents = latents / vae.config.scaling_factor latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W] x = vae.decode(latents).sample x = x / 2 + 0.5 x = x.clamp(0,1) x = x.permute(0, 2, 1, 3, 4).to(torch.float32)# [B, F, C, H, W] x = [ToPILImage()(frame) for frame in x[0]] export_to_video(x, "output.mp4", fps=12) print("vae:") print(f"torch.cuda.max_memory_allocated: {torch.cuda.max_memory_allocated()/ 1024**3:.2f} GB") if __name__ == "__main__": main()
結果
text_encoder: torch.cuda.max_memory_allocated: 6.23 GB transformer: torch.cuda.max_memory_allocated: 8.98 GB vae: torch.cuda.max_memory_allocated: 8.62 GB time: 36.94 sec GPU 0 - Used memory: 10.42/16.00 GB
作成された動画はGoogle Bloggerに載せています。
support-touchsp.blogspot.com
その他
ベンチマークはこちらで記述したスクリプトで行いました。touch-sp.hatenablog.com