はじめに
以下のようなメリットがあります。- 2回目以降「ip_adapter_image_embeds」を計算しなくていいので生成速度があがります。
- 2回目以降「image_encoder」をロードする必要がなくなるのでVRAM消費を抑えられます。
Python環境構築
pip install torch==2.2.0+cu118 --index-url https://download.pytorch.org/whl/cu118 pip install git+https://github.com/huggingface/diffusers pip install accelerate transformers peft
本題
こちらの記事で行ったことを新しい機能で更新します。touch-sp.hatenablog.com
次のスクリプトを実行すると「image_embeds.ipadpt」が保存されます。
import torch from diffusers import AutoPipelineForText2Image, DDIMScheduler from diffusers.utils import load_image from pathlib import Path pipeline = AutoPipelineForText2Image.from_pretrained( "model/modernDisneyXL_v3", torch_dtype=torch.float16, variant="fp16" ) pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) pipeline.load_ip_adapter( "IP-Adapter", subfolder="sdxl_models", weight_name=[ "ip-adapter-plus_sdxl_vit-h.safetensors", "ip-adapter-plus-face_sdxl_vit-h.safetensors" ] , image_encoder_folder="models/image_encoder" ) pipeline.to("cuda") face_image = load_image("face2.png") style_images = [load_image(x.as_posix()) for x in Path("style_ziggy").glob("*.png")] image_embeds = pipeline.prepare_ip_adapter_image_embeds( ip_adapter_image=[style_images, face_image], ip_adapter_image_embeds=None, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=True ) torch.save(image_embeds, "image_embeds.ipadpt")
2回目以降はこれだけで済みます。
import torch from diffusers import AutoPipelineForText2Image, DDIMScheduler pipeline = AutoPipelineForText2Image.from_pretrained( "model/modernDisneyXL_v3", torch_dtype=torch.float16, variant="fp16" ) pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) pipeline.load_ip_adapter( "IP-Adapter", subfolder="sdxl_models", weight_name=[ "ip-adapter-plus_sdxl_vit-h.safetensors", "ip-adapter-plus-face_sdxl_vit-h.safetensors" ], image_encoder_folder=None ) pipeline.set_ip_adapter_scale([0.7, 0.8]) pipeline.to("cuda") image_embeds_fromfile = torch.load("image_embeds.ipadpt") generator = torch.Generator(device="cpu").manual_seed(2024) image = pipeline( prompt="a woman", ip_adapter_image_embeds=image_embeds_fromfile, negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", num_inference_steps=50, num_images_per_prompt=1, generator=generator, ).images[0] image.save("result_from_image_embeds.png")