「IP-Adapter-FaceID-Plus」と「IP-Adapter-FaceID-PlusV2」がDiffusersから使えるようになりました。

結果

左側の写真から右側の写真を作成しました。

PC環境

Windows 11
CUDA 11.8
Python 3.11

Python環境構築

pip install torch==2.2.2+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install git+https://github.com/huggingface/diffusers
pip install accelerate transformers peft
pip install onnxruntime-gpu insightface

Pythonスクリプト

import cv2
import numpy as np
import torch
from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler
from diffusers.utils import load_image
from insightface.app import FaceAnalysis
from transformers import CLIPVisionModelWithProjection

model_path = "model/fudukiMix_v20"
# downloaded from https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
image_encoder_path = "CLIP-ViT-H-14-laion2B-s32B-b79K"

image_encoder = CLIPVisionModelWithProjection.from_pretrained(
    image_encoder_path, 
    torch_dtype=torch.float16,
)
pipeline = AutoPipelineForText2Image.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    variant="fp16",
    image_encoder=image_encoder
)
pipeline.scheduler=scheduler = DPMSolverMultistepScheduler.from_config(
    pipeline.scheduler.config,
    algorithm_type="sde-dpmsolver++",
    use_karras_sigmas=True
)
pipeline.load_ip_adapter(
    "IP-Adapter-FaceID",
    subfolder="sdxl",
    weight_name=["ip-adapter-faceid-plusv2_sdxl.bin"] ,
    image_encoder_folder=None
)
pipeline.to("cuda")

pil_image = load_image("woman/face2.png")

app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))

image = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_BGR2RGB)
faces = app.get(image)

assert len(faces) > 0 , "cannot detect face"

ref_images_embeds = []
image = torch.from_numpy(faces[0].normed_embedding)
ref_images_embeds.append(image.unsqueeze(0))
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")

clip_embeds = pipeline.prepare_ip_adapter_image_embeds(
    ip_adapter_image=[pil_image],
    ip_adapter_image_embeds=None, 
    device="cuda",
    num_images_per_prompt=1,
    do_classifier_free_guidance=True
)[0]

pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = True # True if Plus v2

result = pipeline(
    prompt="japanese woman, close-up, natural lighting, wavy hair, from side, white sweater, dyanmic posing, see-through curtain, bright room",
    ip_adapter_image_embeds=[id_embeds], 
    negative_prompt="cleavage, illustration, 3d, 2d, painting, cartoons, sketch, watercolor, monotone, kimono, crossed eyes, strabismus", 
    num_inference_steps=40,
    guidance_scale=7.5,
    num_images_per_prompt=1,
).images[0]

result.save("result.png")




このエントリーをはてなブックマークに追加