結果
左側の写真から右側の写真を作成しました。PC環境
Windows 11 CUDA 11.8 Python 3.11
Python環境構築
pip install torch==2.2.2+cu118 --index-url https://download.pytorch.org/whl/cu118 pip install git+https://github.com/huggingface/diffusers pip install accelerate transformers peft pip install onnxruntime-gpu insightface
Pythonスクリプト
import cv2 import numpy as np import torch from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler from diffusers.utils import load_image from insightface.app import FaceAnalysis from transformers import CLIPVisionModelWithProjection model_path = "model/fudukiMix_v20" # downloaded from https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K image_encoder_path = "CLIP-ViT-H-14-laion2B-s32B-b79K" image_encoder = CLIPVisionModelWithProjection.from_pretrained( image_encoder_path, torch_dtype=torch.float16, ) pipeline = AutoPipelineForText2Image.from_pretrained( model_path, torch_dtype=torch.float16, variant="fp16", image_encoder=image_encoder ) pipeline.scheduler=scheduler = DPMSolverMultistepScheduler.from_config( pipeline.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True ) pipeline.load_ip_adapter( "IP-Adapter-FaceID", subfolder="sdxl", weight_name=["ip-adapter-faceid-plusv2_sdxl.bin"] , image_encoder_folder=None ) pipeline.to("cuda") pil_image = load_image("woman/face2.png") app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) app.prepare(ctx_id=0, det_size=(640, 640)) image = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_BGR2RGB) faces = app.get(image) assert len(faces) > 0 , "cannot detect face" ref_images_embeds = [] image = torch.from_numpy(faces[0].normed_embedding) ref_images_embeds.append(image.unsqueeze(0)) ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0) neg_ref_images_embeds = torch.zeros_like(ref_images_embeds) id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda") clip_embeds = pipeline.prepare_ip_adapter_image_embeds( ip_adapter_image=[pil_image], ip_adapter_image_embeds=None, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=True )[0] pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16) pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = True # True if Plus v2 result = pipeline( prompt="japanese woman, close-up, natural lighting, wavy hair, from side, white sweater, dyanmic posing, see-through curtain, bright room", ip_adapter_image_embeds=[id_embeds], negative_prompt="cleavage, illustration, 3d, 2d, painting, cartoons, sketch, watercolor, monotone, kimono, crossed eyes, strabismus", num_inference_steps=40, guidance_scale=7.5, num_images_per_prompt=1, ).images[0] result.save("result.png")