はじめに
ここで紹介する方法は一応無料でできます。ただし、「InsigthFace」というモデルを使用しており、そちらのトレーニングデータは非営利の研究目的でのみ利用可能とのことですので注意して下さい。以前同様のことをLoRA学習で行いました。
touch-sp.hatenablog.com
今回紹介する方法の方が圧倒的に簡便です。
準備した画像
とりあえず3枚の顔写真を用意しました。結果
たった3枚の顔写真からこのような感じで同一人物(?)の画像が作れました。少しだけアニメ要素を入れています。
顔の向きが用意した画像と同じなのでそのまま顔を使っているといえばその通りかもしれません。
使用したPythonスクリプト
make_embeddings.py
import torch from diffusers import AutoPipelineForText2Image from diffusers.utils import load_image from pathlib import Path import argparse def cref_embeddings(pipeline, folder): from insightface.app import FaceAnalysis import cv2 import numpy as np face_images = [load_image(x.as_posix()) for x in Path(folder).glob("*.png")] # embeddings of ip-adapter plus face pipeline.load_ip_adapter( "IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors", image_encoder_folder="models/image_encoder" ) image_embeds_plusface = pipeline.prepare_ip_adapter_image_embeds( ip_adapter_image=[face_images], ip_adapter_image_embeds=None, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=True ) torch.save(image_embeds_plusface, "plusface.ipadpt") # embeddings of ip-adapter faceid ref_images_embeds = [] ref_unc_images_embeds = [] app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) app.prepare(ctx_id=0, det_size=(640, 640)) for im in face_images: image = cv2.cvtColor(np.asarray(im), cv2.COLOR_BGR2RGB) faces = app.get(image) if len(faces) > 0: image = torch.from_numpy(faces[0].normed_embedding) image_embeds = image.unsqueeze(0) uncond_image_embeds = torch.zeros_like(image_embeds) ref_images_embeds.append(image_embeds) ref_unc_images_embeds.append(uncond_image_embeds) print(f"InsightFace: {len(face_images)} face images, {len(ref_images_embeds)} faces detected") assert len(ref_images_embeds) > 0, "face detection for faceid failed." ref_images_embeds = torch.stack(ref_images_embeds, dim=0) ref_unc_images_embeds = torch.stack(ref_unc_images_embeds, dim=0) image_embeds_faceid = [torch.cat([ref_unc_images_embeds, ref_images_embeds], dim=0).to(device="cuda", dtype=torch.float16)] torch.save(image_embeds_faceid, "faceid.ipadpt") def sref_embeddings(pipeline, folder): style_images = [load_image(x.as_posix()) for x in Path(folder).glob("*.png")] # embeddings of ip-adapter style pipeline.load_ip_adapter( "IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors", image_encoder_folder="models/image_encoder" ) image_embeds_style = pipeline.prepare_ip_adapter_image_embeds( ip_adapter_image=[style_images], ip_adapter_image_embeds=None, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=True ) torch.save(image_embeds_style, "style.ipadpt") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cref", type=str, help="folder of character reference") parser.add_argument("--sref", type=str, help="folder of style reference") args = parser.parse_args() pipeline = AutoPipelineForText2Image.from_pretrained( "model/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16" ).to("cuda") if args.cref: cref_embeddings(pipeline, args.cref) if args.sref: sref_embeddings(pipeline, args.sref)
run.py
import torch from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler import argparse import itertools from pathlib import Path parser = argparse.ArgumentParser() parser.add_argument( "--model", type=str, required=True, help="sdxl model", ) parser.add_argument( "--plusface", type=str, required=True, help="embeddings for plusface" ) parser.add_argument( "--plusface_scale", type=float, help="scale of ip-apdapter plusface" ) parser.add_argument( "--faceid", type=str, required=True, help="embeddings for facdid" ) parser.add_argument( "--faceid_scale", type=float, help="scale of ip-apdapter faceid" ) parser.add_argument( "--style", type=str, help="embeddings for style", ) parser.add_argument( "--style_scale", type=float, help="scale of style" ) parser.add_argument( "--prompt", type=str, default="a woman", help="prompt" ) parser.add_argument( "--negative_prompt", type=str, default="monochrome, lowres, bad anatomy, worst quality, low quality", help="negative prompt" ) args = parser.parse_args() pipeline = AutoPipelineForText2Image.from_pretrained( args.model, torch_dtype=torch.float16, variant="fp16" ).to("cuda") pipeline.scheduler = DPMSolverMultistepScheduler.from_config( pipeline.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True ) prompt = args.prompt negative_prompt = args.negative_prompt plusface_scale_list = [args.plusface_scale] if args.plusface_scale else [0.1, 0.3, 0.5] faceid_scale_list = [args.faceid_scale] if args.faceid_scale else [0.3, 0.5, 0.7, 0.9] if args.style: save_folder = "results_with_style" Path(save_folder).mkdir(exist_ok=True) style_scale_list = [args.style_scale] if args.style_scale else [0.3, 0.5] # text2image with 3 image embeddings pipeline.load_ip_adapter( ["IP-Adapter", "IP-Adapter", "IP-Adapter-FaceID"], subfolder=["sdxl_models", "sdxl_models", None], weight_name=[ "ip-adapter-plus_sdxl_vit-h.safetensors", "ip-adapter-plus-face_sdxl_vit-h.safetensors", "ip-adapter-faceid_sdxl.bin" ], image_encoder_folder=None ) image_embeds_plusface = torch.load(args.plusface) image_embeds_faceid = torch.load(args.faceid) image_embeds_style = torch.load(args.style) image_embeds = [image_embeds_style[0], image_embeds_plusface[0], image_embeds_faceid[0]] for (style_scale, plusface_scale, faceid_scale) in itertools.product(style_scale_list, plusface_scale_list, faceid_scale_list): pipeline.set_ip_adapter_scale([style_scale, plusface_scale, faceid_scale]) image = pipeline( prompt=prompt, ip_adapter_image_embeds=image_embeds, negative_prompt=negative_prompt, num_inference_steps=50, num_images_per_prompt=1, guidance_scale = 7.5, width=1024, height=1024, generator=torch.Generator(device="cpu").manual_seed(0) ).images[0] save_fname = f"plusface{plusface_scale}_faceid{faceid_scale}_with_style{style_scale}.png" image.save(Path(save_folder, save_fname).as_posix()) else: save_folder = "results_without_style" Path(save_folder).mkdir(exist_ok=True) # text2image with 2 image embeddings pipeline.load_ip_adapter( ["IP-Adapter", "IP-Adapter-FaceID"], subfolder=["sdxl_models", None], weight_name=[ "ip-adapter-plus-face_sdxl_vit-h.safetensors", "ip-adapter-faceid_sdxl.bin" ], image_encoder_folder=None ) image_embeds_plusface = torch.load(args.plusface) image_embeds_faceid = torch.load(args.faceid) image_embeds = [image_embeds_plusface[0], image_embeds_faceid[0]] for (plusface_scale, faceid_scale) in itertools.product(plusface_scale_list, faceid_scale_list): pipeline.set_ip_adapter_scale([plusface_scale, faceid_scale]) image = pipeline( prompt=prompt, ip_adapter_image_embeds=image_embeds, negative_prompt=negative_prompt, num_inference_steps=50, num_images_per_prompt=1, guidance_scale = 7.5, width=1024, height=1024, generator=torch.Generator(device="cpu").manual_seed(0) ).images[0] save_fname = f"plusface{plusface_scale}_faceid{faceid_scale}.png" image.save(Path(save_folder, save_fname).as_posix())
PC環境
Windows 11 CUDA 11.8 Python 3.11
Python環境構築
pip install torch==2.2.1+cu118 --index-url https://download.pytorch.org/whl/cu118 pip install git+https://github.com/fabiorigano/diffusers@faceidcore pip install accelerate transformers peft opencv-python pip install onnxruntime-gpu insightface