顔を統一してAIインフルエンサーを作るっていうのが流行っているみたいです。それってIP-Adapter使えばできるよね。

はじめに

ここで紹介する方法は一応無料でできます。

ただし、「InsigthFace」というモデルを使用しており、そちらのトレーニングデータは非営利の研究目的でのみ利用可能とのことですので注意して下さい。


以前同様のことをLoRA学習で行いました。
touch-sp.hatenablog.com
今回紹介する方法の方が圧倒的に簡便です。

準備した画像

とりあえず3枚の顔写真を用意しました。

結果

たった3枚の顔写真からこのような感じで同一人物(?)の画像が作れました。
少しだけアニメ要素を入れています。

顔の向きが用意した画像と同じなのでそのまま顔を使っているといえばその通りかもしれません。

使用したPythonスクリプト

make_embeddings.py

import torch
from diffusers import AutoPipelineForText2Image
from diffusers.utils import load_image
from pathlib import Path
import argparse

def cref_embeddings(pipeline, folder):

    from insightface.app import FaceAnalysis
    import cv2
    import numpy as np

    face_images = [load_image(x.as_posix()) for x in Path(folder).glob("*.png")]

    # embeddings of ip-adapter plus face
    pipeline.load_ip_adapter(
        "IP-Adapter",
        subfolder="sdxl_models",
        weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors",
        image_encoder_folder="models/image_encoder"
    )
    image_embeds_plusface = pipeline.prepare_ip_adapter_image_embeds(
        ip_adapter_image=[face_images],
        ip_adapter_image_embeds=None,
        device="cuda",
        num_images_per_prompt=1,
        do_classifier_free_guidance=True
    )
    torch.save(image_embeds_plusface, "plusface.ipadpt")

    # embeddings of ip-adapter faceid
    ref_images_embeds = []
    ref_unc_images_embeds = []
    app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
    app.prepare(ctx_id=0, det_size=(640, 640))
    for im in face_images:
        image = cv2.cvtColor(np.asarray(im), cv2.COLOR_BGR2RGB)
        faces = app.get(image)
        if len(faces) > 0:
            image = torch.from_numpy(faces[0].normed_embedding)
            image_embeds = image.unsqueeze(0)
            uncond_image_embeds = torch.zeros_like(image_embeds)
            ref_images_embeds.append(image_embeds)
            ref_unc_images_embeds.append(uncond_image_embeds)

    print(f"InsightFace: {len(face_images)} face images, {len(ref_images_embeds)} faces detected")
    assert len(ref_images_embeds) > 0, "face detection for faceid failed."
    
    ref_images_embeds = torch.stack(ref_images_embeds, dim=0)
    ref_unc_images_embeds = torch.stack(ref_unc_images_embeds, dim=0)
    image_embeds_faceid = [torch.cat([ref_unc_images_embeds, ref_images_embeds], dim=0).to(device="cuda", dtype=torch.float16)]

    torch.save(image_embeds_faceid, "faceid.ipadpt")

def sref_embeddings(pipeline, folder):

    style_images = [load_image(x.as_posix()) for x in Path(folder).glob("*.png")]

    # embeddings of ip-adapter style
    pipeline.load_ip_adapter(
        "IP-Adapter",
        subfolder="sdxl_models",
        weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
        image_encoder_folder="models/image_encoder"
    )
    image_embeds_style = pipeline.prepare_ip_adapter_image_embeds(
        ip_adapter_image=[style_images],
        ip_adapter_image_embeds=None,
        device="cuda",
        num_images_per_prompt=1,
        do_classifier_free_guidance=True
    )
    torch.save(image_embeds_style, "style.ipadpt")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cref", type=str, help="folder of character reference")
    parser.add_argument("--sref", type=str, help="folder of style reference")
    args = parser.parse_args()

    pipeline = AutoPipelineForText2Image.from_pretrained(
        "model/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float16,
        variant="fp16"
    ).to("cuda")

    if args.cref:
        cref_embeddings(pipeline, args.cref)
    if args.sref:
        sref_embeddings(pipeline, args.sref)

run.py

import torch
from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler
import argparse
import itertools
from pathlib import Path

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model",
    type=str,
    required=True,
    help="sdxl model",
)
parser.add_argument(
    "--plusface",
    type=str,
    required=True,
    help="embeddings for plusface"
)
parser.add_argument(
    "--plusface_scale",
    type=float,
    help="scale of ip-apdapter plusface"
)
parser.add_argument(
    "--faceid",
    type=str,
    required=True,
    help="embeddings for facdid"
)
parser.add_argument(
    "--faceid_scale",
    type=float,
    help="scale of ip-apdapter faceid"
)
parser.add_argument(
    "--style",
    type=str,
    help="embeddings for style",
)
parser.add_argument(
    "--style_scale",
    type=float,
    help="scale of style"
)
parser.add_argument(
    "--prompt",
    type=str,
    default="a woman",
    help="prompt"
)
parser.add_argument(
    "--negative_prompt",
    type=str,
    default="monochrome, lowres, bad anatomy, worst quality, low quality",
    help="negative prompt"
)
args = parser.parse_args()

pipeline = AutoPipelineForText2Image.from_pretrained(
    args.model,
    torch_dtype=torch.float16,
    variant="fp16"
).to("cuda")
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
    pipeline.scheduler.config,
    algorithm_type="sde-dpmsolver++",
    use_karras_sigmas=True
)

prompt = args.prompt
negative_prompt = args.negative_prompt

plusface_scale_list = [args.plusface_scale] if args.plusface_scale else [0.1, 0.3, 0.5]
faceid_scale_list = [args.faceid_scale] if args.faceid_scale else [0.3, 0.5, 0.7, 0.9]

if args.style:
    save_folder = "results_with_style"
    Path(save_folder).mkdir(exist_ok=True)
    style_scale_list = [args.style_scale] if args.style_scale else [0.3, 0.5]
    # text2image with 3 image embeddings
    pipeline.load_ip_adapter(
        ["IP-Adapter", "IP-Adapter", "IP-Adapter-FaceID"],
        subfolder=["sdxl_models", "sdxl_models", None],
        weight_name=[
            "ip-adapter-plus_sdxl_vit-h.safetensors",
            "ip-adapter-plus-face_sdxl_vit-h.safetensors",
            "ip-adapter-faceid_sdxl.bin"
        ],
        image_encoder_folder=None
    )

    image_embeds_plusface = torch.load(args.plusface)
    image_embeds_faceid = torch.load(args.faceid)
    image_embeds_style = torch.load(args.style)
    image_embeds = [image_embeds_style[0], image_embeds_plusface[0], image_embeds_faceid[0]]

    for (style_scale, plusface_scale, faceid_scale) in itertools.product(style_scale_list, plusface_scale_list, faceid_scale_list):
        pipeline.set_ip_adapter_scale([style_scale, plusface_scale, faceid_scale])
        image = pipeline(
            prompt=prompt,
            ip_adapter_image_embeds=image_embeds,
            negative_prompt=negative_prompt, 
            num_inference_steps=50,
            num_images_per_prompt=1,
            guidance_scale = 7.5,
            width=1024,
            height=1024, 
            generator=torch.Generator(device="cpu").manual_seed(0)
        ).images[0]

        save_fname = f"plusface{plusface_scale}_faceid{faceid_scale}_with_style{style_scale}.png"
        image.save(Path(save_folder, save_fname).as_posix())

else:
    save_folder = "results_without_style"
    Path(save_folder).mkdir(exist_ok=True)
    # text2image with 2 image embeddings
    pipeline.load_ip_adapter(
        ["IP-Adapter", "IP-Adapter-FaceID"],
        subfolder=["sdxl_models", None],
        weight_name=[
            "ip-adapter-plus-face_sdxl_vit-h.safetensors",
            "ip-adapter-faceid_sdxl.bin"
        ],
        image_encoder_folder=None
    )

    image_embeds_plusface = torch.load(args.plusface)
    image_embeds_faceid = torch.load(args.faceid)
    image_embeds = [image_embeds_plusface[0], image_embeds_faceid[0]]

    for (plusface_scale, faceid_scale) in itertools.product(plusface_scale_list, faceid_scale_list):
        pipeline.set_ip_adapter_scale([plusface_scale, faceid_scale])
        image = pipeline(
            prompt=prompt,
            ip_adapter_image_embeds=image_embeds,
            negative_prompt=negative_prompt, 
            num_inference_steps=50,
            num_images_per_prompt=1,
            guidance_scale = 7.5,
            width=1024,
            height=1024, 
            generator=torch.Generator(device="cpu").manual_seed(0)
        ).images[0]

        save_fname = f"plusface{plusface_scale}_faceid{faceid_scale}.png"
        image.save(Path(save_folder, save_fname).as_posix())

PC環境

Windows 11
CUDA 11.8
Python 3.11

Python環境構築

pip install torch==2.2.1+cu118 --index-url https://download.pytorch.org/whl/cu118
pip install git+https://github.com/fabiorigano/diffusers@faceidcore
pip install accelerate transformers peft opencv-python
pip install onnxruntime-gpu insightface