【SDXL 1.0】 Depth ControlNet が Diffusers から公開されたのでさっそく使ってみました

huggingface.co

はじめに

「Canny」に続いて「Depth」の ControlNet が公開されました。

「Canny」に関してはこちらを見て下さい。
touch-sp.hatenablog.com

元画像

こちらで作成した画像を使用しました。「girl.png」として保存しています。


Depth画像の作成

4つの方法でDepth画像を作成しました。

「controlnet_aux」についてはこちらの記事を見て下さい。

方法1

Hugging Faceの公式ページ通りです。

import torch
import numpy as np
from PIL import Image
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
from diffusers.utils import load_image

depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")

def get_depth_map(image):
    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
    with torch.no_grad(), torch.autocast("cuda"):
        depth_map = depth_estimator(image).predicted_depth

    depth_map = torch.nn.functional.interpolate(
        depth_map.unsqueeze(1),
        size=(1024, 1024),
        mode="bicubic",
        align_corners=False,
    )
    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
    image = torch.cat([depth_map] * 3, dim=1)

    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
    return image

image = load_image("girl.png")
depth_image = get_depth_map(image)

depth_image.save("method1.png")

方法2

from diffusers.utils import load_image
from controlnet_aux.processor import Processor

image = load_image("girl.png")

processor = Processor("depth_midas")
depth_image = processor(image, to_pil=True)

depth_image.save("method2.png")

方法3

from diffusers.utils import load_image
from controlnet_aux.processor import Processor

image = load_image("girl.png")

processor = Processor("depth_leres")
depth_image = processor(image, to_pil=True)

depth_image.save("method3.png")

方法4

from diffusers.utils import load_image
from controlnet_aux.processor import Processor

image = load_image("girl.png")

processor = Processor("depth_leres++")
depth_image = processor(image, to_pil=True)

depth_image.save("method4.png")



左から1→2→3→4です。



どれが一番きれいな結果になるでしょうか?

実行

from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
from diffusers.utils import load_image
import torch

controlnet = ControlNetModel.from_pretrained(
    "diffusers/controlnet-depth-sdxl-1.0",
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
).to("cuda")
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    controlnet=controlnet,
    vae=vae,
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
).to("cuda")

fname_list = ["method1", "method2", "method3", "method4"] 

prompt = "anime style, ultra detailed, super fine illustration, pretty girl, fountain background"
negative_prompt = "worst quality, low quality"
controlnet_conditioning_scale = 0.5  # recommended for good generalization

for fname in fname_list:
    depth_image = load_image(f"{fname}.png").resize((1024, 1024))
    seed = 30000
    generator = torch.manual_seed(seed)

    image = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt, 
        image=depth_image, 
        num_inference_steps=30, 
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        generator=generator).images[0]

    image.save(f"{fname}_result.png")



上から1→2→3→4です。



甲乙つけがたい結果でした。

SDXL 1.0派生モデルとDepth ControlNet

これ以降は「方法2」を使っていきます。

niji-diffusion-xl-base-1.0 v2.3

from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
from diffusers.utils import load_image
import torch
from controlnet_aux.processor import Processor

controlnet = ControlNetModel.from_pretrained(
    "controlnet/controlnet-depth-sdxl-1.0",
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16).to("cuda")

vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")

pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    "model/nijiDiffusion_v2.3",
    controlnet=controlnet,
    vae=vae,
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16).to("cuda")

image = load_image("girl.png")
processor = Processor("depth_midas")
depth_image = processor(image, to_pil=True).resize((1024, 1024))

prompt = "anime style, ultra detailed, super fine illustration, pretty girl, fountain background"
negative_prompt = "worst quality, low quality"
controlnet_conditioning_scale = 0.5

seed = 30000
generator = torch.manual_seed(seed)

image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt, 
    image=depth_image, 
    num_inference_steps=30, 
    controlnet_conditioning_scale=controlnet_conditioning_scale,
    generator=generator).images[0]

image.save("depth_result.png")


Osorubeshi alpha XL v0.4

prompt = "anime style, ultra detailed, super fine illustration, pretty 1girl, fountain background"
negative_prompt = "worst quality, low quality"


Mysterious - SDXL Version v3.15

prompt = "mysterious, anime style, ultra detailed, super fine illustration, pretty girl, fountain background"
negative_prompt = "worst quality, low quality"


Copax TimeLessXL - SDXL1.0 Colorful V2

prompt = "colorful, ultra detailed, super fine illustration, pretty girl, fountain background"
negative_prompt = "worst quality, low quality"





このエントリーをはてなブックマークに追加