はじめに
「Canny」に続いて「Depth」の ControlNet が公開されました。「Canny」に関してはこちらを見て下さい。touch-sp.hatenablog.com
元画像
こちらで作成した画像を使用しました。「girl.png」として保存しています。Depth画像の作成
4つの方法でDepth画像を作成しました。「controlnet_aux」についてはこちらの記事を見て下さい。方法1
Hugging Faceの公式ページ通りです。import torch import numpy as np from PIL import Image from transformers import DPTFeatureExtractor, DPTForDepthEstimation from diffusers.utils import load_image depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas") def get_depth_map(image): image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda") with torch.no_grad(), torch.autocast("cuda"): depth_map = depth_estimator(image).predicted_depth depth_map = torch.nn.functional.interpolate( depth_map.unsqueeze(1), size=(1024, 1024), mode="bicubic", align_corners=False, ) depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True) depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True) depth_map = (depth_map - depth_min) / (depth_max - depth_min) image = torch.cat([depth_map] * 3, dim=1) image = image.permute(0, 2, 3, 1).cpu().numpy()[0] image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8)) return image image = load_image("girl.png") depth_image = get_depth_map(image) depth_image.save("method1.png")
方法2
from diffusers.utils import load_image from controlnet_aux.processor import Processor image = load_image("girl.png") processor = Processor("depth_midas") depth_image = processor(image, to_pil=True) depth_image.save("method2.png")
方法3
from diffusers.utils import load_image from controlnet_aux.processor import Processor image = load_image("girl.png") processor = Processor("depth_leres") depth_image = processor(image, to_pil=True) depth_image.save("method3.png")
方法4
from diffusers.utils import load_image from controlnet_aux.processor import Processor image = load_image("girl.png") processor = Processor("depth_leres++") depth_image = processor(image, to_pil=True) depth_image.save("method4.png")
左から1→2→3→4です。
どれが一番きれいな結果になるでしょうか?
実行
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL from diffusers.utils import load_image import torch controlnet = ControlNetModel.from_pretrained( "diffusers/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16, ).to("cuda") vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda") pipe = StableDiffusionXLControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, variant="fp16", use_safetensors=True, torch_dtype=torch.float16, ).to("cuda") fname_list = ["method1", "method2", "method3", "method4"] prompt = "anime style, ultra detailed, super fine illustration, pretty girl, fountain background" negative_prompt = "worst quality, low quality" controlnet_conditioning_scale = 0.5 # recommended for good generalization for fname in fname_list: depth_image = load_image(f"{fname}.png").resize((1024, 1024)) seed = 30000 generator = torch.manual_seed(seed) image = pipe( prompt=prompt, negative_prompt=negative_prompt, image=depth_image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale, generator=generator).images[0] image.save(f"{fname}_result.png")
上から1→2→3→4です。
甲乙つけがたい結果でした。
SDXL 1.0派生モデルとDepth ControlNet
これ以降は「方法2」を使っていきます。niji-diffusion-xl-base-1.0 v2.3
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL from diffusers.utils import load_image import torch from controlnet_aux.processor import Processor controlnet = ControlNetModel.from_pretrained( "controlnet/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16).to("cuda") vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda") pipe = StableDiffusionXLControlNetPipeline.from_pretrained( "model/nijiDiffusion_v2.3", controlnet=controlnet, vae=vae, variant="fp16", use_safetensors=True, torch_dtype=torch.float16).to("cuda") image = load_image("girl.png") processor = Processor("depth_midas") depth_image = processor(image, to_pil=True).resize((1024, 1024)) prompt = "anime style, ultra detailed, super fine illustration, pretty girl, fountain background" negative_prompt = "worst quality, low quality" controlnet_conditioning_scale = 0.5 seed = 30000 generator = torch.manual_seed(seed) image = pipe( prompt=prompt, negative_prompt=negative_prompt, image=depth_image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale, generator=generator).images[0] image.save("depth_result.png")
Osorubeshi alpha XL v0.4
prompt = "anime style, ultra detailed, super fine illustration, pretty 1girl, fountain background" negative_prompt = "worst quality, low quality"
Mysterious - SDXL Version v3.15
prompt = "mysterious, anime style, ultra detailed, super fine illustration, pretty girl, fountain background" negative_prompt = "worst quality, low quality"
Copax TimeLessXL - SDXL1.0 Colorful V2
prompt = "colorful, ultra detailed, super fine illustration, pretty girl, fountain background" negative_prompt = "worst quality, low quality"