はじめに
ControlNetのcanny2imageについて深堀りしていきます。前回Canny法によるエッジ検出の設定が重要なことが分かりました。touch-sp.hatenablog.com
今回は元画像が生成画像(今回は768x768とします)より大きい場合のリサイズについてです。
リサイズするタイミングの違いで生成画像がどう変化するかを見ていきます。
方法

元画像はぱくたそから使わせて頂きました。
こちらの画像です。
サイズは1856x1856です。
リサイズは「lllyasviel/ControlNet」で使われている方法を採用しました。
比較結果
元画像をエッジ検出する前にリサイズ

エッジ検出した後にリサイズ

リサイズせずにPipelineに渡す(Pipeline内で768x768を指定)

リサイズせずにPipelineに渡す(1856x1856の画像を生成)
時間もかかるし、VRAM消費も多いのでお勧めしません。
感想
2番目と3番目はほとんど同じことをしています。おそらくリサイズする方法が少し異なるため生成画像が異なっているのでしょう。1番目が最も良い結果です。つまり元画像が大きい場合にはエッジ検出する前に生成画像のサイズに合わせてリサイズしておいた方が良いということです。Pythonスクリプト
それぞれのPythonスクリプトを残しておきます。方法①
import cv2 from PIL import Image import numpy as np import torch from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, AutoencoderKL, EulerAncestralDiscreteScheduler resolution = 768 low_threshold = 25 high_threshold = 100 def resize_image(input_image, resolution): H, W = input_image.shape[0:2] H = float(H) W = float(W) k = float(resolution) / min(H, W) H *= k W *= k H = int(np.round(H / 64.0)) * 64 W = int(np.round(W / 64.0)) * 64 img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA) return img original_image = np.array(Image.open('sample.jpg')) img = resize_image(original_image, resolution=resolution) canny = cv2.Canny(img, threshold1=low_threshold, threshold2=high_threshold) canny = canny[:, :, None] canny = np.concatenate([canny, canny, canny], axis=2) control_image = Image.fromarray(canny) control_image.save('result1_canny.png') vae = AutoencoderKL.from_pretrained('vae/any4_vae').to('cuda') controlnet = ControlNetModel.from_pretrained('basemodel/sd-controlnet-canny') pipe = StableDiffusionControlNetPipeline.from_pretrained( "model/anything-v4.0", controlnet=controlnet, vae=vae, safety_checker=None).to('cuda') pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) pipe.enable_xformers_memory_efficient_attention() generator = torch.manual_seed(20000) image = pipe( prompt="best quality, extremely detailed", negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", image=control_image, num_inference_steps=30, generator=generator, guidance_scale = 9.0, ).images[0] image.save('method1_result.png')
方法②
import cv2 from PIL import Image import numpy as np import torch from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, AutoencoderKL, EulerAncestralDiscreteScheduler resolution = 768 low_threshold = 25 high_threshold = 100 def resize_image(input_image, resolution): H, W = input_image.shape[0:2] H = float(H) W = float(W) k = float(resolution) / min(H, W) H *= k W *= k H = int(np.round(H / 64.0)) * 64 W = int(np.round(W / 64.0)) * 64 img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA) return img original_image = np.array(Image.open('sample.jpg')) canny = cv2.Canny(original_image, threshold1=low_threshold, threshold2=high_threshold) canny = resize_image(canny, resolution=resolution) canny = canny[:, :, None] canny = np.concatenate([canny, canny, canny], axis=2) control_image = Image.fromarray(canny) control_image.save('result2_canny.png') vae = AutoencoderKL.from_pretrained('vae/any4_vae').to('cuda') controlnet = ControlNetModel.from_pretrained('basemodel/sd-controlnet-canny') pipe = StableDiffusionControlNetPipeline.from_pretrained( "model/anything-v4.0", controlnet=controlnet, vae=vae, safety_checker=None).to('cuda') pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) pipe.enable_xformers_memory_efficient_attention() generator = torch.manual_seed(20000) image = pipe( prompt="best quality, extremely detailed", negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", image=control_image, num_inference_steps=30, generator=generator, guidance_scale = 9.0, ).images[0] image.save('method2_result.png')
方法③
import cv2 from PIL import Image import numpy as np import torch from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, AutoencoderKL, EulerAncestralDiscreteScheduler resolution = 768 low_threshold = 25 high_threshold = 100 original_image = np.array(Image.open('sample.jpg')) canny = cv2.Canny(original_image, threshold1=low_threshold, threshold2=high_threshold) canny = canny[:, :, None] canny = np.concatenate([canny, canny, canny], axis=2) control_image = Image.fromarray(canny) control_image.save('result3_canny.png') vae = AutoencoderKL.from_pretrained('vae/any4_vae').to('cuda') controlnet = ControlNetModel.from_pretrained('basemodel/sd-controlnet-canny') pipe = StableDiffusionControlNetPipeline.from_pretrained( "model/anything-v4.0", controlnet=controlnet, vae=vae, safety_checker=None).to('cuda') pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) pipe.enable_xformers_memory_efficient_attention() generator = torch.manual_seed(20000) image = pipe( prompt="best quality, extremely detailed", negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", image=control_image, num_inference_steps=30, generator=generator, guidance_scale = 9.0, width = resolution, height=resolution ).images[0] image.save('method3_result.png')
方法④
VRAM節約のため「vae.enable_tiling」を使用しています。import cv2 from PIL import Image import numpy as np import torch from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, AutoencoderKL, EulerAncestralDiscreteScheduler low_threshold = 25 high_threshold = 100 original_image = np.array(Image.open('sample.jpg')) canny = cv2.Canny(original_image, threshold1=low_threshold, threshold2=high_threshold) canny = canny[:, :, None] canny = np.concatenate([canny, canny, canny], axis=2) control_image = Image.fromarray(canny) control_image.save('result4_canny.png') vae = AutoencoderKL.from_pretrained('vae/any4_vae').to('cuda') controlnet = ControlNetModel.from_pretrained('basemodel/sd-controlnet-canny') pipe = StableDiffusionControlNetPipeline.from_pretrained( "model/anything-v4.0", controlnet=controlnet, vae=vae, safety_checker=None).to('cuda') pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) pipe.enable_xformers_memory_efficient_attention() pipe.vae.enable_tiling() generator = torch.manual_seed(20000) image = pipe( prompt="best quality, extremely detailed", negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", image=control_image, num_inference_steps=30, generator=generator, guidance_scale = 9.0, ).images[0] image.save('method4_result.png')