最終更新日:2023年7月28日
はじめに
以前Kandinsky 2.1に関する記事を書きました。touch-sp.hatenablog.com
今回は新しくなったKandinsky 2.2をControlNetと組み合わせて使ってみます。
「Text-to-Image」と「Image-to-Image」の両方で使えるようなので比較してみます。
実行
こちらの画像を使わせて頂きました。ロボット風に変換します。まずは上記画像のDepth Imageを作成して「controlnet_hint.png」として保存しておきます。
import numpy as np from transformers import pipeline from diffusers.utils import load_image img = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png" ).resize((768, 768)) depth_estimator = pipeline("depth-estimation") hint = depth_estimator(img)["depth"] hint.save("controlnet_hint.png")
このような画像が保存されます。
Text2Image with a Depth Image
import torch import numpy as np from diffusers import DiffusionPipeline from diffusers.utils import load_image pipe_prior = DiffusionPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 ) pipe_prior.to("cuda") pipe = DiffusionPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 ) pipe.to("cuda") prompt = "A robot, 4k photo" negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, username, watermark, signature" image_emb, zero_image_emb = pipe_prior( prompt=prompt, negative_prompt=negative_prior_prompt ).to_tuple() hint_image = np.array(load_image("controlnet_hint.png")) hint_image = torch.from_numpy(hint_image).float() / 255.0 hint_image = hint_image.permute(2, 0, 1).unsqueeze(0).half().to("cuda") images = pipe( image_embeds=image_emb, negative_image_embeds=zero_image_emb, hint=hint_image, num_inference_steps=50, height=768, width=768, ).images images[0].save("text2image_result.png")
Image2Image with a Depth Image
import torch import numpy as np from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline from diffusers.utils import load_image img = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png" ).resize((768, 768)) pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 ) pipe_prior.to("cuda") pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 ) pipe.to("cuda") prompt = "A robot, 4k photo" negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, username, watermark, signature" img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85) negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=0.85) hint_image = np.array(load_image("controlnet_hint.png")) hint_image = torch.from_numpy(hint_image).float() / 255.0 hint_image = hint_image.permute(2, 0, 1).unsqueeze(0).half().to("cuda") images = pipe( image=img, strength=0.5, image_embeds=img_emb.image_embeds, negative_image_embeds=negative_emb.image_embeds, hint=hint_image, num_inference_steps=50, height=768, width=768, ).images images[0].save("image2image_result.png")