はじめに
タイトルにあるように Diffusers で SDXL に ControlNet と LoRA が併用できるようになりました。「DreamShaper XL1.0」というSDXL派生モデルに ControlNet と「Japanese Girl - SDXL」という LoRA を使ってみました。「Japanese Girl - SDXL」は日本人女性を出力するためのLoRAです。元画像
ぱくたそからこちらの画像を使わせて頂きました。結果
きれいに日本人女性が出力されています。
Pythonスクリプト
from diffusers import DiffusionPipeline, ControlNetModel, StableDiffusionXLControlNetPipeline, DPMSolverMultistepScheduler from diffusers.utils import load_image import torch from controlnet_aux.processor import Processor controlnet = ControlNetModel.from_pretrained( "controlnet/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16) pipe = StableDiffusionXLControlNetPipeline.from_pretrained( "model/dreamshaperXL10_alpha2Xl10_ema", controlnet=controlnet, variant="fp16", use_safetensors=True, torch_dtype=torch.float16) pipe.scheduler = DPMSolverMultistepScheduler.from_config( pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True) pipe.load_lora_weights("lora/japanese_girl_v1.1.safetensors") pipe.enable_model_cpu_offload() refiner = DiffusionPipeline.from_pretrained( "model/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipe.text_encoder_2, vae=pipe.vae, torch_dtype=torch.float16, variant="fp16", use_safetensors=True ) refiner.enable_model_cpu_offload() image = load_image("girl.jpg") processor = Processor("depth_midas") depth_image = processor(image, to_pil=True).resize((1024, 1024)) prompt = "jpn-girl, pretty girl, 25yo, 8k, detailed, fountain background" negative_prompt = "worst quality, low quality" controlnet_conditioning_scale = 0.5 lora_scale = 0.4 #lora_scale = 0.8 seed = 30000 #seed = 40000 generator = torch.manual_seed(seed) image = pipe( prompt=prompt, negative_prompt=negative_prompt, image=depth_image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale, generator=generator, cross_attention_kwargs={"scale": lora_scale}, output_type="latent" ).images[0] image = refiner( prompt=prompt, negative_prompt=negative_prompt, image=image[None, :] ).images[0] image.save(f"seed{seed}_lorascale{lora_scale}.png")
補足1
ちょっと待てよ。初めから日本人女性の出力が得意なモデル(例えば「fuduki_mix」など)を使えば別にLoRA必要なくないですか?実際「fuduki_mix」を使ってLoRAなしでやってみました。結果
Pythonスクリプト
from diffusers import DiffusionPipeline, ControlNetModel, StableDiffusionXLControlNetPipeline, DPMSolverMultistepScheduler from diffusers.utils import load_image import torch from controlnet_aux.processor import Processor controlnet = ControlNetModel.from_pretrained( "controlnet/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16) pipe = StableDiffusionXLControlNetPipeline.from_pretrained( "model/fudukiMix_v10_ema", controlnet=controlnet, variant="fp16", use_safetensors=True, torch_dtype=torch.float16) pipe.scheduler = DPMSolverMultistepScheduler.from_config( pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True) pipe.enable_model_cpu_offload() refiner = DiffusionPipeline.from_pretrained( "model/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipe.text_encoder_2, vae=pipe.vae, torch_dtype=torch.float16, variant="fp16", use_safetensors=True ) refiner.enable_model_cpu_offload() image = load_image("girl.jpg") processor = Processor("depth_midas") depth_image = processor(image, to_pil=True).resize((1024, 1024)) prompt = "japanese pretty girl, 25yo, 8k, detailed, fountain background" negative_prompt = "worst quality, low quality" controlnet_conditioning_scale = 0.5 seed = 30000 generator = torch.manual_seed(seed) image = pipe( prompt=prompt, negative_prompt=negative_prompt, image=depth_image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale, generator=generator, output_type="latent" ).images[0] image = refiner( prompt=prompt, negative_prompt=negative_prompt, image=image[None, :] ).images[0] image.save(f"seed{seed}.png")
補足2
今度は「fuduki_mix」に「TeethXL」というLoRAを使ってみましょう。「TeethXL」は歯を見せるように笑う顔が出力されるLoRAです。結果
大成功ですね。
Pythonスクリプト
from diffusers import DiffusionPipeline, ControlNetModel, StableDiffusionXLControlNetPipeline, DPMSolverMultistepScheduler from diffusers.utils import load_image import torch from controlnet_aux.processor import Processor controlnet = ControlNetModel.from_pretrained( "controlnet/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16) pipe = StableDiffusionXLControlNetPipeline.from_pretrained( "model/fudukiMix_v10_ema", controlnet=controlnet, variant="fp16", use_safetensors=True, torch_dtype=torch.float16) pipe.scheduler = DPMSolverMultistepScheduler.from_config( pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True) pipe.load_lora_weights("lora/TeethXL.safetensors") pipe.enable_model_cpu_offload() refiner = DiffusionPipeline.from_pretrained( "model/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipe.text_encoder_2, vae=pipe.vae, torch_dtype=torch.float16, variant="fp16", use_safetensors=True ) refiner.enable_model_cpu_offload() image = load_image("girl.jpg") processor = Processor("depth_midas") depth_image = processor(image, to_pil=True).resize((1024, 1024)) prompt = "jpn-girl, pretty girl, 25yo, 8k, detailed, teeth, fountain background" negative_prompt = "worst quality, low quality" controlnet_conditioning_scale = 0.5 lora_scale = 0.4 seed = 30000 generator = torch.manual_seed(seed) image = pipe( prompt=prompt, negative_prompt=negative_prompt, image=depth_image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale, generator=generator, cross_attention_kwargs={"scale": lora_scale}, output_type="latent" ).images[0] image = refiner( prompt=prompt, negative_prompt=negative_prompt, image=image[None, :] ).images[0] image.save(f"seed{seed}_lorascale{lora_scale}.png")