結果
左がボールを強調した生成画像真ん中がノーマルの生成画像
右が猫を強調した生成画像
なんとなく効果があるような気がします。
今回とは関係ないですがこのレベルの画像が簡単に生成できるSDXL 1.0 (Stable Diffusion XL 1.0) には驚かされるばかりです。
Pythonスクリプト
ノーマル
from diffusers import DiffusionPipeline import torch pipeline = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True ).to("cuda") refiner = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipeline.text_encoder_2, vae=pipeline.vae, torch_dtype=torch.float16, variant="fp16", use_safetensors=True ).to("cuda") prompt = "photo, 8k, a cat playing with a ball in the forest" negative_prompt = "worst quality, low quality" seed = 10000 generator = torch.manual_seed(seed) image = pipeline( prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=30, generator=generator, output_type="latent").images[0] image = refiner( prompt=prompt, negative_prompt=negative_prompt, image=image[None, :]).images[0] image.save("normal.png")
ボールを強調
from diffusers import DiffusionPipeline from compel import Compel, ReturnedEmbeddingsType import torch pipeline = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True ).to("cuda") refiner = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipeline.text_encoder_2, vae=pipeline.vae, torch_dtype=torch.float16, variant="fp16", use_safetensors=True ).to("cuda") compel = Compel( tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2], text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2], returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, requires_pooled=[False, True]) prompt_for_compel = "photo, 8k, a cat-- playing with a ball++ in the forest" conditioning, pooled = compel(prompt_for_compel) prompt_for_refiner = "photo, 8k, a cat playing with a ball in the forest" negative_prompt = "worst quality, low quality" seed = 10000 generator = torch.manual_seed(seed) image = pipeline( prompt_embeds=conditioning, pooled_prompt_embeds=pooled, negative_prompt=negative_prompt, num_inference_steps=30, generator=generator, output_type="latent").images[0] image = refiner( prompt=prompt_for_refiner, negative_prompt=negative_prompt, image=image[None, :]).images[0] image.save("ball.png")
猫を強調
from diffusers import DiffusionPipeline from compel import Compel, ReturnedEmbeddingsType import torch pipeline = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True ).to("cuda") refiner = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipeline.text_encoder_2, vae=pipeline.vae, torch_dtype=torch.float16, variant="fp16", use_safetensors=True ).to("cuda") compel = Compel( tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2], text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2], returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, requires_pooled=[False, True]) prompt_for_compel = "photo, 8k, a cat++ playing with a ball-- in the forest" conditioning, pooled = compel(prompt_for_compel) prompt_for_refiner = "photo, 8k, a cat playing with a ball in the forest" negative_prompt = "worst quality, low quality" seed = 10000 generator = torch.manual_seed(seed) image = pipeline( prompt_embeds=conditioning, pooled_prompt_embeds=pooled, negative_prompt=negative_prompt, num_inference_steps=30, generator=generator, output_type="latent").images[0] image = refiner( prompt=prompt_for_refiner, negative_prompt=negative_prompt, image=image[None, :]).images[0] image.save("cat.png")
補足
negative_promptにも使用する場合、Refinerモデルに使用する場合にはやや複雑です。from diffusers import DiffusionPipeline, StableDiffusionXLPipeline, DPMSolverMultistepScheduler import torch from compel import Compel, ReturnedEmbeddingsType model_id = "hadukiMix_v10.safetensors" pipe = StableDiffusionXLPipeline.from_single_file( f"safetensors/{model_id}", extract_ema=True, variant="fp16", torch_dtype=torch.float16 ) pipe.scheduler = DPMSolverMultistepScheduler.from_config( pipe.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True ) pipe.enable_model_cpu_offload() refiner = DiffusionPipeline.from_pretrained( "model/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipe.text_encoder_2, vae=pipe.vae, variant="fp16", torch_dtype=torch.float16, use_safetensors=True ) refiner.enable_model_cpu_offload() base_compel = Compel( tokenizer=[pipe.tokenizer, pipe.tokenizer_2], text_encoder=[pipe.text_encoder, pipe.text_encoder_2], returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, requires_pooled=[False, True] ) refiner_compel = Compel( tokenizer=[refiner.tokenizer_2], text_encoder=[refiner.text_encoder_2], returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, requires_pooled=[True] ) prompt = "beautiful japanese woman with smile, 45yo+++, 8k, RAW photo, (best quality)+, masterpiece, photo-realistic, focus, professional lighting" negative_prompt = "(worst quality)++, (low quality)++" conditioning, pooled = base_compel(prompt) negative_conditioning, negatice_pooled = base_compel(negative_prompt) conditioning_refiner, pooled_refiner = refiner_compel(prompt) negative_conditioning_refiner, negatice_pooled_refiner = refiner_compel(negative_prompt) seed = 10000 generator = torch.manual_seed(seed) image = pipe( prompt_embeds=conditioning, pooled_prompt_embeds=pooled, negative_prompt_embeds=negative_conditioning, negative_pooled_prompt_embeds=negatice_pooled, generator=generator, num_inference_steps=30, width=1152, height=896, output_type="latent" ).images[0] image = refiner( prompt_embeds=conditioning_refiner, pooled_prompt_embeds=pooled_refiner, negative_prompt_embeds=negative_conditioning_refiner, negative_pooled_prompt_embeds=negatice_pooled_refiner, image=image[None, :], ).images[0] image.save("result.png")
Refinerでの使用方法が間違っていれば以下のようなエラーが出ます。
RuntimeError: mat1 and mat2 shapes cannot be multiplied (154x2048 and 1280x768)