【Compel】SDXL 1.0 (Stable Diffusion XL 1.0) でプロンプト内のワードに重みづけをしてみた

結果

左がボールを強調した生成画像
真ん中がノーマルの生成画像
右が猫を強調した生成画像

なんとなく効果があるような気がします。

今回とは関係ないですがこのレベルの画像が簡単に生成できるSDXL 1.0 (Stable Diffusion XL 1.0) には驚かされるばかりです。

Pythonスクリプト

ノーマル

from diffusers import DiffusionPipeline
import torch

pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    variant="fp16", 
    use_safetensors=True
    ).to("cuda")

refiner = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-1.0",
    text_encoder_2=pipeline.text_encoder_2,
    vae=pipeline.vae,
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True
    ).to("cuda")

prompt = "photo, 8k, a cat playing with a ball in the forest"
negative_prompt = "worst quality, low quality"

seed = 10000
generator = torch.manual_seed(seed)
image = pipeline(
    prompt=prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=30,
    generator=generator,
    output_type="latent").images[0]
image = refiner(
    prompt=prompt,
    negative_prompt=negative_prompt,
    image=image[None, :]).images[0]

image.save("normal.png")

ボールを強調

from diffusers import DiffusionPipeline
from compel import Compel, ReturnedEmbeddingsType
import torch

pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    variant="fp16", 
    use_safetensors=True
    ).to("cuda")

refiner = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-1.0",
    text_encoder_2=pipeline.text_encoder_2,
    vae=pipeline.vae,
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True
    ).to("cuda")

compel = Compel(
    tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2],
    text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, 
    requires_pooled=[False, True])

prompt_for_compel = "photo, 8k, a cat-- playing with a ball++ in the forest"
conditioning, pooled = compel(prompt_for_compel)

prompt_for_refiner = "photo, 8k, a cat playing with a ball in the forest"
negative_prompt = "worst quality, low quality"

seed = 10000
generator = torch.manual_seed(seed)
image = pipeline(
    prompt_embeds=conditioning,
    pooled_prompt_embeds=pooled,
    negative_prompt=negative_prompt,
    num_inference_steps=30,
    generator=generator,
    output_type="latent").images[0]
image = refiner(
    prompt=prompt_for_refiner,
    negative_prompt=negative_prompt,
    image=image[None, :]).images[0]

image.save("ball.png")

猫を強調

from diffusers import DiffusionPipeline
from compel import Compel, ReturnedEmbeddingsType
import torch

pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    variant="fp16", 
    use_safetensors=True
    ).to("cuda")

refiner = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-1.0",
    text_encoder_2=pipeline.text_encoder_2,
    vae=pipeline.vae,
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True
    ).to("cuda")

compel = Compel(
    tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2],
    text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, 
    requires_pooled=[False, True])

prompt_for_compel = "photo, 8k, a cat++ playing with a ball-- in the forest"
conditioning, pooled = compel(prompt_for_compel)

prompt_for_refiner = "photo, 8k, a cat playing with a ball in the forest"
negative_prompt = "worst quality, low quality"

seed = 10000
generator = torch.manual_seed(seed)
image = pipeline(
    prompt_embeds=conditioning,
    pooled_prompt_embeds=pooled,
    negative_prompt=negative_prompt,
    num_inference_steps=30,
    generator=generator,
    output_type="latent").images[0]
image = refiner(
    prompt=prompt_for_refiner,
    negative_prompt=negative_prompt,
    image=image[None, :]).images[0]

image.save("cat.png")

補足

negative_promptにも使用する場合、Refinerモデルに使用する場合にはやや複雑です。

from diffusers import DiffusionPipeline, StableDiffusionXLPipeline, DPMSolverMultistepScheduler
import torch
from compel import Compel, ReturnedEmbeddingsType

model_id = "hadukiMix_v10.safetensors"
pipe = StableDiffusionXLPipeline.from_single_file(
    f"safetensors/{model_id}",
    extract_ema=True,
    variant="fp16",
    torch_dtype=torch.float16
    )

pipe.scheduler = DPMSolverMultistepScheduler.from_config(
    pipe.scheduler.config,
    algorithm_type="sde-dpmsolver++",
    use_karras_sigmas=True
    )

pipe.enable_model_cpu_offload()

refiner = DiffusionPipeline.from_pretrained(
    "model/stable-diffusion-xl-refiner-1.0",
    text_encoder_2=pipe.text_encoder_2,
    vae=pipe.vae,
    variant="fp16",
    torch_dtype=torch.float16,
    use_safetensors=True
    )

refiner.enable_model_cpu_offload()

base_compel = Compel(
    tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
    text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, 
    requires_pooled=[False, True]
    )

refiner_compel = Compel(
    tokenizer=[refiner.tokenizer_2],
    text_encoder=[refiner.text_encoder_2],
    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
    requires_pooled=[True]
    )

prompt = "beautiful japanese woman with smile, 45yo+++, 8k, RAW photo, (best quality)+, masterpiece, photo-realistic, focus, professional lighting"
negative_prompt = "(worst quality)++, (low quality)++"

conditioning, pooled = base_compel(prompt)
negative_conditioning, negatice_pooled = base_compel(negative_prompt)

conditioning_refiner, pooled_refiner = refiner_compel(prompt)
negative_conditioning_refiner, negatice_pooled_refiner = refiner_compel(negative_prompt)

seed = 10000
generator = torch.manual_seed(seed)
image = pipe(
    prompt_embeds=conditioning,
    pooled_prompt_embeds=pooled,
    negative_prompt_embeds=negative_conditioning,
    negative_pooled_prompt_embeds=negatice_pooled,
    generator=generator,
    num_inference_steps=30,
    width=1152,
    height=896,
    output_type="latent"
    ).images[0]

image = refiner(
    prompt_embeds=conditioning_refiner,
    pooled_prompt_embeds=pooled_refiner,
    negative_prompt_embeds=negative_conditioning_refiner,
    negative_pooled_prompt_embeds=negatice_pooled_refiner,
    image=image[None, :],
    ).images[0]

image.save("result.png")

Refinerでの使用方法が間違っていれば以下のようなエラーが出ます。

RuntimeError: mat1 and mat2 shapes cannot be multiplied (154x2048 and 1280x768)




このエントリーをはてなブックマークに追加