anzu_flux を Diffusers から使用する

結果

プロンプト

A photorealistic portrait of a young Japanese woman with long black hair and natural makeup, wearing a casual white blouse, sitting in a modern Tokyo cafe with soft window light


時間

time: 122.74sec

RTX 4090で計測しています。

VRAMは13GB程度使用していました。

方法

huggingface.co
こちらから「anzu_flux_Mix_beta01_.safetensors」をダウンロードしました。

以下のスクリプトで変換して保存しました。

from pathlib import Path
import json
import torch
from diffusers import FluxTransformer2DModel
from optimum.quanto import freeze, qfloat8, quantize, quantization_map

dtype = torch.bfloat16

transformer = FluxTransformer2DModel.from_single_file(
    "anzu_flux_Mix_beta01_.safetensors",
    torch_dtype=dtype
)
quantize(transformer, weights=qfloat8)
freeze(transformer)

save_directory = "anzu_flux_qfloat8"
transformer.save_pretrained(save_directory)
qmap_name = Path(save_directory, "quanto_qmap.json")
qmap = quantization_map(transformer)
with open(qmap_name, "w", encoding="utf8") as f:
    json.dump(qmap, f, indent=4)



以下が実行スクリプトです。

import time
import torch
from diffusers import FluxTransformer2DModel, FluxPipeline
from transformers import T5EncoderModel
from optimum.quanto import QuantizedTransformersModel, QuantizedDiffusersModel

start = time.perf_counter()

dtype = torch.bfloat16

class QuantizedFluxTransformer2DModel(QuantizedDiffusersModel):
    base_class = FluxTransformer2DModel

transformer = QuantizedFluxTransformer2DModel.from_pretrained(
    "anzu_flux_qfloat8"
).to(dtype=dtype)

class QuantizedT5EncoderModelForCausalLM(QuantizedTransformersModel):
    auto_class = T5EncoderModel
    auto_class.from_config = auto_class._from_config

text_encoder_2 = QuantizedT5EncoderModelForCausalLM.from_pretrained(
    "t5encodermodel_qfloat8",
).to(dtype=dtype)

pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=transformer,
    text_encoder_2=text_encoder_2,
    torch_dtype=dtype
)

pipe.enable_model_cpu_offload()

generator = torch.Generator().manual_seed(123)
image = pipe(
    prompt="A photorealistic portrait of a young Japanese woman with long black hair and natural makeup, wearing a casual white blouse, sitting in a modern Tokyo cafe with soft window light",
    width=1360,
    height=768,
    num_inference_steps=50,
    generator=generator,
    guidance_scale=3.5,
).images[0]

image.save("woman.jpg")

end = time.perf_counter()
print(f"time: {(end - start):.2f}sec")

「t5encodermodel_qfloat8」はこちらで保存したものをそのまま使用しました。
touch-sp.hatenablog.com



このエントリーをはてなブックマークに追加