結果
プロンプト
A photorealistic portrait of a young Japanese woman with long black hair and natural makeup, wearing a casual white blouse, sitting in a modern Tokyo cafe with soft window light
時間
time: 122.74sec
RTX 4090で計測しています。
VRAMは13GB程度使用していました。
方法
huggingface.coこちらから「anzu_flux_Mix_beta01_.safetensors」をダウンロードしました。
以下のスクリプトで変換して保存しました。
from pathlib import Path import json import torch from diffusers import FluxTransformer2DModel from optimum.quanto import freeze, qfloat8, quantize, quantization_map dtype = torch.bfloat16 transformer = FluxTransformer2DModel.from_single_file( "anzu_flux_Mix_beta01_.safetensors", torch_dtype=dtype ) quantize(transformer, weights=qfloat8) freeze(transformer) save_directory = "anzu_flux_qfloat8" transformer.save_pretrained(save_directory) qmap_name = Path(save_directory, "quanto_qmap.json") qmap = quantization_map(transformer) with open(qmap_name, "w", encoding="utf8") as f: json.dump(qmap, f, indent=4)
以下が実行スクリプトです。
import time import torch from diffusers import FluxTransformer2DModel, FluxPipeline from transformers import T5EncoderModel from optimum.quanto import QuantizedTransformersModel, QuantizedDiffusersModel start = time.perf_counter() dtype = torch.bfloat16 class QuantizedFluxTransformer2DModel(QuantizedDiffusersModel): base_class = FluxTransformer2DModel transformer = QuantizedFluxTransformer2DModel.from_pretrained( "anzu_flux_qfloat8" ).to(dtype=dtype) class QuantizedT5EncoderModelForCausalLM(QuantizedTransformersModel): auto_class = T5EncoderModel auto_class.from_config = auto_class._from_config text_encoder_2 = QuantizedT5EncoderModelForCausalLM.from_pretrained( "t5encodermodel_qfloat8", ).to(dtype=dtype) pipe = FluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", transformer=transformer, text_encoder_2=text_encoder_2, torch_dtype=dtype ) pipe.enable_model_cpu_offload() generator = torch.Generator().manual_seed(123) image = pipe( prompt="A photorealistic portrait of a young Japanese woman with long black hair and natural makeup, wearing a casual white blouse, sitting in a modern Tokyo cafe with soft window light", width=1360, height=768, num_inference_steps=50, generator=generator, guidance_scale=3.5, ).images[0] image.save("woman.jpg") end = time.perf_counter() print(f"time: {(end - start):.2f}sec")
「t5encodermodel_qfloat8」はこちらで保存したものをそのまま使用しました。
touch-sp.hatenablog.com