Sharing quantizing code for both text encoder and the transformer
#84
by
ebybucuresteanu
- opened
Just wanted to share some code with u on how to quantize the text encoder and transfomer.
Did this on a RTX 3060 12gb. Works great
Just change the paths to wherever u want
Generates images in ~24s
import torch
import os
import shutil
from transformers import AutoModel, BitsAndBytesConfig as TransformersBitsAndBytesConfig
# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
save_path = r"D:\temp\z_turbo_te_bnb"
print("--- 🛠️ Step 1: Text Encoder Quantization ---")
# Clean up old run
if os.path.exists(save_path):
print(f"⚠️ Cleaning existing folder: {save_path}")
shutil.rmtree(save_path)
# 1. Define Quantization Config (From your example)
print("1. Configuring Quantization (NF4)...")
quantization_config = TransformersBitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
# 2. Load Text Encoder
print("2. Loading and Quantizing Text Encoder...")
text_encoder = AutoModel.from_pretrained(
model_id,
subfolder="text_encoder",
quantization_config=quantization_config,
torch_dtype=torch.bfloat16,
device_map="auto"
)
# 3. Save to Disk
print(f"3. Saving quantized model to {save_path}...")
text_encoder.save_pretrained(save_path)
print("✅ Text Encoder Saved Successfully.")
import torch
import os
import shutil
from diffusers import ZImageTransformer2DModel, BitsAndBytesConfig as DiffusersBitsAndBytesConfig
# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
save_path = r"D:\temp\z_turbo_tr_bnb"
print("--- 🛠️ Step 2: Transformer Quantization ---")
# Clean up old run
if os.path.exists(save_path):
print(f"⚠️ Cleaning existing folder: {save_path}")
shutil.rmtree(save_path)
# 1. Define Quantization Config (From your example)
print("1. Configuring Quantization (NF4 with skips)...")
quantization_config = DiffusersBitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
# This specific skip is crucial for Z-Image
llm_int8_skip_modules=["transformer_blocks.0.img_mod"],
)
# 2. Load Transformer
print("2. Loading and Quantizing Transformer...")
# Note: Using ZImageTransformer2DModel explicitly to ensure compatibility
transformer = ZImageTransformer2DModel.from_pretrained(
model_id,
subfolder="transformer",
quantization_config=quantization_config,
torch_dtype=torch.bfloat16,
device_map="auto"
)
# 3. Save to Disk
print(f"3. Saving quantized model to {save_path}...")
transformer.save_pretrained(save_path)
print("✅ Transformer Saved Successfully.")
And to run it:
import torch
import os
import sys
from diffusers import ZImagePipeline, ZImageTransformer2DModel
from transformers import AutoModel
# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
te_path = r"D:\temp\z_turbo_te_bnb"
tr_path = r"D:\temp\z_turbo_tr_bnb"
# Verify files exist
if not os.path.exists(te_path) or not os.path.exists(tr_path):
print("❌ Error: Quantized models not found in D:\\temp.")
print(" Please run Step 1 and Step 2 scripts first.")
sys.exit(1)
print("--- ⚡ Z-Image-Turbo (Saved Quants Load) ---")
if torch.cuda.is_available():
device = "cuda:0"
print(f"INFO: CUDA available: {torch.cuda.get_device_name(0)}")
else:
raise RuntimeError("ERROR: CUDA not available.")
# 1. Load the Saved Quants
print("INFO: Loading Transformer from disk...")
# We load the saved folder directly. device_map="auto" handles the 4-bit placement.
transformer = ZImageTransformer2DModel.from_pretrained(
tr_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
print("INFO: Loading Text Encoder from disk...")
text_encoder = AutoModel.from_pretrained(
te_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
# 2. Build Pipeline
print("INFO: Building pipeline...")
pipe = ZImagePipeline.from_pretrained(
model_id,
transformer=transformer,
text_encoder=text_encoder,
torch_dtype=torch.bfloat16,
)
# 3. Memory Settings (From your example)
# Since you used device_map="auto" above, the models are already on GPU.
# We skip CPU offload to match your "THIS WORKS" example which prefers GPU speed.
# If you crash, uncomment the offload line below.
# pipe.enable_model_cpu_offload()
pipe.to(device)
# 4. Generate
prompt = (
"Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, "
"red floral forehead pattern. Elaborate high bun, golden phoenix headdress, "
"red flowers, beads. Holds round folding fan with lady, trees, bird. "
"Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. "
"Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), "
"blurred colorful distant lights."
)
print("INFO: Generating image...")
image = pipe(
prompt=prompt,
height=1024,
width=1024,
num_inference_steps=9,
guidance_scale=0.0,
generator=torch.Generator("cuda").manual_seed(42),
).images[0]
output_filename = "z_image_example_output.png"
image.save(output_filename)
print(f"✅ Success! Saved to {output_filename}")
ebybucuresteanu
changed discussion title from
Quantizing code nf4 for both text encoder and the transformer
to Sharing quantizing code for both text encoder and the transformer