Text-to-Image
Diffusers
Safetensors
English
ZImagePipeline

Sharing quantizing code for both text encoder and the transformer

#84
by ebybucuresteanu - opened

Just wanted to share some code with u on how to quantize the text encoder and transfomer.
Did this on a RTX 3060 12gb. Works great
Just change the paths to wherever u want
Generates images in ~24s

import torch
import os
import shutil
from transformers import AutoModel, BitsAndBytesConfig as TransformersBitsAndBytesConfig

# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
save_path = r"D:\temp\z_turbo_te_bnb"

print("--- 🛠️ Step 1: Text Encoder Quantization ---")

# Clean up old run
if os.path.exists(save_path):
    print(f"⚠️ Cleaning existing folder: {save_path}")
    shutil.rmtree(save_path)

# 1. Define Quantization Config (From your example)
print("1. Configuring Quantization (NF4)...")
quantization_config = TransformersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# 2. Load Text Encoder
print("2. Loading and Quantizing Text Encoder...")
text_encoder = AutoModel.from_pretrained(
    model_id,
    subfolder="text_encoder",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# 3. Save to Disk
print(f"3. Saving quantized model to {save_path}...")
text_encoder.save_pretrained(save_path)

print("✅ Text Encoder Saved Successfully.")
import torch
import os
import shutil
from diffusers import ZImageTransformer2DModel, BitsAndBytesConfig as DiffusersBitsAndBytesConfig

# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
save_path = r"D:\temp\z_turbo_tr_bnb"

print("--- 🛠️ Step 2: Transformer Quantization ---")

# Clean up old run
if os.path.exists(save_path):
    print(f"⚠️ Cleaning existing folder: {save_path}")
    shutil.rmtree(save_path)

# 1. Define Quantization Config (From your example)
print("1. Configuring Quantization (NF4 with skips)...")
quantization_config = DiffusersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    # This specific skip is crucial for Z-Image
    llm_int8_skip_modules=["transformer_blocks.0.img_mod"], 
)

# 2. Load Transformer
print("2. Loading and Quantizing Transformer...")
# Note: Using ZImageTransformer2DModel explicitly to ensure compatibility
transformer = ZImageTransformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# 3. Save to Disk
print(f"3. Saving quantized model to {save_path}...")
transformer.save_pretrained(save_path)

print("✅ Transformer Saved Successfully.")

And to run it:

import torch
import os
import sys
from diffusers import ZImagePipeline, ZImageTransformer2DModel
from transformers import AutoModel

# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
te_path = r"D:\temp\z_turbo_te_bnb"
tr_path = r"D:\temp\z_turbo_tr_bnb"

# Verify files exist
if not os.path.exists(te_path) or not os.path.exists(tr_path):
    print("❌ Error: Quantized models not found in D:\\temp.")
    print("   Please run Step 1 and Step 2 scripts first.")
    sys.exit(1)

print("--- ⚡ Z-Image-Turbo (Saved Quants Load) ---")

if torch.cuda.is_available():
    device = "cuda:0"
    print(f"INFO: CUDA available: {torch.cuda.get_device_name(0)}")
else:
    raise RuntimeError("ERROR: CUDA not available.")

# 1. Load the Saved Quants
print("INFO: Loading Transformer from disk...")
# We load the saved folder directly. device_map="auto" handles the 4-bit placement.
transformer = ZImageTransformer2DModel.from_pretrained(
    tr_path,
    torch_dtype=torch.bfloat16,
    device_map="auto" 
)

print("INFO: Loading Text Encoder from disk...")
text_encoder = AutoModel.from_pretrained(
    te_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# 2. Build Pipeline
print("INFO: Building pipeline...")
pipe = ZImagePipeline.from_pretrained(
    model_id,
    transformer=transformer,
    text_encoder=text_encoder,
    torch_dtype=torch.bfloat16,
)

# 3. Memory Settings (From your example)
# Since you used device_map="auto" above, the models are already on GPU.
# We skip CPU offload to match your "THIS WORKS" example which prefers GPU speed.
# If you crash, uncomment the offload line below.
# pipe.enable_model_cpu_offload() 
pipe.to(device)

# 4. Generate
prompt = (
    "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, "
    "red floral forehead pattern. Elaborate high bun, golden phoenix headdress, "
    "red flowers, beads. Holds round folding fan with lady, trees, bird. "
    "Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. "
    "Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), "
    "blurred colorful distant lights."
)

print("INFO: Generating image...")
image = pipe(
    prompt=prompt,
    height=1024,
    width=1024,
    num_inference_steps=9, 
    guidance_scale=0.0, 
    generator=torch.Generator("cuda").manual_seed(42),
).images[0]

output_filename = "z_image_example_output.png"
image.save(output_filename)
print(f"✅ Success! Saved to {output_filename}")
ebybucuresteanu changed discussion title from Quantizing code nf4 for both text encoder and the transformer to Sharing quantizing code for both text encoder and the transformer

Sign up or log in to comment