Sharing quantizing code for both text encoder and the transformer

#84

by ebybucuresteanu - opened 1 day ago

1 day ago

•

Just wanted to share some code with u on how to quantize the text encoder and transfomer.
Did this on a RTX 3060 12gb. Works great
Just change the paths to wherever u want
Generates images in ~24s

import torch
import os
import shutil
from transformers import AutoModel, BitsAndBytesConfig as TransformersBitsAndBytesConfig

# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
save_path = r"D:\temp\z_turbo_te_bnb"

print("--- 🛠️ Step 1: Text Encoder Quantization ---")

# Clean up old run
if os.path.exists(save_path):
    print(f"⚠️ Cleaning existing folder: {save_path}")
    shutil.rmtree(save_path)

# 1. Define Quantization Config (From your example)
print("1. Configuring Quantization (NF4)...")
quantization_config = TransformersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# 2. Load Text Encoder
print("2. Loading and Quantizing Text Encoder...")
text_encoder = AutoModel.from_pretrained(
    model_id,
    subfolder="text_encoder",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# 3. Save to Disk
print(f"3. Saving quantized model to {save_path}...")
text_encoder.save_pretrained(save_path)

print("✅ Text Encoder Saved Successfully.")

import torch
import os
import shutil
from diffusers import ZImageTransformer2DModel, BitsAndBytesConfig as DiffusersBitsAndBytesConfig

# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
save_path = r"D:\temp\z_turbo_tr_bnb"

print("--- 🛠️ Step 2: Transformer Quantization ---")

# Clean up old run
if os.path.exists(save_path):
    print(f"⚠️ Cleaning existing folder: {save_path}")
    shutil.rmtree(save_path)

# 1. Define Quantization Config (From your example)
print("1. Configuring Quantization (NF4 with skips)...")
quantization_config = DiffusersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    # This specific skip is crucial for Z-Image
    llm_int8_skip_modules=["transformer_blocks.0.img_mod"], 
)

# 2. Load Transformer
print("2. Loading and Quantizing Transformer...")
# Note: Using ZImageTransformer2DModel explicitly to ensure compatibility
transformer = ZImageTransformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# 3. Save to Disk
print(f"3. Saving quantized model to {save_path}...")
transformer.save_pretrained(save_path)

print("✅ Transformer Saved Successfully.")

And to run it:

import torch
import os
import sys
from diffusers import ZImagePipeline, ZImageTransformer2DModel
from transformers import AutoModel

# --- Configuration ---
model_id = "Tongyi-MAI/Z-Image-Turbo"
te_path = r"D:\temp\z_turbo_te_bnb"
tr_path = r"D:\temp\z_turbo_tr_bnb"

# Verify files exist
if not os.path.exists(te_path) or not os.path.exists(tr_path):
    print("❌ Error: Quantized models not found in D:\\temp.")
    print("   Please run Step 1 and Step 2 scripts first.")
    sys.exit(1)

print("--- ⚡ Z-Image-Turbo (Saved Quants Load) ---")

if torch.cuda.is_available():
    device = "cuda:0"
    print(f"INFO: CUDA available: {torch.cuda.get_device_name(0)}")
else:
    raise RuntimeError("ERROR: CUDA not available.")

# 1. Load the Saved Quants
print("INFO: Loading Transformer from disk...")
# We load the saved folder directly. device_map="auto" handles the 4-bit placement.
transformer = ZImageTransformer2DModel.from_pretrained(
    tr_path,
    torch_dtype=torch.bfloat16,
    device_map="auto" 
)

print("INFO: Loading Text Encoder from disk...")
text_encoder = AutoModel.from_pretrained(
    te_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# 2. Build Pipeline
print("INFO: Building pipeline...")
pipe = ZImagePipeline.from_pretrained(
    model_id,
    transformer=transformer,
    text_encoder=text_encoder,
    torch_dtype=torch.bfloat16,
)

# 3. Memory Settings (From your example)
# Since you used device_map="auto" above, the models are already on GPU.
# We skip CPU offload to match your "THIS WORKS" example which prefers GPU speed.
# If you crash, uncomment the offload line below.
# pipe.enable_model_cpu_offload() 
pipe.to(device)

# 4. Generate
prompt = (
    "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, "
    "red floral forehead pattern. Elaborate high bun, golden phoenix headdress, "
    "red flowers, beads. Holds round folding fan with lady, trees, bird. "
    "Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. "
    "Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), "
    "blurred colorful distant lights."
)

print("INFO: Generating image...")
image = pipe(
    prompt=prompt,
    height=1024,
    width=1024,
    num_inference_steps=9, 
    guidance_scale=0.0, 
    generator=torch.Generator("cuda").manual_seed(42),
).images[0]

output_filename = "z_image_example_output.png"
image.save(output_filename)
print(f"✅ Success! Saved to {output_filename}")

ebybucuresteanu changed discussion title from Quantizing code nf4 for both text encoder and the transformer to Sharing quantizing code for both text encoder and the transformer 1 day ago

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment