15. Advanced Image Generation

15. Advanced Image Generation

Overview

This lesson covers the latest image generation techniques after Stable Diffusion. We explore practical techniques including SDXL, ControlNet, IP-Adapter, and Latent Consistency Models.


1. SDXL (Stable Diffusion XL)

1.1 Architecture Improvements

┌──────────────────────────────────────────────────────────────────┐
│                    SDXL vs SD 1.5 Comparison                     │
├──────────────────────────────────────────────────────────────────┤
│                                                                  │
│  SD 1.5:                                                         │
│  - UNet: 860M params                                            │
│  - Text Encoder: CLIP ViT-L/14 (77 tokens)                      │
│  - Resolution: 512×512                                          │
│  - VAE: 4× downscale                                            │
│                                                                  │
│  SDXL:                                                           │
│  - UNet: 2.6B params (3x increase)                              │
│  - Text Encoder: CLIP ViT-L + OpenCLIP ViT-bigG (dual)          │
│  - Resolution: 1024×1024                                        │
│  - VAE: Improved VAE-FT                                         │
│  - Refiner model (optional)                                      │
│                                                                  │
│  Key Improvements:                                               │
│  - Richer text understanding (dual encoder)                     │
│  - High resolution generation (4x pixels)                       │
│  - Micro-conditioning (size, aspect ratio)                      │
│                                                                  │
└──────────────────────────────────────────────────────────────────┘

1.2 Using SDXL

from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline
import torch

def sdxl_generation():
    """SDXL image generation"""

    # Load base model
    pipe = StableDiffusionXLPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float16,
        variant="fp16",
        use_safetensors=True
    ).to("cuda")

    # Memory optimization
    pipe.enable_model_cpu_offload()
    pipe.enable_vae_slicing()

    # Generation
    prompt = "A majestic lion in a savanna at sunset, photorealistic, 8k"
    negative_prompt = "blurry, low quality, distorted"

    image = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=30,
        guidance_scale=7.5,
        height=1024,
        width=1024,
    ).images[0]

    return image


def sdxl_with_refiner():
    """SDXL Base + Refiner pipeline"""

    # Base
    base = StableDiffusionXLPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float16
    ).to("cuda")

    # Refiner
    refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-refiner-1.0",
        torch_dtype=torch.float16
    ).to("cuda")

    prompt = "A cyberpunk city at night, neon lights, rain"

    # Stage 1: Base (80% denoising)
    high_noise_frac = 0.8
    base_output = base(
        prompt=prompt,
        num_inference_steps=40,
        denoising_end=high_noise_frac,
        output_type="latent"
    ).images

    # Stage 2: Refiner (20% denoising)
    refined_image = refiner(
        prompt=prompt,
        image=base_output,
        num_inference_steps=40,
        denoising_start=high_noise_frac
    ).images[0]

    return refined_image

1.3 Micro-Conditioning

def sdxl_micro_conditioning():
    """Using SDXL Micro-Conditioning"""

    pipe = StableDiffusionXLPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float16
    ).to("cuda")

    prompt = "A portrait of a woman"

    # Generate with various aspect ratios
    aspect_ratios = [
        (1024, 1024),  # 1:1
        (1152, 896),   # 4:3
        (896, 1152),   # 3:4
        (1216, 832),   # ~3:2
        (832, 1216),   # ~2:3
    ]

    images = []
    for width, height in aspect_ratios:
        # Micro-conditioning: original resolution hint
        image = pipe(
            prompt=prompt,
            height=height,
            width=width,
            original_size=(height, width),  # Original size during training
            target_size=(height, width),    # Target size
            crops_coords_top_left=(0, 0),   # Crop coordinates
        ).images[0]
        images.append(image)

    return images

2. ControlNet

2.1 Concept

ControlNet: Adding Conditional Control

Inject additional control signals without modifying the original Diffusion model

Supported conditions:
- Canny Edge (edges)
- Depth Map (depth)
- Pose (pose)
- Segmentation (segmentation)
- Normal Map (normals)
- Scribble (scribble)
- Line Art

How it works:
1. Condition image  Condition encoder
2. Encoded condition  Inject into UNet (zero convolution)
3. Freeze original model weights, train only ControlNet

2.2 Implementation and Usage

from diffusers import (
    StableDiffusionControlNetPipeline,
    ControlNetModel,
    UniPCMultistepScheduler
)
from controlnet_aux import CannyDetector, OpenposeDetector
import cv2
import numpy as np

class ControlNetGenerator:
    """ControlNet-based image generation"""

    def __init__(self, base_model: str = "runwayml/stable-diffusion-v1-5"):
        self.base_model = base_model
        self.controlnets = {}
        self.detectors = {
            'canny': CannyDetector(),
            'openpose': OpenposeDetector(),
        }

    def load_controlnet(self, control_type: str):
        """Load ControlNet"""
        controlnet_models = {
            'canny': "lllyasviel/sd-controlnet-canny",
            'depth': "lllyasviel/sd-controlnet-depth",
            'openpose': "lllyasviel/sd-controlnet-openpose",
            'scribble': "lllyasviel/sd-controlnet-scribble",
            'seg': "lllyasviel/sd-controlnet-seg",
        }

        if control_type not in self.controlnets:
            self.controlnets[control_type] = ControlNetModel.from_pretrained(
                controlnet_models[control_type],
                torch_dtype=torch.float16
            )

        return self.controlnets[control_type]

    def generate_with_canny(
        self,
        image: np.ndarray,
        prompt: str,
        low_threshold: int = 100,
        high_threshold: int = 200
    ):
        """Canny Edge control"""

        # Extract Canny edges
        canny_image = cv2.Canny(image, low_threshold, high_threshold)
        canny_image = np.stack([canny_image] * 3, axis=-1)

        # Load ControlNet
        controlnet = self.load_controlnet('canny')

        # Pipeline
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            self.base_model,
            controlnet=controlnet,
            torch_dtype=torch.float16
        ).to("cuda")

        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

        # Generate
        output = pipe(
            prompt=prompt,
            image=canny_image,
            num_inference_steps=20,
            guidance_scale=7.5,
            controlnet_conditioning_scale=1.0,  # Control strength
        ).images[0]

        return output, canny_image

    def generate_with_pose(self, image: np.ndarray, prompt: str):
        """Pose control"""

        # Extract OpenPose
        pose_image = self.detectors['openpose'](image)

        controlnet = self.load_controlnet('openpose')

        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            self.base_model,
            controlnet=controlnet,
            torch_dtype=torch.float16
        ).to("cuda")

        output = pipe(
            prompt=prompt,
            image=pose_image,
            num_inference_steps=20,
        ).images[0]

        return output, pose_image

    def multi_controlnet(
        self,
        image: np.ndarray,
        prompt: str,
        control_types: list = ['canny', 'depth']
    ):
        """Multiple ControlNets"""

        # Load multiple ControlNets
        controlnets = [self.load_controlnet(ct) for ct in control_types]

        # Extract condition images
        control_images = []
        for ct in control_types:
            if ct == 'canny':
                canny = cv2.Canny(image, 100, 200)
                control_images.append(np.stack([canny]*3, axis=-1))
            elif ct == 'depth':
                # Depth extraction (e.g., MiDaS)
                depth = self.extract_depth(image)
                control_images.append(depth)

        # Multi ControlNet pipeline
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            self.base_model,
            controlnet=controlnets,
            torch_dtype=torch.float16
        ).to("cuda")

        output = pipe(
            prompt=prompt,
            image=control_images,
            controlnet_conditioning_scale=[1.0, 0.5],  # Strength for each
        ).images[0]

        return output


# Usage example
generator = ControlNetGenerator()

# Keep composition from reference image while changing style
reference_image = cv2.imread("reference.jpg")
result, canny = generator.generate_with_canny(
    reference_image,
    "A beautiful anime girl, studio ghibli style"
)

3. IP-Adapter (Image Prompt Adapter)

3.1 Concept

IP-Adapter: Using Images as Prompts

Direct style/content with images instead of/alongside text

┌────────────────────────────────────────────────────────────┐
│                    IP-Adapter Structure                     │
├────────────────────────────────────────────────────────────┤
│                                                            │
│  Reference Image → CLIP Image Encoder → Image Features     │
│                         ↓                                  │
│                  Projection Layer (trainable)              │
│                         ↓                                  │
│              Inject into Cross-Attention                   │
│                         ↓                                  │
│  Text Prompt + Image Features → UNet → Generated Image    │
│                                                            │
│  Use cases:                                                │
│  - Style transfer (style reference)                        │
│  - Face similarity preservation (face reference)           │
│  - Composition/color reference (composition)               │
│                                                            │
└────────────────────────────────────────────────────────────┘

3.2 Usage

from diffusers import StableDiffusionPipeline
from transformers import CLIPVisionModelWithProjection
import torch

def use_ip_adapter():
    """Using IP-Adapter"""

    # Base pipeline
    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    # Load IP-Adapter
    pipe.load_ip_adapter(
        "h94/IP-Adapter",
        subfolder="models",
        weight_name="ip-adapter_sd15.bin"
    )

    # Set scale (0~1, higher = more reference image influence)
    pipe.set_ip_adapter_scale(0.6)

    # Reference image
    from PIL import Image
    style_image = Image.open("style_reference.jpg")

    # Generate
    output = pipe(
        prompt="A portrait of a woman",
        ip_adapter_image=style_image,
        num_inference_steps=30,
    ).images[0]

    return output


def ip_adapter_face():
    """IP-Adapter Face: Maintaining face similarity"""

    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    # Face-specific IP-Adapter
    pipe.load_ip_adapter(
        "h94/IP-Adapter",
        subfolder="models",
        weight_name="ip-adapter-full-face_sd15.bin"
    )

    pipe.set_ip_adapter_scale(0.7)

    # Reference face
    face_image = Image.open("face_reference.jpg")

    # Generate in various styles
    prompts = [
        "A person in a business suit, professional photo",
        "A person as a superhero, comic book style",
        "A person in ancient Rome, oil painting"
    ]

    results = []
    for prompt in prompts:
        output = pipe(
            prompt=prompt,
            ip_adapter_image=face_image,
            num_inference_steps=30,
        ).images[0]
        results.append(output)

    return results


def ip_adapter_plus():
    """IP-Adapter Plus: Stronger image conditioning"""

    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    # Plus version (finer control)
    pipe.load_ip_adapter(
        "h94/IP-Adapter",
        subfolder="models",
        weight_name="ip-adapter-plus_sd15.bin"
    )

    # Multiple image references
    style_images = [
        Image.open("style1.jpg"),
        Image.open("style2.jpg")
    ]

    output = pipe(
        prompt="A landscape",
        ip_adapter_image=style_images,
        num_inference_steps=30,
    ).images[0]

    return output

4. Latent Consistency Models (LCM)

4.1 Concept

LCM: Ultra-fast Image Generation

Traditional Diffusion: Requires 20-50 steps
LCM: High-quality generation in 2-4 steps

How it works:
1. Distill original Diffusion model with consistency objective
2. Map any noise level directly to clean image
3. Generate with single or few steps

Advantages:
- Real-time generation possible (< 1 second)
- Interactive applications
- Low-power devices possible

4.2 Usage

from diffusers import (
    DiffusionPipeline,
    LCMScheduler,
    AutoPipelineForText2Image
)

def lcm_generation():
    """LCM fast generation"""

    # Use LCM-LoRA (applies to existing models)
    pipe = DiffusionPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float16,
        variant="fp16"
    ).to("cuda")

    # Load LCM-LoRA
    pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")

    # LCM scheduler
    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

    # Fast generation (4 steps!)
    image = pipe(
        prompt="A beautiful sunset over mountains",
        num_inference_steps=4,  # Very few steps
        guidance_scale=1.5,     # LCM recommends low guidance
    ).images[0]

    return image


def lcm_real_time():
    """Real-time image generation demo"""
    import time

    pipe = DiffusionPipeline.from_pretrained(
        "SimianLuo/LCM_Dreamshaper_v7",
        torch_dtype=torch.float16
    ).to("cuda")

    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

    prompts = [
        "A red apple",
        "A blue car",
        "A green forest",
        "A yellow sun"
    ]

    for prompt in prompts:
        start = time.time()
        image = pipe(
            prompt=prompt,
            num_inference_steps=4,
            guidance_scale=1.0,
            height=512,
            width=512
        ).images[0]
        elapsed = time.time() - start

        print(f"'{prompt}': {elapsed:.2f}s")


def turbo_generation():
    """SDXL-Turbo: 1-4 step generation"""

    pipe = AutoPipelineForText2Image.from_pretrained(
        "stabilityai/sdxl-turbo",
        torch_dtype=torch.float16,
        variant="fp16"
    ).to("cuda")

    # Just 1 step!
    image = pipe(
        prompt="A cinematic shot of a cat wearing a hat",
        num_inference_steps=1,
        guidance_scale=0.0,  # Turbo doesn't need guidance
    ).images[0]

    return image

5. Advanced Techniques

5.1 Inpainting & Outpainting

from diffusers import StableDiffusionInpaintPipeline

def inpainting_example():
    """Region editing (Inpainting)"""

    pipe = StableDiffusionInpaintPipeline.from_pretrained(
        "runwayml/stable-diffusion-inpainting",
        torch_dtype=torch.float16
    ).to("cuda")

    # Original image and mask
    image = Image.open("original.jpg")
    mask = Image.open("mask.png")  # White = region to edit

    result = pipe(
        prompt="A cat sitting on the couch",
        image=image,
        mask_image=mask,
        num_inference_steps=30,
    ).images[0]

    return result


def outpainting_example():
    """Image extension (Outpainting)"""

    pipe = StableDiffusionInpaintPipeline.from_pretrained(
        "runwayml/stable-diffusion-inpainting",
        torch_dtype=torch.float16
    ).to("cuda")

    # Place original image on canvas
    original = Image.open("original.jpg")
    canvas_size = (1024, 1024)
    canvas = Image.new("RGB", canvas_size, (128, 128, 128))

    # Center placement
    offset = ((canvas_size[0] - original.width) // 2,
              (canvas_size[1] - original.height) // 2)
    canvas.paste(original, offset)

    # Mask: white outside original region
    mask = Image.new("L", canvas_size, 255)
    mask.paste(0, offset, (offset[0] + original.width, offset[1] + original.height))

    # Extend
    result = pipe(
        prompt="A beautiful landscape extending the scene",
        image=canvas,
        mask_image=mask,
    ).images[0]

    return result

5.2 Image-to-Image Translation

from diffusers import StableDiffusionImg2ImgPipeline

def style_transfer():
    """Style transformation"""

    pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    # Input image
    init_image = Image.open("photo.jpg").resize((512, 512))

    # Style transformation
    result = pipe(
        prompt="oil painting, impressionist style, vibrant colors",
        image=init_image,
        strength=0.75,  # 0~1, higher = more change
        num_inference_steps=30,
    ).images[0]

    return result

5.3 Text Embedding Manipulation

def prompt_weighting():
    """Prompt weight adjustment"""
    from compel import Compel

    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)

    # Weight syntax
    prompts = [
        "a (beautiful)++ sunset",           # ++ = 1.21x
        "a (beautiful)+++ sunset",          # +++ = 1.33x
        "a (ugly)-- sunset",                # -- = 0.83x
        "a (red:1.5) and (blue:0.5) sunset" # Explicit weights
    ]

    for prompt in prompts:
        conditioning = compel.build_conditioning_tensor(prompt)

        image = pipe(
            prompt_embeds=conditioning,
            num_inference_steps=30,
        ).images[0]


def prompt_blending():
    """Prompt blending"""
    from compel import Compel

    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)

    # Blend two prompts
    prompt1 = "a photo of a cat"
    prompt2 = "a photo of a dog"

    cond1 = compel.build_conditioning_tensor(prompt1)
    cond2 = compel.build_conditioning_tensor(prompt2)

    # 50:50 blending
    blended = (cond1 + cond2) / 2

    image = pipe(
        prompt_embeds=blended,
        num_inference_steps=30,
    ).images[0]

    return image

6. Optimization Techniques

6.1 Memory Optimization

def optimize_memory():
    """Memory optimization techniques"""

    pipe = StableDiffusionXLPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float16
    )

    # 1. CPU Offload
    pipe.enable_model_cpu_offload()

    # 2. Sequential CPU Offload (slower but saves more memory)
    # pipe.enable_sequential_cpu_offload()

    # 3. VAE Slicing (for large images)
    pipe.enable_vae_slicing()

    # 4. VAE Tiling (for very large images)
    pipe.enable_vae_tiling()

    # 5. Attention Slicing
    pipe.enable_attention_slicing(slice_size="auto")

    # 6. xFormers
    pipe.enable_xformers_memory_efficient_attention()

    return pipe


def batch_generation():
    """Batch generation optimization"""

    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    prompts = [
        "A red apple",
        "A blue car",
        "A green tree",
        "A yellow sun",
    ]

    # Batch generation (more efficient)
    images = pipe(
        prompt=prompts,
        num_inference_steps=30,
    ).images

    return images

References

Papers

  • Podell et al. (2023). "SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis"
  • Zhang et al. (2023). "Adding Conditional Control to Text-to-Image Diffusion Models" (ControlNet)
  • Ye et al. (2023). "IP-Adapter: Text Compatible Image Prompt Adapter"
  • Luo et al. (2023). "Latent Consistency Models"

Models

to navigate between lessons