15. Image Generation ์ฌํ
15. Image Generation ์ฌํ¶
๊ฐ์¶
์ด ๋ ์จ์์๋ Stable Diffusion ์ดํ์ ์ต์ ์ด๋ฏธ์ง ์์ฑ ๊ธฐ์ ์ ๋ค๋ฃน๋๋ค. SDXL, ControlNet, IP-Adapter, Latent Consistency Models ๋ฑ ์ค์ฉ์ ์ธ ๊ธฐ๋ฒ์ ํ์ตํฉ๋๋ค.
1. SDXL (Stable Diffusion XL)¶
1.1 ์ํคํ ์ฒ ๊ฐ์ ¶
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ SDXL vs SD 1.5 ๋น๊ต โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ SD 1.5: โ
โ - UNet: 860M params โ
โ - Text Encoder: CLIP ViT-L/14 (77 ํ ํฐ) โ
โ - ํด์๋: 512ร512 โ
โ - VAE: 4ร downscale โ
โ โ
โ SDXL: โ
โ - UNet: 2.6B params (3๋ฐฐ ์ฆ๊ฐ) โ
โ - Text Encoder: CLIP ViT-L + OpenCLIP ViT-bigG (์ด์ค) โ
โ - ํด์๋: 1024ร1024 โ
โ - VAE: ๊ฐ์ ๋ VAE-FT โ
โ - Refiner ๋ชจ๋ธ (์ ํ์ ) โ
โ โ
โ ์ฃผ์ ๊ฐ์ : โ
โ - ๋ ํ๋ถํ ํ
์คํธ ์ดํด (์ด์ค ์ธ์ฝ๋) โ
โ - ๊ณ ํด์๋ ์์ฑ (4๋ฐฐ ํฝ์
) โ
โ - Micro-conditioning (ํฌ๊ธฐ, ์ข
ํก๋น) โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.2 SDXL ์ฌ์ฉ¶
from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline
import torch
def sdxl_generation():
"""SDXL ์ด๋ฏธ์ง ์์ฑ"""
# Base ๋ชจ๋ธ ๋ก๋
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True
).to("cuda")
# ๋ฉ๋ชจ๋ฆฌ ์ต์ ํ
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()
# ์์ฑ
prompt = "A majestic lion in a savanna at sunset, photorealistic, 8k"
negative_prompt = "blurry, low quality, distorted"
image = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=30,
guidance_scale=7.5,
height=1024,
width=1024,
).images[0]
return image
def sdxl_with_refiner():
"""SDXL Base + Refiner ํ์ดํ๋ผ์ธ"""
# Base
base = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
# Refiner
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-refiner-1.0",
torch_dtype=torch.float16
).to("cuda")
prompt = "A cyberpunk city at night, neon lights, rain"
# Stage 1: Base (80% denoising)
high_noise_frac = 0.8
base_output = base(
prompt=prompt,
num_inference_steps=40,
denoising_end=high_noise_frac,
output_type="latent"
).images
# Stage 2: Refiner (20% denoising)
refined_image = refiner(
prompt=prompt,
image=base_output,
num_inference_steps=40,
denoising_start=high_noise_frac
).images[0]
return refined_image
1.3 Micro-Conditioning¶
def sdxl_micro_conditioning():
"""SDXL Micro-Conditioning ์ฌ์ฉ"""
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
prompt = "A portrait of a woman"
# ๋ค์ํ ์ข
ํก๋น๋ก ์์ฑ
aspect_ratios = [
(1024, 1024), # 1:1
(1152, 896), # 4:3
(896, 1152), # 3:4
(1216, 832), # ์ฝ 3:2
(832, 1216), # ์ฝ 2:3
]
images = []
for width, height in aspect_ratios:
# Micro-conditioning: ์๋ณธ ํด์๋ ํํธ
image = pipe(
prompt=prompt,
height=height,
width=width,
original_size=(height, width), # ํ์ต ์ ์๋ณธ ํฌ๊ธฐ
target_size=(height, width), # ๋ชฉํ ํฌ๊ธฐ
crops_coords_top_left=(0, 0), # ํฌ๋กญ ์ขํ
).images[0]
images.append(image)
return images
2. ControlNet¶
2.1 ๊ฐ๋ ¶
ControlNet: ์กฐ๊ฑด๋ถ ์ ์ด ์ถ๊ฐ
์๋ณธ Diffusion ๋ชจ๋ธ์ ์์ ํ์ง ์๊ณ ์ถ๊ฐ ์ ์ด ์ ํธ ์ฃผ์
์ง์ ์กฐ๊ฑด:
- Canny Edge (์ค๊ณฝ์ )
- Depth Map (๊น์ด)
- Pose (์์ธ)
- Segmentation (์ธ๊ทธ๋ฉํ
์ด์
)
- Normal Map (๋ฒ์ )
- Scribble (๋์)
- Line Art
์๋ ์๋ฆฌ:
1. ์กฐ๊ฑด ์ด๋ฏธ์ง โ ์กฐ๊ฑด ์ธ์ฝ๋
2. ์ธ์ฝ๋ฉ๋ ์กฐ๊ฑด โ UNet์ ์ฃผ์
(zero convolution)
3. ์๋ณธ ๋ชจ๋ธ ๊ฐ์ค์น ๊ณ ์ , ControlNet๋ง ํ์ต
2.2 ๊ตฌํ ๋ฐ ์ฌ์ฉ¶
from diffusers import (
StableDiffusionControlNetPipeline,
ControlNetModel,
UniPCMultistepScheduler
)
from controlnet_aux import CannyDetector, OpenposeDetector
import cv2
import numpy as np
class ControlNetGenerator:
"""ControlNet ๊ธฐ๋ฐ ์ด๋ฏธ์ง ์์ฑ"""
def __init__(self, base_model: str = "runwayml/stable-diffusion-v1-5"):
self.base_model = base_model
self.controlnets = {}
self.detectors = {
'canny': CannyDetector(),
'openpose': OpenposeDetector(),
}
def load_controlnet(self, control_type: str):
"""ControlNet ๋ก๋"""
controlnet_models = {
'canny': "lllyasviel/sd-controlnet-canny",
'depth': "lllyasviel/sd-controlnet-depth",
'openpose': "lllyasviel/sd-controlnet-openpose",
'scribble': "lllyasviel/sd-controlnet-scribble",
'seg': "lllyasviel/sd-controlnet-seg",
}
if control_type not in self.controlnets:
self.controlnets[control_type] = ControlNetModel.from_pretrained(
controlnet_models[control_type],
torch_dtype=torch.float16
)
return self.controlnets[control_type]
def generate_with_canny(
self,
image: np.ndarray,
prompt: str,
low_threshold: int = 100,
high_threshold: int = 200
):
"""Canny Edge ์ ์ด"""
# Canny edge ์ถ์ถ
canny_image = cv2.Canny(image, low_threshold, high_threshold)
canny_image = np.stack([canny_image] * 3, axis=-1)
# ControlNet ๋ก๋
controlnet = self.load_controlnet('canny')
# ํ์ดํ๋ผ์ธ
pipe = StableDiffusionControlNetPipeline.from_pretrained(
self.base_model,
controlnet=controlnet,
torch_dtype=torch.float16
).to("cuda")
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
# ์์ฑ
output = pipe(
prompt=prompt,
image=canny_image,
num_inference_steps=20,
guidance_scale=7.5,
controlnet_conditioning_scale=1.0, # ์ ์ด ๊ฐ๋
).images[0]
return output, canny_image
def generate_with_pose(self, image: np.ndarray, prompt: str):
"""Pose ์ ์ด"""
# OpenPose ์ถ์ถ
pose_image = self.detectors['openpose'](image)
controlnet = self.load_controlnet('openpose')
pipe = StableDiffusionControlNetPipeline.from_pretrained(
self.base_model,
controlnet=controlnet,
torch_dtype=torch.float16
).to("cuda")
output = pipe(
prompt=prompt,
image=pose_image,
num_inference_steps=20,
).images[0]
return output, pose_image
def multi_controlnet(
self,
image: np.ndarray,
prompt: str,
control_types: list = ['canny', 'depth']
):
"""๋ค์ค ControlNet"""
# ์ฌ๋ฌ ControlNet ๋ก๋
controlnets = [self.load_controlnet(ct) for ct in control_types]
# ์กฐ๊ฑด ์ด๋ฏธ์ง ์ถ์ถ
control_images = []
for ct in control_types:
if ct == 'canny':
canny = cv2.Canny(image, 100, 200)
control_images.append(np.stack([canny]*3, axis=-1))
elif ct == 'depth':
# Depth ์ถ์ถ (์: MiDaS)
depth = self.extract_depth(image)
control_images.append(depth)
# ๋ค์ค ControlNet ํ์ดํ๋ผ์ธ
pipe = StableDiffusionControlNetPipeline.from_pretrained(
self.base_model,
controlnet=controlnets,
torch_dtype=torch.float16
).to("cuda")
output = pipe(
prompt=prompt,
image=control_images,
controlnet_conditioning_scale=[1.0, 0.5], # ๊ฐ๊ฐ์ ๊ฐ๋
).images[0]
return output
# ์ฌ์ฉ ์์
generator = ControlNetGenerator()
# ์ฐธ์กฐ ์ด๋ฏธ์ง์์ ๊ตฌ๋ ์ ์งํ๋ฉฐ ์คํ์ผ ๋ณ๊ฒฝ
reference_image = cv2.imread("reference.jpg")
result, canny = generator.generate_with_canny(
reference_image,
"A beautiful anime girl, studio ghibli style"
)
3. IP-Adapter (Image Prompt Adapter)¶
3.1 ๊ฐ๋ ¶
IP-Adapter: ์ด๋ฏธ์ง๋ฅผ ํ๋กฌํํธ๋ก ์ฌ์ฉ
ํ
์คํธ ๋์ /ํจ๊ป ์ด๋ฏธ์ง๋ก ์คํ์ผ/๋ด์ฉ ์ง์
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ IP-Adapter ๊ตฌ์กฐ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ ์ฐธ์กฐ ์ด๋ฏธ์ง โ CLIP Image Encoder โ Image Features โ
โ โ โ
โ Projection Layer (ํ์ต) โ
โ โ โ
โ Cross-Attention์ ์ฃผ์
โ
โ โ โ
โ Text Prompt + Image Features โ UNet โ ์์ฑ ์ด๋ฏธ์ง โ
โ โ
โ ์ฉ๋: โ
โ - ์คํ์ผ ์ ์ด (style reference) โ
โ - ์ผ๊ตด ์ ์ฌ์ฑ ์ ์ง (face reference) โ
โ - ๊ตฌ๋/์์ ์ฐธ์กฐ (composition) โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
3.2 ์ฌ์ฉ¶
from diffusers import StableDiffusionPipeline
from transformers import CLIPVisionModelWithProjection
import torch
def use_ip_adapter():
"""IP-Adapter ์ฌ์ฉ"""
# ๊ธฐ๋ณธ ํ์ดํ๋ผ์ธ
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
# IP-Adapter ๋ก๋
pipe.load_ip_adapter(
"h94/IP-Adapter",
subfolder="models",
weight_name="ip-adapter_sd15.bin"
)
# ์ค์ผ์ผ ์ค์ (0~1, ๋์์๋ก ์ฐธ์กฐ ์ด๋ฏธ์ง ์ํฅ ํผ)
pipe.set_ip_adapter_scale(0.6)
# ์ฐธ์กฐ ์ด๋ฏธ์ง
from PIL import Image
style_image = Image.open("style_reference.jpg")
# ์์ฑ
output = pipe(
prompt="A portrait of a woman",
ip_adapter_image=style_image,
num_inference_steps=30,
).images[0]
return output
def ip_adapter_face():
"""IP-Adapter Face: ์ผ๊ตด ์ ์ฌ์ฑ ์ ์ง"""
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
# Face ์ ์ฉ IP-Adapter
pipe.load_ip_adapter(
"h94/IP-Adapter",
subfolder="models",
weight_name="ip-adapter-full-face_sd15.bin"
)
pipe.set_ip_adapter_scale(0.7)
# ์ฐธ์กฐ ์ผ๊ตด
face_image = Image.open("face_reference.jpg")
# ๋ค์ํ ์คํ์ผ๋ก ์์ฑ
prompts = [
"A person in a business suit, professional photo",
"A person as a superhero, comic book style",
"A person in ancient Rome, oil painting"
]
results = []
for prompt in prompts:
output = pipe(
prompt=prompt,
ip_adapter_image=face_image,
num_inference_steps=30,
).images[0]
results.append(output)
return results
def ip_adapter_plus():
"""IP-Adapter Plus: ๋ ๊ฐํ ์ด๋ฏธ์ง ์กฐ๊ฑด"""
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
# Plus ๋ฒ์ (๋ ์ธ๋ฐํ ์ ์ด)
pipe.load_ip_adapter(
"h94/IP-Adapter",
subfolder="models",
weight_name="ip-adapter-plus_sd15.bin"
)
# ๋ค์ค ์ด๋ฏธ์ง ์ฐธ์กฐ
style_images = [
Image.open("style1.jpg"),
Image.open("style2.jpg")
]
output = pipe(
prompt="A landscape",
ip_adapter_image=style_images,
num_inference_steps=30,
).images[0]
return output
4. Latent Consistency Models (LCM)¶
4.1 ๊ฐ๋ ¶
LCM: ์ด๊ณ ์ ์ด๋ฏธ์ง ์์ฑ
๊ธฐ์กด Diffusion: 20-50 ์คํ
ํ์
LCM: 2-4 ์คํ
์ผ๋ก ๊ณ ํ์ง ์์ฑ
์๋ ์๋ฆฌ:
1. ์๋ณธ Diffusion ๋ชจ๋ธ์ consistency ๋ชฉํ๋ก ์ฆ๋ฅ
2. ์ด๋ค ๋
ธ์ด์ฆ ๋ ๋ฒจ์์๋ ๋ฐ๋ก ๊นจ๋ํ ์ด๋ฏธ์ง๋ก ๋งคํ
3. ๋จ์ผ ๋๋ ์์ ์คํ
์ผ๋ก ์์ฑ
์ฅ์ :
- ์ค์๊ฐ ์์ฑ ๊ฐ๋ฅ (< 1์ด)
- ์ธํฐ๋ํฐ๋ธ ์์ฉ
- ์ ์ ๋ ฅ ๋๋ฐ์ด์ค ๊ฐ๋ฅ
4.2 ์ฌ์ฉ¶
from diffusers import (
DiffusionPipeline,
LCMScheduler,
AutoPipelineForText2Image
)
def lcm_generation():
"""LCM ๋น ๋ฅธ ์์ฑ"""
# LCM-LoRA ์ฌ์ฉ (๊ธฐ์กด ๋ชจ๋ธ์ ์ ์ฉ)
pipe = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16,
variant="fp16"
).to("cuda")
# LCM-LoRA ๋ก๋
pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
# LCM ์ค์ผ์ค๋ฌ
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
# ๋น ๋ฅธ ์์ฑ (4 ์คํ
!)
image = pipe(
prompt="A beautiful sunset over mountains",
num_inference_steps=4, # ๋งค์ฐ ์ ์ ์คํ
guidance_scale=1.5, # LCM์ ๋ฎ์ guidance ๊ถ์ฅ
).images[0]
return image
def lcm_real_time():
"""์ค์๊ฐ ์ด๋ฏธ์ง ์์ฑ ๋ฐ๋ชจ"""
import time
pipe = DiffusionPipeline.from_pretrained(
"SimianLuo/LCM_Dreamshaper_v7",
torch_dtype=torch.float16
).to("cuda")
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
prompts = [
"A red apple",
"A blue car",
"A green forest",
"A yellow sun"
]
for prompt in prompts:
start = time.time()
image = pipe(
prompt=prompt,
num_inference_steps=4,
guidance_scale=1.0,
height=512,
width=512
).images[0]
elapsed = time.time() - start
print(f"'{prompt}': {elapsed:.2f}s")
def turbo_generation():
"""SDXL-Turbo: 1-4 ์คํ
์์ฑ"""
pipe = AutoPipelineForText2Image.from_pretrained(
"stabilityai/sdxl-turbo",
torch_dtype=torch.float16,
variant="fp16"
).to("cuda")
# ๋จ 1 ์คํ
!
image = pipe(
prompt="A cinematic shot of a cat wearing a hat",
num_inference_steps=1,
guidance_scale=0.0, # Turbo๋ guidance ๋ถํ์
).images[0]
return image
5. ๊ณ ๊ธ ๊ธฐ๋ฒ¶
5.1 Inpainting & Outpainting¶
from diffusers import StableDiffusionInpaintPipeline
def inpainting_example():
"""์์ญ ์์ (Inpainting)"""
pipe = StableDiffusionInpaintPipeline.from_pretrained(
"runwayml/stable-diffusion-inpainting",
torch_dtype=torch.float16
).to("cuda")
# ์๋ณธ ์ด๋ฏธ์ง์ ๋ง์คํฌ
image = Image.open("original.jpg")
mask = Image.open("mask.png") # ํฐ์ = ์์ ํ ์์ญ
result = pipe(
prompt="A cat sitting on the couch",
image=image,
mask_image=mask,
num_inference_steps=30,
).images[0]
return result
def outpainting_example():
"""์ด๋ฏธ์ง ํ์ฅ (Outpainting)"""
pipe = StableDiffusionInpaintPipeline.from_pretrained(
"runwayml/stable-diffusion-inpainting",
torch_dtype=torch.float16
).to("cuda")
# ์๋ณธ ์ด๋ฏธ์ง๋ฅผ ์บ๋ฒ์ค์ ๋ฐฐ์น
original = Image.open("original.jpg")
canvas_size = (1024, 1024)
canvas = Image.new("RGB", canvas_size, (128, 128, 128))
# ์ค์์ ๋ฐฐ์น
offset = ((canvas_size[0] - original.width) // 2,
(canvas_size[1] - original.height) // 2)
canvas.paste(original, offset)
# ๋ง์คํฌ: ์๋ณธ ์์ญ ์ธ ํฐ์
mask = Image.new("L", canvas_size, 255)
mask.paste(0, offset, (offset[0] + original.width, offset[1] + original.height))
# ํ์ฅ
result = pipe(
prompt="A beautiful landscape extending the scene",
image=canvas,
mask_image=mask,
).images[0]
return result
5.2 Image-to-Image Translation¶
from diffusers import StableDiffusionImg2ImgPipeline
def style_transfer():
"""์คํ์ผ ๋ณํ"""
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
# ์
๋ ฅ ์ด๋ฏธ์ง
init_image = Image.open("photo.jpg").resize((512, 512))
# ์คํ์ผ ๋ณํ
result = pipe(
prompt="oil painting, impressionist style, vibrant colors",
image=init_image,
strength=0.75, # 0~1, ๋์์๋ก ํฐ ๋ณํ
num_inference_steps=30,
).images[0]
return result
5.3 ํ ์คํธ ์๋ฒ ๋ฉ ์กฐ์¶
def prompt_weighting():
"""ํ๋กฌํํธ ๊ฐ์ค์น ์กฐ์ """
from compel import Compel
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
# ๊ฐ์ค์น ๋ฌธ๋ฒ
prompts = [
"a (beautiful)++ sunset", # ++ = 1.21๋ฐฐ
"a (beautiful)+++ sunset", # +++ = 1.33๋ฐฐ
"a (ugly)-- sunset", # -- = 0.83๋ฐฐ
"a (red:1.5) and (blue:0.5) sunset" # ๋ช
์์ ๊ฐ์ค์น
]
for prompt in prompts:
conditioning = compel.build_conditioning_tensor(prompt)
image = pipe(
prompt_embeds=conditioning,
num_inference_steps=30,
).images[0]
def prompt_blending():
"""ํ๋กฌํํธ ๋ธ๋ ๋ฉ"""
from compel import Compel
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
# ๋ ํ๋กฌํํธ ๋ธ๋ ๋ฉ
prompt1 = "a photo of a cat"
prompt2 = "a photo of a dog"
cond1 = compel.build_conditioning_tensor(prompt1)
cond2 = compel.build_conditioning_tensor(prompt2)
# 50:50 ๋ธ๋ ๋ฉ
blended = (cond1 + cond2) / 2
image = pipe(
prompt_embeds=blended,
num_inference_steps=30,
).images[0]
return image
6. ์ต์ ํ ๊ธฐ๋ฒ¶
6.1 ๋ฉ๋ชจ๋ฆฌ ์ต์ ํ¶
def optimize_memory():
"""๋ฉ๋ชจ๋ฆฌ ์ต์ ํ ๊ธฐ๋ฒ"""
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
)
# 1. CPU Offload
pipe.enable_model_cpu_offload()
# 2. Sequential CPU Offload (๋ ๋๋ฆฌ์ง๋ง ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ)
# pipe.enable_sequential_cpu_offload()
# 3. VAE Slicing (ํฐ ์ด๋ฏธ์ง์ฉ)
pipe.enable_vae_slicing()
# 4. VAE Tiling (๋งค์ฐ ํฐ ์ด๋ฏธ์ง์ฉ)
pipe.enable_vae_tiling()
# 5. Attention Slicing
pipe.enable_attention_slicing(slice_size="auto")
# 6. xFormers
pipe.enable_xformers_memory_efficient_attention()
return pipe
def batch_generation():
"""๋ฐฐ์น ์์ฑ ์ต์ ํ"""
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
prompts = [
"A red apple",
"A blue car",
"A green tree",
"A yellow sun",
]
# ๋ฐฐ์น ์์ฑ (๋ ํจ์จ์ )
images = pipe(
prompt=prompts,
num_inference_steps=30,
).images
return images
์ฐธ๊ณ ์๋ฃ¶
๋ ผ๋ฌธ¶
- Podell et al. (2023). "SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis"
- Zhang et al. (2023). "Adding Conditional Control to Text-to-Image Diffusion Models" (ControlNet)
- Ye et al. (2023). "IP-Adapter: Text Compatible Image Prompt Adapter"
- Luo et al. (2023). "Latent Consistency Models"