32. Diffusion Models
์ด์ : Variational Autoencoder (VAE) | ๋ค์: ํ์ฐ ๋ชจ๋ธ(DDPM)
32. Diffusion Models¶
ํ์ต ๋ชฉํ¶
- Diffusion Process ์ด๋ก ์ดํด (Forward/Reverse)
- DDPM (Denoising Diffusion Probabilistic Models) ์๋ฆฌ
- Score-based Generative Models ๊ฐ๋
- U-Net ์ํคํ ์ฒ for Diffusion
- Stable Diffusion ํต์ฌ ์๋ฆฌ
- Classifier-free Guidance
- ๊ฐ๋จํ DDPM PyTorch ๊ตฌํ
1. Diffusion Process ๊ฐ์¶
ํต์ฌ ์์ด๋์ด¶
๋ฐ์ดํฐ์ ์ ์ง์ ์ผ๋ก ๋
ธ์ด์ฆ๋ฅผ ์ถ๊ฐ (Forward Process)
x_0 โ x_1 โ x_2 โ ... โ x_T (์์ ๋
ธ์ด์ฆ)
๋
ธ์ด์ฆ๋ฅผ ์ ์ง์ ์ผ๋ก ์ ๊ฑฐํ์ฌ ๋ฐ์ดํฐ ์์ฑ (Reverse Process)
x_T โ x_{T-1} โ ... โ x_0 (๊นจ๋ํ ์ด๋ฏธ์ง)
ํต์ฌ: Reverse Process๋ฅผ ์ ๊ฒฝ๋ง์ผ๋ก ํ์ต
์๊ฐ์ ์ดํด¶
Forward (๋
ธ์ด์ฆ ์ถ๊ฐ):
[๊นจ๋ํ ์ด๋ฏธ์ง] โโt=0โโโถ [์ฝ๊ฐ ๋
ธ์ด์ฆ] โโt=500โโโถ [๋ ๋ง์ ๋
ธ์ด์ฆ] โโt=1000โโโถ [์์ ๋
ธ์ด์ฆ]
Reverse (๋
ธ์ด์ฆ ์ ๊ฑฐ):
[์์ ๋
ธ์ด์ฆ] โโt=1000โโโถ [์ฝ๊ฐ ์ ๋ช
] โโt=500โโโถ [๋ ์ ๋ช
] โโt=0โโโถ [๊นจ๋ํ ์ด๋ฏธ์ง]
2. DDPM (Denoising Diffusion Probabilistic Models)¶
Forward Process (q)¶
# Forward process: q(x_t | x_{t-1})
# x_t = sqrt(1 - beta_t) * x_{t-1} + sqrt(beta_t) * epsilon
# Closed form (ํ ๋ฒ์ x_0์์ x_t๋ก):
# x_t = sqrt(alpha_bar_t) * x_0 + sqrt(1 - alpha_bar_t) * epsilon
# alpha_t = 1 - beta_t
# alpha_bar_t = prod(alpha_1 * alpha_2 * ... * alpha_t)
์ํ์ ์ ์¶
import torch
def linear_beta_schedule(timesteps, beta_start=1e-4, beta_end=0.02):
"""์ ํ ๋
ธ์ด์ฆ ์ค์ผ์ค (โญโญ)"""
return torch.linspace(beta_start, beta_end, timesteps)
def cosine_beta_schedule(timesteps, s=0.008):
"""์ฝ์ฌ์ธ ๋
ธ์ด์ฆ ์ค์ผ์ค (๋ ์ข์ ์ฑ๋ฅ) (โญโญโญ)"""
steps = timesteps + 1
x = torch.linspace(0, timesteps, steps)
alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * torch.pi * 0.5) ** 2
alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
return torch.clip(betas, 0.0001, 0.9999)
def get_index_from_list(vals, t, x_shape):
"""๋ฐฐ์น์ ๊ฐ ์ํ์ ๋ง๋ t ์์ ์ ๊ฐ ์ถ์ถ"""
batch_size = t.shape[0]
out = vals.gather(-1, t.cpu())
return out.reshape(batch_size, *((1,) * (len(x_shape) - 1))).to(t.device)
Forward Diffusion ๊ตฌํ¶
class DiffusionSchedule:
"""Diffusion ์ค์ผ์ค ๊ด๋ฆฌ (โญโญโญ)"""
def __init__(self, timesteps=1000, beta_schedule='linear'):
self.timesteps = timesteps
if beta_schedule == 'linear':
betas = linear_beta_schedule(timesteps)
else:
betas = cosine_beta_schedule(timesteps)
self.betas = betas
self.alphas = 1.0 - betas
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1], (1, 0), value=1.0)
# ๊ณ์ฐ์ ํ์ํ ๊ฐ๋ค
self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
self.sqrt_recip_alphas = torch.sqrt(1.0 / self.alphas)
# Posterior ๊ณ์ฐ์ฉ
self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
def q_sample(self, x_0, t, noise=None):
"""Forward process: x_0์์ x_t ์ํ๋ง (โญโญโญ)
x_t = sqrt(alpha_bar_t) * x_0 + sqrt(1 - alpha_bar_t) * noise
"""
if noise is None:
noise = torch.randn_like(x_0)
sqrt_alphas_cumprod_t = get_index_from_list(
self.sqrt_alphas_cumprod, t, x_0.shape
)
sqrt_one_minus_alphas_cumprod_t = get_index_from_list(
self.sqrt_one_minus_alphas_cumprod, t, x_0.shape
)
return sqrt_alphas_cumprod_t * x_0 + sqrt_one_minus_alphas_cumprod_t * noise
3. ๋ ธ์ด์ฆ ์์ธก ๋คํธ์ํฌ¶
๋ชฉํ¶
๋ชจ๋ธ์ด x_t์์ ์ถ๊ฐ๋ ๋
ธ์ด์ฆ epsilon์ ์์ธก
epsilon_theta(x_t, t) โ epsilon
์์ค ํจ์:
L = E[||epsilon - epsilon_theta(x_t, t)||^2]
๊ฐ๋จํ U-Net ๊ตฌ์กฐ¶
import torch.nn as nn
import torch.nn.functional as F
class SinusoidalPositionEmbeddings(nn.Module):
"""์๊ฐ ์๋ฒ ๋ฉ (โญโญโญ)"""
def __init__(self, dim):
super().__init__()
self.dim = dim
def forward(self, time):
device = time.device
half_dim = self.dim // 2
embeddings = math.log(10000) / (half_dim - 1)
embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
embeddings = time[:, None] * embeddings[None, :]
embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
return embeddings
class Block(nn.Module):
"""๊ธฐ๋ณธ Conv Block"""
def __init__(self, in_ch, out_ch, time_emb_dim, up=False):
super().__init__()
self.time_mlp = nn.Linear(time_emb_dim, out_ch)
if up:
self.conv1 = nn.Conv2d(2 * in_ch, out_ch, 3, padding=1)
self.transform = nn.ConvTranspose2d(out_ch, out_ch, 4, 2, 1)
else:
self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
self.transform = nn.Conv2d(out_ch, out_ch, 4, 2, 1)
self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
self.bnorm1 = nn.BatchNorm2d(out_ch)
self.bnorm2 = nn.BatchNorm2d(out_ch)
self.relu = nn.ReLU()
def forward(self, x, t):
# First Conv
h = self.bnorm1(self.relu(self.conv1(x)))
# Time embedding
time_emb = self.relu(self.time_mlp(t))
time_emb = time_emb[(...,) + (None,) * 2]
h = h + time_emb
# Second Conv
h = self.bnorm2(self.relu(self.conv2(h)))
# Down or Upsample
return self.transform(h)
class SimpleUNet(nn.Module):
"""๊ฐ๋จํ U-Net for Diffusion (โญโญโญ)"""
def __init__(self, in_channels=3, out_channels=3, time_dim=256):
super().__init__()
# Time embedding
self.time_mlp = nn.Sequential(
SinusoidalPositionEmbeddings(time_dim),
nn.Linear(time_dim, time_dim),
nn.ReLU()
)
# Initial projection
self.conv0 = nn.Conv2d(in_channels, 64, 3, padding=1)
# Downsampling
self.downs = nn.ModuleList([
Block(64, 128, time_dim),
Block(128, 256, time_dim),
Block(256, 256, time_dim),
])
# Upsampling
self.ups = nn.ModuleList([
Block(256, 128, time_dim, up=True),
Block(128, 64, time_dim, up=True),
Block(64, 64, time_dim, up=True),
])
# Output
self.output = nn.Conv2d(64, out_channels, 1)
def forward(self, x, timestep):
# Time embedding
t = self.time_mlp(timestep)
# Initial conv
x = self.conv0(x)
# Downsampling
residuals = []
for down in self.downs:
x = down(x, t)
residuals.append(x)
# Upsampling with skip connections
for up in self.ups:
residual = residuals.pop()
x = torch.cat((x, residual), dim=1)
x = up(x, t)
return self.output(x)
4. ํ์ต ๊ณผ์ ¶
ํ์ต ์๊ณ ๋ฆฌ์ฆ (DDPM)¶
1. x_0 ~ q(x_0): ๋ฐ์ดํฐ์์ ์ํ
2. t ~ Uniform(1, T): ๋๋ค ํ์์คํ
3. epsilon ~ N(0, I): ๋๋ค ๋
ธ์ด์ฆ
4. x_t = sqrt(alpha_bar_t) * x_0 + sqrt(1 - alpha_bar_t) * epsilon
5. Loss = ||epsilon - epsilon_theta(x_t, t)||^2
6. ์ญ์ ํ ๋ฐ ์
๋ฐ์ดํธ
ํ์ต ์ฝ๋¶
def train_diffusion(model, schedule, dataloader, epochs=100, lr=1e-4):
"""Diffusion ๋ชจ๋ธ ํ์ต (โญโญโญ)"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()
for epoch in range(epochs):
total_loss = 0
for batch_idx, (images, _) in enumerate(dataloader):
images = images.to(device)
batch_size = images.size(0)
# ๋๋ค ํ์์คํ
t = torch.randint(0, schedule.timesteps, (batch_size,), device=device).long()
# ๋
ธ์ด์ฆ ์ถ๊ฐ
noise = torch.randn_like(images)
x_t = schedule.q_sample(images, t, noise)
# ๋
ธ์ด์ฆ ์์ธก
noise_pred = model(x_t, t)
# ์์ค ๊ณ์ฐ
loss = criterion(noise_pred, noise)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(dataloader)
print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")
5. ์ํ๋ง (Reverse Process)¶
DDPM ์ํ๋ง¶
@torch.no_grad()
def sample_ddpm(model, schedule, shape, device):
"""DDPM ์ํ๋ง (โญโญโญ)
x_T ~ N(0, I)์์ ์์ํ์ฌ x_0 ์์ฑ
"""
model.eval()
# ์์ ๋
ธ์ด์ฆ์์ ์์
x = torch.randn(shape, device=device)
for i in reversed(range(schedule.timesteps)):
t = torch.full((shape[0],), i, device=device, dtype=torch.long)
betas_t = get_index_from_list(schedule.betas, t, x.shape)
sqrt_one_minus_alphas_cumprod_t = get_index_from_list(
schedule.sqrt_one_minus_alphas_cumprod, t, x.shape
)
sqrt_recip_alphas_t = get_index_from_list(
schedule.sqrt_recip_alphas, t, x.shape
)
# ๋
ธ์ด์ฆ ์์ธก
noise_pred = model(x, t)
# Mean ๊ณ์ฐ
model_mean = sqrt_recip_alphas_t * (
x - betas_t * noise_pred / sqrt_one_minus_alphas_cumprod_t
)
if i > 0:
posterior_variance_t = get_index_from_list(
schedule.posterior_variance, t, x.shape
)
noise = torch.randn_like(x)
x = model_mean + torch.sqrt(posterior_variance_t) * noise
else:
x = model_mean
return x
DDIM ์ํ๋ง (๋ ๋น ๋ฆ)¶
@torch.no_grad()
def sample_ddim(model, schedule, shape, device, num_inference_steps=50, eta=0.0):
"""DDIM ์ํ๋ง (โญโญโญโญ)
๋ ์ ์ ์คํ
์ผ๋ก ๋น ๋ฅธ ์ํ๋ง
eta=0: ๊ฒฐ์ ๋ก ์ , eta=1: DDPM๊ณผ ๋์ผ
"""
model.eval()
# ์คํ
๊ฐ๊ฒฉ
step_size = schedule.timesteps // num_inference_steps
timesteps = list(range(0, schedule.timesteps, step_size))
timesteps = list(reversed(timesteps))
x = torch.randn(shape, device=device)
for i, t in enumerate(timesteps):
t_tensor = torch.full((shape[0],), t, device=device, dtype=torch.long)
alpha_cumprod_t = schedule.alphas_cumprod[t]
alpha_cumprod_prev = schedule.alphas_cumprod[timesteps[i+1]] if i < len(timesteps)-1 else 1.0
# ๋
ธ์ด์ฆ ์์ธก
noise_pred = model(x, t_tensor)
# x_0 ์์ธก
pred_x0 = (x - torch.sqrt(1 - alpha_cumprod_t) * noise_pred) / torch.sqrt(alpha_cumprod_t)
# ๋ฐฉํฅ ๊ณ์ฐ
sigma = eta * torch.sqrt((1 - alpha_cumprod_prev) / (1 - alpha_cumprod_t)) * \
torch.sqrt(1 - alpha_cumprod_t / alpha_cumprod_prev)
pred_dir = torch.sqrt(1 - alpha_cumprod_prev - sigma**2) * noise_pred
# ๋
ธ์ด์ฆ ์ถ๊ฐ (eta > 0์ธ ๊ฒฝ์ฐ)
noise = torch.randn_like(x) if eta > 0 else 0
x = torch.sqrt(alpha_cumprod_prev) * pred_x0 + pred_dir + sigma * noise
return x
6. Score-based Models¶
Score Function¶
Score = gradient of log probability
s(x) = โ_x log p(x)
๋
ธ์ด์ฆ๊ฐ ์ถ๊ฐ๋ ๋ฐ์ดํฐ์ score:
s_theta(x_t, t) โ โ_{x_t} log p(x_t)
DDPM๊ณผ์ ๊ด๊ณ¶
# DDPM์์ ๋
ธ์ด์ฆ ์์ธก๊ณผ score์ ๊ด๊ณ:
# epsilon_theta(x_t, t) = -sqrt(1 - alpha_bar_t) * s_theta(x_t, t)
# Score ์์ธก โ ๋
ธ์ด์ฆ ์์ธก์ผ๋ก ๋ณํ ๊ฐ๋ฅ
7. Stable Diffusion ์๋ฆฌ¶
Latent Diffusion¶
์ด๋ฏธ์ง ๊ณต๊ฐ์ด ์๋ ์ ์ฌ ๊ณต๊ฐ์์ diffusion
1. Encoder: ์ด๋ฏธ์ง โ ์ ์ฌ ํํ z
2. Diffusion: z์์ ๋
ธ์ด์ฆ ์ถ๊ฐ/์ ๊ฑฐ
3. Decoder: ์ ์ฌ ํํ โ ์ด๋ฏธ์ง
์ฅ์ :
- ๊ณ์ฐ ํจ์จ์ฑ (์์ ํด์๋์์ diffusion)
- ๊ณ ํด์๋ ์ด๋ฏธ์ง ์์ฑ ๊ฐ๋ฅ
์ํคํ ์ฒ¶
โโโโโโโโโโโโโโโโ
โ Text Prompt โ
โโโโโโโโฌโโโโโโโโ
โ CLIP Text Encoder
โผ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ Cross-Attention โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ z_T โโโถ U-Net โโโถ z_{T-1} โโโถ ... โโโถ z_0 โ
โ (time embedding) โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ VAE Decoder
โผ
โโโโโโโโโโโโโโโโ
โ Image โ
โโโโโโโโโโโโโโโโ
์กฐ๊ฑด๋ถ ์์ฑ (Cross-Attention)¶
class CrossAttention(nn.Module):
"""Text-Image Cross Attention (โญโญโญโญ)"""
def __init__(self, query_dim, context_dim, heads=8, dim_head=64):
super().__init__()
inner_dim = dim_head * heads
self.scale = dim_head ** -0.5
self.heads = heads
self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
self.to_out = nn.Linear(inner_dim, query_dim)
def forward(self, x, context):
# x: ์ด๋ฏธ์ง ํน์ง (batch, hw, dim)
# context: ํ
์คํธ ์๋ฒ ๋ฉ (batch, seq_len, context_dim)
q = self.to_q(x)
k = self.to_k(context)
v = self.to_v(context)
# Multi-head attention
q = rearrange(q, 'b n (h d) -> b h n d', h=self.heads)
k = rearrange(k, 'b n (h d) -> b h n d', h=self.heads)
v = rearrange(v, 'b n (h d) -> b h n d', h=self.heads)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
out = attn @ v
out = rearrange(out, 'b h n d -> b n (h d)')
return self.to_out(out)
8. Classifier-free Guidance¶
์์ด๋์ด¶
์กฐ๊ฑด๋ถ ์์ฑ๊ณผ ๋น์กฐ๊ฑด๋ถ ์์ฑ์ ํผํฉ
epsilon_guided = epsilon_uncond + w * (epsilon_cond - epsilon_uncond)
w > 1: ์กฐ๊ฑด์ ๋ ๊ฐํ๊ฒ ๋ฐ์ (๋ ์ ๋ช
ํ์ง๋ง ๋ค์์ฑ ๊ฐ์)
w = 1: ์ผ๋ฐ ์กฐ๊ฑด๋ถ ์์ฑ
w < 1: ์กฐ๊ฑด ์ฝํ
๊ตฌํ¶
def classifier_free_guidance_sample(model, schedule, shape, condition, w=7.5, device='cuda'):
"""Classifier-free Guidance ์ํ๋ง (โญโญโญโญ)"""
model.eval()
x = torch.randn(shape, device=device)
for i in reversed(range(schedule.timesteps)):
t = torch.full((shape[0],), i, device=device, dtype=torch.long)
# ์กฐ๊ฑด๋ถ ์์ธก
noise_cond = model(x, t, condition)
# ๋น์กฐ๊ฑด๋ถ ์์ธก (์กฐ๊ฑด = None ๋๋ ๋น ์๋ฒ ๋ฉ)
noise_uncond = model(x, t, None)
# Guidance
noise_pred = noise_uncond + w * (noise_cond - noise_uncond)
# ์ํ๋ง ์คํ
(DDPM ๋๋ DDIM)
x = sampling_step(x, noise_pred, t, schedule)
return x
ํ์ต ์ ์กฐ๊ฑด ๋๋กญ์์¶
def train_with_cfg(model, dataloader, drop_prob=0.1):
"""CFG๋ฅผ ์ํ ํ์ต (์กฐ๊ฑด ๋๋กญ์์) (โญโญโญ)"""
for images, conditions in dataloader:
# ์ผ์ ํ๋ฅ ๋ก ์กฐ๊ฑด์ None์ผ๋ก
mask = torch.rand(images.size(0)) < drop_prob
conditions[mask] = None # ๋๋ ๋น ์๋ฒ ๋ฉ
# ์ผ๋ฐ ํ์ต...
9. ๊ฐ๋จํ DDPM ์ ์ฒด ์์ ¶
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
# ์ค์
image_size = 28
channels = 1
timesteps = 1000
batch_size = 64
epochs = 50
lr = 1e-3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# ๋ฐ์ดํฐ
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Lambda(lambda t: (t * 2) - 1) # [0, 1] โ [-1, 1]
])
train_data = datasets.MNIST('data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# ์ค์ผ์ค
schedule = DiffusionSchedule(timesteps=timesteps, beta_schedule='linear')
# ๋ชจ๋ธ (๊ฐ๋จํ ๋ฒ์ )
model = SimpleUNet(in_channels=channels, out_channels=channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# ํ์ต
for epoch in range(epochs):
total_loss = 0
for images, _ in train_loader:
images = images.to(device)
batch_size = images.size(0)
# ๋๋ค ํ์์คํ
t = torch.randint(0, timesteps, (batch_size,), device=device).long()
# ๋
ธ์ด์ฆ ์ถ๊ฐ (forward process)
noise = torch.randn_like(images)
x_t = schedule.q_sample(images, t, noise)
# ๋
ธ์ด์ฆ ์์ธก
noise_pred = model(x_t, t)
# ์์ค
loss = F.mse_loss(noise_pred, noise)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")
# ์ํ๋ง
model.eval()
with torch.no_grad():
samples = sample_ddpm(model, schedule, (16, channels, image_size, image_size), device)
samples = (samples + 1) / 2 # [-1, 1] โ [0, 1]
# ์๊ฐํ
fig, axes = plt.subplots(4, 4, figsize=(8, 8))
for i, ax in enumerate(axes.flat):
ax.imshow(samples[i, 0].cpu(), cmap='gray')
ax.axis('off')
plt.savefig('diffusion_samples.png')
print("์ํ ์ ์ฅ: diffusion_samples.png")
์ ๋ฆฌ¶
ํต์ฌ ๊ฐ๋ ¶
- Forward Process: ์ ์ง์ ๋ ธ์ด์ฆ ์ถ๊ฐ q(x_t|x_0)
- Reverse Process: ์ ์ง์ ๋ ธ์ด์ฆ ์ ๊ฑฐ p(x_{t-1}|x_t)
- DDPM: ๋ ธ์ด์ฆ ์์ธก์ผ๋ก ์ญ๊ณผ์ ํ์ต
- DDIM: ๊ฒฐ์ ๋ก ์ ์ํ๋ง์ผ๋ก ๋น ๋ฅธ ์์ฑ
- Latent Diffusion: ์ ์ฌ ๊ณต๊ฐ์์ ํจ์จ์ ์์ฑ
- CFG: ์กฐ๊ฑด ๊ฐ๋ ์กฐ์
ํต์ฌ ์์¶
Forward: x_t = sqrt(alpha_bar_t) * x_0 + sqrt(1 - alpha_bar_t) * epsilon
Loss: L = E[||epsilon - epsilon_theta(x_t, t)||^2]
Reverse: x_{t-1} = (1/sqrt(alpha_t)) * (x_t - (beta_t/sqrt(1-alpha_bar_t)) * epsilon_theta) + sigma_t * z
Diffusion vs GAN vs VAE¶
| ํน์ฑ | Diffusion | GAN | VAE |
|---|---|---|---|
| ํ์ต ์์ ์ฑ | ๋งค์ฐ ๋์ | ๋ฎ์ | ๋์ |
| ์ด๋ฏธ์ง ํ์ง | ์ต๊ณ | ์ข์ | ํ๋ฆผ |
| ์ํ๋ง ์๋ | ๋๋ฆผ | ๋น ๋ฆ | ๋น ๋ฆ |
| Mode Coverage | ์ข์ | Mode Collapse | ์ข์ |
| ๋ฐ๋ ์ถ์ | ๊ฐ๋ฅ | ๋ถ๊ฐ | ๊ฐ๋ฅ |
๋ค์ ๋จ๊ณ¶
17_Attention_Deep_Dive.md์์ Attention ๋ฉ์ปค๋์ฆ์ ์ฌ์ธต์ ์ผ๋ก ํ์ตํฉ๋๋ค.