14. Unified Vision Models
14. Unified Vision Models¶
κ°μ¶
Unified Vision Modelsλ λ€μν λΉμ νμ€ν¬(λΆλ₯, κ²μΆ, μΈκ·Έλ©ν μ΄μ λ±)λ₯Ό λ¨μΌ λͺ¨λΈλ‘ μ²λ¦¬νλ ν¨λ¬λ€μμ λλ€. νμ€ν¬λ³ λͺ¨λΈ λμ λ²μ© λΉμ λͺ¨λΈμ λͺ©νλ‘ ν©λλ€.
1. ν¨λ¬λ€μ μ ν¶
1.1 μ ν΅μ μ κ·Ό vs ν΅ν© μ κ·Ό¶
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β λΉμ λͺ¨λΈ ν¨λ¬λ€μ β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β β
β μ ν΅μ (Task-Specific): β
β ββββββββββββββββ ββββββββββββββββ ββββββββββββββββ β
β β ResNet β β Faster R-CNN β β DeepLab β β
β β (λΆλ₯) β β (κ²μΆ) β β (μΈκ·Έλ©ν
μ΄μ
)β β
β ββββββββββββββββ ββββββββββββββββ ββββββββββββββββ β
β β
β ν΅ν© (Task-Agnostic): β
β βββββββββββββββββββββββββββββββββββββββββββββββββ β
β β Unified Vision Model β β
β β "λΆλ₯ν΄μ€" β λΆλ₯ κ²°κ³Ό β β
β β "κ°μ²΄ μ°Ύμμ€" β λ°μ΄λ© λ°μ€ β β
β β "μΈκ·Έλ©ν
μ΄μ
ν΄μ€" β λ§μ€ν¬ β β
β βββββββββββββββββββββββββββββββββββββββββββββββββ β
β β
β μ₯μ : μ§μ 곡μ , μ μ§λ³΄μ μ©μ΄, Zero-shot μ μ΄ β
β β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
1.2 μ£Όμ λͺ¨λΈ λΉκ΅¶
| λͺ¨λΈ | κ°λ° | νΉμ§ | μ§μ νμ€ν¬ |
|---|---|---|---|
| Florence | Microsoft | λκ·λͺ¨ Image-Text | λΆλ₯, κ²μΆ, μΊ‘μ λ, VQA |
| PaLI | λ€κ΅μ΄ VLM | μΊ‘μ λ, VQA, OCR | |
| Unified-IO | Allen AI | λͺ¨λ λͺ¨λ¬λ¦¬ν° | μ΄λ―Έμ§, μ€λμ€, ν μ€νΈ |
| OFA | Alibaba | Seq2Seq ν΅ν© | λ€μν λΉμ -μΈμ΄ |
| GPT-4V | OpenAI | μμ© λ©ν°λͺ¨λ¬ | λ²μ© λΉμ μ΄ν΄ |
2. Florence: Foundation Model for Vision¶
2.1 μν€ν μ²¶
Florence μν€ν
μ²:
μ΄λ―Έμ§ μΈμ½λ: CoSwin Transformer (Hierarchical)
ν
μ€νΈ μΈμ½λ: UniCL (Unified Contrastive Learning)
νμ΅:
1. Image-Text Contrastive (CLIP μ€νμΌ)
2. Image-Text Matching
3. Masked Language Modeling
νΉμ§:
- 9μ΅ Image-Text μμΌλ‘ νμ΅
- λ€μν granularity (μ΄λ―Έμ§ β μμ β ν½μ
)
- Dynamic Headλ‘ νμ€ν¬ μ μ
2.2 ꡬν μμ¶
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
class FlorenceStyleModel(nn.Module):
"""
Florence μ€νμΌ ν΅ν© λΉμ λͺ¨λΈ (κ°μν)
ν΅μ¬: CLIP λ°±λ³Έ + Task-specific Heads
"""
def __init__(
self,
clip_model_name: str = "openai/clip-vit-large-patch14",
num_classes: int = 1000,
num_detection_classes: int = 80
):
super().__init__()
# CLIP λ°±λ³Έ (Image + Text μΈμ½λ)
self.clip = CLIPModel.from_pretrained(clip_model_name)
self.processor = CLIPProcessor.from_pretrained(clip_model_name)
hidden_size = self.clip.config.vision_config.hidden_size
# Task Heads
self.classification_head = nn.Linear(hidden_size, num_classes)
self.detection_head = DetectionHead(hidden_size, num_detection_classes)
self.segmentation_head = SegmentationHead(hidden_size)
self.caption_head = CaptionHead(hidden_size, self.clip.config.text_config)
def forward(
self,
images: torch.Tensor,
task: str = "classification",
text_prompts: list = None
):
"""
Args:
images: (B, 3, H, W)
task: "classification", "detection", "segmentation", "caption"
text_prompts: ν
μ€νΈ ν둬ννΈ (zero-shotμ©)
"""
# Image features
vision_outputs = self.clip.vision_model(images)
image_features = vision_outputs.last_hidden_state # (B, num_patches+1, hidden)
pooled_features = vision_outputs.pooler_output # (B, hidden)
if task == "classification":
if text_prompts:
# Zero-shot classification (CLIP μ€νμΌ)
return self._zero_shot_classify(pooled_features, text_prompts)
else:
return self.classification_head(pooled_features)
elif task == "detection":
return self.detection_head(image_features)
elif task == "segmentation":
return self.segmentation_head(image_features)
elif task == "caption":
return self.caption_head(pooled_features)
def _zero_shot_classify(
self,
image_features: torch.Tensor,
text_prompts: list
) -> torch.Tensor:
"""Zero-shot classification with text prompts"""
# Text encoding
text_inputs = self.processor(
text=text_prompts,
return_tensors="pt",
padding=True
).to(image_features.device)
text_features = self.clip.get_text_features(**text_inputs)
# Normalize
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
# Similarity
similarity = image_features @ text_features.T
return similarity
class DetectionHead(nn.Module):
"""Object Detection Head (DETR μ€νμΌ)"""
def __init__(self, hidden_size: int, num_classes: int, num_queries: int = 100):
super().__init__()
self.num_queries = num_queries
# Object queries
self.query_embed = nn.Embedding(num_queries, hidden_size)
# Transformer decoder
decoder_layer = nn.TransformerDecoderLayer(hidden_size, 8, batch_first=True)
self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
# Prediction heads
self.class_head = nn.Linear(hidden_size, num_classes + 1) # +1 for no-object
self.bbox_head = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, 4) # (cx, cy, w, h)
)
def forward(self, image_features: torch.Tensor):
B = image_features.size(0)
# Query embedding
queries = self.query_embed.weight.unsqueeze(0).repeat(B, 1, 1)
# Decoder
hs = self.decoder(queries, image_features)
# Predictions
class_logits = self.class_head(hs)
bbox_pred = self.bbox_head(hs).sigmoid()
return {
'class_logits': class_logits,
'bbox_pred': bbox_pred
}
class SegmentationHead(nn.Module):
"""Semantic Segmentation Head"""
def __init__(self, hidden_size: int, num_classes: int = 150):
super().__init__()
# FPN-style decoder
self.decoder = nn.Sequential(
nn.ConvTranspose2d(hidden_size, 256, 4, stride=2, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, num_classes, 1)
)
def forward(self, image_features: torch.Tensor):
# Reshape patches to spatial
B, N, C = image_features.shape
H = W = int((N - 1) ** 0.5) # -1 for CLS token
features = image_features[:, 1:, :].transpose(1, 2).view(B, C, H, W)
return self.decoder(features)
class CaptionHead(nn.Module):
"""Image Captioning Head"""
def __init__(self, hidden_size: int, text_config):
super().__init__()
self.vocab_size = text_config.vocab_size
# Cross-attention decoder
decoder_layer = nn.TransformerDecoderLayer(hidden_size, 8, batch_first=True)
self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
self.lm_head = nn.Linear(hidden_size, self.vocab_size)
def forward(
self,
image_features: torch.Tensor,
target_ids: torch.Tensor = None
):
# μμ± μμλ autoregressive
# νμ΅ μμλ teacher forcing
pass # ꡬν μλ΅
3. PaLI (Pathways Language and Image model)¶
3.1 μν€ν μ²¶
PaLI ꡬ쑰:
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β PaLI β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β β
β Image Encoder: ViT-e (4B params, 22B μ΄λ―Έμ§λ‘ νμ΅) β
β β β
β Visual Tokens: [IMG1] [IMG2] ... [IMGn] β
β β β
β Text Encoder-Decoder: mT5 (λ€κ΅μ΄) β
β β β
β Output: ν
μ€νΈ (λ€κ΅μ΄ μ§μ) β
β β
β μ
λ ₯ νμ: β
β "<image> μ΄ μ΄λ―Έμ§λ₯Ό μ€λͺ
ν΄μ£ΌμΈμ" β "κ³ μμ΄κ°..." β
β "<image> What is in the image?" β "A cat..." β
β β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
3.2 νμ€ν¬ ν΅ν©¶
class PaLITaskFormats:
"""PaLI νμ€ν¬λ³ μ
λ ₯ νμ"""
TASK_FORMATS = {
# λΆλ₯
"classification": "What is in this image?",
"fine_grained": "What species of bird is this?",
# μΊ‘μ
λ
"caption_en": "Generate a caption for this image.",
"caption_ko": "μ΄ μ΄λ―Έμ§μ λν μ€λͺ
μ μμ±νμΈμ.",
# VQA
"vqa": "Question: {question} Answer:",
# OCR
"ocr": "What text is in this image?",
# κ²μΆ (ν
μ€νΈλ‘ νν)
"detection": "Detect all objects in this image.",
# μΆλ ₯: "cat [100, 200, 300, 400]; dog [50, 60, 150, 200]"
# μΈκ·Έλ©ν
μ΄μ
μ°Έμ‘°
"referring": "Segment the {object}.",
}
@staticmethod
def format_input(task: str, **kwargs) -> str:
template = PaLITaskFormats.TASK_FORMATS.get(task, "")
return template.format(**kwargs)
# μ¬μ© μμ
def process_with_pali(model, image, task, **kwargs):
"""PaLI μ€νμΌ μ²λ¦¬"""
# νμ€ν¬λ³ ν둬ννΈ
prompt = PaLITaskFormats.format_input(task, **kwargs)
# Visual tokens + Text tokens
inputs = model.prepare_inputs(image, prompt)
# Generate
outputs = model.generate(**inputs)
# Parse output based on task
if task == "detection":
return parse_detection_output(outputs)
elif task == "caption_en":
return outputs
else:
return outputs
4. Unified-IO¶
4.1 μ§μ ν ν΅ν©: λͺ¨λ λͺ¨λ¬λ¦¬ν°¶
Unified-IO: λ¨μΌ λͺ¨λΈλ‘ λͺ¨λ I/O μ²λ¦¬
μ
λ ₯/μΆλ ₯ νμ:
- μ΄λ―Έμ§ β VQ-VAE ν ν°
- ν
μ€νΈ β μλΈμλ ν ν°
- λ°μ΄λ© λ°μ€ β μ’ν ν ν° (μ΄μ°ν)
- λ§μ€ν¬ β VQ-VAE ν ν°
- μ€λμ€ β μ€ννΈλ‘κ·Έλ¨ VQ-VAE
λͺ¨λ κ²μ ν ν° μνμ€λ‘ λ³ν β Seq2Seq Transformer
4.2 ꡬν κ°λ ¶
class UnifiedIOTokenizer:
"""Unified-IO μ€νμΌ ν ν°ν"""
def __init__(self, vocab_size: int = 50000, image_vocab_size: int = 16384):
self.vocab_size = vocab_size
self.image_vocab_size = image_vocab_size
# νΉμ ν ν°
self.SPECIAL_TOKENS = {
'<image>': vocab_size,
'</image>': vocab_size + 1,
'<box>': vocab_size + 2,
'</box>': vocab_size + 3,
'<mask>': vocab_size + 4,
'</mask>': vocab_size + 5,
'<audio>': vocab_size + 6,
'</audio>': vocab_size + 7,
}
# μ’ν μ΄μ°ν bins
self.num_bins = 1000
def tokenize_image(self, image: torch.Tensor) -> torch.Tensor:
"""VQ-VAEλ‘ μ΄λ―Έμ§ ν ν°ν"""
# VQ-VAE μΈμ½λλ‘ discrete codes μΆμΆ
# codes shape: (H', W')
codes = self.vqvae.encode(image)
# Flatten + offset
tokens = codes.flatten() + self.vocab_size + len(self.SPECIAL_TOKENS)
return tokens
def tokenize_bbox(self, bbox: torch.Tensor) -> torch.Tensor:
"""
λ°μ΄λ© λ°μ€λ₯Ό μ΄μ° ν ν°μΌλ‘
bbox: (x1, y1, x2, y2) normalized [0, 1]
"""
# κ° μ’νλ₯Ό binμΌλ‘ μ΄μ°ν
bins = (bbox * self.num_bins).long()
# νΉμ ν ν° + bins
tokens = torch.tensor([
self.SPECIAL_TOKENS['<box>'],
bins[0], bins[1], bins[2], bins[3],
self.SPECIAL_TOKENS['</box>']
])
return tokens
def decode_bbox(self, tokens: torch.Tensor) -> torch.Tensor:
"""ν ν°μμ λ°μ΄λ© λ°μ€ 볡μ"""
# <box> ν ν° μμΉ μ°ΎκΈ°
# 4κ°μ μ«μ ν ν° μΆμΆ
# μ κ·ν ν΄μ
pass
class UnifiedIOModel(nn.Module):
"""Unified-IO μ€νμΌ λͺ¨λΈ"""
def __init__(self, config):
super().__init__()
# Unified Embedding
self.embeddings = nn.ModuleDict({
'text': nn.Embedding(config.text_vocab_size, config.hidden_size),
'image': nn.Embedding(config.image_vocab_size, config.hidden_size),
'coord': nn.Embedding(config.num_bins, config.hidden_size),
})
# Encoder-Decoder Transformer
self.encoder = TransformerEncoder(config)
self.decoder = TransformerDecoder(config)
# Unified LM Head
self.lm_head = nn.Linear(config.hidden_size, config.total_vocab_size)
def forward(self, input_tokens, output_tokens=None):
"""
Seq2Seq forward
input_tokens: νΌν© λͺ¨λ¬λ¦¬ν° ν ν°
output_tokens: λͺ©ν μΆλ ₯ ν ν°
"""
# ν ν° νμ
λ³ μλ² λ©
embeddings = self._get_embeddings(input_tokens)
# Encoder
encoder_output = self.encoder(embeddings)
# Decoder
if output_tokens is not None:
decoder_input = self._get_embeddings(output_tokens)
decoder_output = self.decoder(decoder_input, encoder_output)
logits = self.lm_head(decoder_output)
return logits
return encoder_output
def _get_embeddings(self, tokens):
"""ν ν° νμ
μ λ°λΌ μ μ ν μλ² λ© μ ν"""
# ν ν° λ²μμ λ°λΌ text/image/coord ꡬλΆ
pass
# λ€μν νμ€ν¬ μμ
def unified_io_examples():
"""Unified-IO νμ€ν¬ μμ"""
examples = {
# Image Captioning
"caption": {
"input": "<image> {image_tokens} </image> Describe this image.",
"output": "A cat sitting on a windowsill."
},
# Object Detection
"detection": {
"input": "<image> {image_tokens} </image> Detect all objects.",
"output": "cat <box> 100 200 300 400 </box> dog <box> 50 60 150 200 </box>"
},
# Segmentation
"segmentation": {
"input": "<image> {image_tokens} </image> Segment the cat.",
"output": "<mask> {mask_tokens} </mask>"
},
# Image Generation (μλ°©ν₯)
"generation": {
"input": "Generate an image of a sunset over mountains.",
"output": "<image> {image_tokens} </image>"
},
# VQA
"vqa": {
"input": "<image> {image_tokens} </image> How many cats are there?",
"output": "2"
}
}
return examples
5. μ€μ νμ©¶
5.1 Florence-2 μ¬μ© (HuggingFace)¶
from transformers import AutoProcessor, AutoModelForCausalLM
def use_florence2():
"""Florence-2 μ€μ μ¬μ©"""
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-large",
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(
"microsoft/Florence-2-large",
trust_remote_code=True
)
from PIL import Image
import requests
url = "https://example.com/image.jpg"
image = Image.open(requests.get(url, stream=True).raw)
# λ€μν νμ€ν¬
tasks = {
"<CAPTION>": "μ§§μ μΊ‘μ
",
"<DETAILED_CAPTION>": "μμΈ μΊ‘μ
",
"<MORE_DETAILED_CAPTION>": "λ§€μ° μμΈν μΊ‘μ
",
"<OD>": "κ°μ²΄ κ²μΆ",
"<DENSE_REGION_CAPTION>": "μμλ³ μΊ‘μ
",
"<REGION_PROPOSAL>": "μμ μ μ",
"<CAPTION_TO_PHRASE_GROUNDING>": "ν
μ€νΈβμμ κ·ΈλΌμ΄λ©",
"<REFERRING_EXPRESSION_SEGMENTATION>": "μ°Έμ‘° νν μΈκ·Έλ©ν
μ΄μ
",
"<OCR>": "OCR",
"<OCR_WITH_REGION>": "μμλ³ OCR",
}
for task_prompt, description in tasks.items():
inputs = processor(text=task_prompt, images=image, return_tensors="pt")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed = processor.post_process_generation(generated_text, task=task_prompt, image_size=image.size)
print(f"\n{description} ({task_prompt}):")
print(parsed)
# μ€ν
use_florence2()
5.2 컀μ€ν νμ€ν¬ νμ΅¶
from transformers import Trainer, TrainingArguments
from datasets import Dataset
def finetune_unified_vision():
"""ν΅ν© λΉμ λͺ¨λΈ fine-tuning"""
# λ©ν°νμ€ν¬ λ°μ΄ν°μ
μ€λΉ
def create_multitask_dataset():
"""μ¬λ¬ νμ€ν¬λ₯Ό νλμ λ°μ΄ν°μ
μΌλ‘"""
samples = []
# λΆλ₯ μν
for img_path, label in classification_data:
samples.append({
'image': img_path,
'task': '<CLASSIFICATION>',
'input_text': '<CLASSIFICATION>',
'output_text': label
})
# μΊ‘μ
μν
for img_path, caption in caption_data:
samples.append({
'image': img_path,
'task': '<CAPTION>',
'input_text': '<CAPTION>',
'output_text': caption
})
# VQA μν
for img_path, question, answer in vqa_data:
samples.append({
'image': img_path,
'task': '<VQA>',
'input_text': f'<VQA> {question}',
'output_text': answer
})
return Dataset.from_list(samples)
dataset = create_multitask_dataset()
# νμ΅
training_args = TrainingArguments(
output_dir="./unified-vision-finetuned",
per_device_train_batch_size=8,
num_train_epochs=3,
learning_rate=1e-5,
# νμ€ν¬ μνλ§ μ λ΅
dataloader_drop_last=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
)
trainer.train()
6. λ―Έλ λ°©ν₯¶
6.1 World Models¶
λ€μ λ¨κ³: World Models
λΉμ λͺ¨λΈ + 물리 μ΄ν΄ + νλ μμΈ‘
μμ:
- μ΄λ―Έμ§μμ 물리 λ²μΉ μ΄ν΄
- "곡μ λμ§λ©΄ μ΄λλ‘ κ°κΉ?"
- λΉλμ€μ λ€μ νλ μ μμΈ‘
- λ‘λ΄ μ‘°μ κ³ν
6.2 ν΅ν©μ νκ³μ νΈλ μ΄λμ€ν¶
μ₯μ :
β νμ€ν¬ κ° μ§μ 곡μ
β λ¨μΌ λͺ¨λΈ μ μ§λ³΄μ
β Zero-shot μ μ΄
β μλ‘μ΄ νμ€ν¬ μ μ μ©μ΄
λ¨μ :
β κ°λ³ νμ€ν¬ μ΅κ³ μ±λ₯ λ―Έλ¬
β νμ΅ λ³΅μ‘μ±
β νμ€ν¬ κ° κ°μ
β ν° λͺ¨λΈ ν¬κΈ°
νΈλ μ΄λμ€ν:
- λ²μ©μ± vs μ λ¬Έμ±
- νΈμμ± vs μ΅μ μ±λ₯
μ°Έκ³ μλ£¶
λ Όλ¬Έ¶
- Yuan et al. (2021). "Florence: A New Foundation Model for Computer Vision"
- Chen et al. (2022). "PaLI: A Jointly-Scaled Multilingual Language-Image Model"
- Lu et al. (2022). "Unified-IO: A Unified Model for Vision, Language, and Multi-Modal Tasks"