17. GPT-4V, GPT-4o, Gemini & Claude 3
17. GPT-4V, GPT-4o, Gemini & Claude 3¶
๊ฐ์¶
GPT-4V(ision), GPT-4o, Gemini, Claude 3๋ ํ์ฌ ๊ฐ์ฅ ๊ฐ๋ ฅํ ์์ฉ ๋ฉํฐ๋ชจ๋ฌ AI์ ๋๋ค. ์ด ๋ ์จ์์๋ ์ด๋ค์ ๊ธฐ๋ฅ, API ์ฌ์ฉ๋ฒ, ๊ทธ๋ฆฌ๊ณ ์ค์ ์์ฉ ์ฌ๋ก๋ฅผ ๋ค๋ฃน๋๋ค.
2024๋ ์ ๋ฐ์ดํธ: - GPT-4o (2024.05): GPT-4์ "omni" ๋ฒ์ , ๋ค์ดํฐ๋ธ ๋ฉํฐ๋ชจ๋ฌ - Gemini 1.5 Pro: 2M ํ ํฐ ์ปจํ ์คํธ, ๋น๋์ค/์ค๋์ค ๋ค์ดํฐ๋ธ - Claude 3 Family (2024.03): Haiku, Sonnet, Opus ๋ผ์ธ์ - Claude 3.5 Sonnet (2024.06): ๋น์ ๊ธฐ๋ฅ ๊ฐํ
1. GPT-4V (GPT-4 with Vision)¶
1.1 ๊ธฐ๋ฅ ๊ฐ์¶
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ GPT-4V ์ฃผ์ ๊ธฐ๋ฅ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ ๐ผ๏ธ ์ด๋ฏธ์ง ์ดํด โ
โ - ์์ธ ์ค๋ช
๋ฐ ๋ถ์ โ
โ - ๋ค์ค ์ด๋ฏธ์ง ๋น๊ต โ
โ - ์ฐจํธ/๊ทธ๋ํ ํด์ โ
โ โ
โ ๐ ํ
์คํธ ์ธ์ (OCR) โ
โ - ์๊ธ์จ ์ธ์ โ
โ - ๋ค๊ตญ์ด ํ
์คํธ โ
โ - ๋ฌธ์ ๊ตฌ์กฐ ์ดํด โ
โ โ
โ ๐ ์ธ๋ถ ๋ถ์ โ
โ - ๊ฐ์ฒด ์๋ณ ๋ฐ ์นด์ดํ
โ
โ - ๊ณต๊ฐ ๊ด๊ณ ์ดํด โ
โ - ์์ฑ ์ถ๋ก โ
โ โ
โ ๐ก ์ถ๋ก ๋ฐ ์ฐฝ์ โ
โ - ์ด๋ฏธ์ง ๊ธฐ๋ฐ ์ถ๋ก โ
โ - ์ฝ๋ ์์ฑ (UI ์คํฌ๋ฆฐ์ท โ ์ฝ๋) โ
โ - ์ฐฝ์์ ๊ธ์ฐ๊ธฐ โ
โ โ
โ โ ๏ธ ์ ํ ์ฌํญ โ
โ - ์๋ฃ ์ง๋จ ๋ถ๊ฐ โ
โ - ์ผ๊ตด ์ธ์/์ ์ ํ์ธ ๋ถ๊ฐ โ
โ - ์ค์๊ฐ ๋น๋์ค ๋ฏธ์ง์ (์ด๋ฏธ์ง๋ง) โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.2 API ์ฌ์ฉ๋ฒ¶
from openai import OpenAI
import base64
from pathlib import Path
client = OpenAI()
def encode_image(image_path: str) -> str:
"""์ด๋ฏธ์ง๋ฅผ base64๋ก ์ธ์ฝ๋ฉ"""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode()
def gpt4v_basic(image_path: str, prompt: str) -> str:
"""๊ธฐ๋ณธ ์ด๋ฏธ์ง ๋ถ์"""
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(image_path)}"
}
}
]
}
],
max_tokens=1024
)
return response.choices[0].message.content
def gpt4v_multi_image(image_paths: list, prompt: str) -> str:
"""๋ค์ค ์ด๋ฏธ์ง ๋ถ์"""
content = [{"type": "text", "text": prompt}]
for path in image_paths:
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encode_image(path)}"}
})
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{"role": "user", "content": content}],
max_tokens=2048
)
return response.choices[0].message.content
def gpt4v_with_detail(image_path: str, prompt: str, detail: str = "high") -> str:
"""
์์ธ ์์ค ์ง์
detail:
- "low": ๋น ๋ฅด๊ณ ์ ๋ ด, ์ ํด์๋ ๋ถ์
- "high": ์์ธ ๋ถ์, ๋ ๋ง์ ํ ํฐ ์ฌ์ฉ
- "auto": ์๋ ์ ํ
"""
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(image_path)}",
"detail": detail
}
}
]
}
],
max_tokens=1024
)
return response.choices[0].message.content
def gpt4v_url_image(image_url: str, prompt: str) -> str:
"""URL ์ด๋ฏธ์ง ๋ถ์"""
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}}
]
}
],
max_tokens=1024
)
return response.choices[0].message.content
1.3 ์ค์ ์์ฉ¶
class GPT4VApplications:
"""GPT-4V ์ค์ ์์ฉ"""
def __init__(self):
self.client = OpenAI()
def analyze_ui_screenshot(self, screenshot_path: str) -> dict:
"""UI ์คํฌ๋ฆฐ์ท ๋ถ์ ๋ฐ ์ฝ๋ ์์ฑ"""
prompt = """Analyze this UI screenshot and:
1. List all UI components visible
2. Describe the layout structure
3. Generate HTML/CSS code to recreate this UI
Format your response as JSON with keys:
- components: list of UI elements
- layout: description of layout
- html_code: HTML implementation
- css_code: CSS styles
"""
response = self._call_api(screenshot_path, prompt)
# JSON ํ์ฑ
import json
try:
return json.loads(response)
except:
return {"raw_response": response}
def extract_data_from_chart(self, chart_path: str) -> dict:
"""์ฐจํธ์์ ๋ฐ์ดํฐ ์ถ์ถ"""
prompt = """Analyze this chart and extract:
1. Chart type (bar, line, pie, etc.)
2. Title and axis labels
3. All data points with their values
4. Key insights or trends
Return as structured JSON.
"""
return self._call_api(chart_path, prompt)
def compare_images(self, image_paths: list) -> str:
"""์ด๋ฏธ์ง ๋น๊ต ๋ถ์"""
prompt = """Compare these images and describe:
1. Similarities
2. Differences
3. Which image is better quality and why
4. Any notable features in each
"""
return gpt4v_multi_image(image_paths, prompt)
def ocr_with_structure(self, document_path: str) -> dict:
"""๊ตฌ์กฐํ๋ OCR"""
prompt = """Extract all text from this document and preserve:
1. Headings and hierarchy
2. Tables (as markdown)
3. Lists (numbered and bulleted)
4. Key-value pairs
Return as structured markdown.
"""
return self._call_api(document_path, prompt)
def generate_alt_text(self, image_path: str) -> str:
"""์น ์ ๊ทผ์ฑ์ ์ํ ๋์ฒด ํ
์คํธ ์์ฑ"""
prompt = """Generate an appropriate alt text for this image.
The alt text should be:
1. Concise (under 125 characters)
2. Descriptive of the main content
3. Useful for screen reader users
Just return the alt text, nothing else.
"""
return self._call_api(image_path, prompt)
def _call_api(self, image_path: str, prompt: str) -> str:
response = self.client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(image_path)}",
"detail": "high"
}
}
]
}
],
max_tokens=2048
)
return response.choices[0].message.content
2. GPT-4o (Omni)¶
2.1 GPT-4o ๊ฐ์¶
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ GPT-4o vs GPT-4V ๋น๊ต โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ GPT-4V (๊ธฐ์กด): โ
โ - ํ
์คํธ + ์ด๋ฏธ์ง ์
๋ ฅ โ
โ - ๋ณ๋์ ๋น์ ์ธ์ฝ๋ โ
โ - ๋น๊ต์ ๋๋ฆฐ ์๋ต โ
โ โ
โ GPT-4o (2024.05): โ
โ - ํ
์คํธ + ์ด๋ฏธ์ง + ์ค๋์ค ๋ค์ดํฐ๋ธ โ
โ - ๋จ์ผ ๋ชจ๋ธ์์ ๋ชจ๋ ๋ชจ๋ฌ๋ฆฌํฐ ์ฒ๋ฆฌ โ
โ - 2๋ฐฐ ๋น ๋ฅธ ์๋ต, 50% ์ ๋ ดํ ๊ฐ๊ฒฉ โ
โ - ์ค์๊ฐ ์์ฑ ๋ํ ๊ฐ๋ฅ โ
โ โ
โ ์ฃผ์ ๊ฐ์ ์ : โ
โ โ
์๋: ํ๊ท 320ms ์๋ต (GPT-4V ๋๋น 2๋ฐฐ) โ
โ โ
๋น์ฉ: ์
๋ ฅ $5/1M, ์ถ๋ ฅ $15/1M โ
โ โ
๋น์ : ํฅ์๋ OCR, ์ฐจํธ ํด์ โ
โ โ
์ค๋์ค: ์ค์๊ฐ ์์ฑ ์
์ถ๋ ฅ โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
2.2 GPT-4o API ์ฌ์ฉ๋ฒ¶
from openai import OpenAI
import base64
client = OpenAI()
def gpt4o_vision(image_path: str, prompt: str) -> str:
"""GPT-4o ์ด๋ฏธ์ง ๋ถ์"""
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
response = client.chat.completions.create(
model="gpt-4o", # GPT-4o ์ฌ์ฉ
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}",
"detail": "high"
}
}
]
}
],
max_tokens=1024
)
return response.choices[0].message.content
def gpt4o_audio(audio_path: str, prompt: str) -> str:
"""GPT-4o ์ค๋์ค ๋ถ์ (Realtime API)"""
# ์ค๋์ค ํ์ผ ์ฝ๊ธฐ
with open(audio_path, "rb") as f:
audio_data = base64.b64encode(f.read()).decode()
response = client.chat.completions.create(
model="gpt-4o-audio-preview",
modalities=["text"],
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "input_audio",
"input_audio": {
"data": audio_data,
"format": "wav"
}
}
]
}
]
)
return response.choices[0].message.content
# GPT-4o-mini: ์ ๋น์ฉ ๋ฒ์
def gpt4o_mini_vision(image_path: str, prompt: str) -> str:
"""GPT-4o-mini: ๋น ๋ฅด๊ณ ์ ๋ ดํ ๋น์ ๋ชจ๋ธ"""
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
response = client.chat.completions.create(
model="gpt-4o-mini", # ์ ๋น์ฉ ๋ฒ์
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
}
]
}
],
max_tokens=512
)
return response.choices[0].message.content
3. Google Gemini¶
2.1 Gemini ๋ชจ๋ธ ๋ผ์ธ์ ¶
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ Gemini ๋ชจ๋ธ ๋น๊ต โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ Gemini 1.5 Flash: โ
โ - ๋น ๋ฅธ ์๋ต, ์ ๋น์ฉ โ
โ - 1M ํ ํฐ ์ปจํ
์คํธ โ
โ - ์ค์๊ฐ ์์ฉ์ ์ ํฉ โ
โ โ
โ Gemini 1.5 Pro: โ
โ - ์ต๊ณ ์ฑ๋ฅ โ
โ - 2M ํ ํฐ ์ปจํ
์คํธ โ
โ - ๋ณต์กํ ์ถ๋ก , ์ฝ๋ ์์ฑ โ
โ โ
โ Gemini 1.0 Ultra: โ
โ - ๊ฐ์ฅ ํฐ ๋ชจ๋ธ โ
โ - ๋ณต์กํ ๋ฉํฐ๋ชจ๋ฌ ํ์คํฌ โ
โ โ
โ ํน๋ณ ๊ธฐ๋ฅ: โ
โ - ๋ค์ดํฐ๋ธ ๋ฉํฐ๋ชจ๋ฌ (ํ
์คํธ, ์ด๋ฏธ์ง, ์ค๋์ค, ๋น๋์ค) โ
โ - ์ด์ฅ๋ฌธ ์ปจํ
์คํธ (1์๊ฐ ๋น๋์ค ๋ถ์ ๊ฐ๋ฅ) โ
โ - Code execution ๋ด์ฅ โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
2.2 Gemini API ์ฌ์ฉ๋ฒ¶
import google.generativeai as genai
from PIL import Image
# API ํค ์ค์
genai.configure(api_key="YOUR_API_KEY")
def gemini_basic(image_path: str, prompt: str) -> str:
"""๊ธฐ๋ณธ ์ด๋ฏธ์ง ๋ถ์"""
model = genai.GenerativeModel('gemini-1.5-pro')
image = Image.open(image_path)
response = model.generate_content([prompt, image])
return response.text
def gemini_multi_image(image_paths: list, prompt: str) -> str:
"""๋ค์ค ์ด๋ฏธ์ง ๋ถ์"""
model = genai.GenerativeModel('gemini-1.5-pro')
content = [prompt]
for path in image_paths:
content.append(Image.open(path))
response = model.generate_content(content)
return response.text
def gemini_video_analysis(video_path: str, prompt: str) -> str:
"""๋น๋์ค ๋ถ์ (Gemini ํนํ ๊ธฐ๋ฅ)"""
model = genai.GenerativeModel('gemini-1.5-pro')
# ๋น๋์ค ์
๋ก๋
video_file = genai.upload_file(video_path)
# ์ฒ๋ฆฌ ์๋ฃ ๋๊ธฐ
import time
while video_file.state.name == "PROCESSING":
time.sleep(10)
video_file = genai.get_file(video_file.name)
if video_file.state.name == "FAILED":
raise ValueError("Video processing failed")
response = model.generate_content([prompt, video_file])
return response.text
def gemini_long_context(documents: list, query: str) -> str:
"""๊ธด ๋ฌธ์ ๋ถ์ (1M+ ํ ํฐ)"""
model = genai.GenerativeModel('gemini-1.5-pro')
# ๋ชจ๋ ๋ฌธ์ ๊ฒฐํฉ
content = [query]
for doc in documents:
if doc.endswith('.pdf'):
content.append(genai.upload_file(doc))
elif doc.endswith(('.jpg', '.png')):
content.append(Image.open(doc))
else:
with open(doc, 'r') as f:
content.append(f.read())
response = model.generate_content(content)
return response.text
def gemini_with_code_execution(prompt: str) -> dict:
"""์ฝ๋ ์คํ ๊ธฐ๋ฅ"""
model = genai.GenerativeModel(
'gemini-1.5-pro',
tools='code_execution'
)
response = model.generate_content(prompt)
# ์คํ๋ ์ฝ๋์ ๊ฒฐ๊ณผ ์ถ์ถ
result = {
'text': response.text,
'code_execution': []
}
for part in response.parts:
if hasattr(part, 'code_execution_result'):
result['code_execution'].append({
'code': part.text,
'output': part.code_execution_result.output
})
return result
2.3 Gemini ํนํ ์์ฉ¶
class GeminiApplications:
"""Gemini ํนํ ์์ฉ"""
def __init__(self):
self.model = genai.GenerativeModel('gemini-1.5-pro')
def analyze_long_video(
self,
video_path: str,
questions: list
) -> dict:
"""๊ธด ๋น๋์ค ๋ถ์ (1์๊ฐ+)"""
video_file = self._upload_and_wait(video_path)
results = {}
for question in questions:
prompt = f"""Analyze this video and answer: {question}
Provide timestamps when relevant.
"""
response = self.model.generate_content([prompt, video_file])
results[question] = response.text
return results
def multimodal_reasoning(
self,
images: list,
audio_path: str = None,
text: str = None
) -> str:
"""๋ฉํฐ๋ชจ๋ฌ ์ถ๋ก """
content = []
if text:
content.append(text)
for img_path in images:
content.append(Image.open(img_path))
if audio_path:
audio_file = self._upload_and_wait(audio_path)
content.append(audio_file)
response = self.model.generate_content(content)
return response.text
def research_assistant(
self,
pdf_paths: list,
research_question: str
) -> dict:
"""์ฐ๊ตฌ ๋ณด์กฐ (๊ธด ๋ฌธ์ ๋ถ์)"""
# PDF ์
๋ก๋
files = [self._upload_and_wait(path) for path in pdf_paths]
prompt = f"""You are a research assistant. Analyze these academic papers
and answer the following research question:
{research_question}
Structure your response as:
1. Summary of relevant findings from each paper
2. Synthesis of the findings
3. Gaps or contradictions
4. Suggested future directions
"""
content = [prompt] + files
response = self.model.generate_content(content)
return {
'answer': response.text,
'sources': pdf_paths
}
def _upload_and_wait(self, file_path: str):
"""ํ์ผ ์
๋ก๋ ๋ฐ ์ฒ๋ฆฌ ๋๊ธฐ"""
import time
file = genai.upload_file(file_path)
while file.state.name == "PROCESSING":
time.sleep(5)
file = genai.get_file(file.name)
return file
4. Anthropic Claude 3¶
4.1 Claude 3 ๋ชจ๋ธ ๋ผ์ธ์ ¶
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ Claude 3 Family (2024.03) โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ Claude 3 Haiku: โ
โ - ๊ฐ์ฅ ๋น ๋ฅด๊ณ ์ ๋ ด โ
โ - ์ค์๊ฐ ์์ฉ, ๋๋ ์ฒ๋ฆฌ โ
โ - ๋น์ ์ง์ โ
โ โ
โ Claude 3 Sonnet: โ
โ - ์๋์ ์ฑ๋ฅ์ ๊ท ํ โ
โ - ๋๋ถ๋ถ์ ๋น์ฆ๋์ค ์ฉ๋์ ์ ํฉ โ
โ - ๋น์ ์ง์ โ
โ โ
โ Claude 3 Opus: โ
โ - ์ต๊ณ ์ฑ๋ฅ โ
โ - ๋ณต์กํ ์ถ๋ก , ๋ถ์ ํ์คํฌ โ
โ - ๋น์ ์ง์ โ
โ โ
โ Claude 3.5 Sonnet (2024.06): โ
โ - Opus ์์ค ์ฑ๋ฅ, Sonnet ๊ฐ๊ฒฉ โ
โ - ํฅ์๋ ๋น์ , ์ฝ๋ฉ ๋ฅ๋ ฅ โ
โ - 200K ํ ํฐ ์ปจํ
์คํธ โ
โ โ
โ ํน์ง: โ
โ โ
200K ์ปจํ
์คํธ ์๋์ฐ (์ ๋ชจ๋ธ) โ
โ โ
๋ฉํฐ๋ชจ๋ฌ: ์ด๋ฏธ์ง ์ดํด โ
โ โ
์์ ์ฑ: Constitutional AI ์ ์ฉ โ
โ โ
๋๊ตฌ ์ฌ์ฉ: Function Calling ์ง์ โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
4.2 Claude API ์ฌ์ฉ๋ฒ¶
import anthropic
import base64
client = anthropic.Anthropic()
def claude_vision(image_path: str, prompt: str, model: str = "claude-sonnet-4-20250514") -> str:
"""Claude ๋น์ ๋ถ์"""
# ์ด๋ฏธ์ง ์ธ์ฝ๋ฉ
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
# ๋ฏธ๋์ด ํ์
๊ฒฐ์
if image_path.endswith(".png"):
media_type = "image/png"
elif image_path.endswith(".gif"):
media_type = "image/gif"
elif image_path.endswith(".webp"):
media_type = "image/webp"
else:
media_type = "image/jpeg"
message = client.messages.create(
model=model,
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
},
},
{
"type": "text",
"text": prompt
}
],
}
],
)
return message.content[0].text
def claude_multi_image(image_paths: list, prompt: str) -> str:
"""Claude ๋ค์ค ์ด๋ฏธ์ง ๋ถ์"""
content = []
for path in image_paths:
with open(path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
media_type = "image/png" if path.endswith(".png") else "image/jpeg"
content.append({
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
}
})
content.append({"type": "text", "text": prompt})
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
messages=[{"role": "user", "content": content}],
)
return message.content[0].text
def claude_with_tools(prompt: str, image_path: str = None) -> dict:
"""Claude Tool Use (Function Calling)"""
tools = [
{
"name": "get_weather",
"description": "Get current weather for a location",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name"
}
},
"required": ["location"]
}
}
]
content = [{"type": "text", "text": prompt}]
if image_path:
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
content.insert(0, {
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data,
}
})
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
tools=tools,
messages=[{"role": "user", "content": content}],
)
return {
"content": message.content,
"stop_reason": message.stop_reason
}
4.3 Claude ํนํ ๊ธฐ๋ฅ¶
class ClaudeApplications:
"""Claude ํนํ ์์ฉ"""
def __init__(self):
self.client = anthropic.Anthropic()
def long_document_analysis(self, document_text: str, query: str) -> str:
"""๊ธด ๋ฌธ์ ๋ถ์ (200K ํ ํฐ)"""
message = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[
{
"role": "user",
"content": f"""๋ค์ ๋ฌธ์๋ฅผ ๋ถ์ํ๊ณ ์ง๋ฌธ์ ๋ตํ์ธ์.
๋ฌธ์:
{document_text}
์ง๋ฌธ: {query}
"""
}
],
)
return message.content[0].text
def code_review(self, code: str, language: str = "python") -> str:
"""์ฝ๋ ๋ฆฌ๋ทฐ"""
message = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
messages=[
{
"role": "user",
"content": f"""๋ค์ {language} ์ฝ๋๋ฅผ ๋ฆฌ๋ทฐํด์ฃผ์ธ์.
```{language}
{code}
๋ค์์ ํฌํจํด์ฃผ์ธ์: 1. ์ ์ฌ์ ๋ฒ๊ทธ 2. ์ฑ๋ฅ ๊ฐ์ ์ฌํญ 3. ์ฝ๋ ์คํ์ผ ์ ์ 4. ๋ณด์ ๋ฌธ์ """ } ], )
return message.content[0].text
def structured_output(self, image_path: str, schema: dict) -> dict:
"""๊ตฌ์กฐํ๋ ์ถ๋ ฅ ์์ฑ"""
import json
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
message = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data,
}
},
{
"type": "text",
"text": f"""์ด ์ด๋ฏธ์ง๋ฅผ ๋ถ์ํ๊ณ ๋ค์ JSON ์คํค๋ง์ ๋ง์ถฐ ๊ฒฐ๊ณผ๋ฅผ ๋ฐํํ์ธ์:
{json.dumps(schema, indent=2, ensure_ascii=False)}
JSON๋ง ๋ฐํํ์ธ์.""" } ] } ], )
return json.loads(message.content[0].text)
---
## 5. ๋น๊ต ๋ฐ ์ ํ ๊ฐ์ด๋
### 5.1 ๋ฉํฐ๋ชจ๋ฌ ๋ชจ๋ธ ๋น๊ต
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ 2024 ๋ฉํฐ๋ชจ๋ฌ ๋ชจ๋ธ ๋น๊ต โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค โ โ โ ๊ธฐ๋ฅ GPT-4o Gemini 1.5 Pro Claude 3.5 Sonnet โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ โ ์ด๋ฏธ์ง ์ดํด โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ ๋น๋์ค ๋ถ์ โ โ โ โ โ โ (๋ค์ดํฐ๋ธ) โ โ โ ์ค๋์ค ๋ถ์ โ โ โ โ โ โ โ โ โ โ โ โ โ ์ปจํ ์คํธ 128K 2M 200K โ โ ์ฝ๋ ์คํ โ โ โ โ โ โ (๋ด์ฅ) โ โ โ ์๋ โ โ โ โ โ โ โ โ โ โ (Flash) โ โ โ โ โ โ โ ๊ฐ๊ฒฉ ์ค๊ฐ ๋ฎ์ ์ค๊ฐ โ โ ์ฝ๋ฉ ๋ฅ๋ ฅ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ ์ถ๋ก ๋ฅ๋ ฅ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ โ ์ถ์ฒ ์ฌ์ฉ ์ฌ๋ก: โ โ - GPT-4o: ์ค์๊ฐ ๋ฉํฐ๋ชจ๋ฌ, ์์ฑ ๋ํ, ๋น ๋ฅธ ์๋ต ํ์ ์ โ โ - Gemini: ๋น๋์ค ๋ถ์, ์ด์ฅ๋ฌธ ๋ฌธ์, ๋ฉํฐ๋ชจ๋ฌ ๋ณตํฉ ํ์คํฌ โ โ - Claude: ๋ณต์กํ ์ถ๋ก , ์ฝ๋ ๋ฆฌ๋ทฐ, ๊ธด ๋ฌธ์ ๋ถ์, ์์ ์ฑ ์ค์ ์ โ โ โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
### 5.2 ์ฌ์ฉ ์ฌ๋ก๋ณ ์ ํ
```python
def select_model(use_case: str) -> str:
"""์ฌ์ฉ ์ฌ๋ก๋ณ ๋ชจ๋ธ ์ ํ (2024 ์
๋ฐ์ดํธ)"""
recommendations = {
# GPT-4o๊ฐ ์ข์ ๊ฒฝ์ฐ
"ui_to_code": "gpt-4o",
"realtime_chat": "gpt-4o",
"voice_assistant": "gpt-4o-audio-preview",
"quick_vision": "gpt-4o",
# Gemini๊ฐ ์ข์ ๊ฒฝ์ฐ
"video_analysis": "gemini-1.5-pro",
"very_long_document": "gemini-1.5-pro", # 2M ์ปจํ
์คํธ
"audio_transcription": "gemini-1.5-pro",
"multimodal_app": "gemini-1.5-pro",
# Claude๊ฐ ์ข์ ๊ฒฝ์ฐ
"complex_reasoning": "claude-sonnet-4-20250514",
"code_review": "claude-sonnet-4-20250514",
"long_document": "claude-sonnet-4-20250514", # 200K ์ปจํ
์คํธ
"safety_critical": "claude-sonnet-4-20250514",
# ๋น์ฉ ์ต์ ํ
"high_volume": "gemini-1.5-flash",
"quick_caption": "gpt-4o-mini",
"simple_classification": "claude-3-haiku-20240307",
}
return recommendations.get(use_case, "gpt-4o")
6. ๋น์ฉ ์ต์ ํ¶
6.1 ๋น์ฉ ๊ณ์ฐ¶
class CostEstimator:
"""API ๋น์ฉ ์ถ์ """
# 2024๋
๊ธฐ์ค ๊ฐ๊ฒฉ (USD per 1M tokens)
PRICING = {
"gpt-4-vision-preview": {
"input": 10.0, # per 1M tokens
"output": 30.0, # per 1M tokens
"image_low": 85, # tokens
"image_high": 765, # tokens (base) + tiles
},
"gpt-4o": {
"input": 5.0, # per 1M tokens
"output": 15.0, # per 1M tokens
"image_low": 85,
"image_high": 765,
},
"gpt-4o-mini": {
"input": 0.15, # per 1M tokens
"output": 0.60, # per 1M tokens
"image_low": 85,
"image_high": 765,
},
"gemini-1.5-pro": {
"input": 1.25, # per 1M tokens
"output": 5.0,
"image": 258, # tokens per image
"video": 263, # tokens per second
"audio": 32, # tokens per second
},
"gemini-1.5-flash": {
"input": 0.075,
"output": 0.30,
},
"claude-3-opus": {
"input": 15.0, # per 1M tokens
"output": 75.0,
},
"claude-sonnet-4-20250514": {
"input": 3.0, # per 1M tokens
"output": 15.0,
},
"claude-3-haiku": {
"input": 0.25, # per 1M tokens
"output": 1.25,
},
}
def estimate_gpt4v_cost(
self,
num_images: int,
avg_prompt_tokens: int,
avg_response_tokens: int,
detail: str = "high"
) -> float:
"""GPT-4V ๋น์ฉ ์ถ์ """
pricing = self.PRICING["gpt-4-vision-preview"]
# ์ด๋ฏธ์ง ํ ํฐ
if detail == "low":
image_tokens = num_images * pricing["image_low"]
else:
image_tokens = num_images * pricing["image_high"]
total_input = avg_prompt_tokens + image_tokens
total_output = avg_response_tokens
cost = (total_input / 1000 * pricing["input"] +
total_output / 1000 * pricing["output"])
return cost
def estimate_gemini_cost(
self,
num_images: int = 0,
video_seconds: int = 0,
audio_seconds: int = 0,
text_chars: int = 0,
output_chars: int = 0,
model: str = "gemini-1.5-pro"
) -> float:
"""Gemini ๋น์ฉ ์ถ์ """
pricing = self.PRICING[model]
input_cost = text_chars / 1000 * pricing["input"]
output_cost = output_chars / 1000 * pricing["output"]
if model == "gemini-1.5-pro":
# ๋ฉํฐ๋ฏธ๋์ด ๋น์ฉ
image_tokens = num_images * pricing["image"]
video_tokens = video_seconds * pricing["video"]
audio_tokens = audio_seconds * pricing["audio"]
media_chars = (image_tokens + video_tokens + audio_tokens) * 4 # ํ ํฐ โ ๋ฌธ์ ๊ทผ์ฌ
input_cost += media_chars / 1000 * pricing["input"]
return input_cost + output_cost
# ์ฌ์ฉ ์์
estimator = CostEstimator()
# 100๊ฐ ์ด๋ฏธ์ง ๋ถ์ ๋น์ฉ ๋น๊ต
gpt4v_cost = estimator.estimate_gpt4v_cost(
num_images=100,
avg_prompt_tokens=100,
avg_response_tokens=500,
detail="high"
)
gemini_cost = estimator.estimate_gemini_cost(
num_images=100,
text_chars=500,
output_chars=2000,
model="gemini-1.5-pro"
)
print(f"GPT-4V cost: ${gpt4v_cost:.2f}")
print(f"Gemini Pro cost: ${gemini_cost:.2f}")