24. API & νκ°
24. API & νκ°¶
κ°μ¶
μμ© LLM API μ¬μ©λ²κ³Ό λΉμ© μ΅μ ν, κ·Έλ¦¬κ³ LLM μ±λ₯ νκ°λ₯Ό μν λ²€μΉλ§ν¬μ λ°©λ²λ‘ μ λ€λ£Ήλλ€.
1. μμ© LLM API¶
1.1 μ£Όμ μ 곡μ λΉκ΅¶
API μ 곡μ λΉκ΅ (2024):
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β Provider β Model β Input/1M β Output/1M β Context β
ββββββββββββββββΌβββββββββββββββββΌββββββββββββΌββββββββββββΌββββββββββ€
β OpenAI β GPT-4 Turbo β $10 β $30 β 128K β
β β GPT-4o β $5 β $15 β 128K β
β β GPT-3.5 Turbo β $0.50 β $1.50 β 16K β
ββββββββββββββββΌβββββββββββββββββΌββββββββββββΌββββββββββββΌββββββββββ€
β Anthropic β Claude 3 Opus β $15 β $75 β 200K β
β β Claude 3 Sonnetβ $3 β $15 β 200K β
β β Claude 3 Haiku β $0.25 β $1.25 β 200K β
ββββββββββββββββΌβββββββββββββββββΌββββββββββββΌββββββββββββΌββββββββββ€
β Google β Gemini 1.5 Pro β $3.50 β $10.50 β 1M β
β β Gemini 1.5 Flashβ $0.35 β $1.05 β 1M β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
1.2 OpenAI API¶
from openai import OpenAI
import tiktoken
class OpenAIClient:
"""OpenAI API ν΄λΌμ΄μΈνΈ"""
def __init__(self, api_key: str = None):
self.client = OpenAI(api_key=api_key)
self.token_encoder = tiktoken.get_encoding("cl100k_base")
def chat(
self,
messages: list,
model: str = "gpt-4o",
temperature: float = 0.7,
max_tokens: int = 1000,
**kwargs
) -> dict:
"""μ±ν
μμ±"""
response = self.client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
**kwargs
)
return {
"content": response.choices[0].message.content,
"usage": {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens
},
"model": response.model,
"finish_reason": response.choices[0].finish_reason
}
def stream_chat(self, messages: list, model: str = "gpt-4o", **kwargs):
"""μ€νΈλ¦¬λ° μ±ν
"""
stream = self.client.chat.completions.create(
model=model,
messages=messages,
stream=True,
**kwargs
)
for chunk in stream:
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
def count_tokens(self, text: str) -> int:
"""ν ν° μ κ³μ°"""
return len(self.token_encoder.encode(text))
def estimate_cost(
self,
prompt_tokens: int,
completion_tokens: int,
model: str = "gpt-4o"
) -> float:
"""λΉμ© μΆμ """
pricing = {
"gpt-4o": {"input": 5.0, "output": 15.0},
"gpt-4-turbo": {"input": 10.0, "output": 30.0},
"gpt-3.5-turbo": {"input": 0.5, "output": 1.5}
}
if model not in pricing:
return 0.0
cost = (
prompt_tokens * pricing[model]["input"] / 1_000_000 +
completion_tokens * pricing[model]["output"] / 1_000_000
)
return cost
# Function calling
def function_calling_example():
client = OpenAI()
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name"
}
},
"required": ["location"]
}
}
}
]
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "What's the weather in Seoul?"}],
tools=tools,
tool_choice="auto"
)
# Tool call μ²λ¦¬
if response.choices[0].message.tool_calls:
tool_call = response.choices[0].message.tool_calls[0]
print(f"Function: {tool_call.function.name}")
print(f"Arguments: {tool_call.function.arguments}")
1.3 Anthropic API¶
from anthropic import Anthropic
class AnthropicClient:
"""Anthropic Claude API ν΄λΌμ΄μΈνΈ"""
def __init__(self, api_key: str = None):
self.client = Anthropic(api_key=api_key)
def chat(
self,
messages: list,
model: str = "claude-3-sonnet-20240229",
max_tokens: int = 1000,
system: str = None,
**kwargs
) -> dict:
"""μ±ν
"""
response = self.client.messages.create(
model=model,
max_tokens=max_tokens,
system=system or "You are a helpful assistant.",
messages=messages,
**kwargs
)
return {
"content": response.content[0].text,
"usage": {
"input_tokens": response.usage.input_tokens,
"output_tokens": response.usage.output_tokens
},
"model": response.model,
"stop_reason": response.stop_reason
}
def stream_chat(self, messages: list, **kwargs):
"""μ€νΈλ¦¬λ°"""
with self.client.messages.stream(
messages=messages,
**kwargs
) as stream:
for text in stream.text_stream:
yield text
def vision(
self,
image_url: str,
prompt: str,
model: str = "claude-3-sonnet-20240229"
) -> str:
"""λΉμ API"""
import base64
import httpx
# μ΄λ―Έμ§ λ‘λ
if image_url.startswith("http"):
image_data = base64.standard_b64encode(
httpx.get(image_url).content
).decode("utf-8")
else:
with open(image_url, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = self.client.messages.create(
model=model,
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data
}
},
{
"type": "text",
"text": prompt
}
]
}]
)
return response.content[0].text
1.4 Google Gemini API¶
import google.generativeai as genai
class GeminiClient:
"""Google Gemini API ν΄λΌμ΄μΈνΈ"""
def __init__(self, api_key: str):
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel('gemini-1.5-pro')
def chat(
self,
messages: list,
temperature: float = 0.7,
max_tokens: int = 1000
) -> dict:
"""μ±ν
"""
# OpenAI νμμ Gemini νμμΌλ‘ λ³ν
history = []
for msg in messages[:-1]:
role = "user" if msg["role"] == "user" else "model"
history.append({"role": role, "parts": [msg["content"]]})
chat = self.model.start_chat(history=history)
response = chat.send_message(
messages[-1]["content"],
generation_config=genai.GenerationConfig(
temperature=temperature,
max_output_tokens=max_tokens
)
)
return {
"content": response.text,
"usage": {
"prompt_tokens": response.usage_metadata.prompt_token_count,
"completion_tokens": response.usage_metadata.candidates_token_count
}
}
def multimodal(
self,
image_path: str,
prompt: str
) -> str:
"""λ©ν°λͺ¨λ¬ μ
λ ₯"""
import PIL.Image
img = PIL.Image.open(image_path)
response = self.model.generate_content([prompt, img])
return response.text
2. λΉμ© μ΅μ ν¶
2.1 λΉμ© λͺ¨λν°λ§¶
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List
import json
@dataclass
class UsageRecord:
"""API μ¬μ© κΈ°λ‘"""
timestamp: datetime
model: str
prompt_tokens: int
completion_tokens: int
cost: float
request_type: str = "chat"
class CostTracker:
"""λΉμ© μΆμ κΈ°"""
def __init__(self):
self.records: List[UsageRecord] = []
self.pricing = {
"gpt-4o": {"input": 5.0, "output": 15.0},
"gpt-4-turbo": {"input": 10.0, "output": 30.0},
"gpt-3.5-turbo": {"input": 0.5, "output": 1.5},
"claude-3-opus": {"input": 15.0, "output": 75.0},
"claude-3-sonnet": {"input": 3.0, "output": 15.0},
"claude-3-haiku": {"input": 0.25, "output": 1.25}
}
def log_request(
self,
model: str,
prompt_tokens: int,
completion_tokens: int,
request_type: str = "chat"
):
"""μμ² λ‘κΉ
"""
cost = self._calculate_cost(model, prompt_tokens, completion_tokens)
record = UsageRecord(
timestamp=datetime.now(),
model=model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
cost=cost,
request_type=request_type
)
self.records.append(record)
return cost
def _calculate_cost(
self,
model: str,
prompt_tokens: int,
completion_tokens: int
) -> float:
"""λΉμ© κ³μ°"""
if model not in self.pricing:
return 0.0
pricing = self.pricing[model]
return (
prompt_tokens * pricing["input"] / 1_000_000 +
completion_tokens * pricing["output"] / 1_000_000
)
def get_summary(self, period: str = "day") -> Dict:
"""μ¬μ©λ μμ½"""
from collections import defaultdict
summary = defaultdict(lambda: {"tokens": 0, "cost": 0, "requests": 0})
for record in self.records:
model = record.model
summary[model]["tokens"] += record.prompt_tokens + record.completion_tokens
summary[model]["cost"] += record.cost
summary[model]["requests"] += 1
return dict(summary)
def set_budget_alert(self, daily_limit: float):
"""μΌμΌ μμ° μλ¦Ό μ€μ """
today_cost = sum(
r.cost for r in self.records
if r.timestamp.date() == datetime.now().date()
)
if today_cost > daily_limit:
return f"β οΈ Daily budget exceeded: ${today_cost:.2f} / ${daily_limit:.2f}"
return None
2.2 μ΅μ ν μ λ΅¶
class CostOptimizer:
"""λΉμ© μ΅μ ν μ λ΅"""
def __init__(self):
self.cache = {}
def semantic_cache(self, query: str, threshold: float = 0.95):
"""μλ§¨ν± μΊμ±"""
# μ μ¬ν μ΄μ 쿼리 μ°ΎκΈ°
from sentence_transformers import SentenceTransformer
import numpy as np
if not hasattr(self, 'encoder'):
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
query_emb = self.encoder.encode(query)
for cached_query, (cached_emb, response) in self.cache.items():
similarity = np.dot(query_emb, cached_emb) / (
np.linalg.norm(query_emb) * np.linalg.norm(cached_emb)
)
if similarity > threshold:
return response
return None
def add_to_cache(self, query: str, response: str):
"""μΊμμ μΆκ°"""
if hasattr(self, 'encoder'):
emb = self.encoder.encode(query)
self.cache[query] = (emb, response)
def select_model(
self,
task_complexity: str,
latency_requirement: str = "normal"
) -> str:
"""νμ€ν¬μ λ§λ λͺ¨λΈ μ ν"""
model_map = {
# (complexity, latency) -> model
("simple", "fast"): "gpt-3.5-turbo",
("simple", "normal"): "gpt-3.5-turbo",
("medium", "fast"): "claude-3-haiku",
("medium", "normal"): "gpt-4o",
("complex", "fast"): "gpt-4o",
("complex", "normal"): "claude-3-opus",
}
return model_map.get(
(task_complexity, latency_requirement),
"gpt-4o"
)
def prompt_compression(self, text: str, target_ratio: float = 0.5) -> str:
"""ν둬ννΈ μμΆ"""
# LLMLingua λ± μ¬μ© κ°λ₯
# μ¬κΈ°μλ κ°λ¨ν μμ½ λ°©μ
words = text.split()
target_len = int(len(words) * target_ratio)
# μ€μ λ¬Έμ₯ μ ν (μ€μ λ‘λ λ μ κ΅ν λ°©λ² νμ)
sentences = text.split('.')
if len(sentences) <= 2:
return text
# 첫 λ¬Έμ₯κ³Ό λ§μ§λ§ λ¬Έμ₯ μ μ§
compressed = sentences[0] + '.' + sentences[-1]
return compressed
3. LLM νκ°¶
3.1 λ²€μΉλ§ν¬¶
μ£Όμ λ²€μΉλ§ν¬:
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β General β
β - MMLU: 57 subjects, multiple choice β
β - HellaSwag: Commonsense reasoning β
β - WinoGrande: Coreference resolution β
β β
β Reasoning β
β - GSM8K: Grade school math β
β - MATH: Competition math β
β - ARC: Science questions β
β β
β Coding β
β - HumanEval: Python code generation β
β - MBPP: Python problems β
β - CodeContests: Competitive programming β
β β
β Chat/Instruction β
β - MT-Bench: Multi-turn conversation β
β - AlpacaEval: Instruction following β
β - Chatbot Arena: Human preference β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
3.2 μλ νκ°¶
import re
from typing import List, Dict
class LLMEvaluator:
"""LLM μλ νκ°"""
def __init__(self, model_client):
self.client = model_client
def evaluate_factuality(
self,
question: str,
answer: str,
reference: str
) -> Dict:
"""μ¬μ€μ± νκ°"""
prompt = f"""Evaluate if the answer is factually consistent with the reference.
Question: {question}
Answer: {answer}
Reference: {reference}
Score from 1-5 where:
1 = Completely incorrect
3 = Partially correct
5 = Completely correct
Provide your score and brief explanation.
Format: Score: X
Explanation: ..."""
response = self.client.chat([{"role": "user", "content": prompt}])
text = response["content"]
# μ μ μΆμΆ
score_match = re.search(r'Score:\s*(\d)', text)
score = int(score_match.group(1)) if score_match else 3
return {
"score": score,
"explanation": text
}
def evaluate_helpfulness(
self,
instruction: str,
response: str
) -> Dict:
"""μ μ©μ± νκ°"""
prompt = f"""Evaluate how helpful and complete the response is.
Instruction: {instruction}
Response: {response}
Rate on these criteria (1-5 each):
1. Relevance: Does it address the instruction?
2. Completeness: Does it fully answer?
3. Clarity: Is it well-written and clear?
4. Accuracy: Is the information correct?
Format:
Relevance: X
Completeness: X
Clarity: X
Accuracy: X
Overall: X"""
response = self.client.chat([{"role": "user", "content": prompt}])
text = response["content"]
# μ μ νμ±
scores = {}
for criterion in ["Relevance", "Completeness", "Clarity", "Accuracy", "Overall"]:
match = re.search(rf'{criterion}:\s*(\d)', text)
scores[criterion.lower()] = int(match.group(1)) if match else 3
return scores
def pairwise_comparison(
self,
instruction: str,
response_a: str,
response_b: str
) -> str:
"""μλ λΉκ΅"""
prompt = f"""Compare these two responses to the instruction.
Instruction: {instruction}
Response A:
{response_a}
Response B:
{response_b}
Which response is better? Consider helpfulness, accuracy, and clarity.
Answer with:
- "A" if Response A is better
- "B" if Response B is better
- "TIE" if they are equally good
Your choice:"""
response = self.client.chat([{"role": "user", "content": prompt}])
text = response["content"].strip().upper()
if "A" in text and "B" not in text:
return "A"
elif "B" in text and "A" not in text:
return "B"
else:
return "TIE"
# MT-Bench μ€νμΌ νκ°
class MTBenchEvaluator:
"""MT-Bench μ€νμΌ λ€μ€ ν΄ νκ°"""
def __init__(self, judge_model):
self.judge = judge_model
def evaluate_conversation(
self,
conversation: List[Dict]
) -> Dict:
"""λν νκ°"""
# κ° ν΄λ³ νκ°
turn_scores = []
for i, turn in enumerate(conversation):
if turn["role"] == "assistant":
context = conversation[:i+1]
score = self._evaluate_turn(context)
turn_scores.append(score)
return {
"turn_scores": turn_scores,
"average": sum(turn_scores) / len(turn_scores) if turn_scores else 0
}
def _evaluate_turn(self, context: List[Dict]) -> float:
"""κ°λ³ ν΄ νκ°"""
# νκ° ν둬ννΈ κ΅¬μ±
context_str = "\n".join([
f"{msg['role']}: {msg['content']}"
for msg in context
])
prompt = f"""Rate the assistant's last response on a scale of 1-10.
Conversation:
{context_str}
Consider:
- Helpfulness
- Relevance
- Accuracy
- Depth
Score (1-10):"""
response = self.judge.chat([{"role": "user", "content": prompt}])
score_match = re.search(r'\d+', response["content"])
return float(score_match.group()) if score_match else 5.0
3.3 μΈκ° νκ°¶
from dataclasses import dataclass
from typing import Optional
import random
@dataclass
class EvaluationItem:
"""νκ° νλͺ©"""
id: str
instruction: str
response_a: str
response_b: str
model_a: str
model_b: str
winner: Optional[str] = None
annotator: Optional[str] = None
class HumanEvaluation:
"""μΈκ° νκ° κ΄λ¦¬"""
def __init__(self):
self.items: List[EvaluationItem] = []
self.results: Dict[str, int] = {}
def add_comparison(
self,
instruction: str,
responses: Dict[str, str] # {model_name: response}
):
"""λΉκ΅ νλͺ© μΆκ°"""
models = list(responses.keys())
if len(models) != 2:
raise ValueError("Exactly 2 models required")
# μμ λλ€ν (bias λ°©μ§)
if random.random() > 0.5:
models = models[::-1]
item = EvaluationItem(
id=str(len(self.items)),
instruction=instruction,
response_a=responses[models[0]],
response_b=responses[models[1]],
model_a=models[0],
model_b=models[1]
)
self.items.append(item)
def record_judgment(
self,
item_id: str,
winner: str, # "A", "B", or "TIE"
annotator: str
):
"""νκ° κ²°κ³Ό κΈ°λ‘"""
for item in self.items:
if item.id == item_id:
item.winner = winner
item.annotator = annotator
# μΉμ λͺ¨λΈ κΈ°λ‘
if winner == "A":
winning_model = item.model_a
elif winner == "B":
winning_model = item.model_b
else:
winning_model = "TIE"
self.results[winning_model] = self.results.get(winning_model, 0) + 1
break
def get_elo_ratings(self) -> Dict[str, float]:
"""Elo λ μ΄ν
κ³μ°"""
# μ΄κΈ° λ μ΄ν
ratings = {}
for item in self.items:
ratings[item.model_a] = 1500
ratings[item.model_b] = 1500
K = 32 # K-factor
for item in self.items:
if item.winner is None:
continue
ra = ratings[item.model_a]
rb = ratings[item.model_b]
# Expected scores
ea = 1 / (1 + 10 ** ((rb - ra) / 400))
eb = 1 / (1 + 10 ** ((ra - rb) / 400))
# Actual scores
if item.winner == "A":
sa, sb = 1, 0
elif item.winner == "B":
sa, sb = 0, 1
else:
sa, sb = 0.5, 0.5
# Update ratings
ratings[item.model_a] += K * (sa - ea)
ratings[item.model_b] += K * (sb - eb)
return ratings
ν΅μ¬ μ 리¶
API μ¬μ© 체ν¬λ¦¬μ€νΈ¶
β‘ API ν€ νκ²½ λ³μλ‘ κ΄λ¦¬
β‘ ν ν° μ μ¬μ κ³μ°
β‘ λΉμ© λͺ¨λν°λ§ μ€μ
β‘ Rate limit μ²λ¦¬
β‘ μλ¬ νΈλ€λ§ λ° μ¬μλ
β‘ μΊμ± μ λ΅ κ΅¬ν
νκ° λ°©λ² μ ν¶
- κ°κ΄μ λ¬Έμ β μ νλ
- μμ± νμ€ν¬ β LLM-as-Judge
- μ±ν
/λν β MT-Bench/Chatbot Arena
- μ½λ© β pass@k, HumanEval
- νλ‘λμ
β A/B ν
μ€νΈ
μ°Έκ³ μλ£¶
- OpenAI API Documentation
- Anthropic Claude Documentation
- Zheng et al. (2023). "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena"
- Chen et al. (2021). "Evaluating Large Language Models Trained on Code" (HumanEval)