1"""
209. RAG (Retrieval-Augmented Generation) 예제
3
4문서 검색 + LLM 생성 결합
5"""
6
7import numpy as np
8
9print("=" * 60)
10print("RAG 시스템")
11print("=" * 60)
12
13
14# ============================================
15# 1. 간단한 RAG 구현 (NumPy만 사용)
16# ============================================
17print("\n[1] 간단한 RAG (NumPy)")
18print("-" * 40)
19
20class SimpleVectorStore:
21 """간단한 벡터 저장소"""
22 def __init__(self):
23 self.documents = []
24 self.embeddings = None
25
26 def add_documents(self, documents, embeddings):
27 self.documents = documents
28 self.embeddings = np.array(embeddings)
29
30 def search(self, query_embedding, top_k=3):
31 """코사인 유사도로 검색"""
32 query = np.array(query_embedding)
33
34 # 코사인 유사도
35 similarities = np.dot(self.embeddings, query) / (
36 np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query)
37 )
38
39 # 상위 k개
40 top_indices = np.argsort(similarities)[-top_k:][::-1]
41 return [(self.documents[i], similarities[i]) for i in top_indices]
42
43
44# 예시 문서
45documents = [
46 "Python is a high-level programming language known for its readability.",
47 "Machine learning is a subset of artificial intelligence.",
48 "Deep learning uses neural networks with many layers.",
49 "Natural language processing deals with text and speech.",
50 "Computer vision enables machines to interpret images."
51]
52
53# 가상 임베딩 (실제로는 모델 사용)
54np.random.seed(42)
55embeddings = np.random.randn(len(documents), 128)
56
57# 벡터 저장소
58store = SimpleVectorStore()
59store.add_documents(documents, embeddings)
60
61# 검색
62query_embedding = np.random.randn(128)
63results = store.search(query_embedding, top_k=2)
64
65print("검색 결과:")
66for doc, score in results:
67 print(f" [{score:.4f}] {doc[:50]}...")
68
69
70# ============================================
71# 2. Sentence Transformers + RAG
72# ============================================
73print("\n[2] Sentence Transformers RAG")
74print("-" * 40)
75
76try:
77 from sentence_transformers import SentenceTransformer
78
79 # 임베딩 모델
80 model = SentenceTransformer('all-MiniLM-L6-v2')
81
82 # 문서 임베딩
83 doc_embeddings = model.encode(documents)
84 print(f"문서 임베딩 shape: {doc_embeddings.shape}")
85
86 # 쿼리
87 query = "What is machine learning?"
88 query_embedding = model.encode(query)
89
90 # 검색
91 store = SimpleVectorStore()
92 store.add_documents(documents, doc_embeddings)
93 results = store.search(query_embedding, top_k=2)
94
95 print(f"\n쿼리: {query}")
96 print("검색 결과:")
97 for doc, score in results:
98 print(f" [{score:.4f}] {doc}")
99
100except ImportError:
101 print("sentence-transformers 미설치")
102
103
104# ============================================
105# 3. 청킹 (Chunking)
106# ============================================
107print("\n[3] 텍스트 청킹")
108print("-" * 40)
109
110def chunk_text(text, chunk_size=100, overlap=20):
111 """오버랩이 있는 청킹"""
112 chunks = []
113 start = 0
114 while start < len(text):
115 end = start + chunk_size
116 chunk = text[start:end]
117 chunks.append(chunk)
118 start = end - overlap
119 return chunks
120
121long_text = """
122Artificial intelligence (AI) is intelligence demonstrated by machines,
123as opposed to natural intelligence displayed by animals including humans.
124AI research has been defined as the field of study of intelligent agents,
125which refers to any system that perceives its environment and takes actions
126that maximize its chance of achieving its goals. The term "artificial intelligence"
127had previously been used to describe machines that mimic and display "human"
128cognitive skills that are associated with the human mind, such as "learning" and
129"problem-solving". This definition has since been rejected by major AI researchers
130who now describe AI in terms of rationality and acting rationally.
131"""
132
133chunks = chunk_text(long_text, chunk_size=150, overlap=30)
134print(f"원본 길이: {len(long_text)} chars")
135print(f"청크 수: {len(chunks)}")
136for i, chunk in enumerate(chunks[:3]):
137 print(f" 청크 {i+1}: {chunk[:50]}...")
138
139
140# ============================================
141# 4. 완전한 RAG 파이프라인
142# ============================================
143print("\n[4] 완전한 RAG 파이프라인")
144print("-" * 40)
145
146class RAGPipeline:
147 """RAG 파이프라인"""
148
149 def __init__(self, embedding_model=None):
150 self.documents = []
151 self.chunks = []
152 self.embeddings = None
153 self.embedding_model = embedding_model
154
155 def add_documents(self, documents, chunk_size=200, overlap=50):
156 """문서 추가 및 청킹"""
157 self.documents = documents
158
159 # 청킹
160 for doc in documents:
161 doc_chunks = chunk_text(doc, chunk_size, overlap)
162 self.chunks.extend(doc_chunks)
163
164 # 임베딩
165 if self.embedding_model:
166 self.embeddings = self.embedding_model.encode(self.chunks)
167 else:
168 # 가상 임베딩
169 self.embeddings = np.random.randn(len(self.chunks), 128)
170
171 print(f"문서 {len(documents)}개 → 청크 {len(self.chunks)}개")
172
173 def retrieve(self, query, top_k=3):
174 """관련 청크 검색"""
175 if self.embedding_model:
176 query_emb = self.embedding_model.encode(query)
177 else:
178 query_emb = np.random.randn(128)
179
180 # 코사인 유사도
181 similarities = np.dot(self.embeddings, query_emb) / (
182 np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_emb) + 1e-10
183 )
184
185 top_indices = np.argsort(similarities)[-top_k:][::-1]
186 return [self.chunks[i] for i in top_indices]
187
188 def generate(self, query, context):
189 """프롬프트 구성 (실제로는 LLM 호출)"""
190 prompt = f"""Answer based on the context:
191
192Context:
193{context}
194
195Question: {query}
196
197Answer:"""
198 return prompt
199
200 def query(self, question, top_k=3):
201 """RAG 쿼리"""
202 # 검색
203 relevant_chunks = self.retrieve(question, top_k)
204 context = "\n\n".join(relevant_chunks)
205
206 # 프롬프트 생성
207 prompt = self.generate(question, context)
208
209 return {
210 "question": question,
211 "context": context,
212 "prompt": prompt
213 }
214
215
216# RAG 파이프라인 테스트
217rag = RAGPipeline()
218rag.add_documents([long_text])
219
220result = rag.query("What is artificial intelligence?", top_k=2)
221print(f"\n질문: {result['question']}")
222print(f"컨텍스트 길이: {len(result['context'])} chars")
223print(f"프롬프트 미리보기:\n{result['prompt'][:200]}...")
224
225
226# ============================================
227# 5. OpenAI RAG (API 필요)
228# ============================================
229print("\n[5] OpenAI RAG 예제 (코드만)")
230print("-" * 40)
231
232openai_rag_code = '''
233from openai import OpenAI
234from sentence_transformers import SentenceTransformer
235
236class OpenAIRAG:
237 def __init__(self):
238 self.client = OpenAI()
239 self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
240 self.documents = []
241 self.embeddings = None
242
243 def add_documents(self, documents):
244 self.documents = documents
245 self.embeddings = self.embed_model.encode(documents)
246
247 def search(self, query, top_k=3):
248 query_emb = self.embed_model.encode(query)
249 similarities = cosine_similarity([query_emb], self.embeddings)[0]
250 top_idx = np.argsort(similarities)[-top_k:][::-1]
251 return [self.documents[i] for i in top_idx]
252
253 def query(self, question, top_k=3):
254 # 검색
255 relevant = self.search(question, top_k)
256 context = "\\n\\n".join(relevant)
257
258 # LLM 호출
259 response = self.client.chat.completions.create(
260 model="gpt-3.5-turbo",
261 messages=[
262 {"role": "system", "content": "Answer based on the context."},
263 {"role": "user", "content": f"Context:\\n{context}\\n\\nQuestion: {question}"}
264 ]
265 )
266 return response.choices[0].message.content
267'''
268print(openai_rag_code)
269
270
271# ============================================
272# 정리
273# ============================================
274print("\n" + "=" * 60)
275print("RAG 정리")
276print("=" * 60)
277
278summary = """
279RAG 파이프라인:
280 1. 문서 → 청킹 → 임베딩 → 벡터 DB 저장
281 2. 쿼리 → 임베딩 → 유사 문서 검색
282 3. 쿼리 + 문서 → LLM → 답변
283
284핵심 코드:
285 # 임베딩
286 embeddings = model.encode(documents)
287
288 # 검색
289 similarities = cosine_similarity([query_emb], embeddings)
290 top_docs = documents[top_indices]
291
292 # 생성
293 prompt = f"Context: {context}\\nQuestion: {query}"
294 response = llm.generate(prompt)
295"""
296print(summary)