09_rag_system.py

Download
python 297 lines 8.6 KB
  1"""
  209. RAG (Retrieval-Augmented Generation) 예제
  3
  4문서 검색 + LLM 생성 결합
  5"""
  6
  7import numpy as np
  8
  9print("=" * 60)
 10print("RAG 시스템")
 11print("=" * 60)
 12
 13
 14# ============================================
 15# 1. 간단한 RAG 구현 (NumPy만 사용)
 16# ============================================
 17print("\n[1] 간단한 RAG (NumPy)")
 18print("-" * 40)
 19
 20class SimpleVectorStore:
 21    """간단한 벡터 저장소"""
 22    def __init__(self):
 23        self.documents = []
 24        self.embeddings = None
 25
 26    def add_documents(self, documents, embeddings):
 27        self.documents = documents
 28        self.embeddings = np.array(embeddings)
 29
 30    def search(self, query_embedding, top_k=3):
 31        """코사인 유사도로 검색"""
 32        query = np.array(query_embedding)
 33
 34        # 코사인 유사도
 35        similarities = np.dot(self.embeddings, query) / (
 36            np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query)
 37        )
 38
 39        # 상위 k개
 40        top_indices = np.argsort(similarities)[-top_k:][::-1]
 41        return [(self.documents[i], similarities[i]) for i in top_indices]
 42
 43
 44# 예시 문서
 45documents = [
 46    "Python is a high-level programming language known for its readability.",
 47    "Machine learning is a subset of artificial intelligence.",
 48    "Deep learning uses neural networks with many layers.",
 49    "Natural language processing deals with text and speech.",
 50    "Computer vision enables machines to interpret images."
 51]
 52
 53# 가상 임베딩 (실제로는 모델 사용)
 54np.random.seed(42)
 55embeddings = np.random.randn(len(documents), 128)
 56
 57# 벡터 저장소
 58store = SimpleVectorStore()
 59store.add_documents(documents, embeddings)
 60
 61# 검색
 62query_embedding = np.random.randn(128)
 63results = store.search(query_embedding, top_k=2)
 64
 65print("검색 결과:")
 66for doc, score in results:
 67    print(f"  [{score:.4f}] {doc[:50]}...")
 68
 69
 70# ============================================
 71# 2. Sentence Transformers + RAG
 72# ============================================
 73print("\n[2] Sentence Transformers RAG")
 74print("-" * 40)
 75
 76try:
 77    from sentence_transformers import SentenceTransformer
 78
 79    # 임베딩 모델
 80    model = SentenceTransformer('all-MiniLM-L6-v2')
 81
 82    # 문서 임베딩
 83    doc_embeddings = model.encode(documents)
 84    print(f"문서 임베딩 shape: {doc_embeddings.shape}")
 85
 86    # 쿼리
 87    query = "What is machine learning?"
 88    query_embedding = model.encode(query)
 89
 90    # 검색
 91    store = SimpleVectorStore()
 92    store.add_documents(documents, doc_embeddings)
 93    results = store.search(query_embedding, top_k=2)
 94
 95    print(f"\n쿼리: {query}")
 96    print("검색 결과:")
 97    for doc, score in results:
 98        print(f"  [{score:.4f}] {doc}")
 99
100except ImportError:
101    print("sentence-transformers 미설치")
102
103
104# ============================================
105# 3. 청킹 (Chunking)
106# ============================================
107print("\n[3] 텍스트 청킹")
108print("-" * 40)
109
110def chunk_text(text, chunk_size=100, overlap=20):
111    """오버랩이 있는 청킹"""
112    chunks = []
113    start = 0
114    while start < len(text):
115        end = start + chunk_size
116        chunk = text[start:end]
117        chunks.append(chunk)
118        start = end - overlap
119    return chunks
120
121long_text = """
122Artificial intelligence (AI) is intelligence demonstrated by machines,
123as opposed to natural intelligence displayed by animals including humans.
124AI research has been defined as the field of study of intelligent agents,
125which refers to any system that perceives its environment and takes actions
126that maximize its chance of achieving its goals. The term "artificial intelligence"
127had previously been used to describe machines that mimic and display "human"
128cognitive skills that are associated with the human mind, such as "learning" and
129"problem-solving". This definition has since been rejected by major AI researchers
130who now describe AI in terms of rationality and acting rationally.
131"""
132
133chunks = chunk_text(long_text, chunk_size=150, overlap=30)
134print(f"원본 길이: {len(long_text)} chars")
135print(f"청크 수: {len(chunks)}")
136for i, chunk in enumerate(chunks[:3]):
137    print(f"  청크 {i+1}: {chunk[:50]}...")
138
139
140# ============================================
141# 4. 완전한 RAG 파이프라인
142# ============================================
143print("\n[4] 완전한 RAG 파이프라인")
144print("-" * 40)
145
146class RAGPipeline:
147    """RAG 파이프라인"""
148
149    def __init__(self, embedding_model=None):
150        self.documents = []
151        self.chunks = []
152        self.embeddings = None
153        self.embedding_model = embedding_model
154
155    def add_documents(self, documents, chunk_size=200, overlap=50):
156        """문서 추가 및 청킹"""
157        self.documents = documents
158
159        # 청킹
160        for doc in documents:
161            doc_chunks = chunk_text(doc, chunk_size, overlap)
162            self.chunks.extend(doc_chunks)
163
164        # 임베딩
165        if self.embedding_model:
166            self.embeddings = self.embedding_model.encode(self.chunks)
167        else:
168            # 가상 임베딩
169            self.embeddings = np.random.randn(len(self.chunks), 128)
170
171        print(f"문서 {len(documents)}개 → 청크 {len(self.chunks)}개")
172
173    def retrieve(self, query, top_k=3):
174        """관련 청크 검색"""
175        if self.embedding_model:
176            query_emb = self.embedding_model.encode(query)
177        else:
178            query_emb = np.random.randn(128)
179
180        # 코사인 유사도
181        similarities = np.dot(self.embeddings, query_emb) / (
182            np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_emb) + 1e-10
183        )
184
185        top_indices = np.argsort(similarities)[-top_k:][::-1]
186        return [self.chunks[i] for i in top_indices]
187
188    def generate(self, query, context):
189        """프롬프트 구성 (실제로는 LLM 호출)"""
190        prompt = f"""Answer based on the context:
191
192Context:
193{context}
194
195Question: {query}
196
197Answer:"""
198        return prompt
199
200    def query(self, question, top_k=3):
201        """RAG 쿼리"""
202        # 검색
203        relevant_chunks = self.retrieve(question, top_k)
204        context = "\n\n".join(relevant_chunks)
205
206        # 프롬프트 생성
207        prompt = self.generate(question, context)
208
209        return {
210            "question": question,
211            "context": context,
212            "prompt": prompt
213        }
214
215
216# RAG 파이프라인 테스트
217rag = RAGPipeline()
218rag.add_documents([long_text])
219
220result = rag.query("What is artificial intelligence?", top_k=2)
221print(f"\n질문: {result['question']}")
222print(f"컨텍스트 길이: {len(result['context'])} chars")
223print(f"프롬프트 미리보기:\n{result['prompt'][:200]}...")
224
225
226# ============================================
227# 5. OpenAI RAG (API 필요)
228# ============================================
229print("\n[5] OpenAI RAG 예제 (코드만)")
230print("-" * 40)
231
232openai_rag_code = '''
233from openai import OpenAI
234from sentence_transformers import SentenceTransformer
235
236class OpenAIRAG:
237    def __init__(self):
238        self.client = OpenAI()
239        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')
240        self.documents = []
241        self.embeddings = None
242
243    def add_documents(self, documents):
244        self.documents = documents
245        self.embeddings = self.embed_model.encode(documents)
246
247    def search(self, query, top_k=3):
248        query_emb = self.embed_model.encode(query)
249        similarities = cosine_similarity([query_emb], self.embeddings)[0]
250        top_idx = np.argsort(similarities)[-top_k:][::-1]
251        return [self.documents[i] for i in top_idx]
252
253    def query(self, question, top_k=3):
254        # 검색
255        relevant = self.search(question, top_k)
256        context = "\\n\\n".join(relevant)
257
258        # LLM 호출
259        response = self.client.chat.completions.create(
260            model="gpt-3.5-turbo",
261            messages=[
262                {"role": "system", "content": "Answer based on the context."},
263                {"role": "user", "content": f"Context:\\n{context}\\n\\nQuestion: {question}"}
264            ]
265        )
266        return response.choices[0].message.content
267'''
268print(openai_rag_code)
269
270
271# ============================================
272# 정리
273# ============================================
274print("\n" + "=" * 60)
275print("RAG 정리")
276print("=" * 60)
277
278summary = """
279RAG 파이프라인:
280    1. 문서 → 청킹 → 임베딩 → 벡터 DB 저장
281    2. 쿼리 → 임베딩 → 유사 문서 검색
282    3. 쿼리 + 문서 → LLM → 답변
283
284핵심 코드:
285    # 임베딩
286    embeddings = model.encode(documents)
287
288    # 검색
289    similarities = cosine_similarity([query_emb], embeddings)
290    top_docs = documents[top_indices]
291
292    # 생성
293    prompt = f"Context: {context}\\nQuestion: {query}"
294    response = llm.generate(prompt)
295"""
296print(summary)