11_vector_db.py - Examples

  1"""
  211. 벡터 데이터베이스 예제
  3
  4Chroma, FAISS를 사용한 벡터 검색
  5"""
  6
  7import numpy as np
  8
  9print("=" * 60)
 10print("벡터 데이터베이스")
 11print("=" * 60)
 12
 13
 14# ============================================
 15# 1. 기본 벡터 검색 (NumPy)
 16# ============================================
 17print("\n[1] NumPy 벡터 검색")
 18print("-" * 40)
 19
 20def cosine_similarity(query, vectors):
 21    """코사인 유사도 계산"""
 22    query_norm = query / np.linalg.norm(query)
 23    vectors_norm = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
 24    return np.dot(vectors_norm, query_norm)
 25
 26# 샘플 데이터
 27documents = [
 28    "Python is a programming language",
 29    "Machine learning uses algorithms",
 30    "Deep learning is a subset of ML",
 31    "JavaScript is for web development",
 32    "Data science involves statistics"
 33]
 34
 35# 가상 임베딩
 36np.random.seed(42)
 37embeddings = np.random.randn(len(documents), 128)
 38
 39# 검색
 40query_embedding = np.random.randn(128)
 41similarities = cosine_similarity(query_embedding, embeddings)
 42
 43# 상위 결과
 44top_k = 3
 45top_indices = np.argsort(similarities)[-top_k:][::-1]
 46
 47print("검색 결과:")
 48for idx in top_indices:
 49    print(f"  [{similarities[idx]:.4f}] {documents[idx]}")
 50
 51
 52# ============================================
 53# 2. Chroma DB
 54# ============================================
 55print("\n[2] Chroma DB")
 56print("-" * 40)
 57
 58try:
 59    import chromadb
 60
 61    # 클라이언트 (메모리)
 62    client = chromadb.Client()
 63
 64    # 컬렉션 생성
 65    collection = client.create_collection(
 66        name="demo_collection",
 67        metadata={"description": "Demo collection"}
 68    )
 69
 70    # 문서 추가
 71    collection.add(
 72        documents=documents,
 73        ids=[f"doc_{i}" for i in range(len(documents))],
 74        metadatas=[{"source": "demo"} for _ in documents]
 75    )
 76
 77    print(f"컬렉션 생성: {collection.name}")
 78    print(f"문서 수: {collection.count()}")
 79
 80    # 검색
 81    results = collection.query(
 82        query_texts=["What is Python?"],
 83        n_results=3
 84    )
 85
 86    print("\nChroma 검색 결과:")
 87    for doc, dist in zip(results['documents'][0], results['distances'][0]):
 88        print(f"  [{dist:.4f}] {doc}")
 89
 90    # 메타데이터 필터링
 91    filtered = collection.query(
 92        query_texts=["programming"],
 93        n_results=2,
 94        where={"source": "demo"}
 95    )
 96    print(f"\n필터링 결과: {len(filtered['documents'][0])}개")
 97
 98except ImportError:
 99    print("chromadb 미설치 (pip install chromadb)")
100
101
102# ============================================
103# 3. FAISS
104# ============================================
105print("\n[3] FAISS")
106print("-" * 40)
107
108try:
109    import faiss
110
111    # 인덱스 생성
112    dimension = 128
113    index = faiss.IndexFlatL2(dimension)  # L2 거리
114
115    # 벡터 추가
116    vectors = np.random.randn(1000, dimension).astype('float32')
117    index.add(vectors)
118
119    print(f"인덱스 생성: {index.ntotal} 벡터")
120
121    # 검색
122    query = np.random.randn(1, dimension).astype('float32')
123    distances, indices = index.search(query, k=5)
124
125    print(f"검색 결과 (상위 5개):")
126    print(f"  인덱스: {indices[0]}")
127    print(f"  거리: {distances[0]}")
128
129    # IVF 인덱스 (대규모용)
130    nlist = 10  # 클러스터 수
131    quantizer = faiss.IndexFlatL2(dimension)
132    ivf_index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
133
134    # 학습 및 추가
135    ivf_index.train(vectors)
136    ivf_index.add(vectors)
137    ivf_index.nprobe = 3  # 검색할 클러스터 수
138
139    print(f"\nIVF 인덱스: {ivf_index.ntotal} 벡터, {nlist} 클러스터")
140
141    # 저장/로드
142    faiss.write_index(index, "demo_index.faiss")
143    loaded_index = faiss.read_index("demo_index.faiss")
144    print(f"인덱스 저장/로드 완료")
145
146    import os
147    os.remove("demo_index.faiss")
148
149except ImportError:
150    print("faiss 미설치 (pip install faiss-cpu)")
151
152
153# ============================================
154# 4. Sentence Transformers + Chroma
155# ============================================
156print("\n[4] Sentence Transformers + Chroma")
157print("-" * 40)
158
159try:
160    import chromadb
161    from chromadb.utils import embedding_functions
162
163    # Sentence Transformer 임베딩 함수
164    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
165        model_name="all-MiniLM-L6-v2"
166    )
167
168    # 클라이언트
169    client = chromadb.Client()
170
171    # 컬렉션 (임베딩 함수 지정)
172    collection = client.create_collection(
173        name="semantic_search",
174        embedding_function=embedding_fn
175    )
176
177    # 문서 추가 (임베딩 자동 생성)
178    collection.add(
179        documents=documents,
180        ids=[f"doc_{i}" for i in range(len(documents))]
181    )
182
183    # 시맨틱 검색
184    results = collection.query(
185        query_texts=["How to learn programming?"],
186        n_results=3
187    )
188
189    print("시맨틱 검색 결과:")
190    for doc, dist in zip(results['documents'][0], results['distances'][0]):
191        print(f"  [{dist:.4f}] {doc}")
192
193except ImportError as e:
194    print(f"필요 패키지 미설치: {e}")
195
196
197# ============================================
198# 5. LangChain + Chroma
199# ============================================
200print("\n[5] LangChain + Chroma (코드)")
201print("-" * 40)
202
203langchain_chroma = '''
204from langchain_community.vectorstores import Chroma
205from langchain_openai import OpenAIEmbeddings
206
207# 임베딩
208embeddings = OpenAIEmbeddings()
209
210# 벡터 스토어 생성
211vectorstore = Chroma.from_texts(
212    texts=documents,
213    embedding=embeddings,
214    persist_directory="./chroma_db"
215)
216
217# 검색
218docs = vectorstore.similarity_search("What is Python?", k=3)
219
220# Retriever로 변환
221retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
222results = retriever.invoke("programming languages")
223
224# 메타데이터와 함께 생성
225from langchain.schema import Document
226
227docs_with_meta = [
228    Document(page_content=text, metadata={"source": f"doc_{i}"})
229    for i, text in enumerate(texts)
230]
231
232vectorstore = Chroma.from_documents(
233    documents=docs_with_meta,
234    embedding=embeddings
235)
236'''
237print(langchain_chroma)
238
239
240# ============================================
241# 6. 인덱스 타입 비교
242# ============================================
243print("\n[6] FAISS 인덱스 타입 비교")
244print("-" * 40)
245
246index_comparison = """
247| 인덱스 타입 | 정확도 | 속도 | 메모리 | 사용 시점 |
248|------------|--------|------|--------|----------|
249| IndexFlatL2| 100%   | 느림 | 높음   | 소규모 (<100K) |
250| IndexIVF   | 95%+   | 빠름 | 중간   | 중규모 |
251| IndexHNSW  | 98%+   | 매우빠름| 높음 | 대규모, 실시간 |
252| IndexPQ    | 90%+   | 빠름 | 낮음   | 메모리 제한 |
253"""
254print(index_comparison)
255
256faiss_indexes = '''
257import faiss
258
259# Flat (정확)
260index = faiss.IndexFlatL2(dim)
261
262# IVF (클러스터링)
263quantizer = faiss.IndexFlatL2(dim)
264index = faiss.IndexIVFFlat(quantizer, dim, nlist=100)
265index.train(vectors)
266
267# HNSW (그래프 기반)
268index = faiss.IndexHNSWFlat(dim, 32)
269
270# PQ (압축)
271index = faiss.IndexPQ(dim, m=8, nbits=8)
272index.train(vectors)
273'''
274print(faiss_indexes)
275
276
277# ============================================
278# 정리
279# ============================================
280print("\n" + "=" * 60)
281print("벡터 DB 정리")
282print("=" * 60)
283
284summary = """
285선택 가이드:
286    - 개발/프로토타입: Chroma
287    - 대규모 로컬: FAISS
288    - 프로덕션 관리형: Pinecone
289
290핵심 코드:
291    # Chroma
292    collection = client.create_collection("name")
293    collection.add(documents=texts, ids=ids)
294    results = collection.query(query_texts=["query"], n_results=5)
295
296    # FAISS
297    index = faiss.IndexFlatL2(dimension)
298    index.add(vectors)
299    distances, indices = index.search(query, k=5)
300
301    # LangChain
302    vectorstore = Chroma.from_texts(texts, embeddings)
303    retriever = vectorstore.as_retriever()
304"""
305print(summary)