11_vector_db.py

Download
python 306 lines 7.6 KB
  1"""
  211. ๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์˜ˆ์ œ
  3
  4Chroma, FAISS๋ฅผ ์‚ฌ์šฉํ•œ ๋ฒกํ„ฐ ๊ฒ€์ƒ‰
  5"""
  6
  7import numpy as np
  8
  9print("=" * 60)
 10print("๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค")
 11print("=" * 60)
 12
 13
 14# ============================================
 15# 1. ๊ธฐ๋ณธ ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ (NumPy)
 16# ============================================
 17print("\n[1] NumPy ๋ฒกํ„ฐ ๊ฒ€์ƒ‰")
 18print("-" * 40)
 19
 20def cosine_similarity(query, vectors):
 21    """์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ"""
 22    query_norm = query / np.linalg.norm(query)
 23    vectors_norm = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
 24    return np.dot(vectors_norm, query_norm)
 25
 26# ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ
 27documents = [
 28    "Python is a programming language",
 29    "Machine learning uses algorithms",
 30    "Deep learning is a subset of ML",
 31    "JavaScript is for web development",
 32    "Data science involves statistics"
 33]
 34
 35# ๊ฐ€์ƒ ์ž„๋ฒ ๋”ฉ
 36np.random.seed(42)
 37embeddings = np.random.randn(len(documents), 128)
 38
 39# ๊ฒ€์ƒ‰
 40query_embedding = np.random.randn(128)
 41similarities = cosine_similarity(query_embedding, embeddings)
 42
 43# ์ƒ์œ„ ๊ฒฐ๊ณผ
 44top_k = 3
 45top_indices = np.argsort(similarities)[-top_k:][::-1]
 46
 47print("๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ:")
 48for idx in top_indices:
 49    print(f"  [{similarities[idx]:.4f}] {documents[idx]}")
 50
 51
 52# ============================================
 53# 2. Chroma DB
 54# ============================================
 55print("\n[2] Chroma DB")
 56print("-" * 40)
 57
 58try:
 59    import chromadb
 60
 61    # ํด๋ผ์ด์–ธํŠธ (๋ฉ”๋ชจ๋ฆฌ)
 62    client = chromadb.Client()
 63
 64    # ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ
 65    collection = client.create_collection(
 66        name="demo_collection",
 67        metadata={"description": "Demo collection"}
 68    )
 69
 70    # ๋ฌธ์„œ ์ถ”๊ฐ€
 71    collection.add(
 72        documents=documents,
 73        ids=[f"doc_{i}" for i in range(len(documents))],
 74        metadatas=[{"source": "demo"} for _ in documents]
 75    )
 76
 77    print(f"์ปฌ๋ ‰์…˜ ์ƒ์„ฑ: {collection.name}")
 78    print(f"๋ฌธ์„œ ์ˆ˜: {collection.count()}")
 79
 80    # ๊ฒ€์ƒ‰
 81    results = collection.query(
 82        query_texts=["What is Python?"],
 83        n_results=3
 84    )
 85
 86    print("\nChroma ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ:")
 87    for doc, dist in zip(results['documents'][0], results['distances'][0]):
 88        print(f"  [{dist:.4f}] {doc}")
 89
 90    # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํ•„ํ„ฐ๋ง
 91    filtered = collection.query(
 92        query_texts=["programming"],
 93        n_results=2,
 94        where={"source": "demo"}
 95    )
 96    print(f"\nํ•„ํ„ฐ๋ง ๊ฒฐ๊ณผ: {len(filtered['documents'][0])}๊ฐœ")
 97
 98except ImportError:
 99    print("chromadb ๋ฏธ์„ค์น˜ (pip install chromadb)")
100
101
102# ============================================
103# 3. FAISS
104# ============================================
105print("\n[3] FAISS")
106print("-" * 40)
107
108try:
109    import faiss
110
111    # ์ธ๋ฑ์Šค ์ƒ์„ฑ
112    dimension = 128
113    index = faiss.IndexFlatL2(dimension)  # L2 ๊ฑฐ๋ฆฌ
114
115    # ๋ฒกํ„ฐ ์ถ”๊ฐ€
116    vectors = np.random.randn(1000, dimension).astype('float32')
117    index.add(vectors)
118
119    print(f"์ธ๋ฑ์Šค ์ƒ์„ฑ: {index.ntotal} ๋ฒกํ„ฐ")
120
121    # ๊ฒ€์ƒ‰
122    query = np.random.randn(1, dimension).astype('float32')
123    distances, indices = index.search(query, k=5)
124
125    print(f"๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ (์ƒ์œ„ 5๊ฐœ):")
126    print(f"  ์ธ๋ฑ์Šค: {indices[0]}")
127    print(f"  ๊ฑฐ๋ฆฌ: {distances[0]}")
128
129    # IVF ์ธ๋ฑ์Šค (๋Œ€๊ทœ๋ชจ์šฉ)
130    nlist = 10  # ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜
131    quantizer = faiss.IndexFlatL2(dimension)
132    ivf_index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
133
134    # ํ•™์Šต ๋ฐ ์ถ”๊ฐ€
135    ivf_index.train(vectors)
136    ivf_index.add(vectors)
137    ivf_index.nprobe = 3  # ๊ฒ€์ƒ‰ํ•  ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜
138
139    print(f"\nIVF ์ธ๋ฑ์Šค: {ivf_index.ntotal} ๋ฒกํ„ฐ, {nlist} ํด๋Ÿฌ์Šคํ„ฐ")
140
141    # ์ €์žฅ/๋กœ๋“œ
142    faiss.write_index(index, "demo_index.faiss")
143    loaded_index = faiss.read_index("demo_index.faiss")
144    print(f"์ธ๋ฑ์Šค ์ €์žฅ/๋กœ๋“œ ์™„๋ฃŒ")
145
146    import os
147    os.remove("demo_index.faiss")
148
149except ImportError:
150    print("faiss ๋ฏธ์„ค์น˜ (pip install faiss-cpu)")
151
152
153# ============================================
154# 4. Sentence Transformers + Chroma
155# ============================================
156print("\n[4] Sentence Transformers + Chroma")
157print("-" * 40)
158
159try:
160    import chromadb
161    from chromadb.utils import embedding_functions
162
163    # Sentence Transformer ์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜
164    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
165        model_name="all-MiniLM-L6-v2"
166    )
167
168    # ํด๋ผ์ด์–ธํŠธ
169    client = chromadb.Client()
170
171    # ์ปฌ๋ ‰์…˜ (์ž„๋ฒ ๋”ฉ ํ•จ์ˆ˜ ์ง€์ •)
172    collection = client.create_collection(
173        name="semantic_search",
174        embedding_function=embedding_fn
175    )
176
177    # ๋ฌธ์„œ ์ถ”๊ฐ€ (์ž„๋ฒ ๋”ฉ ์ž๋™ ์ƒ์„ฑ)
178    collection.add(
179        documents=documents,
180        ids=[f"doc_{i}" for i in range(len(documents))]
181    )
182
183    # ์‹œ๋งจํ‹ฑ ๊ฒ€์ƒ‰
184    results = collection.query(
185        query_texts=["How to learn programming?"],
186        n_results=3
187    )
188
189    print("์‹œ๋งจํ‹ฑ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ:")
190    for doc, dist in zip(results['documents'][0], results['distances'][0]):
191        print(f"  [{dist:.4f}] {doc}")
192
193except ImportError as e:
194    print(f"ํ•„์š” ํŒจํ‚ค์ง€ ๋ฏธ์„ค์น˜: {e}")
195
196
197# ============================================
198# 5. LangChain + Chroma
199# ============================================
200print("\n[5] LangChain + Chroma (์ฝ”๋“œ)")
201print("-" * 40)
202
203langchain_chroma = '''
204from langchain_community.vectorstores import Chroma
205from langchain_openai import OpenAIEmbeddings
206
207# ์ž„๋ฒ ๋”ฉ
208embeddings = OpenAIEmbeddings()
209
210# ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ƒ์„ฑ
211vectorstore = Chroma.from_texts(
212    texts=documents,
213    embedding=embeddings,
214    persist_directory="./chroma_db"
215)
216
217# ๊ฒ€์ƒ‰
218docs = vectorstore.similarity_search("What is Python?", k=3)
219
220# Retriever๋กœ ๋ณ€ํ™˜
221retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
222results = retriever.invoke("programming languages")
223
224# ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์™€ ํ•จ๊ป˜ ์ƒ์„ฑ
225from langchain.schema import Document
226
227docs_with_meta = [
228    Document(page_content=text, metadata={"source": f"doc_{i}"})
229    for i, text in enumerate(texts)
230]
231
232vectorstore = Chroma.from_documents(
233    documents=docs_with_meta,
234    embedding=embeddings
235)
236'''
237print(langchain_chroma)
238
239
240# ============================================
241# 6. ์ธ๋ฑ์Šค ํƒ€์ž… ๋น„๊ต
242# ============================================
243print("\n[6] FAISS ์ธ๋ฑ์Šค ํƒ€์ž… ๋น„๊ต")
244print("-" * 40)
245
246index_comparison = """
247| ์ธ๋ฑ์Šค ํƒ€์ž… | ์ •ํ™•๋„ | ์†๋„ | ๋ฉ”๋ชจ๋ฆฌ | ์‚ฌ์šฉ ์‹œ์  |
248|------------|--------|------|--------|----------|
249| IndexFlatL2| 100%   | ๋А๋ฆผ | ๋†’์Œ   | ์†Œ๊ทœ๋ชจ (<100K) |
250| IndexIVF   | 95%+   | ๋น ๋ฆ„ | ์ค‘๊ฐ„   | ์ค‘๊ทœ๋ชจ |
251| IndexHNSW  | 98%+   | ๋งค์šฐ๋น ๋ฆ„| ๋†’์Œ | ๋Œ€๊ทœ๋ชจ, ์‹ค์‹œ๊ฐ„ |
252| IndexPQ    | 90%+   | ๋น ๋ฆ„ | ๋‚ฎ์Œ   | ๋ฉ”๋ชจ๋ฆฌ ์ œํ•œ |
253"""
254print(index_comparison)
255
256faiss_indexes = '''
257import faiss
258
259# Flat (์ •ํ™•)
260index = faiss.IndexFlatL2(dim)
261
262# IVF (ํด๋Ÿฌ์Šคํ„ฐ๋ง)
263quantizer = faiss.IndexFlatL2(dim)
264index = faiss.IndexIVFFlat(quantizer, dim, nlist=100)
265index.train(vectors)
266
267# HNSW (๊ทธ๋ž˜ํ”„ ๊ธฐ๋ฐ˜)
268index = faiss.IndexHNSWFlat(dim, 32)
269
270# PQ (์••์ถ•)
271index = faiss.IndexPQ(dim, m=8, nbits=8)
272index.train(vectors)
273'''
274print(faiss_indexes)
275
276
277# ============================================
278# ์ •๋ฆฌ
279# ============================================
280print("\n" + "=" * 60)
281print("๋ฒกํ„ฐ DB ์ •๋ฆฌ")
282print("=" * 60)
283
284summary = """
285์„ ํƒ ๊ฐ€์ด๋“œ:
286    - ๊ฐœ๋ฐœ/ํ”„๋กœํ† ํƒ€์ž…: Chroma
287    - ๋Œ€๊ทœ๋ชจ ๋กœ์ปฌ: FAISS
288    - ํ”„๋กœ๋•์…˜ ๊ด€๋ฆฌํ˜•: Pinecone
289
290ํ•ต์‹ฌ ์ฝ”๋“œ:
291    # Chroma
292    collection = client.create_collection("name")
293    collection.add(documents=texts, ids=ids)
294    results = collection.query(query_texts=["query"], n_results=5)
295
296    # FAISS
297    index = faiss.IndexFlatL2(dimension)
298    index.add(vectors)
299    distances, indices = index.search(query, k=5)
300
301    # LangChain
302    vectorstore = Chroma.from_texts(texts, embeddings)
303    retriever = vectorstore.as_retriever()
304"""
305print(summary)