1"""
211. ๋ฒกํฐ ๋ฐ์ดํฐ๋ฒ ์ด์ค ์์
3
4Chroma, FAISS๋ฅผ ์ฌ์ฉํ ๋ฒกํฐ ๊ฒ์
5"""
6
7import numpy as np
8
9print("=" * 60)
10print("๋ฒกํฐ ๋ฐ์ดํฐ๋ฒ ์ด์ค")
11print("=" * 60)
12
13
14# ============================================
15# 1. ๊ธฐ๋ณธ ๋ฒกํฐ ๊ฒ์ (NumPy)
16# ============================================
17print("\n[1] NumPy ๋ฒกํฐ ๊ฒ์")
18print("-" * 40)
19
20def cosine_similarity(query, vectors):
21 """์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ"""
22 query_norm = query / np.linalg.norm(query)
23 vectors_norm = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
24 return np.dot(vectors_norm, query_norm)
25
26# ์ํ ๋ฐ์ดํฐ
27documents = [
28 "Python is a programming language",
29 "Machine learning uses algorithms",
30 "Deep learning is a subset of ML",
31 "JavaScript is for web development",
32 "Data science involves statistics"
33]
34
35# ๊ฐ์ ์๋ฒ ๋ฉ
36np.random.seed(42)
37embeddings = np.random.randn(len(documents), 128)
38
39# ๊ฒ์
40query_embedding = np.random.randn(128)
41similarities = cosine_similarity(query_embedding, embeddings)
42
43# ์์ ๊ฒฐ๊ณผ
44top_k = 3
45top_indices = np.argsort(similarities)[-top_k:][::-1]
46
47print("๊ฒ์ ๊ฒฐ๊ณผ:")
48for idx in top_indices:
49 print(f" [{similarities[idx]:.4f}] {documents[idx]}")
50
51
52# ============================================
53# 2. Chroma DB
54# ============================================
55print("\n[2] Chroma DB")
56print("-" * 40)
57
58try:
59 import chromadb
60
61 # ํด๋ผ์ด์ธํธ (๋ฉ๋ชจ๋ฆฌ)
62 client = chromadb.Client()
63
64 # ์ปฌ๋ ์
์์ฑ
65 collection = client.create_collection(
66 name="demo_collection",
67 metadata={"description": "Demo collection"}
68 )
69
70 # ๋ฌธ์ ์ถ๊ฐ
71 collection.add(
72 documents=documents,
73 ids=[f"doc_{i}" for i in range(len(documents))],
74 metadatas=[{"source": "demo"} for _ in documents]
75 )
76
77 print(f"์ปฌ๋ ์
์์ฑ: {collection.name}")
78 print(f"๋ฌธ์ ์: {collection.count()}")
79
80 # ๊ฒ์
81 results = collection.query(
82 query_texts=["What is Python?"],
83 n_results=3
84 )
85
86 print("\nChroma ๊ฒ์ ๊ฒฐ๊ณผ:")
87 for doc, dist in zip(results['documents'][0], results['distances'][0]):
88 print(f" [{dist:.4f}] {doc}")
89
90 # ๋ฉํ๋ฐ์ดํฐ ํํฐ๋ง
91 filtered = collection.query(
92 query_texts=["programming"],
93 n_results=2,
94 where={"source": "demo"}
95 )
96 print(f"\nํํฐ๋ง ๊ฒฐ๊ณผ: {len(filtered['documents'][0])}๊ฐ")
97
98except ImportError:
99 print("chromadb ๋ฏธ์ค์น (pip install chromadb)")
100
101
102# ============================================
103# 3. FAISS
104# ============================================
105print("\n[3] FAISS")
106print("-" * 40)
107
108try:
109 import faiss
110
111 # ์ธ๋ฑ์ค ์์ฑ
112 dimension = 128
113 index = faiss.IndexFlatL2(dimension) # L2 ๊ฑฐ๋ฆฌ
114
115 # ๋ฒกํฐ ์ถ๊ฐ
116 vectors = np.random.randn(1000, dimension).astype('float32')
117 index.add(vectors)
118
119 print(f"์ธ๋ฑ์ค ์์ฑ: {index.ntotal} ๋ฒกํฐ")
120
121 # ๊ฒ์
122 query = np.random.randn(1, dimension).astype('float32')
123 distances, indices = index.search(query, k=5)
124
125 print(f"๊ฒ์ ๊ฒฐ๊ณผ (์์ 5๊ฐ):")
126 print(f" ์ธ๋ฑ์ค: {indices[0]}")
127 print(f" ๊ฑฐ๋ฆฌ: {distances[0]}")
128
129 # IVF ์ธ๋ฑ์ค (๋๊ท๋ชจ์ฉ)
130 nlist = 10 # ํด๋ฌ์คํฐ ์
131 quantizer = faiss.IndexFlatL2(dimension)
132 ivf_index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
133
134 # ํ์ต ๋ฐ ์ถ๊ฐ
135 ivf_index.train(vectors)
136 ivf_index.add(vectors)
137 ivf_index.nprobe = 3 # ๊ฒ์ํ ํด๋ฌ์คํฐ ์
138
139 print(f"\nIVF ์ธ๋ฑ์ค: {ivf_index.ntotal} ๋ฒกํฐ, {nlist} ํด๋ฌ์คํฐ")
140
141 # ์ ์ฅ/๋ก๋
142 faiss.write_index(index, "demo_index.faiss")
143 loaded_index = faiss.read_index("demo_index.faiss")
144 print(f"์ธ๋ฑ์ค ์ ์ฅ/๋ก๋ ์๋ฃ")
145
146 import os
147 os.remove("demo_index.faiss")
148
149except ImportError:
150 print("faiss ๋ฏธ์ค์น (pip install faiss-cpu)")
151
152
153# ============================================
154# 4. Sentence Transformers + Chroma
155# ============================================
156print("\n[4] Sentence Transformers + Chroma")
157print("-" * 40)
158
159try:
160 import chromadb
161 from chromadb.utils import embedding_functions
162
163 # Sentence Transformer ์๋ฒ ๋ฉ ํจ์
164 embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
165 model_name="all-MiniLM-L6-v2"
166 )
167
168 # ํด๋ผ์ด์ธํธ
169 client = chromadb.Client()
170
171 # ์ปฌ๋ ์
(์๋ฒ ๋ฉ ํจ์ ์ง์ )
172 collection = client.create_collection(
173 name="semantic_search",
174 embedding_function=embedding_fn
175 )
176
177 # ๋ฌธ์ ์ถ๊ฐ (์๋ฒ ๋ฉ ์๋ ์์ฑ)
178 collection.add(
179 documents=documents,
180 ids=[f"doc_{i}" for i in range(len(documents))]
181 )
182
183 # ์๋งจํฑ ๊ฒ์
184 results = collection.query(
185 query_texts=["How to learn programming?"],
186 n_results=3
187 )
188
189 print("์๋งจํฑ ๊ฒ์ ๊ฒฐ๊ณผ:")
190 for doc, dist in zip(results['documents'][0], results['distances'][0]):
191 print(f" [{dist:.4f}] {doc}")
192
193except ImportError as e:
194 print(f"ํ์ ํจํค์ง ๋ฏธ์ค์น: {e}")
195
196
197# ============================================
198# 5. LangChain + Chroma
199# ============================================
200print("\n[5] LangChain + Chroma (์ฝ๋)")
201print("-" * 40)
202
203langchain_chroma = '''
204from langchain_community.vectorstores import Chroma
205from langchain_openai import OpenAIEmbeddings
206
207# ์๋ฒ ๋ฉ
208embeddings = OpenAIEmbeddings()
209
210# ๋ฒกํฐ ์คํ ์ด ์์ฑ
211vectorstore = Chroma.from_texts(
212 texts=documents,
213 embedding=embeddings,
214 persist_directory="./chroma_db"
215)
216
217# ๊ฒ์
218docs = vectorstore.similarity_search("What is Python?", k=3)
219
220# Retriever๋ก ๋ณํ
221retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
222results = retriever.invoke("programming languages")
223
224# ๋ฉํ๋ฐ์ดํฐ์ ํจ๊ป ์์ฑ
225from langchain.schema import Document
226
227docs_with_meta = [
228 Document(page_content=text, metadata={"source": f"doc_{i}"})
229 for i, text in enumerate(texts)
230]
231
232vectorstore = Chroma.from_documents(
233 documents=docs_with_meta,
234 embedding=embeddings
235)
236'''
237print(langchain_chroma)
238
239
240# ============================================
241# 6. ์ธ๋ฑ์ค ํ์
๋น๊ต
242# ============================================
243print("\n[6] FAISS ์ธ๋ฑ์ค ํ์
๋น๊ต")
244print("-" * 40)
245
246index_comparison = """
247| ์ธ๋ฑ์ค ํ์
| ์ ํ๋ | ์๋ | ๋ฉ๋ชจ๋ฆฌ | ์ฌ์ฉ ์์ |
248|------------|--------|------|--------|----------|
249| IndexFlatL2| 100% | ๋๋ฆผ | ๋์ | ์๊ท๋ชจ (<100K) |
250| IndexIVF | 95%+ | ๋น ๋ฆ | ์ค๊ฐ | ์ค๊ท๋ชจ |
251| IndexHNSW | 98%+ | ๋งค์ฐ๋น ๋ฆ| ๋์ | ๋๊ท๋ชจ, ์ค์๊ฐ |
252| IndexPQ | 90%+ | ๋น ๋ฆ | ๋ฎ์ | ๋ฉ๋ชจ๋ฆฌ ์ ํ |
253"""
254print(index_comparison)
255
256faiss_indexes = '''
257import faiss
258
259# Flat (์ ํ)
260index = faiss.IndexFlatL2(dim)
261
262# IVF (ํด๋ฌ์คํฐ๋ง)
263quantizer = faiss.IndexFlatL2(dim)
264index = faiss.IndexIVFFlat(quantizer, dim, nlist=100)
265index.train(vectors)
266
267# HNSW (๊ทธ๋ํ ๊ธฐ๋ฐ)
268index = faiss.IndexHNSWFlat(dim, 32)
269
270# PQ (์์ถ)
271index = faiss.IndexPQ(dim, m=8, nbits=8)
272index.train(vectors)
273'''
274print(faiss_indexes)
275
276
277# ============================================
278# ์ ๋ฆฌ
279# ============================================
280print("\n" + "=" * 60)
281print("๋ฒกํฐ DB ์ ๋ฆฌ")
282print("=" * 60)
283
284summary = """
285์ ํ ๊ฐ์ด๋:
286 - ๊ฐ๋ฐ/ํ๋กํ ํ์
: Chroma
287 - ๋๊ท๋ชจ ๋ก์ปฌ: FAISS
288 - ํ๋ก๋์
๊ด๋ฆฌํ: Pinecone
289
290ํต์ฌ ์ฝ๋:
291 # Chroma
292 collection = client.create_collection("name")
293 collection.add(documents=texts, ids=ids)
294 results = collection.query(query_texts=["query"], n_results=5)
295
296 # FAISS
297 index = faiss.IndexFlatL2(dimension)
298 index.add(vectors)
299 distances, indices = index.search(query, k=5)
300
301 # LangChain
302 vectorstore = Chroma.from_texts(texts, embeddings)
303 retriever = vectorstore.as_retriever()
304"""
305print(summary)