11. Vector Databases

11. Vector Databases

Learning Objectives

  • Vector database concepts
  • Using Chroma, FAISS, Pinecone
  • Indexing and search optimization
  • Practical usage patterns

1. Vector Database Overview

Why Vector DB?

Traditional DB:
    SELECT * FROM docs WHERE text LIKE '%machine learning%'
    → Keyword matching only

Vector DB:
    query_vector = embed("What is AI?")
    SELECT * FROM docs ORDER BY similarity(vector, query_vector)
    → Semantic similarity search

Major Vector DBs

Name Type Features
Chroma Local/Embedded Simple, for development
FAISS Library Fast, large-scale
Pinecone Cloud Managed, scalable
Weaviate Open source Hybrid search
Qdrant Open source Strong filtering
Milvus Open source Large-scale, distributed

2. Chroma

Installation and Basic Usage

pip install chromadb
import chromadb
from chromadb.utils import embedding_functions

# Create client
client = chromadb.Client()  # In-memory
# client = chromadb.PersistentClient(path="./chroma_db")  # Persistent

# Embedding function
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create collection
collection = client.create_collection(
    name="my_collection",
    embedding_function=embedding_fn
)

Adding Documents

# Add documents
collection.add(
    documents=["Document 1 text", "Document 2 text", "Document 3 text"],
    metadatas=[{"source": "a"}, {"source": "b"}, {"source": "a"}],
    ids=["doc1", "doc2", "doc3"]
)

# Provide embeddings directly
collection.add(
    embeddings=[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
    documents=["Doc 1", "Doc 2"],
    ids=["id1", "id2"]
)
# Query search
results = collection.query(
    query_texts=["What is machine learning?"],
    n_results=3
)

print(results['documents'])  # Document content
print(results['distances'])  # Distances
print(results['metadatas'])  # Metadata

# Metadata filtering
results = collection.query(
    query_texts=["query"],
    n_results=5,
    where={"source": "a"}  # Only source "a"
)

# Complex filters
results = collection.query(
    query_texts=["query"],
    where={"$and": [{"source": "a"}, {"year": {"$gt": 2020}}]}
)

LangChain Integration

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

# Create
vectorstore = Chroma.from_texts(
    texts=["text1", "text2", "text3"],
    embedding=embeddings,
    persist_directory="./chroma_db"
)

# Search
docs = vectorstore.similarity_search("query", k=3)

# Use as Retriever
retriever = vectorstore.as_retriever()

3. FAISS

Installation and Basic Usage

pip install faiss-cpu  # CPU version
# pip install faiss-gpu  # GPU version
import faiss
import numpy as np

# Create index
dimension = 384
index = faiss.IndexFlatL2(dimension)  # L2 distance

# Add vectors
vectors = np.random.random((1000, dimension)).astype('float32')
index.add(vectors)

print(f"Total vectors: {index.ntotal}")

Search

# Search
query = np.random.random((1, dimension)).astype('float32')
distances, indices = index.search(query, k=5)

print(f"Indices: {indices}")
print(f"Distances: {distances}")

Index Types

# Flat (accurate, slow)
index = faiss.IndexFlatL2(dimension)

# IVF (approximate, fast)
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist=100)
index.train(vectors)  # Training required
index.add(vectors)
index.nprobe = 10  # Number of clusters to search

# HNSW (very fast)
index = faiss.IndexHNSWFlat(dimension, 32)  # 32 = M parameter
index.add(vectors)

# PQ (memory efficient)
index = faiss.IndexPQ(dimension, 8, 8)  # M=8, nbits=8
index.train(vectors)
index.add(vectors)

Save/Load

# Save
faiss.write_index(index, "index.faiss")

# Load
index = faiss.read_index("index.faiss")

LangChain Integration

from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

# Create
vectorstore = FAISS.from_texts(
    texts=["text1", "text2"],
    embedding=embeddings
)

# Save/Load
vectorstore.save_local("faiss_index")
vectorstore = FAISS.load_local("faiss_index", embeddings)

4. Pinecone

Installation and Setup

pip install pinecone-client
from pinecone import Pinecone, ServerlessSpec

# Create client
pc = Pinecone(api_key="your-api-key")

# Create index
pc.create_index(
    name="my-index",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

# Connect to index
index = pc.Index("my-index")

Adding Documents

# Upsert (add/update)
index.upsert(
    vectors=[
        {"id": "vec1", "values": [0.1, 0.2, ...], "metadata": {"source": "a"}},
        {"id": "vec2", "values": [0.3, 0.4, ...], "metadata": {"source": "b"}},
    ]
)

# Batch upsert
from itertools import islice

def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = list(islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = list(islice(it, batch_size))

for batch in chunks(vectors, batch_size=100):
    index.upsert(vectors=batch)

Search

# Query
results = index.query(
    vector=[0.1, 0.2, ...],
    top_k=5,
    include_metadata=True
)

for match in results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")
    print(f"Metadata: {match['metadata']}")

# Metadata filtering
results = index.query(
    vector=[0.1, 0.2, ...],
    top_k=5,
    filter={"source": {"$eq": "a"}}
)

LangChain Integration

from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

vectorstore = PineconeVectorStore.from_texts(
    texts=["text1", "text2"],
    embedding=embeddings,
    index_name="my-index"
)

# Search
docs = vectorstore.similarity_search("query", k=3)

5. Indexing Strategies

Index Type Comparison

Type Accuracy Speed Memory When to Use
Flat 100% Slow High Small-scale (<100K)
IVF 95%+ Fast Medium Medium-scale
HNSW 98%+ Very fast High Large-scale, real-time
PQ 90%+ Fast Low Memory limited

Hybrid Index

# IVF + PQ
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFPQ(
    quantizer,
    dimension,
    nlist=100,   # Number of clusters
    m=8,         # PQ segments
    nbits=8      # PQ bits
)
index.train(vectors)
index.add(vectors)

6. Using Metadata

Filtering Patterns

# Chroma filter syntax
results = collection.query(
    query_texts=["query"],
    where={
        "$and": [
            {"category": "tech"},
            {"year": {"$gte": 2023}},
            {"author": {"$in": ["Alice", "Bob"]}}
        ]
    }
)

# Supported operators
# $eq, $ne: equal, not equal
# $gt, $gte, $lt, $lte: comparison
# $in, $nin: in, not in
# $and, $or: logical operations

Metadata Updates

# Chroma
collection.update(
    ids=["doc1"],
    metadatas=[{"source": "updated"}]
)

# Pinecone
index.update(
    id="vec1",
    set_metadata={"source": "updated"}
)

7. Practical Patterns

Document Management Class

class VectorStore:
    def __init__(self, persist_dir="./db"):
        self.client = chromadb.PersistentClient(path=persist_dir)
        self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction()
        self.collection = self.client.get_or_create_collection(
            name="documents",
            embedding_function=self.embedding_fn
        )

    def add_documents(self, texts, metadatas=None, ids=None):
        if ids is None:
            ids = [str(uuid.uuid4()) for _ in texts]
        self.collection.add(
            documents=texts,
            metadatas=metadatas,
            ids=ids
        )
        return ids

    def search(self, query, k=5, where=None):
        results = self.collection.query(
            query_texts=[query],
            n_results=k,
            where=where
        )
        return results

    def delete(self, ids):
        self.collection.delete(ids=ids)

Incremental Updates

import hashlib

def get_doc_id(text):
    return hashlib.md5(text.encode()).hexdigest()

def upsert_documents(texts, collection):
    """Upsert with deduplication"""
    ids = [get_doc_id(t) for t in texts]

    # Check existing documents
    existing = collection.get(ids=ids)
    existing_ids = set(existing['ids'])

    # Add only new documents
    new_texts = []
    new_ids = []
    for text, doc_id in zip(texts, ids):
        if doc_id not in existing_ids:
            new_texts.append(text)
            new_ids.append(doc_id)

    if new_texts:
        collection.add(documents=new_texts, ids=new_ids)

    return len(new_texts)

Batch Processing

def batch_add(collection, texts, batch_size=100):
    """Add large number of documents in batches"""
    total = len(texts)
    for i in range(0, total, batch_size):
        batch = texts[i:i + batch_size]
        ids = [str(uuid.uuid4()) for _ in batch]
        collection.add(documents=batch, ids=ids)
        print(f"Added {min(i + batch_size, total)}/{total}")

8. Performance Optimization

Embedding Caching

import pickle
import os

class CachedEmbeddings:
    def __init__(self, model, cache_dir="./embed_cache"):
        self.model = model
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)

    def embed(self, text):
        cache_key = hashlib.md5(text.encode()).hexdigest()
        cache_path = os.path.join(self.cache_dir, f"{cache_key}.pkl")

        if os.path.exists(cache_path):
            with open(cache_path, 'rb') as f:
                return pickle.load(f)

        embedding = self.model.encode(text)

        with open(cache_path, 'wb') as f:
            pickle.dump(embedding, f)

        return embedding

Index Optimization

# FAISS search parameter tuning
index.nprobe = 20  # Search more clusters (accuracy ↑, speed ↓)

# Parallel search
faiss.omp_set_num_threads(4)  # Set number of threads

Summary

Selection Guide

Situation Recommendation
Development/Prototype Chroma
Large-scale local FAISS
Production managed Pinecone
Open source self-hosted Qdrant, Milvus

Key Code

# Chroma
collection = client.create_collection("name")
collection.add(documents=texts, ids=ids)
results = collection.query(query_texts=["query"], n_results=5)

# FAISS
index = faiss.IndexFlatL2(dimension)
index.add(vectors)
distances, indices = index.search(query, k=5)

# LangChain
vectorstore = Chroma.from_texts(texts, embeddings)
docs = vectorstore.similarity_search("query", k=3)

Next Steps

Build a conversational AI system in 12_Practical_Chatbot.md.

to navigate between lessons