06_indexing_btree.py

Download
python 394 lines 12.7 KB
  1"""
  2B+Tree Indexing
  3
  4Demonstrates B+Tree data structure for database indexing:
  5- B+Tree properties: balanced, sorted, multi-way tree
  6- Insert, search, range query operations
  7- Performance comparison: with vs without index
  8- Leaf node chaining for sequential access
  9
 10Theory:
 11- B+Tree is optimized for disk I/O (high branching factor)
 12- All data stored in leaves; internal nodes only store keys
 13- Leaves are linked for efficient range queries
 14- Height is O(log_m N) where m is order, N is number of keys
 15- Guarantees balanced tree (all paths to leaves have same length)
 16- Insert/delete maintain balance through splits and merges
 17
 18Simplified implementation focuses on concepts, not production-ready code.
 19"""
 20
 21import sqlite3
 22import time
 23from typing import Optional, List, Tuple, Any
 24
 25
 26class BPlusTreeNode:
 27    """Node in a B+Tree."""
 28
 29    def __init__(self, is_leaf: bool = False):
 30        self.is_leaf = is_leaf
 31        self.keys: List[int] = []
 32        self.children: List[BPlusTreeNode] = []  # For internal nodes
 33        self.values: List[Any] = []  # For leaf nodes
 34        self.next: Optional[BPlusTreeNode] = None  # Link to next leaf
 35
 36
 37class BPlusTree:
 38    """Simplified B+Tree implementation for demonstration."""
 39
 40    def __init__(self, order: int = 4):
 41        """
 42        Args:
 43            order: Maximum number of children per node (minimum is ceil(order/2))
 44        """
 45        self.order = order
 46        self.root = BPlusTreeNode(is_leaf=True)
 47
 48    def search(self, key: int) -> Optional[Any]:
 49        """Search for a key in the tree."""
 50        node = self.root
 51
 52        # Traverse to leaf
 53        while not node.is_leaf:
 54            # Find appropriate child
 55            i = 0
 56            while i < len(node.keys) and key >= node.keys[i]:
 57                i += 1
 58            node = node.children[i]
 59
 60        # Search in leaf
 61        if key in node.keys:
 62            idx = node.keys.index(key)
 63            return node.values[idx]
 64        return None
 65
 66    def range_query(self, start_key: int, end_key: int) -> List[Tuple[int, Any]]:
 67        """Return all key-value pairs in range [start_key, end_key]."""
 68        results = []
 69        node = self.root
 70
 71        # Find leaf containing start_key
 72        while not node.is_leaf:
 73            i = 0
 74            while i < len(node.keys) and start_key >= node.keys[i]:
 75                i += 1
 76            node = node.children[i]
 77
 78        # Traverse leaves using next pointers
 79        while node is not None:
 80            for i, key in enumerate(node.keys):
 81                if start_key <= key <= end_key:
 82                    results.append((key, node.values[i]))
 83                elif key > end_key:
 84                    return results
 85            node = node.next
 86
 87        return results
 88
 89    def insert(self, key: int, value: Any):
 90        """Insert a key-value pair."""
 91        root = self.root
 92
 93        # If root is full, split it
 94        if len(root.keys) >= self.order:
 95            new_root = BPlusTreeNode(is_leaf=False)
 96            new_root.children.append(self.root)
 97            self._split_child(new_root, 0)
 98            self.root = new_root
 99
100        self._insert_non_full(self.root, key, value)
101
102    def _insert_non_full(self, node: BPlusTreeNode, key: int, value: Any):
103        """Insert into a node that is not full."""
104        if node.is_leaf:
105            # Insert into sorted position
106            i = 0
107            while i < len(node.keys) and node.keys[i] < key:
108                i += 1
109            node.keys.insert(i, key)
110            node.values.insert(i, value)
111        else:
112            # Find child to insert into
113            i = 0
114            while i < len(node.keys) and key >= node.keys[i]:
115                i += 1
116
117            # Split child if full
118            if len(node.children[i].keys) >= self.order:
119                self._split_child(node, i)
120                if key >= node.keys[i]:
121                    i += 1
122
123            self._insert_non_full(node.children[i], key, value)
124
125    def _split_child(self, parent: BPlusTreeNode, index: int):
126        """Split a full child node."""
127        full_child = parent.children[index]
128        mid = len(full_child.keys) // 2
129
130        # Create new sibling
131        new_child = BPlusTreeNode(is_leaf=full_child.is_leaf)
132
133        if full_child.is_leaf:
134            # Copy upper half to new node
135            new_child.keys = full_child.keys[mid:]
136            new_child.values = full_child.values[mid:]
137            full_child.keys = full_child.keys[:mid]
138            full_child.values = full_child.values[:mid]
139
140            # Link leaves
141            new_child.next = full_child.next
142            full_child.next = new_child
143
144            # Promote copy of first key in new node
145            parent.keys.insert(index, new_child.keys[0])
146        else:
147            # Internal node split
148            new_child.keys = full_child.keys[mid+1:]
149            new_child.children = full_child.children[mid+1:]
150            promote_key = full_child.keys[mid]
151            full_child.keys = full_child.keys[:mid]
152            full_child.children = full_child.children[:mid+1]
153
154            parent.keys.insert(index, promote_key)
155
156        parent.children.insert(index + 1, new_child)
157
158    def print_tree(self, node: Optional[BPlusTreeNode] = None, level: int = 0):
159        """Print tree structure for visualization."""
160        if node is None:
161            node = self.root
162
163        indent = "  " * level
164        if node.is_leaf:
165            print(f"{indent}LEAF: {node.keys}")
166        else:
167            print(f"{indent}INTERNAL: {node.keys}")
168            for child in node.children:
169                self.print_tree(child, level + 1)
170
171
172def demonstrate_btree_operations():
173    """Demonstrate B+Tree operations."""
174    print("=" * 60)
175    print("B+TREE OPERATIONS")
176    print("=" * 60)
177    print()
178
179    tree = BPlusTree(order=4)  # Max 4 keys per node
180
181    # Insert keys
182    print("Inserting keys: 10, 20, 5, 6, 12, 30, 7, 17")
183    print("-" * 60)
184    for key in [10, 20, 5, 6, 12, 30, 7, 17]:
185        tree.insert(key, f"value_{key}")
186        print(f"Inserted {key}")
187
188    print("\nTree structure:")
189    print("-" * 60)
190    tree.print_tree()
191
192    # Search
193    print("\n\nSearch operations:")
194    print("-" * 60)
195    for key in [12, 15, 30]:
196        result = tree.search(key)
197        print(f"Search({key}): {result if result else 'Not found'}")
198
199    # Range query
200    print("\n\nRange query [6, 17]:")
201    print("-" * 60)
202    results = tree.range_query(6, 17)
203    for key, value in results:
204        print(f"  {key}: {value}")
205    print()
206
207
208def demonstrate_index_performance():
209    """Compare query performance with and without index."""
210    print("=" * 60)
211    print("INDEX PERFORMANCE COMPARISON")
212    print("=" * 60)
213    print()
214
215    conn = sqlite3.connect(':memory:')
216    cursor = conn.cursor()
217
218    # Create table with many rows
219    print("Creating table with 100,000 rows...")
220    cursor.execute('''
221        CREATE TABLE Products (
222            product_id INTEGER PRIMARY KEY,
223            name TEXT,
224            category TEXT,
225            price REAL
226        )
227    ''')
228
229    # Insert data
230    import random
231    categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Toys']
232    data = [
233        (i, f'Product_{i}', random.choice(categories), random.uniform(10, 1000))
234        for i in range(100000)
235    ]
236    cursor.executemany("INSERT INTO Products VALUES (?, ?, ?, ?)", data)
237    conn.commit()
238
239    print("✓ Inserted 100,000 products\n")
240
241    # Query without index
242    print("1. Query WITHOUT index on category:")
243    print("-" * 60)
244    start = time.time()
245    cursor.execute("SELECT COUNT(*) FROM Products WHERE category = 'Electronics'")
246    count = cursor.fetchone()[0]
247    elapsed = time.time() - start
248    print(f"Found {count} electronics in {elapsed*1000:.2f} ms")
249
250    # Show query plan
251    cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Products WHERE category = 'Electronics'")
252    print("\nQuery plan:")
253    for row in cursor.fetchall():
254        print(f"  {row}")
255
256    # Create index
257    print("\n\n2. Creating index on category...")
258    print("-" * 60)
259    start = time.time()
260    cursor.execute("CREATE INDEX idx_category ON Products(category)")
261    elapsed = time.time() - start
262    print(f"✓ Index created in {elapsed*1000:.2f} ms\n")
263
264    # Query with index
265    print("3. Query WITH index on category:")
266    print("-" * 60)
267    start = time.time()
268    cursor.execute("SELECT COUNT(*) FROM Products WHERE category = 'Electronics'")
269    count = cursor.fetchone()[0]
270    elapsed = time.time() - start
271    print(f"Found {count} electronics in {elapsed*1000:.2f} ms")
272
273    # Show query plan
274    cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Products WHERE category = 'Electronics'")
275    print("\nQuery plan (now uses index):")
276    for row in cursor.fetchall():
277        print(f"  {row}")
278
279    # Range query
280    print("\n\n4. Range query (price between 100 and 200):")
281    print("-" * 60)
282
283    # Without index
284    start = time.time()
285    cursor.execute("SELECT COUNT(*) FROM Products WHERE price BETWEEN 100 AND 200")
286    count = cursor.fetchone()[0]
287    elapsed_no_idx = time.time() - start
288    print(f"Without index: {count} products in {elapsed_no_idx*1000:.2f} ms")
289
290    # With index
291    cursor.execute("CREATE INDEX idx_price ON Products(price)")
292    start = time.time()
293    cursor.execute("SELECT COUNT(*) FROM Products WHERE price BETWEEN 100 AND 200")
294    count = cursor.fetchone()[0]
295    elapsed_with_idx = time.time() - start
296    print(f"With index:    {count} products in {elapsed_with_idx*1000:.2f} ms")
297
298    if elapsed_no_idx > elapsed_with_idx:
299        speedup = elapsed_no_idx / elapsed_with_idx
300        print(f"\n✓ Speedup: {speedup:.1f}x faster with index")
301
302    conn.close()
303    print()
304
305
306def demonstrate_composite_index():
307    """Demonstrate composite (multi-column) index."""
308    print("=" * 60)
309    print("COMPOSITE INDEX")
310    print("=" * 60)
311    print()
312
313    conn = sqlite3.connect(':memory:')
314    cursor = conn.cursor()
315
316    cursor.execute('''
317        CREATE TABLE Orders (
318            order_id INTEGER PRIMARY KEY,
319            customer_id INTEGER,
320            order_date TEXT,
321            total REAL
322        )
323    ''')
324
325    # Insert sample data
326    import random
327    from datetime import date, timedelta
328    start_date = date(2023, 1, 1)
329    data = [
330        (i, random.randint(1, 100), (start_date + timedelta(days=random.randint(0, 365))).isoformat(), random.uniform(10, 500))
331        for i in range(10000)
332    ]
333    cursor.executemany("INSERT INTO Orders VALUES (?, ?, ?, ?)", data)
334
335    print("Inserted 10,000 orders\n")
336
337    # Query on customer_id and order_date
338    print("Query: SELECT * FROM Orders WHERE customer_id = 50 AND order_date >= '2023-06-01'")
339    print("-" * 60)
340
341    # Without index
342    cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Orders WHERE customer_id = 50 AND order_date >= '2023-06-01'")
343    print("Without index:")
344    for row in cursor.fetchall():
345        print(f"  {row}")
346
347    # Create composite index
348    print("\nCreating composite index on (customer_id, order_date)...")
349    cursor.execute("CREATE INDEX idx_cust_date ON Orders(customer_id, order_date)")
350
351    cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Orders WHERE customer_id = 50 AND order_date >= '2023-06-01'")
352    print("\nWith composite index:")
353    for row in cursor.fetchall():
354        print(f"  {row}")
355
356    print("\nNote: Composite index can be used for:")
357    print("  ✓ WHERE customer_id = X")
358    print("  ✓ WHERE customer_id = X AND order_date = Y")
359    print("  ✗ WHERE order_date = Y (only) - cannot use index efficiently")
360    print("    (Leftmost prefix rule)")
361
362    conn.close()
363    print()
364
365
366if __name__ == "__main__":
367    print("""
368╔══════════════════════════════════════════════════════════════╗
369║                B+TREE INDEXING                               ║
370║  Data Structure, Operations, Performance                     ║
371╚══════════════════════════════════════════════════════════════╝
372""")
373
374    demonstrate_btree_operations()
375    demonstrate_index_performance()
376    demonstrate_composite_index()
377
378    print("=" * 60)
379    print("SUMMARY")
380    print("=" * 60)
381    print("B+Tree properties:")
382    print("  - Balanced: all leaves at same depth")
383    print("  - Sorted: supports range queries efficiently")
384    print("  - High branching factor: minimizes disk I/O")
385    print("  - Leaves linked: sequential access")
386    print()
387    print("When to use indexes:")
388    print("  ✓ Columns in WHERE clauses")
389    print("  ✓ Columns in JOIN conditions")
390    print("  ✓ Columns in ORDER BY")
391    print("  ✗ Small tables (overhead not worth it)")
392    print("  ✗ Frequently updated columns (index maintenance cost)")
393    print("=" * 60)