1"""
2B+Tree Indexing
3
4Demonstrates B+Tree data structure for database indexing:
5- B+Tree properties: balanced, sorted, multi-way tree
6- Insert, search, range query operations
7- Performance comparison: with vs without index
8- Leaf node chaining for sequential access
9
10Theory:
11- B+Tree is optimized for disk I/O (high branching factor)
12- All data stored in leaves; internal nodes only store keys
13- Leaves are linked for efficient range queries
14- Height is O(log_m N) where m is order, N is number of keys
15- Guarantees balanced tree (all paths to leaves have same length)
16- Insert/delete maintain balance through splits and merges
17
18Simplified implementation focuses on concepts, not production-ready code.
19"""
20
21import sqlite3
22import time
23from typing import Optional, List, Tuple, Any
24
25
26class BPlusTreeNode:
27 """Node in a B+Tree."""
28
29 def __init__(self, is_leaf: bool = False):
30 self.is_leaf = is_leaf
31 self.keys: List[int] = []
32 self.children: List[BPlusTreeNode] = [] # For internal nodes
33 self.values: List[Any] = [] # For leaf nodes
34 self.next: Optional[BPlusTreeNode] = None # Link to next leaf
35
36
37class BPlusTree:
38 """Simplified B+Tree implementation for demonstration."""
39
40 def __init__(self, order: int = 4):
41 """
42 Args:
43 order: Maximum number of children per node (minimum is ceil(order/2))
44 """
45 self.order = order
46 self.root = BPlusTreeNode(is_leaf=True)
47
48 def search(self, key: int) -> Optional[Any]:
49 """Search for a key in the tree."""
50 node = self.root
51
52 # Traverse to leaf
53 while not node.is_leaf:
54 # Find appropriate child
55 i = 0
56 while i < len(node.keys) and key >= node.keys[i]:
57 i += 1
58 node = node.children[i]
59
60 # Search in leaf
61 if key in node.keys:
62 idx = node.keys.index(key)
63 return node.values[idx]
64 return None
65
66 def range_query(self, start_key: int, end_key: int) -> List[Tuple[int, Any]]:
67 """Return all key-value pairs in range [start_key, end_key]."""
68 results = []
69 node = self.root
70
71 # Find leaf containing start_key
72 while not node.is_leaf:
73 i = 0
74 while i < len(node.keys) and start_key >= node.keys[i]:
75 i += 1
76 node = node.children[i]
77
78 # Traverse leaves using next pointers
79 while node is not None:
80 for i, key in enumerate(node.keys):
81 if start_key <= key <= end_key:
82 results.append((key, node.values[i]))
83 elif key > end_key:
84 return results
85 node = node.next
86
87 return results
88
89 def insert(self, key: int, value: Any):
90 """Insert a key-value pair."""
91 root = self.root
92
93 # If root is full, split it
94 if len(root.keys) >= self.order:
95 new_root = BPlusTreeNode(is_leaf=False)
96 new_root.children.append(self.root)
97 self._split_child(new_root, 0)
98 self.root = new_root
99
100 self._insert_non_full(self.root, key, value)
101
102 def _insert_non_full(self, node: BPlusTreeNode, key: int, value: Any):
103 """Insert into a node that is not full."""
104 if node.is_leaf:
105 # Insert into sorted position
106 i = 0
107 while i < len(node.keys) and node.keys[i] < key:
108 i += 1
109 node.keys.insert(i, key)
110 node.values.insert(i, value)
111 else:
112 # Find child to insert into
113 i = 0
114 while i < len(node.keys) and key >= node.keys[i]:
115 i += 1
116
117 # Split child if full
118 if len(node.children[i].keys) >= self.order:
119 self._split_child(node, i)
120 if key >= node.keys[i]:
121 i += 1
122
123 self._insert_non_full(node.children[i], key, value)
124
125 def _split_child(self, parent: BPlusTreeNode, index: int):
126 """Split a full child node."""
127 full_child = parent.children[index]
128 mid = len(full_child.keys) // 2
129
130 # Create new sibling
131 new_child = BPlusTreeNode(is_leaf=full_child.is_leaf)
132
133 if full_child.is_leaf:
134 # Copy upper half to new node
135 new_child.keys = full_child.keys[mid:]
136 new_child.values = full_child.values[mid:]
137 full_child.keys = full_child.keys[:mid]
138 full_child.values = full_child.values[:mid]
139
140 # Link leaves
141 new_child.next = full_child.next
142 full_child.next = new_child
143
144 # Promote copy of first key in new node
145 parent.keys.insert(index, new_child.keys[0])
146 else:
147 # Internal node split
148 new_child.keys = full_child.keys[mid+1:]
149 new_child.children = full_child.children[mid+1:]
150 promote_key = full_child.keys[mid]
151 full_child.keys = full_child.keys[:mid]
152 full_child.children = full_child.children[:mid+1]
153
154 parent.keys.insert(index, promote_key)
155
156 parent.children.insert(index + 1, new_child)
157
158 def print_tree(self, node: Optional[BPlusTreeNode] = None, level: int = 0):
159 """Print tree structure for visualization."""
160 if node is None:
161 node = self.root
162
163 indent = " " * level
164 if node.is_leaf:
165 print(f"{indent}LEAF: {node.keys}")
166 else:
167 print(f"{indent}INTERNAL: {node.keys}")
168 for child in node.children:
169 self.print_tree(child, level + 1)
170
171
172def demonstrate_btree_operations():
173 """Demonstrate B+Tree operations."""
174 print("=" * 60)
175 print("B+TREE OPERATIONS")
176 print("=" * 60)
177 print()
178
179 tree = BPlusTree(order=4) # Max 4 keys per node
180
181 # Insert keys
182 print("Inserting keys: 10, 20, 5, 6, 12, 30, 7, 17")
183 print("-" * 60)
184 for key in [10, 20, 5, 6, 12, 30, 7, 17]:
185 tree.insert(key, f"value_{key}")
186 print(f"Inserted {key}")
187
188 print("\nTree structure:")
189 print("-" * 60)
190 tree.print_tree()
191
192 # Search
193 print("\n\nSearch operations:")
194 print("-" * 60)
195 for key in [12, 15, 30]:
196 result = tree.search(key)
197 print(f"Search({key}): {result if result else 'Not found'}")
198
199 # Range query
200 print("\n\nRange query [6, 17]:")
201 print("-" * 60)
202 results = tree.range_query(6, 17)
203 for key, value in results:
204 print(f" {key}: {value}")
205 print()
206
207
208def demonstrate_index_performance():
209 """Compare query performance with and without index."""
210 print("=" * 60)
211 print("INDEX PERFORMANCE COMPARISON")
212 print("=" * 60)
213 print()
214
215 conn = sqlite3.connect(':memory:')
216 cursor = conn.cursor()
217
218 # Create table with many rows
219 print("Creating table with 100,000 rows...")
220 cursor.execute('''
221 CREATE TABLE Products (
222 product_id INTEGER PRIMARY KEY,
223 name TEXT,
224 category TEXT,
225 price REAL
226 )
227 ''')
228
229 # Insert data
230 import random
231 categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Toys']
232 data = [
233 (i, f'Product_{i}', random.choice(categories), random.uniform(10, 1000))
234 for i in range(100000)
235 ]
236 cursor.executemany("INSERT INTO Products VALUES (?, ?, ?, ?)", data)
237 conn.commit()
238
239 print("✓ Inserted 100,000 products\n")
240
241 # Query without index
242 print("1. Query WITHOUT index on category:")
243 print("-" * 60)
244 start = time.time()
245 cursor.execute("SELECT COUNT(*) FROM Products WHERE category = 'Electronics'")
246 count = cursor.fetchone()[0]
247 elapsed = time.time() - start
248 print(f"Found {count} electronics in {elapsed*1000:.2f} ms")
249
250 # Show query plan
251 cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Products WHERE category = 'Electronics'")
252 print("\nQuery plan:")
253 for row in cursor.fetchall():
254 print(f" {row}")
255
256 # Create index
257 print("\n\n2. Creating index on category...")
258 print("-" * 60)
259 start = time.time()
260 cursor.execute("CREATE INDEX idx_category ON Products(category)")
261 elapsed = time.time() - start
262 print(f"✓ Index created in {elapsed*1000:.2f} ms\n")
263
264 # Query with index
265 print("3. Query WITH index on category:")
266 print("-" * 60)
267 start = time.time()
268 cursor.execute("SELECT COUNT(*) FROM Products WHERE category = 'Electronics'")
269 count = cursor.fetchone()[0]
270 elapsed = time.time() - start
271 print(f"Found {count} electronics in {elapsed*1000:.2f} ms")
272
273 # Show query plan
274 cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Products WHERE category = 'Electronics'")
275 print("\nQuery plan (now uses index):")
276 for row in cursor.fetchall():
277 print(f" {row}")
278
279 # Range query
280 print("\n\n4. Range query (price between 100 and 200):")
281 print("-" * 60)
282
283 # Without index
284 start = time.time()
285 cursor.execute("SELECT COUNT(*) FROM Products WHERE price BETWEEN 100 AND 200")
286 count = cursor.fetchone()[0]
287 elapsed_no_idx = time.time() - start
288 print(f"Without index: {count} products in {elapsed_no_idx*1000:.2f} ms")
289
290 # With index
291 cursor.execute("CREATE INDEX idx_price ON Products(price)")
292 start = time.time()
293 cursor.execute("SELECT COUNT(*) FROM Products WHERE price BETWEEN 100 AND 200")
294 count = cursor.fetchone()[0]
295 elapsed_with_idx = time.time() - start
296 print(f"With index: {count} products in {elapsed_with_idx*1000:.2f} ms")
297
298 if elapsed_no_idx > elapsed_with_idx:
299 speedup = elapsed_no_idx / elapsed_with_idx
300 print(f"\n✓ Speedup: {speedup:.1f}x faster with index")
301
302 conn.close()
303 print()
304
305
306def demonstrate_composite_index():
307 """Demonstrate composite (multi-column) index."""
308 print("=" * 60)
309 print("COMPOSITE INDEX")
310 print("=" * 60)
311 print()
312
313 conn = sqlite3.connect(':memory:')
314 cursor = conn.cursor()
315
316 cursor.execute('''
317 CREATE TABLE Orders (
318 order_id INTEGER PRIMARY KEY,
319 customer_id INTEGER,
320 order_date TEXT,
321 total REAL
322 )
323 ''')
324
325 # Insert sample data
326 import random
327 from datetime import date, timedelta
328 start_date = date(2023, 1, 1)
329 data = [
330 (i, random.randint(1, 100), (start_date + timedelta(days=random.randint(0, 365))).isoformat(), random.uniform(10, 500))
331 for i in range(10000)
332 ]
333 cursor.executemany("INSERT INTO Orders VALUES (?, ?, ?, ?)", data)
334
335 print("Inserted 10,000 orders\n")
336
337 # Query on customer_id and order_date
338 print("Query: SELECT * FROM Orders WHERE customer_id = 50 AND order_date >= '2023-06-01'")
339 print("-" * 60)
340
341 # Without index
342 cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Orders WHERE customer_id = 50 AND order_date >= '2023-06-01'")
343 print("Without index:")
344 for row in cursor.fetchall():
345 print(f" {row}")
346
347 # Create composite index
348 print("\nCreating composite index on (customer_id, order_date)...")
349 cursor.execute("CREATE INDEX idx_cust_date ON Orders(customer_id, order_date)")
350
351 cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM Orders WHERE customer_id = 50 AND order_date >= '2023-06-01'")
352 print("\nWith composite index:")
353 for row in cursor.fetchall():
354 print(f" {row}")
355
356 print("\nNote: Composite index can be used for:")
357 print(" ✓ WHERE customer_id = X")
358 print(" ✓ WHERE customer_id = X AND order_date = Y")
359 print(" ✗ WHERE order_date = Y (only) - cannot use index efficiently")
360 print(" (Leftmost prefix rule)")
361
362 conn.close()
363 print()
364
365
366if __name__ == "__main__":
367 print("""
368╔══════════════════════════════════════════════════════════════╗
369║ B+TREE INDEXING ║
370║ Data Structure, Operations, Performance ║
371╚══════════════════════════════════════════════════════════════╝
372""")
373
374 demonstrate_btree_operations()
375 demonstrate_index_performance()
376 demonstrate_composite_index()
377
378 print("=" * 60)
379 print("SUMMARY")
380 print("=" * 60)
381 print("B+Tree properties:")
382 print(" - Balanced: all leaves at same depth")
383 print(" - Sorted: supports range queries efficiently")
384 print(" - High branching factor: minimizes disk I/O")
385 print(" - Leaves linked: sequential access")
386 print()
387 print("When to use indexes:")
388 print(" ✓ Columns in WHERE clauses")
389 print(" ✓ Columns in JOIN conditions")
390 print(" ✓ Columns in ORDER BY")
391 print(" ✗ Small tables (overhead not worth it)")
392 print(" ✗ Frequently updated columns (index maintenance cost)")
393 print("=" * 60)