1"""
2Query Optimization and Execution Plans
3
4Demonstrates query optimization concepts:
5- Query execution plans (EXPLAIN QUERY PLAN)
6- Cost estimation for different strategies
7- Join ordering and optimization
8- Query equivalence transformations
9- Index selection
10- Statistics and cardinality estimation
11
12Theory:
13- Query optimizer translates SQL to efficient execution plan
14- Cost-based optimization: estimate cost of different plans, choose cheapest
15- Relational algebra equivalences allow rewriting queries
16- Join ordering affects performance (e.g., smaller result sets first)
17- Selectivity and cardinality estimation guide optimizer decisions
18- EXPLAIN shows the chosen execution plan
19
20Optimization techniques:
21- Predicate pushdown: apply filters early
22- Join reordering: minimize intermediate results
23- Index selection: choose appropriate indexes
24- Common subexpression elimination
25"""
26
27import sqlite3
28import time
29import random
30from typing import List, Tuple
31
32
33def setup_sample_database() -> sqlite3.Connection:
34 """Create sample database for optimization demonstrations."""
35 conn = sqlite3.connect(':memory:')
36 cursor = conn.cursor()
37
38 # Create tables
39 cursor.execute('''
40 CREATE TABLE customers (
41 customer_id INTEGER PRIMARY KEY,
42 name TEXT,
43 city TEXT,
44 country TEXT
45 )
46 ''')
47
48 cursor.execute('''
49 CREATE TABLE orders (
50 order_id INTEGER PRIMARY KEY,
51 customer_id INTEGER,
52 order_date TEXT,
53 total REAL,
54 FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
55 )
56 ''')
57
58 cursor.execute('''
59 CREATE TABLE order_items (
60 item_id INTEGER PRIMARY KEY,
61 order_id INTEGER,
62 product_id INTEGER,
63 quantity INTEGER,
64 price REAL,
65 FOREIGN KEY (order_id) REFERENCES orders(order_id)
66 )
67 ''')
68
69 # Insert sample data
70 cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney']
71 countries = ['USA', 'UK', 'France', 'Japan', 'Australia']
72
73 # Customers
74 for i in range(1, 1001):
75 city = random.choice(cities)
76 country = random.choice(countries)
77 cursor.execute(
78 "INSERT INTO customers VALUES (?, ?, ?, ?)",
79 (i, f'Customer_{i}', city, country)
80 )
81
82 # Orders
83 order_id = 1
84 for customer_id in range(1, 1001):
85 num_orders = random.randint(0, 5)
86 for _ in range(num_orders):
87 cursor.execute(
88 "INSERT INTO orders VALUES (?, ?, ?, ?)",
89 (order_id, customer_id, '2024-01-01', random.uniform(10, 1000))
90 )
91 order_id += 1
92
93 # Order items
94 cursor.execute("SELECT order_id FROM orders")
95 order_ids = [row[0] for row in cursor.fetchall()]
96 item_id = 1
97 for order_id in order_ids:
98 num_items = random.randint(1, 5)
99 for _ in range(num_items):
100 cursor.execute(
101 "INSERT INTO order_items VALUES (?, ?, ?, ?, ?)",
102 (item_id, order_id, random.randint(1, 100), random.randint(1, 10), random.uniform(5, 500))
103 )
104 item_id += 1
105
106 conn.commit()
107 return conn
108
109
110def demonstrate_explain_query_plan():
111 """Demonstrate EXPLAIN QUERY PLAN."""
112 print("=" * 60)
113 print("QUERY EXECUTION PLANS")
114 print("=" * 60)
115 print()
116
117 conn = setup_sample_database()
118 cursor = conn.cursor()
119
120 # Simple query
121 print("1. Simple SELECT with WHERE clause")
122 print("-" * 60)
123 query = "SELECT * FROM customers WHERE city = 'London'"
124 print(f"Query: {query}")
125 print("\nExecution plan:")
126
127 cursor.execute(f"EXPLAIN QUERY PLAN {query}")
128 for row in cursor.fetchall():
129 print(f" {row}")
130
131 print("\nInterpretation: SCAN TABLE (full table scan, no index)")
132
133 # Query with JOIN
134 print("\n\n2. JOIN query")
135 print("-" * 60)
136 query = """
137 SELECT c.name, COUNT(o.order_id) as num_orders
138 FROM customers c
139 LEFT JOIN orders o ON c.customer_id = o.customer_id
140 GROUP BY c.customer_id
141 """
142 print(f"Query: Multi-line JOIN with GROUP BY")
143 print("\nExecution plan:")
144
145 cursor.execute(f"EXPLAIN QUERY PLAN {query}")
146 for row in cursor.fetchall():
147 print(f" {row}")
148
149 conn.close()
150 print()
151
152
153def demonstrate_join_ordering():
154 """Demonstrate impact of join ordering."""
155 print("=" * 60)
156 print("JOIN ORDERING OPTIMIZATION")
157 print("=" * 60)
158 print()
159
160 conn = setup_sample_database()
161 cursor = conn.cursor()
162
163 # Count rows in each table
164 cursor.execute("SELECT COUNT(*) FROM customers")
165 num_customers = cursor.fetchone()[0]
166 cursor.execute("SELECT COUNT(*) FROM orders")
167 num_orders = cursor.fetchone()[0]
168 cursor.execute("SELECT COUNT(*) FROM order_items")
169 num_items = cursor.fetchone()[0]
170
171 print("Table sizes:")
172 print("-" * 60)
173 print(f" customers: {num_customers:5} rows")
174 print(f" orders: {num_orders:5} rows")
175 print(f" order_items: {num_items:5} rows")
176
177 # Three-way join
178 print("\n\n1. Three-way JOIN (customers → orders → order_items)")
179 print("-" * 60)
180 query = """
181 SELECT c.name, SUM(oi.price * oi.quantity) as total
182 FROM customers c
183 JOIN orders o ON c.customer_id = o.customer_id
184 JOIN order_items oi ON o.order_id = oi.order_id
185 WHERE c.country = 'USA'
186 GROUP BY c.customer_id
187 """
188
189 cursor.execute(f"EXPLAIN QUERY PLAN {query}")
190 print("Execution plan:")
191 for row in cursor.fetchall():
192 print(f" {row}")
193
194 start = time.time()
195 cursor.execute(query)
196 results = cursor.fetchall()
197 elapsed = time.time() - start
198 print(f"\nExecuted in {elapsed*1000:.2f} ms, {len(results)} results")
199
200 # Same query, different table order in FROM clause
201 print("\n\n2. Same query, reordered tables (optimizer handles this)")
202 print("-" * 60)
203 query2 = """
204 SELECT c.name, SUM(oi.price * oi.quantity) as total
205 FROM order_items oi
206 JOIN orders o ON oi.order_id = o.order_id
207 JOIN customers c ON o.customer_id = c.customer_id
208 WHERE c.country = 'USA'
209 GROUP BY c.customer_id
210 """
211
212 cursor.execute(f"EXPLAIN QUERY PLAN {query2}")
213 print("Execution plan:")
214 for row in cursor.fetchall():
215 print(f" {row}")
216
217 print("\nNote: SQLite's optimizer reorders joins for efficiency")
218 print(" (predicate pushdown: filters applied early)")
219
220 conn.close()
221 print()
222
223
224def demonstrate_index_selection():
225 """Demonstrate how indexes affect query plans."""
226 print("=" * 60)
227 print("INDEX SELECTION")
228 print("=" * 60)
229 print()
230
231 conn = setup_sample_database()
232 cursor = conn.cursor()
233
234 # Query without index
235 print("1. Query WITHOUT index on country")
236 print("-" * 60)
237 query = "SELECT * FROM customers WHERE country = 'USA'"
238
239 cursor.execute(f"EXPLAIN QUERY PLAN {query}")
240 print("Execution plan:")
241 for row in cursor.fetchall():
242 print(f" {row}")
243
244 start = time.time()
245 cursor.execute(query)
246 results = cursor.fetchall()
247 elapsed_no_idx = time.time() - start
248 print(f"\nExecuted in {elapsed_no_idx*1000:.2f} ms, {len(results)} results")
249
250 # Create index
251 print("\n\n2. Creating index on country")
252 print("-" * 60)
253 cursor.execute("CREATE INDEX idx_country ON customers(country)")
254 print("✓ Index created")
255
256 # Query with index
257 print("\n\n3. Query WITH index on country")
258 print("-" * 60)
259 cursor.execute(f"EXPLAIN QUERY PLAN {query}")
260 print("Execution plan:")
261 for row in cursor.fetchall():
262 print(f" {row}")
263
264 start = time.time()
265 cursor.execute(query)
266 results = cursor.fetchall()
267 elapsed_with_idx = time.time() - start
268 print(f"\nExecuted in {elapsed_with_idx*1000:.2f} ms, {len(results)} results")
269
270 # Covering index
271 print("\n\n4. Covering index (includes all needed columns)")
272 print("-" * 60)
273 cursor.execute("CREATE INDEX idx_country_city ON customers(country, city)")
274 print("✓ Created composite index on (country, city)")
275
276 query_covering = "SELECT country, city FROM customers WHERE country = 'USA'"
277 cursor.execute(f"EXPLAIN QUERY PLAN {query_covering}")
278 print("\nExecution plan:")
279 for row in cursor.fetchall():
280 print(f" {row}")
281
282 print("\nNote: If plan shows 'USING COVERING INDEX', all data comes from index")
283 print(" (no need to access table rows - faster)")
284
285 conn.close()
286 print()
287
288
289def demonstrate_query_transformations():
290 """Demonstrate query equivalence transformations."""
291 print("=" * 60)
292 print("QUERY EQUIVALENCE TRANSFORMATIONS")
293 print("=" * 60)
294 print()
295
296 conn = setup_sample_database()
297 cursor = conn.cursor()
298
299 # Create index for demonstration
300 cursor.execute("CREATE INDEX idx_customer ON orders(customer_id)")
301
302 print("Transformation 1: Subquery → JOIN")
303 print("-" * 60)
304
305 # Original: subquery
306 query1 = """
307 SELECT name FROM customers
308 WHERE customer_id IN (
309 SELECT customer_id FROM orders WHERE total > 500
310 )
311 """
312 print("Original (subquery):")
313 print(query1)
314 cursor.execute(f"EXPLAIN QUERY PLAN {query1}")
315 print("\nPlan:")
316 for row in cursor.fetchall():
317 print(f" {row}")
318
319 # Transformed: JOIN
320 query2 = """
321 SELECT DISTINCT c.name
322 FROM customers c
323 JOIN orders o ON c.customer_id = o.customer_id
324 WHERE o.total > 500
325 """
326 print("\n\nTransformed (JOIN):")
327 print(query2)
328 cursor.execute(f"EXPLAIN QUERY PLAN {query2}")
329 print("\nPlan:")
330 for row in cursor.fetchall():
331 print(f" {row}")
332
333 print("\nNote: Optimizer may automatically transform queries")
334
335 # Predicate pushdown
336 print("\n\nTransformation 2: Predicate Pushdown")
337 print("-" * 60)
338
339 query3 = """
340 SELECT * FROM (
341 SELECT c.*, o.total
342 FROM customers c
343 JOIN orders o ON c.customer_id = o.customer_id
344 ) AS subq
345 WHERE total > 500
346 """
347 print("Query with filter outside subquery:")
348 print(query3)
349
350 cursor.execute(f"EXPLAIN QUERY PLAN {query3}")
351 print("\nPlan:")
352 for row in cursor.fetchall():
353 print(f" {row}")
354
355 print("\nNote: Optimizer pushes 'total > 500' predicate into subquery")
356 print(" (filters applied early to reduce intermediate results)")
357
358 conn.close()
359 print()
360
361
362def demonstrate_statistics():
363 """Demonstrate role of statistics in optimization."""
364 print("=" * 60)
365 print("STATISTICS AND COST ESTIMATION")
366 print("=" * 60)
367 print()
368
369 conn = setup_sample_database()
370 cursor = conn.cursor()
371
372 # Analyze tables to gather statistics
373 print("1. Gathering statistics with ANALYZE")
374 print("-" * 60)
375 cursor.execute("ANALYZE")
376 print("✓ Statistics gathered")
377
378 # Check statistics
379 cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'sqlite_stat%'")
380 stat_tables = cursor.fetchall()
381 print(f"\nStatistics stored in tables: {[t[0] for t in stat_tables]}")
382
383 # View statistics for a table
384 if stat_tables:
385 cursor.execute("SELECT * FROM sqlite_stat1 WHERE tbl = 'customers' LIMIT 5")
386 print("\nSample statistics for customers table:")
387 for row in cursor.fetchall():
388 print(f" {row}")
389
390 print("\n\n2. How optimizer uses statistics")
391 print("-" * 60)
392 cursor.execute("CREATE INDEX idx_city ON customers(city)")
393
394 # Selective query (few results)
395 selective_query = "SELECT * FROM customers WHERE city = 'Tokyo'"
396 cursor.execute(f"SELECT COUNT(*) FROM customers WHERE city = 'Tokyo'")
397 count = cursor.fetchone()[0]
398 print(f"\nSelective query (city='Tokyo'): {count} rows match")
399
400 cursor.execute(f"EXPLAIN QUERY PLAN {selective_query}")
401 print("Plan:")
402 for row in cursor.fetchall():
403 print(f" {row}")
404
405 # Non-selective query (many results)
406 cursor.execute("SELECT COUNT(DISTINCT city) FROM customers")
407 num_cities = cursor.fetchone()[0]
408 print(f"\n\nNon-selective query: Only {num_cities} distinct cities")
409 print("If most rows match, full scan may be cheaper than index")
410
411 print("\n✓ Optimizer uses statistics to estimate:")
412 print(" - Table cardinality (number of rows)")
413 print(" - Index selectivity (how many rows match)")
414 print(" - Join cardinality (size of join result)")
415
416 conn.close()
417 print()
418
419
420def demonstrate_optimization_hints():
421 """Demonstrate when manual optimization is needed."""
422 print("=" * 60)
423 print("MANUAL OPTIMIZATION TECHNIQUES")
424 print("=" * 60)
425 print()
426
427 print("1. Rewrite correlated subquery as JOIN")
428 print("-" * 60)
429 print("Slow (correlated subquery - runs for each row):")
430 print("""
431 SELECT c.name,
432 (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.customer_id)
433 FROM customers c
434 """)
435
436 print("\nFast (JOIN with GROUP BY):")
437 print("""
438 SELECT c.name, COUNT(o.order_id)
439 FROM customers c
440 LEFT JOIN orders o ON c.customer_id = o.customer_id
441 GROUP BY c.customer_id
442 """)
443
444 print("\n\n2. Use LIMIT for pagination")
445 print("-" * 60)
446 print("Bad (fetch all, discard most):")
447 print(" SELECT * FROM large_table ORDER BY id")
448 print("\nGood (only fetch needed rows):")
449 print(" SELECT * FROM large_table ORDER BY id LIMIT 20 OFFSET 100")
450
451 print("\n\n3. Avoid SELECT *")
452 print("-" * 60)
453 print("Bad (fetches unnecessary data):")
454 print(" SELECT * FROM customers WHERE country = 'USA'")
455 print("\nGood (only needed columns):")
456 print(" SELECT customer_id, name FROM customers WHERE country = 'USA'")
457 print(" (Enables covering index optimization)")
458
459 print("\n\n4. Use EXISTS instead of COUNT for existence check")
460 print("-" * 60)
461 print("Slow (counts all matches):")
462 print(" SELECT COUNT(*) FROM orders WHERE customer_id = 123")
463 print("\nFast (stops after finding first match):")
464 print(" SELECT EXISTS(SELECT 1 FROM orders WHERE customer_id = 123)")
465
466 print()
467
468
469if __name__ == "__main__":
470 print("""
471╔══════════════════════════════════════════════════════════════╗
472║ QUERY OPTIMIZATION ║
473║ Execution Plans, Join Ordering, Index Selection ║
474╚══════════════════════════════════════════════════════════════╝
475""")
476
477 demonstrate_explain_query_plan()
478 demonstrate_join_ordering()
479 demonstrate_index_selection()
480 demonstrate_query_transformations()
481 demonstrate_statistics()
482 demonstrate_optimization_hints()
483
484 print("=" * 60)
485 print("SUMMARY: QUERY OPTIMIZATION")
486 print("=" * 60)
487 print("Optimizer's job:")
488 print(" 1. Parse SQL into relational algebra")
489 print(" 2. Apply equivalence transformations")
490 print(" 3. Estimate cost of alternative plans")
491 print(" 4. Choose plan with lowest estimated cost")
492 print()
493 print("Key optimization techniques:")
494 print(" - Predicate pushdown (filter early)")
495 print(" - Join reordering (minimize intermediates)")
496 print(" - Index selection (avoid full scans)")
497 print(" - Subquery flattening (convert to JOIN)")
498 print()
499 print("Tools:")
500 print(" - EXPLAIN QUERY PLAN: see execution plan")
501 print(" - ANALYZE: gather statistics")
502 print(" - CREATE INDEX: speed up queries")
503 print("=" * 60)