10_query_optimizer.py

Download
python 504 lines 15.4 KB
  1"""
  2Query Optimization and Execution Plans
  3
  4Demonstrates query optimization concepts:
  5- Query execution plans (EXPLAIN QUERY PLAN)
  6- Cost estimation for different strategies
  7- Join ordering and optimization
  8- Query equivalence transformations
  9- Index selection
 10- Statistics and cardinality estimation
 11
 12Theory:
 13- Query optimizer translates SQL to efficient execution plan
 14- Cost-based optimization: estimate cost of different plans, choose cheapest
 15- Relational algebra equivalences allow rewriting queries
 16- Join ordering affects performance (e.g., smaller result sets first)
 17- Selectivity and cardinality estimation guide optimizer decisions
 18- EXPLAIN shows the chosen execution plan
 19
 20Optimization techniques:
 21- Predicate pushdown: apply filters early
 22- Join reordering: minimize intermediate results
 23- Index selection: choose appropriate indexes
 24- Common subexpression elimination
 25"""
 26
 27import sqlite3
 28import time
 29import random
 30from typing import List, Tuple
 31
 32
 33def setup_sample_database() -> sqlite3.Connection:
 34    """Create sample database for optimization demonstrations."""
 35    conn = sqlite3.connect(':memory:')
 36    cursor = conn.cursor()
 37
 38    # Create tables
 39    cursor.execute('''
 40        CREATE TABLE customers (
 41            customer_id INTEGER PRIMARY KEY,
 42            name TEXT,
 43            city TEXT,
 44            country TEXT
 45        )
 46    ''')
 47
 48    cursor.execute('''
 49        CREATE TABLE orders (
 50            order_id INTEGER PRIMARY KEY,
 51            customer_id INTEGER,
 52            order_date TEXT,
 53            total REAL,
 54            FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
 55        )
 56    ''')
 57
 58    cursor.execute('''
 59        CREATE TABLE order_items (
 60            item_id INTEGER PRIMARY KEY,
 61            order_id INTEGER,
 62            product_id INTEGER,
 63            quantity INTEGER,
 64            price REAL,
 65            FOREIGN KEY (order_id) REFERENCES orders(order_id)
 66        )
 67    ''')
 68
 69    # Insert sample data
 70    cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney']
 71    countries = ['USA', 'UK', 'France', 'Japan', 'Australia']
 72
 73    # Customers
 74    for i in range(1, 1001):
 75        city = random.choice(cities)
 76        country = random.choice(countries)
 77        cursor.execute(
 78            "INSERT INTO customers VALUES (?, ?, ?, ?)",
 79            (i, f'Customer_{i}', city, country)
 80        )
 81
 82    # Orders
 83    order_id = 1
 84    for customer_id in range(1, 1001):
 85        num_orders = random.randint(0, 5)
 86        for _ in range(num_orders):
 87            cursor.execute(
 88                "INSERT INTO orders VALUES (?, ?, ?, ?)",
 89                (order_id, customer_id, '2024-01-01', random.uniform(10, 1000))
 90            )
 91            order_id += 1
 92
 93    # Order items
 94    cursor.execute("SELECT order_id FROM orders")
 95    order_ids = [row[0] for row in cursor.fetchall()]
 96    item_id = 1
 97    for order_id in order_ids:
 98        num_items = random.randint(1, 5)
 99        for _ in range(num_items):
100            cursor.execute(
101                "INSERT INTO order_items VALUES (?, ?, ?, ?, ?)",
102                (item_id, order_id, random.randint(1, 100), random.randint(1, 10), random.uniform(5, 500))
103            )
104            item_id += 1
105
106    conn.commit()
107    return conn
108
109
110def demonstrate_explain_query_plan():
111    """Demonstrate EXPLAIN QUERY PLAN."""
112    print("=" * 60)
113    print("QUERY EXECUTION PLANS")
114    print("=" * 60)
115    print()
116
117    conn = setup_sample_database()
118    cursor = conn.cursor()
119
120    # Simple query
121    print("1. Simple SELECT with WHERE clause")
122    print("-" * 60)
123    query = "SELECT * FROM customers WHERE city = 'London'"
124    print(f"Query: {query}")
125    print("\nExecution plan:")
126
127    cursor.execute(f"EXPLAIN QUERY PLAN {query}")
128    for row in cursor.fetchall():
129        print(f"  {row}")
130
131    print("\nInterpretation: SCAN TABLE (full table scan, no index)")
132
133    # Query with JOIN
134    print("\n\n2. JOIN query")
135    print("-" * 60)
136    query = """
137        SELECT c.name, COUNT(o.order_id) as num_orders
138        FROM customers c
139        LEFT JOIN orders o ON c.customer_id = o.customer_id
140        GROUP BY c.customer_id
141    """
142    print(f"Query: Multi-line JOIN with GROUP BY")
143    print("\nExecution plan:")
144
145    cursor.execute(f"EXPLAIN QUERY PLAN {query}")
146    for row in cursor.fetchall():
147        print(f"  {row}")
148
149    conn.close()
150    print()
151
152
153def demonstrate_join_ordering():
154    """Demonstrate impact of join ordering."""
155    print("=" * 60)
156    print("JOIN ORDERING OPTIMIZATION")
157    print("=" * 60)
158    print()
159
160    conn = setup_sample_database()
161    cursor = conn.cursor()
162
163    # Count rows in each table
164    cursor.execute("SELECT COUNT(*) FROM customers")
165    num_customers = cursor.fetchone()[0]
166    cursor.execute("SELECT COUNT(*) FROM orders")
167    num_orders = cursor.fetchone()[0]
168    cursor.execute("SELECT COUNT(*) FROM order_items")
169    num_items = cursor.fetchone()[0]
170
171    print("Table sizes:")
172    print("-" * 60)
173    print(f"  customers:    {num_customers:5} rows")
174    print(f"  orders:       {num_orders:5} rows")
175    print(f"  order_items:  {num_items:5} rows")
176
177    # Three-way join
178    print("\n\n1. Three-way JOIN (customers → orders → order_items)")
179    print("-" * 60)
180    query = """
181        SELECT c.name, SUM(oi.price * oi.quantity) as total
182        FROM customers c
183        JOIN orders o ON c.customer_id = o.customer_id
184        JOIN order_items oi ON o.order_id = oi.order_id
185        WHERE c.country = 'USA'
186        GROUP BY c.customer_id
187    """
188
189    cursor.execute(f"EXPLAIN QUERY PLAN {query}")
190    print("Execution plan:")
191    for row in cursor.fetchall():
192        print(f"  {row}")
193
194    start = time.time()
195    cursor.execute(query)
196    results = cursor.fetchall()
197    elapsed = time.time() - start
198    print(f"\nExecuted in {elapsed*1000:.2f} ms, {len(results)} results")
199
200    # Same query, different table order in FROM clause
201    print("\n\n2. Same query, reordered tables (optimizer handles this)")
202    print("-" * 60)
203    query2 = """
204        SELECT c.name, SUM(oi.price * oi.quantity) as total
205        FROM order_items oi
206        JOIN orders o ON oi.order_id = o.order_id
207        JOIN customers c ON o.customer_id = c.customer_id
208        WHERE c.country = 'USA'
209        GROUP BY c.customer_id
210    """
211
212    cursor.execute(f"EXPLAIN QUERY PLAN {query2}")
213    print("Execution plan:")
214    for row in cursor.fetchall():
215        print(f"  {row}")
216
217    print("\nNote: SQLite's optimizer reorders joins for efficiency")
218    print("      (predicate pushdown: filters applied early)")
219
220    conn.close()
221    print()
222
223
224def demonstrate_index_selection():
225    """Demonstrate how indexes affect query plans."""
226    print("=" * 60)
227    print("INDEX SELECTION")
228    print("=" * 60)
229    print()
230
231    conn = setup_sample_database()
232    cursor = conn.cursor()
233
234    # Query without index
235    print("1. Query WITHOUT index on country")
236    print("-" * 60)
237    query = "SELECT * FROM customers WHERE country = 'USA'"
238
239    cursor.execute(f"EXPLAIN QUERY PLAN {query}")
240    print("Execution plan:")
241    for row in cursor.fetchall():
242        print(f"  {row}")
243
244    start = time.time()
245    cursor.execute(query)
246    results = cursor.fetchall()
247    elapsed_no_idx = time.time() - start
248    print(f"\nExecuted in {elapsed_no_idx*1000:.2f} ms, {len(results)} results")
249
250    # Create index
251    print("\n\n2. Creating index on country")
252    print("-" * 60)
253    cursor.execute("CREATE INDEX idx_country ON customers(country)")
254    print("✓ Index created")
255
256    # Query with index
257    print("\n\n3. Query WITH index on country")
258    print("-" * 60)
259    cursor.execute(f"EXPLAIN QUERY PLAN {query}")
260    print("Execution plan:")
261    for row in cursor.fetchall():
262        print(f"  {row}")
263
264    start = time.time()
265    cursor.execute(query)
266    results = cursor.fetchall()
267    elapsed_with_idx = time.time() - start
268    print(f"\nExecuted in {elapsed_with_idx*1000:.2f} ms, {len(results)} results")
269
270    # Covering index
271    print("\n\n4. Covering index (includes all needed columns)")
272    print("-" * 60)
273    cursor.execute("CREATE INDEX idx_country_city ON customers(country, city)")
274    print("✓ Created composite index on (country, city)")
275
276    query_covering = "SELECT country, city FROM customers WHERE country = 'USA'"
277    cursor.execute(f"EXPLAIN QUERY PLAN {query_covering}")
278    print("\nExecution plan:")
279    for row in cursor.fetchall():
280        print(f"  {row}")
281
282    print("\nNote: If plan shows 'USING COVERING INDEX', all data comes from index")
283    print("      (no need to access table rows - faster)")
284
285    conn.close()
286    print()
287
288
289def demonstrate_query_transformations():
290    """Demonstrate query equivalence transformations."""
291    print("=" * 60)
292    print("QUERY EQUIVALENCE TRANSFORMATIONS")
293    print("=" * 60)
294    print()
295
296    conn = setup_sample_database()
297    cursor = conn.cursor()
298
299    # Create index for demonstration
300    cursor.execute("CREATE INDEX idx_customer ON orders(customer_id)")
301
302    print("Transformation 1: Subquery → JOIN")
303    print("-" * 60)
304
305    # Original: subquery
306    query1 = """
307        SELECT name FROM customers
308        WHERE customer_id IN (
309            SELECT customer_id FROM orders WHERE total > 500
310        )
311    """
312    print("Original (subquery):")
313    print(query1)
314    cursor.execute(f"EXPLAIN QUERY PLAN {query1}")
315    print("\nPlan:")
316    for row in cursor.fetchall():
317        print(f"  {row}")
318
319    # Transformed: JOIN
320    query2 = """
321        SELECT DISTINCT c.name
322        FROM customers c
323        JOIN orders o ON c.customer_id = o.customer_id
324        WHERE o.total > 500
325    """
326    print("\n\nTransformed (JOIN):")
327    print(query2)
328    cursor.execute(f"EXPLAIN QUERY PLAN {query2}")
329    print("\nPlan:")
330    for row in cursor.fetchall():
331        print(f"  {row}")
332
333    print("\nNote: Optimizer may automatically transform queries")
334
335    # Predicate pushdown
336    print("\n\nTransformation 2: Predicate Pushdown")
337    print("-" * 60)
338
339    query3 = """
340        SELECT * FROM (
341            SELECT c.*, o.total
342            FROM customers c
343            JOIN orders o ON c.customer_id = o.customer_id
344        ) AS subq
345        WHERE total > 500
346    """
347    print("Query with filter outside subquery:")
348    print(query3)
349
350    cursor.execute(f"EXPLAIN QUERY PLAN {query3}")
351    print("\nPlan:")
352    for row in cursor.fetchall():
353        print(f"  {row}")
354
355    print("\nNote: Optimizer pushes 'total > 500' predicate into subquery")
356    print("      (filters applied early to reduce intermediate results)")
357
358    conn.close()
359    print()
360
361
362def demonstrate_statistics():
363    """Demonstrate role of statistics in optimization."""
364    print("=" * 60)
365    print("STATISTICS AND COST ESTIMATION")
366    print("=" * 60)
367    print()
368
369    conn = setup_sample_database()
370    cursor = conn.cursor()
371
372    # Analyze tables to gather statistics
373    print("1. Gathering statistics with ANALYZE")
374    print("-" * 60)
375    cursor.execute("ANALYZE")
376    print("✓ Statistics gathered")
377
378    # Check statistics
379    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'sqlite_stat%'")
380    stat_tables = cursor.fetchall()
381    print(f"\nStatistics stored in tables: {[t[0] for t in stat_tables]}")
382
383    # View statistics for a table
384    if stat_tables:
385        cursor.execute("SELECT * FROM sqlite_stat1 WHERE tbl = 'customers' LIMIT 5")
386        print("\nSample statistics for customers table:")
387        for row in cursor.fetchall():
388            print(f"  {row}")
389
390    print("\n\n2. How optimizer uses statistics")
391    print("-" * 60)
392    cursor.execute("CREATE INDEX idx_city ON customers(city)")
393
394    # Selective query (few results)
395    selective_query = "SELECT * FROM customers WHERE city = 'Tokyo'"
396    cursor.execute(f"SELECT COUNT(*) FROM customers WHERE city = 'Tokyo'")
397    count = cursor.fetchone()[0]
398    print(f"\nSelective query (city='Tokyo'): {count} rows match")
399
400    cursor.execute(f"EXPLAIN QUERY PLAN {selective_query}")
401    print("Plan:")
402    for row in cursor.fetchall():
403        print(f"  {row}")
404
405    # Non-selective query (many results)
406    cursor.execute("SELECT COUNT(DISTINCT city) FROM customers")
407    num_cities = cursor.fetchone()[0]
408    print(f"\n\nNon-selective query: Only {num_cities} distinct cities")
409    print("If most rows match, full scan may be cheaper than index")
410
411    print("\n✓ Optimizer uses statistics to estimate:")
412    print("  - Table cardinality (number of rows)")
413    print("  - Index selectivity (how many rows match)")
414    print("  - Join cardinality (size of join result)")
415
416    conn.close()
417    print()
418
419
420def demonstrate_optimization_hints():
421    """Demonstrate when manual optimization is needed."""
422    print("=" * 60)
423    print("MANUAL OPTIMIZATION TECHNIQUES")
424    print("=" * 60)
425    print()
426
427    print("1. Rewrite correlated subquery as JOIN")
428    print("-" * 60)
429    print("Slow (correlated subquery - runs for each row):")
430    print("""
431    SELECT c.name,
432           (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.customer_id)
433    FROM customers c
434    """)
435
436    print("\nFast (JOIN with GROUP BY):")
437    print("""
438    SELECT c.name, COUNT(o.order_id)
439    FROM customers c
440    LEFT JOIN orders o ON c.customer_id = o.customer_id
441    GROUP BY c.customer_id
442    """)
443
444    print("\n\n2. Use LIMIT for pagination")
445    print("-" * 60)
446    print("Bad (fetch all, discard most):")
447    print("  SELECT * FROM large_table ORDER BY id")
448    print("\nGood (only fetch needed rows):")
449    print("  SELECT * FROM large_table ORDER BY id LIMIT 20 OFFSET 100")
450
451    print("\n\n3. Avoid SELECT *")
452    print("-" * 60)
453    print("Bad (fetches unnecessary data):")
454    print("  SELECT * FROM customers WHERE country = 'USA'")
455    print("\nGood (only needed columns):")
456    print("  SELECT customer_id, name FROM customers WHERE country = 'USA'")
457    print("  (Enables covering index optimization)")
458
459    print("\n\n4. Use EXISTS instead of COUNT for existence check")
460    print("-" * 60)
461    print("Slow (counts all matches):")
462    print("  SELECT COUNT(*) FROM orders WHERE customer_id = 123")
463    print("\nFast (stops after finding first match):")
464    print("  SELECT EXISTS(SELECT 1 FROM orders WHERE customer_id = 123)")
465
466    print()
467
468
469if __name__ == "__main__":
470    print("""
471╔══════════════════════════════════════════════════════════════╗
472║            QUERY OPTIMIZATION                                ║
473║  Execution Plans, Join Ordering, Index Selection             ║
474╚══════════════════════════════════════════════════════════════╝
475""")
476
477    demonstrate_explain_query_plan()
478    demonstrate_join_ordering()
479    demonstrate_index_selection()
480    demonstrate_query_transformations()
481    demonstrate_statistics()
482    demonstrate_optimization_hints()
483
484    print("=" * 60)
485    print("SUMMARY: QUERY OPTIMIZATION")
486    print("=" * 60)
487    print("Optimizer's job:")
488    print("  1. Parse SQL into relational algebra")
489    print("  2. Apply equivalence transformations")
490    print("  3. Estimate cost of alternative plans")
491    print("  4. Choose plan with lowest estimated cost")
492    print()
493    print("Key optimization techniques:")
494    print("  - Predicate pushdown (filter early)")
495    print("  - Join reordering (minimize intermediates)")
496    print("  - Index selection (avoid full scans)")
497    print("  - Subquery flattening (convert to JOIN)")
498    print()
499    print("Tools:")
500    print("  - EXPLAIN QUERY PLAN: see execution plan")
501    print("  - ANALYZE: gather statistics")
502    print("  - CREATE INDEX: speed up queries")
503    print("=" * 60)