09_nosql_document_store.py

Download
python 500 lines 14.6 KB
  1"""
  2NoSQL Document Store with SQLite JSON
  3
  4Demonstrates document-oriented database concepts using SQLite's JSON functions:
  5- Schema-less storage (flexible documents)
  6- CRUD operations on JSON documents
  7- Nested queries and array operations
  8- Indexing JSON fields
  9- Comparison with relational approach
 10
 11Theory:
 12- Document stores organize data as semi-structured documents (JSON, BSON, XML)
 13- Schema flexibility: different documents can have different structures
 14- Trade-offs: flexibility vs. data integrity
 15- SQLite supports JSON1 extension for JSON operations
 16- Useful for: hierarchical data, variable schemas, rapid development
 17
 18CAP theorem consideration:
 19- Most NoSQL systems sacrifice strong consistency for availability/partition tolerance
 20- SQLite is single-writer, so it maintains strong consistency
 21"""
 22
 23import sqlite3
 24import json
 25from typing import List, Dict, Any
 26from datetime import datetime
 27
 28
 29def demonstrate_document_storage():
 30    """Demonstrate storing and querying JSON documents."""
 31    print("=" * 60)
 32    print("DOCUMENT STORAGE (Schema-less)")
 33    print("=" * 60)
 34    print()
 35
 36    conn = sqlite3.connect(':memory:')
 37    cursor = conn.cursor()
 38
 39    # Create simple document store
 40    cursor.execute('''
 41        CREATE TABLE documents (
 42            id INTEGER PRIMARY KEY AUTOINCREMENT,
 43            collection TEXT NOT NULL,
 44            data TEXT NOT NULL,  -- JSON document
 45            created_at REAL DEFAULT (julianday('now'))
 46        )
 47    ''')
 48
 49    print("1. Inserting documents with different schemas")
 50    print("-" * 60)
 51
 52    # User documents with varying fields
 53    users = [
 54        {
 55            "name": "Alice",
 56            "email": "alice@example.com",
 57            "age": 30,
 58            "address": {
 59                "city": "New York",
 60                "country": "USA"
 61            }
 62        },
 63        {
 64            "name": "Bob",
 65            "email": "bob@example.com",
 66            "phone": "555-1234",  # Different field!
 67            "preferences": {
 68                "theme": "dark",
 69                "notifications": True
 70            }
 71        },
 72        {
 73            "name": "Charlie",
 74            "age": 25,
 75            "skills": ["Python", "SQL", "Docker"],  # Array field!
 76            "address": {
 77                "city": "London",
 78                "country": "UK",
 79                "postal_code": "SW1A 1AA"  # Extra nested field
 80            }
 81        }
 82    ]
 83
 84    for user in users:
 85        cursor.execute(
 86            "INSERT INTO documents (collection, data) VALUES (?, ?)",
 87            ("users", json.dumps(user))
 88        )
 89    print(f"✓ Inserted {len(users)} user documents with different schemas")
 90
 91    # Retrieve documents
 92    print("\n2. Retrieving all user documents")
 93    print("-" * 60)
 94    cursor.execute("SELECT id, data FROM documents WHERE collection = 'users'")
 95    for doc_id, data in cursor.fetchall():
 96        doc = json.loads(data)
 97        print(f"  Doc {doc_id}: {doc.get('name')} - {json.dumps(doc, indent=4)}")
 98
 99    print("\n✓ Schema flexibility: each document can have different fields")
100    print()
101
102
103def demonstrate_json_queries():
104    """Demonstrate JSON query operations."""
105    print("=" * 60)
106    print("JSON QUERY OPERATIONS")
107    print("=" * 60)
108    print()
109
110    conn = sqlite3.connect(':memory:')
111    cursor = conn.cursor()
112
113    cursor.execute('''
114        CREATE TABLE products (
115            id INTEGER PRIMARY KEY AUTOINCREMENT,
116            data TEXT NOT NULL
117        )
118    ''')
119
120    # Insert product documents
121    products = [
122        {
123            "name": "Laptop",
124            "price": 999.99,
125            "specs": {"cpu": "i7", "ram": 16, "storage": 512},
126            "tags": ["electronics", "computers"]
127        },
128        {
129            "name": "Mouse",
130            "price": 29.99,
131            "specs": {"type": "wireless", "dpi": 1600},
132            "tags": ["electronics", "accessories"]
133        },
134        {
135            "name": "Book",
136            "price": 19.99,
137            "author": "John Doe",
138            "tags": ["books", "fiction"]
139        }
140    ]
141
142    for product in products:
143        cursor.execute("INSERT INTO products (data) VALUES (?)", (json.dumps(product),))
144
145    # Extract specific fields
146    print("1. Extract specific JSON fields (json_extract)")
147    print("-" * 60)
148    cursor.execute("""
149        SELECT
150            json_extract(data, '$.name') as name,
151            json_extract(data, '$.price') as price
152        FROM products
153        ORDER BY price DESC
154    """)
155    for row in cursor.fetchall():
156        print(f"  {row[0]:15} ${row[1]:.2f}")
157
158    # Filter by nested field
159    print("\n2. Filter by nested field (specs.ram >= 16)")
160    print("-" * 60)
161    cursor.execute("""
162        SELECT json_extract(data, '$.name') as name
163        FROM products
164        WHERE json_extract(data, '$.specs.ram') >= 16
165    """)
166    for row in cursor.fetchall():
167        print(f"  {row[0]}")
168
169    # Array operations
170    print("\n3. Find products with 'electronics' tag (array search)")
171    print("-" * 60)
172    cursor.execute("""
173        SELECT json_extract(data, '$.name') as name
174        FROM products, json_each(json_extract(data, '$.tags'))
175        WHERE json_each.value = 'electronics'
176    """)
177    for row in cursor.fetchall():
178        print(f"  {row[0]}")
179
180    # Full-text search in JSON
181    print("\n4. Search for 'wireless' anywhere in document")
182    print("-" * 60)
183    cursor.execute("""
184        SELECT json_extract(data, '$.name') as name
185        FROM products
186        WHERE json_extract(data, '$.specs.type') = 'wireless'
187           OR data LIKE '%wireless%'
188    """)
189    for row in cursor.fetchall():
190        print(f"  {row[0]}")
191
192    conn.close()
193    print()
194
195
196def demonstrate_json_indexing():
197    """Demonstrate indexing JSON fields for performance."""
198    print("=" * 60)
199    print("INDEXING JSON FIELDS")
200    print("=" * 60)
201    print()
202
203    conn = sqlite3.connect(':memory:')
204    cursor = conn.cursor()
205
206    cursor.execute('''
207        CREATE TABLE events (
208            id INTEGER PRIMARY KEY AUTOINCREMENT,
209            data TEXT NOT NULL
210        )
211    ''')
212
213    # Insert many events
214    print("Inserting 10,000 event documents...")
215    import random
216    event_types = ["login", "logout", "purchase", "view", "click"]
217    for i in range(10000):
218        event = {
219            "type": random.choice(event_types),
220            "user_id": random.randint(1, 1000),
221            "timestamp": datetime.now().isoformat(),
222            "metadata": {
223                "ip": f"192.168.1.{random.randint(1, 255)}",
224                "session_id": f"sess_{random.randint(1000, 9999)}"
225            }
226        }
227        cursor.execute("INSERT INTO events (data) VALUES (?)", (json.dumps(event),))
228    conn.commit()
229    print("✓ Inserted 10,000 events\n")
230
231    # Query without index
232    print("1. Query without index (count by event type)")
233    print("-" * 60)
234    import time
235    start = time.time()
236    cursor.execute("""
237        SELECT
238            json_extract(data, '$.type') as event_type,
239            COUNT(*) as count
240        FROM events
241        WHERE json_extract(data, '$.type') = 'purchase'
242    """)
243    result = cursor.fetchone()
244    elapsed_no_idx = time.time() - start
245    print(f"  Found {result[1]} purchases in {elapsed_no_idx*1000:.2f} ms")
246
247    # Create index on extracted JSON field
248    print("\n2. Creating index on json_extract(data, '$.type')")
249    print("-" * 60)
250    cursor.execute("""
251        CREATE INDEX idx_event_type
252        ON events(json_extract(data, '$.type'))
253    """)
254    print("✓ Index created")
255
256    # Query with index
257    print("\n3. Query with index (same query)")
258    print("-" * 60)
259    start = time.time()
260    cursor.execute("""
261        SELECT
262            json_extract(data, '$.type') as event_type,
263            COUNT(*) as count
264        FROM events
265        WHERE json_extract(data, '$.type') = 'purchase'
266    """)
267    result = cursor.fetchone()
268    elapsed_with_idx = time.time() - start
269    print(f"  Found {result[1]} purchases in {elapsed_with_idx*1000:.2f} ms")
270
271    if elapsed_no_idx > elapsed_with_idx:
272        speedup = elapsed_no_idx / elapsed_with_idx
273        print(f"\n✓ Speedup: {speedup:.1f}x faster with index")
274
275    # Show query plan
276    print("\n4. Query plan analysis")
277    print("-" * 60)
278    cursor.execute("""
279        EXPLAIN QUERY PLAN
280        SELECT * FROM events
281        WHERE json_extract(data, '$.type') = 'purchase'
282    """)
283    for row in cursor.fetchall():
284        print(f"  {row}")
285
286    conn.close()
287    print()
288
289
290def demonstrate_relational_vs_document():
291    """Compare relational and document approaches."""
292    print("=" * 60)
293    print("RELATIONAL VS DOCUMENT APPROACH")
294    print("=" * 60)
295    print()
296
297    conn = sqlite3.connect(':memory:')
298    cursor = conn.cursor()
299
300    # Relational approach: normalized tables
301    print("RELATIONAL APPROACH (Normalized)")
302    print("-" * 60)
303    cursor.execute('''
304        CREATE TABLE blog_posts (
305            id INTEGER PRIMARY KEY,
306            title TEXT,
307            author TEXT,
308            content TEXT,
309            created_at TEXT
310        )
311    ''')
312    cursor.execute('''
313        CREATE TABLE comments (
314            id INTEGER PRIMARY KEY,
315            post_id INTEGER,
316            author TEXT,
317            text TEXT,
318            created_at TEXT,
319            FOREIGN KEY (post_id) REFERENCES blog_posts(id)
320        )
321    ''')
322
323    cursor.execute("""
324        INSERT INTO blog_posts VALUES
325        (1, 'First Post', 'Alice', 'Hello world!', '2024-01-01')
326    """)
327    cursor.execute("""
328        INSERT INTO comments VALUES
329        (1, 1, 'Bob', 'Great post!', '2024-01-02'),
330        (2, 1, 'Charlie', 'Thanks for sharing', '2024-01-03')
331    """)
332
333    print("Query: Get post with all comments (requires JOIN)")
334    cursor.execute("""
335        SELECT
336            p.title,
337            p.author as post_author,
338            c.author as comment_author,
339            c.text as comment_text
340        FROM blog_posts p
341        LEFT JOIN comments c ON p.id = c.post_id
342        WHERE p.id = 1
343    """)
344    print("Results:")
345    for row in cursor.fetchall():
346        print(f"  {row}")
347
348    # Document approach: embedded documents
349    print("\n\nDOCUMENT APPROACH (Denormalized)")
350    print("-" * 60)
351    cursor.execute('''
352        CREATE TABLE blog_docs (
353            id INTEGER PRIMARY KEY,
354            data TEXT NOT NULL
355        )
356    ''')
357
358    blog_post = {
359        "title": "First Post",
360        "author": "Alice",
361        "content": "Hello world!",
362        "created_at": "2024-01-01",
363        "comments": [
364            {"author": "Bob", "text": "Great post!", "created_at": "2024-01-02"},
365            {"author": "Charlie", "text": "Thanks for sharing", "created_at": "2024-01-03"}
366        ]
367    }
368
369    cursor.execute("INSERT INTO blog_docs VALUES (1, ?)", (json.dumps(blog_post),))
370
371    print("Query: Get post with all comments (single document, no JOIN)")
372    cursor.execute("SELECT data FROM blog_docs WHERE id = 1")
373    doc = json.loads(cursor.fetchone()[0])
374    print(f"Post: {doc['title']} by {doc['author']}")
375    print("Comments:")
376    for comment in doc['comments']:
377        print(f"  - {comment['author']}: {comment['text']}")
378
379    print("\n\nTRADE-OFFS:")
380    print("-" * 60)
381    print("Relational:")
382    print("  ✓ No data duplication")
383    print("  ✓ Easy to update (update in one place)")
384    print("  ✓ Strong schema validation")
385    print("  ✗ Requires JOINs (slower for reads)")
386    print()
387    print("Document:")
388    print("  ✓ Fast reads (no JOINs needed)")
389    print("  ✓ Schema flexibility")
390    print("  ✓ Natural hierarchical data representation")
391    print("  ✗ Data duplication possible")
392    print("  ✗ Updates may require updating multiple documents")
393    print("  ✗ Weaker data integrity")
394
395    conn.close()
396    print()
397
398
399def demonstrate_update_operations():
400    """Demonstrate updating JSON documents."""
401    print("=" * 60)
402    print("UPDATING JSON DOCUMENTS")
403    print("=" * 60)
404    print()
405
406    conn = sqlite3.connect(':memory:')
407    cursor = conn.cursor()
408
409    cursor.execute('''
410        CREATE TABLE user_profiles (
411            id INTEGER PRIMARY KEY,
412            data TEXT NOT NULL
413        )
414    ''')
415
416    user = {
417        "name": "Alice",
418        "email": "alice@example.com",
419        "settings": {
420            "theme": "light",
421            "notifications": True
422        }
423    }
424    cursor.execute("INSERT INTO user_profiles VALUES (1, ?)", (json.dumps(user),))
425
426    print("Original document:")
427    print("-" * 60)
428    cursor.execute("SELECT data FROM user_profiles WHERE id = 1")
429    print(json.dumps(json.loads(cursor.fetchone()[0]), indent=2))
430
431    # Update nested field
432    print("\n1. Update nested field (theme: light → dark)")
433    print("-" * 60)
434    cursor.execute("""
435        UPDATE user_profiles
436        SET data = json_set(data, '$.settings.theme', 'dark')
437        WHERE id = 1
438    """)
439
440    cursor.execute("SELECT data FROM user_profiles WHERE id = 1")
441    print(json.dumps(json.loads(cursor.fetchone()[0]), indent=2))
442
443    # Add new field
444    print("\n2. Add new field (age: 30)")
445    print("-" * 60)
446    cursor.execute("""
447        UPDATE user_profiles
448        SET data = json_set(data, '$.age', 30)
449        WHERE id = 1
450    """)
451
452    cursor.execute("SELECT data FROM user_profiles WHERE id = 1")
453    print(json.dumps(json.loads(cursor.fetchone()[0]), indent=2))
454
455    # Remove field
456    print("\n3. Remove field (email)")
457    print("-" * 60)
458    cursor.execute("""
459        UPDATE user_profiles
460        SET data = json_remove(data, '$.email')
461        WHERE id = 1
462    """)
463
464    cursor.execute("SELECT data FROM user_profiles WHERE id = 1")
465    print(json.dumps(json.loads(cursor.fetchone()[0]), indent=2))
466
467    conn.close()
468    print()
469
470
471if __name__ == "__main__":
472    print("""
473╔══════════════════════════════════════════════════════════════╗
474║          NoSQL DOCUMENT STORE (SQLite JSON)                  ║
475║  Schema-less Storage, JSON Queries, Indexing                 ║
476╚══════════════════════════════════════════════════════════════╝
477""")
478
479    demonstrate_document_storage()
480    demonstrate_json_queries()
481    demonstrate_json_indexing()
482    demonstrate_relational_vs_document()
483    demonstrate_update_operations()
484
485    print("=" * 60)
486    print("SUMMARY: DOCUMENT STORES")
487    print("=" * 60)
488    print("When to use:")
489    print("  ✓ Hierarchical/nested data")
490    print("  ✓ Schema flexibility needed")
491    print("  ✓ Rapid prototyping")
492    print("  ✓ Read-heavy workloads")
493    print()
494    print("When NOT to use:")
495    print("  ✗ Complex relationships (many-to-many)")
496    print("  ✗ Strong ACID guarantees required")
497    print("  ✗ Complex queries across documents")
498    print("  ✗ Frequent updates to shared data")
499    print("=" * 60)