Subqueries and CTE

Subqueries and CTE

1. What is a Subquery?

A subquery is another query contained within a query.

SELECT * FROM users
WHERE id IN (SELECT user_id FROM orders);  -- Subquery
          ↑
    Query in parentheses

2. WHERE Clause Subqueries

Scalar Subquery (Single Value)

-- Products more expensive than average
SELECT * FROM products
WHERE price > (SELECT AVG(price) FROM products);

-- Orders on the latest order date
SELECT * FROM orders
WHERE order_date = (SELECT MAX(order_date) FROM orders);

Multi-Row Subquery

-- Users who have ordered
SELECT * FROM users
WHERE id IN (SELECT DISTINCT user_id FROM orders);

-- Users who purchased electronics
SELECT * FROM users
WHERE id IN (
    SELECT o.user_id FROM orders o
    JOIN order_items oi ON o.id = oi.order_id
    JOIN products p ON oi.product_id = p.id
    WHERE p.category = 'Electronics'
);

NOT IN

-- Users who have never ordered
SELECT * FROM users
WHERE id NOT IN (
    SELECT user_id FROM orders WHERE user_id IS NOT NULL
);
-- Caution: NOT IN with NULL may return empty results

ANY / SOME

-- Furniture more expensive than any electronics item
SELECT * FROM products
WHERE category = 'Furniture'
  AND price > ANY (SELECT price FROM products WHERE category = 'Electronics');
-- = ANY is same as IN

ALL

-- Products more expensive than all electronics items
SELECT * FROM products
WHERE price > ALL (SELECT price FROM products WHERE category = 'Electronics');

3. EXISTS / NOT EXISTS

Checks only for row existence.

-- Users with orders
SELECT * FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o WHERE o.user_id = u.id
);

-- Users without orders
SELECT * FROM users u
WHERE NOT EXISTS (
    SELECT 1 FROM orders o WHERE o.user_id = u.id
);

IN vs EXISTS

-- IN: Load subquery results into memory
SELECT * FROM users
WHERE id IN (SELECT user_id FROM orders);

-- EXISTS: Check existence for each row
SELECT * FROM users u
WHERE EXISTS (SELECT 1 FROM orders o WHERE o.user_id = u.id);

-- Generally:
-- - Use IN if subquery results are small
-- - Use EXISTS if subquery results are large
-- - Prefer NOT EXISTS over NOT IN (avoid NULL issues)

4. FROM Clause Subquery (Inline View)

-- Calculate average price by category, then filter
SELECT *
FROM (
    SELECT category, AVG(price) AS avg_price
    FROM products
    GROUP BY category
) AS category_avg
WHERE avg_price > 100000;

-- Alias required for subquery (AS category_avg)

JOIN After Complex Aggregation

-- Combine user stats with user info
SELECT
    u.name,
    u.email,
    stats.order_count,
    stats.total_amount
FROM users u
JOIN (
    SELECT
        user_id,
        COUNT(*) AS order_count,
        SUM(amount) AS total_amount
    FROM orders
    GROUP BY user_id
) AS stats ON u.id = stats.user_id;

5. SELECT Clause Subquery (Scalar Subquery)

-- Display each product with category average price
SELECT
    name,
    price,
    (SELECT AVG(price) FROM products p2 WHERE p2.category = p.category) AS category_avg
FROM products p;

-- Order count for each user
SELECT
    u.name,
    (SELECT COUNT(*) FROM orders o WHERE o.user_id = u.id) AS order_count
FROM users u;

6. Correlated Subquery

A subquery that references values from the outer query.

-- Products more expensive than their category average
SELECT * FROM products p
WHERE price > (
    SELECT AVG(price) FROM products WHERE category = p.category
);
--                                                    ↑ Reference outer query

-- Most expensive product in each category
SELECT * FROM products p
WHERE price = (
    SELECT MAX(price) FROM products WHERE category = p.category
);

7. CTE (Common Table Expression)

Uses WITH clause to name temporary result sets.

Basic CTE

-- Subquery approach
SELECT * FROM (
    SELECT category, AVG(price) AS avg_price
    FROM products
    GROUP BY category
) AS category_stats
WHERE avg_price > 100000;

-- CTE approach (more readable)
WITH category_stats AS (
    SELECT category, AVG(price) AS avg_price
    FROM products
    GROUP BY category
)
SELECT * FROM category_stats
WHERE avg_price > 100000;

Multiple CTEs

WITH
-- Category statistics
category_stats AS (
    SELECT
        category,
        COUNT(*) AS product_count,
        AVG(price) AS avg_price
    FROM products
    GROUP BY category
),
-- Expensive products (1,000,000+)
expensive_products AS (
    SELECT * FROM products WHERE price >= 1000000
)
SELECT
    cs.category,
    cs.product_count,
    cs.avg_price,
    COUNT(ep.id) AS expensive_count
FROM category_stats cs
LEFT JOIN expensive_products ep ON cs.category = ep.category
GROUP BY cs.category, cs.product_count, cs.avg_price;

CTE with Main Query

WITH monthly_sales AS (
    SELECT
        DATE_TRUNC('month', order_date) AS month,
        SUM(amount) AS total
    FROM orders
    GROUP BY DATE_TRUNC('month', order_date)
)
SELECT
    month,
    total,
    LAG(total) OVER (ORDER BY month) AS prev_month,
    total - LAG(total) OVER (ORDER BY month) AS diff
FROM monthly_sales
ORDER BY month;

8. Recursive CTE (WITH RECURSIVE)

A CTE that references itself.

Organization Chart Traversal

-- Employees table
CREATE TABLE employees (
    id SERIAL PRIMARY KEY,
    name VARCHAR(100),
    manager_id INTEGER REFERENCES employees(id)
);

INSERT INTO employees (name, manager_id) VALUES
('CEO', NULL),
('CTO', 1),
('Dev Manager', 2),
('Developer A', 3),
('Developer B', 3),
('CFO', 1),
('Finance Manager', 6);

-- Query all subordinates from CEO
WITH RECURSIVE org_tree AS (
    -- Base case: CEO
    SELECT id, name, manager_id, 1 AS level, name::TEXT AS path
    FROM employees
    WHERE manager_id IS NULL

    UNION ALL

    -- Recursive case: subordinates
    SELECT
        e.id,
        e.name,
        e.manager_id,
        ot.level + 1,
        ot.path || ' > ' || e.name
    FROM employees e
    JOIN org_tree ot ON e.manager_id = ot.id
)
SELECT
    REPEAT('  ', level - 1) || name AS org_chart,
    level,
    path
FROM org_tree
ORDER BY path;

Result:

     org_chart      β”‚ level β”‚           path
────────────────────┼───────┼──────────────────────────
 CEO                β”‚     1 β”‚ CEO
   CFO              β”‚     2 β”‚ CEO > CFO
     Finance Managerβ”‚     3 β”‚ CEO > CFO > Finance Manager
   CTO              β”‚     2 β”‚ CEO > CTO
     Dev Manager    β”‚     3 β”‚ CEO > CTO > Dev Manager
       Developer A  β”‚     4 β”‚ CEO > CTO > Dev Manager > Developer A
       Developer B  β”‚     4 β”‚ CEO > CTO > Dev Manager > Developer B

Generate Number Sequence

-- 1 to 10
WITH RECURSIVE numbers AS (
    SELECT 1 AS n
    UNION ALL
    SELECT n + 1 FROM numbers WHERE n < 10
)
SELECT * FROM numbers;

Generate Date Range

-- Last 7 days
WITH RECURSIVE date_range AS (
    SELECT CURRENT_DATE - INTERVAL '6 days' AS date
    UNION ALL
    SELECT date + INTERVAL '1 day'
    FROM date_range
    WHERE date < CURRENT_DATE
)
SELECT date::DATE FROM date_range;

9. Practice Examples

Sample Data

-- Create tables
CREATE TABLE departments (
    id SERIAL PRIMARY KEY,
    name VARCHAR(100)
);

CREATE TABLE employees (
    id SERIAL PRIMARY KEY,
    name VARCHAR(100),
    department_id INTEGER REFERENCES departments(id),
    salary NUMERIC(10, 2),
    hire_date DATE
);

-- Insert data
INSERT INTO departments (name) VALUES
('Development'), ('Marketing'), ('HR'), ('Finance');

INSERT INTO employees (name, department_id, salary, hire_date) VALUES
('Kim Dev', 1, 5000000, '2020-03-15'),
('Lee Dev', 1, 4500000, '2021-06-20'),
('Park Marketing', 2, 4000000, '2019-11-10'),
('Choi Marketing', 2, 3800000, '2022-01-05'),
('Jung HR', 3, 3500000, '2020-08-25'),
('Han Finance', 4, 4200000, '2021-03-10'),
('Oh Finance', 4, 3900000, '2022-07-15');

Practice 1: WHERE Subqueries

-- 1. Employees with salary higher than average
SELECT name, salary
FROM employees
WHERE salary > (SELECT AVG(salary) FROM employees);

-- 2. Most recently hired employee
SELECT * FROM employees
WHERE hire_date = (SELECT MAX(hire_date) FROM employees);

-- 3. Employees in Development or Marketing
SELECT * FROM employees
WHERE department_id IN (
    SELECT id FROM departments WHERE name IN ('Development', 'Marketing')
);

Practice 2: Correlated Subqueries

-- 1. Employees with salary higher than their department average
SELECT
    e.name,
    e.salary,
    d.name AS department
FROM employees e
JOIN departments d ON e.department_id = d.id
WHERE e.salary > (
    SELECT AVG(salary)
    FROM employees
    WHERE department_id = e.department_id
);

-- 2. Highest paid employee in each department
SELECT * FROM employees e
WHERE salary = (
    SELECT MAX(salary)
    FROM employees
    WHERE department_id = e.department_id
);

Practice 3: CTE Usage

-- 1. Query employee info with department stats
WITH dept_stats AS (
    SELECT
        department_id,
        AVG(salary) AS avg_salary,
        COUNT(*) AS emp_count
    FROM employees
    GROUP BY department_id
)
SELECT
    e.name,
    e.salary,
    d.name AS department,
    ds.avg_salary AS dept_avg,
    ds.emp_count AS dept_count
FROM employees e
JOIN departments d ON e.department_id = d.id
JOIN dept_stats ds ON e.department_id = ds.department_id;

-- 2. Query with salary ranking
WITH ranked_employees AS (
    SELECT
        *,
        RANK() OVER (ORDER BY salary DESC) AS salary_rank,
        RANK() OVER (PARTITION BY department_id ORDER BY salary DESC) AS dept_rank
    FROM employees
)
SELECT
    name,
    salary,
    salary_rank AS overall_rank,
    dept_rank AS dept_rank
FROM ranked_employees
ORDER BY salary_rank;

Practice 4: Complex Usage

-- Employees with above-average salary in their department and the difference
WITH
dept_avg AS (
    SELECT department_id, AVG(salary) AS avg_salary
    FROM employees
    GROUP BY department_id
),
above_avg AS (
    SELECT
        e.*,
        da.avg_salary,
        e.salary - da.avg_salary AS diff
    FROM employees e
    JOIN dept_avg da ON e.department_id = da.department_id
    WHERE e.salary >= da.avg_salary
)
SELECT
    aa.name,
    d.name AS department,
    aa.salary,
    ROUND(aa.avg_salary, 0) AS dept_avg,
    ROUND(aa.diff, 0) AS above_avg_by
FROM above_avg aa
JOIN departments d ON aa.department_id = d.id
ORDER BY aa.diff DESC;

10. Subquery vs CTE vs JOIN

Situation Recommended
Simple value comparison Subquery
Multiple references CTE
Table connection JOIN
Separate complex logic CTE
Recursive traversal WITH RECURSIVE

Next Steps

Learn about VIEW and INDEX in 09_Views_and_Indexes.md!

to navigate between lessons