19. Full-Text Search

19. Full-Text Search

Learning Objectives

  • Understand PostgreSQL full-text search architecture
  • Use tsvector and tsquery data types effectively
  • Create GIN indexes for fast text search
  • Implement weighted search with ranking
  • Configure multilingual search support
  • Utilize pg_trgm for fuzzy matching

Table of Contents

  1. Full-Text Search Overview
  2. tsvector and tsquery
  3. Search Configuration
  4. GIN and GiST Indexes
  5. Ranking and Weights
  6. Advanced Search Techniques
  7. pg_trgm for Fuzzy Matching
  8. Practice Problems

1. Full-Text Search Overview

┌─────────────────────────────────────────────────────────────────┐
              LIKE vs Full-Text Search                            
├─────────────────────────────────────────────────────────────────┤
                                                                 
  LIKE / ILIKE:                                                  
  ┌──────────────────────────────────────────┐                   
   SELECT * FROM articles                                      
   WHERE body ILIKE '%database%';                              
                                                               
    No index usage (sequential scan)                          
    No linguistic awareness                                   
    No ranking capability                                     
    "databases" won't match "database"     │                   │
│  └──────────────────────────────────────────┘                   │
│                                                                 │
│  Full-Text Search:                                              │
│  ┌──────────────────────────────────────────┐                   │
│  │ SELECT * FROM articles                   │                   │
│  │ WHERE to_tsvector(body)                  │                   │
│  │       @@ to_tsquery('database');                             
                                                               
    GIN index support (fast)                                  
    Stemming (databases  database)                           
    Ranking by relevance                                      
    Boolean operators (AND, OR, NOT)                          
  └──────────────────────────────────────────┘                   
                                                                 
└─────────────────────────────────────────────────────────────────┘

1.2 Architecture Overview

┌─────────────────────────────────────────────────────────────────┐
│              Full-Text Search Pipeline                           │
│                                                                 │
│  Document Text                                                  │
│       │                                                         │
│       ▼                                                         │
│  ┌─────────────────┐                                            │
│  │   Parser         │──▶ Tokenize into words                   │
│  └─────────────────┘                                            │
│       │                                                         │
│       ▼                                                         │
│  ┌─────────────────┐                                            │
│  │  Dictionaries    │──▶ Normalize (stop words, stems)         │
│  └─────────────────┘                                            │
│       │                                                         │
│       ▼                                                         │
│  ┌─────────────────┐                                            │
│  │   tsvector       │──▶ Sorted list of lexemes + positions    │
│  └─────────────────┘                                            │
│       │                                                         │
│       ▼                                                         │
│  ┌─────────────────┐                                            │
│  │  GIN Index       │──▶ Fast lookup by lexeme                 │
│  └─────────────────┘                                            │
│                                                                 │
│  Query Text ──▶ tsquery ──▶ Match against tsvector             │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

2. tsvector and tsquery

2.1 tsvector — Document Representation

-- Basic conversion
SELECT to_tsvector('english', 'The quick brown foxes jumped over the lazy dogs');
-- Result: 'brown':3 'dog':9 'fox':4 'jump':5 'lazi':8 'quick':2

-- Notice: stop words removed ("the", "over"), words stemmed ("foxes" → "fox")

-- tsvector stores lexemes with positions
SELECT to_tsvector('english', 'A fat cat sat on a mat. A fat cat ate a fat rat.');
-- Result: 'ate':10 'cat':3,8 'fat':2,7,11 'mat':6 'rat':12 'sat':4

2.2 tsquery — Query Representation

-- Basic query
SELECT to_tsquery('english', 'cats & dogs');
-- Result: 'cat' & 'dog'

-- Boolean operators
SELECT to_tsquery('english', 'cat | dog');        -- OR
SELECT to_tsquery('english', 'cat & !dog');       -- AND NOT
SELECT to_tsquery('english', 'cat <-> dog');      -- FOLLOWED BY (phrase)
SELECT to_tsquery('english', 'cat <2> dog');      -- within 2 words

-- plainto_tsquery: simpler syntax (implicit AND)
SELECT plainto_tsquery('english', 'fat cats');
-- Result: 'fat' & 'cat'

-- phraseto_tsquery: exact phrase matching
SELECT phraseto_tsquery('english', 'fat cats');
-- Result: 'fat' <-> 'cat'

-- websearch_to_tsquery: web-style syntax (PostgreSQL 11+)
SELECT websearch_to_tsquery('english', '"fat cats" -dogs');
-- Result: 'fat' <-> 'cat' & !'dog'

SELECT websearch_to_tsquery('english', 'cats or dogs');
-- Result: 'cat' | 'dog'

2.3 The Match Operator (@@)

-- tsvector @@ tsquery
SELECT to_tsvector('english', 'The fat cats sat on the mat')
       @@ to_tsquery('english', 'cat & mat');
-- Result: true

SELECT to_tsvector('english', 'The fat cats sat on the mat')
       @@ to_tsquery('english', 'cat & dog');
-- Result: false

-- Practical example
CREATE TABLE articles (
    id SERIAL PRIMARY KEY,
    title TEXT NOT NULL,
    body TEXT NOT NULL,
    created_at TIMESTAMP DEFAULT NOW()
);

INSERT INTO articles (title, body) VALUES
('PostgreSQL Full-Text Search', 'PostgreSQL provides powerful full-text search capabilities using tsvector and tsquery.'),
('Database Indexing Guide', 'Indexes are crucial for database performance. B-tree, GIN, and GiST are common index types.'),
('Introduction to SQL', 'SQL is a standard language for managing relational databases. SELECT, INSERT, UPDATE, DELETE.'),
('Advanced Query Optimization', 'Query optimization involves analyzing execution plans and choosing efficient access paths.');

-- Search articles
SELECT title, ts_rank(to_tsvector('english', body), query) AS rank
FROM articles, to_tsquery('english', 'database & index') AS query
WHERE to_tsvector('english', body) @@ query
ORDER BY rank DESC;

2.4 Stored tsvector Column

-- Add a generated tsvector column for better performance
ALTER TABLE articles ADD COLUMN search_vector tsvector
    GENERATED ALWAYS AS (
        setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
        setweight(to_tsvector('english', coalesce(body, '')), 'B')
    ) STORED;

-- Now queries use the pre-computed column
SELECT title
FROM articles
WHERE search_vector @@ to_tsquery('english', 'database');

3. Search Configuration

3.1 Text Search Configurations

-- List available configurations
SELECT cfgname FROM pg_ts_config;

-- Common configurations:
-- 'simple'    — no stemming, no stop words
-- 'english'   — English stemming and stop words
-- 'german'    — German stemming
-- 'spanish'   — Spanish stemming

-- Show current default
SHOW default_text_search_config;

-- Set default
SET default_text_search_config = 'english';

-- Compare configurations
SELECT to_tsvector('simple', 'The running dogs are quickly jumping');
-- Result: 'are':4 'dogs':3 'jumping':6 'quickly':5 'running':2 'the':1

SELECT to_tsvector('english', 'The running dogs are quickly jumping');
-- Result: 'dog':3 'jump':6 'quick':5 'run':2

3.2 Dictionaries and Stop Words

-- Show dictionaries for a configuration
SELECT * FROM ts_debug('english', 'The quick brown foxes are jumping');

-- Output shows token → dictionary → lexeme mapping
-- Token     | Dictionary   | Lexemes
-- ----------|-------------|--------
-- The       | english_stem | {stop word}
-- quick     | english_stem | {quick}
-- brown     | english_stem | {brown}
-- foxes     | english_stem | {fox}
-- are       | english_stem | {stop word}
-- jumping   | english_stem | {jump}

3.3 Custom Configuration

-- Create custom configuration based on English
CREATE TEXT SEARCH CONFIGURATION my_english (COPY = english);

-- Add synonym dictionary
CREATE TEXT SEARCH DICTIONARY my_synonyms (
    TEMPLATE = synonym,
    SYNONYMS = my_synonyms  -- references $SHAREDIR/tsearch_data/my_synonyms.syn
);

-- Add to configuration
ALTER TEXT SEARCH CONFIGURATION my_english
    ALTER MAPPING FOR asciiword
    WITH my_synonyms, english_stem;

4. GIN and GiST Indexes

-- Create GIN index on tsvector column
CREATE INDEX idx_articles_search ON articles USING GIN (search_vector);

-- Or on expression (slower to build, no stored column needed)
CREATE INDEX idx_articles_body_gin ON articles
    USING GIN (to_tsvector('english', body));

-- Query uses the index automatically
EXPLAIN ANALYZE
SELECT title FROM articles
WHERE search_vector @@ to_tsquery('english', 'database');
-- Bitmap Index Scan on idx_articles_search

4.2 GIN vs GiST Comparison

┌─────────────────────────────────────────────────────────────────┐
│              GIN vs GiST for Full-Text Search                   │
├────────────────┬──────────────────┬──────────────────────────────┤
│                 GIN               GiST                         │
├────────────────┼──────────────────┼──────────────────────────────┤
│ Build speed     Slower            Faster                       │
│ Search speed    Faster (3x)       Slower                       │
│ Index size      Larger            Smaller                      │
│ Update cost     Higher            Lower                        │
│ Exact results   Yes               May have false positives      │
│ Best for        Static data,      Frequently updated data,     │
│                 read-heavy        write-heavy                  │
└────────────────┴──────────────────┴──────────────────────────────┘
-- GiST index (alternative)
CREATE INDEX idx_articles_search_gist ON articles USING GiST (search_vector);

-- GIN with fastupdate (default on, good for batch inserts)
CREATE INDEX idx_articles_search_gin ON articles
    USING GIN (search_vector) WITH (fastupdate = on);

4.3 Index Maintenance

-- Check index size
SELECT pg_size_pretty(pg_relation_size('idx_articles_search'));

-- Reindex if needed
REINDEX INDEX idx_articles_search;

-- Analyze for query planner
ANALYZE articles;

5. Ranking and Weights

5.1 ts_rank

-- Basic ranking
SELECT
    title,
    ts_rank(search_vector, to_tsquery('english', 'database')) AS rank
FROM articles
WHERE search_vector @@ to_tsquery('english', 'database')
ORDER BY rank DESC;

-- Normalization options (bitmask)
-- 0  = default (document length ignored)
-- 1  = divide by 1 + log(document length)
-- 2  = divide by document length
-- 4  = divide by mean harmonic distance between extents
-- 8  = divide by number of unique words
-- 16 = divide by 1 + log(unique words)
-- 32 = divide by itself + 1

SELECT
    title,
    ts_rank(search_vector, query, 2) AS rank_normalized  -- normalize by length
FROM articles, to_tsquery('english', 'database') AS query
WHERE search_vector @@ query
ORDER BY rank_normalized DESC;

5.2 ts_rank_cd (Cover Density)

-- Cover density ranking considers proximity of matching terms
SELECT
    title,
    ts_rank_cd(search_vector, to_tsquery('english', 'database & index')) AS rank_cd
FROM articles
WHERE search_vector @@ to_tsquery('english', 'database & index')
ORDER BY rank_cd DESC;
-- Weights: A (1.0), B (0.4), C (0.2), D (0.1)
-- Custom weights array: {D, C, B, A}

-- Apply custom weights
SELECT
    title,
    ts_rank('{0.1, 0.2, 0.4, 1.0}', search_vector, query) AS weighted_rank
FROM articles, to_tsquery('english', 'database') AS query
WHERE search_vector @@ query
ORDER BY weighted_rank DESC;

-- Build weighted tsvector
SELECT
    setweight(to_tsvector('english', 'PostgreSQL Guide'), 'A') ||
    setweight(to_tsvector('english', 'A comprehensive database tutorial'), 'B') ||
    setweight(to_tsvector('english', 'Learn SQL queries and optimization'), 'C');

5.4 Highlighting Search Results

-- ts_headline highlights matching terms
SELECT
    title,
    ts_headline('english', body, to_tsquery('english', 'database'),
        'StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15'
    ) AS highlighted
FROM articles
WHERE search_vector @@ to_tsquery('english', 'database');

-- Options:
-- StartSel / StopSel — highlight markers
-- MaxWords / MinWords — context window size
-- ShortWord — minimum word length to show
-- MaxFragments — number of fragments (0 = whole document)
-- FragmentDelimiter — separator between fragments

6. Advanced Search Techniques

-- Search across multiple columns with different weights
CREATE TABLE blog_posts (
    id SERIAL PRIMARY KEY,
    title TEXT NOT NULL,
    summary TEXT,
    body TEXT NOT NULL,
    tags TEXT[],
    search_vector tsvector GENERATED ALWAYS AS (
        setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
        setweight(to_tsvector('english', coalesce(summary, '')), 'B') ||
        setweight(to_tsvector('english', coalesce(body, '')), 'C') ||
        setweight(to_tsvector('english', coalesce(array_to_string(tags, ' '), '')), 'A')
    ) STORED
);

CREATE INDEX idx_blog_search ON blog_posts USING GIN (search_vector);
-- Exact phrase: "full text search"
SELECT title FROM articles
WHERE search_vector @@ phraseto_tsquery('english', 'full text search');

-- Proximity: words within N positions
SELECT title FROM articles
WHERE search_vector @@ to_tsquery('english', 'full <3> search');
-- "full" and "search" within 3 words of each other

6.3 Search with Filters

-- Combine full-text search with regular WHERE clauses
SELECT title, ts_rank(search_vector, query) AS rank
FROM articles, to_tsquery('english', 'database') AS query
WHERE search_vector @@ query
  AND created_at >= NOW() - INTERVAL '30 days'
ORDER BY rank DESC
LIMIT 10;

-- Composite index for combined search
CREATE INDEX idx_articles_date_search ON articles
    USING GIN (search_vector) WHERE created_at >= '2024-01-01';
-- Prefix matching with :*
SELECT title FROM articles
WHERE search_vector @@ to_tsquery('english', 'dat:*');
-- Matches: database, data, date, etc.

-- Practical autocomplete function
CREATE OR REPLACE FUNCTION search_autocomplete(search_term TEXT, max_results INT DEFAULT 10)
RETURNS TABLE(title TEXT, rank REAL) AS $$
BEGIN
    RETURN QUERY
    SELECT a.title, ts_rank(a.search_vector, query) AS rank
    FROM articles a, to_tsquery('english', search_term || ':*') AS query
    WHERE a.search_vector @@ query
    ORDER BY rank DESC
    LIMIT max_results;
END;
$$ LANGUAGE plpgsql;

SELECT * FROM search_autocomplete('post');

6.5 Search Statistics

-- Most common words in a corpus
SELECT word, ndoc, nentry
FROM ts_stat('SELECT search_vector FROM articles')
ORDER BY nentry DESC
LIMIT 20;

-- Word frequency for specific query
SELECT word, ndoc
FROM ts_stat('SELECT search_vector FROM articles')
WHERE word LIKE 'data%'
ORDER BY ndoc DESC;

7. pg_trgm for Fuzzy Matching

7.1 Trigram Basics

-- Enable extension
CREATE EXTENSION IF NOT EXISTS pg_trgm;

-- Show trigrams
SELECT show_trgm('PostgreSQL');
-- Result: {"  p"," po","gre","osq","pos","ostg","pgr","ql ","res","sql","stg","tgr"}

-- Similarity score (0 to 1)
SELECT similarity('PostgreSQL', 'Postgresql');  -- ~0.75
SELECT similarity('PostgreSQL', 'MySQL');       -- ~0.09

7.2 Trigram Indexes

-- GIN trigram index
CREATE INDEX idx_articles_title_trgm ON articles USING GIN (title gin_trgm_ops);

-- GiST trigram index (supports distance operator)
CREATE INDEX idx_articles_title_trgm_gist ON articles USING GiST (title gist_trgm_ops);

-- Similarity search
SELECT title, similarity(title, 'Postgre') AS sim
FROM articles
WHERE title % 'Postgre'  -- % operator uses similarity threshold
ORDER BY sim DESC;

-- Set similarity threshold (default 0.3)
SET pg_trgm.similarity_threshold = 0.2;

-- Distance operator (GiST only)
SELECT title, title <-> 'Postgre' AS distance
FROM articles
ORDER BY title <-> 'Postgre'
LIMIT 5;

7.3 LIKE/ILIKE with Trigram Index

-- Trigram index accelerates LIKE queries too
CREATE INDEX idx_articles_body_trgm ON articles USING GIN (body gin_trgm_ops);

-- These queries now use the index
SELECT title FROM articles WHERE body LIKE '%database%';
SELECT title FROM articles WHERE body ILIKE '%DATABASE%';

-- Regular expression queries also benefit
SELECT title FROM articles WHERE body ~ 'data(base|set)';

7.4 Combining FTS and pg_trgm

-- Best of both: full-text search for relevance, trigrams for typo tolerance
CREATE OR REPLACE FUNCTION smart_search(search_term TEXT, max_results INT DEFAULT 10)
RETURNS TABLE(id INT, title TEXT, score REAL) AS $$
BEGIN
    -- Try exact full-text search first
    RETURN QUERY
    SELECT a.id, a.title,
           ts_rank(a.search_vector, websearch_to_tsquery('english', search_term)) AS score
    FROM articles a
    WHERE a.search_vector @@ websearch_to_tsquery('english', search_term)
    ORDER BY score DESC
    LIMIT max_results;

    -- If no results, fall back to fuzzy matching
    IF NOT FOUND THEN
        RETURN QUERY
        SELECT a.id, a.title, similarity(a.title, search_term) AS score
        FROM articles a
        WHERE a.title % search_term OR a.body % search_term
        ORDER BY score DESC
        LIMIT max_results;
    END IF;
END;
$$ LANGUAGE plpgsql;

8. Practice Problems

Build a full-text search system for an e-commerce product catalog.

-- Example answer
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    name TEXT NOT NULL,
    description TEXT,
    category TEXT,
    price NUMERIC(10,2),
    search_vector tsvector GENERATED ALWAYS AS (
        setweight(to_tsvector('english', coalesce(name, '')), 'A') ||
        setweight(to_tsvector('english', coalesce(description, '')), 'B') ||
        setweight(to_tsvector('english', coalesce(category, '')), 'C')
    ) STORED
);

CREATE INDEX idx_products_search ON products USING GIN (search_vector);

-- Search function with category filter
CREATE OR REPLACE FUNCTION search_products(
    search_term TEXT,
    category_filter TEXT DEFAULT NULL,
    max_results INT DEFAULT 20
)
RETURNS TABLE(id INT, name TEXT, price NUMERIC, rank REAL) AS $$
BEGIN
    RETURN QUERY
    SELECT p.id, p.name, p.price,
           ts_rank(p.search_vector, websearch_to_tsquery('english', search_term)) AS rank
    FROM products p
    WHERE p.search_vector @@ websearch_to_tsquery('english', search_term)
      AND (category_filter IS NULL OR p.category = category_filter)
    ORDER BY rank DESC
    LIMIT max_results;
END;
$$ LANGUAGE plpgsql;

Exercise 2: Search with Highlighting

Create a search result with highlighted matches.

-- Example answer
SELECT
    id,
    ts_headline('english', name, query,
        'StartSel=<mark>, StopSel=</mark>') AS highlighted_name,
    ts_headline('english', description, query,
        'StartSel=<mark>, StopSel=</mark>, MaxFragments=2, FragmentDelimiter= ... ') AS highlighted_desc,
    ts_rank(search_vector, query) AS relevance
FROM products, websearch_to_tsquery('english', 'wireless bluetooth') AS query
WHERE search_vector @@ query
ORDER BY relevance DESC;

Design a search system that handles both English and simple (no stemming) text.

-- Example answer
CREATE TABLE multilingual_docs (
    id SERIAL PRIMARY KEY,
    content_en TEXT,
    content_raw TEXT,
    search_en tsvector GENERATED ALWAYS AS (
        to_tsvector('english', coalesce(content_en, ''))
    ) STORED,
    search_simple tsvector GENERATED ALWAYS AS (
        to_tsvector('simple', coalesce(content_raw, ''))
    ) STORED
);

CREATE INDEX idx_ml_en ON multilingual_docs USING GIN (search_en);
CREATE INDEX idx_ml_simple ON multilingual_docs USING GIN (search_simple);

-- Search across both
SELECT id, content_en
FROM multilingual_docs
WHERE search_en @@ websearch_to_tsquery('english', 'search term')
   OR search_simple @@ websearch_to_tsquery('simple', 'search term');

Next Steps

References

to navigate between lessons