1-- ============================================================================
2-- PostgreSQL Full-Text Search Examples
3-- ============================================================================
4-- This script demonstrates full-text search capabilities in PostgreSQL,
5-- including tsvector, tsquery, ranking, highlighting, and fuzzy matching.
6--
7-- Prerequisites:
8-- - PostgreSQL 12+ (pg_trgm extension)
9-- - CREATE EXTENSION privilege (for pg_trgm)
10--
11-- Usage:
12-- psql -U postgres -d your_database -f 06_full_text_search.sql
13-- ============================================================================
14
15-- Clean up from previous runs
16DROP TABLE IF EXISTS documents CASCADE;
17DROP TABLE IF EXISTS articles CASCADE;
18
19-- ============================================================================
20-- 1. Basic tsvector and tsquery
21-- ============================================================================
22
23-- Create a sample documents table
24CREATE TABLE documents (
25 id SERIAL PRIMARY KEY,
26 title TEXT NOT NULL,
27 body TEXT NOT NULL,
28 -- Store pre-computed search vector for performance
29 search_vector tsvector
30);
31
32-- Insert sample data
33INSERT INTO documents (title, body) VALUES
34 ('PostgreSQL Tutorial', 'PostgreSQL is a powerful open-source relational database system with advanced features.'),
35 ('Full-Text Search Guide', 'Full-text search allows you to search natural language documents efficiently using inverted indexes.'),
36 ('Database Performance', 'Optimizing database performance requires understanding indexes, query planning, and proper schema design.'),
37 ('Advanced SQL Techniques', 'Learn advanced SQL techniques including window functions, CTEs, and recursive queries.'),
38 ('Python and PostgreSQL', 'Connecting Python applications to PostgreSQL databases using psycopg2 and SQLAlchemy.');
39
40-- Convert text to tsvector (tokenize and normalize)
41SELECT to_tsvector('english', 'PostgreSQL is a powerful database system');
42-- Output: 'databas':5 'postgresql':1 'power':4 'system':6
43
44-- Create tsquery (search pattern)
45SELECT to_tsquery('english', 'database & system');
46-- Output: 'databas' & 'system'
47
48-- Match tsvector against tsquery
49SELECT to_tsvector('english', 'PostgreSQL is a powerful database system')
50 @@ to_tsquery('english', 'database & system') AS matches;
51-- Output: true
52
53-- ============================================================================
54-- 2. Creating a GIN Index for Fast Search
55-- ============================================================================
56
57-- Update search_vector column with combined title and body
58UPDATE documents
59SET search_vector =
60 setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
61 setweight(to_tsvector('english', coalesce(body, '')), 'B');
62
63-- Create GIN (Generalized Inverted Index) for fast full-text search
64CREATE INDEX idx_documents_search ON documents USING GIN(search_vector);
65
66-- Basic search query
67SELECT id, title
68FROM documents
69WHERE search_vector @@ to_tsquery('english', 'database');
70
71-- Search with AND operator
72SELECT id, title
73FROM documents
74WHERE search_vector @@ to_tsquery('english', 'database & performance');
75
76-- Search with OR operator
77SELECT id, title
78FROM documents
79WHERE search_vector @@ to_tsquery('english', 'PostgreSQL | Python');
80
81-- Search with negation
82SELECT id, title
83FROM documents
84WHERE search_vector @@ to_tsquery('english', 'database & !performance');
85
86-- Phrase search (use <-> for word distance)
87SELECT id, title
88FROM documents
89WHERE search_vector @@ to_tsquery('english', 'full <-> text <-> search');
90
91-- ============================================================================
92-- 3. Weighted Search (Priority Ranking)
93-- ============================================================================
94
95-- Assign different weights to different parts of the document
96-- Weight 'A' (highest) for title, 'B' for body
97-- Weights: A=1.0, B=0.4, C=0.2, D=0.1
98
99SELECT
100 id,
101 title,
102 -- Show which parts matched
103 ts_debug('english', title) AS title_tokens
104FROM documents
105WHERE search_vector @@ to_tsquery('english', 'PostgreSQL')
106LIMIT 1;
107
108-- ============================================================================
109-- 4. Relevance Ranking with ts_rank
110-- ============================================================================
111
112-- Rank search results by relevance
113SELECT
114 id,
115 title,
116 ts_rank(search_vector, query) AS rank
117FROM documents, to_tsquery('english', 'database') query
118WHERE search_vector @@ query
119ORDER BY rank DESC;
120
121-- ts_rank_cd (cover density ranking) - considers proximity of terms
122SELECT
123 id,
124 title,
125 ts_rank_cd(search_vector, query, 32) AS rank_cd
126FROM documents, to_tsquery('english', 'database & system') query
127WHERE search_vector @@ query
128ORDER BY rank_cd DESC;
129
130-- Custom ranking with weights
131SELECT
132 id,
133 title,
134 ts_rank(
135 search_vector,
136 query,
137 -- Normalization flags: 1=length, 2=log(length), 4=harmonic distance
138 1 -- Normalize by document length
139 ) AS rank
140FROM documents, to_tsquery('english', 'PostgreSQL | database') query
141WHERE search_vector @@ query
142ORDER BY rank DESC;
143
144-- ============================================================================
145-- 5. Result Highlighting with ts_headline
146-- ============================================================================
147
148-- Highlight matching terms in search results
149SELECT
150 id,
151 title,
152 ts_headline(
153 'english',
154 body,
155 to_tsquery('english', 'database & system'),
156 'StartSel=<b>, StopSel=</b>, MaxWords=50, MinWords=20'
157 ) AS snippet
158FROM documents
159WHERE search_vector @@ to_tsquery('english', 'database & system');
160
161-- Highlight with custom options
162SELECT
163 id,
164 ts_headline(
165 'english',
166 title || ' ' || body,
167 to_tsquery('english', 'PostgreSQL'),
168 'StartSel=**, StopSel=**, MaxFragments=2, FragmentDelimiter=...'
169 ) AS highlighted
170FROM documents
171WHERE search_vector @@ to_tsquery('english', 'PostgreSQL');
172
173-- ============================================================================
174-- 6. Advanced: plainto_tsquery and websearch_to_tsquery
175-- ============================================================================
176
177-- plainto_tsquery: Simple query parser (automatic AND between words)
178SELECT
179 id,
180 title,
181 ts_rank(search_vector, query) AS rank
182FROM documents, plainto_tsquery('english', 'database performance') query
183WHERE search_vector @@ query
184ORDER BY rank DESC;
185
186-- websearch_to_tsquery: Google-like search syntax (PostgreSQL 11+)
187-- Supports: "phrase", OR, -, quotes
188SELECT
189 id,
190 title,
191 ts_rank(search_vector, query) AS rank
192FROM documents, websearch_to_tsquery('english', '"full-text search" OR PostgreSQL -Python') query
193WHERE search_vector @@ query
194ORDER BY rank DESC;
195
196-- ============================================================================
197-- 7. Fuzzy Matching with pg_trgm
198-- ============================================================================
199
200-- Enable pg_trgm extension for trigram-based fuzzy matching
201CREATE EXTENSION IF NOT EXISTS pg_trgm;
202
203-- Create articles table for fuzzy search demo
204CREATE TABLE articles (
205 id SERIAL PRIMARY KEY,
206 title TEXT NOT NULL,
207 content TEXT NOT NULL
208);
209
210INSERT INTO articles (title, content) VALUES
211 ('Introduction to Databases', 'Learn about relational and NoSQL databases.'),
212 ('Database Design Patterns', 'Common patterns for designing scalable database schemas.'),
213 ('PostgreSQL Administration', 'Essential PostgreSQL administration tasks and tools.'),
214 ('Data Warehousing Basics', 'Introduction to data warehousing concepts and star schemas.');
215
216-- Create GIN index for trigram matching
217CREATE INDEX idx_articles_title_trgm ON articles USING GIN(title gin_trgm_ops);
218
219-- Similarity search (0.0 = no match, 1.0 = perfect match)
220SELECT
221 title,
222 similarity(title, 'Databse Design') AS sim
223FROM articles
224WHERE title % 'Databse Design' -- % operator checks if similarity > threshold
225ORDER BY sim DESC;
226
227-- Set similarity threshold (default is 0.3)
228SET pg_trgm.similarity_threshold = 0.2;
229
230SELECT
231 title,
232 similarity(title, 'Postgre') AS sim
233FROM articles
234WHERE title % 'Postgre'
235ORDER BY sim DESC;
236
237-- Word similarity (for substring matching)
238SELECT
239 title,
240 word_similarity('Admin', title) AS word_sim
241FROM articles
242WHERE 'Admin' <% title -- <% operator for word similarity
243ORDER BY word_sim DESC;
244
245-- Combine full-text search with fuzzy matching
246SELECT
247 a.title,
248 ts_rank(to_tsvector('english', a.content), query) AS fts_rank,
249 similarity(a.title, 'Databse') AS fuzzy_sim
250FROM articles a, to_tsquery('english', 'database') query
251WHERE
252 to_tsvector('english', a.content) @@ query
253 OR a.title % 'Databse'
254ORDER BY fts_rank DESC, fuzzy_sim DESC;
255
256-- ============================================================================
257-- 8. Automatic Search Vector Update with Trigger
258-- ============================================================================
259
260-- Create trigger function to auto-update search_vector
261CREATE OR REPLACE FUNCTION documents_search_vector_update() RETURNS trigger AS $$
262BEGIN
263 NEW.search_vector :=
264 setweight(to_tsvector('english', coalesce(NEW.title, '')), 'A') ||
265 setweight(to_tsvector('english', coalesce(NEW.body, '')), 'B');
266 RETURN NEW;
267END;
268$$ LANGUAGE plpgsql;
269
270-- Attach trigger to documents table
271DROP TRIGGER IF EXISTS tsvector_update ON documents;
272CREATE TRIGGER tsvector_update
273 BEFORE INSERT OR UPDATE ON documents
274 FOR EACH ROW EXECUTE FUNCTION documents_search_vector_update();
275
276-- Test trigger
277INSERT INTO documents (title, body) VALUES
278 ('Trigger Test', 'This document tests automatic search vector updates.');
279
280SELECT title, search_vector
281FROM documents
282WHERE title = 'Trigger Test';
283
284-- ============================================================================
285-- 9. Multi-language Full-Text Search
286-- ============================================================================
287
288-- PostgreSQL supports many languages (run \dF to see available configurations)
289-- Examples: 'simple', 'english', 'french', 'german', 'spanish', etc.
290
291-- Spanish text search
292SELECT to_tsvector('spanish', 'Los documentos están escritos en español');
293
294-- Mixed language table
295CREATE TABLE multilang_docs (
296 id SERIAL PRIMARY KEY,
297 lang TEXT NOT NULL,
298 content TEXT NOT NULL,
299 search_vector tsvector
300);
301
302INSERT INTO multilang_docs (lang, content) VALUES
303 ('english', 'The quick brown fox jumps over the lazy dog'),
304 ('french', 'Le renard brun rapide saute par-dessus le chien paresseux'),
305 ('german', 'Der schnelle braune Fuchs springt über den faulen Hund');
306
307-- Update with language-specific search vectors
308UPDATE multilang_docs
309SET search_vector = to_tsvector(lang::regconfig, content);
310
311-- Search in specific language
312SELECT lang, content
313FROM multilang_docs
314WHERE search_vector @@ to_tsquery('french', 'renard');
315
316-- ============================================================================
317-- 10. Performance Tips
318-- ============================================================================
319
320-- Analyze search performance
321EXPLAIN ANALYZE
322SELECT id, title, ts_rank(search_vector, query) AS rank
323FROM documents, to_tsquery('english', 'database & performance') query
324WHERE search_vector @@ query
325ORDER BY rank DESC;
326
327-- Vacuum and analyze to update statistics
328VACUUM ANALYZE documents;
329
330-- Monitor index usage
331SELECT
332 schemaname,
333 tablename,
334 indexname,
335 idx_scan,
336 idx_tup_read,
337 idx_tup_fetch
338FROM pg_stat_user_indexes
339WHERE tablename = 'documents';
340
341-- ============================================================================
342-- Summary
343-- ============================================================================
344-- Key takeaways:
345-- 1. tsvector: Tokenized, normalized text representation
346-- 2. tsquery: Search pattern with operators (&, |, !, <->)
347-- 3. GIN index: Fast full-text search (essential for production)
348-- 4. Weights: Prioritize different document parts (A > B > C > D)
349-- 5. ts_rank: Relevance scoring for result ordering
350-- 6. ts_headline: Highlight matching terms in results
351-- 7. pg_trgm: Fuzzy matching for typo tolerance
352-- 8. Triggers: Auto-update search vectors on INSERT/UPDATE
353-- 9. Multi-language: Support for 20+ languages
354-- 10. Performance: Use GIN indexes, VACUUM ANALYZE, monitor stats
355-- ============================================================================