1#!/usr/bin/env bash
2set -euo pipefail
3
4# Data Extraction Using Regex
5# Demonstrates extracting structured data from text using bash regex patterns
6
7# ============================================================================
8# Color definitions for output
9# ============================================================================
10
11readonly RED='\033[0;31m'
12readonly GREEN='\033[0;32m'
13readonly YELLOW='\033[1;33m'
14readonly BLUE='\033[0;34m'
15readonly CYAN='\033[0;36m'
16readonly NC='\033[0m' # No Color
17
18# ============================================================================
19# Extraction Functions
20# ============================================================================
21
22# Extract all URLs from text
23extract_urls() {
24 local text="$1"
25
26 echo -e "${CYAN}Extracting URLs...${NC}"
27
28 # URL pattern: http(s)://...
29 local url_pattern='https?://[a-zA-Z0-9./?=_%:-]*'
30
31 local count=0
32 while [[ "$text" =~ $url_pattern ]]; do
33 local url="${BASH_REMATCH[0]}"
34 echo -e " ${GREEN}→${NC} $url"
35
36 # Remove matched URL from text to find next one
37 text="${text#*"$url"}"
38 ((count++))
39 done
40
41 if [[ $count -eq 0 ]]; then
42 echo -e " ${YELLOW}(no URLs found)${NC}"
43 else
44 echo -e " Found ${GREEN}$count${NC} URL(s)"
45 fi
46 echo
47}
48
49# Extract all email addresses from text
50extract_emails() {
51 local text="$1"
52
53 echo -e "${CYAN}Extracting email addresses...${NC}"
54
55 # Email pattern
56 local email_pattern='[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
57
58 local count=0
59 while [[ "$text" =~ $email_pattern ]]; do
60 local email="${BASH_REMATCH[0]}"
61 echo -e " ${GREEN}→${NC} $email"
62
63 # Remove matched email to find next one
64 text="${text#*"$email"}"
65 ((count++))
66 done
67
68 if [[ $count -eq 0 ]]; then
69 echo -e " ${YELLOW}(no emails found)${NC}"
70 else
71 echo -e " Found ${GREEN}$count${NC} email(s)"
72 fi
73 echo
74}
75
76# Parse structured log line into components
77parse_log_line() {
78 local log_line="$1"
79
80 # Log pattern: [timestamp] level: message
81 # Example: [2024-01-15 10:30:45] ERROR: Connection failed
82 local log_pattern='^\[([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\] ([A-Z]+): (.*)$'
83
84 if [[ "$log_line" =~ $log_pattern ]]; then
85 local timestamp="${BASH_REMATCH[1]}"
86 local level="${BASH_REMATCH[2]}"
87 local message="${BASH_REMATCH[3]}"
88
89 echo -e "${GREEN}✓${NC} Successfully parsed log line:"
90 echo -e " Timestamp: ${CYAN}$timestamp${NC}"
91 echo -e " Level: ${YELLOW}$level${NC}"
92 echo -e " Message: $message"
93 return 0
94 else
95 echo -e "${RED}✗${NC} Failed to parse log line: $log_line"
96 return 1
97 fi
98}
99
100# Parse CSV line handling quoted fields
101parse_csv_line() {
102 local csv_line="$1"
103
104 echo -e "${CYAN}Parsing CSV line:${NC} $csv_line"
105
106 # Simple CSV parser for demonstration
107 # Handles: field1,"field with, comma","field with ""quotes"""
108
109 local -a fields=()
110 local field=""
111 local in_quotes=false
112 local i
113
114 for ((i=0; i<${#csv_line}; i++)); do
115 local char="${csv_line:$i:1}"
116
117 if [[ "$char" == '"' ]]; then
118 if [[ "$in_quotes" == true ]]; then
119 # Check for escaped quote ""
120 if [[ "${csv_line:$((i+1)):1}" == '"' ]]; then
121 field+="\""
122 ((i++))
123 else
124 in_quotes=false
125 fi
126 else
127 in_quotes=true
128 fi
129 elif [[ "$char" == ',' && "$in_quotes" == false ]]; then
130 fields+=("$field")
131 field=""
132 else
133 field+="$char"
134 fi
135 done
136
137 # Add last field
138 fields+=("$field")
139
140 echo -e " ${GREEN}Extracted ${#fields[@]} field(s):${NC}"
141 local idx=1
142 for field in "${fields[@]}"; do
143 echo -e " [$idx] $field"
144 ((idx++))
145 done
146 echo
147}
148
149# Extract key-value pairs from configuration-style text
150extract_key_values() {
151 local text="$1"
152
153 echo -e "${CYAN}Extracting key-value pairs...${NC}"
154
155 # Pattern: key = value (or key=value)
156 local kv_pattern='([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]].*)'
157
158 local count=0
159 while IFS= read -r line; do
160 if [[ "$line" =~ $kv_pattern ]]; then
161 local key="${BASH_REMATCH[1]}"
162 local value="${BASH_REMATCH[2]}"
163 echo -e " ${GREEN}$key${NC} = $value"
164 ((count++))
165 fi
166 done <<< "$text"
167
168 if [[ $count -eq 0 ]]; then
169 echo -e " ${YELLOW}(no key-value pairs found)${NC}"
170 fi
171 echo
172}
173
174# Extract phone numbers (US format)
175extract_phone_numbers() {
176 local text="$1"
177
178 echo -e "${CYAN}Extracting phone numbers...${NC}"
179
180 # US phone pattern: (123) 456-7890 or 123-456-7890 or 1234567890
181 local phone_pattern='(\([0-9]{3}\) ?[0-9]{3}-[0-9]{4}|[0-9]{3}-[0-9]{3}-[0-9]{4}|[0-9]{10})'
182
183 local count=0
184 while [[ "$text" =~ $phone_pattern ]]; do
185 local phone="${BASH_REMATCH[1]}"
186 echo -e " ${GREEN}→${NC} $phone"
187
188 text="${text#*"$phone"}"
189 ((count++))
190 done
191
192 if [[ $count -eq 0 ]]; then
193 echo -e " ${YELLOW}(no phone numbers found)${NC}"
194 else
195 echo -e " Found ${GREEN}$count${NC} phone number(s)"
196 fi
197 echo
198}
199
200# ============================================================================
201# Demo Section
202# ============================================================================
203
204demo_url_extraction() {
205 echo -e "\n${BLUE}=== URL Extraction Demo ===${NC}\n"
206
207 local sample_text="Check out https://example.com and http://api.example.org/v1/data
208Also visit https://github.com/user/repo for more info.
209FTP not supported: ftp://old.server.com"
210
211 echo "Sample text:"
212 echo "$sample_text"
213 echo
214
215 extract_urls "$sample_text"
216}
217
218demo_email_extraction() {
219 echo -e "${BLUE}=== Email Extraction Demo ===${NC}\n"
220
221 local sample_text="Contact john.doe@example.com or support@company.co.uk
222For sales, reach out to sales@example.com
223Admin email: admin@localhost"
224
225 echo "Sample text:"
226 echo "$sample_text"
227 echo
228
229 extract_emails "$sample_text"
230}
231
232demo_log_parsing() {
233 echo -e "${BLUE}=== Log Line Parsing Demo ===${NC}\n"
234
235 local -a log_lines=(
236 "[2024-01-15 10:30:45] ERROR: Connection timeout"
237 "[2024-01-15 10:31:12] INFO: Retrying connection"
238 "[2024-01-15 10:31:15] WARN: High memory usage detected"
239 "Invalid log format without timestamp"
240 )
241
242 for log in "${log_lines[@]}"; do
243 parse_log_line "$log"
244 echo
245 done
246}
247
248demo_csv_parsing() {
249 echo -e "${BLUE}=== CSV Parsing Demo ===${NC}\n"
250
251 local -a csv_lines=(
252 'John,Doe,30,Engineer'
253 '"Smith, Jane",Manager,"San Francisco, CA",45'
254 'Bob,"Quote ""test"" here",Developer,35'
255 )
256
257 for csv in "${csv_lines[@]}"; do
258 parse_csv_line "$csv"
259 done
260}
261
262demo_key_value_extraction() {
263 echo -e "${BLUE}=== Key-Value Extraction Demo ===${NC}\n"
264
265 local config_text="# Configuration file
266database_host = localhost
267database_port = 5432
268max_connections=100
269timeout = 30
270debug_mode = true"
271
272 echo "Sample configuration:"
273 echo "$config_text"
274 echo
275
276 extract_key_values "$config_text"
277}
278
279demo_phone_extraction() {
280 echo -e "${BLUE}=== Phone Number Extraction Demo ===${NC}\n"
281
282 local sample_text="Call us at (555) 123-4567 or 555-987-6543
283Mobile: 5551234567
284International numbers not supported: +1-555-123-4567"
285
286 echo "Sample text:"
287 echo "$sample_text"
288 echo
289
290 extract_phone_numbers "$sample_text"
291}
292
293# ============================================================================
294# Main Execution
295# ============================================================================
296
297main() {
298 echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}"
299 echo -e "${BLUE}║ Data Extraction with Regex Demo ║${NC}"
300 echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}"
301
302 demo_url_extraction
303 demo_email_extraction
304 demo_log_parsing
305 demo_csv_parsing
306 demo_key_value_extraction
307 demo_phone_extraction
308
309 echo -e "${GREEN}=== All Extraction Demos Complete ===${NC}\n"
310}
311
312main "$@"