extract.sh

Download
bash 313 lines 8.3 KB
  1#!/usr/bin/env bash
  2set -euo pipefail
  3
  4# Data Extraction Using Regex
  5# Demonstrates extracting structured data from text using bash regex patterns
  6
  7# ============================================================================
  8# Color definitions for output
  9# ============================================================================
 10
 11readonly RED='\033[0;31m'
 12readonly GREEN='\033[0;32m'
 13readonly YELLOW='\033[1;33m'
 14readonly BLUE='\033[0;34m'
 15readonly CYAN='\033[0;36m'
 16readonly NC='\033[0m' # No Color
 17
 18# ============================================================================
 19# Extraction Functions
 20# ============================================================================
 21
 22# Extract all URLs from text
 23extract_urls() {
 24    local text="$1"
 25
 26    echo -e "${CYAN}Extracting URLs...${NC}"
 27
 28    # URL pattern: http(s)://...
 29    local url_pattern='https?://[a-zA-Z0-9./?=_%:-]*'
 30
 31    local count=0
 32    while [[ "$text" =~ $url_pattern ]]; do
 33        local url="${BASH_REMATCH[0]}"
 34        echo -e "  ${GREEN}${NC} $url"
 35
 36        # Remove matched URL from text to find next one
 37        text="${text#*"$url"}"
 38        ((count++))
 39    done
 40
 41    if [[ $count -eq 0 ]]; then
 42        echo -e "  ${YELLOW}(no URLs found)${NC}"
 43    else
 44        echo -e "  Found ${GREEN}$count${NC} URL(s)"
 45    fi
 46    echo
 47}
 48
 49# Extract all email addresses from text
 50extract_emails() {
 51    local text="$1"
 52
 53    echo -e "${CYAN}Extracting email addresses...${NC}"
 54
 55    # Email pattern
 56    local email_pattern='[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
 57
 58    local count=0
 59    while [[ "$text" =~ $email_pattern ]]; do
 60        local email="${BASH_REMATCH[0]}"
 61        echo -e "  ${GREEN}${NC} $email"
 62
 63        # Remove matched email to find next one
 64        text="${text#*"$email"}"
 65        ((count++))
 66    done
 67
 68    if [[ $count -eq 0 ]]; then
 69        echo -e "  ${YELLOW}(no emails found)${NC}"
 70    else
 71        echo -e "  Found ${GREEN}$count${NC} email(s)"
 72    fi
 73    echo
 74}
 75
 76# Parse structured log line into components
 77parse_log_line() {
 78    local log_line="$1"
 79
 80    # Log pattern: [timestamp] level: message
 81    # Example: [2024-01-15 10:30:45] ERROR: Connection failed
 82    local log_pattern='^\[([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})\] ([A-Z]+): (.*)$'
 83
 84    if [[ "$log_line" =~ $log_pattern ]]; then
 85        local timestamp="${BASH_REMATCH[1]}"
 86        local level="${BASH_REMATCH[2]}"
 87        local message="${BASH_REMATCH[3]}"
 88
 89        echo -e "${GREEN}${NC} Successfully parsed log line:"
 90        echo -e "  Timestamp: ${CYAN}$timestamp${NC}"
 91        echo -e "  Level:     ${YELLOW}$level${NC}"
 92        echo -e "  Message:   $message"
 93        return 0
 94    else
 95        echo -e "${RED}${NC} Failed to parse log line: $log_line"
 96        return 1
 97    fi
 98}
 99
100# Parse CSV line handling quoted fields
101parse_csv_line() {
102    local csv_line="$1"
103
104    echo -e "${CYAN}Parsing CSV line:${NC} $csv_line"
105
106    # Simple CSV parser for demonstration
107    # Handles: field1,"field with, comma","field with ""quotes"""
108
109    local -a fields=()
110    local field=""
111    local in_quotes=false
112    local i
113
114    for ((i=0; i<${#csv_line}; i++)); do
115        local char="${csv_line:$i:1}"
116
117        if [[ "$char" == '"' ]]; then
118            if [[ "$in_quotes" == true ]]; then
119                # Check for escaped quote ""
120                if [[ "${csv_line:$((i+1)):1}" == '"' ]]; then
121                    field+="\""
122                    ((i++))
123                else
124                    in_quotes=false
125                fi
126            else
127                in_quotes=true
128            fi
129        elif [[ "$char" == ',' && "$in_quotes" == false ]]; then
130            fields+=("$field")
131            field=""
132        else
133            field+="$char"
134        fi
135    done
136
137    # Add last field
138    fields+=("$field")
139
140    echo -e "  ${GREEN}Extracted ${#fields[@]} field(s):${NC}"
141    local idx=1
142    for field in "${fields[@]}"; do
143        echo -e "    [$idx] $field"
144        ((idx++))
145    done
146    echo
147}
148
149# Extract key-value pairs from configuration-style text
150extract_key_values() {
151    local text="$1"
152
153    echo -e "${CYAN}Extracting key-value pairs...${NC}"
154
155    # Pattern: key = value (or key=value)
156    local kv_pattern='([a-zA-Z_][a-zA-Z0-9_]*)[[:space:]]*=[[:space:]]*([^[:space:]].*)'
157
158    local count=0
159    while IFS= read -r line; do
160        if [[ "$line" =~ $kv_pattern ]]; then
161            local key="${BASH_REMATCH[1]}"
162            local value="${BASH_REMATCH[2]}"
163            echo -e "  ${GREEN}$key${NC} = $value"
164            ((count++))
165        fi
166    done <<< "$text"
167
168    if [[ $count -eq 0 ]]; then
169        echo -e "  ${YELLOW}(no key-value pairs found)${NC}"
170    fi
171    echo
172}
173
174# Extract phone numbers (US format)
175extract_phone_numbers() {
176    local text="$1"
177
178    echo -e "${CYAN}Extracting phone numbers...${NC}"
179
180    # US phone pattern: (123) 456-7890 or 123-456-7890 or 1234567890
181    local phone_pattern='(\([0-9]{3}\) ?[0-9]{3}-[0-9]{4}|[0-9]{3}-[0-9]{3}-[0-9]{4}|[0-9]{10})'
182
183    local count=0
184    while [[ "$text" =~ $phone_pattern ]]; do
185        local phone="${BASH_REMATCH[1]}"
186        echo -e "  ${GREEN}${NC} $phone"
187
188        text="${text#*"$phone"}"
189        ((count++))
190    done
191
192    if [[ $count -eq 0 ]]; then
193        echo -e "  ${YELLOW}(no phone numbers found)${NC}"
194    else
195        echo -e "  Found ${GREEN}$count${NC} phone number(s)"
196    fi
197    echo
198}
199
200# ============================================================================
201# Demo Section
202# ============================================================================
203
204demo_url_extraction() {
205    echo -e "\n${BLUE}=== URL Extraction Demo ===${NC}\n"
206
207    local sample_text="Check out https://example.com and http://api.example.org/v1/data
208Also visit https://github.com/user/repo for more info.
209FTP not supported: ftp://old.server.com"
210
211    echo "Sample text:"
212    echo "$sample_text"
213    echo
214
215    extract_urls "$sample_text"
216}
217
218demo_email_extraction() {
219    echo -e "${BLUE}=== Email Extraction Demo ===${NC}\n"
220
221    local sample_text="Contact john.doe@example.com or support@company.co.uk
222For sales, reach out to sales@example.com
223Admin email: admin@localhost"
224
225    echo "Sample text:"
226    echo "$sample_text"
227    echo
228
229    extract_emails "$sample_text"
230}
231
232demo_log_parsing() {
233    echo -e "${BLUE}=== Log Line Parsing Demo ===${NC}\n"
234
235    local -a log_lines=(
236        "[2024-01-15 10:30:45] ERROR: Connection timeout"
237        "[2024-01-15 10:31:12] INFO: Retrying connection"
238        "[2024-01-15 10:31:15] WARN: High memory usage detected"
239        "Invalid log format without timestamp"
240    )
241
242    for log in "${log_lines[@]}"; do
243        parse_log_line "$log"
244        echo
245    done
246}
247
248demo_csv_parsing() {
249    echo -e "${BLUE}=== CSV Parsing Demo ===${NC}\n"
250
251    local -a csv_lines=(
252        'John,Doe,30,Engineer'
253        '"Smith, Jane",Manager,"San Francisco, CA",45'
254        'Bob,"Quote ""test"" here",Developer,35'
255    )
256
257    for csv in "${csv_lines[@]}"; do
258        parse_csv_line "$csv"
259    done
260}
261
262demo_key_value_extraction() {
263    echo -e "${BLUE}=== Key-Value Extraction Demo ===${NC}\n"
264
265    local config_text="# Configuration file
266database_host = localhost
267database_port = 5432
268max_connections=100
269timeout = 30
270debug_mode = true"
271
272    echo "Sample configuration:"
273    echo "$config_text"
274    echo
275
276    extract_key_values "$config_text"
277}
278
279demo_phone_extraction() {
280    echo -e "${BLUE}=== Phone Number Extraction Demo ===${NC}\n"
281
282    local sample_text="Call us at (555) 123-4567 or 555-987-6543
283Mobile: 5551234567
284International numbers not supported: +1-555-123-4567"
285
286    echo "Sample text:"
287    echo "$sample_text"
288    echo
289
290    extract_phone_numbers "$sample_text"
291}
292
293# ============================================================================
294# Main Execution
295# ============================================================================
296
297main() {
298    echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}"
299    echo -e "${BLUE}║    Data Extraction with Regex Demo        ║${NC}"
300    echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}"
301
302    demo_url_extraction
303    demo_email_extraction
304    demo_log_parsing
305    demo_csv_parsing
306    demo_key_value_extraction
307    demo_phone_extraction
308
309    echo -e "${GREEN}=== All Extraction Demos Complete ===${NC}\n"
310}
311
312main "$@"