health_check.sh

Download
bash 370 lines 9.8 KB
  1#!/usr/bin/env bash
  2set -euo pipefail
  3
  4# Health Check and Alerting Script
  5# Monitors system resources and services, sends alerts on failures
  6# Designed to run from cron for continuous monitoring
  7
  8# ============================================================================
  9# Configuration
 10# ============================================================================
 11
 12SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 13LOG_FILE="${LOG_FILE:-/var/log/health_check.log}"
 14STATE_FILE="${STATE_FILE:-/tmp/health_check.state}"
 15
 16# Alert thresholds
 17CPU_THRESHOLD="${CPU_THRESHOLD:-85}"
 18MEMORY_THRESHOLD="${MEMORY_THRESHOLD:-90}"
 19DISK_THRESHOLD="${DISK_THRESHOLD:-90}"
 20
 21# Services to check (process names)
 22REQUIRED_PROCESSES="${REQUIRED_PROCESSES:-sshd}"
 23
 24# HTTP endpoints to check (format: name,url,timeout)
 25HTTP_ENDPOINTS="${HTTP_ENDPOINTS:-}"
 26# Example: "API,http://localhost:8080/health,5;Frontend,http://localhost:3000,3"
 27
 28# Webhook URL for alerts (Slack-compatible)
 29WEBHOOK_URL="${WEBHOOK_URL:-}"
 30
 31# Platform detection
 32OS_TYPE=$(uname -s)
 33
 34# ============================================================================
 35# Logging Functions
 36# ============================================================================
 37
 38log() {
 39    local level=$1
 40    shift
 41    local message="$*"
 42    local timestamp
 43    timestamp=$(date '+%Y-%m-%d %H:%M:%S')
 44
 45    echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
 46}
 47
 48log_info() {
 49    log "INFO" "$@"
 50}
 51
 52log_warning() {
 53    log "WARNING" "$@"
 54}
 55
 56log_error() {
 57    log "ERROR" "$@"
 58}
 59
 60log_success() {
 61    log "SUCCESS" "$@"
 62}
 63
 64# ============================================================================
 65# State Management
 66# ============================================================================
 67
 68# Track if we've already alerted for this issue
 69has_alerted() {
 70    local key=$1
 71
 72    if [[ ! -f "$STATE_FILE" ]]; then
 73        return 1
 74    fi
 75
 76    grep -q "^${key}$" "$STATE_FILE"
 77}
 78
 79mark_alerted() {
 80    local key=$1
 81
 82    mkdir -p "$(dirname "$STATE_FILE")"
 83    echo "$key" >> "$STATE_FILE"
 84}
 85
 86clear_alert() {
 87    local key=$1
 88
 89    if [[ -f "$STATE_FILE" ]]; then
 90        grep -v "^${key}$" "$STATE_FILE" > "${STATE_FILE}.tmp" || true
 91        mv "${STATE_FILE}.tmp" "$STATE_FILE"
 92    fi
 93}
 94
 95# ============================================================================
 96# Metric Collection
 97# ============================================================================
 98
 99get_cpu_usage() {
100    if [[ "$OS_TYPE" == "Linux" ]]; then
101        if command -v top &> /dev/null; then
102            top -bn2 -d 0.1 | grep '^%Cpu' | tail -n1 | awk '{print int(100 - $8)}'
103        else
104            echo "0"
105        fi
106    elif [[ "$OS_TYPE" == "Darwin" ]]; then
107        top -l 2 -n 0 -F | grep 'CPU usage' | tail -n1 | awk '{print int($3 + $5)}'
108    else
109        echo "0"
110    fi
111}
112
113get_memory_usage() {
114    if [[ "$OS_TYPE" == "Linux" ]]; then
115        if [[ -f /proc/meminfo ]]; then
116            local total
117            local available
118            total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
119            available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
120            echo $(( (total - available) * 100 / total ))
121        else
122            echo "0"
123        fi
124    elif [[ "$OS_TYPE" == "Darwin" ]]; then
125        local page_size
126        local free_pages
127        local active_pages
128        local inactive_pages
129        local wired_pages
130
131        page_size=$(vm_stat | grep 'page size' | awk '{print $8}')
132        free_pages=$(vm_stat | grep 'Pages free' | awk '{print $3}' | tr -d '.')
133        active_pages=$(vm_stat | grep 'Pages active' | awk '{print $3}' | tr -d '.')
134        inactive_pages=$(vm_stat | grep 'Pages inactive' | awk '{print $3}' | tr -d '.')
135        wired_pages=$(vm_stat | grep 'Pages wired' | awk '{print $4}' | tr -d '.')
136
137        local used=$((active_pages + inactive_pages + wired_pages))
138        local total=$((used + free_pages))
139
140        echo $((used * 100 / total))
141    else
142        echo "0"
143    fi
144}
145
146get_disk_usage() {
147    df -h / | awk 'NR==2 {print int($5)}'
148}
149
150# ============================================================================
151# Health Checks
152# ============================================================================
153
154check_cpu() {
155    local usage
156    usage=$(get_cpu_usage)
157    local alert_key="cpu_high"
158
159    if [[ $usage -ge $CPU_THRESHOLD ]]; then
160        if ! has_alerted "$alert_key"; then
161            log_error "CPU usage is high: ${usage}%"
162            send_alert "CPU Alert" "CPU usage is ${usage}% (threshold: ${CPU_THRESHOLD}%)" "danger"
163            mark_alerted "$alert_key"
164        fi
165        return 1
166    else
167        log_info "CPU usage is normal: ${usage}%"
168        clear_alert "$alert_key"
169        return 0
170    fi
171}
172
173check_memory() {
174    local usage
175    usage=$(get_memory_usage)
176    local alert_key="memory_high"
177
178    if [[ $usage -ge $MEMORY_THRESHOLD ]]; then
179        if ! has_alerted "$alert_key"; then
180            log_error "Memory usage is high: ${usage}%"
181            send_alert "Memory Alert" "Memory usage is ${usage}% (threshold: ${MEMORY_THRESHOLD}%)" "danger"
182            mark_alerted "$alert_key"
183        fi
184        return 1
185    else
186        log_info "Memory usage is normal: ${usage}%"
187        clear_alert "$alert_key"
188        return 0
189    fi
190}
191
192check_disk() {
193    local usage
194    usage=$(get_disk_usage)
195    local alert_key="disk_full"
196
197    if [[ $usage -ge $DISK_THRESHOLD ]]; then
198        if ! has_alerted "$alert_key"; then
199            log_error "Disk usage is high: ${usage}%"
200            send_alert "Disk Alert" "Disk usage is ${usage}% (threshold: ${DISK_THRESHOLD}%)" "warning"
201            mark_alerted "$alert_key"
202        fi
203        return 1
204    else
205        log_info "Disk usage is normal: ${usage}%"
206        clear_alert "$alert_key"
207        return 0
208    fi
209}
210
211check_process() {
212    local process_name=$1
213    local alert_key="process_${process_name}"
214
215    if pgrep -x "$process_name" > /dev/null; then
216        log_info "Process is running: $process_name"
217        clear_alert "$alert_key"
218        return 0
219    else
220        if ! has_alerted "$alert_key"; then
221            log_error "Process is not running: $process_name"
222            send_alert "Process Alert" "Required process '$process_name' is not running" "danger"
223            mark_alerted "$alert_key"
224        fi
225        return 1
226    fi
227}
228
229check_http_endpoint() {
230    local name=$1
231    local url=$2
232    local timeout=$3
233    local alert_key="http_${name}"
234
235    if command -v curl &> /dev/null; then
236        if curl --silent --fail --max-time "$timeout" "$url" > /dev/null 2>&1; then
237            log_info "HTTP endpoint is healthy: $name ($url)"
238            clear_alert "$alert_key"
239            return 0
240        else
241            if ! has_alerted "$alert_key"; then
242                log_error "HTTP endpoint is unhealthy: $name ($url)"
243                send_alert "HTTP Alert" "Endpoint '$name' at $url is not responding" "danger"
244                mark_alerted "$alert_key"
245            fi
246            return 1
247        fi
248    else
249        log_warning "curl not available, skipping HTTP check for $name"
250        return 0
251    fi
252}
253
254# ============================================================================
255# Alerting
256# ============================================================================
257
258send_alert() {
259    local title=$1
260    local message=$2
261    local color=${3:-warning}  # good, warning, danger
262
263    if [[ -z "$WEBHOOK_URL" ]]; then
264        log_warning "WEBHOOK_URL not set, skipping alert notification"
265        return 0
266    fi
267
268    # Slack-compatible JSON payload
269    local payload
270    payload=$(cat <<EOF
271{
272  "attachments": [
273    {
274      "color": "$color",
275      "title": "$title",
276      "text": "$message",
277      "footer": "Health Check on $(hostname)",
278      "ts": $(date +%s)
279    }
280  ]
281}
282EOF
283)
284
285    if command -v curl &> /dev/null; then
286        if curl --silent --fail \
287                --max-time 10 \
288                -X POST \
289                -H 'Content-Type: application/json' \
290                -d "$payload" \
291                "$WEBHOOK_URL" > /dev/null 2>&1; then
292            log_info "Alert sent successfully"
293        else
294            log_error "Failed to send alert via webhook"
295        fi
296    else
297        log_warning "curl not available, cannot send webhook alert"
298    fi
299}
300
301# ============================================================================
302# Main Execution
303# ============================================================================
304
305main() {
306    log_info "========== Health Check Started =========="
307
308    local checks_passed=0
309    local checks_failed=0
310
311    # System resource checks
312    if check_cpu; then
313        ((checks_passed++))
314    else
315        ((checks_failed++))
316    fi
317
318    if check_memory; then
319        ((checks_passed++))
320    else
321        ((checks_failed++))
322    fi
323
324    if check_disk; then
325        ((checks_passed++))
326    else
327        ((checks_failed++))
328    fi
329
330    # Process checks
331    if [[ -n "$REQUIRED_PROCESSES" ]]; then
332        IFS=',' read -ra processes <<< "$REQUIRED_PROCESSES"
333        for process in "${processes[@]}"; do
334            if check_process "$process"; then
335                ((checks_passed++))
336            else
337                ((checks_failed++))
338            fi
339        done
340    fi
341
342    # HTTP endpoint checks
343    if [[ -n "$HTTP_ENDPOINTS" ]]; then
344        IFS=';' read -ra endpoints <<< "$HTTP_ENDPOINTS"
345        for endpoint in "${endpoints[@]}"; do
346            IFS=',' read -r name url timeout <<< "$endpoint"
347            if check_http_endpoint "$name" "$url" "$timeout"; then
348                ((checks_passed++))
349            else
350                ((checks_failed++))
351            fi
352        done
353    fi
354
355    # Summary
356    log_info "========== Health Check Complete =========="
357    log_info "Checks passed: $checks_passed"
358    log_info "Checks failed: $checks_failed"
359
360    if [[ $checks_failed -eq 0 ]]; then
361        log_success "All health checks passed"
362        exit 0
363    else
364        log_error "Some health checks failed"
365        exit 1
366    fi
367}
368
369main "$@"