1#!/usr/bin/env bash
2set -euo pipefail
3
4# Health Check and Alerting Script
5# Monitors system resources and services, sends alerts on failures
6# Designed to run from cron for continuous monitoring
7
8# ============================================================================
9# Configuration
10# ============================================================================
11
12SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
13LOG_FILE="${LOG_FILE:-/var/log/health_check.log}"
14STATE_FILE="${STATE_FILE:-/tmp/health_check.state}"
15
16# Alert thresholds
17CPU_THRESHOLD="${CPU_THRESHOLD:-85}"
18MEMORY_THRESHOLD="${MEMORY_THRESHOLD:-90}"
19DISK_THRESHOLD="${DISK_THRESHOLD:-90}"
20
21# Services to check (process names)
22REQUIRED_PROCESSES="${REQUIRED_PROCESSES:-sshd}"
23
24# HTTP endpoints to check (format: name,url,timeout)
25HTTP_ENDPOINTS="${HTTP_ENDPOINTS:-}"
26# Example: "API,http://localhost:8080/health,5;Frontend,http://localhost:3000,3"
27
28# Webhook URL for alerts (Slack-compatible)
29WEBHOOK_URL="${WEBHOOK_URL:-}"
30
31# Platform detection
32OS_TYPE=$(uname -s)
33
34# ============================================================================
35# Logging Functions
36# ============================================================================
37
38log() {
39 local level=$1
40 shift
41 local message="$*"
42 local timestamp
43 timestamp=$(date '+%Y-%m-%d %H:%M:%S')
44
45 echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
46}
47
48log_info() {
49 log "INFO" "$@"
50}
51
52log_warning() {
53 log "WARNING" "$@"
54}
55
56log_error() {
57 log "ERROR" "$@"
58}
59
60log_success() {
61 log "SUCCESS" "$@"
62}
63
64# ============================================================================
65# State Management
66# ============================================================================
67
68# Track if we've already alerted for this issue
69has_alerted() {
70 local key=$1
71
72 if [[ ! -f "$STATE_FILE" ]]; then
73 return 1
74 fi
75
76 grep -q "^${key}$" "$STATE_FILE"
77}
78
79mark_alerted() {
80 local key=$1
81
82 mkdir -p "$(dirname "$STATE_FILE")"
83 echo "$key" >> "$STATE_FILE"
84}
85
86clear_alert() {
87 local key=$1
88
89 if [[ -f "$STATE_FILE" ]]; then
90 grep -v "^${key}$" "$STATE_FILE" > "${STATE_FILE}.tmp" || true
91 mv "${STATE_FILE}.tmp" "$STATE_FILE"
92 fi
93}
94
95# ============================================================================
96# Metric Collection
97# ============================================================================
98
99get_cpu_usage() {
100 if [[ "$OS_TYPE" == "Linux" ]]; then
101 if command -v top &> /dev/null; then
102 top -bn2 -d 0.1 | grep '^%Cpu' | tail -n1 | awk '{print int(100 - $8)}'
103 else
104 echo "0"
105 fi
106 elif [[ "$OS_TYPE" == "Darwin" ]]; then
107 top -l 2 -n 0 -F | grep 'CPU usage' | tail -n1 | awk '{print int($3 + $5)}'
108 else
109 echo "0"
110 fi
111}
112
113get_memory_usage() {
114 if [[ "$OS_TYPE" == "Linux" ]]; then
115 if [[ -f /proc/meminfo ]]; then
116 local total
117 local available
118 total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
119 available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
120 echo $(( (total - available) * 100 / total ))
121 else
122 echo "0"
123 fi
124 elif [[ "$OS_TYPE" == "Darwin" ]]; then
125 local page_size
126 local free_pages
127 local active_pages
128 local inactive_pages
129 local wired_pages
130
131 page_size=$(vm_stat | grep 'page size' | awk '{print $8}')
132 free_pages=$(vm_stat | grep 'Pages free' | awk '{print $3}' | tr -d '.')
133 active_pages=$(vm_stat | grep 'Pages active' | awk '{print $3}' | tr -d '.')
134 inactive_pages=$(vm_stat | grep 'Pages inactive' | awk '{print $3}' | tr -d '.')
135 wired_pages=$(vm_stat | grep 'Pages wired' | awk '{print $4}' | tr -d '.')
136
137 local used=$((active_pages + inactive_pages + wired_pages))
138 local total=$((used + free_pages))
139
140 echo $((used * 100 / total))
141 else
142 echo "0"
143 fi
144}
145
146get_disk_usage() {
147 df -h / | awk 'NR==2 {print int($5)}'
148}
149
150# ============================================================================
151# Health Checks
152# ============================================================================
153
154check_cpu() {
155 local usage
156 usage=$(get_cpu_usage)
157 local alert_key="cpu_high"
158
159 if [[ $usage -ge $CPU_THRESHOLD ]]; then
160 if ! has_alerted "$alert_key"; then
161 log_error "CPU usage is high: ${usage}%"
162 send_alert "CPU Alert" "CPU usage is ${usage}% (threshold: ${CPU_THRESHOLD}%)" "danger"
163 mark_alerted "$alert_key"
164 fi
165 return 1
166 else
167 log_info "CPU usage is normal: ${usage}%"
168 clear_alert "$alert_key"
169 return 0
170 fi
171}
172
173check_memory() {
174 local usage
175 usage=$(get_memory_usage)
176 local alert_key="memory_high"
177
178 if [[ $usage -ge $MEMORY_THRESHOLD ]]; then
179 if ! has_alerted "$alert_key"; then
180 log_error "Memory usage is high: ${usage}%"
181 send_alert "Memory Alert" "Memory usage is ${usage}% (threshold: ${MEMORY_THRESHOLD}%)" "danger"
182 mark_alerted "$alert_key"
183 fi
184 return 1
185 else
186 log_info "Memory usage is normal: ${usage}%"
187 clear_alert "$alert_key"
188 return 0
189 fi
190}
191
192check_disk() {
193 local usage
194 usage=$(get_disk_usage)
195 local alert_key="disk_full"
196
197 if [[ $usage -ge $DISK_THRESHOLD ]]; then
198 if ! has_alerted "$alert_key"; then
199 log_error "Disk usage is high: ${usage}%"
200 send_alert "Disk Alert" "Disk usage is ${usage}% (threshold: ${DISK_THRESHOLD}%)" "warning"
201 mark_alerted "$alert_key"
202 fi
203 return 1
204 else
205 log_info "Disk usage is normal: ${usage}%"
206 clear_alert "$alert_key"
207 return 0
208 fi
209}
210
211check_process() {
212 local process_name=$1
213 local alert_key="process_${process_name}"
214
215 if pgrep -x "$process_name" > /dev/null; then
216 log_info "Process is running: $process_name"
217 clear_alert "$alert_key"
218 return 0
219 else
220 if ! has_alerted "$alert_key"; then
221 log_error "Process is not running: $process_name"
222 send_alert "Process Alert" "Required process '$process_name' is not running" "danger"
223 mark_alerted "$alert_key"
224 fi
225 return 1
226 fi
227}
228
229check_http_endpoint() {
230 local name=$1
231 local url=$2
232 local timeout=$3
233 local alert_key="http_${name}"
234
235 if command -v curl &> /dev/null; then
236 if curl --silent --fail --max-time "$timeout" "$url" > /dev/null 2>&1; then
237 log_info "HTTP endpoint is healthy: $name ($url)"
238 clear_alert "$alert_key"
239 return 0
240 else
241 if ! has_alerted "$alert_key"; then
242 log_error "HTTP endpoint is unhealthy: $name ($url)"
243 send_alert "HTTP Alert" "Endpoint '$name' at $url is not responding" "danger"
244 mark_alerted "$alert_key"
245 fi
246 return 1
247 fi
248 else
249 log_warning "curl not available, skipping HTTP check for $name"
250 return 0
251 fi
252}
253
254# ============================================================================
255# Alerting
256# ============================================================================
257
258send_alert() {
259 local title=$1
260 local message=$2
261 local color=${3:-warning} # good, warning, danger
262
263 if [[ -z "$WEBHOOK_URL" ]]; then
264 log_warning "WEBHOOK_URL not set, skipping alert notification"
265 return 0
266 fi
267
268 # Slack-compatible JSON payload
269 local payload
270 payload=$(cat <<EOF
271{
272 "attachments": [
273 {
274 "color": "$color",
275 "title": "$title",
276 "text": "$message",
277 "footer": "Health Check on $(hostname)",
278 "ts": $(date +%s)
279 }
280 ]
281}
282EOF
283)
284
285 if command -v curl &> /dev/null; then
286 if curl --silent --fail \
287 --max-time 10 \
288 -X POST \
289 -H 'Content-Type: application/json' \
290 -d "$payload" \
291 "$WEBHOOK_URL" > /dev/null 2>&1; then
292 log_info "Alert sent successfully"
293 else
294 log_error "Failed to send alert via webhook"
295 fi
296 else
297 log_warning "curl not available, cannot send webhook alert"
298 fi
299}
300
301# ============================================================================
302# Main Execution
303# ============================================================================
304
305main() {
306 log_info "========== Health Check Started =========="
307
308 local checks_passed=0
309 local checks_failed=0
310
311 # System resource checks
312 if check_cpu; then
313 ((checks_passed++))
314 else
315 ((checks_failed++))
316 fi
317
318 if check_memory; then
319 ((checks_passed++))
320 else
321 ((checks_failed++))
322 fi
323
324 if check_disk; then
325 ((checks_passed++))
326 else
327 ((checks_failed++))
328 fi
329
330 # Process checks
331 if [[ -n "$REQUIRED_PROCESSES" ]]; then
332 IFS=',' read -ra processes <<< "$REQUIRED_PROCESSES"
333 for process in "${processes[@]}"; do
334 if check_process "$process"; then
335 ((checks_passed++))
336 else
337 ((checks_failed++))
338 fi
339 done
340 fi
341
342 # HTTP endpoint checks
343 if [[ -n "$HTTP_ENDPOINTS" ]]; then
344 IFS=';' read -ra endpoints <<< "$HTTP_ENDPOINTS"
345 for endpoint in "${endpoints[@]}"; do
346 IFS=',' read -r name url timeout <<< "$endpoint"
347 if check_http_endpoint "$name" "$url" "$timeout"; then
348 ((checks_passed++))
349 else
350 ((checks_failed++))
351 fi
352 done
353 fi
354
355 # Summary
356 log_info "========== Health Check Complete =========="
357 log_info "Checks passed: $checks_passed"
358 log_info "Checks failed: $checks_failed"
359
360 if [[ $checks_failed -eq 0 ]]; then
361 log_success "All health checks passed"
362 exit 0
363 else
364 log_error "Some health checks failed"
365 exit 1
366 fi
367}
368
369main "$@"