1#!/usr/bin/env bash
2# =============================================================================
3# disaster_recovery.sh - Disaster Recovery Planning and Simulation Script
4#
5# PURPOSE: Demonstrates DR concepts including backup verification, service
6# health checks, RTO/RPO estimation, and recovery procedure documentation.
7#
8# USAGE:
9# ./disaster_recovery.sh [--check|--simulate|--report]
10#
11# MODES:
12# --check Run all health and backup checks (default)
13# --simulate Simulate a recovery scenario step-by-step
14# --report Generate a full DR readiness report
15#
16# CONCEPTS COVERED:
17# - RTO (Recovery Time Objective): max acceptable downtime
18# - RPO (Recovery Point Objective): max acceptable data loss window
19# - Backup integrity verification
20# - Service dependency mapping
21# =============================================================================
22
23set -euo pipefail
24
25# ---------------------------------------------------------------------------
26# Color codes for visual severity feedback
27# Using ANSI escape sequences that work on most terminals
28# ---------------------------------------------------------------------------
29RED='\033[0;31m'
30GREEN='\033[0;32m'
31YELLOW='\033[1;33m'
32BLUE='\033[0;34m'
33BOLD='\033[1m'
34RESET='\033[0m'
35
36# ---------------------------------------------------------------------------
37# DR Configuration — in a real environment these would be in a config file
38# ---------------------------------------------------------------------------
39BACKUP_DIR="${BACKUP_DIR:-/var/backups}" # Where backups are stored
40MAX_BACKUP_AGE_HOURS=24 # RPO threshold in hours
41MIN_BACKUP_SIZE_KB=100 # Sanity-check minimum size
42CRITICAL_SERVICES=("ssh" "cron" "syslog") # Services that must be running
43DB_BACKUP_PATH="/tmp/dr_sim_db_backup.sql" # Simulated DB backup path
44ESTIMATED_RTO_MINUTES=60 # Target recovery time
45
46PASS=0
47WARN=0
48FAIL=0
49
50# ---------------------------------------------------------------------------
51# Helper: print a labeled result line
52# ---------------------------------------------------------------------------
53result() {
54 local status="$1" # OK | WARN | FAIL
55 local message="$2"
56 case "$status" in
57 OK) echo -e " [${GREEN}OK${RESET}] $message"; ((PASS++)) ;;
58 WARN) echo -e " [${YELLOW}WARN${RESET}] $message"; ((WARN++)) ;;
59 FAIL) echo -e " [${RED}FAIL${RESET}] $message"; ((FAIL++)) ;;
60 esac
61}
62
63section() {
64 echo ""
65 echo -e "${BOLD}${BLUE}=== $1 ===${RESET}"
66}
67
68# ---------------------------------------------------------------------------
69# CHECK 1: Backup Directory and File Integrity
70# DR concept: Backups are useless if they cannot be restored. Verify that
71# backup files exist, are recent enough to meet RPO, and are non-empty.
72# ---------------------------------------------------------------------------
73check_backups() {
74 section "Backup Verification (RPO: ${MAX_BACKUP_AGE_HOURS}h)"
75
76 if [[ ! -d "$BACKUP_DIR" ]]; then
77 result WARN "Backup directory $BACKUP_DIR not found — using /tmp for simulation"
78 BACKUP_DIR="/tmp"
79 fi
80
81 # Find the most recent file in the backup directory
82 local newest
83 newest=$(find "$BACKUP_DIR" -maxdepth 1 -type f -newer /tmp 2>/dev/null | head -1 || true)
84
85 if [[ -z "$newest" ]]; then
86 result WARN "No backup files found in $BACKUP_DIR (simulation mode)"
87 else
88 local age_hours=$(( ( $(date +%s) - $(stat -c%Y "$newest" 2>/dev/null || stat -f%m "$newest") ) / 3600 ))
89 local size_kb=$(du -k "$newest" 2>/dev/null | cut -f1)
90
91 [[ $age_hours -le $MAX_BACKUP_AGE_HOURS ]] \
92 && result OK "Most recent backup is ${age_hours}h old (within RPO)" \
93 || result FAIL "Most recent backup is ${age_hours}h old (exceeds ${MAX_BACKUP_AGE_HOURS}h RPO)"
94
95 [[ ${size_kb:-0} -ge $MIN_BACKUP_SIZE_KB ]] \
96 && result OK "Backup file size ${size_kb} KB meets minimum threshold" \
97 || result WARN "Backup file size ${size_kb} KB is below minimum (${MIN_BACKUP_SIZE_KB} KB)"
98 fi
99}
100
101# ---------------------------------------------------------------------------
102# CHECK 2: Critical Service Health
103# DR concept: Know which services are essential before a disaster; document
104# their startup order and dependencies (the "service dependency map").
105# ---------------------------------------------------------------------------
106check_services() {
107 section "Critical Service Health Checks"
108
109 for svc in "${CRITICAL_SERVICES[@]}"; do
110 # Use 'pgrep' as a portable fallback when 'systemctl' is unavailable
111 if command -v systemctl &>/dev/null; then
112 systemctl is-active --quiet "$svc" 2>/dev/null \
113 && result OK "Service '$svc' is active" \
114 || result WARN "Service '$svc' is not active (may not exist in this environment)"
115 else
116 pgrep -x "$svc" &>/dev/null \
117 && result OK "Process '$svc' is running" \
118 || result WARN "Process '$svc' not found (may not apply to this OS)"
119 fi
120 done
121}
122
123# ---------------------------------------------------------------------------
124# CHECK 3: Database Backup Verification
125# DR concept: Database backups require special handling — a file that exists
126# does not mean data is recoverable. Validate the dump format.
127# ---------------------------------------------------------------------------
128check_database_backup() {
129 section "Database Backup Verification (PostgreSQL simulation)"
130
131 # Simulate creating a pg_dump; real usage: pg_dump -Fc mydb > backup.dump
132 echo "-- PostgreSQL DR test dump $(date)" > "$DB_BACKUP_PATH"
133 echo "-- Tables: users, orders, inventory" >> "$DB_BACKUP_PATH"
134
135 if [[ -f "$DB_BACKUP_PATH" ]]; then
136 result OK "Database backup file exists at $DB_BACKUP_PATH"
137 # Check that the file begins with a recognizable header
138 if head -1 "$DB_BACKUP_PATH" | grep -q "PostgreSQL"; then
139 result OK "Backup file header matches expected format"
140 else
141 result WARN "Backup header format unexpected — verify with pg_restore --list"
142 fi
143 else
144 result FAIL "Database backup not found — RPO violation risk"
145 fi
146}
147
148# ---------------------------------------------------------------------------
149# CHECK 4: Network Connectivity
150# DR concept: Recovery often requires network access to pull backups from
151# remote storage or reach a standby site.
152# ---------------------------------------------------------------------------
153check_network() {
154 section "Network Connectivity Checks"
155
156 local targets=("8.8.8.8" "1.1.1.1")
157 for host in "${targets[@]}"; do
158 ping -c 1 -W 2 "$host" &>/dev/null \
159 && result OK "Reachable: $host" \
160 || result WARN "Unreachable: $host (check firewall / DNS)"
161 done
162
163 # Verify DNS resolution is functional
164 if host google.com &>/dev/null 2>&1 || nslookup google.com &>/dev/null 2>&1; then
165 result OK "DNS resolution is working"
166 else
167 result WARN "DNS resolution failed — recovery from remote storage may be impacted"
168 fi
169}
170
171# ---------------------------------------------------------------------------
172# CHECK 5: Disk Space
173# DR concept: Running out of disk space during recovery is a common failure
174# mode. Ensure the recovery target volume has sufficient headroom.
175# ---------------------------------------------------------------------------
176check_disk_space() {
177 section "Disk Space Monitoring"
178
179 while IFS= read -r line; do
180 local use pct mount
181 use=$(echo "$line" | awk '{print $5}' | tr -d '%')
182 mount=$(echo "$line" | awk '{print $6}')
183 [[ -z "$use" || ! "$use" =~ ^[0-9]+$ ]] && continue
184
185 if [[ $use -ge 90 ]]; then result FAIL "Disk $mount at ${use}% — critical, recovery may fail"
186 elif [[ $use -ge 75 ]]; then result WARN "Disk $mount at ${use}% — low space, monitor closely"
187 else result OK "Disk $mount at ${use}% — sufficient free space"
188 fi
189 done < <(df -h | tail -n +2)
190}
191
192# ---------------------------------------------------------------------------
193# SIMULATE: Walk through recovery steps interactively
194# DR concept: A DR plan that has never been rehearsed is not a plan.
195# Tabletop exercises and simulated failovers validate procedures.
196# ---------------------------------------------------------------------------
197simulate_recovery() {
198 section "Recovery Simulation (Tabletop Exercise)"
199 echo -e "${YELLOW}Simulating disaster scenario: Primary database host failure${RESET}"
200 echo ""
201
202 local steps=(
203 "DETECT | Monitor alerts fire; on-call engineer paged via PagerDuty"
204 "ASSESS | Confirm outage scope — DB host unreachable, app layer affected"
205 "DECLARE | Incident declared; DR team assembled; stakeholders notified"
206 "FAILOVER| Promote read-replica to primary (pg_promote / Route 53 update)"
207 "RESTORE | If no replica: restore latest pg_dump to standby host"
208 "VERIFY | Run smoke tests; confirm application connectivity"
209 "CUTOVER | Update load balancer; route traffic to recovered instance"
210 "MONITOR | Watch error rates and latency for 30 min post-recovery"
211 "REVIEW | Schedule post-mortem within 48 h; update runbook"
212 )
213
214 local step_num=1
215 for step in "${steps[@]}"; do
216 printf " ${BOLD}Step %d${RESET}: %s\n" "$step_num" "$step"
217 ((step_num++))
218 sleep 0.2
219 done
220
221 echo ""
222 echo -e " ${GREEN}Estimated RTO for this scenario: ${ESTIMATED_RTO_MINUTES} minutes${RESET}"
223 echo -e " ${YELLOW}Actual RTO depends on backup restore speed (~1 GB/min typical)${RESET}"
224}
225
226# ---------------------------------------------------------------------------
227# REPORT: Summarize all checks and DR readiness
228# ---------------------------------------------------------------------------
229generate_report() {
230 PASS=0; WARN=0; FAIL=0
231 check_backups
232 check_services
233 check_database_backup
234 check_network
235 check_disk_space
236
237 section "DR Readiness Summary"
238 echo -e " ${GREEN}PASS: $PASS${RESET} ${YELLOW}WARN: $WARN${RESET} ${RED}FAIL: $FAIL${RESET}"
239 echo ""
240
241 if [[ $FAIL -gt 0 ]]; then
242 echo -e " ${RED}${BOLD}DR READINESS: NOT READY — $FAIL critical issue(s) must be resolved${RESET}"
243 exit 1
244 elif [[ $WARN -gt 0 ]]; then
245 echo -e " ${YELLOW}${BOLD}DR READINESS: PARTIAL — review $WARN warning(s) before next drill${RESET}"
246 else
247 echo -e " ${GREEN}${BOLD}DR READINESS: GOOD — all checks passed${RESET}"
248 fi
249}
250
251# ---------------------------------------------------------------------------
252# Entry point
253# ---------------------------------------------------------------------------
254main() {
255 echo -e "${BOLD}Disaster Recovery Script — $(date '+%Y-%m-%d %H:%M:%S')${RESET}"
256
257 case "${1:---check}" in
258 --check) check_backups; check_services; check_database_backup; check_network; check_disk_space ;;
259 --simulate) simulate_recovery ;;
260 --report) generate_report ;;
261 *)
262 echo "Usage: $0 [--check|--simulate|--report]"
263 exit 1
264 ;;
265 esac
266}
267
268main "$@"