disaster_recovery.sh

  1#!/usr/bin/env bash
  2# =============================================================================
  3# disaster_recovery.sh - Disaster Recovery Planning and Simulation Script
  4#
  5# PURPOSE: Demonstrates DR concepts including backup verification, service
  6#          health checks, RTO/RPO estimation, and recovery procedure documentation.
  7#
  8# USAGE:
  9#   ./disaster_recovery.sh [--check|--simulate|--report]
 10#
 11# MODES:
 12#   --check     Run all health and backup checks (default)
 13#   --simulate  Simulate a recovery scenario step-by-step
 14#   --report    Generate a full DR readiness report
 15#
 16# CONCEPTS COVERED:
 17#   - RTO (Recovery Time Objective): max acceptable downtime
 18#   - RPO (Recovery Point Objective): max acceptable data loss window
 19#   - Backup integrity verification
 20#   - Service dependency mapping
 21# =============================================================================
 22
 23set -euo pipefail
 24
 25# ---------------------------------------------------------------------------
 26# Color codes for visual severity feedback
 27# Using ANSI escape sequences that work on most terminals
 28# ---------------------------------------------------------------------------
 29RED='\033[0;31m'
 30GREEN='\033[0;32m'
 31YELLOW='\033[1;33m'
 32BLUE='\033[0;34m'
 33BOLD='\033[1m'
 34RESET='\033[0m'
 35
 36# ---------------------------------------------------------------------------
 37# DR Configuration — in a real environment these would be in a config file
 38# ---------------------------------------------------------------------------
 39BACKUP_DIR="${BACKUP_DIR:-/var/backups}"          # Where backups are stored
 40MAX_BACKUP_AGE_HOURS=24                            # RPO threshold in hours
 41MIN_BACKUP_SIZE_KB=100                             # Sanity-check minimum size
 42CRITICAL_SERVICES=("ssh" "cron" "syslog")         # Services that must be running
 43DB_BACKUP_PATH="/tmp/dr_sim_db_backup.sql"        # Simulated DB backup path
 44ESTIMATED_RTO_MINUTES=60                           # Target recovery time
 45
 46PASS=0
 47WARN=0
 48FAIL=0
 49
 50# ---------------------------------------------------------------------------
 51# Helper: print a labeled result line
 52# ---------------------------------------------------------------------------
 53result() {
 54    local status="$1"  # OK | WARN | FAIL
 55    local message="$2"
 56    case "$status" in
 57        OK)   echo -e "  [${GREEN}OK${RESET}]   $message"; ((PASS++)) ;;
 58        WARN) echo -e "  [${YELLOW}WARN${RESET}] $message"; ((WARN++)) ;;
 59        FAIL) echo -e "  [${RED}FAIL${RESET}] $message"; ((FAIL++)) ;;
 60    esac
 61}
 62
 63section() {
 64    echo ""
 65    echo -e "${BOLD}${BLUE}=== $1 ===${RESET}"
 66}
 67
 68# ---------------------------------------------------------------------------
 69# CHECK 1: Backup Directory and File Integrity
 70# DR concept: Backups are useless if they cannot be restored. Verify that
 71# backup files exist, are recent enough to meet RPO, and are non-empty.
 72# ---------------------------------------------------------------------------
 73check_backups() {
 74    section "Backup Verification (RPO: ${MAX_BACKUP_AGE_HOURS}h)"
 75
 76    if [[ ! -d "$BACKUP_DIR" ]]; then
 77        result WARN "Backup directory $BACKUP_DIR not found — using /tmp for simulation"
 78        BACKUP_DIR="/tmp"
 79    fi
 80
 81    # Find the most recent file in the backup directory
 82    local newest
 83    newest=$(find "$BACKUP_DIR" -maxdepth 1 -type f -newer /tmp 2>/dev/null | head -1 || true)
 84
 85    if [[ -z "$newest" ]]; then
 86        result WARN "No backup files found in $BACKUP_DIR (simulation mode)"
 87    else
 88        local age_hours=$(( ( $(date +%s) - $(stat -c%Y "$newest" 2>/dev/null || stat -f%m "$newest") ) / 3600 ))
 89        local size_kb=$(du -k "$newest" 2>/dev/null | cut -f1)
 90
 91        [[ $age_hours -le $MAX_BACKUP_AGE_HOURS ]] \
 92            && result OK "Most recent backup is ${age_hours}h old (within RPO)" \
 93            || result FAIL "Most recent backup is ${age_hours}h old (exceeds ${MAX_BACKUP_AGE_HOURS}h RPO)"
 94
 95        [[ ${size_kb:-0} -ge $MIN_BACKUP_SIZE_KB ]] \
 96            && result OK "Backup file size ${size_kb} KB meets minimum threshold" \
 97            || result WARN "Backup file size ${size_kb} KB is below minimum (${MIN_BACKUP_SIZE_KB} KB)"
 98    fi
 99}
100
101# ---------------------------------------------------------------------------
102# CHECK 2: Critical Service Health
103# DR concept: Know which services are essential before a disaster; document
104# their startup order and dependencies (the "service dependency map").
105# ---------------------------------------------------------------------------
106check_services() {
107    section "Critical Service Health Checks"
108
109    for svc in "${CRITICAL_SERVICES[@]}"; do
110        # Use 'pgrep' as a portable fallback when 'systemctl' is unavailable
111        if command -v systemctl &>/dev/null; then
112            systemctl is-active --quiet "$svc" 2>/dev/null \
113                && result OK "Service '$svc' is active" \
114                || result WARN "Service '$svc' is not active (may not exist in this environment)"
115        else
116            pgrep -x "$svc" &>/dev/null \
117                && result OK "Process '$svc' is running" \
118                || result WARN "Process '$svc' not found (may not apply to this OS)"
119        fi
120    done
121}
122
123# ---------------------------------------------------------------------------
124# CHECK 3: Database Backup Verification
125# DR concept: Database backups require special handling — a file that exists
126# does not mean data is recoverable. Validate the dump format.
127# ---------------------------------------------------------------------------
128check_database_backup() {
129    section "Database Backup Verification (PostgreSQL simulation)"
130
131    # Simulate creating a pg_dump; real usage: pg_dump -Fc mydb > backup.dump
132    echo "-- PostgreSQL DR test dump $(date)" > "$DB_BACKUP_PATH"
133    echo "-- Tables: users, orders, inventory" >> "$DB_BACKUP_PATH"
134
135    if [[ -f "$DB_BACKUP_PATH" ]]; then
136        result OK "Database backup file exists at $DB_BACKUP_PATH"
137        # Check that the file begins with a recognizable header
138        if head -1 "$DB_BACKUP_PATH" | grep -q "PostgreSQL"; then
139            result OK "Backup file header matches expected format"
140        else
141            result WARN "Backup header format unexpected — verify with pg_restore --list"
142        fi
143    else
144        result FAIL "Database backup not found — RPO violation risk"
145    fi
146}
147
148# ---------------------------------------------------------------------------
149# CHECK 4: Network Connectivity
150# DR concept: Recovery often requires network access to pull backups from
151# remote storage or reach a standby site.
152# ---------------------------------------------------------------------------
153check_network() {
154    section "Network Connectivity Checks"
155
156    local targets=("8.8.8.8" "1.1.1.1")
157    for host in "${targets[@]}"; do
158        ping -c 1 -W 2 "$host" &>/dev/null \
159            && result OK "Reachable: $host" \
160            || result WARN "Unreachable: $host (check firewall / DNS)"
161    done
162
163    # Verify DNS resolution is functional
164    if host google.com &>/dev/null 2>&1 || nslookup google.com &>/dev/null 2>&1; then
165        result OK "DNS resolution is working"
166    else
167        result WARN "DNS resolution failed — recovery from remote storage may be impacted"
168    fi
169}
170
171# ---------------------------------------------------------------------------
172# CHECK 5: Disk Space
173# DR concept: Running out of disk space during recovery is a common failure
174# mode. Ensure the recovery target volume has sufficient headroom.
175# ---------------------------------------------------------------------------
176check_disk_space() {
177    section "Disk Space Monitoring"
178
179    while IFS= read -r line; do
180        local use pct mount
181        use=$(echo "$line" | awk '{print $5}' | tr -d '%')
182        mount=$(echo "$line" | awk '{print $6}')
183        [[ -z "$use" || ! "$use" =~ ^[0-9]+$ ]] && continue
184
185        if   [[ $use -ge 90 ]]; then result FAIL "Disk $mount at ${use}% — critical, recovery may fail"
186        elif [[ $use -ge 75 ]]; then result WARN "Disk $mount at ${use}% — low space, monitor closely"
187        else                         result OK   "Disk $mount at ${use}% — sufficient free space"
188        fi
189    done < <(df -h | tail -n +2)
190}
191
192# ---------------------------------------------------------------------------
193# SIMULATE: Walk through recovery steps interactively
194# DR concept: A DR plan that has never been rehearsed is not a plan.
195# Tabletop exercises and simulated failovers validate procedures.
196# ---------------------------------------------------------------------------
197simulate_recovery() {
198    section "Recovery Simulation (Tabletop Exercise)"
199    echo -e "${YELLOW}Simulating disaster scenario: Primary database host failure${RESET}"
200    echo ""
201
202    local steps=(
203        "DETECT  | Monitor alerts fire; on-call engineer paged via PagerDuty"
204        "ASSESS  | Confirm outage scope — DB host unreachable, app layer affected"
205        "DECLARE | Incident declared; DR team assembled; stakeholders notified"
206        "FAILOVER| Promote read-replica to primary (pg_promote / Route 53 update)"
207        "RESTORE | If no replica: restore latest pg_dump to standby host"
208        "VERIFY  | Run smoke tests; confirm application connectivity"
209        "CUTOVER | Update load balancer; route traffic to recovered instance"
210        "MONITOR | Watch error rates and latency for 30 min post-recovery"
211        "REVIEW  | Schedule post-mortem within 48 h; update runbook"
212    )
213
214    local step_num=1
215    for step in "${steps[@]}"; do
216        printf "  ${BOLD}Step %d${RESET}: %s\n" "$step_num" "$step"
217        ((step_num++))
218        sleep 0.2
219    done
220
221    echo ""
222    echo -e "  ${GREEN}Estimated RTO for this scenario: ${ESTIMATED_RTO_MINUTES} minutes${RESET}"
223    echo -e "  ${YELLOW}Actual RTO depends on backup restore speed (~1 GB/min typical)${RESET}"
224}
225
226# ---------------------------------------------------------------------------
227# REPORT: Summarize all checks and DR readiness
228# ---------------------------------------------------------------------------
229generate_report() {
230    PASS=0; WARN=0; FAIL=0
231    check_backups
232    check_services
233    check_database_backup
234    check_network
235    check_disk_space
236
237    section "DR Readiness Summary"
238    echo -e "  ${GREEN}PASS: $PASS${RESET}  ${YELLOW}WARN: $WARN${RESET}  ${RED}FAIL: $FAIL${RESET}"
239    echo ""
240
241    if [[ $FAIL -gt 0 ]]; then
242        echo -e "  ${RED}${BOLD}DR READINESS: NOT READY — $FAIL critical issue(s) must be resolved${RESET}"
243        exit 1
244    elif [[ $WARN -gt 0 ]]; then
245        echo -e "  ${YELLOW}${BOLD}DR READINESS: PARTIAL — review $WARN warning(s) before next drill${RESET}"
246    else
247        echo -e "  ${GREEN}${BOLD}DR READINESS: GOOD — all checks passed${RESET}"
248    fi
249}
250
251# ---------------------------------------------------------------------------
252# Entry point
253# ---------------------------------------------------------------------------
254main() {
255    echo -e "${BOLD}Disaster Recovery Script — $(date '+%Y-%m-%d %H:%M:%S')${RESET}"
256
257    case "${1:---check}" in
258        --check)    check_backups; check_services; check_database_backup; check_network; check_disk_space ;;
259        --simulate) simulate_recovery ;;
260        --report)   generate_report ;;
261        *)
262            echo "Usage: $0 [--check|--simulate|--report]"
263            exit 1
264            ;;
265    esac
266}
267
268main "$@"