performance_diagnostics.sh

  1#!/usr/bin/env bash
  2# =============================================================================
  3# performance_diagnostics.sh - System Performance Bottleneck Diagnosis
  4#
  5# PURPOSE: Identifies CPU, memory, disk I/O, and network bottlenecks using
  6#          standard Linux (and macOS-compatible) tools. Teaches which metrics
  7#          matter and how to interpret them.
  8#
  9# USAGE:
 10#   ./performance_diagnostics.sh [--cpu|--memory|--disk|--network|--all]
 11#
 12# MODES:
 13#   --cpu       Analyze CPU load and top consumers
 14#   --memory    Analyze RAM and swap usage
 15#   --disk      Analyze disk I/O throughput and utilization
 16#   --network   Analyze connections and listening ports
 17#   --all       Run all analyses and print a bottleneck summary (default)
 18#
 19# PREREQUISITES:
 20#   Linux: ps, top, df, iostat (sysstat), ss or netstat, free
 21#   macOS: ps, top, df, vm_stat, netstat (iostat available via brew install sysstat)
 22#
 23# CROSS-PLATFORM NOTES:
 24#   This script detects the OS and adjusts commands accordingly.
 25#   On macOS, some Linux-specific flags are not available; fallbacks are used.
 26# =============================================================================
 27
 28set -euo pipefail
 29
 30# ---------------------------------------------------------------------------
 31# Color and formatting
 32# ---------------------------------------------------------------------------
 33RED='\033[0;31m'
 34YELLOW='\033[1;33m'
 35GREEN='\033[0;32m'
 36CYAN='\033[0;36m'
 37BOLD='\033[1m'
 38RESET='\033[0m'
 39
 40# Bottleneck severity levels
 41SEV_OK="OK"
 42SEV_WARN="WARN"
 43SEV_CRIT="CRIT"
 44
 45# Accumulated bottleneck findings for the summary report
 46BOTTLENECKS=()
 47
 48# ---------------------------------------------------------------------------
 49# OS detection — commands differ between Linux and macOS (Darwin)
 50# ---------------------------------------------------------------------------
 51OS="$(uname -s)"
 52
 53# ---------------------------------------------------------------------------
 54# Helpers
 55# ---------------------------------------------------------------------------
 56section() {
 57    echo ""
 58    echo -e "${BOLD}${CYAN}>>> $1${RESET}"
 59    echo -e "${CYAN}$(printf '%0.s-' {1..60})${RESET}"
 60}
 61
 62flag_bottleneck() {
 63    local severity="$1"
 64    local message="$2"
 65    BOTTLENECKS+=("[$severity] $message")
 66}
 67
 68sev_color() {
 69    case "$1" in
 70        "$SEV_CRIT") echo -e "${RED}[CRIT]${RESET}" ;;
 71        "$SEV_WARN") echo -e "${YELLOW}[WARN]${RESET}" ;;
 72        *)           echo -e "${GREEN}[ OK ]${RESET}" ;;
 73    esac
 74}
 75
 76# Check if a command exists; return 1 silently if missing
 77has_cmd() { command -v "$1" &>/dev/null; }
 78
 79# ---------------------------------------------------------------------------
 80# SYSTEM OVERVIEW
 81# Metric: uptime, kernel, CPU core count
 82# Why it matters: baseline context before diving into specific metrics
 83# ---------------------------------------------------------------------------
 84system_overview() {
 85    section "System Overview"
 86
 87    echo -e "  Hostname   : $(hostname)"
 88    echo -e "  OS         : $OS $(uname -r)"
 89    echo -e "  Date       : $(date '+%Y-%m-%d %H:%M:%S %Z')"
 90    echo -e "  Uptime     : $(uptime | sed 's/.*up //' | sed 's/,  [0-9]* user.*//')"
 91
 92    # CPU core count (logical processors)
 93    local cores
 94    if [[ "$OS" == "Darwin" ]]; then
 95        cores=$(sysctl -n hw.logicalcpu)
 96    else
 97        cores=$(nproc 2>/dev/null || grep -c ^processor /proc/cpuinfo)
 98    fi
 99    echo -e "  CPU Cores  : $cores logical processor(s)"
100}
101
102# ---------------------------------------------------------------------------
103# CPU ANALYSIS
104# Metrics: load average (1/5/15 min), per-process CPU%
105#
106# Rule of thumb:
107#   load avg / cores < 1.0 → healthy
108#   load avg / cores 1.0–2.0 → elevated, monitor
109#   load avg / cores > 2.0 → saturated, investigate
110# ---------------------------------------------------------------------------
111analyze_cpu() {
112    section "CPU Analysis"
113
114    # Load average is the number of processes waiting for CPU time
115    local load1 load5 load15 cores
116    if [[ "$OS" == "Darwin" ]]; then
117        read -r load1 load5 load15 <<< "$(sysctl -n vm.loadavg | tr -d '{}' | awk '{print $1,$2,$3}')"
118        cores=$(sysctl -n hw.logicalcpu)
119    else
120        read -r load1 load5 load15 <<< "$(awk '{print $1,$2,$3}' /proc/loadavg)"
121        cores=$(nproc)
122    fi
123
124    echo -e "  Load Average (1/5/15 min): ${BOLD}$load1  $load5  $load15${RESET}"
125    echo -e "  Logical CPU Cores        : $cores"
126
127    # Compare 1-min load to core count to detect saturation
128    local ratio
129    ratio=$(awk "BEGIN {printf \"%.2f\", $load1 / $cores}")
130    echo -e "  Load/Core Ratio (1 min)  : $ratio"
131
132    local sev="$SEV_OK"
133    if awk "BEGIN {exit !($load1 / $cores >= 2.0)}"; then
134        sev="$SEV_CRIT"; flag_bottleneck "$sev" "CPU saturated: load/core ratio $ratio"
135    elif awk "BEGIN {exit !($load1 / $cores >= 1.0)}"; then
136        sev="$SEV_WARN"; flag_bottleneck "$sev" "CPU elevated: load/core ratio $ratio"
137    fi
138    echo -e "  CPU Pressure             : $(sev_color "$sev")"
139
140    echo ""
141    echo -e "  ${BOLD}Top 5 CPU-consuming processes:${RESET}"
142    # ps output: %cpu pid comm — sorted descending by CPU
143    # -A selects all processes; -o customizes output columns
144    ps -A -o pcpu,pid,comm 2>/dev/null \
145        | sort -rn \
146        | head -5 \
147        | awk '{printf "    %6s%%  PID %-6s  %s\n", $1, $2, $3}'
148
149    # Detect zombie processes — processes that have exited but whose parent
150    # has not yet called wait(). Large numbers indicate a parent bug.
151    local zombies
152    zombies=$(ps -A -o stat 2>/dev/null | grep -c '^Z' || echo 0)
153    echo ""
154    if [[ $zombies -gt 0 ]]; then
155        echo -e "  ${YELLOW}Zombie processes: $zombies (parent process may not be reaping children)${RESET}"
156        flag_bottleneck "$SEV_WARN" "Zombie processes detected: $zombies"
157    else
158        echo -e "  Zombie processes: ${GREEN}0${RESET}"
159    fi
160}
161
162# ---------------------------------------------------------------------------
163# MEMORY ANALYSIS
164# Metrics: total/used/free/cached RAM, swap in/out
165#
166# Key insight: Linux aggressively uses free RAM for disk cache (page cache).
167# "Available" memory (not just "free") is the correct metric for headroom.
168# High swap usage with low available RAM = memory pressure bottleneck.
169# ---------------------------------------------------------------------------
170analyze_memory() {
171    section "Memory Analysis"
172
173    if [[ "$OS" == "Darwin" ]]; then
174        # macOS uses vm_stat; parse page counts (each page = 4096 bytes)
175        local page_size=4096
176        local vm
177        vm=$(vm_stat)
178        local free_pages wired active inactive
179        free_pages=$(echo "$vm" | awk '/Pages free/{gsub(/\./,"",$3); print $3}')
180        wired=$(echo "$vm"      | awk '/Pages wired/{gsub(/\./,"",$4); print $4}')
181        active=$(echo "$vm"     | awk '/Pages active/{gsub(/\./,"",$3); print $3}')
182        inactive=$(echo "$vm"   | awk '/Pages inactive/{gsub(/\./,"",$3); print $3}')
183
184        local total_pages=$(( (${free_pages:-0} + ${wired:-0} + ${active:-0} + ${inactive:-0}) ))
185        local total_mb=$(( total_pages * page_size / 1024 / 1024 ))
186        local free_mb=$(( ${free_pages:-0} * page_size / 1024 / 1024 ))
187        local used_mb=$(( total_mb - free_mb ))
188
189        echo -e "  Total RAM : ${total_mb} MB"
190        echo -e "  Used      : ${used_mb} MB"
191        echo -e "  Free      : ${free_mb} MB"
192        echo -e "  (macOS caches aggressively; use 'Memory Pressure' in Activity Monitor)"
193
194        # Swap on macOS via sysctl
195        local swap_used
196        swap_used=$(sysctl -n vm.swapusage 2>/dev/null | awk '{print $6}' | tr -d 'M' || echo 0)
197        echo -e "  Swap Used : ${swap_used} MB"
198        if awk "BEGIN {exit !(${swap_used:-0} > 1024)}"; then
199            flag_bottleneck "$SEV_WARN" "Swap usage elevated: ${swap_used} MB"
200        fi
201    else
202        # Linux: 'free' provides clear columns
203        echo -e "  ${BOLD}Memory (MB):${RESET}"
204        free -m | awk '
205            NR==1 {printf "  %-12s %8s %8s %8s %8s\n", "", $1, $2, $3, $6}
206            NR==2 {printf "  %-12s %8s %8s %8s %8s\n", "RAM", $2, $3, $4, $7}
207            NR==3 {printf "  %-12s %8s %8s %8s\n", "Swap", $2, $3, $4}
208        '
209
210        # Parse available memory to calculate utilization percentage
211        local avail_mb total_mb
212        avail_mb=$(free -m | awk '/^Mem:/{print $7}')
213        total_mb=$(free -m | awk '/^Mem:/{print $2}')
214        local used_pct=$(( (total_mb - avail_mb) * 100 / total_mb ))
215
216        echo ""
217        echo -e "  RAM Utilization : ${used_pct}%"
218
219        local sev="$SEV_OK"
220        if [[ $used_pct -ge 90 ]]; then
221            sev="$SEV_CRIT"; flag_bottleneck "$sev" "Memory critical: ${used_pct}% used"
222        elif [[ $used_pct -ge 75 ]]; then
223            sev="$SEV_WARN"; flag_bottleneck "$sev" "Memory elevated: ${used_pct}% used"
224        fi
225        echo -e "  Memory Pressure : $(sev_color "$sev")"
226    fi
227
228    echo ""
229    echo -e "  ${BOLD}Top 5 memory-consuming processes:${RESET}"
230    ps -A -o pmem,pid,comm 2>/dev/null \
231        | sort -rn \
232        | head -5 \
233        | awk '{printf "    %6s%%  PID %-6s  %s\n", $1, $2, $3}'
234}
235
236# ---------------------------------------------------------------------------
237# DISK I/O ANALYSIS
238# Metrics: disk utilization %, read/write throughput
239#
240# iostat's %util field: percentage of time the device was busy.
241# Values near 100% indicate the disk is saturated (I/O bottleneck).
242# High await (average I/O wait time) with high %util = disk is the bottleneck.
243# ---------------------------------------------------------------------------
244analyze_disk() {
245    section "Disk I/O Analysis"
246
247    # Disk space — always available
248    echo -e "  ${BOLD}Filesystem Usage:${RESET}"
249    df -h | awk 'NR==1 || /^\// {printf "  %-30s %6s %6s %6s %5s  %s\n", $1, $2, $3, $4, $5, $6}' \
250        | head -8
251
252    # Check for critically full filesystems
253    df -h | awk 'NR>1 && /^\// {gsub(/%/,"",$5); if ($5+0 >= 90) print $6, $5}' \
254    | while read -r mount pct; do
255        flag_bottleneck "$SEV_CRIT" "Filesystem $mount at ${pct}% — nearly full"
256        echo -e "  ${RED}[CRIT]${RESET} Filesystem $mount at ${pct}%"
257    done
258
259    echo ""
260    echo -e "  ${BOLD}Disk I/O Throughput:${RESET}"
261
262    if has_cmd iostat; then
263        if [[ "$OS" == "Darwin" ]]; then
264            # macOS iostat: columns are different from Linux
265            echo -e "  (macOS iostat — KB/t=KB per transfer, tps=transfers/sec)"
266            iostat -d 1 2 2>/dev/null | tail -n +4 | head -6 \
267                | awk '{printf "  %-12s KB/t=%8s  tps=%8s\n", $1, $2, $3}' || true
268        else
269            # Linux iostat -x: extended stats including %util and await
270            echo -e "  (Linux iostat -x: %util=disk busy%, await=avg I/O wait ms)"
271            iostat -dx 1 2 2>/dev/null \
272                | awk '/^[svhm]d|^nvme/ {printf "  %-12s  %util=%6s%%  await=%6s ms  r/s=%6s  w/s=%6s\n", $1, $NF, $10, $4, $5}' \
273                | head -6 || true
274
275            # Flag disks with high utilization
276            iostat -dx 1 2 2>/dev/null \
277                | awk '/^[svhm]d|^nvme/ {util=$NF+0; dev=$1; if (util>=80) print dev, util}' \
278                | while read -r dev util; do
279                    flag_bottleneck "$SEV_WARN" "Disk $dev I/O utilization ${util}%"
280                done
281        fi
282    else
283        echo -e "  ${YELLOW}iostat not found.${RESET} Install sysstat (Linux) or brew install sysstat (macOS)"
284        echo -e "  Showing /proc/diskstats snapshot instead (Linux only):"
285        if [[ -f /proc/diskstats ]]; then
286            awk 'NF>=14 && $3~/^[svhm]d|^nvme/ {printf "  %-10s reads=%s writes=%s\n", $3, $6, $10}' \
287                /proc/diskstats | head -5
288        fi
289    fi
290}
291
292# ---------------------------------------------------------------------------
293# NETWORK ANALYSIS
294# Metrics: active connections by state, listening ports, interface stats
295#
296# TIME_WAIT: normal; connections waiting for duplicate packets to expire (2*MSL)
297# CLOSE_WAIT: may indicate app not closing sockets — potential resource leak
298# High connection counts on a single port may indicate a traffic spike or DoS
299# ---------------------------------------------------------------------------
300analyze_network() {
301    section "Network Analysis"
302
303    echo -e "  ${BOLD}Connection State Summary:${RESET}"
304
305    # Prefer 'ss' (socket statistics, modern Linux) over netstat
306    if has_cmd ss; then
307        ss -tan 2>/dev/null | awk 'NR>1 {states[$1]++} END {for (s in states) printf "  %-15s : %d\n", s, states[s]}' | sort
308    elif has_cmd netstat; then
309        netstat -an 2>/dev/null | awk '/^tcp/ {states[$6]++} END {for (s in states) printf "  %-15s : %d\n", s, states[s]}' | sort
310    else
311        echo -e "  ${YELLOW}Neither ss nor netstat found${RESET}"
312    fi
313
314    echo ""
315    echo -e "  ${BOLD}Listening Ports (TCP):${RESET}"
316    if has_cmd ss; then
317        ss -tlnp 2>/dev/null | awk 'NR>1 {printf "  %-25s %s\n", $4, $6}' | head -10
318    elif has_cmd netstat; then
319        netstat -tlnp 2>/dev/null | awk 'NR>2 && /LISTEN/ {printf "  %-25s %s\n", $4, $7}' | head -10
320    fi
321
322    echo ""
323    echo -e "  ${BOLD}Network Interface Statistics:${RESET}"
324    if [[ "$OS" == "Darwin" ]]; then
325        netstat -ib 2>/dev/null | awk 'NR==1 || /en[0-9]/' | head -6
326    elif [[ -f /proc/net/dev ]]; then
327        awk 'NR>2 && !/lo:/ {
328            gsub(/:/, " ");
329            printf "  %-10s  RX: %s bytes  TX: %s bytes\n", $1, $2, $10
330        }' /proc/net/dev | head -5
331    fi
332
333    # Flag large TIME_WAIT counts as a potential issue
334    local tw_count=0
335    if has_cmd ss; then
336        tw_count=$(ss -tan 2>/dev/null | grep -c TIME-WAIT || echo 0)
337    fi
338    if [[ $tw_count -gt 500 ]]; then
339        flag_bottleneck "$SEV_WARN" "High TIME_WAIT count: $tw_count (consider tcp_tw_reuse)"
340        echo -e "  ${YELLOW}[WARN]${RESET} High TIME_WAIT connections: $tw_count"
341    fi
342}
343
344# ---------------------------------------------------------------------------
345# SUMMARY REPORT — consolidate all bottleneck findings
346# ---------------------------------------------------------------------------
347print_summary() {
348    section "Performance Bottleneck Summary"
349
350    if [[ ${#BOTTLENECKS[@]} -eq 0 ]]; then
351        echo -e "  ${GREEN}${BOLD}No significant bottlenecks detected.${RESET}"
352        echo -e "  System appears healthy across CPU, memory, disk, and network."
353    else
354        echo -e "  ${BOLD}Detected ${#BOTTLENECKS[@]} issue(s):${RESET}"
355        echo ""
356        for b in "${BOTTLENECKS[@]}"; do
357            if [[ "$b" == *"[CRIT]"* ]]; then
358                echo -e "  ${RED}$b${RESET}"
359            elif [[ "$b" == *"[WARN]"* ]]; then
360                echo -e "  ${YELLOW}$b${RESET}"
361            else
362                echo -e "  ${GREEN}$b${RESET}"
363            fi
364        done
365        echo ""
366        echo -e "  ${BOLD}Next steps:${RESET}"
367        echo -e "  - CPU: profile with 'perf top' or 'flamegraph' to find hot functions"
368        echo -e "  - Memory: use 'smem' or 'valgrind' to detect leaks"
369        echo -e "  - Disk: check application I/O patterns with 'iotop' or 'blktrace'"
370        echo -e "  - Network: capture traffic with 'tcpdump' or 'Wireshark'"
371    fi
372}
373
374# ---------------------------------------------------------------------------
375# Entry point
376# ---------------------------------------------------------------------------
377main() {
378    echo -e "${BOLD}Performance Diagnostics — $(date '+%Y-%m-%d %H:%M:%S') — $OS${RESET}"
379    system_overview
380
381    case "${1:---all}" in
382        --cpu)     analyze_cpu ;;
383        --memory)  analyze_memory ;;
384        --disk)    analyze_disk ;;
385        --network) analyze_network ;;
386        --all)
387            analyze_cpu
388            analyze_memory
389            analyze_disk
390            analyze_network
391            print_summary
392            ;;
393        *)
394            echo "Usage: $0 [--cpu|--memory|--disk|--network|--all]"
395            exit 1
396            ;;
397    esac
398}
399
400main "$@"