1#!/usr/bin/env bash
2# =============================================================================
3# performance_diagnostics.sh - System Performance Bottleneck Diagnosis
4#
5# PURPOSE: Identifies CPU, memory, disk I/O, and network bottlenecks using
6# standard Linux (and macOS-compatible) tools. Teaches which metrics
7# matter and how to interpret them.
8#
9# USAGE:
10# ./performance_diagnostics.sh [--cpu|--memory|--disk|--network|--all]
11#
12# MODES:
13# --cpu Analyze CPU load and top consumers
14# --memory Analyze RAM and swap usage
15# --disk Analyze disk I/O throughput and utilization
16# --network Analyze connections and listening ports
17# --all Run all analyses and print a bottleneck summary (default)
18#
19# PREREQUISITES:
20# Linux: ps, top, df, iostat (sysstat), ss or netstat, free
21# macOS: ps, top, df, vm_stat, netstat (iostat available via brew install sysstat)
22#
23# CROSS-PLATFORM NOTES:
24# This script detects the OS and adjusts commands accordingly.
25# On macOS, some Linux-specific flags are not available; fallbacks are used.
26# =============================================================================
27
28set -euo pipefail
29
30# ---------------------------------------------------------------------------
31# Color and formatting
32# ---------------------------------------------------------------------------
33RED='\033[0;31m'
34YELLOW='\033[1;33m'
35GREEN='\033[0;32m'
36CYAN='\033[0;36m'
37BOLD='\033[1m'
38RESET='\033[0m'
39
40# Bottleneck severity levels
41SEV_OK="OK"
42SEV_WARN="WARN"
43SEV_CRIT="CRIT"
44
45# Accumulated bottleneck findings for the summary report
46BOTTLENECKS=()
47
48# ---------------------------------------------------------------------------
49# OS detection — commands differ between Linux and macOS (Darwin)
50# ---------------------------------------------------------------------------
51OS="$(uname -s)"
52
53# ---------------------------------------------------------------------------
54# Helpers
55# ---------------------------------------------------------------------------
56section() {
57 echo ""
58 echo -e "${BOLD}${CYAN}>>> $1${RESET}"
59 echo -e "${CYAN}$(printf '%0.s-' {1..60})${RESET}"
60}
61
62flag_bottleneck() {
63 local severity="$1"
64 local message="$2"
65 BOTTLENECKS+=("[$severity] $message")
66}
67
68sev_color() {
69 case "$1" in
70 "$SEV_CRIT") echo -e "${RED}[CRIT]${RESET}" ;;
71 "$SEV_WARN") echo -e "${YELLOW}[WARN]${RESET}" ;;
72 *) echo -e "${GREEN}[ OK ]${RESET}" ;;
73 esac
74}
75
76# Check if a command exists; return 1 silently if missing
77has_cmd() { command -v "$1" &>/dev/null; }
78
79# ---------------------------------------------------------------------------
80# SYSTEM OVERVIEW
81# Metric: uptime, kernel, CPU core count
82# Why it matters: baseline context before diving into specific metrics
83# ---------------------------------------------------------------------------
84system_overview() {
85 section "System Overview"
86
87 echo -e " Hostname : $(hostname)"
88 echo -e " OS : $OS $(uname -r)"
89 echo -e " Date : $(date '+%Y-%m-%d %H:%M:%S %Z')"
90 echo -e " Uptime : $(uptime | sed 's/.*up //' | sed 's/, [0-9]* user.*//')"
91
92 # CPU core count (logical processors)
93 local cores
94 if [[ "$OS" == "Darwin" ]]; then
95 cores=$(sysctl -n hw.logicalcpu)
96 else
97 cores=$(nproc 2>/dev/null || grep -c ^processor /proc/cpuinfo)
98 fi
99 echo -e " CPU Cores : $cores logical processor(s)"
100}
101
102# ---------------------------------------------------------------------------
103# CPU ANALYSIS
104# Metrics: load average (1/5/15 min), per-process CPU%
105#
106# Rule of thumb:
107# load avg / cores < 1.0 → healthy
108# load avg / cores 1.0–2.0 → elevated, monitor
109# load avg / cores > 2.0 → saturated, investigate
110# ---------------------------------------------------------------------------
111analyze_cpu() {
112 section "CPU Analysis"
113
114 # Load average is the number of processes waiting for CPU time
115 local load1 load5 load15 cores
116 if [[ "$OS" == "Darwin" ]]; then
117 read -r load1 load5 load15 <<< "$(sysctl -n vm.loadavg | tr -d '{}' | awk '{print $1,$2,$3}')"
118 cores=$(sysctl -n hw.logicalcpu)
119 else
120 read -r load1 load5 load15 <<< "$(awk '{print $1,$2,$3}' /proc/loadavg)"
121 cores=$(nproc)
122 fi
123
124 echo -e " Load Average (1/5/15 min): ${BOLD}$load1 $load5 $load15${RESET}"
125 echo -e " Logical CPU Cores : $cores"
126
127 # Compare 1-min load to core count to detect saturation
128 local ratio
129 ratio=$(awk "BEGIN {printf \"%.2f\", $load1 / $cores}")
130 echo -e " Load/Core Ratio (1 min) : $ratio"
131
132 local sev="$SEV_OK"
133 if awk "BEGIN {exit !($load1 / $cores >= 2.0)}"; then
134 sev="$SEV_CRIT"; flag_bottleneck "$sev" "CPU saturated: load/core ratio $ratio"
135 elif awk "BEGIN {exit !($load1 / $cores >= 1.0)}"; then
136 sev="$SEV_WARN"; flag_bottleneck "$sev" "CPU elevated: load/core ratio $ratio"
137 fi
138 echo -e " CPU Pressure : $(sev_color "$sev")"
139
140 echo ""
141 echo -e " ${BOLD}Top 5 CPU-consuming processes:${RESET}"
142 # ps output: %cpu pid comm — sorted descending by CPU
143 # -A selects all processes; -o customizes output columns
144 ps -A -o pcpu,pid,comm 2>/dev/null \
145 | sort -rn \
146 | head -5 \
147 | awk '{printf " %6s%% PID %-6s %s\n", $1, $2, $3}'
148
149 # Detect zombie processes — processes that have exited but whose parent
150 # has not yet called wait(). Large numbers indicate a parent bug.
151 local zombies
152 zombies=$(ps -A -o stat 2>/dev/null | grep -c '^Z' || echo 0)
153 echo ""
154 if [[ $zombies -gt 0 ]]; then
155 echo -e " ${YELLOW}Zombie processes: $zombies (parent process may not be reaping children)${RESET}"
156 flag_bottleneck "$SEV_WARN" "Zombie processes detected: $zombies"
157 else
158 echo -e " Zombie processes: ${GREEN}0${RESET}"
159 fi
160}
161
162# ---------------------------------------------------------------------------
163# MEMORY ANALYSIS
164# Metrics: total/used/free/cached RAM, swap in/out
165#
166# Key insight: Linux aggressively uses free RAM for disk cache (page cache).
167# "Available" memory (not just "free") is the correct metric for headroom.
168# High swap usage with low available RAM = memory pressure bottleneck.
169# ---------------------------------------------------------------------------
170analyze_memory() {
171 section "Memory Analysis"
172
173 if [[ "$OS" == "Darwin" ]]; then
174 # macOS uses vm_stat; parse page counts (each page = 4096 bytes)
175 local page_size=4096
176 local vm
177 vm=$(vm_stat)
178 local free_pages wired active inactive
179 free_pages=$(echo "$vm" | awk '/Pages free/{gsub(/\./,"",$3); print $3}')
180 wired=$(echo "$vm" | awk '/Pages wired/{gsub(/\./,"",$4); print $4}')
181 active=$(echo "$vm" | awk '/Pages active/{gsub(/\./,"",$3); print $3}')
182 inactive=$(echo "$vm" | awk '/Pages inactive/{gsub(/\./,"",$3); print $3}')
183
184 local total_pages=$(( (${free_pages:-0} + ${wired:-0} + ${active:-0} + ${inactive:-0}) ))
185 local total_mb=$(( total_pages * page_size / 1024 / 1024 ))
186 local free_mb=$(( ${free_pages:-0} * page_size / 1024 / 1024 ))
187 local used_mb=$(( total_mb - free_mb ))
188
189 echo -e " Total RAM : ${total_mb} MB"
190 echo -e " Used : ${used_mb} MB"
191 echo -e " Free : ${free_mb} MB"
192 echo -e " (macOS caches aggressively; use 'Memory Pressure' in Activity Monitor)"
193
194 # Swap on macOS via sysctl
195 local swap_used
196 swap_used=$(sysctl -n vm.swapusage 2>/dev/null | awk '{print $6}' | tr -d 'M' || echo 0)
197 echo -e " Swap Used : ${swap_used} MB"
198 if awk "BEGIN {exit !(${swap_used:-0} > 1024)}"; then
199 flag_bottleneck "$SEV_WARN" "Swap usage elevated: ${swap_used} MB"
200 fi
201 else
202 # Linux: 'free' provides clear columns
203 echo -e " ${BOLD}Memory (MB):${RESET}"
204 free -m | awk '
205 NR==1 {printf " %-12s %8s %8s %8s %8s\n", "", $1, $2, $3, $6}
206 NR==2 {printf " %-12s %8s %8s %8s %8s\n", "RAM", $2, $3, $4, $7}
207 NR==3 {printf " %-12s %8s %8s %8s\n", "Swap", $2, $3, $4}
208 '
209
210 # Parse available memory to calculate utilization percentage
211 local avail_mb total_mb
212 avail_mb=$(free -m | awk '/^Mem:/{print $7}')
213 total_mb=$(free -m | awk '/^Mem:/{print $2}')
214 local used_pct=$(( (total_mb - avail_mb) * 100 / total_mb ))
215
216 echo ""
217 echo -e " RAM Utilization : ${used_pct}%"
218
219 local sev="$SEV_OK"
220 if [[ $used_pct -ge 90 ]]; then
221 sev="$SEV_CRIT"; flag_bottleneck "$sev" "Memory critical: ${used_pct}% used"
222 elif [[ $used_pct -ge 75 ]]; then
223 sev="$SEV_WARN"; flag_bottleneck "$sev" "Memory elevated: ${used_pct}% used"
224 fi
225 echo -e " Memory Pressure : $(sev_color "$sev")"
226 fi
227
228 echo ""
229 echo -e " ${BOLD}Top 5 memory-consuming processes:${RESET}"
230 ps -A -o pmem,pid,comm 2>/dev/null \
231 | sort -rn \
232 | head -5 \
233 | awk '{printf " %6s%% PID %-6s %s\n", $1, $2, $3}'
234}
235
236# ---------------------------------------------------------------------------
237# DISK I/O ANALYSIS
238# Metrics: disk utilization %, read/write throughput
239#
240# iostat's %util field: percentage of time the device was busy.
241# Values near 100% indicate the disk is saturated (I/O bottleneck).
242# High await (average I/O wait time) with high %util = disk is the bottleneck.
243# ---------------------------------------------------------------------------
244analyze_disk() {
245 section "Disk I/O Analysis"
246
247 # Disk space — always available
248 echo -e " ${BOLD}Filesystem Usage:${RESET}"
249 df -h | awk 'NR==1 || /^\// {printf " %-30s %6s %6s %6s %5s %s\n", $1, $2, $3, $4, $5, $6}' \
250 | head -8
251
252 # Check for critically full filesystems
253 df -h | awk 'NR>1 && /^\// {gsub(/%/,"",$5); if ($5+0 >= 90) print $6, $5}' \
254 | while read -r mount pct; do
255 flag_bottleneck "$SEV_CRIT" "Filesystem $mount at ${pct}% — nearly full"
256 echo -e " ${RED}[CRIT]${RESET} Filesystem $mount at ${pct}%"
257 done
258
259 echo ""
260 echo -e " ${BOLD}Disk I/O Throughput:${RESET}"
261
262 if has_cmd iostat; then
263 if [[ "$OS" == "Darwin" ]]; then
264 # macOS iostat: columns are different from Linux
265 echo -e " (macOS iostat — KB/t=KB per transfer, tps=transfers/sec)"
266 iostat -d 1 2 2>/dev/null | tail -n +4 | head -6 \
267 | awk '{printf " %-12s KB/t=%8s tps=%8s\n", $1, $2, $3}' || true
268 else
269 # Linux iostat -x: extended stats including %util and await
270 echo -e " (Linux iostat -x: %util=disk busy%, await=avg I/O wait ms)"
271 iostat -dx 1 2 2>/dev/null \
272 | awk '/^[svhm]d|^nvme/ {printf " %-12s %util=%6s%% await=%6s ms r/s=%6s w/s=%6s\n", $1, $NF, $10, $4, $5}' \
273 | head -6 || true
274
275 # Flag disks with high utilization
276 iostat -dx 1 2 2>/dev/null \
277 | awk '/^[svhm]d|^nvme/ {util=$NF+0; dev=$1; if (util>=80) print dev, util}' \
278 | while read -r dev util; do
279 flag_bottleneck "$SEV_WARN" "Disk $dev I/O utilization ${util}%"
280 done
281 fi
282 else
283 echo -e " ${YELLOW}iostat not found.${RESET} Install sysstat (Linux) or brew install sysstat (macOS)"
284 echo -e " Showing /proc/diskstats snapshot instead (Linux only):"
285 if [[ -f /proc/diskstats ]]; then
286 awk 'NF>=14 && $3~/^[svhm]d|^nvme/ {printf " %-10s reads=%s writes=%s\n", $3, $6, $10}' \
287 /proc/diskstats | head -5
288 fi
289 fi
290}
291
292# ---------------------------------------------------------------------------
293# NETWORK ANALYSIS
294# Metrics: active connections by state, listening ports, interface stats
295#
296# TIME_WAIT: normal; connections waiting for duplicate packets to expire (2*MSL)
297# CLOSE_WAIT: may indicate app not closing sockets — potential resource leak
298# High connection counts on a single port may indicate a traffic spike or DoS
299# ---------------------------------------------------------------------------
300analyze_network() {
301 section "Network Analysis"
302
303 echo -e " ${BOLD}Connection State Summary:${RESET}"
304
305 # Prefer 'ss' (socket statistics, modern Linux) over netstat
306 if has_cmd ss; then
307 ss -tan 2>/dev/null | awk 'NR>1 {states[$1]++} END {for (s in states) printf " %-15s : %d\n", s, states[s]}' | sort
308 elif has_cmd netstat; then
309 netstat -an 2>/dev/null | awk '/^tcp/ {states[$6]++} END {for (s in states) printf " %-15s : %d\n", s, states[s]}' | sort
310 else
311 echo -e " ${YELLOW}Neither ss nor netstat found${RESET}"
312 fi
313
314 echo ""
315 echo -e " ${BOLD}Listening Ports (TCP):${RESET}"
316 if has_cmd ss; then
317 ss -tlnp 2>/dev/null | awk 'NR>1 {printf " %-25s %s\n", $4, $6}' | head -10
318 elif has_cmd netstat; then
319 netstat -tlnp 2>/dev/null | awk 'NR>2 && /LISTEN/ {printf " %-25s %s\n", $4, $7}' | head -10
320 fi
321
322 echo ""
323 echo -e " ${BOLD}Network Interface Statistics:${RESET}"
324 if [[ "$OS" == "Darwin" ]]; then
325 netstat -ib 2>/dev/null | awk 'NR==1 || /en[0-9]/' | head -6
326 elif [[ -f /proc/net/dev ]]; then
327 awk 'NR>2 && !/lo:/ {
328 gsub(/:/, " ");
329 printf " %-10s RX: %s bytes TX: %s bytes\n", $1, $2, $10
330 }' /proc/net/dev | head -5
331 fi
332
333 # Flag large TIME_WAIT counts as a potential issue
334 local tw_count=0
335 if has_cmd ss; then
336 tw_count=$(ss -tan 2>/dev/null | grep -c TIME-WAIT || echo 0)
337 fi
338 if [[ $tw_count -gt 500 ]]; then
339 flag_bottleneck "$SEV_WARN" "High TIME_WAIT count: $tw_count (consider tcp_tw_reuse)"
340 echo -e " ${YELLOW}[WARN]${RESET} High TIME_WAIT connections: $tw_count"
341 fi
342}
343
344# ---------------------------------------------------------------------------
345# SUMMARY REPORT — consolidate all bottleneck findings
346# ---------------------------------------------------------------------------
347print_summary() {
348 section "Performance Bottleneck Summary"
349
350 if [[ ${#BOTTLENECKS[@]} -eq 0 ]]; then
351 echo -e " ${GREEN}${BOLD}No significant bottlenecks detected.${RESET}"
352 echo -e " System appears healthy across CPU, memory, disk, and network."
353 else
354 echo -e " ${BOLD}Detected ${#BOTTLENECKS[@]} issue(s):${RESET}"
355 echo ""
356 for b in "${BOTTLENECKS[@]}"; do
357 if [[ "$b" == *"[CRIT]"* ]]; then
358 echo -e " ${RED}$b${RESET}"
359 elif [[ "$b" == *"[WARN]"* ]]; then
360 echo -e " ${YELLOW}$b${RESET}"
361 else
362 echo -e " ${GREEN}$b${RESET}"
363 fi
364 done
365 echo ""
366 echo -e " ${BOLD}Next steps:${RESET}"
367 echo -e " - CPU: profile with 'perf top' or 'flamegraph' to find hot functions"
368 echo -e " - Memory: use 'smem' or 'valgrind' to detect leaks"
369 echo -e " - Disk: check application I/O patterns with 'iotop' or 'blktrace'"
370 echo -e " - Network: capture traffic with 'tcpdump' or 'Wireshark'"
371 fi
372}
373
374# ---------------------------------------------------------------------------
375# Entry point
376# ---------------------------------------------------------------------------
377main() {
378 echo -e "${BOLD}Performance Diagnostics — $(date '+%Y-%m-%d %H:%M:%S') — $OS${RESET}"
379 system_overview
380
381 case "${1:---all}" in
382 --cpu) analyze_cpu ;;
383 --memory) analyze_memory ;;
384 --disk) analyze_disk ;;
385 --network) analyze_network ;;
386 --all)
387 analyze_cpu
388 analyze_memory
389 analyze_disk
390 analyze_network
391 print_summary
392 ;;
393 *)
394 echo "Usage: $0 [--cpu|--memory|--disk|--network|--all]"
395 exit 1
396 ;;
397 esac
398}
399
400main "$@"