Practical Script — System Health Monitor
Build a comprehensive system health check script: CPU, memory, disk, services, and SSL certificate expiry. Generate a colour-coded report and send alerts when thresholds are breached.
February 5, 20269 min read
BashSystem AdminMonitoringPracticalScripting
A system health check is the quintessential sysadmin script. It pulls together everything: variables, arrays, functions, conditionals, process management, and formatted output. This one generates a colour-coded terminal report and can be run from cron.
syshealth.sh
#!/usr/bin/env bash
# syshealth.sh — System health check with threshold alerting
set -euo pipefail
# ── Configuration ─────────────────────────────────────────────────────────────
DISK_WARN=70 # % disk usage → WARN
DISK_CRIT=85 # % disk usage → CRITICAL
MEM_WARN=80 # % memory usage → WARN
CPU_WARN=80 # % CPU usage (1-min load / nproc * 100)
SERVICES=(nginx mysql sshd) # services to check
SSL_WARN_DAYS=30 # warn if cert expires within N days
# ── Colors ────────────────────────────────────────────────────────────────────
if [[ -t 1 ]]; then # only color if stdout is a terminal
RED='\033[0;31m'; YELLOW='\033[1;33m'
GREEN='\033[0;32m'; BLUE='\033[0;34m'
BOLD='\033[1m'; NC='\033[0m'
else
RED=''; YELLOW=''; GREEN=''; BLUE=''; BOLD=''; NC=''
fi
# ── State tracking ────────────────────────────────────────────────────────────
declare -a ALERTS=()
declare -a WARNINGS=()
# ── Formatting helpers ────────────────────────────────────────────────────────
function header { printf "\n${BOLD}${BLUE}══ %s ══${NC}\n" "$*"; }
function ok { printf " ${GREEN}✓${NC} %-30s %s\n" "$1" "$2"; }
function warn { printf " ${YELLOW}⚠${NC} %-30s %s\n" "$1" "$2"; WARNINGS+=("$1: $2"); }
function crit { printf " ${RED}✗${NC} %-30s %s\n" "$1" "$2"; ALERTS+=("$1: $2"); }
# ── Checks ────────────────────────────────────────────────────────────────────
function check_disk {
header "Disk Usage"
while IFS= read -r line; do
local usage mount
usage=$(echo "$line" | awk '{print $5}' | tr -d '%')
mount=$(echo "$line" | awk '{print $6}')
local size; size=$(echo "$line" | awk '{print $2}')
local used; used=$(echo "$line" | awk '{print $3}')
if (( usage >= DISK_CRIT )); then
crit "Disk $mount" "${usage}% full (${used}/${size})"
elif (( usage >= DISK_WARN )); then
warn "Disk $mount" "${usage}% full (${used}/${size})"
else
ok "Disk $mount" "${usage}% used (${used}/${size})"
fi
done < <(df -h | awk 'NR>1 && $6 ~ /^// {print}')
}
function check_memory {
header "Memory"
if command -v free &>/dev/null; then
local total used pct
total=$(free -m | awk '/^Mem:/ {print $2}')
used=$(free -m | awk '/^Mem:/ {print $3}')
pct=$(( used * 100 / total ))
if (( pct >= MEM_WARN )); then
warn "Memory" "${pct}% used (${used}M / ${total}M)"
else
ok "Memory" "${pct}% used (${used}M / ${total}M)"
fi
else
# macOS
local pages_free pages_total
pages_free=$(vm_stat | awk '/Pages free:/ {print $3}' | tr -d '.')
pages_total=$(sysctl -n hw.memsize 2>/dev/null)
ok "Memory" "$(( pages_free * 4096 / 1024 / 1024 ))M free"
fi
}
function check_cpu {
header "CPU"
local load nproc pct
load=$(uptime | awk -F'load average:' '{print $2}' | awk -F, '{print $1}' | tr -d ' ')
nproc=$(nproc 2>/dev/null || sysctl -n hw.ncpu)
pct=$(echo "$load $nproc" | awk '{printf "%d", ($1/$2)*100}')
if (( pct >= CPU_WARN )); then
warn "CPU Load" "${pct}% (load=${load}, cores=${nproc})"
else
ok "CPU Load" "${pct}% (load=${load}, cores=${nproc})"
fi
# Top 3 processes by CPU
echo ""
echo " Top 3 by CPU:"
ps -eo pid,pcpu,comm --sort=-pcpu 2>/dev/null | head -4 | tail -3 | awk '{printf " PID %-6s CPU %-5s %s
", $1, $2"%", $3}'
}
function check_services {
header "Services"
for svc in "${SERVICES[@]}"; do
if systemctl is-active --quiet "$svc" 2>/dev/null || pgrep -x "$svc" &>/dev/null; then
ok "Service: $svc" "running"
else
crit "Service: $svc" "NOT RUNNING"
fi
done
}
function check_ssl {
header "SSL Certificates"
local domains=("${@:-}")
[[ ${#domains[@]} -eq 0 ]] && { echo " (no domains configured)"; return; }
for domain in "${domains[@]}"; do
if ! expiry=$(echo | openssl s_client -servername "$domain" -connect "${domain}:443" 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2); then
crit "SSL: $domain" "Cannot connect"
continue
fi
local exp_epoch now days_left
exp_epoch=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%b %e %T %Y %Z" "$expiry" +%s 2>/dev/null)
now=$(date +%s)
days_left=$(( (exp_epoch - now) / 86400 ))
if (( days_left <= 0 )); then
crit "SSL: $domain" "EXPIRED!"
elif (( days_left <= SSL_WARN_DAYS )); then
warn "SSL: $domain" "Expires in ${days_left} days"
else
ok "SSL: $domain" "Valid for ${days_left} days"
fi
done
}
function print_summary {
header "Summary"
echo ""
if [[ ${#ALERTS[@]} -eq 0 && ${#WARNINGS[@]} -eq 0 ]]; then
printf " ${GREEN}${BOLD}All checks passed!${NC}\n"
else
if [[ ${#ALERTS[@]} -gt 0 ]]; then
printf " ${RED}${BOLD}CRITICAL (${#ALERTS[@]}):${NC}\n"
for a in "${ALERTS[@]}"; do printf " - %s\n" "$a"; done
fi
if [[ ${#WARNINGS[@]} -gt 0 ]]; then
printf " ${YELLOW}${BOLD}WARNINGS (${#WARNINGS[@]}):${NC}\n"
for w in "${WARNINGS[@]}"; do printf " - %s\n" "$w"; done
fi
fi
echo ""
# Exit non-zero if critical issues found
[[ ${#ALERTS[@]} -eq 0 ]]
}
# ── Entry Point ───────────────────────────────────────────────────────────────
echo ""
printf "${BOLD}System Health Report — $(hostname) — $(date)${NC}\n"
check_disk
check_memory
check_cpu
check_services
check_ssl example.com # add your domains here
print_summaryRunning It
chmod +x syshealth.sh
# Manual run
./syshealth.sh
# Cron — email on failure only
MAILTO="ops@example.com"
*/15 * * * * /usr/local/bin/syshealth.sh >/dev/null
# Cron — always email a report
0 8 * * * /usr/local/bin/syshealth.shMaking it production-ready
To go further with this script:
- Send alerts to Slack using
curland a webhook URL - Write JSON output for ingestion by a monitoring dashboard
- Add network connectivity checks (
ping,nc) - Check for zombie processes, OOM events in
dmesg - Store historical data in a simple CSV for trend analysis
Quick Check
The script uses `[[ -t 1 ]]` before setting color codes. What does this check?
Exercise
Add a check_failed_logins function to the health script that:
- Reads
/var/log/auth.log(or/var/log/secureon RHEL) for failed SSH login attempts - Counts attempts in the last hour
- WARNs if > 10 failed attempts, CRITs if > 50
- Lists the top 3 source IPs with the most failures