Sagar.BlogArticle
All posts
All posts
Bash

Practical Script — System Health Monitor

Build a comprehensive system health check script: CPU, memory, disk, services, and SSL certificate expiry. Generate a colour-coded report and send alerts when thresholds are breached.

February 5, 20269 min read
BashSystem AdminMonitoringPracticalScripting

A system health check is the quintessential sysadmin script. It pulls together everything: variables, arrays, functions, conditionals, process management, and formatted output. This one generates a colour-coded terminal report and can be run from cron.

syshealth.sh
#!/usr/bin/env bash
# syshealth.sh — System health check with threshold alerting

set -euo pipefail

# ── Configuration ─────────────────────────────────────────────────────────────
DISK_WARN=70       # % disk usage → WARN
DISK_CRIT=85       # % disk usage → CRITICAL
MEM_WARN=80        # % memory usage → WARN
CPU_WARN=80        # % CPU usage (1-min load / nproc * 100)
SERVICES=(nginx mysql sshd)     # services to check
SSL_WARN_DAYS=30   # warn if cert expires within N days

# ── Colors ────────────────────────────────────────────────────────────────────
if [[ -t 1 ]]; then   # only color if stdout is a terminal
    RED='\033[0;31m'; YELLOW='\033[1;33m'
    GREEN='\033[0;32m'; BLUE='\033[0;34m'
    BOLD='\033[1m'; NC='\033[0m'
else
    RED=''; YELLOW=''; GREEN=''; BLUE=''; BOLD=''; NC=''
fi

# ── State tracking ────────────────────────────────────────────────────────────
declare -a ALERTS=()
declare -a WARNINGS=()

# ── Formatting helpers ────────────────────────────────────────────────────────
function header  { printf "\n${BOLD}${BLUE}══ %s ══${NC}\n" "$*"; }
function ok      { printf "  ${GREEN}${NC} %-30s %s\n" "$1" "$2"; }
function warn    { printf "  ${YELLOW}${NC} %-30s %s\n" "$1" "$2"; WARNINGS+=("$1: $2"); }
function crit    { printf "  ${RED}${NC} %-30s %s\n" "$1" "$2"; ALERTS+=("$1: $2"); }

# ── Checks ────────────────────────────────────────────────────────────────────
function check_disk {
    header "Disk Usage"
    while IFS= read -r line; do
        local usage mount
        usage=$(echo "$line" | awk '{print $5}' | tr -d '%')
        mount=$(echo "$line" | awk '{print $6}')
        local size; size=$(echo "$line" | awk '{print $2}')
        local used; used=$(echo "$line" | awk '{print $3}')

        if (( usage >= DISK_CRIT )); then
            crit "Disk $mount" "${usage}% full (${used}/${size})"
        elif (( usage >= DISK_WARN )); then
            warn "Disk $mount" "${usage}% full (${used}/${size})"
        else
            ok "Disk $mount" "${usage}% used (${used}/${size})"
        fi
    done < <(df -h | awk 'NR>1 && $6 ~ /^// {print}')
}

function check_memory {
    header "Memory"
    if command -v free &>/dev/null; then
        local total used pct
        total=$(free -m | awk '/^Mem:/ {print $2}')
        used=$(free -m | awk '/^Mem:/ {print $3}')
        pct=$(( used * 100 / total ))
        if (( pct >= MEM_WARN )); then
            warn "Memory" "${pct}% used (${used}M / ${total}M)"
        else
            ok "Memory" "${pct}% used (${used}M / ${total}M)"
        fi
    else
        # macOS
        local pages_free pages_total
        pages_free=$(vm_stat | awk '/Pages free:/ {print $3}' | tr -d '.')
        pages_total=$(sysctl -n hw.memsize 2>/dev/null)
        ok "Memory" "$(( pages_free * 4096 / 1024 / 1024 ))M free"
    fi
}

function check_cpu {
    header "CPU"
    local load nproc pct
    load=$(uptime | awk -F'load average:' '{print $2}' | awk -F, '{print $1}' | tr -d ' ')
    nproc=$(nproc 2>/dev/null || sysctl -n hw.ncpu)
    pct=$(echo "$load $nproc" | awk '{printf "%d", ($1/$2)*100}')

    if (( pct >= CPU_WARN )); then
        warn "CPU Load" "${pct}% (load=${load}, cores=${nproc})"
    else
        ok "CPU Load" "${pct}% (load=${load}, cores=${nproc})"
    fi

    # Top 3 processes by CPU
    echo ""
    echo "  Top 3 by CPU:"
    ps -eo pid,pcpu,comm --sort=-pcpu 2>/dev/null | head -4 | tail -3 |         awk '{printf "    PID %-6s CPU %-5s %s
", $1, $2"%", $3}'
}

function check_services {
    header "Services"
    for svc in "${SERVICES[@]}"; do
        if systemctl is-active --quiet "$svc" 2>/dev/null ||            pgrep -x "$svc" &>/dev/null; then
            ok "Service: $svc" "running"
        else
            crit "Service: $svc" "NOT RUNNING"
        fi
    done
}

function check_ssl {
    header "SSL Certificates"
    local domains=("${@:-}")
    [[ ${#domains[@]} -eq 0 ]] && { echo "  (no domains configured)"; return; }

    for domain in "${domains[@]}"; do
        if ! expiry=$(echo | openssl s_client -servername "$domain"                 -connect "${domain}:443" 2>/dev/null                 | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2); then
            crit "SSL: $domain" "Cannot connect"
            continue
        fi

        local exp_epoch now days_left
        exp_epoch=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%b %e %T %Y %Z" "$expiry" +%s 2>/dev/null)
        now=$(date +%s)
        days_left=$(( (exp_epoch - now) / 86400 ))

        if (( days_left <= 0 )); then
            crit "SSL: $domain" "EXPIRED!"
        elif (( days_left <= SSL_WARN_DAYS )); then
            warn "SSL: $domain" "Expires in ${days_left} days"
        else
            ok "SSL: $domain" "Valid for ${days_left} days"
        fi
    done
}

function print_summary {
    header "Summary"
    echo ""
    if [[ ${#ALERTS[@]} -eq 0 && ${#WARNINGS[@]} -eq 0 ]]; then
        printf "  ${GREEN}${BOLD}All checks passed!${NC}\n"
    else
        if [[ ${#ALERTS[@]} -gt 0 ]]; then
            printf "  ${RED}${BOLD}CRITICAL (${#ALERTS[@]}):${NC}\n"
            for a in "${ALERTS[@]}"; do printf "    - %s\n" "$a"; done
        fi
        if [[ ${#WARNINGS[@]} -gt 0 ]]; then
            printf "  ${YELLOW}${BOLD}WARNINGS (${#WARNINGS[@]}):${NC}\n"
            for w in "${WARNINGS[@]}"; do printf "    - %s\n" "$w"; done
        fi
    fi
    echo ""

    # Exit non-zero if critical issues found
    [[ ${#ALERTS[@]} -eq 0 ]]
}

# ── Entry Point ───────────────────────────────────────────────────────────────
echo ""
printf "${BOLD}System Health Report — $(hostname)$(date)${NC}\n"

check_disk
check_memory
check_cpu
check_services
check_ssl example.com   # add your domains here

print_summary

Running It

chmod +x syshealth.sh

# Manual run
./syshealth.sh

# Cron — email on failure only
MAILTO="ops@example.com"
*/15 * * * * /usr/local/bin/syshealth.sh >/dev/null

# Cron — always email a report
0 8 * * * /usr/local/bin/syshealth.sh

Making it production-ready

To go further with this script:

  • Send alerts to Slack using curl and a webhook URL
  • Write JSON output for ingestion by a monitoring dashboard
  • Add network connectivity checks (ping, nc)
  • Check for zombie processes, OOM events in dmesg
  • Store historical data in a simple CSV for trend analysis
Quick Check

The script uses `[[ -t 1 ]]` before setting color codes. What does this check?

Exercise

Add a check_failed_logins function to the health script that:

  1. Reads /var/log/auth.log (or /var/log/secure on RHEL) for failed SSH login attempts
  2. Counts attempts in the last hour
  3. WARNs if > 10 failed attempts, CRITs if > 50
  4. Lists the top 3 source IPs with the most failures