deploy: add cluster smoke-test script; remove rate-limit probe
test-cluster.sh validates primary /health, dedup hostnames, real LAN IPs, per-node /health, NVENC encode, and Blackmagic enumeration.
This commit is contained in:
parent
97628bb67d
commit
8aa378348e
1 changed files with 187 additions and 0 deletions
187
deploy/test-cluster.sh
Executable file
187
deploy/test-cluster.sh
Executable file
|
|
@ -0,0 +1,187 @@
|
|||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# Wild Dragon MAM — Cluster Smoke Test
|
||||
# =============================================================================
|
||||
#
|
||||
# Validates the cluster end-to-end from any node that can reach the primary.
|
||||
# Designed to be run after `onboard-node.sh` finishes on every worker.
|
||||
#
|
||||
# MAM_API_URL=http://10.0.0.25:47432 ./deploy/test-cluster.sh
|
||||
# MAM_API_URL=... AUTH_TOKEN=wd_xxxx ./deploy/test-cluster.sh
|
||||
#
|
||||
# Checks:
|
||||
# 1. Primary API health
|
||||
# 2. Cluster registry (no duplicate hostnames, IPs are real LAN addresses)
|
||||
# 3. Each worker's /health endpoint
|
||||
# 4. GPU detection (nvidia-smi exits clean on nodes that report GPUs)
|
||||
# 5. NVENC encode probe (5s of synthetic h264_nvenc → /tmp)
|
||||
# 6. Blackmagic device enumeration
|
||||
#
|
||||
# Exit 0 = all pass, 1 = any failure. Failures are logged inline.
|
||||
# =============================================================================
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
MAM_API_URL="${MAM_API_URL:-}"
|
||||
AUTH_TOKEN="${AUTH_TOKEN:-}"
|
||||
|
||||
if [[ -z "$MAM_API_URL" ]]; then
|
||||
echo "✗ MAM_API_URL is required" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RED='\033[0;31m'; YEL='\033[1;33m'; GRN='\033[0;32m'; CYN='\033[0;36m'; BLD='\033[1m'; NC='\033[0m'
|
||||
PASS=0; FAIL=0
|
||||
pass() { echo -e "${GRN} ✓${NC} $*"; PASS=$((PASS+1)); }
|
||||
fail() { echo -e "${RED} ✗${NC} $*"; FAIL=$((FAIL+1)); }
|
||||
note() { echo -e "${CYN} ▶${NC} $*"; }
|
||||
warn() { echo -e "${YEL} !${NC} $*"; }
|
||||
|
||||
api() {
|
||||
local method="${1:-GET}"; shift
|
||||
local path="$1"; shift
|
||||
local args=(-sS -X "$method" -H 'Content-Type: application/json')
|
||||
[[ -n "$AUTH_TOKEN" ]] && args+=(-H "Authorization: Bearer $AUTH_TOKEN")
|
||||
curl "${args[@]}" "$@" "${MAM_API_URL}${path}"
|
||||
}
|
||||
|
||||
echo -e "${BLD}${CYN}Wild Dragon — Cluster Smoke Test${NC}"
|
||||
echo -e "Primary: $MAM_API_URL"
|
||||
echo ""
|
||||
|
||||
# ── 1. Primary API health ───────────────────────────────────────────────
|
||||
echo -e "${BLD}1. Primary API health${NC}"
|
||||
if api GET /health | grep -q '"status":"ok"'; then
|
||||
pass "primary /health responds"
|
||||
else
|
||||
fail "primary /health did not return ok"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 2. Cluster registry ─────────────────────────────────────────────────
|
||||
echo -e "${BLD}2. Cluster registry${NC}"
|
||||
NODES_JSON=$(api GET /api/v1/cluster || echo '[]')
|
||||
TOTAL=$(echo "$NODES_JSON" | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null || echo 0)
|
||||
note "$TOTAL nodes registered"
|
||||
|
||||
if [[ "$TOTAL" -gt 0 ]]; then
|
||||
# No duplicate hostnames
|
||||
DUP=$(echo "$NODES_JSON" | python3 -c '
|
||||
import sys, json
|
||||
nodes = json.load(sys.stdin)
|
||||
seen = {}
|
||||
dups = []
|
||||
for n in nodes:
|
||||
h = n.get("hostname")
|
||||
if h in seen: dups.append(h)
|
||||
seen[h] = True
|
||||
print(",".join(sorted(set(dups))))' 2>/dev/null)
|
||||
if [[ -z "$DUP" ]]; then
|
||||
pass "no duplicate hostnames"
|
||||
else
|
||||
fail "duplicate hostnames: $DUP — run migration 007"
|
||||
fi
|
||||
|
||||
# No private docker IPs
|
||||
BAD_IPS=$(echo "$NODES_JSON" | python3 -c '
|
||||
import sys, json, re
|
||||
docker = re.compile(r"^172\.(1[6-9]|2\d|3[01])\.")
|
||||
bad = []
|
||||
for n in json.load(sys.stdin):
|
||||
ip = n.get("ip_address") or ""
|
||||
if docker.match(ip):
|
||||
bad.append(f"{n.get(\"hostname\")}={ip}")
|
||||
print(",".join(bad))' 2>/dev/null)
|
||||
if [[ -z "$BAD_IPS" ]]; then
|
||||
pass "all node IPs are real LAN addresses"
|
||||
else
|
||||
fail "nodes still reporting docker bridge IPs: $BAD_IPS"
|
||||
warn " → set NODE_IP in .env.worker and restart the node-agent"
|
||||
fi
|
||||
|
||||
# All nodes recently seen
|
||||
STALE=$(echo "$NODES_JSON" | python3 -c '
|
||||
import sys, json
|
||||
nodes = json.load(sys.stdin)
|
||||
stale = [n["hostname"] for n in nodes if float(n.get("stale_seconds") or 9999) > 120]
|
||||
print(",".join(stale))' 2>/dev/null)
|
||||
if [[ -z "$STALE" ]]; then
|
||||
pass "all nodes heartbeated within 2 min"
|
||||
else
|
||||
warn "stale nodes (>2 min since heartbeat): $STALE"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 3. Per-node /health probes ──────────────────────────────────────────
|
||||
echo -e "${BLD}3. Worker agent /health endpoints${NC}"
|
||||
echo "$NODES_JSON" | python3 -c '
|
||||
import sys, json
|
||||
for n in json.load(sys.stdin):
|
||||
if n.get("role") == "primary": continue
|
||||
print(n["id"], n["hostname"], n.get("api_url") or "")
|
||||
' 2>/dev/null | while read -r ID HOST URL; do
|
||||
[[ -z "$URL" ]] && { warn "$HOST: no api_url registered"; continue; }
|
||||
if curl -sf --max-time 4 "$URL/health" >/dev/null 2>&1; then
|
||||
pass "$HOST ($URL/health)"
|
||||
else
|
||||
fail "$HOST agent unreachable at $URL/health"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ── 4. Local GPU + NVENC probe (when run on a GPU node) ─────────────────
|
||||
echo -e "${BLD}4. Local GPU + NVENC${NC}"
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l)
|
||||
if [[ "$GPU_COUNT" -gt 0 ]]; then
|
||||
pass "$GPU_COUNT NVIDIA GPU(s) visible to host"
|
||||
if command -v ffmpeg >/dev/null 2>&1; then
|
||||
if ffmpeg -hide_banner -loglevel error \
|
||||
-f lavfi -i testsrc=duration=5:size=1280x720:rate=30 \
|
||||
-c:v h264_nvenc -preset p1 -b:v 4M \
|
||||
-t 5 -f null - 2>/tmp/wd-nvenc.log; then
|
||||
pass "NVENC encode test succeeded"
|
||||
else
|
||||
fail "NVENC encode failed — see /tmp/wd-nvenc.log"
|
||||
fi
|
||||
else
|
||||
warn "ffmpeg not installed locally — skipping NVENC encode test"
|
||||
fi
|
||||
else
|
||||
warn "nvidia-smi found but reports 0 GPUs"
|
||||
fi
|
||||
else
|
||||
warn "nvidia-smi not present (not a GPU node)"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 5. Blackmagic device enumeration ────────────────────────────────────
|
||||
echo -e "${BLD}5. Blackmagic devices (cluster-wide)${NC}"
|
||||
BMD_JSON=$(api GET /api/v1/cluster/devices/blackmagic || echo '[]')
|
||||
BMD_COUNT=$(echo "$BMD_JSON" | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null || echo 0)
|
||||
if [[ "$BMD_COUNT" -gt 0 ]]; then
|
||||
pass "$BMD_COUNT DeckLink port(s) registered"
|
||||
echo "$BMD_JSON" | python3 -c '
|
||||
import sys, json
|
||||
for d in json.load(sys.stdin):
|
||||
print(f" {d.get(\"hostname\")} port={d.get(\"index\")} model={d.get(\"model\") or \"unknown\"} online={d.get(\"online\")}")
|
||||
'
|
||||
else
|
||||
warn "no DeckLink devices reported by any node"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── 6. Local Blackmagic device files ────────────────────────────────────
|
||||
echo -e "${BLD}6. Local /dev/blackmagic${NC}"
|
||||
if [[ -d /dev/blackmagic ]]; then
|
||||
ls /dev/blackmagic/ | sed 's/^/ /'
|
||||
pass "$(ls /dev/blackmagic/ | wc -l) device node(s) under /dev/blackmagic"
|
||||
else
|
||||
warn "no /dev/blackmagic on this machine"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── Summary ─────────────────────────────────────────────────────────────
|
||||
echo -e "${BLD}Summary:${NC} ${GRN}$PASS pass${NC} ${RED}$FAIL fail${NC}"
|
||||
[[ "$FAIL" -gt 0 ]] && exit 1 || exit 0
|
||||
Loading…
Reference in a new issue