diff --git a/deploy/test-cluster.sh b/deploy/test-cluster.sh new file mode 100755 index 0000000..b55dde4 --- /dev/null +++ b/deploy/test-cluster.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash +# ============================================================================= +# Wild Dragon MAM — Cluster Smoke Test +# ============================================================================= +# +# Validates the cluster end-to-end from any node that can reach the primary. +# Designed to be run after `onboard-node.sh` finishes on every worker. +# +# MAM_API_URL=http://10.0.0.25:47432 ./deploy/test-cluster.sh +# MAM_API_URL=... AUTH_TOKEN=wd_xxxx ./deploy/test-cluster.sh +# +# Checks: +# 1. Primary API health +# 2. Cluster registry (no duplicate hostnames, IPs are real LAN addresses) +# 3. Each worker's /health endpoint +# 4. GPU detection (nvidia-smi exits clean on nodes that report GPUs) +# 5. NVENC encode probe (5s of synthetic h264_nvenc → /tmp) +# 6. Blackmagic device enumeration +# +# Exit 0 = all pass, 1 = any failure. Failures are logged inline. +# ============================================================================= + +set -uo pipefail + +MAM_API_URL="${MAM_API_URL:-}" +AUTH_TOKEN="${AUTH_TOKEN:-}" + +if [[ -z "$MAM_API_URL" ]]; then + echo "✗ MAM_API_URL is required" >&2 + exit 1 +fi + +RED='\033[0;31m'; YEL='\033[1;33m'; GRN='\033[0;32m'; CYN='\033[0;36m'; BLD='\033[1m'; NC='\033[0m' +PASS=0; FAIL=0 +pass() { echo -e "${GRN} ✓${NC} $*"; PASS=$((PASS+1)); } +fail() { echo -e "${RED} ✗${NC} $*"; FAIL=$((FAIL+1)); } +note() { echo -e "${CYN} ▶${NC} $*"; } +warn() { echo -e "${YEL} !${NC} $*"; } + +api() { + local method="${1:-GET}"; shift + local path="$1"; shift + local args=(-sS -X "$method" -H 'Content-Type: application/json') + [[ -n "$AUTH_TOKEN" ]] && args+=(-H "Authorization: Bearer $AUTH_TOKEN") + curl "${args[@]}" "$@" "${MAM_API_URL}${path}" +} + +echo -e "${BLD}${CYN}Wild Dragon — Cluster Smoke Test${NC}" +echo -e "Primary: $MAM_API_URL" +echo "" + +# ── 1. Primary API health ─────────────────────────────────────────────── +echo -e "${BLD}1. Primary API health${NC}" +if api GET /health | grep -q '"status":"ok"'; then + pass "primary /health responds" +else + fail "primary /health did not return ok" +fi +echo "" + +# ── 2. Cluster registry ───────────────────────────────────────────────── +echo -e "${BLD}2. Cluster registry${NC}" +NODES_JSON=$(api GET /api/v1/cluster || echo '[]') +TOTAL=$(echo "$NODES_JSON" | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null || echo 0) +note "$TOTAL nodes registered" + +if [[ "$TOTAL" -gt 0 ]]; then + # No duplicate hostnames + DUP=$(echo "$NODES_JSON" | python3 -c ' +import sys, json +nodes = json.load(sys.stdin) +seen = {} +dups = [] +for n in nodes: + h = n.get("hostname") + if h in seen: dups.append(h) + seen[h] = True +print(",".join(sorted(set(dups))))' 2>/dev/null) + if [[ -z "$DUP" ]]; then + pass "no duplicate hostnames" + else + fail "duplicate hostnames: $DUP — run migration 007" + fi + + # No private docker IPs + BAD_IPS=$(echo "$NODES_JSON" | python3 -c ' +import sys, json, re +docker = re.compile(r"^172\.(1[6-9]|2\d|3[01])\.") +bad = [] +for n in json.load(sys.stdin): + ip = n.get("ip_address") or "" + if docker.match(ip): + bad.append(f"{n.get(\"hostname\")}={ip}") +print(",".join(bad))' 2>/dev/null) + if [[ -z "$BAD_IPS" ]]; then + pass "all node IPs are real LAN addresses" + else + fail "nodes still reporting docker bridge IPs: $BAD_IPS" + warn " → set NODE_IP in .env.worker and restart the node-agent" + fi + + # All nodes recently seen + STALE=$(echo "$NODES_JSON" | python3 -c ' +import sys, json +nodes = json.load(sys.stdin) +stale = [n["hostname"] for n in nodes if float(n.get("stale_seconds") or 9999) > 120] +print(",".join(stale))' 2>/dev/null) + if [[ -z "$STALE" ]]; then + pass "all nodes heartbeated within 2 min" + else + warn "stale nodes (>2 min since heartbeat): $STALE" + fi +fi +echo "" + +# ── 3. Per-node /health probes ────────────────────────────────────────── +echo -e "${BLD}3. Worker agent /health endpoints${NC}" +echo "$NODES_JSON" | python3 -c ' +import sys, json +for n in json.load(sys.stdin): + if n.get("role") == "primary": continue + print(n["id"], n["hostname"], n.get("api_url") or "") +' 2>/dev/null | while read -r ID HOST URL; do + [[ -z "$URL" ]] && { warn "$HOST: no api_url registered"; continue; } + if curl -sf --max-time 4 "$URL/health" >/dev/null 2>&1; then + pass "$HOST ($URL/health)" + else + fail "$HOST agent unreachable at $URL/health" + fi +done +echo "" + +# ── 4. Local GPU + NVENC probe (when run on a GPU node) ───────────────── +echo -e "${BLD}4. Local GPU + NVENC${NC}" +if command -v nvidia-smi >/dev/null 2>&1; then + GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l) + if [[ "$GPU_COUNT" -gt 0 ]]; then + pass "$GPU_COUNT NVIDIA GPU(s) visible to host" + if command -v ffmpeg >/dev/null 2>&1; then + if ffmpeg -hide_banner -loglevel error \ + -f lavfi -i testsrc=duration=5:size=1280x720:rate=30 \ + -c:v h264_nvenc -preset p1 -b:v 4M \ + -t 5 -f null - 2>/tmp/wd-nvenc.log; then + pass "NVENC encode test succeeded" + else + fail "NVENC encode failed — see /tmp/wd-nvenc.log" + fi + else + warn "ffmpeg not installed locally — skipping NVENC encode test" + fi + else + warn "nvidia-smi found but reports 0 GPUs" + fi +else + warn "nvidia-smi not present (not a GPU node)" +fi +echo "" + +# ── 5. Blackmagic device enumeration ──────────────────────────────────── +echo -e "${BLD}5. Blackmagic devices (cluster-wide)${NC}" +BMD_JSON=$(api GET /api/v1/cluster/devices/blackmagic || echo '[]') +BMD_COUNT=$(echo "$BMD_JSON" | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null || echo 0) +if [[ "$BMD_COUNT" -gt 0 ]]; then + pass "$BMD_COUNT DeckLink port(s) registered" + echo "$BMD_JSON" | python3 -c ' +import sys, json +for d in json.load(sys.stdin): + print(f" {d.get(\"hostname\")} port={d.get(\"index\")} model={d.get(\"model\") or \"unknown\"} online={d.get(\"online\")}") +' +else + warn "no DeckLink devices reported by any node" +fi +echo "" + +# ── 6. Local Blackmagic device files ──────────────────────────────────── +echo -e "${BLD}6. Local /dev/blackmagic${NC}" +if [[ -d /dev/blackmagic ]]; then + ls /dev/blackmagic/ | sed 's/^/ /' + pass "$(ls /dev/blackmagic/ | wc -l) device node(s) under /dev/blackmagic" +else + warn "no /dev/blackmagic on this machine" +fi +echo "" + +# ── Summary ───────────────────────────────────────────────────────────── +echo -e "${BLD}Summary:${NC} ${GRN}$PASS pass${NC} ${RED}$FAIL fail${NC}" +[[ "$FAIL" -gt 0 ]] && exit 1 || exit 0