feat(cluster): auto-assign node profiles from detected hardware

onboard-node.sh auto-detects GPU (nvidia-smi/lspci) and SDI capture cards
(blackmagic/deltacast) and computes PROFILES (worker [+gpu] [+capture])
automatically; explicit NODE_ROLE/PROFILES still override. Add Node wizard
drops the role picker — node self-configures from hardware.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Zac Gaetano 2026-05-31 18:02:29 -04:00
parent d908c0c056
commit f2d5f5aa16
2 changed files with 76 additions and 34 deletions

View file

@ -16,11 +16,12 @@
# Environment variables:
# MAM_API_URL REQUIRED Primary MAM API base URL
# NODE_TOKEN API bearer token (required if AUTH_ENABLED=true)
# NODE_ROLE Role tag reported to the cluster (default: worker)
# NODE_ROLE Role tag reported to the cluster (default: auto-detect)
# NODE_IP Override the LAN IP reported back (default: auto-detect)
# AGENT_PORT Host port for the node agent (default: 7436)
# INSTALL_DIR Where to clone/find the repo (default: /opt/wild-dragon)
# PROFILES Extra compose profiles, space-sep e.g. "worker capture"
# PROFILES Compose profiles, space-sep (default: auto-detect from hardware)
# Override only to force, e.g. "worker capture"
# BMD_MODEL DeckLink card model name (e.g. "DeckLink Duo 2")
# REPO_URL Override the Forgejo clone URL
# =============================================================================
@ -32,6 +33,9 @@ REPO_URL="${REPO_URL:-https://forge.wilddragon.net/zgaetano/wild-dragon.git}"
INSTALL_DIR="${INSTALL_DIR:-/opt/wild-dragon}"
MAM_API_URL="${MAM_API_URL:-}"
NODE_TOKEN="${NODE_TOKEN:-}"
# Track whether the caller pinned NODE_ROLE explicitly (manual override) vs.
# us defaulting it — so auto-detection only fills in an *unset* role.
[[ -n "${NODE_ROLE:-}" ]] && NODE_ROLE_EXPLICIT=1 || NODE_ROLE_EXPLICIT=""
NODE_ROLE="${NODE_ROLE:-worker}"
NODE_IP="${NODE_IP:-}"
AGENT_PORT="${AGENT_PORT:-7436}"
@ -65,6 +69,37 @@ detect_lan_ip() {
echo "$ip"
}
# ── Auto-detect hardware ─────────────────────────────────────────────────────
# Mirror detect_lan_ip's style: best-effort, guard every probe with `|| true`
# so a missing nvidia-smi/lspci never aborts under `set -euo pipefail`. The
# node self-describes its hardware here so the operator never has to pick a
# role — the right compose profiles are enabled automatically.
# GPU present? nvidia-smi is the strong signal; fall back to an lspci scan for
# NVIDIA or AMD VGA controllers (covers nodes where the driver isn't installed
# yet but the card is physically present).
detect_gpu() {
if command -v nvidia-smi &>/dev/null && nvidia-smi -L &>/dev/null; then
return 0
fi
if command -v lspci &>/dev/null; then
if lspci 2>/dev/null | grep -iE 'nvidia|vga.*amd' &>/dev/null; then
return 0
fi
fi
return 1
}
# SDI capture card present? Blackmagic DeckLink or Deltacast, via lspci.
detect_sdi() {
if command -v lspci &>/dev/null; then
if lspci 2>/dev/null | grep -iE 'blackmagic|deltacast' &>/dev/null; then
return 0
fi
fi
return 1
}
# ── Preflight ────────────────────────────────────────────────────────────────
echo -e "\n${BLD}${CYN}Wild Dragon MAM — Cluster Node Onboarding${NC}\n"
@ -79,6 +114,36 @@ if [[ -z "$NODE_IP" ]]; then
fi
fi
# ── Auto-assign compose profiles from detected hardware ──────────────────────
# Operator never picks a role: the worker profile always runs, and we add the
# gpu / capture profiles only when the matching hardware is present. Explicit
# PROFILES / NODE_ROLE env vars are honoured as a manual override escape hatch.
HAS_GPU=false; HAS_SDI=false
detect_gpu && HAS_GPU=true || true
detect_sdi && HAS_SDI=true || true
DETECTED_DESC="CPU"
[[ "$HAS_GPU" == true ]] && DETECTED_DESC="$DETECTED_DESC, GPU"
[[ "$HAS_SDI" == true ]] && DETECTED_DESC="$DETECTED_DESC, SDI capture card"
if [[ -z "$PROFILES" ]]; then
AUTO_PROFILES="worker"
[[ "$HAS_GPU" == true ]] && AUTO_PROFILES="$AUTO_PROFILES gpu"
[[ "$HAS_SDI" == true ]] && AUTO_PROFILES="$AUTO_PROFILES capture"
PROFILES="$AUTO_PROFILES"
info "Detected: $DETECTED_DESC → profiles: $PROFILES"
else
info "Detected: $DETECTED_DESC (profiles overridden by env: $PROFILES)"
fi
# Derive a human-friendly role tag from detected hardware when not pinned.
# Capture cards win over GPU (an SDI ingest node is the more specific role).
if [[ -z "$NODE_ROLE_EXPLICIT" ]]; then
if [[ "$HAS_SDI" == true ]]; then NODE_ROLE="capture"
elif [[ "$HAS_GPU" == true ]]; then NODE_ROLE="gpu"
else NODE_ROLE="worker"; fi
fi
info "Primary API : $MAM_API_URL"
info "Role : $NODE_ROLE"
info "Agent port : $AGENT_PORT"

View file

@ -1597,23 +1597,15 @@ function Cluster() {
);
}
// AddNodeModal Approach A onboarding wizard. Collects a node name + role,
// mints a one-time auth token via /auth/tokens, and renders a ready-to-paste
// AddNodeModal Approach A onboarding wizard. Collects a node name, mints a
// one-time auth token via /auth/tokens, and renders a ready-to-paste
// `curl | bash` command that provisions the machine via deploy/onboard-node.sh.
//
// Role compose PROFILES mapping (see docker-compose.worker.yml):
// Worker "worker"
// Capture "worker capture"
// GPU "worker gpu" (worker-l4 service, profiles: [gpu])
const ADD_NODE_ROLES = [
{ id: 'worker', label: 'Worker', profiles: 'worker', desc: 'CPU transcode / general jobs' },
{ id: 'capture', label: 'Capture', profiles: 'worker capture', desc: 'SDI / DeckLink ingest' },
{ id: 'gpu', label: 'GPU', profiles: 'worker gpu', desc: 'NVENC-accelerated transcode' },
];
// No role picker: the new node self-detects its hardware (GPU / DeckLink /
// Deltacast) in onboard-node.sh and auto-enables the matching compose profiles
// (worker always; + gpu / + capture when present). Zero manual choice.
function AddNodeModal({ onClose }) {
const [nodeName, setNodeName] = React.useState('');
const [role, setRole] = React.useState('worker');
const [apiUrl, setApiUrl] = React.useState('');
const [info, setInfo] = React.useState(null); // { scriptUrl, branch }
const [command, setCommand] = React.useState(null); // generated string
@ -1631,8 +1623,6 @@ function AddNodeModal({ onClose }) {
.catch(() => {}); // leave apiUrl empty user must fill it before Generate
}, []);
const roleDef = ADD_NODE_ROLES.find(r => r.id === role) || ADD_NODE_ROLES[0];
const generate = async () => {
setError(null);
if (!nodeName.trim()) { setError('Node name is required.'); return; }
@ -1654,8 +1644,7 @@ function AddNodeModal({ onClose }) {
const scriptUrl = (info && info.scriptUrl)
|| 'https://forge.wilddragon.net/zgaetano/wild-dragon/raw/branch/main/deploy/onboard-node.sh';
const cmd =
`curl -sL ${scriptUrl} | NODE_TOKEN=${token} MAM_API_URL=${apiUrl.trim()} ` +
`NODE_ROLE=${role} PROFILES="${roleDef.profiles}" bash`;
`curl -sL ${scriptUrl} | NODE_TOKEN=${token} MAM_API_URL=${apiUrl.trim()} bash`;
setCommand(cmd);
} catch (e) {
setError(e.message || 'Network error');
@ -1688,21 +1677,6 @@ function AddNodeModal({ onClose }) {
value={nodeName} onChange={e => setNodeName(e.target.value)} />
</div>
<div style={{ marginBottom: 14 }}>
<label style={{ display: 'block', fontSize: 11.5, color: 'var(--text-3)', marginBottom: 5 }}>Role</label>
<div style={{ display: 'flex', gap: 6 }}>
{ADD_NODE_ROLES.map(rd => (
<button key={rd.id}
className={'btn sm' + (role === rd.id ? ' primary' : ' ghost')}
style={{ flex: 1, flexDirection: 'column', alignItems: 'flex-start', gap: 2, padding: '8px 10px' }}
onClick={() => setRole(rd.id)}>
<span style={{ fontWeight: 600 }}>{rd.label}</span>
<span style={{ fontSize: 10, opacity: 0.8 }}>{rd.desc}</span>
</button>
))}
</div>
</div>
<div style={{ marginBottom: 4 }}>
<label style={{ display: 'block', fontSize: 11.5, color: 'var(--text-3)', marginBottom: 5 }}>Primary API URL</label>
<input className="field-input mono" style={{ width: '100%', fontSize: 12 }}
@ -1723,6 +1697,9 @@ function AddNodeModal({ onClose }) {
</div>
</div>
<code className="mono" style={{ display: 'block', background: 'var(--bg-2)', padding: 12, borderRadius: 6, fontSize: 11.5, lineHeight: 1.5, wordBreak: 'break-all', whiteSpace: 'pre-wrap' }}>{command}</code>
<div style={{ fontSize: 11, color: 'var(--text-4)', marginTop: 8, lineHeight: 1.5 }}>
Profiles (worker / capture / GPU) are auto-selected from the new machine's detected hardware no need to choose.
</div>
<ol style={{ margin: '12px 0 0', paddingLeft: 18, fontSize: 12, color: 'var(--text-2)', lineHeight: 1.6 }}>
<li>SSH into the fresh Ubuntu machine.</li>
<li>Paste and run this command.</li>