feat(cluster): auto-assign node profiles from detected hardware
onboard-node.sh auto-detects GPU (nvidia-smi/lspci) and SDI capture cards (blackmagic/deltacast) and computes PROFILES (worker [+gpu] [+capture]) automatically; explicit NODE_ROLE/PROFILES still override. Add Node wizard drops the role picker — node self-configures from hardware. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
d908c0c056
commit
f2d5f5aa16
2 changed files with 76 additions and 34 deletions
|
|
@ -16,11 +16,12 @@
|
|||
# Environment variables:
|
||||
# MAM_API_URL REQUIRED Primary MAM API base URL
|
||||
# NODE_TOKEN API bearer token (required if AUTH_ENABLED=true)
|
||||
# NODE_ROLE Role tag reported to the cluster (default: worker)
|
||||
# NODE_ROLE Role tag reported to the cluster (default: auto-detect)
|
||||
# NODE_IP Override the LAN IP reported back (default: auto-detect)
|
||||
# AGENT_PORT Host port for the node agent (default: 7436)
|
||||
# INSTALL_DIR Where to clone/find the repo (default: /opt/wild-dragon)
|
||||
# PROFILES Extra compose profiles, space-sep e.g. "worker capture"
|
||||
# PROFILES Compose profiles, space-sep (default: auto-detect from hardware)
|
||||
# Override only to force, e.g. "worker capture"
|
||||
# BMD_MODEL DeckLink card model name (e.g. "DeckLink Duo 2")
|
||||
# REPO_URL Override the Forgejo clone URL
|
||||
# =============================================================================
|
||||
|
|
@ -32,6 +33,9 @@ REPO_URL="${REPO_URL:-https://forge.wilddragon.net/zgaetano/wild-dragon.git}"
|
|||
INSTALL_DIR="${INSTALL_DIR:-/opt/wild-dragon}"
|
||||
MAM_API_URL="${MAM_API_URL:-}"
|
||||
NODE_TOKEN="${NODE_TOKEN:-}"
|
||||
# Track whether the caller pinned NODE_ROLE explicitly (manual override) vs.
|
||||
# us defaulting it — so auto-detection only fills in an *unset* role.
|
||||
[[ -n "${NODE_ROLE:-}" ]] && NODE_ROLE_EXPLICIT=1 || NODE_ROLE_EXPLICIT=""
|
||||
NODE_ROLE="${NODE_ROLE:-worker}"
|
||||
NODE_IP="${NODE_IP:-}"
|
||||
AGENT_PORT="${AGENT_PORT:-7436}"
|
||||
|
|
@ -65,6 +69,37 @@ detect_lan_ip() {
|
|||
echo "$ip"
|
||||
}
|
||||
|
||||
# ── Auto-detect hardware ─────────────────────────────────────────────────────
|
||||
# Mirror detect_lan_ip's style: best-effort, guard every probe with `|| true`
|
||||
# so a missing nvidia-smi/lspci never aborts under `set -euo pipefail`. The
|
||||
# node self-describes its hardware here so the operator never has to pick a
|
||||
# role — the right compose profiles are enabled automatically.
|
||||
|
||||
# GPU present? nvidia-smi is the strong signal; fall back to an lspci scan for
|
||||
# NVIDIA or AMD VGA controllers (covers nodes where the driver isn't installed
|
||||
# yet but the card is physically present).
|
||||
detect_gpu() {
|
||||
if command -v nvidia-smi &>/dev/null && nvidia-smi -L &>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
if command -v lspci &>/dev/null; then
|
||||
if lspci 2>/dev/null | grep -iE 'nvidia|vga.*amd' &>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
# SDI capture card present? Blackmagic DeckLink or Deltacast, via lspci.
|
||||
detect_sdi() {
|
||||
if command -v lspci &>/dev/null; then
|
||||
if lspci 2>/dev/null | grep -iE 'blackmagic|deltacast' &>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
# ── Preflight ────────────────────────────────────────────────────────────────
|
||||
echo -e "\n${BLD}${CYN}Wild Dragon MAM — Cluster Node Onboarding${NC}\n"
|
||||
|
||||
|
|
@ -79,6 +114,36 @@ if [[ -z "$NODE_IP" ]]; then
|
|||
fi
|
||||
fi
|
||||
|
||||
# ── Auto-assign compose profiles from detected hardware ──────────────────────
|
||||
# Operator never picks a role: the worker profile always runs, and we add the
|
||||
# gpu / capture profiles only when the matching hardware is present. Explicit
|
||||
# PROFILES / NODE_ROLE env vars are honoured as a manual override escape hatch.
|
||||
HAS_GPU=false; HAS_SDI=false
|
||||
detect_gpu && HAS_GPU=true || true
|
||||
detect_sdi && HAS_SDI=true || true
|
||||
|
||||
DETECTED_DESC="CPU"
|
||||
[[ "$HAS_GPU" == true ]] && DETECTED_DESC="$DETECTED_DESC, GPU"
|
||||
[[ "$HAS_SDI" == true ]] && DETECTED_DESC="$DETECTED_DESC, SDI capture card"
|
||||
|
||||
if [[ -z "$PROFILES" ]]; then
|
||||
AUTO_PROFILES="worker"
|
||||
[[ "$HAS_GPU" == true ]] && AUTO_PROFILES="$AUTO_PROFILES gpu"
|
||||
[[ "$HAS_SDI" == true ]] && AUTO_PROFILES="$AUTO_PROFILES capture"
|
||||
PROFILES="$AUTO_PROFILES"
|
||||
info "Detected: $DETECTED_DESC → profiles: $PROFILES"
|
||||
else
|
||||
info "Detected: $DETECTED_DESC (profiles overridden by env: $PROFILES)"
|
||||
fi
|
||||
|
||||
# Derive a human-friendly role tag from detected hardware when not pinned.
|
||||
# Capture cards win over GPU (an SDI ingest node is the more specific role).
|
||||
if [[ -z "$NODE_ROLE_EXPLICIT" ]]; then
|
||||
if [[ "$HAS_SDI" == true ]]; then NODE_ROLE="capture"
|
||||
elif [[ "$HAS_GPU" == true ]]; then NODE_ROLE="gpu"
|
||||
else NODE_ROLE="worker"; fi
|
||||
fi
|
||||
|
||||
info "Primary API : $MAM_API_URL"
|
||||
info "Role : $NODE_ROLE"
|
||||
info "Agent port : $AGENT_PORT"
|
||||
|
|
|
|||
|
|
@ -1597,23 +1597,15 @@ function Cluster() {
|
|||
);
|
||||
}
|
||||
|
||||
// AddNodeModal — Approach A onboarding wizard. Collects a node name + role,
|
||||
// mints a one-time auth token via /auth/tokens, and renders a ready-to-paste
|
||||
// AddNodeModal — Approach A onboarding wizard. Collects a node name, mints a
|
||||
// one-time auth token via /auth/tokens, and renders a ready-to-paste
|
||||
// `curl … | bash` command that provisions the machine via deploy/onboard-node.sh.
|
||||
//
|
||||
// Role → compose PROFILES mapping (see docker-compose.worker.yml):
|
||||
// Worker → "worker"
|
||||
// Capture → "worker capture"
|
||||
// GPU → "worker gpu" (worker-l4 service, profiles: [gpu])
|
||||
const ADD_NODE_ROLES = [
|
||||
{ id: 'worker', label: 'Worker', profiles: 'worker', desc: 'CPU transcode / general jobs' },
|
||||
{ id: 'capture', label: 'Capture', profiles: 'worker capture', desc: 'SDI / DeckLink ingest' },
|
||||
{ id: 'gpu', label: 'GPU', profiles: 'worker gpu', desc: 'NVENC-accelerated transcode' },
|
||||
];
|
||||
|
||||
// No role picker: the new node self-detects its hardware (GPU / DeckLink /
|
||||
// Deltacast) in onboard-node.sh and auto-enables the matching compose profiles
|
||||
// (worker always; + gpu / + capture when present). Zero manual choice.
|
||||
function AddNodeModal({ onClose }) {
|
||||
const [nodeName, setNodeName] = React.useState('');
|
||||
const [role, setRole] = React.useState('worker');
|
||||
const [apiUrl, setApiUrl] = React.useState('');
|
||||
const [info, setInfo] = React.useState(null); // { scriptUrl, branch }
|
||||
const [command, setCommand] = React.useState(null); // generated string
|
||||
|
|
@ -1631,8 +1623,6 @@ function AddNodeModal({ onClose }) {
|
|||
.catch(() => {}); // leave apiUrl empty → user must fill it before Generate
|
||||
}, []);
|
||||
|
||||
const roleDef = ADD_NODE_ROLES.find(r => r.id === role) || ADD_NODE_ROLES[0];
|
||||
|
||||
const generate = async () => {
|
||||
setError(null);
|
||||
if (!nodeName.trim()) { setError('Node name is required.'); return; }
|
||||
|
|
@ -1654,8 +1644,7 @@ function AddNodeModal({ onClose }) {
|
|||
const scriptUrl = (info && info.scriptUrl)
|
||||
|| 'https://forge.wilddragon.net/zgaetano/wild-dragon/raw/branch/main/deploy/onboard-node.sh';
|
||||
const cmd =
|
||||
`curl -sL ${scriptUrl} | NODE_TOKEN=${token} MAM_API_URL=${apiUrl.trim()} ` +
|
||||
`NODE_ROLE=${role} PROFILES="${roleDef.profiles}" bash`;
|
||||
`curl -sL ${scriptUrl} | NODE_TOKEN=${token} MAM_API_URL=${apiUrl.trim()} bash`;
|
||||
setCommand(cmd);
|
||||
} catch (e) {
|
||||
setError(e.message || 'Network error');
|
||||
|
|
@ -1688,21 +1677,6 @@ function AddNodeModal({ onClose }) {
|
|||
value={nodeName} onChange={e => setNodeName(e.target.value)} />
|
||||
</div>
|
||||
|
||||
<div style={{ marginBottom: 14 }}>
|
||||
<label style={{ display: 'block', fontSize: 11.5, color: 'var(--text-3)', marginBottom: 5 }}>Role</label>
|
||||
<div style={{ display: 'flex', gap: 6 }}>
|
||||
{ADD_NODE_ROLES.map(rd => (
|
||||
<button key={rd.id}
|
||||
className={'btn sm' + (role === rd.id ? ' primary' : ' ghost')}
|
||||
style={{ flex: 1, flexDirection: 'column', alignItems: 'flex-start', gap: 2, padding: '8px 10px' }}
|
||||
onClick={() => setRole(rd.id)}>
|
||||
<span style={{ fontWeight: 600 }}>{rd.label}</span>
|
||||
<span style={{ fontSize: 10, opacity: 0.8 }}>{rd.desc}</span>
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style={{ marginBottom: 4 }}>
|
||||
<label style={{ display: 'block', fontSize: 11.5, color: 'var(--text-3)', marginBottom: 5 }}>Primary API URL</label>
|
||||
<input className="field-input mono" style={{ width: '100%', fontSize: 12 }}
|
||||
|
|
@ -1723,6 +1697,9 @@ function AddNodeModal({ onClose }) {
|
|||
</div>
|
||||
</div>
|
||||
<code className="mono" style={{ display: 'block', background: 'var(--bg-2)', padding: 12, borderRadius: 6, fontSize: 11.5, lineHeight: 1.5, wordBreak: 'break-all', whiteSpace: 'pre-wrap' }}>{command}</code>
|
||||
<div style={{ fontSize: 11, color: 'var(--text-4)', marginTop: 8, lineHeight: 1.5 }}>
|
||||
Profiles (worker / capture / GPU) are auto-selected from the new machine's detected hardware — no need to choose.
|
||||
</div>
|
||||
<ol style={{ margin: '12px 0 0', paddingLeft: 18, fontSize: 12, color: 'var(--text-2)', lineHeight: 1.6 }}>
|
||||
<li>SSH into the fresh Ubuntu machine.</li>
|
||||
<li>Paste and run this command.</li>
|
||||
|
|
|
|||
Loading…
Reference in a new issue