nvidia-smi bind-mount failed due to Alpine vs Ubuntu glibc incompatibility. Fix: nsenter --mount=/proc/1/ns/mnt -- nvidia-smi runs in the host's mount namespace where glibc and all NVIDIA driver libs are present. Requires pid: host in docker-compose.worker.yml (already has network: host). nsenter is provided by util-linux in Alpine — already in the image. Falls back to direct nvidia-smi call (for glibc-based containers), then to /dev/nvidia* file scan if all attempts fail.
356 lines
13 KiB
JavaScript
356 lines
13 KiB
JavaScript
import http from 'http';
|
|
import os from 'os';
|
|
import fs from 'fs';
|
|
|
|
const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, '');
|
|
const NODE_TOKEN = process.env.NODE_TOKEN || '';
|
|
const NODE_ROLE = process.env.NODE_ROLE || 'worker';
|
|
const AGENT_PORT = parseInt(process.env.AGENT_PORT || '7436', 10);
|
|
const HEARTBEAT_MS = parseInt(process.env.HEARTBEAT_MS || '30000', 10);
|
|
const LIVE_DIR = process.env.LIVE_DIR || '/mnt/NVME/MAM/wild-dragon-live';
|
|
const VERSION = '1.2.0';
|
|
|
|
// Pick the host's LAN IP. Inside a bridge-mode container,
|
|
// os.networkInterfaces() returns the container's docker-bridge IP (172.x),
|
|
// not the host's LAN address. Two strategies:
|
|
// 1. honour an explicit NODE_IP override (set by onboard-node.sh)
|
|
// 2. filter out interfaces that obviously belong to container/virtual
|
|
// bridges and down-rank the docker bridge range so real LAN IPs win
|
|
//
|
|
// Combine with `network_mode: host` in docker-compose.worker.yml for the
|
|
// most reliable result — that lets os.networkInterfaces() see the host's
|
|
// real adapters directly.
|
|
function getIp() {
|
|
if (process.env.NODE_IP) return process.env.NODE_IP;
|
|
|
|
const SKIP_IFACE = /^(lo|docker\d*|br-|veth|cni|flannel|cali|tun|tap|virbr|kube)/i;
|
|
const isContainerBridge = (ip) => /^172\.(1[6-9]|2\d|3[01])\./.test(ip);
|
|
|
|
const candidates = [];
|
|
for (const [name, addrs] of Object.entries(os.networkInterfaces())) {
|
|
if (SKIP_IFACE.test(name)) continue;
|
|
for (const a of (addrs || [])) {
|
|
if (a.family !== 'IPv4' || a.internal) continue;
|
|
candidates.push({ name, address: a.address, rank: isContainerBridge(a.address) ? 1 : 0 });
|
|
}
|
|
}
|
|
candidates.sort((a, b) => a.rank - b.rank);
|
|
return candidates.length ? candidates[0].address : null;
|
|
}
|
|
|
|
// ── Docker API helper (talks to local /var/run/docker.sock) ───────────────
|
|
function dockerApi(method, path, body = null) {
|
|
return new Promise((resolve, reject) => {
|
|
const options = {
|
|
socketPath: '/var/run/docker.sock',
|
|
path: `/v1.43${path}`,
|
|
method,
|
|
headers: { 'Content-Type': 'application/json' },
|
|
};
|
|
const req = http.request(options, (res) => {
|
|
let data = '';
|
|
res.on('data', chunk => data += chunk);
|
|
res.on('end', () => {
|
|
try { resolve({ status: res.statusCode, data: data ? JSON.parse(data) : {} }); }
|
|
catch { resolve({ status: res.statusCode, data }); }
|
|
});
|
|
});
|
|
req.on('error', reject);
|
|
if (body) req.write(JSON.stringify(body));
|
|
req.end();
|
|
});
|
|
}
|
|
|
|
function readBody(req) {
|
|
return new Promise((resolve, reject) => {
|
|
let data = '';
|
|
req.on('data', chunk => data += chunk);
|
|
req.on('end', () => {
|
|
try { resolve(JSON.parse(data || '{}')); }
|
|
catch { reject(new Error('Invalid JSON')); }
|
|
});
|
|
req.on('error', reject);
|
|
});
|
|
}
|
|
|
|
function jsonResponse(res, status, body) {
|
|
res.writeHead(status, { 'Content-Type': 'application/json' });
|
|
res.end(JSON.stringify(body));
|
|
}
|
|
|
|
// ── Sidecar: spawn a capture container on this node ───────────────────────
|
|
|
|
async function handleSidecarStart(body, res) {
|
|
try {
|
|
const {
|
|
image = 'wild-dragon-capture:latest',
|
|
env = [],
|
|
capturePort = 3001,
|
|
sourceType = 'sdi',
|
|
} = body;
|
|
|
|
const binds = [`${LIVE_DIR}:/live`];
|
|
if (sourceType === 'sdi') binds.unshift('/dev/blackmagic:/dev/blackmagic');
|
|
|
|
const spec = {
|
|
Image: image,
|
|
Env: [...env, `PORT=${capturePort}`],
|
|
HostConfig: {
|
|
NetworkMode: 'host',
|
|
Privileged: true,
|
|
Binds: binds,
|
|
},
|
|
};
|
|
|
|
const createRes = await dockerApi('POST', '/containers/create', spec);
|
|
if (createRes.status !== 201) {
|
|
return jsonResponse(res, 502, { error: 'Failed to create container', details: createRes.data });
|
|
}
|
|
|
|
const containerId = createRes.data.Id;
|
|
const startRes = await dockerApi('POST', `/containers/${containerId}/start`);
|
|
if (startRes.status !== 204) {
|
|
await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
|
|
return jsonResponse(res, 502, { error: 'Failed to start container', details: startRes.data });
|
|
}
|
|
|
|
jsonResponse(res, 201, { containerId, capturePort });
|
|
} catch (err) {
|
|
jsonResponse(res, 500, { error: err.message });
|
|
}
|
|
}
|
|
|
|
async function handleSidecarStop(containerId, res) {
|
|
try {
|
|
await dockerApi('POST', `/containers/${containerId}/stop`).catch(() => {});
|
|
await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
|
|
jsonResponse(res, 200, { ok: true });
|
|
} catch (err) {
|
|
jsonResponse(res, 500, { error: err.message });
|
|
}
|
|
}
|
|
|
|
async function handleSidecarStatus(containerId, res) {
|
|
try {
|
|
const inspectRes = await dockerApi('GET', `/containers/${containerId}/json`);
|
|
if (inspectRes.status !== 200) {
|
|
return jsonResponse(res, 200, { running: false });
|
|
}
|
|
|
|
const container = inspectRes.data;
|
|
const running = container.State?.Running || false;
|
|
const startedAt = container.State?.StartedAt;
|
|
|
|
// Derive capturePort from the container's env so we know where to poll.
|
|
const portEnv = (container.Config?.Env || []).find(e => e.startsWith('PORT='));
|
|
const capturePort = portEnv ? parseInt(portEnv.split('=')[1], 10) : 3001;
|
|
|
|
let live = null;
|
|
if (running) {
|
|
try {
|
|
const captureRes = await fetch(`http://127.0.0.1:${capturePort}/capture/status`, {
|
|
signal: AbortSignal.timeout(2000),
|
|
});
|
|
if (captureRes.ok) live = await captureRes.json();
|
|
} catch (_) {}
|
|
}
|
|
|
|
jsonResponse(res, 200, { running, startedAt, live });
|
|
} catch (err) {
|
|
jsonResponse(res, 500, { error: err.message });
|
|
}
|
|
}
|
|
|
|
// ── CPU sampling (500ms window) ───────────────────────────────────────────
|
|
function sampleCpu() {
|
|
return new Promise(resolve => {
|
|
const s1 = os.cpus();
|
|
setTimeout(() => {
|
|
const s2 = os.cpus();
|
|
let idle = 0, total = 0;
|
|
s2.forEach((cpu, i) => {
|
|
for (const t of Object.keys(cpu.times)) {
|
|
const d = cpu.times[t] - (s1[i]?.times[t] ?? 0);
|
|
total += d;
|
|
if (t === 'idle') idle += d;
|
|
}
|
|
});
|
|
resolve(total > 0 ? Math.round((1 - idle / total) * 10000) / 100 : 0);
|
|
}, 500);
|
|
});
|
|
}
|
|
|
|
// ── Hardware detection ────────────────────────────────────────────────────
|
|
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
|
function detectGpusViaSmi() {
|
|
// Run nvidia-smi in the host's mount namespace via nsenter so it uses the
|
|
// host's glibc and NVIDIA driver libs (not the container's Alpine libc).
|
|
// Requires pid: host in docker-compose so we can access /proc/1/ns/mnt.
|
|
// Falls back to direct nvidia-smi call (works if container has glibc).
|
|
// Returns null if nvidia-smi isn't available — caller falls back to /dev scan.
|
|
const { execFileSync } = require('child_process');
|
|
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
|
|
const FMT = '--format=csv,noheader,nounits';
|
|
|
|
const attempts = [
|
|
// nsenter into host mount ns (preferred — uses host glibc)
|
|
() => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT],
|
|
{ timeout: 8000, encoding: 'utf8' }),
|
|
// Direct call (works if container has compatible glibc)
|
|
() => execFileSync('nvidia-smi', [QUERY, FMT],
|
|
{ timeout: 5000, encoding: 'utf8' }),
|
|
];
|
|
|
|
for (const attempt of attempts) {
|
|
try {
|
|
const out = attempt();
|
|
return out.trim().split('\n').filter(Boolean).map(line => {
|
|
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
|
return {
|
|
index: parseInt(idx, 10),
|
|
name: name,
|
|
memory_mb: parseInt(memMiB, 10),
|
|
driver: driver,
|
|
device: `/dev/nvidia${idx}`,
|
|
type: 'nvidia',
|
|
};
|
|
});
|
|
} catch (_) {}
|
|
}
|
|
return null; // all attempts failed — caller falls back to /dev scan
|
|
}
|
|
|
|
function detectHardware() {
|
|
const capabilities = { gpus: [], blackmagic: [] };
|
|
|
|
const gpuOverride = parseInt(process.env.GPU_COUNT || '-1', 10);
|
|
if (gpuOverride >= 0) {
|
|
for (let i = 0; i < gpuOverride; i++) {
|
|
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
|
}
|
|
} else {
|
|
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
|
|
const smiGpus = detectGpusViaSmi();
|
|
if (smiGpus !== null) {
|
|
capabilities.gpus = smiGpus;
|
|
} else {
|
|
for (let i = 0; i < 16; i++) {
|
|
try {
|
|
fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK);
|
|
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
|
} catch (_) { break; }
|
|
}
|
|
}
|
|
}
|
|
|
|
const bmdOverride = parseInt(process.env.BMD_COUNT || '-1', 10);
|
|
if (bmdOverride >= 0) {
|
|
for (let i = 0; i < bmdOverride; i++) {
|
|
capabilities.blackmagic.push({ device: `/dev/blackmagic/dv${i}`, index: i });
|
|
}
|
|
} else {
|
|
try {
|
|
const bmdEntries = fs.readdirSync('/dev/blackmagic');
|
|
capabilities.blackmagic = bmdEntries.map((d, i) => ({ device: `/dev/blackmagic/${d}`, index: i }));
|
|
} catch (_) {}
|
|
}
|
|
|
|
// Best-effort model name from BMD_MODEL env (set manually) — used by the UI
|
|
// to render the correct card layout (Duo 2, Quad 2, Mini Recorder, ...).
|
|
if (process.env.BMD_MODEL) capabilities.blackmagic_model = process.env.BMD_MODEL;
|
|
|
|
return capabilities;
|
|
}
|
|
|
|
// ── Heartbeat ─────────────────────────────────────────────────────────────
|
|
async function heartbeat() {
|
|
const cpu_usage = await sampleCpu();
|
|
const totalMem = os.totalmem();
|
|
const freeMem = os.freemem();
|
|
const ip_address = getIp();
|
|
const capabilities = detectHardware();
|
|
|
|
const payload = {
|
|
hostname: os.hostname(),
|
|
ip_address,
|
|
role: NODE_ROLE,
|
|
version: VERSION,
|
|
api_url: `http://${ip_address || os.hostname()}:${AGENT_PORT}`,
|
|
cpu_usage,
|
|
mem_used_mb: Math.round((totalMem - freeMem) / 1048576),
|
|
mem_total_mb: Math.round(totalMem / 1048576),
|
|
capabilities,
|
|
};
|
|
|
|
const headers = { 'Content-Type': 'application/json' };
|
|
if (NODE_TOKEN) headers['Authorization'] = `Bearer ${NODE_TOKEN}`;
|
|
|
|
try {
|
|
const res = await fetch(`${MAM_API_URL}/api/v1/cluster/heartbeat`, {
|
|
method: 'POST',
|
|
headers,
|
|
body: JSON.stringify(payload),
|
|
signal: AbortSignal.timeout(8000),
|
|
});
|
|
if (res.ok) {
|
|
const gpuStr = capabilities.gpus.length ? ` gpu=${capabilities.gpus.length}` : '';
|
|
const bmdStr = capabilities.blackmagic.length ? ` bmd=${capabilities.blackmagic.length}` : '';
|
|
process.stdout.write(
|
|
`[hb] ${payload.hostname} ip=${ip_address || '?'} cpu=${cpu_usage}% ` +
|
|
`mem=${payload.mem_used_mb}/${payload.mem_total_mb}MB${gpuStr}${bmdStr}\n`
|
|
);
|
|
} else {
|
|
const txt = await res.text().catch(() => '');
|
|
console.error(`[hb] ${res.status} ${txt.slice(0, 120)}`);
|
|
}
|
|
} catch (err) {
|
|
console.error(`[hb] failed — ${err.message}`);
|
|
}
|
|
}
|
|
|
|
heartbeat();
|
|
setInterval(heartbeat, HEARTBEAT_MS);
|
|
|
|
// ── HTTP server ───────────────────────────────────────────────────────────
|
|
const server = http.createServer((req, res) => {
|
|
const { pathname } = new URL(req.url, 'http://localhost');
|
|
|
|
if (req.method === 'GET' && pathname === '/health') {
|
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
res.end(JSON.stringify({
|
|
ok: true,
|
|
hostname: os.hostname(),
|
|
uptime: Math.round(process.uptime()),
|
|
version: VERSION,
|
|
role: NODE_ROLE,
|
|
ip: getIp(),
|
|
}));
|
|
|
|
} else if (req.method === 'POST' && pathname === '/sidecar/start') {
|
|
readBody(req)
|
|
.then(body => handleSidecarStart(body, res))
|
|
.catch(() => jsonResponse(res, 400, { error: 'Invalid request body' }));
|
|
|
|
} else if (req.method === 'DELETE' && pathname.startsWith('/sidecar/')) {
|
|
const id = pathname.slice('/sidecar/'.length);
|
|
if (!id || id.includes('/')) { res.writeHead(404); return res.end(); }
|
|
handleSidecarStop(id, res);
|
|
|
|
} else if (req.method === 'GET' && /^\/sidecar\/[^/]+\/status$/.test(pathname)) {
|
|
const id = pathname.slice('/sidecar/'.length, -'/status'.length);
|
|
handleSidecarStatus(id, res);
|
|
|
|
} else {
|
|
res.writeHead(404);
|
|
res.end();
|
|
}
|
|
});
|
|
|
|
server.listen(AGENT_PORT, () => {
|
|
console.log(`wild-dragon-node-agent v${VERSION}`);
|
|
console.log(` Listening :${AGENT_PORT}`);
|
|
console.log(` Host IP ${getIp() || '(unresolved)'}`);
|
|
console.log(` Primary ${MAM_API_URL}`);
|
|
console.log(` Role ${NODE_ROLE}`);
|
|
console.log(` Heartbeat every ${HEARTBEAT_MS / 1000}s`);
|
|
});
|