diff --git a/services/node-agent/index.js b/services/node-agent/index.js index 6d438bf..9f9ac41 100644 --- a/services/node-agent/index.js +++ b/services/node-agent/index.js @@ -182,37 +182,84 @@ function sampleCpu() { // ── Hardware detection ──────────────────────────────────────────────────── // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped -function detectGpusViaSmi() { - // Spawn a temporary container with --gpus all so the NVIDIA Container Runtime - // injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts). - // This sidesteps the Alpine/glibc incompatibility in the node-agent image itself. - // Returns null if nvidia runtime isn't available — caller falls back to /dev scan. - const { execFileSync } = require('child_process'); +// Cached GPU info from nvidia-smi — populated once at startup via Docker API. +// null = not yet probed; [] = probed but no GPUs or no runtime. +let _gpuCache = null; + +async function probeGpusViaSmi() { + // Use Docker API (socket) to run nvidia-smi inside a GPU-enabled container. + // The NVIDIA Container Runtime injects nvidia-smi + libs into ubuntu:22.04. + // This sidesteps the Alpine/glibc incompatibility in the node-agent image. const QUERY = '--query-gpu=index,name,memory.total,driver_version'; const FMT = '--format=csv,noheader,nounits'; + let containerId; try { - const out = execFileSync('docker', [ - 'run', '--rm', '--gpus', 'all', - 'ubuntu:22.04', - 'nvidia-smi', QUERY, FMT, - ], { timeout: 15000, encoding: 'utf8' }); - - const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim())); - if (!lines.length) return null; - return lines.map(line => { - const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); - return { - index: parseInt(idx, 10), - name: name, - memory_mb: parseInt(memMiB, 10), - driver: driver, - device: `/dev/nvidia${idx}`, - type: 'nvidia', - }; + // Create container with nvidia runtime and GPU access + const createRes = await dockerApi('POST', '/containers/create', { + Image: 'ubuntu:22.04', + Cmd: ['nvidia-smi', QUERY, FMT], + HostConfig: { + AutoRemove: false, + Runtime: 'nvidia', + DeviceRequests: [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }], + }, }); - } catch (_) { - return null; // nvidia runtime not available — caller uses /dev scan + if (createRes.status !== 201) return; + containerId = createRes.data.Id; + + await dockerApi('POST', `/containers/${containerId}/start`); + + // Wait for it to finish (poll status) + for (let i = 0; i < 20; i++) { + await new Promise(r => setTimeout(r, 500)); + const inspect = await dockerApi('GET', `/containers/${containerId}/json`); + if (!inspect.data?.State?.Running) break; + } + + // Grab stdout logs + const logRes = await new Promise((resolve, reject) => { + const options = { + socketPath: '/var/run/docker.sock', + path: `/v1.43/containers/${containerId}/logs?stdout=1&stderr=0`, + method: 'GET', + }; + const req = http.request(options, res => { + const chunks = []; + res.on('data', c => chunks.push(c)); + res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8'))); + }); + req.on('error', reject); + req.end(); + }); + + // Docker log frames have an 8-byte header — strip them + const text = logRes.replace(/[\x00-\x07].{7}/g, '').trim(); + const lines = text.split('\n').filter(l => /^\d+,/.test(l.trim())); + + if (lines.length) { + _gpuCache = lines.map(line => { + const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); + return { + index: parseInt(idx, 10), + name: name, + memory_mb: parseInt(memMiB, 10), + driver: driver, + device: `/dev/nvidia${idx}`, + type: 'nvidia', + }; + }); + console.log(`[gpu] detected ${_gpuCache.length} GPU(s) via nvidia-smi: ${_gpuCache.map(g => g.name).join(', ')}`); + } else { + _gpuCache = []; + } + } catch (err) { + console.warn('[gpu] nvidia-smi probe failed:', err.message); + _gpuCache = []; + } finally { + if (containerId) { + await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {}); + } } } @@ -225,10 +272,9 @@ function detectHardware() { capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i }); } } else { - // Try nvidia-smi first for full GPU details; fall back to /dev file scan - const smiGpus = detectGpusViaSmi(); - if (smiGpus !== null) { - capabilities.gpus = smiGpus; + // Use nvidia-smi cache if populated, otherwise fall back to /dev file scan + if (_gpuCache !== null && _gpuCache.length > 0) { + capabilities.gpus = _gpuCache; } else { for (let i = 0; i < 16; i++) { try { @@ -304,8 +350,12 @@ async function heartbeat() { } } -heartbeat(); -setInterval(heartbeat, HEARTBEAT_MS); +// Probe GPU info once at startup (async, before first heartbeat). +// _gpuCache is populated here; heartbeats use the cached value. +probeGpusViaSmi().then(() => { + heartbeat(); + setInterval(heartbeat, HEARTBEAT_MS); +}); // ── HTTP server ─────────────────────────────────────────────────────────── const server = http.createServer((req, res) => {