fix(node-agent): probe GPU via Docker API async at startup, cache result

Replaced sync execFileSync('docker') approach (no docker CLI in container)
with async Docker socket HTTP API calls:
- POST /containers/create with nvidia runtime + DeviceRequests
- POST /containers/:id/start
- Poll inspect until not running
- GET /containers/:id/logs, strip 8-byte frame headers, parse csv

probeGpusViaSmi() runs once at startup before the first heartbeat.
Result cached in _gpuCache; detectHardware() reads cache on every heartbeat.
Falls back to /dev/nvidia* scan if probe fails or runtime unavailable.
This commit is contained in:
Zac Gaetano 2026-05-26 18:28:03 +00:00
parent 558c18e417
commit a6f045b3d7

View file

@ -182,37 +182,84 @@ function sampleCpu() {
// ── Hardware detection ────────────────────────────────────────────────────
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
function detectGpusViaSmi() {
// Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
// injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
// This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
// Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
const { execFileSync } = require('child_process');
// Cached GPU info from nvidia-smi — populated once at startup via Docker API.
// null = not yet probed; [] = probed but no GPUs or no runtime.
let _gpuCache = null;
async function probeGpusViaSmi() {
// Use Docker API (socket) to run nvidia-smi inside a GPU-enabled container.
// The NVIDIA Container Runtime injects nvidia-smi + libs into ubuntu:22.04.
// This sidesteps the Alpine/glibc incompatibility in the node-agent image.
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
const FMT = '--format=csv,noheader,nounits';
let containerId;
try {
const out = execFileSync('docker', [
'run', '--rm', '--gpus', 'all',
'ubuntu:22.04',
'nvidia-smi', QUERY, FMT,
], { timeout: 15000, encoding: 'utf8' });
const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
if (!lines.length) return null;
return lines.map(line => {
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
return {
index: parseInt(idx, 10),
name: name,
memory_mb: parseInt(memMiB, 10),
driver: driver,
device: `/dev/nvidia${idx}`,
type: 'nvidia',
};
// Create container with nvidia runtime and GPU access
const createRes = await dockerApi('POST', '/containers/create', {
Image: 'ubuntu:22.04',
Cmd: ['nvidia-smi', QUERY, FMT],
HostConfig: {
AutoRemove: false,
Runtime: 'nvidia',
DeviceRequests: [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }],
},
});
} catch (_) {
return null; // nvidia runtime not available — caller uses /dev scan
if (createRes.status !== 201) return;
containerId = createRes.data.Id;
await dockerApi('POST', `/containers/${containerId}/start`);
// Wait for it to finish (poll status)
for (let i = 0; i < 20; i++) {
await new Promise(r => setTimeout(r, 500));
const inspect = await dockerApi('GET', `/containers/${containerId}/json`);
if (!inspect.data?.State?.Running) break;
}
// Grab stdout logs
const logRes = await new Promise((resolve, reject) => {
const options = {
socketPath: '/var/run/docker.sock',
path: `/v1.43/containers/${containerId}/logs?stdout=1&stderr=0`,
method: 'GET',
};
const req = http.request(options, res => {
const chunks = [];
res.on('data', c => chunks.push(c));
res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
});
req.on('error', reject);
req.end();
});
// Docker log frames have an 8-byte header — strip them
const text = logRes.replace(/[\x00-\x07].{7}/g, '').trim();
const lines = text.split('\n').filter(l => /^\d+,/.test(l.trim()));
if (lines.length) {
_gpuCache = lines.map(line => {
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
return {
index: parseInt(idx, 10),
name: name,
memory_mb: parseInt(memMiB, 10),
driver: driver,
device: `/dev/nvidia${idx}`,
type: 'nvidia',
};
});
console.log(`[gpu] detected ${_gpuCache.length} GPU(s) via nvidia-smi: ${_gpuCache.map(g => g.name).join(', ')}`);
} else {
_gpuCache = [];
}
} catch (err) {
console.warn('[gpu] nvidia-smi probe failed:', err.message);
_gpuCache = [];
} finally {
if (containerId) {
await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
}
}
}
@ -225,10 +272,9 @@ function detectHardware() {
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
}
} else {
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
const smiGpus = detectGpusViaSmi();
if (smiGpus !== null) {
capabilities.gpus = smiGpus;
// Use nvidia-smi cache if populated, otherwise fall back to /dev file scan
if (_gpuCache !== null && _gpuCache.length > 0) {
capabilities.gpus = _gpuCache;
} else {
for (let i = 0; i < 16; i++) {
try {
@ -304,8 +350,12 @@ async function heartbeat() {
}
}
heartbeat();
setInterval(heartbeat, HEARTBEAT_MS);
// Probe GPU info once at startup (async, before first heartbeat).
// _gpuCache is populated here; heartbeats use the cached value.
probeGpusViaSmi().then(() => {
heartbeat();
setInterval(heartbeat, HEARTBEAT_MS);
});
// ── HTTP server ───────────────────────────────────────────────────────────
const server = http.createServer((req, res) => {