fix(node-agent): probe GPU via Docker API async at startup, cache result
Replaced sync execFileSync('docker') approach (no docker CLI in container)
with async Docker socket HTTP API calls:
- POST /containers/create with nvidia runtime + DeviceRequests
- POST /containers/:id/start
- Poll inspect until not running
- GET /containers/:id/logs, strip 8-byte frame headers, parse csv
probeGpusViaSmi() runs once at startup before the first heartbeat.
Result cached in _gpuCache; detectHardware() reads cache on every heartbeat.
Falls back to /dev/nvidia* scan if probe fails or runtime unavailable.
This commit is contained in:
parent
558c18e417
commit
a6f045b3d7
1 changed files with 82 additions and 32 deletions
|
|
@ -182,37 +182,84 @@ function sampleCpu() {
|
|||
|
||||
// ── Hardware detection ────────────────────────────────────────────────────
|
||||
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
||||
function detectGpusViaSmi() {
|
||||
// Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
|
||||
// injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
|
||||
// This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
|
||||
// Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
|
||||
const { execFileSync } = require('child_process');
|
||||
// Cached GPU info from nvidia-smi — populated once at startup via Docker API.
|
||||
// null = not yet probed; [] = probed but no GPUs or no runtime.
|
||||
let _gpuCache = null;
|
||||
|
||||
async function probeGpusViaSmi() {
|
||||
// Use Docker API (socket) to run nvidia-smi inside a GPU-enabled container.
|
||||
// The NVIDIA Container Runtime injects nvidia-smi + libs into ubuntu:22.04.
|
||||
// This sidesteps the Alpine/glibc incompatibility in the node-agent image.
|
||||
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
|
||||
const FMT = '--format=csv,noheader,nounits';
|
||||
|
||||
let containerId;
|
||||
try {
|
||||
const out = execFileSync('docker', [
|
||||
'run', '--rm', '--gpus', 'all',
|
||||
'ubuntu:22.04',
|
||||
'nvidia-smi', QUERY, FMT,
|
||||
], { timeout: 15000, encoding: 'utf8' });
|
||||
|
||||
const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
|
||||
if (!lines.length) return null;
|
||||
return lines.map(line => {
|
||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||
return {
|
||||
index: parseInt(idx, 10),
|
||||
name: name,
|
||||
memory_mb: parseInt(memMiB, 10),
|
||||
driver: driver,
|
||||
device: `/dev/nvidia${idx}`,
|
||||
type: 'nvidia',
|
||||
};
|
||||
// Create container with nvidia runtime and GPU access
|
||||
const createRes = await dockerApi('POST', '/containers/create', {
|
||||
Image: 'ubuntu:22.04',
|
||||
Cmd: ['nvidia-smi', QUERY, FMT],
|
||||
HostConfig: {
|
||||
AutoRemove: false,
|
||||
Runtime: 'nvidia',
|
||||
DeviceRequests: [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }],
|
||||
},
|
||||
});
|
||||
} catch (_) {
|
||||
return null; // nvidia runtime not available — caller uses /dev scan
|
||||
if (createRes.status !== 201) return;
|
||||
containerId = createRes.data.Id;
|
||||
|
||||
await dockerApi('POST', `/containers/${containerId}/start`);
|
||||
|
||||
// Wait for it to finish (poll status)
|
||||
for (let i = 0; i < 20; i++) {
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
const inspect = await dockerApi('GET', `/containers/${containerId}/json`);
|
||||
if (!inspect.data?.State?.Running) break;
|
||||
}
|
||||
|
||||
// Grab stdout logs
|
||||
const logRes = await new Promise((resolve, reject) => {
|
||||
const options = {
|
||||
socketPath: '/var/run/docker.sock',
|
||||
path: `/v1.43/containers/${containerId}/logs?stdout=1&stderr=0`,
|
||||
method: 'GET',
|
||||
};
|
||||
const req = http.request(options, res => {
|
||||
const chunks = [];
|
||||
res.on('data', c => chunks.push(c));
|
||||
res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
|
||||
});
|
||||
req.on('error', reject);
|
||||
req.end();
|
||||
});
|
||||
|
||||
// Docker log frames have an 8-byte header — strip them
|
||||
const text = logRes.replace(/[\x00-\x07].{7}/g, '').trim();
|
||||
const lines = text.split('\n').filter(l => /^\d+,/.test(l.trim()));
|
||||
|
||||
if (lines.length) {
|
||||
_gpuCache = lines.map(line => {
|
||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||
return {
|
||||
index: parseInt(idx, 10),
|
||||
name: name,
|
||||
memory_mb: parseInt(memMiB, 10),
|
||||
driver: driver,
|
||||
device: `/dev/nvidia${idx}`,
|
||||
type: 'nvidia',
|
||||
};
|
||||
});
|
||||
console.log(`[gpu] detected ${_gpuCache.length} GPU(s) via nvidia-smi: ${_gpuCache.map(g => g.name).join(', ')}`);
|
||||
} else {
|
||||
_gpuCache = [];
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn('[gpu] nvidia-smi probe failed:', err.message);
|
||||
_gpuCache = [];
|
||||
} finally {
|
||||
if (containerId) {
|
||||
await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -225,10 +272,9 @@ function detectHardware() {
|
|||
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
||||
}
|
||||
} else {
|
||||
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
|
||||
const smiGpus = detectGpusViaSmi();
|
||||
if (smiGpus !== null) {
|
||||
capabilities.gpus = smiGpus;
|
||||
// Use nvidia-smi cache if populated, otherwise fall back to /dev file scan
|
||||
if (_gpuCache !== null && _gpuCache.length > 0) {
|
||||
capabilities.gpus = _gpuCache;
|
||||
} else {
|
||||
for (let i = 0; i < 16; i++) {
|
||||
try {
|
||||
|
|
@ -304,8 +350,12 @@ async function heartbeat() {
|
|||
}
|
||||
}
|
||||
|
||||
heartbeat();
|
||||
setInterval(heartbeat, HEARTBEAT_MS);
|
||||
// Probe GPU info once at startup (async, before first heartbeat).
|
||||
// _gpuCache is populated here; heartbeats use the cached value.
|
||||
probeGpusViaSmi().then(() => {
|
||||
heartbeat();
|
||||
setInterval(heartbeat, HEARTBEAT_MS);
|
||||
});
|
||||
|
||||
// ── HTTP server ───────────────────────────────────────────────────────────
|
||||
const server = http.createServer((req, res) => {
|
||||
|
|
|
|||
Loading…
Reference in a new issue