fix(node-agent): probe GPU via Docker API async at startup, cache result
Replaced sync execFileSync('docker') approach (no docker CLI in container)
with async Docker socket HTTP API calls:
- POST /containers/create with nvidia runtime + DeviceRequests
- POST /containers/:id/start
- Poll inspect until not running
- GET /containers/:id/logs, strip 8-byte frame headers, parse csv
probeGpusViaSmi() runs once at startup before the first heartbeat.
Result cached in _gpuCache; detectHardware() reads cache on every heartbeat.
Falls back to /dev/nvidia* scan if probe fails or runtime unavailable.
This commit is contained in:
parent
558c18e417
commit
a6f045b3d7
1 changed files with 82 additions and 32 deletions
|
|
@ -182,37 +182,84 @@ function sampleCpu() {
|
||||||
|
|
||||||
// ── Hardware detection ────────────────────────────────────────────────────
|
// ── Hardware detection ────────────────────────────────────────────────────
|
||||||
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
||||||
function detectGpusViaSmi() {
|
// Cached GPU info from nvidia-smi — populated once at startup via Docker API.
|
||||||
// Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
|
// null = not yet probed; [] = probed but no GPUs or no runtime.
|
||||||
// injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
|
let _gpuCache = null;
|
||||||
// This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
|
|
||||||
// Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
|
async function probeGpusViaSmi() {
|
||||||
const { execFileSync } = require('child_process');
|
// Use Docker API (socket) to run nvidia-smi inside a GPU-enabled container.
|
||||||
|
// The NVIDIA Container Runtime injects nvidia-smi + libs into ubuntu:22.04.
|
||||||
|
// This sidesteps the Alpine/glibc incompatibility in the node-agent image.
|
||||||
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
|
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
|
||||||
const FMT = '--format=csv,noheader,nounits';
|
const FMT = '--format=csv,noheader,nounits';
|
||||||
|
|
||||||
|
let containerId;
|
||||||
try {
|
try {
|
||||||
const out = execFileSync('docker', [
|
// Create container with nvidia runtime and GPU access
|
||||||
'run', '--rm', '--gpus', 'all',
|
const createRes = await dockerApi('POST', '/containers/create', {
|
||||||
'ubuntu:22.04',
|
Image: 'ubuntu:22.04',
|
||||||
'nvidia-smi', QUERY, FMT,
|
Cmd: ['nvidia-smi', QUERY, FMT],
|
||||||
], { timeout: 15000, encoding: 'utf8' });
|
HostConfig: {
|
||||||
|
AutoRemove: false,
|
||||||
const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
|
Runtime: 'nvidia',
|
||||||
if (!lines.length) return null;
|
DeviceRequests: [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }],
|
||||||
return lines.map(line => {
|
},
|
||||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
|
||||||
return {
|
|
||||||
index: parseInt(idx, 10),
|
|
||||||
name: name,
|
|
||||||
memory_mb: parseInt(memMiB, 10),
|
|
||||||
driver: driver,
|
|
||||||
device: `/dev/nvidia${idx}`,
|
|
||||||
type: 'nvidia',
|
|
||||||
};
|
|
||||||
});
|
});
|
||||||
} catch (_) {
|
if (createRes.status !== 201) return;
|
||||||
return null; // nvidia runtime not available — caller uses /dev scan
|
containerId = createRes.data.Id;
|
||||||
|
|
||||||
|
await dockerApi('POST', `/containers/${containerId}/start`);
|
||||||
|
|
||||||
|
// Wait for it to finish (poll status)
|
||||||
|
for (let i = 0; i < 20; i++) {
|
||||||
|
await new Promise(r => setTimeout(r, 500));
|
||||||
|
const inspect = await dockerApi('GET', `/containers/${containerId}/json`);
|
||||||
|
if (!inspect.data?.State?.Running) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab stdout logs
|
||||||
|
const logRes = await new Promise((resolve, reject) => {
|
||||||
|
const options = {
|
||||||
|
socketPath: '/var/run/docker.sock',
|
||||||
|
path: `/v1.43/containers/${containerId}/logs?stdout=1&stderr=0`,
|
||||||
|
method: 'GET',
|
||||||
|
};
|
||||||
|
const req = http.request(options, res => {
|
||||||
|
const chunks = [];
|
||||||
|
res.on('data', c => chunks.push(c));
|
||||||
|
res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
|
||||||
|
});
|
||||||
|
req.on('error', reject);
|
||||||
|
req.end();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Docker log frames have an 8-byte header — strip them
|
||||||
|
const text = logRes.replace(/[\x00-\x07].{7}/g, '').trim();
|
||||||
|
const lines = text.split('\n').filter(l => /^\d+,/.test(l.trim()));
|
||||||
|
|
||||||
|
if (lines.length) {
|
||||||
|
_gpuCache = lines.map(line => {
|
||||||
|
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||||
|
return {
|
||||||
|
index: parseInt(idx, 10),
|
||||||
|
name: name,
|
||||||
|
memory_mb: parseInt(memMiB, 10),
|
||||||
|
driver: driver,
|
||||||
|
device: `/dev/nvidia${idx}`,
|
||||||
|
type: 'nvidia',
|
||||||
|
};
|
||||||
|
});
|
||||||
|
console.log(`[gpu] detected ${_gpuCache.length} GPU(s) via nvidia-smi: ${_gpuCache.map(g => g.name).join(', ')}`);
|
||||||
|
} else {
|
||||||
|
_gpuCache = [];
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[gpu] nvidia-smi probe failed:', err.message);
|
||||||
|
_gpuCache = [];
|
||||||
|
} finally {
|
||||||
|
if (containerId) {
|
||||||
|
await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -225,10 +272,9 @@ function detectHardware() {
|
||||||
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
|
// Use nvidia-smi cache if populated, otherwise fall back to /dev file scan
|
||||||
const smiGpus = detectGpusViaSmi();
|
if (_gpuCache !== null && _gpuCache.length > 0) {
|
||||||
if (smiGpus !== null) {
|
capabilities.gpus = _gpuCache;
|
||||||
capabilities.gpus = smiGpus;
|
|
||||||
} else {
|
} else {
|
||||||
for (let i = 0; i < 16; i++) {
|
for (let i = 0; i < 16; i++) {
|
||||||
try {
|
try {
|
||||||
|
|
@ -304,8 +350,12 @@ async function heartbeat() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
heartbeat();
|
// Probe GPU info once at startup (async, before first heartbeat).
|
||||||
setInterval(heartbeat, HEARTBEAT_MS);
|
// _gpuCache is populated here; heartbeats use the cached value.
|
||||||
|
probeGpusViaSmi().then(() => {
|
||||||
|
heartbeat();
|
||||||
|
setInterval(heartbeat, HEARTBEAT_MS);
|
||||||
|
});
|
||||||
|
|
||||||
// ── HTTP server ───────────────────────────────────────────────────────────
|
// ── HTTP server ───────────────────────────────────────────────────────────
|
||||||
const server = http.createServer((req, res) => {
|
const server = http.createServer((req, res) => {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue