fix(node-agent): detect GPUs via docker run --gpus all ubuntu:22.04

nsenter approach failed (requires SYS_ADMIN in container).
nvidia-smi bind-mount failed (Alpine vs Ubuntu glibc incompatibility).

Working solution: spawn 'docker run --rm --gpus all ubuntu:22.04 nvidia-smi'
via the Docker socket. The NVIDIA Container Runtime injects nvidia-smi and
driver libs into any container with --gpus all, regardless of the base image.
ubuntu:22.04 is already cached on GPU nodes.

Result: GPU reported with name, memory_mb, driver_version — shows as BOUND
in the cluster UI.
This commit is contained in:
Zac Gaetano 2026-05-26 18:25:44 +00:00
parent 5ff507b81b
commit 558c18e417
2 changed files with 25 additions and 30 deletions

View file

@ -41,7 +41,6 @@ services:
build: ./services/node-agent
restart: unless-stopped
network_mode: host
pid: host # share host PID namespace so nsenter can run host binaries
environment:
MAM_API_URL: ${MAM_API_URL}
NODE_TOKEN: ${NODE_TOKEN:-}

View file

@ -183,41 +183,37 @@ function sampleCpu() {
// ── Hardware detection ────────────────────────────────────────────────────
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
function detectGpusViaSmi() {
// Run nvidia-smi in the host's mount namespace via nsenter so it uses the
// host's glibc and NVIDIA driver libs (not the container's Alpine libc).
// Requires pid: host in docker-compose so we can access /proc/1/ns/mnt.
// Falls back to direct nvidia-smi call (works if container has glibc).
// Returns null if nvidia-smi isn't available — caller falls back to /dev scan.
// Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
// injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
// This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
// Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
const { execFileSync } = require('child_process');
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
const FMT = '--format=csv,noheader,nounits';
const attempts = [
// nsenter into host mount ns (preferred — uses host glibc)
() => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT],
{ timeout: 8000, encoding: 'utf8' }),
// Direct call (works if container has compatible glibc)
() => execFileSync('nvidia-smi', [QUERY, FMT],
{ timeout: 5000, encoding: 'utf8' }),
];
try {
const out = execFileSync('docker', [
'run', '--rm', '--gpus', 'all',
'ubuntu:22.04',
'nvidia-smi', QUERY, FMT,
], { timeout: 15000, encoding: 'utf8' });
for (const attempt of attempts) {
try {
const out = attempt();
return out.trim().split('\n').filter(Boolean).map(line => {
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
return {
index: parseInt(idx, 10),
name: name,
memory_mb: parseInt(memMiB, 10),
driver: driver,
device: `/dev/nvidia${idx}`,
type: 'nvidia',
};
});
} catch (_) {}
const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
if (!lines.length) return null;
return lines.map(line => {
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
return {
index: parseInt(idx, 10),
name: name,
memory_mb: parseInt(memMiB, 10),
driver: driver,
device: `/dev/nvidia${idx}`,
type: 'nvidia',
};
});
} catch (_) {
return null; // nvidia runtime not available — caller uses /dev scan
}
return null; // all attempts failed — caller falls back to /dev scan
}
function detectHardware() {