fix(node-agent): detect GPUs via docker run --gpus all ubuntu:22.04

nsenter approach failed (requires SYS_ADMIN in container).
nvidia-smi bind-mount failed (Alpine vs Ubuntu glibc incompatibility).

Working solution: spawn 'docker run --rm --gpus all ubuntu:22.04 nvidia-smi'
via the Docker socket. The NVIDIA Container Runtime injects nvidia-smi and
driver libs into any container with --gpus all, regardless of the base image.
ubuntu:22.04 is already cached on GPU nodes.

Result: GPU reported with name, memory_mb, driver_version — shows as BOUND
in the cluster UI.
This commit is contained in:
Zac Gaetano 2026-05-26 18:25:44 +00:00
parent 5ff507b81b
commit 558c18e417
2 changed files with 25 additions and 30 deletions

View file

@ -41,7 +41,6 @@ services:
build: ./services/node-agent build: ./services/node-agent
restart: unless-stopped restart: unless-stopped
network_mode: host network_mode: host
pid: host # share host PID namespace so nsenter can run host binaries
environment: environment:
MAM_API_URL: ${MAM_API_URL} MAM_API_URL: ${MAM_API_URL}
NODE_TOKEN: ${NODE_TOKEN:-} NODE_TOKEN: ${NODE_TOKEN:-}

View file

@ -183,41 +183,37 @@ function sampleCpu() {
// ── Hardware detection ──────────────────────────────────────────────────── // ── Hardware detection ────────────────────────────────────────────────────
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
function detectGpusViaSmi() { function detectGpusViaSmi() {
// Run nvidia-smi in the host's mount namespace via nsenter so it uses the // Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
// host's glibc and NVIDIA driver libs (not the container's Alpine libc). // injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
// Requires pid: host in docker-compose so we can access /proc/1/ns/mnt. // This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
// Falls back to direct nvidia-smi call (works if container has glibc). // Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
// Returns null if nvidia-smi isn't available — caller falls back to /dev scan.
const { execFileSync } = require('child_process'); const { execFileSync } = require('child_process');
const QUERY = '--query-gpu=index,name,memory.total,driver_version'; const QUERY = '--query-gpu=index,name,memory.total,driver_version';
const FMT = '--format=csv,noheader,nounits'; const FMT = '--format=csv,noheader,nounits';
const attempts = [ try {
// nsenter into host mount ns (preferred — uses host glibc) const out = execFileSync('docker', [
() => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT], 'run', '--rm', '--gpus', 'all',
{ timeout: 8000, encoding: 'utf8' }), 'ubuntu:22.04',
// Direct call (works if container has compatible glibc) 'nvidia-smi', QUERY, FMT,
() => execFileSync('nvidia-smi', [QUERY, FMT], ], { timeout: 15000, encoding: 'utf8' });
{ timeout: 5000, encoding: 'utf8' }),
];
for (const attempt of attempts) { const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
try { if (!lines.length) return null;
const out = attempt(); return lines.map(line => {
return out.trim().split('\n').filter(Boolean).map(line => { const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); return {
return { index: parseInt(idx, 10),
index: parseInt(idx, 10), name: name,
name: name, memory_mb: parseInt(memMiB, 10),
memory_mb: parseInt(memMiB, 10), driver: driver,
driver: driver, device: `/dev/nvidia${idx}`,
device: `/dev/nvidia${idx}`, type: 'nvidia',
type: 'nvidia', };
}; });
}); } catch (_) {
} catch (_) {} return null; // nvidia runtime not available — caller uses /dev scan
} }
return null; // all attempts failed — caller falls back to /dev scan
} }
function detectHardware() { function detectHardware() {