fix(node-agent): use nsenter to run nvidia-smi in host mount namespace

nvidia-smi bind-mount failed due to Alpine vs Ubuntu glibc incompatibility.
Fix: nsenter --mount=/proc/1/ns/mnt -- nvidia-smi runs in the host's mount
namespace where glibc and all NVIDIA driver libs are present.

Requires pid: host in docker-compose.worker.yml (already has network: host).
nsenter is provided by util-linux in Alpine — already in the image.

Falls back to direct nvidia-smi call (for glibc-based containers), then
to /dev/nvidia* file scan if all attempts fail.
This commit is contained in:
Zac Gaetano 2026-05-26 18:22:11 +00:00
parent 726343db96
commit 5ff507b81b
2 changed files with 35 additions and 29 deletions

View file

@ -41,6 +41,7 @@ services:
build: ./services/node-agent build: ./services/node-agent
restart: unless-stopped restart: unless-stopped
network_mode: host network_mode: host
pid: host # share host PID namespace so nsenter can run host binaries
environment: environment:
MAM_API_URL: ${MAM_API_URL} MAM_API_URL: ${MAM_API_URL}
NODE_TOKEN: ${NODE_TOKEN:-} NODE_TOKEN: ${NODE_TOKEN:-}
@ -55,12 +56,6 @@ services:
volumes: volumes:
- /var/run/docker.sock:/var/run/docker.sock - /var/run/docker.sock:/var/run/docker.sock
- /dev:/dev:ro - /dev:/dev:ro
# Bind-mount nvidia-smi + driver libs so the agent can query GPU details.
# These paths match Ubuntu installations with driver 595+. The mounts are
# optional — if the binaries don't exist on the host, Docker silently
# ignores them and the agent falls back to /dev/nvidia* file detection.
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
devices: devices:
- /dev/blackmagic:/dev/blackmagic - /dev/blackmagic:/dev/blackmagic

View file

@ -183,30 +183,41 @@ function sampleCpu() {
// ── Hardware detection ──────────────────────────────────────────────────── // ── Hardware detection ────────────────────────────────────────────────────
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
function detectGpusViaSmi() { function detectGpusViaSmi() {
// Try nvidia-smi for rich GPU info (name, memory, driver). // Run nvidia-smi in the host's mount namespace via nsenter so it uses the
// nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml. // host's glibc and NVIDIA driver libs (not the container's Alpine libc).
// Returns [] if not available or if no GPUs are present. // Requires pid: host in docker-compose so we can access /proc/1/ns/mnt.
try { // Falls back to direct nvidia-smi call (works if container has glibc).
const { execFileSync } = require('child_process'); // Returns null if nvidia-smi isn't available — caller falls back to /dev scan.
const out = execFileSync( const { execFileSync } = require('child_process');
'nvidia-smi', const QUERY = '--query-gpu=index,name,memory.total,driver_version';
['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'], const FMT = '--format=csv,noheader,nounits';
{ timeout: 5000, encoding: 'utf8' }
); const attempts = [
return out.trim().split('\n').filter(Boolean).map(line => { // nsenter into host mount ns (preferred — uses host glibc)
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); () => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT],
return { { timeout: 8000, encoding: 'utf8' }),
index: parseInt(idx, 10), // Direct call (works if container has compatible glibc)
name: name, () => execFileSync('nvidia-smi', [QUERY, FMT],
memory_mb: parseInt(memMiB, 10), { timeout: 5000, encoding: 'utf8' }),
driver: driver, ];
device: `/dev/nvidia${idx}`,
type: 'nvidia', for (const attempt of attempts) {
}; try {
}); const out = attempt();
} catch (_) { return out.trim().split('\n').filter(Boolean).map(line => {
return null; // nvidia-smi not available or failed — fall back to /dev scan const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
return {
index: parseInt(idx, 10),
name: name,
memory_mb: parseInt(memMiB, 10),
driver: driver,
device: `/dev/nvidia${idx}`,
type: 'nvidia',
};
});
} catch (_) {}
} }
return null; // all attempts failed — caller falls back to /dev scan
} }
function detectHardware() { function detectHardware() {