From 5ff507b81b7a6b37b0fff42a0e42499fe4b9b84d Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Tue, 26 May 2026 18:22:11 +0000 Subject: [PATCH] fix(node-agent): use nsenter to run nvidia-smi in host mount namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvidia-smi bind-mount failed due to Alpine vs Ubuntu glibc incompatibility. Fix: nsenter --mount=/proc/1/ns/mnt -- nvidia-smi runs in the host's mount namespace where glibc and all NVIDIA driver libs are present. Requires pid: host in docker-compose.worker.yml (already has network: host). nsenter is provided by util-linux in Alpine — already in the image. Falls back to direct nvidia-smi call (for glibc-based containers), then to /dev/nvidia* file scan if all attempts fail. --- docker-compose.worker.yml | 7 +---- services/node-agent/index.js | 57 +++++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml index 107d709..378f996 100644 --- a/docker-compose.worker.yml +++ b/docker-compose.worker.yml @@ -41,6 +41,7 @@ services: build: ./services/node-agent restart: unless-stopped network_mode: host + pid: host # share host PID namespace so nsenter can run host binaries environment: MAM_API_URL: ${MAM_API_URL} NODE_TOKEN: ${NODE_TOKEN:-} @@ -55,12 +56,6 @@ services: volumes: - /var/run/docker.sock:/var/run/docker.sock - /dev:/dev:ro - # Bind-mount nvidia-smi + driver libs so the agent can query GPU details. - # These paths match Ubuntu installations with driver 595+. The mounts are - # optional — if the binaries don't exist on the host, Docker silently - # ignores them and the agent falls back to /dev/nvidia* file detection. - - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro - - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro devices: - /dev/blackmagic:/dev/blackmagic diff --git a/services/node-agent/index.js b/services/node-agent/index.js index 7f03967..6e3b5b0 100644 --- a/services/node-agent/index.js +++ b/services/node-agent/index.js @@ -183,30 +183,41 @@ function sampleCpu() { // ── Hardware detection ──────────────────────────────────────────────────── // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped function detectGpusViaSmi() { - // Try nvidia-smi for rich GPU info (name, memory, driver). - // nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml. - // Returns [] if not available or if no GPUs are present. - try { - const { execFileSync } = require('child_process'); - const out = execFileSync( - 'nvidia-smi', - ['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'], - { timeout: 5000, encoding: 'utf8' } - ); - return out.trim().split('\n').filter(Boolean).map(line => { - const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); - return { - index: parseInt(idx, 10), - name: name, - memory_mb: parseInt(memMiB, 10), - driver: driver, - device: `/dev/nvidia${idx}`, - type: 'nvidia', - }; - }); - } catch (_) { - return null; // nvidia-smi not available or failed — fall back to /dev scan + // Run nvidia-smi in the host's mount namespace via nsenter so it uses the + // host's glibc and NVIDIA driver libs (not the container's Alpine libc). + // Requires pid: host in docker-compose so we can access /proc/1/ns/mnt. + // Falls back to direct nvidia-smi call (works if container has glibc). + // Returns null if nvidia-smi isn't available — caller falls back to /dev scan. + const { execFileSync } = require('child_process'); + const QUERY = '--query-gpu=index,name,memory.total,driver_version'; + const FMT = '--format=csv,noheader,nounits'; + + const attempts = [ + // nsenter into host mount ns (preferred — uses host glibc) + () => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT], + { timeout: 8000, encoding: 'utf8' }), + // Direct call (works if container has compatible glibc) + () => execFileSync('nvidia-smi', [QUERY, FMT], + { timeout: 5000, encoding: 'utf8' }), + ]; + + for (const attempt of attempts) { + try { + const out = attempt(); + return out.trim().split('\n').filter(Boolean).map(line => { + const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); + return { + index: parseInt(idx, 10), + name: name, + memory_mb: parseInt(memMiB, 10), + driver: driver, + device: `/dev/nvidia${idx}`, + type: 'nvidia', + }; + }); + } catch (_) {} } + return null; // all attempts failed — caller falls back to /dev scan } function detectHardware() {