diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml index 378f996..26d5508 100644 --- a/docker-compose.worker.yml +++ b/docker-compose.worker.yml @@ -41,7 +41,6 @@ services: build: ./services/node-agent restart: unless-stopped network_mode: host - pid: host # share host PID namespace so nsenter can run host binaries environment: MAM_API_URL: ${MAM_API_URL} NODE_TOKEN: ${NODE_TOKEN:-} diff --git a/services/node-agent/index.js b/services/node-agent/index.js index 6e3b5b0..6d438bf 100644 --- a/services/node-agent/index.js +++ b/services/node-agent/index.js @@ -183,41 +183,37 @@ function sampleCpu() { // ── Hardware detection ──────────────────────────────────────────────────── // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped function detectGpusViaSmi() { - // Run nvidia-smi in the host's mount namespace via nsenter so it uses the - // host's glibc and NVIDIA driver libs (not the container's Alpine libc). - // Requires pid: host in docker-compose so we can access /proc/1/ns/mnt. - // Falls back to direct nvidia-smi call (works if container has glibc). - // Returns null if nvidia-smi isn't available — caller falls back to /dev scan. + // Spawn a temporary container with --gpus all so the NVIDIA Container Runtime + // injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts). + // This sidesteps the Alpine/glibc incompatibility in the node-agent image itself. + // Returns null if nvidia runtime isn't available — caller falls back to /dev scan. const { execFileSync } = require('child_process'); const QUERY = '--query-gpu=index,name,memory.total,driver_version'; const FMT = '--format=csv,noheader,nounits'; - const attempts = [ - // nsenter into host mount ns (preferred — uses host glibc) - () => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT], - { timeout: 8000, encoding: 'utf8' }), - // Direct call (works if container has compatible glibc) - () => execFileSync('nvidia-smi', [QUERY, FMT], - { timeout: 5000, encoding: 'utf8' }), - ]; + try { + const out = execFileSync('docker', [ + 'run', '--rm', '--gpus', 'all', + 'ubuntu:22.04', + 'nvidia-smi', QUERY, FMT, + ], { timeout: 15000, encoding: 'utf8' }); - for (const attempt of attempts) { - try { - const out = attempt(); - return out.trim().split('\n').filter(Boolean).map(line => { - const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); - return { - index: parseInt(idx, 10), - name: name, - memory_mb: parseInt(memMiB, 10), - driver: driver, - device: `/dev/nvidia${idx}`, - type: 'nvidia', - }; - }); - } catch (_) {} + const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim())); + if (!lines.length) return null; + return lines.map(line => { + const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); + return { + index: parseInt(idx, 10), + name: name, + memory_mb: parseInt(memMiB, 10), + driver: driver, + device: `/dev/nvidia${idx}`, + type: 'nvidia', + }; + }); + } catch (_) { + return null; // nvidia runtime not available — caller uses /dev scan } - return null; // all attempts failed — caller falls back to /dev scan } function detectHardware() {