From 726343db962a8ff1d548e38d909fd44101e4dad7 Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Tue, 26 May 2026 18:19:23 +0000 Subject: [PATCH] fix(node-agent): bind nvidia-smi for full GPU info (name, VRAM, driver) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit index.js: - detectGpusViaSmi(): runs nvidia-smi --query-gpu=index,name,memory.total, driver_version and parses the output into structured GPU objects with name, memory_mb, driver, device — the same fields the cluster UI uses to determine BOUND status - Falls back to /dev/nvidia* file scan if nvidia-smi isn't available docker-compose.worker.yml: - Bind-mount /usr/bin/nvidia-smi and libnvidia-ml.so.1 from host into node-agent container (read-only). These are the minimum binaries needed for nvidia-smi to execute inside the container. - Mounts are optional — Docker ignores them silently if paths don't exist (e.g. on nodes without NVIDIA hardware) --- docker-compose.worker.yml | 6 +++++ services/node-agent/index.js | 43 +++++++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml index 26d5508..107d709 100644 --- a/docker-compose.worker.yml +++ b/docker-compose.worker.yml @@ -55,6 +55,12 @@ services: volumes: - /var/run/docker.sock:/var/run/docker.sock - /dev:/dev:ro + # Bind-mount nvidia-smi + driver libs so the agent can query GPU details. + # These paths match Ubuntu installations with driver 595+. The mounts are + # optional — if the binaries don't exist on the host, Docker silently + # ignores them and the agent falls back to /dev/nvidia* file detection. + - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro + - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro devices: - /dev/blackmagic:/dev/blackmagic diff --git a/services/node-agent/index.js b/services/node-agent/index.js index 57bbb9f..7f03967 100644 --- a/services/node-agent/index.js +++ b/services/node-agent/index.js @@ -182,6 +182,33 @@ function sampleCpu() { // ── Hardware detection ──────────────────────────────────────────────────── // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped +function detectGpusViaSmi() { + // Try nvidia-smi for rich GPU info (name, memory, driver). + // nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml. + // Returns [] if not available or if no GPUs are present. + try { + const { execFileSync } = require('child_process'); + const out = execFileSync( + 'nvidia-smi', + ['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'], + { timeout: 5000, encoding: 'utf8' } + ); + return out.trim().split('\n').filter(Boolean).map(line => { + const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim()); + return { + index: parseInt(idx, 10), + name: name, + memory_mb: parseInt(memMiB, 10), + driver: driver, + device: `/dev/nvidia${idx}`, + type: 'nvidia', + }; + }); + } catch (_) { + return null; // nvidia-smi not available or failed — fall back to /dev scan + } +} + function detectHardware() { const capabilities = { gpus: [], blackmagic: [] }; @@ -191,11 +218,17 @@ function detectHardware() { capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i }); } } else { - for (let i = 0; i < 16; i++) { - try { - fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK); - capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i }); - } catch (_) { break; } + // Try nvidia-smi first for full GPU details; fall back to /dev file scan + const smiGpus = detectGpusViaSmi(); + if (smiGpus !== null) { + capabilities.gpus = smiGpus; + } else { + for (let i = 0; i < 16; i++) { + try { + fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK); + capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i }); + } catch (_) { break; } + } } }