fix(node-agent): bind nvidia-smi for full GPU info (name, VRAM, driver)

index.js:
- detectGpusViaSmi(): runs nvidia-smi --query-gpu=index,name,memory.total,
  driver_version and parses the output into structured GPU objects with
  name, memory_mb, driver, device — the same fields the cluster UI uses
  to determine BOUND status
- Falls back to /dev/nvidia* file scan if nvidia-smi isn't available

docker-compose.worker.yml:
- Bind-mount /usr/bin/nvidia-smi and libnvidia-ml.so.1 from host into
  node-agent container (read-only). These are the minimum binaries needed
  for nvidia-smi to execute inside the container.
- Mounts are optional — Docker ignores them silently if paths don't exist
  (e.g. on nodes without NVIDIA hardware)
This commit is contained in:
Zac Gaetano 2026-05-26 18:19:23 +00:00
parent 55ff2e717f
commit 726343db96
2 changed files with 44 additions and 5 deletions

View file

@ -55,6 +55,12 @@ services:
volumes: volumes:
- /var/run/docker.sock:/var/run/docker.sock - /var/run/docker.sock:/var/run/docker.sock
- /dev:/dev:ro - /dev:/dev:ro
# Bind-mount nvidia-smi + driver libs so the agent can query GPU details.
# These paths match Ubuntu installations with driver 595+. The mounts are
# optional — if the binaries don't exist on the host, Docker silently
# ignores them and the agent falls back to /dev/nvidia* file detection.
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
devices: devices:
- /dev/blackmagic:/dev/blackmagic - /dev/blackmagic:/dev/blackmagic

View file

@ -182,6 +182,33 @@ function sampleCpu() {
// ── Hardware detection ──────────────────────────────────────────────────── // ── Hardware detection ────────────────────────────────────────────────────
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
function detectGpusViaSmi() {
// Try nvidia-smi for rich GPU info (name, memory, driver).
// nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml.
// Returns [] if not available or if no GPUs are present.
try {
const { execFileSync } = require('child_process');
const out = execFileSync(
'nvidia-smi',
['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'],
{ timeout: 5000, encoding: 'utf8' }
);
return out.trim().split('\n').filter(Boolean).map(line => {
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
return {
index: parseInt(idx, 10),
name: name,
memory_mb: parseInt(memMiB, 10),
driver: driver,
device: `/dev/nvidia${idx}`,
type: 'nvidia',
};
});
} catch (_) {
return null; // nvidia-smi not available or failed — fall back to /dev scan
}
}
function detectHardware() { function detectHardware() {
const capabilities = { gpus: [], blackmagic: [] }; const capabilities = { gpus: [], blackmagic: [] };
@ -190,6 +217,11 @@ function detectHardware() {
for (let i = 0; i < gpuOverride; i++) { for (let i = 0; i < gpuOverride; i++) {
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i }); capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
} }
} else {
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
const smiGpus = detectGpusViaSmi();
if (smiGpus !== null) {
capabilities.gpus = smiGpus;
} else { } else {
for (let i = 0; i < 16; i++) { for (let i = 0; i < 16; i++) {
try { try {
@ -198,6 +230,7 @@ function detectHardware() {
} catch (_) { break; } } catch (_) { break; }
} }
} }
}
const bmdOverride = parseInt(process.env.BMD_COUNT || '-1', 10); const bmdOverride = parseInt(process.env.BMD_COUNT || '-1', 10);
if (bmdOverride >= 0) { if (bmdOverride >= 0) {