fix(node-agent): bind nvidia-smi for full GPU info (name, VRAM, driver)

index.js:
- detectGpusViaSmi(): runs nvidia-smi --query-gpu=index,name,memory.total,
  driver_version and parses the output into structured GPU objects with
  name, memory_mb, driver, device — the same fields the cluster UI uses
  to determine BOUND status
- Falls back to /dev/nvidia* file scan if nvidia-smi isn't available

docker-compose.worker.yml:
- Bind-mount /usr/bin/nvidia-smi and libnvidia-ml.so.1 from host into
  node-agent container (read-only). These are the minimum binaries needed
  for nvidia-smi to execute inside the container.
- Mounts are optional — Docker ignores them silently if paths don't exist
  (e.g. on nodes without NVIDIA hardware)
This commit is contained in:
Zac Gaetano 2026-05-26 18:19:23 +00:00
parent 55ff2e717f
commit 726343db96
2 changed files with 44 additions and 5 deletions

View file

@ -55,6 +55,12 @@ services:
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /dev:/dev:ro
# Bind-mount nvidia-smi + driver libs so the agent can query GPU details.
# These paths match Ubuntu installations with driver 595+. The mounts are
# optional — if the binaries don't exist on the host, Docker silently
# ignores them and the agent falls back to /dev/nvidia* file detection.
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
devices:
- /dev/blackmagic:/dev/blackmagic

View file

@ -182,6 +182,33 @@ function sampleCpu() {
// ── Hardware detection ────────────────────────────────────────────────────
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
function detectGpusViaSmi() {
// Try nvidia-smi for rich GPU info (name, memory, driver).
// nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml.
// Returns [] if not available or if no GPUs are present.
try {
const { execFileSync } = require('child_process');
const out = execFileSync(
'nvidia-smi',
['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'],
{ timeout: 5000, encoding: 'utf8' }
);
return out.trim().split('\n').filter(Boolean).map(line => {
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
return {
index: parseInt(idx, 10),
name: name,
memory_mb: parseInt(memMiB, 10),
driver: driver,
device: `/dev/nvidia${idx}`,
type: 'nvidia',
};
});
} catch (_) {
return null; // nvidia-smi not available or failed — fall back to /dev scan
}
}
function detectHardware() {
const capabilities = { gpus: [], blackmagic: [] };
@ -191,11 +218,17 @@ function detectHardware() {
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
}
} else {
for (let i = 0; i < 16; i++) {
try {
fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK);
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
} catch (_) { break; }
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
const smiGpus = detectGpusViaSmi();
if (smiGpus !== null) {
capabilities.gpus = smiGpus;
} else {
for (let i = 0; i < 16; i++) {
try {
fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK);
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
} catch (_) { break; }
}
}
}