fix(node-agent): bind nvidia-smi for full GPU info (name, VRAM, driver)
index.js: - detectGpusViaSmi(): runs nvidia-smi --query-gpu=index,name,memory.total, driver_version and parses the output into structured GPU objects with name, memory_mb, driver, device — the same fields the cluster UI uses to determine BOUND status - Falls back to /dev/nvidia* file scan if nvidia-smi isn't available docker-compose.worker.yml: - Bind-mount /usr/bin/nvidia-smi and libnvidia-ml.so.1 from host into node-agent container (read-only). These are the minimum binaries needed for nvidia-smi to execute inside the container. - Mounts are optional — Docker ignores them silently if paths don't exist (e.g. on nodes without NVIDIA hardware)
This commit is contained in:
parent
55ff2e717f
commit
726343db96
2 changed files with 44 additions and 5 deletions
|
|
@ -55,6 +55,12 @@ services:
|
|||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- /dev:/dev:ro
|
||||
# Bind-mount nvidia-smi + driver libs so the agent can query GPU details.
|
||||
# These paths match Ubuntu installations with driver 595+. The mounts are
|
||||
# optional — if the binaries don't exist on the host, Docker silently
|
||||
# ignores them and the agent falls back to /dev/nvidia* file detection.
|
||||
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
|
||||
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
|
||||
devices:
|
||||
- /dev/blackmagic:/dev/blackmagic
|
||||
|
||||
|
|
|
|||
|
|
@ -182,6 +182,33 @@ function sampleCpu() {
|
|||
|
||||
// ── Hardware detection ────────────────────────────────────────────────────
|
||||
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
||||
function detectGpusViaSmi() {
|
||||
// Try nvidia-smi for rich GPU info (name, memory, driver).
|
||||
// nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml.
|
||||
// Returns [] if not available or if no GPUs are present.
|
||||
try {
|
||||
const { execFileSync } = require('child_process');
|
||||
const out = execFileSync(
|
||||
'nvidia-smi',
|
||||
['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'],
|
||||
{ timeout: 5000, encoding: 'utf8' }
|
||||
);
|
||||
return out.trim().split('\n').filter(Boolean).map(line => {
|
||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||
return {
|
||||
index: parseInt(idx, 10),
|
||||
name: name,
|
||||
memory_mb: parseInt(memMiB, 10),
|
||||
driver: driver,
|
||||
device: `/dev/nvidia${idx}`,
|
||||
type: 'nvidia',
|
||||
};
|
||||
});
|
||||
} catch (_) {
|
||||
return null; // nvidia-smi not available or failed — fall back to /dev scan
|
||||
}
|
||||
}
|
||||
|
||||
function detectHardware() {
|
||||
const capabilities = { gpus: [], blackmagic: [] };
|
||||
|
||||
|
|
@ -191,11 +218,17 @@ function detectHardware() {
|
|||
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
||||
}
|
||||
} else {
|
||||
for (let i = 0; i < 16; i++) {
|
||||
try {
|
||||
fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK);
|
||||
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
||||
} catch (_) { break; }
|
||||
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
|
||||
const smiGpus = detectGpusViaSmi();
|
||||
if (smiGpus !== null) {
|
||||
capabilities.gpus = smiGpus;
|
||||
} else {
|
||||
for (let i = 0; i < 16; i++) {
|
||||
try {
|
||||
fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK);
|
||||
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
||||
} catch (_) { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue