fix(node-agent): bind nvidia-smi for full GPU info (name, VRAM, driver)
index.js: - detectGpusViaSmi(): runs nvidia-smi --query-gpu=index,name,memory.total, driver_version and parses the output into structured GPU objects with name, memory_mb, driver, device — the same fields the cluster UI uses to determine BOUND status - Falls back to /dev/nvidia* file scan if nvidia-smi isn't available docker-compose.worker.yml: - Bind-mount /usr/bin/nvidia-smi and libnvidia-ml.so.1 from host into node-agent container (read-only). These are the minimum binaries needed for nvidia-smi to execute inside the container. - Mounts are optional — Docker ignores them silently if paths don't exist (e.g. on nodes without NVIDIA hardware)
This commit is contained in:
parent
55ff2e717f
commit
726343db96
2 changed files with 44 additions and 5 deletions
|
|
@ -55,6 +55,12 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- /var/run/docker.sock:/var/run/docker.sock
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
- /dev:/dev:ro
|
- /dev:/dev:ro
|
||||||
|
# Bind-mount nvidia-smi + driver libs so the agent can query GPU details.
|
||||||
|
# These paths match Ubuntu installations with driver 595+. The mounts are
|
||||||
|
# optional — if the binaries don't exist on the host, Docker silently
|
||||||
|
# ignores them and the agent falls back to /dev/nvidia* file detection.
|
||||||
|
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
|
||||||
|
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
|
||||||
devices:
|
devices:
|
||||||
- /dev/blackmagic:/dev/blackmagic
|
- /dev/blackmagic:/dev/blackmagic
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -182,6 +182,33 @@ function sampleCpu() {
|
||||||
|
|
||||||
// ── Hardware detection ────────────────────────────────────────────────────
|
// ── Hardware detection ────────────────────────────────────────────────────
|
||||||
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
||||||
|
function detectGpusViaSmi() {
|
||||||
|
// Try nvidia-smi for rich GPU info (name, memory, driver).
|
||||||
|
// nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml.
|
||||||
|
// Returns [] if not available or if no GPUs are present.
|
||||||
|
try {
|
||||||
|
const { execFileSync } = require('child_process');
|
||||||
|
const out = execFileSync(
|
||||||
|
'nvidia-smi',
|
||||||
|
['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'],
|
||||||
|
{ timeout: 5000, encoding: 'utf8' }
|
||||||
|
);
|
||||||
|
return out.trim().split('\n').filter(Boolean).map(line => {
|
||||||
|
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||||
|
return {
|
||||||
|
index: parseInt(idx, 10),
|
||||||
|
name: name,
|
||||||
|
memory_mb: parseInt(memMiB, 10),
|
||||||
|
driver: driver,
|
||||||
|
device: `/dev/nvidia${idx}`,
|
||||||
|
type: 'nvidia',
|
||||||
|
};
|
||||||
|
});
|
||||||
|
} catch (_) {
|
||||||
|
return null; // nvidia-smi not available or failed — fall back to /dev scan
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function detectHardware() {
|
function detectHardware() {
|
||||||
const capabilities = { gpus: [], blackmagic: [] };
|
const capabilities = { gpus: [], blackmagic: [] };
|
||||||
|
|
||||||
|
|
@ -191,11 +218,17 @@ function detectHardware() {
|
||||||
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (let i = 0; i < 16; i++) {
|
// Try nvidia-smi first for full GPU details; fall back to /dev file scan
|
||||||
try {
|
const smiGpus = detectGpusViaSmi();
|
||||||
fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK);
|
if (smiGpus !== null) {
|
||||||
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
capabilities.gpus = smiGpus;
|
||||||
} catch (_) { break; }
|
} else {
|
||||||
|
for (let i = 0; i < 16; i++) {
|
||||||
|
try {
|
||||||
|
fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK);
|
||||||
|
capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
|
||||||
|
} catch (_) { break; }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue