fix(node-agent): count GPUs via /dev/nvidiaN device nodes
_gpuCache was empty (probeGpusViaSmi container didn't populate it), so the count fell back to 1 → NVIDIA_VISIBLE_DEVICES=all again. Count /dev/nvidiaN nodes directly (visible in the privileged node-agent container, confirmed 3) — same method the heartbeat uses.
This commit is contained in:
parent
15c282e749
commit
f223bb8c8b
1 changed files with 12 additions and 7 deletions
|
|
@ -32,16 +32,21 @@ const VERSION = '1.4.0';
|
||||||
// 8 ports = 16 sessions hammering one card → it falls below realtime → the
|
// 8 ports = 16 sessions hammering one card → it falls below realtime → the
|
||||||
// framecache ring laps → video freezes/stutters then recovers. Pinning each
|
// framecache ring laps → video freezes/stutters then recovers. Pinning each
|
||||||
// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards.
|
// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards.
|
||||||
// GPU count comes from _gpuCache (populated at startup via probeGpusViaSmi,
|
// GPU count for spreading capture encodes. The node-agent image has no
|
||||||
// which runs nvidia-smi in a throwaway GPU container because the node-agent
|
// nvidia-smi, and the startup probeGpusViaSmi cache can be empty, so count the
|
||||||
// image itself has no nvidia-smi). CAPTURE_GPU_COUNT env overrides everything.
|
// /dev/nvidiaN device nodes directly (same approach the heartbeat uses — these
|
||||||
|
// are visible because the node-agent runs privileged with /dev bound).
|
||||||
|
// CAPTURE_GPU_COUNT / GPU_COUNT env override everything.
|
||||||
function detectGpuCount() {
|
function detectGpuCount() {
|
||||||
const envN = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0;
|
const envN = parseInt(process.env.CAPTURE_GPU_COUNT || process.env.GPU_COUNT || '0', 10) || 0;
|
||||||
if (envN > 0) return envN;
|
if (envN > 0) return envN;
|
||||||
// _gpuCache is declared later (function hoisting makes the ref fine at call
|
|
||||||
// time). null = not probed yet; [] = no GPUs; otherwise one entry per GPU.
|
|
||||||
if (Array.isArray(_gpuCache) && _gpuCache.length > 0) return _gpuCache.length;
|
if (Array.isArray(_gpuCache) && _gpuCache.length > 0) return _gpuCache.length;
|
||||||
return 1; // not probed yet / no GPUs → single-device fallback ('all')
|
let n = 0;
|
||||||
|
for (let i = 0; i < 16; i++) {
|
||||||
|
try { fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK); n++; }
|
||||||
|
catch (_) { break; }
|
||||||
|
}
|
||||||
|
return n > 0 ? n : 1; // no GPU nodes → single-device fallback ('all')
|
||||||
}
|
}
|
||||||
|
|
||||||
// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit
|
// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue