From f223bb8c8bbb27905f9131e3bb81b0fbb4199ea1 Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Thu, 4 Jun 2026 16:02:22 +0000 Subject: [PATCH] fix(node-agent): count GPUs via /dev/nvidiaN device nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _gpuCache was empty (probeGpusViaSmi container didn't populate it), so the count fell back to 1 → NVIDIA_VISIBLE_DEVICES=all again. Count /dev/nvidiaN nodes directly (visible in the privileged node-agent container, confirmed 3) — same method the heartbeat uses. --- services/node-agent/index.js | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/services/node-agent/index.js b/services/node-agent/index.js index 26e4205..9b7427e 100644 --- a/services/node-agent/index.js +++ b/services/node-agent/index.js @@ -32,16 +32,21 @@ const VERSION = '1.4.0'; // 8 ports = 16 sessions hammering one card → it falls below realtime → the // framecache ring laps → video freezes/stutters then recovers. Pinning each // sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards. -// GPU count comes from _gpuCache (populated at startup via probeGpusViaSmi, -// which runs nvidia-smi in a throwaway GPU container because the node-agent -// image itself has no nvidia-smi). CAPTURE_GPU_COUNT env overrides everything. +// GPU count for spreading capture encodes. The node-agent image has no +// nvidia-smi, and the startup probeGpusViaSmi cache can be empty, so count the +// /dev/nvidiaN device nodes directly (same approach the heartbeat uses — these +// are visible because the node-agent runs privileged with /dev bound). +// CAPTURE_GPU_COUNT / GPU_COUNT env override everything. function detectGpuCount() { - const envN = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0; + const envN = parseInt(process.env.CAPTURE_GPU_COUNT || process.env.GPU_COUNT || '0', 10) || 0; if (envN > 0) return envN; - // _gpuCache is declared later (function hoisting makes the ref fine at call - // time). null = not probed yet; [] = no GPUs; otherwise one entry per GPU. if (Array.isArray(_gpuCache) && _gpuCache.length > 0) return _gpuCache.length; - return 1; // not probed yet / no GPUs → single-device fallback ('all') + let n = 0; + for (let i = 0; i < 16; i++) { + try { fs.accessSync(`/dev/nvidia${i}`, fs.constants.F_OK); n++; } + catch (_) { break; } + } + return n > 0 ? n : 1; // no GPU nodes → single-device fallback ('all') } // Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit