From 15fab99d55d4ffd80a722945d710a7fd077dcedf Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Thu, 4 Jun 2026 16:05:14 +0000 Subject: [PATCH] fix(node-agent): scope DeviceRequests to chosen GPU so VISIBLE_DEVICES sticks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVIDIA_VISIBLE_DEVICES was set per-port correctly (0/1/2) but ALL encodes still landed on GPU 0 at 99%. Cause: HostConfig.DeviceRequests granted Count:-1 (all GPUs), which OVERRIDES NVIDIA_VISIBLE_DEVICES — the container saw all 3 cards and nvenc defaulted to device 0. Now build DeviceRequests with DeviceIDs:[chosenGpu] so each sidecar truly sees only its one L4. --- services/node-agent/index.js | 38 ++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/services/node-agent/index.js b/services/node-agent/index.js index 9b7427e..40ff530 100644 --- a/services/node-agent/index.js +++ b/services/node-agent/index.js @@ -62,6 +62,17 @@ function pickVisibleDevices(gpuUuid, capturePort) { return String(idx); } +// Build the Docker DeviceRequests entry matching a NVIDIA_VISIBLE_DEVICES value. +// 'all' → grant every GPU (Count -1). A specific index/UUID → grant ONLY that +// device via DeviceIDs, so the container can't see (and nvenc can't fall back +// to) any other card. A blanket Count:-1 would silently override the env var. +function gpuDeviceRequest(visibleDevices) { + if (!visibleDevices || visibleDevices === 'all') { + return { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }; + } + return { Driver: 'nvidia', DeviceIDs: [String(visibleDevices)], Capabilities: [['gpu']] }; +} + // Capture-driver vendor allowlist. NOTHING outside this set is ever passed to // the host installer — the value is only ever used to pick a script arg, never // interpolated into a shell string. @@ -513,10 +524,10 @@ async function handleSidecarStart(body, res) { // numeric index) pins the sidecar to exactly that device; otherwise // Round-robin the encode across all GPUs by capture port (or honor an // explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0. - const visibleDevices = pickVisibleDevices(gpuUuid, capturePort); - sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); + var startVisibleDevices = pickVisibleDevices(gpuUuid, capturePort); + sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`); sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility'); - console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); + console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`); } const hostConfig = { @@ -525,12 +536,12 @@ async function handleSidecarStart(body, res) { Binds: binds, }; if (useGpu) { - // Tell Docker to use the NVIDIA container runtime for this container. - // Equivalent to `docker run --gpus all` / `--runtime=nvidia`. hostConfig.Runtime = 'nvidia'; - hostConfig.DeviceRequests = [ - { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }, - ]; + // CRITICAL: scope DeviceRequests to the SAME single GPU as + // NVIDIA_VISIBLE_DEVICES. A blanket Count:-1 (all GPUs) OVERRIDES the env + // var, so every sidecar got all 3 cards and nvenc piled onto GPU 0. Pass + // the specific DeviceIDs so the container truly sees only its one GPU. + hostConfig.DeviceRequests = [gpuDeviceRequest(startVisibleDevices)]; } const spec = { @@ -824,19 +835,22 @@ async function handleSidecarStandby(body, res) { } const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1']; + var standbyVisibleDevices = 'all'; if (useGpu) { // Same round-robin GPU spread as the start path (see pickVisibleDevices). - const visibleDevices = pickVisibleDevices(gpuUuid, capturePort); - sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); + standbyVisibleDevices = pickVisibleDevices(gpuUuid, capturePort); + sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`); sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility'); - console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); + console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`); } sidecarEnv.push(`FC_URL=${FC_URL}`); const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds }; if (useGpu) { hostConfig.Runtime = 'nvidia'; - hostConfig.DeviceRequests = [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }]; + // Scope to the single chosen GPU (see gpuDeviceRequest) — a blanket + // Count:-1 overrides NVIDIA_VISIBLE_DEVICES and re-piles everything on GPU 0. + hostConfig.DeviceRequests = [gpuDeviceRequest(standbyVisibleDevices)]; } // Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart).