From 15fab99d55d4ffd80a722945d710a7fd077dcedf Mon Sep 17 00:00:00 2001
From: ZGaetano <zgaetano@wilddragon.net>
Date: Thu, 4 Jun 2026 16:05:14 +0000
Subject: [PATCH] fix(node-agent): scope DeviceRequests to chosen GPU so
 VISIBLE_DEVICES sticks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NVIDIA_VISIBLE_DEVICES was set per-port correctly (0/1/2) but ALL encodes still landed on GPU 0 at 99%. Cause: HostConfig.DeviceRequests granted Count:-1 (all GPUs), which OVERRIDES NVIDIA_VISIBLE_DEVICES — the container saw all 3 cards and nvenc defaulted to device 0. Now build DeviceRequests with DeviceIDs:[chosenGpu] so each sidecar truly sees only its one L4.
---
 services/node-agent/index.js | 38 ++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/services/node-agent/index.js b/services/node-agent/index.js
index 9b7427e..40ff530 100644
--- a/services/node-agent/index.js
+++ b/services/node-agent/index.js
@@ -62,6 +62,17 @@ function pickVisibleDevices(gpuUuid, capturePort) {
   return String(idx);
 }
 
+// Build the Docker DeviceRequests entry matching a NVIDIA_VISIBLE_DEVICES value.
+// 'all' → grant every GPU (Count -1). A specific index/UUID → grant ONLY that
+// device via DeviceIDs, so the container can't see (and nvenc can't fall back
+// to) any other card. A blanket Count:-1 would silently override the env var.
+function gpuDeviceRequest(visibleDevices) {
+  if (!visibleDevices || visibleDevices === 'all') {
+    return { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] };
+  }
+  return { Driver: 'nvidia', DeviceIDs: [String(visibleDevices)], Capabilities: [['gpu']] };
+}
+
 // Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
 // the host installer — the value is only ever used to pick a script arg, never
 // interpolated into a shell string.
@@ -513,10 +524,10 @@ async function handleSidecarStart(body, res) {
       // numeric index) pins the sidecar to exactly that device; otherwise
       // Round-robin the encode across all GPUs by capture port (or honor an
       // explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
-      const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
-      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      var startVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
+      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
       sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
-      console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
     }
 
     const hostConfig = {
@@ -525,12 +536,12 @@ async function handleSidecarStart(body, res) {
       Binds: binds,
     };
     if (useGpu) {
-      // Tell Docker to use the NVIDIA container runtime for this container.
-      // Equivalent to `docker run --gpus all` / `--runtime=nvidia`.
       hostConfig.Runtime = 'nvidia';
-      hostConfig.DeviceRequests = [
-        { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] },
-      ];
+      // CRITICAL: scope DeviceRequests to the SAME single GPU as
+      // NVIDIA_VISIBLE_DEVICES. A blanket Count:-1 (all GPUs) OVERRIDES the env
+      // var, so every sidecar got all 3 cards and nvenc piled onto GPU 0. Pass
+      // the specific DeviceIDs so the container truly sees only its one GPU.
+      hostConfig.DeviceRequests = [gpuDeviceRequest(startVisibleDevices)];
     }
 
     const spec = {
@@ -824,19 +835,22 @@ async function handleSidecarStandby(body, res) {
     }
 
     const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
+    var standbyVisibleDevices = 'all';
     if (useGpu) {
       // Same round-robin GPU spread as the start path (see pickVisibleDevices).
-      const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
-      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      standbyVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
+      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
       sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
-      console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
     }
     sidecarEnv.push(`FC_URL=${FC_URL}`);
 
     const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds };
     if (useGpu) {
       hostConfig.Runtime = 'nvidia';
-      hostConfig.DeviceRequests = [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }];
+      // Scope to the single chosen GPU (see gpuDeviceRequest) — a blanket
+      // Count:-1 overrides NVIDIA_VISIBLE_DEVICES and re-piles everything on GPU 0.
+      hostConfig.DeviceRequests = [gpuDeviceRequest(standbyVisibleDevices)];
     }
 
     // Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart).