fix(node-agent): scope DeviceRequests to chosen GPU so VISIBLE_DEVICES sticks

NVIDIA_VISIBLE_DEVICES was set per-port correctly (0/1/2) but ALL encodes still landed on GPU 0 at 99%. Cause: HostConfig.DeviceRequests granted Count:-1 (all GPUs), which OVERRIDES NVIDIA_VISIBLE_DEVICES — the container saw all 3 cards and nvenc defaulted to device 0. Now build DeviceRequests with DeviceIDs:[chosenGpu] so each sidecar truly sees only its one L4.
2026-06-04 16:05:14 +00:00 · 2026-06-04 16:05:14 +00:00 · 15fab99d55
commit 15fab99d55
parent f223bb8c8b
1 changed files with 26 additions and 12 deletions
--- a/services/node-agent/index.js
+++ b/services/node-agent/index.js
@ -62,6 +62,17 @@ function pickVisibleDevices(gpuUuid, capturePort) {
  return String(idx);
 }
 // Build the Docker DeviceRequests entry matching a NVIDIA_VISIBLE_DEVICES value.
 // 'all' → grant every GPU (Count -1). A specific index/UUID → grant ONLY that
 // device via DeviceIDs, so the container can't see (and nvenc can't fall back
 // to) any other card. A blanket Count:-1 would silently override the env var.
 function gpuDeviceRequest(visibleDevices) {
  if (!visibleDevices || visibleDevices === 'all') {
    return { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] };
  }
  return { Driver: 'nvidia', DeviceIDs: [String(visibleDevices)], Capabilities: [['gpu']] };
 }
 // Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
 // the host installer — the value is only ever used to pick a script arg, never
 // interpolated into a shell string.
@ -513,10 +524,10 @@ async function handleSidecarStart(body, res) {
      // numeric index) pins the sidecar to exactly that device; otherwise
      // Round-robin the encode across all GPUs by capture port (or honor an
      // explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
-      const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
+      var startVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
-      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
      sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
-      console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
    }
    const hostConfig = {
@ -525,12 +536,12 @@ async function handleSidecarStart(body, res) {
      Binds: binds,
    };
    if (useGpu) {
      // Tell Docker to use the NVIDIA container runtime for this container.
      // Equivalent to `docker run --gpus all` / `--runtime=nvidia`.
      hostConfig.Runtime = 'nvidia';
-      hostConfig.DeviceRequests = [
+      // CRITICAL: scope DeviceRequests to the SAME single GPU as
-        { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] },
+      // NVIDIA_VISIBLE_DEVICES. A blanket Count:-1 (all GPUs) OVERRIDES the env
-      ];
+      // var, so every sidecar got all 3 cards and nvenc piled onto GPU 0. Pass
      // the specific DeviceIDs so the container truly sees only its one GPU.
      hostConfig.DeviceRequests = [gpuDeviceRequest(startVisibleDevices)];
    }
    const spec = {
@ -824,19 +835,22 @@ async function handleSidecarStandby(body, res) {
    }
    const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
    var standbyVisibleDevices = 'all';
    if (useGpu) {
      // Same round-robin GPU spread as the start path (see pickVisibleDevices).
-      const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
+      standbyVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
-      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
      sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
-      console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
+      console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
    }
    sidecarEnv.push(`FC_URL=${FC_URL}`);
    const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds };
    if (useGpu) {
      hostConfig.Runtime = 'nvidia';
-      hostConfig.DeviceRequests = [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }];
+      // Scope to the single chosen GPU (see gpuDeviceRequest) — a blanket
      // Count:-1 overrides NVIDIA_VISIBLE_DEVICES and re-piles everything on GPU 0.
      hostConfig.DeviceRequests = [gpuDeviceRequest(standbyVisibleDevices)];
    }
    // Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart).