fix(node-agent): scope DeviceRequests to chosen GPU so VISIBLE_DEVICES sticks

NVIDIA_VISIBLE_DEVICES was set per-port correctly (0/1/2) but ALL encodes still landed on GPU 0 at 99%. Cause: HostConfig.DeviceRequests granted Count:-1 (all GPUs), which OVERRIDES NVIDIA_VISIBLE_DEVICES — the container saw all 3 cards and nvenc defaulted to device 0. Now build DeviceRequests with DeviceIDs:[chosenGpu] so each sidecar truly sees only its one L4.
This commit is contained in:
Zac Gaetano 2026-06-04 16:05:14 +00:00
parent f223bb8c8b
commit 15fab99d55

View file

@ -62,6 +62,17 @@ function pickVisibleDevices(gpuUuid, capturePort) {
return String(idx); return String(idx);
} }
// Build the Docker DeviceRequests entry matching a NVIDIA_VISIBLE_DEVICES value.
// 'all' → grant every GPU (Count -1). A specific index/UUID → grant ONLY that
// device via DeviceIDs, so the container can't see (and nvenc can't fall back
// to) any other card. A blanket Count:-1 would silently override the env var.
function gpuDeviceRequest(visibleDevices) {
if (!visibleDevices || visibleDevices === 'all') {
return { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] };
}
return { Driver: 'nvidia', DeviceIDs: [String(visibleDevices)], Capabilities: [['gpu']] };
}
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to // Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
// the host installer — the value is only ever used to pick a script arg, never // the host installer — the value is only ever used to pick a script arg, never
// interpolated into a shell string. // interpolated into a shell string.
@ -513,10 +524,10 @@ async function handleSidecarStart(body, res) {
// numeric index) pins the sidecar to exactly that device; otherwise // numeric index) pins the sidecar to exactly that device; otherwise
// Round-robin the encode across all GPUs by capture port (or honor an // Round-robin the encode across all GPUs by capture port (or honor an
// explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0. // explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort); var startVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility'); sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
} }
const hostConfig = { const hostConfig = {
@ -525,12 +536,12 @@ async function handleSidecarStart(body, res) {
Binds: binds, Binds: binds,
}; };
if (useGpu) { if (useGpu) {
// Tell Docker to use the NVIDIA container runtime for this container.
// Equivalent to `docker run --gpus all` / `--runtime=nvidia`.
hostConfig.Runtime = 'nvidia'; hostConfig.Runtime = 'nvidia';
hostConfig.DeviceRequests = [ // CRITICAL: scope DeviceRequests to the SAME single GPU as
{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }, // NVIDIA_VISIBLE_DEVICES. A blanket Count:-1 (all GPUs) OVERRIDES the env
]; // var, so every sidecar got all 3 cards and nvenc piled onto GPU 0. Pass
// the specific DeviceIDs so the container truly sees only its one GPU.
hostConfig.DeviceRequests = [gpuDeviceRequest(startVisibleDevices)];
} }
const spec = { const spec = {
@ -824,19 +835,22 @@ async function handleSidecarStandby(body, res) {
} }
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1']; const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
var standbyVisibleDevices = 'all';
if (useGpu) { if (useGpu) {
// Same round-robin GPU spread as the start path (see pickVisibleDevices). // Same round-robin GPU spread as the start path (see pickVisibleDevices).
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort); standbyVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility'); sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
} }
sidecarEnv.push(`FC_URL=${FC_URL}`); sidecarEnv.push(`FC_URL=${FC_URL}`);
const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds }; const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds };
if (useGpu) { if (useGpu) {
hostConfig.Runtime = 'nvidia'; hostConfig.Runtime = 'nvidia';
hostConfig.DeviceRequests = [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }]; // Scope to the single chosen GPU (see gpuDeviceRequest) — a blanket
// Count:-1 overrides NVIDIA_VISIBLE_DEVICES and re-piles everything on GPU 0.
hostConfig.DeviceRequests = [gpuDeviceRequest(standbyVisibleDevices)];
} }
// Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart). // Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart).