fix(node-agent): scope DeviceRequests to chosen GPU so VISIBLE_DEVICES sticks
NVIDIA_VISIBLE_DEVICES was set per-port correctly (0/1/2) but ALL encodes still landed on GPU 0 at 99%. Cause: HostConfig.DeviceRequests granted Count:-1 (all GPUs), which OVERRIDES NVIDIA_VISIBLE_DEVICES — the container saw all 3 cards and nvenc defaulted to device 0. Now build DeviceRequests with DeviceIDs:[chosenGpu] so each sidecar truly sees only its one L4.
This commit is contained in:
parent
f223bb8c8b
commit
15fab99d55
1 changed files with 26 additions and 12 deletions
|
|
@ -62,6 +62,17 @@ function pickVisibleDevices(gpuUuid, capturePort) {
|
|||
return String(idx);
|
||||
}
|
||||
|
||||
// Build the Docker DeviceRequests entry matching a NVIDIA_VISIBLE_DEVICES value.
|
||||
// 'all' → grant every GPU (Count -1). A specific index/UUID → grant ONLY that
|
||||
// device via DeviceIDs, so the container can't see (and nvenc can't fall back
|
||||
// to) any other card. A blanket Count:-1 would silently override the env var.
|
||||
function gpuDeviceRequest(visibleDevices) {
|
||||
if (!visibleDevices || visibleDevices === 'all') {
|
||||
return { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] };
|
||||
}
|
||||
return { Driver: 'nvidia', DeviceIDs: [String(visibleDevices)], Capabilities: [['gpu']] };
|
||||
}
|
||||
|
||||
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
|
||||
// the host installer — the value is only ever used to pick a script arg, never
|
||||
// interpolated into a shell string.
|
||||
|
|
@ -513,10 +524,10 @@ async function handleSidecarStart(body, res) {
|
|||
// numeric index) pins the sidecar to exactly that device; otherwise
|
||||
// Round-robin the encode across all GPUs by capture port (or honor an
|
||||
// explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
|
||||
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
var startVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
|
||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
|
||||
}
|
||||
|
||||
const hostConfig = {
|
||||
|
|
@ -525,12 +536,12 @@ async function handleSidecarStart(body, res) {
|
|||
Binds: binds,
|
||||
};
|
||||
if (useGpu) {
|
||||
// Tell Docker to use the NVIDIA container runtime for this container.
|
||||
// Equivalent to `docker run --gpus all` / `--runtime=nvidia`.
|
||||
hostConfig.Runtime = 'nvidia';
|
||||
hostConfig.DeviceRequests = [
|
||||
{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] },
|
||||
];
|
||||
// CRITICAL: scope DeviceRequests to the SAME single GPU as
|
||||
// NVIDIA_VISIBLE_DEVICES. A blanket Count:-1 (all GPUs) OVERRIDES the env
|
||||
// var, so every sidecar got all 3 cards and nvenc piled onto GPU 0. Pass
|
||||
// the specific DeviceIDs so the container truly sees only its one GPU.
|
||||
hostConfig.DeviceRequests = [gpuDeviceRequest(startVisibleDevices)];
|
||||
}
|
||||
|
||||
const spec = {
|
||||
|
|
@ -824,19 +835,22 @@ async function handleSidecarStandby(body, res) {
|
|||
}
|
||||
|
||||
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
|
||||
var standbyVisibleDevices = 'all';
|
||||
if (useGpu) {
|
||||
// Same round-robin GPU spread as the start path (see pickVisibleDevices).
|
||||
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
standbyVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
|
||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
|
||||
}
|
||||
sidecarEnv.push(`FC_URL=${FC_URL}`);
|
||||
|
||||
const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds };
|
||||
if (useGpu) {
|
||||
hostConfig.Runtime = 'nvidia';
|
||||
hostConfig.DeviceRequests = [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }];
|
||||
// Scope to the single chosen GPU (see gpuDeviceRequest) — a blanket
|
||||
// Count:-1 overrides NVIDIA_VISIBLE_DEVICES and re-piles everything on GPU 0.
|
||||
hostConfig.DeviceRequests = [gpuDeviceRequest(standbyVisibleDevices)];
|
||||
}
|
||||
|
||||
// Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart).
|
||||
|
|
|
|||
Loading…
Reference in a new issue