fix(node-agent): scope DeviceRequests to chosen GPU so VISIBLE_DEVICES sticks
NVIDIA_VISIBLE_DEVICES was set per-port correctly (0/1/2) but ALL encodes still landed on GPU 0 at 99%. Cause: HostConfig.DeviceRequests granted Count:-1 (all GPUs), which OVERRIDES NVIDIA_VISIBLE_DEVICES — the container saw all 3 cards and nvenc defaulted to device 0. Now build DeviceRequests with DeviceIDs:[chosenGpu] so each sidecar truly sees only its one L4.
This commit is contained in:
parent
f223bb8c8b
commit
15fab99d55
1 changed files with 26 additions and 12 deletions
|
|
@ -62,6 +62,17 @@ function pickVisibleDevices(gpuUuid, capturePort) {
|
||||||
return String(idx);
|
return String(idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Build the Docker DeviceRequests entry matching a NVIDIA_VISIBLE_DEVICES value.
|
||||||
|
// 'all' → grant every GPU (Count -1). A specific index/UUID → grant ONLY that
|
||||||
|
// device via DeviceIDs, so the container can't see (and nvenc can't fall back
|
||||||
|
// to) any other card. A blanket Count:-1 would silently override the env var.
|
||||||
|
function gpuDeviceRequest(visibleDevices) {
|
||||||
|
if (!visibleDevices || visibleDevices === 'all') {
|
||||||
|
return { Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] };
|
||||||
|
}
|
||||||
|
return { Driver: 'nvidia', DeviceIDs: [String(visibleDevices)], Capabilities: [['gpu']] };
|
||||||
|
}
|
||||||
|
|
||||||
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
|
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
|
||||||
// the host installer — the value is only ever used to pick a script arg, never
|
// the host installer — the value is only ever used to pick a script arg, never
|
||||||
// interpolated into a shell string.
|
// interpolated into a shell string.
|
||||||
|
|
@ -513,10 +524,10 @@ async function handleSidecarStart(body, res) {
|
||||||
// numeric index) pins the sidecar to exactly that device; otherwise
|
// numeric index) pins the sidecar to exactly that device; otherwise
|
||||||
// Round-robin the encode across all GPUs by capture port (or honor an
|
// Round-robin the encode across all GPUs by capture port (or honor an
|
||||||
// explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
|
// explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
|
||||||
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
var startVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
|
||||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||||
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${startVisibleDevices}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const hostConfig = {
|
const hostConfig = {
|
||||||
|
|
@ -525,12 +536,12 @@ async function handleSidecarStart(body, res) {
|
||||||
Binds: binds,
|
Binds: binds,
|
||||||
};
|
};
|
||||||
if (useGpu) {
|
if (useGpu) {
|
||||||
// Tell Docker to use the NVIDIA container runtime for this container.
|
|
||||||
// Equivalent to `docker run --gpus all` / `--runtime=nvidia`.
|
|
||||||
hostConfig.Runtime = 'nvidia';
|
hostConfig.Runtime = 'nvidia';
|
||||||
hostConfig.DeviceRequests = [
|
// CRITICAL: scope DeviceRequests to the SAME single GPU as
|
||||||
{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] },
|
// NVIDIA_VISIBLE_DEVICES. A blanket Count:-1 (all GPUs) OVERRIDES the env
|
||||||
];
|
// var, so every sidecar got all 3 cards and nvenc piled onto GPU 0. Pass
|
||||||
|
// the specific DeviceIDs so the container truly sees only its one GPU.
|
||||||
|
hostConfig.DeviceRequests = [gpuDeviceRequest(startVisibleDevices)];
|
||||||
}
|
}
|
||||||
|
|
||||||
const spec = {
|
const spec = {
|
||||||
|
|
@ -824,19 +835,22 @@ async function handleSidecarStandby(body, res) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
|
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
|
||||||
|
var standbyVisibleDevices = 'all';
|
||||||
if (useGpu) {
|
if (useGpu) {
|
||||||
// Same round-robin GPU spread as the start path (see pickVisibleDevices).
|
// Same round-robin GPU spread as the start path (see pickVisibleDevices).
|
||||||
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
standbyVisibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
|
||||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||||
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${standbyVisibleDevices}`);
|
||||||
}
|
}
|
||||||
sidecarEnv.push(`FC_URL=${FC_URL}`);
|
sidecarEnv.push(`FC_URL=${FC_URL}`);
|
||||||
|
|
||||||
const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds };
|
const hostConfig = { NetworkMode: 'host', Privileged: true, Binds: binds };
|
||||||
if (useGpu) {
|
if (useGpu) {
|
||||||
hostConfig.Runtime = 'nvidia';
|
hostConfig.Runtime = 'nvidia';
|
||||||
hostConfig.DeviceRequests = [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }];
|
// Scope to the single chosen GPU (see gpuDeviceRequest) — a blanket
|
||||||
|
// Count:-1 overrides NVIDIA_VISIBLE_DEVICES and re-piles everything on GPU 0.
|
||||||
|
hostConfig.DeviceRequests = [gpuDeviceRequest(standbyVisibleDevices)];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart).
|
// Warm up the bridge and inject FC_SLOT_ID (same as handleSidecarStart).
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue