fix(node-agent): GPU count from _gpuCache, not direct nvidia-smi
The node-agent image has no nvidia-smi binary, so the direct execFileSync detect always failed → fell back to 1 GPU → NVIDIA_VISIBLE_DEVICES=all (the exact bug we were fixing). Use the existing _gpuCache (populated at startup by probeGpusViaSmi via a throwaway GPU container) for the count instead.
This commit is contained in:
parent
4be12c6f9a
commit
15c282e749
1 changed files with 10 additions and 14 deletions
|
|
@ -2,7 +2,7 @@ import http from 'http';
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import crypto from 'crypto';
|
import crypto from 'crypto';
|
||||||
import { spawn, execFileSync } from 'child_process';
|
import { spawn } from 'child_process';
|
||||||
|
|
||||||
const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, '');
|
const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, '');
|
||||||
const NODE_TOKEN = process.env.NODE_TOKEN || '';
|
const NODE_TOKEN = process.env.NODE_TOKEN || '';
|
||||||
|
|
@ -32,20 +32,16 @@ const VERSION = '1.4.0';
|
||||||
// 8 ports = 16 sessions hammering one card → it falls below realtime → the
|
// 8 ports = 16 sessions hammering one card → it falls below realtime → the
|
||||||
// framecache ring laps → video freezes/stutters then recovers. Pinning each
|
// framecache ring laps → video freezes/stutters then recovers. Pinning each
|
||||||
// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards.
|
// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards.
|
||||||
// Auto-detected from nvidia-smi at boot; override with CAPTURE_GPU_COUNT.
|
// GPU count comes from _gpuCache (populated at startup via probeGpusViaSmi,
|
||||||
let CAPTURE_GPU_COUNT = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0;
|
// which runs nvidia-smi in a throwaway GPU container because the node-agent
|
||||||
|
// image itself has no nvidia-smi). CAPTURE_GPU_COUNT env overrides everything.
|
||||||
function detectGpuCount() {
|
function detectGpuCount() {
|
||||||
if (CAPTURE_GPU_COUNT > 0) return CAPTURE_GPU_COUNT;
|
const envN = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0;
|
||||||
try {
|
if (envN > 0) return envN;
|
||||||
const out = execFileSync('nvidia-smi', ['--query-gpu=index', '--format=csv,noheader'], { timeout: 5000 })
|
// _gpuCache is declared later (function hoisting makes the ref fine at call
|
||||||
.toString().trim();
|
// time). null = not probed yet; [] = no GPUs; otherwise one entry per GPU.
|
||||||
const n = out ? out.split('\n').filter(Boolean).length : 0;
|
if (Array.isArray(_gpuCache) && _gpuCache.length > 0) return _gpuCache.length;
|
||||||
CAPTURE_GPU_COUNT = n > 0 ? n : 1;
|
return 1; // not probed yet / no GPUs → single-device fallback ('all')
|
||||||
} catch (_) {
|
|
||||||
CAPTURE_GPU_COUNT = 1; // no nvidia-smi / no GPU — single-device fallback
|
|
||||||
}
|
|
||||||
console.log(`[gpu] capture encodes will spread across ${CAPTURE_GPU_COUNT} GPU(s)`);
|
|
||||||
return CAPTURE_GPU_COUNT;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit
|
// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue