feat(gpu+capture): nvenc HLS preview, source-backend abstraction, GPU affinity+telemetry
#164 HLS preview uses h264_nvenc (forced-IDR, GOP=segment) when the sidecar has the GPU, else keeps libx264 fallback. #168 source-backend abstraction in capture-manager (blackmagic implemented as a behavior-preserving refactor; deltacast/aja stubbed pending hardware). #167 per-recorder gpu_uuid (migration 032) plumbed mam-api->agent-> NVIDIA_VISIBLE_DEVICES (defaults to 'all'). #166 node-agent reports encoder util + NVENC session count per GPU; Cluster screen renders per-GPU GPU/ENC util, VRAM, sessions. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
ca1eec0600
commit
08499b93b2
5 changed files with 291 additions and 67 deletions
|
|
@ -133,6 +133,59 @@ const VIDEO_CODECS = {
|
|||
},
|
||||
};
|
||||
|
||||
// nvenc codecs available in the capture image. Used both to validate the master
|
||||
// codec and (issue #164) as the GPU-availability signal for the HLS preview.
|
||||
const NVENC_CODECS = new Set(['h264_nvenc', 'hevc_nvenc']);
|
||||
|
||||
// ── GPU availability for this sidecar (issue #164) ───────────────────────
|
||||
// The HLS monitor preview should be GPU-encoded (h264_nvenc) when — and only
|
||||
// when — the GPU is actually attached to this capture container. A non-GPU
|
||||
// recorder must keep using libx264, otherwise ffmpeg would fail to open the
|
||||
// nvenc encoder and break the preview.
|
||||
//
|
||||
// Two signals, OR'd for robustness:
|
||||
// 1) The master video codec is an nvenc codec. recorders.js derives `useGpu`
|
||||
// from exactly this (GPU_CODECS = [hevc_nvenc, h264_nvenc]) and node-agent
|
||||
// only attaches the NVIDIA runtime when useGpu is set — so an nvenc master
|
||||
// codec is a reliable proxy for "this sidecar has the GPU".
|
||||
// 2) node-agent injects NVIDIA_VISIBLE_DEVICES into the sidecar env whenever
|
||||
// useGpu is set. This is the most direct in-process evidence the runtime
|
||||
// attached a GPU, and covers the (currently unused) case where the GPU is
|
||||
// present but the master codec is a CPU codec.
|
||||
function gpuAvailableForPreview(masterCodec) {
|
||||
if (NVENC_CODECS.has(masterCodec)) return true;
|
||||
const vis = process.env.NVIDIA_VISIBLE_DEVICES;
|
||||
if (vis && vis !== 'void' && vis !== 'none') return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Build the HLS preview video-encode args. `segTime` is the HLS segment length
|
||||
// (seconds); we pin the GOP/keyframe interval to one IDR per segment so every
|
||||
// segment starts on a keyframe (misaligned keyframes were the root cause of the
|
||||
// playout preview black/flashing bug — keep the preview robust).
|
||||
function buildHlsVideoArgs(masterCodec, framerate) {
|
||||
// Frames-per-segment for keyframe alignment. The SDI preview runs at the
|
||||
// capture framerate; default to 30 (matches the test-card rate) when unknown.
|
||||
const fps = Number.parseFloat(framerate) || 30;
|
||||
const segTime = 2; // matches -hls_time below
|
||||
const gop = Math.max(1, Math.round(fps * segTime));
|
||||
if (gpuAvailableForPreview(masterCodec)) {
|
||||
// Low-latency NVENC preset (p1 + ll tune). forced-idr + a keyframe every GOP
|
||||
// frames keeps segment boundaries on IDR frames so hls.js can sync cleanly.
|
||||
return [
|
||||
'-c:v', 'h264_nvenc', '-preset', 'p1', '-tune', 'll',
|
||||
'-pix_fmt', 'yuv420p', '-b:v', '2M',
|
||||
'-g', String(gop), '-forced-idr', '1', '-sc_threshold', '0',
|
||||
];
|
||||
}
|
||||
// No GPU → keep the original CPU encode (must not break a non-GPU recorder).
|
||||
return [
|
||||
'-c:v', 'libx264', '-preset', 'veryfast', '-tune', 'zerolatency',
|
||||
'-pix_fmt', 'yuv420p', '-b:v', '2M',
|
||||
'-g', String(gop), '-sc_threshold', '0',
|
||||
];
|
||||
}
|
||||
|
||||
const AUDIO_CODECS = {
|
||||
pcm_s16le: { args: ['-c:a', 'pcm_s16le'], bitrateControl: false },
|
||||
pcm_s24le: { args: ['-c:a', 'pcm_s24le'], bitrateControl: false },
|
||||
|
|
@ -182,6 +235,88 @@ const GROWING_VIDEO_ARGS = [
|
|||
];
|
||||
const GROWING_EXT = 'mxf';
|
||||
|
||||
// ── Source-backend abstraction (issue #168) ──────────────────────────────
|
||||
// The capture input was historically hard-wired to a single `-f decklink -i …`
|
||||
// construction. To allow other SDI capture cards (Deltacast, AJA) to be added
|
||||
// later without touching the encode/output/HLS pipeline, the per-backend FFmpeg
|
||||
// INPUT-arg construction now lives behind this map. Each backend exposes:
|
||||
//
|
||||
// buildInput(ctx) -> { inputArgs, isNetwork } (may be async)
|
||||
//
|
||||
// where `ctx` carries the resolved recorder fields the backend needs (device).
|
||||
// The rest of capture-manager consumes the returned `inputArgs` unchanged, so
|
||||
// adding a backend is purely additive.
|
||||
//
|
||||
// IMPORTANT: `blackmagic` is a behaviour-preserving extraction of the previous
|
||||
// default DeckLink path — for an existing DeckLink recorder the produced ffmpeg
|
||||
// input args are byte-for-byte identical to the pre-refactor code. The
|
||||
// `deltacast`/`aja` entries are stubs that throw until the hardware/SDK plumbing
|
||||
// lands.
|
||||
const sourceBackends = {
|
||||
// BlackMagic DeckLink over SDI (the only backend implemented today).
|
||||
// device may be an integer index (0-based) or a full device name string.
|
||||
// FFmpeg 7.x DeckLink requires the full name (e.g. 'DeckLink Duo 2 (2)').
|
||||
// Map integer index -> name using ffmpeg -sources decklink at runtime.
|
||||
//
|
||||
// ffmpeg -sources decklink output format:
|
||||
// Auto-detected sources for decklink:
|
||||
// DeckLink Duo 2
|
||||
// DeckLink Duo 2 (2)
|
||||
// Lines containing device names start with whitespace; the header line
|
||||
// starts with a non-space character. Previous code used a v4l2-style
|
||||
// hex-address regex that never matched DeckLink output → index 1+ always
|
||||
// fell through to a wrong fallback, producing black output from port 2+.
|
||||
blackmagic: {
|
||||
async buildInput({ device }) {
|
||||
let deckLinkName = String(device);
|
||||
if (typeof device === 'number' || /^\d+$/.test(String(device))) {
|
||||
const idx = parseInt(device, 10);
|
||||
try {
|
||||
const { execSync } = await import('child_process');
|
||||
const out = execSync('ffmpeg -hide_banner -sources decklink 2>&1', { encoding: 'utf-8', timeout: 5000 });
|
||||
const names = [];
|
||||
for (const line of out.split('\n')) {
|
||||
// DeckLink source lines: " 81:76669a80:00000000 [DeckLink Duo (1)] (none)"
|
||||
const m = line.match(/^\s+[0-9a-f:]+\s+\[([^\]]+)\]/);
|
||||
if (m) names.push(m[1]);
|
||||
}
|
||||
if (names[idx]) {
|
||||
deckLinkName = names[idx];
|
||||
console.log(`[capture] DeckLink index ${idx} → "${deckLinkName}" (from ${names.length} detected: ${names.join(', ')})`);
|
||||
} else {
|
||||
// Fallback: cannot determine model name without enumeration.
|
||||
// Log a warning — operator should check the detected device list.
|
||||
console.warn(`[capture] DeckLink index ${idx} out of range (detected ${names.length} devices: ${names.join(', ')}). Falling back to index-only input — capture may fail.`);
|
||||
deckLinkName = `DeckLink (${idx})`;
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(`[capture] ffmpeg -sources decklink failed: ${err.message}. Using index ${device} directly.`);
|
||||
// Pass the numeric index directly; some ffmpeg builds accept it.
|
||||
deckLinkName = String(device);
|
||||
}
|
||||
}
|
||||
return {
|
||||
inputArgs: ['-f', 'decklink', '-i', deckLinkName],
|
||||
isNetwork: false,
|
||||
};
|
||||
},
|
||||
},
|
||||
|
||||
// Stubs — hardware/SDK plumbing not yet implemented. These throw clearly so a
|
||||
// misconfigured recorder fails fast instead of silently falling back to the
|
||||
// wrong card.
|
||||
deltacast: {
|
||||
buildInput() {
|
||||
throw new Error('deltacast backend not yet implemented — requires hardware');
|
||||
},
|
||||
},
|
||||
aja: {
|
||||
buildInput() {
|
||||
throw new Error('aja backend not yet implemented — requires hardware');
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
function buildEncodeArgs({
|
||||
codec, videoBitrate, framerate,
|
||||
audioCodec, audioBitrate, audioChannels,
|
||||
|
|
@ -257,7 +392,7 @@ class CaptureManager {
|
|||
* Returns { inputArgs, isNetwork }
|
||||
* @private
|
||||
*/
|
||||
async _buildInputArgs({ sourceType, device, sourceUrl, listen, listenPort, streamKey }) {
|
||||
async _buildInputArgs({ sourceType, sourceBackend = 'blackmagic', device, sourceUrl, listen, listenPort, streamKey }) {
|
||||
if (sourceType === 'srt') {
|
||||
let url;
|
||||
if (listen) {
|
||||
|
|
@ -321,50 +456,15 @@ class CaptureManager {
|
|||
}
|
||||
}
|
||||
|
||||
// Default: SDI via DeckLink
|
||||
// device may be an integer index (0-based) or a full device name string.
|
||||
// FFmpeg 7.x DeckLink requires the full name (e.g. 'DeckLink Duo 2 (2)').
|
||||
// Map integer index -> name using ffmpeg -sources decklink at runtime.
|
||||
//
|
||||
// ffmpeg -sources decklink output format:
|
||||
// Auto-detected sources for decklink:
|
||||
// DeckLink Duo 2
|
||||
// DeckLink Duo 2 (2)
|
||||
// Lines containing device names start with whitespace; the header line
|
||||
// starts with a non-space character. Previous code used a v4l2-style
|
||||
// hex-address regex that never matched DeckLink output → index 1+ always
|
||||
// fell through to a wrong fallback, producing black output from port 2+.
|
||||
let deckLinkName = String(device);
|
||||
if (typeof device === 'number' || /^\d+$/.test(String(device))) {
|
||||
const idx = parseInt(device, 10);
|
||||
try {
|
||||
const { execSync } = await import('child_process');
|
||||
const out = execSync('ffmpeg -hide_banner -sources decklink 2>&1', { encoding: 'utf-8', timeout: 5000 });
|
||||
const names = [];
|
||||
for (const line of out.split('\n')) {
|
||||
// DeckLink source lines: " 81:76669a80:00000000 [DeckLink Duo (1)] (none)"
|
||||
const m = line.match(/^\s+[0-9a-f:]+\s+\[([^\]]+)\]/);
|
||||
if (m) names.push(m[1]);
|
||||
}
|
||||
if (names[idx]) {
|
||||
deckLinkName = names[idx];
|
||||
console.log(`[capture] DeckLink index ${idx} → "${deckLinkName}" (from ${names.length} detected: ${names.join(', ')})`);
|
||||
} else {
|
||||
// Fallback: cannot determine model name without enumeration.
|
||||
// Log a warning — operator should check the detected device list.
|
||||
console.warn(`[capture] DeckLink index ${idx} out of range (detected ${names.length} devices: ${names.join(', ')}). Falling back to index-only input — capture may fail.`);
|
||||
deckLinkName = `DeckLink (${idx})`;
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(`[capture] ffmpeg -sources decklink failed: ${err.message}. Using index ${device} directly.`);
|
||||
// Pass the numeric index directly; some ffmpeg builds accept it.
|
||||
deckLinkName = String(device);
|
||||
}
|
||||
// Default: SDI via a pluggable source backend (issue #168). The backend
|
||||
// selection defaults to `blackmagic` (DeckLink) so existing SDI recorders
|
||||
// behave exactly as before. Deltacast/AJA backends throw until their
|
||||
// hardware/SDK plumbing lands.
|
||||
const backend = sourceBackends[sourceBackend];
|
||||
if (!backend) {
|
||||
throw new Error(`Unknown source backend "${sourceBackend}" — expected one of: ${Object.keys(sourceBackends).join(', ')}`);
|
||||
}
|
||||
return {
|
||||
inputArgs: ['-f', 'decklink', '-i', deckLinkName],
|
||||
isNetwork: false,
|
||||
};
|
||||
return await backend.buildInput({ device });
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -380,6 +480,9 @@ class CaptureManager {
|
|||
clipName,
|
||||
device,
|
||||
sourceType = 'sdi',
|
||||
// Source-backend selection for SDI capture (issue #168). Defaults to
|
||||
// `blackmagic` (DeckLink) so existing recorders are unaffected.
|
||||
sourceBackend = 'blackmagic',
|
||||
sourceUrl,
|
||||
listen = false,
|
||||
listenPort,
|
||||
|
|
@ -447,7 +550,7 @@ class CaptureManager {
|
|||
const startedAt = new Date().toISOString();
|
||||
|
||||
const { inputArgs, isNetwork } = await this._buildInputArgs({
|
||||
sourceType, device, sourceUrl, listen, listenPort, streamKey,
|
||||
sourceType, sourceBackend, device, sourceUrl, listen, listenPort, streamKey,
|
||||
});
|
||||
|
||||
const hiresCodecArgs = buildEncodeArgs({
|
||||
|
|
@ -506,10 +609,12 @@ class CaptureManager {
|
|||
'-map', '[vhi]', '-map', '0:a:0?',
|
||||
...hiresCodecArgs,
|
||||
hiresOutput,
|
||||
// Output 1 — low-latency H.264 HLS preview for the UI monitor
|
||||
// Output 1 — low-latency H.264 HLS preview for the UI monitor.
|
||||
// GPU-encoded (h264_nvenc) when the GPU is attached to this sidecar,
|
||||
// otherwise libx264 (issue #164). GOP is pinned to one IDR per HLS
|
||||
// segment so segments start on keyframes (avoids black/flashing).
|
||||
'-map', '[vlo]', '-map', '0:a:0?',
|
||||
'-c:v', 'libx264', '-preset', 'veryfast', '-tune', 'zerolatency',
|
||||
'-pix_fmt', 'yuv420p', '-b:v', '2M', '-g', '60', '-sc_threshold', '0',
|
||||
...buildHlsVideoArgs(videoCodec, framerate),
|
||||
'-c:a', 'aac', '-b:a', '128k', '-ar', '44100',
|
||||
'-f', 'hls', '-hls_time', '2', '-hls_list_size', '15',
|
||||
'-hls_flags', 'delete_segments+append_list+omit_endlist',
|
||||
|
|
@ -541,8 +646,8 @@ class CaptureManager {
|
|||
const hlsArgs = [
|
||||
...inputArgs,
|
||||
'-map', '0:v:0?', '-map', '0:a:0?',
|
||||
'-c:v', 'libx264', '-preset', 'veryfast', '-tune', 'zerolatency',
|
||||
'-pix_fmt', 'yuv420p', '-b:v', '2M', '-g', '60', '-sc_threshold', '0',
|
||||
// GPU-gated preview encode, same as the SDI 2nd-output path (#164).
|
||||
...buildHlsVideoArgs(videoCodec, framerate),
|
||||
'-c:a', 'aac', '-b:a', '128k', '-ar', '44100',
|
||||
'-f', 'hls', '-hls_time', '2', '-hls_list_size', '15',
|
||||
'-hls_flags', 'delete_segments+append_list+omit_endlist',
|
||||
|
|
@ -756,4 +861,4 @@ class CaptureManager {
|
|||
}
|
||||
|
||||
export default new CaptureManager();
|
||||
export { VIDEO_CODECS, AUDIO_CODECS, CONTAINER_FMT, CONTAINER_EXT };
|
||||
export { VIDEO_CODECS, AUDIO_CODECS, CONTAINER_FMT, CONTAINER_EXT, sourceBackends };
|
||||
|
|
|
|||
|
|
@ -0,0 +1,10 @@
|
|||
-- Migration 032: Per-recorder GPU affinity (Issue #167)
|
||||
-- Adds a nullable GPU UUID to the recorders table so each recorder can be
|
||||
-- pinned to a specific GPU on its node. The value is passed through to the
|
||||
-- node-agent sidecar-start payload and becomes NVIDIA_VISIBLE_DEVICES for the
|
||||
-- capture container. NULL = legacy behavior (NVIDIA_VISIBLE_DEVICES=all, i.e.
|
||||
-- every GPU visible). Accepts an nvidia-smi GPU UUID (e.g. "GPU-xxxx") or a
|
||||
-- numeric index string.
|
||||
|
||||
ALTER TABLE recorders
|
||||
ADD COLUMN IF NOT EXISTS gpu_uuid TEXT DEFAULT NULL;
|
||||
|
|
@ -604,6 +604,11 @@ router.post('/:id/start', requireRecorderEdit, async (req, res, next) => {
|
|||
// module-level GPU_CODECS list); extend it if av1_nvenc or others are added.
|
||||
const useGpu = GPU_CODECS.includes(recorder.recording_codec);
|
||||
|
||||
// Issue #167 — per-recorder GPU affinity. When recorders.gpu_uuid is set the
|
||||
// sidecar is pinned to that single device (NVIDIA_VISIBLE_DEVICES=<uuid>);
|
||||
// null keeps the legacy "all" behavior. Only meaningful when useGpu is true.
|
||||
const gpuUuid = recorder.gpu_uuid || null;
|
||||
|
||||
// Determine whether to spawn locally or via a remote node-agent.
|
||||
const { remote: isRemote, apiUrl: targetNodeApiUrl } = await resolveNodeTarget(recorder.node_id);
|
||||
// For remote sidecars, the capture container runs on the worker host network and cannot
|
||||
|
|
@ -621,7 +626,7 @@ router.post('/:id/start', requireRecorderEdit, async (req, res, next) => {
|
|||
const sidecarRes = await fetch(`${targetNodeApiUrl}/sidecar/start`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ image: 'wild-dragon-capture:latest', env, capturePort, sourceType, useGpu }),
|
||||
body: JSON.stringify({ image: 'wild-dragon-capture:latest', env, capturePort, sourceType, useGpu, gpuUuid }),
|
||||
signal: AbortSignal.timeout(15000),
|
||||
});
|
||||
if (!sidecarRes.ok) {
|
||||
|
|
@ -664,7 +669,8 @@ router.post('/:id/start', requireRecorderEdit, async (req, res, next) => {
|
|||
|
||||
const localEnv = [...env];
|
||||
if (useGpu) {
|
||||
localEnv.push('NVIDIA_VISIBLE_DEVICES=all');
|
||||
// Issue #167 — same per-recorder GPU affinity as the remote sidecar path.
|
||||
localEnv.push(`NVIDIA_VISIBLE_DEVICES=${gpuUuid || 'all'}`);
|
||||
localEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -102,6 +102,11 @@ async function handleSidecarStart(body, res) {
|
|||
// (ProRes, DNxHR, libx264) don't need it and it avoids a hard dep on the
|
||||
// NVIDIA container runtime on nodes that have no GPU.
|
||||
useGpu = false,
|
||||
// Issue #167 — optional per-recorder GPU affinity. When set to a GPU
|
||||
// UUID (e.g. "GPU-xxxx") or a numeric index, the sidecar is pinned to
|
||||
// that single device via NVIDIA_VISIBLE_DEVICES instead of "all". null /
|
||||
// undefined keeps the legacy "all" behavior (expose every GPU).
|
||||
gpuUuid = null,
|
||||
} = body;
|
||||
|
||||
const binds = [`${LIVE_DIR}:/live`];
|
||||
|
|
@ -118,11 +123,14 @@ async function handleSidecarStart(body, res) {
|
|||
// Build the sidecar environment, injecting NVIDIA vars when GPU is requested.
|
||||
const sidecarEnv = [...env, `PORT=${capturePort}`];
|
||||
if (useGpu) {
|
||||
// NVIDIA_VISIBLE_DEVICES=all exposes every GPU on the host.
|
||||
// For a single-GPU node (zampp2 / L4) this is equivalent to pinning GPU 0.
|
||||
// When we later store per-recorder GPU affinity in the DB we can pass a
|
||||
// specific UUID here instead.
|
||||
sidecarEnv.push('NVIDIA_VISIBLE_DEVICES=all');
|
||||
// Issue #167 — per-recorder GPU affinity. A gpuUuid (UUID string or
|
||||
// numeric index) pins the sidecar to exactly that device; otherwise
|
||||
// NVIDIA_VISIBLE_DEVICES=all exposes every GPU on the host (legacy
|
||||
// behavior — for a single-GPU node like zampp2 / L4 this equals GPU 0).
|
||||
const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '')
|
||||
? String(gpuUuid).trim()
|
||||
: 'all';
|
||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||
}
|
||||
|
||||
|
|
@ -397,21 +405,39 @@ function sampleCpu() {
|
|||
}
|
||||
|
||||
|
||||
// -- Live GPU utilization sampling -----------------------------------------
|
||||
// -- Live GPU / NVENC encode telemetry sampling -----------------------------
|
||||
// Spawns a short-lived nvidia container via Docker API on each heartbeat call.
|
||||
// Returns array of { index, util_pct, mem_used_mb, mem_total_mb } per GPU,
|
||||
// or [] if no GPUs / nvidia runtime unavailable.
|
||||
// Returns array of { index, util_pct, enc_util_pct, mem_used_mb, mem_total_mb,
|
||||
// nvenc_sessions } per GPU, or [] if no GPUs / nvidia runtime unavailable.
|
||||
//
|
||||
// Two nvidia-smi queries are run inside one container via `sh -c`, each guarded
|
||||
// with `|| true` so a query unsupported on a given driver/GPU (e.g. older cards
|
||||
// that don't expose utilization.encoder) doesn't abort the whole sample:
|
||||
// 1. --query-gpu → per-GPU gpu/encoder util + memory
|
||||
// 2. --query-compute-apps → pid,used_memory,gpu_uuid for live processes; we
|
||||
// count rows per GPU as an NVENC/compute "session" approximation. Marked
|
||||
// with a SEP line so the two CSV blocks can be told apart in the log.
|
||||
async function sampleGpuUtil() {
|
||||
if (!_gpuCache || _gpuCache.length === 0) return [];
|
||||
|
||||
const QUERY = '--query-gpu=index,utilization.gpu,memory.used,memory.total';
|
||||
const FMT = '--format=csv,noheader,nounits';
|
||||
const GPU_QUERY = '--query-gpu=index,utilization.gpu,utilization.encoder,memory.used,memory.total';
|
||||
const APP_QUERY = '--query-compute-apps=gpu_uuid,pid,used_memory';
|
||||
const FMT = '--format=csv,noheader,nounits';
|
||||
// Map GPU index → uuid so compute-app rows (keyed by uuid) attach to a GPU.
|
||||
const UUID_QUERY = '--query-gpu=index,uuid';
|
||||
const SCRIPT = [
|
||||
`nvidia-smi ${GPU_QUERY} ${FMT} || true`,
|
||||
`echo '---SEP-APPS---'`,
|
||||
`nvidia-smi ${APP_QUERY} ${FMT} 2>/dev/null || true`,
|
||||
`echo '---SEP-UUID---'`,
|
||||
`nvidia-smi ${UUID_QUERY} ${FMT} 2>/dev/null || true`,
|
||||
].join('; ');
|
||||
|
||||
let containerId;
|
||||
try {
|
||||
const createRes = await dockerApi('POST', '/containers/create', {
|
||||
Image: 'ubuntu:22.04',
|
||||
Cmd: ['nvidia-smi', QUERY, FMT],
|
||||
Cmd: ['sh', '-c', SCRIPT],
|
||||
HostConfig: {
|
||||
AutoRemove: false,
|
||||
Runtime: 'nvidia',
|
||||
|
|
@ -445,11 +471,46 @@ async function sampleGpuUtil() {
|
|||
});
|
||||
|
||||
const text = logRes.replace(/[\x00-\x07].{7}/g, '').trim();
|
||||
const lines = text.split('\n').filter(l => /^\d+,/.test(l.trim()));
|
||||
const [gpuBlock = '', appBlock = '', uuidBlock = ''] =
|
||||
text.split(/---SEP-(?:APPS|UUID)---/);
|
||||
|
||||
// uuid → index map (for attributing compute-app rows to a GPU)
|
||||
const uuidToIndex = {};
|
||||
uuidBlock.split('\n').forEach(l => {
|
||||
const m = l.trim().match(/^(\d+)\s*,\s*(GPU-[0-9a-fA-F-]+)/);
|
||||
if (m) uuidToIndex[m[2]] = parseInt(m[1], 10);
|
||||
});
|
||||
|
||||
// NVENC/compute session count per GPU index (best-effort).
|
||||
const sessionsByIndex = {};
|
||||
appBlock.split('\n').forEach(l => {
|
||||
const parts = l.split(',').map(s => s.trim());
|
||||
const uuid = parts[0];
|
||||
if (!uuid || !uuid.startsWith('GPU-')) return;
|
||||
const idx = uuidToIndex[uuid];
|
||||
if (idx == null) return;
|
||||
sessionsByIndex[idx] = (sessionsByIndex[idx] || 0) + 1;
|
||||
});
|
||||
|
||||
const lines = gpuBlock.split('\n').filter(l => /^\s*\d+\s*,/.test(l));
|
||||
|
||||
return lines.map(line => {
|
||||
const [idx, util, memUsed, memTotal] = line.split(',').map(s => parseInt(s.trim(), 10));
|
||||
return { index: idx, util_pct: util, mem_used_mb: memUsed, mem_total_mb: memTotal };
|
||||
// utilization.encoder may report "[N/A]" on cards/drivers that don't
|
||||
// expose it — parseInt yields NaN there, which we coerce to null.
|
||||
const cols = line.split(',').map(s => s.trim());
|
||||
const idx = parseInt(cols[0], 10);
|
||||
const util = parseInt(cols[1], 10);
|
||||
const encUtil = parseInt(cols[2], 10);
|
||||
const memUsed = parseInt(cols[3], 10);
|
||||
const memTotal = parseInt(cols[4], 10);
|
||||
return {
|
||||
index: idx,
|
||||
util_pct: Number.isNaN(util) ? null : util,
|
||||
enc_util_pct: Number.isNaN(encUtil) ? null : encUtil,
|
||||
mem_used_mb: Number.isNaN(memUsed) ? null : memUsed,
|
||||
mem_total_mb: Number.isNaN(memTotal) ? null : memTotal,
|
||||
nvenc_sessions: sessionsByIndex[idx] || 0,
|
||||
};
|
||||
});
|
||||
} catch (err) {
|
||||
console.warn('[gpu-util] sampling failed:', err.message);
|
||||
|
|
@ -630,6 +691,25 @@ async function heartbeat() {
|
|||
const ip_address = getIp();
|
||||
const capabilities = detectHardware();
|
||||
|
||||
// Issue #166 — fold live NVENC/GPU encode telemetry into capabilities.gpus so
|
||||
// the Cluster screen (which reads cluster_nodes.capabilities.gpus) can render
|
||||
// per-GPU util / encoder util / NVENC sessions alongside the static name+VRAM.
|
||||
// gpu_util is also sent verbatim below for any consumer reading metrics.gpus.
|
||||
if (Array.isArray(capabilities.gpus) && gpu_util.length) {
|
||||
capabilities.gpus = capabilities.gpus.map(g => {
|
||||
const live = gpu_util.find(u => u.index === g.index);
|
||||
if (!live) return g;
|
||||
return {
|
||||
...g,
|
||||
util_pct: live.util_pct,
|
||||
enc_util_pct: live.enc_util_pct,
|
||||
mem_used_mb: live.mem_used_mb,
|
||||
mem_total_mb: live.mem_total_mb ?? g.memory_mb ?? null,
|
||||
nvenc_sessions: live.nvenc_sessions,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
const payload = {
|
||||
hostname: os.hostname(),
|
||||
ip_address,
|
||||
|
|
|
|||
|
|
@ -11,6 +11,12 @@ function _normalizeNode(n, x, y) {
|
|||
index: g.index ?? 0,
|
||||
device: g.device || null,
|
||||
bound: !!(g.name && g.memory_mb), // name+memory = nvidia-smi confirmed driver bound
|
||||
// Issue #166 — live NVENC/GPU encode telemetry folded into capabilities.gpus
|
||||
// by the node-agent heartbeat (null until a heartbeat carries it / a GPU node).
|
||||
utilPct: g.util_pct != null ? g.util_pct : null,
|
||||
encUtilPct: g.enc_util_pct != null ? g.enc_util_pct : null,
|
||||
memUsedMb: g.mem_used_mb != null ? g.mem_used_mb : null,
|
||||
nvencSessions: g.nvenc_sessions != null ? g.nvenc_sessions : null,
|
||||
}));
|
||||
|
||||
// Blackmagic DeckLink: capabilities.blackmagic + capabilities.blackmagic_model
|
||||
|
|
@ -1706,6 +1712,23 @@ function Cluster() {
|
|||
</div>
|
||||
)}
|
||||
{g.device && <div style={{ fontSize: 10.5, color: "var(--text-4)", fontFamily: "var(--font-mono)" }}>{g.device}</div>}
|
||||
{/* Issue #166 — live NVENC/GPU encode telemetry (0 until a live encode runs) */}
|
||||
{(g.utilPct != null || g.encUtilPct != null || g.nvencSessions != null) && (
|
||||
<div style={{ display: "flex", flexWrap: "wrap", gap: "4px 10px", marginTop: 4, fontSize: 10.5, fontFamily: "var(--font-mono)" }}>
|
||||
{g.utilPct != null && (
|
||||
<span style={{ color: "var(--text-3)" }}>GPU <strong style={{ color: "var(--text-2)" }}>{g.utilPct}%</strong></span>
|
||||
)}
|
||||
{g.encUtilPct != null && (
|
||||
<span style={{ color: "var(--text-3)" }}>ENC <strong style={{ color: g.encUtilPct > 0 ? "var(--success)" : "var(--text-2)" }}>{g.encUtilPct}%</strong></span>
|
||||
)}
|
||||
{g.memUsedMb != null && g.memMb && (
|
||||
<span style={{ color: "var(--text-3)" }}>VRAM <strong style={{ color: "var(--text-2)" }}>{g.memUsedMb}/{g.memMb} MB</strong></span>
|
||||
)}
|
||||
{g.nvencSessions != null && (
|
||||
<span style={{ color: "var(--text-3)" }}>NVENC <strong style={{ color: g.nvencSessions > 0 ? "var(--success)" : "var(--text-2)" }}>{g.nvencSessions}</strong></span>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<span style={{
|
||||
fontSize: 10, fontWeight: 600, padding: "2px 6px", borderRadius: 3,
|
||||
|
|
|
|||
Loading…
Reference in a new issue