fix(stability): spread capture encodes across all GPUs + GOP parse + filmstrip retry

VIDEO FREEZE UNDER BURN (transient stall, self-recovers): all 8 capture
sidecars ran NVIDIA_VISIBLE_DEVICES=all with no -gpu selector, so ffmpeg
nvenc put every session (8 HEVC masters + 8 HLS = 16) on physical GPU 0
while the other two L4s sat idle. GPU 0 NVENC hit 86%, encode fell below
realtime, the framecache ring lapped → video froze → caught up → recovered.
Bridge verified smooth at 60fps throughout. FIX: node-agent now round-robins
each sidecar to a GPU by capture port (port % detected-GPU-count) via
NVIDIA_VISIBLE_DEVICES, honoring an explicit gpuUuid when set. Auto-detects
GPU count from nvidia-smi (override CAPTURE_GPU_COUNT). ~3 encoders/GPU now.

GOP PARSE: Number.parseFloat('60000/1001') returns 60000, making GOP 120000
(near open-GOP) instead of ~120. Added parseFps() to handle rational rates;
fixed hevcNvencArgs + buildHlsVideoArgs.

FILMSTRIP: RustFS object store intermittently returns NoSuchKey on GET for
keys that List/Head confirm exist, blanking the strip. Generation/queue/DB
all verified healthy (13/15 assets HAVE filmstrips). FIX: API now serves the
filmstrip JSON through itself with retry-on-NoSuchKey (succeeds within a
couple attempts) instead of handing the browser a signed URL — also closes
the S3 CORS gap. Frontend updated to consume the direct JSON.
This commit is contained in:
Zac Gaetano 2026-06-04 15:56:44 +00:00
parent 32d829f796
commit 4be12c6f9a
4 changed files with 100 additions and 21 deletions

View file

@ -166,13 +166,28 @@ const VIDEO_CODECS = {
// `force_key_frames expr:1` (all-intra) is the ~4× compute path that was
// crippling realtime when applied to every recording; gating it on `growing`
// is the fix for the dropped-frame A/V drift.
// Parse a framerate that may be a rational like "60000/1001" (=59.94) OR a plain
// "59.94"/"60". Number.parseFloat("60000/1001") returns 60000 (stops at '/'),
// which made the GOP 120000 instead of ~120 — effectively open-GOP. Handle the
// rational form explicitly.
function parseFps(framerate, fallback = 60) {
if (framerate == null) return fallback;
const s = String(framerate).trim();
if (s.includes('/')) {
const [n, d] = s.split('/').map(Number);
if (Number.isFinite(n) && Number.isFinite(d) && d !== 0) return n / d;
}
const f = Number.parseFloat(s);
return Number.isFinite(f) && f > 0 ? f : fallback;
}
function hevcNvencArgs(framerate, growing) {
const base = ['-c:v', 'hevc_nvenc', '-preset', 'p4', '-rc', 'vbr', '-profile:v', 'main10'];
if (growing) {
return [...base, '-bf', '0', '-forced-idr', '1', '-g', '600', '-force_key_frames', 'expr:1'];
}
// Normal long-GOP: ~2s keyframe interval, 2 B-frames. Realtime-friendly.
const fps = Number.parseFloat(framerate) || 60;
const fps = parseFps(framerate, 60);
const gop = Math.max(2, Math.round(fps * 2));
return [...base, '-bf', '2', '-g', String(gop)];
}
@ -210,7 +225,7 @@ function gpuAvailableForPreview(masterCodec) {
function buildHlsVideoArgs(masterCodec, framerate) {
// Frames-per-segment for keyframe alignment. The SDI preview runs at the
// capture framerate; default to 30 (matches the test-card rate) when unknown.
const fps = Number.parseFloat(framerate) || 30;
const fps = parseFps(framerate, 30);
const segTime = 2; // matches -hls_time below
const gop = Math.max(1, Math.round(fps * segTime));
if (gpuAvailableForPreview(masterCodec)) {

View file

@ -685,8 +685,33 @@ router.get('/:id/filmstrip', async (req, res, next) => {
if (r.rows.length === 0) return res.status(404).json({ error: 'Asset not found' });
const { filmstrip_s3_key } = r.rows[0];
if (!filmstrip_s3_key) return res.json({ url: null, ready: false });
const url = await getSignedUrlForObject(filmstrip_s3_key);
res.json({ url, ready: true });
// Serve the filmstrip JSON THROUGH the API (with retry) instead of handing
// the browser a signed URL. The RustFS object store intermittently returns
// NoSuchKey on GET for keys that List/Head confirm exist — a single browser
// fetch then blanks the strip. Retrying server-side (where the GET succeeds
// within a couple attempts) makes filmstrips reliable, and avoids the S3
// CORS gap on the signed-URL path.
let lastErr = null;
for (let attempt = 0; attempt < 4; attempt++) {
try {
const obj = await s3Client.send(new GetObjectCommand({ Bucket: getS3Bucket(), Key: filmstrip_s3_key }));
const body = await obj.Body.transformToString();
res.set('Cache-Control', 'public, max-age=86400');
res.set('Content-Type', 'application/json');
return res.send(body);
} catch (e) {
lastErr = e;
const code = e?.name || e?.Code || '';
// Only retry the known transient store inconsistency; fail fast otherwise.
if (!/NoSuchKey|NotFound|404/i.test(code)) break;
await new Promise(r => setTimeout(r, 150 * (attempt + 1)));
}
}
// All retries missed — report not-ready so the UI shows its graceful
// fallback rather than a hard error.
console.warn(`[assets] filmstrip GET failed for ${id} (${filmstrip_s3_key}): ${lastErr?.name || lastErr?.message}`);
return res.json({ url: null, ready: false, transient: true });
} catch (err) { next(err); }
});

View file

@ -2,7 +2,7 @@ import http from 'http';
import os from 'os';
import fs from 'fs';
import crypto from 'crypto';
import { spawn } from 'child_process';
import { spawn, execFileSync } from 'child_process';
const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, '');
const NODE_TOKEN = process.env.NODE_TOKEN || '';
@ -26,6 +26,41 @@ const LIVE_DIR = process.env.LIVE_DIR || '/mnt/NVME/MAM/wild-dragon-live';
const REPO_DIR = process.env.REPO_DIR || '/opt/wild-dragon';
const VERSION = '1.4.0';
// Number of GPUs to spread capture encodes across. Each capture sidecar runs
// ~2 NVENC sessions (master HEVC + HLS preview); with NVIDIA_VISIBLE_DEVICES=all
// and no -gpu selector, ffmpeg's nvenc puts EVERY session on physical GPU 0, so
// 8 ports = 16 sessions hammering one card → it falls below realtime → the
// framecache ring laps → video freezes/stutters then recovers. Pinning each
// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards.
// Auto-detected from nvidia-smi at boot; override with CAPTURE_GPU_COUNT.
let CAPTURE_GPU_COUNT = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0;
function detectGpuCount() {
if (CAPTURE_GPU_COUNT > 0) return CAPTURE_GPU_COUNT;
try {
const out = execFileSync('nvidia-smi', ['--query-gpu=index', '--format=csv,noheader'], { timeout: 5000 })
.toString().trim();
const n = out ? out.split('\n').filter(Boolean).length : 0;
CAPTURE_GPU_COUNT = n > 0 ? n : 1;
} catch (_) {
CAPTURE_GPU_COUNT = 1; // no nvidia-smi / no GPU — single-device fallback
}
console.log(`[gpu] capture encodes will spread across ${CAPTURE_GPU_COUNT} GPU(s)`);
return CAPTURE_GPU_COUNT;
}
// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit
// per-recorder gpuUuid always wins; otherwise round-robin by capture port so
// consecutive Deltacast ports land on different physical GPUs. With a single
// device visible, the container's nvenc device 0 == the chosen physical GPU.
const SIDECAR_BASE_PORT = 7438;
function pickVisibleDevices(gpuUuid, capturePort) {
if (gpuUuid != null && String(gpuUuid).trim() !== '') return String(gpuUuid).trim();
const count = detectGpuCount();
if (count <= 1) return 'all';
const idx = Number.isFinite(capturePort) ? ((capturePort - SIDECAR_BASE_PORT) % count + count) % count : 0;
return String(idx);
}
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
// the host installer — the value is only ever used to pick a script arg, never
// interpolated into a shell string.
@ -475,13 +510,12 @@ async function handleSidecarStart(body, res) {
if (useGpu) {
// Issue #167 — per-recorder GPU affinity. A gpuUuid (UUID string or
// numeric index) pins the sidecar to exactly that device; otherwise
// NVIDIA_VISIBLE_DEVICES=all exposes every GPU on the host (legacy
// behavior — for a single-GPU node like zampp2 / L4 this equals GPU 0).
const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '')
? String(gpuUuid).trim()
: 'all';
// Round-robin the encode across all GPUs by capture port (or honor an
// explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
}
const hostConfig = {
@ -790,10 +824,11 @@ async function handleSidecarStandby(body, res) {
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
if (useGpu) {
const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '')
? String(gpuUuid).trim() : 'all';
// Same round-robin GPU spread as the start path (see pickVisibleDevices).
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
}
sidecarEnv.push(`FC_URL=${FC_URL}`);

View file

@ -102,18 +102,22 @@ function AssetDetail({ asset, onClose }) {
setFilmFrames([]);
setFilmstripLoading(true);
// The API now serves the filmstrip frames JSON directly (with server-side
// retry around the flaky object store) instead of returning a signed URL.
// Response is either the frames array, or { ready:false } when unavailable.
window.ZAMPP_API.fetch('/assets/' + assetId + '/filmstrip')
.then(function(r) {
if (cancelled) return;
if (!r || !r.url) { setFilmstripLoading(false); return; }
// Fetch the JSON array of base64 frames from the signed S3 URL
return fetch(r.url)
.then(function(res) { return res.json(); })
.then(function(frames) {
if (!cancelled && Array.isArray(frames) && frames.length) {
setFilmFrames(frames);
}
});
// New shape: bare array of base64 frames.
if (Array.isArray(r) && r.length) { setFilmFrames(r); return; }
// Legacy/empty shape: { url } (older API) or { ready:false }.
if (r && r.url) {
return fetch(r.url)
.then(function(res) { return res.json(); })
.then(function(frames) {
if (!cancelled && Array.isArray(frames) && frames.length) setFilmFrames(frames);
});
}
})
.catch(function() {})
.finally(function() { if (!cancelled) setFilmstripLoading(false); });