fix(stability): spread capture encodes across all GPUs + GOP parse + filmstrip retry
VIDEO FREEZE UNDER BURN (transient stall, self-recovers): all 8 capture
sidecars ran NVIDIA_VISIBLE_DEVICES=all with no -gpu selector, so ffmpeg
nvenc put every session (8 HEVC masters + 8 HLS = 16) on physical GPU 0
while the other two L4s sat idle. GPU 0 NVENC hit 86%, encode fell below
realtime, the framecache ring lapped → video froze → caught up → recovered.
Bridge verified smooth at 60fps throughout. FIX: node-agent now round-robins
each sidecar to a GPU by capture port (port % detected-GPU-count) via
NVIDIA_VISIBLE_DEVICES, honoring an explicit gpuUuid when set. Auto-detects
GPU count from nvidia-smi (override CAPTURE_GPU_COUNT). ~3 encoders/GPU now.
GOP PARSE: Number.parseFloat('60000/1001') returns 60000, making GOP 120000
(near open-GOP) instead of ~120. Added parseFps() to handle rational rates;
fixed hevcNvencArgs + buildHlsVideoArgs.
FILMSTRIP: RustFS object store intermittently returns NoSuchKey on GET for
keys that List/Head confirm exist, blanking the strip. Generation/queue/DB
all verified healthy (13/15 assets HAVE filmstrips). FIX: API now serves the
filmstrip JSON through itself with retry-on-NoSuchKey (succeeds within a
couple attempts) instead of handing the browser a signed URL — also closes
the S3 CORS gap. Frontend updated to consume the direct JSON.
This commit is contained in:
parent
32d829f796
commit
4be12c6f9a
4 changed files with 100 additions and 21 deletions
|
|
@ -166,13 +166,28 @@ const VIDEO_CODECS = {
|
|||
// `force_key_frames expr:1` (all-intra) is the ~4× compute path that was
|
||||
// crippling realtime when applied to every recording; gating it on `growing`
|
||||
// is the fix for the dropped-frame A/V drift.
|
||||
// Parse a framerate that may be a rational like "60000/1001" (=59.94) OR a plain
|
||||
// "59.94"/"60". Number.parseFloat("60000/1001") returns 60000 (stops at '/'),
|
||||
// which made the GOP 120000 instead of ~120 — effectively open-GOP. Handle the
|
||||
// rational form explicitly.
|
||||
function parseFps(framerate, fallback = 60) {
|
||||
if (framerate == null) return fallback;
|
||||
const s = String(framerate).trim();
|
||||
if (s.includes('/')) {
|
||||
const [n, d] = s.split('/').map(Number);
|
||||
if (Number.isFinite(n) && Number.isFinite(d) && d !== 0) return n / d;
|
||||
}
|
||||
const f = Number.parseFloat(s);
|
||||
return Number.isFinite(f) && f > 0 ? f : fallback;
|
||||
}
|
||||
|
||||
function hevcNvencArgs(framerate, growing) {
|
||||
const base = ['-c:v', 'hevc_nvenc', '-preset', 'p4', '-rc', 'vbr', '-profile:v', 'main10'];
|
||||
if (growing) {
|
||||
return [...base, '-bf', '0', '-forced-idr', '1', '-g', '600', '-force_key_frames', 'expr:1'];
|
||||
}
|
||||
// Normal long-GOP: ~2s keyframe interval, 2 B-frames. Realtime-friendly.
|
||||
const fps = Number.parseFloat(framerate) || 60;
|
||||
const fps = parseFps(framerate, 60);
|
||||
const gop = Math.max(2, Math.round(fps * 2));
|
||||
return [...base, '-bf', '2', '-g', String(gop)];
|
||||
}
|
||||
|
|
@ -210,7 +225,7 @@ function gpuAvailableForPreview(masterCodec) {
|
|||
function buildHlsVideoArgs(masterCodec, framerate) {
|
||||
// Frames-per-segment for keyframe alignment. The SDI preview runs at the
|
||||
// capture framerate; default to 30 (matches the test-card rate) when unknown.
|
||||
const fps = Number.parseFloat(framerate) || 30;
|
||||
const fps = parseFps(framerate, 30);
|
||||
const segTime = 2; // matches -hls_time below
|
||||
const gop = Math.max(1, Math.round(fps * segTime));
|
||||
if (gpuAvailableForPreview(masterCodec)) {
|
||||
|
|
|
|||
|
|
@ -685,8 +685,33 @@ router.get('/:id/filmstrip', async (req, res, next) => {
|
|||
if (r.rows.length === 0) return res.status(404).json({ error: 'Asset not found' });
|
||||
const { filmstrip_s3_key } = r.rows[0];
|
||||
if (!filmstrip_s3_key) return res.json({ url: null, ready: false });
|
||||
const url = await getSignedUrlForObject(filmstrip_s3_key);
|
||||
res.json({ url, ready: true });
|
||||
|
||||
// Serve the filmstrip JSON THROUGH the API (with retry) instead of handing
|
||||
// the browser a signed URL. The RustFS object store intermittently returns
|
||||
// NoSuchKey on GET for keys that List/Head confirm exist — a single browser
|
||||
// fetch then blanks the strip. Retrying server-side (where the GET succeeds
|
||||
// within a couple attempts) makes filmstrips reliable, and avoids the S3
|
||||
// CORS gap on the signed-URL path.
|
||||
let lastErr = null;
|
||||
for (let attempt = 0; attempt < 4; attempt++) {
|
||||
try {
|
||||
const obj = await s3Client.send(new GetObjectCommand({ Bucket: getS3Bucket(), Key: filmstrip_s3_key }));
|
||||
const body = await obj.Body.transformToString();
|
||||
res.set('Cache-Control', 'public, max-age=86400');
|
||||
res.set('Content-Type', 'application/json');
|
||||
return res.send(body);
|
||||
} catch (e) {
|
||||
lastErr = e;
|
||||
const code = e?.name || e?.Code || '';
|
||||
// Only retry the known transient store inconsistency; fail fast otherwise.
|
||||
if (!/NoSuchKey|NotFound|404/i.test(code)) break;
|
||||
await new Promise(r => setTimeout(r, 150 * (attempt + 1)));
|
||||
}
|
||||
}
|
||||
// All retries missed — report not-ready so the UI shows its graceful
|
||||
// fallback rather than a hard error.
|
||||
console.warn(`[assets] filmstrip GET failed for ${id} (${filmstrip_s3_key}): ${lastErr?.name || lastErr?.message}`);
|
||||
return res.json({ url: null, ready: false, transient: true });
|
||||
} catch (err) { next(err); }
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import http from 'http';
|
|||
import os from 'os';
|
||||
import fs from 'fs';
|
||||
import crypto from 'crypto';
|
||||
import { spawn } from 'child_process';
|
||||
import { spawn, execFileSync } from 'child_process';
|
||||
|
||||
const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, '');
|
||||
const NODE_TOKEN = process.env.NODE_TOKEN || '';
|
||||
|
|
@ -26,6 +26,41 @@ const LIVE_DIR = process.env.LIVE_DIR || '/mnt/NVME/MAM/wild-dragon-live';
|
|||
const REPO_DIR = process.env.REPO_DIR || '/opt/wild-dragon';
|
||||
const VERSION = '1.4.0';
|
||||
|
||||
// Number of GPUs to spread capture encodes across. Each capture sidecar runs
|
||||
// ~2 NVENC sessions (master HEVC + HLS preview); with NVIDIA_VISIBLE_DEVICES=all
|
||||
// and no -gpu selector, ffmpeg's nvenc puts EVERY session on physical GPU 0, so
|
||||
// 8 ports = 16 sessions hammering one card → it falls below realtime → the
|
||||
// framecache ring laps → video freezes/stutters then recovers. Pinning each
|
||||
// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards.
|
||||
// Auto-detected from nvidia-smi at boot; override with CAPTURE_GPU_COUNT.
|
||||
let CAPTURE_GPU_COUNT = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0;
|
||||
function detectGpuCount() {
|
||||
if (CAPTURE_GPU_COUNT > 0) return CAPTURE_GPU_COUNT;
|
||||
try {
|
||||
const out = execFileSync('nvidia-smi', ['--query-gpu=index', '--format=csv,noheader'], { timeout: 5000 })
|
||||
.toString().trim();
|
||||
const n = out ? out.split('\n').filter(Boolean).length : 0;
|
||||
CAPTURE_GPU_COUNT = n > 0 ? n : 1;
|
||||
} catch (_) {
|
||||
CAPTURE_GPU_COUNT = 1; // no nvidia-smi / no GPU — single-device fallback
|
||||
}
|
||||
console.log(`[gpu] capture encodes will spread across ${CAPTURE_GPU_COUNT} GPU(s)`);
|
||||
return CAPTURE_GPU_COUNT;
|
||||
}
|
||||
|
||||
// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit
|
||||
// per-recorder gpuUuid always wins; otherwise round-robin by capture port so
|
||||
// consecutive Deltacast ports land on different physical GPUs. With a single
|
||||
// device visible, the container's nvenc device 0 == the chosen physical GPU.
|
||||
const SIDECAR_BASE_PORT = 7438;
|
||||
function pickVisibleDevices(gpuUuid, capturePort) {
|
||||
if (gpuUuid != null && String(gpuUuid).trim() !== '') return String(gpuUuid).trim();
|
||||
const count = detectGpuCount();
|
||||
if (count <= 1) return 'all';
|
||||
const idx = Number.isFinite(capturePort) ? ((capturePort - SIDECAR_BASE_PORT) % count + count) % count : 0;
|
||||
return String(idx);
|
||||
}
|
||||
|
||||
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
|
||||
// the host installer — the value is only ever used to pick a script arg, never
|
||||
// interpolated into a shell string.
|
||||
|
|
@ -475,13 +510,12 @@ async function handleSidecarStart(body, res) {
|
|||
if (useGpu) {
|
||||
// Issue #167 — per-recorder GPU affinity. A gpuUuid (UUID string or
|
||||
// numeric index) pins the sidecar to exactly that device; otherwise
|
||||
// NVIDIA_VISIBLE_DEVICES=all exposes every GPU on the host (legacy
|
||||
// behavior — for a single-GPU node like zampp2 / L4 this equals GPU 0).
|
||||
const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '')
|
||||
? String(gpuUuid).trim()
|
||||
: 'all';
|
||||
// Round-robin the encode across all GPUs by capture port (or honor an
|
||||
// explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
|
||||
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
}
|
||||
|
||||
const hostConfig = {
|
||||
|
|
@ -790,10 +824,11 @@ async function handleSidecarStandby(body, res) {
|
|||
|
||||
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
|
||||
if (useGpu) {
|
||||
const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '')
|
||||
? String(gpuUuid).trim() : 'all';
|
||||
// Same round-robin GPU spread as the start path (see pickVisibleDevices).
|
||||
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||
}
|
||||
sidecarEnv.push(`FC_URL=${FC_URL}`);
|
||||
|
||||
|
|
|
|||
|
|
@ -102,18 +102,22 @@ function AssetDetail({ asset, onClose }) {
|
|||
setFilmFrames([]);
|
||||
setFilmstripLoading(true);
|
||||
|
||||
// The API now serves the filmstrip frames JSON directly (with server-side
|
||||
// retry around the flaky object store) instead of returning a signed URL.
|
||||
// Response is either the frames array, or { ready:false } when unavailable.
|
||||
window.ZAMPP_API.fetch('/assets/' + assetId + '/filmstrip')
|
||||
.then(function(r) {
|
||||
if (cancelled) return;
|
||||
if (!r || !r.url) { setFilmstripLoading(false); return; }
|
||||
// Fetch the JSON array of base64 frames from the signed S3 URL
|
||||
return fetch(r.url)
|
||||
.then(function(res) { return res.json(); })
|
||||
.then(function(frames) {
|
||||
if (!cancelled && Array.isArray(frames) && frames.length) {
|
||||
setFilmFrames(frames);
|
||||
}
|
||||
});
|
||||
// New shape: bare array of base64 frames.
|
||||
if (Array.isArray(r) && r.length) { setFilmFrames(r); return; }
|
||||
// Legacy/empty shape: { url } (older API) or { ready:false }.
|
||||
if (r && r.url) {
|
||||
return fetch(r.url)
|
||||
.then(function(res) { return res.json(); })
|
||||
.then(function(frames) {
|
||||
if (!cancelled && Array.isArray(frames) && frames.length) setFilmFrames(frames);
|
||||
});
|
||||
}
|
||||
})
|
||||
.catch(function() {})
|
||||
.finally(function() { if (!cancelled) setFilmstripLoading(false); });
|
||||
|
|
|
|||
Loading…
Reference in a new issue