fix(stability): spread capture encodes across all GPUs + GOP parse + filmstrip retry
VIDEO FREEZE UNDER BURN (transient stall, self-recovers): all 8 capture
sidecars ran NVIDIA_VISIBLE_DEVICES=all with no -gpu selector, so ffmpeg
nvenc put every session (8 HEVC masters + 8 HLS = 16) on physical GPU 0
while the other two L4s sat idle. GPU 0 NVENC hit 86%, encode fell below
realtime, the framecache ring lapped → video froze → caught up → recovered.
Bridge verified smooth at 60fps throughout. FIX: node-agent now round-robins
each sidecar to a GPU by capture port (port % detected-GPU-count) via
NVIDIA_VISIBLE_DEVICES, honoring an explicit gpuUuid when set. Auto-detects
GPU count from nvidia-smi (override CAPTURE_GPU_COUNT). ~3 encoders/GPU now.
GOP PARSE: Number.parseFloat('60000/1001') returns 60000, making GOP 120000
(near open-GOP) instead of ~120. Added parseFps() to handle rational rates;
fixed hevcNvencArgs + buildHlsVideoArgs.
FILMSTRIP: RustFS object store intermittently returns NoSuchKey on GET for
keys that List/Head confirm exist, blanking the strip. Generation/queue/DB
all verified healthy (13/15 assets HAVE filmstrips). FIX: API now serves the
filmstrip JSON through itself with retry-on-NoSuchKey (succeeds within a
couple attempts) instead of handing the browser a signed URL — also closes
the S3 CORS gap. Frontend updated to consume the direct JSON.
This commit is contained in:
parent
32d829f796
commit
4be12c6f9a
4 changed files with 100 additions and 21 deletions
|
|
@ -166,13 +166,28 @@ const VIDEO_CODECS = {
|
||||||
// `force_key_frames expr:1` (all-intra) is the ~4× compute path that was
|
// `force_key_frames expr:1` (all-intra) is the ~4× compute path that was
|
||||||
// crippling realtime when applied to every recording; gating it on `growing`
|
// crippling realtime when applied to every recording; gating it on `growing`
|
||||||
// is the fix for the dropped-frame A/V drift.
|
// is the fix for the dropped-frame A/V drift.
|
||||||
|
// Parse a framerate that may be a rational like "60000/1001" (=59.94) OR a plain
|
||||||
|
// "59.94"/"60". Number.parseFloat("60000/1001") returns 60000 (stops at '/'),
|
||||||
|
// which made the GOP 120000 instead of ~120 — effectively open-GOP. Handle the
|
||||||
|
// rational form explicitly.
|
||||||
|
function parseFps(framerate, fallback = 60) {
|
||||||
|
if (framerate == null) return fallback;
|
||||||
|
const s = String(framerate).trim();
|
||||||
|
if (s.includes('/')) {
|
||||||
|
const [n, d] = s.split('/').map(Number);
|
||||||
|
if (Number.isFinite(n) && Number.isFinite(d) && d !== 0) return n / d;
|
||||||
|
}
|
||||||
|
const f = Number.parseFloat(s);
|
||||||
|
return Number.isFinite(f) && f > 0 ? f : fallback;
|
||||||
|
}
|
||||||
|
|
||||||
function hevcNvencArgs(framerate, growing) {
|
function hevcNvencArgs(framerate, growing) {
|
||||||
const base = ['-c:v', 'hevc_nvenc', '-preset', 'p4', '-rc', 'vbr', '-profile:v', 'main10'];
|
const base = ['-c:v', 'hevc_nvenc', '-preset', 'p4', '-rc', 'vbr', '-profile:v', 'main10'];
|
||||||
if (growing) {
|
if (growing) {
|
||||||
return [...base, '-bf', '0', '-forced-idr', '1', '-g', '600', '-force_key_frames', 'expr:1'];
|
return [...base, '-bf', '0', '-forced-idr', '1', '-g', '600', '-force_key_frames', 'expr:1'];
|
||||||
}
|
}
|
||||||
// Normal long-GOP: ~2s keyframe interval, 2 B-frames. Realtime-friendly.
|
// Normal long-GOP: ~2s keyframe interval, 2 B-frames. Realtime-friendly.
|
||||||
const fps = Number.parseFloat(framerate) || 60;
|
const fps = parseFps(framerate, 60);
|
||||||
const gop = Math.max(2, Math.round(fps * 2));
|
const gop = Math.max(2, Math.round(fps * 2));
|
||||||
return [...base, '-bf', '2', '-g', String(gop)];
|
return [...base, '-bf', '2', '-g', String(gop)];
|
||||||
}
|
}
|
||||||
|
|
@ -210,7 +225,7 @@ function gpuAvailableForPreview(masterCodec) {
|
||||||
function buildHlsVideoArgs(masterCodec, framerate) {
|
function buildHlsVideoArgs(masterCodec, framerate) {
|
||||||
// Frames-per-segment for keyframe alignment. The SDI preview runs at the
|
// Frames-per-segment for keyframe alignment. The SDI preview runs at the
|
||||||
// capture framerate; default to 30 (matches the test-card rate) when unknown.
|
// capture framerate; default to 30 (matches the test-card rate) when unknown.
|
||||||
const fps = Number.parseFloat(framerate) || 30;
|
const fps = parseFps(framerate, 30);
|
||||||
const segTime = 2; // matches -hls_time below
|
const segTime = 2; // matches -hls_time below
|
||||||
const gop = Math.max(1, Math.round(fps * segTime));
|
const gop = Math.max(1, Math.round(fps * segTime));
|
||||||
if (gpuAvailableForPreview(masterCodec)) {
|
if (gpuAvailableForPreview(masterCodec)) {
|
||||||
|
|
|
||||||
|
|
@ -685,8 +685,33 @@ router.get('/:id/filmstrip', async (req, res, next) => {
|
||||||
if (r.rows.length === 0) return res.status(404).json({ error: 'Asset not found' });
|
if (r.rows.length === 0) return res.status(404).json({ error: 'Asset not found' });
|
||||||
const { filmstrip_s3_key } = r.rows[0];
|
const { filmstrip_s3_key } = r.rows[0];
|
||||||
if (!filmstrip_s3_key) return res.json({ url: null, ready: false });
|
if (!filmstrip_s3_key) return res.json({ url: null, ready: false });
|
||||||
const url = await getSignedUrlForObject(filmstrip_s3_key);
|
|
||||||
res.json({ url, ready: true });
|
// Serve the filmstrip JSON THROUGH the API (with retry) instead of handing
|
||||||
|
// the browser a signed URL. The RustFS object store intermittently returns
|
||||||
|
// NoSuchKey on GET for keys that List/Head confirm exist — a single browser
|
||||||
|
// fetch then blanks the strip. Retrying server-side (where the GET succeeds
|
||||||
|
// within a couple attempts) makes filmstrips reliable, and avoids the S3
|
||||||
|
// CORS gap on the signed-URL path.
|
||||||
|
let lastErr = null;
|
||||||
|
for (let attempt = 0; attempt < 4; attempt++) {
|
||||||
|
try {
|
||||||
|
const obj = await s3Client.send(new GetObjectCommand({ Bucket: getS3Bucket(), Key: filmstrip_s3_key }));
|
||||||
|
const body = await obj.Body.transformToString();
|
||||||
|
res.set('Cache-Control', 'public, max-age=86400');
|
||||||
|
res.set('Content-Type', 'application/json');
|
||||||
|
return res.send(body);
|
||||||
|
} catch (e) {
|
||||||
|
lastErr = e;
|
||||||
|
const code = e?.name || e?.Code || '';
|
||||||
|
// Only retry the known transient store inconsistency; fail fast otherwise.
|
||||||
|
if (!/NoSuchKey|NotFound|404/i.test(code)) break;
|
||||||
|
await new Promise(r => setTimeout(r, 150 * (attempt + 1)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// All retries missed — report not-ready so the UI shows its graceful
|
||||||
|
// fallback rather than a hard error.
|
||||||
|
console.warn(`[assets] filmstrip GET failed for ${id} (${filmstrip_s3_key}): ${lastErr?.name || lastErr?.message}`);
|
||||||
|
return res.json({ url: null, ready: false, transient: true });
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ import http from 'http';
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import crypto from 'crypto';
|
import crypto from 'crypto';
|
||||||
import { spawn } from 'child_process';
|
import { spawn, execFileSync } from 'child_process';
|
||||||
|
|
||||||
const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, '');
|
const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, '');
|
||||||
const NODE_TOKEN = process.env.NODE_TOKEN || '';
|
const NODE_TOKEN = process.env.NODE_TOKEN || '';
|
||||||
|
|
@ -26,6 +26,41 @@ const LIVE_DIR = process.env.LIVE_DIR || '/mnt/NVME/MAM/wild-dragon-live';
|
||||||
const REPO_DIR = process.env.REPO_DIR || '/opt/wild-dragon';
|
const REPO_DIR = process.env.REPO_DIR || '/opt/wild-dragon';
|
||||||
const VERSION = '1.4.0';
|
const VERSION = '1.4.0';
|
||||||
|
|
||||||
|
// Number of GPUs to spread capture encodes across. Each capture sidecar runs
|
||||||
|
// ~2 NVENC sessions (master HEVC + HLS preview); with NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
// and no -gpu selector, ffmpeg's nvenc puts EVERY session on physical GPU 0, so
|
||||||
|
// 8 ports = 16 sessions hammering one card → it falls below realtime → the
|
||||||
|
// framecache ring laps → video freezes/stutters then recovers. Pinning each
|
||||||
|
// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards.
|
||||||
|
// Auto-detected from nvidia-smi at boot; override with CAPTURE_GPU_COUNT.
|
||||||
|
let CAPTURE_GPU_COUNT = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0;
|
||||||
|
function detectGpuCount() {
|
||||||
|
if (CAPTURE_GPU_COUNT > 0) return CAPTURE_GPU_COUNT;
|
||||||
|
try {
|
||||||
|
const out = execFileSync('nvidia-smi', ['--query-gpu=index', '--format=csv,noheader'], { timeout: 5000 })
|
||||||
|
.toString().trim();
|
||||||
|
const n = out ? out.split('\n').filter(Boolean).length : 0;
|
||||||
|
CAPTURE_GPU_COUNT = n > 0 ? n : 1;
|
||||||
|
} catch (_) {
|
||||||
|
CAPTURE_GPU_COUNT = 1; // no nvidia-smi / no GPU — single-device fallback
|
||||||
|
}
|
||||||
|
console.log(`[gpu] capture encodes will spread across ${CAPTURE_GPU_COUNT} GPU(s)`);
|
||||||
|
return CAPTURE_GPU_COUNT;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit
|
||||||
|
// per-recorder gpuUuid always wins; otherwise round-robin by capture port so
|
||||||
|
// consecutive Deltacast ports land on different physical GPUs. With a single
|
||||||
|
// device visible, the container's nvenc device 0 == the chosen physical GPU.
|
||||||
|
const SIDECAR_BASE_PORT = 7438;
|
||||||
|
function pickVisibleDevices(gpuUuid, capturePort) {
|
||||||
|
if (gpuUuid != null && String(gpuUuid).trim() !== '') return String(gpuUuid).trim();
|
||||||
|
const count = detectGpuCount();
|
||||||
|
if (count <= 1) return 'all';
|
||||||
|
const idx = Number.isFinite(capturePort) ? ((capturePort - SIDECAR_BASE_PORT) % count + count) % count : 0;
|
||||||
|
return String(idx);
|
||||||
|
}
|
||||||
|
|
||||||
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
|
// Capture-driver vendor allowlist. NOTHING outside this set is ever passed to
|
||||||
// the host installer — the value is only ever used to pick a script arg, never
|
// the host installer — the value is only ever used to pick a script arg, never
|
||||||
// interpolated into a shell string.
|
// interpolated into a shell string.
|
||||||
|
|
@ -475,13 +510,12 @@ async function handleSidecarStart(body, res) {
|
||||||
if (useGpu) {
|
if (useGpu) {
|
||||||
// Issue #167 — per-recorder GPU affinity. A gpuUuid (UUID string or
|
// Issue #167 — per-recorder GPU affinity. A gpuUuid (UUID string or
|
||||||
// numeric index) pins the sidecar to exactly that device; otherwise
|
// numeric index) pins the sidecar to exactly that device; otherwise
|
||||||
// NVIDIA_VISIBLE_DEVICES=all exposes every GPU on the host (legacy
|
// Round-robin the encode across all GPUs by capture port (or honor an
|
||||||
// behavior — for a single-GPU node like zampp2 / L4 this equals GPU 0).
|
// explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0.
|
||||||
const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '')
|
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||||
? String(gpuUuid).trim()
|
|
||||||
: 'all';
|
|
||||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||||
|
console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const hostConfig = {
|
const hostConfig = {
|
||||||
|
|
@ -790,10 +824,11 @@ async function handleSidecarStandby(body, res) {
|
||||||
|
|
||||||
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
|
const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1'];
|
||||||
if (useGpu) {
|
if (useGpu) {
|
||||||
const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '')
|
// Same round-robin GPU spread as the start path (see pickVisibleDevices).
|
||||||
? String(gpuUuid).trim() : 'all';
|
const visibleDevices = pickVisibleDevices(gpuUuid, capturePort);
|
||||||
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||||
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility');
|
||||||
|
console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`);
|
||||||
}
|
}
|
||||||
sidecarEnv.push(`FC_URL=${FC_URL}`);
|
sidecarEnv.push(`FC_URL=${FC_URL}`);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -102,18 +102,22 @@ function AssetDetail({ asset, onClose }) {
|
||||||
setFilmFrames([]);
|
setFilmFrames([]);
|
||||||
setFilmstripLoading(true);
|
setFilmstripLoading(true);
|
||||||
|
|
||||||
|
// The API now serves the filmstrip frames JSON directly (with server-side
|
||||||
|
// retry around the flaky object store) instead of returning a signed URL.
|
||||||
|
// Response is either the frames array, or { ready:false } when unavailable.
|
||||||
window.ZAMPP_API.fetch('/assets/' + assetId + '/filmstrip')
|
window.ZAMPP_API.fetch('/assets/' + assetId + '/filmstrip')
|
||||||
.then(function(r) {
|
.then(function(r) {
|
||||||
if (cancelled) return;
|
if (cancelled) return;
|
||||||
if (!r || !r.url) { setFilmstripLoading(false); return; }
|
// New shape: bare array of base64 frames.
|
||||||
// Fetch the JSON array of base64 frames from the signed S3 URL
|
if (Array.isArray(r) && r.length) { setFilmFrames(r); return; }
|
||||||
return fetch(r.url)
|
// Legacy/empty shape: { url } (older API) or { ready:false }.
|
||||||
.then(function(res) { return res.json(); })
|
if (r && r.url) {
|
||||||
.then(function(frames) {
|
return fetch(r.url)
|
||||||
if (!cancelled && Array.isArray(frames) && frames.length) {
|
.then(function(res) { return res.json(); })
|
||||||
setFilmFrames(frames);
|
.then(function(frames) {
|
||||||
}
|
if (!cancelled && Array.isArray(frames) && frames.length) setFilmFrames(frames);
|
||||||
});
|
});
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.catch(function() {})
|
.catch(function() {})
|
||||||
.finally(function() { if (!cancelled) setFilmstripLoading(false); });
|
.finally(function() { if (!cancelled) setFilmstripLoading(false); });
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue