From 4be12c6f9a901e0aa19dac1e7c7c090a5a9ed388 Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Thu, 4 Jun 2026 15:56:44 +0000 Subject: [PATCH] fix(stability): spread capture encodes across all GPUs + GOP parse + filmstrip retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VIDEO FREEZE UNDER BURN (transient stall, self-recovers): all 8 capture sidecars ran NVIDIA_VISIBLE_DEVICES=all with no -gpu selector, so ffmpeg nvenc put every session (8 HEVC masters + 8 HLS = 16) on physical GPU 0 while the other two L4s sat idle. GPU 0 NVENC hit 86%, encode fell below realtime, the framecache ring lapped → video froze → caught up → recovered. Bridge verified smooth at 60fps throughout. FIX: node-agent now round-robins each sidecar to a GPU by capture port (port % detected-GPU-count) via NVIDIA_VISIBLE_DEVICES, honoring an explicit gpuUuid when set. Auto-detects GPU count from nvidia-smi (override CAPTURE_GPU_COUNT). ~3 encoders/GPU now. GOP PARSE: Number.parseFloat('60000/1001') returns 60000, making GOP 120000 (near open-GOP) instead of ~120. Added parseFps() to handle rational rates; fixed hevcNvencArgs + buildHlsVideoArgs. FILMSTRIP: RustFS object store intermittently returns NoSuchKey on GET for keys that List/Head confirm exist, blanking the strip. Generation/queue/DB all verified healthy (13/15 assets HAVE filmstrips). FIX: API now serves the filmstrip JSON through itself with retry-on-NoSuchKey (succeeds within a couple attempts) instead of handing the browser a signed URL — also closes the S3 CORS gap. Frontend updated to consume the direct JSON. --- services/capture/src/capture-manager.js | 19 ++++++++- services/mam-api/src/routes/assets.js | 29 +++++++++++++- services/node-agent/index.js | 51 ++++++++++++++++++++---- services/web-ui/public/screens-asset.jsx | 22 +++++----- 4 files changed, 100 insertions(+), 21 deletions(-) diff --git a/services/capture/src/capture-manager.js b/services/capture/src/capture-manager.js index 528d2d1..7db5a6e 100644 --- a/services/capture/src/capture-manager.js +++ b/services/capture/src/capture-manager.js @@ -166,13 +166,28 @@ const VIDEO_CODECS = { // `force_key_frames expr:1` (all-intra) is the ~4× compute path that was // crippling realtime when applied to every recording; gating it on `growing` // is the fix for the dropped-frame A/V drift. +// Parse a framerate that may be a rational like "60000/1001" (=59.94) OR a plain +// "59.94"/"60". Number.parseFloat("60000/1001") returns 60000 (stops at '/'), +// which made the GOP 120000 instead of ~120 — effectively open-GOP. Handle the +// rational form explicitly. +function parseFps(framerate, fallback = 60) { + if (framerate == null) return fallback; + const s = String(framerate).trim(); + if (s.includes('/')) { + const [n, d] = s.split('/').map(Number); + if (Number.isFinite(n) && Number.isFinite(d) && d !== 0) return n / d; + } + const f = Number.parseFloat(s); + return Number.isFinite(f) && f > 0 ? f : fallback; +} + function hevcNvencArgs(framerate, growing) { const base = ['-c:v', 'hevc_nvenc', '-preset', 'p4', '-rc', 'vbr', '-profile:v', 'main10']; if (growing) { return [...base, '-bf', '0', '-forced-idr', '1', '-g', '600', '-force_key_frames', 'expr:1']; } // Normal long-GOP: ~2s keyframe interval, 2 B-frames. Realtime-friendly. - const fps = Number.parseFloat(framerate) || 60; + const fps = parseFps(framerate, 60); const gop = Math.max(2, Math.round(fps * 2)); return [...base, '-bf', '2', '-g', String(gop)]; } @@ -210,7 +225,7 @@ function gpuAvailableForPreview(masterCodec) { function buildHlsVideoArgs(masterCodec, framerate) { // Frames-per-segment for keyframe alignment. The SDI preview runs at the // capture framerate; default to 30 (matches the test-card rate) when unknown. - const fps = Number.parseFloat(framerate) || 30; + const fps = parseFps(framerate, 30); const segTime = 2; // matches -hls_time below const gop = Math.max(1, Math.round(fps * segTime)); if (gpuAvailableForPreview(masterCodec)) { diff --git a/services/mam-api/src/routes/assets.js b/services/mam-api/src/routes/assets.js index 167c7bb..6be43d5 100644 --- a/services/mam-api/src/routes/assets.js +++ b/services/mam-api/src/routes/assets.js @@ -685,8 +685,33 @@ router.get('/:id/filmstrip', async (req, res, next) => { if (r.rows.length === 0) return res.status(404).json({ error: 'Asset not found' }); const { filmstrip_s3_key } = r.rows[0]; if (!filmstrip_s3_key) return res.json({ url: null, ready: false }); - const url = await getSignedUrlForObject(filmstrip_s3_key); - res.json({ url, ready: true }); + + // Serve the filmstrip JSON THROUGH the API (with retry) instead of handing + // the browser a signed URL. The RustFS object store intermittently returns + // NoSuchKey on GET for keys that List/Head confirm exist — a single browser + // fetch then blanks the strip. Retrying server-side (where the GET succeeds + // within a couple attempts) makes filmstrips reliable, and avoids the S3 + // CORS gap on the signed-URL path. + let lastErr = null; + for (let attempt = 0; attempt < 4; attempt++) { + try { + const obj = await s3Client.send(new GetObjectCommand({ Bucket: getS3Bucket(), Key: filmstrip_s3_key })); + const body = await obj.Body.transformToString(); + res.set('Cache-Control', 'public, max-age=86400'); + res.set('Content-Type', 'application/json'); + return res.send(body); + } catch (e) { + lastErr = e; + const code = e?.name || e?.Code || ''; + // Only retry the known transient store inconsistency; fail fast otherwise. + if (!/NoSuchKey|NotFound|404/i.test(code)) break; + await new Promise(r => setTimeout(r, 150 * (attempt + 1))); + } + } + // All retries missed — report not-ready so the UI shows its graceful + // fallback rather than a hard error. + console.warn(`[assets] filmstrip GET failed for ${id} (${filmstrip_s3_key}): ${lastErr?.name || lastErr?.message}`); + return res.json({ url: null, ready: false, transient: true }); } catch (err) { next(err); } }); diff --git a/services/node-agent/index.js b/services/node-agent/index.js index 31fa9d9..150cfec 100644 --- a/services/node-agent/index.js +++ b/services/node-agent/index.js @@ -2,7 +2,7 @@ import http from 'http'; import os from 'os'; import fs from 'fs'; import crypto from 'crypto'; -import { spawn } from 'child_process'; +import { spawn, execFileSync } from 'child_process'; const MAM_API_URL = (process.env.MAM_API_URL || 'http://localhost:3000').replace(/\/$/, ''); const NODE_TOKEN = process.env.NODE_TOKEN || ''; @@ -26,6 +26,41 @@ const LIVE_DIR = process.env.LIVE_DIR || '/mnt/NVME/MAM/wild-dragon-live'; const REPO_DIR = process.env.REPO_DIR || '/opt/wild-dragon'; const VERSION = '1.4.0'; +// Number of GPUs to spread capture encodes across. Each capture sidecar runs +// ~2 NVENC sessions (master HEVC + HLS preview); with NVIDIA_VISIBLE_DEVICES=all +// and no -gpu selector, ffmpeg's nvenc puts EVERY session on physical GPU 0, so +// 8 ports = 16 sessions hammering one card → it falls below realtime → the +// framecache ring laps → video freezes/stutters then recovers. Pinning each +// sidecar to GPU (port % CAPTURE_GPU_COUNT) spreads the load across all cards. +// Auto-detected from nvidia-smi at boot; override with CAPTURE_GPU_COUNT. +let CAPTURE_GPU_COUNT = parseInt(process.env.CAPTURE_GPU_COUNT || '0', 10) || 0; +function detectGpuCount() { + if (CAPTURE_GPU_COUNT > 0) return CAPTURE_GPU_COUNT; + try { + const out = execFileSync('nvidia-smi', ['--query-gpu=index', '--format=csv,noheader'], { timeout: 5000 }) + .toString().trim(); + const n = out ? out.split('\n').filter(Boolean).length : 0; + CAPTURE_GPU_COUNT = n > 0 ? n : 1; + } catch (_) { + CAPTURE_GPU_COUNT = 1; // no nvidia-smi / no GPU — single-device fallback + } + console.log(`[gpu] capture encodes will spread across ${CAPTURE_GPU_COUNT} GPU(s)`); + return CAPTURE_GPU_COUNT; +} + +// Choose the NVIDIA_VISIBLE_DEVICES value for a capture sidecar. An explicit +// per-recorder gpuUuid always wins; otherwise round-robin by capture port so +// consecutive Deltacast ports land on different physical GPUs. With a single +// device visible, the container's nvenc device 0 == the chosen physical GPU. +const SIDECAR_BASE_PORT = 7438; +function pickVisibleDevices(gpuUuid, capturePort) { + if (gpuUuid != null && String(gpuUuid).trim() !== '') return String(gpuUuid).trim(); + const count = detectGpuCount(); + if (count <= 1) return 'all'; + const idx = Number.isFinite(capturePort) ? ((capturePort - SIDECAR_BASE_PORT) % count + count) % count : 0; + return String(idx); +} + // Capture-driver vendor allowlist. NOTHING outside this set is ever passed to // the host installer — the value is only ever used to pick a script arg, never // interpolated into a shell string. @@ -475,13 +510,12 @@ async function handleSidecarStart(body, res) { if (useGpu) { // Issue #167 — per-recorder GPU affinity. A gpuUuid (UUID string or // numeric index) pins the sidecar to exactly that device; otherwise - // NVIDIA_VISIBLE_DEVICES=all exposes every GPU on the host (legacy - // behavior — for a single-GPU node like zampp2 / L4 this equals GPU 0). - const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '') - ? String(gpuUuid).trim() - : 'all'; + // Round-robin the encode across all GPUs by capture port (or honor an + // explicit per-recorder gpuUuid). Prevents all sidecars piling onto GPU 0. + const visibleDevices = pickVisibleDevices(gpuUuid, capturePort); sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility'); + console.log(`[gpu] sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); } const hostConfig = { @@ -790,10 +824,11 @@ async function handleSidecarStandby(body, res) { const sidecarEnv = [...env, `PORT=${capturePort}`, 'STANDBY=1']; if (useGpu) { - const visibleDevices = (gpuUuid != null && String(gpuUuid).trim() !== '') - ? String(gpuUuid).trim() : 'all'; + // Same round-robin GPU spread as the start path (see pickVisibleDevices). + const visibleDevices = pickVisibleDevices(gpuUuid, capturePort); sidecarEnv.push(`NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); sidecarEnv.push('NVIDIA_DRIVER_CAPABILITIES=video,compute,utility'); + console.log(`[gpu] standby sidecar port ${capturePort} → NVIDIA_VISIBLE_DEVICES=${visibleDevices}`); } sidecarEnv.push(`FC_URL=${FC_URL}`); diff --git a/services/web-ui/public/screens-asset.jsx b/services/web-ui/public/screens-asset.jsx index f622c4d..f420811 100644 --- a/services/web-ui/public/screens-asset.jsx +++ b/services/web-ui/public/screens-asset.jsx @@ -102,18 +102,22 @@ function AssetDetail({ asset, onClose }) { setFilmFrames([]); setFilmstripLoading(true); + // The API now serves the filmstrip frames JSON directly (with server-side + // retry around the flaky object store) instead of returning a signed URL. + // Response is either the frames array, or { ready:false } when unavailable. window.ZAMPP_API.fetch('/assets/' + assetId + '/filmstrip') .then(function(r) { if (cancelled) return; - if (!r || !r.url) { setFilmstripLoading(false); return; } - // Fetch the JSON array of base64 frames from the signed S3 URL - return fetch(r.url) - .then(function(res) { return res.json(); }) - .then(function(frames) { - if (!cancelled && Array.isArray(frames) && frames.length) { - setFilmFrames(frames); - } - }); + // New shape: bare array of base64 frames. + if (Array.isArray(r) && r.length) { setFilmFrames(r); return; } + // Legacy/empty shape: { url } (older API) or { ready:false }. + if (r && r.url) { + return fetch(r.url) + .then(function(res) { return res.json(); }) + .then(function(frames) { + if (!cancelled && Array.isArray(frames) && frames.length) setFilmFrames(frames); + }); + } }) .catch(function() {}) .finally(function() { if (!cancelled) setFilmstripLoading(false); });