From bf4632b9119010a42bbf6a864e6c25f78c59e7b8 Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Thu, 4 Jun 2026 03:05:00 +0000 Subject: [PATCH] feat(mam-api): extract ensureStandbySidecar + add POST /recorders/reconcile-standby Re-provisions the persistent standby sidecar for SDI/deltacast recorders that lost theirs (manual cleanup, node redeploy, wiped /dev/shm). Without this the recorder falls back to slow on-demand spawn on /start, which can collide on the capture port (EADDRINUSE). Idempotent; { force:true } recreates even when a container_id is already set. --- services/mam-api/src/routes/recorders.js | 131 ++++++++++++++++------- 1 file changed, 93 insertions(+), 38 deletions(-) diff --git a/services/mam-api/src/routes/recorders.js b/services/mam-api/src/routes/recorders.js index d44f4cc..0244588 100644 --- a/services/mam-api/src/routes/recorders.js +++ b/services/mam-api/src/routes/recorders.js @@ -280,6 +280,55 @@ function buildStandbyEnv(recorder) { ]; } +// Source types that run a long-lived standby sidecar (idle-preview container +// kept up 24/7 so `record` is a sub-second HTTP call, not a Docker cold start). +const STANDBY_SOURCE_TYPES = ['deltacast', 'sdi', 'blackmagic']; + +// Provision (or re-provision) the single persistent standby sidecar for one +// recorder by asking its node's agent to create the idle container. Idempotent +// at the node-agent layer (one container per capture port). Updates the +// recorder row with the new container_id + status='standby'. Returns: +// { ok, containerId?, reason? } +// Non-fatal by contract — the caller logs/aggregates; a recorder is still +// usable via the on-demand spawn fallback in /start if this fails. +async function ensureStandbySidecar(recorder) { + if (!recorder.node_id || !STANDBY_SOURCE_TYPES.includes(recorder.source_type)) { + return { ok: false, reason: 'not a standby source / no node' }; + } + const { remote: isRemote, apiUrl: targetNodeApiUrl } = + await resolveNodeTarget(recorder.node_id).catch(() => ({ remote: false })); + if (!isRemote || !targetNodeApiUrl) { + return { ok: false, reason: 'node not remote/reachable' }; + } + const capturePort = SIDECAR_PORT_BASE + (recorder.device_index || 0); + const useGpu = GPU_CODECS.includes(recorder.recording_codec); + const standbyRes = await fetch(`${targetNodeApiUrl}/sidecar/standby`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + image: 'wild-dragon-capture:latest', + env: buildStandbyEnv(recorder), + capturePort, + sourceType: recorder.source_type, + useGpu, + gpuUuid: recorder.gpu_uuid || null, + }), + signal: AbortSignal.timeout(15000), + }); + if (!standbyRes.ok) { + return { ok: false, reason: `node-agent returned ${standbyRes.status}` }; + } + const { containerId } = await standbyRes.json(); + await pool.query( + `UPDATE recorders SET container_id = $1, status = 'standby', updated_at = NOW() WHERE id = $2`, + [containerId, recorder.id] + ); + recorder.container_id = containerId; + recorder.status = 'standby'; + console.log(`[recorders] standby sidecar spawned for ${recorder.id}: ${containerId}`); + return { ok: true, containerId }; +} + // Issue #162 — after a local-spawn stop, wait for the capture container to // finalize its master. The asset row was pre-created at start with // status='live' (display_name = current_session_id); the ingest/finalize step @@ -432,43 +481,8 @@ router.post('/', async (req, res, next) => { // Spawn a standby sidecar immediately for SDI/deltacast/blackmagic recorders // that have an assigned node, so the container + bridge are ready before the // user hits record. Non-fatal — recorder is still usable if this fails. - const STANDBY_SOURCE_TYPES = ['deltacast', 'sdi', 'blackmagic']; - if (recorder.node_id && STANDBY_SOURCE_TYPES.includes(recorder.source_type)) { - const { remote: isRemote, apiUrl: targetNodeApiUrl } = await resolveNodeTarget(recorder.node_id).catch(() => ({ remote: false })); - if (isRemote && targetNodeApiUrl) { - const capturePort = SIDECAR_PORT_BASE + (recorder.device_index || 0); - const useGpu = GPU_CODECS.includes(recorder.recording_codec); - try { - const standbyRes = await fetch(`${targetNodeApiUrl}/sidecar/standby`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - image: 'wild-dragon-capture:latest', - env: buildStandbyEnv(recorder), - capturePort, - sourceType: recorder.source_type, - useGpu, - gpuUuid: recorder.gpu_uuid || null, - }), - signal: AbortSignal.timeout(15000), - }); - if (standbyRes.ok) { - const { containerId } = await standbyRes.json(); - await pool.query( - `UPDATE recorders SET container_id = $1, status = 'standby', updated_at = NOW() WHERE id = $2`, - [containerId, recorder.id] - ); - recorder.container_id = containerId; - recorder.status = 'standby'; - console.log(`[recorders] standby sidecar spawned for ${recorder.id}: ${containerId}`); - } else { - console.warn(`[recorders] standby spawn returned ${standbyRes.status} for ${recorder.id} — will spawn on start`); - } - } catch (e) { - console.warn(`[recorders] standby spawn failed for ${recorder.id} (non-fatal): ${e.message}`); - } - } - } + await ensureStandbySidecar(recorder).catch(e => + console.warn(`[recorders] standby spawn failed for ${recorder.id} (non-fatal): ${e.message}`)); res.status(201).json(recorder); } catch (err) { @@ -476,6 +490,48 @@ router.post('/', async (req, res, next) => { } }); +// POST /reconcile-standby - (re)provision the persistent standby sidecar for +// every SDI/deltacast recorder that should have one. Standby sidecars are +// created on recorder-create and kept up 24/7 (RestartPolicy=unless-stopped), +// but if they're externally removed (manual cleanup, node redeploy, a wiped +// /dev/shm) nothing recreates them — the recorder then falls back to the slow +// on-demand spawn on /start, which can collide on the capture port. This +// endpoint re-warms them so all recorders return to the fast standby path. +// +// Optional body: { force: true } recreates even recorders that currently claim +// a container_id (the node-agent is idempotent per capture port, so a stale id +// is replaced cleanly). Without force, only recorders with no container_id are +// (re)provisioned. +router.post('/reconcile-standby', requireRecorderEdit, async (req, res, next) => { + try { + const force = !!(req.body && req.body.force); + const { rows } = await pool.query( + `SELECT * FROM recorders + WHERE source_type = ANY($1) + AND node_id IS NOT NULL + ORDER BY name`, + [STANDBY_SOURCE_TYPES] + ); + const results = []; + for (const recorder of rows) { + if (!force && recorder.container_id) { + results.push({ id: recorder.id, name: recorder.name, ok: true, skipped: 'already has container_id' }); + continue; + } + try { + const r = await ensureStandbySidecar(recorder); + results.push({ id: recorder.id, name: recorder.name, ...r }); + } catch (e) { + results.push({ id: recorder.id, name: recorder.name, ok: false, reason: e.message }); + } + } + const provisioned = results.filter(r => r.ok && r.containerId).length; + res.json({ provisioned, total: rows.length, results }); + } catch (err) { + next(err); + } +}); + // GET /:id - Get single recorder router.get('/:id', async (req, res, next) => { try { @@ -970,7 +1026,6 @@ router.post('/:id/stop', requireRecorderEdit, async (req, res, next) => { // /start call immediately. // // If NOT in standby (legacy on-demand spawn), use the old docker-stop path. - const STANDBY_SOURCE_TYPES = ['deltacast', 'sdi', 'blackmagic']; const isStandbySource = STANDBY_SOURCE_TYPES.includes(recorder.source_type); if (isStandbySource && recorder.container_id) {