fix(recorders): stop capture containers in the background to prevent API TimeoutError on large file uploads

This commit is contained in:
Zac Gaetano 2026-06-03 00:22:36 +00:00
parent 600af4564e
commit 62b9a90291
2 changed files with 40 additions and 57 deletions

View file

@ -789,43 +789,22 @@ router.post('/:id/stop', requireRecorderEdit, async (req, res, next) => {
return res.status(502).json({ error: 'Remote node failed to stop sidecar' }); return res.status(502).json({ error: 'Remote node failed to stop sidecar' });
} }
} else { } else {
// Issue #162 — stop WITH a grace period (t=180). Docker sends SIGTERM and // Issue #162 — stop local container in the background so the HTTP stop
// waits up to 180s for ffmpeg to flush and write the MOV/MP4 trailer before // request returns immediately. The container teardown (SIGTERM -> ffmpeg
// it SIGKILLs. Without this the master is truncated/corrupt and the // exit -> S3 upload -> post-stop callback) takes up to 180s for large files,
// pre-created asset can get stuck in 'live'. // which would otherwise timeout the browser/API connection.
const stopRes = await dockerApi( const containerId = recorder.container_id;
'POST', (async () => {
`/containers/${recorder.container_id}/stop?t=180` try {
); const stopRes = await dockerApi('POST', `/containers/${containerId}/stop?t=180`);
if (stopRes.status !== 404) {
// 204 = stopped, 304 = already stopped, 404 = container gone — all acceptable. await waitForFinalize(recorder);
if (stopRes.status !== 204 && stopRes.status !== 304 && stopRes.status !== 404) { await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
return res.status(500).json({ }
error: 'Failed to stop container', } catch (e) {
details: stopRes.data, console.error('[recorders] failed local background stop:', e.message);
});
}
// Only attempt remove if the container existed (not 404).
if (stopRes.status !== 404) {
// Issue #162 — before removing the container, wait for the master to
// finalize (asset leaves 'live'), mirroring the remote path's reliance on
// the node-agent's clean teardown. This guards against deleting the
// container — and its lingering finalize work — too early.
await waitForFinalize(recorder);
const removeRes = await dockerApi(
'DELETE',
`/containers/${recorder.container_id}`
);
if (removeRes.status !== 204 && removeRes.status !== 404) {
return res.status(500).json({
error: 'Failed to remove container',
details: removeRes.data,
});
} }
} })();
} }
const updateResult = await pool.query( const updateResult = await pool.query(

View file

@ -387,29 +387,33 @@ async function fetchContainerLogs(containerId) {
async function handleSidecarStop(containerId, res) { async function handleSidecarStop(containerId, res) {
try { try {
console.log(`[sidecar-stop] stopping ${containerId} (grace 180s)...`); console.log(`[sidecar-stop] stopping ${containerId} (grace 180s)...`);
// Grace period must exceed the capture container's shutdown work
// (finalise ffmpeg session + register asset via callback). Default // Run the container teardown and cleanup in the background. The capture
// docker stop is only 10s, which SIGKILLs capture mid-finalise and // process SIGTERM handler flushes ffmpeg and uploads the file to S3
// loses the POST /assets callback -> asset stuck 'live', no jobs. // (taking up to 3 minutes for multi-GB files) before exiting. Returning
await dockerApi('POST', `/containers/${containerId}/stop?t=180`).catch(() => {}); // immediately stops the API request timing out.
// Dump the capture container's shutdown logs into our persistent log (async () => {
// BEFORE removing it, so failed callbacks are diagnosable. try {
const logs = await fetchContainerLogs(containerId); await dockerApi('POST', `/containers/${containerId}/stop?t=180`).catch(() => {});
console.log(`[sidecar-stop] ==== capture logs for ${containerId} ====\n${logs}\n[sidecar-stop] ==== end logs ====`); const logs = await fetchContainerLogs(containerId);
// Container has now exited gracefully (or hit the 180s cap); remove it. console.log(`[sidecar-stop] ==== capture logs for ${containerId} ====\n${logs}\n[sidecar-stop] ==== end logs ====`);
await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {}); await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
// Deltacast bridge lifecycle: decrement sidecar count; stop bridge when last. // Deltacast bridge lifecycle: decrement sidecar count; stop bridge when last.
if (_containerSourceType.get(containerId) === 'deltacast') { if (_containerSourceType.get(containerId) === 'deltacast') {
_containerSourceType.delete(containerId); _containerSourceType.delete(containerId);
_dcSidecarCount--; _dcSidecarCount--;
if (_dcSidecarCount <= 0) { if (_dcSidecarCount <= 0) {
_dcSidecarCount = 0; _dcSidecarCount = 0;
stopDeltacastBridge(); stopDeltacastBridge();
}
} else {
_containerSourceType.delete(containerId);
}
} catch (err) {
console.error(`[sidecar-stop] background cleanup failed for ${containerId}:`, err.message);
} }
} else { })();
_containerSourceType.delete(containerId);
}
jsonResponse(res, 200, { ok: true }); jsonResponse(res, 200, { ok: true });
} catch (err) { } catch (err) {