fix(playout): immediate failover on new channels + play 502 vs 409

- spawnChannelSidecar: set last_heartbeat_at = NOW() when flipping
  channel to 'running'. Without this, last_heartbeat_at is NULL so
  the first scheduler tick sees ageMs = (now - epoch) >> TIMEOUT_MS
  and triggers failover before the sidecar has had a single chance
  to respond.
- scheduler playoutHealthTick: when last_heartbeat_at is NULL fall
  back to updated_at as the baseline (belt-and-suspenders with the
  spawnChannelSidecar fix). Also include updated_at in the query.
- POST /channels/:id/play: catch callSidecar errors explicitly and
  return 502 Bad Gateway instead of delegating to next(err) which
  the error middleware maps to 409 Conflict.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Zac Gaetano 2026-05-31 12:34:41 -04:00
parent e51cf1aa9c
commit e8f91cf4b4
2 changed files with 21 additions and 4 deletions

View file

@ -338,9 +338,14 @@ async function spawnChannelSidecar(channel) {
}
}
// Set last_heartbeat_at = NOW() so the scheduler health tick treats this
// channel as freshly alive. Without this, last_heartbeat_at starts as NULL
// (epoch=0), and the very first tick sees ageMs >> TIMEOUT_MS and triggers
// failover immediately — before the sidecar has had a chance to respond.
const { rows } = await pool.query(
`UPDATE playout_channels
SET status = 'running', container_id = $1, container_meta = $2, updated_at = NOW()
SET status = 'running', container_id = $1, container_meta = $2,
last_heartbeat_at = NOW(), updated_at = NOW()
WHERE id = $3 RETURNING *`,
[containerId, JSON.stringify(containerMeta), channel.id]
);
@ -448,7 +453,14 @@ router.post('/channels/:id/play', requireChannelEdit, async (req, res, next) =>
asset_duration_ms: i.asset_duration_ms != null ? Number(i.asset_duration_ms) : null,
})),
};
const out = await callSidecar(req.channel, '/playlist/load', 'POST', payload);
// callSidecar throws on network/timeout errors. Return 502 (not 409) so
// the UI and operators know it's a gateway problem, not a state conflict.
let out;
try {
out = await callSidecar(req.channel, '/playlist/load', 'POST', payload);
} catch (err) {
return res.status(502).json({ error: 'Sidecar unreachable: ' + err.message });
}
res.json(out);
} catch (err) { next(err); }
});

View file

@ -222,7 +222,7 @@ async function playoutHealthTick(client) {
let channels;
try {
({ rows: channels } = await client.query(
`SELECT id, output_type, container_meta, node_id, last_heartbeat_at, restart_count
`SELECT id, output_type, container_meta, node_id, last_heartbeat_at, updated_at, restart_count
FROM playout_channels WHERE status = 'running'`
));
} catch (err) {
@ -244,7 +244,12 @@ async function playoutHealthTick(client) {
'UPDATE playout_channels SET last_heartbeat_at = NOW() WHERE id = $1', [ch.id]
);
} catch (err) {
const lastSeen = ch.last_heartbeat_at ? new Date(ch.last_heartbeat_at).getTime() : 0;
// When last_heartbeat_at is NULL (channel just spawned), fall back to
// updated_at (set to NOW() by spawnChannelSidecar). This prevents a
// brand-new channel from being failed over on the very first tick because
// epoch-0 age always exceeds TIMEOUT_MS.
const baseline = ch.last_heartbeat_at || ch.updated_at;
const lastSeen = baseline ? new Date(baseline).getTime() : Date.now();
const ageMs = Date.now() - lastSeen;
if (ageMs < TIMEOUT_MS) continue; // not yet 3 misses