fix(playout): immediate failover on new channels + play 502 vs 409
- spawnChannelSidecar: set last_heartbeat_at = NOW() when flipping channel to 'running'. Without this, last_heartbeat_at is NULL so the first scheduler tick sees ageMs = (now - epoch) >> TIMEOUT_MS and triggers failover before the sidecar has had a single chance to respond. - scheduler playoutHealthTick: when last_heartbeat_at is NULL fall back to updated_at as the baseline (belt-and-suspenders with the spawnChannelSidecar fix). Also include updated_at in the query. - POST /channels/:id/play: catch callSidecar errors explicitly and return 502 Bad Gateway instead of delegating to next(err) which the error middleware maps to 409 Conflict. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e51cf1aa9c
commit
e8f91cf4b4
2 changed files with 21 additions and 4 deletions
|
|
@ -338,9 +338,14 @@ async function spawnChannelSidecar(channel) {
|
|||
}
|
||||
}
|
||||
|
||||
// Set last_heartbeat_at = NOW() so the scheduler health tick treats this
|
||||
// channel as freshly alive. Without this, last_heartbeat_at starts as NULL
|
||||
// (epoch=0), and the very first tick sees ageMs >> TIMEOUT_MS and triggers
|
||||
// failover immediately — before the sidecar has had a chance to respond.
|
||||
const { rows } = await pool.query(
|
||||
`UPDATE playout_channels
|
||||
SET status = 'running', container_id = $1, container_meta = $2, updated_at = NOW()
|
||||
SET status = 'running', container_id = $1, container_meta = $2,
|
||||
last_heartbeat_at = NOW(), updated_at = NOW()
|
||||
WHERE id = $3 RETURNING *`,
|
||||
[containerId, JSON.stringify(containerMeta), channel.id]
|
||||
);
|
||||
|
|
@ -448,7 +453,14 @@ router.post('/channels/:id/play', requireChannelEdit, async (req, res, next) =>
|
|||
asset_duration_ms: i.asset_duration_ms != null ? Number(i.asset_duration_ms) : null,
|
||||
})),
|
||||
};
|
||||
const out = await callSidecar(req.channel, '/playlist/load', 'POST', payload);
|
||||
// callSidecar throws on network/timeout errors. Return 502 (not 409) so
|
||||
// the UI and operators know it's a gateway problem, not a state conflict.
|
||||
let out;
|
||||
try {
|
||||
out = await callSidecar(req.channel, '/playlist/load', 'POST', payload);
|
||||
} catch (err) {
|
||||
return res.status(502).json({ error: 'Sidecar unreachable: ' + err.message });
|
||||
}
|
||||
res.json(out);
|
||||
} catch (err) { next(err); }
|
||||
});
|
||||
|
|
|
|||
|
|
@ -222,7 +222,7 @@ async function playoutHealthTick(client) {
|
|||
let channels;
|
||||
try {
|
||||
({ rows: channels } = await client.query(
|
||||
`SELECT id, output_type, container_meta, node_id, last_heartbeat_at, restart_count
|
||||
`SELECT id, output_type, container_meta, node_id, last_heartbeat_at, updated_at, restart_count
|
||||
FROM playout_channels WHERE status = 'running'`
|
||||
));
|
||||
} catch (err) {
|
||||
|
|
@ -244,7 +244,12 @@ async function playoutHealthTick(client) {
|
|||
'UPDATE playout_channels SET last_heartbeat_at = NOW() WHERE id = $1', [ch.id]
|
||||
);
|
||||
} catch (err) {
|
||||
const lastSeen = ch.last_heartbeat_at ? new Date(ch.last_heartbeat_at).getTime() : 0;
|
||||
// When last_heartbeat_at is NULL (channel just spawned), fall back to
|
||||
// updated_at (set to NOW() by spawnChannelSidecar). This prevents a
|
||||
// brand-new channel from being failed over on the very first tick because
|
||||
// epoch-0 age always exceeds TIMEOUT_MS.
|
||||
const baseline = ch.last_heartbeat_at || ch.updated_at;
|
||||
const lastSeen = baseline ? new Date(baseline).getTime() : Date.now();
|
||||
const ageMs = Date.now() - lastSeen;
|
||||
if (ageMs < TIMEOUT_MS) continue; // not yet 3 misses
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue