fix(playout): immediate failover on new channels + play 502 vs 409
- spawnChannelSidecar: set last_heartbeat_at = NOW() when flipping channel to 'running'. Without this, last_heartbeat_at is NULL so the first scheduler tick sees ageMs = (now - epoch) >> TIMEOUT_MS and triggers failover before the sidecar has had a single chance to respond. - scheduler playoutHealthTick: when last_heartbeat_at is NULL fall back to updated_at as the baseline (belt-and-suspenders with the spawnChannelSidecar fix). Also include updated_at in the query. - POST /channels/:id/play: catch callSidecar errors explicitly and return 502 Bad Gateway instead of delegating to next(err) which the error middleware maps to 409 Conflict. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e51cf1aa9c
commit
e8f91cf4b4
2 changed files with 21 additions and 4 deletions
|
|
@ -338,9 +338,14 @@ async function spawnChannelSidecar(channel) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Set last_heartbeat_at = NOW() so the scheduler health tick treats this
|
||||||
|
// channel as freshly alive. Without this, last_heartbeat_at starts as NULL
|
||||||
|
// (epoch=0), and the very first tick sees ageMs >> TIMEOUT_MS and triggers
|
||||||
|
// failover immediately — before the sidecar has had a chance to respond.
|
||||||
const { rows } = await pool.query(
|
const { rows } = await pool.query(
|
||||||
`UPDATE playout_channels
|
`UPDATE playout_channels
|
||||||
SET status = 'running', container_id = $1, container_meta = $2, updated_at = NOW()
|
SET status = 'running', container_id = $1, container_meta = $2,
|
||||||
|
last_heartbeat_at = NOW(), updated_at = NOW()
|
||||||
WHERE id = $3 RETURNING *`,
|
WHERE id = $3 RETURNING *`,
|
||||||
[containerId, JSON.stringify(containerMeta), channel.id]
|
[containerId, JSON.stringify(containerMeta), channel.id]
|
||||||
);
|
);
|
||||||
|
|
@ -448,7 +453,14 @@ router.post('/channels/:id/play', requireChannelEdit, async (req, res, next) =>
|
||||||
asset_duration_ms: i.asset_duration_ms != null ? Number(i.asset_duration_ms) : null,
|
asset_duration_ms: i.asset_duration_ms != null ? Number(i.asset_duration_ms) : null,
|
||||||
})),
|
})),
|
||||||
};
|
};
|
||||||
const out = await callSidecar(req.channel, '/playlist/load', 'POST', payload);
|
// callSidecar throws on network/timeout errors. Return 502 (not 409) so
|
||||||
|
// the UI and operators know it's a gateway problem, not a state conflict.
|
||||||
|
let out;
|
||||||
|
try {
|
||||||
|
out = await callSidecar(req.channel, '/playlist/load', 'POST', payload);
|
||||||
|
} catch (err) {
|
||||||
|
return res.status(502).json({ error: 'Sidecar unreachable: ' + err.message });
|
||||||
|
}
|
||||||
res.json(out);
|
res.json(out);
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -222,7 +222,7 @@ async function playoutHealthTick(client) {
|
||||||
let channels;
|
let channels;
|
||||||
try {
|
try {
|
||||||
({ rows: channels } = await client.query(
|
({ rows: channels } = await client.query(
|
||||||
`SELECT id, output_type, container_meta, node_id, last_heartbeat_at, restart_count
|
`SELECT id, output_type, container_meta, node_id, last_heartbeat_at, updated_at, restart_count
|
||||||
FROM playout_channels WHERE status = 'running'`
|
FROM playout_channels WHERE status = 'running'`
|
||||||
));
|
));
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
|
@ -244,7 +244,12 @@ async function playoutHealthTick(client) {
|
||||||
'UPDATE playout_channels SET last_heartbeat_at = NOW() WHERE id = $1', [ch.id]
|
'UPDATE playout_channels SET last_heartbeat_at = NOW() WHERE id = $1', [ch.id]
|
||||||
);
|
);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
const lastSeen = ch.last_heartbeat_at ? new Date(ch.last_heartbeat_at).getTime() : 0;
|
// When last_heartbeat_at is NULL (channel just spawned), fall back to
|
||||||
|
// updated_at (set to NOW() by spawnChannelSidecar). This prevents a
|
||||||
|
// brand-new channel from being failed over on the very first tick because
|
||||||
|
// epoch-0 age always exceeds TIMEOUT_MS.
|
||||||
|
const baseline = ch.last_heartbeat_at || ch.updated_at;
|
||||||
|
const lastSeen = baseline ? new Date(baseline).getTime() : Date.now();
|
||||||
const ageMs = Date.now() - lastSeen;
|
const ageMs = Date.now() - lastSeen;
|
||||||
if (ageMs < TIMEOUT_MS) continue; // not yet 3 misses
|
if (ageMs < TIMEOUT_MS) continue; // not yet 3 misses
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue