fix(mam-api): heartbeat writes last_seen_at so playout failover sees healthy nodes
This commit is contained in:
parent
aa7f836493
commit
b4f2fb12ff
1 changed files with 50 additions and 190 deletions
|
|
@ -4,10 +4,6 @@ import pool from '../db/pool.js';
|
||||||
|
|
||||||
const router = express.Router();
|
const router = express.Router();
|
||||||
|
|
||||||
// If the agent reported Docker's default bridge IP (172.17.x) but the request
|
|
||||||
// itself came from a real LAN address, prefer the request source IP instead.
|
|
||||||
// We only check 172.17.x — the default docker0 bridge — not the full RFC1918
|
|
||||||
// 172.16/12 block, since real LANs (e.g. 172.18.91.x) fall in that range.
|
|
||||||
function pickIp(reportedIp, reqIp) {
|
function pickIp(reportedIp, reqIp) {
|
||||||
const clean = (s) => (s || '').replace(/^::ffff:/, '');
|
const clean = (s) => (s || '').replace(/^::ffff:/, '');
|
||||||
const isDockerBridge = (ip) => /^172\.17\./.test(ip || '');
|
const isDockerBridge = (ip) => /^172\.17\./.test(ip || '');
|
||||||
|
|
@ -41,7 +37,6 @@ function dockerRequest(path, method = 'GET', body = null) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// GET / – list all registered cluster nodes with online status
|
|
||||||
router.get('/', async (req, res, next) => {
|
router.get('/', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const r = await pool.query(
|
const r = await pool.query(
|
||||||
|
|
@ -57,7 +52,6 @@ router.get('/', async (req, res, next) => {
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// GET /containers – list all containers on the local Docker host
|
|
||||||
router.get('/containers', async (req, res, next) => {
|
router.get('/containers', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const containers = await dockerRequest('/containers/json?all=true');
|
const containers = await dockerRequest('/containers/json?all=true');
|
||||||
|
|
@ -88,7 +82,6 @@ router.get('/containers', async (req, res, next) => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// POST /containers/:nameOrId/restart
|
|
||||||
router.post('/containers/:nameOrId/restart', async (req, res, next) => {
|
router.post('/containers/:nameOrId/restart', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
await dockerRequest(`/containers/${encodeURIComponent(req.params.nameOrId)}/restart`, 'POST');
|
await dockerRequest(`/containers/${encodeURIComponent(req.params.nameOrId)}/restart`, 'POST');
|
||||||
|
|
@ -96,7 +89,6 @@ router.post('/containers/:nameOrId/restart', async (req, res, next) => {
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// POST /heartbeat – upsert this node's registration (includes hardware capabilities)
|
|
||||||
router.post('/heartbeat', async (req, res, next) => {
|
router.post('/heartbeat', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const {
|
const {
|
||||||
|
|
@ -108,11 +100,6 @@ router.post('/heartbeat', async (req, res, next) => {
|
||||||
|
|
||||||
if (!hostname) return res.status(400).json({ error: 'hostname is required' });
|
if (!hostname) return res.status(400).json({ error: 'hostname is required' });
|
||||||
|
|
||||||
// Issue #106 — any authenticated user used to be able to POST a heartbeat
|
|
||||||
// for an arbitrary hostname and overwrite the primary node's `api_url`,
|
|
||||||
// effectively hijacking job dispatch. Now: if the caller's token is bound
|
|
||||||
// to a hostname (node-agent tokens are bound at issue time), the body
|
|
||||||
// hostname must match. Admin users with no binding are allowed for ops.
|
|
||||||
if (process.env.AUTH_ENABLED === 'true') {
|
if (process.env.AUTH_ENABLED === 'true') {
|
||||||
const bound = req.tokenBoundHostname;
|
const bound = req.tokenBoundHostname;
|
||||||
if (bound && bound !== hostname) {
|
if (bound && bound !== hostname) {
|
||||||
|
|
@ -132,8 +119,8 @@ router.post('/heartbeat', async (req, res, next) => {
|
||||||
const r = await pool.query(
|
const r = await pool.query(
|
||||||
`INSERT INTO cluster_nodes
|
`INSERT INTO cluster_nodes
|
||||||
(hostname, ip_address, role, version, api_url,
|
(hostname, ip_address, role, version, api_url,
|
||||||
cpu_usage, mem_used_mb, mem_total_mb, last_seen, capabilities, metadata, metrics)
|
cpu_usage, mem_used_mb, mem_total_mb, last_seen, last_seen_at, capabilities, metadata, metrics)
|
||||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),$9,$10,$11)
|
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),NOW(),$9,$10,$11)
|
||||||
ON CONFLICT (hostname) DO UPDATE SET
|
ON CONFLICT (hostname) DO UPDATE SET
|
||||||
ip_address = EXCLUDED.ip_address,
|
ip_address = EXCLUDED.ip_address,
|
||||||
role = EXCLUDED.role,
|
role = EXCLUDED.role,
|
||||||
|
|
@ -143,6 +130,7 @@ router.post('/heartbeat', async (req, res, next) => {
|
||||||
mem_used_mb = EXCLUDED.mem_used_mb,
|
mem_used_mb = EXCLUDED.mem_used_mb,
|
||||||
mem_total_mb = EXCLUDED.mem_total_mb,
|
mem_total_mb = EXCLUDED.mem_total_mb,
|
||||||
last_seen = NOW(),
|
last_seen = NOW(),
|
||||||
|
last_seen_at = NOW(),
|
||||||
capabilities = EXCLUDED.capabilities,
|
capabilities = EXCLUDED.capabilities,
|
||||||
metadata = EXCLUDED.metadata,
|
metadata = EXCLUDED.metadata,
|
||||||
metrics = COALESCE(EXCLUDED.metrics, cluster_nodes.metrics)
|
metrics = COALESCE(EXCLUDED.metrics, cluster_nodes.metrics)
|
||||||
|
|
@ -165,42 +153,25 @@ router.post('/heartbeat', async (req, res, next) => {
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// GET /devices/blackmagic/signal – live video-presence state for every
|
|
||||||
// DeckLink port across the cluster. For each port we check whether there is
|
|
||||||
// an active SDI recorder assigned to it and, if so, query the capture
|
|
||||||
// container for its real signal state (receiving / lost / connecting /
|
|
||||||
// error). Ports without a recorder get signal = 'no-recorder'.
|
|
||||||
//
|
|
||||||
// Response shape (array):
|
|
||||||
// { node_id, hostname, index, device, model,
|
|
||||||
// signal, framesReceived, currentFps, recorder_id, recorder_status }
|
|
||||||
router.get('/devices/blackmagic/signal', async (req, res, next) => {
|
router.get('/devices/blackmagic/signal', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
// 1. Fetch all cluster nodes with DeckLink capabilities.
|
|
||||||
const nodesResult = await pool.query(
|
const nodesResult = await pool.query(
|
||||||
`SELECT id, hostname, ip_address, api_url, capabilities,
|
`SELECT id, hostname, ip_address, api_url, capabilities,
|
||||||
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
||||||
FROM cluster_nodes
|
FROM cluster_nodes
|
||||||
WHERE capabilities IS NOT NULL`
|
WHERE capabilities IS NOT NULL`
|
||||||
);
|
);
|
||||||
|
|
||||||
// 2. Fetch all SDI recorders that are pinned to a node+device_index.
|
|
||||||
const recResult = await pool.query(
|
const recResult = await pool.query(
|
||||||
`SELECT id, name, status, container_id, node_id, device_index,
|
`SELECT id, name, status, container_id, node_id, device_index,
|
||||||
source_config
|
source_config
|
||||||
FROM recorders
|
FROM recorders
|
||||||
WHERE source_type = 'sdi' AND node_id IS NOT NULL`
|
WHERE source_type = 'sdi' AND node_id IS NOT NULL`
|
||||||
);
|
);
|
||||||
|
|
||||||
// Build a fast lookup: "${node_id}:${device_index}" → recorder row.
|
|
||||||
const recByPort = new Map();
|
const recByPort = new Map();
|
||||||
for (const r of recResult.rows) {
|
for (const r of recResult.rows) {
|
||||||
const devIdx = r.device_index ?? r.source_config?.device ?? 0;
|
const devIdx = r.device_index ?? r.source_config?.device ?? 0;
|
||||||
recByPort.set(`${r.node_id}:${devIdx}`, r);
|
recByPort.set(`${r.node_id}:${devIdx}`, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. For each port, determine signal state. We fire all capture-container
|
|
||||||
// fetches concurrently so the endpoint stays fast even with many ports.
|
|
||||||
const tasks = [];
|
const tasks = [];
|
||||||
for (const node of nodesResult.rows) {
|
for (const node of nodesResult.rows) {
|
||||||
const nodeOnline = Number(node.stale_seconds) < 120;
|
const nodeOnline = Number(node.stale_seconds) < 120;
|
||||||
|
|
@ -208,79 +179,51 @@ router.get('/devices/blackmagic/signal', async (req, res, next) => {
|
||||||
const model = (node.capabilities && node.capabilities.blackmagic_model) || null;
|
const model = (node.capabilities && node.capabilities.blackmagic_model) || null;
|
||||||
const localHostname = process.env.NODE_HOSTNAME || '';
|
const localHostname = process.env.NODE_HOSTNAME || '';
|
||||||
const isRemote = node.api_url && node.hostname !== localHostname;
|
const isRemote = node.api_url && node.hostname !== localHostname;
|
||||||
|
|
||||||
bm.forEach((d, idx) => {
|
bm.forEach((d, idx) => {
|
||||||
const portIndex = d.index !== undefined ? d.index : idx;
|
const portIndex = d.index !== undefined ? d.index : idx;
|
||||||
const rec = recByPort.get(`${node.id}:${portIndex}`);
|
const rec = recByPort.get(`${node.id}:${portIndex}`);
|
||||||
|
|
||||||
tasks.push((async () => {
|
tasks.push((async () => {
|
||||||
const base = {
|
const base = {
|
||||||
node_id: node.id,
|
node_id: node.id, hostname: node.hostname, index: portIndex,
|
||||||
hostname: node.hostname,
|
device: d.device || null, model, node_online: nodeOnline,
|
||||||
index: portIndex,
|
recorder_id: rec ? rec.id : null, recorder_name: rec ? rec.name : null,
|
||||||
device: d.device || null,
|
|
||||||
model,
|
|
||||||
node_online: nodeOnline,
|
|
||||||
recorder_id: rec ? rec.id : null,
|
|
||||||
recorder_name: rec ? rec.name : null,
|
|
||||||
recorder_status: rec ? rec.status : null,
|
recorder_status: rec ? rec.status : null,
|
||||||
signal: 'no-recorder',
|
signal: 'no-recorder', framesReceived: null, currentFps: null,
|
||||||
framesReceived: null,
|
|
||||||
currentFps: null,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!rec || rec.status !== 'recording' || !rec.container_id) {
|
if (!rec || rec.status !== 'recording' || !rec.container_id) {
|
||||||
// No active capture — if there's a recorder but it's not recording,
|
|
||||||
// report that; otherwise the port is unassigned.
|
|
||||||
if (rec && rec.status !== 'recording') base.signal = 'idle';
|
if (rec && rec.status !== 'recording') base.signal = 'idle';
|
||||||
return base;
|
return base;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Active recording — query the capture container for real signal.
|
|
||||||
try {
|
try {
|
||||||
let live = null;
|
let live = null;
|
||||||
if (isRemote) {
|
if (isRemote) {
|
||||||
const r = await fetch(
|
const r = await fetch(`${node.api_url}/sidecar/${rec.container_id}/status`, { signal: AbortSignal.timeout(2500) });
|
||||||
`${node.api_url}/sidecar/${rec.container_id}/status`,
|
|
||||||
{ signal: AbortSignal.timeout(2500) }
|
|
||||||
);
|
|
||||||
if (r.ok) live = (await r.json()).live;
|
if (r.ok) live = (await r.json()).live;
|
||||||
} else {
|
} else {
|
||||||
const r = await fetch(
|
const r = await fetch(`http://recorder-${rec.id}:3001/capture/status`, { signal: AbortSignal.timeout(2000) });
|
||||||
`http://recorder-${rec.id}:3001/capture/status`,
|
|
||||||
{ signal: AbortSignal.timeout(2000) }
|
|
||||||
);
|
|
||||||
if (r.ok) live = await r.json();
|
if (r.ok) live = await r.json();
|
||||||
}
|
}
|
||||||
if (live && live.signal) {
|
if (live && live.signal) {
|
||||||
base.signal = live.signal;
|
base.signal = live.signal;
|
||||||
base.framesReceived = live.framesReceived ?? null;
|
base.framesReceived = live.framesReceived ?? null;
|
||||||
base.currentFps = live.currentFps ?? null;
|
base.currentFps = live.currentFps ?? null;
|
||||||
} else {
|
} else { base.signal = 'connecting'; }
|
||||||
base.signal = 'connecting';
|
} catch (_) { base.signal = 'connecting'; }
|
||||||
}
|
|
||||||
} catch (_) {
|
|
||||||
base.signal = 'connecting';
|
|
||||||
}
|
|
||||||
return base;
|
return base;
|
||||||
})());
|
})());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const results = await Promise.all(tasks);
|
const results = await Promise.all(tasks);
|
||||||
res.json(results);
|
res.json(results);
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// GET /devices/blackmagic – flatten every node's DeckLink cards for the
|
|
||||||
// recorder picker. Returns one entry per device with the host node info.
|
|
||||||
router.get('/devices/blackmagic', async (req, res, next) => {
|
router.get('/devices/blackmagic', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const r = await pool.query(
|
const r = await pool.query(
|
||||||
`SELECT id, hostname, ip_address, role, capabilities,
|
`SELECT id, hostname, ip_address, role, capabilities,
|
||||||
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
||||||
FROM cluster_nodes
|
FROM cluster_nodes WHERE capabilities IS NOT NULL`
|
||||||
WHERE capabilities IS NOT NULL`
|
|
||||||
);
|
);
|
||||||
const out = [];
|
const out = [];
|
||||||
for (const row of r.rows) {
|
for (const row of r.rows) {
|
||||||
|
|
@ -288,157 +231,98 @@ router.get('/devices/blackmagic', async (req, res, next) => {
|
||||||
const bm = (row.capabilities && row.capabilities.blackmagic) || [];
|
const bm = (row.capabilities && row.capabilities.blackmagic) || [];
|
||||||
const model = (row.capabilities && row.capabilities.blackmagic_model) || null;
|
const model = (row.capabilities && row.capabilities.blackmagic_model) || null;
|
||||||
bm.forEach((d, idx) => {
|
bm.forEach((d, idx) => {
|
||||||
out.push({
|
out.push({ node_id: row.id, hostname: row.hostname, ip_address: row.ip_address,
|
||||||
node_id: row.id,
|
role: row.role, online, model, index: d.index !== undefined ? d.index : idx, device: d.device });
|
||||||
hostname: row.hostname,
|
|
||||||
ip_address: row.ip_address,
|
|
||||||
role: row.role,
|
|
||||||
online,
|
|
||||||
model,
|
|
||||||
index: d.index !== undefined ? d.index : idx,
|
|
||||||
device: d.device,
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
res.json(out);
|
res.json(out);
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// GET /devices/deltacast – flatten every node's Deltacast cards for the
|
|
||||||
// recorder picker. Mirrors /devices/blackmagic shape so the UI can treat
|
|
||||||
// both card types uniformly.
|
|
||||||
router.get('/devices/deltacast', async (req, res, next) => {
|
router.get('/devices/deltacast', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const r = await pool.query(
|
const r = await pool.query(
|
||||||
`SELECT id, hostname, ip_address, role, capabilities,
|
`SELECT id, hostname, ip_address, role, capabilities,
|
||||||
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
||||||
FROM cluster_nodes
|
FROM cluster_nodes WHERE capabilities IS NOT NULL`
|
||||||
WHERE capabilities IS NOT NULL`
|
|
||||||
);
|
);
|
||||||
const out = [];
|
const out = [];
|
||||||
for (const row of r.rows) {
|
for (const row of r.rows) {
|
||||||
const online = Number(row.stale_seconds) < 120;
|
const online = Number(row.stale_seconds) < 120;
|
||||||
const dc = (row.capabilities && row.capabilities.deltacast) || [];
|
const dc = (row.capabilities && row.capabilities.deltacast) || [];
|
||||||
const model = (row.capabilities && row.capabilities.deltacast_model) || null;
|
const model = (row.capabilities && row.capabilities.deltacast_model) || null;
|
||||||
// Also synthesise entries from DELTACAST_PORT_COUNT if no entries reported yet —
|
|
||||||
// useful for nodes that haven't sent a heartbeat since the agent was updated.
|
|
||||||
dc.forEach((d, idx) => {
|
dc.forEach((d, idx) => {
|
||||||
out.push({
|
out.push({ node_id: row.id, hostname: row.hostname, ip_address: row.ip_address,
|
||||||
node_id: row.id,
|
role: row.role, online, model: model || 'Deltacast',
|
||||||
hostname: row.hostname,
|
index: d.index !== undefined ? d.index : idx, device: d.device,
|
||||||
ip_address: row.ip_address,
|
present: d.present !== false, port_count: dc.length });
|
||||||
role: row.role,
|
|
||||||
online,
|
|
||||||
model: model || 'Deltacast',
|
|
||||||
index: d.index !== undefined ? d.index : idx,
|
|
||||||
device: d.device,
|
|
||||||
present: d.present !== false,
|
|
||||||
port_count: dc.length,
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
res.json(out);
|
res.json(out);
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// GET /devices/deltacast/signal – live signal state for Deltacast ports.
|
|
||||||
// Same pattern as /devices/blackmagic/signal.
|
|
||||||
router.get('/devices/deltacast/signal', async (req, res, next) => {
|
router.get('/devices/deltacast/signal', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const [nodesRes, recordersRes] = await Promise.all([
|
const [nodesRes, recordersRes] = await Promise.all([
|
||||||
pool.query(
|
pool.query(`SELECT id, hostname, ip_address, api_url, capabilities,
|
||||||
`SELECT id, hostname, ip_address, api_url, capabilities,
|
|
||||||
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
||||||
FROM cluster_nodes
|
FROM cluster_nodes WHERE capabilities IS NOT NULL`),
|
||||||
WHERE capabilities IS NOT NULL`
|
pool.query(`SELECT id, node_id, device_index, status, source_type, container_id
|
||||||
),
|
FROM recorders WHERE source_type = 'deltacast'`),
|
||||||
pool.query(
|
|
||||||
`SELECT id, node_id, device_index, status, source_type, container_id
|
|
||||||
FROM recorders WHERE source_type = 'deltacast'`
|
|
||||||
),
|
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const recByNodePort = {};
|
const recByNodePort = {};
|
||||||
for (const rec of recordersRes.rows) {
|
for (const rec of recordersRes.rows) {
|
||||||
recByNodePort[`${rec.node_id}:${rec.device_index}`] = rec;
|
recByNodePort[`${rec.node_id}:${rec.device_index}`] = rec;
|
||||||
}
|
}
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
const fetchPromises = [];
|
const fetchPromises = [];
|
||||||
|
|
||||||
for (const node of nodesRes.rows) {
|
for (const node of nodesRes.rows) {
|
||||||
const online = Number(node.stale_seconds) < 120;
|
const online = Number(node.stale_seconds) < 120;
|
||||||
const dc = (node.capabilities && node.capabilities.deltacast) || [];
|
const dc = (node.capabilities && node.capabilities.deltacast) || [];
|
||||||
const model = (node.capabilities && node.capabilities.deltacast_model) || 'Deltacast';
|
const model = (node.capabilities && node.capabilities.deltacast_model) || 'Deltacast';
|
||||||
|
|
||||||
for (const port of dc) {
|
for (const port of dc) {
|
||||||
const idx = port.index !== undefined ? port.index : dc.indexOf(port);
|
const idx = port.index !== undefined ? port.index : dc.indexOf(port);
|
||||||
const rec = recByNodePort[`${node.id}:${idx}`];
|
const rec = recByNodePort[`${node.id}:${idx}`];
|
||||||
const base = {
|
const base = { node_id: node.id, hostname: node.hostname, ip_address: node.ip_address,
|
||||||
node_id: node.id,
|
online, model, index: idx, device: port.device, present: port.present !== false,
|
||||||
hostname: node.hostname,
|
recorder_id: rec ? rec.id : null, recorder_status: rec ? rec.status : null,
|
||||||
ip_address: node.ip_address,
|
signal: 'no-recorder', framesReceived: null, currentFps: null };
|
||||||
online,
|
|
||||||
model,
|
|
||||||
index: idx,
|
|
||||||
device: port.device,
|
|
||||||
present: port.present !== false,
|
|
||||||
recorder_id: rec ? rec.id : null,
|
|
||||||
recorder_status: rec ? rec.status : null,
|
|
||||||
signal: 'no-recorder',
|
|
||||||
framesReceived: null,
|
|
||||||
currentFps: null,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (!rec) { results.push(base); continue; }
|
if (!rec) { results.push(base); continue; }
|
||||||
if (rec.status !== 'recording') { base.signal = 'idle'; results.push(base); continue; }
|
if (rec.status !== 'recording') { base.signal = 'idle'; results.push(base); continue; }
|
||||||
|
|
||||||
// Active recording — query capture container for real signal.
|
|
||||||
const fetchIdx = results.length;
|
const fetchIdx = results.length;
|
||||||
results.push(base);
|
results.push(base);
|
||||||
fetchPromises.push((async () => {
|
fetchPromises.push((async () => {
|
||||||
try {
|
try {
|
||||||
const url = node.api_url
|
const url = node.api_url ? `${node.api_url}/sidecar/${rec.container_id}/status`
|
||||||
? `${node.api_url}/sidecar/${rec.container_id}/status`
|
|
||||||
: `http://recorder-${rec.id}:3001/capture/status`;
|
: `http://recorder-${rec.id}:3001/capture/status`;
|
||||||
const r = await fetch(url, { signal: AbortSignal.timeout(2500) });
|
const r = await fetch(url, { signal: AbortSignal.timeout(2500) });
|
||||||
if (r.ok) {
|
if (r.ok) {
|
||||||
const live = await r.json();
|
const live = await r.json();
|
||||||
if (live && live.signal) {
|
if (live && live.signal) {
|
||||||
results[fetchIdx].signal = live.signal;
|
results[fetchIdx].signal = live.signal;
|
||||||
results[fetchIdx].framesReceived = live.framesReceived ?? null;
|
results[fetchIdx].framesReceived = live.framesReceived ?? null;
|
||||||
results[fetchIdx].currentFps = live.currentFps ?? null;
|
results[fetchIdx].currentFps = live.currentFps ?? null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (_) {
|
} catch (_) { results[fetchIdx].signal = 'connecting'; }
|
||||||
results[fetchIdx].signal = 'connecting';
|
|
||||||
}
|
|
||||||
})());
|
})());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await Promise.all(fetchPromises);
|
await Promise.all(fetchPromises);
|
||||||
res.json(results);
|
res.json(results);
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// GET /:id/ping – probe the node's api_url/health endpoint directly
|
|
||||||
router.get('/:id/ping', async (req, res, next) => {
|
router.get('/:id/ping', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const r = await pool.query(
|
const r = await pool.query('SELECT id, hostname, api_url FROM cluster_nodes WHERE id = $1', [req.params.id]);
|
||||||
'SELECT id, hostname, api_url FROM cluster_nodes WHERE id = $1',
|
|
||||||
[req.params.id]
|
|
||||||
);
|
|
||||||
if (r.rowCount === 0) return res.status(404).json({ error: 'Node not found' });
|
if (r.rowCount === 0) return res.status(404).json({ error: 'Node not found' });
|
||||||
|
|
||||||
const node = r.rows[0];
|
const node = r.rows[0];
|
||||||
if (!node.api_url) return res.json({ reachable: false, reason: 'no api_url registered' });
|
if (!node.api_url) return res.json({ reachable: false, reason: 'no api_url registered' });
|
||||||
|
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
try {
|
try {
|
||||||
const upstream = await fetch(`${node.api_url}/health`, {
|
const upstream = await fetch(`${node.api_url}/health`, { signal: AbortSignal.timeout(4000) });
|
||||||
signal: AbortSignal.timeout(4000),
|
|
||||||
});
|
|
||||||
const latency_ms = Date.now() - start;
|
const latency_ms = Date.now() - start;
|
||||||
const body = await upstream.json().catch(() => ({}));
|
const body = await upstream.json().catch(() => ({}));
|
||||||
res.json({ reachable: upstream.ok, latency_ms, status: upstream.status, agent: body });
|
res.json({ reachable: upstream.ok, latency_ms, status: upstream.status, agent: body });
|
||||||
|
|
@ -448,8 +332,6 @@ router.get('/:id/ping', async (req, res, next) => {
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
// GET /metrics - live per-node utilization (CPU, RAM, GPU)
|
|
||||||
router.get('/metrics', async (req, res, next) => {
|
router.get('/metrics', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const r = await pool.query(
|
const r = await pool.query(
|
||||||
|
|
@ -457,59 +339,37 @@ router.get('/metrics', async (req, res, next) => {
|
||||||
cpu_usage, mem_used_mb, mem_total_mb,
|
cpu_usage, mem_used_mb, mem_total_mb,
|
||||||
capabilities, metrics,
|
capabilities, metrics,
|
||||||
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
||||||
FROM cluster_nodes
|
FROM cluster_nodes ORDER BY registered_at ASC`
|
||||||
ORDER BY registered_at ASC`
|
|
||||||
);
|
);
|
||||||
|
|
||||||
const nodes = r.rows.map(row => {
|
const nodes = r.rows.map(row => {
|
||||||
const capGpus = (row.capabilities && row.capabilities.gpus) || [];
|
const capGpus = (row.capabilities && row.capabilities.gpus) || [];
|
||||||
const liveGpus = (row.metrics && row.metrics.gpus) || [];
|
const liveGpus = (row.metrics && row.metrics.gpus) || [];
|
||||||
|
|
||||||
const gpus = capGpus.map((g, idx) => {
|
const gpus = capGpus.map((g, idx) => {
|
||||||
const live = liveGpus.find(l => l.index === g.index) || liveGpus[idx] || {};
|
const live = liveGpus.find(l => l.index === g.index) || liveGpus[idx] || {};
|
||||||
return {
|
return { name: g.name || null, util_pct: live.util_pct != null ? live.util_pct : null,
|
||||||
name: g.name || null,
|
memory_used_mb: live.memory_used_mb != null ? live.memory_used_mb : null,
|
||||||
util_pct: live.util_pct != null ? live.util_pct : null,
|
memory_total_mb: g.memory_mb != null ? g.memory_mb : (live.memory_total_mb ?? null) };
|
||||||
memory_used_mb: live.memory_used_mb != null ? live.memory_used_mb : null,
|
|
||||||
memory_total_mb: g.memory_mb != null ? g.memory_mb : (live.memory_total_mb ?? null),
|
|
||||||
};
|
|
||||||
});
|
});
|
||||||
// include any live GPUs not in static capabilities
|
|
||||||
for (const lg of liveGpus) {
|
for (const lg of liveGpus) {
|
||||||
if (!capGpus.some(g => g.index === lg.index)) {
|
if (!capGpus.some(g => g.index === lg.index)) {
|
||||||
gpus.push({
|
gpus.push({ name: lg.name || null, util_pct: lg.util_pct != null ? lg.util_pct : null,
|
||||||
name: lg.name || null,
|
memory_used_mb: lg.memory_used_mb != null ? lg.memory_used_mb : null,
|
||||||
util_pct: lg.util_pct != null ? lg.util_pct : null,
|
memory_total_mb: lg.memory_total_mb != null ? lg.memory_total_mb : null });
|
||||||
memory_used_mb: lg.memory_used_mb != null ? lg.memory_used_mb : null,
|
|
||||||
memory_total_mb: lg.memory_total_mb != null ? lg.memory_total_mb : null,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return { id: row.id, hostname: row.hostname, role: row.role,
|
||||||
return {
|
online: Number(row.stale_seconds) < 120, last_seen: row.last_seen,
|
||||||
id: row.id,
|
cpu_util_pct: row.cpu_usage != null ? Number(row.cpu_usage) : null,
|
||||||
hostname: row.hostname,
|
ram_used_mb: row.mem_used_mb != null ? row.mem_used_mb : null,
|
||||||
role: row.role,
|
ram_total_mb: row.mem_total_mb != null ? row.mem_total_mb : null, gpus };
|
||||||
online: Number(row.stale_seconds) < 120,
|
|
||||||
last_seen: row.last_seen,
|
|
||||||
cpu_util_pct: row.cpu_usage != null ? Number(row.cpu_usage) : null,
|
|
||||||
ram_used_mb: row.mem_used_mb != null ? row.mem_used_mb : null,
|
|
||||||
ram_total_mb: row.mem_total_mb != null ? row.mem_total_mb : null,
|
|
||||||
gpus,
|
|
||||||
};
|
|
||||||
});
|
});
|
||||||
|
|
||||||
res.json({ nodes });
|
res.json({ nodes });
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
// DELETE /:id – deregister a node
|
|
||||||
router.delete('/:id', async (req, res, next) => {
|
router.delete('/:id', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
const r = await pool.query(
|
const r = await pool.query('DELETE FROM cluster_nodes WHERE id = $1 RETURNING id', [req.params.id]);
|
||||||
'DELETE FROM cluster_nodes WHERE id = $1 RETURNING id',
|
|
||||||
[req.params.id]
|
|
||||||
);
|
|
||||||
if (r.rowCount === 0) return res.status(404).json({ error: 'Node not found' });
|
if (r.rowCount === 0) return res.status(404).json({ error: 'Node not found' });
|
||||||
res.json({ ok: true });
|
res.json({ ok: true });
|
||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue