From 453103aee6934a9b5365715f0353fbe7c7613d66 Mon Sep 17 00:00:00 2001 From: Zac Gaetano Date: Fri, 29 May 2026 01:04:24 +0000 Subject: [PATCH] fix: use external MAM_API_URL for remote capture sidecars; add cluster metrics endpoint and dashboard resource graphs - recorders.js: when isRemote=true, replace MAM_API_URL in sidecar env with http://: so capture containers on worker host network can reach mam-api (fixes assets stuck in live status after recorder stop) - cluster.js: add GET /api/v1/cluster/metrics endpoint returning per-node cpu/ram/gpu utilization; update heartbeat handler to persist metrics JSONB - web-ui: add Resources panel to dashboard with live CPU/RAM/GPU bars per node, polling /api/v1/cluster/metrics every 5s --- services/mam-api/src/routes/cluster.js | 65 ++++++++++++- services/mam-api/src/routes/recorders.js | 7 ++ services/web-ui/public/index.html | 1 + services/web-ui/public/screens-home.jsx | 12 +++ services/web-ui/public/screens-resources.jsx | 97 ++++++++++++++++++++ services/web-ui/public/styles-fixes.css | 81 ++++++++++++++++ 6 files changed, 259 insertions(+), 4 deletions(-) create mode 100644 services/web-ui/public/screens-resources.jsx diff --git a/services/mam-api/src/routes/cluster.js b/services/mam-api/src/routes/cluster.js index 327055f..174ca49 100644 --- a/services/mam-api/src/routes/cluster.js +++ b/services/mam-api/src/routes/cluster.js @@ -103,7 +103,7 @@ router.post('/heartbeat', async (req, res, next) => { hostname, ip_address, role = 'worker', version, api_url, cpu_usage, mem_used_mb, mem_total_mb, - capabilities, metadata, + capabilities, metadata, metrics, } = req.body; if (!hostname) return res.status(400).json({ error: 'hostname is required' }); @@ -132,8 +132,8 @@ router.post('/heartbeat', async (req, res, next) => { const r = await pool.query( `INSERT INTO cluster_nodes (hostname, ip_address, role, version, api_url, - cpu_usage, mem_used_mb, mem_total_mb, last_seen, capabilities, metadata) - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),$9,$10) + cpu_usage, mem_used_mb, mem_total_mb, last_seen, capabilities, metadata, metrics) + VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),$9,$10,$11) ON CONFLICT (hostname) DO UPDATE SET ip_address = EXCLUDED.ip_address, role = EXCLUDED.role, @@ -144,7 +144,8 @@ router.post('/heartbeat', async (req, res, next) => { mem_total_mb = EXCLUDED.mem_total_mb, last_seen = NOW(), capabilities = EXCLUDED.capabilities, - metadata = EXCLUDED.metadata + metadata = EXCLUDED.metadata, + metrics = COALESCE(EXCLUDED.metrics, cluster_nodes.metrics) RETURNING *`, [ hostname, @@ -157,6 +158,7 @@ router.post('/heartbeat', async (req, res, next) => { mem_total_mb != null ? mem_total_mb : null, capabilities != null ? JSON.stringify(capabilities) : '{}', metadata != null ? JSON.stringify(metadata) : null, + metrics != null ? JSON.stringify(metrics) : null, ] ); res.json(r.rows[0]); @@ -446,6 +448,61 @@ router.get('/:id/ping', async (req, res, next) => { } catch (err) { next(err); } }); + +// GET /metrics - live per-node utilization (CPU, RAM, GPU) +router.get('/metrics', async (req, res, next) => { + try { + const r = await pool.query( + `SELECT id, hostname, role, last_seen, + cpu_usage, mem_used_mb, mem_total_mb, + capabilities, metrics, + EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds + FROM cluster_nodes + ORDER BY registered_at ASC` + ); + + const nodes = r.rows.map(row => { + const capGpus = (row.capabilities && row.capabilities.gpus) || []; + const liveGpus = (row.metrics && row.metrics.gpus) || []; + + const gpus = capGpus.map((g, idx) => { + const live = liveGpus.find(l => l.index === g.index) || liveGpus[idx] || {}; + return { + name: g.name || null, + util_pct: live.util_pct != null ? live.util_pct : null, + memory_used_mb: live.memory_used_mb != null ? live.memory_used_mb : null, + memory_total_mb: g.memory_mb != null ? g.memory_mb : (live.memory_total_mb ?? null), + }; + }); + // include any live GPUs not in static capabilities + for (const lg of liveGpus) { + if (!capGpus.some(g => g.index === lg.index)) { + gpus.push({ + name: lg.name || null, + util_pct: lg.util_pct != null ? lg.util_pct : null, + memory_used_mb: lg.memory_used_mb != null ? lg.memory_used_mb : null, + memory_total_mb: lg.memory_total_mb != null ? lg.memory_total_mb : null, + }); + } + } + + return { + id: row.id, + hostname: row.hostname, + role: row.role, + online: Number(row.stale_seconds) < 120, + last_seen: row.last_seen, + cpu_util_pct: row.cpu_usage != null ? Number(row.cpu_usage) : null, + ram_used_mb: row.mem_used_mb != null ? row.mem_used_mb : null, + ram_total_mb: row.mem_total_mb != null ? row.mem_total_mb : null, + gpus, + }; + }); + + res.json({ nodes }); + } catch (err) { next(err); } +}); + // DELETE /:id – deregister a node router.delete('/:id', async (req, res, next) => { try { diff --git a/services/mam-api/src/routes/recorders.js b/services/mam-api/src/routes/recorders.js index 2511133..7defc88 100644 --- a/services/mam-api/src/routes/recorders.js +++ b/services/mam-api/src/routes/recorders.js @@ -318,6 +318,7 @@ router.post('/:id/start', async (req, res, next) => { const s3AccessKey = process.env.S3_ACCESS_KEY; const s3SecretKey = process.env.S3_SECRET_KEY; const mamApiUrl = process.env.MAM_API_URL || 'http://mam-api:3000'; + const externalMamApiUrl = `http://${process.env.NODE_IP || '172.18.91.200'}:${process.env.PORT_MAM_API || 47432}`; const dockerNetwork = process.env.DOCKER_NETWORK || 'wild-dragon_wild-dragon'; // Growing-files mode is a global setting (settings table). When on, the @@ -426,6 +427,12 @@ router.post('/:id/start', async (req, res, next) => { // Determine whether to spawn locally or via a remote node-agent. const { remote: isRemote, apiUrl: targetNodeApiUrl } = await resolveNodeTarget(recorder.node_id); + // For remote sidecars, the capture container runs on the worker host network and cannot + // resolve the Docker-internal mam-api hostname — replace with the external URL. + if (isRemote) { + const idx = env.findIndex(e => e.startsWith('MAM_API_URL=')); + if (idx !== -1) env[idx] = `MAM_API_URL=${externalMamApiUrl}`; + } let containerId; diff --git a/services/web-ui/public/index.html b/services/web-ui/public/index.html index bc933c4..51c5869 100644 --- a/services/web-ui/public/index.html +++ b/services/web-ui/public/index.html @@ -35,6 +35,7 @@ + diff --git a/services/web-ui/public/screens-home.jsx b/services/web-ui/public/screens-home.jsx index bd39b01..8b14e02 100644 --- a/services/web-ui/public/screens-home.jsx +++ b/services/web-ui/public/screens-home.jsx @@ -571,6 +571,18 @@ function Dashboard({ navigate }) { + {/* ────────── RESOURCES ────────── */} +
+ + {window.ClusterResources && } +
+ + {/* ────────── RESOURCES ────────── */} +
+ + {window.ClusterResources && React.createElement(window.ClusterResources)} +
+ {/* ────────── STATUS BAR (bottom) ────────── */}