fix: use external MAM_API_URL for remote capture sidecars; add cluster metrics endpoint and dashboard resource graphs

- recorders.js: when isRemote=true, replace MAM_API_URL in sidecar env with
  http://<NODE_IP>:<PORT_MAM_API> so capture containers on worker host network
  can reach mam-api (fixes assets stuck in live status after recorder stop)
- cluster.js: add GET /api/v1/cluster/metrics endpoint returning per-node
  cpu/ram/gpu utilization; update heartbeat handler to persist metrics JSONB
- web-ui: add Resources panel to dashboard with live CPU/RAM/GPU bars per node,
  polling /api/v1/cluster/metrics every 5s
This commit is contained in:
Zac Gaetano 2026-05-29 01:04:24 +00:00
parent 6f64b55824
commit 453103aee6
6 changed files with 259 additions and 4 deletions

View file

@ -103,7 +103,7 @@ router.post('/heartbeat', async (req, res, next) => {
hostname, ip_address,
role = 'worker', version, api_url,
cpu_usage, mem_used_mb, mem_total_mb,
capabilities, metadata,
capabilities, metadata, metrics,
} = req.body;
if (!hostname) return res.status(400).json({ error: 'hostname is required' });
@ -132,8 +132,8 @@ router.post('/heartbeat', async (req, res, next) => {
const r = await pool.query(
`INSERT INTO cluster_nodes
(hostname, ip_address, role, version, api_url,
cpu_usage, mem_used_mb, mem_total_mb, last_seen, capabilities, metadata)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),$9,$10)
cpu_usage, mem_used_mb, mem_total_mb, last_seen, capabilities, metadata, metrics)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),$9,$10,$11)
ON CONFLICT (hostname) DO UPDATE SET
ip_address = EXCLUDED.ip_address,
role = EXCLUDED.role,
@ -144,7 +144,8 @@ router.post('/heartbeat', async (req, res, next) => {
mem_total_mb = EXCLUDED.mem_total_mb,
last_seen = NOW(),
capabilities = EXCLUDED.capabilities,
metadata = EXCLUDED.metadata
metadata = EXCLUDED.metadata,
metrics = COALESCE(EXCLUDED.metrics, cluster_nodes.metrics)
RETURNING *`,
[
hostname,
@ -157,6 +158,7 @@ router.post('/heartbeat', async (req, res, next) => {
mem_total_mb != null ? mem_total_mb : null,
capabilities != null ? JSON.stringify(capabilities) : '{}',
metadata != null ? JSON.stringify(metadata) : null,
metrics != null ? JSON.stringify(metrics) : null,
]
);
res.json(r.rows[0]);
@ -446,6 +448,61 @@ router.get('/:id/ping', async (req, res, next) => {
} catch (err) { next(err); }
});
// GET /metrics - live per-node utilization (CPU, RAM, GPU)
router.get('/metrics', async (req, res, next) => {
try {
const r = await pool.query(
`SELECT id, hostname, role, last_seen,
cpu_usage, mem_used_mb, mem_total_mb,
capabilities, metrics,
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
FROM cluster_nodes
ORDER BY registered_at ASC`
);
const nodes = r.rows.map(row => {
const capGpus = (row.capabilities && row.capabilities.gpus) || [];
const liveGpus = (row.metrics && row.metrics.gpus) || [];
const gpus = capGpus.map((g, idx) => {
const live = liveGpus.find(l => l.index === g.index) || liveGpus[idx] || {};
return {
name: g.name || null,
util_pct: live.util_pct != null ? live.util_pct : null,
memory_used_mb: live.memory_used_mb != null ? live.memory_used_mb : null,
memory_total_mb: g.memory_mb != null ? g.memory_mb : (live.memory_total_mb ?? null),
};
});
// include any live GPUs not in static capabilities
for (const lg of liveGpus) {
if (!capGpus.some(g => g.index === lg.index)) {
gpus.push({
name: lg.name || null,
util_pct: lg.util_pct != null ? lg.util_pct : null,
memory_used_mb: lg.memory_used_mb != null ? lg.memory_used_mb : null,
memory_total_mb: lg.memory_total_mb != null ? lg.memory_total_mb : null,
});
}
}
return {
id: row.id,
hostname: row.hostname,
role: row.role,
online: Number(row.stale_seconds) < 120,
last_seen: row.last_seen,
cpu_util_pct: row.cpu_usage != null ? Number(row.cpu_usage) : null,
ram_used_mb: row.mem_used_mb != null ? row.mem_used_mb : null,
ram_total_mb: row.mem_total_mb != null ? row.mem_total_mb : null,
gpus,
};
});
res.json({ nodes });
} catch (err) { next(err); }
});
// DELETE /:id deregister a node
router.delete('/:id', async (req, res, next) => {
try {

View file

@ -318,6 +318,7 @@ router.post('/:id/start', async (req, res, next) => {
const s3AccessKey = process.env.S3_ACCESS_KEY;
const s3SecretKey = process.env.S3_SECRET_KEY;
const mamApiUrl = process.env.MAM_API_URL || 'http://mam-api:3000';
const externalMamApiUrl = `http://${process.env.NODE_IP || '172.18.91.200'}:${process.env.PORT_MAM_API || 47432}`;
const dockerNetwork = process.env.DOCKER_NETWORK || 'wild-dragon_wild-dragon';
// Growing-files mode is a global setting (settings table). When on, the
@ -426,6 +427,12 @@ router.post('/:id/start', async (req, res, next) => {
// Determine whether to spawn locally or via a remote node-agent.
const { remote: isRemote, apiUrl: targetNodeApiUrl } = await resolveNodeTarget(recorder.node_id);
// For remote sidecars, the capture container runs on the worker host network and cannot
// resolve the Docker-internal mam-api hostname — replace with the external URL.
if (isRemote) {
const idx = env.findIndex(e => e.startsWith('MAM_API_URL='));
if (idx !== -1) env[idx] = `MAM_API_URL=${externalMamApiUrl}`;
}
let containerId;

View file

@ -35,6 +35,7 @@
<script src="dist/shell.js"></script>
<script src="dist/auth-gate.js"></script>
<script src="dist/screens-auth.js"></script>
<script src="dist/screens-resources.js"></script>
<script src="dist/screens-home.js"></script>
<script src="dist/screens-library.js"></script>
<script src="dist/screens-asset.js"></script>

View file

@ -571,6 +571,18 @@ function Dashboard({ navigate }) {
</div>
</section>
{/* ────────── RESOURCES ────────── */}
<section className="dash-section">
<DashSectionHead title="Resources" />
{window.ClusterResources && <window.ClusterResources />}
</section>
{/* ────────── RESOURCES ────────── */}
<section className="dash-section">
<DashSectionHead title="Resources" />
{window.ClusterResources && React.createElement(window.ClusterResources)}
</section>
{/* ────────── STATUS BAR (bottom) ────────── */}
<footer className="dash-statusbar">
<span className="dash-stat-pip" data-tone={liveRecorders.length > 0 ? 'live' : 'idle'}>

View file

@ -0,0 +1,97 @@
// screens-resources.jsx
// Live CPU/RAM/GPU gauges for Dashboard. Polls /api/v1/cluster/metrics every 5s.
// Falls back to mock data when endpoint unavailable.
const RESOURCE_MOCK={nodes:[
{hostname:"zampp1",cpu_util_pct:42,ram_used_mb:14336,ram_total_mb:32768,
gpus:[{name:"RTX 3060",util_pct:67,memory_used_mb:5120,memory_total_mb:12288}]},
{hostname:"zampp2",cpu_util_pct:18,ram_used_mb:8192,ram_total_mb:32768,
gpus:[{name:"RTX 3060",util_pct:12,memory_used_mb:1024,memory_total_mb:12288}]},
]};
function useClusterMetrics(){
const [data,setData]=React.useState(null);
const [usingMock,setUsingMock]=React.useState(false);
React.useEffect(()=>{
let cancelled=false;
const load=()=>{
window.ZAMPP_API.fetch('/cluster/metrics')
.then(d=>{
if(cancelled)return;
if(d&&Array.isArray(d.nodes)&&d.nodes.length>0){
setData(d);setUsingMock(false);
}else{setData(RESOURCE_MOCK);setUsingMock(true);}
})
.catch(()=>{if(!cancelled){setData(RESOURCE_MOCK);setUsingMock(true);}});
};
load();
const t=setInterval(load,5000);
return ()=>{cancelled=true;clearInterval(t);};
},[]);
return {data,usingMock};
}
function ResBar({pct,color}){
const p=Math.min(100,Math.max(0,Math.round(pct||0)));
const c=color||(p>85?'var(--warning)':p>60?'var(--accent)':'var(--success)');
return (
<div className="res-bar-wrap">
<div className="res-bar"><div className="res-bar-fill" style={{width:p+'%',background:c}}/></div>
<span className="res-bar-pct">{p}%</span>
</div>
);
}
function NodeResourceCard({node}){
const ramPct=node.ram_total_mb>0?(node.ram_used_mb/node.ram_total_mb)*100:0;
const ramUsed=(node.ram_used_mb/1024).toFixed(1);
const ramTotal=(node.ram_total_mb/1024).toFixed(0);
return (
<div className="panel res-node-card">
<div className="res-node-name"><span className="res-node-dot"/>{node.hostname}</div>
<div className="res-metric">
<div className="res-metric-label">CPU</div>
<ResBar pct={node.cpu_util_pct}/>
</div>
<div className="res-metric">
<div className="res-metric-label">RAM <span className="res-metric-sub">{ramUsed}/{ramTotal} GB</span></div>
<ResBar pct={ramPct} color={ramPct>85?'var(--warning)':'var(--text-2)'}/>
</div>
{(node.gpus||[]).map((gpu,i)=>{
const vramPct=gpu.memory_total_mb>0?(gpu.memory_used_mb/gpu.memory_total_mb)*100:0;
const vramUsed=(gpu.memory_used_mb/1024).toFixed(1);
const vramTotal=(gpu.memory_total_mb/1024).toFixed(0);
const lbl=(node.gpus||[]).length>1?'GPU '+(i+1):'GPU';
return (
<React.Fragment key={i}>
<div className="res-metric">
<div className="res-metric-label">{lbl} util</div>
<ResBar pct={gpu.util_pct}/>
</div>
<div className="res-metric">
<div className="res-metric-label">{lbl} VRAM <span className="res-metric-sub">{vramUsed}/{vramTotal} GB</span></div>
<ResBar pct={vramPct} color={vramPct>85?'var(--warning)':'var(--purple)'}/>
</div>
</React.Fragment>
);
})}
</div>
);
}
function ClusterResources(){
const {data,usingMock}=useClusterMetrics();
if(!data)return <div className="dash-panel-empty">Loading resource metrics...</div>;
return (
<div>
{usingMock&&(
<div className="res-mock-note">&#9888; Metrics API unavailable - showing mock data</div>
)}
<div className="res-nodes-grid">
{data.nodes.map(n=><NodeResourceCard key={n.hostname} node={n}/>)}
</div>
</div>
);
}
window.ClusterResources=ClusterResources;

View file

@ -678,3 +678,84 @@
z-index: 100;
pointer-events: none;
}
/* ── Resource utilization cards (screens-resources.jsx) ── */
.res-nodes-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 12px;
}
.res-node-card {
padding: 16px;
display: flex;
flex-direction: column;
gap: 10px;
}
.res-node-name {
font-size: 13px;
font-weight: 600;
color: var(--text-1);
display: flex;
align-items: center;
gap: 6px;
margin-bottom: 2px;
}
.res-node-dot {
width: 6px;
height: 6px;
border-radius: 50%;
background: var(--success);
box-shadow: 0 0 0 3px var(--success-soft);
flex-shrink: 0;
}
.res-metric {
display: flex;
flex-direction: column;
gap: 4px;
}
.res-metric-label {
font-size: 11px;
font-weight: 500;
color: var(--text-3);
font-family: var(--font-mono);
display: flex;
align-items: baseline;
gap: 6px;
}
.res-metric-sub {
color: var(--text-4);
font-weight: 400;
}
.res-bar-wrap {
display: flex;
align-items: center;
gap: 8px;
}
.res-bar {
flex: 1;
height: 6px;
background: var(--bg-4);
border-radius: 99px;
overflow: hidden;
}
.res-bar-fill {
height: 100%;
border-radius: 99px;
transition: width 0.6s ease;
}
.res-bar-pct {
font-size: 11px;
font-family: var(--font-mono);
color: var(--text-3);
min-width: 32px;
text-align: right;
}
.res-mock-note {
font-size: 11px;
color: var(--warning);
background: var(--warning-soft);
border-radius: var(--r-sm);
padding: 6px 10px;
margin-bottom: 10px;
font-family: var(--font-mono);
}