fix: use external MAM_API_URL for remote capture sidecars; add cluster metrics endpoint and dashboard resource graphs
- recorders.js: when isRemote=true, replace MAM_API_URL in sidecar env with http://<NODE_IP>:<PORT_MAM_API> so capture containers on worker host network can reach mam-api (fixes assets stuck in live status after recorder stop) - cluster.js: add GET /api/v1/cluster/metrics endpoint returning per-node cpu/ram/gpu utilization; update heartbeat handler to persist metrics JSONB - web-ui: add Resources panel to dashboard with live CPU/RAM/GPU bars per node, polling /api/v1/cluster/metrics every 5s
This commit is contained in:
parent
6f64b55824
commit
453103aee6
6 changed files with 259 additions and 4 deletions
|
|
@ -103,7 +103,7 @@ router.post('/heartbeat', async (req, res, next) => {
|
|||
hostname, ip_address,
|
||||
role = 'worker', version, api_url,
|
||||
cpu_usage, mem_used_mb, mem_total_mb,
|
||||
capabilities, metadata,
|
||||
capabilities, metadata, metrics,
|
||||
} = req.body;
|
||||
|
||||
if (!hostname) return res.status(400).json({ error: 'hostname is required' });
|
||||
|
|
@ -132,8 +132,8 @@ router.post('/heartbeat', async (req, res, next) => {
|
|||
const r = await pool.query(
|
||||
`INSERT INTO cluster_nodes
|
||||
(hostname, ip_address, role, version, api_url,
|
||||
cpu_usage, mem_used_mb, mem_total_mb, last_seen, capabilities, metadata)
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),$9,$10)
|
||||
cpu_usage, mem_used_mb, mem_total_mb, last_seen, capabilities, metadata, metrics)
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,NOW(),$9,$10,$11)
|
||||
ON CONFLICT (hostname) DO UPDATE SET
|
||||
ip_address = EXCLUDED.ip_address,
|
||||
role = EXCLUDED.role,
|
||||
|
|
@ -144,7 +144,8 @@ router.post('/heartbeat', async (req, res, next) => {
|
|||
mem_total_mb = EXCLUDED.mem_total_mb,
|
||||
last_seen = NOW(),
|
||||
capabilities = EXCLUDED.capabilities,
|
||||
metadata = EXCLUDED.metadata
|
||||
metadata = EXCLUDED.metadata,
|
||||
metrics = COALESCE(EXCLUDED.metrics, cluster_nodes.metrics)
|
||||
RETURNING *`,
|
||||
[
|
||||
hostname,
|
||||
|
|
@ -157,6 +158,7 @@ router.post('/heartbeat', async (req, res, next) => {
|
|||
mem_total_mb != null ? mem_total_mb : null,
|
||||
capabilities != null ? JSON.stringify(capabilities) : '{}',
|
||||
metadata != null ? JSON.stringify(metadata) : null,
|
||||
metrics != null ? JSON.stringify(metrics) : null,
|
||||
]
|
||||
);
|
||||
res.json(r.rows[0]);
|
||||
|
|
@ -446,6 +448,61 @@ router.get('/:id/ping', async (req, res, next) => {
|
|||
} catch (err) { next(err); }
|
||||
});
|
||||
|
||||
|
||||
// GET /metrics - live per-node utilization (CPU, RAM, GPU)
|
||||
router.get('/metrics', async (req, res, next) => {
|
||||
try {
|
||||
const r = await pool.query(
|
||||
`SELECT id, hostname, role, last_seen,
|
||||
cpu_usage, mem_used_mb, mem_total_mb,
|
||||
capabilities, metrics,
|
||||
EXTRACT(EPOCH FROM (NOW() - last_seen)) AS stale_seconds
|
||||
FROM cluster_nodes
|
||||
ORDER BY registered_at ASC`
|
||||
);
|
||||
|
||||
const nodes = r.rows.map(row => {
|
||||
const capGpus = (row.capabilities && row.capabilities.gpus) || [];
|
||||
const liveGpus = (row.metrics && row.metrics.gpus) || [];
|
||||
|
||||
const gpus = capGpus.map((g, idx) => {
|
||||
const live = liveGpus.find(l => l.index === g.index) || liveGpus[idx] || {};
|
||||
return {
|
||||
name: g.name || null,
|
||||
util_pct: live.util_pct != null ? live.util_pct : null,
|
||||
memory_used_mb: live.memory_used_mb != null ? live.memory_used_mb : null,
|
||||
memory_total_mb: g.memory_mb != null ? g.memory_mb : (live.memory_total_mb ?? null),
|
||||
};
|
||||
});
|
||||
// include any live GPUs not in static capabilities
|
||||
for (const lg of liveGpus) {
|
||||
if (!capGpus.some(g => g.index === lg.index)) {
|
||||
gpus.push({
|
||||
name: lg.name || null,
|
||||
util_pct: lg.util_pct != null ? lg.util_pct : null,
|
||||
memory_used_mb: lg.memory_used_mb != null ? lg.memory_used_mb : null,
|
||||
memory_total_mb: lg.memory_total_mb != null ? lg.memory_total_mb : null,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
hostname: row.hostname,
|
||||
role: row.role,
|
||||
online: Number(row.stale_seconds) < 120,
|
||||
last_seen: row.last_seen,
|
||||
cpu_util_pct: row.cpu_usage != null ? Number(row.cpu_usage) : null,
|
||||
ram_used_mb: row.mem_used_mb != null ? row.mem_used_mb : null,
|
||||
ram_total_mb: row.mem_total_mb != null ? row.mem_total_mb : null,
|
||||
gpus,
|
||||
};
|
||||
});
|
||||
|
||||
res.json({ nodes });
|
||||
} catch (err) { next(err); }
|
||||
});
|
||||
|
||||
// DELETE /:id – deregister a node
|
||||
router.delete('/:id', async (req, res, next) => {
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -318,6 +318,7 @@ router.post('/:id/start', async (req, res, next) => {
|
|||
const s3AccessKey = process.env.S3_ACCESS_KEY;
|
||||
const s3SecretKey = process.env.S3_SECRET_KEY;
|
||||
const mamApiUrl = process.env.MAM_API_URL || 'http://mam-api:3000';
|
||||
const externalMamApiUrl = `http://${process.env.NODE_IP || '172.18.91.200'}:${process.env.PORT_MAM_API || 47432}`;
|
||||
const dockerNetwork = process.env.DOCKER_NETWORK || 'wild-dragon_wild-dragon';
|
||||
|
||||
// Growing-files mode is a global setting (settings table). When on, the
|
||||
|
|
@ -426,6 +427,12 @@ router.post('/:id/start', async (req, res, next) => {
|
|||
|
||||
// Determine whether to spawn locally or via a remote node-agent.
|
||||
const { remote: isRemote, apiUrl: targetNodeApiUrl } = await resolveNodeTarget(recorder.node_id);
|
||||
// For remote sidecars, the capture container runs on the worker host network and cannot
|
||||
// resolve the Docker-internal mam-api hostname — replace with the external URL.
|
||||
if (isRemote) {
|
||||
const idx = env.findIndex(e => e.startsWith('MAM_API_URL='));
|
||||
if (idx !== -1) env[idx] = `MAM_API_URL=${externalMamApiUrl}`;
|
||||
}
|
||||
|
||||
let containerId;
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@
|
|||
<script src="dist/shell.js"></script>
|
||||
<script src="dist/auth-gate.js"></script>
|
||||
<script src="dist/screens-auth.js"></script>
|
||||
<script src="dist/screens-resources.js"></script>
|
||||
<script src="dist/screens-home.js"></script>
|
||||
<script src="dist/screens-library.js"></script>
|
||||
<script src="dist/screens-asset.js"></script>
|
||||
|
|
|
|||
|
|
@ -571,6 +571,18 @@ function Dashboard({ navigate }) {
|
|||
</div>
|
||||
</section>
|
||||
|
||||
{/* ────────── RESOURCES ────────── */}
|
||||
<section className="dash-section">
|
||||
<DashSectionHead title="Resources" />
|
||||
{window.ClusterResources && <window.ClusterResources />}
|
||||
</section>
|
||||
|
||||
{/* ────────── RESOURCES ────────── */}
|
||||
<section className="dash-section">
|
||||
<DashSectionHead title="Resources" />
|
||||
{window.ClusterResources && React.createElement(window.ClusterResources)}
|
||||
</section>
|
||||
|
||||
{/* ────────── STATUS BAR (bottom) ────────── */}
|
||||
<footer className="dash-statusbar">
|
||||
<span className="dash-stat-pip" data-tone={liveRecorders.length > 0 ? 'live' : 'idle'}>
|
||||
|
|
|
|||
97
services/web-ui/public/screens-resources.jsx
Normal file
97
services/web-ui/public/screens-resources.jsx
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
// screens-resources.jsx
|
||||
// Live CPU/RAM/GPU gauges for Dashboard. Polls /api/v1/cluster/metrics every 5s.
|
||||
// Falls back to mock data when endpoint unavailable.
|
||||
|
||||
const RESOURCE_MOCK={nodes:[
|
||||
{hostname:"zampp1",cpu_util_pct:42,ram_used_mb:14336,ram_total_mb:32768,
|
||||
gpus:[{name:"RTX 3060",util_pct:67,memory_used_mb:5120,memory_total_mb:12288}]},
|
||||
{hostname:"zampp2",cpu_util_pct:18,ram_used_mb:8192,ram_total_mb:32768,
|
||||
gpus:[{name:"RTX 3060",util_pct:12,memory_used_mb:1024,memory_total_mb:12288}]},
|
||||
]};
|
||||
|
||||
function useClusterMetrics(){
|
||||
const [data,setData]=React.useState(null);
|
||||
const [usingMock,setUsingMock]=React.useState(false);
|
||||
React.useEffect(()=>{
|
||||
let cancelled=false;
|
||||
const load=()=>{
|
||||
window.ZAMPP_API.fetch('/cluster/metrics')
|
||||
.then(d=>{
|
||||
if(cancelled)return;
|
||||
if(d&&Array.isArray(d.nodes)&&d.nodes.length>0){
|
||||
setData(d);setUsingMock(false);
|
||||
}else{setData(RESOURCE_MOCK);setUsingMock(true);}
|
||||
})
|
||||
.catch(()=>{if(!cancelled){setData(RESOURCE_MOCK);setUsingMock(true);}});
|
||||
};
|
||||
load();
|
||||
const t=setInterval(load,5000);
|
||||
return ()=>{cancelled=true;clearInterval(t);};
|
||||
},[]);
|
||||
return {data,usingMock};
|
||||
}
|
||||
|
||||
function ResBar({pct,color}){
|
||||
const p=Math.min(100,Math.max(0,Math.round(pct||0)));
|
||||
const c=color||(p>85?'var(--warning)':p>60?'var(--accent)':'var(--success)');
|
||||
return (
|
||||
<div className="res-bar-wrap">
|
||||
<div className="res-bar"><div className="res-bar-fill" style={{width:p+'%',background:c}}/></div>
|
||||
<span className="res-bar-pct">{p}%</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function NodeResourceCard({node}){
|
||||
const ramPct=node.ram_total_mb>0?(node.ram_used_mb/node.ram_total_mb)*100:0;
|
||||
const ramUsed=(node.ram_used_mb/1024).toFixed(1);
|
||||
const ramTotal=(node.ram_total_mb/1024).toFixed(0);
|
||||
return (
|
||||
<div className="panel res-node-card">
|
||||
<div className="res-node-name"><span className="res-node-dot"/>{node.hostname}</div>
|
||||
<div className="res-metric">
|
||||
<div className="res-metric-label">CPU</div>
|
||||
<ResBar pct={node.cpu_util_pct}/>
|
||||
</div>
|
||||
<div className="res-metric">
|
||||
<div className="res-metric-label">RAM <span className="res-metric-sub">{ramUsed}/{ramTotal} GB</span></div>
|
||||
<ResBar pct={ramPct} color={ramPct>85?'var(--warning)':'var(--text-2)'}/>
|
||||
</div>
|
||||
{(node.gpus||[]).map((gpu,i)=>{
|
||||
const vramPct=gpu.memory_total_mb>0?(gpu.memory_used_mb/gpu.memory_total_mb)*100:0;
|
||||
const vramUsed=(gpu.memory_used_mb/1024).toFixed(1);
|
||||
const vramTotal=(gpu.memory_total_mb/1024).toFixed(0);
|
||||
const lbl=(node.gpus||[]).length>1?'GPU '+(i+1):'GPU';
|
||||
return (
|
||||
<React.Fragment key={i}>
|
||||
<div className="res-metric">
|
||||
<div className="res-metric-label">{lbl} util</div>
|
||||
<ResBar pct={gpu.util_pct}/>
|
||||
</div>
|
||||
<div className="res-metric">
|
||||
<div className="res-metric-label">{lbl} VRAM <span className="res-metric-sub">{vramUsed}/{vramTotal} GB</span></div>
|
||||
<ResBar pct={vramPct} color={vramPct>85?'var(--warning)':'var(--purple)'}/>
|
||||
</div>
|
||||
</React.Fragment>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ClusterResources(){
|
||||
const {data,usingMock}=useClusterMetrics();
|
||||
if(!data)return <div className="dash-panel-empty">Loading resource metrics...</div>;
|
||||
return (
|
||||
<div>
|
||||
{usingMock&&(
|
||||
<div className="res-mock-note">⚠ Metrics API unavailable - showing mock data</div>
|
||||
)}
|
||||
<div className="res-nodes-grid">
|
||||
{data.nodes.map(n=><NodeResourceCard key={n.hostname} node={n}/>)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
window.ClusterResources=ClusterResources;
|
||||
|
|
@ -678,3 +678,84 @@
|
|||
z-index: 100;
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
/* ── Resource utilization cards (screens-resources.jsx) ── */
|
||||
.res-nodes-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
|
||||
gap: 12px;
|
||||
}
|
||||
.res-node-card {
|
||||
padding: 16px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 10px;
|
||||
}
|
||||
.res-node-name {
|
||||
font-size: 13px;
|
||||
font-weight: 600;
|
||||
color: var(--text-1);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
margin-bottom: 2px;
|
||||
}
|
||||
.res-node-dot {
|
||||
width: 6px;
|
||||
height: 6px;
|
||||
border-radius: 50%;
|
||||
background: var(--success);
|
||||
box-shadow: 0 0 0 3px var(--success-soft);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.res-metric {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 4px;
|
||||
}
|
||||
.res-metric-label {
|
||||
font-size: 11px;
|
||||
font-weight: 500;
|
||||
color: var(--text-3);
|
||||
font-family: var(--font-mono);
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
gap: 6px;
|
||||
}
|
||||
.res-metric-sub {
|
||||
color: var(--text-4);
|
||||
font-weight: 400;
|
||||
}
|
||||
.res-bar-wrap {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
}
|
||||
.res-bar {
|
||||
flex: 1;
|
||||
height: 6px;
|
||||
background: var(--bg-4);
|
||||
border-radius: 99px;
|
||||
overflow: hidden;
|
||||
}
|
||||
.res-bar-fill {
|
||||
height: 100%;
|
||||
border-radius: 99px;
|
||||
transition: width 0.6s ease;
|
||||
}
|
||||
.res-bar-pct {
|
||||
font-size: 11px;
|
||||
font-family: var(--font-mono);
|
||||
color: var(--text-3);
|
||||
min-width: 32px;
|
||||
text-align: right;
|
||||
}
|
||||
.res-mock-note {
|
||||
font-size: 11px;
|
||||
color: var(--warning);
|
||||
background: var(--warning-soft);
|
||||
border-radius: var(--r-sm);
|
||||
padding: 6px 10px;
|
||||
margin-bottom: 10px;
|
||||
font-family: var(--font-mono);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue