From 74299629e664590f32995300bb97f087aebaf5fe Mon Sep 17 00:00:00 2001 From: ZGaetano Date: Wed, 20 May 2026 17:25:11 -0400 Subject: [PATCH] feat: detect GPUs via nvidia-smi and populate cluster_nodes capabilities --- services/mam-api/src/index.js | 44 ++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/services/mam-api/src/index.js b/services/mam-api/src/index.js index 9593a59..29024a6 100644 --- a/services/mam-api/src/index.js +++ b/services/mam-api/src/index.js @@ -4,6 +4,7 @@ import cors from 'cors'; import session from 'express-session'; import ConnectPgSimple from 'connect-pg-simple'; import os from 'node:os'; +import { exec } from 'node:child_process'; import pool from './db/pool.js'; import { errorHandler } from './middleware/errors.js'; import { loadS3ConfigFromDb } from './s3/client.js'; @@ -40,7 +41,6 @@ app.use( store: new PgSession({ pool, tableName: 'sessions', - // Prune expired sessions every hour pruneSessionInterval: 3600, }), secret: process.env.SESSION_SECRET || 'change-me-in-production', @@ -49,7 +49,7 @@ app.use( cookie: { secure: process.env.NODE_ENV === 'production', httpOnly: true, - maxAge: 1000 * 60 * 60 * 24, // 24 h + maxAge: 1000 * 60 * 60 * 24, }, }) ); @@ -58,10 +58,7 @@ app.use( app.get('/health', (_req, res) => res.json({ status: 'ok' })); // ── API Routes ──────────────────────────────────────────────────────────────── -// Auth routes are always open (login/logout don't require a session) app.use('/api/v1/auth', authRouter); - -// All other routes are gated by requireAuth (no-op unless AUTH_ENABLED=true) app.use('/api/v1/assets', assetsRouter); app.use('/api/v1/projects', projectsRouter); app.use('/api/v1/bins', binsRouter); @@ -107,7 +104,6 @@ await runMigrations(); await loadS3ConfigFromDb(); // ── Cluster self-heartbeat ──────────────────────────────────────────────────── -// Registers this node in cluster_nodes every 30 s so the Cluster page shows it. function getLocalIp() { // Prefer an explicit override — useful when running inside Docker where // os.networkInterfaces() returns container bridge IPs, not the host LAN IP. @@ -122,20 +118,49 @@ function getLocalIp() { return '127.0.0.1'; } -function selfHeartbeat() { +// Detect NVIDIA GPUs available to this container via nvidia-smi. +// Returns an array like [{ index: 0, name: 'Tesla P4', memory_mb: 7680 }, ...] +// or an empty array if nvidia-smi is unavailable or no GPUs found. +function detectGpus() { + return new Promise(resolve => { + exec( + 'nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader,nounits', + { timeout: 5000 }, + (err, stdout) => { + if (err || !stdout.trim()) return resolve([]); + const gpus = stdout.trim().split('\n').map(line => { + const parts = line.split(',').map(s => s.trim()); + return { + index: parseInt(parts[0], 10), + name: parts[1] || 'Unknown GPU', + memory_mb: parseInt(parts[2], 10) || 0, + }; + }).filter(g => !isNaN(g.index)); + resolve(gpus); + } + ); + }); +} + +async function selfHeartbeat() { const load = os.loadavg()[0]; const total = os.totalmem(); const used = total - os.freemem(); + const gpus = await detectGpus(); + + const capabilities = { gpus, blackmagic: [] }; + pool.query( `INSERT INTO cluster_nodes (hostname, ip_address, role, version, api_url, - cpu_usage, mem_used_mb, mem_total_mb, last_seen) - VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,NOW()) + cpu_usage, mem_used_mb, mem_total_mb, capabilities, last_seen) + VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,$8,NOW()) ON CONFLICT (hostname) DO UPDATE SET ip_address = EXCLUDED.ip_address, cpu_usage = EXCLUDED.cpu_usage, mem_used_mb = EXCLUDED.mem_used_mb, mem_total_mb = EXCLUDED.mem_total_mb, + capabilities = EXCLUDED.capabilities, last_seen = NOW()`, [ os.hostname(), @@ -145,6 +170,7 @@ function selfHeartbeat() { parseFloat(load.toFixed(2)), Math.round(used / 1024 / 1024), Math.round(total / 1024 / 1024), + JSON.stringify(capabilities), ] ).catch(err => console.error('[cluster] heartbeat failed:', err.message)); }