feat: detect GPUs via nvidia-smi and populate cluster_nodes capabilities

This commit is contained in:
Zac Gaetano 2026-05-20 17:25:11 -04:00
parent a4b9b5be82
commit 74299629e6

View file

@ -4,6 +4,7 @@ import cors from 'cors';
import session from 'express-session';
import ConnectPgSimple from 'connect-pg-simple';
import os from 'node:os';
import { exec } from 'node:child_process';
import pool from './db/pool.js';
import { errorHandler } from './middleware/errors.js';
import { loadS3ConfigFromDb } from './s3/client.js';
@ -40,7 +41,6 @@ app.use(
store: new PgSession({
pool,
tableName: 'sessions',
// Prune expired sessions every hour
pruneSessionInterval: 3600,
}),
secret: process.env.SESSION_SECRET || 'change-me-in-production',
@ -49,7 +49,7 @@ app.use(
cookie: {
secure: process.env.NODE_ENV === 'production',
httpOnly: true,
maxAge: 1000 * 60 * 60 * 24, // 24 h
maxAge: 1000 * 60 * 60 * 24,
},
})
);
@ -58,10 +58,7 @@ app.use(
app.get('/health', (_req, res) => res.json({ status: 'ok' }));
// ── API Routes ────────────────────────────────────────────────────────────────
// Auth routes are always open (login/logout don't require a session)
app.use('/api/v1/auth', authRouter);
// All other routes are gated by requireAuth (no-op unless AUTH_ENABLED=true)
app.use('/api/v1/assets', assetsRouter);
app.use('/api/v1/projects', projectsRouter);
app.use('/api/v1/bins', binsRouter);
@ -107,7 +104,6 @@ await runMigrations();
await loadS3ConfigFromDb();
// ── Cluster self-heartbeat ────────────────────────────────────────────────────
// Registers this node in cluster_nodes every 30 s so the Cluster page shows it.
function getLocalIp() {
// Prefer an explicit override — useful when running inside Docker where
// os.networkInterfaces() returns container bridge IPs, not the host LAN IP.
@ -122,20 +118,49 @@ function getLocalIp() {
return '127.0.0.1';
}
function selfHeartbeat() {
// Detect NVIDIA GPUs available to this container via nvidia-smi.
// Returns an array like [{ index: 0, name: 'Tesla P4', memory_mb: 7680 }, ...]
// or an empty array if nvidia-smi is unavailable or no GPUs found.
function detectGpus() {
return new Promise(resolve => {
exec(
'nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader,nounits',
{ timeout: 5000 },
(err, stdout) => {
if (err || !stdout.trim()) return resolve([]);
const gpus = stdout.trim().split('\n').map(line => {
const parts = line.split(',').map(s => s.trim());
return {
index: parseInt(parts[0], 10),
name: parts[1] || 'Unknown GPU',
memory_mb: parseInt(parts[2], 10) || 0,
};
}).filter(g => !isNaN(g.index));
resolve(gpus);
}
);
});
}
async function selfHeartbeat() {
const load = os.loadavg()[0];
const total = os.totalmem();
const used = total - os.freemem();
const gpus = await detectGpus();
const capabilities = { gpus, blackmagic: [] };
pool.query(
`INSERT INTO cluster_nodes
(hostname, ip_address, role, version, api_url,
cpu_usage, mem_used_mb, mem_total_mb, last_seen)
VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,NOW())
cpu_usage, mem_used_mb, mem_total_mb, capabilities, last_seen)
VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,$8,NOW())
ON CONFLICT (hostname) DO UPDATE SET
ip_address = EXCLUDED.ip_address,
cpu_usage = EXCLUDED.cpu_usage,
mem_used_mb = EXCLUDED.mem_used_mb,
mem_total_mb = EXCLUDED.mem_total_mb,
capabilities = EXCLUDED.capabilities,
last_seen = NOW()`,
[
os.hostname(),
@ -145,6 +170,7 @@ function selfHeartbeat() {
parseFloat(load.toFixed(2)),
Math.round(used / 1024 / 1024),
Math.round(total / 1024 / 1024),
JSON.stringify(capabilities),
]
).catch(err => console.error('[cluster] heartbeat failed:', err.message));
}