feat: detect GPUs via nvidia-smi and populate cluster_nodes capabilities
This commit is contained in:
parent
a4b9b5be82
commit
74299629e6
1 changed files with 35 additions and 9 deletions
|
|
@ -4,6 +4,7 @@ import cors from 'cors';
|
|||
import session from 'express-session';
|
||||
import ConnectPgSimple from 'connect-pg-simple';
|
||||
import os from 'node:os';
|
||||
import { exec } from 'node:child_process';
|
||||
import pool from './db/pool.js';
|
||||
import { errorHandler } from './middleware/errors.js';
|
||||
import { loadS3ConfigFromDb } from './s3/client.js';
|
||||
|
|
@ -40,7 +41,6 @@ app.use(
|
|||
store: new PgSession({
|
||||
pool,
|
||||
tableName: 'sessions',
|
||||
// Prune expired sessions every hour
|
||||
pruneSessionInterval: 3600,
|
||||
}),
|
||||
secret: process.env.SESSION_SECRET || 'change-me-in-production',
|
||||
|
|
@ -49,7 +49,7 @@ app.use(
|
|||
cookie: {
|
||||
secure: process.env.NODE_ENV === 'production',
|
||||
httpOnly: true,
|
||||
maxAge: 1000 * 60 * 60 * 24, // 24 h
|
||||
maxAge: 1000 * 60 * 60 * 24,
|
||||
},
|
||||
})
|
||||
);
|
||||
|
|
@ -58,10 +58,7 @@ app.use(
|
|||
app.get('/health', (_req, res) => res.json({ status: 'ok' }));
|
||||
|
||||
// ── API Routes ────────────────────────────────────────────────────────────────
|
||||
// Auth routes are always open (login/logout don't require a session)
|
||||
app.use('/api/v1/auth', authRouter);
|
||||
|
||||
// All other routes are gated by requireAuth (no-op unless AUTH_ENABLED=true)
|
||||
app.use('/api/v1/assets', assetsRouter);
|
||||
app.use('/api/v1/projects', projectsRouter);
|
||||
app.use('/api/v1/bins', binsRouter);
|
||||
|
|
@ -107,7 +104,6 @@ await runMigrations();
|
|||
await loadS3ConfigFromDb();
|
||||
|
||||
// ── Cluster self-heartbeat ────────────────────────────────────────────────────
|
||||
// Registers this node in cluster_nodes every 30 s so the Cluster page shows it.
|
||||
function getLocalIp() {
|
||||
// Prefer an explicit override — useful when running inside Docker where
|
||||
// os.networkInterfaces() returns container bridge IPs, not the host LAN IP.
|
||||
|
|
@ -122,20 +118,49 @@ function getLocalIp() {
|
|||
return '127.0.0.1';
|
||||
}
|
||||
|
||||
function selfHeartbeat() {
|
||||
// Detect NVIDIA GPUs available to this container via nvidia-smi.
|
||||
// Returns an array like [{ index: 0, name: 'Tesla P4', memory_mb: 7680 }, ...]
|
||||
// or an empty array if nvidia-smi is unavailable or no GPUs found.
|
||||
function detectGpus() {
|
||||
return new Promise(resolve => {
|
||||
exec(
|
||||
'nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader,nounits',
|
||||
{ timeout: 5000 },
|
||||
(err, stdout) => {
|
||||
if (err || !stdout.trim()) return resolve([]);
|
||||
const gpus = stdout.trim().split('\n').map(line => {
|
||||
const parts = line.split(',').map(s => s.trim());
|
||||
return {
|
||||
index: parseInt(parts[0], 10),
|
||||
name: parts[1] || 'Unknown GPU',
|
||||
memory_mb: parseInt(parts[2], 10) || 0,
|
||||
};
|
||||
}).filter(g => !isNaN(g.index));
|
||||
resolve(gpus);
|
||||
}
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
async function selfHeartbeat() {
|
||||
const load = os.loadavg()[0];
|
||||
const total = os.totalmem();
|
||||
const used = total - os.freemem();
|
||||
const gpus = await detectGpus();
|
||||
|
||||
const capabilities = { gpus, blackmagic: [] };
|
||||
|
||||
pool.query(
|
||||
`INSERT INTO cluster_nodes
|
||||
(hostname, ip_address, role, version, api_url,
|
||||
cpu_usage, mem_used_mb, mem_total_mb, last_seen)
|
||||
VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,NOW())
|
||||
cpu_usage, mem_used_mb, mem_total_mb, capabilities, last_seen)
|
||||
VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,$8,NOW())
|
||||
ON CONFLICT (hostname) DO UPDATE SET
|
||||
ip_address = EXCLUDED.ip_address,
|
||||
cpu_usage = EXCLUDED.cpu_usage,
|
||||
mem_used_mb = EXCLUDED.mem_used_mb,
|
||||
mem_total_mb = EXCLUDED.mem_total_mb,
|
||||
capabilities = EXCLUDED.capabilities,
|
||||
last_seen = NOW()`,
|
||||
[
|
||||
os.hostname(),
|
||||
|
|
@ -145,6 +170,7 @@ function selfHeartbeat() {
|
|||
parseFloat(load.toFixed(2)),
|
||||
Math.round(used / 1024 / 1024),
|
||||
Math.round(total / 1024 / 1024),
|
||||
JSON.stringify(capabilities),
|
||||
]
|
||||
).catch(err => console.error('[cluster] heartbeat failed:', err.message));
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue