feat: detect GPUs via nvidia-smi and populate cluster_nodes capabilities
This commit is contained in:
parent
a4b9b5be82
commit
74299629e6
1 changed files with 35 additions and 9 deletions
|
|
@ -4,6 +4,7 @@ import cors from 'cors';
|
||||||
import session from 'express-session';
|
import session from 'express-session';
|
||||||
import ConnectPgSimple from 'connect-pg-simple';
|
import ConnectPgSimple from 'connect-pg-simple';
|
||||||
import os from 'node:os';
|
import os from 'node:os';
|
||||||
|
import { exec } from 'node:child_process';
|
||||||
import pool from './db/pool.js';
|
import pool from './db/pool.js';
|
||||||
import { errorHandler } from './middleware/errors.js';
|
import { errorHandler } from './middleware/errors.js';
|
||||||
import { loadS3ConfigFromDb } from './s3/client.js';
|
import { loadS3ConfigFromDb } from './s3/client.js';
|
||||||
|
|
@ -40,7 +41,6 @@ app.use(
|
||||||
store: new PgSession({
|
store: new PgSession({
|
||||||
pool,
|
pool,
|
||||||
tableName: 'sessions',
|
tableName: 'sessions',
|
||||||
// Prune expired sessions every hour
|
|
||||||
pruneSessionInterval: 3600,
|
pruneSessionInterval: 3600,
|
||||||
}),
|
}),
|
||||||
secret: process.env.SESSION_SECRET || 'change-me-in-production',
|
secret: process.env.SESSION_SECRET || 'change-me-in-production',
|
||||||
|
|
@ -49,7 +49,7 @@ app.use(
|
||||||
cookie: {
|
cookie: {
|
||||||
secure: process.env.NODE_ENV === 'production',
|
secure: process.env.NODE_ENV === 'production',
|
||||||
httpOnly: true,
|
httpOnly: true,
|
||||||
maxAge: 1000 * 60 * 60 * 24, // 24 h
|
maxAge: 1000 * 60 * 60 * 24,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
@ -58,10 +58,7 @@ app.use(
|
||||||
app.get('/health', (_req, res) => res.json({ status: 'ok' }));
|
app.get('/health', (_req, res) => res.json({ status: 'ok' }));
|
||||||
|
|
||||||
// ── API Routes ────────────────────────────────────────────────────────────────
|
// ── API Routes ────────────────────────────────────────────────────────────────
|
||||||
// Auth routes are always open (login/logout don't require a session)
|
|
||||||
app.use('/api/v1/auth', authRouter);
|
app.use('/api/v1/auth', authRouter);
|
||||||
|
|
||||||
// All other routes are gated by requireAuth (no-op unless AUTH_ENABLED=true)
|
|
||||||
app.use('/api/v1/assets', assetsRouter);
|
app.use('/api/v1/assets', assetsRouter);
|
||||||
app.use('/api/v1/projects', projectsRouter);
|
app.use('/api/v1/projects', projectsRouter);
|
||||||
app.use('/api/v1/bins', binsRouter);
|
app.use('/api/v1/bins', binsRouter);
|
||||||
|
|
@ -107,7 +104,6 @@ await runMigrations();
|
||||||
await loadS3ConfigFromDb();
|
await loadS3ConfigFromDb();
|
||||||
|
|
||||||
// ── Cluster self-heartbeat ────────────────────────────────────────────────────
|
// ── Cluster self-heartbeat ────────────────────────────────────────────────────
|
||||||
// Registers this node in cluster_nodes every 30 s so the Cluster page shows it.
|
|
||||||
function getLocalIp() {
|
function getLocalIp() {
|
||||||
// Prefer an explicit override — useful when running inside Docker where
|
// Prefer an explicit override — useful when running inside Docker where
|
||||||
// os.networkInterfaces() returns container bridge IPs, not the host LAN IP.
|
// os.networkInterfaces() returns container bridge IPs, not the host LAN IP.
|
||||||
|
|
@ -122,20 +118,49 @@ function getLocalIp() {
|
||||||
return '127.0.0.1';
|
return '127.0.0.1';
|
||||||
}
|
}
|
||||||
|
|
||||||
function selfHeartbeat() {
|
// Detect NVIDIA GPUs available to this container via nvidia-smi.
|
||||||
|
// Returns an array like [{ index: 0, name: 'Tesla P4', memory_mb: 7680 }, ...]
|
||||||
|
// or an empty array if nvidia-smi is unavailable or no GPUs found.
|
||||||
|
function detectGpus() {
|
||||||
|
return new Promise(resolve => {
|
||||||
|
exec(
|
||||||
|
'nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader,nounits',
|
||||||
|
{ timeout: 5000 },
|
||||||
|
(err, stdout) => {
|
||||||
|
if (err || !stdout.trim()) return resolve([]);
|
||||||
|
const gpus = stdout.trim().split('\n').map(line => {
|
||||||
|
const parts = line.split(',').map(s => s.trim());
|
||||||
|
return {
|
||||||
|
index: parseInt(parts[0], 10),
|
||||||
|
name: parts[1] || 'Unknown GPU',
|
||||||
|
memory_mb: parseInt(parts[2], 10) || 0,
|
||||||
|
};
|
||||||
|
}).filter(g => !isNaN(g.index));
|
||||||
|
resolve(gpus);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function selfHeartbeat() {
|
||||||
const load = os.loadavg()[0];
|
const load = os.loadavg()[0];
|
||||||
const total = os.totalmem();
|
const total = os.totalmem();
|
||||||
const used = total - os.freemem();
|
const used = total - os.freemem();
|
||||||
|
const gpus = await detectGpus();
|
||||||
|
|
||||||
|
const capabilities = { gpus, blackmagic: [] };
|
||||||
|
|
||||||
pool.query(
|
pool.query(
|
||||||
`INSERT INTO cluster_nodes
|
`INSERT INTO cluster_nodes
|
||||||
(hostname, ip_address, role, version, api_url,
|
(hostname, ip_address, role, version, api_url,
|
||||||
cpu_usage, mem_used_mb, mem_total_mb, last_seen)
|
cpu_usage, mem_used_mb, mem_total_mb, capabilities, last_seen)
|
||||||
VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,NOW())
|
VALUES ($1,$2,'primary',$3,$4,$5,$6,$7,$8,NOW())
|
||||||
ON CONFLICT (hostname) DO UPDATE SET
|
ON CONFLICT (hostname) DO UPDATE SET
|
||||||
ip_address = EXCLUDED.ip_address,
|
ip_address = EXCLUDED.ip_address,
|
||||||
cpu_usage = EXCLUDED.cpu_usage,
|
cpu_usage = EXCLUDED.cpu_usage,
|
||||||
mem_used_mb = EXCLUDED.mem_used_mb,
|
mem_used_mb = EXCLUDED.mem_used_mb,
|
||||||
mem_total_mb = EXCLUDED.mem_total_mb,
|
mem_total_mb = EXCLUDED.mem_total_mb,
|
||||||
|
capabilities = EXCLUDED.capabilities,
|
||||||
last_seen = NOW()`,
|
last_seen = NOW()`,
|
||||||
[
|
[
|
||||||
os.hostname(),
|
os.hostname(),
|
||||||
|
|
@ -145,6 +170,7 @@ function selfHeartbeat() {
|
||||||
parseFloat(load.toFixed(2)),
|
parseFloat(load.toFixed(2)),
|
||||||
Math.round(used / 1024 / 1024),
|
Math.round(used / 1024 / 1024),
|
||||||
Math.round(total / 1024 / 1024),
|
Math.round(total / 1024 / 1024),
|
||||||
|
JSON.stringify(capabilities),
|
||||||
]
|
]
|
||||||
).catch(err => console.error('[cluster] heartbeat failed:', err.message));
|
).catch(err => console.error('[cluster] heartbeat failed:', err.message));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue