fix(node-agent): detect GPUs via docker run --gpus all ubuntu:22.04
nsenter approach failed (requires SYS_ADMIN in container). nvidia-smi bind-mount failed (Alpine vs Ubuntu glibc incompatibility). Working solution: spawn 'docker run --rm --gpus all ubuntu:22.04 nvidia-smi' via the Docker socket. The NVIDIA Container Runtime injects nvidia-smi and driver libs into any container with --gpus all, regardless of the base image. ubuntu:22.04 is already cached on GPU nodes. Result: GPU reported with name, memory_mb, driver_version — shows as BOUND in the cluster UI.
This commit is contained in:
parent
5ff507b81b
commit
558c18e417
2 changed files with 25 additions and 30 deletions
|
|
@ -41,7 +41,6 @@ services:
|
|||
build: ./services/node-agent
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
pid: host # share host PID namespace so nsenter can run host binaries
|
||||
environment:
|
||||
MAM_API_URL: ${MAM_API_URL}
|
||||
NODE_TOKEN: ${NODE_TOKEN:-}
|
||||
|
|
|
|||
|
|
@ -183,41 +183,37 @@ function sampleCpu() {
|
|||
// ── Hardware detection ────────────────────────────────────────────────────
|
||||
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
||||
function detectGpusViaSmi() {
|
||||
// Run nvidia-smi in the host's mount namespace via nsenter so it uses the
|
||||
// host's glibc and NVIDIA driver libs (not the container's Alpine libc).
|
||||
// Requires pid: host in docker-compose so we can access /proc/1/ns/mnt.
|
||||
// Falls back to direct nvidia-smi call (works if container has glibc).
|
||||
// Returns null if nvidia-smi isn't available — caller falls back to /dev scan.
|
||||
// Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
|
||||
// injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
|
||||
// This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
|
||||
// Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
|
||||
const { execFileSync } = require('child_process');
|
||||
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
|
||||
const FMT = '--format=csv,noheader,nounits';
|
||||
|
||||
const attempts = [
|
||||
// nsenter into host mount ns (preferred — uses host glibc)
|
||||
() => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT],
|
||||
{ timeout: 8000, encoding: 'utf8' }),
|
||||
// Direct call (works if container has compatible glibc)
|
||||
() => execFileSync('nvidia-smi', [QUERY, FMT],
|
||||
{ timeout: 5000, encoding: 'utf8' }),
|
||||
];
|
||||
try {
|
||||
const out = execFileSync('docker', [
|
||||
'run', '--rm', '--gpus', 'all',
|
||||
'ubuntu:22.04',
|
||||
'nvidia-smi', QUERY, FMT,
|
||||
], { timeout: 15000, encoding: 'utf8' });
|
||||
|
||||
for (const attempt of attempts) {
|
||||
try {
|
||||
const out = attempt();
|
||||
return out.trim().split('\n').filter(Boolean).map(line => {
|
||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||
return {
|
||||
index: parseInt(idx, 10),
|
||||
name: name,
|
||||
memory_mb: parseInt(memMiB, 10),
|
||||
driver: driver,
|
||||
device: `/dev/nvidia${idx}`,
|
||||
type: 'nvidia',
|
||||
};
|
||||
});
|
||||
} catch (_) {}
|
||||
const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
|
||||
if (!lines.length) return null;
|
||||
return lines.map(line => {
|
||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||
return {
|
||||
index: parseInt(idx, 10),
|
||||
name: name,
|
||||
memory_mb: parseInt(memMiB, 10),
|
||||
driver: driver,
|
||||
device: `/dev/nvidia${idx}`,
|
||||
type: 'nvidia',
|
||||
};
|
||||
});
|
||||
} catch (_) {
|
||||
return null; // nvidia runtime not available — caller uses /dev scan
|
||||
}
|
||||
return null; // all attempts failed — caller falls back to /dev scan
|
||||
}
|
||||
|
||||
function detectHardware() {
|
||||
|
|
|
|||
Loading…
Reference in a new issue