fix(node-agent): use nsenter to run nvidia-smi in host mount namespace
nvidia-smi bind-mount failed due to Alpine vs Ubuntu glibc incompatibility. Fix: nsenter --mount=/proc/1/ns/mnt -- nvidia-smi runs in the host's mount namespace where glibc and all NVIDIA driver libs are present. Requires pid: host in docker-compose.worker.yml (already has network: host). nsenter is provided by util-linux in Alpine — already in the image. Falls back to direct nvidia-smi call (for glibc-based containers), then to /dev/nvidia* file scan if all attempts fail.
This commit is contained in:
parent
726343db96
commit
5ff507b81b
2 changed files with 35 additions and 29 deletions
|
|
@ -41,6 +41,7 @@ services:
|
|||
build: ./services/node-agent
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
pid: host # share host PID namespace so nsenter can run host binaries
|
||||
environment:
|
||||
MAM_API_URL: ${MAM_API_URL}
|
||||
NODE_TOKEN: ${NODE_TOKEN:-}
|
||||
|
|
@ -55,12 +56,6 @@ services:
|
|||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- /dev:/dev:ro
|
||||
# Bind-mount nvidia-smi + driver libs so the agent can query GPU details.
|
||||
# These paths match Ubuntu installations with driver 595+. The mounts are
|
||||
# optional — if the binaries don't exist on the host, Docker silently
|
||||
# ignores them and the agent falls back to /dev/nvidia* file detection.
|
||||
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
|
||||
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
|
||||
devices:
|
||||
- /dev/blackmagic:/dev/blackmagic
|
||||
|
||||
|
|
|
|||
|
|
@ -183,16 +183,27 @@ function sampleCpu() {
|
|||
// ── Hardware detection ────────────────────────────────────────────────────
|
||||
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
||||
function detectGpusViaSmi() {
|
||||
// Try nvidia-smi for rich GPU info (name, memory, driver).
|
||||
// nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml.
|
||||
// Returns [] if not available or if no GPUs are present.
|
||||
try {
|
||||
// Run nvidia-smi in the host's mount namespace via nsenter so it uses the
|
||||
// host's glibc and NVIDIA driver libs (not the container's Alpine libc).
|
||||
// Requires pid: host in docker-compose so we can access /proc/1/ns/mnt.
|
||||
// Falls back to direct nvidia-smi call (works if container has glibc).
|
||||
// Returns null if nvidia-smi isn't available — caller falls back to /dev scan.
|
||||
const { execFileSync } = require('child_process');
|
||||
const out = execFileSync(
|
||||
'nvidia-smi',
|
||||
['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'],
|
||||
{ timeout: 5000, encoding: 'utf8' }
|
||||
);
|
||||
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
|
||||
const FMT = '--format=csv,noheader,nounits';
|
||||
|
||||
const attempts = [
|
||||
// nsenter into host mount ns (preferred — uses host glibc)
|
||||
() => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT],
|
||||
{ timeout: 8000, encoding: 'utf8' }),
|
||||
// Direct call (works if container has compatible glibc)
|
||||
() => execFileSync('nvidia-smi', [QUERY, FMT],
|
||||
{ timeout: 5000, encoding: 'utf8' }),
|
||||
];
|
||||
|
||||
for (const attempt of attempts) {
|
||||
try {
|
||||
const out = attempt();
|
||||
return out.trim().split('\n').filter(Boolean).map(line => {
|
||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||
return {
|
||||
|
|
@ -204,9 +215,9 @@ function detectGpusViaSmi() {
|
|||
type: 'nvidia',
|
||||
};
|
||||
});
|
||||
} catch (_) {
|
||||
return null; // nvidia-smi not available or failed — fall back to /dev scan
|
||||
} catch (_) {}
|
||||
}
|
||||
return null; // all attempts failed — caller falls back to /dev scan
|
||||
}
|
||||
|
||||
function detectHardware() {
|
||||
|
|
|
|||
Loading…
Reference in a new issue