fix(node-agent): use nsenter to run nvidia-smi in host mount namespace
nvidia-smi bind-mount failed due to Alpine vs Ubuntu glibc incompatibility. Fix: nsenter --mount=/proc/1/ns/mnt -- nvidia-smi runs in the host's mount namespace where glibc and all NVIDIA driver libs are present. Requires pid: host in docker-compose.worker.yml (already has network: host). nsenter is provided by util-linux in Alpine — already in the image. Falls back to direct nvidia-smi call (for glibc-based containers), then to /dev/nvidia* file scan if all attempts fail.
This commit is contained in:
parent
726343db96
commit
5ff507b81b
2 changed files with 35 additions and 29 deletions
|
|
@ -41,6 +41,7 @@ services:
|
||||||
build: ./services/node-agent
|
build: ./services/node-agent
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
network_mode: host
|
network_mode: host
|
||||||
|
pid: host # share host PID namespace so nsenter can run host binaries
|
||||||
environment:
|
environment:
|
||||||
MAM_API_URL: ${MAM_API_URL}
|
MAM_API_URL: ${MAM_API_URL}
|
||||||
NODE_TOKEN: ${NODE_TOKEN:-}
|
NODE_TOKEN: ${NODE_TOKEN:-}
|
||||||
|
|
@ -55,12 +56,6 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- /var/run/docker.sock:/var/run/docker.sock
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
- /dev:/dev:ro
|
- /dev:/dev:ro
|
||||||
# Bind-mount nvidia-smi + driver libs so the agent can query GPU details.
|
|
||||||
# These paths match Ubuntu installations with driver 595+. The mounts are
|
|
||||||
# optional — if the binaries don't exist on the host, Docker silently
|
|
||||||
# ignores them and the agent falls back to /dev/nvidia* file detection.
|
|
||||||
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
|
|
||||||
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
|
|
||||||
devices:
|
devices:
|
||||||
- /dev/blackmagic:/dev/blackmagic
|
- /dev/blackmagic:/dev/blackmagic
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -183,30 +183,41 @@ function sampleCpu() {
|
||||||
// ── Hardware detection ────────────────────────────────────────────────────
|
// ── Hardware detection ────────────────────────────────────────────────────
|
||||||
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
// GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
|
||||||
function detectGpusViaSmi() {
|
function detectGpusViaSmi() {
|
||||||
// Try nvidia-smi for rich GPU info (name, memory, driver).
|
// Run nvidia-smi in the host's mount namespace via nsenter so it uses the
|
||||||
// nvidia-smi binary is bind-mounted from the host via docker-compose.worker.yml.
|
// host's glibc and NVIDIA driver libs (not the container's Alpine libc).
|
||||||
// Returns [] if not available or if no GPUs are present.
|
// Requires pid: host in docker-compose so we can access /proc/1/ns/mnt.
|
||||||
try {
|
// Falls back to direct nvidia-smi call (works if container has glibc).
|
||||||
const { execFileSync } = require('child_process');
|
// Returns null if nvidia-smi isn't available — caller falls back to /dev scan.
|
||||||
const out = execFileSync(
|
const { execFileSync } = require('child_process');
|
||||||
'nvidia-smi',
|
const QUERY = '--query-gpu=index,name,memory.total,driver_version';
|
||||||
['--query-gpu=index,name,memory.total,driver_version', '--format=csv,noheader,nounits'],
|
const FMT = '--format=csv,noheader,nounits';
|
||||||
{ timeout: 5000, encoding: 'utf8' }
|
|
||||||
);
|
const attempts = [
|
||||||
return out.trim().split('\n').filter(Boolean).map(line => {
|
// nsenter into host mount ns (preferred — uses host glibc)
|
||||||
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
() => execFileSync('nsenter', ['--mount=/proc/1/ns/mnt', '--', 'nvidia-smi', QUERY, FMT],
|
||||||
return {
|
{ timeout: 8000, encoding: 'utf8' }),
|
||||||
index: parseInt(idx, 10),
|
// Direct call (works if container has compatible glibc)
|
||||||
name: name,
|
() => execFileSync('nvidia-smi', [QUERY, FMT],
|
||||||
memory_mb: parseInt(memMiB, 10),
|
{ timeout: 5000, encoding: 'utf8' }),
|
||||||
driver: driver,
|
];
|
||||||
device: `/dev/nvidia${idx}`,
|
|
||||||
type: 'nvidia',
|
for (const attempt of attempts) {
|
||||||
};
|
try {
|
||||||
});
|
const out = attempt();
|
||||||
} catch (_) {
|
return out.trim().split('\n').filter(Boolean).map(line => {
|
||||||
return null; // nvidia-smi not available or failed — fall back to /dev scan
|
const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
|
||||||
|
return {
|
||||||
|
index: parseInt(idx, 10),
|
||||||
|
name: name,
|
||||||
|
memory_mb: parseInt(memMiB, 10),
|
||||||
|
driver: driver,
|
||||||
|
device: `/dev/nvidia${idx}`,
|
||||||
|
type: 'nvidia',
|
||||||
|
};
|
||||||
|
});
|
||||||
|
} catch (_) {}
|
||||||
}
|
}
|
||||||
|
return null; // all attempts failed — caller falls back to /dev scan
|
||||||
}
|
}
|
||||||
|
|
||||||
function detectHardware() {
|
function detectHardware() {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue