fix(node-agent): probe GPU via Docker API async at startup, cache result

Replaced sync execFileSync('docker') approach (no docker CLI in container) with async Docker socket HTTP API calls: - POST /containers/create with nvidia runtime + DeviceRequests - POST /containers/:id/start - Poll inspect until not running - GET /containers/:id/logs, strip 8-byte frame headers, parse csv probeGpusViaSmi() runs once at startup before the first heartbeat. Result cached in _gpuCache; detectHardware() reads cache on every heartbeat. Falls back to /dev/nvidia* scan if probe fails or runtime unavailable.
2026-05-26 18:28:03 +00:00 · 2026-05-26 18:28:03 +00:00 · a6f045b3d7
commit a6f045b3d7
parent 558c18e417
1 changed files with 82 additions and 32 deletions
--- a/services/node-agent/index.js
+++ b/services/node-agent/index.js
@ -182,37 +182,84 @@ function sampleCpu() {
 // ── Hardware detection ────────────────────────────────────────────────────
 // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
-function detectGpusViaSmi() {
+// Cached GPU info from nvidia-smi — populated once at startup via Docker API.
-  // Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
+// null = not yet probed; [] = probed but no GPUs or no runtime.
-  // injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
+let _gpuCache = null;
-  // This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
+
-  // Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
+async function probeGpusViaSmi() {
-  const { execFileSync } = require('child_process');
+  // Use Docker API (socket) to run nvidia-smi inside a GPU-enabled container.
  // The NVIDIA Container Runtime injects nvidia-smi + libs into ubuntu:22.04.
  // This sidesteps the Alpine/glibc incompatibility in the node-agent image.
  const QUERY = '--query-gpu=index,name,memory.total,driver_version';
  const FMT   = '--format=csv,noheader,nounits';
  let containerId;
  try {
-    const out = execFileSync('docker', [
+    // Create container with nvidia runtime and GPU access
-      'run', '--rm', '--gpus', 'all',
+    const createRes = await dockerApi('POST', '/containers/create', {
-      'ubuntu:22.04',
+      Image: 'ubuntu:22.04',
-      'nvidia-smi', QUERY, FMT,
+      Cmd:   ['nvidia-smi', QUERY, FMT],
-    ], { timeout: 15000, encoding: 'utf8' });
+      HostConfig: {
-
+        AutoRemove: false,
-    const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
+        Runtime:    'nvidia',
-    if (!lines.length) return null;
+        DeviceRequests: [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }],
-    return lines.map(line => {
+      },
      const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
      return {
        index:     parseInt(idx, 10),
        name:      name,
        memory_mb: parseInt(memMiB, 10),
        driver:    driver,
        device:    `/dev/nvidia${idx}`,
        type:      'nvidia',
      };
    });
-  } catch (_) {
+    if (createRes.status !== 201) return;
-    return null; // nvidia runtime not available — caller uses /dev scan
+    containerId = createRes.data.Id;
    await dockerApi('POST', `/containers/${containerId}/start`);
    // Wait for it to finish (poll status)
    for (let i = 0; i < 20; i++) {
      await new Promise(r => setTimeout(r, 500));
      const inspect = await dockerApi('GET', `/containers/${containerId}/json`);
      if (!inspect.data?.State?.Running) break;
    }
    // Grab stdout logs
    const logRes = await new Promise((resolve, reject) => {
      const options = {
        socketPath: '/var/run/docker.sock',
        path:       `/v1.43/containers/${containerId}/logs?stdout=1&stderr=0`,
        method:     'GET',
      };
      const req = http.request(options, res => {
        const chunks = [];
        res.on('data', c => chunks.push(c));
        res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
      });
      req.on('error', reject);
      req.end();
    });
    // Docker log frames have an 8-byte header — strip them
    const text = logRes.replace(/[\x00-\x07].{7}/g, '').trim();
    const lines = text.split('\n').filter(l => /^\d+,/.test(l.trim()));
    if (lines.length) {
      _gpuCache = lines.map(line => {
        const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
        return {
          index:     parseInt(idx, 10),
          name:      name,
          memory_mb: parseInt(memMiB, 10),
          driver:    driver,
          device:    `/dev/nvidia${idx}`,
          type:      'nvidia',
        };
      });
      console.log(`[gpu] detected ${_gpuCache.length} GPU(s) via nvidia-smi: ${_gpuCache.map(g => g.name).join(', ')}`);
    } else {
      _gpuCache = [];
    }
  } catch (err) {
    console.warn('[gpu] nvidia-smi probe failed:', err.message);
    _gpuCache = [];
  } finally {
    if (containerId) {
      await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
    }
  }
 }
@ -225,10 +272,9 @@ function detectHardware() {
      capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
    }
  } else {
-    // Try nvidia-smi first for full GPU details; fall back to /dev file scan
+    // Use nvidia-smi cache if populated, otherwise fall back to /dev file scan
-    const smiGpus = detectGpusViaSmi();
+    if (_gpuCache !== null && _gpuCache.length > 0) {
-    if (smiGpus !== null) {
+      capabilities.gpus = _gpuCache;
      capabilities.gpus = smiGpus;
    } else {
      for (let i = 0; i < 16; i++) {
        try {
@ -304,8 +350,12 @@ async function heartbeat() {
  }
 }
-heartbeat();
+// Probe GPU info once at startup (async, before first heartbeat).
-setInterval(heartbeat, HEARTBEAT_MS);
+// _gpuCache is populated here; heartbeats use the cached value.
 probeGpusViaSmi().then(() => {
  heartbeat();
  setInterval(heartbeat, HEARTBEAT_MS);
 });
 // ── HTTP server ───────────────────────────────────────────────────────────
 const server = http.createServer((req, res) => {