fix(node-agent): probe GPU via Docker API async at startup, cache result

Replaced sync execFileSync('docker') approach (no docker CLI in container) with async Docker socket HTTP API calls: - POST /containers/create with nvidia runtime + DeviceRequests - POST /containers/:id/start - Poll inspect until not running - GET /containers/:id/logs, strip 8-byte frame headers, parse csv probeGpusViaSmi() runs once at startup before the first heartbeat. Result cached in _gpuCache; detectHardware() reads cache on every heartbeat. Falls back to /dev/nvidia* scan if probe fails or runtime unavailable.
2026-05-26 18:28:03 +00:00 · 2026-05-26 18:28:03 +00:00 · a6f045b3d7
commit a6f045b3d7
parent 558c18e417
1 changed files with 82 additions and 32 deletions
--- a/services/node-agent/index.js
+++ b/services/node-agent/index.js
@ -182,37 +182,84 @@ function sampleCpu() {

 // ── Hardware detection ────────────────────────────────────────────────────
 // GPU_COUNT / BMD_COUNT env vars override filesystem detection when /dev isn't mapped
-function detectGpusViaSmi() {
-  // Spawn a temporary container with --gpus all so the NVIDIA Container Runtime
-  // injects nvidia-smi + driver libs into ubuntu:22.04 (already cached on GPU hosts).
-  // This sidesteps the Alpine/glibc incompatibility in the node-agent image itself.
-  // Returns null if nvidia runtime isn't available — caller falls back to /dev scan.
-  const { execFileSync } = require('child_process');
+// Cached GPU info from nvidia-smi — populated once at startup via Docker API.
+// null = not yet probed; [] = probed but no GPUs or no runtime.
+let _gpuCache = null;
+
+async function probeGpusViaSmi() {
+  // Use Docker API (socket) to run nvidia-smi inside a GPU-enabled container.
+  // The NVIDIA Container Runtime injects nvidia-smi + libs into ubuntu:22.04.
+  // This sidesteps the Alpine/glibc incompatibility in the node-agent image.
  const QUERY = '--query-gpu=index,name,memory.total,driver_version';
  const FMT   = '--format=csv,noheader,nounits';

+  let containerId;
  try {
-    const out = execFileSync('docker', [
-      'run', '--rm', '--gpus', 'all',
-      'ubuntu:22.04',
-      'nvidia-smi', QUERY, FMT,
-    ], { timeout: 15000, encoding: 'utf8' });
-
-    const lines = out.trim().split('\n').filter(l => /^\d+,/.test(l.trim()));
-    if (!lines.length) return null;
-    return lines.map(line => {
-      const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
-      return {
-        index:     parseInt(idx, 10),
-        name:      name,
-        memory_mb: parseInt(memMiB, 10),
-        driver:    driver,
-        device:    `/dev/nvidia${idx}`,
-        type:      'nvidia',
-      };
+    // Create container with nvidia runtime and GPU access
+    const createRes = await dockerApi('POST', '/containers/create', {
+      Image: 'ubuntu:22.04',
+      Cmd:   ['nvidia-smi', QUERY, FMT],
+      HostConfig: {
+        AutoRemove: false,
+        Runtime:    'nvidia',
+        DeviceRequests: [{ Driver: 'nvidia', Count: -1, Capabilities: [['gpu']] }],
+      },
    });
-  } catch (_) {
-    return null; // nvidia runtime not available — caller uses /dev scan
+    if (createRes.status !== 201) return;
+    containerId = createRes.data.Id;
+
+    await dockerApi('POST', `/containers/${containerId}/start`);
+
+    // Wait for it to finish (poll status)
+    for (let i = 0; i < 20; i++) {
+      await new Promise(r => setTimeout(r, 500));
+      const inspect = await dockerApi('GET', `/containers/${containerId}/json`);
+      if (!inspect.data?.State?.Running) break;
+    }
+
+    // Grab stdout logs
+    const logRes = await new Promise((resolve, reject) => {
+      const options = {
+        socketPath: '/var/run/docker.sock',
+        path:       `/v1.43/containers/${containerId}/logs?stdout=1&stderr=0`,
+        method:     'GET',
+      };
+      const req = http.request(options, res => {
+        const chunks = [];
+        res.on('data', c => chunks.push(c));
+        res.on('end', () => resolve(Buffer.concat(chunks).toString('utf8')));
+      });
+      req.on('error', reject);
+      req.end();
+    });
+
+    // Docker log frames have an 8-byte header — strip them
+    const text = logRes.replace(/[\x00-\x07].{7}/g, '').trim();
+    const lines = text.split('\n').filter(l => /^\d+,/.test(l.trim()));
+
+    if (lines.length) {
+      _gpuCache = lines.map(line => {
+        const [idx, name, memMiB, driver] = line.split(',').map(s => s.trim());
+        return {
+          index:     parseInt(idx, 10),
+          name:      name,
+          memory_mb: parseInt(memMiB, 10),
+          driver:    driver,
+          device:    `/dev/nvidia${idx}`,
+          type:      'nvidia',
+        };
+      });
+      console.log(`[gpu] detected ${_gpuCache.length} GPU(s) via nvidia-smi: ${_gpuCache.map(g => g.name).join(', ')}`);
+    } else {
+      _gpuCache = [];
+    }
+  } catch (err) {
+    console.warn('[gpu] nvidia-smi probe failed:', err.message);
+    _gpuCache = [];
+  } finally {
+    if (containerId) {
+      await dockerApi('DELETE', `/containers/${containerId}?force=true`).catch(() => {});
+    }
  }
 }

@ -225,10 +272,9 @@ function detectHardware() {
      capabilities.gpus.push({ device: `/dev/nvidia${i}`, type: 'nvidia', index: i });
    }
  } else {
-    // Try nvidia-smi first for full GPU details; fall back to /dev file scan
-    const smiGpus = detectGpusViaSmi();
-    if (smiGpus !== null) {
-      capabilities.gpus = smiGpus;
+    // Use nvidia-smi cache if populated, otherwise fall back to /dev file scan
+    if (_gpuCache !== null && _gpuCache.length > 0) {
+      capabilities.gpus = _gpuCache;
    } else {
      for (let i = 0; i < 16; i++) {
        try {
@ -304,8 +350,12 @@ async function heartbeat() {
  }
 }

-heartbeat();
-setInterval(heartbeat, HEARTBEAT_MS);
+// Probe GPU info once at startup (async, before first heartbeat).
+// _gpuCache is populated here; heartbeats use the cached value.
+probeGpusViaSmi().then(() => {
+  heartbeat();
+  setInterval(heartbeat, HEARTBEAT_MS);
+});

 // ── HTTP server ───────────────────────────────────────────────────────────
 const server = http.createServer((req, res) => {