fix(promotion): heal dead CIFS mounts + retry file lookup; reset orphans on failure

The promotion worker mounts the growing SMB share, but a CIFS soft-mount can
stay mounted while DEAD (server dropped the connection) — every access then
returns ENOENT, so promotion fails Growing file not found and the asset is
stranded in processing (recurring stuck-migration bug). Fixes:
- ensureGrowingShareMounted now PROBES the mount with a readdir; if dead, lazy-
  unmounts and remounts fresh (was: returned early if anything was mounted).
- file lookup retries for ~20s (CIFS attribute-cache lag on a freshly written
  master), remounting between attempts.
- on any promotion failure, the asset is reset (pending_migration if the file is
  present, else error) instead of being left in processing forever.
This commit is contained in:
OpenCode 2026-06-05 11:45:10 +00:00
parent 641b033bf4
commit e6f1313065

View file

@ -1,7 +1,7 @@
import { readdir, stat, unlink, mkdir, writeFile } from 'node:fs/promises';
import { execFileSync } from 'node:child_process';
import { join, relative, basename } from 'node:path';
import { createReadStream, existsSync } from 'node:fs';
import { createReadStream, existsSync, readdirSync } from 'node:fs';
import { Queue } from 'bullmq';
import { query } from '../db/client.js';
import { uploadStreamToS3 } from '../s3/client.js';
@ -38,8 +38,19 @@ async function ensureGrowingShareMounted() {
}
try {
if (isMounted(GROWING_PATH)) {
console.log('[promotion] growing share already mounted at', GROWING_PATH);
return;
// A CIFS soft-mount can stay "mounted" yet be DEAD (server dropped the
// connection): the mountpoint exists but every access returns ENOENT/EIO.
// This is the recurring "Growing file not found" / stuck-in-processing bug.
// Probe the mount with a real readdir; if it fails, force-unmount so we
// fall through and remount fresh below.
let healthy = false;
try { readdirSync(GROWING_PATH); healthy = true; } catch (_) { healthy = false; }
if (healthy) {
console.log('[promotion] growing share healthy at', GROWING_PATH);
return;
}
console.warn('[promotion] growing share mounted but DEAD — remounting', GROWING_PATH);
try { execFileSync('umount', ['-l', GROWING_PATH], { stdio: ['ignore', 'ignore', 'pipe'] }); } catch (_) {}
}
await mkdir(GROWING_PATH, { recursive: true }).catch(() => {});
await writeFile(
@ -114,10 +125,26 @@ async function runPromotion(job) {
}
const asset = r.rows[0];
// 3. Resolve local path
// 3. Resolve local path. RETRY: the master is written to the SMB share by the
// capture sidecar's mount; the promotion worker sees it through its OWN CIFS
// mount, which has an attribute cache (actimeo). A just-finalized file (or a
// file written via a different mount) can show a transient negative-lookup
// here for a second or two even though it exists. Without a retry the job
// fails "Growing file not found" and the asset gets stranded — the recurring
// stuck-in-processing bug. Re-check for up to ~20s before giving up.
const localPath = `${GROWING_PATH}/${asset.project_id}/${asset.filename}.mxf`;
if (!existsSync(localPath)) {
throw new Error(`Growing file not found at ${localPath}`);
{
const deadline = Date.now() + 20000;
let seen = existsSync(localPath);
while (!seen && Date.now() < deadline) {
await new Promise(r => setTimeout(r, 2000));
// Re-mount best-effort in case the share dropped, then re-check.
await ensureGrowingShareMounted().catch(() => {});
seen = existsSync(localPath);
}
if (!seen) {
throw new Error(`Growing file not found at ${localPath} after 20s`);
}
}
const s3Key = `projects/${asset.project_id}/masters/${asset.filename}.mxf`;