fix(worker): conform — 2-pass strategy (normalise on trim, demux on concat)

ffmpeg 8.x's concat filter kept dying with the opaque [fc#0] Error sending frames to consumers: Invalid argument even after we locked fps + sample rate + pixel format + SAR in the filter graph. Mixed sources (AV1+H.264, 23.98+60 fps, 44100+48000 Hz, tv-range+unspecified-range pixel format) just don't survive the concat filter cleanly in this build. Switch to the more reliable 2-pass pattern: 1. At the trim step, re-encode each segment to a uniform intermediate spec: libx264 ultrafast, 1920x1080 (letterboxed), yuv420p, seqFps target rate, 48kHz stereo AAC. Per-segment ffmpeg. 2. At the concat step, use the concat *demuxer*. Because every input now matches exactly, the demuxer is well-behaved. Transcode the concatenated stream to the final target codec (ProRes 422 HQ etc). Costs an extra intermediate encode (libx264 ultrafast ≈ realtime on this hardware) but eliminates the filter-graph fragility on mixed- source timelines, which is the workload that actually matters. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 15:34:52 -04:00 · 2026-05-28 15:34:52 -04:00 · 686b90294b
commit 686b90294b
parent fcf4c8bbe7
1 changed files with 41 additions and 68 deletions
--- a/services/worker/src/workers/conform.js
+++ b/services/worker/src/workers/conform.js
@ -3,7 +3,7 @@ import { unlink, writeFile, mkdir, rm } from 'fs/promises';
 import { tmpdir } from 'os';
 import { query } from '../db/client.js';
 import { downloadFromS3, uploadToS3 } from '../s3/client.js';
-import { trimSegment, concatSegments, runFFmpeg } from '../ffmpeg/executor.js';
+import { trimSegment, concatSegments, runFFmpeg, getMediaInfo } from '../ffmpeg/executor.js';
 import { parseEDL } from '../edl/parser.js';
 import { XMLParser } from 'fast-xml-parser';

@ -217,8 +217,34 @@ export const conformWorker = async (job) => {
      console.log(`[conform] Downloading segment ${edit.editNumber} from S3 (${sourceKey})`);
      await downloadFromS3(S3_BUCKET, sourceKey, segmentInputPath);

-      console.log(`[conform] Trimming ${edit.editNumber}: ${edit.sourceIn} → ${edit.sourceOut}`);
-      await trimSegment(segmentInputPath, segmentOutputPath, edit.sourceIn, edit.sourceOut);
+      // Trim + normalise in a single ffmpeg pass per segment. We re-encode
+      // here (libx264 ultrafast) so every segment lands at the same spec
+      // — same fps, resolution, pixel format, sample rate, channel layout
+      // — which lets the final concat-demuxer step run reliably even when
+      // the source clips are wildly different (mixed codecs / fps / sample
+      // rate). The double-encode (intermediate h264 → final ProRes) costs
+      // some CPU but avoids the concat filter's opaque "Invalid argument"
+      // failures with disparate sources.
+      console.log(`[conform] Trim + normalise ${edit.editNumber}: ${edit.sourceIn} → ${edit.sourceOut}`);
+      const segMs = await getMediaInfo(segmentInputPath);
+      const segFps = segMs.fps || 30;
+      const inSec = edit.sourceIn / segFps;
+      const durSec = (edit.sourceOut - edit.sourceIn) / segFps;
+      await runFFmpeg([
+        '-ss', String(inSec),
+        '-i', segmentInputPath,
+        '-t', String(durSec),
+        '-vf', `fps=${Math.round(seqFps) || 30},` +
+               `scale=1920:1080:force_original_aspect_ratio=decrease,` +
+               `pad=1920:1080:(ow-iw)/2:(oh-ih)/2,` +
+               `setsar=1,format=yuv420p`,
+        '-af', 'aresample=48000:ocl=stereo',
+        '-c:v', 'libx264', '-preset', 'ultrafast', '-crf', '18',
+        '-pix_fmt', 'yuv420p',
+        '-c:a', 'aac', '-b:a', '320k', '-ar', '48000',
+        '-shortest',
+        '-y', segmentOutputPath,
+      ]);

      concatList.push(segmentOutputPath);
      await unlink(segmentInputPath).catch(() => {});
@ -267,74 +293,21 @@ export const conformWorker = async (job) => {
          '-crf',    quality === 'broadcast' ? '18' : quality === 'high' ? '23' : '28',
        ];

-    // Concat strategy: the demuxer is fast but requires identical specs
-    // (codec, pixel format, framerate, resolution) across all segments.
-    // When sources differ (AV1 + H.264, mixed framerates, etc.) it dies
-    // with "Error sending frames to consumers: Invalid argument". Use the
-    // concat *filter* instead — it normalises each input through a filter
-    // graph before joining, so disparate sources work.
-    //
-    // Normalise every input to the target resolution + 1/SAR + yuv420p so
-    // the encode is deterministic. Output frame rate is unset; ffmpeg
-    // will use whatever the final filter graph emits, which the target
-    // codec accepts.
-    const wantAudio = !(audio === 'none' || audio === 'off');
-    const inputArgs = [];
-    concatList.forEach(p => { inputArgs.push('-i', p); });
-
-    // Build the filter graph. The concat filter in ffmpeg 8.x requires
-    // identical resolution, pixel format, SAR, FRAME RATE and audio
-    // SAMPLE RATE / CHANNEL LAYOUT across all inputs. Different-spec
-    // sources (e.g. a 23.98 fps clip + a 60 fps clip) trip
+    // Concat: every segment was normalised at trim time (uniform fps,
+    // resolution, pixel format, sample rate, stereo). The demuxer can
+    // stream-stitch them and we just need to transcode the result to the
+    // final target codec. This bypasses ffmpeg 8.x's brittle concat-
+    // filter path that was throwing
    //   [fc#0] Error sending frames to consumers: Invalid argument
-    // even though our earlier scale+pad+format pass took care of the
-    // pixel side. Force the time-domain axes too:
-    //   fps=<target>       — resample video to a constant rate
-    //   setpts=PTS-STARTPTS — re-zero PTS so concat's per-input clock
-    //                        resets cleanly
-    //   aresample=48000    — force a single audio sample rate
-    //   asetpts=PTS-STARTPTS — same for audio
-    const targetW = 1920;
-    const targetH = 1080;
-    const targetFps = Math.round(seqFps) || 30;
-    const targetSampleRate = 48000;
-
-    const vLabels = [];
-    const aLabels = [];
-    let normalize = '';
-    for (let i = 0; i < concatList.length; i++) {
-      normalize +=
-        `[${i}:v:0]fps=${targetFps},` +
-        `scale=${targetW}:${targetH}:force_original_aspect_ratio=decrease,` +
-        `pad=${targetW}:${targetH}:(ow-iw)/2:(oh-ih)/2,` +
-        `setsar=1,format=yuv420p,setpts=PTS-STARTPTS[v${i}];`;
-      vLabels.push(`[v${i}]`);
-      if (wantAudio) {
-        normalize +=
-          `[${i}:a:0]aresample=${targetSampleRate},` +
-          `aformat=channel_layouts=stereo:sample_fmts=fltp,` +
-          `asetpts=PTS-STARTPTS[a${i}];`;
-        aLabels.push(`[a${i}]`);
-      }
-    }
-    const n = concatList.length;
-    let concatExpr;
-    if (wantAudio) {
-      const interleaved = [];
-      for (let i = 0; i < n; i++) { interleaved.push(vLabels[i], aLabels[i]); }
-      concatExpr = `${interleaved.join('')}concat=n=${n}:v=1:a=1[outv][outa]`;
-    } else {
-      concatExpr = `${vLabels.join('')}concat=n=${n}:v=1:a=0[outv]`;
-    }
-    const filterComplex = normalize + concatExpr;
-
-    const mapArgs = wantAudio ? ['-map', '[outv]', '-map', '[outa]'] : ['-map', '[outv]'];
-    const encodeAudio = wantAudio ? ['-c:a', 'aac', '-b:a', '320k', '-ar', '48000'] : ['-an'];
+    // on mixed-source timelines.
+    const encodeAudio = (audio === 'none' || audio === 'off')
+      ? ['-an']
+      : ['-c:a', 'aac', '-b:a', '320k', '-ar', '48000'];

    await runFFmpeg([
-      ...inputArgs,
-      '-filter_complex', filterComplex,
-      ...mapArgs,
+      '-f', 'concat',
+      '-safe', '0',
+      '-i', segmentListPath,
      '-c:v', videoCodec,
      ...profileFlag,
      ...qualityArgs,