fix(worker): conform — 2-pass strategy (normalise on trim, demux on concat)

ffmpeg 8.x's concat filter kept dying with the opaque
  [fc#0] Error sending frames to consumers: Invalid argument
even after we locked fps + sample rate + pixel format + SAR in the
filter graph. Mixed sources (AV1+H.264, 23.98+60 fps, 44100+48000 Hz,
tv-range+unspecified-range pixel format) just don't survive the
concat filter cleanly in this build.

Switch to the more reliable 2-pass pattern:

1. At the trim step, re-encode each segment to a uniform intermediate
   spec: libx264 ultrafast, 1920x1080 (letterboxed), yuv420p,
   seqFps target rate, 48kHz stereo AAC. Per-segment ffmpeg.

2. At the concat step, use the concat *demuxer*. Because every input
   now matches exactly, the demuxer is well-behaved. Transcode the
   concatenated stream to the final target codec (ProRes 422 HQ etc).

Costs an extra intermediate encode (libx264 ultrafast ≈ realtime on
this hardware) but eliminates the filter-graph fragility on mixed-
source timelines, which is the workload that actually matters.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Claude 2026-05-28 15:34:52 -04:00
parent fcf4c8bbe7
commit 686b90294b

View file

@ -3,7 +3,7 @@ import { unlink, writeFile, mkdir, rm } from 'fs/promises';
import { tmpdir } from 'os';
import { query } from '../db/client.js';
import { downloadFromS3, uploadToS3 } from '../s3/client.js';
import { trimSegment, concatSegments, runFFmpeg } from '../ffmpeg/executor.js';
import { trimSegment, concatSegments, runFFmpeg, getMediaInfo } from '../ffmpeg/executor.js';
import { parseEDL } from '../edl/parser.js';
import { XMLParser } from 'fast-xml-parser';
@ -217,8 +217,34 @@ export const conformWorker = async (job) => {
console.log(`[conform] Downloading segment ${edit.editNumber} from S3 (${sourceKey})`);
await downloadFromS3(S3_BUCKET, sourceKey, segmentInputPath);
console.log(`[conform] Trimming ${edit.editNumber}: ${edit.sourceIn}${edit.sourceOut}`);
await trimSegment(segmentInputPath, segmentOutputPath, edit.sourceIn, edit.sourceOut);
// Trim + normalise in a single ffmpeg pass per segment. We re-encode
// here (libx264 ultrafast) so every segment lands at the same spec
// — same fps, resolution, pixel format, sample rate, channel layout
// — which lets the final concat-demuxer step run reliably even when
// the source clips are wildly different (mixed codecs / fps / sample
// rate). The double-encode (intermediate h264 → final ProRes) costs
// some CPU but avoids the concat filter's opaque "Invalid argument"
// failures with disparate sources.
console.log(`[conform] Trim + normalise ${edit.editNumber}: ${edit.sourceIn}${edit.sourceOut}`);
const segMs = await getMediaInfo(segmentInputPath);
const segFps = segMs.fps || 30;
const inSec = edit.sourceIn / segFps;
const durSec = (edit.sourceOut - edit.sourceIn) / segFps;
await runFFmpeg([
'-ss', String(inSec),
'-i', segmentInputPath,
'-t', String(durSec),
'-vf', `fps=${Math.round(seqFps) || 30},` +
`scale=1920:1080:force_original_aspect_ratio=decrease,` +
`pad=1920:1080:(ow-iw)/2:(oh-ih)/2,` +
`setsar=1,format=yuv420p`,
'-af', 'aresample=48000:ocl=stereo',
'-c:v', 'libx264', '-preset', 'ultrafast', '-crf', '18',
'-pix_fmt', 'yuv420p',
'-c:a', 'aac', '-b:a', '320k', '-ar', '48000',
'-shortest',
'-y', segmentOutputPath,
]);
concatList.push(segmentOutputPath);
await unlink(segmentInputPath).catch(() => {});
@ -267,74 +293,21 @@ export const conformWorker = async (job) => {
'-crf', quality === 'broadcast' ? '18' : quality === 'high' ? '23' : '28',
];
// Concat strategy: the demuxer is fast but requires identical specs
// (codec, pixel format, framerate, resolution) across all segments.
// When sources differ (AV1 + H.264, mixed framerates, etc.) it dies
// with "Error sending frames to consumers: Invalid argument". Use the
// concat *filter* instead — it normalises each input through a filter
// graph before joining, so disparate sources work.
//
// Normalise every input to the target resolution + 1/SAR + yuv420p so
// the encode is deterministic. Output frame rate is unset; ffmpeg
// will use whatever the final filter graph emits, which the target
// codec accepts.
const wantAudio = !(audio === 'none' || audio === 'off');
const inputArgs = [];
concatList.forEach(p => { inputArgs.push('-i', p); });
// Build the filter graph. The concat filter in ffmpeg 8.x requires
// identical resolution, pixel format, SAR, FRAME RATE and audio
// SAMPLE RATE / CHANNEL LAYOUT across all inputs. Different-spec
// sources (e.g. a 23.98 fps clip + a 60 fps clip) trip
// Concat: every segment was normalised at trim time (uniform fps,
// resolution, pixel format, sample rate, stereo). The demuxer can
// stream-stitch them and we just need to transcode the result to the
// final target codec. This bypasses ffmpeg 8.x's brittle concat-
// filter path that was throwing
// [fc#0] Error sending frames to consumers: Invalid argument
// even though our earlier scale+pad+format pass took care of the
// pixel side. Force the time-domain axes too:
// fps=<target> — resample video to a constant rate
// setpts=PTS-STARTPTS — re-zero PTS so concat's per-input clock
// resets cleanly
// aresample=48000 — force a single audio sample rate
// asetpts=PTS-STARTPTS — same for audio
const targetW = 1920;
const targetH = 1080;
const targetFps = Math.round(seqFps) || 30;
const targetSampleRate = 48000;
const vLabels = [];
const aLabels = [];
let normalize = '';
for (let i = 0; i < concatList.length; i++) {
normalize +=
`[${i}:v:0]fps=${targetFps},` +
`scale=${targetW}:${targetH}:force_original_aspect_ratio=decrease,` +
`pad=${targetW}:${targetH}:(ow-iw)/2:(oh-ih)/2,` +
`setsar=1,format=yuv420p,setpts=PTS-STARTPTS[v${i}];`;
vLabels.push(`[v${i}]`);
if (wantAudio) {
normalize +=
`[${i}:a:0]aresample=${targetSampleRate},` +
`aformat=channel_layouts=stereo:sample_fmts=fltp,` +
`asetpts=PTS-STARTPTS[a${i}];`;
aLabels.push(`[a${i}]`);
}
}
const n = concatList.length;
let concatExpr;
if (wantAudio) {
const interleaved = [];
for (let i = 0; i < n; i++) { interleaved.push(vLabels[i], aLabels[i]); }
concatExpr = `${interleaved.join('')}concat=n=${n}:v=1:a=1[outv][outa]`;
} else {
concatExpr = `${vLabels.join('')}concat=n=${n}:v=1:a=0[outv]`;
}
const filterComplex = normalize + concatExpr;
const mapArgs = wantAudio ? ['-map', '[outv]', '-map', '[outa]'] : ['-map', '[outv]'];
const encodeAudio = wantAudio ? ['-c:a', 'aac', '-b:a', '320k', '-ar', '48000'] : ['-an'];
// on mixed-source timelines.
const encodeAudio = (audio === 'none' || audio === 'off')
? ['-an']
: ['-c:a', 'aac', '-b:a', '320k', '-ar', '48000'];
await runFFmpeg([
...inputArgs,
'-filter_complex', filterComplex,
...mapArgs,
'-f', 'concat',
'-safe', '0',
'-i', segmentListPath,
'-c:v', videoCodec,
...profileFlag,
...qualityArgs,