fix(worker): conform — concat-filter for mixed source formats

ffmpeg concat demuxer dies with "Error sending frames to consumers:
Invalid argument" when input segments don't share codec / pixel format
/ framerate / resolution. Mixed-source timelines hit this every time —
e.g. an AV1 clip + an H.264 clip going through the same concat.

Switch to the concat *filter*. It re-encodes through a filter graph
so disparate inputs are normalised inline. Each input is scaled to
1920x1080 with letterbox, format=yuv420p, audio resampled. concat=n=N
joins them into [outv]/[outa].

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Claude 2026-05-28 15:04:55 -04:00
parent 6412b5c252
commit 94b6710e2d

View file

@ -267,14 +267,63 @@ export const conformWorker = async (job) => {
'-crf', quality === 'broadcast' ? '18' : quality === 'high' ? '23' : '28',
];
// Concat strategy: the demuxer is fast but requires identical specs
// (codec, pixel format, framerate, resolution) across all segments.
// When sources differ (AV1 + H.264, mixed framerates, etc.) it dies
// with "Error sending frames to consumers: Invalid argument". Use the
// concat *filter* instead — it normalises each input through a filter
// graph before joining, so disparate sources work.
//
// Normalise every input to the target resolution + 1/SAR + yuv420p so
// the encode is deterministic. Output frame rate is unset; ffmpeg
// will use whatever the final filter graph emits, which the target
// codec accepts.
const wantAudio = !(audio === 'none' || audio === 'off');
const inputArgs = [];
concatList.forEach(p => { inputArgs.push('-i', p); });
// Build the filter graph: scale each video stream to a consistent
// resolution + pixel format, then concat them. The audio leg only
// runs if audio is being kept.
const targetW = isProRes ? 1920 : 1920;
const targetH = 1080;
const vLabels = [];
const aLabels = [];
let normalize = '';
for (let i = 0; i < concatList.length; i++) {
// scale=W:H force_original_aspect_ratio=decrease + pad to box keeps
// mixed-aspect sources inside the frame without distortion.
normalize += `[${i}:v:0]scale=${targetW}:${targetH}:force_original_aspect_ratio=decrease,pad=${targetW}:${targetH}:(ow-iw)/2:(oh-ih)/2,setsar=1,format=yuv420p[v${i}];`;
vLabels.push(`[v${i}]`);
if (wantAudio) {
// anullsrc as a fallback so missing audio doesn't blow up concat.
normalize += `[${i}:a:0]aresample=async=1:first_pts=0[a${i}];`;
aLabels.push(`[a${i}]`);
}
}
const n = concatList.length;
let concatExpr;
if (wantAudio) {
// interleaved [v0][a0][v1][a1]…
const interleaved = [];
for (let i = 0; i < n; i++) { interleaved.push(vLabels[i], aLabels[i]); }
concatExpr = `${interleaved.join('')}concat=n=${n}:v=1:a=1[outv][outa]`;
} else {
concatExpr = `${vLabels.join('')}concat=n=${n}:v=1:a=0[outv]`;
}
const filterComplex = normalize + concatExpr;
const mapArgs = wantAudio ? ['-map', '[outv]', '-map', '[outa]'] : ['-map', '[outv]'];
const encodeAudio = wantAudio ? ['-c:a', 'aac', '-b:a', '320k', '-ar', '48000'] : ['-an'];
await runFFmpeg([
'-f', 'concat',
'-safe', '0',
'-i', segmentListPath,
...inputArgs,
'-filter_complex', filterComplex,
...mapArgs,
'-c:v', videoCodec,
...profileFlag,
...qualityArgs,
...audioFlag,
...encodeAudio,
'-y', outputPath,
]);