diff --git a/services/worker/src/workers/conform.js b/services/worker/src/workers/conform.js index dc6f7c0..da2dfc9 100644 --- a/services/worker/src/workers/conform.js +++ b/services/worker/src/workers/conform.js @@ -3,7 +3,7 @@ import { unlink, writeFile, mkdir, rm } from 'fs/promises'; import { tmpdir } from 'os'; import { query } from '../db/client.js'; import { downloadFromS3, uploadToS3 } from '../s3/client.js'; -import { trimSegment, concatSegments, runFFmpeg } from '../ffmpeg/executor.js'; +import { trimSegment, concatSegments, runFFmpeg, getMediaInfo } from '../ffmpeg/executor.js'; import { parseEDL } from '../edl/parser.js'; import { XMLParser } from 'fast-xml-parser'; @@ -217,8 +217,34 @@ export const conformWorker = async (job) => { console.log(`[conform] Downloading segment ${edit.editNumber} from S3 (${sourceKey})`); await downloadFromS3(S3_BUCKET, sourceKey, segmentInputPath); - console.log(`[conform] Trimming ${edit.editNumber}: ${edit.sourceIn} → ${edit.sourceOut}`); - await trimSegment(segmentInputPath, segmentOutputPath, edit.sourceIn, edit.sourceOut); + // Trim + normalise in a single ffmpeg pass per segment. We re-encode + // here (libx264 ultrafast) so every segment lands at the same spec + // — same fps, resolution, pixel format, sample rate, channel layout + // — which lets the final concat-demuxer step run reliably even when + // the source clips are wildly different (mixed codecs / fps / sample + // rate). The double-encode (intermediate h264 → final ProRes) costs + // some CPU but avoids the concat filter's opaque "Invalid argument" + // failures with disparate sources. + console.log(`[conform] Trim + normalise ${edit.editNumber}: ${edit.sourceIn} → ${edit.sourceOut}`); + const segMs = await getMediaInfo(segmentInputPath); + const segFps = segMs.fps || 30; + const inSec = edit.sourceIn / segFps; + const durSec = (edit.sourceOut - edit.sourceIn) / segFps; + await runFFmpeg([ + '-ss', String(inSec), + '-i', segmentInputPath, + '-t', String(durSec), + '-vf', `fps=${Math.round(seqFps) || 30},` + + `scale=1920:1080:force_original_aspect_ratio=decrease,` + + `pad=1920:1080:(ow-iw)/2:(oh-ih)/2,` + + `setsar=1,format=yuv420p`, + '-af', 'aresample=48000:ocl=stereo', + '-c:v', 'libx264', '-preset', 'ultrafast', '-crf', '18', + '-pix_fmt', 'yuv420p', + '-c:a', 'aac', '-b:a', '320k', '-ar', '48000', + '-shortest', + '-y', segmentOutputPath, + ]); concatList.push(segmentOutputPath); await unlink(segmentInputPath).catch(() => {}); @@ -267,74 +293,21 @@ export const conformWorker = async (job) => { '-crf', quality === 'broadcast' ? '18' : quality === 'high' ? '23' : '28', ]; - // Concat strategy: the demuxer is fast but requires identical specs - // (codec, pixel format, framerate, resolution) across all segments. - // When sources differ (AV1 + H.264, mixed framerates, etc.) it dies - // with "Error sending frames to consumers: Invalid argument". Use the - // concat *filter* instead — it normalises each input through a filter - // graph before joining, so disparate sources work. - // - // Normalise every input to the target resolution + 1/SAR + yuv420p so - // the encode is deterministic. Output frame rate is unset; ffmpeg - // will use whatever the final filter graph emits, which the target - // codec accepts. - const wantAudio = !(audio === 'none' || audio === 'off'); - const inputArgs = []; - concatList.forEach(p => { inputArgs.push('-i', p); }); - - // Build the filter graph. The concat filter in ffmpeg 8.x requires - // identical resolution, pixel format, SAR, FRAME RATE and audio - // SAMPLE RATE / CHANNEL LAYOUT across all inputs. Different-spec - // sources (e.g. a 23.98 fps clip + a 60 fps clip) trip + // Concat: every segment was normalised at trim time (uniform fps, + // resolution, pixel format, sample rate, stereo). The demuxer can + // stream-stitch them and we just need to transcode the result to the + // final target codec. This bypasses ffmpeg 8.x's brittle concat- + // filter path that was throwing // [fc#0] Error sending frames to consumers: Invalid argument - // even though our earlier scale+pad+format pass took care of the - // pixel side. Force the time-domain axes too: - // fps= — resample video to a constant rate - // setpts=PTS-STARTPTS — re-zero PTS so concat's per-input clock - // resets cleanly - // aresample=48000 — force a single audio sample rate - // asetpts=PTS-STARTPTS — same for audio - const targetW = 1920; - const targetH = 1080; - const targetFps = Math.round(seqFps) || 30; - const targetSampleRate = 48000; - - const vLabels = []; - const aLabels = []; - let normalize = ''; - for (let i = 0; i < concatList.length; i++) { - normalize += - `[${i}:v:0]fps=${targetFps},` + - `scale=${targetW}:${targetH}:force_original_aspect_ratio=decrease,` + - `pad=${targetW}:${targetH}:(ow-iw)/2:(oh-ih)/2,` + - `setsar=1,format=yuv420p,setpts=PTS-STARTPTS[v${i}];`; - vLabels.push(`[v${i}]`); - if (wantAudio) { - normalize += - `[${i}:a:0]aresample=${targetSampleRate},` + - `aformat=channel_layouts=stereo:sample_fmts=fltp,` + - `asetpts=PTS-STARTPTS[a${i}];`; - aLabels.push(`[a${i}]`); - } - } - const n = concatList.length; - let concatExpr; - if (wantAudio) { - const interleaved = []; - for (let i = 0; i < n; i++) { interleaved.push(vLabels[i], aLabels[i]); } - concatExpr = `${interleaved.join('')}concat=n=${n}:v=1:a=1[outv][outa]`; - } else { - concatExpr = `${vLabels.join('')}concat=n=${n}:v=1:a=0[outv]`; - } - const filterComplex = normalize + concatExpr; - - const mapArgs = wantAudio ? ['-map', '[outv]', '-map', '[outa]'] : ['-map', '[outv]']; - const encodeAudio = wantAudio ? ['-c:a', 'aac', '-b:a', '320k', '-ar', '48000'] : ['-an']; + // on mixed-source timelines. + const encodeAudio = (audio === 'none' || audio === 'off') + ? ['-an'] + : ['-c:a', 'aac', '-b:a', '320k', '-ar', '48000']; await runFFmpeg([ - ...inputArgs, - '-filter_complex', filterComplex, - ...mapArgs, + '-f', 'concat', + '-safe', '0', + '-i', segmentListPath, '-c:v', videoCodec, ...profileFlag, ...qualityArgs,