fix(capture): revert 16ch audio to clean 2ch — fixes pitch/rate regression

The 16ch interleave in the deltacast bridge produced audio at HALF the correct sample rate (measured 24224 vs 48000 samples/s/ch), which broke A/V sync and pitch. Per the working baseline (audio was clean before the channel selector), revert the bridge audio thread to the original single-group 2ch extraction and the capture-manager audio input to -ac 2 + wallclock + aresample. KEPT the good fixes: long-GOP HEVC for non-growing (NVENC realtime, no frame drops) and GPU-only codec list. 16ch/channel-select is shelved for a separate, properly-validated change.
2026-06-04 04:33:34 +00:00 · 2026-06-04 04:33:34 +00:00 · fffb6b63b5
commit fffb6b63b5
parent b28393eb76
2 changed files with 28 additions and 137 deletions
--- a/services/capture/deltacast-bridge/main.c
+++ b/services/capture/deltacast-bridge/main.c
@ -24,7 +24,7 @@
 *
 * For each port that acquires signal, emits one JSON line to stderr:
 *   {"port":N,"width":W,"height":H,"fps_num":N,"fps_den":D,
- *    "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":16,
+ *    "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":2,
 *    "slot_id":"deltacast-<device>-<port>"}
 *
 * Compile with -DLEGACY_FIFO=1 to disable shm writes and fall back to
@ -198,14 +198,8 @@ static void *audio_thread(void *arg) {
    PortState *ps = (PortState *)arg;
    const int    AUDIO_RATE    = 48000;
-    /* The bridge ALWAYS captures the full 16 embedded channels (4 SDI audio
+    const int    CHANNELS      = 2;
-     * groups × 1 stereo pair each). Per-recorder channel selection (keep first
+    const size_t FRAME_BYTES   = (size_t)CHANNELS * 2; /* s16le stereo */
     * N) happens downstream in the capture ffmpeg via a channelmap — the bridge
     * publishes one consistent 16ch s16le interleaved stream per port so a
     * single FIFO serves every consumer regardless of how many channels they
     * want. */
    enum { GROUPS = 4, CH_PER_GROUP = 2, CHANNELS = GROUPS * CH_PER_GROUP }; /* = 16 */
    const size_t FRAME_BYTES = (size_t)CHANNELS * 2; /* s16le, 16ch */
    int fps_num = ps->vi.fps_num > 0 ? ps->vi.fps_num : 25;
    int fps_den = ps->vi.fps_den > 0 ? ps->vi.fps_den : 1;
    long samples_per_frame = ((long)AUDIO_RATE * fps_den + fps_num / 2) / fps_num;
@ -215,17 +209,10 @@ static void *audio_thread(void *arg) {
    ULONG max_samples = VHD_GetNbSamples((VHD_VIDEOSTANDARD)ps->video_std,
                                         (VHD_CLOCKDIVISOR)ps->clock_div,
                                         VHD_ASR_48000, 0);
    /* Per-group capture buffer (2ch packed s16le) — one per SDI audio group.
     * Sized for the SDK's stereo block size; we extract each group into its
     * own gbuf[g] then interleave the 4 groups into the 16ch out buffer. */
    ULONG block_size  = VHD_GetBlockSize(VHD_AF_16, VHD_AM_STEREO);
-    size_t gbuf_sz    = ((size_t)max_samples + 64) * (block_size ? block_size : 4);
+    size_t vhd_buf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : FRAME_BYTES);
-    unsigned char *gbuf[GROUPS];
+    size_t buf_sz     = vhd_buf_sz > tick_bytes ? vhd_buf_sz : tick_bytes;
-    for (int g = 0; g < GROUPS; g++) { gbuf[g] = calloc(1, gbuf_sz); if (!gbuf[g]) return NULL; }
+    unsigned char *buf = calloc(1, buf_sz);
    /* Interleaved 16ch output buffer (and the silence buffer reuses it). */
    size_t out_cap  = (size_t)(max_samples + 64) * FRAME_BYTES;
    if (out_cap < tick_bytes) out_cap = tick_bytes;
    unsigned char *buf = calloc(1, out_cap);
    if (!buf) return NULL;
    /* Open the VHD audio stream once for the lifetime of the bridge.
@ -250,18 +237,14 @@ static void *audio_thread(void *arg) {
        VHD_SetStreamProperty(stream, VHD_CORE_SP_TRANSFER_SCHEME,  VHD_TRANSFER_SLAVED);
        VHD_SetStreamProperty(stream, VHD_SDI_SP_INTERFACE,         iface);
-        /* Configure all 4 audio groups as stereo pairs. Each group's packed
+        /* Configure BOTH channels of the stereo pair (group 0). The actual PCM
-         * L/R s16le samples land in pAudioGroups[g].pAudioChannels[0].pData;
+         * samples land in pAudioChannels[0].pData (packed L/R s16le). Channel
-         * channel [1] must still declare Mode+BufferFormat so the SDK
+         * [1] must declare Mode+BufferFormat so the SDK recognizes the pair. */
-         * recognizes the pair. Groups with no embedded audio simply return 0
+        ai.pAudioGroups[0].pAudioChannels[0].Mode         = VHD_AM_STEREO;
-         * samples and are zero-filled during interleave. */
+        ai.pAudioGroups[0].pAudioChannels[0].BufferFormat = VHD_AF_16;
-        for (int g = 0; g < GROUPS; g++) {
+        ai.pAudioGroups[0].pAudioChannels[0].pData        = buf;
-            ai.pAudioGroups[g].pAudioChannels[0].Mode         = VHD_AM_STEREO;
+        ai.pAudioGroups[0].pAudioChannels[1].Mode         = VHD_AM_STEREO;
-            ai.pAudioGroups[g].pAudioChannels[0].BufferFormat = VHD_AF_16;
+        ai.pAudioGroups[0].pAudioChannels[1].BufferFormat = VHD_AF_16;
            ai.pAudioGroups[g].pAudioChannels[0].pData        = gbuf[g];
            ai.pAudioGroups[g].pAudioChannels[1].Mode         = VHD_AM_STEREO;
            ai.pAudioGroups[g].pAudioChannels[1].BufferFormat = VHD_AF_16;
        }
        if (VHD_StartStream(stream) == VHDERR_NOERROR) {
            have_vhd_audio = 1;
@ -315,47 +298,10 @@ static void *audio_thread(void *arg) {
                 * stream length diverge from the video stream length. */
                r = VHD_LockSlotHandle(stream, &slot);
                if (r == VHDERR_NOERROR) {
-                    /* Ask the SDK for up to gbuf_sz bytes per group. After
+                    ai.pAudioGroups[0].pAudioChannels[0].DataSize = (ULONG)buf_sz;
                     * extraction each group's DataSize holds the bytes actually
                     * written (2ch s16le). Group 0 paces the frame count; groups
                     * with no audio report 0 and are zero-filled. */
                    for (int g = 0; g < GROUPS; g++)
                        ai.pAudioGroups[g].pAudioChannels[0].DataSize = (ULONG)gbuf_sz;
                    if (VHD_SlotExtractAudio(slot, &ai) == VHDERR_NOERROR) {
-                        /* Group 0 is the AUTHORITATIVE sample count — it paces the
+                        ULONG sz = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
-                         * audio timeline in lockstep with video (same SDI slot
+                        if (sz > 0 && (size_t)sz <= buf_sz) out_bytes = (size_t)sz;
                         * clock), exactly as the original 2ch path did. We must
                         * emit EXACTLY group 0's frame count per slot; taking a
                         * max across groups would occasionally emit extra frames
                         * and make the audio stream drift LONGER than the video
                         * (heard as a slight pitch-up). Groups 1-3 are sampled at
                         * the same rate; any that return fewer bytes are padded
                         * with silence to group 0's length, never extending it. */
                        ULONG g0 = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
                        size_t frames = (size_t)g0 / 4;   /* 2ch * s16 = 4 bytes/frame */
                        if (frames > 0) {
                            size_t need = frames * FRAME_BYTES;
                            if (need > out_cap) { frames = out_cap / FRAME_BYTES; need = frames * FRAME_BYTES; }
                            /* Interleave: for each sample frame, emit the 2
                             * samples of each group in order → 16ch frame
                             * [G0L G0R G1L G1R G2L G2R G3L G3R]. Groups shorter
                             * than `frames` (or absent) contribute silence. */
                            int16_t *out = (int16_t *)buf;
                            for (size_t f = 0; f < frames; f++) {
                                for (int g = 0; g < GROUPS; g++) {
                                    size_t gframes = (size_t)ai.pAudioGroups[g].pAudioChannels[0].DataSize / 4;
                                    const int16_t *gs = (const int16_t *)gbuf[g];
                                    if (f < gframes) {
                                        out[f * CHANNELS + g * 2 + 0] = gs[f * 2 + 0];
                                        out[f * CHANNELS + g * 2 + 1] = gs[f * 2 + 1];
                                    } else {
                                        out[f * CHANNELS + g * 2 + 0] = 0;
                                        out[f * CHANNELS + g * 2 + 1] = 0;
                                    }
                                }
                            }
                            out_bytes = need;
                        }
                    }
                    VHD_UnlockSlotHandle(slot);
@ -414,7 +360,6 @@ static void *audio_thread(void *arg) {
        VHD_CloseStreamHandle(stream);
    }
    free(buf);
    for (int g = 0; g < GROUPS; g++) free(gbuf[g]);
    return NULL;
 }
@ -815,7 +760,7 @@ int main(int argc, char *argv[]) {
            "\"fps_num\":%d,\"fps_den\":%d,"
            "\"interlaced\":%s,"
            "\"pix_fmt\":\"uyvy422\","
-            "\"audio_channels\":16,\"audio_rate\":48000,"
+            "\"audio_channels\":2,\"audio_rate\":48000,"
            "\"device\":%u,"
            "\"slot_id\":\"%s\"}\n",
            ports[pi],
--- a/services/capture/src/capture-manager.js
+++ b/services/capture/src/capture-manager.js
@ -693,18 +693,7 @@ class CaptureManager {
      const fcFps        = process.env.DELTACAST_FRAMERATE  || '60000/1001';
      const fcInterlaced = process.env.DELTACAST_INTERLACED === '1';
-      // The deltacast bridge now publishes a fixed 16-channel s16le stream per
+      console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath}`);
      // port (all 4 SDI audio groups). The recorder selects how many of those
      // channels to keep in the master — RECORDING_AUDIO_CHANNELS (2/8/16),
      // injected by node-agent from the recorder config. We declare the FIFO as
      // 16ch on input and KEEP THE FIRST N discrete channels downstream (no
      // downmix) via an audio channel-map on the encode output.
      const FIFO_CHANNELS = 16;
      let wantCh = parseInt(process.env.RECORDING_AUDIO_CHANNELS || '2', 10);
      if (!Number.isFinite(wantCh) || wantCh < 1) wantCh = 2;
      if (wantCh > FIFO_CHANNELS) wantCh = FIFO_CHANNELS;
      console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath} ch=${wantCh}/${FIFO_CHANNELS}`);
      // Spawn fc_pipe: opens the framecache slot with its own read cursor and
      // streams raw UYVY422 frames to stdout. ffmpeg reads from the pipe as
@ -724,37 +713,21 @@ class CaptureManager {
      return {
        inputArgs: [
-          // fc_pipe stdout → ffmpeg rawvideo input 0 (video)
+          // fc_pipe stdout → ffmpeg rawvideo input 0 (video).
          // DO NOT use -use_wallclock_as_timestamps here. The framecache ring
          // delivers frame-accurate 60fps from the SDI clock, so -framerate
          // produces correct CFR timestamps from frame 0, immune to ffmpeg
          // startup jitter and NVENC cold-start. Wallclock timestamping caused
          // wrong framerate in the recorded file (e.g. 56.06 instead of 59.94)
          // because arrival-time jitter at ffmpeg startup skewed the PTS.
          '-thread_queue_size', '512',
          '-f', 'rawvideo',
          '-pix_fmt',    'uyvy422',
          '-video_size', fcSize,
          '-framerate',  fcFps,
          '-i', 'pipe:0',
-          // Audio FIFO → ffmpeg input 1. Wall-clock timestamps on the audio
+          // Audio FIFO → ffmpeg input 1. The deltacast bridge writes a 2ch s16le
-          // input are REQUIRED for throughput: without them ffmpeg's audio
+          // 48kHz stream paced by the SDI slot clock (same clock as the video),
-          // reader has no rate reference on the raw s16le FIFO and the demux
+          // so wallclock timestamps + master aresample=async=1 keep A/V locked.
          // thread stalls the whole graph (NVENC sat idle at 9% while frames
          // dropped). With wallclock, audio is paced by arrival and the master
          // -af aresample=async=1 resamples it onto the video CFR timeline so
          // A/V length stays locked. The residual ~1% drift that wallclock used
          // to cause was actually the all-intra HEVC dropping frames (video
          // short); that's fixed by long-GOP HEVC for non-growing records, so
          // wallclock is safe again and necessary.
          // The FIFO carries the full 16ch the bridge publishes; channel
          // SELECTION (keep first N) is applied as an output filter so the
          // discrete broadcast channels are preserved, not downmixed.
          '-use_wallclock_as_timestamps', '1',
          '-thread_queue_size', '512',
          '-f', 's16le',
          '-ar', '48000',
-          '-ac', String(FIFO_CHANNELS),
+          '-ac', '2',
          '-i', audioFifoPath,
        ],
        isNetwork:      false,
@ -762,11 +735,6 @@ class CaptureManager {
        audioFifo:      null,
        interlaced:     fcInterlaced,
        audioInputIndex: 1,             /* audio FIFO is ffmpeg input 1 */
        // Number of source channels available on the FIFO, and how many the
        // recorder wants kept (first N). The encode builder turns wantCh into a
        // channelmap so the master holds exactly those discrete channels.
        sourceAudioChannels: FIFO_CHANNELS,
        wantAudioChannels:   wantCh,
        _fcPipeProcess: fcPipeProcess,  /* stored for clean stop */
      };
    }
@ -1058,25 +1026,10 @@ exit "$BMXRC"
    this._sessionIdForBridge = sessionId;
    const { inputArgs, isNetwork, bridgeProcess = null, audioFifo = null, interlaced = false, audioInputIndex = 0,
-            sourceAudioChannels = null, wantAudioChannels = null } = await this._buildInputArgs({
+          } = await this._buildInputArgs({
      sourceType, sourceBackend, device, port, board, sourceUrl, listen, listenPort, streamKey,
    });
    // Channel selection for the master: when the source FIFO carries more
    // discrete channels than the recorder wants (e.g. 16ch SDI → 2ch master),
    // keep the FIRST N channels as discrete streams (no downmix) via a `pan`
    // filter `c0=c0|c1=c1|…`. effAudioChannels is what the master container
    // actually holds and what `-ac` must declare.
    const effAudioChannels = (sourceAudioChannels && wantAudioChannels)
      ? Math.min(wantAudioChannels, sourceAudioChannels)
      : audioChannels;
    const needChannelSelect = !!(sourceAudioChannels && wantAudioChannels && wantAudioChannels < sourceAudioChannels);
    const channelSelectFilter = needChannelSelect
      ? `pan=${effAudioChannels}c|` + Array.from({ length: effAudioChannels }, (_, i) => `c${i}=c${i}`).join('|')
      : null;
    // Override the codec channel count so -ac matches the selected layout.
    if (sourceAudioChannels && wantAudioChannels) audioChannels = effAudioChannels;
    // ── Pre-roll: discard initial unstable frames ────────────────────────────
    if (bridgeProcess && (sourceType === 'deltacast' || sourceType === 'blackmagic' || sourceType === 'sdi')) {
      console.log(`[capture] pre-rolling: discarding ${PRE_ROLL_SECONDS}s of frames`);
@ -1194,18 +1147,11 @@ exit "$BMXRC"
        // ffmpeg doesn't fail trying to map a nonexistent audio stream.
        const hasAudio = audioInputIndex >= 0 && !isNetFcPipe;
        const masterAudioMap = hasAudio ? ['-map', audioMap] : [];
-        // Master audio: optional first-N channel select (discrete, no downmix),
+        const masterAudioFilter = hasAudio
-        // then async resample to lock A/V sync. Chain both into one -af.
+          ? ['-af', 'aresample=async=1:min_hard_comp=0.100000:first_pts=0'] : [];
        const masterFilterChain = [
          ...(channelSelectFilter ? [channelSelectFilter] : []),
          'aresample=async=1:min_hard_comp=0.100000:first_pts=0',
        ].join(',');
        const masterAudioFilter = hasAudio ? ['-af', masterFilterChain] : [];
        const hlsAudioMap = hasAudio ? ['-map', audioMap] : [];
        // HLS preview is always stereo for browser playback — downmix the first
        // pair regardless of how many channels the master keeps.
        const hlsAudioCodec = hasAudio
-          ? ['-af', 'pan=stereo|c0=c0|c1=c1', '-c:a', 'aac', '-b:a', '128k', '-ar', '44100', '-ac', '2'] : [];
+          ? ['-c:a', 'aac', '-b:a', '128k', '-ar', '44100'] : [];
        hiresArgs = [
          ...inputArgs,
          '-filter_complex', filterStr,