fix(bridge): JOINED single-slot capture — embed audio with each video frame

Re-architect the Deltacast bridge from two independently-buffered VHD streams (DISJOINED_VIDEO + a separate DISJOINED_ANC audio stream) to a single VHD_SDI_STPROC_JOINED RX stream per port. Each locked slot now carries both the video frame and that frame's SDI-embedded audio, so audio is extracted (VHD_SlotExtractAudio) from the SAME slot the video came from — eliminating the constant "audio ahead of video" offset at its root instead of papering over it with --audio-delay-ms. This mirrors Deltacast's own FFmpeg fork (libavdevice/videomaster_common.c): open JOINED, then per frame LockSlot -> GetSlotBuffer(VIDEO) -> SlotExtractAudio(same slot) -> Unlock. Changes (main.c): - main(): RX stream opened with VHD_SDI_STPROC_JOINED (was VHD_SDI_STPROC_DISJOINED_VIDEO). SDI interface still propagated so audio extraction yields real samples. - video_thread(): becomes the single per-frame consumer. After locking each JOINED slot it extracts the embedded audio (audio_extract_slot) and pushes de-interleaved s16le stereo PCM into a new lock-free SPSC ring (ApcmRing), then writes the video frame to the framecache ring (or legacy video FIFO). Audio is extracted on BOTH the framecache and legacy-FIFO paths, before the video packing check so it is never dropped on a rare packing mismatch. - audio_thread(): no longer opens any VHD stream. It is now purely the audio-FIFO sink: drains the ApcmRing into the named audio FIFO (ffmpeg input 1), flushes the ring to the live edge on reader attach, survives ffmpeg restarts (EPIPE -> reopen), and emits wall-clock-paced silence when the ring is empty (preserves the silence-fallback when the signal carries no embedded audio). - Audio stays 48 kHz stereo s16le to match ffmpeg's expectations. - --audio-delay-ms / FC_AUDIO_DELAY_MS retained but now unnecessary (should be left at 0); kept for compatibility / emergency tuning. Compiles clean (only pre-existing %lu/ULONG format warnings in main()). Not installed, not deployed — branch only.
2026-06-05 12:15:47 +00:00 · 2026-06-05 12:15:47 +00:00 · 80d8b15e8c
commit 80d8b15e8c
parent e6f1313065
1 changed files with 259 additions and 164 deletions
--- a/services/capture/deltacast-bridge/main.c
+++ b/services/capture/deltacast-bridge/main.c
@ -170,6 +170,69 @@ static int write_all(int fd, const unsigned char *p, size_t len) {
    return 0;
 }

+/* ── Embedded-audio PCM ring (SPSC) ───────────────────────────────────────
+ * JOINED architecture: the video_thread extracts the SDI-embedded audio of
+ * each frame from the SAME slot it pulls video from, and pushes that PCM into
+ * this lock-free single-producer/single-consumer ring. The audio_thread is the
+ * single consumer: it drains the ring into the named audio FIFO (ffmpeg input
+ * 1) and survives ffmpeg restarts (EPIPE → reopen) without touching the board.
+ *
+ * Why a ring instead of writing the FIFO directly from video_thread:
+ *   - open(audio_fifo, O_WRONLY) blocks until an ffmpeg reader attaches. If the
+ *     video_thread blocked on that, video capture would stall. The ring keeps
+ *     the board-paced frame loop (video + audio extract) free-running while the
+ *     FIFO lifecycle (blocking open, EPIPE reopen, silence fallback) lives in
+ *     audio_thread.
+ *   - Audio is still bound to its exact video frame because it is extracted on
+ *     the SAME slot in the SAME loop iteration → zero constant offset at root.
+ *
+ * 4 MB holds ~21 s of 48 kHz stereo s16le — far more than any FIFO hiccup. */
+#define APCM_RING_BYTES (4u * 1024u * 1024u)
+typedef struct {
+    unsigned char    *buf;             /* APCM_RING_BYTES, power-of-two-free */
+    _Atomic size_t    w;               /* producer write offset (monotonic)  */
+    _Atomic size_t    r;               /* consumer read offset  (monotonic)  */
+    _Atomic int       have_embedded;   /* 1 once real embedded PCM seen      */
+} ApcmRing;
+
+/* Producer (video_thread): copy n bytes in; drop on overflow (never blocks). */
+static void apcm_push(ApcmRing *ring, const unsigned char *src, size_t n) {
+    if (!ring->buf || n == 0) return;
+    size_t w = atomic_load_explicit(&ring->w, memory_order_relaxed);
+    size_t r = atomic_load_explicit(&ring->r, memory_order_acquire);
+    size_t used = w - r;
+    if (used + n > APCM_RING_BYTES) {
+        /* Overflow: reader stalled (no ffmpeg attached, or slow). Drop the
+         * oldest by advancing nothing here and simply refusing the write —
+         * keeping the most-recent contiguous audio aligned to live video. */
+        return;
+    }
+    for (size_t i = 0; i < n; i++)
+        ring->buf[(w + i) % APCM_RING_BYTES] = src[i];
+    atomic_store_explicit(&ring->w, w + n, memory_order_release);
+}
+
+/* Consumer (audio_thread): pop up to max bytes; returns bytes copied. */
+static size_t apcm_pop(ApcmRing *ring, unsigned char *dst, size_t max) {
+    if (!ring->buf) return 0;
+    size_t r = atomic_load_explicit(&ring->r, memory_order_relaxed);
+    size_t w = atomic_load_explicit(&ring->w, memory_order_acquire);
+    size_t avail = w - r;
+    size_t n = avail < max ? avail : max;
+    for (size_t i = 0; i < n; i++)
+        dst[i] = ring->buf[(r + i) % APCM_RING_BYTES];
+    atomic_store_explicit(&ring->r, r + n, memory_order_release);
+    return n;
+}
+
+/* Consumer: discard everything currently queued (flush stale backlog to the
+ * live edge when a fresh reader attaches). */
+static void apcm_drain(ApcmRing *ring) {
+    if (!ring->buf) return;
+    size_t w = atomic_load_explicit(&ring->w, memory_order_acquire);
+    atomic_store_explicit(&ring->r, w, memory_order_release);
+}
+
 /* ── Per-port state ───────────────────────────────────────────────────── */
 typedef struct {
    HANDLE        board;
@ -186,20 +249,37 @@ typedef struct {
    pthread_t     video_tid;
    pthread_t     audio_tid;
    /* streams (owned by threads, set before thread launch) */
-    HANDLE        video_stream;
+    HANDLE        video_stream;     /* JOINED RX stream: carries video + embedded audio */
 #ifndef LEGACY_FIFO
    fc_writer_t  *fc_writer;        /* shm ring buffer writer (NULL = use FIFO fallback) */
 #endif
+    /* JOINED embedded-audio plumbing (producer=video_thread, consumer=audio_thread) */
+    ApcmRing      apcm;             /* video_thread → audio_thread PCM hand-off */
 } PortState;

-/* ── Audio thread ──────────────────────────────────────────────────────
+/* ── Audio thread (JOINED architecture: FIFO sink, no second VHD stream) ──
 *
- *   - Opens FIFO writer (blocks until a reader connects — correct behaviour).
- *   - Feeds continuous wall-clock-paced s16le stereo (real or silence).
- *   - Best-effort VHD audio stream; silence fallback on any failure.
- *   - On EPIPE (ffmpeg reader died): closes and REOPENS the FIFO so the
- *     thread survives an ffmpeg restart without bringing down other ports.
+ * In the JOINED re-architecture the board is opened with ONE RX stream per
+ * port (VHD_SDI_STPROC_JOINED). The video_thread locks each slot and extracts
+ * BOTH the video frame and that frame's SDI-EMBEDDED audio from the SAME slot,
+ * pushing the de-interleaved s16le stereo PCM into ps->apcm. Because the audio
+ * is the embedded audio of the exact frame, it is inherently sync'd with that
+ * frame — zero constant offset at the root (no separate DISJOINED_ANC stream,
+ * no independent buffer queue racing ahead of video).
+ *
+ * This thread NO LONGER opens a VHD stream. Its sole job is FIFO lifecycle:
+ *   - Open the named audio FIFO (blocks until ffmpeg input 1 attaches).
+ *   - On reader attach, flush the ring backlog to the LIVE edge.
+ *   - Drain ps->apcm → FIFO. When the ring is momentarily empty, emit
+ *     wall-clock-paced silence so ffmpeg input 1 never starves (also the
+ *     silence-fallback when the signal carries no embedded audio at all).
+ *   - On EPIPE (ffmpeg reader died): close and REOPEN the FIFO so the thread
+ *     survives an ffmpeg restart without bringing down other ports.
 *     EPIPE never sets g_stop — only SIGTERM/SIGINT does that.
+ *
+ * The legacy --audio-delay-ms knob is still honoured (prepended once on reader
+ * attach) but should be UNNECESSARY now that audio rides with its frame; leave
+ * it at the default 0.
 */
 static void *audio_thread(void *arg) {
    PortState *ps = (PortState *)arg;
@ -213,61 +293,15 @@ static void *audio_thread(void *arg) {
    if (samples_per_frame < 1) samples_per_frame = 1;
    size_t tick_bytes = (size_t)samples_per_frame * FRAME_BYTES;

-    ULONG max_samples = VHD_GetNbSamples((VHD_VIDEOSTANDARD)ps->video_std,
-                                         (VHD_CLOCKDIVISOR)ps->clock_div,
-                                         VHD_ASR_48000, 0);
-    ULONG block_size  = VHD_GetBlockSize(VHD_AF_16, VHD_AM_STEREO);
-    size_t vhd_buf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : FRAME_BYTES);
-    size_t buf_sz     = vhd_buf_sz > tick_bytes ? vhd_buf_sz : tick_bytes;
+    long frame_ns = (long)(1000000000.0 * (double)fps_den / (double)fps_num);
+
+    /* Scratch buffer: large enough for a generous burst pulled from the ring
+     * in one go (several frames of audio) plus the per-tick silence buffer. */
+    size_t buf_sz = tick_bytes * 8;
+    if (buf_sz < 65536) buf_sz = 65536;
    unsigned char *buf = calloc(1, buf_sz);
    if (!buf) return NULL;

-    /* Open the VHD audio stream once for the lifetime of the bridge.
-     * The stream stays open across reader reconnects — no need to reopen it. */
-    HANDLE stream = NULL;
-    int have_vhd_audio = 0;
-    VHD_AUDIOINFO ai;
-    memset(&ai, 0, sizeof(ai));
-
-    ULONG r = VHD_OpenStreamHandle(ps->board, rx_streamtype(ps->port),
-                                   VHD_SDI_STPROC_DISJOINED_ANC,
-                                   NULL, &stream, NULL);
-    if (r == VHDERR_NOERROR) {
-        /* Per Deltacast SDK Sample_RXAudio.cpp: VHD_SDI_SP_INTERFACE must be
-         * propagated to the audio stream, otherwise VHD_SlotExtractAudio
-         * returns 0 samples (silent capture). */
-        ULONG iface = 0;
-        VHD_GetStreamProperty(stream, VHD_SDI_SP_INTERFACE, &iface);
-
-        VHD_SetStreamProperty(stream, VHD_SDI_SP_VIDEO_STANDARD,    ps->video_std);
-        VHD_SetStreamProperty(stream, VHD_SDI_SP_CLOCK_SYSTEM,      ps->clock_div);
-        VHD_SetStreamProperty(stream, VHD_CORE_SP_TRANSFER_SCHEME,  VHD_TRANSFER_SLAVED);
-        VHD_SetStreamProperty(stream, VHD_SDI_SP_INTERFACE,         iface);
-
-        /* Configure BOTH channels of the stereo pair (group 0). The actual PCM
-         * samples land in pAudioChannels[0].pData (packed L/R s16le). Channel
-         * [1] must declare Mode+BufferFormat so the SDK recognizes the pair. */
-        ai.pAudioGroups[0].pAudioChannels[0].Mode         = VHD_AM_STEREO;
-        ai.pAudioGroups[0].pAudioChannels[0].BufferFormat = VHD_AF_16;
-        ai.pAudioGroups[0].pAudioChannels[0].pData        = buf;
-        ai.pAudioGroups[0].pAudioChannels[1].Mode         = VHD_AM_STEREO;
-        ai.pAudioGroups[0].pAudioChannels[1].BufferFormat = VHD_AF_16;
-
-        if (VHD_StartStream(stream) == VHDERR_NOERROR) {
-            have_vhd_audio = 1;
-        } else {
-            fprintf(stderr, "[audio:%u] VHD_StartStream failed — feeding silence\n", ps->port);
-            VHD_CloseStreamHandle(stream);
-            stream = NULL;
-        }
-    } else {
-        fprintf(stderr, "[audio:%u] VHD_OpenStreamHandle failed (%lu) — feeding silence\n",
-                ps->port, r);
-    }
-
-    long frame_ns = (long)(1000000000.0 * (double)fps_den / (double)fps_num);
-    HANDLE slot = NULL;
-
    /* Outer loop: reopen the FIFO writer each time a reader connects.
     * This allows the bridge to survive ffmpeg session stop/restart on a port
     * without affecting any other port's threads. */
@ -283,55 +317,21 @@ static void *audio_thread(void *arg) {
        }
        fcntl(fd, F_SETPIPE_SZ, 1024 * 1024);

-        /* ── Flush the VHD audio slot backlog to the LIVE edge ──────────────
+        /* ── Flush the embedded-audio ring backlog to the LIVE edge ─────────
         * While no reader is attached (recorder idle/standby), the open() above
-         * blocks but the VHD audio stream keeps running, so its internal slot
-         * queue fills with buffered audio. Without flushing, the first thing a
-         * newly-attached reader (the record ffmpeg) receives is that backlog —
-         * several seconds of stale/sync-warmup audio that plays as leading
-         * silence and pushes the audio stream out of alignment with the live
-         * video. Drain all immediately-available slots (non-blocking via the
-         * SDK timeout) so we hand the reader the LIVE edge, frame-aligned with
-         * the video that fc_pipe is delivering right now. */
-        if (have_vhd_audio) {
-            /* Drain the QUEUED backlog only: keep discarding slots while each
-             * lock returns FAST (the board hands back already-buffered slots in
-             * well under a frame period). The first lock that takes ~a full frame
-             * period means the queue is empty and we're now waiting on a LIVE
-             * slot — at that point we've reached the live edge, so stop WITHOUT
-             * consuming it (the inner loop will pick it up and write it). */
-            const long fast_ns = frame_ns / 2;     /* "immediate" threshold */
-            int flushed = 0;
-            for (;;) {
-                struct timespec a, b;
-                clock_gettime(CLOCK_MONOTONIC, &a);
-                HANDLE fslot = NULL;
-                ULONG fr = VHD_LockSlotHandle(stream, &fslot);
-                clock_gettime(CLOCK_MONOTONIC, &b);
-                if (fr != VHDERR_NOERROR) break;   /* TIMEOUT/error => drained */
-                long lock_ns = (b.tv_sec - a.tv_sec) * 1000000000L + (b.tv_nsec - a.tv_nsec);
-                VHD_UnlockSlotHandle(fslot);
-                if (lock_ns >= fast_ns) break;     /* waited for a live slot => stop */
-                if (++flushed > 8192) break;       /* hard safety cap */
-            }
-            if (flushed > 0)
-                fprintf(stderr, "[audio:%u] flushed %d stale slots on reader attach\n",
-                        ps->port, flushed);
-        }
-
-        /* Reset wall-clock baseline after potentially blocking on open().
-         * Only used for the SILENCE fallback path (no hardware audio). */
-        struct timespec next;
-        clock_gettime(CLOCK_MONOTONIC, &next);
+         * blocks but the video_thread keeps free-running and pushing the
+         * embedded audio of every live frame into ps->apcm. Without flushing,
+         * the first thing a newly-attached reader (the record ffmpeg) receives
+         * is that backlog — seconds of stale audio that plays as leading
+         * mis-sync. Discard everything queued so we hand the reader the LIVE
+         * edge, frame-aligned with the video fc_pipe is delivering right now. */
+        apcm_drain(&ps->apcm);

        /* ── Fixed A/V alignment: prepend g_audio_delay_ms of leading silence ──
-         * The video path is buffered deeper than this audio FIFO, so audio would
-         * otherwise arrive at the muxer ahead of its matching video frame. Writing
-         * N ms of silence here (once, right after reaching the live edge) shifts
-         * the entire audio timeline N ms LATER, re-aligning it with video. The
-         * samples are real PCM zeros at 48 kHz so they consume exactly N ms of the
-         * audio timeline — ffmpeg derives audio PTS from sample count, so this is a
-         * precise, drift-free delay. */
+         * Retained for compatibility; with JOINED capture audio already rides
+         * with its frame so this should stay 0. When set, the real PCM zeros at
+         * 48 kHz consume exactly N ms of the audio timeline (ffmpeg derives
+         * audio PTS from sample count) — a precise, drift-free shift. */
        if (g_audio_delay_ms > 0) {
            long delay_samples = (long)AUDIO_RATE * g_audio_delay_ms / 1000;
            size_t delay_bytes = (size_t)delay_samples * FRAME_BYTES;
@ -349,61 +349,49 @@ static void *audio_thread(void *arg) {
                        ps->port, g_audio_delay_ms, delay_samples);
        }

-        /* Inner loop: feed audio into the open FIFO until reader exits (EPIPE). */
+        /* Wall-clock baseline for the silence-fill cadence. */
+        struct timespec next;
+        clock_gettime(CLOCK_MONOTONIC, &next);
+
+        /* Inner loop: drain the ring into the FIFO until the reader exits.
+         *
+         * Pacing model:
+         *   - Whenever the ring has embedded PCM, write ALL of it immediately.
+         *     The producer (video_thread) is paced by the board's JOINED slot
+         *     cadence = the true SDI clock, so the volume of bytes the ring
+         *     accumulates per unit time exactly tracks video. We never pad or
+         *     resample it, so the audio timeline length matches video length
+         *     (no progressive drift).
+         *   - When the ring is empty for a whole frame interval (no embedded
+         *     audio on the signal, or a brief gap), emit exactly one frame of
+         *     silence, wall-clock paced, so ffmpeg input 1 never starves. */
+        int wrote_real_since_log = 0;
        while (!atomic_load(&g_stop) && !atomic_load(&g_port_stop[ps->port])) {
-            size_t out_bytes = 0;
-
-            if (have_vhd_audio) {
-                /* HARDWARE-PACED PATH (the normal case).
-                 * VHD_LockSlotHandle blocks until the board has the next audio
-                 * slot ready — this slot is generated from the SAME SDI signal
-                 * as the video, so blocking here paces audio in lockstep with
-                 * video at the TRUE hardware rate. We write ONLY the real
-                 * samples the board gives us (no silence padding, no wall-clock
-                 * sleep) so the audio timeline length exactly tracks video.
-                 * This is the fix for progressive A/V drift: mixing wall-clock
-                 * paced silence with variable-length real reads made the audio
-                 * stream length diverge from the video stream length. */
-                r = VHD_LockSlotHandle(stream, &slot);
-                if (r == VHDERR_NOERROR) {
-                    ai.pAudioGroups[0].pAudioChannels[0].DataSize = (ULONG)buf_sz;
-                    if (VHD_SlotExtractAudio(slot, &ai) == VHDERR_NOERROR) {
-                        ULONG sz = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
-                        if (sz > 0 && (size_t)sz <= buf_sz) out_bytes = (size_t)sz;
-                    }
-                    VHD_UnlockSlotHandle(slot);
-
-                    if (out_bytes > 0) {
-                        if (write_all(fd, buf, out_bytes) < 0) {
-                            fprintf(stderr, "[audio:%u] EPIPE — waiting for next reader\n", ps->port);
-                            break;
-                        }
-                    }
-                    /* No wall-clock sleep — the board's slot cadence is the clock. */
-                    continue;
-                } else if (r == VHDERR_TIMEOUT) {
-                    /* No slot yet — loop and try again (do NOT inject silence,
-                     * that would add extra samples and cause drift). */
-                    continue;
-                } else {
-                    fprintf(stderr, "[audio:%u] lock error %lu — degrading to silence\n",
-                            ps->port, r);
-                    VHD_StopStream(stream);
-                    VHD_CloseStreamHandle(stream);
-                    stream = NULL;
-                    have_vhd_audio = 0;
-                    clock_gettime(CLOCK_MONOTONIC, &next); /* rebase silence clock */
+            size_t got = apcm_pop(&ps->apcm, buf, buf_sz);
+            if (got > 0) {
+                if (write_all(fd, buf, got) < 0) {
+                    fprintf(stderr, "[audio:%u] EPIPE — waiting for next reader\n", ps->port);
+                    break;
                }
+                if (!wrote_real_since_log &&
+                    atomic_load_explicit(&ps->apcm.have_embedded, memory_order_relaxed)) {
+                    fprintf(stderr, "[audio:%u] streaming SDI-embedded audio (JOINED slot)\n",
+                            ps->port);
+                    wrote_real_since_log = 1;
+                }
+                /* Re-baseline the silence clock so we don't burst silence right
+                 * after a real chunk; the next empty interval starts from now. */
+                clock_gettime(CLOCK_MONOTONIC, &next);
+                /* Small yield to avoid a busy spin when the ring is being fed in
+                 * sub-frame increments; the board cadence refills it promptly. */
+                struct timespec ts = {0, frame_ns / 4 > 0 ? frame_ns / 4 : 250000L};
+                nanosleep(&ts, NULL);
+                continue;
            }

-            /* SILENCE FALLBACK PATH (no hardware audio available).
-             * Wall-clock paced one-frame-of-silence per video-frame interval so
-             * ffmpeg's input 1 never starves and audio length still tracks
-             * real time. */
+            /* Ring empty this interval → emit one frame of silence, paced. */
            memset(buf, 0, tick_bytes);
-            out_bytes = tick_bytes;
-
-            if (write_all(fd, buf, out_bytes) < 0) {
+            if (write_all(fd, buf, tick_bytes) < 0) {
                fprintf(stderr, "[audio:%u] EPIPE — waiting for next reader\n", ps->port);
                break;
            }
@ -423,18 +411,84 @@ static void *audio_thread(void *arg) {
        close(fd);
    }

-    if (stream) {
-        VHD_StopStream(stream);
-        VHD_CloseStreamHandle(stream);
-    }
    free(buf);
    return NULL;
 }

+/* ── Embedded-audio extraction context (used inside the JOINED video loop) ─
+ * Set up once per video_thread; reused for every slot. The VHD_AUDIOINFO is
+ * configured for a single stereo pair (group 0) in s16le, exactly as the old
+ * DISJOINED_ANC audio path was — the SDK lands packed L/R s16le PCM in
+ * pAudioChannels[0].pData with the byte count in .DataSize. */
+typedef struct {
+    int            enabled;     /* 0 = no scratch buffer (extraction disabled) */
+    unsigned char *buf;         /* scratch PCM landing buffer                  */
+    size_t         buf_sz;
+    VHD_AUDIOINFO  ai;
+} AudioExtract;
+
+static void audio_extract_init(AudioExtract *ax, PortState *ps) {
+    memset(ax, 0, sizeof(*ax));
+    /* Worst-case samples per frame at this standard/clock, + headroom. */
+    ULONG max_samples = VHD_GetNbSamples((VHD_VIDEOSTANDARD)ps->video_std,
+                                         (VHD_CLOCKDIVISOR)ps->clock_div,
+                                         VHD_ASR_48000, 0);
+    ULONG block_size  = VHD_GetBlockSize(VHD_AF_16, VHD_AM_STEREO);
+    size_t fb         = (size_t)2 /*ch*/ * 2 /*s16*/;
+    ax->buf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : fb);
+    if (ax->buf_sz < 65536) ax->buf_sz = 65536;
+    ax->buf = calloc(1, ax->buf_sz);
+    if (!ax->buf) { ax->enabled = 0; return; }
+
+    memset(&ax->ai, 0, sizeof(ax->ai));
+    /* Configure BOTH channels of the stereo pair (group 0). PCM lands in
+     * pAudioChannels[0].pData (packed L/R s16le). Channel [1] must declare
+     * Mode+BufferFormat so the SDK recognizes the pair (same as the proven
+     * legacy DISJOINED_ANC config). */
+    ax->ai.pAudioGroups[0].pAudioChannels[0].Mode         = VHD_AM_STEREO;
+    ax->ai.pAudioGroups[0].pAudioChannels[0].BufferFormat = VHD_AF_16;
+    ax->ai.pAudioGroups[0].pAudioChannels[0].pData        = ax->buf;
+    ax->ai.pAudioGroups[0].pAudioChannels[1].Mode         = VHD_AM_STEREO;
+    ax->ai.pAudioGroups[0].pAudioChannels[1].BufferFormat = VHD_AF_16;
+    ax->enabled = 1;
+}
+
+/* Extract this slot's SDI-embedded audio and push it into the ring.
+ * Must be called while `slot` is locked (JOINED slot = same frame as video).
+ * Best-effort: any failure or zero-sample slot simply pushes nothing, and the
+ * audio_thread covers the gap with paced silence. */
+static void audio_extract_slot(AudioExtract *ax, PortState *ps, HANDLE slot) {
+    if (!ax->enabled) return;
+    ax->ai.pAudioGroups[0].pAudioChannels[0].DataSize = (ULONG)ax->buf_sz;
+    if (VHD_SlotExtractAudio(slot, &ax->ai) == VHDERR_NOERROR) {
+        ULONG sz = ax->ai.pAudioGroups[0].pAudioChannels[0].DataSize;
+        if (sz > 0 && (size_t)sz <= ax->buf_sz) {
+            atomic_store_explicit(&ps->apcm.have_embedded, 1, memory_order_relaxed);
+            apcm_push(&ps->apcm, ax->buf, (size_t)sz);
+        }
+    }
+}
+
+static void audio_extract_free(AudioExtract *ax) {
+    if (ax->buf) free(ax->buf);
+    ax->buf = NULL;
+    ax->enabled = 0;
+}
+
 /* ── Video thread ─────────────────────────────────────────────────────── */
 static void *video_thread(void *arg) {
    PortState *ps = (PortState *)arg;

+    /* JOINED: set up embedded-audio extraction once; reused for every slot. */
+    AudioExtract ax;
+    audio_extract_init(&ax, ps);
+    if (ax.enabled)
+        fprintf(stderr, "[video:%u] JOINED audio extraction armed (buf=%zu)\n",
+                ps->port, ax.buf_sz);
+    else
+        fprintf(stderr, "[video:%u] WARN: audio extract buffer alloc failed — silence only\n",
+                ps->port);
+
 #ifndef LEGACY_FIFO
    /* ── Framecache shm path (primary) ──────────────────────────────────
     * Write frames directly into the shared memory ring buffer.
@ -452,6 +506,14 @@ static void *video_thread(void *arg) {
            HANDLE slot = NULL;
            ULONG r = VHD_LockSlotHandle(ps->video_stream, &slot);
            if (r == VHDERR_NOERROR) {
+                /* ── JOINED: extract this frame's embedded audio FIRST ──────
+                 * Same locked slot as the video below ⇒ the audio is the SDI-
+                 * embedded audio of this exact frame ⇒ inherently sync'd. Push
+                 * it to the ring for audio_thread to drain to the FIFO. Done
+                 * before the video packing check so audio is never dropped on a
+                 * (rare) video packing mismatch. */
+                audio_extract_slot(&ax, ps, slot);
+
                BYTE  *buf = NULL;
                ULONG  sz  = 0;
                if (VHD_GetSlotBuffer(slot, VHD_SDI_BT_VIDEO, &buf, &sz) == VHDERR_NOERROR) {
@ -481,6 +543,7 @@ static void *video_thread(void *arg) {
                break;
            }
        }
+        audio_extract_free(&ax);
        return NULL;
    }
    /* fc_writer == NULL → fall through to FIFO path */
@ -518,6 +581,10 @@ static void *video_thread(void *arg) {
        while (!atomic_load(&g_stop) && !atomic_load(&g_port_stop[ps->port])) {
            ULONG r = VHD_LockSlotHandle(ps->video_stream, &slot);
            if (r == VHDERR_NOERROR) {
+                /* JOINED: extract this frame's embedded audio on the SAME slot
+                 * (before the video packing check so audio is never dropped). */
+                audio_extract_slot(&ax, ps, slot);
+
                BYTE  *buf = NULL;
                ULONG  sz  = 0;
                if (VHD_GetSlotBuffer(slot, VHD_SDI_BT_VIDEO, &buf, &sz) == VHDERR_NOERROR) {
@ -550,6 +617,7 @@ static void *video_thread(void *arg) {
        if (fatal) break;
    }

+    audio_extract_free(&ax);
    return NULL;
 }

@ -796,13 +864,22 @@ int main(int argc, char *argv[]) {
        }
 #endif

-        /* Open video stream. */
+        /* Open the RX stream in JOINED processing mode.
+         *
+         * JOINED (vs. the old DISJOINED_VIDEO + a separate DISJOINED_ANC audio
+         * stream) means a single stream delivers slots that carry BOTH the
+         * video frame AND its SDI-embedded ancillary audio. The video_thread
+         * locks each slot once and pulls video (VHD_GetSlotBuffer) and that
+         * frame's audio (VHD_SlotExtractAudio) from the SAME slot, so audio is
+         * inherently frame-synchronised — eliminating the constant "audio ahead
+         * of video" offset that two independently-buffered streams produced.
+         * (Pattern per Deltacast's own FFmpeg fork: libavdevice/videomaster_common.c.) */
        HANDLE vs = NULL;
        ULONG r = VHD_OpenStreamHandle(board, rx_streamtype(ports[pi]),
-                                       VHD_SDI_STPROC_DISJOINED_VIDEO,
+                                       VHD_SDI_STPROC_JOINED,
                                       NULL, &vs, NULL);
        if (r != VHDERR_NOERROR) {
-            fprintf(stderr, "{\"error\":\"VHD_OpenStreamHandle video failed port %u rc=%lu\"}\n",
+            fprintf(stderr, "{\"error\":\"VHD_OpenStreamHandle JOINED failed port %u rc=%lu\"}\n",
                    ports[pi], r);
            continue;
        }
@ -847,10 +924,24 @@ int main(int argc, char *argv[]) {
            p->slot_id);
        fflush(stderr);

-        /* Launch audio thread (blocks until reader connects to audio FIFO). */
+        /* Allocate the embedded-audio hand-off ring BEFORE launching threads.
+         * Producer = video_thread (JOINED slot extract), consumer = audio_thread
+         * (FIFO sink). If allocation fails the bridge still runs video + paced
+         * silence audio (apcm_push/pop are no-ops on a NULL/empty ring). */
+        p->apcm.buf = calloc(1, APCM_RING_BYTES);
+        atomic_store(&p->apcm.w, 0);
+        atomic_store(&p->apcm.r, 0);
+        atomic_store(&p->apcm.have_embedded, 0);
+        if (!p->apcm.buf)
+            fprintf(stderr, "[port:%u] WARN: apcm ring alloc failed — audio will be silence\n",
+                    ports[pi]);
+
+        /* Launch audio thread (FIFO sink: drains apcm ring → audio FIFO,
+         * blocks until reader connects; paced silence when ring empty). */
        pthread_create(&p->audio_tid, NULL, audio_thread, p);

-        /* Launch video thread (blocks until reader connects to video FIFO). */
+        /* Launch video thread (JOINED: video → framecache/FIFO AND embedded
+         * audio → apcm ring; blocks until a video reader connects in FIFO mode). */
        pthread_create(&p->video_tid, NULL, video_thread, p);

        active_count++;
@ -880,6 +971,10 @@ int main(int argc, char *argv[]) {
            ps[i].fc_writer = NULL;
        }
 #endif
+        if (ps[i].apcm.buf) {
+            free(ps[i].apcm.buf);
+            ps[i].apcm.buf = NULL;
+        }
    }
    VHD_CloseBoardHandle(board);