fix(capture): revert 16ch audio to clean 2ch — fixes pitch/rate regression

The 16ch interleave in the deltacast bridge produced audio at HALF the correct
sample rate (measured 24224 vs 48000 samples/s/ch), which broke A/V sync and
pitch. Per the working baseline (audio was clean before the channel selector),
revert the bridge audio thread to the original single-group 2ch extraction and
the capture-manager audio input to -ac 2 + wallclock + aresample.

KEPT the good fixes: long-GOP HEVC for non-growing (NVENC realtime, no frame
drops) and GPU-only codec list. 16ch/channel-select is shelved for a separate,
properly-validated change.
This commit is contained in:
Zac Gaetano 2026-06-04 04:33:34 +00:00
parent b28393eb76
commit fffb6b63b5
2 changed files with 28 additions and 137 deletions

View file

@ -24,7 +24,7 @@
* *
* For each port that acquires signal, emits one JSON line to stderr: * For each port that acquires signal, emits one JSON line to stderr:
* {"port":N,"width":W,"height":H,"fps_num":N,"fps_den":D, * {"port":N,"width":W,"height":H,"fps_num":N,"fps_den":D,
* "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":16, * "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":2,
* "slot_id":"deltacast-<device>-<port>"} * "slot_id":"deltacast-<device>-<port>"}
* *
* Compile with -DLEGACY_FIFO=1 to disable shm writes and fall back to * Compile with -DLEGACY_FIFO=1 to disable shm writes and fall back to
@ -198,14 +198,8 @@ static void *audio_thread(void *arg) {
PortState *ps = (PortState *)arg; PortState *ps = (PortState *)arg;
const int AUDIO_RATE = 48000; const int AUDIO_RATE = 48000;
/* The bridge ALWAYS captures the full 16 embedded channels (4 SDI audio const int CHANNELS = 2;
* groups × 1 stereo pair each). Per-recorder channel selection (keep first const size_t FRAME_BYTES = (size_t)CHANNELS * 2; /* s16le stereo */
* N) happens downstream in the capture ffmpeg via a channelmap the bridge
* publishes one consistent 16ch s16le interleaved stream per port so a
* single FIFO serves every consumer regardless of how many channels they
* want. */
enum { GROUPS = 4, CH_PER_GROUP = 2, CHANNELS = GROUPS * CH_PER_GROUP }; /* = 16 */
const size_t FRAME_BYTES = (size_t)CHANNELS * 2; /* s16le, 16ch */
int fps_num = ps->vi.fps_num > 0 ? ps->vi.fps_num : 25; int fps_num = ps->vi.fps_num > 0 ? ps->vi.fps_num : 25;
int fps_den = ps->vi.fps_den > 0 ? ps->vi.fps_den : 1; int fps_den = ps->vi.fps_den > 0 ? ps->vi.fps_den : 1;
long samples_per_frame = ((long)AUDIO_RATE * fps_den + fps_num / 2) / fps_num; long samples_per_frame = ((long)AUDIO_RATE * fps_den + fps_num / 2) / fps_num;
@ -215,17 +209,10 @@ static void *audio_thread(void *arg) {
ULONG max_samples = VHD_GetNbSamples((VHD_VIDEOSTANDARD)ps->video_std, ULONG max_samples = VHD_GetNbSamples((VHD_VIDEOSTANDARD)ps->video_std,
(VHD_CLOCKDIVISOR)ps->clock_div, (VHD_CLOCKDIVISOR)ps->clock_div,
VHD_ASR_48000, 0); VHD_ASR_48000, 0);
/* Per-group capture buffer (2ch packed s16le) — one per SDI audio group.
* Sized for the SDK's stereo block size; we extract each group into its
* own gbuf[g] then interleave the 4 groups into the 16ch out buffer. */
ULONG block_size = VHD_GetBlockSize(VHD_AF_16, VHD_AM_STEREO); ULONG block_size = VHD_GetBlockSize(VHD_AF_16, VHD_AM_STEREO);
size_t gbuf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : 4); size_t vhd_buf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : FRAME_BYTES);
unsigned char *gbuf[GROUPS]; size_t buf_sz = vhd_buf_sz > tick_bytes ? vhd_buf_sz : tick_bytes;
for (int g = 0; g < GROUPS; g++) { gbuf[g] = calloc(1, gbuf_sz); if (!gbuf[g]) return NULL; } unsigned char *buf = calloc(1, buf_sz);
/* Interleaved 16ch output buffer (and the silence buffer reuses it). */
size_t out_cap = (size_t)(max_samples + 64) * FRAME_BYTES;
if (out_cap < tick_bytes) out_cap = tick_bytes;
unsigned char *buf = calloc(1, out_cap);
if (!buf) return NULL; if (!buf) return NULL;
/* Open the VHD audio stream once for the lifetime of the bridge. /* Open the VHD audio stream once for the lifetime of the bridge.
@ -250,18 +237,14 @@ static void *audio_thread(void *arg) {
VHD_SetStreamProperty(stream, VHD_CORE_SP_TRANSFER_SCHEME, VHD_TRANSFER_SLAVED); VHD_SetStreamProperty(stream, VHD_CORE_SP_TRANSFER_SCHEME, VHD_TRANSFER_SLAVED);
VHD_SetStreamProperty(stream, VHD_SDI_SP_INTERFACE, iface); VHD_SetStreamProperty(stream, VHD_SDI_SP_INTERFACE, iface);
/* Configure all 4 audio groups as stereo pairs. Each group's packed /* Configure BOTH channels of the stereo pair (group 0). The actual PCM
* L/R s16le samples land in pAudioGroups[g].pAudioChannels[0].pData; * samples land in pAudioChannels[0].pData (packed L/R s16le). Channel
* channel [1] must still declare Mode+BufferFormat so the SDK * [1] must declare Mode+BufferFormat so the SDK recognizes the pair. */
* recognizes the pair. Groups with no embedded audio simply return 0 ai.pAudioGroups[0].pAudioChannels[0].Mode = VHD_AM_STEREO;
* samples and are zero-filled during interleave. */ ai.pAudioGroups[0].pAudioChannels[0].BufferFormat = VHD_AF_16;
for (int g = 0; g < GROUPS; g++) { ai.pAudioGroups[0].pAudioChannels[0].pData = buf;
ai.pAudioGroups[g].pAudioChannels[0].Mode = VHD_AM_STEREO; ai.pAudioGroups[0].pAudioChannels[1].Mode = VHD_AM_STEREO;
ai.pAudioGroups[g].pAudioChannels[0].BufferFormat = VHD_AF_16; ai.pAudioGroups[0].pAudioChannels[1].BufferFormat = VHD_AF_16;
ai.pAudioGroups[g].pAudioChannels[0].pData = gbuf[g];
ai.pAudioGroups[g].pAudioChannels[1].Mode = VHD_AM_STEREO;
ai.pAudioGroups[g].pAudioChannels[1].BufferFormat = VHD_AF_16;
}
if (VHD_StartStream(stream) == VHDERR_NOERROR) { if (VHD_StartStream(stream) == VHDERR_NOERROR) {
have_vhd_audio = 1; have_vhd_audio = 1;
@ -315,47 +298,10 @@ static void *audio_thread(void *arg) {
* stream length diverge from the video stream length. */ * stream length diverge from the video stream length. */
r = VHD_LockSlotHandle(stream, &slot); r = VHD_LockSlotHandle(stream, &slot);
if (r == VHDERR_NOERROR) { if (r == VHDERR_NOERROR) {
/* Ask the SDK for up to gbuf_sz bytes per group. After ai.pAudioGroups[0].pAudioChannels[0].DataSize = (ULONG)buf_sz;
* extraction each group's DataSize holds the bytes actually
* written (2ch s16le). Group 0 paces the frame count; groups
* with no audio report 0 and are zero-filled. */
for (int g = 0; g < GROUPS; g++)
ai.pAudioGroups[g].pAudioChannels[0].DataSize = (ULONG)gbuf_sz;
if (VHD_SlotExtractAudio(slot, &ai) == VHDERR_NOERROR) { if (VHD_SlotExtractAudio(slot, &ai) == VHDERR_NOERROR) {
/* Group 0 is the AUTHORITATIVE sample count — it paces the ULONG sz = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
* audio timeline in lockstep with video (same SDI slot if (sz > 0 && (size_t)sz <= buf_sz) out_bytes = (size_t)sz;
* clock), exactly as the original 2ch path did. We must
* emit EXACTLY group 0's frame count per slot; taking a
* max across groups would occasionally emit extra frames
* and make the audio stream drift LONGER than the video
* (heard as a slight pitch-up). Groups 1-3 are sampled at
* the same rate; any that return fewer bytes are padded
* with silence to group 0's length, never extending it. */
ULONG g0 = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
size_t frames = (size_t)g0 / 4; /* 2ch * s16 = 4 bytes/frame */
if (frames > 0) {
size_t need = frames * FRAME_BYTES;
if (need > out_cap) { frames = out_cap / FRAME_BYTES; need = frames * FRAME_BYTES; }
/* Interleave: for each sample frame, emit the 2
* samples of each group in order 16ch frame
* [G0L G0R G1L G1R G2L G2R G3L G3R]. Groups shorter
* than `frames` (or absent) contribute silence. */
int16_t *out = (int16_t *)buf;
for (size_t f = 0; f < frames; f++) {
for (int g = 0; g < GROUPS; g++) {
size_t gframes = (size_t)ai.pAudioGroups[g].pAudioChannels[0].DataSize / 4;
const int16_t *gs = (const int16_t *)gbuf[g];
if (f < gframes) {
out[f * CHANNELS + g * 2 + 0] = gs[f * 2 + 0];
out[f * CHANNELS + g * 2 + 1] = gs[f * 2 + 1];
} else {
out[f * CHANNELS + g * 2 + 0] = 0;
out[f * CHANNELS + g * 2 + 1] = 0;
}
}
}
out_bytes = need;
}
} }
VHD_UnlockSlotHandle(slot); VHD_UnlockSlotHandle(slot);
@ -414,7 +360,6 @@ static void *audio_thread(void *arg) {
VHD_CloseStreamHandle(stream); VHD_CloseStreamHandle(stream);
} }
free(buf); free(buf);
for (int g = 0; g < GROUPS; g++) free(gbuf[g]);
return NULL; return NULL;
} }
@ -815,7 +760,7 @@ int main(int argc, char *argv[]) {
"\"fps_num\":%d,\"fps_den\":%d," "\"fps_num\":%d,\"fps_den\":%d,"
"\"interlaced\":%s," "\"interlaced\":%s,"
"\"pix_fmt\":\"uyvy422\"," "\"pix_fmt\":\"uyvy422\","
"\"audio_channels\":16,\"audio_rate\":48000," "\"audio_channels\":2,\"audio_rate\":48000,"
"\"device\":%u," "\"device\":%u,"
"\"slot_id\":\"%s\"}\n", "\"slot_id\":\"%s\"}\n",
ports[pi], ports[pi],

View file

@ -693,18 +693,7 @@ class CaptureManager {
const fcFps = process.env.DELTACAST_FRAMERATE || '60000/1001'; const fcFps = process.env.DELTACAST_FRAMERATE || '60000/1001';
const fcInterlaced = process.env.DELTACAST_INTERLACED === '1'; const fcInterlaced = process.env.DELTACAST_INTERLACED === '1';
// The deltacast bridge now publishes a fixed 16-channel s16le stream per console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath}`);
// port (all 4 SDI audio groups). The recorder selects how many of those
// channels to keep in the master — RECORDING_AUDIO_CHANNELS (2/8/16),
// injected by node-agent from the recorder config. We declare the FIFO as
// 16ch on input and KEEP THE FIRST N discrete channels downstream (no
// downmix) via an audio channel-map on the encode output.
const FIFO_CHANNELS = 16;
let wantCh = parseInt(process.env.RECORDING_AUDIO_CHANNELS || '2', 10);
if (!Number.isFinite(wantCh) || wantCh < 1) wantCh = 2;
if (wantCh > FIFO_CHANNELS) wantCh = FIFO_CHANNELS;
console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath} ch=${wantCh}/${FIFO_CHANNELS}`);
// Spawn fc_pipe: opens the framecache slot with its own read cursor and // Spawn fc_pipe: opens the framecache slot with its own read cursor and
// streams raw UYVY422 frames to stdout. ffmpeg reads from the pipe as // streams raw UYVY422 frames to stdout. ffmpeg reads from the pipe as
@ -724,37 +713,21 @@ class CaptureManager {
return { return {
inputArgs: [ inputArgs: [
// fc_pipe stdout → ffmpeg rawvideo input 0 (video) // fc_pipe stdout → ffmpeg rawvideo input 0 (video).
// DO NOT use -use_wallclock_as_timestamps here. The framecache ring
// delivers frame-accurate 60fps from the SDI clock, so -framerate
// produces correct CFR timestamps from frame 0, immune to ffmpeg
// startup jitter and NVENC cold-start. Wallclock timestamping caused
// wrong framerate in the recorded file (e.g. 56.06 instead of 59.94)
// because arrival-time jitter at ffmpeg startup skewed the PTS.
'-thread_queue_size', '512', '-thread_queue_size', '512',
'-f', 'rawvideo', '-f', 'rawvideo',
'-pix_fmt', 'uyvy422', '-pix_fmt', 'uyvy422',
'-video_size', fcSize, '-video_size', fcSize,
'-framerate', fcFps, '-framerate', fcFps,
'-i', 'pipe:0', '-i', 'pipe:0',
// Audio FIFO → ffmpeg input 1. Wall-clock timestamps on the audio // Audio FIFO → ffmpeg input 1. The deltacast bridge writes a 2ch s16le
// input are REQUIRED for throughput: without them ffmpeg's audio // 48kHz stream paced by the SDI slot clock (same clock as the video),
// reader has no rate reference on the raw s16le FIFO and the demux // so wallclock timestamps + master aresample=async=1 keep A/V locked.
// thread stalls the whole graph (NVENC sat idle at 9% while frames
// dropped). With wallclock, audio is paced by arrival and the master
// -af aresample=async=1 resamples it onto the video CFR timeline so
// A/V length stays locked. The residual ~1% drift that wallclock used
// to cause was actually the all-intra HEVC dropping frames (video
// short); that's fixed by long-GOP HEVC for non-growing records, so
// wallclock is safe again and necessary.
// The FIFO carries the full 16ch the bridge publishes; channel
// SELECTION (keep first N) is applied as an output filter so the
// discrete broadcast channels are preserved, not downmixed.
'-use_wallclock_as_timestamps', '1', '-use_wallclock_as_timestamps', '1',
'-thread_queue_size', '512', '-thread_queue_size', '512',
'-f', 's16le', '-f', 's16le',
'-ar', '48000', '-ar', '48000',
'-ac', String(FIFO_CHANNELS), '-ac', '2',
'-i', audioFifoPath, '-i', audioFifoPath,
], ],
isNetwork: false, isNetwork: false,
@ -762,11 +735,6 @@ class CaptureManager {
audioFifo: null, audioFifo: null,
interlaced: fcInterlaced, interlaced: fcInterlaced,
audioInputIndex: 1, /* audio FIFO is ffmpeg input 1 */ audioInputIndex: 1, /* audio FIFO is ffmpeg input 1 */
// Number of source channels available on the FIFO, and how many the
// recorder wants kept (first N). The encode builder turns wantCh into a
// channelmap so the master holds exactly those discrete channels.
sourceAudioChannels: FIFO_CHANNELS,
wantAudioChannels: wantCh,
_fcPipeProcess: fcPipeProcess, /* stored for clean stop */ _fcPipeProcess: fcPipeProcess, /* stored for clean stop */
}; };
} }
@ -1058,25 +1026,10 @@ exit "$BMXRC"
this._sessionIdForBridge = sessionId; this._sessionIdForBridge = sessionId;
const { inputArgs, isNetwork, bridgeProcess = null, audioFifo = null, interlaced = false, audioInputIndex = 0, const { inputArgs, isNetwork, bridgeProcess = null, audioFifo = null, interlaced = false, audioInputIndex = 0,
sourceAudioChannels = null, wantAudioChannels = null } = await this._buildInputArgs({ } = await this._buildInputArgs({
sourceType, sourceBackend, device, port, board, sourceUrl, listen, listenPort, streamKey, sourceType, sourceBackend, device, port, board, sourceUrl, listen, listenPort, streamKey,
}); });
// Channel selection for the master: when the source FIFO carries more
// discrete channels than the recorder wants (e.g. 16ch SDI → 2ch master),
// keep the FIRST N channels as discrete streams (no downmix) via a `pan`
// filter `c0=c0|c1=c1|…`. effAudioChannels is what the master container
// actually holds and what `-ac` must declare.
const effAudioChannels = (sourceAudioChannels && wantAudioChannels)
? Math.min(wantAudioChannels, sourceAudioChannels)
: audioChannels;
const needChannelSelect = !!(sourceAudioChannels && wantAudioChannels && wantAudioChannels < sourceAudioChannels);
const channelSelectFilter = needChannelSelect
? `pan=${effAudioChannels}c|` + Array.from({ length: effAudioChannels }, (_, i) => `c${i}=c${i}`).join('|')
: null;
// Override the codec channel count so -ac matches the selected layout.
if (sourceAudioChannels && wantAudioChannels) audioChannels = effAudioChannels;
// ── Pre-roll: discard initial unstable frames ──────────────────────────── // ── Pre-roll: discard initial unstable frames ────────────────────────────
if (bridgeProcess && (sourceType === 'deltacast' || sourceType === 'blackmagic' || sourceType === 'sdi')) { if (bridgeProcess && (sourceType === 'deltacast' || sourceType === 'blackmagic' || sourceType === 'sdi')) {
console.log(`[capture] pre-rolling: discarding ${PRE_ROLL_SECONDS}s of frames`); console.log(`[capture] pre-rolling: discarding ${PRE_ROLL_SECONDS}s of frames`);
@ -1194,18 +1147,11 @@ exit "$BMXRC"
// ffmpeg doesn't fail trying to map a nonexistent audio stream. // ffmpeg doesn't fail trying to map a nonexistent audio stream.
const hasAudio = audioInputIndex >= 0 && !isNetFcPipe; const hasAudio = audioInputIndex >= 0 && !isNetFcPipe;
const masterAudioMap = hasAudio ? ['-map', audioMap] : []; const masterAudioMap = hasAudio ? ['-map', audioMap] : [];
// Master audio: optional first-N channel select (discrete, no downmix), const masterAudioFilter = hasAudio
// then async resample to lock A/V sync. Chain both into one -af. ? ['-af', 'aresample=async=1:min_hard_comp=0.100000:first_pts=0'] : [];
const masterFilterChain = [
...(channelSelectFilter ? [channelSelectFilter] : []),
'aresample=async=1:min_hard_comp=0.100000:first_pts=0',
].join(',');
const masterAudioFilter = hasAudio ? ['-af', masterFilterChain] : [];
const hlsAudioMap = hasAudio ? ['-map', audioMap] : []; const hlsAudioMap = hasAudio ? ['-map', audioMap] : [];
// HLS preview is always stereo for browser playback — downmix the first
// pair regardless of how many channels the master keeps.
const hlsAudioCodec = hasAudio const hlsAudioCodec = hasAudio
? ['-af', 'pan=stereo|c0=c0|c1=c1', '-c:a', 'aac', '-b:a', '128k', '-ar', '44100', '-ac', '2'] : []; ? ['-c:a', 'aac', '-b:a', '128k', '-ar', '44100'] : [];
hiresArgs = [ hiresArgs = [
...inputArgs, ...inputArgs,
'-filter_complex', filterStr, '-filter_complex', filterStr,