fix(capture): revert 16ch audio to clean 2ch — fixes pitch/rate regression

The 16ch interleave in the deltacast bridge produced audio at HALF the correct
sample rate (measured 24224 vs 48000 samples/s/ch), which broke A/V sync and
pitch. Per the working baseline (audio was clean before the channel selector),
revert the bridge audio thread to the original single-group 2ch extraction and
the capture-manager audio input to -ac 2 + wallclock + aresample.

KEPT the good fixes: long-GOP HEVC for non-growing (NVENC realtime, no frame
drops) and GPU-only codec list. 16ch/channel-select is shelved for a separate,
properly-validated change.
This commit is contained in:
Zac Gaetano 2026-06-04 04:33:34 +00:00
parent b28393eb76
commit fffb6b63b5
2 changed files with 28 additions and 137 deletions

View file

@ -24,7 +24,7 @@
*
* For each port that acquires signal, emits one JSON line to stderr:
* {"port":N,"width":W,"height":H,"fps_num":N,"fps_den":D,
* "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":16,
* "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":2,
* "slot_id":"deltacast-<device>-<port>"}
*
* Compile with -DLEGACY_FIFO=1 to disable shm writes and fall back to
@ -198,14 +198,8 @@ static void *audio_thread(void *arg) {
PortState *ps = (PortState *)arg;
const int AUDIO_RATE = 48000;
/* The bridge ALWAYS captures the full 16 embedded channels (4 SDI audio
* groups × 1 stereo pair each). Per-recorder channel selection (keep first
* N) happens downstream in the capture ffmpeg via a channelmap the bridge
* publishes one consistent 16ch s16le interleaved stream per port so a
* single FIFO serves every consumer regardless of how many channels they
* want. */
enum { GROUPS = 4, CH_PER_GROUP = 2, CHANNELS = GROUPS * CH_PER_GROUP }; /* = 16 */
const size_t FRAME_BYTES = (size_t)CHANNELS * 2; /* s16le, 16ch */
const int CHANNELS = 2;
const size_t FRAME_BYTES = (size_t)CHANNELS * 2; /* s16le stereo */
int fps_num = ps->vi.fps_num > 0 ? ps->vi.fps_num : 25;
int fps_den = ps->vi.fps_den > 0 ? ps->vi.fps_den : 1;
long samples_per_frame = ((long)AUDIO_RATE * fps_den + fps_num / 2) / fps_num;
@ -215,17 +209,10 @@ static void *audio_thread(void *arg) {
ULONG max_samples = VHD_GetNbSamples((VHD_VIDEOSTANDARD)ps->video_std,
(VHD_CLOCKDIVISOR)ps->clock_div,
VHD_ASR_48000, 0);
/* Per-group capture buffer (2ch packed s16le) — one per SDI audio group.
* Sized for the SDK's stereo block size; we extract each group into its
* own gbuf[g] then interleave the 4 groups into the 16ch out buffer. */
ULONG block_size = VHD_GetBlockSize(VHD_AF_16, VHD_AM_STEREO);
size_t gbuf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : 4);
unsigned char *gbuf[GROUPS];
for (int g = 0; g < GROUPS; g++) { gbuf[g] = calloc(1, gbuf_sz); if (!gbuf[g]) return NULL; }
/* Interleaved 16ch output buffer (and the silence buffer reuses it). */
size_t out_cap = (size_t)(max_samples + 64) * FRAME_BYTES;
if (out_cap < tick_bytes) out_cap = tick_bytes;
unsigned char *buf = calloc(1, out_cap);
size_t vhd_buf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : FRAME_BYTES);
size_t buf_sz = vhd_buf_sz > tick_bytes ? vhd_buf_sz : tick_bytes;
unsigned char *buf = calloc(1, buf_sz);
if (!buf) return NULL;
/* Open the VHD audio stream once for the lifetime of the bridge.
@ -250,18 +237,14 @@ static void *audio_thread(void *arg) {
VHD_SetStreamProperty(stream, VHD_CORE_SP_TRANSFER_SCHEME, VHD_TRANSFER_SLAVED);
VHD_SetStreamProperty(stream, VHD_SDI_SP_INTERFACE, iface);
/* Configure all 4 audio groups as stereo pairs. Each group's packed
* L/R s16le samples land in pAudioGroups[g].pAudioChannels[0].pData;
* channel [1] must still declare Mode+BufferFormat so the SDK
* recognizes the pair. Groups with no embedded audio simply return 0
* samples and are zero-filled during interleave. */
for (int g = 0; g < GROUPS; g++) {
ai.pAudioGroups[g].pAudioChannels[0].Mode = VHD_AM_STEREO;
ai.pAudioGroups[g].pAudioChannels[0].BufferFormat = VHD_AF_16;
ai.pAudioGroups[g].pAudioChannels[0].pData = gbuf[g];
ai.pAudioGroups[g].pAudioChannels[1].Mode = VHD_AM_STEREO;
ai.pAudioGroups[g].pAudioChannels[1].BufferFormat = VHD_AF_16;
}
/* Configure BOTH channels of the stereo pair (group 0). The actual PCM
* samples land in pAudioChannels[0].pData (packed L/R s16le). Channel
* [1] must declare Mode+BufferFormat so the SDK recognizes the pair. */
ai.pAudioGroups[0].pAudioChannels[0].Mode = VHD_AM_STEREO;
ai.pAudioGroups[0].pAudioChannels[0].BufferFormat = VHD_AF_16;
ai.pAudioGroups[0].pAudioChannels[0].pData = buf;
ai.pAudioGroups[0].pAudioChannels[1].Mode = VHD_AM_STEREO;
ai.pAudioGroups[0].pAudioChannels[1].BufferFormat = VHD_AF_16;
if (VHD_StartStream(stream) == VHDERR_NOERROR) {
have_vhd_audio = 1;
@ -315,47 +298,10 @@ static void *audio_thread(void *arg) {
* stream length diverge from the video stream length. */
r = VHD_LockSlotHandle(stream, &slot);
if (r == VHDERR_NOERROR) {
/* Ask the SDK for up to gbuf_sz bytes per group. After
* extraction each group's DataSize holds the bytes actually
* written (2ch s16le). Group 0 paces the frame count; groups
* with no audio report 0 and are zero-filled. */
for (int g = 0; g < GROUPS; g++)
ai.pAudioGroups[g].pAudioChannels[0].DataSize = (ULONG)gbuf_sz;
ai.pAudioGroups[0].pAudioChannels[0].DataSize = (ULONG)buf_sz;
if (VHD_SlotExtractAudio(slot, &ai) == VHDERR_NOERROR) {
/* Group 0 is the AUTHORITATIVE sample count — it paces the
* audio timeline in lockstep with video (same SDI slot
* clock), exactly as the original 2ch path did. We must
* emit EXACTLY group 0's frame count per slot; taking a
* max across groups would occasionally emit extra frames
* and make the audio stream drift LONGER than the video
* (heard as a slight pitch-up). Groups 1-3 are sampled at
* the same rate; any that return fewer bytes are padded
* with silence to group 0's length, never extending it. */
ULONG g0 = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
size_t frames = (size_t)g0 / 4; /* 2ch * s16 = 4 bytes/frame */
if (frames > 0) {
size_t need = frames * FRAME_BYTES;
if (need > out_cap) { frames = out_cap / FRAME_BYTES; need = frames * FRAME_BYTES; }
/* Interleave: for each sample frame, emit the 2
* samples of each group in order 16ch frame
* [G0L G0R G1L G1R G2L G2R G3L G3R]. Groups shorter
* than `frames` (or absent) contribute silence. */
int16_t *out = (int16_t *)buf;
for (size_t f = 0; f < frames; f++) {
for (int g = 0; g < GROUPS; g++) {
size_t gframes = (size_t)ai.pAudioGroups[g].pAudioChannels[0].DataSize / 4;
const int16_t *gs = (const int16_t *)gbuf[g];
if (f < gframes) {
out[f * CHANNELS + g * 2 + 0] = gs[f * 2 + 0];
out[f * CHANNELS + g * 2 + 1] = gs[f * 2 + 1];
} else {
out[f * CHANNELS + g * 2 + 0] = 0;
out[f * CHANNELS + g * 2 + 1] = 0;
}
}
}
out_bytes = need;
}
ULONG sz = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
if (sz > 0 && (size_t)sz <= buf_sz) out_bytes = (size_t)sz;
}
VHD_UnlockSlotHandle(slot);
@ -414,7 +360,6 @@ static void *audio_thread(void *arg) {
VHD_CloseStreamHandle(stream);
}
free(buf);
for (int g = 0; g < GROUPS; g++) free(gbuf[g]);
return NULL;
}
@ -815,7 +760,7 @@ int main(int argc, char *argv[]) {
"\"fps_num\":%d,\"fps_den\":%d,"
"\"interlaced\":%s,"
"\"pix_fmt\":\"uyvy422\","
"\"audio_channels\":16,\"audio_rate\":48000,"
"\"audio_channels\":2,\"audio_rate\":48000,"
"\"device\":%u,"
"\"slot_id\":\"%s\"}\n",
ports[pi],

View file

@ -693,18 +693,7 @@ class CaptureManager {
const fcFps = process.env.DELTACAST_FRAMERATE || '60000/1001';
const fcInterlaced = process.env.DELTACAST_INTERLACED === '1';
// The deltacast bridge now publishes a fixed 16-channel s16le stream per
// port (all 4 SDI audio groups). The recorder selects how many of those
// channels to keep in the master — RECORDING_AUDIO_CHANNELS (2/8/16),
// injected by node-agent from the recorder config. We declare the FIFO as
// 16ch on input and KEEP THE FIRST N discrete channels downstream (no
// downmix) via an audio channel-map on the encode output.
const FIFO_CHANNELS = 16;
let wantCh = parseInt(process.env.RECORDING_AUDIO_CHANNELS || '2', 10);
if (!Number.isFinite(wantCh) || wantCh < 1) wantCh = 2;
if (wantCh > FIFO_CHANNELS) wantCh = FIFO_CHANNELS;
console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath} ch=${wantCh}/${FIFO_CHANNELS}`);
console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath}`);
// Spawn fc_pipe: opens the framecache slot with its own read cursor and
// streams raw UYVY422 frames to stdout. ffmpeg reads from the pipe as
@ -724,37 +713,21 @@ class CaptureManager {
return {
inputArgs: [
// fc_pipe stdout → ffmpeg rawvideo input 0 (video)
// DO NOT use -use_wallclock_as_timestamps here. The framecache ring
// delivers frame-accurate 60fps from the SDI clock, so -framerate
// produces correct CFR timestamps from frame 0, immune to ffmpeg
// startup jitter and NVENC cold-start. Wallclock timestamping caused
// wrong framerate in the recorded file (e.g. 56.06 instead of 59.94)
// because arrival-time jitter at ffmpeg startup skewed the PTS.
// fc_pipe stdout → ffmpeg rawvideo input 0 (video).
'-thread_queue_size', '512',
'-f', 'rawvideo',
'-pix_fmt', 'uyvy422',
'-video_size', fcSize,
'-framerate', fcFps,
'-i', 'pipe:0',
// Audio FIFO → ffmpeg input 1. Wall-clock timestamps on the audio
// input are REQUIRED for throughput: without them ffmpeg's audio
// reader has no rate reference on the raw s16le FIFO and the demux
// thread stalls the whole graph (NVENC sat idle at 9% while frames
// dropped). With wallclock, audio is paced by arrival and the master
// -af aresample=async=1 resamples it onto the video CFR timeline so
// A/V length stays locked. The residual ~1% drift that wallclock used
// to cause was actually the all-intra HEVC dropping frames (video
// short); that's fixed by long-GOP HEVC for non-growing records, so
// wallclock is safe again and necessary.
// The FIFO carries the full 16ch the bridge publishes; channel
// SELECTION (keep first N) is applied as an output filter so the
// discrete broadcast channels are preserved, not downmixed.
// Audio FIFO → ffmpeg input 1. The deltacast bridge writes a 2ch s16le
// 48kHz stream paced by the SDI slot clock (same clock as the video),
// so wallclock timestamps + master aresample=async=1 keep A/V locked.
'-use_wallclock_as_timestamps', '1',
'-thread_queue_size', '512',
'-f', 's16le',
'-ar', '48000',
'-ac', String(FIFO_CHANNELS),
'-ac', '2',
'-i', audioFifoPath,
],
isNetwork: false,
@ -762,11 +735,6 @@ class CaptureManager {
audioFifo: null,
interlaced: fcInterlaced,
audioInputIndex: 1, /* audio FIFO is ffmpeg input 1 */
// Number of source channels available on the FIFO, and how many the
// recorder wants kept (first N). The encode builder turns wantCh into a
// channelmap so the master holds exactly those discrete channels.
sourceAudioChannels: FIFO_CHANNELS,
wantAudioChannels: wantCh,
_fcPipeProcess: fcPipeProcess, /* stored for clean stop */
};
}
@ -1058,25 +1026,10 @@ exit "$BMXRC"
this._sessionIdForBridge = sessionId;
const { inputArgs, isNetwork, bridgeProcess = null, audioFifo = null, interlaced = false, audioInputIndex = 0,
sourceAudioChannels = null, wantAudioChannels = null } = await this._buildInputArgs({
} = await this._buildInputArgs({
sourceType, sourceBackend, device, port, board, sourceUrl, listen, listenPort, streamKey,
});
// Channel selection for the master: when the source FIFO carries more
// discrete channels than the recorder wants (e.g. 16ch SDI → 2ch master),
// keep the FIRST N channels as discrete streams (no downmix) via a `pan`
// filter `c0=c0|c1=c1|…`. effAudioChannels is what the master container
// actually holds and what `-ac` must declare.
const effAudioChannels = (sourceAudioChannels && wantAudioChannels)
? Math.min(wantAudioChannels, sourceAudioChannels)
: audioChannels;
const needChannelSelect = !!(sourceAudioChannels && wantAudioChannels && wantAudioChannels < sourceAudioChannels);
const channelSelectFilter = needChannelSelect
? `pan=${effAudioChannels}c|` + Array.from({ length: effAudioChannels }, (_, i) => `c${i}=c${i}`).join('|')
: null;
// Override the codec channel count so -ac matches the selected layout.
if (sourceAudioChannels && wantAudioChannels) audioChannels = effAudioChannels;
// ── Pre-roll: discard initial unstable frames ────────────────────────────
if (bridgeProcess && (sourceType === 'deltacast' || sourceType === 'blackmagic' || sourceType === 'sdi')) {
console.log(`[capture] pre-rolling: discarding ${PRE_ROLL_SECONDS}s of frames`);
@ -1194,18 +1147,11 @@ exit "$BMXRC"
// ffmpeg doesn't fail trying to map a nonexistent audio stream.
const hasAudio = audioInputIndex >= 0 && !isNetFcPipe;
const masterAudioMap = hasAudio ? ['-map', audioMap] : [];
// Master audio: optional first-N channel select (discrete, no downmix),
// then async resample to lock A/V sync. Chain both into one -af.
const masterFilterChain = [
...(channelSelectFilter ? [channelSelectFilter] : []),
'aresample=async=1:min_hard_comp=0.100000:first_pts=0',
].join(',');
const masterAudioFilter = hasAudio ? ['-af', masterFilterChain] : [];
const masterAudioFilter = hasAudio
? ['-af', 'aresample=async=1:min_hard_comp=0.100000:first_pts=0'] : [];
const hlsAudioMap = hasAudio ? ['-map', audioMap] : [];
// HLS preview is always stereo for browser playback — downmix the first
// pair regardless of how many channels the master keeps.
const hlsAudioCodec = hasAudio
? ['-af', 'pan=stereo|c0=c0|c1=c1', '-c:a', 'aac', '-b:a', '128k', '-ar', '44100', '-ac', '2'] : [];
? ['-c:a', 'aac', '-b:a', '128k', '-ar', '44100'] : [];
hiresArgs = [
...inputArgs,
'-filter_complex', filterStr,