fix(capture): revert 16ch audio to clean 2ch — fixes pitch/rate regression
The 16ch interleave in the deltacast bridge produced audio at HALF the correct sample rate (measured 24224 vs 48000 samples/s/ch), which broke A/V sync and pitch. Per the working baseline (audio was clean before the channel selector), revert the bridge audio thread to the original single-group 2ch extraction and the capture-manager audio input to -ac 2 + wallclock + aresample. KEPT the good fixes: long-GOP HEVC for non-growing (NVENC realtime, no frame drops) and GPU-only codec list. 16ch/channel-select is shelved for a separate, properly-validated change.
This commit is contained in:
parent
b28393eb76
commit
fffb6b63b5
2 changed files with 28 additions and 137 deletions
|
|
@ -24,7 +24,7 @@
|
|||
*
|
||||
* For each port that acquires signal, emits one JSON line to stderr:
|
||||
* {"port":N,"width":W,"height":H,"fps_num":N,"fps_den":D,
|
||||
* "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":16,
|
||||
* "pix_fmt":"uyvy422","audio_rate":48000,"audio_channels":2,
|
||||
* "slot_id":"deltacast-<device>-<port>"}
|
||||
*
|
||||
* Compile with -DLEGACY_FIFO=1 to disable shm writes and fall back to
|
||||
|
|
@ -198,14 +198,8 @@ static void *audio_thread(void *arg) {
|
|||
PortState *ps = (PortState *)arg;
|
||||
|
||||
const int AUDIO_RATE = 48000;
|
||||
/* The bridge ALWAYS captures the full 16 embedded channels (4 SDI audio
|
||||
* groups × 1 stereo pair each). Per-recorder channel selection (keep first
|
||||
* N) happens downstream in the capture ffmpeg via a channelmap — the bridge
|
||||
* publishes one consistent 16ch s16le interleaved stream per port so a
|
||||
* single FIFO serves every consumer regardless of how many channels they
|
||||
* want. */
|
||||
enum { GROUPS = 4, CH_PER_GROUP = 2, CHANNELS = GROUPS * CH_PER_GROUP }; /* = 16 */
|
||||
const size_t FRAME_BYTES = (size_t)CHANNELS * 2; /* s16le, 16ch */
|
||||
const int CHANNELS = 2;
|
||||
const size_t FRAME_BYTES = (size_t)CHANNELS * 2; /* s16le stereo */
|
||||
int fps_num = ps->vi.fps_num > 0 ? ps->vi.fps_num : 25;
|
||||
int fps_den = ps->vi.fps_den > 0 ? ps->vi.fps_den : 1;
|
||||
long samples_per_frame = ((long)AUDIO_RATE * fps_den + fps_num / 2) / fps_num;
|
||||
|
|
@ -215,17 +209,10 @@ static void *audio_thread(void *arg) {
|
|||
ULONG max_samples = VHD_GetNbSamples((VHD_VIDEOSTANDARD)ps->video_std,
|
||||
(VHD_CLOCKDIVISOR)ps->clock_div,
|
||||
VHD_ASR_48000, 0);
|
||||
/* Per-group capture buffer (2ch packed s16le) — one per SDI audio group.
|
||||
* Sized for the SDK's stereo block size; we extract each group into its
|
||||
* own gbuf[g] then interleave the 4 groups into the 16ch out buffer. */
|
||||
ULONG block_size = VHD_GetBlockSize(VHD_AF_16, VHD_AM_STEREO);
|
||||
size_t gbuf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : 4);
|
||||
unsigned char *gbuf[GROUPS];
|
||||
for (int g = 0; g < GROUPS; g++) { gbuf[g] = calloc(1, gbuf_sz); if (!gbuf[g]) return NULL; }
|
||||
/* Interleaved 16ch output buffer (and the silence buffer reuses it). */
|
||||
size_t out_cap = (size_t)(max_samples + 64) * FRAME_BYTES;
|
||||
if (out_cap < tick_bytes) out_cap = tick_bytes;
|
||||
unsigned char *buf = calloc(1, out_cap);
|
||||
size_t vhd_buf_sz = ((size_t)max_samples + 64) * (block_size ? block_size : FRAME_BYTES);
|
||||
size_t buf_sz = vhd_buf_sz > tick_bytes ? vhd_buf_sz : tick_bytes;
|
||||
unsigned char *buf = calloc(1, buf_sz);
|
||||
if (!buf) return NULL;
|
||||
|
||||
/* Open the VHD audio stream once for the lifetime of the bridge.
|
||||
|
|
@ -250,18 +237,14 @@ static void *audio_thread(void *arg) {
|
|||
VHD_SetStreamProperty(stream, VHD_CORE_SP_TRANSFER_SCHEME, VHD_TRANSFER_SLAVED);
|
||||
VHD_SetStreamProperty(stream, VHD_SDI_SP_INTERFACE, iface);
|
||||
|
||||
/* Configure all 4 audio groups as stereo pairs. Each group's packed
|
||||
* L/R s16le samples land in pAudioGroups[g].pAudioChannels[0].pData;
|
||||
* channel [1] must still declare Mode+BufferFormat so the SDK
|
||||
* recognizes the pair. Groups with no embedded audio simply return 0
|
||||
* samples and are zero-filled during interleave. */
|
||||
for (int g = 0; g < GROUPS; g++) {
|
||||
ai.pAudioGroups[g].pAudioChannels[0].Mode = VHD_AM_STEREO;
|
||||
ai.pAudioGroups[g].pAudioChannels[0].BufferFormat = VHD_AF_16;
|
||||
ai.pAudioGroups[g].pAudioChannels[0].pData = gbuf[g];
|
||||
ai.pAudioGroups[g].pAudioChannels[1].Mode = VHD_AM_STEREO;
|
||||
ai.pAudioGroups[g].pAudioChannels[1].BufferFormat = VHD_AF_16;
|
||||
}
|
||||
/* Configure BOTH channels of the stereo pair (group 0). The actual PCM
|
||||
* samples land in pAudioChannels[0].pData (packed L/R s16le). Channel
|
||||
* [1] must declare Mode+BufferFormat so the SDK recognizes the pair. */
|
||||
ai.pAudioGroups[0].pAudioChannels[0].Mode = VHD_AM_STEREO;
|
||||
ai.pAudioGroups[0].pAudioChannels[0].BufferFormat = VHD_AF_16;
|
||||
ai.pAudioGroups[0].pAudioChannels[0].pData = buf;
|
||||
ai.pAudioGroups[0].pAudioChannels[1].Mode = VHD_AM_STEREO;
|
||||
ai.pAudioGroups[0].pAudioChannels[1].BufferFormat = VHD_AF_16;
|
||||
|
||||
if (VHD_StartStream(stream) == VHDERR_NOERROR) {
|
||||
have_vhd_audio = 1;
|
||||
|
|
@ -315,47 +298,10 @@ static void *audio_thread(void *arg) {
|
|||
* stream length diverge from the video stream length. */
|
||||
r = VHD_LockSlotHandle(stream, &slot);
|
||||
if (r == VHDERR_NOERROR) {
|
||||
/* Ask the SDK for up to gbuf_sz bytes per group. After
|
||||
* extraction each group's DataSize holds the bytes actually
|
||||
* written (2ch s16le). Group 0 paces the frame count; groups
|
||||
* with no audio report 0 and are zero-filled. */
|
||||
for (int g = 0; g < GROUPS; g++)
|
||||
ai.pAudioGroups[g].pAudioChannels[0].DataSize = (ULONG)gbuf_sz;
|
||||
ai.pAudioGroups[0].pAudioChannels[0].DataSize = (ULONG)buf_sz;
|
||||
if (VHD_SlotExtractAudio(slot, &ai) == VHDERR_NOERROR) {
|
||||
/* Group 0 is the AUTHORITATIVE sample count — it paces the
|
||||
* audio timeline in lockstep with video (same SDI slot
|
||||
* clock), exactly as the original 2ch path did. We must
|
||||
* emit EXACTLY group 0's frame count per slot; taking a
|
||||
* max across groups would occasionally emit extra frames
|
||||
* and make the audio stream drift LONGER than the video
|
||||
* (heard as a slight pitch-up). Groups 1-3 are sampled at
|
||||
* the same rate; any that return fewer bytes are padded
|
||||
* with silence to group 0's length, never extending it. */
|
||||
ULONG g0 = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
|
||||
size_t frames = (size_t)g0 / 4; /* 2ch * s16 = 4 bytes/frame */
|
||||
if (frames > 0) {
|
||||
size_t need = frames * FRAME_BYTES;
|
||||
if (need > out_cap) { frames = out_cap / FRAME_BYTES; need = frames * FRAME_BYTES; }
|
||||
/* Interleave: for each sample frame, emit the 2
|
||||
* samples of each group in order → 16ch frame
|
||||
* [G0L G0R G1L G1R G2L G2R G3L G3R]. Groups shorter
|
||||
* than `frames` (or absent) contribute silence. */
|
||||
int16_t *out = (int16_t *)buf;
|
||||
for (size_t f = 0; f < frames; f++) {
|
||||
for (int g = 0; g < GROUPS; g++) {
|
||||
size_t gframes = (size_t)ai.pAudioGroups[g].pAudioChannels[0].DataSize / 4;
|
||||
const int16_t *gs = (const int16_t *)gbuf[g];
|
||||
if (f < gframes) {
|
||||
out[f * CHANNELS + g * 2 + 0] = gs[f * 2 + 0];
|
||||
out[f * CHANNELS + g * 2 + 1] = gs[f * 2 + 1];
|
||||
} else {
|
||||
out[f * CHANNELS + g * 2 + 0] = 0;
|
||||
out[f * CHANNELS + g * 2 + 1] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
out_bytes = need;
|
||||
}
|
||||
ULONG sz = ai.pAudioGroups[0].pAudioChannels[0].DataSize;
|
||||
if (sz > 0 && (size_t)sz <= buf_sz) out_bytes = (size_t)sz;
|
||||
}
|
||||
VHD_UnlockSlotHandle(slot);
|
||||
|
||||
|
|
@ -414,7 +360,6 @@ static void *audio_thread(void *arg) {
|
|||
VHD_CloseStreamHandle(stream);
|
||||
}
|
||||
free(buf);
|
||||
for (int g = 0; g < GROUPS; g++) free(gbuf[g]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -815,7 +760,7 @@ int main(int argc, char *argv[]) {
|
|||
"\"fps_num\":%d,\"fps_den\":%d,"
|
||||
"\"interlaced\":%s,"
|
||||
"\"pix_fmt\":\"uyvy422\","
|
||||
"\"audio_channels\":16,\"audio_rate\":48000,"
|
||||
"\"audio_channels\":2,\"audio_rate\":48000,"
|
||||
"\"device\":%u,"
|
||||
"\"slot_id\":\"%s\"}\n",
|
||||
ports[pi],
|
||||
|
|
|
|||
|
|
@ -693,18 +693,7 @@ class CaptureManager {
|
|||
const fcFps = process.env.DELTACAST_FRAMERATE || '60000/1001';
|
||||
const fcInterlaced = process.env.DELTACAST_INTERLACED === '1';
|
||||
|
||||
// The deltacast bridge now publishes a fixed 16-channel s16le stream per
|
||||
// port (all 4 SDI audio groups). The recorder selects how many of those
|
||||
// channels to keep in the master — RECORDING_AUDIO_CHANNELS (2/8/16),
|
||||
// injected by node-agent from the recorder config. We declare the FIFO as
|
||||
// 16ch on input and KEEP THE FIRST N discrete channels downstream (no
|
||||
// downmix) via an audio channel-map on the encode output.
|
||||
const FIFO_CHANNELS = 16;
|
||||
let wantCh = parseInt(process.env.RECORDING_AUDIO_CHANNELS || '2', 10);
|
||||
if (!Number.isFinite(wantCh) || wantCh < 1) wantCh = 2;
|
||||
if (wantCh > FIFO_CHANNELS) wantCh = FIFO_CHANNELS;
|
||||
|
||||
console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath} ch=${wantCh}/${FIFO_CHANNELS}`);
|
||||
console.log(`[framecache] slot=${slotId} size=${fcSize} fps=${fcFps} audio=${audioFifoPath}`);
|
||||
|
||||
// Spawn fc_pipe: opens the framecache slot with its own read cursor and
|
||||
// streams raw UYVY422 frames to stdout. ffmpeg reads from the pipe as
|
||||
|
|
@ -724,37 +713,21 @@ class CaptureManager {
|
|||
|
||||
return {
|
||||
inputArgs: [
|
||||
// fc_pipe stdout → ffmpeg rawvideo input 0 (video)
|
||||
// DO NOT use -use_wallclock_as_timestamps here. The framecache ring
|
||||
// delivers frame-accurate 60fps from the SDI clock, so -framerate
|
||||
// produces correct CFR timestamps from frame 0, immune to ffmpeg
|
||||
// startup jitter and NVENC cold-start. Wallclock timestamping caused
|
||||
// wrong framerate in the recorded file (e.g. 56.06 instead of 59.94)
|
||||
// because arrival-time jitter at ffmpeg startup skewed the PTS.
|
||||
// fc_pipe stdout → ffmpeg rawvideo input 0 (video).
|
||||
'-thread_queue_size', '512',
|
||||
'-f', 'rawvideo',
|
||||
'-pix_fmt', 'uyvy422',
|
||||
'-video_size', fcSize,
|
||||
'-framerate', fcFps,
|
||||
'-i', 'pipe:0',
|
||||
// Audio FIFO → ffmpeg input 1. Wall-clock timestamps on the audio
|
||||
// input are REQUIRED for throughput: without them ffmpeg's audio
|
||||
// reader has no rate reference on the raw s16le FIFO and the demux
|
||||
// thread stalls the whole graph (NVENC sat idle at 9% while frames
|
||||
// dropped). With wallclock, audio is paced by arrival and the master
|
||||
// -af aresample=async=1 resamples it onto the video CFR timeline so
|
||||
// A/V length stays locked. The residual ~1% drift that wallclock used
|
||||
// to cause was actually the all-intra HEVC dropping frames (video
|
||||
// short); that's fixed by long-GOP HEVC for non-growing records, so
|
||||
// wallclock is safe again and necessary.
|
||||
// The FIFO carries the full 16ch the bridge publishes; channel
|
||||
// SELECTION (keep first N) is applied as an output filter so the
|
||||
// discrete broadcast channels are preserved, not downmixed.
|
||||
// Audio FIFO → ffmpeg input 1. The deltacast bridge writes a 2ch s16le
|
||||
// 48kHz stream paced by the SDI slot clock (same clock as the video),
|
||||
// so wallclock timestamps + master aresample=async=1 keep A/V locked.
|
||||
'-use_wallclock_as_timestamps', '1',
|
||||
'-thread_queue_size', '512',
|
||||
'-f', 's16le',
|
||||
'-ar', '48000',
|
||||
'-ac', String(FIFO_CHANNELS),
|
||||
'-ac', '2',
|
||||
'-i', audioFifoPath,
|
||||
],
|
||||
isNetwork: false,
|
||||
|
|
@ -762,11 +735,6 @@ class CaptureManager {
|
|||
audioFifo: null,
|
||||
interlaced: fcInterlaced,
|
||||
audioInputIndex: 1, /* audio FIFO is ffmpeg input 1 */
|
||||
// Number of source channels available on the FIFO, and how many the
|
||||
// recorder wants kept (first N). The encode builder turns wantCh into a
|
||||
// channelmap so the master holds exactly those discrete channels.
|
||||
sourceAudioChannels: FIFO_CHANNELS,
|
||||
wantAudioChannels: wantCh,
|
||||
_fcPipeProcess: fcPipeProcess, /* stored for clean stop */
|
||||
};
|
||||
}
|
||||
|
|
@ -1058,25 +1026,10 @@ exit "$BMXRC"
|
|||
|
||||
this._sessionIdForBridge = sessionId;
|
||||
const { inputArgs, isNetwork, bridgeProcess = null, audioFifo = null, interlaced = false, audioInputIndex = 0,
|
||||
sourceAudioChannels = null, wantAudioChannels = null } = await this._buildInputArgs({
|
||||
} = await this._buildInputArgs({
|
||||
sourceType, sourceBackend, device, port, board, sourceUrl, listen, listenPort, streamKey,
|
||||
});
|
||||
|
||||
// Channel selection for the master: when the source FIFO carries more
|
||||
// discrete channels than the recorder wants (e.g. 16ch SDI → 2ch master),
|
||||
// keep the FIRST N channels as discrete streams (no downmix) via a `pan`
|
||||
// filter `c0=c0|c1=c1|…`. effAudioChannels is what the master container
|
||||
// actually holds and what `-ac` must declare.
|
||||
const effAudioChannels = (sourceAudioChannels && wantAudioChannels)
|
||||
? Math.min(wantAudioChannels, sourceAudioChannels)
|
||||
: audioChannels;
|
||||
const needChannelSelect = !!(sourceAudioChannels && wantAudioChannels && wantAudioChannels < sourceAudioChannels);
|
||||
const channelSelectFilter = needChannelSelect
|
||||
? `pan=${effAudioChannels}c|` + Array.from({ length: effAudioChannels }, (_, i) => `c${i}=c${i}`).join('|')
|
||||
: null;
|
||||
// Override the codec channel count so -ac matches the selected layout.
|
||||
if (sourceAudioChannels && wantAudioChannels) audioChannels = effAudioChannels;
|
||||
|
||||
// ── Pre-roll: discard initial unstable frames ────────────────────────────
|
||||
if (bridgeProcess && (sourceType === 'deltacast' || sourceType === 'blackmagic' || sourceType === 'sdi')) {
|
||||
console.log(`[capture] pre-rolling: discarding ${PRE_ROLL_SECONDS}s of frames`);
|
||||
|
|
@ -1194,18 +1147,11 @@ exit "$BMXRC"
|
|||
// ffmpeg doesn't fail trying to map a nonexistent audio stream.
|
||||
const hasAudio = audioInputIndex >= 0 && !isNetFcPipe;
|
||||
const masterAudioMap = hasAudio ? ['-map', audioMap] : [];
|
||||
// Master audio: optional first-N channel select (discrete, no downmix),
|
||||
// then async resample to lock A/V sync. Chain both into one -af.
|
||||
const masterFilterChain = [
|
||||
...(channelSelectFilter ? [channelSelectFilter] : []),
|
||||
'aresample=async=1:min_hard_comp=0.100000:first_pts=0',
|
||||
].join(',');
|
||||
const masterAudioFilter = hasAudio ? ['-af', masterFilterChain] : [];
|
||||
const masterAudioFilter = hasAudio
|
||||
? ['-af', 'aresample=async=1:min_hard_comp=0.100000:first_pts=0'] : [];
|
||||
const hlsAudioMap = hasAudio ? ['-map', audioMap] : [];
|
||||
// HLS preview is always stereo for browser playback — downmix the first
|
||||
// pair regardless of how many channels the master keeps.
|
||||
const hlsAudioCodec = hasAudio
|
||||
? ['-af', 'pan=stereo|c0=c0|c1=c1', '-c:a', 'aac', '-b:a', '128k', '-ar', '44100', '-ac', '2'] : [];
|
||||
? ['-c:a', 'aac', '-b:a', '128k', '-ar', '44100'] : [];
|
||||
hiresArgs = [
|
||||
...inputArgs,
|
||||
'-filter_complex', filterStr,
|
||||
|
|
|
|||
Loading…
Reference in a new issue