fix(deltacast-bridge): serialize VHD_OpenBoardHandle via flock to prevent BufMngr wedge

Concurrent VHD_OpenBoardHandle calls from multiple capture sidecars
trigger delta_x300 BufMngr.c:781 array-index-out-of-bounds, wedging all
RX channels until the module is reloaded. The node-agent stagger only
delays container start — the bridge binary starts ~2s later and can still
race. This fix acquires an exclusive flock on /dev/shm/deltacast/bridge.lock
before VHD_OpenBoardHandle and holds it until signal lock succeeds (then
adds a 4s settle before releasing so the board's buffer queues stabilize).
Lock is released on signal failure too so the next bridge is never
permanently blocked. All 8 channels can now start safely by serializing
through the same lock file mounted into every sidecar.
This commit is contained in:
Zac Gaetano 2026-06-01 18:54:00 -04:00
parent b324878db9
commit 7d704d3af3

View file

@ -17,6 +17,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/file.h>
#include <time.h>
#include <unistd.h>
@ -272,10 +273,35 @@ int main(int argc, char *argv[]) {
return 1;
}
/* ── Serialize board open via flock ──────────────────────────────
* delta_x300 BufMngr.c:781 has an array-index-out-of-bounds bug that
* fires when two VHD_OpenBoardHandle calls race on the same board.
* Use a cross-container exclusive lock on a file in /dev/shm/deltacast/
* (already bind-mounted into every capture sidecar) to guarantee only
* one bridge runs OpenBoardHandle + signal-wait at a time. The lock is
* released after signal lock succeeds (plus a settle delay) or on
* failure so the next bridge is never permanently blocked. */
const char *lock_path = "/dev/shm/deltacast/bridge.lock";
int lock_fd = open(lock_path, O_CREAT | O_RDWR, 0666);
if (lock_fd >= 0) {
fprintf(stderr, "[board] waiting for board-open lock (port %u)...\n", port_id);
if (flock(lock_fd, LOCK_EX) != 0) {
fprintf(stderr, "[board] flock failed: %s — proceeding without lock\n", strerror(errno));
close(lock_fd);
lock_fd = -1;
} else {
fprintf(stderr, "[board] lock acquired (port %u)\n", port_id);
}
} else {
fprintf(stderr, "[board] could not open lock file %s: %s — proceeding without lock\n",
lock_path, strerror(errno));
}
/* ── Open board ───────────────────────────────────────────────── */
HANDLE board = NULL;
if (VHD_OpenBoardHandle(device_id, &board, NULL, 0) != VHDERR_NOERROR) {
fprintf(stderr, "{\"error\":\"VHD_OpenBoardHandle failed for board %u\"}\n", device_id);
if (lock_fd >= 0) { flock(lock_fd, LOCK_UN); close(lock_fd); }
return 1;
}
@ -312,9 +338,22 @@ int main(int argc, char *argv[]) {
"{\"error\":\"no signal on board %u port %u within %ds\"}\n",
device_id, port_id, sig_timeout);
VHD_CloseBoardHandle(board);
if (lock_fd >= 0) { flock(lock_fd, LOCK_UN); close(lock_fd); }
return 1;
}
/* Signal locked. Hold the board-open lock for a settle period so the
* board's RX buffer queues are fully initialised before the next bridge
* calls OpenBoardHandle. 4 seconds is enough for 1080p59.94 @ queue-depth 8. */
if (lock_fd >= 0) {
struct timespec settle = {4, 0};
nanosleep(&settle, NULL);
flock(lock_fd, LOCK_UN);
close(lock_fd);
lock_fd = -1;
fprintf(stderr, "[board] lock released (port %u) — streaming\n", port_id);
}
ULONG clock_div = VHD_CLOCKDIV_1;
VHD_GetChannelProperty(board, VHD_RX_CHANNEL, port_id,
VHD_SDI_CP_CLOCK_DIVISOR, &clock_div);