From c30a6163c8b13ef1902c0ffcfc5c6ae7917756ae Mon Sep 17 00:00:00 2001 From: Zac Gaetano Date: Sat, 16 May 2026 13:35:22 -0400 Subject: [PATCH] fix(engine): self-healing NDI discovery + unified poll loop When a process spawns and NDI Find returns zero sources at cold start, the finder can stay stuck on zero forever even when other processes can see Teams' broadcasts. Observed today: a user's PID launched at 12:50, ran for 9+ minutes showing 0 sources, while a parallel PID launched at 12:59 immediately discovered 2 sources. Same exe, same install, same Teams meeting, same medium-integrity SAFER token. The first process's finder simply got into a bad state at construction (suspected: NIC-bind race against mDNS responder readiness, or a SAFER-token quirk in the NDI runtime's IPC layer). The fix: auto-rebuild the finder when (a) we've never seen a source and 5s have passed since startup, or (b) the source set has been empty for 15s after previously containing entries. Both paths back off (>=5s and >=10s between rebuilds respectively) so we don't churn during legitimate empty periods. Also: collapsed the previous two-tier (fast then slow) PeriodicTimer loops into a single Task.Delay loop with a dynamic interval. Same behavior (200ms for first 3s, then operator-configured pollInterval), less code, easier to thread the self-healing logic through. The finder is still disposed in a try/finally so cancellation paths don't leak. 246/246 tests still passing. The Discovery tests use PollOnce directly so RunAsync changes don't affect them. --- .../Discovery/NdiDiscoveryService.cs | 153 +++++++++++------- 1 file changed, 92 insertions(+), 61 deletions(-) diff --git a/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs b/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs index e4d34bc..0ec626c 100644 --- a/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs +++ b/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs @@ -74,84 +74,95 @@ public sealed class NdiDiscoveryService foreach (var name in currentSet) _previous.Add(name); } - /// Long-running poll loop. Cancel the token to stop. + /// + /// Long-running poll loop with cold-start ramp + self-healing. + /// Cancel the token to stop. + /// + /// Cadence: 200ms for the first 3 seconds (fast cold-start mDNS settling), + /// then the configured . + /// + /// Self-healing: certain process spawns end up with an NDI finder that + /// returns 0 sources forever even when sources are visible to other + /// processes (suspected cause: medium-integrity SAFER token from runas + /// /trustlevel doesn't talk to NDI's mDNS responder reliably; could also + /// be a NIC-bind race at finder construction). To recover, we rebuild + /// the finder when: + /// + /// We've never seen a source AND it's been >5s since startup AND + /// it's been >5s since the last rebuild. + /// We previously saw sources but the set has been empty for >15s + /// AND it's been >10s since the last rebuild. + /// + /// Both rules apply backoff so we don't churn during legitimate empty + /// periods (no meeting active, etc.) — the rebuild is cheap but the log + /// noise isn't useful. + /// public async Task RunAsync(TimeSpan pollInterval, CancellationToken cancellationToken) { - // Wrap the whole method in try/finally so _finder is always disposed, - // including the cancellation paths inside the fast-poll loop. try { - // First poll happens IMMEDIATELY — PeriodicTimer.WaitForNextTickAsync - // waits the full interval before its first tick, which created a - // noticeable cold-start window where the UI showed "no ndi sources yet" - // for ~500ms (or whatever the interval is) before discovery had a chance - // to fire. Operators launching into a meeting that was already broadcasting - // saw an empty table and assumed it was broken. Poll once up front to - // pull whatever the NDI runtime has already cached, then settle into - // the regular poll cadence. - try { PollOnce(); } - catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed; falling through to timer loop."); } + // Immediate first poll — PeriodicTimer.WaitForNextTickAsync would + // wait the full interval otherwise, costing us 200-500ms at cold + // start when operators are most impatient. + try { PollOnce(); } catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed."); } - // Aggressive poll cadence for the first ~3 seconds so cold-start mDNS - // resolution surfaces quickly (mDNS responses can lag the initial socket - // setup by 200-1500ms depending on the network and the responder); after - // that ramp down to the operator-configured pollInterval. - var fastInterval = TimeSpan.FromMilliseconds(200); var startedAt = DateTimeOffset.UtcNow; - var rampToFullAfter = TimeSpan.FromSeconds(3); + var fastUntil = startedAt + TimeSpan.FromSeconds(3); + var fastInterval = TimeSpan.FromMilliseconds(200); + DateTimeOffset? lastSeenAt = _previous.Count > 0 ? startedAt : null; + var lastRebuildAt = startedAt; - using (var fastTimer = new PeriodicTimer(fastInterval)) + while (!cancellationToken.IsCancellationRequested) { - try + var now = DateTimeOffset.UtcNow; + var interval = now < fastUntil ? fastInterval : pollInterval; + try { await Task.Delay(interval, cancellationToken); } + catch (OperationCanceledException) { break; } + + now = DateTimeOffset.UtcNow; + + // Operator-requested rebuild (Refresh discovery in the UI) wins. + if (Interlocked.Exchange(ref _refreshRequested, 0) == 1) { - while (DateTimeOffset.UtcNow - startedAt < rampToFullAfter) - { - if (!await fastTimer.WaitForNextTickAsync(cancellationToken)) break; - if (Interlocked.Exchange(ref _refreshRequested, 0) == 1) - { - try - { - _logger.LogInformation("Rebuilding NDI finder on operator request."); - _finder.Dispose(); - _finder = _interop.CreateFinder(_discoveryGroups); - _previous.Clear(); - } - catch (Exception ex) - { - _logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder."); - } - } - try { PollOnce(); } - catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); } - } + RebuildFinder("operator request"); + lastRebuildAt = now; } - catch (OperationCanceledException) { return; } - } - - using var timer = new PeriodicTimer(pollInterval); - try - { - while (await timer.WaitForNextTickAsync(cancellationToken)) + // Auto-healing rebuilds. Only one path can fire per tick. + else if (_previous.Count == 0) { - if (Interlocked.Exchange(ref _refreshRequested, 0) == 1) + var sinceStart = now - startedAt; + var sinceRebuild = now - lastRebuildAt; + if (lastSeenAt is null) { - try + // Never seen a source. After 5s of empty results, try a rebuild + // every 5s. This handles the cold-start case where the initial + // finder didn't bind to the right network interface. + if (sinceStart > TimeSpan.FromSeconds(5) && sinceRebuild > TimeSpan.FromSeconds(5)) { - _logger.LogInformation("Rebuilding NDI finder on operator request."); - _finder.Dispose(); - _finder = _interop.CreateFinder(_discoveryGroups); - _previous.Clear(); - } - catch (Exception ex) - { - _logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder."); + RebuildFinder("auto-heal: never saw a source"); + lastRebuildAt = now; + } + } + else + { + // We've seen sources before but currently see nothing. After + // 15s of empty results, try a rebuild every 10s. This handles + // the "Teams stopped broadcasting then started again but our + // finder didn't pick up the new advertisements" case. + var sinceLastSeen = now - lastSeenAt.Value; + if (sinceLastSeen > TimeSpan.FromSeconds(15) && sinceRebuild > TimeSpan.FromSeconds(10)) + { + RebuildFinder("auto-heal: source set went empty 15s ago"); + lastRebuildAt = now; } } - try { PollOnce(); } - catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); } } + + try { PollOnce(); } + catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); } + + if (_previous.Count > 0) lastSeenAt = DateTimeOffset.UtcNow; } - catch (OperationCanceledException) { /* expected */ } } finally { @@ -159,6 +170,26 @@ public sealed class NdiDiscoveryService } } + /// + /// Dispose the current finder and create a fresh one against the cached + /// discovery groups. Clears the seen-set so all currently-visible sources + /// will re-fire as on the next poll. + /// + private void RebuildFinder(string reason) + { + try + { + _logger.LogInformation("Rebuilding NDI finder ({Reason}).", reason); + _finder.Dispose(); + _finder = _interop.CreateFinder(_discoveryGroups); + _previous.Clear(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Finder rebuild failed ({Reason}); continuing with existing finder.", reason); + } + } + /// /// Updates the cached discovery-groups string used by future finder rebuilds. /// Call after this to actually pick up the change.