diff --git a/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs b/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs index e4d34bc..0ec626c 100644 --- a/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs +++ b/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs @@ -74,84 +74,95 @@ public sealed class NdiDiscoveryService foreach (var name in currentSet) _previous.Add(name); } - /// Long-running poll loop. Cancel the token to stop. + /// + /// Long-running poll loop with cold-start ramp + self-healing. + /// Cancel the token to stop. + /// + /// Cadence: 200ms for the first 3 seconds (fast cold-start mDNS settling), + /// then the configured . + /// + /// Self-healing: certain process spawns end up with an NDI finder that + /// returns 0 sources forever even when sources are visible to other + /// processes (suspected cause: medium-integrity SAFER token from runas + /// /trustlevel doesn't talk to NDI's mDNS responder reliably; could also + /// be a NIC-bind race at finder construction). To recover, we rebuild + /// the finder when: + /// + /// We've never seen a source AND it's been >5s since startup AND + /// it's been >5s since the last rebuild. + /// We previously saw sources but the set has been empty for >15s + /// AND it's been >10s since the last rebuild. + /// + /// Both rules apply backoff so we don't churn during legitimate empty + /// periods (no meeting active, etc.) — the rebuild is cheap but the log + /// noise isn't useful. + /// public async Task RunAsync(TimeSpan pollInterval, CancellationToken cancellationToken) { - // Wrap the whole method in try/finally so _finder is always disposed, - // including the cancellation paths inside the fast-poll loop. try { - // First poll happens IMMEDIATELY — PeriodicTimer.WaitForNextTickAsync - // waits the full interval before its first tick, which created a - // noticeable cold-start window where the UI showed "no ndi sources yet" - // for ~500ms (or whatever the interval is) before discovery had a chance - // to fire. Operators launching into a meeting that was already broadcasting - // saw an empty table and assumed it was broken. Poll once up front to - // pull whatever the NDI runtime has already cached, then settle into - // the regular poll cadence. - try { PollOnce(); } - catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed; falling through to timer loop."); } + // Immediate first poll — PeriodicTimer.WaitForNextTickAsync would + // wait the full interval otherwise, costing us 200-500ms at cold + // start when operators are most impatient. + try { PollOnce(); } catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed."); } - // Aggressive poll cadence for the first ~3 seconds so cold-start mDNS - // resolution surfaces quickly (mDNS responses can lag the initial socket - // setup by 200-1500ms depending on the network and the responder); after - // that ramp down to the operator-configured pollInterval. - var fastInterval = TimeSpan.FromMilliseconds(200); var startedAt = DateTimeOffset.UtcNow; - var rampToFullAfter = TimeSpan.FromSeconds(3); + var fastUntil = startedAt + TimeSpan.FromSeconds(3); + var fastInterval = TimeSpan.FromMilliseconds(200); + DateTimeOffset? lastSeenAt = _previous.Count > 0 ? startedAt : null; + var lastRebuildAt = startedAt; - using (var fastTimer = new PeriodicTimer(fastInterval)) + while (!cancellationToken.IsCancellationRequested) { - try + var now = DateTimeOffset.UtcNow; + var interval = now < fastUntil ? fastInterval : pollInterval; + try { await Task.Delay(interval, cancellationToken); } + catch (OperationCanceledException) { break; } + + now = DateTimeOffset.UtcNow; + + // Operator-requested rebuild (Refresh discovery in the UI) wins. + if (Interlocked.Exchange(ref _refreshRequested, 0) == 1) { - while (DateTimeOffset.UtcNow - startedAt < rampToFullAfter) - { - if (!await fastTimer.WaitForNextTickAsync(cancellationToken)) break; - if (Interlocked.Exchange(ref _refreshRequested, 0) == 1) - { - try - { - _logger.LogInformation("Rebuilding NDI finder on operator request."); - _finder.Dispose(); - _finder = _interop.CreateFinder(_discoveryGroups); - _previous.Clear(); - } - catch (Exception ex) - { - _logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder."); - } - } - try { PollOnce(); } - catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); } - } + RebuildFinder("operator request"); + lastRebuildAt = now; } - catch (OperationCanceledException) { return; } - } - - using var timer = new PeriodicTimer(pollInterval); - try - { - while (await timer.WaitForNextTickAsync(cancellationToken)) + // Auto-healing rebuilds. Only one path can fire per tick. + else if (_previous.Count == 0) { - if (Interlocked.Exchange(ref _refreshRequested, 0) == 1) + var sinceStart = now - startedAt; + var sinceRebuild = now - lastRebuildAt; + if (lastSeenAt is null) { - try + // Never seen a source. After 5s of empty results, try a rebuild + // every 5s. This handles the cold-start case where the initial + // finder didn't bind to the right network interface. + if (sinceStart > TimeSpan.FromSeconds(5) && sinceRebuild > TimeSpan.FromSeconds(5)) { - _logger.LogInformation("Rebuilding NDI finder on operator request."); - _finder.Dispose(); - _finder = _interop.CreateFinder(_discoveryGroups); - _previous.Clear(); - } - catch (Exception ex) - { - _logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder."); + RebuildFinder("auto-heal: never saw a source"); + lastRebuildAt = now; + } + } + else + { + // We've seen sources before but currently see nothing. After + // 15s of empty results, try a rebuild every 10s. This handles + // the "Teams stopped broadcasting then started again but our + // finder didn't pick up the new advertisements" case. + var sinceLastSeen = now - lastSeenAt.Value; + if (sinceLastSeen > TimeSpan.FromSeconds(15) && sinceRebuild > TimeSpan.FromSeconds(10)) + { + RebuildFinder("auto-heal: source set went empty 15s ago"); + lastRebuildAt = now; } } - try { PollOnce(); } - catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); } } + + try { PollOnce(); } + catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); } + + if (_previous.Count > 0) lastSeenAt = DateTimeOffset.UtcNow; } - catch (OperationCanceledException) { /* expected */ } } finally { @@ -159,6 +170,26 @@ public sealed class NdiDiscoveryService } } + /// + /// Dispose the current finder and create a fresh one against the cached + /// discovery groups. Clears the seen-set so all currently-visible sources + /// will re-fire as on the next poll. + /// + private void RebuildFinder(string reason) + { + try + { + _logger.LogInformation("Rebuilding NDI finder ({Reason}).", reason); + _finder.Dispose(); + _finder = _interop.CreateFinder(_discoveryGroups); + _previous.Clear(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Finder rebuild failed ({Reason}); continuing with existing finder.", reason); + } + } + /// /// Updates the cached discovery-groups string used by future finder rebuilds. /// Call after this to actually pick up the change.