diff --git a/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs b/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs
index e4d34bc..0ec626c 100644
--- a/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs
+++ b/src/TeamsISO.Engine/Discovery/NdiDiscoveryService.cs
@@ -74,84 +74,95 @@ public sealed class NdiDiscoveryService
foreach (var name in currentSet) _previous.Add(name);
}
- /// Long-running poll loop. Cancel the token to stop.
+ ///
+ /// Long-running poll loop with cold-start ramp + self-healing.
+ /// Cancel the token to stop.
+ ///
+ /// Cadence: 200ms for the first 3 seconds (fast cold-start mDNS settling),
+ /// then the configured .
+ ///
+ /// Self-healing: certain process spawns end up with an NDI finder that
+ /// returns 0 sources forever even when sources are visible to other
+ /// processes (suspected cause: medium-integrity SAFER token from runas
+ /// /trustlevel doesn't talk to NDI's mDNS responder reliably; could also
+ /// be a NIC-bind race at finder construction). To recover, we rebuild
+ /// the finder when:
+ ///
+ /// - We've never seen a source AND it's been >5s since startup AND
+ /// it's been >5s since the last rebuild.
+ /// - We previously saw sources but the set has been empty for >15s
+ /// AND it's been >10s since the last rebuild.
+ ///
+ /// Both rules apply backoff so we don't churn during legitimate empty
+ /// periods (no meeting active, etc.) — the rebuild is cheap but the log
+ /// noise isn't useful.
+ ///
public async Task RunAsync(TimeSpan pollInterval, CancellationToken cancellationToken)
{
- // Wrap the whole method in try/finally so _finder is always disposed,
- // including the cancellation paths inside the fast-poll loop.
try
{
- // First poll happens IMMEDIATELY — PeriodicTimer.WaitForNextTickAsync
- // waits the full interval before its first tick, which created a
- // noticeable cold-start window where the UI showed "no ndi sources yet"
- // for ~500ms (or whatever the interval is) before discovery had a chance
- // to fire. Operators launching into a meeting that was already broadcasting
- // saw an empty table and assumed it was broken. Poll once up front to
- // pull whatever the NDI runtime has already cached, then settle into
- // the regular poll cadence.
- try { PollOnce(); }
- catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed; falling through to timer loop."); }
+ // Immediate first poll — PeriodicTimer.WaitForNextTickAsync would
+ // wait the full interval otherwise, costing us 200-500ms at cold
+ // start when operators are most impatient.
+ try { PollOnce(); } catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed."); }
- // Aggressive poll cadence for the first ~3 seconds so cold-start mDNS
- // resolution surfaces quickly (mDNS responses can lag the initial socket
- // setup by 200-1500ms depending on the network and the responder); after
- // that ramp down to the operator-configured pollInterval.
- var fastInterval = TimeSpan.FromMilliseconds(200);
var startedAt = DateTimeOffset.UtcNow;
- var rampToFullAfter = TimeSpan.FromSeconds(3);
+ var fastUntil = startedAt + TimeSpan.FromSeconds(3);
+ var fastInterval = TimeSpan.FromMilliseconds(200);
+ DateTimeOffset? lastSeenAt = _previous.Count > 0 ? startedAt : null;
+ var lastRebuildAt = startedAt;
- using (var fastTimer = new PeriodicTimer(fastInterval))
+ while (!cancellationToken.IsCancellationRequested)
{
- try
+ var now = DateTimeOffset.UtcNow;
+ var interval = now < fastUntil ? fastInterval : pollInterval;
+ try { await Task.Delay(interval, cancellationToken); }
+ catch (OperationCanceledException) { break; }
+
+ now = DateTimeOffset.UtcNow;
+
+ // Operator-requested rebuild (Refresh discovery in the UI) wins.
+ if (Interlocked.Exchange(ref _refreshRequested, 0) == 1)
{
- while (DateTimeOffset.UtcNow - startedAt < rampToFullAfter)
- {
- if (!await fastTimer.WaitForNextTickAsync(cancellationToken)) break;
- if (Interlocked.Exchange(ref _refreshRequested, 0) == 1)
- {
- try
- {
- _logger.LogInformation("Rebuilding NDI finder on operator request.");
- _finder.Dispose();
- _finder = _interop.CreateFinder(_discoveryGroups);
- _previous.Clear();
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder.");
- }
- }
- try { PollOnce(); }
- catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); }
- }
+ RebuildFinder("operator request");
+ lastRebuildAt = now;
}
- catch (OperationCanceledException) { return; }
- }
-
- using var timer = new PeriodicTimer(pollInterval);
- try
- {
- while (await timer.WaitForNextTickAsync(cancellationToken))
+ // Auto-healing rebuilds. Only one path can fire per tick.
+ else if (_previous.Count == 0)
{
- if (Interlocked.Exchange(ref _refreshRequested, 0) == 1)
+ var sinceStart = now - startedAt;
+ var sinceRebuild = now - lastRebuildAt;
+ if (lastSeenAt is null)
{
- try
+ // Never seen a source. After 5s of empty results, try a rebuild
+ // every 5s. This handles the cold-start case where the initial
+ // finder didn't bind to the right network interface.
+ if (sinceStart > TimeSpan.FromSeconds(5) && sinceRebuild > TimeSpan.FromSeconds(5))
{
- _logger.LogInformation("Rebuilding NDI finder on operator request.");
- _finder.Dispose();
- _finder = _interop.CreateFinder(_discoveryGroups);
- _previous.Clear();
- }
- catch (Exception ex)
- {
- _logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder.");
+ RebuildFinder("auto-heal: never saw a source");
+ lastRebuildAt = now;
+ }
+ }
+ else
+ {
+ // We've seen sources before but currently see nothing. After
+ // 15s of empty results, try a rebuild every 10s. This handles
+ // the "Teams stopped broadcasting then started again but our
+ // finder didn't pick up the new advertisements" case.
+ var sinceLastSeen = now - lastSeenAt.Value;
+ if (sinceLastSeen > TimeSpan.FromSeconds(15) && sinceRebuild > TimeSpan.FromSeconds(10))
+ {
+ RebuildFinder("auto-heal: source set went empty 15s ago");
+ lastRebuildAt = now;
}
}
- try { PollOnce(); }
- catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); }
}
+
+ try { PollOnce(); }
+ catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); }
+
+ if (_previous.Count > 0) lastSeenAt = DateTimeOffset.UtcNow;
}
- catch (OperationCanceledException) { /* expected */ }
}
finally
{
@@ -159,6 +170,26 @@ public sealed class NdiDiscoveryService
}
}
+ ///
+ /// Dispose the current finder and create a fresh one against the cached
+ /// discovery groups. Clears the seen-set so all currently-visible sources
+ /// will re-fire as on the next poll.
+ ///
+ private void RebuildFinder(string reason)
+ {
+ try
+ {
+ _logger.LogInformation("Rebuilding NDI finder ({Reason}).", reason);
+ _finder.Dispose();
+ _finder = _interop.CreateFinder(_discoveryGroups);
+ _previous.Clear();
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "Finder rebuild failed ({Reason}); continuing with existing finder.", reason);
+ }
+ }
+
///
/// Updates the cached discovery-groups string used by future finder rebuilds.
/// Call after this to actually pick up the change.