fix(engine): self-healing NDI discovery + unified poll loop
Some checks failed
CI / build-and-test (push) Failing after 26s
Some checks failed
CI / build-and-test (push) Failing after 26s
When a process spawns and NDI Find returns zero sources at cold start, the finder can stay stuck on zero forever even when other processes can see Teams' broadcasts. Observed today: a user's PID launched at 12:50, ran for 9+ minutes showing 0 sources, while a parallel PID launched at 12:59 immediately discovered 2 sources. Same exe, same install, same Teams meeting, same medium-integrity SAFER token. The first process's finder simply got into a bad state at construction (suspected: NIC-bind race against mDNS responder readiness, or a SAFER-token quirk in the NDI runtime's IPC layer). The fix: auto-rebuild the finder when (a) we've never seen a source and 5s have passed since startup, or (b) the source set has been empty for 15s after previously containing entries. Both paths back off (>=5s and >=10s between rebuilds respectively) so we don't churn during legitimate empty periods. Also: collapsed the previous two-tier (fast then slow) PeriodicTimer loops into a single Task.Delay loop with a dynamic interval. Same behavior (200ms for first 3s, then operator-configured pollInterval), less code, easier to thread the self-healing logic through. The finder is still disposed in a try/finally so cancellation paths don't leak. 246/246 tests still passing. The Discovery tests use PollOnce directly so RunAsync changes don't affect them.
This commit is contained in:
parent
54ee578fe9
commit
c30a6163c8
1 changed files with 92 additions and 61 deletions
|
|
@ -74,84 +74,95 @@ public sealed class NdiDiscoveryService
|
|||
foreach (var name in currentSet) _previous.Add(name);
|
||||
}
|
||||
|
||||
/// <summary>Long-running poll loop. Cancel the token to stop.</summary>
|
||||
/// <summary>
|
||||
/// Long-running poll loop with cold-start ramp + self-healing.
|
||||
/// Cancel the token to stop.
|
||||
///
|
||||
/// Cadence: 200ms for the first 3 seconds (fast cold-start mDNS settling),
|
||||
/// then the configured <paramref name="pollInterval"/>.
|
||||
///
|
||||
/// Self-healing: certain process spawns end up with an NDI finder that
|
||||
/// returns 0 sources forever even when sources are visible to other
|
||||
/// processes (suspected cause: medium-integrity SAFER token from runas
|
||||
/// /trustlevel doesn't talk to NDI's mDNS responder reliably; could also
|
||||
/// be a NIC-bind race at finder construction). To recover, we rebuild
|
||||
/// the finder when:
|
||||
/// <list type="number">
|
||||
/// <item>We've never seen a source AND it's been >5s since startup AND
|
||||
/// it's been >5s since the last rebuild.</item>
|
||||
/// <item>We previously saw sources but the set has been empty for >15s
|
||||
/// AND it's been >10s since the last rebuild.</item>
|
||||
/// </list>
|
||||
/// Both rules apply backoff so we don't churn during legitimate empty
|
||||
/// periods (no meeting active, etc.) — the rebuild is cheap but the log
|
||||
/// noise isn't useful.
|
||||
/// </summary>
|
||||
public async Task RunAsync(TimeSpan pollInterval, CancellationToken cancellationToken)
|
||||
{
|
||||
// Wrap the whole method in try/finally so _finder is always disposed,
|
||||
// including the cancellation paths inside the fast-poll loop.
|
||||
try
|
||||
{
|
||||
// First poll happens IMMEDIATELY — PeriodicTimer.WaitForNextTickAsync
|
||||
// waits the full interval before its first tick, which created a
|
||||
// noticeable cold-start window where the UI showed "no ndi sources yet"
|
||||
// for ~500ms (or whatever the interval is) before discovery had a chance
|
||||
// to fire. Operators launching into a meeting that was already broadcasting
|
||||
// saw an empty table and assumed it was broken. Poll once up front to
|
||||
// pull whatever the NDI runtime has already cached, then settle into
|
||||
// the regular poll cadence.
|
||||
try { PollOnce(); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed; falling through to timer loop."); }
|
||||
// Immediate first poll — PeriodicTimer.WaitForNextTickAsync would
|
||||
// wait the full interval otherwise, costing us 200-500ms at cold
|
||||
// start when operators are most impatient.
|
||||
try { PollOnce(); } catch (Exception ex) { _logger.LogWarning(ex, "Initial discovery poll failed."); }
|
||||
|
||||
// Aggressive poll cadence for the first ~3 seconds so cold-start mDNS
|
||||
// resolution surfaces quickly (mDNS responses can lag the initial socket
|
||||
// setup by 200-1500ms depending on the network and the responder); after
|
||||
// that ramp down to the operator-configured pollInterval.
|
||||
var fastInterval = TimeSpan.FromMilliseconds(200);
|
||||
var startedAt = DateTimeOffset.UtcNow;
|
||||
var rampToFullAfter = TimeSpan.FromSeconds(3);
|
||||
var fastUntil = startedAt + TimeSpan.FromSeconds(3);
|
||||
var fastInterval = TimeSpan.FromMilliseconds(200);
|
||||
DateTimeOffset? lastSeenAt = _previous.Count > 0 ? startedAt : null;
|
||||
var lastRebuildAt = startedAt;
|
||||
|
||||
using (var fastTimer = new PeriodicTimer(fastInterval))
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var interval = now < fastUntil ? fastInterval : pollInterval;
|
||||
try { await Task.Delay(interval, cancellationToken); }
|
||||
catch (OperationCanceledException) { break; }
|
||||
|
||||
now = DateTimeOffset.UtcNow;
|
||||
|
||||
// Operator-requested rebuild (Refresh discovery in the UI) wins.
|
||||
if (Interlocked.Exchange(ref _refreshRequested, 0) == 1)
|
||||
{
|
||||
while (DateTimeOffset.UtcNow - startedAt < rampToFullAfter)
|
||||
{
|
||||
if (!await fastTimer.WaitForNextTickAsync(cancellationToken)) break;
|
||||
if (Interlocked.Exchange(ref _refreshRequested, 0) == 1)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogInformation("Rebuilding NDI finder on operator request.");
|
||||
_finder.Dispose();
|
||||
_finder = _interop.CreateFinder(_discoveryGroups);
|
||||
_previous.Clear();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder.");
|
||||
}
|
||||
}
|
||||
try { PollOnce(); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); }
|
||||
}
|
||||
RebuildFinder("operator request");
|
||||
lastRebuildAt = now;
|
||||
}
|
||||
catch (OperationCanceledException) { return; }
|
||||
}
|
||||
|
||||
using var timer = new PeriodicTimer(pollInterval);
|
||||
try
|
||||
{
|
||||
while (await timer.WaitForNextTickAsync(cancellationToken))
|
||||
// Auto-healing rebuilds. Only one path can fire per tick.
|
||||
else if (_previous.Count == 0)
|
||||
{
|
||||
if (Interlocked.Exchange(ref _refreshRequested, 0) == 1)
|
||||
var sinceStart = now - startedAt;
|
||||
var sinceRebuild = now - lastRebuildAt;
|
||||
if (lastSeenAt is null)
|
||||
{
|
||||
try
|
||||
// Never seen a source. After 5s of empty results, try a rebuild
|
||||
// every 5s. This handles the cold-start case where the initial
|
||||
// finder didn't bind to the right network interface.
|
||||
if (sinceStart > TimeSpan.FromSeconds(5) && sinceRebuild > TimeSpan.FromSeconds(5))
|
||||
{
|
||||
_logger.LogInformation("Rebuilding NDI finder on operator request.");
|
||||
_finder.Dispose();
|
||||
_finder = _interop.CreateFinder(_discoveryGroups);
|
||||
_previous.Clear();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Finder refresh failed; continuing with existing finder.");
|
||||
RebuildFinder("auto-heal: never saw a source");
|
||||
lastRebuildAt = now;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// We've seen sources before but currently see nothing. After
|
||||
// 15s of empty results, try a rebuild every 10s. This handles
|
||||
// the "Teams stopped broadcasting then started again but our
|
||||
// finder didn't pick up the new advertisements" case.
|
||||
var sinceLastSeen = now - lastSeenAt.Value;
|
||||
if (sinceLastSeen > TimeSpan.FromSeconds(15) && sinceRebuild > TimeSpan.FromSeconds(10))
|
||||
{
|
||||
RebuildFinder("auto-heal: source set went empty 15s ago");
|
||||
lastRebuildAt = now;
|
||||
}
|
||||
}
|
||||
try { PollOnce(); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); }
|
||||
}
|
||||
|
||||
try { PollOnce(); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "Discovery poll failed; will retry on next tick."); }
|
||||
|
||||
if (_previous.Count > 0) lastSeenAt = DateTimeOffset.UtcNow;
|
||||
}
|
||||
catch (OperationCanceledException) { /* expected */ }
|
||||
}
|
||||
finally
|
||||
{
|
||||
|
|
@ -159,6 +170,26 @@ public sealed class NdiDiscoveryService
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dispose the current finder and create a fresh one against the cached
|
||||
/// discovery groups. Clears the seen-set so all currently-visible sources
|
||||
/// will re-fire as <see cref="DiscoveryEvent.Added"/> on the next poll.
|
||||
/// </summary>
|
||||
private void RebuildFinder(string reason)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogInformation("Rebuilding NDI finder ({Reason}).", reason);
|
||||
_finder.Dispose();
|
||||
_finder = _interop.CreateFinder(_discoveryGroups);
|
||||
_previous.Clear();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Finder rebuild failed ({Reason}); continuing with existing finder.", reason);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the cached discovery-groups string used by future finder rebuilds.
|
||||
/// Call <see cref="RequestRefresh"/> after this to actually pick up the change.
|
||||
|
|
|
|||
Loading…
Reference in a new issue