Skip to content

Commit

Permalink
Add operational resolution retries in OperationalSessionSetup. (#25991)
Browse files Browse the repository at this point in the history
Apparently some border routers are broken and don't actually do the "multicast"
part of mDNS right, leading to resolve failures if the advertisement is not
available during the initial unicast-response probe.  Work around that by
retrying operational discovery a few times during commissioning.
  • Loading branch information
bzbarsky-apple authored and pull[bot] committed Oct 31, 2023
1 parent af14412 commit 756f5dc
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 5 deletions.
59 changes: 54 additions & 5 deletions src/app/OperationalSessionSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,10 @@ CHIP_ERROR OperationalSessionSetup::LookupPeerAddress()
{
++mAttemptsDone;
}
if (mResolveAttemptsAllowed > 0)
{
--mResolveAttemptsAllowed;
}
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

// NOTE: This is public API that can be used to update our stored peer
Expand Down Expand Up @@ -502,9 +506,44 @@ void OperationalSessionSetup::OnNodeAddressResolutionFailed(const PeerId & peerI
ChipLogError(Discovery, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: operational discovery failed: %" CHIP_ERROR_FORMAT,
mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId()), reason.Format());

// Does it make sense to ScheduleSessionSetupReattempt() here? DNS-SD
// resolution has its own retry/backoff mechanisms, so if it's failed we
// have already done a lot of that.
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// If we're in a mode where we would generally retry CASE, retry operational
// discovery once. That allows us to more-gracefully handle broken networks
// where multicast DNS does not actually work and hence only the initial
// unicast DNS-SD queries get a response.
//
// We check for State::ResolvingAddress just in case in the meantime
// something weird happened and we are no longer trying to resolve an
// address.
if (mState == State::ResolvingAddress && mResolveAttemptsAllowed > 0)
{
ChipLogProgress(Discovery, "Retrying operational DNS-SD discovery. Attempts remaining: %u", mResolveAttemptsAllowed);

// Pretend like our previous attempt (i.e. call to LookupPeerAddress)
// has not happened for purposes of the generic attempt counters, so we
// don't mess up the counters for our actual CASE retry logic.
if (mRemainingAttempts < UINT8_MAX)
{
++mRemainingAttempts;
}
if (mAttemptsDone > 0)
{
--mAttemptsDone;
}

CHIP_ERROR err = LookupPeerAddress();
if (err == CHIP_NO_ERROR)
{
// We need to notify our consumer that the resolve will take more
// time, but we don't actually know how much time it will take,
// because the resolver does not expose that information. Just use
// one minute to be safe.
using namespace chip::System::Clock::Literals;
NotifyRetryHandlers(reason, 60_s16);
return;
}
}
#endif

// No need to modify any variables in `this` since call below releases `this`.
DequeueConnectionCallbacks(reason);
Expand All @@ -531,6 +570,11 @@ void OperationalSessionSetup::UpdateAttemptCount(uint8_t attemptCount)
{
mRemainingAttempts = attemptCount;
}

if (attemptCount > mResolveAttemptsAllowed)
{
mResolveAttemptsAllowed = attemptCount;
}
}

CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay)
Expand Down Expand Up @@ -619,11 +663,16 @@ void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, const Reliab
System::Clock::Timeout messageTimeout = CASESession::ComputeSigma1ResponseTimeout(remoteMrpConfig);
auto timeoutSecs = std::chrono::duration_cast<System::Clock::Seconds16>(messageTimeout);
// Add 1 second in case we had fractional milliseconds in messageTimeout.
timeoutSecs += System::Clock::Seconds16(1);
using namespace chip::System::Clock::Literals;
NotifyRetryHandlers(error, timeoutSecs + 1_s16 + retryDelay);
}

void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, System::Clock::Seconds16 timeoutEstimate)
{
for (auto * item = mConnectionRetry.First(); item && item != &mConnectionRetry; item = item->mNext)
{
auto cb = Callback::Callback<OnDeviceConnectionRetry>::FromCancelable(item);
cb->mCall(cb->mContext, mPeerId, error, timeoutSecs + retryDelay);
cb->mCall(cb->mContext, mPeerId, error, timeoutEstimate);
}
}
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
Expand Down
8 changes: 8 additions & 0 deletions src/app/OperationalSessionSetup.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate,
uint8_t mRemainingAttempts = 0;
uint8_t mAttemptsDone = 0;

uint8_t mResolveAttemptsAllowed = 0;

Callback::CallbackDeque mConnectionRetry;
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

Expand Down Expand Up @@ -354,6 +356,12 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate,
*/
void NotifyRetryHandlers(CHIP_ERROR error, const ReliableMessageProtocolConfig & remoteMrpConfig,
System::Clock::Seconds16 retryDelay);

/**
* A version of NotifyRetryHandlers that passes in a retry timeout estimate
* directly.
*/
void NotifyRetryHandlers(CHIP_ERROR error, System::Clock::Seconds16 timeoutEstimate);
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
};

Expand Down

0 comments on commit 756f5dc

Please sign in to comment.