Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Longhaul: Tolerance adjustments #5916

Merged
merged 7 commits into from
Jan 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions test/modules/TestResultCoordinator/Reports/CountingReport.cs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ public CountingReport(

public override bool IsPassed => this.IsPassedHelper();

// Tolerances are needed due to a combination of false-positive failures and real product-issues.
// Connectivity tolerances:
// - [All-Cases]: Fail the tests if we have > 20% missing C2D messages
// - [Nested-Edge] [Broker-Enabled]: Fail tests if we have > 10% missing custom mqtt messages
// Longhaul tolerances:
// - [Nested-Edge] [Broker-Enabled]: Fail the tests if we have > 20% missing custom mqtt messages
// - [Nested-Edge] [Broker-Enabled]: Fail the tests if we have > 1% missing iothub messages
bool IsPassedHelper()
{
return this.TotalExpectCount > 0 && this.TotalDuplicateExpectedResultCount == 0 && this.EventHubSpecificReportComponents.Match(
Expand All @@ -126,19 +133,39 @@ bool IsPassedHelper()
},
() =>
{
// Product issue for C2D messages connected to edgehub over mqtt.
// We should remove this failure tolerance when fixed.
if (this.TestDescription.Contains(C2dTestDescription))
if (this.TestMode == TestMode.Connectivity)
{
return ((double)this.TotalMatchCount / this.TotalExpectCount) > .8d;
}
else if (this.TestDescription == GenericMqttTelemetryTestDescription && this.TestMode == TestMode.Connectivity)
{
return ((double)this.TotalMatchCount / this.TotalExpectCount) > .9d;
// Product issue for C2D messages connected to edgehub.
if (this.TestDescription.Contains(C2dTestDescription))
{
return ((double)this.TotalMatchCount / this.TotalExpectCount) > .8d;
}
// Product issue for custom mqtt telemetry.
else if (this.Topology == Topology.Nested && this.MqttBrokerEnabled && this.TestDescription == GenericMqttTelemetryTestDescription)
{
return ((double)this.TotalMatchCount / this.TotalExpectCount) > .9d;
}
else
{
return this.TotalExpectCount == this.TotalMatchCount;
}
}
else
{
return this.TotalExpectCount == this.TotalMatchCount;
// Product issue for custom mqtt telemetry.
if (this.Topology == Topology.Nested && this.MqttBrokerEnabled && this.TestDescription.Contains(GenericMqttTelemetryTestDescription))
{
return ((double)this.TotalMatchCount / this.TotalExpectCount) > .8d;
}
// Product issue for messages when broker is enabled.
else if (this.Topology == Topology.Nested && this.MqttBrokerEnabled && this.TestDescription.Contains(MessagesTestDescription))
{
return ((double)this.TotalMatchCount / this.TotalExpectCount) > .99d;
}
else
{
return this.TotalExpectCount == this.TotalMatchCount;
}
}
});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ public DirectMethodLongHaulReport(

public override bool IsPassed => this.IsPassedHelper();

// Tolerances are needed because sometimes test has a combination of false-positive failures.
// Here is a description of these tolerances:
// - [All-Cases] Status code 0: Fail the tests if we get > 1 in 1000 direct methods with status code 0.
// - [All-Cases] Resource error: Fail the tests if we get > 1 in 1000 direct methods with resource error.
// - [All-Cases] Unauthorized: Fail the tests if we get > 1 in 1000 direct methods with resource error.
// - [All-Cases] Transient error: Fail the tests if we get > 1 in 1000 direct methods with transient error.
// - [All-Cases] NotImplemented: Fail the tests if we get > 1 in 1000 direct methods with NotImplemented error.
// - [Nested-Edge] [Broker-Enabled] DeviceNotFound: Fail the tests if we get > 1 in 400 direct methods with DeviceNotFound
// - [Nested-Edge] [Non-Broker] DeviceNotFound: Fail the tests if we get > 1 in 250 direct methods with DeviceNotFound
// - [Single-Node] DeviceNotFound: Fail the tests if we get > 1 in 200 direct methods with DeviceNotFound
bool IsPassedHelper()
{
if (this.Other.Sum(x => x.Value) > 0)
Expand All @@ -85,20 +95,65 @@ bool IsPassedHelper()
}

bool senderAndReceiverSuccessesPass = this.SenderSuccesses <= this.ReceiverSuccesses;
long allStatusCount = this.SenderSuccesses + this.StatusCodeZero + this.Unauthorized + this.DeviceNotFound + this.TransientError + this.ResourceError + this.NotImplemented + this.Other.Sum(x => x.Value);

double statusCodeZeroThreshold;
double unauthorizedThreshold;
double deviceNotFoundThreshold;
double transientErrorThreshold;
double resourceErrorThreshold;
double notImplementedThreshold;

// The SDK does not allow edgehub to de-register from iothub subscriptions, which results in DirectMethod clients sometimes receiving status code 0.
// Github issue: https://github.com/Azure/iotedge/issues/681
// We expect to get this status sometimes because of edgehub restarts, but if we receive too many we should fail the tests.
// TODO: When the SDK allows edgehub to de-register from subscriptions and we make the fix in edgehub, then we can fail tests for any status code 0.
long allStatusCount = this.SenderSuccesses + this.StatusCodeZero + this.Other.Sum(x => x.Value);
bool statusCodeZeroBelowThreshold = (this.StatusCodeZero == 0) || (this.StatusCodeZero < ((double)allStatusCount / 1000));
bool unauthorizedBelowThreshold = (this.Unauthorized == 0) || (this.Unauthorized < ((double)allStatusCount / 1000));
bool deviceNotFoundBelowThreshold = (this.DeviceNotFound == 0) || (this.DeviceNotFound < ((double)allStatusCount / 100));
bool transientErrorBelowThreshold = (this.TransientError == 0) || (this.TransientError < ((double)allStatusCount / 100));
bool resourceErrorBelowThreshold = (this.ResourceError == 0) || (this.ResourceError < ((double)allStatusCount / 100));
statusCodeZeroThreshold = (double)allStatusCount / 1000;

// Sometimes transient network/resource errors are caught necessitating a tolerance.
transientErrorThreshold = (double)allStatusCount / 1000;
resourceErrorThreshold = (double)allStatusCount / 1000;

// Sometimes iothub returns Unauthorized or NotImplemented that then later recovers.
// Only occurs with broker enabled, so only apply tolerance in this case.
if (this.MqttBrokerEnabled)
{
unauthorizedThreshold = (double)allStatusCount / 1000;
notImplementedThreshold = (double)allStatusCount / 1000;
}
else
{
unauthorizedThreshold = (double)allStatusCount / double.MaxValue;
notImplementedThreshold = (double)allStatusCount / double.MaxValue;
}

// DeviceNotFound typically happens when EdgeHub restarts and is offline.
// For different test suites this happens at different rates.
// 1) Single node runs arm devices, so this tolerance is a bit lenient.
// 2) Nested non-broker has some product issue where we need some tolerance.
// 3) Nested broker-enabled is the most stable.
if (this.Topology == Topology.SingleNode && !this.MqttBrokerEnabled)
{
deviceNotFoundThreshold = (double)allStatusCount / 200;
}
else if (this.Topology == Topology.Nested && !this.MqttBrokerEnabled)
{
deviceNotFoundThreshold = (double)allStatusCount / 250;
}
else
{
deviceNotFoundThreshold = (double)allStatusCount / 400;
}

bool statusCodeZeroBelowThreshold = (this.StatusCodeZero == 0) || (this.StatusCodeZero < statusCodeZeroThreshold);
bool unauthorizedBelowThreshold = (this.Unauthorized == 0) || (this.Unauthorized < unauthorizedThreshold);
bool deviceNotFoundBelowThreshold = (this.DeviceNotFound == 0) || (this.DeviceNotFound < deviceNotFoundThreshold);
bool transientErrorBelowThreshold = (this.TransientError == 0) || (this.TransientError < transientErrorThreshold);
bool resourceErrorBelowThreshold = (this.ResourceError == 0) || (this.ResourceError < resourceErrorThreshold);
bool notImplementedBelowThreshold = (this.NotImplemented == 0) || (this.NotImplemented < notImplementedThreshold);

// Pass if below the thresholds, and sender and receiver got same amount of successess (or receiver has no results)
return statusCodeZeroBelowThreshold && unauthorizedBelowThreshold && deviceNotFoundBelowThreshold && transientErrorBelowThreshold && senderAndReceiverSuccessesPass;
return statusCodeZeroBelowThreshold && unauthorizedBelowThreshold && deviceNotFoundBelowThreshold && transientErrorBelowThreshold && senderAndReceiverSuccessesPass && notImplementedBelowThreshold;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ namespace TestResultCoordinator.Reports.LegacyTwin

sealed class LegacyTwinReportGenerator : ITestResultReportGenerator
{
const double BigToleranceProportion = .005;
const double LittleToleranceProportion = .001;
static readonly ILogger Logger = ModuleUtil.CreateLogger(nameof(LegacyTwinReportGenerator));
readonly string trackingId;

Expand Down Expand Up @@ -47,15 +49,9 @@ public async Task<ITestResultReport> CreateReportAsync()
{
Logger.LogInformation($"Start to generate report by {nameof(LegacyTwinReportGenerator)} for Sources [{this.SenderSource}] ");
IDictionary<int, int> results = new Dictionary<int, int>();
bool isPassed = true;
while (await this.SenderTestResults.MoveNextAsync())
{
int status = int.Parse(this.SenderTestResults.Current.Result.Substring(0, 3));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this parse command always work? Maybe you can use TryParse here to guarantee that you don't get any exceptions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't seen this ever fail. Since I didn't change this in the PR and it isn't a problem, I'd rather leave it as-is.

if (status > 299)
{
isPassed = false;
}

if (results.ContainsKey(status))
{
results[status] = results[status] + 1;
Expand All @@ -66,6 +62,8 @@ public async Task<ITestResultReport> CreateReportAsync()
}
}

bool isPassed = this.IsPassed(results);

var report = new LegacyTwinReport(
this.TestDescription,
this.trackingId,
Expand All @@ -77,5 +75,78 @@ public async Task<ITestResultReport> CreateReportAsync()
Logger.LogInformation($"Successfully finished creating LegacyTwinReport for Source [{this.SenderSource}]");
return report;
}

// Tolerances are needed due to a combination of false-positive failures and real product-issues.
// - [Nested-Edge] [Broker-Enabled]: Sometimes we get an excessive amount of 501 and 504 status codes.
// Fail the tests if we have > 0.1% of either code.
// + (501) We don't receive some desired properties in module-registered twin desired property callback
// + (504) Module cannot make reported property update
bool IsPassed(IDictionary<int, int> statusCodesToCount)
{
bool isPassed = true;
int totalResults = statusCodesToCount.Sum(x => x.Value);

if (totalResults == 0)
{
return false;
}

if (this.Topology == Topology.Nested && this.MqttBrokerEnabled)
{
// See TwinTester/StatusCode.cs for reference.
int[] bigToleranceStatusCodes = { };
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed everything requiring a big tolerance, as the case that made me initially create this hasn't reproduced. But chose to leave it for now in case we need it later.

int[] littleToleranceStatusCodes = { 501, 504 };
isPassed = this.GeneratePassResult(statusCodesToCount, bigToleranceStatusCodes, littleToleranceStatusCodes);
}
else
{
List<int> statusCodes = statusCodesToCount.Keys.ToList();
IEnumerable<int> failingStatusCodes = statusCodes.Where(s =>
{
string statusCode = s.ToString();
return !statusCode.StartsWith("2");
});

isPassed = failingStatusCodes.Count() == 0;
}

return isPassed;
}

bool GeneratePassResult(IDictionary<int, int> statusCodesToCount, int[] bigToleranceStatusCodes, int[] littleToleranceStatusCodes)
{
int totalResults = statusCodesToCount.Sum(x => x.Value);
foreach (KeyValuePair<int, int> statusCodeToCount in statusCodesToCount)
{
int statusCode = statusCodeToCount.Key;
int statusCodeCount = statusCodeToCount.Value;

// ignore the status codes indicating some success
if (statusCode.ToString().StartsWith("2"))
{
continue;
}
else if (bigToleranceStatusCodes.Contains(statusCode))
{
if ((double)statusCodeCount / totalResults > BigToleranceProportion)
{
return false;
}
}
else if (littleToleranceStatusCodes.Contains(statusCode))
{
if ((double)statusCodeCount / totalResults > LittleToleranceProportion)
{
return false;
}
}
else
{
return false;
}
}

return true;
}
}
}