From 78bee73323aaa1c096d5c49ecacc8cc9a78e8436 Mon Sep 17 00:00:00 2001
From: Beatriz Rizental <brizental@mozilla.com>
Date: Tue, 4 Aug 2020 16:04:30 +0200
Subject: [PATCH] Bug 1644364 -  Move logic to limit max retries on recoverable
 failures to glean-core (#1120)

---
 .dictionary                                   |   1 +
 CHANGELOG.md                                  |   5 +
 .../mozilla/telemetry/glean/net/Upload.kt     |  10 +-
 .../glean/scheduler/PingUploadWorker.kt       |  17 +--
 glean-core/csharp/Glean/Net/BaseUploader.cs   |  13 +-
 .../ios/Glean/Net/HttpPingUploader.swift      |  11 +-
 .../python/glean/net/ping_upload_worker.py    |  17 +--
 glean-core/python/tests/test_network.py       |  12 +-
 glean-core/src/lib.rs                         |   2 +-
 glean-core/src/upload/mod.rs                  | 122 +++++++++++++++---
 10 files changed, 138 insertions(+), 72 deletions(-)

diff --git a/.dictionary b/.dictionary
index 305943a687..00706bf21c 100644
--- a/.dictionary
+++ b/.dictionary
@@ -101,6 +101,7 @@ docstrings
 dokka
 dropdown
 enqueue
+enqueued
 enum
 envs
 exe
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d31badd0d2..338a27d981 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,12 +3,17 @@
 [Full changelog](https://github.com/mozilla/glean/compare/v32.0.0...main)
 
 * General
+  * Move logic to limit the number of retries on ping uploading "recoverable failures" to glean-core. ([#1120](https://github.com/mozilla/glean/pull/1120))
+    * The functionality to limit the number of retries in these cases was introduced to the Glean SDK in `v31.1.0`. The work done now was to move that logic to the glean-core in order to avoid code duplication throughout the language bindings.
   * Update `glean_parser` to `v1.28.3`
     * BUGFIX: Generate valid C# code when using Labeled metric types.
     * BUGFIX: Support `HashSet` and `Dictionary` in the C# generated code.
 * C#
   * Add support for the String List metric type.
   * Enable generating the C# APIs using the glean_parser.
+* Python
+  * BUGFIX: Limit the number of retries for 5xx server errors on ping uploads. ([#1120](https://github.com/mozilla/glean/pull/1120))
+    * This kinds of failures yield a "recoverable error", which means the ping gets re-enqueued. That can cause infinite loops on the ping upload worker. For python we were incorrectly only limiting the number of retries for I/O errors, another type of "recoverable error".
 
 # v32.0.0 (2020-08-03)
 
diff --git a/glean-core/android/src/main/java/mozilla/telemetry/glean/net/Upload.kt b/glean-core/android/src/main/java/mozilla/telemetry/glean/net/Upload.kt
index a5f352faa7..ac6b1fc80b 100644
--- a/glean-core/android/src/main/java/mozilla/telemetry/glean/net/Upload.kt
+++ b/glean-core/android/src/main/java/mozilla/telemetry/glean/net/Upload.kt
@@ -117,7 +117,15 @@ internal sealed class PingUploadTask {
     object Wait : PingUploadTask()
 
     /**
-     * A flag signaling that the pending pings queue is empty and requester is done.
+     * A flag signaling that requester doesn't need to request any more upload tasks at this moment.
+     *
+     * There are two possibilities for this scenario:
+     * * Pending pings queue is empty, no more pings to request;
+     * * Requester has reported more max recoverable upload failures on the same uploading_window[1]
+     *   and should stop requesting at this moment.
+     *
+     * [1]: An "uploading window" starts when a requester gets a new `PingUploadTask::Upload(PingRequest)`
+     *      response and finishes when they finally get a `PingUploadTask::Done` or `PingUploadTask::Wait` response.
      */
     object Done : PingUploadTask()
 }
diff --git a/glean-core/android/src/main/java/mozilla/telemetry/glean/scheduler/PingUploadWorker.kt b/glean-core/android/src/main/java/mozilla/telemetry/glean/scheduler/PingUploadWorker.kt
index 501ea4566e..389104a4b6 100644
--- a/glean-core/android/src/main/java/mozilla/telemetry/glean/scheduler/PingUploadWorker.kt
+++ b/glean-core/android/src/main/java/mozilla/telemetry/glean/scheduler/PingUploadWorker.kt
@@ -19,7 +19,6 @@ import mozilla.telemetry.glean.Glean
 import mozilla.telemetry.glean.net.FfiPingUploadTask
 import mozilla.telemetry.glean.utils.testFlushWorkManagerJob
 import mozilla.telemetry.glean.net.PingUploadTask
-import mozilla.telemetry.glean.rust.Constants
 
 /**
  * Build the constraints around which the worker can be run, such as whether network
@@ -53,8 +52,6 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
     companion object {
         internal const val PING_WORKER_TAG = "mozac_service_glean_ping_upload_worker"
 
-        internal const val MAX_RETRIES = 3
-
         /**
          * Function to aid in properly enqueuing the worker in [WorkManager]
          *
@@ -98,11 +95,6 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
      */
     @Suppress("ReturnCount")
     override fun doWork(): Result {
-        // We counte the number of failures,
-        // if we get more than three failures in upload we will stop this worker.
-        //
-        // This is a hack before a more robust mechanism is implemented in Rust.
-        var uploadFailures = 0
         do {
             // Create a slot of memory for the task: glean-core will write data into
             // the allocated memory.
@@ -121,17 +113,14 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
                         Glean.configuration
                     ).toFfi()
 
-                    if (result == Constants.UPLOAD_RESULT_RECOVERABLE) {
-                        uploadFailures++
-                    }
-
                     // Process the upload response
                     LibGleanFFI.INSTANCE.glean_process_ping_upload_response(incomingTask, result)
                 }
                 PingUploadTask.Wait -> return Result.retry()
                 PingUploadTask.Done -> return Result.success()
             }
-        } while (uploadFailures < MAX_RETRIES)
-        return Result.failure()
+        } while (true)
+        // Limits are enforced by glean-core to avoid an inifinite loop here.
+        // Whenever a limit is reached, this binding will receive `PingUploadTask.Done` and step out.
     }
 }
diff --git a/glean-core/csharp/Glean/Net/BaseUploader.cs b/glean-core/csharp/Glean/Net/BaseUploader.cs
index 2191e9c47f..c7cc1985b9 100644
--- a/glean-core/csharp/Glean/Net/BaseUploader.cs
+++ b/glean-core/csharp/Glean/Net/BaseUploader.cs
@@ -22,9 +22,6 @@ internal class BaseUploader
         // How many times to attempt waiting when told to by glean-core's upload API.
         private const int MAX_WAIT_ATTEMPTS = 3;
 
-        // Maximum number of recoverable errors allowed before aborting the ping uploader.
-        private const int MAX_RETRIES = 3;
-
         private readonly IPingUploader uploader;
 
         /// <summary>
@@ -100,9 +97,10 @@ internal void TriggerUpload(Configuration config)
             // FOR TESTING Implement the upload worker here and call this from Glean.cs
 
             int waitAttempts = 0;
-            int uploadFailures = 0;
 
-            while (uploadFailures < MAX_RETRIES)
+            // Limits are enforced by glean-core to avoid an inifinite loop here.
+            // Whenever a limit is reached, this binding will receive `UploadTaskTag.Done` and step out.
+            while (true)
             {
                 FfiUploadTask incomingTask = new FfiUploadTask();
                 LibGleanFFI.glean_get_upload_task(ref incomingTask);
@@ -123,11 +121,6 @@ internal void TriggerUpload(Configuration config)
                             // Delegate the actual upload and get its return value.
                             UploadResult result = Upload(path, body, headers, config);
 
-                            if (result is RecoverableFailure)
-                            {
-                                uploadFailures += 1;
-                            }
-
                             // Copy the `FfiUploadTask` to unmanaged memory, because
                             // `glean_process_ping_upload` assumes it has to free the memory.
                             IntPtr ptrCopy = Marshal.AllocHGlobal(Marshal.SizeOf(incomingTask));
diff --git a/glean-core/ios/Glean/Net/HttpPingUploader.swift b/glean-core/ios/Glean/Net/HttpPingUploader.swift
index 5b22b6ee02..8213472c50 100644
--- a/glean-core/ios/Glean/Net/HttpPingUploader.swift
+++ b/glean-core/ios/Glean/Net/HttpPingUploader.swift
@@ -18,9 +18,6 @@ public class HttpPingUploader {
         static let recoverableErrorStatusCode: UInt16 = 500
         // For this error, the ping data will be deleted and no retry happens
         static let unrecoverableErrorStatusCode: UInt16 = 400
-
-        // Maximum number of recoverable errors allowed before aborting the ping uploader
-        static let maxRetries = 3
     }
 
     private let logger = Logger(tag: Constants.logTag)
@@ -98,8 +95,9 @@ public class HttpPingUploader {
     /// It will report back the task status to Glean, which will take care of deleting pending ping files.
     /// It will continue upload as long as it can fetch new tasks.
     func process() {
-        var uploadFailures = 0
-        while uploadFailures < Constants.maxRetries {
+        // Limits are enforced by glean-core to avoid an inifinite loop here.
+        // Whenever a limit is reached, this binding will receive `.done` and step out.
+        while true {
             var incomingTask = FfiPingUploadTask()
             glean_get_upload_task(&incomingTask)
             let task = incomingTask.toPingUploadTask()
@@ -107,9 +105,6 @@ public class HttpPingUploader {
             switch task {
             case let .upload(request):
                 self.upload(path: request.path, data: request.body, headers: request.headers) { result in
-                    if case .recoverableFailure = result {
-                        uploadFailures += 1
-                    }
                     glean_process_ping_upload_response(&incomingTask, result.toFfi())
                 }
             case .wait:
diff --git a/glean-core/python/glean/net/ping_upload_worker.py b/glean-core/python/glean/net/ping_upload_worker.py
index 7a4022ba3e..efdd5326ad 100644
--- a/glean-core/python/glean/net/ping_upload_worker.py
+++ b/glean-core/python/glean/net/ping_upload_worker.py
@@ -17,7 +17,6 @@
 from .._glean_ffi import ffi as ffi_support  # type: ignore
 from .._dispatcher import Dispatcher
 from .._process_dispatcher import ProcessDispatcher
-from .ping_uploader import RecoverableFailure
 
 
 log = logging.getLogger(__name__)
@@ -26,9 +25,6 @@
 # How many times to attempt waiting when told to by glean-core's upload API.
 MAX_WAIT_ATTEMPTS = 3
 
-# Maximum number of recoverable errors allowed before aborting the ping uploader
-MAX_RETRIES = 3
-
 
 class PingUploadWorker:
     @classmethod
@@ -120,9 +116,9 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
 
     wait_attempts = 0
 
-    upload_failures = 0
-
-    while upload_failures < MAX_RETRIES:
+    # Limits are enforced by glean-core to avoid an inifinite loop here.
+    # Whenever a limit is reached, this binding will receive `UploadTaskTag.DONE` and step out.
+    while True:
         incoming_task = ffi_support.new("FfiPingUploadTask *")
         _ffi.lib.glean_get_upload_task(incoming_task)
 
@@ -146,9 +142,6 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
                 url_path, body, _parse_ping_headers(headers, doc_id), configuration
             )
 
-            if isinstance(upload_result, RecoverableFailure):
-                upload_failures = upload_failures + 1
-
             # Process the response.
             _ffi.lib.glean_process_ping_upload_response(
                 incoming_task, upload_result.to_ffi()
@@ -161,9 +154,7 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
             else:
                 return False
         elif tag == UploadTaskTag.DONE:
-            break
-
-    return True
+            return True
 
 
 __all__ = ["PingUploadWorker"]
diff --git a/glean-core/python/tests/test_network.py b/glean-core/python/tests/test_network.py
index 349b831acf..cc6d29b7f4 100644
--- a/glean-core/python/tests/test_network.py
+++ b/glean-core/python/tests/test_network.py
@@ -81,11 +81,12 @@ def test_500_error_submit(safe_httpserver, monkeypatch):
     ProcessDispatcher._wait_for_last_process()
 
     # This kind of recoverable error will be tried 10 times
-    assert 10 == len(safe_httpserver.requests)
+    # The number of retries is defined on glean-core
+    assert 3 == len(safe_httpserver.requests)
 
     metric = get_upload_failure_metric()
     assert not metric["status_code_4xx"].test_has_value()
-    assert 10 == metric["status_code_5xx"].test_get_value()
+    assert 3 == metric["status_code_5xx"].test_get_value()
 
 
 def test_500_error_submit_concurrent_writing(slow_httpserver, monkeypatch):
@@ -113,12 +114,13 @@ def test_500_error_submit_concurrent_writing(slow_httpserver, monkeypatch):
         counter.add()
         times += 1
 
-    # This kind of recoverable error will be tried 10 times
-    assert 10 == len(slow_httpserver.requests)
+    # This kind of recoverable error will be tried 3 times
+    # The number of retries is defined on glean-core
+    assert 3 == len(slow_httpserver.requests)
 
     metric = get_upload_failure_metric()
     assert not metric["status_code_4xx"].test_has_value()
-    assert 10 == metric["status_code_5xx"].test_get_value()
+    assert 3 == metric["status_code_5xx"].test_get_value()
 
     assert times > 0
     assert times == counter.test_get_value()
diff --git a/glean-core/src/lib.rs b/glean-core/src/lib.rs
index 90363f993e..557f06d76c 100644
--- a/glean-core/src/lib.rs
+++ b/glean-core/src/lib.rs
@@ -490,7 +490,7 @@ impl Glean {
     ///
     /// * `Wait` - which means the requester should ask again later;
     /// * `Upload(PingRequest)` - which means there is a ping to upload. This wraps the actual request object;
-    /// * `Done` - which means there are no more pings queued right now.
+    /// * `Done` - which means requester should stop asking for now.
     ///
     /// # Return value
     ///
diff --git a/glean-core/src/upload/mod.rs b/glean-core/src/upload/mod.rs
index d224f26707..7fd2cecf03 100644
--- a/glean-core/src/upload/mod.rs
+++ b/glean-core/src/upload/mod.rs
@@ -11,7 +11,7 @@
 
 use std::collections::VecDeque;
 use std::path::PathBuf;
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
 use std::sync::{Arc, RwLock, RwLockWriteGuard};
 use std::thread;
 use std::time::{Duration, Instant};
@@ -26,6 +26,11 @@ mod directory;
 mod request;
 mod result;
 
+/// The maximum recoverable failures allowed per uploading window.
+///
+/// Limiting this is necessary to avoid infinite loops on requesting upload tasks.
+const MAX_RECOVERABLE_FAILURES_PER_UPLOADING_WINDOW: u32 = 3;
+
 // The maximum size in bytes a ping body may have to be eligible for upload.
 const PING_BODY_MAX_SIZE: usize = 1024 * 1024; // 1 MB
 
@@ -112,7 +117,16 @@ pub enum PingUploadTask {
     /// A flag signaling that the pending pings directories are not done being processed,
     /// thus the requester should wait and come back later.
     Wait,
-    /// A flag signaling that the pending pings queue is empty and requester is done.
+    /// A flag signaling that requester doesn't need to request any more upload tasks at this moment.
+    ///
+    /// There are two possibilities for this scenario:
+    /// * Pending pings queue is empty, no more pings to request;
+    /// * Requester has reported more than MAX_RECOVERABLE_FAILURES_PER_UPLOADING_WINDOW
+    ///   recoverable upload failures on the same uploading window[1]
+    ///   and should stop requesting at this moment.
+    ///
+    /// [1]: An "uploading window" starts when a requester gets a new `PingUploadTask::Upload(PingRequest)`
+    ///      response and finishes when they finally get a `PingUploadTask::Done` or `PingUploadTask::Wait` response.
     Done,
 }
 
@@ -127,6 +141,8 @@ pub struct PingUploadManager {
     processed_pending_pings: Arc<AtomicBool>,
     /// A vector to store the pending pings processed off-thread.
     pending_pings: Arc<RwLock<Vec<PingPayload>>>,
+    /// The number of upload failures for the current uploading window.
+    recoverable_failure_count: AtomicU32,
     /// A ping counter to help rate limit the ping uploads.
     ///
     /// To keep resource usage in check,
@@ -187,9 +203,10 @@ impl PingUploadManager {
 
         Self {
             queue,
+            directory_manager,
             processed_pending_pings,
             pending_pings,
-            directory_manager,
+            recoverable_failure_count: AtomicU32::new(0),
             rate_limiter: None,
             language_binding_name: language_binding_name.into(),
             upload_metrics: UploadMetrics::new(),
@@ -200,11 +217,12 @@ impl PingUploadManager {
         self.processed_pending_pings.load(Ordering::SeqCst)
     }
 
-    /// Checks if a ping with a certain `document_id` is already enqueued.
-    fn is_enqueued(queue: &VecDeque<PingRequest>, document_id: &str) -> bool {
-        queue
-            .iter()
-            .any(|request| request.document_id == document_id)
+    fn recoverable_failure_count(&self) -> u32 {
+        self.recoverable_failure_count.load(Ordering::SeqCst)
+    }
+
+    fn reset_recoverable_failure_count(&self) {
+        self.recoverable_failure_count.store(0, Ordering::SeqCst);
     }
 
     /// Attempts to build a ping request from a ping file payload.
@@ -261,7 +279,10 @@ impl PingUploadManager {
             .expect("Can't write to pending pings queue.");
 
         // Checks if a ping with this `document_id` is already enqueued.
-        if Self::is_enqueued(&queue, &document_id) {
+        if queue
+            .iter()
+            .any(|request| request.document_id == document_id)
+        {
             log::trace!(
                 "Attempted to enqueue a duplicate ping {} at {}.",
                 document_id,
@@ -326,17 +347,7 @@ impl PingUploadManager {
         queue
     }
 
-    /// Gets the next `PingUploadTask`.
-    ///
-    /// ## Arguments
-    ///
-    /// * `glean` - The Glean object holding the database.
-    /// * `log_ping` - Whether to log the ping before returning.
-    ///
-    /// # Return value
-    ///
-    /// `PingUploadTask` - see [`PingUploadTask`](enum.PingUploadTask.html) for more information.
-    pub fn get_upload_task(&self, glean: &Glean, log_ping: bool) -> PingUploadTask {
+    fn get_upload_task_internal(&self, glean: &Glean, log_ping: bool) -> PingUploadTask {
         if !self.has_processed_pings_dir() {
             log::info!(
                 "Tried getting an upload task, but processing is ongoing. Will come back later."
@@ -352,6 +363,14 @@ impl PingUploadManager {
             self.enqueue_ping(glean, &document_id, &path, &body, headers);
         }
 
+        if self.recoverable_failure_count() >= MAX_RECOVERABLE_FAILURES_PER_UPLOADING_WINDOW {
+            log::warn!(
+                "Reached maximum recoverable failures for the current uploading window. You are done."
+            );
+
+            return PingUploadTask::Done;
+        }
+
         let mut queue = self
             .queue
             .write()
@@ -393,6 +412,25 @@ impl PingUploadManager {
         }
     }
 
+    /// Gets the next `PingUploadTask`.
+    ///
+    /// ## Arguments
+    ///
+    /// * `glean` - The Glean object holding the database.
+    /// * `log_ping` - Whether to log the ping before returning.
+    ///
+    /// # Return value
+    ///
+    /// `PingUploadTask` - see [`PingUploadTask`](enum.PingUploadTask.html) for more information.
+    pub fn get_upload_task(&self, glean: &Glean, log_ping: bool) -> PingUploadTask {
+        let task = self.get_upload_task_internal(glean, log_ping);
+        if task == PingUploadTask::Done || task == PingUploadTask::Wait {
+            self.reset_recoverable_failure_count()
+        }
+
+        task
+    }
+
     /// Processes the response from an attempt to upload a ping.
     ///
     /// Based on the HTTP status of said response,
@@ -466,6 +504,8 @@ impl PingUploadManager {
                     status
                 );
                 self.enqueue_ping_from_file(glean, &document_id);
+                self.recoverable_failure_count
+                    .fetch_add(1, Ordering::SeqCst);
             }
         };
     }
@@ -1054,4 +1094,46 @@ mod test {
             PingUploadTask::Done
         );
     }
+
+    #[test]
+    fn maximum_of_recoverable_errors_is_enforced_for_uploading_window() {
+        let (mut glean, _) = new_glean(None);
+
+        // Wait for processing of pending pings directory to finish.
+        while glean.get_upload_task() == PingUploadTask::Wait {
+            thread::sleep(Duration::from_millis(10));
+        }
+
+        // Register a ping for testing
+        let ping_type = PingType::new("test", true, /* send_if_empty */ true, vec![]);
+        glean.register_ping_type(&ping_type);
+
+        // Submit the ping multiple times
+        let n = 5;
+        for _ in 0..n {
+            glean.submit_ping(&ping_type, None).unwrap();
+        }
+
+        // Return the max recoverable error failures in a row
+        for _ in 0..MAX_RECOVERABLE_FAILURES_PER_UPLOADING_WINDOW {
+            match glean.get_upload_task() {
+                PingUploadTask::Upload(req) => {
+                    glean.process_ping_upload_response(&req.document_id, RecoverableFailure)
+                }
+                _ => panic!("Expected upload manager to return the next request!"),
+            }
+        }
+
+        // Verify that after returning the max amount of recoverable failures,
+        // we are done even though we haven't gotten all the enqueued requests.
+        assert_eq!(glean.get_upload_task(), PingUploadTask::Done);
+
+        // Verify all requests are returned when we try again.
+        for _ in 0..n {
+            match glean.get_upload_task() {
+                PingUploadTask::Upload(_) => {}
+                _ => panic!("Expected upload manager to return the next request!"),
+            }
+        }
+    }
 }