cortexlabs · vishalbollu · Dec 22, 2020 · Dec 3, 2020 · Dec 4, 2020 · Dec 4, 2020
diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go
@@ -121,7 +121,7 @@ func batchAPITable(batchAPI schema.APIResponse) string {
 				{Title: "job id"},
 				{Title: "status"},
 				{Title: "progress"}, // (succeeded/total)
-				{Title: "failed", Hidden: totalFailed == 0},
+				{Title: "failed attempts", Hidden: totalFailed == 0},
 				{Title: "start time"},
 				{Title: "duration"},
 			},
@@ -200,7 +200,7 @@ func getJob(env cliconfig.Environment, apiName string, jobID string) (string, er
 		Headers: []table.Header{
 			{Title: "total"},
 			{Title: "succeeded"},
-			{Title: "failed"},
+			{Title: "failed attempts"},
 			{Title: "avg time per batch"},
 		},
 		Rows: [][]interface{}{

diff --git a/docs/workloads/batch/endpoints.md b/docs/workloads/batch/endpoints.md
@@ -33,6 +33,10 @@ POST <batch_api_endpoint>:
 {
     "workers": <int>,         # the number of workers to allocate for this job (required)
     "timeout": <int>,         # duration in seconds since the submission of a job before it is terminated (optional)
+    "sqs_dead_letter_queue": {      # specify a queue to redirect failed batches (optional)
+        "arn": <string>,            # arn of dead letter queue e.g. arn:aws:sqs:us-west-2:123456789:failed.fifo
+        "max_receive_count": <int>  # number of a times a batch is allowed to be handled by a worker before it is considered to be failed and transferred to the dead letter queue (must be >= 1)
+    },
     "item_list": {
         "items": [            # a list items that can be of any type (required)
             <any>,
@@ -54,6 +58,10 @@ RESPONSE:
     "api_id": <string>,
     "sqs_url": <string>,
     "timeout": <int>,
+    "sqs_dead_letter_queue": {
+        "arn": <string>,
+        "max_receive_count": <int>
+    },
     "created_time": <string>  # e.g. 2020-07-16T14:56:10.276007415Z
 }
 ```
@@ -76,6 +84,10 @@ POST <batch_api_endpoint>:
 {
     "workers": <int>,            # the number of workers to allocate for this job (required)
     "timeout": <int>,            # duration in seconds since the submission of a job before it is terminated (optional)
+    "sqs_dead_letter_queue": {      # specify a queue to redirect failed batches (optional)
+        "arn": <string>,            # arn of dead letter queue e.g. arn:aws:sqs:us-west-2:123456789:failed.fifo
+        "max_receive_count": <int>  # number of a times a batch is allowed to be handled by a worker before it is considered to be failed and transferred to the dead letter queue (must be >= 1)
+    },
     "file_path_lister": {
         "s3_paths": [<string>],  # can be S3 prefixes or complete S3 paths (required)
         "includes": [<string>],  # glob patterns (optional)
@@ -96,6 +108,10 @@ RESPONSE:
     "api_id": <string>,
     "sqs_url": <string>,
     "timeout": <int>,
+    "sqs_dead_letter_queue": {
+        "arn": <string>,
+        "max_receive_count": <int>
+    },
     "created_time": <string>  # e.g. 2020-07-16T14:56:10.276007415Z
 }
 ```
@@ -117,6 +133,10 @@ POST <batch_api_endpoint>:
 {
     "workers": <int>,            # the number of workers to allocate for this job (required)
     "timeout": <int>,            # duration in seconds since the submission of a job before it is terminated (optional)
+    "sqs_dead_letter_queue": {      # specify a queue to redirect failed batches (optional)
+        "arn": <string>,            # arn of dead letter queue e.g. arn:aws:sqs:us-west-2:123456789:failed.fifo
+        "max_receive_count": <int>  # number of a times a batch is allowed to be handled by a worker before it is considered to be failed and transferred to the dead letter queue (must be >= 1)
+    },
     "delimited_files": {
         "s3_paths": [<string>],  # can be S3 prefixes or complete S3 paths (required)
         "includes": [<string>],  # glob patterns (optional)
@@ -137,6 +157,10 @@ RESPONSE:
     "api_id": <string>,
     "sqs_url": <string>,
     "timeout": <int>,
+    "sqs_dead_letter_queue": {
+        "arn": <string>,
+        "max_receive_count": <int>
+    },
     "created_time": <string>  # e.g. 2020-07-16T14:56:10.276007415Z
 }
 ```
@@ -163,8 +187,8 @@ RESPONSE:
         "batches_in_queue": <int>        # number of batches remaining in the queue
         "batch_metrics": {
             "succeeded": <int>           # number of succeeded batches
-            "failed": int                # number of failed batches
-            "avg_time_per_batch": <float> (optional)  # only available if batches have been completed
+            "failed": int                # number of failed attempts
+            "avg_time_per_batch": <float> (optional)  # average time spent working on a batch (only considers successful attempts)
         },
         "worker_counts": {               # worker counts are only available while a job is running
             "pending": <int>,            # number of workers that are waiting for compute resources to be provisioned

diff --git a/docs/workloads/batch/example.md b/docs/workloads/batch/example.md
@@ -5,7 +5,8 @@ Deploy batch APIs that can orchestrate distributed batch inference jobs on large
 ## Key features
 
 * Distributed inference
-* Fault tolerance with queues
+* Automatic batch retries
+* Collect failed batches for debugging
 * Metrics and log aggregation
 * `on_job_complete` webhook
 * Scale to 0

diff --git a/pkg/cortex/serve/start/batch.py b/pkg/cortex/serve/start/batch.py
@@ -21,6 +21,7 @@
 import threading
 import math
 import pathlib
+import uuid
 
 import boto3
 import botocore
@@ -34,7 +35,11 @@
 from cortex_internal.lib.exceptions import UserRuntimeException
 
 API_LIVENESS_UPDATE_PERIOD = 5  # seconds
-MAXIMUM_MESSAGE_VISIBILITY = 60 * 60 * 12  # 12 hours is the maximum message visibility
+SQS_POLL_WAIT_TIME = 10  # seconds
+MESSAGE_NOT_FOUND_SLEEP = 10  # seconds
+INITIAL_MESSAGE_VISIBILITY = 30  # seconds
+MESSAGE_RENEWAL_PERIOD = 15  # seconds
+JOB_COMPLETE_MESSAGE_RENEWAL = 10  # seconds
 
 local_cache = {
     "api_spec": None,
@@ -48,6 +53,10 @@
 }
 
 
+receipt_handle_mutex = threading.Lock()
+stop_renewal = set()
+
+
 def dimensions():
     return [
         {"Name": "APIName", "Value": local_cache["api_spec"].name},
@@ -67,6 +76,40 @@ def time_per_batch_metric(total_time_seconds):
     return {"MetricName": "TimePerBatch", "Dimensions": dimensions(), "Value": total_time_seconds}
 
 
+def renew_message_visibility(receipt_handle: str):
+    queue_url = local_cache["job_spec"]["sqs_url"]
+    interval = MESSAGE_RENEWAL_PERIOD
+    new_timeout = INITIAL_MESSAGE_VISIBILITY
+    cur_time = time.time()
+
+    while True:
+        time.sleep((cur_time + interval) - time.time())
+        cur_time += interval
+        new_timeout += interval
+
+        with receipt_handle_mutex:
+            if receipt_handle in stop_renewal:
+                stop_renewal.remove(receipt_handle)
+                break
+
+            try:
+                local_cache["sqs_client"].change_message_visibility(
+                    QueueUrl=queue_url, ReceiptHandle=receipt_handle, VisibilityTimeout=new_timeout
+                )
+            except botocore.exceptions.ClientError as e:
+                if e.response["Error"]["Code"] == "InvalidParameterValue":
+                    # unexpected; this error is thrown when attempting to renew a message that has been deleted
+                    continue
+                elif e.response["Error"]["Code"] == "AWS.SimpleQueueService.NonExistentQueue":
+                    # there may be a delay between the cron may deleting the queue and this worker stopping
+                    cx_logger().info(
+                        "failed to renew message visibility because the queue was not found"
+                    )
+                else:
+                    stop_renewal.remove(receipt_handle)
+                    raise e
+
+
 def build_predict_args(payload, batch_id):
     args = {}
 
@@ -102,49 +145,6 @@ def get_total_messages_in_queue():
     return visible_count, not_visible_count
 
 
-def handle_on_complete(message):
-    job_spec = local_cache["job_spec"]
-    predictor_impl = local_cache["predictor_impl"]
-    sqs_client = local_cache["sqs_client"]
-    queue_url = job_spec["sqs_url"]
-    receipt_handle = message["ReceiptHandle"]
-
-    try:
-        if not getattr(predictor_impl, "on_job_complete", None):
-            sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)
-            return True
-
-        should_run_on_job_complete = False
-
-        while True:
-            visible_count, not_visible_count = get_total_messages_in_queue()
-
-            # if there are other messages that are visible, release this message and get the other ones (should rarely happen for FIFO)
-            if visible_count > 0:
-                sqs_client.change_message_visibility(
-                    QueueUrl=queue_url, ReceiptHandle=receipt_handle, VisibilityTimeout=0
-                )
-                return False
-
-            if should_run_on_job_complete:
-                # double check that the queue is still empty (except for the job_complete message)
-                if not_visible_count <= 1:
-                    logger().info("executing on_job_complete")
-                    predictor_impl.on_job_complete()
-                    sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)
-                    return True
-                else:
-                    should_run_on_job_complete = False
-
-            if not_visible_count <= 1:
-                should_run_on_job_complete = True
-
-            time.sleep(20)
-    except:
-        sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)
-        raise
-
-
 def sqs_loop():
     job_spec = local_cache["job_spec"]
     api_spec = local_cache["api_spec"]
@@ -159,52 +159,118 @@ def sqs_loop():
         response = sqs_client.receive_message(
             QueueUrl=queue_url,
             MaxNumberOfMessages=1,
-            WaitTimeSeconds=10,
-            VisibilityTimeout=MAXIMUM_MESSAGE_VISIBILITY,
+            WaitTimeSeconds=SQS_POLL_WAIT_TIME,
+            VisibilityTimeout=INITIAL_MESSAGE_VISIBILITY,
             MessageAttributeNames=["All"],
         )
 
         if response.get("Messages") is None or len(response["Messages"]) == 0:
-            if no_messages_found_in_previous_iteration:
-                logger().info("no batches left in queue, exiting...")
-                return
-            else:
+            visible_messages, invisible_messages = get_total_messages_in_queue()
+            if visible_messages + invisible_messages == 0:
+                if no_messages_found_in_previous_iteration:
+                    logger().info("no batches left in queue, exiting...")
+                    return
                 no_messages_found_in_previous_iteration = True
-                continue
-        else:
-            no_messages_found_in_previous_iteration = False
 
-        message = response["Messages"][0]
+            time.sleep(MESSAGE_NOT_FOUND_SLEEP)
+            continue
 
+        no_messages_found_in_previous_iteration = False
+        message = response["Messages"][0]
         receipt_handle = message["ReceiptHandle"]
 
-        if "MessageAttributes" in message and "job_complete" in message["MessageAttributes"]:
-            handled_on_complete = handle_on_complete(message)
-            if handled_on_complete:
-                logger().info("no batches left in queue, job has been completed")
-                return
+        renewer = threading.Thread(
+            target=renew_message_visibility, args=(receipt_handle,), daemon=True
+        )
+        renewer.start()
+
+        if is_on_job_complete(message):
+            handle_on_job_complete(message)
+        else:
+            handle_batch_message(message)
+
+
+def is_on_job_complete(message) -> bool:
+    return "MessageAttributes" in message and "job_complete" in message["MessageAttributes"]
+
+
+def handle_batch_message(message):
+    job_spec = local_cache["job_spec"]
+    predictor_impl = local_cache["predictor_impl"]
+    sqs_client = local_cache["sqs_client"]
+    queue_url = job_spec["sqs_url"]
+    receipt_handle = message["ReceiptHandle"]
+    api_spec = local_cache["api_spec"]
+
+    start_time = time.time()
+
+    try:
+        logger().info(f"processing batch {message['MessageId']}")
+        payload = json.loads(message["Body"])
+        batch_id = message["MessageId"]
+        predictor_impl.predict(**build_predict_args(payload, batch_id))
+
+        api_spec.post_metrics(
+            [success_counter_metric(), time_per_batch_metric(time.time() - start_time)]
+        )
+    except:
+        api_spec.post_metrics([failed_counter_metric()])
+        logger().exception(f"failed processing batch {message['MessageId']}")
+        with receipt_handle_mutex:
+            stop_renewal.add(receipt_handle)
+            if job_spec.get("sqs_dead_letter_queue") is not None:
+                sqs_client.change_message_visibility(  # return message
+                    QueueUrl=queue_url, ReceiptHandle=receipt_handle, VisibilityTimeout=0
+                )
+            else:
+                sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)
+    else:
+        with receipt_handle_mutex:
+            stop_renewal.add(receipt_handle)
+            sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)
+
+
+def handle_on_job_complete(message):
+    job_spec = local_cache["job_spec"]
+    predictor_impl = local_cache["predictor_impl"]
+    sqs_client = local_cache["sqs_client"]
+    queue_url = job_spec["sqs_url"]
+    receipt_handle = message["ReceiptHandle"]
+
+    should_run_on_job_complete = False
+    try:
+        while True:
+            visible_messages, invisible_messages = get_total_messages_in_queue()
+            total_messages = visible_messages + invisible_messages
+            if total_messages > 1:
+                new_message_id = uuid.uuid4()
+                time.sleep(JOB_COMPLETE_MESSAGE_RENEWAL)
+                sqs_client.send_message(
+                    QueueUrl=queue_url,
+                    MessageBody='"job_complete"',
+                    MessageAttributes={
+                        "job_complete": {"StringValue": "true", "DataType": "String"},
+                        "api_name": {"StringValue": job_spec["api_name"], "DataType": "String"},
+                        "job_id": {"StringValue": job_spec["job_id"], "DataType": "String"},
+                    },
+                    MessageDeduplicationId=str(new_message_id),
+                    MessageGroupId=str(new_message_id),
+                )
+                break
             else:
-                # sometimes on_job_complete message will be released if there are other messages still to be processed
-                continue
-
-        try:
-            logger().info(f"processing batch {message['MessageId']}")
-
-            start_time = time.time()
-
-            payload = json.loads(message["Body"])
-            batch_id = message["MessageId"]
-            predictor_impl.predict(**build_predict_args(payload, batch_id))
-
-            api_spec.post_metrics(
-                [success_counter_metric(), time_per_batch_metric(time.time() - start_time)]
-            )
-        except Exception:
-            api_spec.post_metrics(
-                [failed_counter_metric(), time_per_batch_metric(time.time() - start_time)]
-            )
-            logger().exception("failed to process batch")
-        finally:
+                if should_run_on_job_complete:
+                    if getattr(predictor_impl, "on_job_complete", None):
+                        logger().info("executing on_job_complete")
+                        predictor_impl.on_job_complete()
+                    break
+                should_run_on_job_complete = True
+            time.sleep(10)  # verify that the queue is empty one more time
+    except:
+        logger.exception("failed to handle on_job_complete")
+        raise
+    finally:
+        with receipt_handle_mutex:
+            stop_renewal.add(receipt_handle)
             sqs_client.delete_message(QueueUrl=queue_url, ReceiptHandle=receipt_handle)