IMAP-Science-Operations-Center · maxinelasp · Dec 18, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/sds_data_manager/constructs/indexer_lambda_construct.py b/sds_data_manager/constructs/indexer_lambda_construct.py
@@ -23,7 +23,6 @@ def __init__(
         vpc_subnets,
         rds_security_group,
         data_bucket,
-        sns_topic,
         layers: list,
         **kwargs,
     ) -> None:
@@ -134,20 +133,6 @@ def __init__(
             ),
         )
 
-        # Uses batch job status of failure
-        # to trigger a sns topic
-        batch_job_failure_rule = events.Rule(
-            self,
-            "batchJobFailure",
-            rule_name="batch-job-failure",
-            event_pattern=events.EventPattern(
-                source=["aws.batch"],
-                detail_type=["Batch Job State Change"],
-                detail={"status": ["FAILED"]},
-            ),
-        )
-
         # Add the Lambda function as the target for the rules
         imap_data_arrival_rule.add_target(targets.LambdaFunction(indexer_lambda))
         batch_job_status_rule.add_target(targets.LambdaFunction(indexer_lambda))
-        batch_job_failure_rule.add_target(targets.SnsTopic(sns_topic))
diff --git a/sds_data_manager/constructs/monitoring_lambda_construct.py b/sds_data_manager/constructs/monitoring_lambda_construct.py
@@ -0,0 +1,80 @@
+"""Configure monitoring formatter lambda."""
+
+import aws_cdk as cdk
+from aws_cdk import aws_events as events
+from aws_cdk import aws_events_targets as targets
+from aws_cdk import aws_iam as iam
+from aws_cdk import aws_lambda as lambda_
+from constructs import Construct
+
+
+class MonitoringLambda(Construct):
+    """Construct for monitoring lambda."""
+
+    def __init__(
+        self,
+        scope: Construct,
+        construct_id: str,
+        code: lambda_.Code,
+        sns_topic,
+        **kwargs,
+    ) -> None:
+        """MonitoringLambda Construct.
+
+        Parameters
+        ----------
+        scope : Construct
+            Parent construct.
+        construct_id : str
+            A unique string identifier for this construct.
+        code : aws_lambda.Code
+            Lambda code bundle
+        sns_topic : aws_sns.Topic
+            SNS Topic for sending notifications so that external
+            resources can subscribe to for alerts.
+        kwargs : dict
+            Keyword arguments
+
+        """
+        super().__init__(scope, construct_id, **kwargs)
+
+        monitoring_lambda = lambda_.Function(
+            self,
+            id="MonitoringLambda",
+            function_name="monitoring",
+            code=code,
+            handler="SDSCode.pipeline_lambdas.monitoring.lambda_handler",
+            runtime=lambda_.Runtime.PYTHON_3_12,
+            timeout=cdk.Duration.minutes(1),
+            memory_size=1000,
+            environment={
+                "SNS_TOPIC_ARN": sns_topic.topic_arn,
+            },
+            allow_public_subnet=True,
+            architecture=lambda_.Architecture.ARM_64,
+        )
+
+        # Uses batch job status of failure
+        # to trigger a sns topic
+        batch_job_failure_rule = events.Rule(
+            self,
+            "batchJobFailure",
+            rule_name="batch-job-failed",
+            event_pattern=events.EventPattern(
+                source=["aws.batch"],
+                detail_type=["Batch Job State Change"],
+                detail={"status": ["FAILED"]},
+            ),
+        )
+
+        # monitoring lambda will retrieve logs and publish output to SNS
+        sns_topic.grant_publish(monitoring_lambda)
+
+        monitoring_lambda.add_to_role_policy(
+            iam.PolicyStatement(
+                actions=["logs:GetLogEvents"],
+                resources=["arn:aws:logs:*:*:log-group:/aws/batch/*"],
+            )
+        )
+
+        batch_job_failure_rule.add_target(targets.LambdaFunction(monitoring_lambda))
diff --git a/sds_data_manager/lambda_code/SDSCode/pipeline_lambdas/monitoring.py b/sds_data_manager/lambda_code/SDSCode/pipeline_lambdas/monitoring.py
@@ -0,0 +1,70 @@
+"""Lambda function to send a formatted SNS notification when a Batch job fails."""
+
+import os
+
+import boto3
+
+sns_client = boto3.client("sns")
+logs_client = boto3.client("logs")
+
+
+def lambda_handler(event, context):
+    """Lambda handler to send an SNS notification when a Batch job fails.
+
+    Lambda will format the message and retrieve logging from the failed job, before
+    sending a message to the notification service (SNS topic defined by the environment
+    variable "SNS_ARN").
+
+    Parameters
+    ----------
+    event : dict
+        The JSON formatted document with the data required for the
+        lambda function to process. Source event is from AWS Batch.
+    context : obj
+        The context object for the lambda function
+    """
+    # Extract relevant details from the event
+    detail = event.get("detail", {})
+
+    job_name = detail.get("jobName", "Unknown")
+    job_id = detail.get("jobId", "Unknown")
+    log_stream_name = (
+        detail.get("attempts", [{}])[0].get("container", {}).get("logStreamName", None)
+    )
+
+    status_reason = detail.get("statusReason", "No reason provided")
+
+    # Fetch logs if logStreamName is available
+    logs = []
+    if log_stream_name:
+        log_group_name = "/aws/batch/job"
+        try:
+            response = logs_client.get_log_events(
+                logGroupName=log_group_name, logStreamName=log_stream_name, limit=10
+            )
+            logs = [event["message"] for event in response.get("events", [])]
+        except Exception as e:
+            logs.append(f"Could not fetch logs: {e!s}")
+
+    # Format email message
+    formatted_message = f"""
+    Batch Job Failed!
+
+    Job Name: {job_name}
+    Job ID: {job_id}
+    Status Reason: {status_reason}
+
+    Logs (Last 10 lines):
+    {''.join(logs) if logs else 'No logs available'}
+    """
+
+    print(f"Formatted Message: {formatted_message}")
+
+    # Send the formatted message to the SNS topic
+    sns_client.publish(
+        TopicArn=os.environ["SNS_TOPIC_ARN"],
+        Subject=f"Batch Job Failure: {job_name}",
+        Message=formatted_message,
+    )
+
+    return {"statusCode": 200, "body": "Notification sent"}
diff --git a/sds_data_manager/utils/stackbuilder.py b/sds_data_manager/utils/stackbuilder.py
@@ -24,6 +24,7 @@
     instrument_lambdas,
     lambda_layer_construct,
     monitoring_construct,
+    monitoring_lambda_construct,
     networking_construct,
     processing_construct,
     route53_hosted_zone,
@@ -185,10 +186,16 @@ def build_sds(
         vpc_subnets=rds_construct.rds_subnet_selection,
         rds_security_group=rds_construct.rds_security_group,
         data_bucket=data_bucket.data_bucket,
-        sns_topic=monitoring.sns_topic_notifications,
         layers=[db_lambda_layer],
     )
 
+    monitoring_lambda_construct.MonitoringLambda(
+        scope=sdc_stack,
+        construct_id="MonitoringLambda",
+        code=lambda_code,
+        sns_topic=monitoring.sns_topic_notifications,
+    )
+
     sds_api_manager_construct.SdsApiManager(
         scope=sdc_stack,
         construct_id="SdsApiManager",