From e54849843f9d2ca728da569285c0d1bc27d7f0f0 Mon Sep 17 00:00:00 2001
From: "Jonathan D." <3976137+Jonathan34@users.noreply.github.com>
Date: Wed, 14 Aug 2024 11:31:30 -0500
Subject: [PATCH] Fix support case partitioning (#5)

* remove resource bucket as we use a workshop static url for the lambda package

* update readme

* update readme

* change the sequence for the shell to avoid errors if CFN fails

* check if bucket exists and permission with boto 3

* remove unused import

* fix support case partitioning

* add a convenient script to package lambda

* add changelog

* update gitignore

* fix import and timezone warning

* update lambda collector version

* update formatting of the deployment script

* Change the way the data is partitioned in S3 to avoid duplicates

* update changelog

* fix pylint warning

* update instructions
---
 .gitignore                                    |  1 +
 CHANGELOG.md                                  | 10 +++++
 src/support_collector/README.md               |  2 +-
 src/support_collector/deploy_collector.sh     | 17 ++++----
 .../member_account_resources.yaml             |  2 +-
 src/support_collector/package_lambda.sh       | 16 ++++++++
 .../lambda_function.py                        |  3 --
 .../support-collector-lambda/upload_cases.py  | 39 +++++++++++--------
 .../support-collector-lambda/upload_health.py | 19 +++++----
 .../support-collector-lambda/upload_ta.py     | 18 ++++-----
 .../support-collector-lambda/utils.py         | 17 ++++++++
 11 files changed, 97 insertions(+), 47 deletions(-)
 create mode 100644 CHANGELOG.md
 create mode 100755 src/support_collector/package_lambda.sh
 create mode 100644 src/support_collector/support-collector-lambda/utils.py
diff --git a/.gitignore b/.gitignore
index 3b08ee7..b1a6abf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ src/support_collector/__pycache__
 src/support_collector/.python-version
 src/support_collector/individual-account-deployments/temp_dir/
 .DS_Store
+src/support_collector/temp_dir/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..94761d6
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,10 @@
+# Changelog
+
+## Support Collector Lambda v1.0.1
+
+* Partition support cases and Health data using their creation date in S3 (YYYY/MM) to avoid saving duplicates on the daily sync
+* Flatten Trusted Advisor checks in S3 to avoid duplicates during daily sync.
+
+## Support Collector Lambda v1.0.0
+
+* Update to Python 3.11 runtime
diff --git a/src/support_collector/README.md b/src/support_collector/README.md
index 5da3578..ad3f149 100644
--- a/src/support_collector/README.md
+++ b/src/support_collector/README.md
@@ -93,7 +93,7 @@ To deploy this solution, you will need to know the `<organization-id>`, `<root-i
    - Update the bucket policy for the support data bucket to allow member accounts to upload their support data.
    - Deploy a stackset to run a one time sync to fetch historical support data and load to S3 data bucket.
 
-The bucket policy for the support data is generated in the file `output_bucket_policy.json` and is similar to the following one:
+The bucket policy for the support data is generated in the file `output_bucket_policy.json` and the script will ask you if you want to overwrite the bucket policy. If you decline, then you will have to update it manually. The policy is similar to the following one:
 
 ```json
 {
diff --git a/src/support_collector/deploy_collector.sh b/src/support_collector/deploy_collector.sh
index 75ae63d..93c4941 100755
--- a/src/support_collector/deploy_collector.sh
+++ b/src/support_collector/deploy_collector.sh
@@ -1,22 +1,23 @@
 #!/bin/bash
-echo "This script will deploy the solution to an organization.\n"
+printf "This script will deploy the solution to an organization.\n\n"
 
-echo "Enter the OU IDs separated by commas (ie: ou-xxxxxxxxxx1,ou-xxxxxxxxxx2): "
+printf "Enter the OU IDs separated by commas (ie: ou-xxxxxxxxxx1,ou-xxxxxxxxxx2): "
 read OU_IDS
-echo ""
+printf "\n\n"
 
-echo "Enter the data collection S3 bucket name in the management account: "
+printf "Enter the data collection S3 bucket name in the management account: "
 read DATA_BUCKET_NAME
-echo ""
+printf "\n\n"
 
-echo "Do you want the script to overwrite the data collection bucket policy on your behalf?\nThis requires PutBucketPolicy permission and it will OVERWRITE the current policy.\nIf the policy is not set, member accounts may not be able to store their data properly. (Y/N, default: Y): "
+printf "Do you want the script to overwrite the data collection bucket policy on your behalf?\nThis requires PutBucketPolicy permission and it will OVERWRITE the current policy.\nIf the policy is not set, member accounts may not be able to store their data properly. (Y/N, default: Y): "
 read OVERWRITE_DATA_BUCKET_POLICY_ANSWER
+if [ "$OVERWRITE_DATA_BUCKET_POLICY_ANSWER" != "${OVERWRITE_DATA_BUCKET_POLICY_ANSWER#[Yy]}" ] ;then
 if [ "$OVERWRITE_DATA_BUCKET_POLICY_ANSWER" != "${OVERWRITE_DATA_BUCKET_POLICY_ANSWER#[Yy]}" ] ;then
     OVERWRITE_DATA_BUCKET_POLICY=--overwrite-data-bucket-policy
 else
     OVERWRITE_DATA_BUCKET_POLICY="--no-overwrite-data-bucket-policy"
 fi
-echo ""
+printf "\n\n"
 
-echo "Invoking deploy_infrastructure.py..."
+printf "Invoking deploy_infrastructure.py...\n"
 python3 deploy_infrastructure.py --data-bucket "${DATA_BUCKET_NAME}" --ou-ids "${OU_IDS}" "${OVERWRITE_DATA_BUCKET_POLICY}"
diff --git a/src/support_collector/member_account_resources.yaml b/src/support_collector/member_account_resources.yaml
index b062f0e..224d492 100644
--- a/src/support_collector/member_account_resources.yaml
+++ b/src/support_collector/member_account_resources.yaml
@@ -57,7 +57,7 @@ Resources:
       Role: !GetAtt SupportInsightsLambdaExecutionRole.Arn
       Code:
         S3Bucket: ws-assets-prod-iad-r-iad-ed304a55c2ca1aee
-        S3Key: 741340b0-0c84-4a80-8ccf-e351dcc826c0/support-collector-lambda.zip
+        S3Key: 741340b0-0c84-4a80-8ccf-e351dcc826c0/support-collector-lambda-v1.0.1.zip
       Runtime: python3.11
       Timeout: 900
 
diff --git a/src/support_collector/package_lambda.sh b/src/support_collector/package_lambda.sh
new file mode 100755
index 0000000..1ba93ab
--- /dev/null
+++ b/src/support_collector/package_lambda.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+echo "Cleaning up old files..."
+rm -rf support-collector-lambda.zip support-collector-lambda-layer.zip python temp_dir
+
+echo "Installing dependencies into a temporary directory..."
+mkdir temp_dir
+pip3 install -r requirements.txt -t temp_dir/
+
+echo "Copying dependencies to the Lambda directory..."
+cp -r temp_dir/* support-collector-lambda/
+
+echo "Creating deployment package..."
+cd support-collector-lambda
+zip -r ../support-collector-lambda.zip . -x '*.DS_Store' 2>/dev/null || true
+cd ..
diff --git a/src/support_collector/support-collector-lambda/lambda_function.py b/src/support_collector/support-collector-lambda/lambda_function.py
index 599a044..c45dbaa 100644
--- a/src/support_collector/support-collector-lambda/lambda_function.py
+++ b/src/support_collector/support-collector-lambda/lambda_function.py
@@ -1,9 +1,6 @@
 import importlib
-import boto3
-
 
 def lambda_handler(event, context):
-    lambda_client = boto3.client("lambda")
     account_id = context.invoked_function_arn.split(":")[4]
 
     # Get PAST_NO_OF_DAYS from event parameters
diff --git a/src/support_collector/support-collector-lambda/upload_cases.py b/src/support_collector/support-collector-lambda/upload_cases.py
index e5b2859..ce34053 100644
--- a/src/support_collector/support-collector-lambda/upload_cases.py
+++ b/src/support_collector/support-collector-lambda/upload_cases.py
@@ -1,32 +1,38 @@
 import json
-import datetime
+from datetime import datetime, timedelta, timezone
 from collections import defaultdict
 import logging
 import boto3
 from botocore.exceptions import ClientError
 
+from utils import convert_time_to_month_year
+
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
 session = boto3.Session()
 
-
 def save_to_s3(cases_by_account, bucket_name):
     region = session.region_name
     s3 = session.client("s3", region_name=region)
-    current_date = datetime.datetime.now().strftime(
-        "%Y-%m-%d"
-    )  # Format the date as YYYY-MM-DD
 
     print(f"The Support cases are being uploaded to S3 bucket {bucket_name}...")
     for account_id, cases in cases_by_account.items():
         for case in cases:
-            case_id = case["case"]["displayId"]  # Extracting case ID for filename
-            case_json = json.dumps(case, ensure_ascii=False).encode(
-                "utf-8"
-            )  # Serialize case data to JSON with UTF-8 encoding
-            file_key = f"support-cases/{account_id}/{current_date}/{case_id}.json"
+            # Extracting case ID for filename
+            case_id = case["case"]["displayId"]
+
+            # Extracting creation time for partitioning in S3
+            time_created = case["case"]["timeCreated"]
+            # Convert the time_created in the format "2024-07-23T15:49:29.995Z" to "2024/07"
+            creation_date = convert_time_to_month_year(iso_datetime=time_created)
+
+            # Serialize case data to JSON with UTF-8 encoding
+            case_json = json.dumps(case, ensure_ascii=False).encode("utf-8")
+
+            file_key = f"support-cases/{account_id}/{creation_date}/{case_id}.json"
             s3.put_object(Bucket=bucket_name, Key=file_key, Body=case_json)
+
             print(f"Uploaded {file_key}")
     print("Support cases upload done!")
 
@@ -45,14 +51,14 @@ def get_support_cases(credentials):
     return cases
 
 
-def describe_cases(after_time, resolved):
+def describe_cases(after_time, include_resolved):
     """
     Describe support cases over a period of time, optionally filtering
     by status.
 
     :param after_time: The start time to include for cases.
     :param before_time: The end time to include for cases.
-    :param resolved: True to include resolved cases in the results,
+    :param include_resolved: True to include resolved cases in the results,
         otherwise results are open cases.
     :return: The final status of the case.
     """
@@ -62,7 +68,7 @@ def describe_cases(after_time, resolved):
         paginator = support_client.get_paginator("describe_cases")
         for page in paginator.paginate(
             afterTime=after_time,
-            includeResolvedCases=resolved,
+            includeResolvedCases=include_resolved,
             includeCommunications=True,
             language="en",
         ):
@@ -85,11 +91,10 @@ def describe_cases(after_time, resolved):
 
 
 def list_all_cases(days):
-    include_communications = True
-    end_date = datetime.datetime.utcnow().date()
-    start_date = end_date - datetime.timedelta(days)
+    include_resolved = True
+    start_date = datetime.now(timezone.utc).date() - timedelta(days)
     start_time = str(start_date)
-    all_cases = describe_cases(start_time, include_communications)
+    all_cases = describe_cases(start_time, include_resolved)
 
     return all_cases
 
diff --git a/src/support_collector/support-collector-lambda/upload_health.py b/src/support_collector/support-collector-lambda/upload_health.py
index 79a1e2a..a3861f0 100644
--- a/src/support_collector/support-collector-lambda/upload_health.py
+++ b/src/support_collector/support-collector-lambda/upload_health.py
@@ -4,6 +4,7 @@
 import logging
 import boto3
 
+
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 
@@ -20,22 +21,26 @@ def default(self, o):
 def save_to_s3(events_by_account, bucket_name):
     region = session.region_name
     s3 = session.client("s3", region_name=region)
-    current_date = datetime.datetime.utcnow().strftime(
-        "%Y-%m-%d"
-    )  # Format the date as YYYY-MM-DD
 
     print(f"The Health events are being uploaded to S3 bucket {bucket_name}...")
     for account_id, account_events in events_by_account.items():
         for event_dict in account_events:
             event = event_dict["event"]
-            arn = (
-                event["arn"].split(":")[-1].replace("/", "_")
-            )  # Clean ARN for use as filename
+
+            # Clean ARN for use as filename
+            arn = event["arn"].split(":")[-1].replace("/", "_")
             event_json = json.dumps(
                 event, cls=DatetimeEncoder, ensure_ascii=False
             ).encode("utf-8")
-            file_key = f"health/{account_id}/{current_date}/{arn}.json"  # Construct the file key using account_id, date, and arn
+
+            # Extracting start time for partitioning in S3
+            dt = event["startTime"]
+            start_date = f"{dt.year}/{dt.month}"
+
+            # Construct the file key using account_id, date, and arn
+            file_key = f"health/{account_id}/{start_date}/{arn}.json"
             s3.put_object(Bucket=bucket_name, Key=file_key, Body=event_json)
+
             print(f"Uploaded {file_key}")
     print("Health upload done!")
 
diff --git a/src/support_collector/support-collector-lambda/upload_ta.py b/src/support_collector/support-collector-lambda/upload_ta.py
index cb18e66..bf67450 100644
--- a/src/support_collector/support-collector-lambda/upload_ta.py
+++ b/src/support_collector/support-collector-lambda/upload_ta.py
@@ -1,5 +1,4 @@
 import json
-import datetime
 from collections import defaultdict
 import boto3
 
@@ -19,26 +18,24 @@
 def save_to_s3(recommendations_by_account, bucket_name):
     region = session.region_name
     s3 = session.client("s3", region_name=region)
-    current_date = datetime.datetime.utcnow().strftime(
-        "%Y-%m-%d"
-    )  # Using UTC date to standardize the timestamps across regions
 
     print(f"The TA recommendations are being uploaded to S3 bucket {bucket_name}...")
     for account_id, recommendations in recommendations_by_account.items():
         for recommendation in recommendations:
             status = recommendation["recommendation"]["status"].lower()
+            # Filter for warning or error status
             if status in [
                 "warning",
                 "error",
                 "yellow",
                 "red",
-            ]:  # Filter for warning or error status
-                check_id = recommendation["recommendation"][
-                    "checkId"
-                ]  # Extract the checkId from the recommendation
+            ]:
+                # Extract the checkId from the recommendation
+                check_id = recommendation["recommendation"]["checkId"]
+                # Get the description from the checks_info_dict
                 description = checks_info_dict.get(check_id, {}).get(
                     "description", "No description provided"
-                )  # Get the description from the checks_info_dict
+                )
                 # Update the recommendation with name and modified description
                 recommendation["recommendation"][
                     "description"
@@ -46,7 +43,8 @@ def save_to_s3(recommendations_by_account, bucket_name):
                 recommendation_json = json.dumps(
                     recommendation, ensure_ascii=False
                 ).encode("utf-8")
-                file_key = f"ta/{account_id}/{current_date}/{check_id}.json"  # Construct the file key using account_id, date, and checkId
+                # Construct the file key using account_id, date, and checkId
+                file_key = f"ta/{account_id}/{check_id}.json"
                 s3.put_object(
                     Bucket=bucket_name, Key=file_key, Body=recommendation_json
                 )
diff --git a/src/support_collector/support-collector-lambda/utils.py b/src/support_collector/support-collector-lambda/utils.py
new file mode 100644
index 0000000..558abfb
--- /dev/null
+++ b/src/support_collector/support-collector-lambda/utils.py
@@ -0,0 +1,17 @@
+from datetime import datetime
+
+
+def convert_time_to_month_year(iso_datetime):
+    # Parse the time_created string into a datetime object
+    # dt = datetime.strptime(iso_datetime, "%Y-%m-%dT%H:%M:%S.%fZ")
+    iso_date = iso_datetime.replace("Z", "+00:00")
+    dt = datetime.fromisoformat(iso_date)
+
+    # Extract the year and month components
+    year = dt.year
+    month = dt.month
+
+    # Format the year and month as "YYYY/MM"
+    month_year = f"{year}/{month:02d}"
+
+    return month_year