From bc3fa66867eaeb008eb493185c7072c910218303 Mon Sep 17 00:00:00 2001 From: Noah Beard Date: Fri, 25 Nov 2022 20:10:05 -0500 Subject: [PATCH 1/3] Add additional logging to mqtt5 decoder, adjust canary to monitor OS metrics, fix canary bugs --- codebuild/CanaryWrapper.py | 133 ++++++++------- codebuild/CanaryWrapper_24_7.py | 188 +++++++++++---------- codebuild/CanaryWrapper_Classes.py | 90 ++++++---- codebuild/CanaryWrapper_MetricFunctions.py | 13 +- source/v5/mqtt5_decoder.c | 7 + 5 files changed, 235 insertions(+), 196 deletions(-) diff --git a/codebuild/CanaryWrapper.py b/codebuild/CanaryWrapper.py index fe894d8b..c089ffd5 100644 --- a/codebuild/CanaryWrapper.py +++ b/codebuild/CanaryWrapper.py @@ -123,7 +123,8 @@ new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=1, - new_metric_alarm_severity=5) + new_metric_alarm_severity=5, + is_percent=True) data_snapshot.register_metric( new_metric_name="total_memory_usage_value", new_metric_function=get_metric_total_memory_usage_value, @@ -134,7 +135,8 @@ new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=0, - new_metric_alarm_severity=5) + new_metric_alarm_severity=5, + is_percent=True) # Print diagnosis information data_snapshot.output_diagnosis_information(command_parser_arguments.dependencies) @@ -217,51 +219,25 @@ def application_thread(): finished_email_body = "MQTT5 Short Running Canary Wrapper has stopped." finished_email_body += "\n\n" - # Find out why we stopped - if (snapshot_monitor.had_internal_error == True): - if (snapshot_monitor.has_cut_ticket == True): - # We do not need to cut a ticket here - it's cut by the snapshot monitor! - print ("ERROR - Snapshot monitor stopped due to metric in alarm!", flush=True) - finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" - finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) - wrapper_error_occurred = True - else: - print ("ERROR - Snapshot monitor stopped due to internal error!", flush=True) - cut_ticket_using_cloudwatch( - git_repo_name=command_parser_arguments.git_repo_name, - git_hash=command_parser_arguments.git_hash, - git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, - git_fixed_namespace_text="mqtt5_canary", - cloudwatch_region="us-east-1", - ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, - ticket_reason="Snapshot monitor stopped due to internal error", - ticket_allow_duplicates=True, - ticket_category=command_parser_arguments.ticket_category, - ticket_item=command_parser_arguments.ticket_item, - ticket_group=command_parser_arguments.ticket_group, - ticket_type=command_parser_arguments.ticket_type, - ticket_severity=4) - wrapper_error_occurred = True - finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." - finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason - - elif (application_monitor.error_has_occurred == True): - if (application_monitor.error_due_to_credentials == True): - print ("INFO - Stopping application due to error caused by credentials") - print ("Please fix your credentials and then restart this application again", flush=True) - wrapper_error_occurred = True - send_finished_email = False - else: - # Is the error something in the canary failed? - if (application_monitor.error_code != 0): + try: + # Find out why we stopped + if (snapshot_monitor.had_internal_error == True): + if (snapshot_monitor.has_cut_ticket == True): + # We do not need to cut a ticket here - it's cut by the snapshot monitor! + print ("ERROR - Snapshot monitor stopped due to metric in alarm!", flush=True) + finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" + finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) + wrapper_error_occurred = True + else: + print ("ERROR - Snapshot monitor stopped due to internal error!", flush=True) cut_ticket_using_cloudwatch( git_repo_name=command_parser_arguments.git_repo_name, git_hash=command_parser_arguments.git_hash, git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, git_fixed_namespace_text="mqtt5_canary", cloudwatch_region="us-east-1", - ticket_description="The Short Running Canary exited with a non-zero exit code! This likely means something in the canary failed.", - ticket_reason="The Short Running Canary exited with a non-zero exit code", + ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, + ticket_reason="Snapshot monitor stopped due to internal error", ticket_allow_duplicates=True, ticket_category=command_parser_arguments.ticket_category, ticket_item=command_parser_arguments.ticket_item, @@ -269,29 +245,60 @@ def application_thread(): ticket_type=command_parser_arguments.ticket_type, ticket_severity=4) wrapper_error_occurred = True - finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code! This means something in the Canary application itself failed" + finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." + finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason + + elif (application_monitor.error_has_occurred == True): + if (application_monitor.error_due_to_credentials == True): + print ("INFO - Stopping application due to error caused by credentials") + print ("Please fix your credentials and then restart this application again", flush=True) + wrapper_error_occurred = True + send_finished_email = False else: - print ("INFO - Stopping application. No error has occurred, application has stopped normally", flush=True) - finished_email_body += "Short Running Canary finished successfully and run without errors!" - wrapper_error_occurred = False - else: - print ("ERROR - Short Running Canary stopped due to unknown reason!", flush=True) - cut_ticket_using_cloudwatch( - git_repo_name=command_parser_arguments.git_repo_name, - git_hash=command_parser_arguments.git_hash, - git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, - git_fixed_namespace_text="mqtt5_canary", - cloudwatch_region="us-east-1", - ticket_description="The Short Running Canary stopped for an unknown reason!", - ticket_reason="The Short Running Canary stopped for unknown reason", - ticket_allow_duplicates=True, - ticket_category=command_parser_arguments.ticket_category, - ticket_item=command_parser_arguments.ticket_item, - ticket_group=command_parser_arguments.ticket_group, - ticket_type=command_parser_arguments.ticket_type, - ticket_severity=4) - wrapper_error_occurred = True - finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" + # Is the error something in the canary failed? + if (application_monitor.error_code != 0): + cut_ticket_using_cloudwatch( + git_repo_name=command_parser_arguments.git_repo_name, + git_hash=command_parser_arguments.git_hash, + git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, + git_fixed_namespace_text="mqtt5_canary", + cloudwatch_region="us-east-1", + ticket_description="The Short Running Canary exited with a non-zero exit code! This likely means something in the canary failed.", + ticket_reason="The Short Running Canary exited with a non-zero exit code", + ticket_allow_duplicates=True, + ticket_category=command_parser_arguments.ticket_category, + ticket_item=command_parser_arguments.ticket_item, + ticket_group=command_parser_arguments.ticket_group, + ticket_type=command_parser_arguments.ticket_type, + ticket_severity=4) + wrapper_error_occurred = True + finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code! This means something in the Canary application itself failed" + else: + print ("INFO - Stopping application. No error has occurred, application has stopped normally", flush=True) + application_monitor.print_stdout() + finished_email_body += "Short Running Canary finished successfully and run without errors!" + wrapper_error_occurred = False + else: + print ("ERROR - Short Running Canary stopped due to unknown reason!", flush=True) + cut_ticket_using_cloudwatch( + git_repo_name=command_parser_arguments.git_repo_name, + git_hash=command_parser_arguments.git_hash, + git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, + git_fixed_namespace_text="mqtt5_canary", + cloudwatch_region="us-east-1", + ticket_description="The Short Running Canary stopped for an unknown reason!", + ticket_reason="The Short Running Canary stopped for unknown reason", + ticket_allow_duplicates=True, + ticket_category=command_parser_arguments.ticket_category, + ticket_item=command_parser_arguments.ticket_item, + ticket_group=command_parser_arguments.ticket_group, + ticket_type=command_parser_arguments.ticket_type, + ticket_severity=4) + wrapper_error_occurred = True + finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" + except Exception as e: + print ("ERROR: Could not (possibly) cut ticket due to exception!") + print ("Exception: " + str(e), flush=True) # Clean everything up and stop snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) diff --git a/codebuild/CanaryWrapper_24_7.py b/codebuild/CanaryWrapper_24_7.py index d4fa3a0c..877b8259 100644 --- a/codebuild/CanaryWrapper_24_7.py +++ b/codebuild/CanaryWrapper_24_7.py @@ -123,7 +123,8 @@ new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=1, - new_metric_alarm_severity=5) + new_metric_alarm_severity=5, + is_percent=True) data_snapshot.register_metric( new_metric_name="total_memory_usage_value", new_metric_function=get_metric_total_memory_usage_value, @@ -134,7 +135,8 @@ new_metric_unit="Percent", new_metric_alarm_threshold=70, new_metric_reports_to_skip=0, - new_metric_alarm_severity=5) + new_metric_alarm_severity=5, + is_percent=True) data_snapshot.register_dashboard_widget("Process CPU Usage - Percentage", ["total_cpu_usage"], 60) data_snapshot.register_dashboard_widget("Process Memory Usage - Percentage", ["total_memory_usage_percent"], 60) @@ -250,120 +252,124 @@ def application_thread(): finished_email_body = "MQTT5 24/7 Canary Wrapper has stopped." finished_email_body += "\n\n" - # Find out why we stopped - # S3 Monitor - if (s3_monitor.had_internal_error == True): - if (s3_monitor.error_due_to_credentials == False): - print ("ERROR - S3 monitor stopped due to internal error!") - cut_ticket_using_cloudwatch( - git_repo_name=canary_local_git_repo_stub, - git_hash=canary_local_git_hash_stub, - git_hash_as_namespace=False, - git_fixed_namespace_text=canary_local_git_fixed_namespace, - cloudwatch_region=canary_region_stub, - ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + s3_monitor.internal_error_reason, - ticket_reason="S3 monitor stopped due to internal error", - ticket_allow_duplicates=True, - ticket_category="AWS", - ticket_type="SDKs and Tools", - ticket_item="IoT SDK for CPP", - ticket_group="AWS IoT Device SDK", - ticket_severity=4) - finished_email_body += "Failure due to S3 monitor stopping due to an internal error." - finished_email_body += " Reason given for error: " + s3_monitor.internal_error_reason - wrapper_error_occurred = True - # Snapshot Monitor - elif (snapshot_monitor.had_internal_error == True): - if (snapshot_monitor.has_cut_ticket == True): - # We do not need to cut a ticket here - it's cut by the snapshot monitor! - print ("ERROR - Snapshot monitor stopped due to metric in alarm!") - finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" - finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) - finished_email_body += "\nNOTE - this shouldn't occur in the 24/7 Canary! If it does, then the wrapper needs adjusting." - wrapper_error_occurred = True - else: - print ("ERROR - Snapshot monitor stopped due to internal error!") - cut_ticket_using_cloudwatch( - git_repo_name=canary_local_git_repo_stub, - git_hash=canary_local_git_hash_stub, - git_hash_as_namespace=False, - git_fixed_namespace_text=canary_local_git_fixed_namespace, - cloudwatch_region=canary_region_stub, - ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, - ticket_reason="Snapshot monitor stopped due to internal error", - ticket_allow_duplicates=True, - ticket_category="AWS", - ticket_type="SDKs and Tools", - ticket_item="IoT SDK for CPP", - ticket_group="AWS IoT Device SDK", - ticket_severity=4) - wrapper_error_occurred = True - finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." - finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason - # Application Monitor - elif (application_monitor.error_has_occurred == True): - if (application_monitor.error_due_to_credentials == True): - print ("INFO - Stopping application due to error caused by credentials") - print ("Please fix your credentials and then restart this application again") - wrapper_error_occurred = True - send_finished_email = False - else: - # Is the error something in the canary failed? - if (application_monitor.error_code != 0): + try: + # Find out why we stopped + # S3 Monitor + if (s3_monitor.had_internal_error == True): + if (s3_monitor.error_due_to_credentials == False): + print ("ERROR - S3 monitor stopped due to internal error!") cut_ticket_using_cloudwatch( git_repo_name=canary_local_git_repo_stub, git_hash=canary_local_git_hash_stub, git_hash_as_namespace=False, git_fixed_namespace_text=canary_local_git_fixed_namespace, cloudwatch_region=canary_region_stub, - ticket_description="The 24/7 Canary exited with a non-zero exit code! This likely means something in the canary failed.", - ticket_reason="The 24/7 Canary exited with a non-zero exit code", + ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + s3_monitor.internal_error_reason, + ticket_reason="S3 monitor stopped due to internal error", ticket_allow_duplicates=True, ticket_category="AWS", ticket_type="SDKs and Tools", ticket_item="IoT SDK for CPP", ticket_group="AWS IoT Device SDK", - ticket_severity=3) + ticket_severity=4) + finished_email_body += "Failure due to S3 monitor stopping due to an internal error." + finished_email_body += " Reason given for error: " + s3_monitor.internal_error_reason + wrapper_error_occurred = True + # Snapshot Monitor + elif (snapshot_monitor.had_internal_error == True): + if (snapshot_monitor.has_cut_ticket == True): + # We do not need to cut a ticket here - it's cut by the snapshot monitor! + print ("ERROR - Snapshot monitor stopped due to metric in alarm!") + finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" + finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) + finished_email_body += "\nNOTE - this shouldn't occur in the 24/7 Canary! If it does, then the wrapper needs adjusting." wrapper_error_occurred = True - finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code!" - finished_email_body += " This means something in the Canary application itself failed" else: + print ("ERROR - Snapshot monitor stopped due to internal error!") cut_ticket_using_cloudwatch( git_repo_name=canary_local_git_repo_stub, git_hash=canary_local_git_hash_stub, git_hash_as_namespace=False, git_fixed_namespace_text=canary_local_git_fixed_namespace, cloudwatch_region=canary_region_stub, - ticket_description="The 24/7 Canary exited with a zero exit code but did not restart!", - ticket_reason="The 24/7 Canary exited with a zero exit code but did not restart", + ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, + ticket_reason="Snapshot monitor stopped due to internal error", ticket_allow_duplicates=True, ticket_category="AWS", ticket_type="SDKs and Tools", ticket_item="IoT SDK for CPP", ticket_group="AWS IoT Device SDK", - ticket_severity=3) + ticket_severity=4) + wrapper_error_occurred = True + finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." + finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason + # Application Monitor + elif (application_monitor.error_has_occurred == True): + if (application_monitor.error_due_to_credentials == True): + print ("INFO - Stopping application due to error caused by credentials") + print ("Please fix your credentials and then restart this application again") wrapper_error_occurred = True - finished_email_body += "Failure due to MQTT5 application stopping and not automatically restarting!" - finished_email_body += " This shouldn't occur and means something is wrong with the Canary wrapper!" - # Other - else: - print ("ERROR - 24/7 Canary stopped due to unknown reason!") - cut_ticket_using_cloudwatch( - git_repo_name=canary_local_git_repo_stub, - git_hash=canary_local_git_hash_stub, - git_hash_as_namespace=False, - git_fixed_namespace_text=canary_local_git_fixed_namespace, - cloudwatch_region=canary_region_stub, - ticket_description="The 24/7 Canary stopped for an unknown reason!", - ticket_reason="The 24/7 Canary stopped for unknown reason", - ticket_allow_duplicates=True, - ticket_category="AWS", - ticket_type="SDKs and Tools", - ticket_item="IoT SDK for CPP", - ticket_group="AWS IoT Device SDK", - ticket_severity=3) - wrapper_error_occurred = True - finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" + send_finished_email = False + else: + # Is the error something in the canary failed? + if (application_monitor.error_code != 0): + cut_ticket_using_cloudwatch( + git_repo_name=canary_local_git_repo_stub, + git_hash=canary_local_git_hash_stub, + git_hash_as_namespace=False, + git_fixed_namespace_text=canary_local_git_fixed_namespace, + cloudwatch_region=canary_region_stub, + ticket_description="The 24/7 Canary exited with a non-zero exit code! This likely means something in the canary failed.", + ticket_reason="The 24/7 Canary exited with a non-zero exit code", + ticket_allow_duplicates=True, + ticket_category="AWS", + ticket_type="SDKs and Tools", + ticket_item="IoT SDK for CPP", + ticket_group="AWS IoT Device SDK", + ticket_severity=3) + wrapper_error_occurred = True + finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code!" + finished_email_body += " This means something in the Canary application itself failed" + else: + cut_ticket_using_cloudwatch( + git_repo_name=canary_local_git_repo_stub, + git_hash=canary_local_git_hash_stub, + git_hash_as_namespace=False, + git_fixed_namespace_text=canary_local_git_fixed_namespace, + cloudwatch_region=canary_region_stub, + ticket_description="The 24/7 Canary exited with a zero exit code but did not restart!", + ticket_reason="The 24/7 Canary exited with a zero exit code but did not restart", + ticket_allow_duplicates=True, + ticket_category="AWS", + ticket_type="SDKs and Tools", + ticket_item="IoT SDK for CPP", + ticket_group="AWS IoT Device SDK", + ticket_severity=3) + wrapper_error_occurred = True + finished_email_body += "Failure due to MQTT5 application stopping and not automatically restarting!" + finished_email_body += " This shouldn't occur and means something is wrong with the Canary wrapper!" + # Other + else: + print ("ERROR - 24/7 Canary stopped due to unknown reason!") + cut_ticket_using_cloudwatch( + git_repo_name=canary_local_git_repo_stub, + git_hash=canary_local_git_hash_stub, + git_hash_as_namespace=False, + git_fixed_namespace_text=canary_local_git_fixed_namespace, + cloudwatch_region=canary_region_stub, + ticket_description="The 24/7 Canary stopped for an unknown reason!", + ticket_reason="The 24/7 Canary stopped for unknown reason", + ticket_allow_duplicates=True, + ticket_category="AWS", + ticket_type="SDKs and Tools", + ticket_item="IoT SDK for CPP", + ticket_group="AWS IoT Device SDK", + ticket_severity=3) + wrapper_error_occurred = True + finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" + except Exception as e: + print ("ERROR: Could not (possibly) cut ticket due to exception!") + print ("Exception: " + str(e), flush=True) # Clean everything up and stop snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) diff --git a/codebuild/CanaryWrapper_Classes.py b/codebuild/CanaryWrapper_Classes.py index c31c0d5d..ee5d3c3e 100644 --- a/codebuild/CanaryWrapper_Classes.py +++ b/codebuild/CanaryWrapper_Classes.py @@ -18,7 +18,7 @@ class DataSnapshot_Metric(): def __init__(self, metric_name, metric_function, metric_dimensions=[], metric_unit="None", metric_alarm_threshold=None, metric_alarm_severity=6, - git_hash="", git_repo_name="", reports_to_skip=0): + git_hash="", git_repo_name="", reports_to_skip=0, is_percent=False): self.metric_name = metric_name self.metric_function = metric_function self.metric_dimensions = metric_dimensions @@ -29,6 +29,7 @@ def __init__(self, metric_name, metric_function, metric_dimensions=[], self.metric_value = None self.reports_to_skip = reports_to_skip self.metric_alarm_severity = metric_alarm_severity + self.is_percent = is_percent # Gets the latest metric value from the metric_function callback def get_metric_value(self, psutil_process : psutil.Process): @@ -486,8 +487,9 @@ def lambda_send_email(self, message, subject): # * (OPTIONAL) new_reports_to_skip is the number of reports this metric will return nothing, but will get it's value. # * Useful for CPU calculations that require deltas # * (OPTIONAL) new_metric_alarm_severity is the severity of the ticket if this alarm is triggered. A severity of 6+ means no ticket. + # * (OPTIONAL) is_percent whether or not to display the metric as a percent when printing it (default=false) def register_metric(self, new_metric_name, new_metric_function, new_metric_unit="None", - new_metric_alarm_threshold=None, new_metric_reports_to_skip=0, new_metric_alarm_severity=6): + new_metric_alarm_threshold=None, new_metric_reports_to_skip=0, new_metric_alarm_severity=6, is_percent=False): new_metric_dimensions = [] @@ -508,7 +510,8 @@ def register_metric(self, new_metric_name, new_metric_function, new_metric_unit= metric_alarm_severity=new_metric_alarm_severity, git_hash=self.git_hash, git_repo_name=self.git_repo_name, - reports_to_skip=new_metric_reports_to_skip + reports_to_skip=new_metric_reports_to_skip, + is_percent=is_percent ) self.metrics.append(new_metric) # append an empty list so we can track it's metrics over time @@ -567,12 +570,16 @@ def _find_cloudwatch_widget(self, name): # Prints the metrics to the console def export_metrics_console(self): datetime_now = datetime.datetime.now() - datetime_string = datetime_now.strftime("%d-%m-%Y/%H-%M-%S") + datetime_string = datetime_now.strftime("%d-%m-%Y/%H:%M:%S") self.print_message("\n[DataSnapshot] Metric report: " + str(self.metric_report_number) + " (" + datetime_string + ")") for metric in self.metrics: - self.print_message(" " + metric.metric_name + - " - value: " + str(metric.metric_value)) + if (metric.is_percent == True): + self.print_message(" " + metric.metric_name + + " - value: " + str(metric.metric_value) + "%") + else: + self.print_message(" " + metric.metric_name + + " - value: " + str(metric.metric_value)) self.print_message("") # Sends all registered metrics to Cloudwatch. @@ -886,13 +893,15 @@ def __init__(self, wrapper_application_path, wrapper_application_arguments, wrap self.wrapper_application_restart_on_finish = wrapper_application_restart_on_finish self.data_snapshot=data_snapshot + self.stdout_file_path = "Canary_Stdout_File.txt" + def start_monitoring(self): self.print_message("[ApplicationMonitor] Starting to monitor application...") if (self.application_process == None): try: canary_command = self.wrapper_application_path + " " + self.wrapper_application_arguments - self.application_process = subprocess.Popen(canary_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="utf-8") + self.application_process = subprocess.Popen(canary_command + " | tee " + self.stdout_file_path, shell=True) self.application_process_psutil = psutil.Process(self.application_process.pid) self.print_message ("[ApplicationMonitor] Application started...") except Exception as e: @@ -912,7 +921,8 @@ def restart_monitoring(self): try: self.stop_monitoring() self.start_monitoring() - self.print_message("[ApplicationMonitor] Restarted monitor application!") + self.print_message("\n[ApplicationMonitor] Restarted monitor application!") + self.print_message("================================================================================") except Exception as e: self.print_message("[ApplicationMonitor] ERROR - Could not restart Canary/Application due to exception!") self.print_message("[ApplicationMonitor] Exception: " + str(e)) @@ -934,18 +944,18 @@ def stop_monitoring(self): self.application_process.terminate() self.application_process.wait() self.print_message ("[ApplicationMonitor] Stopped monitor application!") - - if self.application_process.stdout != None: - self.print_message("\nApplication STDOUT:\n") - self.print_message("=========================================\n") - for line in self.application_process.stdout: - self.print_message(line) - self.application_process.stdout.close() - self.print_message("\n=========================================\n") self.application_process = None + self.print_stdout() else: self.print_message ("[ApplicationMonitor] ERROR - cannot stop monitor application because no process is found!") + def print_stdout(): + # Print the STDOUT file + if (os.path.isfile(self.stdout_file_path)): + self.print_message("Just finished Application STDOUT: ") + with open(self.stdout_file_path, "r") as stdout_file: + self.print_message(stdout_file.read()) + os.remove(self.stdout_file_path) def monitor_loop_function(self, time_passed=30): if (self.application_process != None): @@ -1182,8 +1192,13 @@ def cut_ticket_using_cloudwatch( git_namespace_prepend_text = git_repo_name + "-" + git_hash git_metric_namespace = git_namespace_prepend_text - cloudwatch_client = boto3.client('cloudwatch', cloudwatch_region) - ticket_alarm_name = git_repo_name + "-" + git_hash + "-AUTO-TICKET" + try: + cloudwatch_client = boto3.client('cloudwatch', cloudwatch_region) + ticket_alarm_name = git_repo_name + "-" + git_hash + "-AUTO-TICKET" + except Exception as e: + print ("ERROR - could not create Cloudwatch client to make ticket metric alarm due to exception!") + print ("Exception: " + str(e), flush=True) + return new_metric_dimensions = [] if (git_hash_as_namespace == False): @@ -1203,23 +1218,28 @@ def cut_ticket_using_cloudwatch( ticket_alarm_description = f"AUTO CUT CANARY WRAPPER TICKET\n\nREASON: {ticket_reason}\n\nDESCRIPTION: {ticket_description}\n\n" - # Regsiter a metric alarm so it can auto-cut a ticket for us - cloudwatch_client.put_metric_alarm( - AlarmName=ticket_alarm_name, - AlarmDescription=ticket_alarm_description, - MetricName=ticket_alarm_name, - Namespace=git_metric_namespace, - Statistic="Maximum", - Dimensions=new_metric_dimensions, - Period=60, # How long (in seconds) is an evaluation period? - EvaluationPeriods=1, # How many periods does it need to be invalid for? - DatapointsToAlarm=1, # How many data points need to be invalid? - Threshold=1, - ComparisonOperator="GreaterThanOrEqualToThreshold", - # The data above does not really matter - it just needs to be valid input data. - # This is the part that tells Cloudwatch to cut the ticket - AlarmActions=[ticket_arn] - ) + # Register a metric alarm so it can auto-cut a ticket for us + try: + cloudwatch_client.put_metric_alarm( + AlarmName=ticket_alarm_name, + AlarmDescription=ticket_alarm_description, + MetricName=ticket_alarm_name, + Namespace=git_metric_namespace, + Statistic="Maximum", + Dimensions=new_metric_dimensions, + Period=60, # How long (in seconds) is an evaluation period? + EvaluationPeriods=1, # How many periods does it need to be invalid for? + DatapointsToAlarm=1, # How many data points need to be invalid? + Threshold=1, + ComparisonOperator="GreaterThanOrEqualToThreshold", + # The data above does not really matter - it just needs to be valid input data. + # This is the part that tells Cloudwatch to cut the ticket + AlarmActions=[ticket_arn] + ) + except Exception as e: + print ("ERROR - could not create ticket metric alarm due to exception!") + print ("Exception: " + str(e), flush=True) + return # Trigger the alarm so it cuts the ticket try: diff --git a/codebuild/CanaryWrapper_MetricFunctions.py b/codebuild/CanaryWrapper_MetricFunctions.py index a53d896d..05b1934e 100644 --- a/codebuild/CanaryWrapper_MetricFunctions.py +++ b/codebuild/CanaryWrapper_MetricFunctions.py @@ -12,24 +12,24 @@ def get_metric_total_cpu_usage(psutil_process : psutil.Process): if (psutil_process == None): print ("ERROR - No psutil.process passed! Cannot gather metric!", flush=True) return None - # We always need to skip the first CPU poll on a new process + # We always need to skip the first CPU poll if (cache_cpu_psutil_process != psutil_process): - psutil_process.cpu_percent(interval=None) + psutil.cpu_percent(interval=None) cache_cpu_psutil_process = psutil_process return None - return psutil_process.cpu_percent(interval=None) + return psutil.cpu_percent(interval=None) except Exception as e: print ("ERROR - exception occurred gathering metrics!") print ("Exception: " + str(e), flush=True) return None - +# Note: This value is in BYTES. def get_metric_total_memory_usage_value(psutil_process : psutil.Process): try: if (psutil_process == None): print ("ERROR - No psutil.process passed! Cannot gather metric!", flush=True) return None - return psutil_process.memory_info().rss + return psutil.virtual_memory()[3] except Exception as e: print ("ERROR - exception occurred gathering metrics!") print ("Exception: " + str(e), flush=True) @@ -41,9 +41,8 @@ def get_metric_total_memory_usage_percent(psutil_process : psutil.Process): if (psutil_process == None): print ("ERROR - No psutil.process passed! Cannot gather metric!", flush=True) return None - return psutil_process.memory_percent() + return psutil.virtual_memory()[2] except Exception as e: print ("ERROR - exception occurred gathering metrics!") print ("Exception: " + str(e), flush=True) return None - diff --git a/source/v5/mqtt5_decoder.c b/source/v5/mqtt5_decoder.c index c1840025..b39990a1 100644 --- a/source/v5/mqtt5_decoder.c +++ b/source/v5/mqtt5_decoder.c @@ -284,6 +284,7 @@ static int s_read_connack_property( done: if (result != AWS_OP_SUCCESS) { + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read CONNACK property decode failure", packet_cursor); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -426,6 +427,7 @@ static int s_read_publish_property( done: if (result != AWS_OP_SUCCESS) { + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read PUBLISH property decode failure", packet_cursor); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -589,6 +591,7 @@ static int s_read_puback_property( done: if (result != AWS_OP_SUCCESS) { + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read PUBACK property decode failure", packet_cursor); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -697,6 +700,7 @@ static int s_read_suback_property( done: if (result != AWS_OP_SUCCESS) { + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read SUBACK property decode failure", packet_cursor); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -795,6 +799,7 @@ static int s_read_unsuback_property( done: if (result != AWS_OP_SUCCESS) { + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read UNSUBACK property decode failure", packet_cursor); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -924,6 +929,7 @@ static int s_read_disconnect_property( done: if (result == AWS_OP_ERR) { + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read DISCONNECT property decode failure", packet_cursor); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -1024,6 +1030,7 @@ static int s_aws_mqtt5_decoder_decode_packet(struct aws_mqtt5_decoder *decoder) enum aws_mqtt5_packet_type packet_type = (enum aws_mqtt5_packet_type)(decoder->packet_first_byte >> 4); aws_mqtt5_decoding_fn *decoder_fn = decoder->options.decoder_table->decoders_by_packet_type[packet_type]; if (decoder_fn == NULL) { + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Decoder decode packet function missing!", decoder); return aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } From a62d831485b7f56bc23f845aa9e367fa4e361689 Mon Sep 17 00:00:00 2001 From: Noah Beard Date: Fri, 25 Nov 2022 20:22:03 -0500 Subject: [PATCH 2/3] Do not print the pointer, this is just for debugging anyway --- source/v5/mqtt5_decoder.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/source/v5/mqtt5_decoder.c b/source/v5/mqtt5_decoder.c index b39990a1..2ead2311 100644 --- a/source/v5/mqtt5_decoder.c +++ b/source/v5/mqtt5_decoder.c @@ -284,7 +284,7 @@ static int s_read_connack_property( done: if (result != AWS_OP_SUCCESS) { - AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read CONNACK property decode failure", packet_cursor); + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "Read CONNACK property decode failure"); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -427,7 +427,7 @@ static int s_read_publish_property( done: if (result != AWS_OP_SUCCESS) { - AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read PUBLISH property decode failure", packet_cursor); + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "Read PUBLISH property decode failure"); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -591,7 +591,7 @@ static int s_read_puback_property( done: if (result != AWS_OP_SUCCESS) { - AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read PUBACK property decode failure", packet_cursor); + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "Read PUBACK property decode failure"); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -700,7 +700,7 @@ static int s_read_suback_property( done: if (result != AWS_OP_SUCCESS) { - AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read SUBACK property decode failure", packet_cursor); + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "Read SUBACK property decode failure"); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -799,7 +799,7 @@ static int s_read_unsuback_property( done: if (result != AWS_OP_SUCCESS) { - AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read UNSUBACK property decode failure", packet_cursor); + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "Read UNSUBACK property decode failure"); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -929,7 +929,7 @@ static int s_read_disconnect_property( done: if (result == AWS_OP_ERR) { - AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Read DISCONNECT property decode failure", packet_cursor); + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "Read DISCONNECT property decode failure"); aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } @@ -1030,7 +1030,7 @@ static int s_aws_mqtt5_decoder_decode_packet(struct aws_mqtt5_decoder *decoder) enum aws_mqtt5_packet_type packet_type = (enum aws_mqtt5_packet_type)(decoder->packet_first_byte >> 4); aws_mqtt5_decoding_fn *decoder_fn = decoder->options.decoder_table->decoders_by_packet_type[packet_type]; if (decoder_fn == NULL) { - AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "id=%p: Decoder decode packet function missing!", decoder); + AWS_LOGF_ERROR(AWS_LS_MQTT5_CLIENT, "Decoder decode packet function missing"); return aws_raise_error(AWS_ERROR_MQTT5_DECODE_PROTOCOL_ERROR); } From 4d9f57cc6dd45bef08111e4bdce067822cc30486 Mon Sep 17 00:00:00 2001 From: Noah Beard Date: Fri, 25 Nov 2022 20:37:38 -0500 Subject: [PATCH 3/3] Forgot to add self --- codebuild/CanaryWrapper_Classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebuild/CanaryWrapper_Classes.py b/codebuild/CanaryWrapper_Classes.py index ee5d3c3e..202675df 100644 --- a/codebuild/CanaryWrapper_Classes.py +++ b/codebuild/CanaryWrapper_Classes.py @@ -949,7 +949,7 @@ def stop_monitoring(self): else: self.print_message ("[ApplicationMonitor] ERROR - cannot stop monitor application because no process is found!") - def print_stdout(): + def print_stdout(self): # Print the STDOUT file if (os.path.isfile(self.stdout_file_path)): self.print_message("Just finished Application STDOUT: ")