diff --git a/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py b/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py index 002282d594763..da01711ebb16c 100644 --- a/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py +++ b/providers/amazon/src/airflow/providers/amazon/aws/hooks/glue.py @@ -565,7 +565,13 @@ def _log_results(self, result: dict[str, Any]) -> None: Rule_3 ColumnLength "marketplace" between 1 and 2 FAIL {'Column.marketplace.MaximumLength': 9.0, 'Column.marketplace.MinimumLength': 3.0} Value: 9.0 does not meet the constraint requirement! """ - import pandas as pd + try: + import pandas as pd + except ImportError: + self.log.warning( + "Pandas is not installed. Please install pandas to see the detailed Data Quality results." + ) + return pd.set_option("display.max_rows", None) pd.set_option("display.max_columns", None) diff --git a/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py b/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py index df3e98401ae74..9043f3cc5c314 100644 --- a/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py +++ b/providers/amazon/tests/unit/amazon/aws/hooks/test_glue.py @@ -684,6 +684,90 @@ def test_validate_evaluation_results(self, mock_conn, caplog): "AWS Glue data quality ruleset evaluation run, total number of rules failed: 0" ] + @mock.patch.object(AwsBaseHook, "conn") + def test_validate_evaluation_results_show_results_True(self, mock_conn, caplog): + response_evaluation_run = {"RunId": self.RUN_ID, "ResultIds": ["resultId1"]} + + response_batch_result = { + "RunId": self.RUN_ID, + "ResultIds": ["resultId1"], + "Results": [ + { + "ResultId": "resultId1", + "RulesetName": "rulesetOne", + "RuleResults": [ + { + "Name": "Rule_1", + "Description": "RowCount between 150000 and 600000", + "EvaluatedMetrics": {"Dataset.*.RowCount": 300000.0}, + "Result": "PASS", + } + ], + } + ], + } + mock_conn.get_data_quality_ruleset_evaluation_run.return_value = response_evaluation_run + + mock_conn.batch_get_data_quality_result.return_value = response_batch_result + + with caplog.at_level(logging.INFO, logger=self.glue.log.name): + caplog.clear() + self.glue.validate_evaluation_run_results(evaluation_run_id=self.RUN_ID, show_results=True) + + mock_conn.get_data_quality_ruleset_evaluation_run.assert_called_once_with(RunId=self.RUN_ID) + mock_conn.batch_get_data_quality_result.assert_called_once_with( + ResultIds=response_evaluation_run["ResultIds"] + ) + # The messages have extra spaces to create spacing in the output, the number of consecutive spaces + # may vary. Remove any sequence of spaces greater than 1 before asserting. + messages = [" ".join(msg.split()) for msg in caplog.messages] + assert messages == [ + "AWS Glue data quality ruleset evaluation result for RulesetName: rulesetOne RulesetEvaluationRunId: None Score: None", + "Name Description EvaluatedMetrics Result 0 Rule_1 RowCount between 150000 and 600000 {'Dataset.*.RowCount': 300000.0} PASS", + "AWS Glue data quality ruleset evaluation run, total number of rules failed: 0", + ] + + @mock.patch.object(AwsBaseHook, "conn") + def test_validate_evaluation_results_show_results_True_no_pandas(self, mock_conn, caplog): + response_evaluation_run = {"RunId": self.RUN_ID, "ResultIds": ["resultId1"]} + + response_batch_result = { + "RunId": self.RUN_ID, + "ResultIds": ["resultId1"], + "Results": [ + { + "ResultId": "resultId1", + "RulesetName": "rulesetOne", + "RuleResults": [ + { + "Name": "Rule_1", + "Description": "RowCount between 150000 and 600000", + "EvaluatedMetrics": {"Dataset.*.RowCount": 300000.0}, + "Result": "PASS", + } + ], + } + ], + } + mock_conn.get_data_quality_ruleset_evaluation_run.return_value = response_evaluation_run + + mock_conn.batch_get_data_quality_result.return_value = response_batch_result + + # Emulate/mock the import of pandas failing with ModlueNotFoundError + with mock.patch.dict("sys.modules", {"pandas": None}): + with caplog.at_level(logging.INFO, logger=self.glue.log.name): + caplog.clear() + self.glue.validate_evaluation_run_results(evaluation_run_id=self.RUN_ID, show_results=True) + + mock_conn.get_data_quality_ruleset_evaluation_run.assert_called_once_with(RunId=self.RUN_ID) + mock_conn.batch_get_data_quality_result.assert_called_once_with( + ResultIds=response_evaluation_run["ResultIds"] + ) + assert caplog.messages == [ + "Pandas is not installed. Please install pandas to see the detailed Data Quality results.", + "AWS Glue data quality ruleset evaluation run, total number of rules failed: 0", + ] + @mock.patch.object(AwsBaseHook, "conn") def test_validate_evaluation_results_should_fail_when_any_rules_failed(self, mock_conn, caplog): response_batch_result = {