diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py index ef0586758f..db830ab2c2 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config.py @@ -15,7 +15,6 @@ # their own benchmark regression config, currenlty place # here for lambda - COMPILER_BENCHMARK_CONFIG = BenchmarkConfig( name="Compiler Benchmark Regression", id="compiler_regression", @@ -44,6 +43,9 @@ } """, ), + hud_info={ + "url": "https://hud.pytorch.org/benchmark/compilers", + }, # set baseline from past 7 days using avg, and compare with the last 1 day policy=Policy( frequency=Frequency(value=1, unit="days"), @@ -67,7 +69,7 @@ "compression_ratio": RegressionPolicy( name="compression_ratio", condition="greater_equal", - threshold=0.9, + threshold=0.95, baseline_aggregation="max", ), }, diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py index c262b35939..0de99b1ce1 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/config_model.py +++ b/aws/lambda/benchmark_regression_summary_report/common/config_model.py @@ -241,6 +241,7 @@ class BenchmarkConfig: id: str source: BenchmarkApiSource policy: Policy + hud_info: Optional[dict[str, Any]] = None @dataclass diff --git a/aws/lambda/benchmark_regression_summary_report/common/report_manager.py b/aws/lambda/benchmark_regression_summary_report/common/report_manager.py index e86084c134..79b550c88b 100644 --- a/aws/lambda/benchmark_regression_summary_report/common/report_manager.py +++ b/aws/lambda/benchmark_regression_summary_report/common/report_manager.py @@ -12,6 +12,63 @@ get_regression_status, PerGroupResult, ) +from jinja2 import Template + + +logger = logging.getLogger() +REPORT_MD_TEMPLATE = """# Benchmark Report {{ id }} +config_id: `{{ report_id }}` + +We have detected **{{ status }}** in benchmark results for `{{ report_id }}` (id: `{{ id }}`). +(HUD benchmark regression page coming soon...) + +> **Status:** {{ status }} · **Frequency:** {{ frequency }} + +## Summary +| Metric | Value | +| :-- | --: | +| Total | {{ summary.total_count | default(0) }} | +| Regressions | {{ summary.regression_count | default(0) }} | +| Suspicious | {{ summary.suspicious_count | default(0) }} | +| No Regression | {{ summary.no_regression_count | default(0) }} | +| Insufficient Data | {{ summary.insufficient_data_count | default(0) }} | + +## Data Windows +Baseline is a single reference value (e.g., mean, max, min, latest) aggregated from the previous few days, +used to detect regressions by comparing against metric values in the target window. + +### Baseline window (used to calculate baseline value) +- **Start:** `{{ baseline.start.timestamp | default('') }}` (commit: `{{ baseline.start.commit | default('') }}`) +- **End:** `{{ baseline.end.timestamp | default('') }}` (commit: `{{ baseline.end.commit | default('') }}`) + +### Target window (used to compare against baseline value) +- **Start:** `{{ target.start.timestamp | default('') }}` (commit: `{{ target.start.commit | default('') }}`) +- **End:** `{{ target.end.timestamp | default('') }}` (commit: `{{ target.end.commit | default('') }}`) + +{% if regression_items and regression_items|length > 0 %} +## Regression Glance +{% if url %} +Use items below in [HUD]({{ url }}) to see regression. +{% endif %} + +{% set items = regression_items if regression_items|length <= 10 else regression_items[:10] %} +{% if regression_items|length > 10 %} +… (showing first 10 only, total {{ regression_items|length }} regressions) +{% endif %} +{% for item in items %} +{% set kv = item.group_info|dictsort %} +{{ "" }}|{% for k, _ in kv %}{{ k }} |{% endfor %}{{ "\n" -}} +|{% for _k, _ in kv %}---|{% endfor %}{{ "\n" -}} +|{% for _k, v in kv %}{{ v }} |{% endfor %}{{ "\n\n" -}} +{% if item.baseline_point -%} +- **baseline**: {{ item.baseline_point.value}}, +- **startTime**: {{ item.baseline_point.timestamp }}, **endTime**: {{ target.end.timestamp }} +- **lcommit**: `{{ item.baseline_point.commit }}`, **rcommit**: `{{ target.end.commit }}` +{{ "\n" }} +{%- endif %} +{% endfor %} +{% endif %} +""" logger = logging.getLogger() @@ -64,10 +121,64 @@ def run( main method used to insert the report to db and create github comment in targeted issue """ try: - self.insert_to_db(cc) + applied_insertion = self.insert_to_db(cc) except Exception as e: logger.error(f"failed to insert report to db, error: {e}") raise + if not applied_insertion: + logger.info("[%s] skip notification, already exists in db", self.config_id) + return + self.notify_github_comment(github_token) + + def notify_github_comment(self, github_token: str): + if self.status != "regression": + logger.info( + "[%s] no regression found, skip notification", + self.config_id, + ) + return + + github_notification = self.config.policy.get_github_notification_config() + if not github_notification: + logger.info( + "[%s] no github notification config found, skip notification", + self.config_id, + ) + return + logger.info("[%s] prepareing gitub comment content", self.config_id) + content = self._to_markdoown() + if self.is_dry_run: + logger.info( + "[%s]dry run, skip sending comment to github, report(%s)", + self.config_id, + self.id, + ) + logger.info("[dry run] printing comment content") + print(json.dumps(content, indent=2, default=str)) + logger.info("[dry run] Done! Finish printing comment content") + return + logger.info("[%s] create comment to github issue", self.config_id) + github_notification.create_github_comment(content, github_token) + logger.info("[%s] done. comment is sent to github", self.config_id) + + def _to_markdoown(self): + self.regression_items = self._collect_regression_items() + url = "" + if self.config.hud_info: + url = self.config.hud_info.get("url", "") + + md = Template(REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True).render( + id=self.id, + url=url, + status=self.status, + report_id=self.config_id, + summary=self.report["summary"], + baseline=self.baseline, + target=self.target, + frequency=self.config.policy.frequency.get_text(), + regression_items=self.regression_items, + ) + return md def _collect_regression_items(self) -> list[PerGroupResult]: items = [] diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py index 75212dd980..6d0998d8c7 100644 --- a/aws/lambda/benchmark_regression_summary_report/lambda_function.py +++ b/aws/lambda/benchmark_regression_summary_report/lambda_function.py @@ -18,7 +18,9 @@ from dateutil.parser import isoparse +# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created BENCHMARK_REGRESSION_REPORT_TABLE = "fortesting.benchmark_regression_report" +BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"] logging.basicConfig( level=logging.INFO, @@ -33,9 +35,6 @@ "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""), } -# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created -BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"] - def format_ts_with_t(ts: int) -> str: return dt.datetime.fromtimestamp(ts, tz=dt.timezone.utc).strftime( @@ -54,7 +53,6 @@ def get_clickhouse_client( return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True, verify=False ) - return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True ) @@ -77,8 +75,10 @@ def __init__( config_id: str, end_time: int, is_dry_run: bool = False, + is_pass_check: bool = False, ) -> None: self.is_dry_run = is_dry_run + self.is_pass_check = is_pass_check self.config_id = config_id self.end_time = end_time @@ -127,6 +127,7 @@ def process( should_generate = self._should_generate_report( cc, self.end_time, self.config_id, report_freq ) + if not should_generate: self.log_info( "Skip generate report", @@ -138,7 +139,6 @@ def process( f"with frequency {report_freq.get_text()}..." ) - self.log_info("get target data") target, ls, le = self.get_target(config, self.end_time) if not target: self.log_info( @@ -157,12 +157,11 @@ def process( regression_report = generator.generate() if self.is_dry_run: print(json.dumps(regression_report, indent=2, default=str)) - return - reportManager = ReportManager( config=config, regression_report=regression_report, db_table_name=BENCHMARK_REGRESSION_REPORT_TABLE, + is_dry_run=self.is_dry_run, ) reportManager.run(cc, ENVS["GITHUB_TOKEN"]) return @@ -172,7 +171,8 @@ def get_target(self, config: BenchmarkConfig, end_time: int): target_s = end_time - data_range.comparison_timedelta_s() target_e = end_time self.log_info( - f"get baseline data for time range [{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}]" + "getting target data for time range " + f"[{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}] ..." ) target_data = self._fetch_from_benchmark_ts_api( config_id=config.id, @@ -181,7 +181,7 @@ def get_target(self, config: BenchmarkConfig, end_time: int): source=config.source, ) self.log_info( - f"found {len(target_data.time_series)} # of data, with time range {target_data.time_range}", + f"done. found {len(target_data.time_series)} # of data groups, with time range {target_data.time_range}", ) if not target_data.time_range or not target_data.time_range.end: return None, target_s, target_e @@ -196,7 +196,8 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int): baseline_s = end_time - data_range.total_timedelta_s() baseline_e = end_time - data_range.comparison_timedelta_s() self.log_info( - f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]" + "getting baseline data for time range " + f"[{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}] ..." ) # fetch baseline from api raw_data = self._fetch_from_benchmark_ts_api( @@ -207,11 +208,7 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int): ) self.log_info( - f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]" - ) - - self.log_info( - f"found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}", + f"Done. found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}", ) baseline_latest_ts = int(isoparse(raw_data.time_range.end).timestamp()) @@ -269,11 +266,8 @@ def _fetch_from_benchmark_ts_api( ) elapsed_ms = (time.perf_counter() - t0) * 1000.0 - logger.info( - "[%s] call OK in %.1f ms (query_len=%d)", - config_id, - elapsed_ms, - len(query), + self.log_info( + f"call OK in {elapsed_ms} ms (query_len={len(query)})", ) return resp.data except requests.exceptions.HTTPError as e: @@ -290,7 +284,7 @@ def _fetch_from_benchmark_ts_api( else str(e) ) self.log_error( - f"[{config_id}] call FAILED in {elapsed_ms} ms: {err_msg}", + f"call FAILED in {elapsed_ms} ms: {err_msg}", ) raise @@ -348,6 +342,12 @@ def _get_latest_record_ts( f"time_boundary({format_ts_with_t(time_boundary)})" f"based on latest_record_ts({format_ts_with_t(latest_record_ts)})", ) + # dry_run is True, is_pass_check is True, then we allow to generate report even the time check is not met + if self.is_dry_run and self.is_pass_check: + should_generate = True + self.log_info( + f"[{f.get_text()}] dry_run is True, is_pass_check is True, force generate report for print only", + ) return should_generate @@ -357,7 +357,12 @@ def main( args: Optional[argparse.Namespace] = None, *, is_dry_run: bool = False, + is_forced: bool = False, ): + if not is_dry_run and is_forced: + is_forced = False + logger.info("is_dry_run is False, force must be disabled, this is not allowed") + if not github_access_token: raise ValueError("Missing environment variable GITHUB_TOKEN") @@ -378,7 +383,10 @@ def main( # caution, raise exception may lead lambda to retry try: processor = BenchmarkSummaryProcessor( - config_id=config_id, end_time=end_time_ts, is_dry_run=is_dry_run + config_id=config_id, + end_time=end_time_ts, + is_dry_run=is_dry_run, + is_pass_check=is_forced, ) processor.process(args=args) except Exception as e: @@ -419,6 +427,12 @@ def parse_args() -> argparse.Namespace: action="store_false", help="Disable dry-run mode", ) + parser.add_argument( + "--force", + dest="force", + action="store_true", + help="Enable force mode, this only allowed when dry-run is enabled", + ) parser.add_argument( "--config-id", type=str, @@ -466,6 +480,7 @@ def local_run() -> None: github_access_token=args.github_access_token, args=args, is_dry_run=args.dry_run, + is_forced=args.force, )