Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
75c7da7
Update
yangw-dev Sep 4, 2025
73a3e51
Update (base update)
yangw-dev Sep 4, 2025
20cb745
Update
yangw-dev Sep 4, 2025
e9e4444
Update (base update)
yangw-dev Sep 4, 2025
206396c
Update
yangw-dev Sep 4, 2025
f792a3f
Update (base update)
yangw-dev Sep 4, 2025
4825bfa
Update
yangw-dev Sep 4, 2025
2e27f44
Update (base update)
yangw-dev Sep 4, 2025
75b6dca
Update
yangw-dev Sep 4, 2025
a0a53db
Update (base update)
yangw-dev Sep 4, 2025
f5d15f7
Update
yangw-dev Sep 4, 2025
d73c464
Update
yangw-dev Sep 4, 2025
bd61cde
Update
yangw-dev Sep 4, 2025
429d591
Update
yangw-dev Sep 4, 2025
e6f975a
Update
yangw-dev Sep 4, 2025
cfe4ac0
Update
yangw-dev Sep 4, 2025
a181938
Update
yangw-dev Sep 4, 2025
26d1ae2
Update
yangw-dev Sep 4, 2025
d1130ec
Update (base update)
yangw-dev Sep 4, 2025
c97fa59
Update
yangw-dev Sep 4, 2025
ef68c2d
Update (base update)
yangw-dev Sep 4, 2025
99b810b
Update
yangw-dev Sep 5, 2025
b69950d
Update (base update)
yangw-dev Sep 5, 2025
366011a
Update
yangw-dev Sep 5, 2025
223fa02
Update
yangw-dev Sep 5, 2025
5aa529e
Update
yangw-dev Sep 5, 2025
96d0d8f
Update
yangw-dev Sep 5, 2025
1d0ca44
Update
yangw-dev Sep 5, 2025
d0cf57b
Update
yangw-dev Sep 5, 2025
9f90749
Update (base update)
yangw-dev Sep 6, 2025
d99d17e
Update
yangw-dev Sep 6, 2025
a78a422
Update (base update)
yangw-dev Sep 6, 2025
9951e0e
Update
yangw-dev Sep 6, 2025
cfa7d93
Update (base update)
yangw-dev Sep 8, 2025
f1871ca
Update
yangw-dev Sep 8, 2025
18c2dfc
Update (base update)
yangw-dev Sep 9, 2025
6eba598
Update
yangw-dev Sep 9, 2025
6e77e83
Update
yangw-dev Sep 9, 2025
df968dc
Merge branch 'main' into gh/yangw-dev/5/head
yangw-dev Sep 10, 2025
6de2b38
Merge branch 'main' into gh/yangw-dev/5/head
yangw-dev Sep 10, 2025
51f1cd5
Update report_manager.py
yangw-dev Sep 10, 2025
7795d9c
Update report_manager.py
yangw-dev Sep 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# their own benchmark regression config, currenlty place
# here for lambda


COMPILER_BENCHMARK_CONFIG = BenchmarkConfig(
name="Compiler Benchmark Regression",
id="compiler_regression",
Expand Down Expand Up @@ -44,6 +43,9 @@
}
""",
),
hud_info={
"url": "https://hud.pytorch.org/benchmark/compilers",
},
# set baseline from past 7 days using avg, and compare with the last 1 day
policy=Policy(
frequency=Frequency(value=1, unit="days"),
Expand All @@ -67,7 +69,7 @@
"compression_ratio": RegressionPolicy(
name="compression_ratio",
condition="greater_equal",
threshold=0.9,
threshold=0.95,
baseline_aggregation="max",
),
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ class BenchmarkConfig:
id: str
source: BenchmarkApiSource
policy: Policy
hud_info: Optional[dict[str, Any]] = None


@dataclass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,63 @@
get_regression_status,
PerGroupResult,
)
from jinja2 import Template


logger = logging.getLogger()
REPORT_MD_TEMPLATE = """# Benchmark Report {{ id }}
config_id: `{{ report_id }}`

We have detected **{{ status }}** in benchmark results for `{{ report_id }}` (id: `{{ id }}`).
(HUD benchmark regression page coming soon...)

> **Status:** {{ status }} · **Frequency:** {{ frequency }}

## Summary
| Metric | Value |
| :-- | --: |
| Total | {{ summary.total_count | default(0) }} |
| Regressions | {{ summary.regression_count | default(0) }} |
| Suspicious | {{ summary.suspicious_count | default(0) }} |
| No Regression | {{ summary.no_regression_count | default(0) }} |
| Insufficient Data | {{ summary.insufficient_data_count | default(0) }} |

## Data Windows
Baseline is a single reference value (e.g., mean, max, min, latest) aggregated from the previous few days,
used to detect regressions by comparing against metric values in the target window.

### Baseline window (used to calculate baseline value)
- **Start:** `{{ baseline.start.timestamp | default('') }}` (commit: `{{ baseline.start.commit | default('') }}`)
- **End:** `{{ baseline.end.timestamp | default('') }}` (commit: `{{ baseline.end.commit | default('') }}`)

### Target window (used to compare against baseline value)
- **Start:** `{{ target.start.timestamp | default('') }}` (commit: `{{ target.start.commit | default('') }}`)
- **End:** `{{ target.end.timestamp | default('') }}` (commit: `{{ target.end.commit | default('') }}`)

{% if regression_items and regression_items|length > 0 %}
## Regression Glance
{% if url %}
Use items below in [HUD]({{ url }}) to see regression.
{% endif %}

{% set items = regression_items if regression_items|length <= 10 else regression_items[:10] %}
{% if regression_items|length > 10 %}
… (showing first 10 only, total {{ regression_items|length }} regressions)
{% endif %}
{% for item in items %}
{% set kv = item.group_info|dictsort %}
{{ "" }}|{% for k, _ in kv %}{{ k }} |{% endfor %}{{ "\n" -}}
|{% for _k, _ in kv %}---|{% endfor %}{{ "\n" -}}
|{% for _k, v in kv %}{{ v }} |{% endfor %}{{ "\n\n" -}}
{% if item.baseline_point -%}
- **baseline**: {{ item.baseline_point.value}},
- **startTime**: {{ item.baseline_point.timestamp }}, **endTime**: {{ target.end.timestamp }}
- **lcommit**: `{{ item.baseline_point.commit }}`, **rcommit**: `{{ target.end.commit }}`
{{ "\n" }}
{%- endif %}
{% endfor %}
{% endif %}
"""


logger = logging.getLogger()
Expand Down Expand Up @@ -64,10 +121,64 @@ def run(
main method used to insert the report to db and create github comment in targeted issue
"""
try:
self.insert_to_db(cc)
applied_insertion = self.insert_to_db(cc)
except Exception as e:
logger.error(f"failed to insert report to db, error: {e}")
raise
if not applied_insertion:
logger.info("[%s] skip notification, already exists in db", self.config_id)
return
self.notify_github_comment(github_token)

def notify_github_comment(self, github_token: str):
if self.status != "regression":
logger.info(
"[%s] no regression found, skip notification",
self.config_id,
)
return

github_notification = self.config.policy.get_github_notification_config()
if not github_notification:
logger.info(
"[%s] no github notification config found, skip notification",
self.config_id,
)
return
logger.info("[%s] prepareing gitub comment content", self.config_id)
content = self._to_markdoown()
if self.is_dry_run:
logger.info(
"[%s]dry run, skip sending comment to github, report(%s)",
self.config_id,
self.id,
)
logger.info("[dry run] printing comment content")
print(json.dumps(content, indent=2, default=str))
logger.info("[dry run] Done! Finish printing comment content")
return
logger.info("[%s] create comment to github issue", self.config_id)
github_notification.create_github_comment(content, github_token)
logger.info("[%s] done. comment is sent to github", self.config_id)

def _to_markdoown(self):
self.regression_items = self._collect_regression_items()
url = ""
if self.config.hud_info:
url = self.config.hud_info.get("url", "")

md = Template(REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True).render(
id=self.id,
url=url,
status=self.status,
report_id=self.config_id,
summary=self.report["summary"],
baseline=self.baseline,
target=self.target,
frequency=self.config.policy.frequency.get_text(),
regression_items=self.regression_items,
)
return md

def _collect_regression_items(self) -> list[PerGroupResult]:
items = []
Expand Down
59 changes: 37 additions & 22 deletions aws/lambda/benchmark_regression_summary_report/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
from dateutil.parser import isoparse


# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created
BENCHMARK_REGRESSION_REPORT_TABLE = "fortesting.benchmark_regression_report"
BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"]

logging.basicConfig(
level=logging.INFO,
Expand All @@ -33,9 +35,6 @@
"CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""),
}

# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created
BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"]


def format_ts_with_t(ts: int) -> str:
return dt.datetime.fromtimestamp(ts, tz=dt.timezone.utc).strftime(
Expand All @@ -54,7 +53,6 @@ def get_clickhouse_client(
return clickhouse_connect.get_client(
host=host, user=user, password=password, secure=True, verify=False
)

return clickhouse_connect.get_client(
host=host, user=user, password=password, secure=True
)
Expand All @@ -77,8 +75,10 @@ def __init__(
config_id: str,
end_time: int,
is_dry_run: bool = False,
is_pass_check: bool = False,
) -> None:
self.is_dry_run = is_dry_run
self.is_pass_check = is_pass_check
self.config_id = config_id
self.end_time = end_time

Expand Down Expand Up @@ -127,6 +127,7 @@ def process(
should_generate = self._should_generate_report(
cc, self.end_time, self.config_id, report_freq
)

if not should_generate:
self.log_info(
"Skip generate report",
Expand All @@ -138,7 +139,6 @@ def process(
f"with frequency {report_freq.get_text()}..."
)

self.log_info("get target data")
target, ls, le = self.get_target(config, self.end_time)
if not target:
self.log_info(
Expand All @@ -157,12 +157,11 @@ def process(
regression_report = generator.generate()
if self.is_dry_run:
print(json.dumps(regression_report, indent=2, default=str))
return

reportManager = ReportManager(
config=config,
regression_report=regression_report,
db_table_name=BENCHMARK_REGRESSION_REPORT_TABLE,
is_dry_run=self.is_dry_run,
)
reportManager.run(cc, ENVS["GITHUB_TOKEN"])
return
Expand All @@ -172,7 +171,8 @@ def get_target(self, config: BenchmarkConfig, end_time: int):
target_s = end_time - data_range.comparison_timedelta_s()
target_e = end_time
self.log_info(
f"get baseline data for time range [{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}]"
"getting target data for time range "
f"[{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}] ..."
)
target_data = self._fetch_from_benchmark_ts_api(
config_id=config.id,
Expand All @@ -181,7 +181,7 @@ def get_target(self, config: BenchmarkConfig, end_time: int):
source=config.source,
)
self.log_info(
f"found {len(target_data.time_series)} # of data, with time range {target_data.time_range}",
f"done. found {len(target_data.time_series)} # of data groups, with time range {target_data.time_range}",
)
if not target_data.time_range or not target_data.time_range.end:
return None, target_s, target_e
Expand All @@ -196,7 +196,8 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int):
baseline_s = end_time - data_range.total_timedelta_s()
baseline_e = end_time - data_range.comparison_timedelta_s()
self.log_info(
f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]"
"getting baseline data for time range "
f"[{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}] ..."
)
# fetch baseline from api
raw_data = self._fetch_from_benchmark_ts_api(
Expand All @@ -207,11 +208,7 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int):
)

self.log_info(
f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]"
)

self.log_info(
f"found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}",
f"Done. found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}",
)

baseline_latest_ts = int(isoparse(raw_data.time_range.end).timestamp())
Expand Down Expand Up @@ -269,11 +266,8 @@ def _fetch_from_benchmark_ts_api(
)

elapsed_ms = (time.perf_counter() - t0) * 1000.0
logger.info(
"[%s] call OK in %.1f ms (query_len=%d)",
config_id,
elapsed_ms,
len(query),
self.log_info(
f"call OK in {elapsed_ms} ms (query_len={len(query)})",
)
return resp.data
except requests.exceptions.HTTPError as e:
Expand All @@ -290,7 +284,7 @@ def _fetch_from_benchmark_ts_api(
else str(e)
)
self.log_error(
f"[{config_id}] call FAILED in {elapsed_ms} ms: {err_msg}",
f"call FAILED in {elapsed_ms} ms: {err_msg}",
)
raise

Expand Down Expand Up @@ -348,6 +342,12 @@ def _get_latest_record_ts(
f"time_boundary({format_ts_with_t(time_boundary)})"
f"based on latest_record_ts({format_ts_with_t(latest_record_ts)})",
)
# dry_run is True, is_pass_check is True, then we allow to generate report even the time check is not met
if self.is_dry_run and self.is_pass_check:
should_generate = True
self.log_info(
f"[{f.get_text()}] dry_run is True, is_pass_check is True, force generate report for print only",
)
return should_generate


Expand All @@ -357,7 +357,12 @@ def main(
args: Optional[argparse.Namespace] = None,
*,
is_dry_run: bool = False,
is_forced: bool = False,
):
if not is_dry_run and is_forced:
is_forced = False
logger.info("is_dry_run is False, force must be disabled, this is not allowed")

if not github_access_token:
raise ValueError("Missing environment variable GITHUB_TOKEN")

Expand All @@ -378,7 +383,10 @@ def main(
# caution, raise exception may lead lambda to retry
try:
processor = BenchmarkSummaryProcessor(
config_id=config_id, end_time=end_time_ts, is_dry_run=is_dry_run
config_id=config_id,
end_time=end_time_ts,
is_dry_run=is_dry_run,
is_pass_check=is_forced,
)
processor.process(args=args)
except Exception as e:
Expand Down Expand Up @@ -419,6 +427,12 @@ def parse_args() -> argparse.Namespace:
action="store_false",
help="Disable dry-run mode",
)
parser.add_argument(
"--force",
dest="force",
action="store_true",
help="Enable force mode, this only allowed when dry-run is enabled",
)
parser.add_argument(
"--config-id",
type=str,
Expand Down Expand Up @@ -466,6 +480,7 @@ def local_run() -> None:
github_access_token=args.github_access_token,
args=args,
is_dry_run=args.dry_run,
is_forced=args.force,
)


Expand Down