Benchmark Comparison & Alarm Regression #2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmark Comparison & Alarm Regression | |
| on: | |
| workflow_run: | |
| workflows: ["Production"] | |
| types: [completed] | |
| permissions: | |
| contents: read | |
| actions: read | |
| pull-requests: write | |
| checks: write | |
| jobs: | |
| comment-if-regressed: | |
| runs-on: ubuntu-latest | |
| if: > | |
| github.event.workflow_run.event == 'pull_request' && | |
| contains(fromJson('["success","neutral"]'), github.event.workflow_run.conclusion) | |
| steps: | |
| - name: Setup Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.10' | |
| - name: Install deps | |
| run: | | |
| python -m pip install --quiet --upgrade wandb frozendict | |
| - name: Download artifacts from triggering run | |
| id: dl | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: speed-test-* | |
| run-id: ${{ github.event.workflow_run.id }} | |
| github-token: ${{ secrets.GITHUB_TOKEN }} | |
| path: ./artifacts | |
| - name: Show downloaded files | |
| run: | | |
| echo "Downloaded into ${{ steps.dl.outputs.download-path }}" | |
| ls -la ${{ steps.dl.outputs.download-path }} || true | |
| (command -v tree >/dev/null && tree -a ${{ steps.dl.outputs.download-path }}) || true | |
| - name: Check regressions + build outputs | |
| id: analyze | |
| env: | |
| # Note that secrets are not passed to workflows that are triggered by a pull request from a fork | |
| # --- W&B --- | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| WANDB_ENTITY: genesis-ai-company | |
| WANDB_PROJECT: genesis-benchmarks | |
| WANDB_SILENT: "true" | |
| # --- Parameters --- | |
| MAX_VALID_REVISIONS: 5 | |
| MAX_FETCH_REVISIONS: 40 | |
| RUNTIME_REGRESSION_TOLERANCE_PCT: 10 | |
| COMPILE_REGRESSION_TOLERANCE_PCT: 10 | |
| # Input/Output paths | |
| ARTIFACTS_DIR: ${{ steps.dl.outputs.download-path }} | |
| CHECK_BODY_PATH: check_output.md | |
| CSV_RUNTIME_PATH: runtime_fps.csv | |
| CSV_COMPILE_PATH: compile_time.csv | |
| EXIT_CODE_REGRESSION: 42 | |
| EXIT_CODE_ALERT: 43 | |
| run: | | |
| { python - << 'PY'; EXIT_CODE=$?; } || true | |
| import os, sys, json, re, math, statistics | |
| import wandb | |
| from frozendict import frozendict | |
| from pathlib import Path | |
| import csv | |
| # ----- arguments ----- | |
| MAX_VALID_REVISIONS = int(os.environ["MAX_VALID_REVISIONS"]) | |
| MAX_FETCH_REVISIONS = int(os.environ["MAX_FETCH_REVISIONS"]) | |
| METRICS_TOL = { | |
| "runtime_fps": float(os.environ["RUNTIME_REGRESSION_TOLERANCE_PCT"]), | |
| "compile_time": float(os.environ["COMPILE_REGRESSION_TOLERANCE_PCT"]), | |
| } | |
| artifacts_dir = Path(os.environ["ARTIFACTS_DIR"]).expanduser().resolve() | |
| check_body_path = Path(os.environ["CHECK_BODY_PATH"]).expanduser() | |
| csv_files = { | |
| "runtime_fps": Path(os.environ["CSV_RUNTIME_PATH"]).expanduser().resolve(), | |
| "compile_time": Path(os.environ["CSV_COMPILE_PATH"]).expanduser().resolve(), | |
| } | |
| # ---------- helpers ---------- | |
| METRIC_KEYS = ("compile_time", "runtime_fps", "realtime_factor") | |
| def parse_benchmark_id(bid: str) -> dict: | |
| kv = {} | |
| if bid: | |
| for token in bid.split("-"): | |
| token = token.strip() | |
| if token and "=" in token: | |
| k, v = token.split("=", 1) | |
| kv[k.strip()] = v.strip() | |
| return kv | |
| def normalize_benchmark_id(bid: str) -> frozendict[str, str]: | |
| return frozendict(parse_benchmark_id(bid)) | |
| def get_param_names(bids: tuple[frozendict]) -> tuple[str, ...]: | |
| """ | |
| Merge a list of tuples into a single tuple of keys that: | |
| - Preserves the relative order of keys within each tuple | |
| - Gives precedence to later tuples when conflicts arise | |
| """ | |
| merged = list(bids[-1]) | |
| merged_set = set(merged) | |
| for tup in bids[:-1]: | |
| for key in tup: | |
| if key not in merged_set: | |
| merged.append(key) | |
| merged_set.add(key) | |
| return tuple(merged) | |
| def sort_key(d): | |
| key_list = [] | |
| for col in params_name: | |
| if col in d: | |
| val = d[col] | |
| key_list.append((0, val)) | |
| else: | |
| key_list.append((1, None)) | |
| return key_list | |
| def artifacts_parse_csv_summary(current_txt_path): | |
| out = {} | |
| for line in current_txt_path.read_text().splitlines(): | |
| kv = dict(map(str.strip, p.split("=", 1)) for p in line.split("|") if "=" in p) | |
| record = {} | |
| for k in METRIC_KEYS: | |
| try: | |
| record[k] = float(kv.pop(k)) | |
| except (ValueError, TypeError, KeyError): | |
| pass | |
| nbid = frozendict(kv) | |
| out[nbid] = record | |
| return out | |
| def fmt_num(v, is_int: bool): | |
| return f"{int(v):,}" if is_int else f"{v:.2f}" | |
| # ----- load artifacts (current results) ----- | |
| current_csv_paths = list(artifacts_dir.rglob("speed_test*.txt")) | |
| if not current_csv_paths: | |
| check_body_path.touch() | |
| sys.exit(0) | |
| current_bm = {} | |
| for csv_path in current_csv_paths: | |
| current_bm |= artifacts_parse_csv_summary(csv_path) | |
| bids_set = frozenset(current_bm.keys()) | |
| assert bids_set | |
| # ----- W&B baselines ----- | |
| if not "WANDB_API_KEY" in os.environ: | |
| print("WANDB_API_KEY is not set") | |
| sys.exit(0) | |
| ENTITY = os.environ["WANDB_ENTITY"] | |
| PROJECT = os.environ["WANDB_PROJECT"] | |
| api = wandb.Api() | |
| runs_iter = api.runs(f"{ENTITY}/{PROJECT}", order="-created_at") | |
| revs = set() | |
| records_by_rev = {} | |
| for i, run in enumerate(runs_iter): | |
| # Abort if still not complete after checking enough runs. | |
| # This would happen if a new benchmark has been added, and not enough past data is available yet. | |
| if len(revs) == MAX_FETCH_REVISIONS: | |
| break | |
| # Early return if enough complete records have been collected | |
| records_is_complete = [bids_set.issubset(record.keys()) for record in records_by_rev.values()] | |
| if sum(records_is_complete) == MAX_VALID_REVISIONS: | |
| break | |
| # Load config and summary, with support of legacy runs | |
| config, summary = run.config, run.summary | |
| if isinstance(config, str): | |
| config = {k: v["value"] for k, v in json.loads(run.config).items() if not k.startswith("_")} | |
| if isinstance(summary._json_dict, str): | |
| summary = json.loads(summary._json_dict) | |
| # Extract revision commit and branch | |
| try: | |
| rev, branch = config["revision"].split("@", 1) | |
| revs.add(rev) | |
| except ValueError: | |
| # Ignore this run if the revision has been corrupted for some unknown reason | |
| continue | |
| # Ignore runs associated with a commit that is not part of the official repository | |
| if not branch.startswith('Genesis-Embodied-AI/'): | |
| continue | |
| # Skip runs did not finish for some reason | |
| if run.state != "finished": | |
| continue | |
| # Do not store new records if the desired number of revision is already reached | |
| if len(records_by_rev) == MAX_VALID_REVISIONS and rev not in records_by_rev: | |
| continue | |
| # Extract benchmark ID and normalize it to make sure it does not depends on key ordering. | |
| # Note that the rigid body benchmark suite is the only one being supported for now. | |
| sid, bid = config["benchmark_id"].split("-", 1) | |
| if sid != "rigid_body": | |
| continue | |
| # Make sure that stats are valid | |
| try: | |
| is_valid = True | |
| for k in METRIC_KEYS: | |
| v = summary[k] | |
| if not isinstance(v, (float, int)) or math.isnan(v): | |
| is_valid = False | |
| break | |
| if not is_valid: | |
| continue | |
| except KeyError: | |
| continue | |
| # Store all the records into a dict | |
| nbid = normalize_benchmark_id(bid) | |
| records_by_rev.setdefault(rev, {})[nbid] = { | |
| metric: summary[metric] for metric in METRIC_KEYS | |
| } | |
| # ----- build TWO tables ----- | |
| # Parse benchmark IDs into key-value dicts while preserving order | |
| params_name = get_param_names(tuple((tuple(kv.keys())) for kv in current_bm.keys())) | |
| reg_found, alert_found = False, False | |
| tables = {} | |
| rows_for_csv = {"runtime_fps": [], "compile_time": []} | |
| info = {} | |
| for metric, alias, sign in (("runtime_fps", "FPS", 1), ("compile_time", "compile", -1)): | |
| rows_md = [] | |
| header_cells = ( | |
| "status", | |
| *params_name, | |
| f"current {alias}", | |
| f"baseline {alias} [last (mean ± std)] (*1)", | |
| f"Δ {alias} (*2)" | |
| ) | |
| header = "| " + " | ".join(header_cells) + " |" | |
| align = "|:------:|" + "|".join([":---" for _ in params_name]) + "|---:|---:|---:|" | |
| for bid in sorted(current_bm.keys(), key=sort_key): | |
| value_cur = current_bm[bid][metric] | |
| is_int = isinstance(value_cur, int) or value_cur.is_integer() | |
| value_repr = fmt_num(value_cur, is_int) | |
| params_repr = [bid.get(k, "-") for k in params_name] | |
| info = { | |
| **dict(zip(params_name, params_repr)), | |
| "current": value_cur, | |
| "baseline_last": None, | |
| "baseline_min": None, | |
| "baseline_max": None, | |
| } | |
| values_prev = [ | |
| record[bid][metric] | |
| for record in records_by_rev.values() | |
| if bid in record | |
| ] | |
| if values_prev: | |
| value_last = values_prev[0] | |
| value_ref = statistics.fmean(values_prev) | |
| delta = (value_cur - value_last) / value_last * 100.0 | |
| info["baseline_last"] = int(value_last) if is_int else float(value_last) | |
| stats_repr = f"{fmt_num(value_last, is_int)}" | |
| delta_repr = f"{delta:+.1f}%" | |
| if len(values_prev) == MAX_VALID_REVISIONS: | |
| info["baseline_mean"] = int(value_ref) if is_int else float(value_ref) | |
| info["baseline_min"] = int(min(values_prev)) if is_int else float(min(values_prev)) | |
| info["baseline_max"] = int(max(values_prev)) if is_int else float(max(values_prev)) | |
| value_std = statistics.stdev(values_prev) | |
| stats_repr += f" ({fmt_num(value_ref, is_int)} ± {fmt_num(value_std, is_int)})" | |
| if sign * delta < - METRICS_TOL[metric]: | |
| info["status"] = "regression" | |
| delta_repr = f"**{delta_repr}**" | |
| picto = "🔴" | |
| reg_found = True | |
| elif sign * delta > METRICS_TOL[metric]: | |
| info["status"] = "alert" | |
| delta_repr = f"**{delta_repr}**" | |
| picto = "⚠️" | |
| alert_found = True | |
| else: | |
| info["status"] = "ok" | |
| picto = "✅" | |
| else: | |
| info["status"] = "n/a" | |
| picto = "ℹ️" | |
| else: | |
| picto, stats_repr, delta_repr = "ℹ️", "---", "---" | |
| rows_md.append("| " + " | ".join((picto, *params_repr, value_repr, stats_repr, delta_repr)) + " |") | |
| rows_for_csv[metric].append(info) | |
| tables[metric] = [header, align] + rows_md | |
| # ----- baseline commit list (MD) ----- | |
| blist = [f"- Commit {i}: {sha}" for i, sha in enumerate(records_by_rev.keys(), 1)] | |
| baseline_block = ["**Baselines considered:** " + f"**{len(records_by_rev)}** commits"] + blist | |
| # ----- CHECK body (always) ----- | |
| thr_repr = ", ".join( | |
| f"{alias} ± {METRICS_TOL[metric]:.0f}%" | |
| for metric, alias in (("runtime_fps", "runtime"), ("compile_time", "compile")) | |
| ) | |
| check_body = "\n".join( | |
| [ | |
| *baseline_block, | |
| "", | |
| f"Thresholds: {thr_repr}", | |
| "", | |
| "### Runtime FPS", | |
| *tables["runtime_fps"], | |
| "", | |
| "### Compile Time", | |
| *tables["compile_time"], | |
| "", | |
| f"- (*1) last: last commit on main, mean/std: stats over revs {MAX_VALID_REVISIONS} commits if available.", | |
| f"- (*2) Δ: relative difference between PR and last commit on main, i.e. (PR - main) / main * 100%.", | |
| ] | |
| ) | |
| # ----- COMMENT body (only if regressions) ----- | |
| if reg_found: | |
| comment_body = "\n".join([":warning: **Benchmark Regression Detected**", *check_body]) | |
| else: | |
| comment_body = "" | |
| # CSV file | |
| for metric in ("runtime_fps", "compile_time"): | |
| with csv_files[metric].open("w", newline="", encoding="utf-8") as f: | |
| w = csv.DictWriter(f, fieldnames=info.keys()) | |
| w.writeheader() | |
| for rec in rows_for_csv[metric]: | |
| w.writerow(rec) | |
| # write md results | |
| check_body_path.write_text(check_body + "\n", encoding="utf-8") | |
| # Exit with error code | |
| if reg_found: | |
| exit_code = int(os.environ["EXIT_CODE_REGRESSION"]) | |
| elif alert_found: | |
| exit_code = int(os.environ["EXIT_CODE_ALERT"]) | |
| else: | |
| exit_code = 0 | |
| sys.exit(exit_code) | |
| PY | |
| # Enable command trace to ease debugging | |
| set -o xtrace | |
| # Expose outputs to later steps | |
| if [ -f "$CHECK_BODY_PATH" ]; then | |
| { | |
| echo 'CHECK_OUTPUT<<__EOF__' | |
| cat "$CHECK_BODY_PATH" | |
| echo '__EOF__' | |
| } >> "$GITHUB_ENV" | |
| else | |
| echo "CHECK_OUTPUT=" >> "$GITHUB_ENV" | |
| fi | |
| # Export status | |
| echo "HAS_REGRESSIONS=$([ "$EXIT_CODE" = "$EXIT_CODE_REGRESSION" ] && echo 1 || echo 0)" >> "$GITHUB_ENV" | |
| echo "HAS_ALERTS=$([ "$EXIT_CODE" = "$EXIT_CODE_ALERT" ] && echo 1 || echo 0)" >> "$GITHUB_ENV" | |
| - name: Upload benchmark comparisons in CSV | |
| id: upload | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-comparison-tables | |
| path: | | |
| runtime_fps.csv | |
| compile_time.csv | |
| if-no-files-found: warn | |
| - name: Publish PR check | |
| id: publish_check | |
| uses: actions/github-script@v8 | |
| env: | |
| CHECK_NAME: Benchmark Comparison | |
| CHECK_OUTPUT: ${{ env.CHECK_OUTPUT }} | |
| HAS_REGRESSIONS: ${{ env.HAS_REGRESSIONS }} | |
| HAS_ALERTS: ${{ env.HAS_ALERTS }} | |
| ARTIFACT_URL: ${{ steps.upload.outputs.artifact-url }} | |
| with: | |
| script: | | |
| const artifactUrl = process.env.ARTIFACT_URL || ''; | |
| let body = process.env.CHECK_OUTPUT || ''; | |
| if (body && artifactUrl) { | |
| body += `\n\n**Artifact:** [Download raw data](${artifactUrl})`; | |
| } | |
| let summary; | |
| let conclusion = 'success'; | |
| if ((process.env.HAS_REGRESSIONS || '0') === '1') { | |
| summary = '🔴 Regressions detected. See tables below.'; | |
| conclusion = 'failure'; | |
| } else if ((process.env.HAS_ALERTS || '0') === '1') { | |
| summary = '⚠️ Large deviation detected. See tables below.'; | |
| } else { | |
| summary = '✅ No regressions detected. See tables below.'; | |
| } | |
| const check = await github.rest.checks.create({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| head_sha: context.payload.workflow_run.head_sha, | |
| name: process.env.CHECK_NAME, | |
| status: 'completed', | |
| conclusion: conclusion, | |
| output: { | |
| title: process.env.CHECK_NAME, | |
| summary, | |
| text: body || undefined | |
| } | |
| }); | |
| core.setOutput("check-url", check.data.html_url); | |
| - name: Add PR comment | |
| if: ${{ env.HAS_REGRESSIONS == '1' || env.HAS_ALERTS == '1' }} | |
| uses: actions/github-script@v8 | |
| env: | |
| HAS_REGRESSIONS: ${{ env.HAS_REGRESSIONS }} | |
| REPORT_URL: ${{ steps.publish_check.outputs.check-url }} | |
| with: | |
| script: | | |
| // Getting PR number when using 'workflow_run' is tricky. For reference, see: | |
| // * https://docs.github.com/en/webhooks/webhook-events-and-payloads#workflow_run | |
| // * https://stackoverflow.com/a/75420270/4820605 | |
| const { data } = await github.rest.repos.listPullRequestsAssociatedWithCommit({ | |
| owner: context.payload.workflow_run.head_repository.owner.login, | |
| repo: context.payload.workflow_run.head_repository.name, | |
| commit_sha: context.payload.workflow_run.head_sha, | |
| }); | |
| if (!data || !data.length) { | |
| core.info('No associated PR; skipping comment.'); | |
| return; | |
| } | |
| const title = (process.env.HAS_REGRESSIONS || '0') === '1' | |
| ? 'Benchmark Regression Detected' : 'Abnormal Benchmark Result Detected'; | |
| const comment = `:warning: **${title}** | |
| ➡️ **[Report](${process.env.REPORT_URL})**`; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: data[0].number, | |
| body: comment | |
| }); |