Skip to content

Benchmark Comparison & Alarm Regression #2

Benchmark Comparison & Alarm Regression

Benchmark Comparison & Alarm Regression #2

Workflow file for this run

name: Benchmark Comparison & Alarm Regression
on:
workflow_run:
workflows: ["Production"]
types: [completed]
permissions:
contents: read
actions: read
pull-requests: write
checks: write
jobs:
comment-if-regressed:
runs-on: ubuntu-latest
if: >
github.event.workflow_run.event == 'pull_request' &&
contains(fromJson('["success","neutral"]'), github.event.workflow_run.conclusion)
steps:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install deps
run: |
python -m pip install --quiet --upgrade wandb frozendict
- name: Download artifacts from triggering run
id: dl
uses: actions/download-artifact@v4
with:
pattern: speed-test-*
run-id: ${{ github.event.workflow_run.id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
path: ./artifacts
- name: Show downloaded files
run: |
echo "Downloaded into ${{ steps.dl.outputs.download-path }}"
ls -la ${{ steps.dl.outputs.download-path }} || true
(command -v tree >/dev/null && tree -a ${{ steps.dl.outputs.download-path }}) || true
- name: Check regressions + build outputs
id: analyze
env:
# Note that secrets are not passed to workflows that are triggered by a pull request from a fork
# --- W&B ---
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
WANDB_ENTITY: genesis-ai-company
WANDB_PROJECT: genesis-benchmarks
WANDB_SILENT: "true"
# --- Parameters ---
MAX_VALID_REVISIONS: 5
MAX_FETCH_REVISIONS: 40
RUNTIME_REGRESSION_TOLERANCE_PCT: 10
COMPILE_REGRESSION_TOLERANCE_PCT: 10
# Input/Output paths
ARTIFACTS_DIR: ${{ steps.dl.outputs.download-path }}
CHECK_BODY_PATH: check_output.md
CSV_RUNTIME_PATH: runtime_fps.csv
CSV_COMPILE_PATH: compile_time.csv
EXIT_CODE_REGRESSION: 42
EXIT_CODE_ALERT: 43
run: |
{ python - << 'PY'; EXIT_CODE=$?; } || true
import os, sys, json, re, math, statistics
import wandb
from frozendict import frozendict
from pathlib import Path
import csv
# ----- arguments -----
MAX_VALID_REVISIONS = int(os.environ["MAX_VALID_REVISIONS"])
MAX_FETCH_REVISIONS = int(os.environ["MAX_FETCH_REVISIONS"])
METRICS_TOL = {
"runtime_fps": float(os.environ["RUNTIME_REGRESSION_TOLERANCE_PCT"]),
"compile_time": float(os.environ["COMPILE_REGRESSION_TOLERANCE_PCT"]),
}
artifacts_dir = Path(os.environ["ARTIFACTS_DIR"]).expanduser().resolve()
check_body_path = Path(os.environ["CHECK_BODY_PATH"]).expanduser()
csv_files = {
"runtime_fps": Path(os.environ["CSV_RUNTIME_PATH"]).expanduser().resolve(),
"compile_time": Path(os.environ["CSV_COMPILE_PATH"]).expanduser().resolve(),
}
# ---------- helpers ----------
METRIC_KEYS = ("compile_time", "runtime_fps", "realtime_factor")
def parse_benchmark_id(bid: str) -> dict:
kv = {}
if bid:
for token in bid.split("-"):
token = token.strip()
if token and "=" in token:
k, v = token.split("=", 1)
kv[k.strip()] = v.strip()
return kv
def normalize_benchmark_id(bid: str) -> frozendict[str, str]:
return frozendict(parse_benchmark_id(bid))
def get_param_names(bids: tuple[frozendict]) -> tuple[str, ...]:
"""
Merge a list of tuples into a single tuple of keys that:
- Preserves the relative order of keys within each tuple
- Gives precedence to later tuples when conflicts arise
"""
merged = list(bids[-1])
merged_set = set(merged)
for tup in bids[:-1]:
for key in tup:
if key not in merged_set:
merged.append(key)
merged_set.add(key)
return tuple(merged)
def sort_key(d):
key_list = []
for col in params_name:
if col in d:
val = d[col]
key_list.append((0, val))
else:
key_list.append((1, None))
return key_list
def artifacts_parse_csv_summary(current_txt_path):
out = {}
for line in current_txt_path.read_text().splitlines():
kv = dict(map(str.strip, p.split("=", 1)) for p in line.split("|") if "=" in p)
record = {}
for k in METRIC_KEYS:
try:
record[k] = float(kv.pop(k))
except (ValueError, TypeError, KeyError):
pass
nbid = frozendict(kv)
out[nbid] = record
return out
def fmt_num(v, is_int: bool):
return f"{int(v):,}" if is_int else f"{v:.2f}"
# ----- load artifacts (current results) -----
current_csv_paths = list(artifacts_dir.rglob("speed_test*.txt"))
if not current_csv_paths:
check_body_path.touch()
sys.exit(0)
current_bm = {}
for csv_path in current_csv_paths:
current_bm |= artifacts_parse_csv_summary(csv_path)
bids_set = frozenset(current_bm.keys())
assert bids_set
# ----- W&B baselines -----
if not "WANDB_API_KEY" in os.environ:
print("WANDB_API_KEY is not set")
sys.exit(0)
ENTITY = os.environ["WANDB_ENTITY"]
PROJECT = os.environ["WANDB_PROJECT"]
api = wandb.Api()
runs_iter = api.runs(f"{ENTITY}/{PROJECT}", order="-created_at")
revs = set()
records_by_rev = {}
for i, run in enumerate(runs_iter):
# Abort if still not complete after checking enough runs.
# This would happen if a new benchmark has been added, and not enough past data is available yet.
if len(revs) == MAX_FETCH_REVISIONS:
break
# Early return if enough complete records have been collected
records_is_complete = [bids_set.issubset(record.keys()) for record in records_by_rev.values()]
if sum(records_is_complete) == MAX_VALID_REVISIONS:
break
# Load config and summary, with support of legacy runs
config, summary = run.config, run.summary
if isinstance(config, str):
config = {k: v["value"] for k, v in json.loads(run.config).items() if not k.startswith("_")}
if isinstance(summary._json_dict, str):
summary = json.loads(summary._json_dict)
# Extract revision commit and branch
try:
rev, branch = config["revision"].split("@", 1)
revs.add(rev)
except ValueError:
# Ignore this run if the revision has been corrupted for some unknown reason
continue
# Ignore runs associated with a commit that is not part of the official repository
if not branch.startswith('Genesis-Embodied-AI/'):
continue
# Skip runs did not finish for some reason
if run.state != "finished":
continue
# Do not store new records if the desired number of revision is already reached
if len(records_by_rev) == MAX_VALID_REVISIONS and rev not in records_by_rev:
continue
# Extract benchmark ID and normalize it to make sure it does not depends on key ordering.
# Note that the rigid body benchmark suite is the only one being supported for now.
sid, bid = config["benchmark_id"].split("-", 1)
if sid != "rigid_body":
continue
# Make sure that stats are valid
try:
is_valid = True
for k in METRIC_KEYS:
v = summary[k]
if not isinstance(v, (float, int)) or math.isnan(v):
is_valid = False
break
if not is_valid:
continue
except KeyError:
continue
# Store all the records into a dict
nbid = normalize_benchmark_id(bid)
records_by_rev.setdefault(rev, {})[nbid] = {
metric: summary[metric] for metric in METRIC_KEYS
}
# ----- build TWO tables -----
# Parse benchmark IDs into key-value dicts while preserving order
params_name = get_param_names(tuple((tuple(kv.keys())) for kv in current_bm.keys()))
reg_found, alert_found = False, False
tables = {}
rows_for_csv = {"runtime_fps": [], "compile_time": []}
info = {}
for metric, alias, sign in (("runtime_fps", "FPS", 1), ("compile_time", "compile", -1)):
rows_md = []
header_cells = (
"status",
*params_name,
f"current {alias}",
f"baseline {alias} [last (mean ± std)] (*1)",
f"Δ {alias} (*2)"
)
header = "| " + " | ".join(header_cells) + " |"
align = "|:------:|" + "|".join([":---" for _ in params_name]) + "|---:|---:|---:|"
for bid in sorted(current_bm.keys(), key=sort_key):
value_cur = current_bm[bid][metric]
is_int = isinstance(value_cur, int) or value_cur.is_integer()
value_repr = fmt_num(value_cur, is_int)
params_repr = [bid.get(k, "-") for k in params_name]
info = {
**dict(zip(params_name, params_repr)),
"current": value_cur,
"baseline_last": None,
"baseline_min": None,
"baseline_max": None,
}
values_prev = [
record[bid][metric]
for record in records_by_rev.values()
if bid in record
]
if values_prev:
value_last = values_prev[0]
value_ref = statistics.fmean(values_prev)
delta = (value_cur - value_last) / value_last * 100.0
info["baseline_last"] = int(value_last) if is_int else float(value_last)
stats_repr = f"{fmt_num(value_last, is_int)}"
delta_repr = f"{delta:+.1f}%"
if len(values_prev) == MAX_VALID_REVISIONS:
info["baseline_mean"] = int(value_ref) if is_int else float(value_ref)
info["baseline_min"] = int(min(values_prev)) if is_int else float(min(values_prev))
info["baseline_max"] = int(max(values_prev)) if is_int else float(max(values_prev))
value_std = statistics.stdev(values_prev)
stats_repr += f" ({fmt_num(value_ref, is_int)} ± {fmt_num(value_std, is_int)})"
if sign * delta < - METRICS_TOL[metric]:
info["status"] = "regression"
delta_repr = f"**{delta_repr}**"
picto = "🔴"
reg_found = True
elif sign * delta > METRICS_TOL[metric]:
info["status"] = "alert"
delta_repr = f"**{delta_repr}**"
picto = "⚠️"
alert_found = True
else:
info["status"] = "ok"
picto = "✅"
else:
info["status"] = "n/a"
picto = "ℹ️"
else:
picto, stats_repr, delta_repr = "ℹ️", "---", "---"
rows_md.append("| " + " | ".join((picto, *params_repr, value_repr, stats_repr, delta_repr)) + " |")
rows_for_csv[metric].append(info)
tables[metric] = [header, align] + rows_md
# ----- baseline commit list (MD) -----
blist = [f"- Commit {i}: {sha}" for i, sha in enumerate(records_by_rev.keys(), 1)]
baseline_block = ["**Baselines considered:** " + f"**{len(records_by_rev)}** commits"] + blist
# ----- CHECK body (always) -----
thr_repr = ", ".join(
f"{alias} ± {METRICS_TOL[metric]:.0f}%"
for metric, alias in (("runtime_fps", "runtime"), ("compile_time", "compile"))
)
check_body = "\n".join(
[
*baseline_block,
"",
f"Thresholds: {thr_repr}",
"",
"### Runtime FPS",
*tables["runtime_fps"],
"",
"### Compile Time",
*tables["compile_time"],
"",
f"- (*1) last: last commit on main, mean/std: stats over revs {MAX_VALID_REVISIONS} commits if available.",
f"- (*2) Δ: relative difference between PR and last commit on main, i.e. (PR - main) / main * 100%.",
]
)
# ----- COMMENT body (only if regressions) -----
if reg_found:
comment_body = "\n".join([":warning: **Benchmark Regression Detected**", *check_body])
else:
comment_body = ""
# CSV file
for metric in ("runtime_fps", "compile_time"):
with csv_files[metric].open("w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=info.keys())
w.writeheader()
for rec in rows_for_csv[metric]:
w.writerow(rec)
# write md results
check_body_path.write_text(check_body + "\n", encoding="utf-8")
# Exit with error code
if reg_found:
exit_code = int(os.environ["EXIT_CODE_REGRESSION"])
elif alert_found:
exit_code = int(os.environ["EXIT_CODE_ALERT"])
else:
exit_code = 0
sys.exit(exit_code)
PY
# Enable command trace to ease debugging
set -o xtrace
# Expose outputs to later steps
if [ -f "$CHECK_BODY_PATH" ]; then
{
echo 'CHECK_OUTPUT<<__EOF__'
cat "$CHECK_BODY_PATH"
echo '__EOF__'
} >> "$GITHUB_ENV"
else
echo "CHECK_OUTPUT=" >> "$GITHUB_ENV"
fi
# Export status
echo "HAS_REGRESSIONS=$([ "$EXIT_CODE" = "$EXIT_CODE_REGRESSION" ] && echo 1 || echo 0)" >> "$GITHUB_ENV"
echo "HAS_ALERTS=$([ "$EXIT_CODE" = "$EXIT_CODE_ALERT" ] && echo 1 || echo 0)" >> "$GITHUB_ENV"
- name: Upload benchmark comparisons in CSV
id: upload
uses: actions/upload-artifact@v4
with:
name: benchmark-comparison-tables
path: |
runtime_fps.csv
compile_time.csv
if-no-files-found: warn
- name: Publish PR check
id: publish_check
uses: actions/github-script@v8
env:
CHECK_NAME: Benchmark Comparison
CHECK_OUTPUT: ${{ env.CHECK_OUTPUT }}
HAS_REGRESSIONS: ${{ env.HAS_REGRESSIONS }}
HAS_ALERTS: ${{ env.HAS_ALERTS }}
ARTIFACT_URL: ${{ steps.upload.outputs.artifact-url }}
with:
script: |
const artifactUrl = process.env.ARTIFACT_URL || '';
let body = process.env.CHECK_OUTPUT || '';
if (body && artifactUrl) {
body += `\n\n**Artifact:** [Download raw data](${artifactUrl})`;
}
let summary;
let conclusion = 'success';
if ((process.env.HAS_REGRESSIONS || '0') === '1') {
summary = '🔴 Regressions detected. See tables below.';
conclusion = 'failure';
} else if ((process.env.HAS_ALERTS || '0') === '1') {
summary = '⚠️ Large deviation detected. See tables below.';
} else {
summary = '✅ No regressions detected. See tables below.';
}
const check = await github.rest.checks.create({
owner: context.repo.owner,
repo: context.repo.repo,
head_sha: context.payload.workflow_run.head_sha,
name: process.env.CHECK_NAME,
status: 'completed',
conclusion: conclusion,
output: {
title: process.env.CHECK_NAME,
summary,
text: body || undefined
}
});
core.setOutput("check-url", check.data.html_url);
- name: Add PR comment
if: ${{ env.HAS_REGRESSIONS == '1' || env.HAS_ALERTS == '1' }}
uses: actions/github-script@v8
env:
HAS_REGRESSIONS: ${{ env.HAS_REGRESSIONS }}
REPORT_URL: ${{ steps.publish_check.outputs.check-url }}
with:
script: |
// Getting PR number when using 'workflow_run' is tricky. For reference, see:
// * https://docs.github.com/en/webhooks/webhook-events-and-payloads#workflow_run
// * https://stackoverflow.com/a/75420270/4820605
const { data } = await github.rest.repos.listPullRequestsAssociatedWithCommit({
owner: context.payload.workflow_run.head_repository.owner.login,
repo: context.payload.workflow_run.head_repository.name,
commit_sha: context.payload.workflow_run.head_sha,
});
if (!data || !data.length) {
core.info('No associated PR; skipping comment.');
return;
}
const title = (process.env.HAS_REGRESSIONS || '0') === '1'
? 'Benchmark Regression Detected' : 'Abnormal Benchmark Result Detected';
const comment = `:warning: **${title}**
➡️ **[Report](${process.env.REPORT_URL})**`;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: data[0].number,
body: comment
});