Benchmark Comparison & Alarm Regression #2

Workflow file for this run

.github/workflows/alarm.yml at 2051ff5

	name: Benchmark Comparison & Alarm Regression

	on:
	workflow_run:
	workflows: ["Production"]
	types: [completed]

	permissions:
	contents: read
	actions: read
	pull-requests: write
	checks: write

	jobs:
	comment-if-regressed:
	runs-on: ubuntu-latest
	if: >
	github.event.workflow_run.event == 'pull_request' &&
	contains(fromJson('["success","neutral"]'), github.event.workflow_run.conclusion)

	steps:
	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.10'

	- name: Install deps
	run: \|
	python -m pip install --quiet --upgrade wandb frozendict

	- name: Download artifacts from triggering run
	id: dl
	uses: actions/download-artifact@v4
	with:
	pattern: speed-test-*
	run-id: ${{ github.event.workflow_run.id }}
	github-token: ${{ secrets.GITHUB_TOKEN }}
	path: ./artifacts

	- name: Show downloaded files
	run: \|
	echo "Downloaded into ${{ steps.dl.outputs.download-path }}"
	ls -la ${{ steps.dl.outputs.download-path }} \|\| true
	(command -v tree >/dev/null && tree -a ${{ steps.dl.outputs.download-path }}) \|\| true

	- name: Check regressions + build outputs
	id: analyze
	env:
	# Note that secrets are not passed to workflows that are triggered by a pull request from a fork
	# --- W&B ---
	WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
	WANDB_ENTITY: genesis-ai-company
	WANDB_PROJECT: genesis-benchmarks
	WANDB_SILENT: "true"

	# --- Parameters ---
	MAX_VALID_REVISIONS: 5
	MAX_FETCH_REVISIONS: 40
	RUNTIME_REGRESSION_TOLERANCE_PCT: 10
	COMPILE_REGRESSION_TOLERANCE_PCT: 10

	# Input/Output paths
	ARTIFACTS_DIR: ${{ steps.dl.outputs.download-path }}
	CHECK_BODY_PATH: check_output.md
	CSV_RUNTIME_PATH: runtime_fps.csv
	CSV_COMPILE_PATH: compile_time.csv
	EXIT_CODE_REGRESSION: 42
	EXIT_CODE_ALERT: 43
	run: \|
	{ python - << 'PY'; EXIT_CODE=$?; } \|\| true

	import os, sys, json, re, math, statistics
	import wandb
	from frozendict import frozendict
	from pathlib import Path
	import csv

	# ----- arguments -----

	MAX_VALID_REVISIONS = int(os.environ["MAX_VALID_REVISIONS"])
	MAX_FETCH_REVISIONS = int(os.environ["MAX_FETCH_REVISIONS"])

	METRICS_TOL = {
	"runtime_fps": float(os.environ["RUNTIME_REGRESSION_TOLERANCE_PCT"]),
	"compile_time": float(os.environ["COMPILE_REGRESSION_TOLERANCE_PCT"]),
	}

	artifacts_dir = Path(os.environ["ARTIFACTS_DIR"]).expanduser().resolve()
	check_body_path = Path(os.environ["CHECK_BODY_PATH"]).expanduser()

	csv_files = {
	"runtime_fps": Path(os.environ["CSV_RUNTIME_PATH"]).expanduser().resolve(),
	"compile_time": Path(os.environ["CSV_COMPILE_PATH"]).expanduser().resolve(),
	}

	# ---------- helpers ----------

	METRIC_KEYS = ("compile_time", "runtime_fps", "realtime_factor")

	def parse_benchmark_id(bid: str) -> dict:
	kv = {}
	if bid:
	for token in bid.split("-"):
	token = token.strip()
	if token and "=" in token:
	k, v = token.split("=", 1)
	kv[k.strip()] = v.strip()
	return kv

	def normalize_benchmark_id(bid: str) -> frozendict[str, str]:
	return frozendict(parse_benchmark_id(bid))

	def get_param_names(bids: tuple[frozendict]) -> tuple[str, ...]:
	"""
	Merge a list of tuples into a single tuple of keys that:
	- Preserves the relative order of keys within each tuple
	- Gives precedence to later tuples when conflicts arise
	"""
	merged = list(bids[-1])
	merged_set = set(merged)
	for tup in bids[:-1]:
	for key in tup:
	if key not in merged_set:
	merged.append(key)
	merged_set.add(key)
	return tuple(merged)

	def sort_key(d):
	key_list = []
	for col in params_name:
	if col in d:
	val = d[col]
	key_list.append((0, val))
	else:
	key_list.append((1, None))
	return key_list

	def artifacts_parse_csv_summary(current_txt_path):
	out = {}
	for line in current_txt_path.read_text().splitlines():
	kv = dict(map(str.strip, p.split("=", 1)) for p in line.split("\|") if "=" in p)
	record = {}
	for k in METRIC_KEYS:
	try:
	record[k] = float(kv.pop(k))
	except (ValueError, TypeError, KeyError):
	pass
	nbid = frozendict(kv)
	out[nbid] = record
	return out

	def fmt_num(v, is_int: bool):
	return f"{int(v):,}" if is_int else f"{v:.2f}"

	# ----- load artifacts (current results) -----

	current_csv_paths = list(artifacts_dir.rglob("speed_test*.txt"))
	if not current_csv_paths:
	check_body_path.touch()
	sys.exit(0)

	current_bm = {}
	for csv_path in current_csv_paths:
	current_bm \|= artifacts_parse_csv_summary(csv_path)
	bids_set = frozenset(current_bm.keys())
	assert bids_set

	# ----- W&B baselines -----

	if not "WANDB_API_KEY" in os.environ:
	print("WANDB_API_KEY is not set")
	sys.exit(0)
	ENTITY = os.environ["WANDB_ENTITY"]
	PROJECT = os.environ["WANDB_PROJECT"]

	api = wandb.Api()
	runs_iter = api.runs(f"{ENTITY}/{PROJECT}", order="-created_at")

	revs = set()
	records_by_rev = {}
	for i, run in enumerate(runs_iter):
	# Abort if still not complete after checking enough runs.
	# This would happen if a new benchmark has been added, and not enough past data is available yet.
	if len(revs) == MAX_FETCH_REVISIONS:
	break

	# Early return if enough complete records have been collected
	records_is_complete = [bids_set.issubset(record.keys()) for record in records_by_rev.values()]
	if sum(records_is_complete) == MAX_VALID_REVISIONS:
	break

	# Load config and summary, with support of legacy runs
	config, summary = run.config, run.summary
	if isinstance(config, str):
	config = {k: v["value"] for k, v in json.loads(run.config).items() if not k.startswith("_")}
	if isinstance(summary._json_dict, str):
	summary = json.loads(summary._json_dict)

	# Extract revision commit and branch
	try:
	rev, branch = config["revision"].split("@", 1)
	revs.add(rev)
	except ValueError:
	# Ignore this run if the revision has been corrupted for some unknown reason
	continue
	# Ignore runs associated with a commit that is not part of the official repository
	if not branch.startswith('Genesis-Embodied-AI/'):
	continue

	# Skip runs did not finish for some reason
	if run.state != "finished":
	continue

	# Do not store new records if the desired number of revision is already reached
	if len(records_by_rev) == MAX_VALID_REVISIONS and rev not in records_by_rev:
	continue

	# Extract benchmark ID and normalize it to make sure it does not depends on key ordering.
	# Note that the rigid body benchmark suite is the only one being supported for now.
	sid, bid = config["benchmark_id"].split("-", 1)
	if sid != "rigid_body":
	continue

	# Make sure that stats are valid
	try:
	is_valid = True
	for k in METRIC_KEYS:
	v = summary[k]
	if not isinstance(v, (float, int)) or math.isnan(v):
	is_valid = False
	break
	if not is_valid:
	continue
	except KeyError:
	continue

	# Store all the records into a dict
	nbid = normalize_benchmark_id(bid)
	records_by_rev.setdefault(rev, {})[nbid] = {
	metric: summary[metric] for metric in METRIC_KEYS
	}

	# ----- build TWO tables -----

	# Parse benchmark IDs into key-value dicts while preserving order
	params_name = get_param_names(tuple((tuple(kv.keys())) for kv in current_bm.keys()))

	reg_found, alert_found = False, False
	tables = {}
	rows_for_csv = {"runtime_fps": [], "compile_time": []}
	info = {}
	for metric, alias, sign in (("runtime_fps", "FPS", 1), ("compile_time", "compile", -1)):
	rows_md = []

	header_cells = (
	"status",
	*params_name,
	f"current {alias}",
	f"baseline {alias} [last (mean ± std)] (*1)",
	f"Δ {alias} (*2)"
	)
	header = "\| " + " \| ".join(header_cells) + " \|"
	align = "\|:------:\|" + "\|".join([":---" for _ in params_name]) + "\|---:\|---:\|---:\|"

	for bid in sorted(current_bm.keys(), key=sort_key):
	value_cur = current_bm[bid][metric]
	is_int = isinstance(value_cur, int) or value_cur.is_integer()
	value_repr = fmt_num(value_cur, is_int)

	params_repr = [bid.get(k, "-") for k in params_name]
	info = {
	**dict(zip(params_name, params_repr)),
	"current": value_cur,
	"baseline_last": None,
	"baseline_min": None,
	"baseline_max": None,
	}

	values_prev = [
	record[bid][metric]
	for record in records_by_rev.values()
	if bid in record
	]
	if values_prev:
	value_last = values_prev[0]
	value_ref = statistics.fmean(values_prev)
	delta = (value_cur - value_last) / value_last * 100.0

	info["baseline_last"] = int(value_last) if is_int else float(value_last)

	stats_repr = f"{fmt_num(value_last, is_int)}"
	delta_repr = f"{delta:+.1f}%"
	if len(values_prev) == MAX_VALID_REVISIONS:
	info["baseline_mean"] = int(value_ref) if is_int else float(value_ref)
	info["baseline_min"] = int(min(values_prev)) if is_int else float(min(values_prev))
	info["baseline_max"] = int(max(values_prev)) if is_int else float(max(values_prev))

	value_std = statistics.stdev(values_prev)
	stats_repr += f" ({fmt_num(value_ref, is_int)} ± {fmt_num(value_std, is_int)})"
	if sign * delta < - METRICS_TOL[metric]:
	info["status"] = "regression"

	delta_repr = f"{delta_repr}"
	picto = "🔴"
	reg_found = True
	elif sign * delta > METRICS_TOL[metric]:
	info["status"] = "alert"

	delta_repr = f"{delta_repr}"
	picto = "⚠️"
	alert_found = True
	else:
	info["status"] = "ok"

	picto = "✅"
	else:
	info["status"] = "n/a"

	picto = "ℹ️"
	else:
	picto, stats_repr, delta_repr = "ℹ️", "---", "---"

	rows_md.append("\| " + " \| ".join((picto, *params_repr, value_repr, stats_repr, delta_repr)) + " \|")
	rows_for_csv[metric].append(info)

	tables[metric] = [header, align] + rows_md

	# ----- baseline commit list (MD) -----
	blist = [f"- Commit {i}: {sha}" for i, sha in enumerate(records_by_rev.keys(), 1)]
	baseline_block = ["Baselines considered: " + f"{len(records_by_rev)} commits"] + blist

	# ----- CHECK body (always) -----

	thr_repr = ", ".join(
	f"{alias} ± {METRICS_TOL[metric]:.0f}%"
	for metric, alias in (("runtime_fps", "runtime"), ("compile_time", "compile"))
	)

	check_body = "\n".join(
	[
	*baseline_block,
	"",
	f"Thresholds: {thr_repr}",
	"",
	"### Runtime FPS",
	*tables["runtime_fps"],
	"",
	"### Compile Time",
	*tables["compile_time"],
	"",
	f"- (*1) last: last commit on main, mean/std: stats over revs {MAX_VALID_REVISIONS} commits if available.",
	f"- (2) Δ: relative difference between PR and last commit on main, i.e. (PR - main) / main 100%.",
	]
	)

	# ----- COMMENT body (only if regressions) -----

	if reg_found:
	comment_body = "\n".join([":warning: Benchmark Regression Detected", *check_body])
	else:
	comment_body = ""

	# CSV file
	for metric in ("runtime_fps", "compile_time"):
	with csv_files[metric].open("w", newline="", encoding="utf-8") as f:
	w = csv.DictWriter(f, fieldnames=info.keys())
	w.writeheader()
	for rec in rows_for_csv[metric]:
	w.writerow(rec)

	# write md results
	check_body_path.write_text(check_body + "\n", encoding="utf-8")

	# Exit with error code
	if reg_found:
	exit_code = int(os.environ["EXIT_CODE_REGRESSION"])
	elif alert_found:
	exit_code = int(os.environ["EXIT_CODE_ALERT"])
	else:
	exit_code = 0
	sys.exit(exit_code)
	PY

	# Enable command trace to ease debugging
	set -o xtrace

	# Expose outputs to later steps
	if [ -f "$CHECK_BODY_PATH" ]; then
	{
	echo 'CHECK_OUTPUT<<__EOF__'
	cat "$CHECK_BODY_PATH"
	echo '__EOF__'
	} >> "$GITHUB_ENV"
	else
	echo "CHECK_OUTPUT=" >> "$GITHUB_ENV"
	fi

	# Export status
	echo "HAS_REGRESSIONS=$([ "$EXIT_CODE" = "$EXIT_CODE_REGRESSION" ] && echo 1 \|\| echo 0)" >> "$GITHUB_ENV"
	echo "HAS_ALERTS=$([ "$EXIT_CODE" = "$EXIT_CODE_ALERT" ] && echo 1 \|\| echo 0)" >> "$GITHUB_ENV"

	- name: Upload benchmark comparisons in CSV
	id: upload
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-comparison-tables
	path: \|
	runtime_fps.csv
	compile_time.csv
	if-no-files-found: warn

	- name: Publish PR check
	id: publish_check
	uses: actions/github-script@v8
	env:
	CHECK_NAME: Benchmark Comparison
	CHECK_OUTPUT: ${{ env.CHECK_OUTPUT }}
	HAS_REGRESSIONS: ${{ env.HAS_REGRESSIONS }}
	HAS_ALERTS: ${{ env.HAS_ALERTS }}
	ARTIFACT_URL: ${{ steps.upload.outputs.artifact-url }}
	with:
	script: \|
	const artifactUrl = process.env.ARTIFACT_URL \|\| '';
	let body = process.env.CHECK_OUTPUT \|\| '';
	if (body && artifactUrl) {
	body += `\n\nArtifact: [Download raw data](${artifactUrl})`;
	}

	let summary;
	let conclusion = 'success';
	if ((process.env.HAS_REGRESSIONS \|\| '0') === '1') {
	summary = '🔴 Regressions detected. See tables below.';
	conclusion = 'failure';
	} else if ((process.env.HAS_ALERTS \|\| '0') === '1') {
	summary = '⚠️ Large deviation detected. See tables below.';
	} else {
	summary = '✅ No regressions detected. See tables below.';
	}

	const check = await github.rest.checks.create({
	owner: context.repo.owner,
	repo: context.repo.repo,
	head_sha: context.payload.workflow_run.head_sha,
	name: process.env.CHECK_NAME,
	status: 'completed',
	conclusion: conclusion,
	output: {
	title: process.env.CHECK_NAME,
	summary,
	text: body \|\| undefined
	}
	});
	core.setOutput("check-url", check.data.html_url);

	- name: Add PR comment
	if: ${{ env.HAS_REGRESSIONS == '1' \|\| env.HAS_ALERTS == '1' }}
	uses: actions/github-script@v8
	env:
	HAS_REGRESSIONS: ${{ env.HAS_REGRESSIONS }}
	REPORT_URL: ${{ steps.publish_check.outputs.check-url }}
	with:
	script: \|
	// Getting PR number when using 'workflow_run' is tricky. For reference, see:
	// * https://docs.github.com/en/webhooks/webhook-events-and-payloads#workflow_run
	// * https://stackoverflow.com/a/75420270/4820605
	const { data } = await github.rest.repos.listPullRequestsAssociatedWithCommit({
	owner: context.payload.workflow_run.head_repository.owner.login,
	repo: context.payload.workflow_run.head_repository.name,
	commit_sha: context.payload.workflow_run.head_sha,
	});
	if (!data \|\| !data.length) {
	core.info('No associated PR; skipping comment.');
	return;
	}

	const title = (process.env.HAS_REGRESSIONS \|\| '0') === '1'
	? 'Benchmark Regression Detected' : 'Abnormal Benchmark Result Detected';
	const comment = `:warning: ${title}
	➡️ [Report](${process.env.REPORT_URL})`;

	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: data[0].number,
	body: comment
	});

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Benchmark Comparison & Alarm Regression #2

Workflow file

Benchmark Comparison & Alarm Regression #2

Uh oh!

Workflow file for this run