chore: //ci/githubstats:query add 'impact' column (#8627)

basvandijk · web-flow · commit 653c927f2c73 · 2026-02-01T19:30:47.000Z
We should prioritise fixing tests where failures have the most impact.
So this adds an `impact` column to the `top` query which is the product
of the number of `non_success` runs and the `duration_p90`.

A better metric than `non_success * duration_p90` would be the sum of
time spent on failed runs. However we don't track that data (yet) so we
go for this for now and might change it to the latter later.
diff --git a/ci/githubstats/query.py b/ci/githubstats/query.py
@@ -146,6 +146,7 @@ def top(args):
         headers = [desc[0] for desc in cursor.description]
         df = pd.DataFrame(cursor, columns=headers)
 
+    df["impact"] = df["impact"].apply(normalize_duration)
     df["duration_p90"] = df["duration_p90"].apply(normalize_duration)
 
     # Find the CODEOWNERS for each test target:
@@ -171,13 +172,14 @@ def top(args):
         "left",  # label
         "decimal",  # total
         "decimal",  # non_success
-        "decimal",  # non_success%
         "decimal",  # flaky
-        "decimal",  # flaky%
         "decimal",  # timeout
-        "decimal",  # timeout%
         "decimal",  # fail
-        "decimal",  #  fail%
+        "decimal",  # non_success%
+        "decimal",  # flaky%
+        "decimal",  # timeout%
+        "decimal",  # fail%
+        "right",  # impact
         "right",  # duration_p90
         "left",  # owners
     ]
@@ -264,6 +266,11 @@ def direct_url_to_buildbuddy(url):
     print(tabulate(df[columns], headers="keys", tablefmt=args.tablefmt, colalign=colalignments))
 
 
+# argparse formatter to allow newlines in --help.
+class RawDefaultsFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
+    pass
+
+
 def main():
     parser = argparse.ArgumentParser(prog="bazel run //ci/githubstats:query --")
 
@@ -304,8 +311,19 @@ def main():
     top_parser = subparsers.add_parser(
         "top",
         parents=[common_parser, filter_parser],
+        formatter_class=RawDefaultsFormatter,
         help="Get the top non-successful / flaky / failed / timed-out tests in the last period",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        epilog="""
+Examples:
+  # Show the top 10 most flaky tests in the last week
+  bazel run //ci/githubstats:query -- top 10 flaky% --week
+
+  # Show the top 5 tests on PRs where failures had the highest impact in the last week
+  bazel run //ci/githubstats:query -- top 5 impact --prs --week
+
+  # Show the 100 slowest tests in the last month
+  bazel run //ci/githubstats:query -- top 100 duration_p90 --month
+""",
     )
     top_parser.add_argument(
         "N", type=int, nargs="?", default=10, help="If specified, limits the number of tests to show"
@@ -317,16 +335,29 @@ def main():
         choices=[
             "total",
             "non_success",
-            "non_success%",
             "flaky",
-            "flaky%",
             "timeout",
-            "timeout%",
             "fail",
+            "non_success%",
+            "flaky%",
+            "timeout%",
             "fail%",
+            "impact",
             "duration_p90",
         ],
-        help="COLUMN to order by and have the condition flags like --gt, --ge, etc. apply to",
+        help="""COLUMN to order by and have the condition flags like --gt, --ge, etc. apply to.
+
+total:\t\tTotal runs in the specified period
+non_success:\tNumber of non-successful runs in the specified period
+flaky:\t\tNumber of flaky runs in the specified period
+timeout:\tNumber of timed-out runs in the specified period
+fail:\t\tNumber of failed runs in the specified period
+non_success%%:\tPercentage of non-successful runs in the specified period
+flaky%%:\t\tPercentage of flaky runs in the specified period
+timeout%%:\tPercentage of timed-out runs in the specified period
+fail%%:\t\tPercentage of failed runs in the specified period
+impact:\t\tnon_success * duration_p90. A rough estimate on the impact of failures
+duration_p90:\t90th percentile duration of all runs in the specified period""",
     )
 
     condition_group = top_parser.add_mutually_exclusive_group()
@@ -349,8 +380,13 @@ def main():
     last_runs_parser = subparsers.add_parser(
         "last",
         parents=[common_parser, filter_parser],
+        formatter_class=RawDefaultsFormatter,
         help="Get the last runs of the specified test in the given period",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        epilog="""
+Examples:
+  # Show the last flaky runs of the rent_subnet_test in the last week
+  bazel run //ci/githubstats:query -- last --flaky //rs/tests/nns:rent_subnet_test --week
+""",
     )
     last_runs_parser.add_argument("--success", action="store_true", help="Include successful runs")
     last_runs_parser.add_argument("--flaky", action="store_true", help="Include flaky runs")
diff --git a/ci/githubstats/top.sql b/ci/githubstats/top.sql
@@ -1,38 +1,47 @@
-WITH "top" AS (
-  SELECT
-    label,
-
-    COUNT(*) AS "total",
-
-           SUM(CASE WHEN overall_status <> 1 THEN 1 ELSE 0 END)                         AS "non_success",
-    ROUND((SUM(CASE WHEN overall_status <> 1 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1) AS "non_success%",
-
-           SUM(CASE WHEN overall_status = 2 THEN 1 ELSE 0 END)                          AS "flaky",
-    ROUND((SUM(CASE WHEN overall_status = 2 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1)  AS "flaky%",
-
-           SUM(CASE WHEN overall_status = 3 THEN 1 ELSE 0 END)                          AS "timeout",
-    ROUND((SUM(CASE WHEN overall_status = 3 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1)  AS "timeout%",
-
-           SUM(CASE WHEN overall_status = 4 THEN 1 ELSE 0 END)                          AS "fail",
-    ROUND((SUM(CASE WHEN overall_status = 4 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1)  AS "fail%",
-
-    percentile_disc(0.9) WITHIN GROUP (ORDER BY total_run_duration) * INTERVAL '1 second' AS "duration_p90"
-
-  FROM
-    workflow_runs     AS wr JOIN
-    bazel_invocations AS bi ON wr.id = bi.run_id JOIN
-    bazel_tests       AS bt ON bi.build_id = bt.build_id
-
-  WHERE
-    ({hide} = '' OR bt.label NOT LIKE {hide})
-    AND ('{period}' = '' OR bt.first_start_time > now() - ('1 {period}'::interval))
-    AND (NOT {only_prs} OR wr.event_type = 'pull_request')
-    AND ({branch} = '' OR wr.head_branch LIKE {branch})
-
-  GROUP BY label
-
-  ORDER BY {order_by} DESC
-
-  LIMIT {N}
-)
+WITH
+  "core" AS (
+    SELECT
+      label,
+      COUNT(*) AS "total",
+      SUM(CASE WHEN overall_status <> 1 THEN 1 ELSE 0 END) AS "non_success",
+      SUM(CASE WHEN overall_status = 2 THEN 1 ELSE 0 END)  AS "flaky",
+      SUM(CASE WHEN overall_status = 3 THEN 1 ELSE 0 END)  AS "timeout",
+      SUM(CASE WHEN overall_status = 4 THEN 1 ELSE 0 END)  AS "fail",
+      percentile_disc(0.9) WITHIN GROUP (ORDER BY total_run_duration) * INTERVAL '1 second' AS "duration_p90"
+
+    FROM
+      workflow_runs     AS wr JOIN
+      bazel_invocations AS bi ON wr.id = bi.run_id JOIN
+      bazel_tests       AS bt ON bi.build_id = bt.build_id
+
+    WHERE
+      ({hide} = '' OR bt.label NOT LIKE {hide})
+      AND ('{period}' = '' OR bt.first_start_time > now() - ('1 {period}'::interval))
+      AND (NOT {only_prs} OR wr.event_type = 'pull_request')
+      AND ({branch} = '' OR wr.head_branch LIKE {branch})
+
+    GROUP BY label
+  ),
+  "top" AS (
+    SELECT
+      label,
+      "total",
+      "non_success",
+      "flaky",
+      "timeout",
+      "fail",
+      ROUND(("non_success" * 100.0) / "total", 1) AS "non_success%",
+      ROUND(("flaky" * 100.0) / "total", 1)  AS "flaky%",
+      ROUND(("timeout" * 100.0) / "total", 1)  AS "timeout%",
+      ROUND(("fail" * 100.0) / "total", 1)  AS "fail%",
+      "non_success" * "duration_p90" AS "impact",
+      "duration_p90"
+
+    FROM
+      "core"
+
+    ORDER BY {order_by} DESC
+
+    LIMIT {N}
+  )
 SELECT * FROM "top" WHERE {condition}