Skip to content

Commit 653c927

Browse files
authored
chore: //ci/githubstats:query add 'impact' column (#8627)
We should prioritise fixing tests where failures have the most impact. So this adds an `impact` column to the `top` query which is the product of the number of `non_success` runs and the `duration_p90`. A better metric than `non_success * duration_p90` would be the sum of time spent on failed runs. However we don't track that data (yet) so we go for this for now and might change it to the latter later.
1 parent 36a1507 commit 653c927

File tree

2 files changed

+92
-47
lines changed

2 files changed

+92
-47
lines changed

ci/githubstats/query.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ def top(args):
146146
headers = [desc[0] for desc in cursor.description]
147147
df = pd.DataFrame(cursor, columns=headers)
148148

149+
df["impact"] = df["impact"].apply(normalize_duration)
149150
df["duration_p90"] = df["duration_p90"].apply(normalize_duration)
150151

151152
# Find the CODEOWNERS for each test target:
@@ -171,13 +172,14 @@ def top(args):
171172
"left", # label
172173
"decimal", # total
173174
"decimal", # non_success
174-
"decimal", # non_success%
175175
"decimal", # flaky
176-
"decimal", # flaky%
177176
"decimal", # timeout
178-
"decimal", # timeout%
179177
"decimal", # fail
180-
"decimal", # fail%
178+
"decimal", # non_success%
179+
"decimal", # flaky%
180+
"decimal", # timeout%
181+
"decimal", # fail%
182+
"right", # impact
181183
"right", # duration_p90
182184
"left", # owners
183185
]
@@ -264,6 +266,11 @@ def direct_url_to_buildbuddy(url):
264266
print(tabulate(df[columns], headers="keys", tablefmt=args.tablefmt, colalign=colalignments))
265267

266268

269+
# argparse formatter to allow newlines in --help.
270+
class RawDefaultsFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
271+
pass
272+
273+
267274
def main():
268275
parser = argparse.ArgumentParser(prog="bazel run //ci/githubstats:query --")
269276

@@ -304,8 +311,19 @@ def main():
304311
top_parser = subparsers.add_parser(
305312
"top",
306313
parents=[common_parser, filter_parser],
314+
formatter_class=RawDefaultsFormatter,
307315
help="Get the top non-successful / flaky / failed / timed-out tests in the last period",
308-
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
316+
epilog="""
317+
Examples:
318+
# Show the top 10 most flaky tests in the last week
319+
bazel run //ci/githubstats:query -- top 10 flaky% --week
320+
321+
# Show the top 5 tests on PRs where failures had the highest impact in the last week
322+
bazel run //ci/githubstats:query -- top 5 impact --prs --week
323+
324+
# Show the 100 slowest tests in the last month
325+
bazel run //ci/githubstats:query -- top 100 duration_p90 --month
326+
""",
309327
)
310328
top_parser.add_argument(
311329
"N", type=int, nargs="?", default=10, help="If specified, limits the number of tests to show"
@@ -317,16 +335,29 @@ def main():
317335
choices=[
318336
"total",
319337
"non_success",
320-
"non_success%",
321338
"flaky",
322-
"flaky%",
323339
"timeout",
324-
"timeout%",
325340
"fail",
341+
"non_success%",
342+
"flaky%",
343+
"timeout%",
326344
"fail%",
345+
"impact",
327346
"duration_p90",
328347
],
329-
help="COLUMN to order by and have the condition flags like --gt, --ge, etc. apply to",
348+
help="""COLUMN to order by and have the condition flags like --gt, --ge, etc. apply to.
349+
350+
total:\t\tTotal runs in the specified period
351+
non_success:\tNumber of non-successful runs in the specified period
352+
flaky:\t\tNumber of flaky runs in the specified period
353+
timeout:\tNumber of timed-out runs in the specified period
354+
fail:\t\tNumber of failed runs in the specified period
355+
non_success%%:\tPercentage of non-successful runs in the specified period
356+
flaky%%:\t\tPercentage of flaky runs in the specified period
357+
timeout%%:\tPercentage of timed-out runs in the specified period
358+
fail%%:\t\tPercentage of failed runs in the specified period
359+
impact:\t\tnon_success * duration_p90. A rough estimate on the impact of failures
360+
duration_p90:\t90th percentile duration of all runs in the specified period""",
330361
)
331362

332363
condition_group = top_parser.add_mutually_exclusive_group()
@@ -349,8 +380,13 @@ def main():
349380
last_runs_parser = subparsers.add_parser(
350381
"last",
351382
parents=[common_parser, filter_parser],
383+
formatter_class=RawDefaultsFormatter,
352384
help="Get the last runs of the specified test in the given period",
353-
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
385+
epilog="""
386+
Examples:
387+
# Show the last flaky runs of the rent_subnet_test in the last week
388+
bazel run //ci/githubstats:query -- last --flaky //rs/tests/nns:rent_subnet_test --week
389+
""",
354390
)
355391
last_runs_parser.add_argument("--success", action="store_true", help="Include successful runs")
356392
last_runs_parser.add_argument("--flaky", action="store_true", help="Include flaky runs")

ci/githubstats/top.sql

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,47 @@
1-
WITH "top" AS (
2-
SELECT
3-
label,
4-
5-
COUNT(*) AS "total",
6-
7-
SUM(CASE WHEN overall_status <> 1 THEN 1 ELSE 0 END) AS "non_success",
8-
ROUND((SUM(CASE WHEN overall_status <> 1 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1) AS "non_success%",
9-
10-
SUM(CASE WHEN overall_status = 2 THEN 1 ELSE 0 END) AS "flaky",
11-
ROUND((SUM(CASE WHEN overall_status = 2 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1) AS "flaky%",
12-
13-
SUM(CASE WHEN overall_status = 3 THEN 1 ELSE 0 END) AS "timeout",
14-
ROUND((SUM(CASE WHEN overall_status = 3 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1) AS "timeout%",
15-
16-
SUM(CASE WHEN overall_status = 4 THEN 1 ELSE 0 END) AS "fail",
17-
ROUND((SUM(CASE WHEN overall_status = 4 THEN 1 ELSE 0 END) * 100.0) / COUNT(*), 1) AS "fail%",
18-
19-
percentile_disc(0.9) WITHIN GROUP (ORDER BY total_run_duration) * INTERVAL '1 second' AS "duration_p90"
20-
21-
FROM
22-
workflow_runs AS wr JOIN
23-
bazel_invocations AS bi ON wr.id = bi.run_id JOIN
24-
bazel_tests AS bt ON bi.build_id = bt.build_id
25-
26-
WHERE
27-
({hide} = '' OR bt.label NOT LIKE {hide})
28-
AND ('{period}' = '' OR bt.first_start_time > now() - ('1 {period}'::interval))
29-
AND (NOT {only_prs} OR wr.event_type = 'pull_request')
30-
AND ({branch} = '' OR wr.head_branch LIKE {branch})
31-
32-
GROUP BY label
33-
34-
ORDER BY {order_by} DESC
35-
36-
LIMIT {N}
37-
)
1+
WITH
2+
"core" AS (
3+
SELECT
4+
label,
5+
COUNT(*) AS "total",
6+
SUM(CASE WHEN overall_status <> 1 THEN 1 ELSE 0 END) AS "non_success",
7+
SUM(CASE WHEN overall_status = 2 THEN 1 ELSE 0 END) AS "flaky",
8+
SUM(CASE WHEN overall_status = 3 THEN 1 ELSE 0 END) AS "timeout",
9+
SUM(CASE WHEN overall_status = 4 THEN 1 ELSE 0 END) AS "fail",
10+
percentile_disc(0.9) WITHIN GROUP (ORDER BY total_run_duration) * INTERVAL '1 second' AS "duration_p90"
11+
12+
FROM
13+
workflow_runs AS wr JOIN
14+
bazel_invocations AS bi ON wr.id = bi.run_id JOIN
15+
bazel_tests AS bt ON bi.build_id = bt.build_id
16+
17+
WHERE
18+
({hide} = '' OR bt.label NOT LIKE {hide})
19+
AND ('{period}' = '' OR bt.first_start_time > now() - ('1 {period}'::interval))
20+
AND (NOT {only_prs} OR wr.event_type = 'pull_request')
21+
AND ({branch} = '' OR wr.head_branch LIKE {branch})
22+
23+
GROUP BY label
24+
),
25+
"top" AS (
26+
SELECT
27+
label,
28+
"total",
29+
"non_success",
30+
"flaky",
31+
"timeout",
32+
"fail",
33+
ROUND(("non_success" * 100.0) / "total", 1) AS "non_success%",
34+
ROUND(("flaky" * 100.0) / "total", 1) AS "flaky%",
35+
ROUND(("timeout" * 100.0) / "total", 1) AS "timeout%",
36+
ROUND(("fail" * 100.0) / "total", 1) AS "fail%",
37+
"non_success" * "duration_p90" AS "impact",
38+
"duration_p90"
39+
40+
FROM
41+
"core"
42+
43+
ORDER BY {order_by} DESC
44+
45+
LIMIT {N}
46+
)
3847
SELECT * FROM "top" WHERE {condition}

0 commit comments

Comments
 (0)