-
Notifications
You must be signed in to change notification settings - Fork 11
Replace Base.walkdir with variation that skips hidden directories
#236
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
Verified this works on the original case (reported by @charnik) And also had Claude write a benchmark script to show the impact: $ cat walkdir_benchmark.jl
# Benchmark comparing _walkdir vs Base.walkdir performance
# when there are many files in hidden directories
using ReTestItems: _walkdir
const BENCH_DIR = joinpath(@__DIR__, "test_tree")
function setup_benchmark_tree(; num_hidden_dirs=10, num_subdirs_per_hidden=50, num_files_per_dir=100)
# Clean up any existing benchmark directory
rm(BENCH_DIR; force=true, recursive=true)
mkpath(BENCH_DIR)
# Create some normal directories with a few files
for i in 1:5
normal_dir = joinpath(BENCH_DIR, "normal_dir_$i")
mkpath(normal_dir)
for j in 1:10
touch(joinpath(normal_dir, "file_$j.jl"))
end
end
# Create hidden directories with LOTS of files and subdirectories
for i in 1:num_hidden_dirs
hidden_dir = joinpath(BENCH_DIR, ".hidden_dir_$i")
mkpath(hidden_dir)
for j in 1:num_subdirs_per_hidden
subdir = joinpath(hidden_dir, "subdir_$j")
mkpath(subdir)
for k in 1:num_files_per_dir
touch(joinpath(subdir, "file_$k.txt"))
end
end
end
# Count what we created
total_hidden_files = num_hidden_dirs * num_subdirs_per_hidden * num_files_per_dir
total_hidden_dirs = num_hidden_dirs * (1 + num_subdirs_per_hidden) # parent + subdirs
total_normal_files = 5 * 10
total_normal_dirs = 5
println("Created benchmark tree:")
println(" Normal directories: $total_normal_dirs with $total_normal_files files")
println(" Hidden directories: $total_hidden_dirs with $total_hidden_files files")
println()
end
function count_walkdir_iterations(walkdir_func, root; collect_dirs=false)
count = 0
visited_dirs = String[]
for (dir, dirs, files) in walkdir_func(root)
count += 1
if collect_dirs
push!(visited_dirs, dir)
end
end
return count, visited_dirs
end
function benchmark_walkdir(walkdir_func, root, name; warmup=true, show_dirs=false)
# Warmup run
if warmup
count_walkdir_iterations(walkdir_func, root)
end
# Timed runs
times = Float64[]
iterations = 0
visited_dirs = String[]
for _ in 1:5
t = @elapsed begin
iterations, visited_dirs = count_walkdir_iterations(walkdir_func, root; collect_dirs=show_dirs)
end
push!(times, t)
end
avg_time = sum(times) / length(times)
min_time = minimum(times)
println("$name:")
println(" Directories visited: $iterations")
if show_dirs && !isempty(visited_dirs)
println(" Directory names:")
for dir in visited_dirs
relpath_dir = relpath(dir, root)
println(" - $relpath_dir")
end
end
println(" Average time: $(round(avg_time * 1000, digits=3)) ms")
println(" Minimum time: $(round(min_time * 1000, digits=3)) ms")
println()
return (iterations=iterations, avg_time=avg_time, min_time=min_time)
end
function run_benchmark()
println("=" ^ 60)
println("Benchmark: _walkdir vs Base.walkdir with hidden directories")
println("=" ^ 60)
println()
# Setup
setup_benchmark_tree(num_hidden_dirs=10, num_subdirs_per_hidden=50, num_files_per_dir=100)
# Run benchmarks
println("-" ^ 40)
println("Results:")
println("-" ^ 40)
base_result = benchmark_walkdir(Base.walkdir, BENCH_DIR, "Base.walkdir")
custom_result = benchmark_walkdir(_walkdir, BENCH_DIR, "_walkdir (skips hidden)"; show_dirs=true)
# Summary
println("=" ^ 60)
println("Summary:")
println("=" ^ 60)
speedup = base_result.avg_time / custom_result.avg_time
dirs_skipped = base_result.iterations - custom_result.iterations
println(" Directories skipped by _walkdir: $dirs_skipped")
println(" Speedup: $(round(speedup, digits=2))x faster")
println()
# Cleanup
println("Cleaning up benchmark directory...")
rm(BENCH_DIR; force=true, recursive=true)
println("Done!")
end
# Run the benchmark
run_benchmark()$ julia --project walkdir_benchmark.jl
============================================================
Benchmark: _walkdir vs Base.walkdir with hidden directories
============================================================
Created benchmark tree:
Normal directories: 5 with 50 files
Hidden directories: 510 with 50000 files
----------------------------------------
Results:
----------------------------------------
Base.walkdir:
Directories visited: 516
Average time: 197.247 ms
Minimum time: 191.929 ms
_walkdir (skips hidden):
Directories visited: 6
Directory names:
- .
- normal_dir_1
- normal_dir_2
- normal_dir_3
- normal_dir_4
- normal_dir_5
Average time: 0.781 ms
Minimum time: 0.748 ms
============================================================
Summary:
============================================================
Directories skipped by _walkdir: 510
Speedup: 252.7x faster
Cleaning up benchmark directory...
Done! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull request overview
This PR replaces Base.walkdir with a custom _walkdir function that automatically skips hidden directories (those starting with '.'), eliminating the need for manual filtering in the caller code.
Changes:
- Introduced
_walkdirfunction that wrapsBase.walkdirand filters out hidden directories before descending into them - Replaced
Base.walkdircall with_walkdirininclude_testfiles! - Removed redundant
hidden_reregex and directory-level filtering since hidden directories are now excluded at traversal time
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
Co-authored-by: Copilot <[email protected]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull request overview
Copilot reviewed 1 out of 2 changed files in this pull request and generated 2 comments.
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
Drvi
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think #225 would fix this in a way that I'd personally prefer, but this seems to work
|
We could do it, but it would completely replace this PR since #225 also reimplements the walkdir function |
@charnik reported seeing long "scanning for test items" times, e.g.:
This comes from having large "hidden directories", including in this case
.julia, at the root of the repoBecause ReTestItems.jl always searches down from the root of the repo looking for test files we were descending into hidden directories. This PR fixes that so we now skip hidden directories.
On that use-case, this PR reduces the scan time down from
5.26to0.21seconds.