diff --git a/test/performance/matrix_multiplication.cpp b/test/performance/matrix_multiplication.cpp index 8b37b001f7df..da71865cbad7 100644 --- a/test/performance/matrix_multiplication.cpp +++ b/test/performance/matrix_multiplication.cpp @@ -1,5 +1,7 @@ #include "Halide.h" #include "halide_benchmark.h" +#include "halide_test_dirs.h" + #include using namespace Halide; @@ -25,10 +27,11 @@ int main(int argc, char **argv) { return 0; } - const int matrix_size = 992; + constexpr int matrix_size = 992; + constexpr float gflops = 2.0f * matrix_size * matrix_size * matrix_size / 1e9f; - ImageParam A(type_of(), 2); - ImageParam B(type_of(), 2); + ImageParam A(type_of(), 2, "A"); + ImageParam B(type_of(), 2, "B"); Var x("x"), y("y"); RDom k(0, matrix_size); @@ -46,17 +49,34 @@ int main(int argc, char **argv) { // // Using 16 threads (and no hyperthreading), hits 2080 GFlops (67% of peak) // and 1310 GFLops (85% of peak) respectively. + // + // On Apple M3 Max, single-threaded hits ~114 GFlops (89% of peak), and + // ~1270 GFlops using 16 cores. const int vec = target.natural_vector_size(); - // Size the inner loop tiles to fit into the number of registers available - // on the target, using either 12 accumulator registers or 24. - const int inner_tile_x = 3 * vec; - const int inner_tile_y = (target.has_feature(Target::AVX512) || target.arch != Target::X86) ? 8 : 4; + // On 64-bit ARM, there are 32 NEON registers. Using inner_tile_x=4*vec + // with inner_tile_y=4 leaves 10 spare NEON registers, which lets LLVM + // assign an independent GP base address to each A row. This avoids the + // ld1r post-increment serial dependency chain that occurs with 8 rows + // (where only 2 temp registers cycle between rows), and produces balanced + // load/compute throughput (4 cycles each at 4 FP units and 2 load ports). + const bool is_aarch64 = target.arch == Target::ARM && target.bits == 64; + const bool is_avx512 = target.has_feature(Target::AVX512); - // The shape of the outer tiling - const int tile_y = matrix_size / 4; - const int tile_k = matrix_size / 16; + // Size the inner loop tiles to fit into the number of registers available + // on the target. + // ARM64 NEON: 4×4=16 accumulators (22/32 NEON regs). + // AVX-512: 3×8=24 accumulators (27/32 ZMM regs). + // AVX2 (default): 3×4=12 accumulators. + const int inner_tile_x = is_aarch64 ? 4 * vec : 3 * vec; + const int inner_tile_y = is_avx512 ? 8 : 4; + + // The shape of the outer tiling. On ARM64, use a narrower y-tile and wider + // k-tile so the B panel (inner_tile_x × matrix_k × 4 bytes = ~62KB) fits in + // L1 alongside the C accumulator buffer. + const int tile_y = matrix_size / (is_aarch64 ? 8 : 4); + const int tile_k = matrix_size / (is_aarch64 ? 4 : 16); Var xy("xy"), xi("xi"), yi("yi"), yii("yii"); @@ -117,11 +137,26 @@ int main(int argc, char **argv) { A.set(mat_A); B.set(mat_B); + // TODO: we really need a generic performance testing harness + if (Internal::get_env_variable("DUMP_SCHEDULE") == "1") { + Target simple_target = get_jit_target_from_environment() + .with_feature(Target::NoAsserts) + .with_feature(Target::NoBoundsQuery) + .with_feature(Target::NoRuntime); + const auto temp_dir = Internal::get_test_tmp_dir(); + const auto asm_path = temp_dir + "matrix_mul.S"; + const auto stmt_path = temp_dir + "matrix_mul.stmt"; + out.compile_to_assembly(asm_path, {A, B}, simple_target); + out.compile_to_conceptual_stmt(stmt_path, {A, B}, Text, simple_target); + printf("Assembly dumped to %s\n", asm_path.c_str()); + printf("Halide IR dumped to %s\n", stmt_path.c_str()); + } + + // warm up one round (pre-compile jit) out.realize(output); - double t = benchmark([&]() { - out.realize(output); - }); + double elapsed = benchmark([&] { out.realize(output); }); + printf("Benchmark: %fms, %f GFLOP/s\n", elapsed * 1e3, (gflops / elapsed)); // check results Buffer output_ref(matrix_size, matrix_size); @@ -130,32 +165,22 @@ int main(int argc, char **argv) { simple_version(mat_A.data(), mat_B.data(), output_ref.data(), mat_A.width(), mat_A.stride(1)); out.realize(output_halide); - bool halide_correct = true; - for (int iy = 0; iy < matrix_size && halide_correct; iy++) { - for (int ix = 0; ix < matrix_size; ix++) { - halide_correct = halide_correct && (std::abs(output_ref(ix, iy) - output_halide(ix, iy)) < 0.001f); + bool halide_correct = [&] { + for (int iy = 0; iy < matrix_size; iy++) { + for (int ix = 0; ix < matrix_size; ix++) { + if (std::abs(output_ref(ix, iy) - output_halide(ix, iy)) > 0.001f) { + return false; + } + } } - } + return true; + }(); - if (halide_correct) { - printf("Halide results - OK\n"); - } else { - printf("Halide results - FAIL\n"); + if (!halide_correct) { + printf("FAIL - matrix values were incorrect\n"); return 1; } - // Uncomment to see the generated assembly. - /* - { - Target t("host-no_asserts-no_runtime-no_bounds_query"); - out.compile_to_assembly("/dev/stdout", matrix_mul.infer_arguments(), t); - } - */ - - float gflops = 2.0f * matrix_size * matrix_size * matrix_size / 1e9f; - - printf("Halide: %fms, %f GFLOP/s\n\n", t * 1e3, (gflops / t)); - printf("Success!\n"); return 0; }