Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 59 additions & 34 deletions test/performance/matrix_multiplication.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "Halide.h"
#include "halide_benchmark.h"
#include "halide_test_dirs.h"

#include <cstdio>

using namespace Halide;
Expand All @@ -25,10 +27,11 @@ int main(int argc, char **argv) {
return 0;
}

const int matrix_size = 992;
constexpr int matrix_size = 992;
constexpr float gflops = 2.0f * matrix_size * matrix_size * matrix_size / 1e9f;

ImageParam A(type_of<float>(), 2);
ImageParam B(type_of<float>(), 2);
ImageParam A(type_of<float>(), 2, "A");
ImageParam B(type_of<float>(), 2, "B");

Var x("x"), y("y");
RDom k(0, matrix_size);
Expand All @@ -46,17 +49,34 @@ int main(int argc, char **argv) {
//
// Using 16 threads (and no hyperthreading), hits 2080 GFlops (67% of peak)
// and 1310 GFLops (85% of peak) respectively.
//
// On Apple M3 Max, single-threaded hits ~114 GFlops (89% of peak), and
// ~1270 GFlops using 16 cores.

const int vec = target.natural_vector_size<float>();

// Size the inner loop tiles to fit into the number of registers available
// on the target, using either 12 accumulator registers or 24.
const int inner_tile_x = 3 * vec;
const int inner_tile_y = (target.has_feature(Target::AVX512) || target.arch != Target::X86) ? 8 : 4;
// On 64-bit ARM, there are 32 NEON registers. Using inner_tile_x=4*vec
// with inner_tile_y=4 leaves 10 spare NEON registers, which lets LLVM
// assign an independent GP base address to each A row. This avoids the
// ld1r post-increment serial dependency chain that occurs with 8 rows
// (where only 2 temp registers cycle between rows), and produces balanced
// load/compute throughput (4 cycles each at 4 FP units and 2 load ports).
const bool is_aarch64 = target.arch == Target::ARM && target.bits == 64;
const bool is_avx512 = target.has_feature(Target::AVX512);

// The shape of the outer tiling
const int tile_y = matrix_size / 4;
const int tile_k = matrix_size / 16;
// Size the inner loop tiles to fit into the number of registers available
// on the target.
// ARM64 NEON: 4×4=16 accumulators (22/32 NEON regs).
// AVX-512: 3×8=24 accumulators (27/32 ZMM regs).
// AVX2 (default): 3×4=12 accumulators.
const int inner_tile_x = is_aarch64 ? 4 * vec : 3 * vec;
const int inner_tile_y = is_avx512 ? 8 : 4;

// The shape of the outer tiling. On ARM64, use a narrower y-tile and wider
// k-tile so the B panel (inner_tile_x × matrix_k × 4 bytes = ~62KB) fits in
// L1 alongside the C accumulator buffer.
const int tile_y = matrix_size / (is_aarch64 ? 8 : 4);
const int tile_k = matrix_size / (is_aarch64 ? 4 : 16);

Var xy("xy"), xi("xi"), yi("yi"), yii("yii");

Expand Down Expand Up @@ -117,11 +137,26 @@ int main(int argc, char **argv) {
A.set(mat_A);
B.set(mat_B);

// TODO: we really need a generic performance testing harness

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we create an issue to track?

if (Internal::get_env_variable("DUMP_SCHEDULE") == "1") {
Target simple_target = get_jit_target_from_environment()
.with_feature(Target::NoAsserts)
.with_feature(Target::NoBoundsQuery)
.with_feature(Target::NoRuntime);
const auto temp_dir = Internal::get_test_tmp_dir();
const auto asm_path = temp_dir + "matrix_mul.S";
const auto stmt_path = temp_dir + "matrix_mul.stmt";
out.compile_to_assembly(asm_path, {A, B}, simple_target);
out.compile_to_conceptual_stmt(stmt_path, {A, B}, Text, simple_target);
printf("Assembly dumped to %s\n", asm_path.c_str());
printf("Halide IR dumped to %s\n", stmt_path.c_str());
}

// warm up one round (pre-compile jit)
out.realize(output);

double t = benchmark([&]() {
out.realize(output);
});
double elapsed = benchmark([&] { out.realize(output); });
printf("Benchmark: %fms, %f GFLOP/s\n", elapsed * 1e3, (gflops / elapsed));

// check results
Buffer<float> output_ref(matrix_size, matrix_size);
Expand All @@ -130,32 +165,22 @@ int main(int argc, char **argv) {
simple_version(mat_A.data(), mat_B.data(), output_ref.data(), mat_A.width(), mat_A.stride(1));
out.realize(output_halide);

bool halide_correct = true;
for (int iy = 0; iy < matrix_size && halide_correct; iy++) {
for (int ix = 0; ix < matrix_size; ix++) {
halide_correct = halide_correct && (std::abs(output_ref(ix, iy) - output_halide(ix, iy)) < 0.001f);
bool halide_correct = [&] {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we also need a generic equalsish for floats too, but maybe that's part of our generic perf testing harness

for (int iy = 0; iy < matrix_size; iy++) {
for (int ix = 0; ix < matrix_size; ix++) {
if (std::abs(output_ref(ix, iy) - output_halide(ix, iy)) > 0.001f) {
return false;
}
}
}
}
return true;
}();

if (halide_correct) {
printf("Halide results - OK\n");
} else {
printf("Halide results - FAIL\n");
if (!halide_correct) {
printf("FAIL - matrix values were incorrect\n");
return 1;
}

// Uncomment to see the generated assembly.
Comment thread
shoaibkamil marked this conversation as resolved.
/*
{
Target t("host-no_asserts-no_runtime-no_bounds_query");
out.compile_to_assembly("/dev/stdout", matrix_mul.infer_arguments(), t);
}
*/

float gflops = 2.0f * matrix_size * matrix_size * matrix_size / 1e9f;

printf("Halide: %fms, %f GFLOP/s\n\n", t * 1e3, (gflops / t));

printf("Success!\n");
return 0;
}
Loading