Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c7da77d
test_convnd_fwd
johannes-graner Jan 7, 2026
e00ef08
test_convnd_bwd_data
johannes-graner Jan 7, 2026
2f83bac
test_conv_bwd_data_scale
johannes-graner Jan 7, 2026
2f9b366
test_grouped_convnd_fwd_clamp
johannes-graner Jan 7, 2026
0c106d2
test_grouped_convnd_fwd_scale
johannes-graner Jan 8, 2026
9e95a2a
multiple A/B tensors and D tensor for fwd GPU ref
johannes-graner Jan 8, 2026
7004943
test_grouped_convnd_fwd_scaleadd_ab
johannes-graner Jan 8, 2026
3298801
test_grouped_convnd_fwd_bias_clamp
johannes-graner Jan 8, 2026
2e36ef8
test_grouped_convnd_fwd_bilinear
johannes-graner Jan 8, 2026
2992269
test_grouped_convnd_fwd_gk_bias_clamp
johannes-graner Jan 8, 2026
e2f75fa
Extend GPU reference to enable batchnorm epilogue
johannes-graner Jan 9, 2026
6da4576
test_grouped_convnd_fwd{,_gk}_bias_bnorm_clamp
johannes-graner Jan 9, 2026
64cf835
test_grouped_conv_bwd_data_bilinear
johannes-graner Jan 9, 2026
1556359
test_grouped_convnd_bwd_weight_bilinear
johannes-graner Jan 9, 2026
9c2a899
Merge branch 'develop' into jograner/extend-gpu-reference
johannes-graner Jan 9, 2026
0d5b27d
Add missing template instantiation
johannes-graner Jan 9, 2026
85f2d93
Perform operations in float in reference
johannes-graner Jan 12, 2026
684c9ed
Merge branch 'develop' into jograner/extend-gpu-reference
johannes-graner Jan 12, 2026
c9f0a5c
Merge branch 'develop' into jograner/extend-gpu-reference
johannes-graner Jan 13, 2026
a3b2475
Slightly increase tolerance for batchnorm profiler
johannes-graner Jan 13, 2026
a7c1f22
Revert "Slightly increase tolerance for batchnorm profiler"
johannes-graner Jan 14, 2026
7016a7f
Revert "test_grouped_convnd_fwd{,_gk}_bias_bnorm_clamp"
johannes-graner Jan 14, 2026
5d81d9d
Revert "Extend GPU reference to enable batchnorm epilogue"
johannes-graner Jan 14, 2026
d057d28
Clarify variable names
johannes-graner Jan 15, 2026
16a9c61
Refactor elementwise ops into helper functions
johannes-graner Jan 15, 2026
822dae0
Merge branch 'develop' into jograner/extend-gpu-reference
johannes-graner Jan 15, 2026
06b03ca
Make helpers C++17-compatible
johannes-graner Jan 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1631,6 +1631,13 @@ struct ConvInvscale
e = type_convert<f8_t>(c / scale_in_ / scale_wei_ / scale_out_);
};

template <>
__host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
{
const float c_float = type_convert<float>(c);
e = type_convert<f8_t>(c_float / scale_in_ / scale_wei_ / scale_out_);
};

float scale_in_;
float scale_wei_;
float scale_out_;
Expand All @@ -1656,6 +1663,13 @@ struct ConvScale
e = type_convert<f8_t>(c * scale_in_ * scale_wei_ * scale_out_);
};

template <>
__host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
{
const float c_float = type_convert<float>(c);
e = type_convert<f8_t>(c_float * scale_in_ * scale_wei_ * scale_out_);
};

float scale_in_;
float scale_wei_;
float scale_out_;
Expand Down Expand Up @@ -1683,6 +1697,15 @@ struct ConvScaleRelu
e = type_convert<f8_t>(x * scale_out_);
};

template <>
__host__ __device__ void operator()<f8_t, f8_t>(f8_t& e, const f8_t& c) const
{
const float c_float = type_convert<float>(c);
float x;
Relu{}.template operator()<float>(x, c_float * scale_in_ * scale_wei_);
e = type_convert<f8_t>(x * scale_out_);
};

float scale_in_;
float scale_wei_;
float scale_out_;
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,39 @@ struct SimpleDeviceMem
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&p_mem_), mem_size));
}

// Delete copy operations (resource should not be copied)
SimpleDeviceMem(const SimpleDeviceMem&) = delete;
SimpleDeviceMem& operator=(const SimpleDeviceMem&) = delete;

// Define move operations
SimpleDeviceMem(SimpleDeviceMem&& other) noexcept : p_mem_(other.p_mem_)
{
other.p_mem_ = nullptr;
}

SimpleDeviceMem& operator=(SimpleDeviceMem&& other) noexcept
{
if(this != &other)
{
if(p_mem_)
{
(void)hipFree(p_mem_);
}
p_mem_ = other.p_mem_;
other.p_mem_ = nullptr;
}
return *this;
}

void* GetDeviceBuffer() { return p_mem_; }

~SimpleDeviceMem() { (void)hipFree(p_mem_); }
~SimpleDeviceMem()
{
if(p_mem_)
{
(void)hipFree(p_mem_);
}
}

void* p_mem_;
};
Expand Down Expand Up @@ -173,5 +203,90 @@ __global__ void strided_copy_kernel(const DataType* __restrict__ src,
}
}

namespace detail {

// Helper for parameter pack expansion (D tensors)
template <typename ResultType, typename Op, typename DataType, std::size_t... Is>
__device__ __forceinline__ void apply_multi_tensor_impl(ResultType& result,
Op&& element_op,
const DataType* const* tensor_ptrs,
long_index_t element_offset,
std::index_sequence<Is...>)
{
element_op(result, tensor_ptrs[Is][element_offset]...);
}

// Generic helper for A and B tensors (works in all directions)
template <index_t NumExtraTensors, typename DataType, typename ResultType, typename Op>
__device__ __forceinline__ void apply_multi_tensor_elementwise_op(ResultType& result,
Op&& element_op,
const DataType* primary_ptr,
const DataType* const* extra_ptrs,
long_index_t extra_base_offset,
long_index_t element_offset)
{
const DataType* tensor_ptrs[NumExtraTensors + 1];
tensor_ptrs[0] = primary_ptr;

static_for<1, NumExtraTensors + 1, 1>{}(
[&](auto i) { tensor_ptrs[i] = extra_ptrs[i - 1] + extra_base_offset; });

apply_multi_tensor_impl(result,
element_op,
tensor_ptrs,
element_offset,
std::make_index_sequence<NumExtraTensors + 1>{});
}

// Helper for parameter pack expansion (D tensors)
template <typename OutDataType, typename Op, std::size_t... Is>
__device__ __forceinline__ void apply_d_tensor_impl(OutDataType& result_out,
Op&& element_op,
float computed_value,
const float* d_values,
std::index_sequence<Is...>)
{
float temp_out;
element_op(temp_out, computed_value, d_values[Is]...);
result_out = type_convert<OutDataType>(temp_out);
}

// Specialized helper for D tensors with stride calculations and float conversion
template <index_t NumDTensors, typename DDataType, typename OutDataType, typename Op>
__device__ __forceinline__ void apply_d_tensor_elementwise_op(OutDataType& result_out,
Op&& element_op,
float computed_value,
const DDataType* const* p_ds,
const index_t* const* p_d_strides,
index_t g,
index_t n,
index_t c_or_k,
long_index_t spatial_linear_index)
{
if constexpr(NumDTensors == 0)
{
element_op(result_out, computed_value);
}
else
{
float d_values[NumDTensors];

// Compute all D tensor indices and convert to float
static_for<0, NumDTensors, 1>{}([&](auto i) {
const long_index_t d_idx = g * p_d_strides[i][0] + n * p_d_strides[i][1] +
c_or_k * p_d_strides[i][2] + spatial_linear_index;
d_values[i] = type_convert<float>(p_ds[i][d_idx]);
});

apply_d_tensor_impl(result_out,
element_op,
computed_value,
d_values,
std::make_index_sequence<NumDTensors>{});
}
}

} // namespace detail

} // namespace ref
} // namespace ck
56 changes: 52 additions & 4 deletions profiler/include/profiler/profile_conv_bwd_data_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"

namespace ck {
namespace profiler {
Expand Down Expand Up @@ -129,7 +130,10 @@ bool profile_conv_bwd_data_impl(int do_verification,
out_device_buf.ToDevice(output.mData.data());
wei_device_buf.ToDevice(weight.mData.data());

if(do_verification)
// profile device Conv instances
bool pass = true;

if(do_verification == 1)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
InDataType,
Expand All @@ -154,6 +158,27 @@ bool profile_conv_bwd_data_impl(int do_verification,
ref_invoker.Run(ref_argument);
}

// GPU reference (compute once, compare in kernel loop)
Tensor<InDataType> gpu_ref_input(in_g_n_c_wis_desc);
if(do_verification == 2)
{
DeviceMem gpu_ref_in_dev(sizeof(InDataType) *
input_device_result.mDesc.GetElementSpaceSize());
gpu_ref_in_dev.SetZero(); // bwd data needs zero initialization

ck::ref::naive_conv_bwd_data<InLayout, WeiLayout, OutLayout>(
static_cast<InDataType*>(gpu_ref_in_dev.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);

hip_check_error(hipDeviceSynchronize());
gpu_ref_in_dev.FromDevice(gpu_ref_input.mData.data());
}

using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NDimSpatial,
InLayout,
WeiLayout,
Expand All @@ -176,8 +201,6 @@ bool profile_conv_bwd_data_impl(int do_verification,
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device Conv instances
bool pass = true;

for(auto& op_ptr : op_ptrs)
{
Expand Down Expand Up @@ -235,7 +258,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
best_gb_per_sec = gb_per_sec;
}

if(do_verification)
if(do_verification == 1)
{
in_device_buf.FromDevice(input_device_result.mData.data());

Expand All @@ -255,6 +278,31 @@ bool profile_conv_bwd_data_impl(int do_verification,
show_data_nhwc_layout(input_host_result);
std::cout << std::endl;

std::cout << "out_device: ";
show_data_nhwc_layout(input_device_result);
std::cout << std::endl;
}
}
else if(do_verification == 2)
{
in_device_buf.FromDevice(input_device_result.mData.data());

pass = pass & ck::utils::check_err(input_device_result, gpu_ref_input);

if(do_log)
{
std::cout << "in : ";
show_data_nhwc_layout(output);
std::cout << std::endl;

std::cout << "wei: ";
show_data_nhwc_layout(weight);
std::cout << std::endl;

std::cout << "out_gpu_ref : ";
show_data_nhwc_layout(gpu_ref_input);
std::cout << std::endl;

std::cout << "out_device: ";
show_data_nhwc_layout(input_device_result);
std::cout << std::endl;
Expand Down
45 changes: 41 additions & 4 deletions profiler/include/profiler/profile_conv_fwd_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"

namespace ck {
namespace profiler {
Expand Down Expand Up @@ -107,8 +108,11 @@ bool profile_conv_fwd_impl(int do_verification,
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data());

// profile device op instances
bool pass = true;

// run reference op
if(do_verification)
if(do_verification == 1)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
Expand All @@ -135,6 +139,24 @@ bool profile_conv_fwd_impl(int do_verification,

ref_invoker.Run(ref_argument);
}
// GPU reference (compute once, compare in kernel loop)
Tensor<OutDataType> gpu_ref_output(out_g_n_k_wos_desc);
if(do_verification == 2)
{
DeviceMem gpu_ref_out_dev(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());

ck::ref::naive_conv_fwd<InLayout, WeiLayout, OutLayout>(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(gpu_ref_out_dev.GetDeviceBuffer()),
conv_param,
in_element_op,
wei_element_op,
out_element_op);

hip_check_error(hipDeviceSynchronize());
gpu_ref_out_dev.FromDevice(gpu_ref_output.mData.data());
}

using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NDimSpatial,
InLayout,
Expand All @@ -158,8 +180,6 @@ bool profile_conv_fwd_impl(int do_verification,
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances
bool pass = true;

for(auto& op_ptr : op_ptrs)
{
Expand Down Expand Up @@ -217,7 +237,7 @@ bool profile_conv_fwd_impl(int do_verification,
best_gb_per_sec = gb_per_sec;
}

if(do_verification)
if(do_verification == 1)
{
out_device_buf.FromDevice(device_output.mData.data());

Expand All @@ -233,6 +253,23 @@ bool profile_conv_fwd_impl(int do_verification,
<< std::endl;
}
}
else if(do_verification == 2)
{
out_device_buf.FromDevice(device_output.mData.data());

pass = pass & ck::utils::check_err(device_output, gpu_ref_output);

if(do_log)
{
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
LogRangeAsType<float>(
std::cout << "gpu_ref_output : ", gpu_ref_output.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
<< std::endl;
}
}
}
else
{
Expand Down
Loading