Skip to content

Commit 705e239

Browse files
committed
Merge branch 'develop' into aosewski/trasfer_concept
2 parents 0921a77 + 644cdbe commit 705e239

File tree

119 files changed

+6880
-2910
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+6880
-2910
lines changed

.github/CODEOWNERS

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
1+
* @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @tenpercent @ThomasNing @coderfeli @cgmillette @shumway @vidyasagar-amd @vpietila-amd @Snektron
22
# Documentation files
3-
docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
4-
*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
5-
*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
6-
.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd @ddembeckAMD
3+
docs/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @cgmillette @shumway @vidyasagar-amd @ddembeckAMD @vpietila-amd @Snektron
4+
*.md @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @cgmillette @shumway @vidyasagar-amd @ddembeckAMD @vpietila-amd @Snektron
5+
*.rst @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @cgmillette @shumway @vidyasagar-amd @ddembeckAMD @vpietila-amd @Snektron
6+
.readthedocs.yaml @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @cgmillette @shumway @vidyasagar-amd @ddembeckAMD @vpietila-amd @Snektron
77
# Header directory for Doxygen documentation
8-
library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @aska-0096 @cgmillette @shumway @vidyasagar-amd
8+
library/include/ @ROCm/rocm-documentation @illsilin @carlushuang @qianfengz @aosewski @poyenc @geyyer @bartekxk @andriy-ca @afagaj @asleepzzz @ThomasNing @coderfeli @cgmillette @shumway @vidyasagar-amd @vpietila-amd @Snektron

CMakeLists.txt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,12 @@ endif()
3131
# Default installation path
3232
if(NOT WIN32)
3333
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
34+
else()
35+
set(CMAKE_INSTALL_PREFIX "C:/dist/TheRock" CACHE PATH "")
3436
endif()
3537

3638
set(version 1.2.0)
37-
# Check support for CUDA/HIP in Cmake
38-
project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
39+
project(composable_kernel VERSION ${version} LANGUAGES CXX)
3940
include(CTest)
4041

4142
option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
@@ -162,7 +163,13 @@ execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMI
162163
configure_file(include/ck/version.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/version.h)
163164

164165
set(ROCM_SYMLINK_LIBS OFF)
165-
find_package(ROCM REQUIRED PATHS /opt/rocm)
166+
167+
if (WIN32)
168+
find_package(ROCmCMakeBuildTools REQUIRED PATHS C:/dist/TheRock)
169+
set(HIP_PLATFORM "amd" CACHE STRING "HIP platform")
170+
else()
171+
find_package(ROCM REQUIRED PATHS /opt/rocm)
172+
endif()
166173

167174
include(ROCMInstallTargets)
168175
include(ROCMPackageConfigHelpers)
@@ -189,7 +196,10 @@ if(GPU_TARGETS)
189196
else()
190197
set(USER_GPU_TARGETS 0)
191198
endif()
199+
192200
find_package(hip REQUIRED)
201+
enable_language(HIP)
202+
193203
# No assumption that HIP kernels are launched with uniform block size for backward compatibility
194204
# SWDEV-413293 and https://reviews.llvm.org/D155213
195205
math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")

Jenkinsfile

Lines changed: 6 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -811,41 +811,12 @@ def Build_CK(Map conf=[:]){
811811
archiveArtifacts "perf_*.log"
812812
stash includes: "perf_**.log", name: "perf_log_${arch}"
813813
}
814-
// disable performance tests on gfx1030 for now.
815-
//else if ( arch == "gfx10"){
816-
// run basic tests on gfx1030
817-
// echo "Run gemm performance tests"
818-
// sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10"
819-
// archiveArtifacts "perf_onnx_gemm_gfx10.log"
820-
// stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10"
821-
//}
822-
else if ( arch == "gfx11"){
823-
// run basic tests on gfx11
814+
else if ( arch != "gfx10"){
815+
// run basic tests on gfx11/gfx12/gfx908/gfx950, but not on gfx10, it takes too long
824816
echo "Run gemm performance tests"
825-
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11"
826-
archiveArtifacts "perf_onnx_gemm_gfx11.log"
827-
stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11"
828-
}
829-
else if ( arch == "gfx120" ){
830-
// run basic tests on gfx12
831-
echo "Run gemm performance tests"
832-
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12"
833-
archiveArtifacts "perf_onnx_gemm_gfx12.log"
834-
stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12"
835-
}
836-
else if ( arch == "gfx908" ){
837-
// run basic tests on gfx908
838-
echo "Run performance tests"
839-
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx908"
840-
archiveArtifacts "perf_onnx_gemm_gfx908.log"
841-
stash includes: "perf_onnx_gemm_gfx908.log", name: "perf_log_gfx908"
842-
}
843-
else if ( arch == "gfx950" ){
844-
// run basic tests on gfx950
845-
echo "Run performance tests"
846-
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx950"
847-
archiveArtifacts "perf_onnx_gemm_gfx950.log"
848-
stash includes: "perf_onnx_gemm_gfx950.log", name: "perf_log_gfx950"
817+
sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} ${arch}"
818+
archiveArtifacts "perf_onnx_gemm_*.log"
819+
stash includes: "perf_onnx_gemm_**.log", name: "perf_log_${arch}"
849820
}
850821
}
851822
}
@@ -1049,6 +1020,7 @@ def run_aiter_tests(Map conf=[:]){
10491020
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
10501021
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
10511022
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
1023+
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_batch_prefill.py"
10521024
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
10531025
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py"
10541026
sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py"

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,22 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
137137
```
138138
**[See Note on -j](#notes)**
139139
140+
### Building for Windows
141+
142+
Install TheRock and run CMake configure as
143+
144+
```bash
145+
cmake \
146+
-D CMAKE_PREFIX_PATH="C:/dist/TheRock" \
147+
-D CMAKE_CXX_COMPILER="C:/dist/TheRock/bin/hipcc.exe" \
148+
-D CMAKE_BUILD_TYPE=Release \
149+
-D GPU_TARGETS="gfx1151" \
150+
-G Ninja \
151+
..
152+
```
153+
154+
Use Ninja to build either the whole library or individual targets.
155+
140156
## Optional post-install steps
141157
142158
* Build examples and tests:

docs/sphinx/requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
rocm-docs-core[api_reference]==1.31.1
1+
rocm-docs-core[api_reference]==1.31.2
22
sphinxcontrib-bibtex==2.6.5

docs/sphinx/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ requests==2.32.3
237237
# via
238238
# pygithub
239239
# sphinx
240-
rocm-docs-core[api-reference]==1.31.1
240+
rocm-docs-core[api-reference]==1.31.2
241241
# via -r requirements.in
242242
rpds-py==0.24.0
243243
# via

example/01_gemm/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,7 @@ add_example_executable(example_gemm_wmma_fp16_fp8_v3 gemm_wmma_fp16_fp8_v3.cpp)
149149
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_fp8_v3)
150150
add_example_executable(example_gemm_wmma_fp16_pk_i4_v3_b_scale gemm_wmma_fp16_pk_i4_v3_b_scale.cpp)
151151
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_pk_i4_v3_b_scale)
152+
add_example_executable(example_gemm_wmma_fp8_bpreshuffle gemm_wmma_fp8_bpreshuffle.cpp)
153+
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp8_bpreshuffle)
154+
add_example_executable(example_gemm_wmma_fp16_bpreshuffle gemm_wmma_fp16_bpreshuffle.cpp)
155+
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16_bpreshuffle)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2+
// SPDX-License-Identifier: MIT
3+
4+
#include "common.hpp"
5+
6+
#include "ck/ck.hpp"
7+
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
8+
#include "ck/library/utility/check_err.hpp"
9+
#include "ck/library/utility/device_memory.hpp"
10+
#include "ck/library/utility/host_tensor.hpp"
11+
#include "ck/library/utility/host_tensor_generator.hpp"
12+
#include "ck/library/utility/literals.hpp"
13+
#include "ck/stream_config.hpp"
14+
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
15+
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_preshuffle.hpp"
16+
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
17+
#include "ck/utility/data_type.hpp"
18+
#include "ck/utility/get_id.hpp"
19+
#include "ck/utility/scheduler_enum.hpp"
20+
21+
#include <cstddef>
22+
#include <iostream>
23+
#include <type_traits>
24+
25+
using F16 = ck::half_t;
26+
using F32 = float;
27+
28+
using ADataType = F16;
29+
using BDataType = F16;
30+
using AccDataType = F32;
31+
using CShuffleDataType = F32;
32+
using CDataType = F16;
33+
using ComputeTypeA = F16;
34+
using ComputeTypeB = F16;
35+
36+
using ALayout = Row;
37+
using BLayout = Col;
38+
using CLayout = Row;
39+
40+
using AElementOp = PassThrough;
41+
using BElementOp = PassThrough;
42+
using CElementOp = PassThrough;
43+
44+
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
45+
46+
static constexpr bool PermuteA = false;
47+
static constexpr bool PermuteB = false;
48+
static constexpr int KPack = 8; // int4 -> 32, fp8 -> 16, fp16 -> 8
49+
// clang-format off
50+
using DeviceOpInstance =
51+
ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3_BPreshuffle<
52+
ALayout, BLayout, CLayout,
53+
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
54+
AElementOp, BElementOp, CElementOp, GemmDefault,
55+
128,
56+
32, 128, 128,
57+
8, 8,
58+
16, 16,
59+
2, 2,
60+
S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>,
61+
2, 8, 8, 0,
62+
S<16, 8, 1>, S<1, 0, 2>, S<1, 0, 2>,
63+
2, 8, 8, 0,
64+
1, 1, S<1, 16, 1, 8>, S<4, 4, 1>,
65+
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB>;
66+
// clang-format on
67+
68+
#include "run_gemm_wmma_bpreshuffle_example.inc"
69+
70+
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2+
// SPDX-License-Identifier: MIT
3+
4+
#include "common.hpp"
5+
6+
#include "ck/ck.hpp"
7+
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
8+
#include "ck/library/utility/check_err.hpp"
9+
#include "ck/library/utility/device_memory.hpp"
10+
#include "ck/library/utility/host_tensor.hpp"
11+
#include "ck/library/utility/host_tensor_generator.hpp"
12+
#include "ck/library/utility/literals.hpp"
13+
#include "ck/stream_config.hpp"
14+
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
15+
#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_preshuffle.hpp"
16+
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
17+
#include "ck/utility/amd_ck_fp8.hpp"
18+
#include "ck/utility/data_type.hpp"
19+
#include "ck/utility/get_id.hpp"
20+
#include "ck/utility/scheduler_enum.hpp"
21+
22+
#include <cstddef>
23+
#include <iostream>
24+
#include <type_traits>
25+
26+
using F8 = ck::f8_t;
27+
using F16 = ck::half_t;
28+
using F32 = float;
29+
30+
using ADataType = F8;
31+
using BDataType = F8;
32+
using AccDataType = F32;
33+
using CShuffleDataType = F32;
34+
using CDataType = F16;
35+
using ComputeTypeA = F8;
36+
using ComputeTypeB = F8;
37+
38+
using ALayout = Row;
39+
using BLayout = Col;
40+
using CLayout = Row;
41+
42+
using AElementOp = PassThrough;
43+
using BElementOp = PassThrough;
44+
using CElementOp = PassThrough;
45+
46+
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
47+
48+
static constexpr bool PermuteA = false;
49+
static constexpr bool PermuteB = false;
50+
static constexpr int KPack = 16; // int4 -> 32, fp8 -> 16, fp16 -> 8
51+
// clang-format off
52+
using DeviceOpInstance =
53+
ck::tensor_operation::device::DeviceGemm_Wmma_CShuffleV3_BPreshuffle<
54+
ALayout, BLayout, CLayout,
55+
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
56+
AElementOp, BElementOp, CElementOp, GemmDefault,
57+
256,
58+
32, 128, 256,
59+
16, 16,
60+
16, 16,
61+
2, 1,
62+
S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>,
63+
2, 16, 16, 0,
64+
S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>,
65+
2, 16, 16, 0,
66+
1, 1, S<1, 16, 1, 16>, S<8, 8, 1>,
67+
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB>;
68+
// clang-format on
69+
70+
#include "run_gemm_wmma_bpreshuffle_example.inc"
71+
72+
int main(int argc, char* argv[]) { return !run_gemm_splitk_example(argc, argv); }

0 commit comments

Comments
 (0)