Skip to content

Commit a3cf63e

Browse files
committed
fix CUDA architectures cmake logic
1 parent 9d77dcb commit a3cf63e

File tree

1 file changed

+26
-3
lines changed

1 file changed

+26
-3
lines changed

transformer_engine/common/CMakeLists.txt

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,11 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
3636
endif()
3737
endif()
3838

39-
# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures
39+
# Process CMAKE_CUDA_ARCHITECTURES to separate standard, generic, and specific architectures.
40+
# - NVTE_STANDARD_ARCHS: pre-Blackwell archs (e.g. 75, 80, 89, 90). Applied to all CUDA sources.
41+
# - NVTE_GENERIC_ARCHS: Blackwell family heads (e.g. 100, 120). Applied to non-arch-specific sources only.
42+
# - NVTE_SPECIFIC_ARCHS: Blackwell specific targets (e.g. 100a, 120f). Applied to arch-specific sources only.
43+
set(NVTE_STANDARD_ARCHS)
4044
set(NVTE_GENERIC_ARCHS)
4145
set(NVTE_SPECIFIC_ARCHS)
4246

@@ -79,6 +83,12 @@ if(NOT arch_120_index EQUAL -1)
7983
endif()
8084
endif()
8185

86+
# Move remaining standard (pre-Blackwell) architectures into NVTE_STANDARD_ARCHS.
87+
# These are applied to all CUDA sources (both generic and arch-specific).
88+
foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
89+
list(APPEND NVTE_STANDARD_ARCHS "${arch}")
90+
endforeach()
91+
8292
# cuDNN frontend API
8393
set(CUDNN_FRONTEND_INCLUDE_DIR
8494
"${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
@@ -192,9 +202,13 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s
192202
${transformer_engine_cuda_sources}
193203
${transformer_engine_cpp_sources})
194204

195-
# Set compile options for CUDA sources with generic architectures
205+
# Set compile options for CUDA sources with generic architectures.
206+
# These get standard archs (pre-Blackwell) + generic Blackwell family heads.
196207
foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
197208
set(arch_compile_options)
209+
foreach(arch IN LISTS NVTE_STANDARD_ARCHS)
210+
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
211+
endforeach()
198212
foreach(arch IN LISTS NVTE_GENERIC_ARCHS)
199213
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
200214
endforeach()
@@ -209,9 +223,14 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
209223
endif()
210224
endforeach()
211225

212-
# Set compile options for CUDA sources with specific architectures
226+
# Set compile options for CUDA sources with arch-specific features.
227+
# These get standard archs (pre-Blackwell) + Blackwell specific targets (a/f suffix).
228+
# They must NOT get generic Blackwell archs, as they use family/arch-specific PTX features.
213229
foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
214230
set(arch_compile_options)
231+
foreach(arch IN LISTS NVTE_STANDARD_ARCHS)
232+
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
233+
endforeach()
215234
foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS)
216235
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
217236
endforeach()
@@ -232,6 +251,10 @@ list(APPEND transformer_engine_SOURCES
232251
endif()
233252

234253
add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
254+
# Disable CMake's automatic architecture flag injection.
255+
# All architectures are handled explicitly via per-source COMPILE_OPTIONS
256+
# using NVTE_STANDARD_ARCHS, NVTE_GENERIC_ARCHS, and NVTE_SPECIFIC_ARCHS above.
257+
set_target_properties(transformer_engine PROPERTIES CUDA_ARCHITECTURES OFF)
235258
target_include_directories(transformer_engine PUBLIC
236259
"${CMAKE_CURRENT_SOURCE_DIR}/include")
237260

0 commit comments

Comments
 (0)