@@ -36,7 +36,11 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
3636 endif ()
3737endif ()
3838
39- # Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures
39+ # Process CMAKE_CUDA_ARCHITECTURES to separate standard, generic, and specific architectures.
40+ # - NVTE_STANDARD_ARCHS: pre-Blackwell archs (e.g. 75, 80, 89, 90). Applied to all CUDA sources.
41+ # - NVTE_GENERIC_ARCHS: Blackwell family heads (e.g. 100, 120). Applied to non-arch-specific sources only.
42+ # - NVTE_SPECIFIC_ARCHS: Blackwell specific targets (e.g. 100a, 120f). Applied to arch-specific sources only.
43+ set (NVTE_STANDARD_ARCHS)
4044set (NVTE_GENERIC_ARCHS)
4145set (NVTE_SPECIFIC_ARCHS)
4246
@@ -79,6 +83,12 @@ if(NOT arch_120_index EQUAL -1)
7983 endif ()
8084endif ()
8185
86+ # Move remaining standard (pre-Blackwell) architectures into NVTE_STANDARD_ARCHS.
87+ # These are applied to all CUDA sources (both generic and arch-specific).
88+ foreach (arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
89+ list (APPEND NVTE_STANDARD_ARCHS "${arch} " )
90+ endforeach ()
91+
8292# cuDNN frontend API
8393set (CUDNN_FRONTEND_INCLUDE_DIR
8494 "${CMAKE_CURRENT_SOURCE_DIR} /../../3rdparty/cudnn-frontend/include" )
@@ -192,9 +202,13 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s
192202 ${transformer_engine_cuda_sources}
193203 ${transformer_engine_cpp_sources} )
194204
195- # Set compile options for CUDA sources with generic architectures
205+ # Set compile options for CUDA sources with generic architectures.
206+ # These get standard archs (pre-Blackwell) + generic Blackwell family heads.
196207foreach (cuda_source IN LISTS transformer_engine_cuda_sources)
197208 set (arch_compile_options)
209+ foreach (arch IN LISTS NVTE_STANDARD_ARCHS)
210+ list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
211+ endforeach ()
198212 foreach (arch IN LISTS NVTE_GENERIC_ARCHS)
199213 list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
200214 endforeach ()
@@ -209,9 +223,14 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
209223 endif ()
210224endforeach ()
211225
212- # Set compile options for CUDA sources with specific architectures
226+ # Set compile options for CUDA sources with arch-specific features.
227+ # These get standard archs (pre-Blackwell) + Blackwell specific targets (a/f suffix).
228+ # They must NOT get generic Blackwell archs, as they use family/arch-specific PTX features.
213229foreach (cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
214230 set (arch_compile_options)
231+ foreach (arch IN LISTS NVTE_STANDARD_ARCHS)
232+ list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
233+ endforeach ()
215234 foreach (arch IN LISTS NVTE_SPECIFIC_ARCHS)
216235 list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
217236 endforeach ()
@@ -232,6 +251,10 @@ list(APPEND transformer_engine_SOURCES
232251endif ()
233252
234253add_library (transformer_engine SHARED ${transformer_engine_SOURCES} )
254+ # Disable CMake's automatic architecture flag injection.
255+ # All architectures are handled explicitly via per-source COMPILE_OPTIONS
256+ # using NVTE_STANDARD_ARCHS, NVTE_GENERIC_ARCHS, and NVTE_SPECIFIC_ARCHS above.
257+ set_target_properties (transformer_engine PROPERTIES CUDA_ARCHITECTURES OFF )
235258target_include_directories (transformer_engine PUBLIC
236259 "${CMAKE_CURRENT_SOURCE_DIR} /include" )
237260
0 commit comments