From 7cff674a175e68ad8762f36e1dbe53f505bd6109 Mon Sep 17 00:00:00 2001 From: pfischill Date: Wed, 29 Apr 2026 15:22:23 +0200 Subject: [PATCH 01/39] Add IPPL_ENABLE_FINUFFT/CUFFTMP options and FetchContent integration - Top-level CMakeLists.txt: add option(IPPL_ENABLE_FINUFFT) and option(IPPL_ENABLE_CUFFTMP), both defaulting OFF. - cmake/Dependencies.cmake: add FetchContent block for (CU)FINUFFT and the cuFFTMp / NVSHMEM discovery block. Bump Kokkos default to 5.0.0 and enable Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE. - cmake/InstallIppl.cmake: install finufft / cufinufft targets when built in-tree. - cmake/AddIpplTest.cmake: pass TEST_LINK_LIBS through to non-integration tests as well. - src/CMakeLists.txt: link Heffte/finufft/cufinufft only when FFT and the corresponding option are enabled; embed build metadata as an opt-in to reduce rebuild churn. - .gitignore: exclude .idea*, cmake-build-*. --- .gitignore | 4 + CMakeLists.txt | 3 + cmake/AddIpplTest.cmake | 4 + cmake/AutoTunePresets.cmake | 140 +++++++++++++++++ cmake/Dependencies.cmake | 95 +++++++++++- cmake/InstallIppl.cmake | 8 +- cmake/IpplAutoTunePresets.h.in | 15 ++ cmake/auto_tune/README.md | 37 +++++ .../auto_tune/sm_90/gather_sweep_optimal.csv | 2 + .../auto_tune/sm_90/tile_sweep_sa_optimal.csv | 145 ++++++++++++++++++ src/CMakeLists.txt | 22 ++- 11 files changed, 468 insertions(+), 7 deletions(-) create mode 100644 cmake/AutoTunePresets.cmake create mode 100644 cmake/IpplAutoTunePresets.h.in create mode 100644 cmake/auto_tune/README.md create mode 100644 cmake/auto_tune/sm_90/gather_sweep_optimal.csv create mode 100644 cmake/auto_tune/sm_90/tile_sweep_sa_optimal.csv diff --git a/.gitignore b/.gitignore index ce4b1e9f1..570288809 100644 --- a/.gitignore +++ b/.gitignore @@ -20,5 +20,9 @@ build*/ .vscode .cache +# IDEA IDE +.idea* +cmake-build-* + # mac .DS_Store diff --git a/CMakeLists.txt b/CMakeLists.txt index 54c104885..a2a325bda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,9 @@ option(IPPL_MARK_FAILING_TESTS OFF) option(IPPL_ENABLE_SCRIPTS "Generate job script templates for some benchmarks/tests" OFF) +option(IPPL_ENABLE_FINUFFT "Enable (CU)FINUFFT" OFF) +option(IPPL_ENABLE_CUFFTMP "Enable cuFFTMp distributed FFT backend" OFF) + # "Build IPPL as a shared library (ON) or static library (OFF)" OFF) if(IPPL_DYL) # set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE) message(WARNING "IPPL_DYL is deprecated; use # -DBUILD_SHARED_LIBS=ON instead.") endif() diff --git a/cmake/AddIpplTest.cmake b/cmake/AddIpplTest.cmake index d4476392e..9985152bb 100644 --- a/cmake/AddIpplTest.cmake +++ b/cmake/AddIpplTest.cmake @@ -203,4 +203,8 @@ function(add_ippl_test TEST_NAME) set_tests_properties(${_ctest_name} PROPERTIES ${TEST_PROPERTIES}) endif() endif() + + if(TEST_LINK_LIBS AND NOT TEST_INTEGRATION) + target_link_libraries(${TEST_NAME} PRIVATE ${TEST_LINK_LIBS}) + endif() endfunction() diff --git a/cmake/AutoTunePresets.cmake b/cmake/AutoTunePresets.cmake new file mode 100644 index 000000000..95ce06a6f --- /dev/null +++ b/cmake/AutoTunePresets.cmake @@ -0,0 +1,140 @@ +# ============================================================================ +# AutoTunePresets.cmake +# +# Detect the build's exec-space arch tag, copy any matching preset CSVs from +# `cmake/auto_tune//` into the build's `share/ippl/auto_tune/`, and +# expose the resulting path to the library via the generated header +# `IpplAutoTunePresets.h`. +# +# At runtime, TileSizeCache::load() and GatherCache::load() consult that +# path after env / cwd lookups, so a fresh checkout on a known arch already +# uses tuned parameters without anyone running the sweep. +# ============================================================================ + +function(ippl_configure_autotune_presets) + # ---- Pick a tag that matches the layout under cmake/auto_tune/. ------- + # + # Try Kokkos_ARCH_* cache variables first (they're reliable even when + # CMAKE_CUDA_ARCHITECTURES / CMAKE_HIP_ARCHITECTURES is "native"). Fall + # back to the numeric arch list if Kokkos didn't pin one. + set(_tag "") + + if("CUDA" IN_LIST IPPL_PLATFORMS) + set(_arch_map + "KEPLER30:30" "KEPLER32:32" "KEPLER35:35" "KEPLER37:37" + "MAXWELL50:50" "MAXWELL52:52" "MAXWELL53:53" + "PASCAL60:60" "PASCAL61:61" + "VOLTA70:70" "VOLTA72:72" + "TURING75:75" + "AMPERE80:80" "AMPERE86:86" "AMPERE87:87" + "ADA89:89" + "HOPPER90:90" + "BLACKWELL100:100" "BLACKWELL120:120") + foreach(_entry ${_arch_map}) + string(REPLACE ":" ";" _pair "${_entry}") + list(GET _pair 0 _name) + list(GET _pair 1 _sm) + if(Kokkos_ARCH_${_name}) + set(_tag "sm_${_sm}") + break() + endif() + endforeach() + + if(NOT _tag AND CMAKE_CUDA_ARCHITECTURES) + list(GET CMAKE_CUDA_ARCHITECTURES 0 _first_arch) + string(REGEX REPLACE "[^0-9].*$" "" _first_arch "${_first_arch}") + if(_first_arch) + set(_tag "sm_${_first_arch}") + endif() + endif() + elseif("HIP" IN_LIST IPPL_PLATFORMS) + # Kokkos uses two naming conventions across versions: AMD_GFX* (newer) + # and VEGA*/NAVI* (older). Check both. + set(_hip_arch_map + # AMD_GFX* (Kokkos >= ~4.x) + "AMD_GFX906:gfx906" "AMD_GFX908:gfx908" "AMD_GFX90A:gfx90a" + "AMD_GFX940:gfx940" "AMD_GFX942:gfx942" + "AMD_GFX1030:gfx1030" "AMD_GFX1100:gfx1100" "AMD_GFX1103:gfx1103" + # VEGA*/NAVI* (older Kokkos) + "VEGA906:gfx906" "VEGA908:gfx908" "VEGA90A:gfx90a" + "VEGA940:gfx940" "VEGA942:gfx942" + "NAVI1030:gfx1030" "NAVI1100:gfx1100") + foreach(_entry ${_hip_arch_map}) + string(REPLACE ":" ";" _pair "${_entry}") + list(GET _pair 0 _name) + list(GET _pair 1 _gfx) + if(Kokkos_ARCH_${_name}) + set(_tag "${_gfx}") + break() + endif() + endforeach() + + if(NOT _tag AND CMAKE_HIP_ARCHITECTURES) + list(GET CMAKE_HIP_ARCHITECTURES 0 _first_arch) + # CMAKE_HIP_ARCHITECTURES entries already look like "gfx90a"; strip + # any trailing flags / colons just in case. + string(REGEX REPLACE "[:].*$" "" _first_arch "${_first_arch}") + if(_first_arch) + set(_tag "${_first_arch}") + endif() + endif() + elseif("OPENMP" IN_LIST IPPL_PLATFORMS) + set(_tag "openmp") + else() + set(_tag "serial") + endif() + + set(_src_dir "${CMAKE_SOURCE_DIR}/cmake/auto_tune/${_tag}") + set(_dst_dir "${CMAKE_BINARY_DIR}/share/ippl/auto_tune") + + # Wipe any stale presets from a previous configure (e.g. arch changed or the + # source preset directory was emptied). Otherwise the runtime would happily + # keep loading a CSV produced for a different backend, leading to + # team_size-too-large aborts on host backends. + file(REMOVE + "${_dst_dir}/tile_sweep_sa_optimal.csv" + "${_dst_dir}/gather_sweep_optimal.csv") + + file(MAKE_DIRECTORY "${_dst_dir}") + + set(_have_scatter FALSE) + set(_have_gather FALSE) + + if(_tag AND IS_DIRECTORY "${_src_dir}") + if(EXISTS "${_src_dir}/tile_sweep_sa_optimal.csv") + configure_file("${_src_dir}/tile_sweep_sa_optimal.csv" + "${_dst_dir}/tile_sweep_sa_optimal.csv" COPYONLY) + set(_have_scatter TRUE) + endif() + if(EXISTS "${_src_dir}/gather_sweep_optimal.csv") + configure_file("${_src_dir}/gather_sweep_optimal.csv" + "${_dst_dir}/gather_sweep_optimal.csv" COPYONLY) + set(_have_gather TRUE) + endif() + endif() + + if(_have_scatter OR _have_gather) + message(STATUS "📊 IPPL auto-tune presets: using ${_tag} (" + "scatter=${_have_scatter}, gather=${_have_gather})") + else() + if(_tag) + message(STATUS "📊 IPPL auto-tune presets: none for ${_tag} (drop CSVs in ${_src_dir})") + else() + message(STATUS "📊 IPPL auto-tune presets: no tag resolved") + endif() + endif() + + # Bake into a generated header consumed by TileSizeCache / GatherCache. + set(IPPL_AUTOTUNE_PRESET_DIR "${_dst_dir}") + set(IPPL_AUTOTUNE_ARCH_TAG "${_tag}") + configure_file( + "${CMAKE_SOURCE_DIR}/cmake/IpplAutoTunePresets.h.in" + "${CMAKE_BINARY_DIR}/include/IpplAutoTunePresets.h" + @ONLY) + + # Install the preset directory next to the library so installed binaries + # can find it (TileSizeCache also tries an install-relative fallback). + install(DIRECTORY "${_dst_dir}/" + DESTINATION "share/ippl/auto_tune" + FILES_MATCHING PATTERN "*.csv") +endfunction() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 70bb39314..0d4eb53ae 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -41,7 +41,8 @@ endif() # ------------------------------------------------------------------------------ if("OPENMP" IN_LIST IPPL_PLATFORMS) find_package(OpenMP REQUIRED) - colour_message(STATUS ${Green} "✅ OpenMP platform requested OpenMP found ${OPENMP_VERSION}") + colour_message(STATUS ${Green} + "✅ OpenMP platform requested, OpenMP found ${OpenMP_CXX_VERSION}") endif() # ------------------------------------------------------------------------------ @@ -113,6 +114,8 @@ function(set_kokkos_options) set(Kokkos_ENABLE_LIBDL ON CACHE BOOL "Enable LIBDL" FORCE) endif() endif() + + set(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE ON) endfunction() # ----------------------------------------------------------------------------- @@ -168,7 +171,7 @@ endfunction() # ------------------------------------------------------------------------------ # set the default version of kokkos we will ask for if not already set if(NOT Kokkos_VERSION_DEFAULT) - set(Kokkos_VERSION_DEFAULT 4.7.01) + set(Kokkos_VERSION_DEFAULT 5.0.0) endif() # if the user has not asked for a specific version, we will use a default if(NOT Kokkos_VERSION) @@ -317,3 +320,91 @@ if(IPPL_ENABLE_TESTS) "${DOWNLOADED_HEADERS_DIR}/stb_image_write.h") message(STATUS "✅ stb_image_write loaded for testing FDTD solver.") endif() + +# ------------------------------------------------------------------------------ +# (CU)FINUFFT +# ------------------------------------------------------------------------------ +if(IPPL_ENABLE_FFT AND IPPL_ENABLE_FINUFFT) + message(STATUS "Fetching (CU)FINUFFT") + FetchContent_Declare( + finufft + GIT_REPOSITORY https://github.com/flatironinstitute/finufft.git + GIT_SHALLOW TRUE + ) + if("CUDA" IN_LIST IPPL_PLATFORMS) + set(FINUFFT_USE_CUDA ON CACHE BOOL "") + add_compile_definitions(ENABLE_GPU_NUFFT) + add_compile_definitions(FINUFFT_USE_CUDA) + endif() + set(FINUFFT_USE_CPU ON CACHE BOOL "") + + # cufinufft's CUDA RDC fatbin registration segfaults at startup when its + # device code is wrapped in a .so. Force static for the FetchContent build + # regardless of the global BUILD_SHARED_LIBS setting. + set(_ippl_saved_bsl ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS OFF) + + # Kokkos updates CMAKE_CUDA_ARCHITECTURES only inside its own subproject + # scope. A downstream FetchContent like cufinufft would otherwise inherit + # CMake's default arch (which doesn't match the GPU) and emit kernels that + # fail at launch with cudaErrorInvalidResourceHandle. Translate the + # Kokkos_ARCH_* selection into CMAKE_CUDA_ARCHITECTURES once, here, so + # every downstream CUDA TU sees it. + if("CUDA" IN_LIST IPPL_PLATFORMS) + set(_ippl_arch_map + "KEPLER30:30" "KEPLER32:32" "KEPLER35:35" "KEPLER37:37" + "MAXWELL50:50" "MAXWELL52:52" "MAXWELL53:53" + "PASCAL60:60" "PASCAL61:61" + "VOLTA70:70" "VOLTA72:72" + "TURING75:75" + "AMPERE80:80" "AMPERE86:86" "AMPERE87:87" + "ADA89:89" + "HOPPER90:90" + "BLACKWELL100:100" "BLACKWELL120:120") + foreach(_entry ${_ippl_arch_map}) + string(REPLACE ":" ";" _pair ${_entry}) + list(GET _pair 0 _name) + list(GET _pair 1 _sm) + if(Kokkos_ARCH_${_name}) + set(CMAKE_CUDA_ARCHITECTURES ${_sm} CACHE STRING "" FORCE) + break() + endif() + endforeach() + endif() + + FetchContent_MakeAvailable(finufft) + + set(BUILD_SHARED_LIBS ${_ippl_saved_bsl}) + + add_compile_definitions(ENABLE_FINUFFT) +endif() + +# ------------------------------------------------------------------------------ +# CuFFTMp +# ------------------------------------------------------------------------------ +# CuFFTMp is opt-in; users must point NVSHMEM_HOME (and optionally +# CUFFTMP_ROOT via CMAKE_PREFIX_PATH) at their installation. +if(IPPL_ENABLE_FFT AND IPPL_ENABLE_CUFFTMP) + set(NVSHMEM_HOME $ENV{NVSHMEM_HOME}) + + find_library(NVSHMEM_HOST_LIBRARY + NAMES nvshmem_host + HINTS ${NVSHMEM_HOME} + PATH_SUFFIXES lib + ) + + find_library(CUFFTMP_LIBRARY + NAMES cufftMp + ) + + if(NVSHMEM_HOST_LIBRARY) + message(STATUS "Found NVSHMEM host library: ${NVSHMEM_HOST_LIBRARY}") + set(NVSHMEM_FOUND TRUE) + else() + message(FATAL_ERROR "NVSHMEM not found. Set NVSHMEM_HOME to your installation directory.") + set(NVSHMEM_FOUND FALSE) + endif() + + # The actual link is done at the ippl target in src/CMakeLists.txt + add_compile_definitions(IPPL_ENABLE_CUFFTMP) +endif() diff --git a/cmake/InstallIppl.cmake b/cmake/InstallIppl.cmake index ed0a2656d..4733cb542 100644 --- a/cmake/InstallIppl.cmake +++ b/cmake/InstallIppl.cmake @@ -121,6 +121,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/IPPLConfig.cmake" # Fix/Hack: Ensure extern dependencies are exported correctly if they were built in-tree. This is # needed for Heffte because it doesn't fully use CMake's export target mechanism # ------------------------------------------------------- -if(TARGET Heffte) - install(TARGETS Heffte EXPORT ipplTargets DESTINATION lib) -endif() +foreach(_ippl_extern_dep IN ITEMS Heffte finufft finufft_common cufinufft) + if(TARGET ${_ippl_extern_dep}) + install(TARGETS ${_ippl_extern_dep} EXPORT ipplTargets DESTINATION lib) + endif() +endforeach() diff --git a/cmake/IpplAutoTunePresets.h.in b/cmake/IpplAutoTunePresets.h.in new file mode 100644 index 000000000..feb6cdf1a --- /dev/null +++ b/cmake/IpplAutoTunePresets.h.in @@ -0,0 +1,15 @@ +#ifndef IPPL_AUTO_TUNE_PRESETS_H +#define IPPL_AUTO_TUNE_PRESETS_H + +// Generated by cmake/AutoTunePresets.cmake. + +// Absolute path to the build-tree directory holding the per-arch preset +// CSVs that ship with this IPPL build. Empty string if no preset matched +// the configured arch. +#define IPPL_AUTOTUNE_PRESET_DIR "@IPPL_AUTOTUNE_PRESET_DIR@" + +// Short human-readable label for the resolved arch (e.g. "sm_90", "openmp", +// "serial"). Used for log messages. +#define IPPL_AUTOTUNE_ARCH_TAG "@IPPL_AUTOTUNE_ARCH_TAG@" + +#endif // IPPL_AUTO_TUNE_PRESETS_H diff --git a/cmake/auto_tune/README.md b/cmake/auto_tune/README.md new file mode 100644 index 000000000..81d4419ae --- /dev/null +++ b/cmake/auto_tune/README.md @@ -0,0 +1,37 @@ +# Auto-tune presets + +Pre-generated scatter / gather sweep CSVs that ship with IPPL. At configure +time, `ippl_configure_autotune_presets()` (in `cmake/AutoTunePresets.cmake`) +picks the subdirectory that matches the current build: + +| Build | Tag | Lookup directory | +|---------------------------------------------------------|-----------|---------------------------| +| `IPPL_PLATFORMS=CUDA`, `Kokkos_ARCH_HOPPER90` | `sm_90` | `cmake/auto_tune/sm_90/` | +| `IPPL_PLATFORMS=CUDA`, `Kokkos_ARCH_AMPERE86` | `sm_86` | `cmake/auto_tune/sm_86/` | +| `IPPL_PLATFORMS=HIP`, `Kokkos_ARCH_AMD_GFX942` | `gfx942` | `cmake/auto_tune/gfx942/` | +| `IPPL_PLATFORMS=HIP`, `Kokkos_ARCH_AMD_GFX90A` | `gfx90a` | `cmake/auto_tune/gfx90a/` | +| `IPPL_PLATFORMS=OPENMP` (no GPU) | `openmp` | `cmake/auto_tune/openmp/` | +| Serial only | `serial` | `cmake/auto_tune/serial/` | + +If the matching directory contains `tile_sweep_sa_optimal.csv` and/or +`gather_sweep_optimal.csv`, those files are copied into +`/share/ippl/auto_tune/` and the build-tree path is baked into the +library via the generated header `IpplAutoTunePresets.h`. + +At runtime, `TileSizeCache::load()` and `GatherCache::load()` consult the +following sources in order: + +1. `IPPL_TILE_CSV` / `IPPL_GATHER_CSV` env var +2. `tile_sweep_sa_optimal.csv` / `gather_sweep_optimal.csv` in cwd +3. The shipped preset for this build's arch (this directory) +4. Built-in defaults seeded by `Ippl::initialize` + +To add a new preset: + +1. Build IPPL for the target arch. +2. Run any executable with `IPPL_AUTO_TUNE=full` to produce the two CSVs in + the run directory. +3. Copy them into `cmake/auto_tune//` and commit. + +Subsequent IPPL builds for that arch will pick up the CSVs automatically; +no env var or extra steps are needed at runtime. diff --git a/cmake/auto_tune/sm_90/gather_sweep_optimal.csv b/cmake/auto_tune/sm_90/gather_sweep_optimal.csv new file mode 100644 index 000000000..bfa99857c --- /dev/null +++ b/cmake/auto_tune/sm_90/gather_sweep_optimal.csv @@ -0,0 +1,2 @@ +method,kernel_width,tile_x,tile_y,tile_z,throughput_Mpts_s +Atomic,2,1,1,1,29494.75 diff --git a/cmake/auto_tune/sm_90/tile_sweep_sa_optimal.csv b/cmake/auto_tune/sm_90/tile_sweep_sa_optimal.csv new file mode 100644 index 000000000..1948bfc7c --- /dev/null +++ b/cmake/auto_tune/sm_90/tile_sweep_sa_optimal.csv @@ -0,0 +1,145 @@ +method,value_type,kernel_width,rho,best_tile_x,best_tile_y,best_tile_z,best_team_size,best_oversubscription_factor,best_z_batches,throughput_Mpts_s,time_ms,kernel_evaluations,preflight_rejections +Atomic,real,1,0.5000,1,1,1,32,1,1,666.69,0.0000,0,0 +Tiled,real,1,0.5000,8,8,8,32,4,1,96.26,0.0000,0,0 +OutputFocused,real,1,0.5000,5,5,5,256,4,2,79.10,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,32,1,1,658.94,0.0000,0,0 +Tiled,real,2,0.5000,2,2,2,128,1,1,95.22,0.0000,0,0 +OutputFocused,real,2,0.5000,4,4,4,128,2,8,79.09,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,32,1,1,1651.65,0.0000,0,0 +Tiled,real,1,2.0000,5,5,5,128,4,1,356.06,0.0000,0,0 +OutputFocused,real,1,2.0000,2,2,2,128,4,8,198.88,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,32,1,1,1642.38,0.0000,0,0 +Tiled,real,2,2.0000,5,5,5,256,1,1,355.88,0.0000,0,0 +OutputFocused,real,2,2.0000,5,5,5,512,1,2,198.88,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,32,1,1,2654.68,0.0000,0,0 +Tiled,real,1,8.0000,4,4,4,256,1,1,1140.51,0.0000,0,0 +OutputFocused,real,1,8.0000,5,5,5,128,4,8,363.52,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,32,1,1,2646.96,0.0000,0,0 +Tiled,real,2,8.0000,8,8,8,128,4,1,1145.45,0.0000,0,0 +OutputFocused,real,2,8.0000,6,6,6,512,4,4,359.74,0.0000,0,0 +Atomic,real,1,32.0000,1,1,1,32,1,1,3125.03,0.0000,0,0 +Tiled,real,1,32.0000,2,2,2,128,4,1,2557.89,0.0000,0,0 +OutputFocused,real,1,32.0000,5,5,5,64,1,4,423.01,0.0000,0,0 +Atomic,real,2,32.0000,1,1,1,32,1,1,3124.43,0.0000,0,0 +Tiled,real,2,32.0000,6,6,6,64,4,1,2559.89,0.0000,0,0 +OutputFocused,real,2,32.0000,6,6,6,64,2,4,422.84,0.0000,0,0 +Atomic,real,1,0.5000,1,1,1,32,1,1,2190.45,0.0000,0,0 +Tiled,real,1,0.5000,8,8,8,32,4,1,653.40,0.0000,0,0 +OutputFocused,real,1,0.5000,3,3,3,256,2,8,492.15,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,32,1,1,2193.97,0.0000,0,0 +Tiled,real,2,0.5000,3,3,3,64,1,1,656.75,0.0000,0,0 +OutputFocused,real,2,0.5000,6,6,6,64,4,4,492.32,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,32,1,1,2944.76,0.0000,0,0 +Tiled,real,1,2.0000,5,5,5,64,1,1,2092.27,0.0000,0,0 +OutputFocused,real,1,2.0000,4,4,4,512,4,8,978.42,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,32,1,1,2938.93,0.0000,0,0 +Tiled,real,2,2.0000,4,4,4,32,1,1,2093.60,0.0000,0,0 +OutputFocused,real,2,2.0000,3,3,3,512,2,8,977.89,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,32,1,1,3214.24,0.0000,0,0 +Tiled,real,1,8.0000,5,5,5,128,4,1,3948.80,0.0000,0,0 +OutputFocused,real,1,8.0000,4,4,4,512,4,4,1224.44,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,32,1,1,3216.60,0.0000,0,0 +Tiled,real,2,8.0000,4,4,4,128,1,1,3962.42,0.0000,0,0 +OutputFocused,real,2,8.0000,5,5,5,512,2,2,1224.40,0.0000,0,0 +Atomic,real,1,32.0000,1,1,1,32,1,1,3296.90,0.0000,0,0 +Tiled,real,1,32.0000,8,8,8,128,1,1,5430.52,0.0000,0,0 +OutputFocused,real,1,32.0000,4,4,4,128,2,2,1331.69,0.0000,0,0 +Atomic,real,2,32.0000,1,1,1,32,1,1,3288.92,0.0000,0,0 +Tiled,real,2,32.0000,2,2,2,256,1,1,5338.84,0.0000,0,0 +OutputFocused,real,2,32.0000,6,6,6,512,4,1,1333.33,0.0000,0,0 +Atomic,real,1,0.5000,1,1,1,32,1,1,3085.32,0.0000,0,0 +Tiled,real,1,0.5000,4,4,4,256,2,1,2426.80,0.0000,0,0 +OutputFocused,real,1,0.5000,3,3,3,64,4,2,1144.89,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,32,1,1,3038.68,0.0000,0,0 +Tiled,real,2,0.5000,4,4,4,64,4,1,2292.99,0.0000,0,0 +OutputFocused,real,2,0.5000,6,6,6,64,2,2,1143.69,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,32,1,1,3259.23,0.0000,0,0 +Tiled,real,1,2.0000,5,5,5,64,2,1,4282.99,0.0000,0,0 +OutputFocused,real,1,2.0000,3,3,3,64,1,4,1245.79,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,32,1,1,3245.75,0.0000,0,0 +Tiled,real,2,2.0000,3,3,3,64,4,1,4165.13,0.0000,0,0 +OutputFocused,real,2,2.0000,4,4,4,512,2,4,1246.08,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,32,1,1,3308.47,0.0000,0,0 +Tiled,real,1,8.0000,8,8,8,256,1,1,5417.90,0.0000,0,0 +OutputFocused,real,1,8.0000,2,2,2,256,2,2,1297.83,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,32,1,1,3303.67,0.0000,0,0 +Tiled,real,2,8.0000,6,6,6,32,2,1,5371.93,0.0000,0,0 +OutputFocused,real,2,8.0000,5,5,5,256,1,4,1298.39,0.0000,0,0 +Atomic,real,1,32.0000,1,1,1,32,1,1,3318.94,0.0000,0,0 +Tiled,real,1,32.0000,6,6,6,256,2,1,5728.31,0.0000,0,0 +OutputFocused,real,1,32.0000,6,6,6,512,2,8,1315.34,0.0000,0,0 +Atomic,real,2,32.0000,1,1,1,32,1,1,3320.60,0.0000,0,0 +Tiled,real,2,32.0000,3,3,3,128,4,1,5742.30,0.0000,0,0 +OutputFocused,real,2,32.0000,6,6,6,512,1,1,1315.69,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,32,1,1,666.69,0.0000,0,0 +Tiled,complex,1,0.5000,8,8,8,32,4,1,96.26,0.0000,0,0 +OutputFocused,complex,1,0.5000,5,5,5,256,4,2,79.10,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,32,1,1,658.94,0.0000,0,0 +Tiled,complex,2,0.5000,2,2,2,128,1,1,95.22,0.0000,0,0 +OutputFocused,complex,2,0.5000,4,4,4,128,2,8,79.09,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,32,1,1,1651.65,0.0000,0,0 +Tiled,complex,1,2.0000,5,5,5,128,4,1,356.06,0.0000,0,0 +OutputFocused,complex,1,2.0000,2,2,2,128,4,8,198.88,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,32,1,1,1642.38,0.0000,0,0 +Tiled,complex,2,2.0000,5,5,5,256,1,1,355.88,0.0000,0,0 +OutputFocused,complex,2,2.0000,5,5,5,512,1,2,198.88,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,32,1,1,2654.68,0.0000,0,0 +Tiled,complex,1,8.0000,4,4,4,256,1,1,1140.51,0.0000,0,0 +OutputFocused,complex,1,8.0000,5,5,5,128,4,8,363.52,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,32,1,1,2646.96,0.0000,0,0 +Tiled,complex,2,8.0000,8,8,8,128,4,1,1145.45,0.0000,0,0 +OutputFocused,complex,2,8.0000,6,6,6,512,4,4,359.74,0.0000,0,0 +Atomic,complex,1,32.0000,1,1,1,32,1,1,3125.03,0.0000,0,0 +Tiled,complex,1,32.0000,2,2,2,128,4,1,2557.89,0.0000,0,0 +OutputFocused,complex,1,32.0000,5,5,5,64,1,4,423.01,0.0000,0,0 +Atomic,complex,2,32.0000,1,1,1,32,1,1,3124.43,0.0000,0,0 +Tiled,complex,2,32.0000,6,6,6,64,4,1,2559.89,0.0000,0,0 +OutputFocused,complex,2,32.0000,6,6,6,64,2,4,422.84,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,32,1,1,2190.45,0.0000,0,0 +Tiled,complex,1,0.5000,8,8,8,32,4,1,653.40,0.0000,0,0 +OutputFocused,complex,1,0.5000,3,3,3,256,2,8,492.15,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,32,1,1,2193.97,0.0000,0,0 +Tiled,complex,2,0.5000,3,3,3,64,1,1,656.75,0.0000,0,0 +OutputFocused,complex,2,0.5000,6,6,6,64,4,4,492.32,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,32,1,1,2944.76,0.0000,0,0 +Tiled,complex,1,2.0000,5,5,5,64,1,1,2092.27,0.0000,0,0 +OutputFocused,complex,1,2.0000,4,4,4,512,4,8,978.42,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,32,1,1,2938.93,0.0000,0,0 +Tiled,complex,2,2.0000,4,4,4,32,1,1,2093.60,0.0000,0,0 +OutputFocused,complex,2,2.0000,3,3,3,512,2,8,977.89,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,32,1,1,3214.24,0.0000,0,0 +Tiled,complex,1,8.0000,5,5,5,128,4,1,3948.80,0.0000,0,0 +OutputFocused,complex,1,8.0000,4,4,4,512,4,4,1224.44,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,32,1,1,3216.60,0.0000,0,0 +Tiled,complex,2,8.0000,4,4,4,128,1,1,3962.42,0.0000,0,0 +OutputFocused,complex,2,8.0000,5,5,5,512,2,2,1224.40,0.0000,0,0 +Atomic,complex,1,32.0000,1,1,1,32,1,1,3296.90,0.0000,0,0 +Tiled,complex,1,32.0000,8,8,8,128,1,1,5430.52,0.0000,0,0 +OutputFocused,complex,1,32.0000,4,4,4,128,2,2,1331.69,0.0000,0,0 +Atomic,complex,2,32.0000,1,1,1,32,1,1,3288.92,0.0000,0,0 +Tiled,complex,2,32.0000,2,2,2,256,1,1,5338.84,0.0000,0,0 +OutputFocused,complex,2,32.0000,6,6,6,512,4,1,1333.33,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,32,1,1,3085.32,0.0000,0,0 +Tiled,complex,1,0.5000,4,4,4,256,2,1,2426.80,0.0000,0,0 +OutputFocused,complex,1,0.5000,3,3,3,64,4,2,1144.89,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,32,1,1,3038.68,0.0000,0,0 +Tiled,complex,2,0.5000,4,4,4,64,4,1,2292.99,0.0000,0,0 +OutputFocused,complex,2,0.5000,6,6,6,64,2,2,1143.69,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,32,1,1,3259.23,0.0000,0,0 +Tiled,complex,1,2.0000,5,5,5,64,2,1,4282.99,0.0000,0,0 +OutputFocused,complex,1,2.0000,3,3,3,64,1,4,1245.79,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,32,1,1,3245.75,0.0000,0,0 +Tiled,complex,2,2.0000,3,3,3,64,4,1,4165.13,0.0000,0,0 +OutputFocused,complex,2,2.0000,4,4,4,512,2,4,1246.08,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,32,1,1,3308.47,0.0000,0,0 +Tiled,complex,1,8.0000,8,8,8,256,1,1,5417.90,0.0000,0,0 +OutputFocused,complex,1,8.0000,2,2,2,256,2,2,1297.83,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,32,1,1,3303.67,0.0000,0,0 +Tiled,complex,2,8.0000,6,6,6,32,2,1,5371.93,0.0000,0,0 +OutputFocused,complex,2,8.0000,5,5,5,256,1,4,1298.39,0.0000,0,0 +Atomic,complex,1,32.0000,1,1,1,32,1,1,3318.94,0.0000,0,0 +Tiled,complex,1,32.0000,6,6,6,256,2,1,5728.31,0.0000,0,0 +OutputFocused,complex,1,32.0000,6,6,6,512,2,8,1315.34,0.0000,0,0 +Atomic,complex,2,32.0000,1,1,1,32,1,1,3320.60,0.0000,0,0 +Tiled,complex,2,32.0000,3,3,3,128,4,1,5742.30,0.0000,0,0 +OutputFocused,complex,2,32.0000,6,6,6,512,1,1,1315.69,0.0000,0,0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c4778b174..9451daeb8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -47,6 +47,14 @@ target_compile_options( $<$:-O3> >) +# Disable host LTO on the IPPL library when CUDA is enabled. Repeated +# `fatbinData` symbols in nvcc-generated fat binaries fail LTO merging, +# but this only matters for code that links against cuFFT/cuFFTMp/cufinufft +# — keep the rest of the build LTO-eligible. +if("CUDA" IN_LIST IPPL_PLATFORMS) + target_compile_options(ippl PRIVATE $<$:-fno-lto>) +endif() + get_target_property(_ippl_type ippl TYPE) if(_ippl_type) string(REPLACE "_LIBRARY" "" _ippl_type "${_ippl_type}") # -> STATIC / SHARED / OBJECT / INTERFACE @@ -67,7 +75,8 @@ target_sources(ippl PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/Ippl.cpp) target_include_directories( ippl PUBLIC $ $ - PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_BINARY_DIR}/include) # generated IpplAutoTunePresets.h add_subdirectory(Communicate) @@ -103,9 +112,18 @@ include(${PROJECT_SOURCE_DIR}/cmake/PlatformOptions.cmake) target_link_libraries(ippl PUBLIC Kokkos::kokkos MPI::MPI_CXX) if(IPPL_ENABLE_FFT) + if(IPPL_ENABLE_CUFFTMP) + target_link_libraries(ippl PUBLIC ${CUFFTMP_LIBRARY} ${NVSHMEM_HOST_LIBRARY}) + endif() + target_link_libraries(ippl PUBLIC Heffte::Heffte) + if(IPPL_ENABLE_FINUFFT) + target_link_libraries(ippl PUBLIC finufft) + if("CUDA" IN_LIST IPPL_PLATFORMS) + target_link_libraries(ippl PRIVATE cufinufft) + endif() + endif() endif() - # this alias should be created after all target properties have been set add_library(IPPL::ippl ALIAS ippl) From 13950c4f17398b79ad5e5d1f83ee7fdd4df5980a Mon Sep 17 00:00:00 2001 From: pfischill Date: Wed, 29 Apr 2026 15:26:56 +0200 Subject: [PATCH 02/39] FFT: refactor into Backend/Transform layers and add native NUFFT engine - Split the original FFT.h/FFT.hpp into: - Backend/{Backend,CuFFT,CuFFTMp,Heffte,Interface}.h: per-library backend selection and plan management. - Transform/{CC,RC,PrunedCC,PrunedRC,Trig,Common,Transform, NUFFT.{h,hpp}}.h: per-transform front-ends (complex-complex, real-complex, pruned variants, trigonometric, NUFFT). - Traits.h: backend availability traits and selection helpers. - Add a native Kokkos NUFFT (Type-1 / Type-2) engine under FFT/NUFFT/: NativeNUFFT.h driving the upsampling-FFT pipeline, ESKernel.h exponential-of-semicircle spreading kernel with width-specialised Estrin polynomial evaluators, Correction.h deconvolution factors, Quadrature.h Gauss-Legendre quadrature, NUFFTUtilities.h helpers. - The new NUFFT type plugs into the Transform layer via NUFFT.{h,hpp} and supports both the native engine and the (cu)FINUFFT backend behind IPPL_ENABLE_FINUFFT. --- src/FFT/Backend/Backend.h | 14 + src/FFT/Backend/CuFFT.h | 317 ++++++++++++++++++ src/FFT/Backend/CuFFTMp.h | 569 +++++++++++++++++++++++++++++++++ src/FFT/Backend/Heffte.h | 247 ++++++++++++++ src/FFT/FFT.h | 346 +------------------- src/FFT/FFT.hpp | 460 -------------------------- src/FFT/NUFFT/Correction.h | 388 ++++++++++++++++++++++ src/FFT/NUFFT/ESKernel.h | 446 ++++++++++++++++++++++++++ src/FFT/NUFFT/NUFFTUtilities.h | 65 ++++ src/FFT/NUFFT/NativeNUFFT.h | 346 ++++++++++++++++++++ src/FFT/Traits.h | 242 ++++++++++++++ src/FFT/Transform/CC.h | 75 +++++ src/FFT/Transform/Common.h | 47 +++ src/FFT/Transform/NUFFT.h | 263 +++++++++++++++ src/FFT/Transform/NUFFT.hpp | 531 ++++++++++++++++++++++++++++++ src/FFT/Transform/PrunedCC.h | 421 ++++++++++++++++++++++++ src/FFT/Transform/PrunedRC.h | 286 +++++++++++++++++ src/FFT/Transform/RC.h | 94 ++++++ src/FFT/Transform/Transform.h | 11 + src/FFT/Transform/Trig.h | 103 ++++++ 20 files changed, 4469 insertions(+), 802 deletions(-) create mode 100644 src/FFT/Backend/Backend.h create mode 100644 src/FFT/Backend/CuFFT.h create mode 100644 src/FFT/Backend/CuFFTMp.h create mode 100644 src/FFT/Backend/Heffte.h delete mode 100644 src/FFT/FFT.hpp create mode 100644 src/FFT/NUFFT/Correction.h create mode 100644 src/FFT/NUFFT/ESKernel.h create mode 100644 src/FFT/NUFFT/NUFFTUtilities.h create mode 100644 src/FFT/NUFFT/NativeNUFFT.h create mode 100644 src/FFT/Traits.h create mode 100644 src/FFT/Transform/CC.h create mode 100644 src/FFT/Transform/Common.h create mode 100644 src/FFT/Transform/NUFFT.h create mode 100644 src/FFT/Transform/NUFFT.hpp create mode 100644 src/FFT/Transform/PrunedCC.h create mode 100644 src/FFT/Transform/PrunedRC.h create mode 100644 src/FFT/Transform/RC.h create mode 100644 src/FFT/Transform/Transform.h create mode 100644 src/FFT/Transform/Trig.h diff --git a/src/FFT/Backend/Backend.h b/src/FFT/Backend/Backend.h new file mode 100644 index 000000000..005c02529 --- /dev/null +++ b/src/FFT/Backend/Backend.h @@ -0,0 +1,14 @@ +#ifndef IPPL_FFT_BACKEND_H +#define IPPL_FFT_BACKEND_H + +#include "FFT/Backend/Heffte.h" + +#ifdef IPPL_ENABLE_CUFFTMP +#include "FFT/Backend/CuFFTMp.h" +#endif + +#ifdef KOKKOS_ENABLE_CUDA +#include "FFT/Backend/CuFFT.h" +#endif + +#endif // IPPL_FFT_BACKEND_H diff --git a/src/FFT/Backend/CuFFT.h b/src/FFT/Backend/CuFFT.h new file mode 100644 index 000000000..0c9371894 --- /dev/null +++ b/src/FFT/Backend/CuFFT.h @@ -0,0 +1,317 @@ +#ifndef IPPL_FFT_BACKEND_CUFFT_H +#define IPPL_FFT_BACKEND_CUFFT_H + +#ifdef KOKKOS_ENABLE_CUDA + +#include +#include +#include +#include +#include + +#include + +#include "Utility/IpplException.h" +#include "Utility/ParameterList.h" + +namespace ippl { +namespace fft { + + namespace detail { + inline void checkCufftResult(cufftResult result, const char* context) { + if (result != CUFFT_SUCCESS) { + std::string msg = + std::string(context) + " (error code: " + std::to_string(result) + ")"; + throw IpplException("cuFFT", msg.c_str()); + } + } + + inline void checkCudaError(cudaError_t err, const char* context) { + if (err != cudaSuccess) { + std::string msg = std::string(context) + ": " + cudaGetErrorString(err); + throw IpplException("cuFFT", msg.c_str()); + } + } + } // namespace detail + + namespace detail { + template + __global__ void cufftScaleKernel(T* data, size_t n, double scale) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + data[idx].x *= scale; + data[idx].y *= scale; + } + } + } // namespace detail + + //========================================================================= + // CuFFTC2C - Single-node cuFFT with Batched Support + //========================================================================= + + template + class CuFFTC2C { + public: + static_assert(std::is_same_v, + "CuFFTC2C requires Kokkos::CudaSpace"); + static_assert(Dim == 3, "CuFFTC2C only supports 3D"); + static_assert(std::is_same_v || std::is_same_v, + "CuFFTC2C only supports float and double precision"); + + using complex_t = Kokkos::complex; + using cuda_complex_t = std::conditional_t, + cufftComplex, + cufftDoubleComplex>; + + static constexpr cufftType fft_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + + // Constructor matching HeFFTe/cuFFTMp interface + CuFFTC2C(const heffte::box3d& inbox, + const heffte::box3d& outbox, + MPI_Comm comm, + const ParameterList& /*params*/, + int maxBatchSize = 1) + : maxBatchSize_(maxBatchSize) + , comm_(comm) + { + using detail::checkCudaError; + using detail::checkCufftResult; + + // Extract local dimensions from inbox + for (int d = 0; d < 3; ++d) { + lowerIn_[d] = inbox.low[d]; + upperIn_[d] = inbox.high[d] + 1; + lowerOut_[d] = outbox.low[d]; + upperOut_[d] = outbox.high[d] + 1; + localSize_[d] = upperIn_[d] - lowerIn_[d]; + } + + // Compute global size via MPI reduction + std::array localMax; + for (int d = 0; d < 3; ++d) { + localMax[d] = std::max(upperIn_[d], upperOut_[d]); + } + MPI_Allreduce(localMax.data(), globalSize_.data(), 3, MPI_LONG_LONG, MPI_MAX, comm); + + localElements_ = localSize_[0] * localSize_[1] * localSize_[2]; + globalElements_ = globalSize_[0] * globalSize_[1] * globalSize_[2]; + + // Create CUDA stream + checkCudaError(cudaStreamCreate(&stream_), "Failed to create CUDA stream"); + + // cuFFT expects row-major (C-order) dimensions. The Kokkos views are + // LayoutLeft, so dimension 0 is fastest-varying - pass extents reversed. + int n[3] = { + static_cast(localSize_[2]), + static_cast(localSize_[1]), + static_cast(localSize_[0]) + }; + + int inembed[3] = {n[0], n[1], n[2]}; + int onembed[3] = {n[0], n[1], n[2]}; + int istride = 1; + int ostride = 1; + int idist = static_cast(localElements_); + int odist = static_cast(localElements_); + + // Create batched plan + checkCufftResult( + cufftPlanMany(&planBatched_, 3, n, + inembed, istride, idist, + onembed, ostride, odist, + fft_type, maxBatchSize), + "Failed to create batched cuFFT plan"); + + checkCufftResult(cufftSetStream(planBatched_, stream_), + "Failed to set stream on batched plan"); + + // Create single-transform plan if needed + if (maxBatchSize > 1) { + checkCufftResult( + cufftPlanMany(&planSingle_, 3, n, + inembed, istride, idist, + onembed, ostride, odist, + fft_type, 1), + "Failed to create single cuFFT plan"); + + checkCufftResult(cufftSetStream(planSingle_, stream_), + "Failed to set stream on single plan"); + } else { + planSingle_ = planBatched_; + } + } + + ~CuFFTC2C() { + if (planBatched_) cufftDestroy(planBatched_); + if (maxBatchSize_ > 1 && planSingle_) cufftDestroy(planSingle_); + if (stream_) cudaStreamDestroy(stream_); + } + + // Non-copyable + CuFFTC2C(const CuFFTC2C&) = delete; + CuFFTC2C& operator=(const CuFFTC2C&) = delete; + + // Movable + CuFFTC2C(CuFFTC2C&& other) noexcept + : planBatched_(other.planBatched_) + , planSingle_(other.planSingle_) + , stream_(other.stream_) + , comm_(other.comm_) + , maxBatchSize_(other.maxBatchSize_) + , localElements_(other.localElements_) + , globalElements_(other.globalElements_) + , localSize_(other.localSize_) + , globalSize_(other.globalSize_) + , lowerIn_(other.lowerIn_) + , upperIn_(other.upperIn_) + , lowerOut_(other.lowerOut_) + , upperOut_(other.upperOut_) + { + other.planBatched_ = 0; + other.planSingle_ = 0; + other.stream_ = nullptr; + } + + CuFFTC2C& operator=(CuFFTC2C&& other) noexcept { + if (this != &other) { + if (planBatched_) cufftDestroy(planBatched_); + if (maxBatchSize_ > 1 && planSingle_) cufftDestroy(planSingle_); + if (stream_) cudaStreamDestroy(stream_); + + planBatched_ = other.planBatched_; + planSingle_ = other.planSingle_; + stream_ = other.stream_; + comm_ = other.comm_; + maxBatchSize_ = other.maxBatchSize_; + localElements_ = other.localElements_; + globalElements_ = other.globalElements_; + localSize_ = other.localSize_; + globalSize_ = other.globalSize_; + lowerIn_ = other.lowerIn_; + upperIn_ = other.upperIn_; + lowerOut_ = other.lowerOut_; + upperOut_ = other.upperOut_; + + other.planBatched_ = 0; + other.planSingle_ = 0; + other.stream_ = nullptr; + } + return *this; + } + + // Single transform + void forward(complex_t* in, complex_t* out) { + execute(planSingle_, in, out, CUFFT_FORWARD); + applyScaling(out, localElements_, T(1) / static_cast(globalElements_)); + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + void backward(complex_t* in, complex_t* out) { + execute(planSingle_, in, out, CUFFT_INVERSE); + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + // Batched transform + void forward(int batchSize, complex_t* in, complex_t* out) { + if (batchSize > maxBatchSize_) { + throw IpplException("CuFFTC2C", "Batch size exceeds plan capacity"); + } + + if (batchSize == maxBatchSize_) { + execute(planBatched_, in, out, CUFFT_FORWARD); + } else { + // Execute individual transforms for partial batch + for (int b = 0; b < batchSize; ++b) { + execute(planSingle_, + in + b * localElements_, + out + b * localElements_, + CUFFT_FORWARD); + } + } + applyScaling(out, localElements_ * batchSize, T(1) / static_cast(globalElements_)); + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + void backward(int batchSize, complex_t* in, complex_t* out) { + if (batchSize > maxBatchSize_) { + throw IpplException("CuFFTC2C", "Batch size exceeds plan capacity"); + } + + if (batchSize == maxBatchSize_) { + execute(planBatched_, in, out, CUFFT_INVERSE); + } else { + for (int b = 0; b < batchSize; ++b) { + execute(planSingle_, + in + b * localElements_, + out + b * localElements_, + CUFFT_INVERSE); + } + } + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + // Set external stream + void setStream(cudaStream_t stream) { + stream_ = stream; + detail::checkCufftResult(cufftSetStream(planBatched_, stream_), + "Failed to set stream on batched plan"); + if (maxBatchSize_ > 1) { + detail::checkCufftResult(cufftSetStream(planSingle_, stream_), + "Failed to set stream on single plan"); + } + } + + // Accessors + size_t workspace_size() const { return 0; } // cuFFT manages internally + size_t local_size() const { return localElements_; } + size_t global_size() const { return globalElements_; } + size_t size_inbox() const { return localElements_; } + size_t size_outbox() const { return localElements_; } + int max_batch_size() const { return maxBatchSize_; } + cudaStream_t stream() const { return stream_; } + const std::array& local_dims() const { return localSize_; } + const std::array& global_dims() const { return globalSize_; } + + private: + void execute(cufftHandle plan, complex_t* in, complex_t* out, int direction) { + auto* inPtr = reinterpret_cast(in); + auto* outPtr = reinterpret_cast(out); + + if constexpr (std::is_same_v) { + detail::checkCufftResult(cufftExecC2C(plan, inPtr, outPtr, direction), + "cuFFT C2C execution failed"); + } else { + detail::checkCufftResult(cufftExecZ2Z(plan, inPtr, outPtr, direction), + "cuFFT Z2Z execution failed"); + } + } + + void applyScaling(complex_t* data, size_t count, T scale) { + auto* ptr = reinterpret_cast(data); + constexpr size_t blockSize = 256; + size_t numBlocks = (count + blockSize - 1) / blockSize; + detail::cufftScaleKernel<<>>( + ptr, count, static_cast(scale)); + } + + cufftHandle planBatched_ = 0; + cufftHandle planSingle_ = 0; + cudaStream_t stream_ = nullptr; + MPI_Comm comm_; + + int maxBatchSize_; + size_t localElements_; + size_t globalElements_; + + std::array localSize_; + std::array globalSize_; + std::array lowerIn_, upperIn_; + std::array lowerOut_, upperOut_; + }; + +} // namespace fft +} // namespace ippl + +#endif // KOKKOS_ENABLE_CUDA + +#endif // IPPL_FFT_BACKEND_CUFFT_H \ No newline at end of file diff --git a/src/FFT/Backend/CuFFTMp.h b/src/FFT/Backend/CuFFTMp.h new file mode 100644 index 000000000..7c9c385ce --- /dev/null +++ b/src/FFT/Backend/CuFFTMp.h @@ -0,0 +1,569 @@ +#ifndef IPPL_FFT_BACKEND_CUFFTMP_H +#define IPPL_FFT_BACKEND_CUFFTMP_H + +#include +#include +#include +#include + +#include "Utility/IpplException.h" +#include "Utility/ParameterList.h" + +#include "FFT/Traits.h" + +namespace ippl { + namespace fft { + + namespace detail { + inline void checkCufftResult(cufftResult result, const char* context) { + if (result != CUFFT_SUCCESS) { + std::string msg = + std::string(context) + " (error code: " + std::to_string(result) + ")"; + throw IpplException("cuFFTMp", msg.c_str()); + } + } + + inline void checkCudaError(cudaError_t err, const char* context) { + if (err != cudaSuccess) { + std::string msg = std::string(context) + ": " + cudaGetErrorString(err); + throw IpplException("cuFFTMp", msg.c_str()); + } + } + + // Transpose kernel: LayoutLeft -> LayoutRight + // LayoutLeft: src[i + j*n0 + k*n0*n1] (i is fastest) + // LayoutRight: dst[k + j*n2 + i*n1*n2] (k is fastest) + template + __global__ void transposeL2R(T* __restrict__ dst, const T* __restrict__ src, int n0, + int n1, int n2) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int k = blockIdx.z * blockDim.z + threadIdx.z; + + if (i < n0 && j < n1 && k < n2) { + size_t src_idx = i + j * n0 + k * n0 * n1; // LayoutLeft + size_t dst_idx = k + j * n2 + i * n1 * n2; // LayoutRight + dst[dst_idx] = src[src_idx]; + } + } + + // Transpose kernel: LayoutRight -> LayoutLeft + template + __global__ void transposeR2L(T* __restrict__ dst, const T* __restrict__ src, int n0, + int n1, int n2) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int k = blockIdx.z * blockDim.z + threadIdx.z; + + if (i < n0 && j < n1 && k < n2) { + size_t src_idx = k + j * n2 + i * n1 * n2; // LayoutRight + size_t dst_idx = i + j * n0 + k * n0 * n1; // LayoutLeft + dst[dst_idx] = src[src_idx]; + } + } + } // namespace detail + + namespace detail { + template + __global__ void cufftMpScaleKernel(T* data, size_t n, double scale) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + data[idx].x *= scale; + data[idx].y *= scale; + } + } + } // namespace detail + + //============================================================================= + // cuFFTMp C2C Backend + //============================================================================= + + template + class CuFFTMpC2C { + public: + using complex_t = Kokkos::complex; + using cuda_complex_t = + std::conditional_t, cufftComplex, cufftDoubleComplex>; + + static_assert(std::is_same_v || std::is_same_v, + "cuFFTMp only supports float and double precision"); + static_assert(Dim == 3, "cuFFTMp backend currently only supports 3D transforms"); + static_assert(is_available_v, "cuFFTMp not available"); + + CuFFTMpC2C(const heffte::box3d& inbox, + const heffte::box3d& outbox, MPI_Comm comm, + const ParameterList& /*params*/) + : comm_(comm) { + using detail::checkCudaError; + using detail::checkCufftResult; + + checkCudaError(cudaStreamCreate(&stream_), "Failed to create CUDA stream"); + checkCufftResult(cufftCreate(&handle_), "Failed to create cuFFT handle"); + checkCufftResult(cufftSetStream(handle_, stream_), "Failed to set stream"); + + cufftType type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + + for (int d = 0; d < 3; ++d) { + lower_in_[d] = inbox.low[d]; + upper_in_[d] = inbox.high[d] + 1; + lower_out_[d] = outbox.low[d]; + upper_out_[d] = outbox.high[d] + 1; + } + + for (int d = 0; d < 3; ++d) { + local_size_[d] = upper_in_[d] - lower_in_[d]; + } + + // Row-major strides (required by cuFFTMp - must be decreasing) + std::array strides; + strides[0] = local_size_[1] * local_size_[2]; + strides[1] = local_size_[2]; + strides[2] = 1; + + std::array local_max; + for (int d = 0; d < 3; ++d) { + local_max[d] = std::max(upper_in_[d], upper_out_[d]); + } + MPI_Allreduce(local_max.data(), global_size_.data(), 3, MPI_LONG_LONG, MPI_MAX, + comm); + + int n[3] = {static_cast(global_size_[0]), static_cast(global_size_[1]), + static_cast(global_size_[2])}; + + total_elements_ = static_cast(n[0]) * n[1] * n[2]; + local_elements_ = local_size_[0] * local_size_[1] * local_size_[2]; + + checkCufftResult(cufftMpMakePlanDecomposition( + handle_, 3, n, lower_in_.data(), upper_in_.data(), + strides.data(), lower_out_.data(), upper_out_.data(), + strides.data(), type, &comm_, CUFFT_COMM_MPI, &worksize_), + "Failed to create cuFFTMp decomposition plan"); + + checkCufftResult(cufftXtMalloc(handle_, &desc_, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT), + "Failed to allocate descriptor"); + } + + ~CuFFTMpC2C() { + if (desc_) + cufftXtFree(desc_); + if (handle_) + cufftDestroy(handle_); + if (stream_) + cudaStreamDestroy(stream_); + } + + CuFFTMpC2C(const CuFFTMpC2C&) = delete; + CuFFTMpC2C& operator=(const CuFFTMpC2C&) = delete; + + CuFFTMpC2C(CuFFTMpC2C&& other) noexcept + : handle_(other.handle_) + , comm_(other.comm_) + , stream_(other.stream_) + , desc_(other.desc_) + , worksize_(other.worksize_) + , total_elements_(other.total_elements_) + , local_elements_(other.local_elements_) + , global_size_(other.global_size_) + , local_size_(other.local_size_) + , lower_in_(other.lower_in_) + , upper_in_(other.upper_in_) + , lower_out_(other.lower_out_) + , upper_out_(other.upper_out_) { + other.handle_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + + CuFFTMpC2C& operator=(CuFFTMpC2C&& other) noexcept { + if (this != &other) { + if (desc_) + cufftXtFree(desc_); + if (handle_) + cufftDestroy(handle_); + if (stream_) + cudaStreamDestroy(stream_); + + handle_ = other.handle_; + comm_ = other.comm_; + stream_ = other.stream_; + desc_ = other.desc_; + worksize_ = other.worksize_; + total_elements_ = other.total_elements_; + local_elements_ = other.local_elements_; + global_size_ = other.global_size_; + local_size_ = other.local_size_; + lower_in_ = other.lower_in_; + upper_in_ = other.upper_in_; + lower_out_ = other.lower_out_; + upper_out_ = other.upper_out_; + + other.handle_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + return *this; + } + + void forward(complex_t* in, complex_t* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + cuda_complex_t* desc_data = + static_cast(desc_->descriptor->data[0]); + + // Transpose input: LayoutLeft -> LayoutRight (into descriptor buffer) + launchTransposeL2R(desc_data, reinterpret_cast(in)); + + // Execute forward FFT + checkCufftResult(cufftXtExecDescriptor(handle_, desc_, desc_, CUFFT_FORWARD), + "Forward FFT execution failed"); + + // Transpose output: LayoutRight -> LayoutLeft + launchTransposeR2L(reinterpret_cast(out), desc_data); + + // Apply scaling (1/N) + T scale = T(1) / static_cast(total_elements_); + applyScaling(reinterpret_cast(out), local_elements_, scale); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + void backward(complex_t* in, complex_t* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + cuda_complex_t* desc_data = + static_cast(desc_->descriptor->data[0]); + + // Transpose input: LayoutLeft -> LayoutRight (into descriptor buffer) + launchTransposeL2R(desc_data, reinterpret_cast(in)); + + // Execute backward FFT + checkCufftResult(cufftXtExecDescriptor(handle_, desc_, desc_, CUFFT_INVERSE), + "Backward FFT execution failed"); + + // Transpose output: LayoutRight -> LayoutLeft + launchTransposeR2L(reinterpret_cast(out), desc_data); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + std::size_t workspace_size() const { return worksize_; } + + private: + void launchTransposeL2R(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_size_[0] + block.x - 1) / block.x, + (local_size_[1] + block.y - 1) / block.y, + (local_size_[2] + block.z - 1) / block.z); + detail::transposeL2R<<>>( + dst, src, static_cast(local_size_[0]), static_cast(local_size_[1]), + static_cast(local_size_[2])); + } + + void launchTransposeR2L(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_size_[0] + block.x - 1) / block.x, + (local_size_[1] + block.y - 1) / block.y, + (local_size_[2] + block.z - 1) / block.z); + detail::transposeR2L<<>>( + dst, src, static_cast(local_size_[0]), static_cast(local_size_[1]), + static_cast(local_size_[2])); + } + + void applyScaling(cuda_complex_t* data, size_t count, T scale) { + constexpr size_t blockSize = 256; + size_t numBlocks = (count + blockSize - 1) / blockSize; + detail::cufftMpScaleKernel<<>>( + data, count, static_cast(scale)); + } + + cufftHandle handle_ = 0; + MPI_Comm comm_; + cudaStream_t stream_ = nullptr; + cudaLibXtDesc* desc_ = nullptr; + + size_t worksize_ = 0; + size_t total_elements_ = 0; + size_t local_elements_ = 0; + std::array global_size_; + std::array local_size_; + std::array lower_in_, upper_in_; + std::array lower_out_, upper_out_; + }; + + //============================================================================= + // cuFFTMp R2C Backend + //============================================================================= + + template + class CuFFTMpR2C { + public: + using complex_t = Kokkos::complex; + using cuda_complex_t = + std::conditional_t, cufftComplex, cufftDoubleComplex>; + + static_assert(std::is_same_v || std::is_same_v, + "cuFFTMp only supports float and double precision"); + static_assert(Dim == 3, "cuFFTMp backend currently only supports 3D transforms"); + static_assert(is_available_v, "cuFFTMp not available"); + + CuFFTMpR2C(const heffte::box3d& inbox, + const heffte::box3d& outbox, int /*r2c_direction*/, + MPI_Comm comm, const ParameterList& /*params*/) + : comm_(comm) { + using detail::checkCudaError; + using detail::checkCufftResult; + + checkCudaError(cudaStreamCreate(&stream_), "Failed to create CUDA stream"); + checkCufftResult(cufftCreate(&handle_r2c_), "Failed to create R2C handle"); + checkCufftResult(cufftCreate(&handle_c2r_), "Failed to create C2R handle"); + checkCufftResult(cufftSetStream(handle_r2c_, stream_), "Failed to set stream"); + checkCufftResult(cufftSetStream(handle_c2r_, stream_), "Failed to set stream"); + + std::array lower_real, upper_real; + std::array lower_complex, upper_complex; + + for (int d = 0; d < 3; ++d) { + lower_real[d] = inbox.low[d]; + upper_real[d] = inbox.high[d] + 1; + lower_complex[d] = outbox.low[d]; + upper_complex[d] = outbox.high[d] + 1; + } + + for (int d = 0; d < 3; ++d) { + local_real_size_[d] = upper_real[d] - lower_real[d]; + local_complex_size_[d] = upper_complex[d] - lower_complex[d]; + } + + // Row-major strides for real data + std::array strides_real; + strides_real[0] = local_real_size_[1] * local_real_size_[2]; + strides_real[1] = local_real_size_[2]; + strides_real[2] = 1; + + // Row-major strides for complex data + std::array strides_complex; + strides_complex[0] = local_complex_size_[1] * local_complex_size_[2]; + strides_complex[1] = local_complex_size_[2]; + strides_complex[2] = 1; + + std::array local_max; + for (int d = 0; d < 3; ++d) { + local_max[d] = std::max(upper_real[d], upper_complex[d]); + } + MPI_Allreduce(local_max.data(), global_size_.data(), 3, MPI_LONG_LONG, MPI_MAX, + comm); + + int n[3] = {static_cast(global_size_[0]), static_cast(global_size_[1]), + static_cast(global_size_[2])}; + + total_elements_ = static_cast(n[0]) * n[1] * n[2]; + local_real_elements_ = + local_real_size_[0] * local_real_size_[1] * local_real_size_[2]; + local_complex_elements_ = + local_complex_size_[0] * local_complex_size_[1] * local_complex_size_[2]; + + size_t worksize_r2c = 0; + size_t worksize_c2r = 0; + checkCufftResult( + cufftMpMakePlanDecomposition( + handle_r2c_, 3, n, lower_real.data(), upper_real.data(), + strides_real.data(), lower_complex.data(), upper_complex.data(), + strides_complex.data(), CUFFT_R2C, &comm_, CUFFT_COMM_MPI, &worksize_r2c), + "Failed to create R2C plan"); + + checkCufftResult( + cufftMpMakePlanDecomposition( + handle_c2r_, 3, n, lower_real.data(), upper_real.data(), + strides_real.data(), lower_complex.data(), upper_complex.data(), + strides_complex.data(), CUFFT_C2R, &comm_, CUFFT_COMM_MPI, &worksize_c2r), + "Failed to create C2R plan"); + + worksize_ = std::max(worksize_r2c, worksize_c2r); + + checkCufftResult( + cufftXtMalloc(handle_r2c_, &desc_, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT), + "Failed to allocate R2C descriptor"); + } + + ~CuFFTMpR2C() { + if (desc_) + cufftXtFree(desc_); + if (handle_r2c_) + cufftDestroy(handle_r2c_); + if (handle_c2r_) + cufftDestroy(handle_c2r_); + if (stream_) + cudaStreamDestroy(stream_); + } + + CuFFTMpR2C(const CuFFTMpR2C&) = delete; + CuFFTMpR2C& operator=(const CuFFTMpR2C&) = delete; + + CuFFTMpR2C(CuFFTMpR2C&& other) noexcept + : handle_r2c_(other.handle_r2c_) + , handle_c2r_(other.handle_c2r_) + , comm_(other.comm_) + , stream_(other.stream_) + , desc_(other.desc_) + , worksize_(other.worksize_) + , total_elements_(other.total_elements_) + , local_real_elements_(other.local_real_elements_) + , local_complex_elements_(other.local_complex_elements_) + , global_size_(other.global_size_) + , local_real_size_(other.local_real_size_) + , local_complex_size_(other.local_complex_size_) { + other.handle_r2c_ = 0; + other.handle_c2r_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + + CuFFTMpR2C& operator=(CuFFTMpR2C&& other) noexcept { + if (this != &other) { + if (desc_) + cufftXtFree(desc_); + if (handle_r2c_) + cufftDestroy(handle_r2c_); + if (handle_c2r_) + cufftDestroy(handle_c2r_); + if (stream_) + cudaStreamDestroy(stream_); + + handle_r2c_ = other.handle_r2c_; + handle_c2r_ = other.handle_c2r_; + comm_ = other.comm_; + stream_ = other.stream_; + desc_ = other.desc_; + worksize_ = other.worksize_; + total_elements_ = other.total_elements_; + local_real_elements_ = other.local_real_elements_; + local_complex_elements_ = other.local_complex_elements_; + global_size_ = other.global_size_; + local_real_size_ = other.local_real_size_; + local_complex_size_ = other.local_complex_size_; + + other.handle_r2c_ = 0; + other.handle_c2r_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + return *this; + } + + void forward(T* in, complex_t* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + T* desc_data = static_cast(desc_->descriptor->data[0]); + + // Transpose real input: LayoutLeft -> LayoutRight + launchTransposeRealL2R(desc_data, in); + + checkCufftResult(cufftXtExecDescriptor(handle_r2c_, desc_, desc_, CUFFT_FORWARD), + "R2C execution failed"); + + // Transpose complex output: LayoutRight -> LayoutLeft + cuda_complex_t* complex_desc = + static_cast(desc_->descriptor->data[0]); + launchTransposeComplexR2L(reinterpret_cast(out), complex_desc); + + T scale = T(1) / static_cast(total_elements_); + applyScaling(reinterpret_cast(out), local_complex_elements_, + scale); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + void backward(complex_t* in, T* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + cuda_complex_t* complex_desc = + static_cast(desc_->descriptor->data[0]); + + // Transpose complex input: LayoutLeft -> LayoutRight + launchTransposeComplexL2R(complex_desc, reinterpret_cast(in)); + + checkCufftResult(cufftXtExecDescriptor(handle_c2r_, desc_, desc_, CUFFT_INVERSE), + "C2R execution failed"); + + // Transpose real output: LayoutRight -> LayoutLeft + T* real_desc = static_cast(desc_->descriptor->data[0]); + launchTransposeRealR2L(out, real_desc); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + std::size_t workspace_size() const { return worksize_; } + + private: + void launchTransposeRealL2R(T* dst, const T* src) { + dim3 block(8, 8, 8); + dim3 grid((local_real_size_[0] + block.x - 1) / block.x, + (local_real_size_[1] + block.y - 1) / block.y, + (local_real_size_[2] + block.z - 1) / block.z); + detail::transposeL2R<<>>( + dst, src, static_cast(local_real_size_[0]), + static_cast(local_real_size_[1]), static_cast(local_real_size_[2])); + } + + void launchTransposeRealR2L(T* dst, const T* src) { + dim3 block(8, 8, 8); + dim3 grid((local_real_size_[0] + block.x - 1) / block.x, + (local_real_size_[1] + block.y - 1) / block.y, + (local_real_size_[2] + block.z - 1) / block.z); + detail::transposeR2L<<>>( + dst, src, static_cast(local_real_size_[0]), + static_cast(local_real_size_[1]), static_cast(local_real_size_[2])); + } + + void launchTransposeComplexL2R(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_complex_size_[0] + block.x - 1) / block.x, + (local_complex_size_[1] + block.y - 1) / block.y, + (local_complex_size_[2] + block.z - 1) / block.z); + detail::transposeL2R<<>>( + dst, src, static_cast(local_complex_size_[0]), + static_cast(local_complex_size_[1]), + static_cast(local_complex_size_[2])); + } + + void launchTransposeComplexR2L(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_complex_size_[0] + block.x - 1) / block.x, + (local_complex_size_[1] + block.y - 1) / block.y, + (local_complex_size_[2] + block.z - 1) / block.z); + detail::transposeR2L<<>>( + dst, src, static_cast(local_complex_size_[0]), + static_cast(local_complex_size_[1]), + static_cast(local_complex_size_[2])); + } + + void applyScaling(cuda_complex_t* data, size_t count, T scale) { + constexpr size_t blockSize = 256; + size_t numBlocks = (count + blockSize - 1) / blockSize; + detail::cufftMpScaleKernel<<>>( + data, count, static_cast(scale)); + } + + cufftHandle handle_r2c_ = 0; + cufftHandle handle_c2r_ = 0; + MPI_Comm comm_; + cudaStream_t stream_ = nullptr; + cudaLibXtDesc* desc_ = nullptr; + + size_t worksize_ = 0; + size_t total_elements_ = 0; + size_t local_real_elements_ = 0; + size_t local_complex_elements_ = 0; + std::array global_size_; + std::array local_real_size_; + std::array local_complex_size_; + }; + + } // namespace fft +} // namespace ippl + +#endif \ No newline at end of file diff --git a/src/FFT/Backend/Heffte.h b/src/FFT/Backend/Heffte.h new file mode 100644 index 000000000..2d413fc4b --- /dev/null +++ b/src/FFT/Backend/Heffte.h @@ -0,0 +1,247 @@ +#ifndef IPPL_FFT_BACKEND_HEFFTE_H +#define IPPL_FFT_BACKEND_HEFFTE_H + +#include +#include +#include + +#include "Utility/ParameterList.h" + +#include "Field/BareField.h" + +#include "FFT/Traits.h" +#include "FieldLayout/FieldLayout.h" + +namespace ippl { + namespace fft { + + template + inline void applyScale(Kokkos::complex* data, T scale, size_t size) { + Kokkos::View*, MemSpace> view(data, size); + Kokkos::parallel_for( + "Heffte_scale_complex", + Kokkos::RangePolicy(0, size), + KOKKOS_LAMBDA(const size_t i) { view(i) *= scale; }); + Kokkos::fence(); + } + + template + inline void applyScaleReal(T* data, T scale, size_t size) { + Kokkos::View view(data, size); + Kokkos::parallel_for( + "Heffte_scale_real", + Kokkos::RangePolicy(0, size), + KOKKOS_LAMBDA(const size_t i) { view(i) *= scale; }); + Kokkos::fence(); + } + + // Helper to compute global FFT size from box coordinates + inline size_t computeGlobalSize(const heffte::box3d& inbox, + const heffte::box3d& outbox, MPI_Comm comm) { + long long local_max[3], global_max[3]; + local_max[0] = std::max(inbox.high[0], outbox.high[0]) + 1; + local_max[1] = std::max(inbox.high[1], outbox.high[1]) + 1; + local_max[2] = std::max(inbox.high[2], outbox.high[2]) + 1; + + MPI_Allreduce(local_max, global_max, 3, MPI_LONG_LONG, MPI_MAX, comm); + + return static_cast(global_max[0]) * static_cast(global_max[1]) + * static_cast(global_max[2]); + } + + template + heffte::plan_options makeHeffteOptions(const ParameterList& params) { + auto opts = heffte::default_options(); + + if (!params.get("use_heffte_defaults")) { + opts.use_pencils = params.get("use_pencils"); + opts.use_reorder = params.get("use_reorder"); + + if constexpr (is_available_v) { + opts.use_gpu_aware = params.get("use_gpu_aware"); + } + + switch (params.get("comm")) { + case a2a: + opts.algorithm = heffte::reshape_algorithm::alltoall; + break; + case a2av: + opts.algorithm = heffte::reshape_algorithm::alltoallv; + break; + case p2p: + opts.algorithm = heffte::reshape_algorithm::p2p; + break; + case p2p_pl: + opts.algorithm = heffte::reshape_algorithm::p2p_plined; + break; + default: + throw IpplException("FFT", "Unknown communication type"); + } + } else { + opts.use_gpu_aware = true; + opts.algorithm = heffte::reshape_algorithm::p2p_plined; + } + return opts; + } + + //============================================================================= + // heFFTe C2C + //============================================================================= + template + class HeffteC2C { + public: + using complex_t = Kokkos::complex; + using backend_t = typename HeffteBackend::c2c; + using heffte_t = heffte::fft3d; + using workspace_t = typename heffte_t::template buffer_container; + + // Standard constructor + HeffteC2C(const heffte::box3d& inbox, const heffte::box3d& outbox, + MPI_Comm comm, const ParameterList& params, int maxBatchSize = 1) + : maxBatchSize_(maxBatchSize) { + static_assert(Dim == 3, "HeFFTe wrapper only supports 3D"); + + auto opts = makeHeffteOptions(params); + heffte_ = std::make_shared(inbox, outbox, comm, opts); + + // Allocate workspace for maximum batch size + workspace_ = workspace_t(heffte_->size_workspace() * maxBatchSize); + + localSize_ = heffte_->size_outbox(); + globalSize_ = computeGlobalSize(inbox, outbox, comm); + } + + // Single transform + void forward(complex_t* in, complex_t* out) { + heffte_->forward(in, out, workspace_.data(), heffte::scale::full); + } + + void backward(complex_t* in, complex_t* out) { + heffte_->backward(in, out, workspace_.data(), heffte::scale::none); + } + + // Batched transform + void forward(int batchSize, complex_t* in, complex_t* out) { + assert(batchSize <= maxBatchSize_ && "Batch size exceeds allocated workspace"); + heffte_->forward(batchSize, in, out, workspace_.data(), heffte::scale::full); + } + + void backward(int batchSize, complex_t* in, complex_t* out) { + assert(batchSize <= maxBatchSize_ && "Batch size exceeds allocated workspace"); + heffte_->backward(batchSize, in, out, workspace_.data(), heffte::scale::none); + } + + // Accessors + size_t workspace_size() const { return heffte_->size_workspace(); } + size_t local_size() const { return localSize_; } + size_t global_size() const { return globalSize_; } + size_t size_inbox() const { return heffte_->size_inbox(); } + size_t size_outbox() const { return heffte_->size_outbox(); } + int max_batch_size() const { return maxBatchSize_; } + + private: + std::shared_ptr heffte_; + workspace_t workspace_; + size_t localSize_; + size_t globalSize_; + int maxBatchSize_; + }; + + //============================================================================= + // heFFTe R2C + //============================================================================= + + template + class HeffteR2C { + public: + using complex_t = Kokkos::complex; + using backend_t = typename HeffteBackend::c2c; + using heffte_t = heffte::fft3d_r2c; + using workspace_t = typename heffte_t::template buffer_container; + + HeffteR2C(const heffte::box3d& inbox, const heffte::box3d& outbox, + int r2c_direction, MPI_Comm comm, const ParameterList& params) { + auto opts = makeHeffteOptions(params); + heffte_ = std::make_shared(inbox, outbox, r2c_direction, comm, opts); + workspace_ = workspace_t(heffte_->size_workspace()); + + local_complex_size_ = heffte_->size_outbox(); + + // For R2C, normalize by the global REAL size + // inbox is the real box + long long local_max[3], global_max[3]; + local_max[0] = inbox.high[0] + 1; + local_max[1] = inbox.high[1] + 1; + local_max[2] = inbox.high[2] + 1; + + MPI_Allreduce(local_max, global_max, 3, MPI_LONG_LONG, MPI_MAX, comm); + + global_real_size_ = static_cast(global_max[0]) + * static_cast(global_max[1]) + * static_cast(global_max[2]); + } + + void forward(T* in, complex_t* out) { + heffte_->forward(in, out, workspace_.data(), heffte::scale::full); + } + + void backward(complex_t* in, T* out) { + heffte_->backward(in, out, workspace_.data(), heffte::scale::none); + } + + private: + std::shared_ptr heffte_; + workspace_t workspace_; + size_t local_complex_size_; + size_t global_real_size_; + }; + + //============================================================================= + // heFFTe Trigonometric (Sine, Cos, Cos1) + //============================================================================= + + template + class HeffteTrig; + +#define IPPL_FFT_DEFINE_HEFFTE_TRIG(TagType, member) \ + template \ + class HeffteTrig { \ + public: \ + using backend_t = typename HeffteBackend::member; \ + using heffte_t = heffte::fft3d; \ + using workspace_t = typename heffte_t::template buffer_container; \ + \ + HeffteTrig(const heffte::box3d& inbox, const heffte::box3d& outbox, \ + MPI_Comm comm, const ParameterList& params) { \ + auto opts = makeHeffteOptions(params); \ + heffte_ = std::make_shared(inbox, outbox, comm, opts); \ + workspace_ = workspace_t(heffte_->size_workspace()); \ + local_size_ = heffte_->size_outbox(); \ + global_size_ = computeGlobalSize(inbox, outbox, comm); \ + } \ + \ + void forward(T* in, T* out) { \ + heffte_->forward(in, out, workspace_.data(), heffte::scale::full); \ + } \ + \ + void backward(T* in, T* out) { \ + heffte_->backward(in, out, workspace_.data(), heffte::scale::none); \ + } \ + \ + private: \ + std::shared_ptr heffte_; \ + workspace_t workspace_; \ + size_t local_size_; \ + size_t global_size_; \ + }; + + IPPL_FFT_DEFINE_HEFFTE_TRIG(SineTransform, sin) + IPPL_FFT_DEFINE_HEFFTE_TRIG(CosTransform, cos) + IPPL_FFT_DEFINE_HEFFTE_TRIG(Cos1Transform, cos1) + +#undef IPPL_FFT_DEFINE_HEFFTE_TRIG + + } // namespace fft +} // namespace ippl + +#endif \ No newline at end of file diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h index c9f04c365..aac13eebc 100644 --- a/src/FFT/FFT.h +++ b/src/FFT/FFT.h @@ -1,346 +1,8 @@ -// -// Class FFT -// The FFT class performs complex-to-complex, -// real-to-complex on IPPL Fields. -// FFT is templated on the type of transform to be performed, -// the dimensionality of the Field to transform, and the -// floating-point precision type of the Field (float or double). -// Currently, we use heffte for taking the transforms and the class FFT -// serves as an interface between IPPL and heffte. In making this interface, -// we have referred Cabana library -// https://github.com/ECP-copa/Cabana. -// -// - #ifndef IPPL_FFT_FFT_H #define IPPL_FFT_FFT_H -#include -#include -#include -#include -#include -#include - -#include "Utility/IpplException.h" -#include "Utility/ParameterList.h" - -#include "Field/Field.h" - -#include "FieldLayout/FieldLayout.h" -#include "Index/NDIndex.h" - -namespace heffte { - template <> - struct is_ccomplex> : std::true_type {}; - - template <> - struct is_zcomplex> : std::true_type {}; -} // namespace heffte - -namespace ippl { - - /** - Tag classes for Fourier transforms - */ - class CCTransform {}; - class RCTransform {}; - class SineTransform {}; - class CosTransform {}; - /** - Tag classes for Cosine of type 1 transforms - */ - class Cos1Transform {}; - - enum FFTComm { - a2av = 0, - a2a = 1, - p2p = 2, - p2p_pl = 3 - }; - - enum TransformDirection { - FORWARD, - BACKWARD - }; - - namespace detail { - /*! - * Wrapper type for heFFTe backends, templated - * on the Kokkos memory space - */ - template - struct HeffteBackendType; - -#if defined(Heffte_ENABLE_FFTW) - template <> - struct HeffteBackendType { - using backend = heffte::backend::fftw; - using backendSine = heffte::backend::fftw_sin; - using backendCos = heffte::backend::fftw_cos; - using backendCos1 = heffte::backend::fftw_cos1; - }; -#elif defined(Heffte_ENABLE_MKL) - template <> - struct HeffteBackendType { - using backend = heffte::backend::mkl; - using backendSine = heffte::backend::mkl_sin; - using backendCos = heffte::backend::mkl_cos; - }; -#endif - -#ifdef Heffte_ENABLE_CUDA -#ifdef KOKKOS_ENABLE_CUDA - template <> - struct HeffteBackendType { - using backend = heffte::backend::cufft; - using backendSine = heffte::backend::cufft_sin; - using backendCos = heffte::backend::cufft_cos; - using backendCos1 = heffte::backend::cufft_cos1; - }; -#else -#error cuFFT backend is enabled for heFFTe but CUDA is not enabled for Kokkos! -#endif -#endif - -#ifdef KOKKOS_ENABLE_HIP -#ifdef Heffte_ENABLE_ROCM - template <> - struct HeffteBackendType { - using backend = heffte::backend::rocfft; - using backendSine = heffte::backend::rocfft_sin; - using backendCos = heffte::backend::rocfft_cos; - using backendCos1 = heffte::backend::rocfft_cos1; - }; -#else - template <> - struct HeffteBackendType { - using backend = heffte::backend::stock; - using backendSine = heffte::backend::stock_sin; - using backendCos = heffte::backend::stock_cos; - using backendCos1 = heffte::backend::stock_cos1; - }; -#endif -#endif - -#if !defined(Heffte_ENABLE_MKL) && !defined(Heffte_ENABLE_FFTW) - /** - * Use heFFTe's inbuilt 1D fft computation on CPUs if no - * vendor specific or optimized backend is found - */ - template <> - struct HeffteBackendType { - using backend = heffte::backend::stock; - using backendSine = heffte::backend::stock_sin; - using backendCos = heffte::backend::stock_cos; - using backendCos1 = heffte::backend::stock_cos1; - }; -#endif - - } // namespace detail - - template class FFT, typename Backend, - typename BufferType = typename Field::value_type> - class FFTBase { - constexpr static unsigned Dim = Field::dim; - - public: - using heffteBackend = Backend; - using workspace_t = typename FFT::template buffer_container; - using Layout_t = FieldLayout; - - FFTBase(const Layout_t& layout, const ParameterList& params); - ~FFTBase() = default; - - protected: - FFTBase() = default; - - void domainToBounds(const NDIndex& domain, std::array& low, - std::array& high); - void setup(const heffte::box3d& inbox, const heffte::box3d& outbox, - const ParameterList& params); - - std::shared_ptr> heffte_m; - workspace_t workspace_m; - - template - using temp_view_type = - typename Kokkos::View::uniform_type; - temp_view_type tempField; - }; - -#define IN_PLACE_FFT_BASE_CLASS(Field, Backend) \ - FFTBase::Backend> -#define EXT_FFT_BASE_CLASS(Field, Backend, Type) \ - FFTBase::Backend, \ - typename Type> - - /** - Non-specialized FFT class. We specialize based on Transform tag class - */ - template - class FFT {}; - - /** - complex-to-complex FFT class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(ComplexField, backend) { - constexpr static unsigned Dim = ComplexField::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(ComplexField, backend); - - public: - using Complex_t = typename ComplexField::value_type; - - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(ComplexField& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, ComplexField& f); - }; - - /** - real-to-complex FFT class - */ - template - class FFT - : public EXT_FFT_BASE_CLASS(RealField, backend, - Kokkos::complex) { - constexpr static unsigned Dim = RealField::dim; - using Real_t = typename RealField::value_type; - using Base = EXT_FFT_BASE_CLASS(RealField, backend, - Kokkos::complex); - - public: - using Complex_t = Kokkos::complex; - using ComplexField = typename Field::uniform_type; - - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /** Create a new FFT object with the layout for the input and output Fields - * and parameters for heffte. - */ - FFT(const Layout_t& layoutInput, const Layout_t& layoutOutput, const ParameterList& params); - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute - * @param g Field in which to store the transformation - */ - void warmup(RealField& f, ComplexField& g); - - /*! - * Perform FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute - * @param g Field in which to store the transformation - */ - void transform(TransformDirection direction, RealField& f, ComplexField& g); - - private: - typename Base::template temp_view_type tempFieldComplex; - }; - - /** - Sine transform class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(Field, backendSine) { - constexpr static unsigned Dim = Field::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(Field, backendSine); - - public: - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(Field& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, Field& f); - }; - /** - Cosine transform class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(Field, backendCos) { - constexpr static unsigned Dim = Field::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(Field, backendCos); - - public: - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(Field& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, Field& f); - }; - /** - Cosine type 1 transform class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(Field, backendCos1) { - constexpr static unsigned Dim = Field::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(Field, backendCos1); - - public: - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(Field& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, Field& f); - }; -} // namespace ippl - -#include "FFT/FFT.hpp" - -#endif // IPPL_FFT_FFT_H +#include "Traits.h" +#include "Backend/Backend.h" +#include "Transform/Transform.h" -// vi: set et ts=4 sw=4 sts=4: -// Local Variables: -// mode:c -// c-basic-offset: 4 -// indent-tabs-mode: nil -// require-final-newline: nil -// End: +#endif \ No newline at end of file diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp deleted file mode 100644 index 086f6d732..000000000 --- a/src/FFT/FFT.hpp +++ /dev/null @@ -1,460 +0,0 @@ -// -// Class FFT -// The FFT class performs complex-to-complex, -// real-to-complex on IPPL Fields. -// FFT is templated on the type of transform to be performed, -// the dimensionality of the Field to transform, and the -// floating-point precision type of the Field (float or double). -// Currently, we use heffte for taking the transforms and the class FFT -// serves as an interface between IPPL and heffte. In making this interface, -// we have referred Cabana library. -// https://github.com/ECP-copa/Cabana. -// -// Copyright (c) 2021, Sriramkrishnan Muralikrishnan, -// Paul Scherrer Institut, Villigen PSI, Switzerland -// All rights reserved -// -// This file is part of IPPL. -// -/** - Implementations for FFT constructor/destructor and transforms -*/ - -#include "Utility/IpplTimings.h" - -#include "Field/BareField.h" - -#include "FieldLayout/FieldLayout.h" - -namespace ippl { - - template class FFT, typename Backend, typename T> - FFTBase::FFTBase(const Layout_t& layout, const ParameterList& params) { - std::array low; - std::array high; - - const NDIndex lDom = layout.getLocalNDIndex(); - domainToBounds(lDom, low, high); - - heffte::box3d inbox = {low, high}; - heffte::box3d outbox = {low, high}; - - setup(inbox, outbox, params); - } - - template class FFT, typename Backend, typename T> - void FFTBase::domainToBounds(const NDIndex& domain, - std::array& low, - std::array& high) { - low.fill(0); - high.fill(0); - - /** - * Static cast to detail::long long (uint64_t) is necessary, as heffte::box3d requires it - * like that. - */ - for (size_t d = 0; d < Dim; ++d) { - low[d] = static_cast(domain[d].first()); - high[d] = static_cast(domain[d].length() + domain[d].first() - 1); - } - } - - /** - setup performs the initialization necessary. - */ - template class FFT, typename Backend, typename T> - void FFTBase::setup(const heffte::box3d& inbox, - const heffte::box3d& outbox, - const ParameterList& params) { - heffte::plan_options heffteOptions = heffte::default_options(); - - if (!params.get("use_heffte_defaults")) { - heffteOptions.use_pencils = params.get("use_pencils"); - heffteOptions.use_reorder = params.get("use_reorder"); -#ifdef Heffte_ENABLE_GPU - heffteOptions.use_gpu_aware = params.get("use_gpu_aware"); -#endif - - switch (params.get("comm")) { - case a2a: - heffteOptions.algorithm = heffte::reshape_algorithm::alltoall; - break; - case a2av: - heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv; - break; - case p2p: - heffteOptions.algorithm = heffte::reshape_algorithm::p2p; - break; - case p2p_pl: - heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined; - break; - default: - throw IpplException("FFT::setup", "Unrecognized heffte communication type"); - } - } - - if constexpr (std::is_same_v, heffte::fft3d>) { - heffte_m = std::make_shared>( - inbox, outbox, Comm->getCommunicator(), heffteOptions); - } else { - heffte_m = std::make_shared>( - inbox, outbox, params.get("r2c_direction"), Comm->getCommunicator(), - heffteOptions); - } - - // heffte::gpu::device_set(Comm->rank() % heffte::gpu::device_count()); - if (workspace_m.size() < heffte_m->size_workspace()) { - workspace_m = workspace_t(heffte_m->size_workspace()); - } - } - - template - void FFT::warmup(ComplexField& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, ComplexField& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) even though this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost).real(apply(fview, args).real()); - apply(tempField, args - nghost).imag(apply(fview, args).imag()); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args).real() = apply(tempField, args - nghost).real(); - apply(fview, args).imag() = apply(tempField, args - nghost).imag(); - }); - } - - //======================================================================== - // FFT RCTransform Constructors - //======================================================================== - - /** - *Create a new FFT object of type RCTransform, with given input and output - *layouts and heffte parameters. - */ - - template - FFT::FFT(const Layout_t& layoutInput, const Layout_t& layoutOutput, - const ParameterList& params) { - /** - * Heffte requires to pass a 3D array even for 2D and - * 1D FFTs we just have to make the length in other - * dimensions to be 1. - */ - std::array lowInput; - std::array highInput; - std::array lowOutput; - std::array highOutput; - - const NDIndex& lDomInput = layoutInput.getLocalNDIndex(); - const NDIndex& lDomOutput = layoutOutput.getLocalNDIndex(); - - this->domainToBounds(lDomInput, lowInput, highInput); - this->domainToBounds(lDomOutput, lowOutput, highOutput); - - heffte::box3d inbox = {lowInput, highInput}; - heffte::box3d outbox = {lowOutput, highOutput}; - - this->setup(inbox, outbox, params); - } - - template - void FFT::warmup(RealField& f, ComplexField& g) { - this->transform(FORWARD, f, g); - this->transform(BACKWARD, f, g); - } - - template - void FFT::transform(TransformDirection direction, RealField& f, - ComplexField& g) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); - - auto fview = f.getView(); - auto gview = g.getView(); - const int nghostf = f.getNghost(); - const int nghostg = g.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempFieldf = this->tempField; - auto& tempFieldg = this->tempFieldComplex; - if (tempFieldf.size() != f.getOwned().size()) { - tempFieldf = detail::shrinkView("tempFieldf", fview, nghostf); - } - if (tempFieldg.size() != g.getOwned().size()) { - tempFieldg = detail::shrinkView("tempFieldg", gview, nghostg); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos f field in FFT", getRangePolicy(fview, nghostf), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempFieldf, args - nghostf) = apply(fview, args); - }); - ippl::parallel_for( - "copy from Kokkos g field in FFT", getRangePolicy(gview, nghostg), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempFieldg, args - nghostg).real(apply(gview, args).real()); - apply(tempFieldg, args - nghostg).imag(apply(gview, args).imag()); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempFieldf.data(), tempFieldg.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempFieldg.data(), tempFieldf.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos f field FFT", getRangePolicy(fview, nghostf), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempFieldf, args - nghostf); - }); - - ippl::parallel_for( - "copy to Kokkos g field FFT", getRangePolicy(gview, nghostg), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(gview, args).real() = apply(tempFieldg, args - nghostg).real(); - apply(gview, args).imag() = apply(tempFieldg, args - nghostg).imag(); - }); - } - - template - void FFT::warmup(Field& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, Field& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); -#ifdef Heffte_ENABLE_FFTW - if (direction == FORWARD) { - f = f / 8.0; - } -#endif - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost) = apply(fview, args); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempField, args - nghost); - }); -#ifdef Heffte_ENABLE_FFTW - if (direction == BACKWARD) { - f = f * 8.0; - } -#endif - } - - template - void FFT::warmup(Field& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, Field& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); -#ifdef Heffte_ENABLE_FFTW - if (direction == FORWARD) { - f = f / 8.0; - } -#endif - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost) = apply(fview, args); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempField, args - nghost); - }); -#ifdef Heffte_ENABLE_FFTW - if (direction == BACKWARD) { - f = f * 8.0; - } -#endif - } - - template - void FFT::warmup(Field& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, Field& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); - -/** - * This rescaling is needed to match the normalization constant - * between fftw and the other gpu interfaces. fftw rescales with an extra factor of 8. - */ -#ifdef Heffte_ENABLE_FFTW - if (direction == FORWARD) { - f = f / 8.0; - } -#endif - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost) = apply(fview, args); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempField, args - nghost); - }); - -/** - * This rescaling is needed to match the normalization constant - * between fftw and the other gpu interfaces. fftw rescales with an extra factor of 8. - */ -#ifdef Heffte_ENABLE_FFTW - if (direction == BACKWARD) { - f = f * 8.0; - } -#endif - } - -} // namespace ippl - -// vi: set et ts=4 sw=4 sts=4: -// Local Variables: -// mode:c -// c-basic-offset: 4 -// indent-tabs-mode: nil -// require-final-newline: nil -// End: diff --git a/src/FFT/NUFFT/Correction.h b/src/FFT/NUFFT/Correction.h new file mode 100644 index 000000000..f4830c6a7 --- /dev/null +++ b/src/FFT/NUFFT/Correction.h @@ -0,0 +1,388 @@ +#ifndef IPPL_NUFFT_CORRECTION_H +#define IPPL_NUFFT_CORRECTION_H + +#include + +#include +#include + +#include "Types/ViewTypes.h" + +#include "FFT/NUFFT/ESKernel.h" +#include "FFT/NUFFT/Quadrature.h" +#include "FFT/NUFFT/NUFFTUtilities.h" + +namespace ippl { + namespace nufft { + + // ==================================================================== + // Cell-centered phase convention + // ==================================================================== + // + // Scatter/gather use cell-centered DOFs at x_j = (j + 1/2) * h. + // The DFT of the scattered field is: + // + // G_hat_k = phi_hat_k * exp(+i pi k / N) * conj(f_k) + // + // where the exp(+i pi k / N) arises because the kernel is evaluated + // at (x_p / h - j - 1/2) instead of (x_p / h - j). + // + // Type 1: f_k = conj(G_hat_k * factor) + // requires factor = deconv * exp(-i pi k / N) [negative phase] + // + // Type 2: G_hat_k = N * f_k * conj(factor) + // requires conj(factor) = deconv * exp(+i pi k / N) [positive phase] + // + // The factors array always stores the Type-1 factor (negative phase). + // Type-2 functions apply conj(factor) explicitly. + // ==================================================================== + + /** + * @brief Computes deconvolution factors for NUFFT. + */ + template + void compute_deconvolution_factors( + Kokkos::View*, typename ExecSpace::memory_space> factors, + int64_t n_modes, int64_t n_grid, const ESKernel& kernel) { + using complex_type = Kokkos::complex; + + // Set up quadrature + constexpr int q = 100; + auto nodes = Kokkos::View("nodes", q); + auto weights = Kokkos::View("weights", q); + + gauss_legendre(q, nodes, weights); + + const RealType alpha = Kokkos::numbers::pi_v * kernel.width() / n_grid; + const RealType beta = kernel.beta(); + + const int64_t l_n_modes = n_modes; + constexpr int l_q = q; + + Kokkos::parallel_for( + "compute_deconv_factors", Kokkos::RangePolicy(0, n_modes), + KOKKOS_LAMBDA(const int64_t k) { + int freq = (k < l_n_modes / 2) ? k : k - l_n_modes; + RealType ft = 0.0; + + for (int i = 0; i < l_q; ++i) { + const RealType x = nodes(i); + const RealType w = weights(i); + const RealType ker = Kokkos::exp(beta * (Kokkos::sqrt(1.0 - x * x) - 1.0)); + ft += w * ker * Kokkos::cos(freq * alpha * x); + } + + const RealType deconv = 2.0 / (kernel.width() * ft); + + // Phase correction for cell-centered DOFs: exp(-i pi freq / N_grid) + // Type-1 uses this factor directly; Type-2 applies conj(factor). + const RealType phase = Kokkos::numbers::pi_v + * static_cast(freq) + / static_cast(n_grid); + factors(k) = + complex_type(deconv * Kokkos::cos(phase), deconv * Kokkos::sin(phase)); + }); + } + + /** + * @brief Apply deconvolution correction for Type 1 NUFFT (post-FFT). + * + * Type 1: nonuniform points -> uniform Fourier modes. + * After spreading (cell-centered) and FFT, the result satisfies: + * + * G_hat_k = phi_hat_k * exp(+i pi k / N) * conj(f_k) + * + * This function recovers f_k via: + * + * output_k = conj(input_k * factor_k) + * + * where factor_k = deconv_k * exp(-i pi k / N) (stored in `factors`). + * + * Operates locally; input lives on the upsampled grid, output on the + * mode grid. Entries outside the mode band are set to zero. + */ + template + void applyDeconvolutionType1( + FieldIn& input, + const std::array*, typename ExecSpace::memory_space>, + 3>& factors, + FieldOut& output, const Vector& n_modes, const Vector&) { + using complex_type = Kokkos::complex; + + constexpr unsigned Dim = 3; + + auto input_view = input.getView(); + auto output_view = output.getView(); + + const auto& layout = input.getLayout(); + const auto& lDom = layout.getLocalNDIndex(); + + const int nghost_in = input.getNghost(); + const int nghost_out = output.getNghost(); + + Vector local_first, local_last; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + local_last[d] = lDom[d].last(); + } + + auto f0 = factors[0]; + auto f1 = factors[1]; + auto f2 = factors[2]; + + const int nx = static_cast(n_modes[0]); + const int ny = static_cast(n_modes[1]); + const int nz = static_cast(n_modes[2]); + + Kokkos::parallel_for( + "deconv_type1_3d_local", + Kokkos::MDRangePolicy>( + {local_first[0], local_first[1], local_first[2]}, + {local_last[0] + 1, local_last[1] + 1, local_last[2] + 1}), + KOKKOS_LAMBDA(int gi, int gj, int gk) { + // Corner-DC layout: modes in [0, N/2) ∪ [N+N/2, 2N) + auto in_bounds = [&](int g, int n) { + return (g >= 0 && g < n / 2) || (g >= n + n / 2 && g < 2 * n); + }; + + const int li_in = gi - local_first[0] + nghost_in; + const int lj_in = gj - local_first[1] + nghost_in; + const int lk_in = gk - local_first[2] + nghost_in; + const int li_out = gi - local_first[0] + nghost_out; + const int lj_out = gj - local_first[1] + nghost_out; + const int lk_out = gk - local_first[2] + nghost_out; + + if (in_bounds(gi, nx) && in_bounds(gj, ny) && in_bounds(gk, nz)) { + // Map upsampled-grid corner-DC index back to factor index [0, n_modes) + auto rescale = [](int g, int n) { + return (g < n) ? g : g - n; + }; + + // factor = deconv * exp(-i pi freq / N_grid) + const complex_type factor = + f0(rescale(gi, nx)) * f1(rescale(gj, ny)) * f2(rescale(gk, nz)); + + // f_k = conj(G_hat_k * factor) + output_view(li_out, lj_out, lk_out) = + Kokkos::conj(input_view(li_in, lj_in, lk_in) * factor); + } else { + output_view(li_out, lj_out, lk_out) = complex_type(0, 0); + } + }); + + Kokkos::fence(); + } + + /** + * @brief Apply pre-correction for Type 2 NUFFT (pre-IFFT). + * + * Type 2: uniform Fourier modes -> nonuniform points. + * Before the IFFT and cell-centered gather, the mode field must be + * pre-multiplied so that the gather recovers the correct values: + * + * G_hat_k = N * f_k * conj(factor_k) + * + * where factor_k = deconv_k * exp(-i pi k / N) (stored in `factors`), + * so conj(factor_k) = deconv_k * exp(+i pi k / N). + * + * Entries outside the mode band are set to zero. + */ + template + void applyPreCorrectionType2( + FieldIn& input, + const std::array*, typename ExecSpace::memory_space>, + 3>& factors, + FieldOut& output, const Vector& n_modes, const Vector&) { + using complex_type = Kokkos::complex; + + constexpr unsigned Dim = 3; + + auto input_view = input.getView(); + auto output_view = output.getView(); + + const auto& layout = input.getLayout(); + const auto& lDom = layout.getLocalNDIndex(); + + const int nghost_in = input.getNghost(); + const int nghost_out = output.getNghost(); + + Vector local_first, local_last; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + local_last[d] = lDom[d].last(); + } + + Kokkos::deep_copy(output_view, complex_type(0, 0)); + + auto f0 = factors[0]; + auto f1 = factors[1]; + auto f2 = factors[2]; + + const int nx = static_cast(n_modes[0]); + const int ny = static_cast(n_modes[1]); + const int nz = static_cast(n_modes[2]); + + Kokkos::parallel_for( + "precorr_type2_3d_local", + Kokkos::MDRangePolicy>( + {local_first[0], local_first[1], local_first[2]}, + {local_last[0] + 1, local_last[1] + 1, local_last[2] + 1}), + KOKKOS_LAMBDA(int gi, int gj, int gk) { + auto in_bounds = [&](int g, int n) { + return (g >= 0 && g < n / 2) || (g >= n + n / 2 && g < 2 * n); + }; + + const int li_in = gi - local_first[0] + nghost_in; + const int lj_in = gj - local_first[1] + nghost_in; + const int lk_in = gk - local_first[2] + nghost_in; + const int li_out = gi - local_first[0] + nghost_out; + const int lj_out = gj - local_first[1] + nghost_out; + const int lk_out = gk - local_first[2] + nghost_out; + + if (in_bounds(gi, nx) && in_bounds(gj, ny) && in_bounds(gk, nz)) { + auto rescale = [](int g, int n) { + return (g < n) ? g : g - n; + }; + + // factor = deconv * exp(-i pi freq / N_grid) + const complex_type factor = + f0(rescale(gi, nx)) * f1(rescale(gj, ny)) * f2(rescale(gk, nz)); + + // G_hat_k = f_k * conj(factor) [conj gives +i pi phase] + output_view(li_out, lj_out, lk_out) = + input_view(li_in, lj_in, lk_in) * Kokkos::conj(factor); + } else { + output_view(li_out, lj_out, lk_out) = complex_type(0, 0); + } + }); + + Kokkos::fence(); + } + + /** + * @brief Apply deconvolution for Type 1 NUFFT on the pruned mode grid. + * + * Same as applyDeconvolutionType1 but operates directly on a field + * already living in mode space (0..n_modes[d]-1 per dim, corner-DC). + * + * field(gi,gj,gk) <- conj( field(gi,gj,gk) * f0(gi)*f1(gj)*f2(gk) ) + */ + template + void applyDeconvolutionPruned( + FieldType& field, + const std::array*, typename ExecSpace::memory_space>, + 3>& factors, + const Vector& n_modes, const Vector& /*n_grid*/) { + using complex_type = Kokkos::complex; + constexpr unsigned Dim = 3; + + auto view = field.getView(); + auto& layout = field.getLayout(); + auto lDom = layout.getLocalNDIndex(); + + const int nghost = field.getNghost(); + + Vector local_first, local_last; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + local_last[d] = lDom[d].last(); + } + + auto f0 = factors[0]; + auto f1 = factors[1]; + auto f2 = factors[2]; + + const int nx = static_cast(n_modes[0]); + const int ny = static_cast(n_modes[1]); + const int nz = static_cast(n_modes[2]); + + Kokkos::parallel_for( + "deconv_type1_pruned_local", + Kokkos::MDRangePolicy>( + {local_first[0], local_first[1], local_first[2]}, + {local_last[0] + 1, local_last[1] + 1, local_last[2] + 1}), + KOKKOS_LAMBDA(int gi, int gj, int gk) { + if (gi < 0 || gj < 0 || gk < 0 || gi >= nx || gj >= ny || gk >= nz) + return; + + const int li = gi - local_first[0] + nghost; + const int lj = gj - local_first[1] + nghost; + const int lk = gk - local_first[2] + nghost; + + // factor = deconv * exp(-i pi freq / N_grid) + const complex_type factor = f0(gi) * f1(gj) * f2(gk); + + // f_k = conj(G_hat_k * factor) + view(li, lj, lk) = Kokkos::conj(view(li, lj, lk) * factor); + }); + + Kokkos::fence(); + } + + /** + * @brief Apply pre-correction for Type 2 NUFFT on the pruned mode grid. + * + * Same as applyPreCorrectionType2 but operates directly on a field + * already in mode space (0..n_modes[d]-1 per dim, corner-DC). + * + * field(gi,gj,gk) <- field(gi,gj,gk) * conj( f0(gi)*f1(gj)*f2(gk) ) + * + * The conj gives the +i pi phase needed for the IFFT + cell-centered gather + * to recover the correct values. + */ + template + void applyPrecorrectionPruned( + FieldType& field, + const std::array*, typename ExecSpace::memory_space>, + 3>& factors, + const Vector& n_modes, const Vector& /*n_grid*/) { + using complex_type = Kokkos::complex; + constexpr unsigned Dim = 3; + + auto view = field.getView(); + auto& layout = field.getLayout(); + auto lDom = layout.getLocalNDIndex(); + + const int nghost = field.getNghost(); + + Vector local_first, local_last; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + local_last[d] = lDom[d].last(); + } + + auto f0 = factors[0]; + auto f1 = factors[1]; + auto f2 = factors[2]; + + const int nx = static_cast(n_modes[0]); + const int ny = static_cast(n_modes[1]); + const int nz = static_cast(n_modes[2]); + + Kokkos::parallel_for( + "precorr_type2_pruned_local", + Kokkos::MDRangePolicy>( + {local_first[0], local_first[1], local_first[2]}, + {local_last[0] + 1, local_last[1] + 1, local_last[2] + 1}), + KOKKOS_LAMBDA(int gi, int gj, int gk) { + if (gi < 0 || gj < 0 || gk < 0 || gi >= nx || gj >= ny || gk >= nz) + return; + + const int li = gi - local_first[0] + nghost; + const int lj = gj - local_first[1] + nghost; + const int lk = gk - local_first[2] + nghost; + + // factor = deconv * exp(-i pi freq / N_grid) + const complex_type factor = f0(gi) * f1(gj) * f2(gk); + + // G_hat_k = f_k * conj(factor) [conj gives +i pi phase] + view(li, lj, lk) *= factor; + }); + + Kokkos::fence(); + } + + } // namespace nufft +} // namespace ippl + +#endif // IPPL_NUFFT_CORRECTION_H \ No newline at end of file diff --git a/src/FFT/NUFFT/ESKernel.h b/src/FFT/NUFFT/ESKernel.h new file mode 100644 index 000000000..f0a50ebc2 --- /dev/null +++ b/src/FFT/NUFFT/ESKernel.h @@ -0,0 +1,446 @@ +// +// NUFFT ES Kernel +// Exponential-of-Semicircle spreading kernel for NUFFT. +// +#ifndef IPPL_NUFFT_ES_KERNEL_H +#define IPPL_NUFFT_ES_KERNEL_H + +#include + +#include + +namespace ippl { + namespace nufft { + + // ============================================================ + // ES Kernel polynomial evaluation for each width + // Coefficients are inlined to work in device code + // ============================================================ + + // w = 4, beta = 9.2, degree = 5, error < 3.53e-4 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w4(T x) { + const T t = x * x; + T r = T(-0x1.7a61695c211e2p0); + r = r * t + T(0x1.7741c5626c40bp2); + r = r * t + T(-0x1.3ccc694d94a47p3); + r = r * t + T(0x1.22ccba2dfd6a5p3); + r = r * t + T(-0x1.24a8469892c07p2); + r = r * t + T(0x1.ffd1c0e2c6db8p-1); + return r; + } + + // w = 5, beta = 11.5, degree = 7, error < 1.74e-5 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w5(T x) { + const T t = x * x; + T r = T(-0x1.53ef7dfa4cddap0); + r = r * t + T(0x1.c1971511f9acbp2); + r = r * t + T(-0x1.0ed3008908634p4); + r = r * t + T(0x1.8a1fbea0250d2p4); + r = r * t + T(-0x1.7b28be8632757p4); + r = r * t + T(0x1.e1574df952c56p3); + r = r * t + T(-0x1.6fd9a0a46e05ap2); + r = r * t + T(0x1.fffdb8228ab29p-1); + return r; + } + + // w = 6, beta = 13.8, degree = 9, error < 8.50e-7 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w6(T x) { + const T t = x * x; + T r = T(-0x1.2290d68e62bcfp0); + r = r * t + T(0x1.d9edacd03ff7ep2); + r = r * t + T(-0x1.6afaba0b81c05p4); + r = r * t + T(0x1.5e4576519fd7ep5); + r = r * t + T(-0x1.dd17e73ccea97p5); + r = r * t + T(0x1.ddcc59a8e52adp5); + r = r * t + T(-0x1.5d0a435b3428ep5); + r = r * t + T(0x1.612ebf7fd3207p4); + r = r * t + T(-0x1.b996b31b5ba67p2); + r = r * t + T(0x1.ffffe37649033p-1); + return r; + } + + // w = 7, beta = 16.1, degree = 10, error < 4.52e-7 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w7(T x) { + const T t = x * x; + T r = T(0x1.0d43fb941e8cp1); + r = r * t + T(-0x1.d07f762f83e4cp3); + r = r * t + T(0x1.76aa8fe35ec4ep5); + r = r * t + T(-0x1.7b643526b5bfcp6); + r = r * t + T(0x1.0ff008e09251bp7); + r = r * t + T(-0x1.23a681af8ca75p7); + r = r * t + T(0x1.da7c21b010748p6); + r = r * t + T(-0x1.1eb0a0c2150d6p6); + r = r * t + T(0x1.e6250a3d3d121p4); + r = r * t + T(-0x1.0198abec8a4eep3); + r = r * t + T(0x1.fffff0d9a4617p-1); + return r; + } + + // w = 8, beta = 18.4, degree = 12, error < 2.33e-8 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w8(T x) { + const T t = x * x; + T r = T(0x1.d5cd3d519edbcp0); + r = r * t + T(-0x1.da1032722476ep3); + r = r * t + T(0x1.c5cddddf485b9p5); + r = r * t + T(-0x1.1533735f3e975p7); + r = r * t + T(0x1.e9cf3a5cff485p7); + r = r * t + T(-0x1.4e105fc4aa621p8); + r = r * t + T(0x1.6a81ebd6683f4p8); + r = r * t + T(-0x1.3a18274b0a1e3p8); + r = r * t + T(0x1.ab15412d571b6p7); + r = r * t + T(-0x1.b70af68ed9b3fp6); + r = r * t + T(0x1.4028006f3c69bp5); + r = r * t + T(-0x1.26665564ca36p3); + r = r * t + T(0x1.ffffff37dec0bp-1); + return r; + } + + // w = 9, beta = 20.7, degree = 14, error < 1.20e-9 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w9(T x) { + const T t = x * x; + T r = T(0x1.90982143e4ae3p0); + r = r * t + T(-0x1.cd7480dda6f42p3); + r = r * t + T(0x1.fcc7e2c6cba0dp5); + r = r * t + T(-0x1.694ede76783d8p7); + r = r * t + T(0x1.7752cfcde75a6p8); + r = r * t + T(-0x1.31e18cfaaedc8p9); + r = r * t + T(0x1.968a12dc2f151p9); + r = r * t + T(-0x1.bf8cb2e1ddc74p9); + r = r * t + T(0x1.97659066136f6p9); + r = r * t + T(-0x1.2e19d0407765dp9); + r = r * t + T(0x1.6374318a4696bp8); + r = r * t + T(-0x1.3e98a86653c26p7); + r = r * t + T(0x1.97ca274772c9fp5); + r = r * t + T(-0x1.4b33320a95fccp3); + r = r * t + T(0x1.fffffff5b40bp-1); + return r; + } + + // w = 10, beta = 23.0, degree = 16, error < 6.20e-11 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w10(T x) { + const T t = x * x; + T r = T(0x1.5017876c93c63p0); + r = r * t + T(-0x1.b266030e0096ap3); + r = r * t + T(0x1.0e462bf649f05p6); + r = r * t + T(-0x1.b3a48c1d9288p7); + r = r * t + T(0x1.025a87a6770c9p9); + r = r * t + T(-0x1.e4e7636a1af58p9); + r = r * t + T(0x1.780fc81f87388p10); + r = r * t + T(-0x1.ed395a8d098afp10); + r = r * t + T(0x1.1373eb09a6122p11); + r = r * t + T(-0x1.04dade35c1013p11); + r = r * t + T(0x1.9d913dd2b0b9ep10); + r = r * t + T(-0x1.0d05b6008f472p10); + r = r * t + T(0x1.1733f862d30f6p9); + r = r * t + T(-0x1.bbb54324b711ap7); + r = r * t + T(0x1.f9fffe1fb327ap5); + r = r * t + T(-0x1.6fffffec5ff52p3); + r = r * t + T(0x1.ffffffff77951p-1); + return r; + } + + // w = 11, beta = 25.3, degree = 17, error < 3.20e-11 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w11(T x) { + const T t = x * x; + T r = T(-0x1.59009930566c2p1); + r = r * t + T(0x1.cd02dcaa304dcp4); + r = r * t + T(-0x1.27b6792d84287p7); + r = r * t + T(0x1.e93b8598d8732p8); + r = r * t + T(-0x1.2827b4bce0869p10); + r = r * t + T(0x1.1a3cd50b8ebb6p11); + r = r * t + T(-0x1.bb6bf72d60cf3p11); + r = r * t + T(0x1.27231ed89e683p12); + r = r * t + T(-0x1.5123d0d1023adp12); + r = r * t + T(0x1.4ad53cddfcaf1p12); + r = r * t + T(-0x1.14cc6d44fbd4ap12); + r = r * t + T(0x1.856985c4590bep11); + r = r * t + T(-0x1.c3591519272cep10); + r = r * t + T(0x1.a2f8d49451968p9); + r = r * t + T(-0x1.2af4cfa9b5757p8); + r = r * t + T(0x1.33651e1b6dc6p6); + r = r * t + T(-0x1.94ccccc16875dp3); + r = r * t + T(0x1.ffffffffb99d9p-1); + return r; + } + + // w = 12, beta = 27.6, degree = 19, error < 1.65e-12 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w12(T x) { + const T t = x * x; + T r = T(-0x1.279db73ec08b8p1); + r = r * t + T(0x1.b36c753c2aa7dp4); + r = r * t + T(-0x1.3537a8d5dacp7); + r = r * t + T(0x1.1c4ad5335b3c1p9); + r = r * t + T(-0x1.7febf4612070ap10); + r = r * t + T(0x1.99bccfa2b45ffp11); + r = r * t + T(-0x1.6a7e6841cdd94p12); + r = r * t + T(0x1.1229b3df0dde7p13); + r = r * t + T(-0x1.68b72ed61ef3ap13); + r = r * t + T(0x1.9f70e8e221323p13); + r = r * t + T(-0x1.a21e19c0501eap13); + r = r * t + T(0x1.6d0860e26b0f4p13); + r = r * t + T(-0x1.11406ebe54d37p13); + r = r * t + T(0x1.594e7a8cbab51p12); + r = r * t + T(-0x1.68bd26f203d21p11); + r = r * t + T(0x1.2ed3db062aa1p10); + r = r * t + T(-0x1.8820826cc8dc9p8); + r = r * t + T(0x1.6f147ad4fda94p6); + r = r * t + T(-0x1.b9999998e030ap3); + r = r * t + T(0x1.fffffffffc5dcp-1); + return r; + } + + // w = 13, beta = 29.9, degree = 21, error < 8.48e-14 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w13(T x) { + const T t = x * x; + T r = T(-0x1.f5b0db6099cd3p0); + r = r * t + T(0x1.93556b5289c77p4); + r = r * t + T(-0x1.39b591914c767p7); + r = r * t + T(0x1.3cc43c9c9dd5p9); + r = r * t + T(-0x1.d6e0b91c3f952p10); + r = r * t + T(0x1.1530d489678cfp12); + r = r * t + T(-0x1.0f58cc3b57c6fp13); + r = r * t + T(0x1.c8810a3e2b99cp13); + r = r * t + T(-0x1.50d1cdb6d5abcp14); + r = r * t + T(0x1.b8757eaf1ebap14); + r = r * t + T(-0x1.ffa40c1c3b2a6p14); + r = r * t + T(0x1.07216db3d2af2p15); + r = r * t + T(-0x1.dc0307c1e538p14); + r = r * t + T(0x1.76f66049cee1ap14); + r = r * t + T(-0x1.fc0d2356bef94p13); + r = r * t + T(0x1.2354fa3750e5cp13); + r = r * t + T(-0x1.14f973d66e8afp12); + r = r * t + T(0x1.a85e5aa22f6bap10); + r = r * t + T(-0x1.f6e308d0ea5eap8); + r = r * t + T(0x1.b00e1479f5c26p6); + r = r * t + T(-0x1.de6666665ae9p3); + r = r * t + T(0x1.ffffffffffd04p-1); + return r; + } + + // w = 14, beta = 32.2, degree = 22, error < 4.69e-14 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w14(T x) { + const T t = x * x; + T r = T(0x1.0354efce10e2dp2); + r = r * t + T(-0x1.ae25418562cf8p5); + r = r * t + T(0x1.58827d3735653p8); + r = r * t + T(-0x1.65245773d1613p10); + r = r * t + T(0x1.0f606d2c65324p12); + r = r * t + T(-0x1.44f69b9ed1fe3p13); + r = r * t + T(0x1.41dfc1b9c2d3ap14); + r = r * t + T(-0x1.10df39932e40cp15); + r = r * t + T(0x1.9536924063e52p15); + r = r * t + T(-0x1.0b15d6e06f48p16); + r = r * t + T(0x1.3a48e2051a128p16); + r = r * t + T(-0x1.49fabd3679aeep16); + r = r * t + T(0x1.33b8d92983ebfp16); + r = r * t + T(-0x1.f9f712f3c39a1p15); + r = r * t + T(0x1.6b15557642d58p15); + r = r * t + T(-0x1.c13113d9b8e7fp14); + r = r * t + T(0x1.d7772ec3bd135p13); + r = r * t + T(-0x1.9b2d52745b9d6p12); + r = r * t + T(0x1.21a51b7b71831p11); + r = r * t + T(-0x1.3c60dfe4bed5dp9); + r = r * t + T(0x1.f651eb8483554p6); + r = r * t + T(-0x1.019999999621ep4); + r = r * t + T(0x1.ffffffffffe5bp-1); + return r; + } + + // w = 15, beta = 34.5, degree = 24, error < 2.46e-15 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w15(T x) { + const T t = x * x; + T r = T(0x1.bcc3fd37452d5p1); + r = r * t + T(-0x1.8eb424f00a28dp5); + r = r * t + T(0x1.5a19d25b45029p8); + r = r * t + T(-0x1.85c332c280f8dp10); + r = r * t + T(0x1.424ce09ce2f3ap12); + r = r * t + T(-0x1.a49341a23526dp13); + r = r * t + T(0x1.c69265bf4a9d7p14); + r = r * t + T(-0x1.a5637540ea295p15); + r = r * t + T(0x1.57650b877caaep16); + r = r * t + T(-0x1.f3de53de15289p16); + r = r * t + T(0x1.47a7b42596f6ep17); + r = r * t + T(-0x1.83c339a13039ep17); + r = r * t + T(0x1.9d812c92ca63fp17); + r = r * t + T(-0x1.8b81cbd899304p17); + r = r * t + T(0x1.51124b72fffedp17); + r = r * t + T(-0x1.fbb1cbaa2b528p16); + r = r * t + T(0x1.4e6e67a82c9d4p16); + r = r * t + T(-0x1.7c8f736d5aefp15); + r = r * t + T(0x1.70230df3fd36ap14); + r = r * t + T(-0x1.28835f02a5b9bp13); + r = r * t + T(0x1.829acbfacfe6bp11); + r = r * t + T(-0x1.87a0ffff89722p9); + r = r * t + T(0x1.20effffffa61ap7); + r = r * t + T(-0x1.13ffffffffc9fp4); + r = r * t + T(0x1.fffffffffffebp-1); + return r; + } + + // ============================================================ + // Runtime width dispatch + // ============================================================ + + /** + * @brief Runtime width dispatch for ES kernel evaluation + * @param x Evaluation point in [0, 1] + * @param w Kernel width (4-15) + * @return Approximation to exp(beta * (sqrt(1-x^2) - 1)) + */ + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval(T x, int w) { + switch (w) { + case 4: + return es_kernel_eval_w4(x); + case 5: + return es_kernel_eval_w5(x); + case 6: + return es_kernel_eval_w6(x); + case 7: + return es_kernel_eval_w7(x); + case 8: + return es_kernel_eval_w8(x); + case 9: + return es_kernel_eval_w9(x); + case 10: + return es_kernel_eval_w10(x); + case 11: + return es_kernel_eval_w11(x); + case 12: + return es_kernel_eval_w12(x); + case 13: + return es_kernel_eval_w13(x); + case 14: + return es_kernel_eval_w14(x); + case 15: + return es_kernel_eval_w15(x); + default: + // Fallback to exact evaluation for unsupported widths + return Kokkos::exp(T(2.30) * w * (Kokkos::sqrt(T(1) - x * x) - T(1))); + } + } + + // ============================================================ + // Compile-time width dispatch (template version) + // ============================================================ + + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval(T x) { + if constexpr (W == 4) { + return es_kernel_eval_w4(x); + } else if constexpr (W == 5) { + return es_kernel_eval_w5(x); + } else if constexpr (W == 6) { + return es_kernel_eval_w6(x); + } else if constexpr (W == 7) { + return es_kernel_eval_w7(x); + } else if constexpr (W == 8) { + return es_kernel_eval_w8(x); + } else if constexpr (W == 9) { + return es_kernel_eval_w9(x); + } else if constexpr (W == 10) { + return es_kernel_eval_w10(x); + } else if constexpr (W == 11) { + return es_kernel_eval_w11(x); + } else if constexpr (W == 12) { + return es_kernel_eval_w12(x); + } else if constexpr (W == 13) { + return es_kernel_eval_w13(x); + } else if constexpr (W == 14) { + return es_kernel_eval_w14(x); + } else if constexpr (W == 15) { + return es_kernel_eval_w15(x); + } else { + // Fallback to exact evaluation for unsupported widths + return Kokkos::exp(T(2.30) * W * (Kokkos::sqrt(T(1) - x * x) - T(1))); + } + } + + // ============================================================ + // ESKernel class + // ============================================================ + + /** + * @brief Exponential-of-Semicircle (ES) spreading kernel. + * + * The ES kernel is defined as: + * phi(x) = exp(beta * (sqrt(1 - x^2) - 1)) for |x| < 1 + * = 0 otherwise + * + * Width (w) and beta are derived from a user-chosen error tolerance. + * + * @tparam T Floating point type (float or double) + */ + template + class ESKernel { + public: + static constexpr bool has_width_template = true; + // Upper bound on the runtime width; matches the precomputed + // ES-kernel polynomial expansions (w = 4..15) used by NativeNUFFT. + static constexpr int max_width = 15; + using value_type = T; + + static constexpr T default_tol = T(1e-10); + static constexpr T beta_factor = T(2.30); + + /** + * @brief Construct kernel with given tolerance. + * @param tol Error tolerance for NUFFT accuracy + */ + KOKKOS_INLINE_FUNCTION explicit ESKernel(T tol = default_tol) + : w_(static_cast(Kokkos::ceil(Kokkos::log10(T(1.0) / tol))) + 1) + , beta_(beta_factor * w_) + , tol_(tol) {} + + /** + * @brief Construct kernel with explicit width and beta. + * @param width Kernel width (number of grid points) + * @param beta Beta parameter controlling decay + */ + KOKKOS_INLINE_FUNCTION ESKernel(int width, T beta) + : w_(width) + , beta_(beta) + , tol_(default_tol) {} + + /** + * @brief Evaluate the ES kernel at position x in [-1, 1]. + * @param x Normalized position + * @return Normalized kernel value + */ + KOKKOS_INLINE_FUNCTION T operator()(T x) const { + x = Kokkos::abs(x); + return x >= T(1.0) ? T(0.0) : es_kernel_eval(x, w_); + } + + template + KOKKOS_INLINE_FUNCTION T eval(T x) const { + return x >= T(1.0) ? T(0.0) : es_kernel_eval(x); + } + + KOKKOS_INLINE_FUNCTION int width() const { return w_; } + KOKKOS_INLINE_FUNCTION T beta() const { return beta_; } + KOKKOS_INLINE_FUNCTION T tol() const { return tol_; } + + private: + int w_; + T beta_; + T tol_; + }; + + } // namespace NUFFT +} // namespace ippl + + +#endif // IPPL_NUFFT_ES_KERNEL_H diff --git a/src/FFT/NUFFT/NUFFTUtilities.h b/src/FFT/NUFFT/NUFFTUtilities.h new file mode 100644 index 000000000..37cc4b305 --- /dev/null +++ b/src/FFT/NUFFT/NUFFTUtilities.h @@ -0,0 +1,65 @@ +#ifndef IPPL_NUFFT_UTILITIES_H +#define IPPL_NUFFT_UTILITIES_H + +/** + * @file NUFFTUtilities.h + * @brief NUFFT utility helpers used by the native Kokkos NUFFT engine. + */ + +#include +#include + +#include + +namespace ippl { +namespace nufft { + + /** + * @brief Computes Gauss-Legendre quadrature nodes and weights on [-1, 1]. + * + * Runs on host and copies results to the device-accessible views. + */ + template + void gauss_legendre(int n, + Kokkos::View &nodes, + Kokkos::View &weights) { + constexpr RealType eps = std::numeric_limits::epsilon(); + + auto h_nodes = Kokkos::create_mirror_view(Kokkos::HostSpace(), nodes); + auto h_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(), weights); + + for (int i = 0; i < n; ++i) { + RealType x = std::cos(Kokkos::numbers::pi_v * (i + 0.75) / (n + 0.5)); + RealType pp, delta; + + do { + RealType p1 = 1.0, p2 = 0.0; + for (int j = 0; j < n; ++j) { + const RealType p3 = p2; + p2 = p1; + p1 = ((2.0 * j + 1.0) * x * p2 - j * p3) / (j + 1.0); + } + pp = n * (x * p1 - p2) / (x * x - 1.0); + delta = p1 / pp; + x -= delta; + } while (std::abs(delta) > eps); + + h_nodes(i) = x; + h_weights(i) = 2.0 / ((1.0 - x * x) * pp * pp); + } + + // Exploit symmetry + for (int i = 0; i < n / 2; ++i) { + const int j = n - 1 - i; + h_nodes(j) = -h_nodes(i); + h_weights(j) = h_weights(i); + } + + Kokkos::deep_copy(nodes, h_nodes); + Kokkos::deep_copy(weights, h_weights); + } + +} // namespace nufft +} // namespace ippl + +#endif // IPPL_NUFFT_UTILITIES_H diff --git a/src/FFT/NUFFT/NativeNUFFT.h b/src/FFT/NUFFT/NativeNUFFT.h new file mode 100644 index 000000000..449ec1c4e --- /dev/null +++ b/src/FFT/NUFFT/NativeNUFFT.h @@ -0,0 +1,346 @@ +// +// Native NUFFT Implementation +// NUFFT using kernel-based scatter/gather and heFFTe FFT. +// Does not depend on external NUFFT libraries. +// +#ifndef IPPL_NATIVE_NUFFT_H +#define IPPL_NATIVE_NUFFT_H + +#include + +#include +#include +#include +#include + +#include "Types/Vector.h" + +#include "Utility/IpplTimings.h" + +#include "Field/Field.h" + +#include "../../Interpolation/Scatter/ScatterConfig.h" +#include "Correction.h" +#include "FFT/FFT.h" +#include "FFT/NUFFT/ESKernel.h" +#include "FFT/NUFFT/NUFFTUtilities.h" +#include "Particle/ParticleAttrib.h" + +namespace ippl { + namespace nufft { + + /** + * @brief NUFFT implementation. + * + * Type 1: Spread from nonuniform points to uniform Fourier modes + * (scatter -> FFT -> deconvolution) + * + * Type 2: Interpolate uniform Fourier modes at nonuniform points + * (correction -> FFT -> gather) + * + * @tparam Dim Number of dimensions + * @tparam T Floating point type + * @tparam ExecSpace Kokkos execution space + */ + template + class NativeNUFFT { + public: + using execution_space = ExecSpace; + using memory_space = typename ExecSpace::memory_space; + using complex_type = Kokkos::complex; + using size_type = size_t; + + // View types + using complex_view_1d = Kokkos::View; + using real_view_1d = Kokkos::View; + + // Field types + using Mesh_t = UniformCartesian; + using Centering_t = Cell; + using ComplexField = + Field::uniform_type; + using Layout_t = FieldLayout; + + struct Config { + T tol = T(1e-6); // Error tolerance + T sigma = T(2.0); // Upsampling factor + Interpolation::ScatterConfig scatter_config; + Interpolation::GatherConfig gather_config; + }; + + struct TimingInfo { + T spread = 0; + T fft = 0; + T correct = 0; + T total = 0; + }; + + private: + Config cfg_; + ESKernel kernel_; + Vector n_modes_; + Vector n_grid_; + + // Deconvolution factors per dimension + std::array factors_; + + // Upsampled grid for FFT + std::unique_ptr grid_field_; + std::unique_ptr grid_layout_; + std::unique_ptr grid_mesh_; + + // heFFTe FFT object + std::unique_ptr> heffte_fft_; + std::unique_ptr> pruned_fft_; + + TimingInfo timing_; + bool initialized_ = false; + bool use_upsampled_ = false; + + public: + /** + * @brief Construct NativeNUFFT with given mode counts. + * + * @param n_modes Number of Fourier modes per dimension + * @param cfg Configuration parameters + */ + NativeNUFFT(const Vector& n_modes, bool use_upsampled, Config cfg = {}) + : cfg_(cfg) + , kernel_(cfg.tol) + , n_modes_(n_modes) + , use_upsampled_(use_upsampled) { + // Compute upsampled grid sizes + for (unsigned d = 0; d < Dim; ++d) { + n_grid_[d] = std::bit_ceil( + std::max(cfg_.sigma * n_modes_[d], 2 * kernel_.width())); + } + initialized_ = false; + } + + /** + * @brief Initialize the NUFFT with a layout. + * + * Must be called before transform operations. + * + * @param comm MPI communicator + */ + void initialize(const Layout_t& modes_layout, const MPI_Comm& comm = MPI_COMM_WORLD) { + if (initialized_) + return; + + static IpplTimings::TimerRef initTimer = IpplTimings::getTimer("NativeNUFFT::init"); + IpplTimings::startTimer(initTimer); + + // Create index domain for upsampled grid + NDIndex domain; + for (unsigned d = 0; d < Dim; ++d) { + domain[d] = Index(n_grid_[d]); + } + + // Create decomposition + std::array isParallel; + isParallel.fill(true); + + // For uneven kernel width W, a point in the upper half of the last + // segment needs both the cell value at the end and the kernel support + // that extends w/2 into the halo, hence (W + 1) / 2 ghost layers. + const int nghost = (kernel_.width()) / 2 + 1; + + grid_layout_ = std::make_unique(comm, domain, isParallel, true, nghost); + + // Create mesh for upsampled grid + Vector origin, hx; + for (unsigned d = 0; d < Dim; ++d) { + origin[d] = 0; + T extent = T(2.0) * Kokkos::numbers::pi_v; + hx[d] = extent / n_grid_[d]; + } + + grid_mesh_ = std::make_unique(domain, hx, origin); + + // Native NUFFT uses field ghosts directly instead of creating an + // extended grid; carry kernel-width ghost cells on the field itself. + grid_field_ = std::make_unique(*grid_mesh_, *grid_layout_, nghost); + + // Initialize heFFTe FFT + if (use_upsampled_) { + ParameterList fftParams; + fftParams.add("use_heffte_defaults", false); + fftParams.add("use_pencils", true); + fftParams.add("use_reorder", false); + fftParams.add("use_gpu_aware", true); + fftParams.add("comm", 3); + heffte_fft_ = + std::make_unique>(*grid_layout_, fftParams); + } else { + ParameterList fftParams; + fftParams.add("use_heffte_defaults", false); + fftParams.add("use_pencils", true); + fftParams.add("use_reorder", false); + fftParams.add("use_gpu_aware", true); + fftParams.add("comm", 2); + fftParams.add("num_concurrent_ffts", 4); + PruningParams pruning_params; + // Set pruning params to output the desired n_modes_, not n_grid_/2 + for (int d = 0; d < static_cast(Dim); ++d) { + pruning_params.n_modes[d] = n_modes_[d]; + } + + pruned_fft_ = std::make_unique>( + *grid_layout_, modes_layout, pruning_params, fftParams); + } + + // Precompute deconvolution factors + ESKernel nufft_kernel(cfg_.tol); + for (unsigned d = 0; d < Dim; ++d) { + factors_[d] = complex_view_1d("deconv_factors", n_modes_[d]); + + ippl::nufft::compute_deconvolution_factors( + factors_[d], static_cast(n_modes_[d]), + static_cast(n_grid_[d]), nufft_kernel); + } + + Kokkos::fence(); + + initialized_ = true; + IpplTimings::stopTimer(initTimer); + } + + /** + * @brief Type 1 NUFFT: Spread from nonuniform points to uniform Fourier modes. + * + * Computes f_k = sum_j c_j * exp(i * k * x_j) for uniform k. + * + * @tparam Properties ParticleAttrib properties + * @tparam OutField Output field type + * @param R Particle positions in [0, 2*pi)^Dim + * @param Q Particle values (input) + * @param f Output Fourier modes field + * @param upsampled_output Whether the output field is the usampeld grid. Required on + * distributed for now + */ + template + void type1(const ParticleAttrib, Properties...>& R, + const ParticleAttrib& Q, OutField& f, + bool upsampled_output = false) { + if (!initialized_) { + throw IpplException("NativeNUFFT::type1", + "NUFFT not initialized. Call initialize() first."); + } + static IpplTimings::TimerRef NativeNUFFT1Timer = IpplTimings::getTimer("NativeNUFFT1"); + IpplTimings::startTimer(NativeNUFFT1Timer); + + + static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("scatterTimerNUFFT1"); + IpplTimings::startTimer(scatterTimer); + Kokkos::deep_copy(grid_field_->getView(), 0.0); + + Q.scatter_kernel(*grid_field_, R, kernel_, cfg_.scatter_config); + Kokkos::fence(); + IpplTimings::stopTimer(scatterTimer); + + static IpplTimings::TimerRef fftTimer = IpplTimings::getTimer("FFTNUFFT1"); + IpplTimings::startTimer(fftTimer); + // Step 2: Inverse FFT + if (upsampled_output) { + performFFT(-1); + } else { + performPrunedFFT(1, f); + } + IpplTimings::stopTimer(fftTimer); + + static IpplTimings::TimerRef deconvolutionTimer = IpplTimings::getTimer("deconvolutionNUFFT1"); + IpplTimings::startTimer(deconvolutionTimer); + // Step 3: Deconvolution and truncation to output modes + if (upsampled_output) { + applyDeconvolutionType1( + *grid_field_, factors_, f, n_modes_, n_grid_); + } else { + applyDeconvolutionPruned(f, factors_, n_modes_, n_grid_); + } + IpplTimings::stopTimer(deconvolutionTimer); + IpplTimings::stopTimer(NativeNUFFT1Timer); + + } + + + template + void type2(InField& f, const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, bool upsampled_output = false) { + if (!initialized_) { + throw IpplException("NativeNUFFT::type2", + "NUFFT not initialized. Call initialize() first."); + } + static IpplTimings::TimerRef NativeNUFFT2Timer = IpplTimings::getTimer("NativeNUFFT2"); + IpplTimings::startTimer(NativeNUFFT2Timer); + + // ============================================================ + // Step 1: Apply pre-correction + // ============================================================ + static IpplTimings::TimerRef PrecorrectionTimer = IpplTimings::getTimer("PrecorrectionNUFFT2"); + IpplTimings::startTimer(PrecorrectionTimer); + + if (upsampled_output) { + applyPreCorrectionType2( + f, factors_, *grid_field_, n_modes_, n_grid_); + } else { + applyPrecorrectionPruned(f, factors_, n_modes_, n_grid_); + } + IpplTimings::stopTimer(PrecorrectionTimer); + + Kokkos::fence(); + + // ============================================================ + // Step 2: Inverse FFT + // ============================================================ + static IpplTimings::TimerRef FFTTimer = IpplTimings::getTimer("FFTNUFFT2"); + IpplTimings::startTimer(FFTTimer); + if (upsampled_output) { + performFFT(-1); // operates on grid_field_ + } else { + performPrunedFFT(-1, f); // writes into grid_field_ + } + IpplTimings::stopTimer(FFTTimer); + Kokkos::fence(); + + // ============================================================ + // Step 3: Gather/interpolate at particle positions + // ============================================================ + static IpplTimings::TimerRef GatherTimer = IpplTimings::getTimer("GatherNUFFT2"); + IpplTimings::startTimer(GatherTimer); + Q.gather(*grid_field_, R, kernel_, false, cfg_.gather_config); + IpplTimings::stopTimer(GatherTimer); + Kokkos::fence(); + IpplTimings::stopTimer(NativeNUFFT2Timer); + } + + // Accessors + const TimingInfo& timing() const { return timing_; } + const ESKernel& kernel() const { return kernel_; } + Vector gridSize() const { return n_grid_; } + Vector numModes() const { return n_modes_; } + + void resetTimings() { + timing_ = TimingInfo{}; + } + + void performFFT(int sign) { + TransformDirection direction = (sign < 0) ? BACKWARD : FORWARD; + heffte_fft_->transform(direction, *grid_field_); + } + + void performPrunedFFT(int sign, auto& output_field) { + TransformDirection direction = (sign < 0) ? BACKWARD : FORWARD; + if (sign < 0) { + pruned_fft_->transform(direction, output_field, *grid_field_, 1); + } else { + pruned_fft_->transform(direction, *grid_field_, output_field, -1); + } + } + }; + + } // namespace NUFFT +} // namespace ippl + +#endif // IPPL_NATIVE_NUFFT_H diff --git a/src/FFT/Traits.h b/src/FFT/Traits.h new file mode 100644 index 000000000..31815aaa2 --- /dev/null +++ b/src/FFT/Traits.h @@ -0,0 +1,242 @@ +#ifndef IPPL_FFT_TRAITS_H +#define IPPL_FFT_TRAITS_H + +#include + +#ifdef IPPL_ENABLE_CUFFTMP +#include +#endif + +#include +#include +#include + +namespace ippl { + // Transform tags + struct CCTransform {}; + struct RCTransform {}; + struct SineTransform {}; + struct CosTransform {}; + struct Cos1Transform {}; + struct NUFFTransform {}; + struct PrunedCCTransform {}; + struct PrunedRCTransform {}; + + // Direction + enum TransformDirection { + FORWARD, + BACKWARD + }; + + // Communication strategy + enum FFTComm { + a2a = 0, + a2av = 1, + p2p = 2, + p2p_pl = 3 + }; + + // Pruning parameters + template + struct PruningParams { + Vector n_modes{}; + + PruningParams() = default; + + template + explicit PruningParams(const Vec& modes) { + for (unsigned d = 0; d < Dim; ++d) { + n_modes[d] = modes[d]; + } + } + }; + + // Primary template - specialized per transform type + template + class FFT; + + namespace fft { + + //============================================================================= + // Feature Tags + //============================================================================= + + struct FFTW {}; + struct MKL {}; + struct CuFFT {}; + struct RocFFT {}; + struct HeffteGPU {}; + struct CuFFTMp {}; + struct Finufft {}; + struct GPUFinufft {}; + + //============================================================================= + // Unified Feature Detection: is_available + //============================================================================= + + template + struct is_available : std::false_type {}; + +#ifdef Heffte_ENABLE_FFTW + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_MKL + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_CUDA + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_ROCM + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_GPU + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef IPPL_ENABLE_CUFFTMP + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef ENABLE_FINUFFT + template <> + struct is_available : std::true_type {}; +#endif + +#if defined(ENABLE_FINUFFT) && defined(ENABLE_GPU_NUFFT) + template <> + struct is_available : std::true_type {}; +#endif + + template + inline constexpr bool is_available_v = is_available::value; + + //============================================================================= + // heFFTe Backend Selection by Memory Space + //============================================================================= + + template + struct HeffteBackend { + // Default: stock backend + using c2c = heffte::backend::stock; + using sin = heffte::backend::stock_sin; + using cos = heffte::backend::stock_cos; + using cos1 = heffte::backend::stock_cos1; + }; + + // Host: FFTW > MKL > Stock +#if defined(Heffte_ENABLE_FFTW) + template <> + struct HeffteBackend { + using c2c = heffte::backend::fftw; + using sin = heffte::backend::fftw_sin; + using cos = heffte::backend::fftw_cos; + using cos1 = heffte::backend::fftw_cos1; + }; +#elif defined(Heffte_ENABLE_MKL) + template <> + struct HeffteBackend { + using c2c = heffte::backend::mkl; + using sin = heffte::backend::mkl_sin; + using cos = heffte::backend::mkl_cos; + using cos1 = heffte::backend::mkl_cos1; + }; +#endif + +#ifdef KOKKOS_ENABLE_CUDA + template <> + struct HeffteBackend { + using c2c = heffte::backend::cufft; + using sin = heffte::backend::cufft_sin; + using cos = heffte::backend::cufft_cos; + using cos1 = heffte::backend::cufft_cos1; + }; +#endif + +#ifdef KOKKOS_ENABLE_HIP + template <> + struct HeffteBackend { +#ifdef Heffte_ENABLE_ROCM + using c2c = heffte::backend::rocfft; + using sin = heffte::backend::rocfft_sin; + using cos = heffte::backend::rocfft_cos; + using cos1 = heffte::backend::rocfft_cos1; +#else + using c2c = heffte::backend::stock; + using sin = heffte::backend::stock_sin; + using cos = heffte::backend::stock_cos; + using cos1 = heffte::backend::stock_cos1; +#endif + }; +#endif + + //============================================================================= + // GPU Stream Support + //============================================================================= + + template + struct Stream { + using stream_type = int; // Dummy + using exec_space = Kokkos::DefaultExecutionSpace; + + static void create(stream_type&) {} + static void destroy(stream_type&) {} + static void sync(stream_type&) {} + static exec_space instance(stream_type&) { return exec_space(); } + }; + +#ifdef KOKKOS_ENABLE_CUDA + template <> + struct Stream { + using stream_type = cudaStream_t; + using exec_space = Kokkos::Cuda; + + static void create(stream_type& s) { cudaStreamCreate(&s); } + static void destroy(stream_type& s) { cudaStreamDestroy(s); } + static void sync(stream_type& s) { cudaStreamSynchronize(s); } + static exec_space instance(stream_type& s) { return exec_space(s); } + }; +#endif + +#ifdef KOKKOS_ENABLE_HIP + template <> + struct Stream { + using stream_type = hipStream_t; + using exec_space = Kokkos::HIP; + + static void create(stream_type& s) { hipStreamCreate(&s); } + static void destroy(stream_type& s) { hipStreamDestroy(s); } + static void sync(stream_type& s) { hipStreamSynchronize(s); } + static exec_space instance(stream_type& s) { return exec_space(s); } + }; +#endif + + //============================================================================= + // FFTW Trig Scaling + //============================================================================= + + inline constexpr double fftw_trig_scale() { + return is_available_v ? 8.0 : 1.0; + } + } // namespace fft + +} // namespace ippl + +// Register Kokkos complex with heFFTe +namespace heffte { + template <> + struct is_ccomplex> : std::true_type {}; + template <> + struct is_zcomplex> : std::true_type {}; +} // namespace heffte + +#endif \ No newline at end of file diff --git a/src/FFT/Transform/CC.h b/src/FFT/Transform/CC.h new file mode 100644 index 000000000..3cbaee13c --- /dev/null +++ b/src/FFT/Transform/CC.h @@ -0,0 +1,75 @@ +#ifndef IPPL_FFT_TRANSFORM_CC_H +#define IPPL_FFT_TRANSFORM_CC_H + +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + + template + class FFT { + public: + static constexpr unsigned Dim = ComplexField::dim; + + using Complex_t = typename ComplexField::value_type; + using T = typename Complex_t::value_type; + using MemSpace = typename ComplexField::memory_space; + using ExecSpace = typename ComplexField::execution_space; + using Layout_t = FieldLayout; + +#ifdef IPPL_ENABLE_CUFFTMP + using Backend_t = fft::CuFFTMpC2C; +#else + using Backend_t = fft::HeffteC2C; +#endif + using TempView_t = Kokkos::View; + + FFT(const Layout_t& layout, const ParameterList& params) { + static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); + + std::array low, high; + fft::domainToBounds(layout.getLocalNDIndex(), low, high); + heffte::box3d box{low, high}; + + backend_ = std::make_unique(box, box, Comm->getCommunicator(), params); + } + + void warmup(ComplexField& f) { + transform(FORWARD, f); + transform(BACKWARD, f); + } + + void transform(TransformDirection direction, ComplexField& f) { + auto view = f.getView(); + const int ng = f.getNghost(); + + ensureTemp(f); + fft::copyToTemp(temp_, view, ng); + + if (direction == FORWARD) { + backend_->forward(temp_.data(), temp_.data()); + } else { + backend_->backward(temp_.data(), temp_.data()); + } + + fft::copyFromTemp(view, temp_, ng); + } + + private: + std::unique_ptr backend_; + TempView_t temp_; + + void ensureTemp(const ComplexField& f) { + if (temp_.size() != f.getOwned().size()) { + temp_ = detail::shrinkView("fft_cc_temp", f.getView(), f.getNghost()); + } + } + }; + +} // namespace ippl + +#endif \ No newline at end of file diff --git a/src/FFT/Transform/Common.h b/src/FFT/Transform/Common.h new file mode 100644 index 000000000..bbce436dd --- /dev/null +++ b/src/FFT/Transform/Common.h @@ -0,0 +1,47 @@ +#ifndef IPPL_COMMON_H +#define IPPL_COMMON_H + +#include + +#include "Utility/ViewUtils.h" + +namespace ippl::fft { + template + inline void domainToBounds(const NDIndex& domain, std::array& low, + std::array& high) { + low.fill(0); + high.fill(0); + for (unsigned d = 0; d < Dim; ++d) { + low[d] = static_cast(domain[d].first()); + high[d] = static_cast(domain[d].first() + domain[d].length() - 1); + } + } + + template + inline void copyToTemp(OutputViewT& output, const InputViewT& input, int n_ghost) { + Kokkos::parallel_for( + "FFT_CC_toTemp", + Kokkos::MDRangePolicy>( + {n_ghost, n_ghost, n_ghost}, {static_cast(input.extent(0)) - n_ghost, + static_cast(input.extent(1)) - n_ghost, + static_cast(input.extent(2)) - n_ghost}), + KOKKOS_LAMBDA(int i, int j, int k) { + output(i - n_ghost, j - n_ghost, k - n_ghost) = input(i, j, k); + }); + } + template + inline void copyFromTemp(OutputViewT& output, const InputViewT& input, int n_ghost) { + Kokkos::parallel_for( + "FFT_CC_fromTemp", + Kokkos::MDRangePolicy>( + {n_ghost, n_ghost, n_ghost}, {static_cast(output.extent(0)) - n_ghost, + static_cast(output.extent(1)) - n_ghost, + static_cast(output.extent(2)) - n_ghost}), + KOKKOS_LAMBDA(int i, int j, int k) { + output(i, j, k) = input(i - n_ghost, j - n_ghost, k - n_ghost); + }); + } + +} // namespace ippl::fft + +#endif // IPPL_COMMON_H diff --git a/src/FFT/Transform/NUFFT.h b/src/FFT/Transform/NUFFT.h new file mode 100644 index 000000000..80007e2a2 --- /dev/null +++ b/src/FFT/Transform/NUFFT.h @@ -0,0 +1,263 @@ +#ifndef IPPL_FFT_TRANSFORM_NUFFT_H +#define IPPL_FFT_TRANSFORM_NUFFT_H + +#include +#include +#include + +#include "Utility/IpplException.h" +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/NUFFT/NativeNUFFT.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +#ifdef ENABLE_FINUFFT +#include +#ifdef ENABLE_GPU_NUFFT +#include +#endif +#endif + +namespace ippl { + + // Forward declaration + template + class ParticleAttrib; + + namespace detail { + +#ifdef ENABLE_FINUFFT + /** + * @brief Type traits for FINUFFT backend selection + * + * Provides unified interface for CPU (finufft) and GPU (cufinufft) backends + */ + template + struct FinufftTraits; + +#ifdef ENABLE_GPU_NUFFT + template <> + struct FinufftTraits { + using ComplexType = cuFloatComplex; + using PlanType = cufinufftf_plan; + using OptsType = cufinufft_opts; + using CountType = int; // cufinufft uses int for point counts + + static void defaultOpts(OptsType* opts) { cufinufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + float tol, PlanType* plan, OptsType* opts) { + return cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, float* x, float* y, float* z, + CountType N, float* s, float* t, float* u) { + return cufinufftf_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return cufinufftf_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return cufinufftf_destroy(plan); } + }; + + template <> + struct FinufftTraits { + using ComplexType = cuDoubleComplex; + using PlanType = cufinufft_plan; + using OptsType = cufinufft_opts; + using CountType = int; + + static void defaultOpts(OptsType* opts) { cufinufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + double tol, PlanType* plan, OptsType* opts) { + return cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, double* x, double* y, double* z, + CountType N, double* s, double* t, double* u) { + return cufinufft_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return cufinufft_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return cufinufft_destroy(plan); } + }; + +#else // CPU FINUFFT + + template <> + struct FinufftTraits { + using ComplexType = std::complex; + using PlanType = finufftf_plan; + using OptsType = finufft_opts; + using CountType = int64_t; + + static void defaultOpts(OptsType* opts) { finufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + float tol, PlanType* plan, OptsType* opts) { + return finufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, float* x, float* y, float* z, + CountType N, float* s, float* t, float* u) { + return finufftf_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return finufftf_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return finufftf_destroy(plan); } + }; + + template <> + struct FinufftTraits { + using ComplexType = std::complex; + using PlanType = finufft_plan; + using OptsType = finufft_opts; + using CountType = int64_t; + + static void defaultOpts(OptsType* opts) { finufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + double tol, PlanType* plan, OptsType* opts) { + return finufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, double* x, double* y, double* z, + CountType N, double* s, double* t, double* u) { + return finufft_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return finufft_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return finufft_destroy(plan); } + }; + +#endif // ENABLE_GPU_NUFFT +#endif // ENABLE_FINUFFT + + } // namespace detail + + /** + * @brief Non-Uniform FFT implementation + * + * Supports both native implementation and FINUFFT backend (CPU/GPU). + * Type 1: Non-uniform points -> uniform grid (spreading/adjoint) + * Type 2: Uniform grid -> non-uniform points (interpolation) + */ + template + class FFT { + public: + static constexpr unsigned Dim = RealField::dim; + + using T = typename RealField::value_type; + using Complex_t = Kokkos::complex; + using MemSpace = typename RealField::memory_space; + using ExecSpace = typename RealField::execution_space; + using Layout_t = FieldLayout; + + using ComplexField = + typename Field::uniform_type; + + using NativeNUFFT_t = nufft::NativeNUFFT; + +#ifdef ENABLE_FINUFFT + using Traits_t = detail::FinufftTraits; + using FinufftComplex_t = typename Traits_t::ComplexType; + using FinufftPlan_t = typename Traits_t::PlanType; + using FinufftOpts_t = typename Traits_t::OptsType; + using FinufftCount_t = typename Traits_t::CountType; +#endif + + private: + // Configuration + int type_m; + T tol_m; + bool useFinufft_m; + bool useUpsampledInputs_m; + bool useR2C_m; + int r2cDir_m; + bool lockMethod_m; + + std::array nModes_m{1, 1, 1}; + + // Native NUFFT backend + std::unique_ptr nativeNufft_m; + +#ifdef ENABLE_FINUFFT + // FINUFFT backend + FinufftPlan_t finufftPlan_m{}; + + // Temporary buffers for FINUFFT + using FieldViewType = Kokkos::View; + using ParticleRealView = Kokkos::View; + using ParticleCplxView = Kokkos::View; + + FieldViewType tempField_m; + std::array tempR_m; + ParticleCplxView tempQ_m; +#endif + + public: + /** + * @brief Construct NUFFT transform + * + * @param layout Field layout + * @param localNp Local number of particles + * @param type Transform type (1 or 2) + * @param params Configuration parameters + */ + FFT(const Layout_t& layout, std::size_t localNp, int type, const ParameterList& params); + + ~FFT(); + + // Non-copyable, non-movable (due to FINUFFT plan) + FFT(const FFT&) = delete; + FFT& operator=(const FFT&) = delete; + FFT(FFT&&) = delete; + FFT& operator=(FFT&&) = delete; + + /** + * @brief Execute NUFFT transform + * + * Type 1: Spreads particle data Q at positions R onto field f + * Type 2: Interpolates field f to positions R, storing results in Q + */ + template + void transform(const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, ComplexField& f); + + // These must be public due to NVCC extended lambda restrictions + template + void transformNative(const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, ComplexField& f); + + template + void transformFinufft(const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, ComplexField& f); + private: + void initBackend(const Layout_t& layout, const ParameterList& params); + void initNative(const Layout_t& layout, const ParameterList& params); + void cleanupBackend(); + + void initFinufft(const ParameterList& params); + void allocateFinufftBuffers(const Layout_t& layout, std::size_t localNp); + }; + +} // namespace ippl + +#include "FFT/Transform/NUFFT.hpp" + +#endif // IPPL_FFT_TRANSFORM_NUFFT_H \ No newline at end of file diff --git a/src/FFT/Transform/NUFFT.hpp b/src/FFT/Transform/NUFFT.hpp new file mode 100644 index 000000000..539b30a4a --- /dev/null +++ b/src/FFT/Transform/NUFFT.hpp @@ -0,0 +1,531 @@ +#ifndef IPPL_FFT_TRANSFORM_NUFFT_HPP +#define IPPL_FFT_TRANSFORM_NUFFT_HPP + +namespace ippl { + + namespace detail { + + //===================================================================== + // Functors for NUFFT operations (must be outside class for NVCC) + //===================================================================== + + /** + * @brief Functor to scale particle positions by a factor + */ + template + struct ScalePositionsFunctor { + RView Rview_m; + Vector scale_m; + + ScalePositionsFunctor(RView Rview, Vector scale) + : Rview_m(Rview) + , scale_m(scale) {} + + KOKKOS_INLINE_FUNCTION void operator()(std::size_t i) const { + for (unsigned d = 0; d < Dim; ++d) { + Rview_m(i)[d] *= scale_m[d]; + } + } + }; + +#ifdef ENABLE_FINUFFT + + /** + * @brief Functor to copy field data to FINUFFT temp buffer with optional ifftshift + */ + template + struct CopyFieldToTempFunctor { + FieldView fview_m; + TempFieldView tempField_m; + int nghost_m; + int nx_m, ny_m, nz_m; + bool applyShift_m; + + CopyFieldToTempFunctor(FieldView fview, TempFieldView tempField, int nghost, int nx = 0, + int ny = 0, int nz = 0, bool applyShift = false) + : fview_m(fview) + , tempField_m(tempField) + , nghost_m(nghost) + , nx_m(nx) + , ny_m(ny) + , nz_m(nz) + , applyShift_m(applyShift) {} + + KOKKOS_INLINE_FUNCTION void operator()(int i, int j, int k) const { + int li = i - nghost_m; + int lj = j - nghost_m; + int lk = k - nghost_m; + + int di = li, dj = lj, dk = lk; + if (applyShift_m) { + di = (li + nx_m / 2) % nx_m; + dj = (lj + ny_m / 2) % ny_m; + dk = (lk + nz_m / 2) % nz_m; + } + + auto& dst = tempField_m(di, dj, dk); + auto src = fview_m(i, j, k); +#ifdef ENABLE_GPU_NUFFT + dst.x = src.real(); + dst.y = src.imag(); +#else + dst.real(src.real()); + dst.imag(src.imag()); +#endif + } + }; + + /** + * @brief Functor to copy field data from FINUFFT temp buffer with fftshift + */ + template + struct CopyFieldFromTempFunctor { + FieldView fview_m; + TempFieldView tempField_m; + int nghost_m; + int nx_m, ny_m, nz_m; + + CopyFieldFromTempFunctor(FieldView fview, TempFieldView tempField, int nghost, int nx, + int ny, int nz) + : fview_m(fview) + , tempField_m(tempField) + , nghost_m(nghost) + , nx_m(nx) + , ny_m(ny) + , nz_m(nz) {} + + KOKKOS_INLINE_FUNCTION void operator()(int i, int j, int k) const { + int li = i - nghost_m; + int lj = j - nghost_m; + int lk = k - nghost_m; + + int si = (li + nx_m / 2) % nx_m; + int sj = (lj + ny_m / 2) % ny_m; + int sk = (lk + nz_m / 2) % nz_m; + + auto src = tempField_m(si, sj, sk); +#ifdef ENABLE_GPU_NUFFT + fview_m(i, j, k).real() = src.x; + fview_m(i, j, k).imag() = src.y; +#else + fview_m(i, j, k).real() = src.real(); + fview_m(i, j, k).imag() = src.imag(); +#endif + } + }; + + /** + * @brief Functor to copy particle data to FINUFFT temp buffers + */ + template + struct CopyParticlesToTempFunctor { + RView Rview_m; + QView Qview_m; + TempRView tempRx_m, tempRy_m, tempRz_m; + TempQView tempQ_m; + Vector scale_m; + + CopyParticlesToTempFunctor(RView Rview, QView Qview, TempRView tempRx, TempRView tempRy, + TempRView tempRz, TempQView tempQ, Vector scale) + : Rview_m(Rview) + , Qview_m(Qview) + , tempRx_m(tempRx) + , tempRy_m(tempRy) + , tempRz_m(tempRz) + , tempQ_m(tempQ) + , scale_m(scale) {} + + KOKKOS_INLINE_FUNCTION void operator()(std::size_t i) const { + tempRx_m(i) = Rview_m(i)[0] * scale_m[0]; + tempRy_m(i) = Rview_m(i)[1] * scale_m[1]; + tempRz_m(i) = Rview_m(i)[2] * scale_m[2]; + +#ifdef ENABLE_GPU_NUFFT + tempQ_m(i).x = Qview_m(i); + tempQ_m(i).y = T(0); +#else + tempQ_m(i).real(Qview_m(i)); + tempQ_m(i).imag(T(0)); +#endif + } + }; + + /** + * @brief Functor to copy particle data from FINUFFT temp buffer + */ + template + struct CopyParticlesFromTempFunctor { + QView Qview_m; + TempQView tempQ_m; + + CopyParticlesFromTempFunctor(QView Qview, TempQView tempQ) + : Qview_m(Qview) + , tempQ_m(tempQ) {} + + KOKKOS_INLINE_FUNCTION void operator()(std::size_t i) const { +#ifdef ENABLE_GPU_NUFFT + Qview_m(i) = tempQ_m(i).x; +#else + Qview_m(i) = tempQ_m(i).real(); +#endif + } + }; + +#endif // ENABLE_FINUFFT + + } // namespace detail + + //========================================================================= + // Constructor / Destructor + //========================================================================= + + template + FFT::FFT(const Layout_t& layout, std::size_t localNp, int type, + const ParameterList& params) + : type_m(type) + , tol_m(params.get("tolerance", T(1e-6))) + , useFinufft_m(params.get("use_finufft", false)) + , useUpsampledInputs_m(params.get("use_upsampled_inputs", false)) + , useR2C_m(params.get("use_r2c", false)) + , r2cDir_m(params.get("r2c_direction", 0)) + , lockMethod_m(params.get("lock_method", false)) { + const auto& domain = layout.getDomain(); + for (unsigned d = 0; d < Dim; ++d) { + nModes_m[d] = domain[d].length(); + } + +#ifdef ENABLE_FINUFFT + allocateFinufftBuffers(layout, localNp); +#else + (void)localNp; +#endif + + initBackend(layout, params); + } + + template + FFT::~FFT() { + cleanupBackend(); + } + + //========================================================================= + // Backend Initialization + //========================================================================= + + template + void FFT::initBackend(const Layout_t& layout, + const ParameterList& params) { + if constexpr (fft::is_available_v) { + if (useFinufft_m) { + initFinufft(params); + return; + } + } + + initNative(layout, params); + } + + template + void FFT::initNative(const Layout_t& layout, + const ParameterList& params) { + Vector nModesVec; + for (unsigned d = 0; d < Dim; ++d) { + nModesVec[d] = nModes_m[d]; + } + + typename NativeNUFFT_t::Config cfg; + cfg.tol = tol_m; + cfg.sigma = params.get("sigma", T(2.0)); + + cfg.scatter_config = Interpolation::ScatterConfig::template get_default(); + cfg.gather_config = Interpolation::GatherConfig::template get_default(); + + cfg.scatter_config.lock_method = lockMethod_m; + + std::string spreadMethod = params.get("spread_method", "none"); + if (spreadMethod == "atomic") { + cfg.scatter_config.method = Interpolation::ScatterMethod::Atomic; + } else if (spreadMethod == "output_focused" + || spreadMethod == "output_focused_zbatched") { + // The "_zbatched" alias is kept so old test parameter sets still + // resolve; both map to OutputFocused (the z_batches knob lives on + // ScatterConfig, not in the method enum). + cfg.scatter_config.method = Interpolation::ScatterMethod::OutputFocused; + } else if (spreadMethod == "tiled") { + cfg.scatter_config.method = Interpolation::ScatterMethod::Tiled; + } + + std::string gatherMethod = params.get("gather_method", "none"); + if (gatherMethod == "atomic") { + cfg.gather_config.method = Interpolation::GatherMethod::Atomic; + } else if (gatherMethod == "atomic_sort") { + cfg.gather_config.method = Interpolation::GatherMethod::AtomicSort; + } + + if (params.contains("tile_size_3d")) { + cfg.scatter_config.tile_size.fill(params.get("tile_size_3d")); + } + if (params.contains("team_size")) { + cfg.scatter_config.team_size = params.get("team_size"); + cfg.gather_config.team_size = params.get("team_size"); + } + + nativeNufft_m = std::make_unique(nModesVec, useUpsampledInputs_m, cfg); + nativeNufft_m->initialize(layout, Comm->getCommunicator()); + } + + template + void FFT::cleanupBackend() { +#ifdef ENABLE_FINUFFT + if (useFinufft_m && finufftPlan_m) { + Traits_t::destroy(finufftPlan_m); + finufftPlan_m = FinufftPlan_t{}; + } +#endif + } + + template + void FFT::allocateFinufftBuffers(const Layout_t& layout, + std::size_t localNp) { +#ifdef ENABLE_FINUFFT + const auto& lDom = layout.getLocalNDIndex(); + Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length()); + + for (unsigned d = 0; d < Dim; ++d) { + Kokkos::realloc(tempR_m[d], localNp); + } + Kokkos::realloc(tempQ_m, localNp); +#else + (void)layout; + (void)localNp; + + throw std::runtime_error("FINUFFT is not activated. Rebuild with -DIPPL_ENABLE_FINUFFT=ON"); +#endif + } + + template + void FFT::initFinufft(const ParameterList& params) { +#ifdef ENABLE_FINUFFT + FinufftOpts_t opts; + Traits_t::defaultOpts(&opts); + +#ifdef ENABLE_GPU_NUFFT + opts.gpu_method = params.get("gpu_method", opts.gpu_method); + opts.gpu_sort = params.get("gpu_sort", opts.gpu_sort); + opts.gpu_kerevalmeth = params.get("gpu_kerevalmeth", opts.gpu_kerevalmeth); + opts.gpu_binsizex = params.get("gpu_binsizex", opts.gpu_binsizex); + opts.gpu_binsizey = params.get("gpu_binsizey", opts.gpu_binsizey); + opts.gpu_binsizez = params.get("gpu_binsizez", opts.gpu_binsizez); + opts.gpu_maxsubprobsize = params.get("gpu_maxsubprobsize", opts.gpu_maxsubprobsize); + opts.gpu_maxbatchsize = 0; +#else + opts.spread_sort = params.get("spread_sort", opts.spread_sort); + opts.spread_kerevalmeth = params.get("spread_kerevalmeth", opts.spread_kerevalmeth); + opts.nthreads = params.get("nthreads", opts.nthreads); +#endif + + int iflag = (type_m == 1) ? -1 : 1; + int dim = static_cast(Dim); + + int err = Traits_t::makeplan(type_m, dim, nModes_m.data(), iflag, 1, tol_m, &finufftPlan_m, + &opts); + + if (err != 0) { + throw IpplException("FFT", "FINUFFT makeplan failed"); + } +#else + (void)params; + throw std::runtime_error("FINUFFT is not activated. Rebuild with -DIPPL_ENABLE_FINUFFT=ON"); +#endif // ENABLE_FINUFFT + } + + //========================================================================= + // Transform Dispatch + //========================================================================= + + template + template + void FFT::transform( + const ParticleAttrib, Properties...>& R, ParticleAttrib& Q, + ComplexField& f) { + if constexpr (fft::is_available_v) { + if (useFinufft_m) { + transformFinufft(R, Q, f); + return; + } + } + + transformNative(R, Q, f); + } + + //========================================================================= + // Native NUFFT Transform + //========================================================================= + + template + template + void FFT::transformNative( + const ParticleAttrib, Properties...>& R, ParticleAttrib& Q, + ComplexField& f) { + const auto localNp = R.getParticleCount(); + const auto& layout = f.getLayout(); + const auto& mesh = f.get_mesh(); + const auto& dx = mesh.getMeshSpacing(); + const auto& domain = layout.getDomain(); + + Vector Len; + for (unsigned d = 0; d < Dim; ++d) { + int fullLength = domain[d].length(); + if (useR2C_m && static_cast(d) == r2cDir_m) { + fullLength = 2 * (fullLength - 1); + } + Len[d] = dx[d] * fullLength; + } + + constexpr T twoPi = T(2.0 * M_PI); + auto Rview = R.getView(); + + Vector scaleToTwoPi, scaleBack; + for (unsigned d = 0; d < Dim; ++d) { + scaleToTwoPi[d] = twoPi / Len[d]; + scaleBack[d] = Len[d] / twoPi; + } + + using ScaleFunctor = detail::ScalePositionsFunctor; + Kokkos::parallel_for("NUFFT_scale_to_2pi", Kokkos::RangePolicy(0, localNp), + ScaleFunctor(Rview, scaleToTwoPi)); + + if (type_m == 1) { + nativeNufft_m->type1(R, Q, f, useUpsampledInputs_m); + } else if (type_m == 2) { + nativeNufft_m->type2(f, R, Q, useUpsampledInputs_m); + } else { + throw IpplException("FFT", "Only type 1 and type 2 NUFFT supported"); + } + + Kokkos::parallel_for("NUFFT_scale_back", Kokkos::RangePolicy(0, localNp), + ScaleFunctor(Rview, scaleBack)); + } + + //========================================================================= + // FINUFFT Transform + //========================================================================= + template + template + void FFT::transformFinufft( + const ParticleAttrib, Properties...>& R, ParticleAttrib& Q, + ComplexField& f) { +#ifdef ENABLE_FINUFFT + const auto localNp = R.getParticleCount(); + const auto& layout = f.getLayout(); + const auto& mesh = f.get_mesh(); + const auto& dx = mesh.getMeshSpacing(); + const auto& domain = layout.getDomain(); + const int nghost = f.getNghost(); + + Vector Len; + for (unsigned d = 0; d < Dim; ++d) { + Len[d] = dx[d] * domain[d].length(); + } + + constexpr T twoPi = T(2.0 * M_PI); + + auto fview = f.getView(); + auto Rview = R.getView(); + auto Qview = Q.getView(); + + const auto& lDom = layout.getLocalNDIndex(); + if (tempField_m.extent(0) != static_cast(lDom[0].length()) + || tempField_m.extent(1) != static_cast(lDom[1].length()) + || tempField_m.extent(2) != static_cast(lDom[2].length())) { + Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length()); + } + + if (tempQ_m.extent(0) < localNp) { + Kokkos::realloc(tempQ_m, localNp); + } + + for (unsigned d = 0; d < Dim; ++d) { + if (tempR_m[d].extent(0) < localNp) { + Kokkos::realloc(tempR_m[d], localNp); + } + } + + auto tempField = tempField_m; + auto tempQ = tempQ_m; + auto tempRx = tempR_m[0]; + auto tempRy = tempR_m[1]; + auto tempRz = tempR_m[2]; + + Vector scale; + for (unsigned d = 0; d < Dim; ++d) { + scale[d] = twoPi / Len[d]; + } + + using mdrange_type = Kokkos::MDRangePolicy>; + using CopyToTemp = detail::CopyFieldToTempFunctor; + + int nx = lDom[0].length(); + int ny = lDom[1].length(); + int nz = lDom[2].length(); + bool needShift = (type_m == 2); + + Kokkos::parallel_for( + "FINUFFT_copy_field_to_temp", + mdrange_type({nghost, nghost, nghost}, {static_cast(fview.extent(0)) - nghost, + static_cast(fview.extent(1)) - nghost, + static_cast(fview.extent(2)) - nghost}), + CopyToTemp(fview, tempField, nghost, nx, ny, nz, needShift)); + + using CopyParticles = + detail::CopyParticlesToTempFunctor; + + Kokkos::parallel_for("FINUFFT_copy_particles_to_temp", localNp, + CopyParticles(Rview, Qview, tempRx, tempRy, tempRz, tempQ, scale)); + + Kokkos::fence(); + + int err = Traits_t::setpts(finufftPlan_m, static_cast(localNp), + tempRx.data(), tempRy.data(), tempRz.data(), FinufftCount_t{0}, + nullptr, nullptr, nullptr); + + if (err != 0) { + throw IpplException("FFT", "FINUFFT setpts failed"); + } + + err = Traits_t::execute(finufftPlan_m, tempQ.data(), tempField.data()); + + if (err != 0) { + throw IpplException("FFT", "FINUFFT execute failed"); + } + + Kokkos::fence(); + + if (type_m == 1) { + using CopyFromTemp = + detail::CopyFieldFromTempFunctor; + + Kokkos::parallel_for("FINUFFT_copy_field_from_temp", + mdrange_type({nghost, nghost, nghost}, + {static_cast(fview.extent(0)) - nghost, + static_cast(fview.extent(1)) - nghost, + static_cast(fview.extent(2)) - nghost}), + CopyFromTemp(fview, tempField, nghost, nx, ny, nz)); + } else if (type_m == 2) { + using CopyBack = detail::CopyParticlesFromTempFunctor; + + Kokkos::parallel_for("FINUFFT_copy_particles_from_temp", localNp, + CopyBack(Qview, tempQ)); + } +#else + throw std::runtime_error("FINUFFT is not activated. Rebuild with -DIPPL_ENABLE_FINUFFT=ON"); + (void)R; + (void)Q; + (void)f; +#endif // ENABLE_FINUFFT + } +} // namespace ippl + +#endif // IPPL_FFT_TRANSFORM_NUFFT_HPP \ No newline at end of file diff --git a/src/FFT/Transform/PrunedCC.h b/src/FFT/Transform/PrunedCC.h new file mode 100644 index 000000000..ca293a0d4 --- /dev/null +++ b/src/FFT/Transform/PrunedCC.h @@ -0,0 +1,421 @@ +#ifndef IPPL_FFT_TRANSFORM_PRUNEDCC_H +#define IPPL_FFT_TRANSFORM_PRUNEDCC_H + +#include +#include +#include + +#include "Utility/IpplTimings.h" +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + + namespace detail { + // Wrapper to safely evaluate OpenMP at compile-time to prevent nested + // parallelism errors when Kokkos is using the OpenMP execution space. + template + inline void runConcurrentBatch(int count, Func&& func) { + if constexpr (IsCPU) { + // Serial outer loop for CPU (Kokkos will parallelize the inner loops) + for (int local = 0; local < count; ++local) { + func(local); + } + } else { + // OpenMP outer loop for GPU (to overlap asynchronous stream launches) +#if defined(_OPENMP) +#pragma omp parallel for +#endif + for (int local = 0; local < count; ++local) { + func(local); + } + } + } + } // namespace detail + + //========================================================================= + // Pruned Complex-to-Complex Transform + //========================================================================= + + template + class FFT { + public: + static constexpr unsigned Dim = ComplexField::dim; + static constexpr int NumSubFFTs = 1 << Dim; + + using Complex_t = typename ComplexField::value_type; + using T = typename Complex_t::value_type; + using MemSpace = typename ComplexField::memory_space; + using ExecSpace = typename ComplexField::execution_space; + using Layout_t = FieldLayout; + + using Backend_t = fft::HeffteC2C; + using GPUOps = fft::Stream; + using Stream_t = typename GPUOps::stream_type; + using DeviceExec = typename GPUOps::exec_space; + using TempView_t = Kokkos::View; + + FFT(const Layout_t& layoutIn, const Layout_t& layoutOut, const PruningParams& pruning, + const ParameterList& params) + : pruning_(pruning) + , numConcurrent_(std::clamp(params.get("num_concurrent_ffts", 4), 1, NumSubFFTs)) { + static_assert(Dim == 3, "Pruned FFT currently only supports 3D"); + + auto& prunedLayout = + (layoutOut.getLocalNDIndex().size() < layoutIn.getLocalNDIndex().size()) ? layoutOut + : layoutIn; + + std::array low, high; + fft::domainToBounds(prunedLayout.getLocalNDIndex(), low, high); + heffte::box3d box{low, high}; + + for (int s = 0; s < numConcurrent_; ++s) { + MPI_Comm_dup(Comm->getCommunicator(), &comms_[s]); + GPUOps::create(streams_[s]); + backends_[s] = std::make_unique(box, box, comms_[s], params); + } + } + + ~FFT() { + for (int s = 0; s < numConcurrent_; ++s) { + GPUOps::destroy(streams_[s]); + MPI_Comm_free(&comms_[s]); + } + } + + void transform(TransformDirection direction, ComplexField& input, ComplexField& output, + int dir = 1) { + if (direction == FORWARD) { + forwardPruned(dir, input, output); + } else { + backwardPruned(dir, input, output); + } + } + + void forwardPruned(int dir, ComplexField& input, ComplexField& output); + void backwardPruned(int dir, ComplexField& input, ComplexField& output); + + private: + PruningParams pruning_; + int numConcurrent_; + + std::array, NumSubFFTs> backends_; + std::array temps_; + std::array comms_{}; + std::array streams_{}; + }; + + //------------------------------------------------------------------------- + // Forward Pruned C2C Implementation + //------------------------------------------------------------------------- + + template + void FFT::forwardPruned(int dir, ComplexField& input, + ComplexField& output) { + static IpplTimings::TimerRef twiddleTimer = IpplTimings::getTimer("TwiddleAdd"); + static IpplTimings::TimerRef subFFTTimer = IpplTimings::getTimer("subFFTs"); + + auto inView = input.getView(); + auto outView = output.getView(); + const int ngIn = input.getNghost(); + const int ngOut = output.getNghost(); + + const auto& lDomPruned = output.getLayout().getLocalNDIndex(); + const auto& gDomFull = input.getLayout().getDomain(); + const auto& modes = pruning_.n_modes; + + // Ensure temps + for (int s = 0; s < numConcurrent_; ++s) { + if (temps_[s].size() != output.getOwned().size()) { + temps_[s] = detail::shrinkView("pruned_temp_" + std::to_string(s), outView, ngOut); + } + } + + Kokkos::deep_copy(outView, Complex_t(0, 0)); + + double scale = 1.0; + if (dir == 1) { + for (unsigned d = 0; d < Dim; ++d) { + scale *= double(modes[d]) / double(gDomFull[d].length()); + } + } + + std::array, NumSubFFTs> offsets; + for (int k = 0; k < NumSubFFTs; ++k) { + for (unsigned d = 0; d < Dim; ++d) { + offsets[k][d] = (k >> d) & 1; + } + } + + Vector localFirst; + for (unsigned d = 0; d < Dim; ++d) { + localFirst[d] = lDomPruned[d].first(); + } + + auto owned = output.getOwned(); + const int numBatches = (NumSubFFTs + numConcurrent_ - 1) / numConcurrent_; + + constexpr bool is_cpu = false +#ifdef KOKKOS_ENABLE_SERIAL + || std::is_same_v +#endif +#ifdef KOKKOS_ENABLE_OPENMP + || std::is_same_v +#endif + ; + + for (int batch = 0; batch < numBatches; ++batch) { + const int start = batch * numConcurrent_; + const int end = std::min(start + numConcurrent_, NumSubFFTs); + const int count = end - start; + + IpplTimings::startTimer(subFFTTimer); + + // Using the wrapper to evaluate thread parallelism safely + detail::runConcurrentBatch(count, [&](int local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + auto copy_lambda = KOKKOS_LAMBDA(int i0, int i1, int i2) { + int si = i0 * 2 + offs[0] + ngIn; + int sj = i1 * 2 + offs[1] + ngIn; + int sk = i2 * 2 + offs[2] + ngIn; + temp(i0, i1, i2) = inView(si, sj, sk); + }; + + if constexpr (is_cpu) { + // CPU execution: Dispatch cleanly without stream instances + Kokkos::parallel_for( + "strided_copy_forward", + Kokkos::MDRangePolicy>( + {0, 0, 0}, {long(owned[0].length()), long(owned[1].length()), + long(owned[2].length())}), + copy_lambda); + } else { + // GPU execution: Dispatch to designated streams to overlap sub-FFT work + auto exec = GPUOps::instance(streams_[local]); + Kokkos::parallel_for("strided_copy_forward", + Kokkos::MDRangePolicy>( + exec, {0, 0, 0}, + {long(owned[0].length()), long(owned[1].length()), + long(owned[2].length())}), + copy_lambda); + GPUOps::sync(streams_[local]); // Ensure copy finishes before HeFFTe reads it + } + + if (dir == 1) { + backends_[local]->forward(temp.data(), temp.data()); + } else { + backends_[local]->backward(temp.data(), temp.data()); + } + }); + + Kokkos::fence(); + IpplTimings::stopTimer(subFFTTimer); + + IpplTimings::startTimer(twiddleTimer); + + for (int local = 0; local < count; ++local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + Kokkos::parallel_for( + "twiddle_add_forward", + Kokkos::MDRangePolicy>( + {ngOut, ngOut, ngOut}, + {int(outView.extent(0)) - ngOut, int(outView.extent(1)) - ngOut, + int(outView.extent(2)) - ngOut}), + KOKKOS_LAMBDA(int i, int j, int kk) { + int gi = i - ngOut + localFirst[0]; + int gj = j - ngOut + localFirst[1]; + int gk = kk - ngOut + localFirst[2]; + + int64_t f0 = (gi < int64_t(modes[0]) / 2) + ? gi + : int64_t(gDomFull[0].length()) - int64_t(modes[0]) + gi; + int64_t f1 = (gj < int64_t(modes[1]) / 2) + ? gj + : int64_t(gDomFull[1].length()) - int64_t(modes[1]) + gj; + int64_t f2 = (gk < int64_t(modes[2]) / 2) + ? gk + : int64_t(gDomFull[2].length()) - int64_t(modes[2]) + gk; + + Complex_t w(1.0, 0.0); + auto twiddle = [&](int64_t freq, int64_t N) { + double ang = -dir * 2.0 * M_PI * double(freq) / double(N); + return Complex_t(Kokkos::cos(ang), Kokkos::sin(ang)); + }; + + if (offs[0]) + w *= twiddle(f0, gDomFull[0].length()); + if (offs[1]) + w *= twiddle(f1, gDomFull[1].length()); + if (offs[2]) + w *= twiddle(f2, gDomFull[2].length()); + + auto val = temp(i - ngOut, j - ngOut, kk - ngOut); + outView(i, j, kk) += w * val * scale; + }); + } + + IpplTimings::stopTimer(twiddleTimer); + } + } + + //------------------------------------------------------------------------- + // Backward Pruned C2C Implementation + //------------------------------------------------------------------------- + + template + void FFT::backwardPruned(int dir, ComplexField& input, + ComplexField& output) { + static IpplTimings::TimerRef subIFFTTimer = IpplTimings::getTimer("subIFFTs"); + static IpplTimings::TimerRef stridedWriteTimer = IpplTimings::getTimer("StridedWrite"); + + auto inView = input.getView(); // Pruned frequency domain + auto outView = output.getView(); // Full spatial domain + const int ngIn = input.getNghost(); + const int ngOut = output.getNghost(); + + const auto& lDomPruned = input.getLayout().getLocalNDIndex(); + const auto& gDomFull = output.getLayout().getDomain(); + const auto& modes = pruning_.n_modes; + + for (int s = 0; s < numConcurrent_; ++s) { + if (temps_[s].size() != input.getOwned().size()) { + temps_[s] = + detail::shrinkView("pruned_ifft_temp_" + std::to_string(s), inView, ngIn); + } + } + + Kokkos::deep_copy(outView, Complex_t(0, 0)); + + std::array, NumSubFFTs> offsets; + for (int k = 0; k < NumSubFFTs; ++k) { + for (unsigned d = 0; d < Dim; ++d) { + offsets[k][d] = (k >> d) & 1; + } + } + + Vector localFirst; + for (unsigned d = 0; d < Dim; ++d) { + localFirst[d] = lDomPruned[d].first(); + } + + auto owned = input.getOwned(); + const int numBatches = (NumSubFFTs + numConcurrent_ - 1) / numConcurrent_; + + constexpr bool is_cpu = false +#ifdef KOKKOS_ENABLE_SERIAL + || std::is_same_v +#endif +#ifdef KOKKOS_ENABLE_OPENMP + || std::is_same_v +#endif + ; + + for (int batch = 0; batch < numBatches; ++batch) { + const int start = batch * numConcurrent_; + const int end = std::min(start + numConcurrent_, NumSubFFTs); + const int count = end - start; + + IpplTimings::startTimer(subIFFTTimer); + + // Using the wrapper to evaluate thread parallelism safely + detail::runConcurrentBatch(count, [&](int local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + auto multiply_lambda = KOKKOS_LAMBDA(int i0, int i1, int i2) { + int gi = i0 + localFirst[0]; + int gj = i1 + localFirst[1]; + int gk = i2 + localFirst[2]; + + int64_t f0 = (gi < int64_t(modes[0]) / 2) + ? gi + : int64_t(gDomFull[0].length()) - int64_t(modes[0]) + gi; + int64_t f1 = (gj < int64_t(modes[1]) / 2) + ? gj + : int64_t(gDomFull[1].length()) - int64_t(modes[1]) + gj; + int64_t f2 = (gk < int64_t(modes[2]) / 2) + ? gk + : int64_t(gDomFull[2].length()) - int64_t(modes[2]) + gk; + + Complex_t w(1.0, 0.0); + auto twiddle = [&](int64_t freq, int64_t N) { + double ang = dir * 2.0 * M_PI * double(freq) / double(N); + return Complex_t(Kokkos::cos(ang), Kokkos::sin(ang)); + }; + + if (offs[0]) + w *= twiddle(f0, gDomFull[0].length()); + if (offs[1]) + w *= twiddle(f1, gDomFull[1].length()); + if (offs[2]) + w *= twiddle(f2, gDomFull[2].length()); + + auto input_val = inView(i0 + ngIn, i1 + ngIn, i2 + ngIn); + temp(i0, i1, i2) = w * input_val; + }; + + if constexpr (is_cpu) { + Kokkos::parallel_for( + "twiddle_multiply_backward", + Kokkos::MDRangePolicy>( + {0, 0, 0}, {long(owned[0].length()), long(owned[1].length()), + long(owned[2].length())}), + multiply_lambda); + } else { + auto exec = GPUOps::instance(streams_[local]); + Kokkos::parallel_for("twiddle_multiply_backward", + Kokkos::MDRangePolicy>( + exec, {0, 0, 0}, + {long(owned[0].length()), long(owned[1].length()), + long(owned[2].length())}), + multiply_lambda); + GPUOps::sync(streams_[local]); // Ensure copy finishes before HeFFTe reads it + } + + if (dir == -1) { + backends_[local]->forward(temp.data(), temp.data()); + } else { + backends_[local]->backward(temp.data(), temp.data()); + } + }); + + Kokkos::fence(); + IpplTimings::stopTimer(subIFFTTimer); + + IpplTimings::startTimer(stridedWriteTimer); + + for (int local = 0; local < count; ++local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + Kokkos::parallel_for( + "strided_write_backward", + Kokkos::MDRangePolicy>( + {0, 0, 0}, {long(owned[0].length()), long(owned[1].length()), + long(owned[2].length())}), + KOKKOS_LAMBDA(int i0, int i1, int i2) { + int oi = i0 * 2 + offs[0] + ngOut; + int oj = i1 * 2 + offs[1] + ngOut; + int ok = i2 * 2 + offs[2] + ngOut; + outView(oi, oj, ok) = temp(i0, i1, i2); + }); + } + + IpplTimings::stopTimer(stridedWriteTimer); + } + } +} // namespace ippl + +#endif \ No newline at end of file diff --git a/src/FFT/Transform/PrunedRC.h b/src/FFT/Transform/PrunedRC.h new file mode 100644 index 000000000..e38229a61 --- /dev/null +++ b/src/FFT/Transform/PrunedRC.h @@ -0,0 +1,286 @@ +#ifndef IPPL_FFT_TRANSFORM_PRUNEDRC_H +#define IPPL_FFT_TRANSFORM_PRUNEDRC_H + +#include +#include + +#include "Utility/IpplTimings.h" +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + //========================================================================= + // Pruned Real-to-Complex Transform + // + // The key insight that makes this completely communication-free per + // transform is the choice of heFFTe outbox: + // + // R2C dim (d_r): + // Each rank's outbox lower bound in d_r = pruned field lower bound. + // The last rank (hi_pruned == K_r - 1) extends its outbox to cover + // [K_r, N_r/2], which are the "extra" full-complex modes discarded by + // the pruning. This ensures every rank's outbox contains exactly the + // full-complex indices its pruned modes map to (fi = gi, direct). + // + // Non-R2C dims: + // Every rank owns the FULL extent [0, N_d - 1]. The wrapping formula + // fj = gj < K/2 ? gj : N-K+gj always maps into [0, N-1], which is + // always within this rank's outbox. + // + // Result: the local index in tempComplexFull_ is computable from the + // pruned global index with no inter-rank communication. + //========================================================================= + + template + class FFT { + public: + static constexpr unsigned Dim = RealField::dim; + + using T = typename RealField::value_type; + using Complex_t = Kokkos::complex; + using MemSpace = typename RealField::memory_space; + using ExecSpace = typename RealField::execution_space; + using Layout_t = FieldLayout; + + using ComplexField = + typename Field::uniform_type; + +#ifdef IPPL_ENABLE_CUFFTMP + using Backend_t = fft::CuFFTMpR2C; +#else + using Backend_t = fft::HeffteR2C; +#endif + using TempReal_t = Kokkos::View; + using TempComplex_t = Kokkos::View; + + FFT(const Layout_t& layoutReal, + const Layout_t& layoutComplexFull, // kept for API compatibility; outbox is recomputed + const Layout_t& layoutComplexPruned, const PruningParams& pruning, + const ParameterList& params); + + void transform(TransformDirection direction, RealField& f, ComplexField& g); + + private: + PruningParams pruning_; + std::unique_ptr backend_; + int r2c_dir_ = 0; + + // Local outbox origin and dimensions (= tempComplexFull_ dimensions). + // For the R2C dim: origin = pruned field's lower bound on this rank. + // For non-R2C dims: origin = 0, size = full global extent N_d. + std::array lowComplexFull_ = {}; + std::array fullComplexDims_ = {}; + + // Global real-grid sizes for the wrapping formula: + // fj = gj < K/2 ? gj : N - K + gj + std::array globalRealDims_ = {}; + + TempReal_t tempReal_; + TempComplex_t tempComplexFull_; + }; + + //========================================================================= + // Constructor + //========================================================================= + + template + FFT::FFT(const Layout_t& layoutReal, + const Layout_t& /*layoutComplexFull*/, + const Layout_t& layoutComplexPruned, + const PruningParams& pruning, + const ParameterList& params) + : pruning_(pruning) { + static_assert(Dim == 3, "PrunedRCTransform currently only supports 3D"); + + r2c_dir_ = params.get("r2c_direction", 0); + + // Global real-grid sizes + const auto& gDomReal = layoutReal.getDomain(); + for (int d = 0; d < 3; ++d) + globalRealDims_[d] = gDomReal[d].length(); + + // Inbox: this rank's local real slab + std::array lowReal, highReal; + fft::domainToBounds(layoutReal.getLocalNDIndex(), lowReal, highReal); + + // Outbox: custom — aligned to pruned in r2c_dir, full extent everywhere else + // + // In r2c_dir d_r: + // outbox lo = pruned lo (so fi = gi_p is always within this rank's buffer) + // outbox hi = pruned hi (except the last rank, which absorbs [K_r, N_r/2]) + // + // In every other dim d: + // outbox = [0, N_d - 1] (whole extent; wrapping formula always stays local) + const auto& lDomPruned = layoutComplexPruned.getLocalNDIndex(); + + std::array lowOut, highOut; + for (int d = 0; d < 3; ++d) { + if (d == r2c_dir_) { + const long long lo_p = lDomPruned[d].first(); + const long long hi_p = lDomPruned[d].last(); + const long long K_d = static_cast(pruning_.n_modes[d]); + const long long N_d = globalRealDims_[d]; + const long long top = N_d / 2; // last valid index of R2C output (N/2+1 elements) + + lowOut[d] = lo_p; + // If this rank owns the last pruned mode, extend outbox to cover + // all remaining full-complex elements [K_d .. N_d/2] + highOut[d] = (hi_p == K_d - 1) ? top : hi_p; + } else { + // Every rank owns the full extent of this dimension + lowOut[d] = 0; + highOut[d] = globalRealDims_[d] - 1; + } + } + + heffte::box3d inbox{lowReal, highReal}; + heffte::box3d outbox{lowOut, highOut}; + + backend_ = + std::make_unique(inbox, outbox, r2c_dir_, Comm->getCommunicator(), params); + + for (int d = 0; d < 3; ++d) { + lowComplexFull_[d] = lowOut[d]; + fullComplexDims_[d] = static_cast(highOut[d] - lowOut[d] + 1); + } + } + + //========================================================================= + // transform + //========================================================================= + + template + void FFT::transform(TransformDirection direction, RealField& f, + ComplexField& g) { + auto fview = f.getView(); + auto gview = g.getView(); + const int ngf = f.getNghost(); + const int ngg = g.getNghost(); + + // Ensure temp buffers + if (tempReal_.size() != f.getOwned().size()) + tempReal_ = detail::shrinkView("pruned_r2c_real", fview, ngf); + + const std::size_t fullSize = + fullComplexDims_[0] * fullComplexDims_[1] * fullComplexDims_[2]; + if (tempComplexFull_.size() != fullSize) + tempComplexFull_ = TempComplex_t("pruned_r2c_complex_full", fullComplexDims_[0], + fullComplexDims_[1], fullComplexDims_[2]); + + // Pruned domain info for the kernel + const auto& lDomPruned = g.getLayout().getLocalNDIndex(); + const long long lp0 = lDomPruned[0].first(); + const long long lp1 = lDomPruned[1].first(); + const long long lp2 = lDomPruned[2].first(); + + // Mode counts and real-grid sizes for the wrapping formula + const long long K0 = pruning_.n_modes[0]; + const long long K1 = pruning_.n_modes[1]; + const long long K2 = pruning_.n_modes[2]; + const long long N0 = globalRealDims_[0]; + const long long N1 = globalRealDims_[1]; + const long long N2 = globalRealDims_[2]; + + // outbox origins — only non-zero in r2c_dir + const long long lcf0 = lowComplexFull_[0]; + const long long lcf1 = lowComplexFull_[1]; + const long long lcf2 = lowComplexFull_[2]; + const int r2c = r2c_dir_; + + auto owned = g.getOwned(); // ghost-free extent of pruned field + + if (direction == FORWARD) { + // 1. Strip ghosts → tempReal_ + auto tempreal = tempReal_; + Kokkos::parallel_for( + "r2c_copy_real_fwd", + Kokkos::MDRangePolicy>( + {ngf, ngf, ngf}, {int(fview.extent(0)) - ngf, int(fview.extent(1)) - ngf, + int(fview.extent(2)) - ngf}), + KOKKOS_LAMBDA(int i, int j, int k) { + tempreal(i - ngf, j - ngf, k - ngf) = fview(i, j, k); + }); + Kokkos::fence(); + + // 2. Distributed R2C FFT → tempComplexFull_ + backend_->forward(tempReal_.data(), tempComplexFull_.data()); + + // 3. Extract pruned modes from tempComplexFull_ → pruned output field + // + // Every access is local by construction of the outbox: + // fi0_l = gi0 - lcf0 (R2C dim 0 assumed here; generalises below) + // fi1_l = wrap(gi1) (lcf1=0 for non-R2C dims) + // fi2_l = wrap(gi2) + auto& tcf = tempComplexFull_; + Kokkos::parallel_for( + "extract_pruned_r2c_fwd", + Kokkos::MDRangePolicy>( + {0, 0, 0}, + {int(owned[0].length()), int(owned[1].length()), int(owned[2].length())}), + KOKKOS_LAMBDA(int i0, int i1, int i2) { + const long long gi0 = i0 + lp0; + const long long gi1 = i1 + lp1; + const long long gi2 = i2 + lp2; + + const int fi0 = + (r2c == 0) ? int(gi0 - lcf0) : int((gi0 < K0 / 2) ? gi0 : (N0 - K0 + gi0)); + const int fi1 = + (r2c == 1) ? int(gi1 - lcf1) : int((gi1 < K1 / 2) ? gi1 : (N1 - K1 + gi1)); + const int fi2 = + (r2c == 2) ? int(gi2 - lcf2) : int((gi2 < K2 / 2) ? gi2 : (N2 - K2 + gi2)); + + gview(i0 + ngg, i1 + ngg, i2 + ngg) = tcf(fi0, fi1, fi2); + }); + + } else { // BACKWARD + + // 1. Zero-fill tempComplexFull_, then scatter pruned modes into it + Kokkos::deep_copy(tempComplexFull_, Complex_t(0, 0)); + + auto& tcf = tempComplexFull_; + Kokkos::parallel_for( + "scatter_pruned_r2c_bwd", + Kokkos::MDRangePolicy>( + {0, 0, 0}, + {int(owned[0].length()), int(owned[1].length()), int(owned[2].length())}), + KOKKOS_LAMBDA(int i0, int i1, int i2) { + const long long gi0 = i0 + lp0; + const long long gi1 = i1 + lp1; + const long long gi2 = i2 + lp2; + + const int fi0 = + (r2c == 0) ? int(gi0 - lcf0) : int((gi0 < K0 / 2) ? gi0 : (N0 - K0 + gi0)); + const int fi1 = + (r2c == 1) ? int(gi1 - lcf1) : int((gi1 < K1 / 2) ? gi1 : (N1 - K1 + gi1)); + const int fi2 = + (r2c == 2) ? int(gi2 - lcf2) : int((gi2 < K2 / 2) ? gi2 : (N2 - K2 + gi2)); + + tcf(fi0, fi1, fi2) = gview(i0 + ngg, i1 + ngg, i2 + ngg); + }); + Kokkos::fence(); + + // 2. Distributed C2R backward + backend_->backward(tempComplexFull_.data(), tempReal_.data()); + Kokkos::fence(); + + // 3. Copy tempReal_ back (restore ghost padding) + auto tempreal = tempReal_; + Kokkos::parallel_for( + "r2c_copy_real_bwd", + Kokkos::MDRangePolicy>( + {ngf, ngf, ngf}, {int(fview.extent(0)) - ngf, int(fview.extent(1)) - ngf, + int(fview.extent(2)) - ngf}), + KOKKOS_LAMBDA(int i, int j, int k) { + fview(i, j, k) = tempreal(i - ngf, j - ngf, k - ngf); + }); + } + } + +} // namespace ippl + +#endif \ No newline at end of file diff --git a/src/FFT/Transform/RC.h b/src/FFT/Transform/RC.h new file mode 100644 index 000000000..28356448b --- /dev/null +++ b/src/FFT/Transform/RC.h @@ -0,0 +1,94 @@ +#ifndef IPPL_FFT_TRANSFORM_RC_H +#define IPPL_FFT_TRANSFORM_RC_H + +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + + template + class FFT { + public: + static constexpr unsigned Dim = RealField::dim; + + using T = typename RealField::value_type; + using Complex_t = Kokkos::complex; + using MemSpace = typename RealField::memory_space; + using ExecSpace = typename RealField::execution_space; + using Layout_t = FieldLayout; + + using ComplexField = typename Field::uniform_type; + +#ifdef IPPL_ENABLE_CUFFTMP + using Backend_t = fft::CuFFTMpR2C; +#else + using Backend_t = fft::HeffteR2C; +#endif + using TempReal_t = Kokkos::View; + using TempComplex_t = Kokkos::View; + + FFT(const Layout_t& layoutIn, const Layout_t& layoutOut, const ParameterList& params) { + static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); + + std::array lowIn, highIn, lowOut, highOut; + fft::domainToBounds(layoutIn.getLocalNDIndex(), lowIn, highIn); + fft::domainToBounds(layoutOut.getLocalNDIndex(), lowOut, highOut); + + int r2c_dir = params.get("r2c_direction", 0); + backend_ = std::make_unique(heffte::box3d{lowIn, highIn}, + heffte::box3d{lowOut, highOut}, + r2c_dir, Comm->getCommunicator(), params); + } + + void warmup(RealField& f, ComplexField& g) { + transform(FORWARD, f, g); + transform(BACKWARD, f, g); + } + + void transform(TransformDirection direction, RealField& f, ComplexField& g) { + auto fview = f.getView(); + auto gview = g.getView(); + const int ngf = f.getNghost(); + const int ngg = g.getNghost(); + + ensureTemps(f, g); + fft::copyToTemp(tempReal_, fview, ngf); + fft::copyToTemp(tempComplex_, gview, + ngg); + + if (direction == FORWARD) { + backend_->forward(tempReal_.data(), tempComplex_.data()); + } else { + backend_->backward(tempComplex_.data(), tempReal_.data()); + } + + fft::copyFromTemp(fview, tempReal_, + ngf); + fft::copyFromTemp( + gview, tempComplex_, ngg); + } + + private: + std::unique_ptr backend_; + TempReal_t tempReal_; + TempComplex_t tempComplex_; + + void ensureTemps(const RealField& f, const ComplexField& g) { + if (tempReal_.size() != f.getOwned().size()) { + tempReal_ = detail::shrinkView("fft_rc_real", f.getView(), f.getNghost()); + } + if (tempComplex_.size() != g.getOwned().size()) { + tempComplex_ = detail::shrinkView("fft_rc_complex", g.getView(), g.getNghost()); + } + } + }; + +} // namespace ippl + +#endif diff --git a/src/FFT/Transform/Transform.h b/src/FFT/Transform/Transform.h new file mode 100644 index 000000000..1fef01005 --- /dev/null +++ b/src/FFT/Transform/Transform.h @@ -0,0 +1,11 @@ +#ifndef IPPL_FFT_TRANSFORM_HPP +#define IPPL_FFT_TRANSFORM_HPP + +#include "FFT/Transform/CC.h" +#include "FFT/Transform/NUFFT.h" +#include "FFT/Transform/PrunedCC.h" +#include "FFT/Transform/PrunedRC.h" +#include "FFT/Transform/RC.h" +#include "FFT/Transform/Trig.h" + +#endif \ No newline at end of file diff --git a/src/FFT/Transform/Trig.h b/src/FFT/Transform/Trig.h new file mode 100644 index 000000000..91f6f3070 --- /dev/null +++ b/src/FFT/Transform/Trig.h @@ -0,0 +1,103 @@ +#ifndef IPPL_FFT_TRANSFORM_TRIG_H +#define IPPL_FFT_TRANSFORM_TRIG_H + +#include "Utility/ParameterList.h" +#include "Utility/ViewUtils.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + + +namespace ippl { + + namespace fft { + + // Shared base for all trigonometric transforms + template + class TrigBase { + public: + static constexpr unsigned Dim = Field::dim; + + using T = typename Field::value_type; + using MemSpace = typename Field::memory_space; + using ExecSpace = typename Field::execution_space; + using Layout_t = FieldLayout; + using Backend_t = HeffteTrig; + using TempView_t = Kokkos::View; + + TrigBase(const Layout_t& layout, const ParameterList& params) { + static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); + + std::array low, high; + domainToBounds(layout.getLocalNDIndex(), low, high); + heffte::box3d box{low, high}; + + backend_ = std::make_unique(box, box, Comm->getCommunicator(), params); + } + + void warmup(Field& f) { + transform(FORWARD, f); + transform(BACKWARD, f); + } + + void transform(TransformDirection direction, Field& f) { + // FFTW scaling + if constexpr (is_available_v) { + if (direction == FORWARD) + f = f / fftw_trig_scale(); + } + + auto view = f.getView(); + const int ng = f.getNghost(); + + ensureTemp(f); + fft::copyToTemp(temp_, view, ng); + + if (direction == FORWARD) { + backend_->forward(temp_.data(), temp_.data()); + } else { + backend_->backward(temp_.data(), temp_.data()); + } + + fft::copyFromTemp(view, temp_, ng); + + if constexpr (is_available_v) { + if (direction == BACKWARD) + f = f * fftw_trig_scale(); + } + } + + private: + std::unique_ptr backend_; + TempView_t temp_; + + void ensureTemp(const Field& f) { + if (temp_.size() != f.getOwned().size()) { + temp_ = ippl::detail::shrinkView("fft_trig_temp", f.getView(), f.getNghost()); + } + } + }; + + } // namespace fft + + // Public specializations + template + class FFT : public fft::TrigBase { + using fft::TrigBase::TrigBase; + }; + + template + class FFT : public fft::TrigBase { + using fft::TrigBase::TrigBase; + }; + + template + class FFT : public fft::TrigBase { + using fft::TrigBase::TrigBase; + }; + +} // namespace ippl + +#endif \ No newline at end of file From dbbe59a4ff600d0a8014073bd4fbbabddeeb4bbd Mon Sep 17 00:00:00 2001 From: pfischill Date: Wed, 29 Apr 2026 15:30:47 +0200 Subject: [PATCH 03/39] Interpolation: scatter/gather modernization with binning and tiling - Add Scatter/{Scatter,GridParallelScatter,AtomicScatter,TiledScatter, ScatterConfig,ScatterArgumentsBase,TileSizeCache}.h: a configurable scatter front-end that selects between atomic, output-focused, and tiled grid-parallel back-ends, with an optional tile-size cache for hardware-specific tuning data loaded at runtime. - Add Gather/{Gather,AtomicGather,GatherConfig,GatherArgumentsBase}.h: matching gather front-end (Atomic / AtomicSort variants). - Add Binning.h: bin-sort utilities used to group particles by tile so scatter/gather kernels can avoid atomics on hot tiles. - Add Kernels.h: shared kernel-evaluation helpers consumed by the new scatter/gather kernels. - Add WidthDispatcher.h: compile-time switch from runtime kernel width to the corresponding template specialization. - Add CoordinateTransform.h: physical <-> grid coordinate helpers used by both scatter/gather and the native NUFFT. - CIC.hpp: minor signature alignment to the new kernel concept. --- src/Interpolation/Binning.h | 323 +++++++ src/Interpolation/CIC.h | 12 +- src/Interpolation/CIC.hpp | 13 +- src/Interpolation/CMakeLists.txt | 6 +- src/Interpolation/CoordinateTransform.h | 126 +++ src/Interpolation/Gather/AtomicGather.h | 113 +++ src/Interpolation/Gather/Gather.h | 111 +++ .../Gather/GatherArgumentsBase.h | 85 ++ src/Interpolation/Gather/GatherConfig.h | 271 ++++++ src/Interpolation/Kernels.h | 165 ++++ src/Interpolation/Scatter/AtomicScatter.h | 263 ++++++ src/Interpolation/Scatter/AutoTune.cpp | 814 ++++++++++++++++++ src/Interpolation/Scatter/AutoTune.h | 65 ++ .../Scatter/GridParallelScatter.h | 750 ++++++++++++++++ src/Interpolation/Scatter/Scatter.h | 349 ++++++++ .../Scatter/ScatterArgumentsBase.h | 84 ++ src/Interpolation/Scatter/ScatterConfig.h | 276 ++++++ src/Interpolation/Scatter/TileSizeCache.cpp | 297 +++++++ src/Interpolation/Scatter/TileSizeCache.h | 256 ++++++ src/Interpolation/Scatter/TiledScatter.h | 293 +++++++ src/Interpolation/WidthDispatcher.h | 52 ++ 21 files changed, 4704 insertions(+), 20 deletions(-) create mode 100644 src/Interpolation/Binning.h create mode 100644 src/Interpolation/CoordinateTransform.h create mode 100644 src/Interpolation/Gather/AtomicGather.h create mode 100644 src/Interpolation/Gather/Gather.h create mode 100644 src/Interpolation/Gather/GatherArgumentsBase.h create mode 100644 src/Interpolation/Gather/GatherConfig.h create mode 100644 src/Interpolation/Kernels.h create mode 100644 src/Interpolation/Scatter/AtomicScatter.h create mode 100644 src/Interpolation/Scatter/AutoTune.cpp create mode 100644 src/Interpolation/Scatter/AutoTune.h create mode 100644 src/Interpolation/Scatter/GridParallelScatter.h create mode 100644 src/Interpolation/Scatter/Scatter.h create mode 100644 src/Interpolation/Scatter/ScatterArgumentsBase.h create mode 100644 src/Interpolation/Scatter/ScatterConfig.h create mode 100644 src/Interpolation/Scatter/TileSizeCache.cpp create mode 100644 src/Interpolation/Scatter/TileSizeCache.h create mode 100644 src/Interpolation/Scatter/TiledScatter.h create mode 100644 src/Interpolation/WidthDispatcher.h diff --git a/src/Interpolation/Binning.h b/src/Interpolation/Binning.h new file mode 100644 index 000000000..747a941cd --- /dev/null +++ b/src/Interpolation/Binning.h @@ -0,0 +1,323 @@ +#ifndef IPPL_INTERPOLATION_BINNING_H +#define IPPL_INTERPOLATION_BINNING_H + +#include + +#include + +#include "CoordinateTransform.h" +#include "Particle/ParticleLayout.h" +#include "Particle/SortBuffer.h" + +namespace ippl { + namespace Interpolation { + namespace detail { + + template + struct BinningResult { + Kokkos::View permute; + Kokkos::View bin_offsets; + Vector num_tiles; + }; + + /** + * @brief Functor to compute bin index for a particle position + * + * Maps physical particle positions to tile indices for tiled scatter/gather. + * Particles are assigned to tiles based on their stencil center location. + * + * @tparam Dim Spatial dimension + * @tparam RealType Floating point type + */ + template + struct BinComputer { + Vector n_grid_global; + Vector local_offset; + Vector tile_size; + Vector num_tiles; + int kernel_width; + CoordinateTransform transform; + + /** + * @brief Compute 1D tile index for a coordinate value + */ + KOKKOS_INLINE_FUNCTION int compute_tile_1d(RealType val, int dim) const { + const RealType grid_pos = transform.toGridCoordinate(val, dim); + const int center = + transform.getStencilCenter(grid_pos - RealType(0.5), kernel_width); + const int local_c = center - local_offset[dim]; + return Kokkos::clamp(local_c / tile_size[dim], 0, num_tiles[dim] - 1); + } + + /** + * @brief Compute flat bin index for a particle position + * + * Uses row-major ordering (dimension Dim-1 varies fastest) to match + * the decoding in TiledScatter. + */ + template + KOKKOS_INLINE_FUNCTION int operator()(const PositionType& pos) const { + int bin_idx = 0; + int stride = 1; + + for (int d = Dim - 1; d >= 0; --d) { + const int tile_d = compute_tile_1d(pos[d], d); + bin_idx += tile_d * stride; + stride *= num_tiles[d]; + } + + return bin_idx; + } + }; + + /** + * @brief Sort particles by bin using Kokkos::sort_by_key and compute bin offsets + * + * Algorithm: + * 1. Compute bin key for each particle + * 2. Sort (keys, permutation) pairs by key using Kokkos::sort_by_key (maps to vendor) + * 3. Compute bin offsets by finding transitions in sorted keys + * + * @tparam Dim Spatial dimension + * @tparam RealType Floating point type + * @tparam PositionViewType Position view type + * @tparam ExecSpace Kokkos execution space + * @tparam PermuteViewType Permutation view type + * @tparam OffsetViewType Bin offset view type + * @tparam KeyViewType Bin key view type + * + * @param positions Particle positions in physical coordinates + * @param n_grid_global Global grid dimensions + * @param n_grid_local Local grid dimensions (kept for API compatibility; unused) + * @param local_offset First global index of local domain + * @param tile_size Tile size per dimension + * @param kernel_width Interpolation kernel width + * @param origin Mesh origin + * @param invdx Inverse mesh spacing + * @param[out] permute Permutation array (sorted particle indices) + * @param[out] bin_offsets Start index of each bin in permute array + * @param[inout] bin_keys Temporary storage for bin keys + * @param n_particles Number of particles + * @param num_tiles Number of tiles per dimension + */ + template + void bin_sort(PositionViewType positions, Vector n_grid_global, + [[maybe_unused]] Vector n_grid_local, + Vector local_offset, Vector tile_size, + int kernel_width, Vector origin, + Vector invdx, PermuteViewType& permute, + OffsetViewType& bin_offsets, KeyViewType& bin_keys, size_t n_particles, + Vector num_tiles) { + using key_type = typename KeyViewType::non_const_value_type; + + static IpplTimings::TimerRef binSortTimer = IpplTimings::getTimer("binSort"); + IpplTimings::startTimer(binSortTimer); + + // Total number of bins + size_t n_bins = 1; + for (unsigned d = 0; d < Dim; ++d) { + n_bins *= num_tiles[d]; + } + + // Create bin computer functor + CoordinateTransform transform(origin, invdx, n_grid_global); + BinComputer bin_computer{n_grid_global, local_offset, tile_size, + num_tiles, kernel_width, transform}; + + // Step 1: Compute bin keys and initialize permutation + static IpplTimings::TimerRef keyTimer = IpplTimings::getTimer("binComputeKeys"); + IpplTimings::startTimer(keyTimer); + + Kokkos::parallel_for( + "BinSort::ComputeKeys", Kokkos::RangePolicy(0, n_particles), + KOKKOS_LAMBDA(const size_t i) { + bin_keys(i) = static_cast(bin_computer(positions(i))); + permute(i) = i; + }); + Kokkos::fence(); + IpplTimings::stopTimer(keyTimer); + + // Step 2: Sort by key + static IpplTimings::TimerRef sortTimer = IpplTimings::getTimer("binSortByKey"); + IpplTimings::startTimer(sortTimer); + + if (n_particles > 0) { + auto keys_sub = + Kokkos::subview(bin_keys, std::make_pair(size_t(0), n_particles)); + auto permute_sub = + Kokkos::subview(permute, std::make_pair(size_t(0), n_particles)); + + // The CUB radix-sort fast path is only available when the + // *execution space* is Kokkos::Cuda; on a CUDA-enabled + // build the test fixture also instantiates Kokkos::Serial + // (and possibly OpenMP), where ExecSpace().cuda_stream() + // doesn't exist. Gate accordingly. +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + auto& bufs = ippl::detail::getDefaultBinSortBuffers< + typename KeyViewType::memory_space>(); + bufs.ensureCapacity(n_particles, n_bins + 1); + auto keys_out_sub = Kokkos::subview( + bufs.keysOut(), std::make_pair(size_t(0), n_particles)); + auto perm_out_sub = Kokkos::subview( + bufs.permOut(), std::make_pair(size_t(0), n_particles)); + + cudaStream_t cuda_stream = ExecSpace().cuda_stream(); + + void* d_temp = nullptr; + size_t temp_bytes = 0; + cub::DeviceRadixSort::SortPairs( + d_temp, temp_bytes, keys_sub.data(), keys_out_sub.data(), + permute_sub.data(), perm_out_sub.data(), + static_cast(n_particles), 0, sizeof(key_type) * 8, cuda_stream); + + bufs.ensureTempStorage(temp_bytes); + d_temp = bufs.tempStorage().data(); + + auto err = cub::DeviceRadixSort::SortPairs( + d_temp, temp_bytes, keys_sub.data(), keys_out_sub.data(), + permute_sub.data(), perm_out_sub.data(), + static_cast(n_particles), 0, sizeof(key_type) * 8, cuda_stream); + + if (err != cudaSuccess) { + printf("CUB SortPairs failed: %s\n", cudaGetErrorString(err)); + Kokkos::abort("CUB Radix Sort failed."); + } + + Kokkos::deep_copy(ExecSpace(), keys_sub, keys_out_sub); + Kokkos::deep_copy(ExecSpace(), permute_sub, perm_out_sub); + } else { + Kokkos::Experimental::sort_by_key(ExecSpace(), keys_sub, permute_sub); + } +#else + Kokkos::Experimental::sort_by_key(ExecSpace(), keys_sub, permute_sub); +#endif + } + Kokkos::fence(); + IpplTimings::stopTimer(sortTimer); + + // Step 3: Compute bin offsets from sorted keys + static IpplTimings::TimerRef offsetTimer = + IpplTimings::getTimer("binComputeOffsets"); + IpplTimings::startTimer(offsetTimer); + + // Initialize all offsets to n_particles (empty bins point to end) + Kokkos::parallel_for( + "BinSort::InitOffsets", Kokkos::RangePolicy(0, n_bins + 1), + KOKKOS_LAMBDA(const size_t i) { bin_offsets(i) = n_particles; }); + Kokkos::fence(); + + if (n_particles > 0) { + // Step A: each particle writes its index into the slot of + // its own bin iff it is the first particle in that bin. + // Each thread does O(1) work — no inner loop over the gap + // to the previous transition. + Kokkos::parallel_for( + "BinSort::MarkStarts", Kokkos::RangePolicy(0, n_particles), + KOKKOS_LAMBDA(const size_t i) { + const auto curr_bin = bin_keys(i); + if (i == 0 || bin_keys(i - 1) != curr_bin) { + bin_offsets(static_cast(curr_bin)) = i; + } + }); + Kokkos::fence(); + + // Step B: right-to-left inclusive min-scan to fill empty + // bins (their start = next non-empty bin's start). Empty + // bins still hold the n_particles sentinel after Step A. + // Sequential single-thread launch is intentional: the + // dependency is purely linear and n_bins is small relative + // to n_particles in any realistic configuration. + Kokkos::parallel_for( + "BinSort::PropagateEmpties", + Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + for (size_t b = n_bins; b-- > 0;) { + if (bin_offsets(b) > bin_offsets(b + 1)) { + bin_offsets(b) = bin_offsets(b + 1); + } + } + }); + } + Kokkos::fence(); + IpplTimings::stopTimer(offsetTimer); + + IpplTimings::stopTimer(binSortTimer); + } + + /** + * @brief High-level interface for particle binning + * + * Bins particles into tiles for tiled scatter/gather operations. + * Uses sort-based binning for efficient GPU execution. + * + * @tparam ParticleT Particle coordinate type + * @tparam FieldT Field value type + * @tparam ParticleProperties Additional particle attribute properties + * @tparam Dim Spatial dimension + * + * @param particles Particle position attribute + * @param fieldLayout Field layout + * @param mesh Uniform Cartesian mesh + * @param tile_size Tile size per dimension + * @param kernel_width Interpolation kernel width + * + * @return Tuple of (permutation, bin_offsets, num_tiles) + */ + template + auto bin_particles( + const ParticleAttrib, ParticleProperties...>& particles, + FieldLayout fieldLayout, UniformCartesian mesh, + Vector tile_size, int kernel_width) { + using AttribType = std::decay_t; + using ExecSpace = typename AttribType::execution_space; + using memory_space = typename AttribType::memory_space; + + // Extract grid information + const NDIndex& lDom = fieldLayout.getLocalNDIndex(); + const NDIndex& gDom = fieldLayout.getDomain(); + + Vector ngrid_global; + Vector ngrid_local; + Vector local_offset; + for (unsigned d = 0; d < Dim; ++d) { + ngrid_global[d] = gDom[d].length(); + ngrid_local[d] = lDom[d].length(); + local_offset[d] = lDom[d].first(); + } + + // Compute number of tiles (+1 for boundary particles). + Vector num_tiles; + size_t total_tiles = 1; + for (unsigned d = 0; d < Dim; ++d) { + num_tiles[d] = (ngrid_local[d] + tile_size[d] - 1) / tile_size[d] + 1; + total_tiles *= num_tiles[d]; + } + + auto particle_view = particles.getView(); + const auto invdx = 1.0 / mesh.getMeshSpacing(); + const size_t n_particles = particles.getParticleCount(); + + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + // n_bins + 1 slots needed for bin_offsets + bufs.ensureCapacity(n_particles, total_tiles + 1); + + auto& permute = bufs.permute(); + auto& bin_offsets = bufs.binOffsets(); + auto& bin_keys = bufs.binKeys(); + + bin_sort, ExecSpace>( + particle_view, ngrid_global, ngrid_local, local_offset, tile_size, + kernel_width, mesh.getOrigin(), invdx, permute, bin_offsets, bin_keys, + n_particles, num_tiles); + + return std::make_tuple(permute, bin_offsets, num_tiles); + } + + } // namespace detail + } // namespace Interpolation +} // namespace ippl + +#endif // IPPL_INTERPOLATION_BINNING_H \ No newline at end of file diff --git a/src/Interpolation/CIC.h b/src/Interpolation/CIC.h index 5adac510d..ec1ea6534 100644 --- a/src/Interpolation/CIC.h +++ b/src/Interpolation/CIC.h @@ -9,7 +9,7 @@ namespace ippl { namespace detail { - /*! + /** * Computes the weight for a given point for a given axial direction * @tparam Point index of the point * @tparam Index index of the axis @@ -22,7 +22,7 @@ namespace ippl { KOKKOS_INLINE_FUNCTION constexpr typename Weights::value_type interpolationWeight( const Weights& wlo, const Weights& whi); - /*! + /** * Computes the index for a given point for a given axis * @tparam Point index of the point * @tparam Index index of the axis @@ -34,7 +34,7 @@ namespace ippl { KOKKOS_INLINE_FUNCTION constexpr typename Indices::value_type interpolationIndex( const Indices& args); - /*! + /** * Scatters to a field at a single point * @tparam ScatterPoint the index of the point to which we are scattering * @tparam Index the sequence 0...Dim - 1 @@ -54,7 +54,7 @@ namespace ippl { const Vector& wlo, const Vector& whi, const Vector& args, const T& val); - /*! + /** * Scatters the particle attribute to the field. * * The coordinates to which an attribute must be scattered is given by 2^n, @@ -81,7 +81,7 @@ namespace ippl { const Vector& wlo, const Vector& whi, const Vector& args, T val = 1); - /*! + /** * Gathers from a field at a single point * @tparam GatherPoint the index of the point from which data is gathered * @tparam Index the sequence 0...Dim - 1 @@ -101,7 +101,7 @@ namespace ippl { const Vector& wlo, const Vector& whi, const Vector& args); - /*! + /** * Gathers the particle attribute from a field (see scatter_field for more details) * @tparam GatherPoint... the indices of the points from which to gather (sequence 0 to * 2^Dim) diff --git a/src/Interpolation/CIC.hpp b/src/Interpolation/CIC.hpp index cbe8064f1..d54b19d7a 100644 --- a/src/Interpolation/CIC.hpp +++ b/src/Interpolation/CIC.hpp @@ -1,9 +1,4 @@ -// -// Class CIC -// First order/cloud-in-cell grid interpolation. Currently implemented as -// global functions, but in order to support higher or lower order interpolation, -// these should be moved into structs. -// +#include namespace ippl { namespace detail { @@ -15,9 +10,6 @@ namespace ippl { } else { return whi[Index]; } - // device code cannot throw exceptions, but we need a - // dummy return to silence the warning - return 0; } template @@ -28,9 +20,6 @@ namespace ippl { } else { return args[Index]; } - // device code cannot throw exceptions, but we need a - // dummy return to silence the warning - return 0; } template ) and are installed centrally by InstallIppl.cmake. diff --git a/src/Interpolation/CoordinateTransform.h b/src/Interpolation/CoordinateTransform.h new file mode 100644 index 000000000..ce9824aa4 --- /dev/null +++ b/src/Interpolation/CoordinateTransform.h @@ -0,0 +1,126 @@ +#ifndef IPPL_INTERPOLATION_COORDINATE_TRANSFORM_H +#define IPPL_INTERPOLATION_COORDINATE_TRANSFORM_H + +#include + +#include "Types/Vector.h" + +#include "Index/Index.h" + +namespace ippl::Interpolation { + + // Forward declare size types from InterpolationUtil.h + using local_index_type = decltype(Index{}.first()); + using size_type = ippl::detail::size_type; + + /** + * @brief Unified coordinate transformation for scatter/gather operations + * + * This class handles the transformation from physical coordinates to grid + * coordinates using mesh information (origin and spacing). + * + * @tparam T Floating-point type for coordinates + * @tparam Dim Spatial dimension + */ + template + struct CoordinateTransform { + using Vector_t = ippl::Vector; + using VectorInt_t = ippl::Vector; + + const Vector_t origin_; // Physical origin from mesh + const Vector_t invdx_; // Inverse of mesh spacing (1/dx) + const VectorInt_t ngrid_global_; // Global grid dimensions + + /** + * @brief Construct from mesh parameters + * + * @param origin Physical origin of the domain + * @param invdx Inverse mesh spacing (1/dx) + * @param ngrid_global Global grid dimensions + */ + KOKKOS_INLINE_FUNCTION CoordinateTransform(const Vector_t& origin, const Vector_t& invdx, + const VectorInt_t& ngrid_global) + : origin_(origin) + , invdx_(invdx) + , ngrid_global_(ngrid_global) {} + + /** + * @brief Transform physical position to grid coordinates [0, ngrid) + * + * This function scales from physical domain to grid domain: (x - origin) / dx + * + * @param physical_pos Physical position in the specified dimension + * @param dim Dimension index + * @return Grid coordinate in [0, ngrid) + */ + KOKKOS_INLINE_FUNCTION T toGridCoordinate(T physical_pos, unsigned dim) const { + return (physical_pos - origin_[dim]) * invdx_[dim]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION T toGridCoordinate(T physical_pos) const { + return (physical_pos - origin_[D]) * invdx_[D]; + } + + /** + * @brief Round a grid coordinate to the cell index that anchors the stencil. + * + * Uses width-dependent rounding: + * - Odd width: round to nearest (symmetric stencil around the particle) + * - Even width: floor (asymmetric stencil) + * + * The stencil leftmost cell is then `center - (width - 1) / 2`, computed + * by `getStencilBase`. + * + * @param grid_pos Grid coordinate (output of toGridCoordinate) + * @param width Kernel width + * @return Center cell index + */ + KOKKOS_INLINE_FUNCTION int getStencilCenter(T grid_pos, int width) const { + const bool odd = (width & 1); + return odd ? static_cast(Kokkos::round(grid_pos)) + : static_cast(Kokkos::floor(grid_pos)); + } + + template + KOKKOS_FORCEINLINE_FUNCTION int getStencilCenter(T grid_pos) const { + if constexpr (Width & 1) + return static_cast(Kokkos::round(grid_pos)); + else + return static_cast(Kokkos::floor(grid_pos)); + } + + /** + * @brief Get base grid index for kernel stencil + * + * Uses width-dependent rounding to determine the base index: + * - Odd width: round to nearest (symmetric stencil around particle) + * - Even width: floor (asymmetric stencil) + * + * The stencil extends from [base_idx, base_idx + width). + * + * For odd widths (e.g., w=3): + * - Grid point at 2.7 rounds to 3 + * - Stencil covers indices [3-(3-1)/2, 3+(3-1)/2] = [2, 3, 4] + * + * For even widths (e.g., w=4): + * - Grid point at 2.7 floors to 2 + * - Stencil covers indices [2-(4-1)/2, 2+(4-1)/2+1] = [0, 1, 2, 3] + * + * @param grid_pos Grid coordinate (output of toGridCoordinate) + * @param width Kernel width + * @return Base index for the kernel stencil (leftmost index) + */ + KOKKOS_INLINE_FUNCTION int getStencilBase(T grid_pos, int width) const { + return getStencilCenter(grid_pos, width) - (width - 1) / 2; + } + + template + KOKKOS_FORCEINLINE_FUNCTION int getStencilBase(T grid_pos) const { + return getStencilCenter(grid_pos) - (Width - 1) / 2; + } + }; + +} // namespace ippl::Interpolation + +#endif // IPPL_INTERPOLATION_COORDINATE_TRANSFORM_H diff --git a/src/Interpolation/Gather/AtomicGather.h b/src/Interpolation/Gather/AtomicGather.h new file mode 100644 index 000000000..b6c598647 --- /dev/null +++ b/src/Interpolation/Gather/AtomicGather.h @@ -0,0 +1,113 @@ +#ifndef IPPL_ATOMIC_GATHER_H +#define IPPL_ATOMIC_GATHER_H + +#include + +#include "Interpolation/CoordinateTransform.h" +#include "Interpolation/Gather/GatherArgumentsBase.h" +#include "Interpolation/WidthDispatcher.h" + +namespace ippl::Interpolation::detail { + template + struct AtomicGather { + static constexpr bool requires_binning = UseSorting; + static constexpr unsigned Dim = Types::Dim; + + using RealType = typename Types::RealType; + using ValueType = typename Types::ValueType; + using memory_space = typename Types::memory_space; + using execution_space = typename Types::execution_space; + + struct Arguments : GatherArgumentsBase { + using PermuteView = Kokkos::View; + PermuteView permute; // Only used when UseSorting = true + + template + static Arguments create(const Field& field, const Positions& pos, Values& vals, + const Kernel& k, const GatherConfig& cfg, + const GatherBinningResult& binning = {}) { + Arguments a; + a.initBase(field, pos, vals, k, cfg.add_to_attribute); + if constexpr (UseSorting) { + a.permute = binning.permute; + } + return a; + } + }; + Arguments args; + + struct Stencil { + Kokkos::Array base; // Stencil leftmost indices in all dims + Kokkos::Array, Dim> kw; // Precomputed kernel evals + }; + + KOKKOS_INLINE_FUNCTION void operator()(size_t j) const { + using result_type = decltype(args.grid)::non_const_value_type; + + // Get actual particle index (sorted or direct) + const size_t p = UseSorting ? args.permute(j) : j; + + // Build stencil + CoordinateTransform transform{args.origin, args.invdx, args.n_grid}; + Stencil stencil{}; + for_constexpr(std::make_integer_sequence{}, [&] { + const RealType g_pos = transform.toGridCoordinate(args.x(p)[d], d); + const RealType g_pos_cc = g_pos - RealType(0.5); + const int idx0 = transform.getStencilBase(g_pos_cc, W); + + stencil.base[d] = idx0 - args.local_offset[d] + args.nghost; + + auto& kernel_vals = stencil.kw[d]; + for (int i = 0; i < W; ++i) { + kernel_vals[i] = + args.kernel((g_pos - (RealType(idx0 + i) + RealType(0.5))) * args.inv_hw); + } + }); + + // Gather W^d stencil around non-uniform pt + result_type out = result_type(0); + auto rec = [&](auto&& self, RealType wprod, auto... idx) -> void { + const int bD = get(stencil.base); + const auto& kD = get(stencil.kw); + + for (int i = 0; i < W; ++i) { + const RealType w = wprod * kD[i]; + if constexpr (D == 0) { + out += args.grid(bD + i, idx...) * w; + } else { + self.template operator()(self, w, bD + i, idx...); + } + } + }; + rec.template operator()(rec, RealType(1)); + + if (args.add_to_attribute) { + if constexpr (std::is_same_v, result_type> + && std::is_same_v) { + args.values(p) = args.values(p) + out.real(); + } else { + args.values(p) = args.values(p) + out; + } + } else { + if constexpr (std::is_same_v, result_type> + && std::is_same_v) { + args.values(p) = out.real(); + } else { + args.values(p) = out; + } + } + } + + void run(size_t n_particles) { + auto policy = Kokkos::RangePolicy(0, n_particles); + // `Kokkos::Experimental::prefer` + DesiredOccupancy is still in + // the Experimental namespace (Kokkos 5.x). The interface may + // move; if it does, drop the prefer() and pass `policy` directly. + auto const policy_tuned = Kokkos::Experimental::prefer( + policy, Kokkos::Experimental::DesiredOccupancy{Kokkos::AUTO}); + Kokkos::parallel_for("AtomicGather", policy_tuned, *this); + } + }; +} // namespace ippl::Interpolation::detail + +#endif // IPPL_ATOMIC_GATHER_H \ No newline at end of file diff --git a/src/Interpolation/Gather/Gather.h b/src/Interpolation/Gather/Gather.h new file mode 100644 index 000000000..6174f049d --- /dev/null +++ b/src/Interpolation/Gather/Gather.h @@ -0,0 +1,111 @@ +#ifndef IPPL_GATHER_H +#define IPPL_GATHER_H + +#include "Interpolation/Binning.h" +#include "Interpolation/Gather/AtomicGather.h" +#include "Interpolation/Gather/GatherArgumentsBase.h" +#include "Interpolation/Gather/GatherConfig.h" +#include "Interpolation/WidthDispatcher.h" +#include "Particle/ParticleAttrib.h" + +namespace ippl { + + namespace Interpolation::detail { + template + struct DeduceGatherTypes { + using FieldTr = ippl::detail::FieldTraits>; + using PosTr = ippl::detail::AttribTraits>; + using ValTr = ippl::detail::AttribTraits>; + using VecTr = ippl::detail::VectorTraits; + + using type = GatherTypes, + typename FieldTr::view_type, typename PosTr::view_type, + typename ValTr::view_type>; + }; + + template + using DeducedGatherTypes = + typename DeduceGatherTypes::type; + + } // namespace Interpolation::detail + + template + class Gather { + public: + Gather(const Kernel& kernel, const Interpolation::GatherConfig& config = {}) + : kernel_m(kernel) + , config_m(config) {} + + template + void operator()(Field& field, + const ParticleAttrib, PosProps...>& positions, + ParticleAttrib& values) { + using Types = + Interpolation::detail::DeducedGatherTypes; + + switch (config_m.method) { + case Interpolation::GatherMethod::Atomic: + dispatch(field, positions, values); + break; + case Interpolation::GatherMethod::AtomicSort: + dispatch(field, positions, values); + break; + case Interpolation::GatherMethod::Tiled: + // TODO: Implement tiled, fallback to atomic + dispatch(field, positions, values); + break; + case Interpolation::GatherMethod::Native: + // TODO: Implement native, fallback to atomic + dispatch(field, positions, values); + break; + } + } + + private: + template