create cuSPARSE backend and spmv (#40)

yhmtsai · BenBrock · web-flow · commit 8e4ad018e057 · 2025-05-15T17:39:06.000+02:00
* add cusparse spmv and corresponding cmake * setup CMake to use the same test and example with thrust * add the cusparse example without thrust * Add cuSPARSE to CI. (#50) * add the cuSPARSE into README * accept list into add_device_test --------- Co-authored-by: Benjamin Brock <brock@cs.berkeley.edu>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -132,3 +132,21 @@ jobs:
       shell: bash -l {0}
       run: |
         ./build/test/gtest/spblas-tests
+
+  cusparse:
+    runs-on: 'gpu_nvidia'
+    steps:
+    - uses: actions/checkout@v4
+    - name: CMake
+      shell: bash -l {0}
+      run: |
+        module load cmake
+        cmake -B build -DENABLE_CUSPARSE=ON -DCMAKE_PREFIX_PATH=/usr/local/cuda/targets/x86_64-linux/lib/cmake
+    - name: Build
+      shell: bash -l {0}
+      run: |
+        make -C build -j `nproc`
+    - name: Test
+      shell: bash -l {0}
+      run: |
+        ./build/test/gtest/spblas-tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,6 +8,7 @@ set(CMAKE_CXX_FLAGS "-O3 -march=native")
 
 option(ENABLE_SANITIZERS "Enable Clang sanitizers" OFF)
 option(ENABLE_ROCSPARSE "Enable rocSPARSE" OFF)
+option(ENABLE_CUSPARSE "Enable cuSPARSE" OFF)
 
 # Get includes, which declares the `spblas` library
 add_subdirectory(include)
@@ -73,6 +74,13 @@ if (ENABLE_ROCSPARSE)
   set(CMAKE_HIP_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
+if (ENABLE_CUSPARSE)
+  set(SPBLAS_GPU_BACKEND ON)
+  find_package(CUDAToolkit REQUIRED)
+  target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
+endif()
+
 # turn on/off debug logging
 if (LOG_LEVEL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLOG_LEVEL=${LOG_LEVEL}") # SPBLAS_DEBUG | SPBLAS_WARNING | SPBLAS_TRACE | SPBLAS_INFO
@@ -137,6 +145,13 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     GIT_TAG 11.1.3)
   FetchContent_MakeAvailable(fmt)
 
+  if (ENABLE_ROCSPARSE)
+    find_package(rocthrust REQUIRED)
+  elseif (ENABLE_CUSPARSE)
+    # It is required to be compiled on the node with available NVIDIA GPU
+    find_package(Thrust REQUIRED)
+    thrust_create_target(Thrust)
+  endif()
   add_subdirectory(examples)
   add_subdirectory(test)
 endif()
diff --git a/README.md b/README.md
@@ -104,7 +104,7 @@ brock@slothius:~/src/spblas-reference$ CXX=g++-13 cmake -B build
 
 ### Compiling with a Vendor Backend
 A vendor backend can be enabled by passing in an `-DENABLE_{BACKEND}=ON` switch
-to `cmake`.  Currently, oneMKL, ArmPL, and rocSPARSE are the supported vendor
+to `cmake`.  Currently, oneMKL, ArmPL, rocSPARSE and cuSPARSE are the supported vendor
 backends.
 
 ### Compiling with oneMKL
@@ -141,6 +141,17 @@ have ROCm installed in a non-standard location.
 brock@slothius:~/src/spblas-reference$ cmake -B build -DENABLE_ROCSPARSE=ON -DCMAKE_PREFIX_PATH=/opt/rocm-6.1.2
 ```
 
+### Compiling with cuSPARSE
+In order to compile with cuSPARSE, CUDA must be installed and the install
+location of CUDA added to `CMAKE_PREFIX_PATH`.  Your package manager will likely
+take care of this for you, but you can also manually specify the location if you
+have CUDA installed in a non-standard location.
+
+```bash
+# Explicitly set the location of CUDA using `CMAKE_PREFIX_PATH`.
+brock@slothius:~/src/spblas-reference$ cmake -B build -DENABLE_CUSPARSE=ON -DCMAKE_PREFIX_PATH=/usr/local/cuda-12.6
+```
+
 #### Compiling with GCC on Mac OS
 There is a known linking issue when compiling with GCC on recent versions of
 Mac OS.  This will cause a link error inside of `ld::AtomPlacement::findAtom()`.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -11,10 +11,12 @@ if (NOT SPBLAS_GPU_BACKEND)
   add_example(matrix_opt_example)
   add_example(spmm_csc)
 else()
-  find_package(rocthrust REQUIRED)
   add_subdirectory(device)
 endif()
 
 if (ENABLE_ROCSPARSE)
   add_subdirectory(rocsparse)
 endif()
+if (ENABLE_CUSPARSE)
+  add_subdirectory(cusparse)
+endif()
diff --git a/examples/cusparse/CMakeLists.txt b/examples/cusparse/CMakeLists.txt
@@ -0,0 +1,6 @@
+function(add_cuda_example example_name)
+  add_executable(${example_name} ${example_name}.cpp)
+  target_link_libraries(${example_name} spblas fmt)
+endfunction()
+
+add_cuda_example(cusparse_simple_spmv)
diff --git a/examples/cusparse/cusparse_simple_spmv.cpp b/examples/cusparse/cusparse_simple_spmv.cpp
@@ -0,0 +1,88 @@
+#include <iostream>
+#include <spblas/spblas.hpp>
+
+#include <cuda_runtime.h>
+
+#include "util.hpp"
+
+#include <fmt/core.h>
+#include <fmt/ranges.h>
+
+int main(int argc, char** argv) {
+  using value_t = float;
+  using index_t = spblas::index_t;
+  using offset_t = spblas::offset_t;
+
+  index_t m = 100;
+  index_t n = 100;
+  index_t nnz_in = 10;
+
+  fmt::print("\n\t###########################################################"
+             "######################");
+  fmt::print("\n\t### Running SpMV Example:");
+  fmt::print("\n\t###");
+  fmt::print("\n\t###   y = alpha * A * x");
+  fmt::print("\n\t###");
+  fmt::print("\n\t### with ");
+  fmt::print("\n\t### A, in CSR format, of size ({}, {}) with nnz = {}", m, n,
+             nnz_in);
+  fmt::print("\n\t### x, a dense vector, of size ({}, {})", n, 1);
+  fmt::print("\n\t### y, a dense vector, of size ({}, {})", m, 1);
+  fmt::print("\n\t### using float and spblas::index_t (size = {} bytes)",
+             sizeof(spblas::index_t));
+  fmt::print("\n\t###########################################################"
+             "######################");
+  fmt::print("\n");
+
+  auto&& [values, rowptr, colind, shape, nnz] =
+      spblas::generate_csr<value_t, index_t, offset_t>(m, n, nnz_in);
+
+  value_t* d_values;
+  offset_t* d_rowptr;
+  index_t* d_colind;
+
+  CUDA_CHECK(cudaMalloc(&d_values, values.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMalloc(&d_rowptr, rowptr.size() * sizeof(offset_t)));
+  CUDA_CHECK(cudaMalloc(&d_colind, colind.size() * sizeof(index_t)));
+
+  CUDA_CHECK(cudaMemcpy(d_values, values.data(),
+                        values.size() * sizeof(value_t), cudaMemcpyDefault));
+  CUDA_CHECK(cudaMemcpy(d_rowptr, rowptr.data(),
+                        rowptr.size() * sizeof(offset_t), cudaMemcpyDefault));
+  CUDA_CHECK(cudaMemcpy(d_colind, colind.data(),
+                        colind.size() * sizeof(index_t), cudaMemcpyDefault));
+
+  spblas::csr_view<value_t, index_t, offset_t> a(d_values, d_rowptr, d_colind,
+                                                 shape, nnz);
+
+  // Scale every value of `a` by 5 in place.
+  // scale(5.f, a);
+
+  std::vector<value_t> x(n, 1);
+  std::vector<value_t> y(m, 0);
+
+  value_t* d_x;
+  value_t* d_y;
+
+  CUDA_CHECK(cudaMalloc(&d_x, x.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMalloc(&d_y, y.size() * sizeof(value_t)));
+
+  CUDA_CHECK(
+      cudaMemcpy(d_x, x.data(), x.size() * sizeof(value_t), cudaMemcpyDefault));
+  CUDA_CHECK(
+      cudaMemcpy(d_y, y.data(), y.size() * sizeof(value_t), cudaMemcpyDefault));
+
+  std::span<value_t> x_span(d_x, n);
+  std::span<value_t> y_span(d_y, m);
+
+  // y = A * x
+  spblas::spmv_state_t state;
+  spblas::multiply(state, a, x_span, y_span);
+
+  CUDA_CHECK(
+      cudaMemcpy(y.data(), d_y, y.size() * sizeof(value_t), cudaMemcpyDefault));
+
+  fmt::print("\tExample is completed!\n");
+
+  return 0;
+}
diff --git a/examples/cusparse/util.hpp b/examples/cusparse/util.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(expression)                                                 \
+  do {                                                                         \
+    const cudaError_t status = expression;                                     \
+    if (status != cudaSuccess) {                                               \
+      std::cerr << "CUDA error " << status << ": "                             \
+                << cudaGetErrorString(status) << " at " << __FILE__ << ":"     \
+                << __LINE__ << std::endl;                                      \
+    }                                                                          \
+  } while (false)
diff --git a/examples/device/CMakeLists.txt b/examples/device/CMakeLists.txt
@@ -1,12 +1,12 @@
 function(add_device_example example_name)
+  add_executable(${example_name} ${example_name}.cpp)
   if (ENABLE_ROCSPARSE)
     set_source_files_properties(${example_name}.cpp PROPERTIES LANGUAGE HIP)
-  # elseif (ENABLE_CUSPARSE)
-    # cuSPARSE linking details will go here.
+  elseif (ENABLE_CUSPARSE)
+    target_link_libraries(${example_name} Thrust)
   else()
     message(FATAL_ERROR "Device backend not found.")
   endif()
-  add_executable(${example_name} ${example_name}.cpp)
   target_link_libraries(${example_name} spblas fmt)
 endfunction()
 
diff --git a/include/spblas/backend/backend.hpp b/include/spblas/backend/backend.hpp
@@ -21,3 +21,7 @@
 #ifdef SPBLAS_ENABLE_ROCSPARSE
 #include <spblas/vendor/rocsparse/rocsparse.hpp>
 #endif
+
+#ifdef SPBLAS_ENABLE_CUSPARSE
+#include <spblas/vendor/cusparse/cusparse.hpp>
+#endif
diff --git a/include/spblas/detail/types.hpp b/include/spblas/detail/types.hpp
@@ -19,6 +19,10 @@
 #include <spblas/vendor/rocsparse/types.hpp>
 #endif
 
+#ifdef SPBLAS_ENABLE_CUSPARSE
+#include <spblas/vendor/cusparse/types.hpp>
+#endif
+
 namespace spblas {
 
 #ifndef SPBLAS_VENDOR_BACKEND
diff --git a/include/spblas/spblas.hpp b/include/spblas/spblas.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
 #if defined(SPBLAS_ENABLE_ONEMKL_SYCL) || defined(SPBLAS_ENABLE_ARMPL) ||      \
-    defined(SPBLAS_ENABLE_AOCLSPARSE) || defined(SPBLAS_ENABLE_ROCSPARSE)
-
+    defined(SPBLAS_ENABLE_AOCLSPARSE) || defined(SPBLAS_ENABLE_ROCSPARSE) ||   \
+    defined(SPBLAS_ENABLE_CUSPARSE)
 #define SPBLAS_VENDOR_BACKEND true
 #endif
 
diff --git a/include/spblas/vendor/cusparse/cuda_allocator.hpp b/include/spblas/vendor/cusparse/cuda_allocator.hpp
@@ -0,0 +1,71 @@
+#pragma once
+
+#include "exception.hpp"
+#include <cuda_runtime.h>
+
+namespace spblas {
+
+namespace cusparse {
+
+template <typename T, std::size_t Alignment = 0>
+class cuda_allocator {
+public:
+  using value_type = T;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using reference = T&;
+  using const_reference = const T&;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+
+  cuda_allocator() noexcept {}
+  cuda_allocator(cudaStream_t stream) noexcept : stream_(stream) {}
+
+  template <typename U>
+  cuda_allocator(const cuda_allocator<U, Alignment>& other) noexcept
+      : stream_(other.stream()) {}
+
+  cuda_allocator(const cuda_allocator&) = default;
+  cuda_allocator& operator=(const cuda_allocator&) = default;
+  ~cuda_allocator() = default;
+
+  using is_always_equal = std::false_type;
+
+  pointer allocate(std::size_t size) {
+    void* ptr;
+    this->throw_if_failure(cudaMallocAsync(&ptr, size * sizeof(T), stream()));
+
+    return reinterpret_cast<T*>(ptr);
+  }
+
+  void deallocate(pointer ptr, std::size_t n = 0) {
+    if (ptr != nullptr) {
+      this->throw_if_failure(cudaFreeAsync(ptr, stream()));
+    }
+  }
+
+  bool operator==(const cuda_allocator&) const = default;
+  bool operator!=(const cuda_allocator&) const = default;
+
+  template <typename U>
+  struct rebind {
+    using other = cuda_allocator<U, Alignment>;
+  };
+
+  cudaStream_t stream() const noexcept {
+    return this->stream_;
+  }
+
+private:
+  void throw_if_failure(cudaError_t error) {
+    if (error != cudaSuccess) {
+      throw std::bad_alloc{};
+    }
+  }
+
+  cudaStream_t stream_ = nullptr;
+};
+
+} // namespace cusparse
+
+} // namespace spblas
diff --git a/include/spblas/vendor/cusparse/cusparse.hpp b/include/spblas/vendor/cusparse/cusparse.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+#include "multiply.hpp"
diff --git a/include/spblas/vendor/cusparse/exception.hpp b/include/spblas/vendor/cusparse/exception.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#include <stdexcept>
+#include <string>
+
+namespace spblas {
+
+namespace __cusparse {
+
+// Throw an exception if the cudaError_t is not cudaSuccess.
+void throw_if_error(cudaError_t error_code, std::string prefix = "") {
+  if (error_code == cudaSuccess) {
+    return;
+  }
+  std::string name = cudaGetErrorName(error_code);
+  std::string message = cudaGetErrorString(error_code);
+  throw std::runtime_error(prefix + "CUDA encountered an error " + name +
+                           ": \"" + message + "\"");
+}
+
+// Throw an exception if the cusparseStatus_t is not CUSPARSE_STATUS_SUCCESS.
+void throw_if_error(cusparseStatus_t error_code) {
+  if (error_code == CUSPARSE_STATUS_SUCCESS) {
+    return;
+  } else if (error_code == CUSPARSE_STATUS_NOT_INITIALIZED) {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"CUSPARSE_STATUS_NOT_INITIALIZED\"");
+  } else if (error_code == CUSPARSE_STATUS_ALLOC_FAILED) {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"CUSPARSE_STATUS_ALLOC_FAILED\"");
+  } else if (error_code == CUSPARSE_STATUS_INVALID_VALUE) {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"CUSPARSE_STATUS_INVALID_VALUE\"");
+  } else if (error_code == CUSPARSE_STATUS_ARCH_MISMATCH) {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"CUSPARSE_STATUS_ARCH_MISMATCH\"");
+  } else if (error_code == CUSPARSE_STATUS_EXECUTION_FAILED) {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"CUSPARSE_STATUS_EXECUTION_FAILED\"");
+  } else if (error_code == CUSPARSE_STATUS_INTERNAL_ERROR) {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"CUSPARSE_STATUS_INTERNAL_ERROR\"");
+  } else if (error_code == CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED) {
+    throw std::runtime_error("cuSPARSE encountered an error: "
+                             "\"CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED\"");
+  } else if (error_code == CUSPARSE_STATUS_NOT_SUPPORTED) {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"CUSPARSE_STATUS_NOT_SUPPORTED\"");
+  } else if (error_code == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
+    throw std::runtime_error("cuSPARSE encountered an error: "
+                             "\"CUSPARSE_STATUS_INSUFFICIENT_RESOURCES\"");
+  } else {
+    throw std::runtime_error(
+        "cuSPARSE encountered an error: \"unknown error\"");
+  }
+}
+
+} // namespace __cusparse
+
+} // namespace spblas
diff --git a/include/spblas/vendor/cusparse/multiply.hpp b/include/spblas/vendor/cusparse/multiply.hpp
diff --git a/include/spblas/vendor/cusparse/types.hpp b/include/spblas/vendor/cusparse/types.hpp
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
diff --git a/test/gtest/device/spmv_test.cpp b/test/gtest/device/spmv_test.cpp