rapidsai · tarang-jain · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 11, 2025
@@ -420,6 +420,7 @@ if(NOT BUILD_CPU_ONLY)
     src/distance/detail/pairwise_matrix/dispatch_rbf.cu
     src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int64_t.cu
     src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int64_t.cu
+    src/distance/detail/pairwise_matrix/dispatch_bitwise_hamming_uint8_t_uint32_t_uint32_t_int64_t.cu
     src/distance/distance.cu
     src/distance/pairwise_distance.cu
     src/distance/sparse_distance.cu

@@ -123,6 +123,15 @@ struct balanced_params : base_params {
    * Number of training iterations
    */
   uint32_t n_iters = 20;
+
+  /**
+   * If true, treats uint8_t input data as bit-packed binary data where each byte contains 8 bits.
+   * Bits are expanded on-the-fly to {-1, +1} floats during training.
+   * When enabled:
+   *   - Input data dimension represents packed dimension (actual_dim / 8)
+   *   - Output centroids dimension is expanded (packed_dim * 8)
+   */
+  bool is_packed_binary = false;
 };
 
 /**

@@ -204,6 +204,11 @@ struct index : cuvs::neighbors::index {
   raft::device_matrix_view<float, uint32_t, raft::row_major> centers() noexcept;
   raft::device_matrix_view<const float, uint32_t, raft::row_major> centers() const noexcept;
 
+  /** packed k-means cluster centers corresponding to the lists [n_lists, dim] when the
+   * BitwiseHamming metric is selected */
+  raft::device_matrix_view<uint8_t, int64_t, raft::row_major> binary_centers() noexcept;
+  raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> binary_centers() const noexcept;
+
   /**
    * (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists].
    *
@@ -229,7 +234,10 @@ struct index : cuvs::neighbors::index {
   /** Total length of the index. */
   IdxT size() const noexcept;
 
-  /** Dimensionality of the data. */
+  /** Dimensionality of the data.
+   * @note For binary index, this returns the dimensionality of the byte dataset, which is the
+   * number of bits / 8.
+   */
   uint32_t dim() const noexcept;
 
   /** Number of clusters/inverted lists. */
@@ -255,6 +263,8 @@ struct index : cuvs::neighbors::index {
 
   void check_consistency();
 
+  bool binary_index() const noexcept;
+
  private:
   /**
    * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
@@ -267,7 +277,9 @@ struct index : cuvs::neighbors::index {
   std::vector<std::shared_ptr<list_data<T, IdxT>>> lists_;
   raft::device_vector<uint32_t, uint32_t> list_sizes_;
   raft::device_matrix<float, uint32_t, raft::row_major> centers_;
+  raft::device_matrix<uint8_t, int64_t, raft::row_major> binary_centers_;
   std::optional<raft::device_vector<float, uint32_t>> center_norms_;
+  bool binary_index_;
 
   // Computed members
   raft::device_vector<T*, uint32_t> data_ptrs_;

@@ -69,7 +69,8 @@ void fit(const raft::resources& handle,
          raft::device_matrix_view<MathT, IndexT> centroids,
          MappingOpT mapping_op = raft::identity_op())
 {
-  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1) ||
+                 (params.is_packed_binary && X.extent(1) * 8 == centroids.extent(1)),
                "Number of features in dataset and centroids are different");
   RAFT_EXPECTS(static_cast<uint64_t>(X.extent(0)) * static_cast<uint64_t>(X.extent(1)) <=
                  static_cast<uint64_t>(std::numeric_limits<IndexT>::max()),
@@ -279,14 +280,16 @@ void calc_centers_and_sizes(const raft::resources& handle,
                             raft::device_matrix_view<MathT, IndexT> centroids,
                             raft::device_vector_view<CounterT, IndexT> cluster_sizes,
                             bool reset_counters   = true,
+                            bool is_packed_binary = false,
                             MappingOpT mapping_op = raft::identity_op())
 {
   RAFT_EXPECTS(X.extent(0) == labels.extent(0),
                "Number of rows in dataset and labels are different");
-  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
-               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(
+    is_packed_binary ? X.extent(1) * 8 == centroids.extent(1) : X.extent(1) == centroids.extent(1),
+    "Number of features in dataset and centroids are different");
   RAFT_EXPECTS(centroids.extent(0) == cluster_sizes.extent(0),
-               "Number of rows in centroids and clusyer_sizes are different");
+               "Number of rows in centroids and cluster_sizes are different");
 
   cuvs::cluster::kmeans::detail::calc_centers_and_sizes(
     handle,
@@ -298,6 +301,7 @@ void calc_centers_and_sizes(const raft::resources& handle,
     X.extent(0),
     labels.data_handle(),
     reset_counters,
+    is_packed_binary,
     mapping_op,
     raft::resource::get_workspace_resource(handle));
 }

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,7 @@
 #include "cutlass.cuh"
 
 // The distance operations:
+#include "../distance_ops/bitwise_hamming.cuh"
 #include "../distance_ops/canberra.cuh"
 #include "../distance_ops/correlation.cuh"
 #include "../distance_ops/cosine.cuh"

@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+namespace cuvs::distance::detail::ops {
+
+/**
+ * @brief the Bitwise Hamming distance matrix calculation
+ *  It computes the following equation:
+ *
+ *    c_ij = sum_k popcount(x_ik XOR y_kj)
+ *
+ * where x and y are binary data packed as uint8_t
+ */
+template <typename DataType, typename AccType, typename IdxType>
+struct bitwise_hamming_distance_op {
+  using DataT = DataType;
+  using AccT  = AccType;
+  using IdxT  = IdxType;
+
+  IdxT k;
+
+  bitwise_hamming_distance_op(IdxT k_) noexcept : k(k_) {}
+
+  static constexpr bool use_norms            = false;
+  static constexpr bool expensive_inner_loop = false;
+
+  template <typename Policy>
+  static constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  __device__ __forceinline__ void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    static_assert(std::is_same_v<DataT, uint8_t>, "BitwiseHamming only supports uint8_t");
+    // Ensure proper masking and casting to avoid undefined behavior
+    uint32_t xor_val    = static_cast<uint32_t>(static_cast<uint8_t>(x ^ y));
+    uint32_t masked_val = xor_val & 0xffu;
+    int popcount        = __popc(masked_val);
+    acc += static_cast<AccT>(popcount);
+  }
+
+  template <typename Policy>
+  __device__ __forceinline__ void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                                         AccT* regxn,
+                                         AccT* regyn,
+                                         IdxT gridStrideX,
+                                         IdxT gridStrideY) const
+  {
+  }
+};
+
+}  // namespace cuvs::distance::detail::ops
@@ -1,12 +1,13 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include "distance_ops/l2_exp.cuh"  // ops::l2_exp_distance_op
 #include "fused_distance_nn/cutlass_base.cuh"
+#include "fused_distance_nn/fused_bitwise_hamming_nn.cuh"
 #include "fused_distance_nn/fused_cosine_nn.cuh"
 #include "fused_distance_nn/fused_l2_nn.cuh"
 #include "fused_distance_nn/helper_structs.cuh"
@@ -68,16 +69,31 @@ void fusedDistanceNNImpl(OutT* min,
 
   switch (metric) {
     case cuvs::distance::DistanceType::CosineExpanded:
-      fusedCosineNN<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
-        min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, stream);
+      if constexpr (std::is_same_v<DataT, uint8_t> || std::is_same_v<DataT, int8_t>) {
+        RAFT_FAIL("Cosine distance is not supported for uint8_t/int8_t data types");
+      } else {
+        fusedCosineNN<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
+          min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, stream);
+      }
       break;
     case cuvs::distance::DistanceType::L2SqrtExpanded:
     case cuvs::distance::DistanceType::L2Expanded:
-      // initOutBuffer is take care by fusedDistanceNNImpl() so we set it false to fusedL2NNImpl.
-      fusedL2NNImpl<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
-        min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, false, stream);
+      if constexpr (std::is_same_v<DataT, uint8_t> || std::is_same_v<DataT, int8_t>) {
+        RAFT_FAIL("L2 distance is not supported for uint8_t/int8_t data types");
+      } else {
+        fusedL2NNImpl<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
+          min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, false, stream);
+      }
       break;
-    default: assert("only cosine/l2 metric is supported with fusedDistanceNN\n"); break;
+    case cuvs::distance::DistanceType::BitwiseHamming:
+      if constexpr (std::is_same_v<DataT, uint8_t>) {
+        fusedBitwiseHammingNN<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
+          min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, stream);
+      } else {
+        RAFT_FAIL("BitwiseHamming distance only supports uint8_t data type");
+      }
+      break;
+    default: RAFT_FAIL("only cosine/l2/bitwise hamming metric is supported with fusedDistanceNN");
   }
 }
 

@@ -0,0 +1,82 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "../distance_ops/bitwise_hamming.cuh"  // ops::bitwise_hamming_distance_op
+#include "../pairwise_distance_base.cuh"        // PairwiseDistances
+#include "helper_structs.cuh"
+#include "simt_kernel.cuh"
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+
+/**
+ * @brief Fused BitwiseHamming distance and 1-nearest-neighbor
+ *
+ * This implementation is only meaningful for uint8_t data type.
+ * The if constexpr in fusedDistanceNNImpl ensures it's only called for uint8_t.
+ */
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedBitwiseHammingNN(OutT* min,
+                           const DataT* x,
+                           const DataT* y,
+                           const DataT* xn,
+                           const DataT* yn,
+                           IdxT m,
+                           IdxT n,
+                           IdxT k,
+                           int* workspace,
+                           ReduceOpT redOp,
+                           KVPReduceOpT pairRedOp,
+                           bool sqrt,
+                           cudaStream_t stream)
+{
+  typedef Policy P;
+
+  dim3 blk(P::Nthreads);
+  constexpr auto maxVal  = std::numeric_limits<DataT>::max();
+  using kv_pair_type     = raft::KeyValuePair<IdxT, uint32_t>;
+  using distance_op_type = ops::bitwise_hamming_distance_op<DataT, uint32_t, IdxT>;
+  distance_op_type distance_op{k};
+  auto kernel = fusedDistanceNNkernel<DataT,
+                                      kv_pair_type,
+                                      IdxT,
+                                      P,
+                                      ReduceOpT,
+                                      KVPReduceOpT,
+                                      distance_op_type,
+                                      raft::identity_op>;
+
+  constexpr size_t shmemSize = P::SmemSize;
+
+  dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, kernel);
+
+  kernel<<<grid, blk, shmemSize, stream>>>(min,
+                                           x,
+                                           y,
+                                           nullptr,
+                                           nullptr,
+                                           m,
+                                           n,
+                                           k,
+                                           maxVal,
+                                           workspace,
+                                           redOp,
+                                           pairRedOp,
+                                           distance_op,
+                                           raft::identity_op{});
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -27,8 +27,15 @@ namespace detail {
 template <typename LabelT, typename DataT>
 struct KVPMinReduceImpl {
   typedef raft::KeyValuePair<LabelT, DataT> KVP;
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
-  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  // Use index as tiebreaker for consistent behavior when distances are equal
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b)
+  {
+    return (b.value < a.value || (b.value == a.value && b.key < a.key)) ? b : a;
+  }
+  DI KVP operator()(const KVP& a, const KVP& b)
+  {
+    return (b.value < a.value || (b.value == a.value && b.key < a.key)) ? b : a;
+  }
 
 };  // KVPMinReduce
 
@@ -38,14 +45,16 @@ struct MinAndDistanceReduceOpImpl {
 
   DI void operator()(LabelT rid, KVP* out, const KVP& other) const
   {
-    if (other.value < out->value) {
+    // Use index as tiebreaker for consistent behavior when distances are equal
+    if (other.value < out->value || (other.value == out->value && other.key < out->key)) {
       out->key   = other.key;
       out->value = other.value;
     }
   }
   DI void operator()(LabelT rid, volatile KVP* out, const KVP& other) const
   {
-    if (other.value < out->value) {
+    // Use index as tiebreaker for consistent behavior when distances are equal
+    if (other.value < out->value || (other.value == out->value && other.key < out->key)) {
       out->key   = other.key;
       out->value = other.value;
     }
@@ -123,7 +132,11 @@ struct kvp_cg_min_reduce_op {
   using AccTypeT = AccType;
   using IndexT   = Index;
   // functor signature.
-  __host__ __device__ KVP operator()(KVP a, KVP b) const { return a.value < b.value ? a : b; }
+  // Use index as tiebreaker for consistent behavior when distances are equal
+  __host__ __device__ KVP operator()(KVP a, KVP b) const
+  {
+    return (a.value < b.value || (a.value == b.value && a.key < b.key)) ? a : b;
+  }
 
   __host__ __device__ AccType operator()(AccType a, AccType b) const { return min(a, b); }