[NPU] fix llama_infer, cherry-pick #1324 (#1326)

ronny1996 · web-flow · commit c8ed610dec3d · 2024-06-27T21:14:50.000+08:00
diff --git a/backends/npu/custom_op/llama_infer/atb_ops/fused_blha_layer_op_utils.cc b/backends/npu/custom_op/llama_infer/atb_ops/fused_blha_layer_op_utils.cc
@@ -587,16 +587,14 @@ void FusedBlhaGlobalVar::update_out_encoder(const phi::CustomContext &dev_ctx,
   for (auto i = 0; i < batch_size; ++i) {
     if (seqlens_encoder[i] > 0) {
       in_offset += seqlens_encoder[i] * emb_dim;
+      out_offset = i * emb_dim;
       ACL_CHECK(aclrtMemcpyAsync(
           out_data + out_offset * sizeof(phi::float16),
           emb_dim * sizeof(phi::float16),
           in_data + (in_offset - emb_dim) * sizeof(phi::float16),
           emb_dim * sizeof(phi::float16),
           ACL_MEMCPY_DEVICE_TO_DEVICE,
           reinterpret_cast<aclrtStream>(dev_ctx.stream())));
-      out_offset += emb_dim;
-    } else if (seqlens_decoder[i] > 0) {
-      out_offset += emb_dim;
     }
   }
 }
@@ -622,9 +620,8 @@ void FusedBlhaGlobalVar::update_out_decoder(const phi::CustomContext &dev_ctx,
 
   int64_t in_offset = 0, out_offset = 0;
   for (auto i = 0; i < batch_size; ++i) {
-    if (seqlens_encoder[i] > 0) {
-      out_offset += emb_dim;
-    } else if (seqlens_decoder[i] > 0) {
+    if (seqlens_decoder[i] > 0) {
+      out_offset = i * emb_dim;
       ACL_CHECK(
           aclrtMemcpyAsync(out_data + out_offset * sizeof(phi::float16),
                            emb_dim * sizeof(phi::float16),
@@ -633,7 +630,6 @@ void FusedBlhaGlobalVar::update_out_decoder(const phi::CustomContext &dev_ctx,
                            ACL_MEMCPY_DEVICE_TO_DEVICE,
                            reinterpret_cast<aclrtStream>(dev_ctx.stream())));
       in_offset += emb_dim;
-      out_offset += emb_dim;
     }
   }
 }
diff --git a/backends/npu/custom_op/llama_infer/atb_ops/remove_padding_op.cc b/backends/npu/custom_op/llama_infer/atb_ops/remove_padding_op.cc
@@ -58,7 +58,7 @@ std::vector<paddle::Tensor> RemovePaddingOp(const paddle::Tensor& x,
       paddle::experimental::DeviceContextPool::Instance().Get(place));
 
   auto x_shape = x.shape();
-  const int bsz = x_shape[0];
+  const int bsz = seqlen.numel();
   const int padding_len = x_shape[1];
 
   auto seqlen_host = seqlen.copy_to(paddle::CPUPlace(), true);