jd-opensource
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 0 additions & 10 deletions b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 0 additions & 4 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎xllm/core/distributed_runtime/llm_engine.cpp‎
Lines changed: 5 additions & 2 deletions b/‎xllm/core/distributed_runtime/llm_engine.cpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎xllm/core/distributed_runtime/vlm_engine.cpp‎
Lines changed: 6 additions & 2 deletions b/‎xllm/core/distributed_runtime/vlm_engine.cpp‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎xllm/core/framework/model/causal_lm.h‎
Lines changed: 0 additions & 18 deletions b/‎xllm/core/framework/model/causal_lm.h‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 3 additions & 1 deletion b/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎xllm/core/layers/npu/npu_base_layer.cpp‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/layers/npu/npu_base_layer.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/runtime/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎xllm/core/runtime/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎xllm/core/runtime/acl_graph_executor_impl.cpp‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/runtime/acl_graph_executor_impl.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -84,21 +84,11 @@ DEFINE_bool(enable_customize_mla_kernel, false, "enable customize mla kernel");
 
 // --- graph mode execution config ---
 
-DEFINE_bool(enable_acl_graph,
-            false,
-            "Whether to enable ACL graph execution for decode phase.");
-
 DEFINE_int32(max_seq_len_for_graph_mode,
              0,
              "Maximum number of tokens per sequence for graph execution. "
              "If 0, use model max_position_embeddings.");
 
-DEFINE_bool(enable_acl_graph_no_padding,
-            false,
-            "Whether to enable ACL graph execution for decode phase without "
-            "padding. If true, graph will be caputured with every actual num "
-            "tokens, as stride is 1.");
-
 DEFINE_bool(enable_graph,
             false,
             "Whether to enable graph execution for decode phase.");
 
@@ -83,12 +83,8 @@ DECLARE_string(rank_tablefile);
 
 DECLARE_bool(enable_mla);
 
-DECLARE_bool(enable_acl_graph);
-
 DECLARE_int32(max_seq_len_for_graph_mode);
 
-DECLARE_bool(enable_acl_graph_no_padding);
-
 DECLARE_bool(enable_graph);
 
 DECLARE_bool(enable_graph_no_padding);
 
@@ -858,8 +858,8 @@ std::vector<RawForwardInput> LLMEngine::prepare_inputs(
   std::vector<RawForwardInput> batched_inputs;
   batched_inputs.reserve(dp_size_);
   // some dp related variables
-  std::vector<int32_t> dp_global_token_nums;
-  dp_global_token_nums.resize(dp_size_);
+  std::vector<int32_t> dp_global_token_nums(dp_size_);
+  std::vector<int32_t> dp_is_decode(dp_size_, 0);
   bool global_empty_kv_cache = true;
   // when enable dp, we need to check the forward type of each batch
   // and set the empty forward type of each batch to the same value as the first
@@ -878,6 +878,8 @@ std::vector<RawForwardInput> LLMEngine::prepare_inputs(
         !batched_inputs[dp_rank].batch_forward_type.is_empty()) {
       batch_forward_type = batched_inputs[dp_rank].batch_forward_type;
     }
+    dp_is_decode[dp_rank] = batch_forward_type.is_decode() &&
+                            batched_inputs[dp_rank].q_max_seq_len == 1;
   }
 
   // eplb related
@@ -889,6 +891,7 @@ std::vector<RawForwardInput> LLMEngine::prepare_inputs(
   // update dp_global_token_nums and global_empty_kv_cache
   for (auto dp_rank = 0; dp_rank < dp_size_; ++dp_rank) {
     batched_inputs[dp_rank].dp_global_token_nums = dp_global_token_nums;
+    batched_inputs[dp_rank].dp_is_decode = dp_is_decode;
     batched_inputs[dp_rank].global_empty_kv_cache = global_empty_kv_cache;
     if (FLAGS_enable_eplb) {
       batched_inputs[dp_rank].eplb_info = eplb_info;
 
@@ -434,8 +434,8 @@ std::vector<RawForwardInput> VLMEngine::prepare_inputs(
   std::vector<RawForwardInput> batched_inputs;
   batched_inputs.reserve(dp_size_);
   // some dp related variables
-  std::vector<int32_t> dp_global_token_nums;
-  dp_global_token_nums.resize(dp_size_);
+  std::vector<int32_t> dp_global_token_nums(dp_size_);
+  std::vector<int32_t> dp_is_decode(dp_size_, 0);
   bool global_empty_kv_cache = true;
 
   for (auto dp_rank = 0; dp_rank < dp_size_; ++dp_rank) {
@@ -445,12 +445,16 @@ std::vector<RawForwardInput> VLMEngine::prepare_inputs(
         batched_inputs[dp_rank].flatten_tokens_vec.size();
     global_empty_kv_cache =
         batched_inputs[dp_rank].empty_kv_cache && global_empty_kv_cache;
+    dp_is_decode[dp_rank] =
+        batched_inputs[dp_rank].batch_forward_type.is_decode() &&
+        batched_inputs[dp_rank].q_max_seq_len == 1;
   }
 
   // update dp_global_token_nums and global_empty_kv_cache
   for (auto dp_rank = 0; dp_rank < dp_size_; ++dp_rank) {
     batched_inputs[dp_rank].dp_global_token_nums = dp_global_token_nums;
     batched_inputs[dp_rank].global_empty_kv_cache = global_empty_kv_cache;
+    batched_inputs[dp_rank].dp_is_decode = std::move(dp_is_decode);
   }
 
   return batched_inputs;
 
@@ -119,10 +119,6 @@ class CausalLM : public torch::nn::Module {
     LOG(FATAL) << "Method 'set_word_embedding' is not implemented/supported by "
                   "this model.";
   }
-  virtual void skip_mrope() {}
-  virtual void apply_mrope(const torch::Tensor positions,
-                           torch::Tensor& cos_pos,
-                           torch::Tensor& sin_pos) {}
 };
 
 template <typename Model>
@@ -192,20 +188,6 @@ class CausalLMImpl : public CausalLM {
 
   const torch::TensorOptions& options() const override { return options_; }
 
-  void skip_mrope() override {
-#if defined(USE_MLU)
-    model_->skip_mrope();
-#endif
-  }
-
-  void apply_mrope(const torch::Tensor positions,
-                   torch::Tensor& cos_pos,
-                   torch::Tensor& sin_pos) override {
-#if defined(USE_MLU)
-    model_->apply_mrope(positions, cos_pos, sin_pos);
-#endif
-  }
-
  private:
   Model model_;
 
 
@@ -127,6 +127,7 @@ struct ModelInputParams {
 
     params.mm_data = mm_data.to(device);
     params.dp_global_token_nums = dp_global_token_nums;
+    params.dp_is_decode = dp_is_decode;
     params.embedding_ids = std::move(embedding_ids);
     params.extra_token_ids = std::move(extra_token_ids);
     params.dp_ep_padding_data = dp_ep_padding_data;
@@ -178,7 +179,7 @@ struct ModelInputParams {
     print_tensor(new_cache_slots, "ModelInputParams: new_cache_slots", 4);
     print_tensor(block_tables, "ModelInputParams: block_tables", 4);
     LOG(INFO) << "ModelInputParams: dp_global_token_nums is "
-              << dp_global_token_nums;
+              << dp_global_token_nums << ", dp_is_decode: " << dp_is_decode;
   }
 
   int32_t get_q_seq_len(int32_t seq_idx) const {
@@ -240,6 +241,7 @@ struct ModelInputParams {
 
   // num tokens of all workers，mainly used for dp case
   std::vector<int32_t> dp_global_token_nums;
+  std::vector<int32_t> dp_is_decode;
   // whether the kv-cache is empty for all sequences,mainly used for dp case
   bool global_empty_kv_cache = true;
 
 
@@ -78,11 +78,11 @@ atb::Status BaseLayer::execute_node(atb_speed::Model::Node& node,
   //   However, libtorch_npu current stream is set to default stream after
   //   capture ends, causing inconsistency between ATB context and the actual
   //   execution stream
-  if (FLAGS_enable_acl_graph) {
+  if (FLAGS_enable_graph) {
     void* stream = c10_npu::getCurrentNPUStream(device_.index()).stream();
     context_->SetExecuteStream(stream);
   }
-  // if (FLAGS_enable_acl_graph && !graph_captured_) {
+  // if (FLAGS_enable_graph && !graph_captured_) {
   //   void* stream = c10_npu::getCurrentNPUStream(device_.index()).stream();
   //   aclmdlRICaptureStatus status;
   //   aclmdlRI modelRI;
 
@@ -121,7 +121,7 @@ void Glm4MoeDecoderImpl::initialize_basic_parameters(
 
   // not support MTP model yet
   param.enableAclGraph =
-      FLAGS_enable_acl_graph && !is_prefill && args.n_layers() > 1;
+      FLAGS_enable_graph && !is_prefill && args.n_layers() > 1;
 
   param.moeLinearTransposeType = (layer_id_ < args.first_k_dense_replace())
                                      ? std::vector<int>{-1, -1, -1, -1}
@@ -460,7 +460,7 @@ void Glm4MoeDecoderImpl::build_node_variant_pack(
   node.variantPack.inTensors.at(input_idx++) =
       atb_speed::Utils::AtTensor2Tensor(tensor_placeholder_);
 
-  if (FLAGS_enable_acl_graph && !is_prefill &&
+  if (FLAGS_enable_graph && !is_prefill &&
       input_params.graph_buffer.tiling_data.defined()) {
     node.variantPack.inTensors.at(input_idx++) =
         atb_speed::Utils::AtTensor2Tensor(
 
@@ -2,6 +2,7 @@ include(cc_library)
 include(cc_binary)
 include(cc_test)
 
+
 cc_library(
   NAME
     runtime
@@ -12,6 +13,7 @@ cc_library(
     params_utils.h
     executor.h
     executor_impl.h
+    executor_impl_factory.h
     base_executor_impl.h
     dit_executor.h
     $<$<BOOL:${USE_NPU}>:acl_graph_executor_impl.h>
@@ -29,6 +31,7 @@ cc_library(
     forward_shared_memory_manager.h
   SRCS
     executor.cpp
+    executor_impl_factory.cpp
     base_executor_impl.cpp
     dit_executor.cpp
     $<$<BOOL:${USE_NPU}>:acl_graph_executor_impl.cpp>
 
@@ -849,7 +849,7 @@ void AclGraph::print_graph_tensors() const {
 // bucket will be [1, 2, 4, 8, 16, 32, 48, 64, ..., max_seqs_per_batch]
 uint32_t AclGraphExecutorImpl::get_bucket_num_tokens(
     uint32_t num_tokens) const {
-  if (FLAGS_enable_acl_graph_no_padding) {
+  if (FLAGS_enable_graph_no_padding) {
     return num_tokens;
   }
   if (num_tokens <= 1) {
Original file line number	Diff line number	Diff line change
`@@ -849,7 +849,7 @@ void AclGraph::print_graph_tensors() const {`
`849`	`849`	`// bucket will be [1, 2, 4, 8, 16, 32, 48, 64, ..., max_seqs_per_batch]`
`850`	`850`	`uint32_t AclGraphExecutorImpl::get_bucket_num_tokens(`
`851`	`851`	`uint32_t num_tokens) const {`
`852`		`- if (FLAGS_enable_acl_graph_no_padding) {`
	`852`	`+ if (FLAGS_enable_graph_no_padding) {`
`853`	`853`	`return num_tokens;`
`854`	`854`	`}`
`855`	`855`	`if (num_tokens <= 1) {`