jd-opensource
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 11 additions & 1 deletion b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎xllm/core/framework/model/causal_lm.h‎
Lines changed: 18 additions & 0 deletions b/‎xllm/core/framework/model/causal_lm.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎xllm/core/layers/common/CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions b/‎xllm/core/layers/common/CMakeLists.txt‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎xllm/core/layers/common/layer_utils.cpp‎
Lines changed: 0 additions & 39 deletions b/‎xllm/core/layers/common/layer_utils.cpp‎
Lines changed: 0 additions & 39 deletions
diff --git a/‎xllm/core/layers/common/layer_utils.h‎
Lines changed: 0 additions & 28 deletions b/‎xllm/core/layers/common/layer_utils.h‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎xllm/core/layers/common/qwen2_decoder_layer.cpp‎
Lines changed: 0 additions & 2 deletions b/‎xllm/core/layers/common/qwen2_decoder_layer.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎xllm/core/layers/common/rms_norm.cpp‎
Lines changed: 2 additions & 1 deletion b/‎xllm/core/layers/common/rms_norm.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎xllm/core/runtime/CMakeLists.txt‎
Lines changed: 7 additions & 4 deletions b/‎xllm/core/runtime/CMakeLists.txt‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎xllm/core/runtime/base_executor_impl.cpp‎
Lines changed: 1 addition & 0 deletions b/‎xllm/core/runtime/base_executor_impl.cpp‎
Lines changed: 1 addition & 0 deletions
@@ -90,14 +90,24 @@ DEFINE_bool(enable_acl_graph,
 
 DEFINE_int32(max_seq_len_for_graph_mode,
              0,
-             "Maximum number of tokens per sequence for ACL graph execution. "
+             "Maximum number of tokens per sequence for graph execution. "
              "If 0, use model max_position_embeddings.");
 
 DEFINE_bool(enable_acl_graph_no_padding,
             false,
             "Whether to enable ACL graph execution for decode phase without "
             "padding. If true, graph will be caputured with every actual num "
             "tokens, as stride is 1.");
+
+DEFINE_bool(enable_graph,
+            false,
+            "Whether to enable graph execution for decode phase.");
+
+DEFINE_bool(enable_graph_no_padding,
+            false,
+            "Whether to enable graph execution for decode phase without "
+            "padding. If true, graph will be caputured with every actual num "
+            "tokens, as stride is 1.");
 // --- vlm config ---
 
 DEFINE_int32(limit_image_per_prompt,
 
@@ -89,6 +89,10 @@ DECLARE_int32(max_seq_len_for_graph_mode);
 
 DECLARE_bool(enable_acl_graph_no_padding);
 
+DECLARE_bool(enable_graph);
+
+DECLARE_bool(enable_graph_no_padding);
+
 DECLARE_bool(enable_chunked_prefill);
 
 DECLARE_string(master_node_addr);
 
@@ -69,6 +69,10 @@ class CausalLM : public torch::nn::Module {
   virtual void set_lm_head(layer::LmHead& head) = 0;
   virtual layer::WordEmbedding get_word_embedding() = 0;
   virtual void set_word_embedding(layer::WordEmbedding& embedding) = 0;
+  virtual void skip_mrope() {}
+  virtual void apply_mrope(const torch::Tensor positions,
+                           torch::Tensor& cos_pos,
+                           torch::Tensor& sin_pos) {}
 };
 
 template <typename Model>
@@ -118,6 +122,20 @@ class CausalLMImpl : public CausalLM {
 
   const torch::TensorOptions& options() const override { return options_; }
 
+  void skip_mrope() override {
+#if defined(USE_MLU)
+    model_->skip_mrope();
+#endif
+  }
+
+  void apply_mrope(const torch::Tensor positions,
+                   torch::Tensor& cos_pos,
+                   torch::Tensor& sin_pos) override {
+#if defined(USE_MLU)
+    model_->apply_mrope(positions, cos_pos, sin_pos);
+#endif
+  }
+
  private:
   Model model_;
 
 
@@ -16,7 +16,6 @@ cc_library(
     qwen3_moe_decoder_layer.h
     linear.h
     word_embedding_impl.h
-    layer_utils.h
     indexer.h
     deep_ep.h
   SRCS
@@ -32,7 +31,6 @@ cc_library(
     qwen3_moe_decoder_layer.cpp
     linear.cpp
     word_embedding_impl.cpp
-    layer_utils.cpp
     indexer.cpp
     deep_ep.cpp
   DEPS
 
@@ -17,8 +17,6 @@ limitations under the License.
 
 #include <glog/logging.h>
 
-#include "layer_utils.h"
-
 namespace xllm {
 namespace layer {
 
 
@@ -55,7 +55,7 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> RMSNormImpl::forward(
   if (residual.has_value()) {
     residual.value() = residual.value().reshape({-1, norm_dim_});
     if (Device::type_str() == "mlu") {
-      residual_out = torch::empty_like(residual.value());
+      residual_out = residual.value();
     }
   }
 
@@ -67,6 +67,7 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> RMSNormImpl::forward(
   fused_layernorm_params.weight = weight_;
   fused_layernorm_params.eps = eps_;
   fused_layernorm_params.mode = mode_;
+  fused_layernorm_params.store_output_before_norm = residual_out.has_value();
   if (bias_.defined()) {
     fused_layernorm_params.beta = bias_;
   }
 
@@ -3,7 +3,7 @@ include(cc_binary)
 include(cc_test)
 
 cc_library(
-  NAME 
+  NAME
     runtime
   HDRS
     options.h
@@ -15,6 +15,7 @@ cc_library(
     base_executor_impl.h
     dit_executor.h
     $<$<BOOL:${USE_NPU}>:acl_graph_executor_impl.h>
+    $<$<BOOL:${USE_MLU}>:mlu_graph_executor_impl.h>
     worker.h
     worker_impl.h
     llm_worker_impl.h
@@ -36,6 +37,7 @@ cc_library(
     base_executor_impl.cpp
     dit_executor.cpp
     $<$<BOOL:${USE_NPU}>:acl_graph_executor_impl.cpp>
+    $<$<BOOL:${USE_MLU}>:mlu_graph_executor_impl.cpp>
     worker.cpp
     worker_impl.cpp
     llm_worker_impl.cpp
@@ -82,7 +84,7 @@ cc_library(
 )
 
 cc_library(
-  NAME 
+  NAME
     master
   HDRS
     llm_master.h
@@ -137,8 +139,9 @@ target_link_libraries(acl_graph_executor_test
                       )
 # Use --whole-archive for spdlog to ensure all symbols are available for xllm_kernels
 # This resolves the undefined reference issues in xllm_kernels
-target_link_options(acl_graph_executor_test PRIVATE 
-  "-Wl,--whole-archive" 
+target_link_options(acl_graph_executor_test PRIVATE
+  "-Wl,--whole-archive"
   "${CMAKE_BINARY_DIR}/third_party/spdlog/libspdlog.a"
   "-Wl,--no-whole-archive")
 endif()
+
@@ -35,6 +35,7 @@ torch::Tensor BaseExecutorImpl::run(const torch::Tensor& tokens,
                                     const torch::Tensor& positions,
                                     std::vector<KVCache>& kv_caches,
                                     const ModelInputParams& params) {
+  COUNTER_INC(num_model_execution_total_eager);
   return model_->forward(tokens, positions, kv_caches, params);
 }
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> RMSNormImpl::forward(`
`55`	`55`	`if (residual.has_value()) {`
`56`	`56`	`residual.value() = residual.value().reshape({-1, norm_dim_});`
`57`	`57`	`if (Device::type_str() == "mlu") {`
`58`		`- residual_out = torch::empty_like(residual.value());`
	`58`	`+ residual_out = residual.value();`
`59`	`59`	`}`
`60`	`60`	`}`
`61`	`61`
`@@ -67,6 +67,7 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> RMSNormImpl::forward(`
`67`	`67`	`fused_layernorm_params.weight = weight_;`
`68`	`68`	`fused_layernorm_params.eps = eps_;`
`69`	`69`	`fused_layernorm_params.mode = mode_;`
	`70`	`+ fused_layernorm_params.store_output_before_norm = residual_out.has_value();`
`70`	`71`	`if (bias_.defined()) {`
`71`	`72`	`fused_layernorm_params.beta = bias_;`
`72`	`73`	`}`
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ torch::Tensor BaseExecutorImpl::run(const torch::Tensor& tokens,`
`35`	`35`	`const torch::Tensor& positions,`
`36`	`36`	`std::vector<KVCache>& kv_caches,`
`37`	`37`	`const ModelInputParams& params) {`
	`38`	`+ COUNTER_INC(num_model_execution_total_eager);`
`38`	`39`	`return model_->forward(tokens, positions, kv_caches, params);`
`39`	`40`	`}`
`40`	`41`