amd
diff --git a/‎applications/llama_3.2_1b/configs/llama32_1b.json‎
Lines changed: 1 addition & 1 deletion b/‎applications/llama_3.2_1b/configs/llama32_1b.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 1 addition & 1 deletion b/‎applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎applications/llama_3.2_1b/src/operator/aie_base.py‎
Lines changed: 3 additions & 0 deletions b/‎applications/llama_3.2_1b/src/operator/aie_base.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎applications/llama_3.2_1b/src/operator/aie_elementwise_mul.py‎
Lines changed: 52 additions & 52 deletions b/‎applications/llama_3.2_1b/src/operator/aie_elementwise_mul.py‎
Lines changed: 52 additions & 52 deletions
@@ -22,7 +22,7 @@
     "use_aie_norm2": true,
     "use_aie_residual": true,
     "use_aie_regular_mha": false,
-    "use_aie_fused_mha": false,
+    "use_aie_fused_mha": true,
     "use_aie_final_gemm": false,
     "rope_freq": {
       "factor": 32.0,
 
@@ -6,7 +6,6 @@
 # SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import logging
 import torch
 import torch.nn as nn
 from ..utils import torch_to_numpy, assign
@@ -129,6 +128,7 @@ def forward(self, x):
             or (len(x.shape) == 3 and x.shape[0] == 1 and x.shape[1] == 1)
         )
 
+        is_prefill = not is_vector or not self.cfg["use_kv_cache"]
         is_decode_with_kv = is_vector and self.cfg["use_kv_cache"]
 
         if self.cfg["use_aie_ffn_swiglu"]:
 
@@ -60,6 +60,9 @@ def prepare_runtime(cls):
             cls.static_data_pool[buffer_data] = bo
 
         for op in cls.registered_operators:
+            if len(op.kernels) == 0:
+                # Operator likely is used as a sub-operator in another operator and does need any setup.
+                continue
             logging.info(f"Preparing runtime for AIE operator: {op.__class__.__name__}")
 
             # Set up for each kernel
 
@@ -18,52 +18,18 @@
 from pathlib import Path
 
 
-def get_elementwise_mul_artifacts(
-    base_dir,
-    device_type,
-    size,
-    tile_size=2048,
-    num_columns=4,
-    num_channels=2,
-    prefix="eltwise_mul_",
-):
-    file_name_base = f"{prefix}{num_columns}c_{num_channels}ch_{size}_{tile_size}t"
-
-    mlir_artifact = PythonGeneratedMLIRArtifact.new(
-        f"{file_name_base}.mlir",
-        import_path=base_dir / "example" / "elementwise_mul" / "eltwise_mul.py",
-        callback_fn="my_eltwise_mul",
-        callback_args=[
-            device_type,
-            size,
-            num_columns,
-            num_channels,
-            tile_size,
-            0,
-        ],
-    )
-
-    xclbin_artifact = XclbinArtifact.new(
-        f"{file_name_base}.xclbin",
-        depends=[
-            mlir_artifact,
-            KernelObjectArtifact.new(
-                f"mul.o", depends=[SourceArtifact.new("aie_kernels/generic/mul.cc")]
-            ),
-        ],
-    )
-
-    insts_artifact = InstsBinArtifact.new(
-        f"{file_name_base}.bin", depends=[mlir_artifact]
-    )
-
-    return xclbin_artifact, insts_artifact
-
-
 class AIEElementwiseMul(AIEOperatorBase):
     """AIE-accelerated element-wise multiplication"""
 
-    def __init__(self, size, num_columns=None, num_channels=None, tile_size=None):
+    def __init__(
+        self,
+        size,
+        num_columns=None,
+        num_channels=None,
+        tile_size=None,
+        trace_size=0,
+        do_set_up=True,
+    ):
         self.size = size
 
         # Enforce ShimDMA limits for elementwise_mul (uses 2 inputs per core)
@@ -80,20 +46,54 @@ def __init__(self, size, num_columns=None, num_channels=None, tile_size=None):
         self.num_columns = num_columns
         self.num_channels = num_channels
         self.tile_size = tile_size
+        self.trace_size = trace_size
+        self.do_set_up = do_set_up
 
         AIEOperatorBase.__init__(self)
 
+    def get_artifacts(self, prefix="eltwise_mul_"):
+        file_name_base = f"{prefix}{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t"
+
+        mlir_artifact = PythonGeneratedMLIRArtifact.new(
+            f"{file_name_base}.mlir",
+            import_path=self.base_dir
+            / "example"
+            / "elementwise_mul"
+            / "eltwise_mul.py",
+            callback_fn="my_eltwise_mul",
+            callback_args=[
+                self.device_manager.device_type,
+                self.size,
+                self.num_columns,
+                self.num_channels,
+                self.tile_size,
+                self.trace_size,
+            ],
+        )
+
+        xclbin_artifact = XclbinArtifact.new(
+            f"{file_name_base}.xclbin",
+            depends=[
+                mlir_artifact,
+                KernelObjectArtifact.new(
+                    f"mul.o", depends=[SourceArtifact.new("aie_kernels/generic/mul.cc")]
+                ),
+            ],
+        )
+
+        insts_artifact = InstsBinArtifact.new(
+            f"{file_name_base}.bin", depends=[mlir_artifact]
+        )
+
+        return xclbin_artifact, insts_artifact
+
     def set_up(self):
+        # If this operator is only used as a sub-operator in another operator that sets it up, we should skip the setup here as those artifacts and buffers may not be needed.
+        if not self.do_set_up:
+            return
+
         # Compilation artifacts
-        xclbin_artifact, insts_artifact = get_elementwise_mul_artifacts(
-            self.base_dir,
-            self.device_manager.device_type,
-            self.size,
-            self.tile_size,
-            self.num_columns,
-            self.num_channels,
-            prefix="",
-        )
+        xclbin_artifact, insts_artifact = self.get_artifacts()
 
         # Override device_type in the mlir_artifact's callback_args if needed
         mlir_artifact = xclbin_artifact.depends[0]