amd
diff --git a/‎aie_kernels/generic/mv.cc‎
Lines changed: 30 additions & 12 deletions b/‎aie_kernels/generic/mv.cc‎
Lines changed: 30 additions & 12 deletions
diff --git a/‎applications/llama_3.2_1b/configs/llama32_1b.json‎
Lines changed: 3 additions & 1 deletion b/‎applications/llama_3.2_1b/configs/llama32_1b.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 20 additions & 8 deletions b/‎applications/llama_3.2_1b/src/block/feed_forward.py‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎applications/llama_3.2_1b/src/block/gqa.py‎
Lines changed: 33 additions & 9 deletions b/‎applications/llama_3.2_1b/src/block/gqa.py‎
Lines changed: 33 additions & 9 deletions
diff --git a/‎applications/llama_3.2_1b/src/model_with_json.py‎
Lines changed: 24 additions & 29 deletions b/‎applications/llama_3.2_1b/src/model_with_json.py‎
Lines changed: 24 additions & 29 deletions
diff --git a/‎operators/common/aie_base.py‎
Lines changed: 1 addition & 1 deletion b/‎operators/common/aie_base.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎operators/common/compilation.py‎
Lines changed: 1 addition & 0 deletions b/‎operators/common/compilation.py‎
Lines changed: 1 addition & 0 deletions
@@ -15,27 +15,39 @@
 
 #include <aie_api/aie.hpp>
 
-void matvec_scalar(uint32_t m, uint32_t k, uint32_t row_offset, bfloat16 *a, bfloat16 *b, bfloat16 *c)
+void matvec_scalar(uint32_t m,
+                   uint32_t k,
+                   const bfloat16 *__restrict a,
+                   const bfloat16 *__restrict b,
+                   bfloat16 *__restrict c)
 {
     for (uint32_t row = 0; row < m; row++) {
         float acc = 0;
         for (uint32_t i = 0; i < k; i++) {
             acc += a[row * k + i] * b[i];
         }
-        c[row + row_offset * m] = static_cast<bfloat16>(acc);
+        c[row] = static_cast<bfloat16>(acc);
     }
 }
 
+/*
+Matrix-vector multiplication kernel
+
+ - m: Number of output rows == number of rows in the input matrix
+ - k: Number of columns in the input matrix == length of the input vector
+ - a: Pointer to the input matrix, stored in row-major order
+ - b: Pointer to the input vector
+ - c: Pointer to the output vector
+ - r: Vector size; data from the matrix and vector will be loaded in and processed in chunks of this size
+*/
 template <uint32_t r>
 void matvec_vectorized(uint32_t m,
                        uint32_t k,
-                       uint32_t row_offset,
                        const bfloat16 *__restrict a,
                        const bfloat16 *__restrict b,
                        bfloat16 *__restrict c)
 {
     ::aie::set_rounding(aie::rounding_mode::conv_even);
-    c += row_offset * m;
     bfloat16 *c_end = c + m;
     const bfloat16 *b_end = b + k;
     for (; c < c_end; c++) {
@@ -55,24 +67,30 @@ void matvec_vectorized(uint32_t m,
 
 extern "C" {
 
+/* The row offset parameter in the functions below is a workaround. The output will be written to c + row_offset * m.
+ * This is simpler than to do pointer arithmetic in the calling MLIR code, but that's all this is for -- an offset into
+ * `c`.  */
+
 void matvec_scalar_bf16_bf16(uint32_t m,
                              uint32_t k,
                              uint32_t row_offset,
-                             bfloat16 *a_in,
-                             bfloat16 *b_in,
-                             bfloat16 *c_out)
+                             const bfloat16 *__restrict a_in,
+                             const bfloat16 *__restrict b_in,
+                             bfloat16 *__restrict c_out)
 {
-    matvec_scalar(m, k, row_offset, a_in, b_in, c_out);
+    c_out += row_offset;
+    matvec_scalar(m, k, a_in, b_in, c_out);
 }
 
 void matvec_vectorized_bf16_bf16(uint32_t m,
                                  uint32_t k,
                                  uint32_t row_offset,
-                                 bfloat16 *a_in,
-                                 bfloat16 *b_in,
-                                 bfloat16 *c_out)
+                                 const bfloat16 *__restrict a_in,
+                                 const bfloat16 *__restrict b_in,
+                                 bfloat16 *__restrict c_out)
 {
-    matvec_vectorized<64>(m, k, row_offset, a_in, b_in, c_out);
+    c_out += row_offset;
+    matvec_vectorized<64>(m, k, a_in, b_in, c_out);
 }
 
 } // extern "C"
@@ -8,22 +8,24 @@
     "hidden_dim": 8192,
     "n_kv_groups": 8,
     "use_kv_cache": true,
-    "use_aie_gemv": true,
     "rope_base": 500000.0,
     "dtype": "bfloat16",
     "use_aie_final_norm": true,
     "use_aie_ffn_gemm": false,
     "use_aie_ffn_silu": false,
     "use_aie_ffn_mul": false,
     "use_aie_ffn_swiglu": true,
+    "use_aie_ffn_gemv": true,
     "use_aie_attn_projection_gemm": true,
+    "use_aie_gqa_gemv": true,
     "use_aie_rope": true,
     "use_aie_norm1": true,
     "use_aie_norm2": true,
     "use_aie_residual": true,
     "use_aie_regular_mha": false,
     "use_aie_fused_mha": true,
     "use_aie_final_gemm": true,
+    "use_aie_final_gemv": true,
     "rope_freq": {
       "factor": 32.0,
       "low_freq_factor": 1.0,
 
@@ -115,18 +115,30 @@ def __init__(
                 cfg["hidden_dim"], cfg["emb_dim"], dtype=cfg["dtype"], bias=False
             )
 
-        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
-            aie_gemv_config = {"num_aie_columns": 1, "is_mv": False}
+        if self.cfg["use_kv_cache"] and self.cfg["use_aie_ffn_gemv"]:
+            aie_gemv_config = {"num_aie_columns": 8, "is_mv": False}
             # FC1 and FC2: emb_dim -> hidden_dim
             self.aie_fc1_gemv = AIEGEMV(
-                M=self.hidden_dim, K=self.emb_dim, **aie_gemv_config
+                M=self.hidden_dim,
+                K=self.emb_dim,
+                tile_size_input=1,
+                tile_size_output=self.hidden_dim // 16,
+                **aie_gemv_config,
             )
             self.aie_fc2_gemv = AIEGEMV(
-                M=self.hidden_dim, K=self.emb_dim, **aie_gemv_config
+                M=self.hidden_dim,
+                K=self.emb_dim,
+                tile_size_input=1,
+                tile_size_output=self.hidden_dim // 16,
+                **aie_gemv_config,
             )
             # FC3: hidden_dim -> emb_dim
             self.aie_fc3_gemv = AIEGEMV(
-                M=self.emb_dim, K=self.hidden_dim, **aie_gemv_config
+                M=self.emb_dim,
+                K=self.hidden_dim,
+                tile_size_input=1,
+                tile_size_output=self.emb_dim // 16,
+                **aie_gemv_config,
             )
 
         # Initialize AIE elementwise multiply
@@ -176,7 +188,7 @@ def forward(self, x):
             else:
                 return self.aie_swiglu_decode(x)
 
-        if is_decode_with_kv and self.cfg["use_aie_gemv"]:
+        if is_decode_with_kv and self.cfg["use_aie_ffn_gemv"]:
             x_fc1 = self.aie_fc1_gemv(x)
             x_fc2 = self.aie_fc2_gemv(x)
         else:
@@ -199,14 +211,14 @@ def forward(self, x):
         else:
             x = x_fc1_silu * x_fc2
 
-        if is_decode_with_kv and self.cfg["use_aie_gemv"]:
+        if is_decode_with_kv and self.cfg["use_aie_ffn_gemv"]:
             result = self.aie_fc3_gemv(x)
             return result.view(original_shape)
         else:
             return self.fc3(x).view(original_shape)
 
     def assign_weights(self, l, fc1, fc2, fc3):
-        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
+        if self.cfg["use_kv_cache"] and self.cfg["use_aie_ffn_gemv"]:
             self.aie_fc1_gemv.weight = fc1
             self.aie_fc2_gemv.weight = fc2
             self.aie_fc3_gemv.weight = fc3
 
@@ -115,18 +115,42 @@ def __init__(
             )
 
         # Initialize AIE GEMV operators for decode phase (when using KV cache)
-        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
+        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gqa_gemv"]:
 
             aie_gemv_config = {
-                "num_aie_columns": 1,
+                "num_aie_columns": 8,
                 "is_mv": False,
                 "use_static_weight": True,
             }
-            self.aie_query_gemv = AIEGEMV(M=d_out, K=d_in, **aie_gemv_config)
+            self.aie_query_gemv = AIEGEMV(
+                M=d_out,
+                K=d_in,
+                tile_size_input=1,
+                tile_size_output=d_out // 16,
+                **aie_gemv_config,
+            )
             kv_out_dim = num_kv_groups * self.head_dim
-            self.aie_key_gemv = AIEGEMV(M=kv_out_dim, K=d_in, **aie_gemv_config)
-            self.aie_value_gemv = AIEGEMV(M=kv_out_dim, K=d_in, **aie_gemv_config)
-            self.aie_out_proj_gemv = AIEGEMV(M=d_out, K=d_out, **aie_gemv_config)
+            self.aie_key_gemv = AIEGEMV(
+                M=kv_out_dim,
+                K=d_in,
+                tile_size_input=1,
+                tile_size_output=kv_out_dim // 16,
+                **aie_gemv_config,
+            )
+            self.aie_value_gemv = AIEGEMV(
+                M=kv_out_dim,
+                K=d_in,
+                tile_size_input=1,
+                tile_size_output=kv_out_dim // 16,
+                **aie_gemv_config,
+            )
+            self.aie_out_proj_gemv = AIEGEMV(
+                M=d_out,
+                K=d_out,
+                tile_size_input=1,
+                tile_size_output=d_out // 16,
+                **aie_gemv_config,
+            )
 
         # Initialize AIE GEMM operators
         if self.cfg["use_aie_attn_projection_gemm"]:
@@ -159,7 +183,7 @@ def forward(self, x, mask, angles, input_pos=None):
         is_decode = input_pos is not None
 
         # Choose between GEMM (prefill) and GEMV (decode) based on KV cache usage
-        if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gemv"]:
+        if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gqa_gemv"]:
             # Decode phase with KV cache - use GEMV for single token
             # weight.T @ input, which is vector-matrix multiplication (So, is_mv=False)
             x_flat = x.reshape(1, -1)  # Shape: (1, d_in)
@@ -376,7 +400,7 @@ def my_mha(queries, keys, values):
             context_vec = context_vec.reshape(b, num_tokens, self.d_out)
 
         # Choose output projection based on phase
-        if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gemv"]:
+        if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gqa_gemv"]:
             context_vec_flat = context_vec.reshape(1, -1)
             output_flat = self.aie_out_proj_gemv(context_vec_flat)
             context_vec = output_flat.reshape(b, num_tokens, self.d_out)
@@ -390,7 +414,7 @@ def my_mha(queries, keys, values):
         return context_vec
 
     def assign_weights(self, l, w_query, w_key, w_value, w_out_proj):
-        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
+        if self.cfg["use_kv_cache"] and self.cfg["use_aie_gqa_gemv"]:
             self.aie_query_gemv.weight = w_query
             self.aie_key_gemv.weight = w_key
             self.aie_value_gemv.weight = w_value
 
@@ -12,10 +12,7 @@
 from pathlib import Path
 from src.block.transformer import TransformerBlock
 from operators.rope.rope_utils import compute_rope_params
-from operators import (
-    AIERMSNorm,
-    AIEGEMM,
-)
+from operators import AIERMSNorm, AIEGEMM, AIEGEMV
 from rich.console import Console
 from rich.text import Text
 
@@ -35,20 +32,22 @@ def dtype_from_string(inp):
 config_options = {
     "dtype":                        (dtype_from_string, torch.float32, "Data type"),
     "use_kv_cache":                 (bool,              False,         "[Model] KV Cache"),
-    "use_aie_gemv":                 (bool,              False,         "[Decode] GEMV"),
     "use_aie_rope":                 (bool,              False,         "[Attention] Rope"),
     "use_aie_attn_projection_gemm": (bool,              False,         "[Attention] QKV GEMM"),
     "use_aie_regular_mha":          (bool,              False,         "[Attention] Regular MHA"),
     "use_aie_fused_mha":            (bool,              False,         "[Attention] Fused MHA"),
+    "use_aie_gqa_gemv":             (bool,              False,         "[Attention] GEMV (Decode)"),
     "use_aie_ffn_gemm":             (bool,              False,         "[FFN] GEMM"),
     "use_aie_ffn_mul":              (bool,              False,         "[FFN] Elementwise Mul"),
     "use_aie_ffn_silu":             (bool,              False,         "[FFN] SiLU"),
     "use_aie_ffn_swiglu":           (bool,              False,         "[FFN] Runlist-based SwiGLU"),
+    "use_aie_ffn_gemv":             (bool,              False,         "[FFN] GEMV (Decode)"),
     "use_aie_residual":             (bool,              False,         "[Transformer] Residual Addition"),
     "use_aie_norm1":                (bool,              False,         "[Transformer] Pre Norm"),
     "use_aie_norm2":                (bool,              False,         "[Transformer] Post Norm"),
     "use_aie_final_norm":           (bool,              False,         "[Transformer] Final Norm"),
     "use_aie_final_gemm":           (bool,              False,         "[Transformer] Final GEMM"),   
+    "use_aie_final_gemv":           (bool,              False,         "[Transformer] Final GEMV"),
 }
 # fmt: on
 
@@ -190,25 +189,32 @@ def __init__(
                 M_for_gemm = self.prompt_length
             else:
                 M_for_gemm = self.prompt_length + self.num_tokens
-            self.out_head_aie = AIEGEMM(
+            self.out_head_prefill = AIEGEMM(
                 M=M_for_gemm,
                 K=self.cfg["emb_dim"],
                 N=self.cfg["vocab_size"],
                 **aie_config_prefill,
             )
+            aie_gemv_config = {
+                "num_aie_columns": 8,
+                "is_mv": True,
+                "use_static_weight": True,
+                "num_aie_columns": 8,
+                "tile_size_input": 4,
+                "tile_size_output": 32,
+            }
+            # FC1 and FC2: emb_dim -> hidden_dim
+            if self.cfg["use_aie_final_gemv"]:
+                self.out_head_decode = AIEGEMV(
+                    M=self.cfg["vocab_size"], K=self.cfg["emb_dim"], **aie_gemv_config
+                )
         else:
             self.out_head = nn.Linear(
                 self.cfg["emb_dim"],
                 self.cfg["vocab_size"],
                 bias=False,
                 dtype=self.cfg["dtype"],
             )
-        self.out_head = nn.Linear(
-            self.cfg["emb_dim"],
-            self.cfg["vocab_size"],
-            bias=False,
-            dtype=self.cfg["dtype"],
-        )
 
         # Reusable utilities
         cos, sin = compute_rope_params(
@@ -269,12 +275,10 @@ def forward(self, in_idx, input_pos=None, use_kv_cache=False):
             x = self.final_norm(x)
 
         if self.cfg["use_aie_final_gemm"]:
-            if is_decode_with_kv and self.cfg["use_aie_gemv"]:
-                # TODO: Create GEMV operator
-                # logits = self.aie_out_head_gemv(x)
-                logits = self.out_head(x)  # Running on CPU
+            if is_decode_with_kv and self.cfg["use_aie_final_gemv"]:
+                logits = self.out_head_decode(x)
             else:
-                logits = self.out_head_aie(x)
+                logits = self.out_head_prefill(x)
         else:
             logits = self.out_head(x)
 
@@ -292,20 +296,11 @@ def assign_weights(self, final_norm, out_head, out_head_name):
                 f"model.norm.weight",
             )
 
-        self.out_head.weight = assign(
-            self.out_head.weight,
-            out_head,
-            out_head_name,
-        )
-        # TODO: Offload GEMV to NPU
-        # if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
-        #     self.aie_out_head_gemv.weight = out_head
         if self.cfg["use_aie_final_gemm"]:
             # Want column-major for B
-            self.out_head_aie.weight = out_head.T
-            # TODO: Create separate linear layers for prefill and decode (with gemm/gemv)
-            # if self.cfg["use_kv_cache"]:
-            #     self.out_head.weight = out_head.T
+            self.out_head_prefill.weight = out_head.T
+            if self.cfg["use_aie_final_gemv"]:
+                self.out_head_decode.weight = out_head.T
         else:
             self.out_head.weight = assign(
                 self.out_head.weight,
 
@@ -68,7 +68,7 @@ def add_buffer(self, name, count, dtype=bfloat16, static_data=None):
         if static_data is not None:
             assert (
                 static_data.nbytes <= self.buffers[name]
-            ), f"Static data for buffer {name} exceeds allocated size."
+            ), f"Static data for buffer {name} exceeds allocated size: expected {self.buffers[name]} bytes, got {static_data.nbytes} bytes."
             static_data_bytes = static_data.flatten().view(np.uint8).tobytes()
             if static_data_bytes not in self.context.static_data_pool:
                 self.context.static_data_pool[static_data_bytes] = None
 
@@ -361,6 +361,7 @@ def compile(self, artifacts):
                 "--no-xbridge",
                 "--peano",
                 str(self.peano_dir),
+                "--dynamic-objFifos",
             ]
             do_compile_xclbin = mlir_source in mlir_sources_to_xclbins
             do_compile_insts_bin = mlir_source in mlir_sources_to_insts_bins
Original file line number	Diff line number	Diff line change
`@@ -361,6 +361,7 @@ def compile(self, artifacts):`
`361`	`361`	`"--no-xbridge",`
`362`	`362`	`"--peano",`
`363`	`363`	`str(self.peano_dir),`
	`364`	`+ "--dynamic-objFifos",`
`364`	`365`	`]`
`365`	`366`	`do_compile_xclbin = mlir_source in mlir_sources_to_xclbins`
`366`	`367`	`do_compile_insts_bin = mlir_source in mlir_sources_to_insts_bins`