address Curt's and Pranathi's comments

andrej · andrej · commit 6c7123baaee9 · 2025-11-17T09:23:44.000-07:00
diff --git a/applications/llama_3.2_1b/src/block/feed_forward.py b/applications/llama_3.2_1b/src/block/feed_forward.py
@@ -25,10 +25,6 @@ def __init__(
         super().__init__()
         self.cfg = cfg.copy()
 
-        assert cfg["use_aie_ffn_swiglu"] != (
-            cfg["use_aie_ffn_silu"] or cfg["use_aie_ffn_gemm"] or cfg["use_aie_ffn_mul"]
-        ), "Cannot mix fused SwiGLU with individual AIE operators."
-
         self.emb_dim = cfg["emb_dim"]
         self.hidden_dim = cfg["hidden_dim"]
 
@@ -106,8 +102,8 @@ def forward(self, x):
         is_prefill = not is_vector or not self.cfg["use_kv_cache"]
 
         if is_vector and self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
-            x_fc1 = self.aie_fc1_gemv(None, x)
-            x_fc2 = self.aie_fc2_gemv(None, x)
+            x_fc1 = self.aie_fc1_gemv(x)
+            x_fc2 = self.aie_fc2_gemv(x)
         else:
             x_fc1 = self.fc1(x)
             x_fc2 = self.fc2(x)
@@ -120,7 +116,7 @@ def forward(self, x):
             x = x_fc1_silu * x_fc2
 
         if is_vector and self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
-            result = self.aie_fc3_gemv(None, x)
+            result = self.aie_fc3_gemv(x)
             return result.view(original_shape)
         else:
             return self.fc3(x).view(original_shape)
diff --git a/applications/llama_3.2_1b/src/block/gqa.py b/applications/llama_3.2_1b/src/block/gqa.py
@@ -166,15 +166,15 @@ def forward(self, x, mask, angles, input_pos=None):
             x_flat = x.reshape(1, -1)  # Shape: (1, d_in)
             input_dtype = x.dtype
 
-            queries_flat = self.aie_query_gemv(None, x_flat)
+            queries_flat = self.aie_query_gemv(x_flat)
             queries = queries_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
 
-            keys_flat = self.aie_key_gemv(None, x_flat)
+            keys_flat = self.aie_key_gemv(x_flat)
             keys = keys_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
             ).to(input_dtype)
 
-            values_flat = self.aie_value_gemv(None, x_flat)
+            values_flat = self.aie_value_gemv(x_flat)
             values = values_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
             ).to(input_dtype)
@@ -384,7 +384,7 @@ def my_mha(queries, keys, values):
         # Choose output projection based on phase
         if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gemv"]:
             context_vec_flat = context_vec.reshape(1, -1)
-            output_flat = self.aie_out_proj_gemv(None, context_vec_flat)
+            output_flat = self.aie_out_proj_gemv(context_vec_flat)
             context_vec = output_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
         elif self.cfg["use_aie_attn_projection_gemm"]:
             context_vec_flat = context_vec.reshape(-1, self.d_out)
diff --git a/applications/llama_3.2_1b/src/operator/aie_gemv.py b/applications/llama_3.2_1b/src/operator/aie_gemv.py
@@ -107,7 +107,7 @@ def set_up(self):
         self.add_buffer("output", self.M)
         self.add_to_runlist("gemv", "matrix", "vector", "output")
 
-    def forward(self, matrix, vector):
+    def forward(self, vector, matrix=None):
         """Forward pass through GEMV operation
 
         Args: