use AIE RoPE

andrej · andrej · commit f34ee2976f8a · 2026-01-12T13:49:55.000-07:00
diff --git a/applications/llama_3.2_1b/src/block/gqa.py b/applications/llama_3.2_1b/src/block/gqa.py
@@ -97,11 +97,17 @@ def __init__(
 
         # Initialize AIE RoPE operator
         if self.cfg["use_aie_rope"]:
-            self.aie_rope = AIERope(
-                num_aie_columns=1,
-                num_channels=1,
+            self.aie_rope_prefill = AIERope(
                 size=self.prompt_length * self.head_dim,
                 last_dim=self.head_dim,
+                num_aie_columns=1,
+                method_type=0,
+            )
+            self.aie_rope_decode = AIERope(
+                size=self.head_dim,
+                last_dim=self.head_dim,
+                num_aie_columns=1,
+                method_type=0,
             )
 
         # Initialize fused AIE MHA operator
@@ -158,6 +164,10 @@ def forward(self, x, mask, angles, input_pos=None):
         is_prefill = input_pos is None
         is_decode = input_pos is not None
 
+        # Step 1.
+        # ---
+        # Linear projections -- calculate quries, keys and values by multiplying embedding vector (in decode) or matrix (in prefill) with weight matrices
+
         # Choose between GEMM (prefill) and GEMV (decode) based on KV cache usage
         if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gemv"]:
             # Decode phase with KV cache - use GEMV for single token
@@ -195,10 +205,21 @@ def forward(self, x, mask, angles, input_pos=None):
             keys = self.W_key(x)
             values = self.W_value(x)
 
+        # Each attention head gets its own slice of the embedding dimension.
+        # For each head, we have query, key and value.
+        # In grouped-query attention, the keys and values are shared across groups of heads.
+        # Therefore, we have self.num_heads queries, and self.num_kv_groups (== self.num_heads in case of regular attention) keys and values.
+        # Each head can be applied independently to its subslice of the embedding dimension.
         keys = keys.view(b, num_tokens, self.num_kv_groups, self.head_dim)
         values = values.view(b, num_tokens, self.num_kv_groups, self.head_dim)
         queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
 
+        # Step 2.
+        # ---
+        # Apply positional encoding to keys and queries.
+        # The positional embedding is applied independently to each head.
+        # It modifies the embedding vectors to encode where in the sequence each token is located.
+
         # Determine angle slice based on KV cache usage and phase
         if self.cfg["use_kv_cache"] and is_decode:
             # Decode phase with KV cache: use single position
@@ -208,27 +229,28 @@ def forward(self, x, mask, angles, input_pos=None):
             # Prefill phase or no KV cache: use all tokens
             angle_slice = angles[:num_tokens, :]
 
-        # Apply RoPE with AIE or CPU fallback
+        # Apply RoPE with AIE
         def apply_rope_and_transpose(tensor, num_heads_dim, angle_slice):
-            expected_seq_len = (
-                1 if (self.cfg["use_kv_cache"] and is_decode) else self.prompt_length
-            )
-            can_use_aie = (
-                self.cfg["use_aie_rope"]
-                and tensor.shape[-1] == self.head_dim
-                and tensor.shape[-2] == expected_seq_len
+            transposed = (
+                tensor.view(num_tokens, num_heads_dim, self.head_dim)
+                .transpose(0, 1)
+                .contiguous()
             )
-
-            if can_use_aie:
-                # AIE RoPE path: flatten -> apply -> reshape -> transpose
-                tensor = self.aie_rope(tensor.view(b, num_tokens, -1), angle_slice)
-                return tensor.view(
-                    b, num_tokens, num_heads_dim, self.head_dim
-                ).transpose(1, 2)
+            angle_slice = angle_slice.to(dtype=tensor.dtype)
+            if self.cfg["use_aie_rope"]:
+                if is_prefill:
+                    result = self.aie_rope_prefill(transposed, angle_slice)
+                else:
+                    result = self.aie_rope_decode(transposed, angle_slice)
+                result = result.view(b, num_heads_dim, num_tokens, self.head_dim)
             else:
-                # CPU RoPE path: transpose -> apply
-                tensor = tensor.transpose(1, 2)
-                return apply_rope(tensor, angle_slice)
+                result = apply_rope(
+                    transposed.view(1, num_heads_dim, num_tokens, self.head_dim),
+                    angle_slice,
+                )
+            # ref = apply_rope(transposed.view(1, num_heads_dim, num_tokens, self.head_dim), angle_slice)
+            # assert torch.allclose(ref, result, atol=0.7, rtol=0.07), "AIE RoPE result does not match reference"
+            return result
 
         keys = apply_rope_and_transpose(keys, self.num_kv_groups, angle_slice)
         queries = apply_rope_and_transpose(queries, self.num_heads, angle_slice)
@@ -248,10 +270,18 @@ def apply_rope_and_transpose(tensor, num_heads_dim, angle_slice):
             keys = cached_keys
             values = cached_values
 
-        # Expand keys and values to match query heads for all cases (grouped query attention)
+        # Step 3.
+        # ---
+        # Since the keys and values are shared across groups of heads in grouped-query attention,
+        # we now expand (repeat) the same keys and values so that each head has its own keys and values.
         keys = keys.repeat_interleave(self.group_size, dim=1)
         values = values.repeat_interleave(self.group_size, dim=1)
 
+        # Step 4.
+        # ---
+        # Compute attention scores (indepdentently for each head), apply softmax to get attention weights, then apply those weights to the attention values to get output.
+        # Attention scores are the dot-product of queries and keys.
+
         # Use fused AIE MHA if enabled and conditions are met
         if is_prefill or not self.cfg["use_kv_cache"]:
             if (
diff --git a/operators/rope/op.py b/operators/rope/op.py
@@ -51,7 +51,7 @@ def __init__(
     def set_up_artifacts(self):
         # Compilation artifacts
         operator_dir = Path(__file__).parent
-        file_name_base = f"rope_{self.num_aie_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t_{self.method_type}m"
+        file_name_base = f"rope_{self.num_aie_columns}c_{self.size}_{self.tile_size}t_{self.method_type}m"
 
         mlir_artifact = PythonGeneratedMLIRArtifact.new(
             f"{file_name_base}.mlir",
@@ -119,7 +119,7 @@ def forward(self, x, y):
             and x.shape[-2:] == y.shape
         )
         if not applicable:
-            raise AIEOPeratorConstraintError("AIERope: incompatible tensor shape(s)")
+            raise AIEOperatorConstraintError("AIERope: incompatible tensor shape(s)")
 
         original_shape = x.shape
         if len(x.shape) > 2:
@@ -137,6 +137,7 @@ def forward(self, x, y):
             batch_data = x[i:end_idx, :]
 
             # Pad if necessary to match expected rows_per_batch
+            angle_offset = i % y.shape[0]
             if batch_data.shape[0] < rows_per_batch:
                 padding = torch.zeros(
                     rows_per_batch - batch_data.shape[0],
@@ -146,12 +147,13 @@ def forward(self, x, y):
                 )
                 batch_data_padded = torch.cat([batch_data, padding], dim=0)
                 result = self._process_batch(
-                    batch_data_padded, y[i % y.shape[0] : batch_size]
+                    batch_data_padded, y[angle_offset : angle_offset + rows_per_batch]
                 )
                 result = result[: batch_data.shape[0], :]
             else:
-                result = self._process_batch(batch_data, y[i % y.shape[0] : batch_size])
-
+                result = self._process_batch(
+                    batch_data, y[angle_offset : angle_offset + rows_per_batch]
+                )
             results.append(result)
 
         # Concatenate all batch results
@@ -165,13 +167,9 @@ def forward(self, x, y):
 
     def _process_batch(self, batch_data, angle_data):
         """Process a batch of sequences through the AIE kernel"""
-        batch_flat = batch_data.view(-1)
-
-        # Calculate buffer sizes for the batch
-        input_size = batch_data.nbytes
 
         # Write data to buffers
-        self.write_buffer("input", batch_data)
+        self.write_buffer("in", batch_data)
         self.write_buffer("angles", angle_data)
         test_pattern = np.zeros(len(batch_data), dtype=bfloat16)
         self.write_buffer("output", test_pattern)