Offload Final Linear Layer (#48)

kurtis-b-1 · andrej · web-flow · commit 331dcca1ea72 · 2026-01-08T10:58:00.000-07:00
* Use output directory instead of /tmp directory for building

* Can run GEMM with a wide B matrix

* Moved creating buffers for buffer lists B/C inside golden reference so that the major order logic is there instead

* Simplifying some logic in the operator

* Further simplification of the gemm operator

* Formatting

* Adjust comments for C tile streaming through shim DMA based on the separate_c_tiles parameter

* Remove using the separated C tile runtime streams in test

* Can offload the last linear layer, but TTFT goes way up--reason seems to be that the actual forward operation for last linear layer is ~4.5s, likely due to reading the output buffers and converting the output from np to torch since the GEMM kernel itself should take ~200ms with bfp16 emulation enabled

* Modified the torch_to_numpy/numpy_to_torch conversions to use zero-copy reinterprets, and removed unnecessary .to() calls which could result in extram unnecessary passes over memory

* Make read_buffer for BOs zero-copy

* Fix functionality when copy is True with read_buffer()

* Run decode stage on CPU for final linear layer, which fixes toks per sec but the outptut tokens still inconsistent with CPU-only inference

* Fix CPU final linear layer run with KV cache enabled and formatting

* Use map view for writing buffers like with reading buffers

* Make separate_c_tiles parameter based on partition_N value

* Formatting

* Use corect shapes for forward pass (padded N vs actual N)

* Formatting

* Clean up code and comments in new versions of read_buffer()/write_buffer() methods

* Fix comments in numpy/torch conversion utils

* fixes after merge

* format

* fixes

---------

Co-authored-by: andrej &lt;an.roesti@gmail.com&gt;
diff --git a/applications/llama_3.2_1b/configs/llama32_1b.json b/applications/llama_3.2_1b/configs/llama32_1b.json
@@ -23,7 +23,7 @@
     "use_aie_residual": true,
     "use_aie_regular_mha": false,
     "use_aie_fused_mha": true,
-    "use_aie_final_gemm": false,
+    "use_aie_final_gemm": true,
     "rope_freq": {
       "factor": 32.0,
       "low_freq_factor": 1.0,
diff --git a/applications/llama_3.2_1b/inference.py b/applications/llama_3.2_1b/inference.py
@@ -400,7 +400,7 @@ def set_prefill_time():
     parser.add_argument(
         "--prompt_len",
         type=int,
-        default=64,
+        default=2048,
         help="Truncate prompt to this many tokens.",
     )
     parser.add_argument(
diff --git a/applications/llama_3.2_1b/src/block/gqa.py b/applications/llama_3.2_1b/src/block/gqa.py
@@ -163,38 +163,33 @@ def forward(self, x, mask, angles, input_pos=None):
             # Decode phase with KV cache - use GEMV for single token
             # weight.T @ input, which is vector-matrix multiplication (So, is_mv=False)
             x_flat = x.reshape(1, -1)  # Shape: (1, d_in)
-            input_dtype = x.dtype
 
             queries_flat = self.aie_query_gemv(x_flat)
-            queries = queries_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            queries = queries_flat.reshape(b, num_tokens, self.d_out)
 
             keys_flat = self.aie_key_gemv(x_flat)
-            keys = keys_flat.reshape(
-                b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            keys = keys_flat.reshape(b, num_tokens, self.num_kv_groups * self.head_dim)
 
             values_flat = self.aie_value_gemv(x_flat)
             values = values_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            )
 
         elif self.cfg["use_aie_attn_projection_gemm"]:
             # Prefill phase - use GEMM for multiple tokens
             x_flat = x.reshape(-1, d_in)
             input_dtype = x.dtype
 
             queries_flat = self.aie_query(x_flat)
-            queries = queries_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            queries = queries_flat.reshape(b, num_tokens, self.d_out)
 
             keys_flat = self.aie_key(x_flat)
-            keys = keys_flat.reshape(
-                b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            keys = keys_flat.reshape(b, num_tokens, self.num_kv_groups * self.head_dim)
 
             values_flat = self.aie_value(x_flat)
             values = values_flat.reshape(
                 b, num_tokens, self.num_kv_groups * self.head_dim
-            ).to(input_dtype)
+            )
         else:
             queries = self.W_query(x)
             keys = self.W_key(x)
@@ -348,9 +343,9 @@ def apply_rope_and_transpose(tensor, num_heads_dim, angle_slice):
                 def my_mha(queries, keys, values):
                     inv_scale = 1 / np.sqrt(values.shape[-1])
                     context_vec = torch.nn.functional.scaled_dot_product_attention(
-                        queries.to(torch.bfloat16).to("cpu"),
-                        keys.to(torch.bfloat16).to("cpu"),
-                        values.to(torch.bfloat16).to("cpu"),
+                        queries,
+                        keys,
+                        values,
                         dropout_p=0.0,
                         is_causal=True,
                         scale=inv_scale,
@@ -384,11 +379,11 @@ def my_mha(queries, keys, values):
         if self.cfg["use_kv_cache"] and is_decode and self.cfg["use_aie_gemv"]:
             context_vec_flat = context_vec.reshape(1, -1)
             output_flat = self.aie_out_proj_gemv(context_vec_flat)
-            context_vec = output_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            context_vec = output_flat.reshape(b, num_tokens, self.d_out)
         elif self.cfg["use_aie_attn_projection_gemm"]:
             context_vec_flat = context_vec.reshape(-1, self.d_out)
             output_flat = self.aie_out_proj(context_vec_flat)
-            context_vec = output_flat.reshape(b, num_tokens, self.d_out).to(input_dtype)
+            context_vec = output_flat.reshape(b, num_tokens, self.d_out)
         else:
             context_vec = self.out_proj(context_vec)
 
diff --git a/applications/llama_3.2_1b/src/model_with_json.py b/applications/llama_3.2_1b/src/model_with_json.py
@@ -12,7 +12,10 @@
 from pathlib import Path
 from src.block.transformer import TransformerBlock
 from operators.rope.rope_utils import compute_rope_params
-from operators import AIERMSNorm
+from operators import (
+    AIERMSNorm,
+    AIEGEMM,
+)
 from rich.console import Console
 from rich.text import Text
 
@@ -169,7 +172,37 @@ def __init__(
                 self.cfg["emb_dim"], eps=1e-5, dtype=self.cfg["dtype"]
             )
 
-        # Depedns on use_aie_final_gemm
+        # Offload final linear layer if enabled
+        if self.cfg.get("use_aie_final_gemm", False):
+            # Since this GEMM has such a large N dimension, partition the N dimension by 4,
+            # and GEMM will execute for a workload of that smaller N dimension across different buffers of B and C
+            aie_config_prefill = {
+                "num_aie_columns": 8,
+                "tile_m": 64,
+                "tile_k": 64,
+                "tile_n": 64,
+                "b_col_maj": True,
+                "use_static_weight": True,
+                "separate_c_tiles": True,
+                "partition_N": 4,
+            }
+            if self.cfg["use_kv_cache"]:
+                M_for_gemm = self.prompt_length
+            else:
+                M_for_gemm = self.prompt_length + self.num_tokens
+            self.out_head_aie = AIEGEMM(
+                M=M_for_gemm,
+                K=self.cfg["emb_dim"],
+                N=self.cfg["vocab_size"],
+                **aie_config_prefill,
+            )
+        else:
+            self.out_head = nn.Linear(
+                self.cfg["emb_dim"],
+                self.cfg["vocab_size"],
+                bias=False,
+                dtype=self.cfg["dtype"],
+            )
         self.out_head = nn.Linear(
             self.cfg["emb_dim"],
             self.cfg["vocab_size"],
@@ -194,6 +227,22 @@ def forward(self, in_idx, input_pos=None, use_kv_cache=False):
         tok_embeds = self.tok_emb(in_idx)
         x = tok_embeds
 
+        # Check if input is a vector (decode phase) or matrix (prefill phase)
+        # Handle 1D: (emb_dim,), 2D: (1, emb_dim), or 3D: (1, 1, emb_dim)
+        is_vector = (
+            len(x.shape) == 1
+            or (len(x.shape) == 2 and x.shape[0] == 1)
+            or (len(x.shape) == 3 and x.shape[0] == 1 and x.shape[1] == 1)
+        )
+
+        # (batch, sequence, embedding) where sequence=1 indicates decode
+        if len(x.shape) == 3:
+            is_decode_with_kv = (x.shape[1] == 1) and self.cfg["use_kv_cache"]
+        elif len(x.shape) == 2:
+            is_decode_with_kv = (x.shape[0] == 1) and self.cfg["use_kv_cache"]
+        else:
+            is_decode_with_kv = False
+
         num_tokens = x.shape[1]
 
         # During generation phase with KV cache, don't create a mask
@@ -219,19 +268,47 @@ def forward(self, in_idx, input_pos=None, use_kv_cache=False):
         else:
             x = self.final_norm(x)
 
-        logits = self.out_head(x.to(self.cfg["dtype"]))
+        if self.cfg["use_aie_final_gemm"]:
+            if is_decode_with_kv and self.cfg["use_aie_gemv"]:
+                # TODO: Create GEMV operator
+                # logits = self.aie_out_head_gemv(x)
+                logits = self.out_head(x)  # Running on CPU
+            else:
+                logits = self.out_head_aie(x)
+        else:
+            logits = self.out_head(x)
 
         return logits
 
-    def assign_weights(self, final_norm):
+    def assign_weights(self, final_norm, out_head, out_head_name):
         if self.cfg.get("use_aie_final_norm", False):
             self.aie_final_norm_prefill.weight = final_norm
             if self.cfg["use_kv_cache"]:
                 self.aie_final_norm_decode.weight = final_norm
-            return
+        else:
+            self.final_norm.weight = assign(
+                self.final_norm.weight,
+                final_norm,
+                f"model.norm.weight",
+            )
 
-        self.final_norm.weight = assign(
-            self.final_norm.weight,
-            final_norm,
-            f"model.norm.weight",
+        self.out_head.weight = assign(
+            self.out_head.weight,
+            out_head,
+            out_head_name,
         )
+        # TODO: Offload GEMV to NPU
+        # if self.cfg["use_kv_cache"] and self.cfg["use_aie_gemv"]:
+        #     self.aie_out_head_gemv.weight = out_head
+        if self.cfg["use_aie_final_gemm"]:
+            # Want column-major for B
+            self.out_head_aie.weight = out_head.T
+            # TODO: Create separate linear layers for prefill and decode (with gemm/gemv)
+            # if self.cfg["use_kv_cache"]:
+            #     self.out_head.weight = out_head.T
+        else:
+            self.out_head.weight = assign(
+                self.out_head.weight,
+                out_head,
+                out_head_name,
+            )
diff --git a/applications/llama_3.2_1b/src/utils.py b/applications/llama_3.2_1b/src/utils.py
@@ -126,15 +126,13 @@ def load_weights_into_llama(model, param_config, params):
         )
 
     # Load output layer weights
-    model.assign_weights(params["model.norm.weight"])
-
     if "lm_head.weight" in params.keys():
-        model.out_head.weight = assign(
-            model.out_head.weight, params["lm_head.weight"], "lm_head.weight"
+        model.assign_weights(
+            params["model.norm.weight"], params["lm_head.weight"], "lm_head.weight"
         )
     else:
-        model.out_head.weight = assign(
-            model.out_head.weight,
+        model.assign_weights(
+            params["model.norm.weight"],
             params["model.embed_tokens.weight"],
             "model.embed_tokens.weight",
         )
diff --git a/operators/common/aie_base.py b/operators/common/aie_base.py
@@ -89,25 +89,41 @@ def add_to_runlist(self, kernel_name, *args):
     def get_bo(self, buffer_name):
         return self.buffer_bos[buffer_name]
 
-    def read_buffer(self, buffer_name, shape, dtype=bfloat16):
+    def read_buffer(self, buffer_name, shape, copy=False, dtype=bfloat16):
         """Read buffer and return values as a numpy array"""
-        size = np.prod(shape) * np.dtype(dtype).itemsize
-        output_bytes = self.get_bo(buffer_name).read(size, 0)
-        output_data_flat = np.frombuffer(output_bytes, dtype=dtype)
-        return output_data_flat.reshape(*shape)
+        # Create a byte accessible memory view of the buffer object
+        mv = self.get_bo(buffer_name).map()
+
+        # Interpret the buffer as a 1-dimensional array then change its view to the expected shape
+        arr = np.frombuffer(mv, dtype=dtype, count=np.prod(shape)).reshape(shape)
+
+        # Return an independent copy of the array if needed
+        return arr.copy() if copy else arr
 
     def read_buffer_as_torch(self, buffer_name, shape, dtype=bfloat16):
         return numpy_to_torch(self.read_buffer(buffer_name, shape, dtype))
 
     def write_buffer(self, buffer_name, array):
         """Write buffer from a numpy array into a XRT buffer object"""
-        if isinstance(array, torch.Tensor):
-            numpy_array = torch_to_numpy(array)
-        else:
-            numpy_array = array
         if buffer_name in self.buffer_static_data:
             raise RuntimeError(f"Cannot write to static buffer: {buffer_name}")
-        self.get_bo(buffer_name).write(numpy_array.flatten().view(np.uint8), 0)
+
+        # Normalize the source
+        if isinstance(array, torch.Tensor):
+            src = torch_to_numpy(array)
+        else:
+            src = np.asarray(array)
+
+        # Create a flattened 1D byte view of the source
+        src_bytes = src.ravel().view(np.uint8)
+
+        bo = self.get_bo(buffer_name)
+        mv = bo.map()  # byte accessible memory view
+        # Interpret the buffer as a 1-dimensional array
+        dst_bytes = np.frombuffer(mv, dtype=np.uint8, count=bo.size())
+
+        # The BO is an existing array, so copyto() can be called, which doesn't create a new array
+        np.copyto(dst_bytes[: src_bytes.size], src_bytes, casting="no")
 
     @abstractmethod
     def set_up_artifacts(self):
diff --git a/operators/common/aie_context.py b/operators/common/aie_context.py
@@ -98,16 +98,48 @@ def prepare_runtime(self):
                 )
 
             # If multiple buffers (of the same binned size) are used in the
-            # same kernel invocation, they require separate allocations.
+            # same kernel invocation OR across different invocations with shared
+            # buffers, they require separate allocations.
             conflicting_buffers = {}  # map buffer -> {set of conflicting buffers}
-            for kernel, *args in op.runlist:
+            buffer_to_runlist_entries = {}  # map buffer -> set of runlist entry indices
+
+            # First pass: track which buffers appear in which runlist entries
+            for idx, (kernel, *args) in enumerate(op.runlist):
+                for arg in args:
+                    buffer_to_runlist_entries.setdefault(arg, set()).add(idx)
+
+            # Second pass: determine conflicts
+            for idx, (kernel, *args) in enumerate(op.runlist):
                 for arg in args:
                     if arg in op.buffer_static_data:
+                        # Static buffers never conflict
                         continue
                     pool_sz = get_pool_sz(op.buffers[arg])
+
+                    # Buffers conflict if they're in the same runlist entry
                     conflicting_args = {
                         a for a in args if get_pool_sz(op.buffers[a]) == pool_sz
                     } - {arg}
+
+                    # Also conflict with buffers in other runlist entries that share
+                    # a buffer with this entry
+                    for other_arg in args:
+                        if other_arg == arg:
+                            continue
+                        for other_idx in buffer_to_runlist_entries.get(
+                            other_arg, set()
+                        ):
+                            if other_idx != idx:
+                                _, *other_args = op.runlist[other_idx]
+                                conflicting_args.update(
+                                    {
+                                        a
+                                        for a in other_args
+                                        if get_pool_sz(op.buffers[a]) == pool_sz
+                                        and a != arg
+                                    }
+                                )
+
                     conflicting_buffers[arg] = conflicting_buffers.get(
                         arg, set()
                     ).union(conflicting_args)
diff --git a/operators/common/utils.py b/operators/common/utils.py
@@ -23,14 +23,31 @@
 
 
 def torch_to_numpy(tensor: torch.Tensor) -> np.ndarray:
-    if tensor.dtype == torch.bfloat16:
-        float_arr = tensor.float().detach().cpu().numpy()
-        return float_arr.astype(bfloat16)
-    return tensor.detach().cpu().numpy()
+    # Detach (to drop grad) and ensure on CPU
+    t = tensor.detach()
+    if t.device.type != "cpu":
+        t = t.cpu()
+    # Ensure contiguous for safe view operations
+    if not t.is_contiguous():
+        t = t.contiguous()
+
+    if t.dtype == torch.bfloat16:
+        # View the same memory as uint16, then as NumPy bfloat16
+        # This avoids numeric conversion and extra passes over memory.
+        u16_np = t.view(torch.uint16).numpy()  # shares memory
+        return u16_np.view(np.dtype("bfloat16"))  # reinterpret
+
+    return t.numpy()
 
 
 def numpy_to_torch(array: np.ndarray) -> torch.Tensor:
-    device = torch.device("cpu")
-    if array.dtype == bfloat16:
-        return torch.from_numpy(array.astype(np.float32)).to(torch.bfloat16).to(device)
-    return torch.from_numpy(array).to(device)
+    # Ensure contiguous to let from_numpy create a view
+    if not array.flags["C_CONTIGUOUS"]:
+        array = np.ascontiguousarray(array)
+
+    if array.dtype == np.dtype("bfloat16"):
+        # reinterpret the same memory as uint16, then view as torch.bfloat16
+        t_u16 = torch.from_numpy(array.view(np.uint16))
+        return t_u16.view(torch.bfloat16)  # view
+
+    return torch.from_numpy(array)
diff --git a/operators/gemm/design.py b/operators/gemm/design.py
diff --git a/operators/gemm/op.py b/operators/gemm/op.py
diff --git a/operators/gemm/reference.py b/operators/gemm/reference.py
diff --git a/operators/gemm/test.py b/operators/gemm/test.py

Original file line number	Diff line number	Diff line change
`@@ -400,7 +400,7 @@ def set_prefill_time():`
`400`	`400`	`parser.add_argument(`
`401`	`401`	`"--prompt_len",`
`402`	`402`	`type=int,`
`403`		`- default=64,`
	`403`	`+ default=2048,`
`404`	`404`	`help="Truncate prompt to this many tokens.",`
`405`	`405`	`)`
`406`	`406`	`parser.add_argument(`
Original file line number	Diff line number	Diff line change
`@@ -126,15 +126,13 @@ def load_weights_into_llama(model, param_config, params):`
`126`	`126`	`)`
`127`	`127`
`128`	`128`	`# Load output layer weights`
`129`		`- model.assign_weights(params["model.norm.weight"])`
`130`		`-`
`131`	`129`	`if "lm_head.weight" in params.keys():`
`132`		`- model.out_head.weight = assign(`
`133`		`- model.out_head.weight, params["lm_head.weight"], "lm_head.weight"`
	`130`	`+ model.assign_weights(`
	`131`	`+ params["model.norm.weight"], params["lm_head.weight"], "lm_head.weight"`
`134`	`132`	`)`
`135`	`133`	`else:`
`136`		`- model.out_head.weight = assign(`
`137`		`- model.out_head.weight,`
	`134`	`+ model.assign_weights(`
	`135`	`+ params["model.norm.weight"],`
`138`	`136`	`params["model.embed_tokens.weight"],`
`139`	`137`	`"model.embed_tokens.weight",`
`140`	`138`	`)`