Refactor GQA: Use einsum broadcasting & enable JAX fast-path

ayulockedin · ayulockedin · commit eea7502fca2b · 2026-01-08T22:00:25.000Z
diff --git a/flax/nnx/nn/attention.py b/flax/nnx/nn/attention.py
@@ -129,7 +129,7 @@ def dot_product_attention_weights(
     query = query.reshape(query.shape[:-2] + (k_heads, n_rep, query.shape[-1]))
     # Expand Key: [..., K, H_k, D] -> [..., K, H_k, 1, D]
     key = jnp.expand_dims(key, axis=-2)
-    
+
     # Contract: q(h)gd, k(h)1d -> hgqk (h=H_k, g=n_rep)
     einsum_str = '...qhgd,...kh1d->...hgqk'
   else:
@@ -140,7 +140,7 @@ def dot_product_attention_weights(
   # calculate attention matrix
   depth = query.shape[-1]
   query = query / jnp.sqrt(depth).astype(dtype)
-  
+
   # attn weight shape is (batch..., num_heads, q_length, kv_length)
   attn_weights = jnp.einsum(einsum_str, query, key, precision=precision)
 
@@ -174,7 +174,7 @@ def dot_product_attention_weights(
     keep_prob = 1.0 - dropout_rate
     # Note: We use original key.ndim because we might have expanded key dim
     ndim_base = key.ndim - 1 if is_gqa else key.ndim
-    
+
     if broadcast_dropout:
       # dropout is broadcast across the batch + head dimensions
       dropout_shape = tuple([1] * (ndim_base - 2)) + attn_weights.shape[-2:]
@@ -261,10 +261,9 @@ def dot_product_attention(
   ), 'q, k, v batch dims must match.'
   assert key.shape[-3] == value.shape[-3], 'k, v lengths must match.'
 
-  # Criteria that invoke the more optimized dot product attention
   # We skip this optimization for GQA (mismatched heads) to use manual broadcasting
-  if (dropout_rate == 0.0 and module == None and 
-      query.shape[-2] == key.shape[-2] == value.shape[-2]):
+  # Criteria that invoke the more optimized dot product attention
+  if dropout_rate == 0.0 and module is None:
     # make sure qkv batch are compressed to one dim
     query_shape = query.shape
     if len(query_shape) > 4:
@@ -303,7 +302,7 @@ def reshape_4d(x):
       v_heads = value.shape[-2]
       if q_heads % v_heads != 0:
          raise ValueError(f"Query heads ({q_heads}) must be multiple of Value heads ({v_heads})")
-      
+
       n_rep = q_heads // v_heads
       # Reshape weights: [..., H_v, n_rep, Q, K]
       attn_weights = attn_weights.reshape(attn_weights.shape[:-3] + (v_heads, n_rep) + attn_weights.shape[-2:])