From 4d00344a8daf598615b1a9645e23b58584d42968 Mon Sep 17 00:00:00 2001
From: Nikolay Malkovsky <malkovskynv@gmail.com>
Date: Sat, 30 May 2026 12:02:13 +0300
Subject: [PATCH 1/7] Few implementations of RMQ

---
 .../benchmarks/scripts/plot_benchmarks.py     | 159 +++++++++
 agentic/cpp/skills/diagnose-segfault/SKILL.md | 195 +++++++++++
 include/pixie/bits.h                          | 115 +++++++
 include/pixie/rmq.h                           |   7 +
 include/pixie/rmq/bp_plus_minus_one_rmq.h     | 258 +++++++++++++++
 include/pixie/rmq/cartesian_tree_rmq.h        | 223 +++++++++++++
 include/pixie/rmq/rmq_base.h                  |  56 ++++
 include/pixie/rmq/segment_tree.h              | 116 +++++++
 include/pixie/rmq/sparse_table.h              | 104 ++++++
 src/benchmarks/bench_rmq.cpp                  | 188 +++++++++++
 src/tests/excess_positions_tests.cpp          | 114 +++++++
 src/tests/rmq_tests.cpp                       | 308 ++++++++++++++++++
 12 files changed, 1843 insertions(+)
 create mode 100644 agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py
 create mode 100644 agentic/cpp/skills/diagnose-segfault/SKILL.md
 create mode 100644 include/pixie/rmq.h
 create mode 100644 include/pixie/rmq/bp_plus_minus_one_rmq.h
 create mode 100644 include/pixie/rmq/cartesian_tree_rmq.h
 create mode 100644 include/pixie/rmq/rmq_base.h
 create mode 100644 include/pixie/rmq/segment_tree.h
 create mode 100644 include/pixie/rmq/sparse_table.h
 create mode 100644 src/benchmarks/bench_rmq.cpp
 create mode 100644 src/tests/rmq_tests.cpp

diff --git a/agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py b/agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py
new file mode 100644
index 0000000..abe5040
--- /dev/null
+++ b/agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""Generic Google Benchmark JSON plotter.
+
+Plots timing and (optionally) hardware counter data from Google Benchmark
+--benchmark_format=json output. Correctly handles repetition output by
+averaging raw iterations and skipping aggregate entries.
+
+Usage:
+    python3 plot_benchmarks.py results.json [output_prefix]
+    python3 plot_benchmarks.py results.json report --min-n 16 --max-n 1048576
+"""
+import json
+import sys
+import argparse
+import matplotlib.pyplot as plt
+import numpy as np
+
+def load_benchmark_json(path):
+    with open(path, 'r') as f:
+        return json.load(f)
+
+def extract_series(data, metric='time', min_n=None, max_n=None):
+    """Extract series from benchmark JSON.
+
+    metric='time'     -> real_time (ns)
+    metric='counter'  -> first available hardware counter (e.g. CACHE-MISSES)
+    """
+    # Determine counter key if requested
+    counter_key = None
+    if metric == 'counter':
+        # Find first counter key from the first iteration entry
+        for bench in data.get('benchmarks', []):
+            if bench.get('run_type', 'iteration') != 'iteration':
+                continue
+            for k in bench.keys():
+                if k not in ('name', 'run_name', 'run_type', 'repetitions',
+                             'repetition_index', 'threads', 'iterations',
+                             'real_time', 'cpu_time', 'time_unit',
+                             'items_per_second', 'aggregate_name',
+                             'aggregate_unit', 'family_index',
+                             'per_family_instance_index'):
+                    counter_key = k
+                    break
+            if counter_key:
+                break
+        if not counter_key:
+            return {}
+
+    raw = {}
+    for bench in data.get('benchmarks', []):
+        if bench.get('run_type', 'iteration') != 'iteration':
+            continue
+
+        name = bench['name']
+        parts = name.split('/')
+        if len(parts) < 2:
+            continue
+
+        bench_name = parts[0]
+        try:
+            n = int(parts[1])
+        except ValueError:
+            continue
+
+        if min_n is not None and n < min_n:
+            continue
+        if max_n is not None and n > max_n:
+            continue
+
+        if metric == 'time':
+            val = bench.get('real_time', bench.get('cpu_time', 0))
+        else:
+            val = bench.get(counter_key)
+
+        if val is None or val == 0:
+            continue
+
+        key = (bench_name, n)
+        raw.setdefault(key, []).append(val)
+
+    series = {}
+    for (bench_name, n), vals in raw.items():
+        series.setdefault(bench_name, []).append((n, sum(vals) / len(vals)))
+
+    for name in series:
+        series[name].sort(key=lambda x: x[0])
+
+    return series
+
+def plot_series(series, output_prefix, ylabel, title_suffix):
+    if not series:
+        print(f"No data to plot for {title_suffix}")
+        return
+
+    fig, ax = plt.subplots(figsize=(12, 8))
+    colors = plt.cm.tab10(np.linspace(0, 1, len(series)))
+
+    for idx, (name, points) in enumerate(sorted(series.items())):
+        xs = [p[0] for p in points]
+        ys = [p[1] for p in points]
+        ax.plot(xs, ys, marker='o', markersize=3, label=name, color=colors[idx])
+
+    ax.set_xscale('log')
+    ax.set_xlabel('Benchmark parameter n')
+    ax.set_ylabel(ylabel)
+    ax.set_title(f'Benchmark Results - {title_suffix}')
+    ax.legend(loc='upper left', fontsize='small')
+    ax.grid(True, which='both', ls='--', alpha=0.5)
+    fig.tight_layout()
+    fig.savefig(f'{output_prefix}.png', dpi=150)
+    fig.savefig(f'{output_prefix}.svg')
+    print(f'Saved {output_prefix}.png and {output_prefix}.svg')
+    plt.close(fig)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Plot Google Benchmark JSON results.')
+    parser.add_argument('json_path', help='Path to benchmark JSON file')
+    parser.add_argument('output_prefix', nargs='?', default='report',
+                        help='Output file prefix (default: report)')
+    parser.add_argument('--min-n', type=int, default=None,
+                        help='Minimum parameter value to include (inclusive)')
+    parser.add_argument('--max-n', type=int, default=None,
+                        help='Maximum parameter value to include (inclusive)')
+    args = parser.parse_args()
+
+    data = load_benchmark_json(args.json_path)
+
+    # Timing plot
+    time_series = extract_series(data, metric='time',
+                                 min_n=args.min_n, max_n=args.max_n)
+    if time_series:
+        plot_series(time_series, args.output_prefix,
+                    'Time per iteration (ns)', 'Time')
+
+    # Counter plot
+    counter_series = extract_series(data, metric='counter',
+                                    min_n=args.min_n, max_n=args.max_n)
+    if counter_series:
+        # Determine counter name for labels
+        counter_name = 'Counter'
+        for bench in data.get('benchmarks', []):
+            if bench.get('run_type', 'iteration') != 'iteration':
+                continue
+            for k in bench.keys():
+                if k not in ('name', 'run_name', 'run_type', 'repetitions',
+                             'repetition_index', 'threads', 'iterations',
+                             'real_time', 'cpu_time', 'time_unit',
+                             'items_per_second', 'aggregate_name',
+                             'aggregate_unit', 'family_index',
+                             'per_family_instance_index'):
+                    counter_name = k
+                    break
+            break
+        plot_series(counter_series, f'{args.output_prefix}_{counter_name.lower().replace("-", "_")}',
+                    f'{counter_name} per iteration', counter_name)
+
+if __name__ == '__main__':
+    main()
diff --git a/agentic/cpp/skills/diagnose-segfault/SKILL.md b/agentic/cpp/skills/diagnose-segfault/SKILL.md
new file mode 100644
index 0000000..91c8950
--- /dev/null
+++ b/agentic/cpp/skills/diagnose-segfault/SKILL.md
@@ -0,0 +1,195 @@
+---
+name: diagnose-segfault
+description: Diagnose C++ crashes and memory-safety errors with AddressSanitizer, GDB, and core dumps. Use when a C++ binary crashes with SIGSEGV, SIGABRT, heap-buffer-overflow, use-after-free, stack-buffer-overflow, double-free, suspected memory corruption, or an available core file.
+---
+
+# C++ Segfault and Memory Error Diagnosis
+
+Use this skill to find the first bad access or corrupting operation, not just
+the frame where the process finally crashed.
+
+## When To Use
+
+- A C++ binary crashes with `Segmentation fault`, `SIGSEGV`, or `SIGABRT`.
+- AddressSanitizer reports `ERROR: AddressSanitizer:`.
+- A test reports heap-buffer-overflow, stack-buffer-overflow, use-after-free,
+  double-free, global-buffer-overflow, or similar memory-safety failures.
+- A core dump exists and the user wants root-cause analysis.
+- Memory corruption is suspected but the immediate failure is ambiguous.
+
+For repository-specific binary names, CMake options, or known reproducer
+patterns, also read `agentic/local/cpp/skills/diagnose-segfault/EXAMPLES.md`
+when present.
+
+## Workflow 1: ASan First
+
+Prefer AddressSanitizer when the issue is reproducible. It usually reports the
+bad access with file and line information.
+
+### Build With ASan
+
+For CMake projects, first check whether the repository already has an ASan
+preset or option. If not, configure a dedicated debug build:
+
+```bash
+cmake -B build/asan -DCMAKE_BUILD_TYPE=Debug -DENABLE_ADDRESS_SANITIZER=ON
+cmake --build build/asan -j
+```
+
+For non-CMake builds, compile and link with:
+
+```bash
+-fsanitize=address -fno-omit-frame-pointer -g -O1
+```
+
+Use `-O0` instead of `-O1` when you expect to inspect many local variables in
+GDB.
+
+### Run The Minimal Reproducer
+
+Run the specific binary, test case, or input that triggers the crash. For Google
+Test binaries, prefer a narrow filter:
+
+```bash
+./build/asan/unittests --gtest_filter="SuiteName.TestName"
+```
+
+### Read The ASan Report
+
+Focus on:
+
+| Report section | Meaning |
+|---|---|
+| `ERROR: AddressSanitizer: <type>` | Error class |
+| `READ/WRITE of size N` | Access direction and size |
+| First user-code frame | Exact bad access |
+| Allocation/deallocation stack | Object lifetime and ownership |
+| Shadow-byte legend | Boundary or lifetime category |
+| `SUMMARY:` | One-line location summary |
+
+Useful options:
+
+```bash
+ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=1
+ASAN_OPTIONS=halt_on_error=0
+ASAN_OPTIONS=print_stats=1
+```
+
+Disable leak detection while diagnosing a crash if leak noise hides the primary
+failure:
+
+```bash
+ASAN_OPTIONS=detect_leaks=0 ./build/asan/unittests
+```
+
+## Workflow 2: GDB Live Debugging
+
+Use GDB when ASan is unavailable, when the crash is not a direct memory-safety
+violation, or when variable inspection is needed.
+
+Build with debug symbols:
+
+```bash
+cmake -B build/debug -DCMAKE_BUILD_TYPE=Debug
+cmake --build build/debug -j
+```
+
+Run under GDB:
+
+```bash
+gdb --args <binary> [arguments...]
+```
+
+Core commands:
+
+```gdb
+run
+bt full
+info registers
+info locals
+info args
+frame N
+list
+print <expr>
+thread apply all bt
+```
+
+Make C++ values easier to inspect:
+
+```gdb
+set print pretty on
+set print object on
+set pagination off
+```
+
+## Workflow 3: ASan Under GDB
+
+Use this when ASan points at a bad access but the pointer or lifetime corruption
+comes from an earlier frame.
+
+```bash
+gdb --args <asan-binary> [arguments...]
+```
+
+Break on ASan reporting or abort:
+
+```gdb
+break __asan::ReportGenericError
+catch signal SIGABRT
+run
+bt full
+```
+
+Then inspect the last user-code frames before ASan internals.
+
+## Workflow 4: Core Dump Analysis
+
+Use when the crash already happened or reproduction is expensive.
+
+Enable core dumps for future runs if needed:
+
+```bash
+ulimit -c unlimited
+```
+
+Analyze:
+
+```bash
+gdb <binary> <core-file>
+```
+
+Useful commands:
+
+```gdb
+bt full
+info threads
+thread apply all bt
+frame N
+info locals
+info args
+print <expr>
+```
+
+## Common ASan Errors
+
+| Error type | Typical cause |
+|---|---|
+| `heap-buffer-overflow` | Read or write past heap allocation bounds |
+| `stack-buffer-overflow` | Read or write past a local stack object |
+| `global-buffer-overflow` | Read or write past global/static storage |
+| `heap-use-after-free` | Access after `delete`, `free`, or container invalidation |
+| `stack-use-after-return` | Pointer/reference to a returned stack frame |
+| `double-free` | Object released twice |
+| `alloc-dealloc-mismatch` | Mixed allocation APIs, such as `new[]` with `free` |
+
+## Best Practices
+
+1. Build with `-g`; reports without symbols are often not actionable.
+2. Prefer the smallest reproducer over full-suite runs.
+3. Rebuild after toggling sanitizer or debug options.
+4. Treat the first ASan error as primary; later errors are often fallout.
+5. Check container iterator/reference invalidation around the reported object.
+6. Validate the fix with the same reproducer under ASan before running broader
+   tests.
+7. If ASan is too slow for a large input, use GDB on the same input or reduce
+   the input while preserving the crash.
diff --git a/include/pixie/bits.h b/include/pixie/bits.h
index 7d26597..f1250eb 100644
--- a/include/pixie/bits.h
+++ b/include/pixie/bits.h
@@ -2,6 +2,7 @@
 
 #include <immintrin.h>
 
+#include <algorithm>
 #include <bit>
 #include <cstddef>
 #include <cstdint>
@@ -805,8 +806,46 @@ static inline const __m256i excess_lut_bit2 = _mm256_set1_epi8(4);
 static inline const __m256i excess_lut_bit3 = _mm256_set1_epi8(8);
 static inline const __m128i excess_lut_nibble_mask = _mm_set1_epi8(0x0F);
 // clang-format on
+
+static inline __m256i excess_bit_masks_16x_i16() noexcept {
+  return _mm256_setr_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020,
+                           0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800,
+                           0x1000, 0x2000, 0x4000,
+                           static_cast<int16_t>(0x8000));
+}
+
+static inline __m256i excess_prefix_sum_16x_i16(__m256i v) noexcept {
+  __m256i x = v;
+  __m256i t = _mm256_slli_si256(x, 2);
+  x = _mm256_add_epi16(x, t);
+  t = _mm256_slli_si256(x, 4);
+  x = _mm256_add_epi16(x, t);
+  t = _mm256_slli_si256(x, 8);
+  x = _mm256_add_epi16(x, t);
+
+  __m128i lo = _mm256_extracti128_si256(x, 0);
+  __m128i hi = _mm256_extracti128_si256(x, 1);
+  const int16_t carry = static_cast<int16_t>(_mm_extract_epi16(lo, 7));
+  hi = _mm_add_epi16(hi, _mm_set1_epi16(carry));
+
+  __m256i out = _mm256_castsi128_si256(lo);
+  return _mm256_inserti128_si256(out, hi, 1);
+}
 #endif
 
+/**
+ * @brief Minimum prefix excess in a 128-bit bitstring range.
+ * @details Prefix positions are offsets in `[0, 128]`; position 0 is the
+ * empty prefix and position `k` is the excess after consuming the first `k`
+ * bits. The query range `[left, right]` is inclusive. Ties return the first
+ * offset attaining the minimum. Invalid ranges return offset 128 as a
+ * sentinel.
+ */
+struct ExcessMin128Result {
+  int min_excess = 0;
+  size_t offset = 128;
+};
+
 /**
  * @brief Find every prefix whose excess equals target_x in a 128-bit bitstring.
  *
@@ -935,6 +974,82 @@ static inline int prefix_excess_128(const uint64_t* s,
   return 2 * ones - static_cast<int>(end_offset);
 }
 
+/**
+ * @brief Return the minimum prefix excess and first attaining offset.
+ * @param s 2 little-endian uint64_t words (bit 0 of s[0] is the first bit).
+ * @param left First prefix position to consider, inclusive.
+ * @param right Last prefix position to consider, inclusive.
+ */
+static inline ExcessMin128Result excess_min_128(const uint64_t* s,
+                                                size_t left,
+                                                size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  left = std::min<size_t>(left, 128);
+  right = std::min<size_t>(right, 128);
+
+  int best = prefix_excess_128(s, left);
+  size_t best_offset = left;
+  if (left == right) {
+    return {best, best_offset};
+  }
+
+#ifdef PIXIE_AVX2_SUPPORT
+  static const __m256i masks = excess_bit_masks_16x_i16();
+  static const __m256i zero = _mm256_setzero_si256();
+  static const __m256i pos = _mm256_set1_epi16(1);
+  static const __m256i neg = _mm256_set1_epi16(-1);
+
+  int carry = 0;
+  alignas(32) int16_t prefix_values[16];
+  for (size_t chunk = 0; chunk < 8; ++chunk) {
+    const size_t chunk_bit = chunk * 16;
+    const uint16_t bits =
+        chunk < 4
+            ? static_cast<uint16_t>((s[0] >> (chunk * 16)) & 0xFFFFu)
+            : static_cast<uint16_t>((s[1] >> ((chunk - 4) * 16)) & 0xFFFFu);
+    const int delta = 2 * static_cast<int>(std::popcount(bits)) - 16;
+
+    if (chunk_bit + 1 <= right && chunk_bit + 16 >= left) {
+      const __m256i selected = _mm256_and_si256(
+          _mm256_set1_epi16(static_cast<int16_t>(bits)), masks);
+      const __m256i is_zero = _mm256_cmpeq_epi16(selected, zero);
+      const __m256i steps = _mm256_blendv_epi8(pos, neg, is_zero);
+      const __m256i pref =
+          _mm256_add_epi16(excess_prefix_sum_16x_i16(steps),
+                           _mm256_set1_epi16(static_cast<int16_t>(carry)));
+      _mm256_store_si256(reinterpret_cast<__m256i*>(prefix_values), pref);
+
+      for (size_t lane = 0; lane < 16; ++lane) {
+        const size_t offset = chunk_bit + lane + 1;
+        if (offset < left || offset > right) {
+          continue;
+        }
+        const int value = prefix_values[lane];
+        if (value < best) {
+          best = value;
+          best_offset = offset;
+        }
+      }
+    }
+    carry += delta;
+  }
+#else
+  int current = 0;
+  for (size_t bit = 0; bit < right; ++bit) {
+    current += ((s[bit >> 6] >> (bit & 63)) & 1ull) != 0 ? 1 : -1;
+    const size_t offset = bit + 1;
+    if (offset >= left && current < best) {
+      best = current;
+      best_offset = offset;
+    }
+  }
+#endif
+
+  return {best, best_offset};
+}
+
 /**
  * @brief Find the first prefix reaching target_x in a 128-bit bitstring.
  *
diff --git a/include/pixie/rmq.h b/include/pixie/rmq.h
new file mode 100644
index 0000000..452dfbe
--- /dev/null
+++ b/include/pixie/rmq.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <pixie/rmq/bp_plus_minus_one_rmq.h>
+#include <pixie/rmq/cartesian_tree_rmq.h>
+#include <pixie/rmq/rmq_base.h>
+#include <pixie/rmq/segment_tree.h>
+#include <pixie/rmq/sparse_table.h>
diff --git a/include/pixie/rmq/bp_plus_minus_one_rmq.h b/include/pixie/rmq/bp_plus_minus_one_rmq.h
new file mode 100644
index 0000000..57af0e1
--- /dev/null
+++ b/include/pixie/rmq/bp_plus_minus_one_rmq.h
@@ -0,0 +1,258 @@
+#pragma once
+
+#include <pixie/bits.h>
+#include <pixie/rmq/sparse_table.h>
+
+#include <algorithm>
+#include <array>
+#include <bit>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <span>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace pixie::rmq {
+
+/**
+ * @brief FCB-style RMQ backend for arrays with adjacent differences ±1.
+ *
+ * @details The indexed depth sequence is represented by BP deltas: bit 1 means
+ * the next depth is current + 1, and bit 0 means current - 1. A sequence with
+ * @p depth_count depth positions has @p depth_count - 1 delta bits. Blocks
+ * match the 128-bit excess primitives in `bits.h`; only the absolute minimum
+ * value of each block is stored, and positions are recovered by rescanning the
+ * selected block.
+ *
+ * @tparam Index Unsigned integer type used for stored positions.
+ * @tparam BlockSize Number of depth positions per microblock.
+ */
+template <class Index = std::size_t, std::size_t BlockSize = 128>
+class BpPlusMinusOneRmq {
+ public:
+  static_assert(BlockSize == 128);
+
+  static constexpr std::size_t npos = std::numeric_limits<std::size_t>::max();
+  static constexpr Index invalid_index = std::numeric_limits<Index>::max();
+
+  BpPlusMinusOneRmq() = default;
+
+  BpPlusMinusOneRmq(std::span<const std::uint64_t> bits,
+                    std::size_t depth_count)
+      : input_bits_(bits), depth_count_(depth_count) {
+    build();
+  }
+
+  BpPlusMinusOneRmq(const BpPlusMinusOneRmq& other)
+      : input_bits_(other.input_bits_),
+        depth_count_(other.depth_count_),
+        block_min_values_(other.block_min_values_) {
+    reset_macro_rmq();
+  }
+
+  BpPlusMinusOneRmq& operator=(const BpPlusMinusOneRmq& other) {
+    if (this == &other) {
+      return *this;
+    }
+    input_bits_ = other.input_bits_;
+    depth_count_ = other.depth_count_;
+    block_min_values_ = other.block_min_values_;
+    reset_macro_rmq();
+    return *this;
+  }
+
+  BpPlusMinusOneRmq(BpPlusMinusOneRmq&& other) noexcept
+      : input_bits_(other.input_bits_),
+        depth_count_(other.depth_count_),
+        block_min_values_(std::move(other.block_min_values_)) {
+    reset_macro_rmq();
+  }
+
+  BpPlusMinusOneRmq& operator=(BpPlusMinusOneRmq&& other) noexcept {
+    if (this == &other) {
+      return *this;
+    }
+    input_bits_ = other.input_bits_;
+    depth_count_ = other.depth_count_;
+    block_min_values_ = std::move(other.block_min_values_);
+    reset_macro_rmq();
+    return *this;
+  }
+
+  std::size_t size() const { return depth_count_; }
+
+  bool empty() const { return depth_count_ == 0; }
+
+  std::size_t arg_min(std::size_t left, std::size_t right) const {
+    if (left > right || right >= depth_count_) {
+      return npos;
+    }
+
+    const std::size_t left_block = left / BlockSize;
+    const std::size_t right_block = right / BlockSize;
+    if (left_block == right_block) {
+      return scan_block_range(left_block, left % BlockSize, right % BlockSize)
+          .position;
+    }
+
+    Candidate answer = scan_block_range(left_block, left % BlockSize,
+                                        block_size(left_block) - 1);
+    answer =
+        better(answer, scan_block_range(right_block, 0, right % BlockSize));
+
+    if (left_block + 1 < right_block) {
+      const std::size_t block_position =
+          macro_rmq_.arg_min(left_block + 1, right_block - 1);
+      if (block_position !=
+          SparseTable<std::int64_t, std::less<std::int64_t>, Index>::npos) {
+        answer = better(answer, scan_full_block(block_position));
+      }
+    }
+
+    return answer.position;
+  }
+
+ private:
+  struct Candidate {
+    std::size_t position = npos;
+    std::int64_t value = std::numeric_limits<std::int64_t>::max();
+  };
+
+  std::size_t block_size(std::size_t block) const {
+    const std::size_t begin = block * BlockSize;
+    return std::min(BlockSize, depth_count_ - begin);
+  }
+
+  Candidate better(Candidate left, Candidate right) const {
+    if (left.position == npos) {
+      return right;
+    }
+    if (right.position == npos) {
+      return left;
+    }
+    if (right.value < left.value) {
+      return right;
+    }
+    if (left.value < right.value) {
+      return left;
+    }
+    return right.position < left.position ? right : left;
+  }
+
+  void build() {
+    block_min_values_.clear();
+    macro_rmq_ = SparseTable<std::int64_t, std::less<std::int64_t>, Index>();
+
+    if (depth_count_ == 0) {
+      return;
+    }
+    if (depth_count_ > static_cast<std::size_t>(invalid_index)) {
+      throw std::length_error("RMQ ±1 index type is too small");
+    }
+    if (input_bits_.size() < (depth_count_ - 1 + 63) / 64) {
+      throw std::invalid_argument("RMQ ±1 bit span is too small");
+    }
+
+    const std::size_t block_count = (depth_count_ + BlockSize - 1) / BlockSize;
+    block_min_values_.reserve(block_count);
+
+    std::int64_t base_depth = 0;
+    for (std::size_t block = 0; block < block_count; ++block) {
+      const std::size_t begin = block * BlockSize;
+      const std::size_t size = std::min(BlockSize, depth_count_ - begin);
+      std::int64_t min_depth = base_depth;
+      std::int64_t current_depth = base_depth;
+      for (std::size_t offset = 1; offset < size; ++offset) {
+        const std::size_t delta_position = begin + offset - 1;
+        const bool up = bit(delta_position);
+        current_depth += up ? 1 : -1;
+        if (current_depth < min_depth) {
+          min_depth = current_depth;
+        }
+      }
+
+      block_min_values_.push_back(min_depth);
+      if (block + 1 < block_count) {
+        base_depth += block_excess(begin, next_block_delta_count(begin));
+      }
+    }
+
+    reset_macro_rmq();
+  }
+
+  bool bit(std::size_t position) const {
+    return ((input_bits_[position >> 6] >> (position & 63)) & 1u) != 0;
+  }
+
+  std::uint64_t word_or_zero(std::size_t word) const {
+    return word < input_bits_.size() ? input_bits_[word] : 0;
+  }
+
+  std::array<std::uint64_t, 2> block_bits(std::size_t block) const {
+    const std::size_t first_word = block * (BlockSize / 64);
+    return {word_or_zero(first_word), word_or_zero(first_word + 1)};
+  }
+
+  std::size_t next_block_delta_count(std::size_t begin) const {
+    const std::size_t next_begin = begin + BlockSize;
+    return std::min(next_begin, depth_count_ - 1) - begin;
+  }
+
+  std::int64_t block_excess(std::size_t begin, std::size_t delta_count) const {
+    std::int64_t excess = 0;
+    for (std::size_t i = 0; i < delta_count; ++i) {
+      excess += bit(begin + i) ? 1 : -1;
+    }
+    return excess;
+  }
+
+  std::int64_t block_base_depth(std::size_t block,
+                                const std::array<std::uint64_t, 2>& bits,
+                                std::size_t size) const {
+    const ExcessMin128Result full_min =
+        excess_min_128(bits.data(), 0, size - 1);
+    return block_min_values_[block] - full_min.min_excess;
+  }
+
+  Candidate scan_block_range(std::size_t block,
+                             std::size_t left_offset,
+                             std::size_t right_offset) const {
+    const std::size_t begin = block * BlockSize;
+    const std::size_t size = block_size(block);
+    right_offset = std::min(right_offset, size - 1);
+    const auto bits = block_bits(block);
+    const std::int64_t base_depth = block_base_depth(block, bits, size);
+    const ExcessMin128Result result =
+        excess_min_128(bits.data(), left_offset, right_offset);
+    if (result.offset == npos || result.offset >= size) {
+      return {};
+    }
+    return {begin + result.offset, base_depth + result.min_excess};
+  }
+
+  Candidate scan_full_block(std::size_t block) const {
+    const std::size_t begin = block * BlockSize;
+    const std::size_t size = block_size(block);
+    const auto bits = block_bits(block);
+    const ExcessMin128Result result = excess_min_128(bits.data(), 0, size - 1);
+    if (result.offset == npos || result.offset >= size) {
+      return {};
+    }
+    return {begin + result.offset, block_min_values_[block]};
+  }
+
+  void reset_macro_rmq() {
+    macro_rmq_ = SparseTable<std::int64_t, std::less<std::int64_t>, Index>(
+        std::span<const std::int64_t>(block_min_values_));
+  }
+
+  std::span<const std::uint64_t> input_bits_;
+  std::size_t depth_count_ = 0;
+  std::vector<std::int64_t> block_min_values_;
+  SparseTable<std::int64_t, std::less<std::int64_t>, Index> macro_rmq_;
+};
+
+}  // namespace pixie::rmq
diff --git a/include/pixie/rmq/cartesian_tree_rmq.h b/include/pixie/rmq/cartesian_tree_rmq.h
new file mode 100644
index 0000000..db7d315
--- /dev/null
+++ b/include/pixie/rmq/cartesian_tree_rmq.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <pixie/rmq/bp_plus_minus_one_rmq.h>
+#include <pixie/rmq/rmq_base.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <span>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace pixie::rmq {
+
+/**
+ * @brief General RMQ via Cartesian-tree reduction to ±1 RMQ.
+ *
+ * @details Builds a stable min Cartesian tree over the indexed values, takes
+ * its Euler tour, and answers original RMQ queries as LCA queries implemented
+ * by RMQ over the Euler depth sequence. The indexed values are not owned and
+ * must outlive this object.
+ *
+ * @tparam T Value type in the indexed array.
+ * @tparam Compare Strict weak ordering used to choose minima.
+ * @tparam Index Unsigned integer type used for stored positions.
+ */
+template <class T, class Compare = std::less<T>, class Index = std::size_t>
+class CartesianTreeRmq
+    : public RmqBase<CartesianTreeRmq<T, Compare, Index>, T> {
+ public:
+  static constexpr std::size_t npos =
+      RmqBase<CartesianTreeRmq<T, Compare, Index>, T>::npos;
+  static constexpr Index invalid_index = std::numeric_limits<Index>::max();
+
+  CartesianTreeRmq() = default;
+
+  explicit CartesianTreeRmq(std::span<const T> values,
+                            Compare compare = Compare())
+      : values_(values), compare_(compare) {
+    build();
+  }
+
+  CartesianTreeRmq(const CartesianTreeRmq& other)
+      : values_(other.values_),
+        compare_(other.compare_),
+        left_child_(other.left_child_),
+        right_child_(other.right_child_),
+        first_occurrence_(other.first_occurrence_),
+        euler_nodes_(other.euler_nodes_),
+        depths_(other.depths_),
+        euler_delta_bits_(other.euler_delta_bits_) {
+    reset_depth_rmq();
+  }
+
+  CartesianTreeRmq& operator=(const CartesianTreeRmq& other) {
+    if (this == &other) {
+      return *this;
+    }
+    values_ = other.values_;
+    compare_ = other.compare_;
+    left_child_ = other.left_child_;
+    right_child_ = other.right_child_;
+    first_occurrence_ = other.first_occurrence_;
+    euler_nodes_ = other.euler_nodes_;
+    depths_ = other.depths_;
+    euler_delta_bits_ = other.euler_delta_bits_;
+    reset_depth_rmq();
+    return *this;
+  }
+
+  CartesianTreeRmq(CartesianTreeRmq&& other) noexcept
+      : values_(other.values_),
+        compare_(std::move(other.compare_)),
+        left_child_(std::move(other.left_child_)),
+        right_child_(std::move(other.right_child_)),
+        first_occurrence_(std::move(other.first_occurrence_)),
+        euler_nodes_(std::move(other.euler_nodes_)),
+        depths_(std::move(other.depths_)),
+        euler_delta_bits_(std::move(other.euler_delta_bits_)) {
+    reset_depth_rmq();
+  }
+
+  CartesianTreeRmq& operator=(CartesianTreeRmq&& other) noexcept {
+    if (this == &other) {
+      return *this;
+    }
+    values_ = other.values_;
+    compare_ = std::move(other.compare_);
+    left_child_ = std::move(other.left_child_);
+    right_child_ = std::move(other.right_child_);
+    first_occurrence_ = std::move(other.first_occurrence_);
+    euler_nodes_ = std::move(other.euler_nodes_);
+    depths_ = std::move(other.depths_);
+    euler_delta_bits_ = std::move(other.euler_delta_bits_);
+    reset_depth_rmq();
+    return *this;
+  }
+
+  std::size_t size_impl() const { return values_.size(); }
+
+  T value_at_impl(std::size_t position) const { return values_[position]; }
+
+  std::size_t arg_min_impl(std::size_t left, std::size_t right) const {
+    if (left > right || right >= values_.size()) {
+      return npos;
+    }
+    std::size_t first = first_occurrence_[left];
+    std::size_t second = first_occurrence_[right];
+    if (first > second) {
+      std::swap(first, second);
+    }
+    const std::size_t euler_position = depth_rmq_.arg_min(first, second);
+    if (euler_position == npos) {
+      return npos;
+    }
+    return euler_nodes_[euler_position];
+  }
+
+  std::span<const Index> euler_nodes() const { return euler_nodes_; }
+
+  std::span<const std::int64_t> euler_depths() const { return depths_; }
+
+ private:
+  void build() {
+    left_child_.clear();
+    right_child_.clear();
+    first_occurrence_.clear();
+    euler_nodes_.clear();
+    depths_.clear();
+    euler_delta_bits_.clear();
+    depth_rmq_ = BpPlusMinusOneRmq<Index>();
+
+    if (values_.empty()) {
+      return;
+    }
+    if (values_.size() > static_cast<std::size_t>(invalid_index)) {
+      throw std::length_error("Cartesian RMQ index type is too small");
+    }
+
+    left_child_.assign(values_.size(), invalid_index);
+    right_child_.assign(values_.size(), invalid_index);
+
+    const std::size_t root = build_cartesian_tree();
+    first_occurrence_.assign(values_.size(), invalid_index);
+    euler_nodes_.reserve(2 * values_.size() - 1);
+    depths_.reserve(2 * values_.size() - 1);
+    euler_tour(root, 0);
+    build_euler_delta_bits();
+    reset_depth_rmq();
+  }
+
+  std::size_t build_cartesian_tree() {
+    std::vector<Index> stack;
+    stack.reserve(values_.size());
+
+    for (std::size_t i = 0; i < values_.size(); ++i) {
+      Index last = invalid_index;
+      while (!stack.empty() && compare_(values_[i], values_[stack.back()])) {
+        last = stack.back();
+        stack.pop_back();
+      }
+      if (last != invalid_index) {
+        left_child_[i] = last;
+      }
+      if (!stack.empty()) {
+        right_child_[stack.back()] = static_cast<Index>(i);
+      }
+      stack.push_back(static_cast<Index>(i));
+    }
+
+    return stack.front();
+  }
+
+  void euler_tour(std::size_t node, std::int64_t depth) {
+    append_euler(node, depth);
+    if (left_child_[node] != invalid_index) {
+      euler_tour(left_child_[node], depth + 1);
+      append_euler(node, depth);
+    }
+    if (right_child_[node] != invalid_index) {
+      euler_tour(right_child_[node], depth + 1);
+      append_euler(node, depth);
+    }
+  }
+
+  void append_euler(std::size_t node, std::int64_t depth) {
+    if (first_occurrence_[node] == invalid_index) {
+      first_occurrence_[node] = static_cast<Index>(euler_nodes_.size());
+    }
+    euler_nodes_.push_back(static_cast<Index>(node));
+    depths_.push_back(depth);
+  }
+
+  void reset_depth_rmq() {
+    depth_rmq_ = BpPlusMinusOneRmq<Index>(
+        std::span<const std::uint64_t>(euler_delta_bits_), depths_.size());
+  }
+
+  void build_euler_delta_bits() {
+    euler_delta_bits_.assign((depths_.size() - 1 + 63) / 64, 0);
+    for (std::size_t i = 1; i < depths_.size(); ++i) {
+      const std::int64_t delta = depths_[i] - depths_[i - 1];
+      if (delta == 1) {
+        euler_delta_bits_[(i - 1) >> 6] |= std::uint64_t{1} << ((i - 1) & 63);
+      }
+    }
+  }
+
+  std::span<const T> values_;
+  Compare compare_;
+  std::vector<Index> left_child_;
+  std::vector<Index> right_child_;
+  std::vector<Index> first_occurrence_;
+  std::vector<Index> euler_nodes_;
+  std::vector<std::int64_t> depths_;
+  std::vector<std::uint64_t> euler_delta_bits_;
+  BpPlusMinusOneRmq<Index> depth_rmq_;
+};
+
+}  // namespace pixie::rmq
diff --git a/include/pixie/rmq/rmq_base.h b/include/pixie/rmq/rmq_base.h
new file mode 100644
index 0000000..cc919da
--- /dev/null
+++ b/include/pixie/rmq/rmq_base.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstddef>
+#include <limits>
+
+namespace pixie::rmq {
+
+/**
+ * @brief CRTP facade for static range-minimum-query indexes.
+ *
+ * Implementations are non-owning indexes over an external random-access array.
+ * Queries use inclusive zero-based ranges and return the first position
+ * attaining the minimum. Invalid ranges return `npos`.
+ */
+template <class Impl, class Value>
+class RmqBase {
+ public:
+  /**
+   * @brief Sentinel returned when no valid query answer exists.
+   */
+  static constexpr std::size_t npos = std::numeric_limits<std::size_t>::max();
+
+  /**
+   * @brief Number of indexed values.
+   */
+  std::size_t size() const { return impl().size_impl(); }
+
+  /**
+   * @brief Whether the indexed array is empty.
+   */
+  bool empty() const { return size() == 0; }
+
+  /**
+   * @brief Return the first minimum position in [@p left, @p right].
+   */
+  std::size_t arg_min(std::size_t left, std::size_t right) const {
+    return impl().arg_min_impl(left, right);
+  }
+
+  /**
+   * @brief Return the minimum value in [@p left, @p right].
+   * @details Invalid ranges return a default-constructed value.
+   */
+  Value range_min(std::size_t left, std::size_t right) const {
+    const std::size_t position = arg_min(left, right);
+    if (position == npos) {
+      return Value{};
+    }
+    return impl().value_at_impl(position);
+  }
+
+ private:
+  const Impl& impl() const { return static_cast<const Impl&>(*this); }
+};
+
+}  // namespace pixie::rmq
diff --git a/include/pixie/rmq/segment_tree.h b/include/pixie/rmq/segment_tree.h
new file mode 100644
index 0000000..5e734a8
--- /dev/null
+++ b/include/pixie/rmq/segment_tree.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <pixie/rmq/rmq_base.h>
+
+#include <algorithm>
+#include <bit>
+#include <cstddef>
+#include <functional>
+#include <limits>
+#include <span>
+#include <stdexcept>
+#include <vector>
+
+namespace pixie::rmq {
+
+/**
+ * @brief Static iterative segment-tree RMQ baseline.
+ *
+ * @details Stores the index of the first minimum for each segment in a flat
+ * binary tree. Query time is O(log n), build time is O(n), and storage is O(n)
+ * indices. The indexed values are not owned and must outlive this object.
+ *
+ * @tparam T Value type in the indexed array.
+ * @tparam Compare Strict weak ordering used to choose minima.
+ * @tparam Index Unsigned integer type used for stored positions.
+ */
+template <class T, class Compare = std::less<T>, class Index = std::size_t>
+class SegmentTree : public RmqBase<SegmentTree<T, Compare, Index>, T> {
+ public:
+  static constexpr std::size_t npos =
+      RmqBase<SegmentTree<T, Compare, Index>, T>::npos;
+  static constexpr Index invalid_index = std::numeric_limits<Index>::max();
+
+  SegmentTree() = default;
+
+  explicit SegmentTree(std::span<const T> values, Compare compare = Compare())
+      : values_(values), compare_(compare) {
+    build();
+  }
+
+  std::size_t size_impl() const { return values_.size(); }
+
+  T value_at_impl(std::size_t position) const { return values_[position]; }
+
+  std::size_t arg_min_impl(std::size_t left, std::size_t right) const {
+    if (left > right || right >= values_.size()) {
+      return npos;
+    }
+
+    left += leaf_base_;
+    right += leaf_base_;
+    std::size_t answer = npos;
+    while (left <= right) {
+      if ((left & 1u) != 0) {
+        answer = better(answer, tree_[left]);
+        ++left;
+      }
+      if ((right & 1u) == 0) {
+        answer = better(answer, tree_[right]);
+        if (right == 0) {
+          break;
+        }
+        --right;
+      }
+      left >>= 1;
+      right >>= 1;
+    }
+    return answer;
+  }
+
+ private:
+  std::size_t better(std::size_t left, std::size_t right) const {
+    if (left == npos || left == invalid_index) {
+      return right;
+    }
+    if (right == npos || right == invalid_index) {
+      return left;
+    }
+    if (compare_(values_[right], values_[left])) {
+      return right;
+    }
+    if (compare_(values_[left], values_[right])) {
+      return left;
+    }
+    return std::min(left, right);
+  }
+
+  void build() {
+    tree_.clear();
+    leaf_base_ = 0;
+    if (values_.empty()) {
+      return;
+    }
+    if (values_.size() > static_cast<std::size_t>(invalid_index)) {
+      throw std::length_error("RMQ segment tree index type is too small");
+    }
+
+    leaf_base_ = std::bit_ceil(values_.size());
+    tree_.assign(2 * leaf_base_, invalid_index);
+    for (std::size_t i = 0; i < values_.size(); ++i) {
+      tree_[leaf_base_ + i] = static_cast<Index>(i);
+    }
+    for (std::size_t node = leaf_base_; node > 1;) {
+      --node;
+      tree_[node] =
+          static_cast<Index>(better(tree_[node << 1], tree_[(node << 1) | 1]));
+    }
+  }
+
+  std::span<const T> values_;
+  Compare compare_;
+  std::size_t leaf_base_ = 0;
+  std::vector<Index> tree_;
+};
+
+}  // namespace pixie::rmq
diff --git a/include/pixie/rmq/sparse_table.h b/include/pixie/rmq/sparse_table.h
new file mode 100644
index 0000000..d2ea6ab
--- /dev/null
+++ b/include/pixie/rmq/sparse_table.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <pixie/rmq/rmq_base.h>
+
+#include <algorithm>
+#include <bit>
+#include <cstddef>
+#include <functional>
+#include <limits>
+#include <span>
+#include <stdexcept>
+#include <vector>
+
+namespace pixie::rmq {
+
+/**
+ * @brief Static sparse-table RMQ baseline.
+ *
+ * @details Stores the index of the first minimum for each power-of-two range.
+ * Query time is O(1), build time is O(n log n), and storage is O(n log n)
+ * indices. The indexed values are not owned and must outlive this object.
+ *
+ * @tparam T Value type in the indexed array.
+ * @tparam Compare Strict weak ordering used to choose minima.
+ * @tparam Index Unsigned integer type used for stored positions.
+ */
+template <class T, class Compare = std::less<T>, class Index = std::size_t>
+class SparseTable : public RmqBase<SparseTable<T, Compare, Index>, T> {
+ public:
+  static constexpr std::size_t npos =
+      RmqBase<SparseTable<T, Compare, Index>, T>::npos;
+  static constexpr Index invalid_index = std::numeric_limits<Index>::max();
+
+  SparseTable() = default;
+
+  explicit SparseTable(std::span<const T> values, Compare compare = Compare())
+      : values_(values), compare_(compare) {
+    build();
+  }
+
+  std::size_t size_impl() const { return values_.size(); }
+
+  T value_at_impl(std::size_t position) const { return values_[position]; }
+
+  std::size_t arg_min_impl(std::size_t left, std::size_t right) const {
+    if (left > right || right >= values_.size()) {
+      return npos;
+    }
+    const std::size_t length = right - left + 1;
+    const std::size_t level = std::bit_width(length) - 1;
+    const std::size_t span = std::size_t{1} << level;
+    const std::size_t first = table_[level][left];
+    const std::size_t second = table_[level][right + 1 - span];
+    return better(first, second);
+  }
+
+ private:
+  std::size_t better(std::size_t left, std::size_t right) const {
+    if (left == npos) {
+      return right;
+    }
+    if (right == npos) {
+      return left;
+    }
+    if (compare_(values_[right], values_[left])) {
+      return right;
+    }
+    if (compare_(values_[left], values_[right])) {
+      return left;
+    }
+    return std::min(left, right);
+  }
+
+  void build() {
+    table_.clear();
+    if (values_.empty()) {
+      return;
+    }
+    if (values_.size() > static_cast<std::size_t>(invalid_index)) {
+      throw std::length_error("RMQ sparse table index type is too small");
+    }
+
+    table_.emplace_back(values_.size());
+    for (std::size_t i = 0; i < values_.size(); ++i) {
+      table_[0][i] = static_cast<Index>(i);
+    }
+
+    for (std::size_t span = 2, half = 1; span <= values_.size();
+         half = span, span <<= 1) {
+      const std::size_t level = table_.size();
+      table_.emplace_back(values_.size() - span + 1);
+      for (std::size_t i = 0; i < table_[level].size(); ++i) {
+        table_[level][i] = static_cast<Index>(
+            better(table_[level - 1][i], table_[level - 1][i + half]));
+      }
+    }
+  }
+
+  std::span<const T> values_;
+  Compare compare_;
+  std::vector<std::vector<Index>> table_;
+};
+
+}  // namespace pixie::rmq
diff --git a/src/benchmarks/bench_rmq.cpp b/src/benchmarks/bench_rmq.cpp
new file mode 100644
index 0000000..4067e7e
--- /dev/null
+++ b/src/benchmarks/bench_rmq.cpp
@@ -0,0 +1,188 @@
+#include <benchmark/benchmark.h>
+#include <pixie/rmq.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <random>
+#include <span>
+#include <utility>
+#include <vector>
+
+namespace {
+
+constexpr std::uint64_t kSeed = 42;
+constexpr std::size_t kQueryCount = 32768;
+using Index = std::size_t;
+
+struct Dataset {
+  std::size_t size = 0;
+  std::size_t max_width = 0;
+  std::vector<std::int64_t> values;
+  std::vector<std::pair<std::size_t, std::size_t>> ranges;
+};
+
+struct DepthDataset {
+  std::size_t size = 0;
+  std::size_t max_width = 0;
+  std::vector<std::int64_t> depths;
+  std::vector<std::uint64_t> bits;
+  std::vector<std::pair<std::size_t, std::size_t>> ranges;
+};
+
+Dataset make_dataset(std::size_t size, std::size_t max_width) {
+  Dataset dataset;
+  dataset.size = size;
+  dataset.max_width = max_width;
+  dataset.values.resize(size);
+  dataset.ranges.resize(kQueryCount);
+
+  std::mt19937_64 rng(kSeed ^ (size * 0x9E3779B185EBCA87ull) ^
+                      (max_width * 0xBF58476D1CE4E5B9ull));
+  std::uniform_int_distribution<std::int64_t> value_dist(-1'000'000, 1'000'000);
+  std::generate(dataset.values.begin(), dataset.values.end(),
+                [&] { return value_dist(rng); });
+
+  std::uniform_int_distribution<std::size_t> left_dist(0, size - 1);
+  for (auto& [left, right] : dataset.ranges) {
+    left = left_dist(rng);
+    const std::size_t available = size - left;
+    const std::size_t width_limit = std::min(max_width, available);
+    std::uniform_int_distribution<std::size_t> width_dist(1, width_limit);
+    right = left + width_dist(rng) - 1;
+  }
+  return dataset;
+}
+
+DepthDataset make_depth_dataset(std::size_t size, std::size_t max_width) {
+  DepthDataset dataset;
+  dataset.size = size;
+  dataset.max_width = max_width;
+  dataset.depths.resize(size);
+  dataset.ranges.resize(kQueryCount);
+
+  std::mt19937_64 rng(kSeed ^ (size * 0xD6E8FEB86659FD93ull) ^
+                      (max_width * 0xA5A3564E27F88695ull));
+  for (std::size_t i = 1; i < dataset.depths.size(); ++i) {
+    dataset.depths[i] = dataset.depths[i - 1] + ((rng() & 1u) ? 1 : -1);
+  }
+  dataset.bits.assign((size - 1 + 63) / 64, 0);
+  for (std::size_t i = 1; i < dataset.depths.size(); ++i) {
+    if (dataset.depths[i] - dataset.depths[i - 1] == 1) {
+      dataset.bits[(i - 1) >> 6] |= std::uint64_t{1} << ((i - 1) & 63);
+    }
+  }
+
+  std::uniform_int_distribution<std::size_t> left_dist(0, size - 1);
+  for (auto& [left, right] : dataset.ranges) {
+    left = left_dist(rng);
+    const std::size_t available = size - left;
+    const std::size_t width_limit = std::min(max_width, available);
+    std::uniform_int_distribution<std::size_t> width_dist(1, width_limit);
+    right = left + width_dist(rng) - 1;
+  }
+  return dataset;
+}
+
+template <class Rmq>
+void run_queries(benchmark::State& state) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const Dataset dataset = make_dataset(size, max_width);
+  const Rmq rmq(std::span<const std::int64_t>(dataset.values));
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const auto [left, right] =
+        dataset.ranges[query_index++ % dataset.ranges.size()];
+    std::size_t result = rmq.arg_min(left, right);
+    benchmark::DoNotOptimize(result);
+  }
+
+  state.counters["N"] = static_cast<double>(size);
+  state.counters["max_width"] = static_cast<double>(max_width);
+  state.counters["index_bytes"] = static_cast<double>(sizeof(Index));
+}
+
+template <class Rmq>
+void run_depth_queries(benchmark::State& state) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const DepthDataset dataset = make_depth_dataset(size, max_width);
+  const Rmq rmq(std::span<const std::uint64_t>(dataset.bits),
+                dataset.depths.size());
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const auto [left, right] =
+        dataset.ranges[query_index++ % dataset.ranges.size()];
+    std::size_t result = rmq.arg_min(left, right);
+    benchmark::DoNotOptimize(result);
+  }
+
+  state.counters["N"] = static_cast<double>(size);
+  state.counters["max_width"] = static_cast<double>(max_width);
+  state.counters["index_bytes"] = static_cast<double>(sizeof(Index));
+}
+
+void register_benchmarks() {
+  const std::vector<std::size_t> sizes = {1ull << 10, 1ull << 14, 1ull << 18,
+                                          1ull << 22, 1ull << 26};
+  const std::vector<std::size_t> widths = {64, 4096, 1ull << 18, 1ull << 22,
+                                           1ull << 26};
+
+  for (const std::size_t size : sizes) {
+    std::vector<std::size_t> effective_widths;
+    for (const std::size_t width : widths) {
+      if (width > size) {
+        continue;
+      }
+      effective_widths.push_back(width);
+    }
+    effective_widths.push_back(size);
+    std::sort(effective_widths.begin(), effective_widths.end());
+    effective_widths.erase(
+        std::unique(effective_widths.begin(), effective_widths.end()),
+        effective_widths.end());
+
+    for (const std::size_t width : effective_widths) {
+      benchmark::RegisterBenchmark(
+          "rmq_sparse_table",
+          &run_queries<pixie::rmq::SparseTable<std::int64_t,
+                                               std::less<std::int64_t>, Index>>)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark(
+          "rmq_segment_tree",
+          &run_queries<pixie::rmq::SegmentTree<std::int64_t,
+                                               std::less<std::int64_t>, Index>>)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark(
+          "rmq_cartesian_tree",
+          &run_queries<pixie::rmq::CartesianTreeRmq<
+              std::int64_t, std::less<std::int64_t>, Index>>)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark(
+          "rmq_bp_plus_minus_one",
+          &run_depth_queries<pixie::rmq::BpPlusMinusOneRmq<Index>>)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+    }
+  }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  benchmark::Initialize(&argc, argv);
+  register_benchmarks();
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/src/tests/excess_positions_tests.cpp b/src/tests/excess_positions_tests.cpp
index 57d69c8..d37a7d3 100644
--- a/src/tests/excess_positions_tests.cpp
+++ b/src/tests/excess_positions_tests.cpp
@@ -8,6 +8,7 @@
 #include <cstdlib>
 #include <numeric>
 #include <random>
+#include <utility>
 
 using pixie::experimental::excess_positions_512_branching_lut;
 using pixie::experimental::excess_positions_512_byte_lut;
@@ -68,6 +69,41 @@ static int naive_prefix_excess_128(const uint64_t* s, size_t end_offset) {
   return cur;
 }
 
+static ExcessMin128Result naive_excess_min_128(const uint64_t* s,
+                                               size_t left,
+                                               size_t right) {
+  if (left > right) {
+    return {};
+  }
+  left = std::min<size_t>(left, 128);
+  right = std::min<size_t>(right, 128);
+
+  int cur = 0;
+  int best = 0;
+  size_t best_offset = 0;
+  bool found = false;
+  if (left == 0) {
+    found = true;
+  }
+  for (size_t bit = 0; bit < right; ++bit) {
+    cur += ((s[bit >> 6] >> (bit & 63)) & 1ull) != 0 ? 1 : -1;
+    const size_t offset = bit + 1;
+    if (offset < left) {
+      continue;
+    }
+    if (!found || cur < best) {
+      best = cur;
+      best_offset = offset;
+      found = true;
+    }
+  }
+  if (!found) {
+    best = naive_prefix_excess_128(s, left);
+    best_offset = left;
+  }
+  return {best, best_offset};
+}
+
 static size_t naive_forward_search_128(const uint64_t* s,
                                        int target_x,
                                        size_t start_offset) {
@@ -170,6 +206,84 @@ TEST(ExcessPositions128, PrefixExcessMatchesNaive) {
   }
 }
 
+TEST(ExcessPositions128, MinMatchesNaiveFixedCases) {
+  const std::array<std::array<uint64_t, 2>, 5> cases = {{
+      {0, 0},
+      {UINT64_MAX, UINT64_MAX},
+      {0xAAAAAAAAAAAAAAAAull, 0x5555555555555555ull},
+      {0x0123456789ABCDEFull, 0xFEDCBA9876543210ull},
+      {0x0000FFFF0000FFFFull, 0xFFFF0000FFFF0000ull},
+  }};
+  const std::array<std::pair<size_t, size_t>, 12> ranges = {{
+      {0, 128},
+      {0, 0},
+      {1, 1},
+      {63, 65},
+      {64, 64},
+      {64, 128},
+      {3, 6},
+      {5, 5},
+      {127, 128},
+      {128, 128},
+      {120, 127},
+      {129, 140},
+  }};
+
+  for (const auto& s : cases) {
+    for (const auto [left, right] : ranges) {
+      const ExcessMin128Result result = excess_min_128(s.data(), left, right);
+      const ExcessMin128Result expected =
+          naive_excess_min_128(s.data(), left, right);
+      EXPECT_EQ(result.min_excess, expected.min_excess)
+          << "left=" << left << " right=" << right;
+      EXPECT_EQ(result.offset, expected.offset)
+          << "left=" << left << " right=" << right;
+    }
+  }
+}
+
+TEST(ExcessPositions128, MinReturnsFirstTie) {
+  const std::array<uint64_t, 2> s = {0x5555555555555555ull,
+                                     0x5555555555555555ull};
+  const ExcessMin128Result result = excess_min_128(s.data(), 0, 128);
+  EXPECT_EQ(result.min_excess, 0);
+  EXPECT_EQ(result.offset, 0u);
+
+  const ExcessMin128Result shifted = excess_min_128(s.data(), 1, 128);
+  EXPECT_EQ(shifted.min_excess, 0);
+  EXPECT_EQ(shifted.offset, 2u);
+}
+
+TEST(ExcessPositions128, MinInvalidRangeUsesSentinel) {
+  const std::array<uint64_t, 2> s = {0, 0};
+  const ExcessMin128Result result = excess_min_128(s.data(), 17, 16);
+  EXPECT_EQ(result.min_excess, 0);
+  EXPECT_EQ(result.offset, 128u);
+}
+
+TEST(ExcessPositions128, MinMatchesNaiveRandom) {
+  std::mt19937_64 rng(43);
+  std::uniform_int_distribution<size_t> offset_dist(0, 128);
+
+  for (int t = 0; t < 1000; ++t) {
+    const std::array<uint64_t, 2> s = {rng(), rng()};
+    for (int q = 0; q < 32; ++q) {
+      size_t left = offset_dist(rng);
+      size_t right = offset_dist(rng);
+      if (left > right) {
+        std::swap(left, right);
+      }
+      const ExcessMin128Result result = excess_min_128(s.data(), left, right);
+      const ExcessMin128Result expected =
+          naive_excess_min_128(s.data(), left, right);
+      ASSERT_EQ(result.min_excess, expected.min_excess)
+          << "case=" << t << " left=" << left << " right=" << right;
+      ASSERT_EQ(result.offset, expected.offset)
+          << "case=" << t << " left=" << left << " right=" << right;
+    }
+  }
+}
+
 TEST(ExcessPositions128, ForwardAndBackwardSearchMatchNaive) {
   std::mt19937_64 rng(42);
   const std::array<size_t, 8> offsets = {0, 1, 63, 64, 65, 126, 127, 128};
diff --git a/src/tests/rmq_tests.cpp b/src/tests/rmq_tests.cpp
new file mode 100644
index 0000000..e727353
--- /dev/null
+++ b/src/tests/rmq_tests.cpp
@@ -0,0 +1,308 @@
+#include <gtest/gtest.h>
+#include <pixie/rmq.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <random>
+#include <span>
+#include <vector>
+
+namespace {
+
+template <class T, class Compare>
+std::size_t naive_arg_min(std::span<const T> values,
+                          std::size_t left,
+                          std::size_t right,
+                          Compare compare) {
+  if (left > right || right >= values.size()) {
+    return pixie::rmq::SparseTable<T, Compare>::npos;
+  }
+  std::size_t best = left;
+  for (std::size_t i = left + 1; i <= right; ++i) {
+    if (compare(values[i], values[best])) {
+      best = i;
+    }
+  }
+  return best;
+}
+
+template <class Rmq, class T, class Compare>
+void check_all_ranges(const Rmq& rmq,
+                      std::span<const T> values,
+                      Compare compare) {
+  ASSERT_EQ(rmq.size(), values.size());
+  for (std::size_t left = 0; left < values.size(); ++left) {
+    for (std::size_t right = left; right < values.size(); ++right) {
+      const std::size_t expected = naive_arg_min(values, left, right, compare);
+      EXPECT_EQ(rmq.arg_min(left, right), expected)
+          << "range=[" << left << "," << right << "]";
+      EXPECT_EQ(rmq.range_min(left, right), values[expected])
+          << "range=[" << left << "," << right << "]";
+    }
+  }
+}
+
+template <class Rmq, class T, class Compare>
+void check_all_arg_min_ranges(const Rmq& rmq,
+                              std::span<const T> values,
+                              Compare compare) {
+  ASSERT_EQ(rmq.size(), values.size());
+  for (std::size_t left = 0; left < values.size(); ++left) {
+    for (std::size_t right = left; right < values.size(); ++right) {
+      const std::size_t expected = naive_arg_min(values, left, right, compare);
+      EXPECT_EQ(rmq.arg_min(left, right), expected)
+          << "range=[" << left << "," << right << "]";
+    }
+  }
+}
+
+std::vector<std::uint64_t> pack_depth_deltas(
+    std::span<const std::int64_t> depths) {
+  std::vector<std::uint64_t> bits((depths.size() - 1 + 63) / 64, 0);
+  for (std::size_t i = 1; i < depths.size(); ++i) {
+    if (depths[i] - depths[i - 1] == 1) {
+      bits[(i - 1) >> 6] |= std::uint64_t{1} << ((i - 1) & 63);
+    }
+  }
+  return bits;
+}
+
+}  // namespace
+
+TEST(RmqSparseTable, ExhaustiveSmallArray) {
+  const std::vector<int> values = {4, 1, 3, 1, 5, 0, 0, 2};
+  const pixie::rmq::SparseTable<int> rmq(values);
+  check_all_ranges(rmq, std::span<const int>(values), std::less<int>());
+}
+
+TEST(RmqSegmentTree, ExhaustiveSmallArray) {
+  const std::vector<int> values = {4, 1, 3, 1, 5, 0, 0, 2};
+  const pixie::rmq::SegmentTree<int> rmq(values);
+  check_all_ranges(rmq, std::span<const int>(values), std::less<int>());
+}
+
+TEST(RmqBpPlusMinusOne, ExhaustiveSmallDepthArray) {
+  const std::vector<std::int64_t> depths = {0, 1, 0, 1, 2, 1, 2, 1, 0};
+  const std::vector<std::uint64_t> bits = pack_depth_deltas(depths);
+  const pixie::rmq::BpPlusMinusOneRmq<> rmq(bits, depths.size());
+  check_all_arg_min_ranges(rmq, std::span<const std::int64_t>(depths),
+                           std::less<std::int64_t>());
+}
+
+TEST(RmqBpPlusMinusOne, CrossBlockRanges) {
+  std::vector<std::int64_t> depths(385);
+  for (std::size_t i = 1; i < depths.size(); ++i) {
+    const bool up = (i % 7 == 0) || (i % 7 == 1) || (i % 7 == 4);
+    depths[i] = depths[i - 1] + (up ? 1 : -1);
+  }
+
+  const std::vector<std::uint64_t> bits = pack_depth_deltas(depths);
+  const pixie::rmq::BpPlusMinusOneRmq<> rmq(bits, depths.size());
+  check_all_arg_min_ranges(rmq, std::span<const std::int64_t>(depths),
+                           std::less<std::int64_t>());
+}
+
+TEST(RmqBpPlusMinusOne, BoundaryRangesAround128PositionBlocks) {
+  std::vector<std::int64_t> depths(260);
+  for (std::size_t i = 1; i < depths.size(); ++i) {
+    depths[i] = depths[i - 1] + ((i % 5 == 0 || i % 11 == 0) ? 1 : -1);
+  }
+
+  const std::vector<std::uint64_t> bits = pack_depth_deltas(depths);
+  const pixie::rmq::BpPlusMinusOneRmq<> rmq(bits, depths.size());
+  const std::vector<std::pair<std::size_t, std::size_t>> ranges = {
+      {0, 0},     {126, 127}, {127, 128}, {128, 128},
+      {128, 255}, {129, 255}, {255, 256}, {0, 259},
+  };
+
+  for (const auto [left, right] : ranges) {
+    EXPECT_EQ(rmq.arg_min(left, right),
+              naive_arg_min(std::span<const std::int64_t>(depths), left, right,
+                            std::less<std::int64_t>()))
+        << "range=[" << left << "," << right << "]";
+  }
+}
+
+TEST(RmqBpPlusMinusOne, LongSequenceRangesNearEnd) {
+  std::vector<std::int64_t> depths(8193);
+  for (std::size_t i = 1; i < depths.size(); ++i) {
+    const bool up = (i % 13 == 0) || (i % 17 == 0) || (i % 19 == 0);
+    depths[i] = depths[i - 1] + (up ? 1 : -1);
+  }
+
+  const std::vector<std::uint64_t> bits = pack_depth_deltas(depths);
+  const pixie::rmq::BpPlusMinusOneRmq<> rmq(bits, depths.size());
+  const std::vector<std::pair<std::size_t, std::size_t>> ranges = {
+      {depths.size() - 1, depths.size() - 1},
+      {depths.size() - 128, depths.size() - 1},
+      {depths.size() - 513, depths.size() - 3},
+      {depths.size() - 4097, depths.size() - 7},
+  };
+
+  for (const auto [left, right] : ranges) {
+    EXPECT_EQ(rmq.arg_min(left, right),
+              naive_arg_min(std::span<const std::int64_t>(depths), left, right,
+                            std::less<std::int64_t>()))
+        << "range=[" << left << "," << right << "]";
+  }
+}
+
+TEST(RmqBpPlusMinusOne, CrossBlockTieKeepsFirstPosition) {
+  std::vector<std::int64_t> depths(384);
+  for (std::size_t i = 1; i < depths.size(); ++i) {
+    depths[i] = (i % 2 == 0) ? 0 : 1;
+  }
+
+  const std::vector<std::uint64_t> bits = pack_depth_deltas(depths);
+  const pixie::rmq::BpPlusMinusOneRmq<> rmq(bits, depths.size());
+  const std::size_t left = 120;
+  const std::size_t right = 260;
+
+  EXPECT_EQ(rmq.arg_min(left, right), left);
+  EXPECT_EQ(rmq.arg_min(left, right),
+            naive_arg_min(std::span<const std::int64_t>(depths), left, right,
+                          std::less<std::int64_t>()));
+}
+
+TEST(RmqBpPlusMinusOne, RejectsTooSmallBitSpan) {
+  const std::vector<std::uint64_t> bits;
+  EXPECT_THROW((pixie::rmq::BpPlusMinusOneRmq<>(bits, 2)),
+               std::invalid_argument);
+}
+
+TEST(RmqCartesianTree, ExhaustiveSmallArray) {
+  const std::vector<int> values = {4, 1, 3, 1, 5, 0, 0, 2};
+  const pixie::rmq::CartesianTreeRmq<int> rmq(values);
+  check_all_ranges(rmq, std::span<const int>(values), std::less<int>());
+}
+
+TEST(Rmq, FirstMinimumTieBreaking) {
+  const std::vector<int> values = {7, 2, 2, 3, 2};
+  const pixie::rmq::SparseTable<int> sparse(values);
+  const pixie::rmq::SegmentTree<int> segment(values);
+  const pixie::rmq::CartesianTreeRmq<int> cartesian(values);
+
+  EXPECT_EQ(sparse.arg_min(0, 4), 1u);
+  EXPECT_EQ(segment.arg_min(0, 4), 1u);
+  EXPECT_EQ(cartesian.arg_min(0, 4), 1u);
+  EXPECT_EQ(sparse.arg_min(2, 4), 2u);
+  EXPECT_EQ(segment.arg_min(2, 4), 2u);
+  EXPECT_EQ(cartesian.arg_min(2, 4), 2u);
+}
+
+TEST(Rmq, InvalidAndEmptyRanges) {
+  const std::vector<int> values = {3, 1, 2};
+  const pixie::rmq::SparseTable<int> sparse(values);
+  const pixie::rmq::SegmentTree<int> segment(values);
+  const pixie::rmq::CartesianTreeRmq<int> cartesian(values);
+
+  EXPECT_EQ(sparse.arg_min(2, 1), pixie::rmq::SparseTable<int>::npos);
+  EXPECT_EQ(segment.arg_min(2, 1), pixie::rmq::SegmentTree<int>::npos);
+  EXPECT_EQ(cartesian.arg_min(2, 1), pixie::rmq::CartesianTreeRmq<int>::npos);
+  EXPECT_EQ(sparse.arg_min(0, values.size()),
+            pixie::rmq::SparseTable<int>::npos);
+  EXPECT_EQ(segment.arg_min(0, values.size()),
+            pixie::rmq::SegmentTree<int>::npos);
+  EXPECT_EQ(cartesian.arg_min(0, values.size()),
+            pixie::rmq::CartesianTreeRmq<int>::npos);
+  EXPECT_EQ(sparse.range_min(2, 1), 0);
+  EXPECT_EQ(segment.range_min(2, 1), 0);
+  EXPECT_EQ(cartesian.range_min(2, 1), 0);
+
+  const std::vector<int> empty;
+  const pixie::rmq::SparseTable<int> empty_sparse(empty);
+  const pixie::rmq::SegmentTree<int> empty_segment(empty);
+  const pixie::rmq::CartesianTreeRmq<int> empty_cartesian(empty);
+  EXPECT_TRUE(empty_sparse.empty());
+  EXPECT_TRUE(empty_segment.empty());
+  EXPECT_TRUE(empty_cartesian.empty());
+  EXPECT_EQ(empty_sparse.arg_min(0, 0), pixie::rmq::SparseTable<int>::npos);
+  EXPECT_EQ(empty_segment.arg_min(0, 0), pixie::rmq::SegmentTree<int>::npos);
+  EXPECT_EQ(empty_cartesian.arg_min(0, 0),
+            pixie::rmq::CartesianTreeRmq<int>::npos);
+}
+
+TEST(Rmq, ComparatorCanSelectMaximum) {
+  const std::vector<int> values = {1, 8, 3, 8, 4};
+  const pixie::rmq::SparseTable<int, std::greater<int>> sparse(values);
+  const pixie::rmq::SegmentTree<int, std::greater<int>> segment(values);
+  const pixie::rmq::CartesianTreeRmq<int, std::greater<int>> cartesian(values);
+
+  check_all_ranges(sparse, std::span<const int>(values), std::greater<int>());
+  check_all_ranges(segment, std::span<const int>(values), std::greater<int>());
+  check_all_ranges(cartesian, std::span<const int>(values),
+                   std::greater<int>());
+  EXPECT_EQ(sparse.arg_min(0, 4), 1u);
+  EXPECT_EQ(segment.arg_min(0, 4), 1u);
+  EXPECT_EQ(cartesian.arg_min(0, 4), 1u);
+}
+
+TEST(RmqCartesianTree, MonotoneArrays) {
+  const std::vector<int> increasing = {1, 2, 3, 4, 5, 6};
+  const std::vector<int> decreasing = {6, 5, 4, 3, 2, 1};
+  const pixie::rmq::CartesianTreeRmq<int> increasing_rmq(increasing);
+  const pixie::rmq::CartesianTreeRmq<int> decreasing_rmq(decreasing);
+
+  check_all_ranges(increasing_rmq, std::span<const int>(increasing),
+                   std::less<int>());
+  check_all_ranges(decreasing_rmq, std::span<const int>(decreasing),
+                   std::less<int>());
+}
+
+TEST(RmqCartesianTree, CopyAndMoveRebuildInternalSpans) {
+  const std::vector<int> values = {5, 4, 3, 2, 1, 2, 3};
+  const pixie::rmq::CartesianTreeRmq<int> original(values);
+  pixie::rmq::CartesianTreeRmq<int> copied(original);
+  pixie::rmq::CartesianTreeRmq<int> assigned;
+  assigned = copied;
+  pixie::rmq::CartesianTreeRmq<int> moved(std::move(copied));
+
+  check_all_ranges(original, std::span<const int>(values), std::less<int>());
+  check_all_ranges(assigned, std::span<const int>(values), std::less<int>());
+  check_all_ranges(moved, std::span<const int>(values), std::less<int>());
+}
+
+TEST(RmqCartesianTree, EulerDepthsArePlusMinusOne) {
+  const std::vector<int> values = {4, 1, 3, 2, 5};
+  const pixie::rmq::CartesianTreeRmq<int> rmq(values);
+  const auto depths = rmq.euler_depths();
+
+  ASSERT_FALSE(depths.empty());
+  for (std::size_t i = 1; i < depths.size(); ++i) {
+    EXPECT_EQ(std::abs(depths[i] - depths[i - 1]), 1);
+  }
+}
+
+TEST(Rmq, DifferentialRandom) {
+  std::mt19937_64 rng(42);
+  std::uniform_int_distribution<int> value_dist(-50, 50);
+  for (std::size_t size = 1; size <= 257; size += 17) {
+    std::vector<int> values(size);
+    std::generate(values.begin(), values.end(),
+                  [&] { return value_dist(rng); });
+
+    const pixie::rmq::SparseTable<int> sparse(values);
+    const pixie::rmq::SegmentTree<int> segment(values);
+    const pixie::rmq::CartesianTreeRmq<int> cartesian(values);
+    check_all_ranges(sparse, std::span<const int>(values), std::less<int>());
+    check_all_ranges(segment, std::span<const int>(values), std::less<int>());
+    check_all_ranges(cartesian, std::span<const int>(values), std::less<int>());
+  }
+}
+
+TEST(RmqBpPlusMinusOne, DifferentialRandomWalks) {
+  std::mt19937_64 rng(77);
+  for (std::size_t size = 1; size <= 257; size += 17) {
+    std::vector<std::int64_t> depths(size);
+    for (std::size_t i = 1; i < depths.size(); ++i) {
+      depths[i] = depths[i - 1] + ((rng() & 1u) != 0 ? 1 : -1);
+    }
+
+    const std::vector<std::uint64_t> bits = pack_depth_deltas(depths);
+    const pixie::rmq::BpPlusMinusOneRmq<> rmq(bits, depths.size());
+    check_all_arg_min_ranges(rmq, std::span<const std::int64_t>(depths),
+                             std::less<std::int64_t>());
+  }
+}

From 53573984231b2102851547e62f42f20bb5271a57 Mon Sep 17 00:00:00 2001
From: Nikolay Malkovsky <malkovskynv@gmail.com>
Date: Sat, 30 May 2026 20:37:55 +0300
Subject: [PATCH 2/7] optimization skill

---
 .../skills/optimization-experiment/SKILL.md   | 193 ++++++++++++++++++
 .../optimization-experiment/EXAMPLES.md       |  86 ++++++++
 2 files changed, 279 insertions(+)
 create mode 100644 agentic/cpp/skills/optimization-experiment/SKILL.md
 create mode 100644 agentic/local/cpp/skills/optimization-experiment/EXAMPLES.md

diff --git a/agentic/cpp/skills/optimization-experiment/SKILL.md b/agentic/cpp/skills/optimization-experiment/SKILL.md
new file mode 100644
index 0000000..75904f6
--- /dev/null
+++ b/agentic/cpp/skills/optimization-experiment/SKILL.md
@@ -0,0 +1,193 @@
+---
+name: optimization-experiment
+description: Run iterative C++ optimization experiments for a target function or class by adding same-API experimental variants, validating correctness, benchmarking, comparing results, and deciding whether to promote a faster implementation.
+---
+
+# Optimization Experiment Skill
+
+Use this skill when a user wants to improve performance of a specific C++
+function, class, algorithm, or hot path through benchmark-driven experiments.
+
+This workflow depends on:
+
+1. `../benchmarks/SKILL.md` for Google Benchmark build/run commands, JSON output,
+   hardware counters, pinning, and perf profiling.
+2. `../benchmarks-affected/SKILL.md` when changes need an affected benchmark
+   scope.
+3. `../benchmarks-compare-revisions/SKILL.md` when comparing committed
+   revisions.
+
+## Goal
+
+Iterate from a production implementation to one or more experimental
+implementations, prove semantic equivalence, measure the impact, and decide
+whether a candidate is worth promoting.
+
+The standard loop is:
+
+```text
+target -> benchmark baseline -> experimental same-API variant
+       -> correctness check -> benchmark compare -> keep / revise / discard
+```
+
+Stop when a candidate is clearly better on the intended workload without
+correctness or maintenance regressions, or when the remaining ideas are too weak
+to justify more iteration.
+
+## Step 1 - Identify the Target and Contract
+
+Start from the requested function/class and inspect the real implementation.
+Record:
+
+- public signature/API and call sites that must not change
+- input domains, invalid-input behavior, boundary conditions, and tie-breaking
+- compile-time feature gates such as SIMD flags or platform-specific paths
+- existing tests and reference implementations
+- existing benchmarks that should move if the optimization succeeds
+
+Do not optimize before the contract is clear. If behavior is ambiguous, add or
+find tests before changing implementation.
+
+## Step 2 - Establish Benchmark Coverage
+
+Find benchmark rows that directly exercise the target. Prefer narrow benchmark
+filters over full-suite runs during iteration.
+
+If coverage is missing or too broad, add focused benchmark cases before adding
+the optimized implementation. Include cases for:
+
+- the expected common path
+- boundary and alignment-sensitive paths
+- short, medium, and long ranges or sizes when width matters
+- random or mixed workloads when real calls are not fixed-shape
+- current production behavior and each experimental variant
+
+Capture a baseline JSON before implementation changes:
+
+```bash
+BENCH_CPU=${BENCH_CPU:-0}
+taskset -c "${BENCH_CPU}" <benchmark-binary> \
+  --benchmark_filter="${FILTER}" \
+  --benchmark_report_aggregates_only=true \
+  --benchmark_display_aggregates_only=true \
+  --benchmark_out=/tmp/<target>_baseline.json \
+  --benchmark_out_format=json
+```
+
+Use `../benchmarks/SKILL.md` for exact build directories, Release versus
+diagnostic builds, hardware-counter setup, and retry policy.
+
+## Step 3 - Add Experimental Same-API Variants
+
+Add candidate implementations beside production code in an experimental area,
+namespace, header, or benchmark-local adapter that is already consistent with
+the repository.
+
+Rules:
+
+- keep the callable signature/API identical to production where practical
+- preserve public semantics exactly, including invalid inputs and tie-breaking
+- keep production callers unchanged during experiments
+- make variants benchmark-selectable by name
+- avoid unrelated refactors while measuring
+- keep losing variants only when they document useful evidence or support future
+  comparison
+
+For C++ libraries with feature-gated implementations, provide correct fallbacks
+for unsupported targets or compile configurations.
+
+## Step 4 - Validate Correctness Before Timing
+
+Run relevant tests before trusting benchmark numbers. Add tests when the
+experimental implementation introduces new risk.
+
+Prefer:
+
+- fixed edge cases for boundaries, empty/sentinel behavior, and exact ties
+- randomized differential tests against a scalar or naive reference
+- tests for feature-gated fallback builds when the code has SIMD or platform
+  branches
+- targeted regression tests for any bug found during benchmarking
+
+Do not compare performance for a candidate that has not passed the correctness
+checks for the same semantics as production.
+
+## Step 5 - Benchmark and Compare
+
+Run timing benchmarks from Release builds. Save JSON for every meaningful
+baseline and candidate.
+
+Use diagnostic builds with hardware counters when timing changes need
+explanation:
+
+- cycles and instructions for core execution cost
+- cache counters for memory behavior
+- branch counters when early exits or dispatch logic are involved
+
+Compare both absolute timings and relative deltas. Watch for cases where a
+candidate wins the cherry-picked row but regresses neighboring or realistic
+workloads.
+
+When results are noisy:
+
+- pin to a CPU with `taskset` when available
+- increase repetitions or minimum benchmark time
+- rerun the narrow benchmark filter once
+- avoid changing benchmark scope between baseline and candidate
+
+## Step 6 - Iterate Deliberately
+
+For each candidate, decide one of:
+
+- **Promote**: repeatedly faster on intended rows, no important regressions,
+  correct and maintainable.
+- **Keep experimental**: interesting or workload-specific, but not production
+  ready.
+- **Discard**: slower, too complex, too narrow, or semantically risky.
+
+Use benchmark data to choose the next idea. Examples:
+
+- higher instruction count suggests fewer operations or simpler dispatch
+- lower instructions but higher cycles suggests stalls, memory, or dependency
+  chains
+- short-range regressions suggest a narrower dispatch condition
+- alignment-sensitive rows suggest splitting aligned and unaligned paths
+
+When no idea wins convincingly, document the best result and stop rather than
+overfitting.
+
+## Step 7 - Finalize the Result
+
+If promoting a candidate to production:
+
+- keep the public API unchanged unless the user explicitly requested otherwise
+- keep or update tests that protect the optimized behavior
+- remove accidental benchmark-only scaffolding from production code
+- preserve experimental variants only when useful for future research
+
+If leaving work experimental:
+
+- add a short note near the experimental code with benchmark date, command, and
+  the relevant table or JSON artifact path
+- clearly state that production callers do not use the experimental variant
+- explain which workload the variant helps and where it loses
+
+The final response should include:
+
+- what changed
+- correctness checks run
+- benchmark command or JSON artifacts
+- concise result table
+- recommendation: promote, keep experimenting, or stop
+
+## Guardrails
+
+1. Benchmark before optimizing; otherwise there is no trustworthy baseline.
+2. Never change semantics to win a benchmark.
+3. Never compare Debug timings.
+4. Keep production and experimental code paths distinguishable.
+5. Prefer focused benchmark filters during iteration, then broaden before
+   promotion.
+6. Treat hardware counters as explanatory data, not a replacement for timing.
+7. Record enough benchmark context that future agents do not confuse
+   experimental wins with production behavior.
diff --git a/agentic/local/cpp/skills/optimization-experiment/EXAMPLES.md b/agentic/local/cpp/skills/optimization-experiment/EXAMPLES.md
new file mode 100644
index 0000000..91aa68f
--- /dev/null
+++ b/agentic/local/cpp/skills/optimization-experiment/EXAMPLES.md
@@ -0,0 +1,86 @@
+# Pixie Optimization Experiment Examples
+
+Use these notes with `agentic/cpp/skills/optimization-experiment/SKILL.md` for
+Pixie-specific optimization work.
+
+## Example: `excess_min_128`
+
+The `excess_min_128` experiment is a good template for future hot-path work:
+
+1. **Lock semantics first**: preserve inclusive prefix range `[left, right]`,
+   prefix offsets `[0, 128]`, first-min tie breaking, and invalid-range sentinel
+   behavior.
+2. **Add focused benchmarks before trusting results**: include full-range,
+   RMQ-style `[0, 127]`, aligned short ranges, non-aligned short ranges,
+   cross-word ranges, point ranges, and a reproducible random-range benchmark.
+3. **Keep candidates same-API**: experimental variants should accept the same
+   parameters and return the same result type as production, so benchmarks can
+   swap implementations without adapter logic.
+4. **Use experimental space for ideas**: keep exploratory variants in
+   `include/pixie/experimental/` unless they are clearly production-ready.
+5. **Promote narrowly**: when a variant wins only under a specific condition,
+   promote only that condition. For `excess_min_128`, byte-LUT was useful only
+   for byte-aligned short ranges, so the production fallback was narrowed instead
+   of applied to every short range.
+6. **Mark unlikely cold dispatches**: if a promoted optimization is for a narrow
+   special case, mark the branch unlikely when that matches expected workload.
+7. **Record benchmark evidence near experimental code**: add a top-of-header
+   benchmark note in `/** ... */` form when future agents might confuse
+   experimental winners with production behavior.
+
+## Benchmark Pattern
+
+Prefer a diagnostic run that can explain timing changes, not just report them:
+
+```bash
+taskset -c 0 build/benchmarks-diagnostic_local/excess_positions_benchmarks \
+  --benchmark_filter='BM_ExcessMin128(_|/|$)' \
+  --benchmark_repetitions=3 \
+  --benchmark_perf_counters=CYCLES,INSTRUCTIONS,CACHE-MISSES \
+  --benchmark_counters_tabular=true \
+  --benchmark_out=/tmp/pixie_excess_min_counters.json \
+  --benchmark_out_format=json
+```
+
+For final comparison tables, include at least:
+
+- CPU time
+- cycles
+- instructions
+- cache misses when collected
+- a random or mixed workload row
+- the rows that justified any narrowed production dispatch condition
+
+## Documentation Pattern
+
+For experimental algorithms, use Doxygen block comments:
+
+```cpp
+/**
+ * @brief Short algorithm name and purpose.
+ *
+ * @details Workflow:
+ *
+ *   input -> transform -> candidate result -> final reduction
+ *
+ * Explain the non-obvious tradeoff, such as avoiding lane crossing, reducing
+ * loop iterations, or paying scalar boundary work.
+ */
+```
+
+Keep long benchmark tables as a top-level `/** ... */` note when they describe
+the whole experimental header rather than one symbol.
+
+## Non-Obvious Lessons From This Session
+
+- A faster micro-kernel can regress neighboring ranges if dispatch is too broad;
+  benchmark aligned and non-aligned cases separately.
+- Random-range benchmarks are useful as a sanity check against overfitting fixed
+  rows.
+- Hardware counters helped distinguish lower-level cost: cycles and instructions
+  were more actionable than cache misses for the `excess_min_128` candidates.
+- Production and experimental code must stay visually distinct. A benchmark win
+  in `pixie::experimental` is only a candidate, not a caller-visible change.
+- When result names encode width unnecessarily, prefer the stable semantic name
+  (`ExcessResult`) over width-specific detail (`ExcessResult128`) unless the API
+  truly needs multiple widths.

From 1e6ba4307f98f0555f33d37e8ebdf9cbee375222 Mon Sep 17 00:00:00 2001
From: Nikolay Malkovsky <malkovskynv@gmail.com>
Date: Sat, 30 May 2026 22:49:26 +0300
Subject: [PATCH 3/7] Experiments on excess min

---
 CMakeLists.txt                                |  17 +
 include/pixie/bits.h                          | 266 +++++--
 include/pixie/experimental/excess.h           | 724 ++++++++++++++++++
 include/pixie/rmq/bp_plus_minus_one_rmq.h     |   7 +-
 .../excess_positions_benchmarks.cpp           | 274 ++++++-
 .../excess_positions_benchmark_results.md     |   9 +
 src/tests/excess_positions_tests.cpp          | 159 +++-
 7 files changed, 1378 insertions(+), 78 deletions(-)
 create mode 100644 src/docs/excess_positions_benchmark_results.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e648102..66df9bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -174,6 +174,15 @@ if(PIXIE_TESTS)
         gtest
         gtest_main
         ${PIXIE_DIAGNOSTICS_LIBS})
+
+    add_executable(rmq_tests
+        src/tests/rmq_tests.cpp)
+    target_include_directories(rmq_tests
+        PUBLIC include)
+    target_link_libraries(rmq_tests
+        gtest
+        gtest_main
+        ${PIXIE_DIAGNOSTICS_LIBS})
 endif()
 
 # ---------------------------------------------------------------------------
@@ -209,6 +218,14 @@ if(PIXIE_BENCHMARKS)
         benchmark
         ${PIXIE_DIAGNOSTICS_LIBS})
 
+    add_executable(bench_rmq
+        src/benchmarks/bench_rmq.cpp)
+    target_include_directories(bench_rmq
+        PUBLIC include)
+    target_link_libraries(bench_rmq
+        benchmark
+        ${PIXIE_DIAGNOSTICS_LIBS})
+
     if(PIXIE_THIRD_PARTY_BACKENDS)
         add_executable(bench_rmm_sdsl
             src/benchmarks/bench_rmm_sdsl.cpp)
diff --git a/include/pixie/bits.h b/include/pixie/bits.h
index f1250eb..b103600 100644
--- a/include/pixie/bits.h
+++ b/include/pixie/bits.h
@@ -3,6 +3,7 @@
 #include <immintrin.h>
 
 #include <algorithm>
+#include <array>
 #include <bit>
 #include <cstddef>
 #include <cstdint>
@@ -798,15 +799,48 @@ static inline const __m256i excess_lut_pos2 = _mm256_setr_epi8(
     -1,  1,  1,  3,
     -3, -1, -1,  1,
     -1,  1,  1,  3);
+static inline const __m256i excess_lut_min = _mm256_setr_epi8(
+    -4, -2, -2,  0,
+    -2,  0, -1,  1,
+    -3, -1, -1,  1,
+    -2,  0, -1,  1,
+    -4, -2, -2,  0,
+    -2,  0, -1,  1,
+    -3, -1, -1,  1,
+    -2,  0, -1,  1);
+static inline constexpr int8_t excess_lut_min_offset[16] = {
+    4, 4, 4, 4, 2, 2, 1, 1, 3, 3, 1, 1, 2, 2, 1, 1};
 static inline const __m256i excess_lut_pack_multiplier =
     _mm256_set1_epi16(0x1001);
 static inline const __m256i excess_lut_bit0 = _mm256_set1_epi8(1);
 static inline const __m256i excess_lut_bit1 = _mm256_set1_epi8(2);
 static inline const __m256i excess_lut_bit2 = _mm256_set1_epi8(4);
 static inline const __m256i excess_lut_bit3 = _mm256_set1_epi8(8);
+static inline const __m256i excess_lut_nibble_index = _mm256_setr_epi8(
+     0,  1,  2,  3,
+     4,  5,  6,  7,
+     8,  9, 10, 11,
+    12, 13, 14, 15,
+    16, 17, 18, 19,
+    20, 21, 22, 23,
+    24, 25, 26, 27,
+    28, 29, 30, 31);
 static inline const __m128i excess_lut_nibble_mask = _mm_set1_epi8(0x0F);
 // clang-format on
 
+static inline __m256i excess_nibbles_128_avx2(const uint64_t* s) noexcept {
+  __m128i word_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
+  __m128i lo_nibbles = _mm_and_si128(word_vec, excess_lut_nibble_mask);
+  __m128i hi_nibbles =
+      _mm_and_si128(_mm_srli_epi16(word_vec, 4), excess_lut_nibble_mask);
+
+  __m128i unpack_lo = _mm_unpacklo_epi8(lo_nibbles, hi_nibbles);
+  __m128i unpack_hi = _mm_unpackhi_epi8(lo_nibbles, hi_nibbles);
+
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(unpack_lo), unpack_hi,
+                                 1);
+}
+
 static inline __m256i excess_bit_masks_16x_i16() noexcept {
   return _mm256_setr_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020,
                            0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800,
@@ -841,11 +875,59 @@ static inline __m256i excess_prefix_sum_16x_i16(__m256i v) noexcept {
  * offset attaining the minimum. Invalid ranges return offset 128 as a
  * sentinel.
  */
-struct ExcessMin128Result {
+struct ExcessResult {
   int min_excess = 0;
   size_t offset = 128;
 };
 
+constexpr int8_t excess_byte_delta_value(uint8_t x) {
+  return static_cast<int8_t>(2 * std::popcount(x) - 8);
+}
+
+constexpr int8_t excess_byte_min_prefix_value(uint8_t x) {
+  int cur = 0;
+  int best = 0;
+  for (int bit = 0; bit < 8; ++bit) {
+    cur += ((x >> bit) & 1u) != 0 ? 1 : -1;
+    if (bit == 0 || cur < best) {
+      best = cur;
+    }
+  }
+  return static_cast<int8_t>(best);
+}
+
+constexpr int8_t excess_byte_min_prefix_offset_value(uint8_t x) {
+  int cur = 0;
+  int best = 0;
+  int best_offset = 1;
+  for (int bit = 0; bit < 8; ++bit) {
+    cur += ((x >> bit) & 1u) != 0 ? 1 : -1;
+    if (bit == 0 || cur < best) {
+      best = cur;
+      best_offset = bit + 1;
+    }
+  }
+  return static_cast<int8_t>(best_offset);
+}
+
+template <typename Fn>
+constexpr std::array<int8_t, 256> excess_make_byte_lut(Fn fn) {
+  std::array<int8_t, 256> out{};
+  for (size_t i = 0; i < out.size(); ++i) {
+    out[i] = fn(static_cast<uint8_t>(i));
+  }
+  return out;
+}
+
+static inline constexpr std::array<int8_t, 256> excess_byte_delta_lut =
+    excess_make_byte_lut([](uint8_t x) { return excess_byte_delta_value(x); });
+static inline constexpr std::array<int8_t, 256> excess_byte_min_lut =
+    excess_make_byte_lut(
+        [](uint8_t x) { return excess_byte_min_prefix_value(x); });
+static inline constexpr std::array<int8_t, 256> excess_byte_min_offset_lut =
+    excess_make_byte_lut(
+        [](uint8_t x) { return excess_byte_min_prefix_offset_value(x); });
+
 /**
  * @brief Find every prefix whose excess equals target_x in a 128-bit bitstring.
  *
@@ -879,22 +961,13 @@ static inline int excess_positions_128(const uint64_t* s,
   const __m256i vbit1 = excess_lut_bit1;
   const __m256i vbit2 = excess_lut_bit2;
   const __m256i vbit3 = excess_lut_bit3;
-  const __m128i vnibble_mask = excess_lut_nibble_mask;
 
   const int d = 2 * target_x - block_delta;
   if (d < -128 || d > 128) {
     return block_delta;
   }
 
-  __m128i word_vec = _mm_loadu_si128((const __m128i*)s);
-  __m128i lo_nibbles = _mm_and_si128(word_vec, vnibble_mask);
-  __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word_vec, 4), vnibble_mask);
-
-  __m128i unpack_lo = _mm_unpacklo_epi8(lo_nibbles, hi_nibbles);
-  __m128i unpack_hi = _mm_unpackhi_epi8(lo_nibbles, hi_nibbles);
-
-  __m256i nibbles =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(unpack_lo), unpack_hi, 1);
+  __m256i nibbles = excess_nibbles_128_avx2(s);
 
   __m256i ps = _mm256_shuffle_epi8(vdelta, nibbles);
   ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 1));
@@ -974,15 +1047,59 @@ static inline int prefix_excess_128(const uint64_t* s,
   return 2 * ones - static_cast<int>(end_offset);
 }
 
+static inline ExcessResult excess_min_128_byte_lut_short(
+    const uint64_t* s,
+    size_t left,
+    size_t right) noexcept {
+  int best = prefix_excess_128(s, left);
+  size_t best_offset = left;
+  if (left == right) {
+    return {best, best_offset};
+  }
+
+  int current = best;
+  size_t bit = left;
+  for (; bit < right && (bit & 7u) != 0; ++bit) {
+    current += ((s[bit >> 6] >> (bit & 63)) & 1ull) != 0 ? 1 : -1;
+    const size_t offset = bit + 1;
+    if (current < best) {
+      best = current;
+      best_offset = offset;
+    }
+  }
+
+  for (; bit + 8 <= right; bit += 8) {
+    const uint8_t byte =
+        static_cast<uint8_t>((s[bit >> 6] >> (bit & 63)) & 0xFFu);
+    const int candidate = current + excess_byte_min_lut[byte];
+    if (candidate < best) {
+      best = candidate;
+      best_offset = bit + static_cast<size_t>(excess_byte_min_offset_lut[byte]);
+    }
+    current += excess_byte_delta_lut[byte];
+  }
+
+  for (; bit < right; ++bit) {
+    current += ((s[bit >> 6] >> (bit & 63)) & 1ull) != 0 ? 1 : -1;
+    const size_t offset = bit + 1;
+    if (current < best) {
+      best = current;
+      best_offset = offset;
+    }
+  }
+
+  return {best, best_offset};
+}
+
 /**
  * @brief Return the minimum prefix excess and first attaining offset.
  * @param s 2 little-endian uint64_t words (bit 0 of s[0] is the first bit).
  * @param left First prefix position to consider, inclusive.
  * @param right Last prefix position to consider, inclusive.
  */
-static inline ExcessMin128Result excess_min_128(const uint64_t* s,
-                                                size_t left,
-                                                size_t right) noexcept {
+static inline ExcessResult excess_min_128(const uint64_t* s,
+                                          size_t left,
+                                          size_t right) noexcept {
   if (left > right) {
     return {};
   }
@@ -994,46 +1111,93 @@ static inline ExcessMin128Result excess_min_128(const uint64_t* s,
   if (left == right) {
     return {best, best_offset};
   }
+  if (right - left <= 32 && (left & 7u) == 0 && (right & 7u) == 0)
+      [[unlikely]] {
+    return excess_min_128_byte_lut_short(s, left, right);
+  }
 
 #ifdef PIXIE_AVX2_SUPPORT
-  static const __m256i masks = excess_bit_masks_16x_i16();
-  static const __m256i zero = _mm256_setzero_si256();
-  static const __m256i pos = _mm256_set1_epi16(1);
-  static const __m256i neg = _mm256_set1_epi16(-1);
-
-  int carry = 0;
-  alignas(32) int16_t prefix_values[16];
-  for (size_t chunk = 0; chunk < 8; ++chunk) {
-    const size_t chunk_bit = chunk * 16;
-    const uint16_t bits =
-        chunk < 4
-            ? static_cast<uint16_t>((s[0] >> (chunk * 16)) & 0xFFFFu)
-            : static_cast<uint16_t>((s[1] >> ((chunk - 4) * 16)) & 0xFFFFu);
-    const int delta = 2 * static_cast<int>(std::popcount(bits)) - 16;
-
-    if (chunk_bit + 1 <= right && chunk_bit + 16 >= left) {
-      const __m256i selected = _mm256_and_si256(
-          _mm256_set1_epi16(static_cast<int16_t>(bits)), masks);
-      const __m256i is_zero = _mm256_cmpeq_epi16(selected, zero);
-      const __m256i steps = _mm256_blendv_epi8(pos, neg, is_zero);
-      const __m256i pref =
-          _mm256_add_epi16(excess_prefix_sum_16x_i16(steps),
-                           _mm256_set1_epi16(static_cast<int16_t>(carry)));
-      _mm256_store_si256(reinterpret_cast<__m256i*>(prefix_values), pref);
-
-      for (size_t lane = 0; lane < 16; ++lane) {
-        const size_t offset = chunk_bit + lane + 1;
-        if (offset < left || offset > right) {
-          continue;
-        }
-        const int value = prefix_values[lane];
-        if (value < best) {
-          best = value;
-          best_offset = offset;
-        }
-      }
+  int current = best;
+  size_t bit = left;
+  for (; bit < right && (bit & 3u) != 0; ++bit) {
+    current += ((s[bit >> 6] >> (bit & 63)) & 1ull) != 0 ? 1 : -1;
+    const size_t offset = bit + 1;
+    if (current < best) {
+      best = current;
+      best_offset = offset;
+    }
+  }
+
+  const size_t first_full_nibble = bit >> 2;
+  const size_t last_full_nibble = right >> 2;
+  if (first_full_nibble < last_full_nibble) {
+    const __m256i nibbles = excess_nibbles_128_avx2(s);
+
+    __m256i ps = _mm256_shuffle_epi8(excess_lut_delta, nibbles);
+    ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 1));
+    ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 2));
+    ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 4));
+    ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 8));
+
+    __m128i ps_lo = _mm256_castsi256_si128(ps);
+    __m128i ps_hi = _mm256_extracti128_si256(ps, 1);
+    __m128i carry =
+        _mm_set1_epi8(static_cast<int8_t>(_mm_extract_epi8(ps_lo, 15)));
+    ps_hi = _mm_add_epi8(ps_hi, carry);
+    ps = _mm256_inserti128_si256(_mm256_castsi128_si256(ps_lo), ps_hi, 1);
+
+    __m256i b = _mm256_permute2x128_si256(ps, ps, 0x08);
+    const __m256i excl_ps = _mm256_alignr_epi8(ps, b, 15);
+    const __m256i candidates =
+        _mm256_add_epi8(excl_ps, _mm256_shuffle_epi8(excess_lut_min, nibbles));
+
+    const __m256i idx = excess_lut_nibble_index;
+    const int first_minus_one_value = static_cast<int>(first_full_nibble) - 1;
+    const __m256i first_minus_one =
+        _mm256_set1_epi8(static_cast<int8_t>(first_minus_one_value));
+    const __m256i last =
+        _mm256_set1_epi8(static_cast<int8_t>(last_full_nibble));
+    const __m256i active = _mm256_and_si256(
+        _mm256_cmpgt_epi8(idx, first_minus_one), _mm256_cmpgt_epi8(last, idx));
+    const __m256i masked_candidates =
+        _mm256_blendv_epi8(_mm256_set1_epi8(127), candidates, active);
+
+    __m128i min128 =
+        _mm_min_epi8(_mm256_castsi256_si128(masked_candidates),
+                     _mm256_extracti128_si256(masked_candidates, 1));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 8));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 4));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 2));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 1));
+
+    const int candidate_min =
+        static_cast<int>(static_cast<int8_t>(_mm_extract_epi8(min128, 0)));
+    if (candidate_min < best) {
+      const __m256i equal_min = _mm256_cmpeq_epi8(
+          masked_candidates,
+          _mm256_set1_epi8(static_cast<int8_t>(candidate_min)));
+      const uint32_t equal_mask =
+          static_cast<uint32_t>(_mm256_movemask_epi8(equal_min));
+      const uint32_t nibble_index = std::countr_zero(equal_mask);
+      const uint64_t word = s[nibble_index >> 4];
+      const uint8_t nibble =
+          static_cast<uint8_t>((word >> ((nibble_index & 15u) * 4u)) & 0xFu);
+      best = candidate_min;
+      best_offset = static_cast<size_t>(nibble_index) * 4u +
+                    static_cast<size_t>(excess_lut_min_offset[nibble]);
+    }
+
+    bit = last_full_nibble * 4;
+    current = prefix_excess_128(s, bit);
+  }
+
+  for (; bit < right; ++bit) {
+    current += ((s[bit >> 6] >> (bit & 63)) & 1ull) != 0 ? 1 : -1;
+    const size_t offset = bit + 1;
+    if (current < best) {
+      best = current;
+      best_offset = offset;
     }
-    carry += delta;
   }
 #else
   int current = 0;
diff --git a/include/pixie/experimental/excess.h b/include/pixie/experimental/excess.h
index ef64dde..d5a0e66 100644
--- a/include/pixie/experimental/excess.h
+++ b/include/pixie/experimental/excess.h
@@ -2,13 +2,579 @@
 
 #include <pixie/bits.h>
 
+#include <algorithm>
+#include <array>
 #include <bit>
 #include <cstddef>
 #include <cstdint>
 
+/**
+ * Benchmark note:
+ *
+ * This header keeps experimental and historical excess_min/excess_positions
+ * variants for comparison benchmarks. Production excess code lives in
+ * pixie/bits.h; a benchmark win here should be treated as a candidate to port,
+ * not evidence that callers use this variant already.
+ *
+ * Diagnostic run, 2026-05-30:
+ *   taskset -c 0 build/benchmarks-diagnostic_local/excess_positions_benchmarks
+ *     --benchmark_filter='BM_ExcessMin128(_|/|$)'
+ *     --benchmark_repetitions=3
+ *     --benchmark_perf_counters=CYCLES,INSTRUCTIONS,CACHE-MISSES
+ *     --benchmark_counters_tabular=true
+ *
+ * Production excess_min_128:
+ *   range      ns    cycles  instructions  cache misses
+ *   0-128     8.1    35.8   117.9         0.001
+ *   0-127    11.6    50.9   174.9         0.001
+ *   0-16      3.7    16.4    88.5         0.000
+ *   0-32      5.6    24.7   124.8         0.000
+ *   0-48      9.1    40.5   122.1         0.000
+ *   0-64      8.1    36.2   121.4         0.000
+ *   0-31     12.1    54.2   177.6         0.000
+ *   1-17     14.4    64.6   213.3         0.000
+ *   3-35     13.5    60.1   212.4         0.000
+ *   5-37     12.9    57.5   215.3         0.000
+ *   32-64     7.4    33.1   153.0         0.000
+ *   33-65    14.3    63.6   214.6         0.001
+ *   64-96     7.8    34.2   148.9         0.000
+ *   61-93    16.0    69.4   218.5         0.002
+ *   96-128    6.5    28.1   151.9         0.001
+ *   56-72     5.2    22.7   116.5         0.000
+ *   60-68     8.5    38.1   148.7         0.000
+ *   63-64     3.1    13.3    84.0         0.000
+ *   17-17     2.1     9.1    47.0         0.000
+ *   Random   11.3    49.7   197.0         0.001
+ *
+ * New non-aligned/random range timings, ns:
+ *   range   Prod  Scalar  Nibble  Byte  Hybrid  Expand16  Lane64  Split64  Skip
+ *   1-17    14.4    14.6    10.1  12.5    11.9      30.3    13.3     15.9  16.3
+ *   3-35    13.5    27.1    12.5  14.2    16.1      46.1    13.1     12.7  14.1
+ *   5-37    12.9    27.0    14.6  15.2    18.5      44.7    13.5     13.0  14.7
+ *   33-65   14.3    26.6    13.3  17.8    17.4      42.1    14.5     13.3  13.6
+ *   61-93   16.0    26.6    15.4  14.8    17.7      43.8    14.0     13.6  11.4
+ *   Random  11.3    39.9    18.9  14.7    11.5      68.2    11.9     12.4  12.6
+ *
+ * Random range hardware counters:
+ *   variant       ns    cycles  instructions
+ *   Production   11.3    49.7   197.0
+ *   ScalarBits   39.9   175.9   721.9
+ *   NibbleLUT    18.9    83.2   291.2
+ *   ByteLUT      14.7    63.9   261.2
+ *   HybridLUT    11.5    50.8   209.8
+ *   Expand16     68.2   300.8   879.8
+ *   Lane64SSE    11.9    52.1   224.1
+ *   Split64SSE   12.4    54.1   236.9
+ *   ShortSkip    12.6    55.7   252.5
+ *
+ * Diagnostic run, 2026-05-30:
+ *   taskset -c 0 build/benchmarks-diagnostic_local/excess_positions_benchmarks
+ *     --benchmark_filter='BM_ExcessPositions512'
+ *     --benchmark_repetitions=3
+ *     --benchmark_perf_counters=CYCLES,INSTRUCTIONS,CACHE-MISSES
+ *     --benchmark_counters_tabular=true
+ *
+ * excess_positions_512 random target in [-128, 128]:
+ *   variant        ns    cycles  instructions  cache misses
+ *   Production    11.4    50.8   188.9         0.001
+ *   LUTAVX512     12.8    56.8   195.5         0.002
+ *   BranchingLUT  16.7    73.4   261.4         0.003
+ *   ExpandAVX512  21.0    93.6   266.6         0.003
+ *   Expand8       24.6   109.9   449.7         0.002
+ *   Expand        46.8   207.7   784.8         0.006
+ *   ByteLUT       49.7   221.4   754.5         0.008
+ *   Scalar       374.2  1656.0  7716.6         0.041
+ *
+ * excess_positions_512 fixed-target timings, ns:
+ *   variant       -64   -8    0     8    64
+ *   Production    11.6  18.0  18.3  19.1  12.3
+ *   LUTAVX512     13.4  17.9  19.2  21.3  13.2
+ *   BranchingLUT  19.1  28.9  28.7  28.3  16.8
+ *   ExpandAVX512  22.7  36.6  36.3  36.2  22.5
+ *   Expand8       17.6  52.7  47.5  46.9  17.7
+ *   Expand        51.0  85.9  86.1  85.5  53.9
+ *   ByteLUT       34.7  77.4  76.9  79.0  34.3
+ *   Scalar       367.2 433.5 466.3 428.1 364.6
+ */
+
 namespace pixie::experimental {
 
+namespace detail {
+
+constexpr int8_t nibble_delta(uint8_t x) {
+  return static_cast<int8_t>(2 * std::popcount(x) - 4);
+}
+
+constexpr int8_t byte_delta(uint8_t x) {
+  return static_cast<int8_t>(2 * std::popcount(x) - 8);
+}
+
+constexpr int8_t min_prefix(uint8_t x, int bits) {
+  int cur = 0;
+  int best = 0;
+  for (int bit = 0; bit < bits; ++bit) {
+    cur += ((x >> bit) & 1u) != 0 ? 1 : -1;
+    if (bit == 0 || cur < best) {
+      best = cur;
+    }
+  }
+  return static_cast<int8_t>(best);
+}
+
+constexpr int8_t min_prefix_offset(uint8_t x, int bits) {
+  int cur = 0;
+  int best = 0;
+  int best_offset = 1;
+  for (int bit = 0; bit < bits; ++bit) {
+    cur += ((x >> bit) & 1u) != 0 ? 1 : -1;
+    if (bit == 0 || cur < best) {
+      best = cur;
+      best_offset = bit + 1;
+    }
+  }
+  return static_cast<int8_t>(best_offset);
+}
+
+template <size_t N, typename Fn>
+constexpr std::array<int8_t, N> make_lut(Fn fn) {
+  std::array<int8_t, N> out{};
+  for (size_t i = 0; i < N; ++i) {
+    out[i] = fn(static_cast<uint8_t>(i));
+  }
+  return out;
+}
+
+static inline constexpr std::array<int8_t, 16> kNibbleDelta =
+    make_lut<16>([](uint8_t x) { return nibble_delta(x); });
+static inline constexpr std::array<int8_t, 16> kNibbleMin =
+    make_lut<16>([](uint8_t x) { return min_prefix(x, 4); });
+static inline constexpr std::array<int8_t, 16> kNibbleMinOffset =
+    make_lut<16>([](uint8_t x) { return min_prefix_offset(x, 4); });
+static inline constexpr std::array<int8_t, 256> kByteDelta =
+    make_lut<256>([](uint8_t x) { return byte_delta(x); });
+static inline constexpr std::array<int8_t, 256> kByteMin =
+    make_lut<256>([](uint8_t x) { return min_prefix(x, 8); });
+static inline constexpr std::array<int8_t, 256> kByteMinOffset =
+    make_lut<256>([](uint8_t x) { return min_prefix_offset(x, 8); });
+
+static inline void scan_bit(const uint64_t* s,
+                            size_t bit,
+                            int& current,
+                            int& best,
+                            size_t& best_offset) noexcept {
+  current += ((s[bit >> 6] >> (bit & 63)) & 1ull) != 0 ? 1 : -1;
+  const size_t offset = bit + 1;
+  if (current < best) {
+    best = current;
+    best_offset = offset;
+  }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Reference scalar excess_min_128 implementation.
+ *
+ * @details Workflow:
+ *
+ *   prefix(left) -> scan bits left..right-1 -> first strict minimum
+ *
+ * The value at offset left is included before scanning any bits, matching the
+ * production inclusive prefix range [left, right]. Each scanned bit advances to
+ * the next prefix offset. Ties are intentionally ignored, so the first minimum
+ * offset is preserved.
+ */
+static inline ExcessResult excess_min_128_scalar_bits(const uint64_t* s,
+                                                      size_t left,
+                                                      size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  left = std::min<size_t>(left, 128);
+  right = std::min<size_t>(right, 128);
+
+  int best = prefix_excess_128(s, left);
+  size_t best_offset = left;
+  if (left == right) {
+    return {best, best_offset};
+  }
+
+  int current = best;
+  for (size_t bit = left; bit < right; ++bit) {
+    detail::scan_bit(s, bit, current, best, best_offset);
+  }
+  return {best, best_offset};
+}
+
+/**
+ * @brief Scalar 4-bit LUT excess_min_128 experiment.
+ *
+ * @details Workflow:
+ *
+ *   unaligned bits -> full nibbles -> trailing bits
+ *                      | delta
+ *                      | local min
+ *                      ` first local min offset
+ *
+ * Full nibbles use lookup tables for the nibble delta, the minimum prefix value
+ * inside positions 1..4, and the first local bit offset that reaches that
+ * minimum. Boundary bits are scanned scalar so the LUT never observes bits
+ * outside the query range.
+ */
+static inline ExcessResult excess_min_128_nibble_lut(const uint64_t* s,
+                                                     size_t left,
+                                                     size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  left = std::min<size_t>(left, 128);
+  right = std::min<size_t>(right, 128);
+
+  int best = prefix_excess_128(s, left);
+  size_t best_offset = left;
+  if (left == right) {
+    return {best, best_offset};
+  }
+
+  int current = best;
+  size_t bit = left;
+  for (; bit < right && (bit & 3u) != 0; ++bit) {
+    detail::scan_bit(s, bit, current, best, best_offset);
+  }
+
+  for (; bit + 4 <= right; bit += 4) {
+    const uint8_t nibble =
+        static_cast<uint8_t>((s[bit >> 6] >> (bit & 63)) & 0xFu);
+    const int candidate = current + detail::kNibbleMin[nibble];
+    if (candidate < best) {
+      best = candidate;
+      best_offset = bit + static_cast<size_t>(detail::kNibbleMinOffset[nibble]);
+    }
+    current += detail::kNibbleDelta[nibble];
+  }
+
+  for (; bit < right; ++bit) {
+    detail::scan_bit(s, bit, current, best, best_offset);
+  }
+  return {best, best_offset};
+}
+
+/**
+ * @brief Scalar 8-bit LUT excess_min_128 experiment.
+ *
+ * @details Workflow:
+ *
+ *   unaligned bits -> full bytes -> trailing bits
+ *                     | delta
+ *                     | local min
+ *                     ` first local min offset
+ *
+ * Full bytes use lookup tables for byte delta, minimum prefix value inside
+ * positions 1..8, and the first local bit offset that reaches that minimum.
+ * This reduces loop iterations on byte-aligned ranges but pays scalar boundary
+ * work on unaligned ranges.
+ */
+static inline ExcessResult excess_min_128_byte_lut(const uint64_t* s,
+                                                   size_t left,
+                                                   size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  left = std::min<size_t>(left, 128);
+  right = std::min<size_t>(right, 128);
+
+  int best = prefix_excess_128(s, left);
+  size_t best_offset = left;
+  if (left == right) {
+    return {best, best_offset};
+  }
+
+  int current = best;
+  size_t bit = left;
+  for (; bit < right && (bit & 7u) != 0; ++bit) {
+    detail::scan_bit(s, bit, current, best, best_offset);
+  }
+
+  for (; bit + 8 <= right; bit += 8) {
+    const uint8_t byte =
+        static_cast<uint8_t>((s[bit >> 6] >> (bit & 63)) & 0xFFu);
+    const int candidate = current + detail::kByteMin[byte];
+    if (candidate < best) {
+      best = candidate;
+      best_offset = bit + static_cast<size_t>(detail::kByteMinOffset[byte]);
+    }
+    current += detail::kByteDelta[byte];
+  }
+
+  for (; bit < right; ++bit) {
+    detail::scan_bit(s, bit, current, best, best_offset);
+  }
+  return {best, best_offset};
+}
+
+/**
+ * @brief Hybrid dispatch over scalar, byte-LUT, nibble-LUT, and production.
+ *
+ * @details Workflow:
+ *
+ *   width <= 2                  -> scalar bits
+ *   width <= 64 and byte aligned -> byte LUT
+ *   width <= 32                 -> nibble LUT
+ *   otherwise                   -> production excess_min_128
+ *
+ * This variant probes whether the fastest implementation depends primarily on
+ * query width and boundary alignment. It keeps production behavior for wider
+ * ranges where the AVX2 production path usually wins.
+ */
+static inline ExcessResult excess_min_128_hybrid_lut(const uint64_t* s,
+                                                     size_t left,
+                                                     size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  const size_t clamped_left = std::min<size_t>(left, 128);
+  const size_t clamped_right = std::min<size_t>(right, 128);
+  const size_t width = clamped_right - clamped_left;
+
+  if (width <= 2) {
+    return excess_min_128_scalar_bits(s, left, right);
+  }
+  if (width <= 64 && (clamped_left & 7u) == 0 && (clamped_right & 7u) == 0) {
+    return excess_min_128_byte_lut(s, left, right);
+  }
+  if (width <= 32) {
+    return excess_min_128_nibble_lut(s, left, right);
+  }
+  return excess_min_128(s, left, right);
+}
+
 #ifdef PIXIE_AVX2_SUPPORT
+// clang-format off
+static inline const __m128i excess_lut_delta_128 = _mm_setr_epi8(
+    -4, -2, -2,  0,
+    -2,  0,  0,  2,
+    -2,  0,  0,  2,
+     0,  2,  2,  4);
+static inline const __m128i excess_lut_min_128 = _mm_setr_epi8(
+    -4, -2, -2,  0,
+    -2,  0, -1,  1,
+    -3, -1, -1,  1,
+    -2,  0, -1,  1);
+static inline const __m128i excess_lut_nibble_index_128 = _mm_setr_epi8(
+     0,  1,  2,  3,
+     4,  5,  6,  7,
+     8,  9, 10, 11,
+    12, 13, 14, 15);
+static inline const __m128i excess_lut_nibble_mask_128 = _mm_set1_epi8(0x0F);
+// clang-format on
+
+namespace detail {
+
+static inline __m128i excess_nibbles_64_sse(uint64_t word) noexcept {
+  const __m128i word_vec = _mm_cvtsi64_si128(static_cast<int64_t>(word));
+  const __m128i lo_nibbles =
+      _mm_and_si128(word_vec, excess_lut_nibble_mask_128);
+  const __m128i hi_nibbles =
+      _mm_and_si128(_mm_srli_epi16(word_vec, 4), excess_lut_nibble_mask_128);
+  return _mm_unpacklo_epi8(lo_nibbles, hi_nibbles);
+}
+
+static inline __m128i excess_prefix_sum_16x_i8(__m128i v) noexcept {
+  __m128i x = v;
+  __m128i t = _mm_slli_si128(x, 1);
+  x = _mm_add_epi8(x, t);
+  t = _mm_slli_si128(x, 2);
+  x = _mm_add_epi8(x, t);
+  t = _mm_slli_si128(x, 4);
+  x = _mm_add_epi8(x, t);
+  t = _mm_slli_si128(x, 8);
+  return _mm_add_epi8(x, t);
+}
+
+static inline void scan_full_nibbles_64_sse(uint64_t word,
+                                            int lane_base_excess,
+                                            size_t lane_base_offset,
+                                            size_t first_nibble,
+                                            size_t last_nibble,
+                                            int& best,
+                                            size_t& best_offset) noexcept {
+  if (first_nibble >= last_nibble) {
+    return;
+  }
+
+  const __m128i nibbles = excess_nibbles_64_sse(word);
+  __m128i ps =
+      excess_prefix_sum_16x_i8(_mm_shuffle_epi8(excess_lut_delta_128, nibbles));
+  const __m128i excl_ps = _mm_alignr_epi8(ps, _mm_setzero_si128(), 15);
+  const __m128i candidates = _mm_add_epi8(
+      _mm_add_epi8(_mm_set1_epi8(static_cast<int8_t>(lane_base_excess)),
+                   excl_ps),
+      _mm_shuffle_epi8(excess_lut_min_128, nibbles));
+
+  const __m128i idx = excess_lut_nibble_index_128;
+  const __m128i first_minus_one =
+      _mm_set1_epi8(static_cast<int8_t>(static_cast<int>(first_nibble) - 1));
+  const __m128i last = _mm_set1_epi8(static_cast<int8_t>(last_nibble));
+  const __m128i active = _mm_and_si128(_mm_cmpgt_epi8(idx, first_minus_one),
+                                       _mm_cmpgt_epi8(last, idx));
+  const __m128i masked_candidates =
+      _mm_blendv_epi8(_mm_set1_epi8(127), candidates, active);
+
+  __m128i min128 = masked_candidates;
+  min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 8));
+  min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 4));
+  min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 2));
+  min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 1));
+
+  const int candidate_min =
+      static_cast<int>(static_cast<int8_t>(_mm_extract_epi8(min128, 0)));
+  if (candidate_min < best) {
+    const __m128i equal_min = _mm_cmpeq_epi8(
+        masked_candidates, _mm_set1_epi8(static_cast<int8_t>(candidate_min)));
+    const uint32_t equal_mask =
+        static_cast<uint32_t>(_mm_movemask_epi8(equal_min));
+    const uint32_t nibble_index = std::countr_zero(equal_mask);
+    const uint8_t nibble =
+        static_cast<uint8_t>((word >> (nibble_index * 4u)) & 0xFu);
+    best = candidate_min;
+    best_offset = lane_base_offset + static_cast<size_t>(nibble_index) * 4u +
+                  static_cast<size_t>(kNibbleMinOffset[nibble]);
+  }
+}
+
+static inline ExcessResult excess_min_128_split64_sse_impl(
+    const uint64_t* s,
+    size_t left,
+    size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  left = std::min<size_t>(left, 128);
+  right = std::min<size_t>(right, 128);
+
+  int best = prefix_excess_128(s, left);
+  size_t best_offset = left;
+  if (left == right) {
+    return {best, best_offset};
+  }
+
+  int current = best;
+  size_t bit = left;
+  for (; bit < right && (bit & 3u) != 0; ++bit) {
+    scan_bit(s, bit, current, best, best_offset);
+  }
+
+  size_t first_full_nibble = bit >> 2;
+  const size_t last_full_nibble = right >> 2;
+  while (first_full_nibble < last_full_nibble) {
+    const size_t word_index = first_full_nibble >> 4;
+    const size_t lane_first = first_full_nibble & 15u;
+    const size_t lane_last =
+        std::min<size_t>(last_full_nibble - word_index * 16u, 16);
+    const size_t lane_base_offset = word_index * 64u;
+    scan_full_nibbles_64_sse(
+        s[word_index], prefix_excess_128(s, lane_base_offset), lane_base_offset,
+        lane_first, lane_last, best, best_offset);
+    first_full_nibble = word_index * 16u + lane_last;
+  }
+
+  bit = std::max(bit, first_full_nibble * 4u);
+  current = prefix_excess_128(s, bit);
+  for (; bit < right; ++bit) {
+    scan_bit(s, bit, current, best, best_offset);
+  }
+
+  return {best, best_offset};
+}
+
+}  // namespace detail
+
+/**
+ * @brief Single-64-bit-lane SSE excess_min_128 experiment.
+ *
+ * @details Workflow:
+ *
+ *   scalar boundary -> one 64-bit word as 16 nibbles -> scalar tail
+ *                     fallback to production if full nibbles cross words
+ *
+ * This variant tests whether short ranges benefit from avoiding the 128-bit
+ * cross-lane prefix work used by broader vector paths. It only handles the
+ * single-word full-nibble case; multi-word ranges fall back to production.
+ */
+static inline ExcessResult excess_min_128_lane64_sse(const uint64_t* s,
+                                                     size_t left,
+                                                     size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  const size_t clamped_left = std::min<size_t>(left, 128);
+  const size_t clamped_right = std::min<size_t>(right, 128);
+  const size_t first_full_nibble = ((clamped_left + 3u) & ~size_t{3}) >> 2;
+  const size_t last_full_nibble = clamped_right >> 2;
+  if (first_full_nibble < last_full_nibble &&
+      (first_full_nibble >> 4) != ((last_full_nibble - 1u) >> 4)) {
+    return excess_min_128(s, left, right);
+  }
+  return detail::excess_min_128_split64_sse_impl(s, left, right);
+}
+
+/**
+ * @brief Split-64-bit-lane SSE excess_min_128 experiment.
+ *
+ * @details Workflow:
+ *
+ *   scalar boundary -> word 0 full nibbles -> word 1 full nibbles -> tail
+ *                       16-nibble SSE         16-nibble SSE
+ *
+ * Each 64-bit word is processed as an independent 16-nibble vector scan. The
+ * base excess for a word is recomputed from prefix_excess_128, avoiding
+ * vector-prefix carry propagation across the 64-bit boundary.
+ */
+static inline ExcessResult excess_min_128_split64_sse(const uint64_t* s,
+                                                      size_t left,
+                                                      size_t right) noexcept {
+  return detail::excess_min_128_split64_sse_impl(s, left, right);
+}
+
+/**
+ * @brief Short-range dispatch experiment.
+ *
+ * @details Workflow:
+ *
+ *   width <= 2              -> scalar bits
+ *   full nibbles in 1 word  -> lane64 SSE
+ *   width <= 80             -> split64 SSE
+ *   otherwise               -> production excess_min_128
+ *
+ * This variant tests two ideas together: avoid 128-bit lane crossing when a
+ * query is contained in one 64-bit word, and skip a few production iterations
+ * for medium ranges where split-lane scans may be cheaper.
+ */
+static inline ExcessResult excess_min_128_short_skip(const uint64_t* s,
+                                                     size_t left,
+                                                     size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  const size_t clamped_left = std::min<size_t>(left, 128);
+  const size_t clamped_right = std::min<size_t>(right, 128);
+  const size_t width = clamped_right - clamped_left;
+  if (width <= 2) {
+    return excess_min_128_scalar_bits(s, left, right);
+  }
+
+  const size_t first_full_nibble = ((clamped_left + 3u) & ~size_t{3}) >> 2;
+  const size_t last_full_nibble = clamped_right >> 2;
+  if (first_full_nibble < last_full_nibble &&
+      (first_full_nibble >> 4) == ((last_full_nibble - 1u) >> 4)) {
+    return excess_min_128_lane64_sse(s, left, right);
+  }
+  if (width <= 80) {
+    return excess_min_128_split64_sse(s, left, right);
+  }
+  return excess_min_128(s, left, right);
+}
+
 // clang-format off
 static inline const __m256i excess_branch_lut_em4 = _mm256_setr_epi8(
     0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -133,6 +699,88 @@ static inline int8_t excess_last_prefix_32x_i8(__m256i pref) noexcept {
   return (int8_t)_mm_extract_epi8(hi, 15);
 }
 
+/**
+ * @brief AVX2 expand-to-i16 excess_min_128 experiment.
+ *
+ * @details Workflow:
+ *
+ *   16 input bits -> 16 x i16 +/-1 -> vector prefix sum -> store -> scalar min
+ *
+ * The implementation scans eight 16-bit chunks. For chunks overlapping the
+ * query, bits are expanded to signed +/-1 i16 lanes and prefix summed with the
+ * running carry. The vector result is stored to memory, then relevant lanes are
+ * checked scalar for the first strict minimum.
+ */
+static inline ExcessResult excess_min_128_expand16_avx2(const uint64_t* s,
+                                                        size_t left,
+                                                        size_t right) noexcept {
+  if (left > right) {
+    return {};
+  }
+  left = std::min<size_t>(left, 128);
+  right = std::min<size_t>(right, 128);
+
+  int best = prefix_excess_128(s, left);
+  size_t best_offset = left;
+  if (left == right) {
+    return {best, best_offset};
+  }
+
+  const __m256i masks = excess_bit_masks_16x();
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i pos = _mm256_set1_epi16(1);
+  const __m256i neg = _mm256_set1_epi16(-1);
+
+  int carry = 0;
+  alignas(32) int16_t prefix_values[16];
+  for (size_t chunk = 0; chunk < 8; ++chunk) {
+    const size_t chunk_bit = chunk * 16;
+    const uint16_t bits =
+        chunk < 4
+            ? static_cast<uint16_t>((s[0] >> (chunk * 16)) & 0xFFFFu)
+            : static_cast<uint16_t>((s[1] >> ((chunk - 4) * 16)) & 0xFFFFu);
+    const int delta = 2 * static_cast<int>(std::popcount(bits)) - 16;
+
+    if (chunk_bit + 1 <= right && chunk_bit + 16 >= left) {
+      const __m256i selected = _mm256_and_si256(
+          _mm256_set1_epi16(static_cast<int16_t>(bits)), masks);
+      const __m256i is_zero = _mm256_cmpeq_epi16(selected, zero);
+      const __m256i steps = _mm256_blendv_epi8(pos, neg, is_zero);
+      const __m256i pref =
+          _mm256_add_epi16(excess_prefix_sum_16x_i16(steps),
+                           _mm256_set1_epi16(static_cast<int16_t>(carry)));
+      _mm256_store_si256(reinterpret_cast<__m256i*>(prefix_values), pref);
+
+      for (size_t lane = 0; lane < 16; ++lane) {
+        const size_t offset = chunk_bit + lane + 1;
+        if (offset < left || offset > right) {
+          continue;
+        }
+        const int value = prefix_values[lane];
+        if (value < best) {
+          best = value;
+          best_offset = offset;
+        }
+      }
+    }
+    carry += delta;
+  }
+
+  return {best, best_offset};
+}
+
+/**
+ * @brief Historical AVX2 branching-LUT excess_positions_512 variant.
+ *
+ * @details Workflow:
+ *
+ *   128-bit block -> 32 nibbles -> prefix sums -> branch by target-relative
+ *                                 -> LUT masks -> packed output bits
+ *
+ * Each 128-bit block is converted to nibbles, prefix-summed, and filtered by
+ * reachability. A family of per-target LUTs produces within-nibble match masks,
+ * which are packed back to the output words.
+ */
 static inline void excess_positions_512_branching_lut(const uint64_t* s,
                                                       int target_x,
                                                       uint64_t* out) noexcept {
@@ -249,6 +897,13 @@ static inline void excess_positions_512_branching_lut(const uint64_t* s,
   }
 }
 #else
+/**
+ * @brief Scalar fallback for the branching-LUT positions variant.
+ *
+ * @details Used when AVX2 is not enabled. Delegates to production
+ * excess_positions_512 so callers can benchmark the same symbol across build
+ * configurations.
+ */
 static inline void excess_positions_512_branching_lut(const uint64_t* s,
                                                       int target_x,
                                                       uint64_t* out) noexcept {
@@ -378,6 +1033,19 @@ static inline uint64_t excess_repeat_byte(int value) noexcept {
          static_cast<uint8_t>(static_cast<int8_t>(value));
 }
 
+/**
+ * @brief AVX-512 nibble-LUT excess_positions_512 experiment.
+ *
+ * @details Workflow:
+ *
+ *   256 input bits -> 64 nibbles -> two 128-bit logical halves
+ *                  -> nibble prefix/LUT matches -> packed output masks
+ *
+ * The implementation processes four words at a time. It computes reachability
+ * for each 128-bit half, converts bytes to nibbles, builds exclusive prefix
+ * sums, compares nibble-local positions against the target, and packs matches
+ * back to four output words.
+ */
 static inline void excess_positions_512_lut_avx512(const uint64_t* s,
                                                    int target_x,
                                                    uint64_t* out) noexcept {
@@ -463,6 +1131,13 @@ static inline void excess_positions_512_lut_avx512(const uint64_t* s,
   }
 }
 #else
+/**
+ * @brief Fallback for the AVX-512 LUT positions variant.
+ *
+ * @details Used when AVX-512 is not enabled. Delegates to production
+ * excess_positions_512 so callers can benchmark the same symbol across build
+ * configurations.
+ */
 static inline void excess_positions_512_lut_avx512(const uint64_t* s,
                                                    int target_x,
                                                    uint64_t* out) noexcept {
@@ -470,6 +1145,18 @@ static inline void excess_positions_512_lut_avx512(const uint64_t* s,
 }
 #endif
 
+/**
+ * @brief Expand-to-i16 excess_positions_512 experiment.
+ *
+ * @details Workflow:
+ *
+ *   16 input bits -> 16 x i16 +/-1 -> vector prefix sum -> compare target
+ *                 -> pext mask -> output word
+ *
+ * With AVX2, this variant scans 16-bit chunks, expands them to i16 prefix
+ * lanes, compares absolute prefix values against the target, and compresses the
+ * comparison mask into output bits. Without AVX2 it uses a scalar scan.
+ */
 static inline void excess_positions_512_expand(const uint64_t* s,
                                                int target_x,
                                                uint64_t* out) noexcept {
@@ -541,6 +1228,18 @@ static inline void excess_positions_512_expand(const uint64_t* s,
 #endif
 }
 
+/**
+ * @brief AVX2 expand-to-i8 excess_positions_512 experiment.
+ *
+ * @details Workflow:
+ *
+ *   32 input bits -> 32 x i8 +/-1 -> byte prefix sum -> compare target
+ *                 -> movemask -> output word
+ *
+ * This variant handles 32-bit chunks. It first checks whether the target is
+ * reachable within the chunk, then expands bits to byte lanes and uses the
+ * vector comparison mask directly as output bits.
+ */
 static inline void excess_positions_512_expand8(const uint64_t* s,
                                                 int target_x,
                                                 uint64_t* out) noexcept {
@@ -601,6 +1300,18 @@ static inline void excess_positions_512_expand8(const uint64_t* s,
 #endif
 }
 
+/**
+ * @brief AVX-512 expand-to-i8 excess_positions_512 experiment.
+ *
+ * @details Workflow:
+ *
+ *   64 input bits -> 64 x i8 +/-1 -> byte prefix sum -> k-mask output
+ *
+ * This variant processes one 64-bit word per vector. If the target is
+ * unreachable in the word, it advances by popcount delta only. Otherwise it
+ * expands the word to byte lanes, prefix-sums the lanes, compares against the
+ * target, and stores the resulting AVX-512 mask as the output word.
+ */
 static inline void excess_positions_512_expand_avx512(const uint64_t* s,
                                                       int target_x,
                                                       uint64_t* out) noexcept {
@@ -681,6 +1392,19 @@ struct ExcessByteLut {
 
 inline constexpr ExcessByteLut kExcessByteLut;
 
+/**
+ * @brief Scalar byte-LUT excess_positions_512 experiment.
+ *
+ * @details Workflow:
+ *
+ *   byte -> relative target in [-8, 8] -> LUT match mask
+ *        -> byte delta -> next byte base excess
+ *
+ * The table stores, for each byte, all bit positions that reach each local
+ * target and the byte delta. The scan walks 64 bytes, emits a mask when the
+ * relative target is in range, and then advances the running excess by the
+ * byte delta.
+ */
 static inline void excess_positions_512_byte_lut(const uint64_t* s,
                                                  int target_x,
                                                  uint64_t* out) noexcept {
diff --git a/include/pixie/rmq/bp_plus_minus_one_rmq.h b/include/pixie/rmq/bp_plus_minus_one_rmq.h
index 57af0e1..e796b93 100644
--- a/include/pixie/rmq/bp_plus_minus_one_rmq.h
+++ b/include/pixie/rmq/bp_plus_minus_one_rmq.h
@@ -212,8 +212,7 @@ class BpPlusMinusOneRmq {
   std::int64_t block_base_depth(std::size_t block,
                                 const std::array<std::uint64_t, 2>& bits,
                                 std::size_t size) const {
-    const ExcessMin128Result full_min =
-        excess_min_128(bits.data(), 0, size - 1);
+    const ExcessResult full_min = excess_min_128(bits.data(), 0, size - 1);
     return block_min_values_[block] - full_min.min_excess;
   }
 
@@ -225,7 +224,7 @@ class BpPlusMinusOneRmq {
     right_offset = std::min(right_offset, size - 1);
     const auto bits = block_bits(block);
     const std::int64_t base_depth = block_base_depth(block, bits, size);
-    const ExcessMin128Result result =
+    const ExcessResult result =
         excess_min_128(bits.data(), left_offset, right_offset);
     if (result.offset == npos || result.offset >= size) {
       return {};
@@ -237,7 +236,7 @@ class BpPlusMinusOneRmq {
     const std::size_t begin = block * BlockSize;
     const std::size_t size = block_size(block);
     const auto bits = block_bits(block);
-    const ExcessMin128Result result = excess_min_128(bits.data(), 0, size - 1);
+    const ExcessResult result = excess_min_128(bits.data(), 0, size - 1);
     if (result.offset == npos || result.offset >= size) {
       return {};
     }
diff --git a/src/benchmarks/excess_positions_benchmarks.cpp b/src/benchmarks/excess_positions_benchmarks.cpp
index 4bbf28f..0e0850f 100644
--- a/src/benchmarks/excess_positions_benchmarks.cpp
+++ b/src/benchmarks/excess_positions_benchmarks.cpp
@@ -8,12 +8,22 @@
 #include <random>
 #include <vector>
 
+using pixie::experimental::excess_min_128_byte_lut;
+using pixie::experimental::excess_min_128_hybrid_lut;
 using pixie::experimental::excess_positions_512_branching_lut;
 using pixie::experimental::excess_positions_512_byte_lut;
 using pixie::experimental::excess_positions_512_expand;
 using pixie::experimental::excess_positions_512_expand8;
 using pixie::experimental::excess_positions_512_expand_avx512;
 using pixie::experimental::excess_positions_512_lut_avx512;
+#ifdef PIXIE_AVX2_SUPPORT
+using pixie::experimental::excess_min_128_expand16_avx2;
+using pixie::experimental::excess_min_128_lane64_sse;
+using pixie::experimental::excess_min_128_short_skip;
+using pixie::experimental::excess_min_128_split64_sse;
+#endif
+using pixie::experimental::excess_min_128_nibble_lut;
+using pixie::experimental::excess_min_128_scalar_bits;
 
 static std::vector<std::array<uint64_t, 8>> make_blocks(
     size_t num_blocks = 4096) {
@@ -27,6 +37,193 @@ static std::vector<std::array<uint64_t, 8>> make_blocks(
   return blocks;
 }
 
+static std::vector<std::array<uint64_t, 2>> make_128_blocks(
+    size_t num_blocks = 4096) {
+  std::mt19937_64 rng(42);
+  std::vector<std::array<uint64_t, 2>> blocks(num_blocks);
+  for (auto& b : blocks) {
+    b = {rng(), rng()};
+  }
+  return blocks;
+}
+
+static std::vector<std::pair<size_t, size_t>> make_128_ranges(
+    size_t num_ranges = 4096) {
+  std::mt19937_64 rng(43);
+  std::uniform_int_distribution<size_t> offset_dist(0, 128);
+  std::vector<std::pair<size_t, size_t>> ranges(num_ranges);
+  for (auto& range : ranges) {
+    size_t left = offset_dist(rng);
+    size_t right = offset_dist(rng);
+    if (left > right) {
+      std::swap(left, right);
+    }
+    range = {left, right};
+  }
+  return ranges;
+}
+
+static std::vector<int> make_512_targets(size_t num_targets = 4096) {
+  std::mt19937 rng(44);
+  std::uniform_int_distribution<int> target_dist(-128, 128);
+  std::vector<int> targets(num_targets);
+  for (int& target : targets) {
+    target = target_dist(rng);
+  }
+  return targets;
+}
+
+static void BM_ExcessMin128(benchmark::State& state) {
+  const size_t left = static_cast<size_t>(state.range(0));
+  const size_t right = static_cast<size_t>(state.range(1));
+  const auto blocks = make_128_blocks();
+  const size_t num_blocks = blocks.size();
+
+  size_t idx = 0;
+  for (auto _ : state) {
+    const auto& s = blocks[idx % num_blocks];
+    ExcessResult result = excess_min_128(s.data(), left, right);
+    benchmark::DoNotOptimize(result.min_excess);
+    benchmark::DoNotOptimize(result.offset);
+    ++idx;
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_ExcessMin128)
+    ->ArgNames({"left", "right"})
+    ->Args({0, 128})
+    ->Args({0, 127})
+    ->Args({0, 16})
+    ->Args({0, 32})
+    ->Args({0, 48})
+    ->Args({0, 64})
+    ->Args({0, 31})
+    ->Args({1, 17})
+    ->Args({3, 35})
+    ->Args({5, 37})
+    ->Args({32, 64})
+    ->Args({33, 65})
+    ->Args({64, 96})
+    ->Args({61, 93})
+    ->Args({96, 128})
+    ->Args({56, 72})
+    ->Args({60, 68})
+    ->Args({63, 64})
+    ->Args({17, 17});
+
+template <ExcessResult (*Fn)(const uint64_t*, size_t, size_t)>
+static void BM_ExcessMin128Variant(benchmark::State& state) {
+  const size_t left = static_cast<size_t>(state.range(0));
+  const size_t right = static_cast<size_t>(state.range(1));
+  const auto blocks = make_128_blocks();
+  const size_t num_blocks = blocks.size();
+
+  size_t idx = 0;
+  for (auto _ : state) {
+    const auto& s = blocks[idx % num_blocks];
+    ExcessResult result = Fn(s.data(), left, right);
+    benchmark::DoNotOptimize(result.min_excess);
+    benchmark::DoNotOptimize(result.offset);
+    ++idx;
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+#define PIXIE_BENCH_EXCESS_MIN_VARIANT(name, fn) \
+  BENCHMARK_TEMPLATE(BM_ExcessMin128Variant, fn) \
+      ->Name(name)                               \
+      ->ArgNames({"left", "right"})              \
+      ->Args({0, 128})                           \
+      ->Args({0, 127})                           \
+      ->Args({0, 16})                            \
+      ->Args({0, 32})                            \
+      ->Args({0, 48})                            \
+      ->Args({0, 64})                            \
+      ->Args({0, 31})                            \
+      ->Args({1, 17})                            \
+      ->Args({3, 35})                            \
+      ->Args({5, 37})                            \
+      ->Args({32, 64})                           \
+      ->Args({33, 65})                           \
+      ->Args({64, 96})                           \
+      ->Args({61, 93})                           \
+      ->Args({96, 128})                          \
+      ->Args({56, 72})                           \
+      ->Args({60, 68})                           \
+      ->Args({63, 64})                           \
+      ->Args({17, 17})
+
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_ScalarBits",
+                               excess_min_128_scalar_bits);
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_NibbleLUT",
+                               excess_min_128_nibble_lut);
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_ByteLUT",
+                               excess_min_128_byte_lut);
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_HybridLUT",
+                               excess_min_128_hybrid_lut);
+#ifdef PIXIE_AVX2_SUPPORT
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_Expand16AVX2",
+                               excess_min_128_expand16_avx2);
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_Lane64SSE",
+                               excess_min_128_lane64_sse);
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_Split64SSE",
+                               excess_min_128_split64_sse);
+PIXIE_BENCH_EXCESS_MIN_VARIANT("BM_ExcessMin128_ShortSkip",
+                               excess_min_128_short_skip);
+#endif
+
+#undef PIXIE_BENCH_EXCESS_MIN_VARIANT
+
+template <ExcessResult (*Fn)(const uint64_t*, size_t, size_t)>
+static void BM_ExcessMin128RandomRange(benchmark::State& state) {
+  const auto blocks = make_128_blocks();
+  const auto ranges = make_128_ranges();
+  const size_t num_blocks = blocks.size();
+  const size_t num_ranges = ranges.size();
+
+  size_t idx = 0;
+  for (auto _ : state) {
+    const auto& s = blocks[idx % num_blocks];
+    const auto [left, right] = ranges[idx % num_ranges];
+    ExcessResult result = Fn(s.data(), left, right);
+    benchmark::DoNotOptimize(result.min_excess);
+    benchmark::DoNotOptimize(result.offset);
+    ++idx;
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+#define PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT(name, fn) \
+  BENCHMARK_TEMPLATE(BM_ExcessMin128RandomRange, fn)->Name(name)
+
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_RandomRange",
+                                      excess_min_128);
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_ScalarBits_RandomRange",
+                                      excess_min_128_scalar_bits);
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_NibbleLUT_RandomRange",
+                                      excess_min_128_nibble_lut);
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_ByteLUT_RandomRange",
+                                      excess_min_128_byte_lut);
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_HybridLUT_RandomRange",
+                                      excess_min_128_hybrid_lut);
+#ifdef PIXIE_AVX2_SUPPORT
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT(
+    "BM_ExcessMin128_Expand16AVX2_RandomRange",
+    excess_min_128_expand16_avx2);
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_Lane64SSE_RandomRange",
+                                      excess_min_128_lane64_sse);
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_Split64SSE_RandomRange",
+                                      excess_min_128_split64_sse);
+PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT("BM_ExcessMin128_ShortSkip_RandomRange",
+                                      excess_min_128_short_skip);
+#endif
+
+#undef PIXIE_BENCH_EXCESS_MIN_RANDOM_VARIANT
+
 static void BM_ExcessPositions512(benchmark::State& state) {
   const int target_x = state.range(0);
   const auto blocks = make_blocks();
@@ -183,6 +380,22 @@ BENCHMARK(BM_ExcessPositions512_ExpandAVX512)
     ->Args({8})
     ->Args({64});
 
+static void excess_positions_512_scalar_benchmark(const uint64_t* s,
+                                                  int target_x,
+                                                  uint64_t* out) noexcept {
+  for (int w = 0; w < 8; ++w) {
+    out[w] = 0;
+  }
+  int cur = 0;
+  for (size_t i = 0; i < 512; ++i) {
+    const int bit = int((s[i >> 6] >> (i & 63)) & 1ull);
+    cur += bit ? +1 : -1;
+    if (cur == target_x) {
+      out[i >> 6] |= (uint64_t{1} << (i & 63));
+    }
+  }
+}
+
 static void BM_ExcessPositions512_Scalar(benchmark::State& state) {
   const int target_x = state.range(0);
   const auto blocks = make_blocks();
@@ -193,17 +406,7 @@ static void BM_ExcessPositions512_Scalar(benchmark::State& state) {
 
   for (auto _ : state) {
     const auto& s = blocks[idx % num_blocks];
-    for (int w = 0; w < 8; ++w) {
-      out[w] = 0;
-    }
-    int cur = 0;
-    for (size_t i = 0; i < 512; ++i) {
-      const int bit = int((s[i >> 6] >> (i & 63)) & 1ull);
-      cur += bit ? +1 : -1;
-      if (cur == target_x) {
-        out[i >> 6] |= (uint64_t{1} << (i & 63));
-      }
-    }
+    excess_positions_512_scalar_benchmark(s.data(), target_x, out);
     benchmark::DoNotOptimize(out);
     ++idx;
   }
@@ -244,3 +447,52 @@ BENCHMARK(BM_ExcessPositions512_ByteLUT)
     ->Args({0})
     ->Args({8})
     ->Args({64});
+
+template <void (*Fn)(const uint64_t*, int, uint64_t*)>
+static void BM_ExcessPositions512RandomTarget(benchmark::State& state) {
+  const auto blocks = make_blocks();
+  const auto targets = make_512_targets();
+  const size_t num_blocks = blocks.size();
+  const size_t num_targets = targets.size();
+
+  alignas(64) uint64_t out[8];
+  size_t idx = 0;
+
+  for (auto _ : state) {
+    const auto& s = blocks[idx % num_blocks];
+    Fn(s.data(), targets[idx % num_targets], out);
+    benchmark::DoNotOptimize(out);
+    ++idx;
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+#define PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(name, fn) \
+  BENCHMARK_TEMPLATE(BM_ExcessPositions512RandomTarget, fn)->Name(name)
+
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM("BM_ExcessPositions512_RandomTarget",
+                                        excess_positions_512);
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(
+    "BM_ExcessPositions512_BranchingLUT_RandomTarget",
+    excess_positions_512_branching_lut);
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(
+    "BM_ExcessPositions512_LUTAVX512_RandomTarget",
+    excess_positions_512_lut_avx512);
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(
+    "BM_ExcessPositions512_Expand_RandomTarget",
+    excess_positions_512_expand);
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(
+    "BM_ExcessPositions512_Expand8_RandomTarget",
+    excess_positions_512_expand8);
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(
+    "BM_ExcessPositions512_ExpandAVX512_RandomTarget",
+    excess_positions_512_expand_avx512);
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(
+    "BM_ExcessPositions512_Scalar_RandomTarget",
+    excess_positions_512_scalar_benchmark);
+PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM(
+    "BM_ExcessPositions512_ByteLUT_RandomTarget",
+    excess_positions_512_byte_lut);
+
+#undef PIXIE_BENCH_EXCESS_POSITIONS_512_RANDOM
diff --git a/src/docs/excess_positions_benchmark_results.md b/src/docs/excess_positions_benchmark_results.md
new file mode 100644
index 0000000..17adf71
--- /dev/null
+++ b/src/docs/excess_positions_benchmark_results.md
@@ -0,0 +1,9 @@
+| Method | X=-64 | X=-8 | X=0 | X=8 | X=64 |
+|---|---:|---:|---:|---:|---:|
+| BranchingLUT | 15.71 ns | 26.56 ns | 26.55 ns | 26.43 ns | 15.37 ns |
+| Current | 10.73 ns | 18.09 ns | 18.58 ns | 18.24 ns | 10.43 ns |
+| Expand | 60.70 ns | 88.41 ns | 87.38 ns | 88.50 ns | 56.77 ns |
+| Expand8 | 19.13 ns | 53.05 ns | 47.83 ns | 49.35 ns | 17.44 ns |
+| ExpandAVX512 | 23.13 ns | 38.39 ns | 38.56 ns | 39.24 ns | 23.19 ns |
+| LUTAVX512 | 12.33 ns | 18.34 ns | 18.06 ns | 18.21 ns | 12.75 ns |
+| Scalar | 304.42 ns | 389.58 ns | 446.94 ns | 399.80 ns | 316.23 ns |
diff --git a/src/tests/excess_positions_tests.cpp b/src/tests/excess_positions_tests.cpp
index d37a7d3..62cdf3a 100644
--- a/src/tests/excess_positions_tests.cpp
+++ b/src/tests/excess_positions_tests.cpp
@@ -10,12 +10,22 @@
 #include <random>
 #include <utility>
 
+using pixie::experimental::excess_min_128_byte_lut;
+using pixie::experimental::excess_min_128_hybrid_lut;
 using pixie::experimental::excess_positions_512_branching_lut;
 using pixie::experimental::excess_positions_512_byte_lut;
 using pixie::experimental::excess_positions_512_expand;
 using pixie::experimental::excess_positions_512_expand8;
 using pixie::experimental::excess_positions_512_expand_avx512;
 using pixie::experimental::excess_positions_512_lut_avx512;
+#ifdef PIXIE_AVX2_SUPPORT
+using pixie::experimental::excess_min_128_expand16_avx2;
+using pixie::experimental::excess_min_128_lane64_sse;
+using pixie::experimental::excess_min_128_short_skip;
+using pixie::experimental::excess_min_128_split64_sse;
+#endif
+using pixie::experimental::excess_min_128_nibble_lut;
+using pixie::experimental::excess_min_128_scalar_bits;
 
 static void naive_excess_positions_512(const uint64_t* s,
                                        int target_x,
@@ -69,9 +79,9 @@ static int naive_prefix_excess_128(const uint64_t* s, size_t end_offset) {
   return cur;
 }
 
-static ExcessMin128Result naive_excess_min_128(const uint64_t* s,
-                                               size_t left,
-                                               size_t right) {
+static ExcessResult naive_excess_min_128(const uint64_t* s,
+                                         size_t left,
+                                         size_t right) {
   if (left > right) {
     return {};
   }
@@ -170,6 +180,23 @@ static void check_matches_naive(Fn fn,
       << fn_name << " case=" << case_id << " x=" << target_x;
 }
 
+template <typename Fn>
+static void check_min_matches_naive(Fn fn,
+                                    const char* fn_name,
+                                    const uint64_t* s,
+                                    size_t left,
+                                    size_t right,
+                                    int case_id = 0) {
+  const ExcessResult result = fn(s, left, right);
+  const ExcessResult expected = naive_excess_min_128(s, left, right);
+  ASSERT_EQ(result.min_excess, expected.min_excess)
+      << fn_name << " case=" << case_id << " left=" << left
+      << " right=" << right;
+  ASSERT_EQ(result.offset, expected.offset)
+      << fn_name << " case=" << case_id << " left=" << left
+      << " right=" << right;
+}
+
 TEST(ExcessPositions128, MatchesNaiveMasksAndDelta) {
   const std::array<std::array<uint64_t, 2>, 4> cases = {{
       {0, 0},
@@ -231,9 +258,8 @@ TEST(ExcessPositions128, MinMatchesNaiveFixedCases) {
 
   for (const auto& s : cases) {
     for (const auto [left, right] : ranges) {
-      const ExcessMin128Result result = excess_min_128(s.data(), left, right);
-      const ExcessMin128Result expected =
-          naive_excess_min_128(s.data(), left, right);
+      const ExcessResult result = excess_min_128(s.data(), left, right);
+      const ExcessResult expected = naive_excess_min_128(s.data(), left, right);
       EXPECT_EQ(result.min_excess, expected.min_excess)
           << "left=" << left << " right=" << right;
       EXPECT_EQ(result.offset, expected.offset)
@@ -245,18 +271,54 @@ TEST(ExcessPositions128, MinMatchesNaiveFixedCases) {
 TEST(ExcessPositions128, MinReturnsFirstTie) {
   const std::array<uint64_t, 2> s = {0x5555555555555555ull,
                                      0x5555555555555555ull};
-  const ExcessMin128Result result = excess_min_128(s.data(), 0, 128);
+  const ExcessResult result = excess_min_128(s.data(), 0, 128);
   EXPECT_EQ(result.min_excess, 0);
   EXPECT_EQ(result.offset, 0u);
 
-  const ExcessMin128Result shifted = excess_min_128(s.data(), 1, 128);
+  const ExcessResult shifted = excess_min_128(s.data(), 1, 128);
   EXPECT_EQ(shifted.min_excess, 0);
   EXPECT_EQ(shifted.offset, 2u);
+
+  const ExcessResult left_tie = excess_min_128(s.data(), 2, 128);
+  EXPECT_EQ(left_tie.min_excess, 0);
+  EXPECT_EQ(left_tie.offset, 2u);
+}
+
+TEST(ExcessPositions128, MinHandlesRightBoundary) {
+  const std::array<uint64_t, 2> s = {0, 0};
+
+  const ExcessResult without_last = excess_min_128(s.data(), 0, 127);
+  EXPECT_EQ(without_last.min_excess, -127);
+  EXPECT_EQ(without_last.offset, 127u);
+
+  const ExcessResult with_last = excess_min_128(s.data(), 0, 128);
+  EXPECT_EQ(with_last.min_excess, -128);
+  EXPECT_EQ(with_last.offset, 128u);
+}
+
+TEST(ExcessPositions128, MinPartialNibbleBoundsExcludeOuterMin) {
+  const std::array<uint64_t, 2> s = {0, 0};
+
+  const ExcessResult short_prefix = excess_min_128(s.data(), 1, 2);
+  EXPECT_EQ(short_prefix.min_excess, -2);
+  EXPECT_EQ(short_prefix.offset, 2u);
+
+  const ExcessResult short_suffix = excess_min_128(s.data(), 2, 3);
+  EXPECT_EQ(short_suffix.min_excess, -3);
+  EXPECT_EQ(short_suffix.offset, 3u);
+}
+
+TEST(ExcessPositions128, MinPositiveRangeKeepsLeftBoundary) {
+  const std::array<uint64_t, 2> s = {UINT64_MAX, UINT64_MAX};
+
+  const ExcessResult result = excess_min_128(s.data(), 64, 128);
+  EXPECT_EQ(result.min_excess, 64);
+  EXPECT_EQ(result.offset, 64u);
 }
 
 TEST(ExcessPositions128, MinInvalidRangeUsesSentinel) {
   const std::array<uint64_t, 2> s = {0, 0};
-  const ExcessMin128Result result = excess_min_128(s.data(), 17, 16);
+  const ExcessResult result = excess_min_128(s.data(), 17, 16);
   EXPECT_EQ(result.min_excess, 0);
   EXPECT_EQ(result.offset, 128u);
 }
@@ -273,9 +335,8 @@ TEST(ExcessPositions128, MinMatchesNaiveRandom) {
       if (left > right) {
         std::swap(left, right);
       }
-      const ExcessMin128Result result = excess_min_128(s.data(), left, right);
-      const ExcessMin128Result expected =
-          naive_excess_min_128(s.data(), left, right);
+      const ExcessResult result = excess_min_128(s.data(), left, right);
+      const ExcessResult expected = naive_excess_min_128(s.data(), left, right);
       ASSERT_EQ(result.min_excess, expected.min_excess)
           << "case=" << t << " left=" << left << " right=" << right;
       ASSERT_EQ(result.offset, expected.offset)
@@ -284,6 +345,80 @@ TEST(ExcessPositions128, MinMatchesNaiveRandom) {
   }
 }
 
+TEST(ExcessPositions128Experimental, MinVariantsMatchNaive) {
+  const std::array<std::array<uint64_t, 2>, 6> cases = {{
+      {0, 0},
+      {UINT64_MAX, UINT64_MAX},
+      {0xAAAAAAAAAAAAAAAAull, 0x5555555555555555ull},
+      {0x0123456789ABCDEFull, 0xFEDCBA9876543210ull},
+      {0x0000FFFF0000FFFFull, 0xFFFF0000FFFF0000ull},
+      {0x1111222233334444ull, 0x8888777766665555ull},
+  }};
+  const std::array<std::pair<size_t, size_t>, 25> ranges = {{
+      {0, 128},   {0, 127},   {0, 16},    {0, 31},    {0, 32},
+      {0, 48},    {0, 64},    {32, 64},   {64, 96},   {96, 128},
+      {56, 72},   {60, 68},   {63, 64},   {17, 17},   {1, 2},
+      {2, 3},     {3, 6},     {5, 5},     {64, 64},   {64, 128},
+      {127, 128}, {128, 128}, {120, 127}, {129, 140}, {17, 16},
+  }};
+
+  int case_id = 0;
+  for (const auto& s : cases) {
+    for (const auto [left, right] : ranges) {
+      check_min_matches_naive(excess_min_128_scalar_bits, "scalar_bits",
+                              s.data(), left, right, case_id);
+      check_min_matches_naive(excess_min_128_nibble_lut, "nibble_lut", s.data(),
+                              left, right, case_id);
+      check_min_matches_naive(excess_min_128_byte_lut, "byte_lut", s.data(),
+                              left, right, case_id);
+      check_min_matches_naive(excess_min_128_hybrid_lut, "hybrid_lut", s.data(),
+                              left, right, case_id);
+#ifdef PIXIE_AVX2_SUPPORT
+      check_min_matches_naive(excess_min_128_expand16_avx2, "expand16_avx2",
+                              s.data(), left, right, case_id);
+      check_min_matches_naive(excess_min_128_lane64_sse, "lane64_sse", s.data(),
+                              left, right, case_id);
+      check_min_matches_naive(excess_min_128_split64_sse, "split64_sse",
+                              s.data(), left, right, case_id);
+      check_min_matches_naive(excess_min_128_short_skip, "short_skip", s.data(),
+                              left, right, case_id);
+#endif
+      ++case_id;
+    }
+  }
+}
+
+TEST(ExcessPositions128Experimental, MinVariantsMatchNaiveRandom) {
+  std::mt19937_64 rng(44);
+  std::uniform_int_distribution<size_t> offset_dist(0, 140);
+
+  for (int t = 0; t < 500; ++t) {
+    const std::array<uint64_t, 2> s = {rng(), rng()};
+    for (int q = 0; q < 16; ++q) {
+      const size_t left = offset_dist(rng);
+      const size_t right = offset_dist(rng);
+      check_min_matches_naive(excess_min_128_scalar_bits, "scalar_bits",
+                              s.data(), left, right, t);
+      check_min_matches_naive(excess_min_128_nibble_lut, "nibble_lut", s.data(),
+                              left, right, t);
+      check_min_matches_naive(excess_min_128_byte_lut, "byte_lut", s.data(),
+                              left, right, t);
+      check_min_matches_naive(excess_min_128_hybrid_lut, "hybrid_lut", s.data(),
+                              left, right, t);
+#ifdef PIXIE_AVX2_SUPPORT
+      check_min_matches_naive(excess_min_128_expand16_avx2, "expand16_avx2",
+                              s.data(), left, right, t);
+      check_min_matches_naive(excess_min_128_lane64_sse, "lane64_sse", s.data(),
+                              left, right, t);
+      check_min_matches_naive(excess_min_128_split64_sse, "split64_sse",
+                              s.data(), left, right, t);
+      check_min_matches_naive(excess_min_128_short_skip, "short_skip", s.data(),
+                              left, right, t);
+#endif
+    }
+  }
+}
+
 TEST(ExcessPositions128, ForwardAndBackwardSearchMatchNaive) {
   std::mt19937_64 rng(42);
   const std::array<size_t, 8> offsets = {0, 1, 63, 64, 65, 126, 127, 128};

From 952e9afa8f9c1b2392e08359f373756bbcf67ccc Mon Sep 17 00:00:00 2001
From: Nikolay Malkovsky <malkovskynv@gmail.com>
Date: Sat, 30 May 2026 22:50:46 +0300
Subject: [PATCH 4/7] chore: remove agentic/cpp subtree for submodule migration

---
 agentic/cpp/README.md                         |   15 -
 agentic/cpp/commands/README.md                |    6 -
 agentic/cpp/commands/benchmarks-affected.md   |   34 -
 agentic/cpp/commands/perf-review.md           |  149 ---
 agentic/cpp/commands/ping.md                  |    7 -
 .../cpp/skills/benchmarks-affected/SKILL.md   |   81 --
 .../analyze_benchmarks_affected.py            | 1132 -----------------
 .../benchmarks-compare-revisions/SKILL.md     |  225 ----
 agentic/cpp/skills/benchmarks/SKILL.md        |  209 ---
 .../benchmarks/scripts/plot_benchmarks.py     |  159 ---
 agentic/cpp/skills/cmake/SKILL.md             |   96 --
 agentic/cpp/skills/diagnose-segfault/SKILL.md |  195 ---
 .../skills/optimization-experiment/SKILL.md   |  193 ---
 agentic/cpp/skills/paper-search/SKILL.md      |  106 --
 .../paper-search/references/api_reference.md  |   32 -
 .../paper-search/scripts/search_papers.py     |  333 -----
 agentic/cpp/skills/pdf/SKILL.md               |  112 --
 agentic/cpp/skills/setup-cpp-repo/SKILL.md    |  136 --
 .../references/project_structure.md           |  117 --
 .../scripts/init_cpp_project.py               | 1053 ---------------
 20 files changed, 4390 deletions(-)
 delete mode 100644 agentic/cpp/README.md
 delete mode 100644 agentic/cpp/commands/README.md
 delete mode 100644 agentic/cpp/commands/benchmarks-affected.md
 delete mode 100644 agentic/cpp/commands/perf-review.md
 delete mode 100644 agentic/cpp/commands/ping.md
 delete mode 100644 agentic/cpp/skills/benchmarks-affected/SKILL.md
 delete mode 100755 agentic/cpp/skills/benchmarks-affected/analyze_benchmarks_affected.py
 delete mode 100644 agentic/cpp/skills/benchmarks-compare-revisions/SKILL.md
 delete mode 100644 agentic/cpp/skills/benchmarks/SKILL.md
 delete mode 100644 agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py
 delete mode 100644 agentic/cpp/skills/cmake/SKILL.md
 delete mode 100644 agentic/cpp/skills/diagnose-segfault/SKILL.md
 delete mode 100644 agentic/cpp/skills/optimization-experiment/SKILL.md
 delete mode 100644 agentic/cpp/skills/paper-search/SKILL.md
 delete mode 100644 agentic/cpp/skills/paper-search/references/api_reference.md
 delete mode 100755 agentic/cpp/skills/paper-search/scripts/search_papers.py
 delete mode 100644 agentic/cpp/skills/pdf/SKILL.md
 delete mode 100644 agentic/cpp/skills/setup-cpp-repo/SKILL.md
 delete mode 100644 agentic/cpp/skills/setup-cpp-repo/references/project_structure.md
 delete mode 100755 agentic/cpp/skills/setup-cpp-repo/scripts/init_cpp_project.py

diff --git a/agentic/cpp/README.md b/agentic/cpp/README.md
deleted file mode 100644
index 68a2262..0000000
--- a/agentic/cpp/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Shared C++ Agent Skills
-
-This subtree contains reusable C++ agent skills and related commands.
-
-Keep this tree generic:
-
-- Do not add project-specific benchmark names, CMake options, or paths.
-- Keep reusable scripts beside the skills that use them.
-- Put project-specific examples in the consuming repository under
-  `agentic/local/cpp/skills/<skill-name>/EXAMPLES.md`.
-
-When using a skill in a project, read:
-
-1. `agentic/cpp/skills/<skill-name>/SKILL.md`
-2. `agentic/local/cpp/skills/<skill-name>/EXAMPLES.md`, if present
diff --git a/agentic/cpp/commands/README.md b/agentic/cpp/commands/README.md
deleted file mode 100644
index 2a24b20..0000000
--- a/agentic/cpp/commands/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Shared C++ Agent Commands
-
-Reusable command definitions for C++ projects belong here.
-
-Keep project-specific commands in the consuming repository under
-`agentic/local/cpp/commands`.
diff --git a/agentic/cpp/commands/benchmarks-affected.md b/agentic/cpp/commands/benchmarks-affected.md
deleted file mode 100644
index 158cb9e..0000000
--- a/agentic/cpp/commands/benchmarks-affected.md
+++ /dev/null
@@ -1,34 +0,0 @@
----
-description: Scan current branch and report impacted benchmark targets/functions.
----
-
-# Benchmarks Affected
-
-Identify which benchmark binaries and benchmark functions are affected by changes on the current branch.
-
-Use the `benchmarks-affected` skill as the single source of truth for workflow details and guardrails.
-Do not duplicate or override the skill instructions in this command.
-
-## Inputs
-
-- Optional `--baseline <ref>` (default: `main`)
-- Optional `--compile-commands <path>`
-- Optional `--no-include-working-tree`
-- Optional `--format <text|json>` (default: `text`)
-
-## Workflow
-
-1. Execute the `benchmarks-affected` skill workflow.
-2. Pass through command inputs to the analyzer invocation defined by the skill.
-3. Report results with these sections:
-   - Changed files
-   - Affected benchmark targets
-   - Affected benchmark functions
-   - Suggested `--benchmark_filter` regex
-   - Any warnings/failures
-
-## Output rules
-
-1. If `affected_benchmarks` is non-empty, prioritize those names.
-2. If `affected_benchmarks` is empty but benchmark targets are affected, mark result as partial and include target-level impact.
-3. Do not run full benchmark suites in this command; this command is for impact discovery only.
diff --git a/agentic/cpp/commands/perf-review.md b/agentic/cpp/commands/perf-review.md
deleted file mode 100644
index 8ef1865..0000000
--- a/agentic/cpp/commands/perf-review.md
+++ /dev/null
@@ -1,149 +0,0 @@
----
-description: Benchmark-driven PR performance review versus target branch
----
-
-# Perf Review Workflow
-
-You are performing a performance review for the current PR branch.
-
-Non-negotiable requirements:
-1. Benchmark timing plus profiling data is the highest-priority judgment tool.
-2. Compare source branch versus target branch and report relevant benchmark metric changes.
-3. Provide analysis and a final verdict: does the PR improve performance or not.
-
-## Inputs
-
-- Optional argument `--target <branch>`: target branch override.
-- Optional argument `--filter <regex>`: benchmark filter regex.
-- Optional argument `--no-counters`: disable hardware-counter collection.
-
-If arguments are omitted:
-- Default target branch to PR base branch from `gh pr view --json baseRefName` when available.
-- Fall back target branch to `main`.
-
-Filter handling:
-- If `--filter` is provided, pass it through.
-- Else use the filter produced by `benchmarks-affected` through `benchmarks-compare-revisions`.
-- If no filter can be derived, run conservative full-binary compare for impacted binaries.
-
-## Step 1 - Resolve branches and hashes
-
-1. Resolve contender from current checkout (`HEAD`) and compute short hash.
-2. Resolve baseline branch using precedence: `--target` -> PR base from `gh pr view --json baseRefName` -> `main`.
-3. Resolve baseline short hash.
-4. Print branch/hash mapping before benchmark execution.
-
-## Step 2 - Run timing and hardware-counter comparison via skill (single source of truth)
-
-Use `benchmarks-compare-revisions` as the single source of truth for revision builds, benchmark scope, compare.py flow, retry policy, and guardrails.
-
-Pass-through inputs:
-- Baseline ref/hash from Step 1.
-- Contender ref/hash from Step 1.
-- Optional `--filter` override.
-- Counter mode: default on (`COLLECT_COUNTERS=1`) on Linux, disabled when `--no-counters` is provided.
-
-Consume outputs from `benchmarks-compare-revisions`:
-- Baseline and contender benchmark JSON artifacts.
-- compare.py output per binary.
-- Effective filter used.
-- Scope metadata from `benchmarks-affected` (`affected_benchmark_targets`, `affected_benchmarks`) when available.
-- `counters_available` status and, when unavailable, explicit reason.
-- Baseline and contender counter JSON artifacts (when available).
-- Derived counter metrics per benchmark (IPC, cache miss rate, branch mispredict rate).
-- Counter anomaly list and ready-to-embed counter summary table.
-
-Execution guardrails:
-- Run benchmarks sequentially.
-- No background jobs (`nohup`, `&`).
-- Use Release timing builds only.
-- If timing comparison fails, return blocked verdict with exact failure points.
-
-## Step 3 - Consume delegated hardware-counter outputs
-
-Hardware-counter collection is delegated to `benchmarks-compare-revisions`.
-
-Pass-through inputs:
-- `COLLECT_COUNTERS=1` by default on Linux (unless `--no-counters` is provided).
-- Same baseline/contender refs and effective filter used in Step 2.
-
-Consume outputs:
-- Counter preflight result.
-- Counter JSON artifacts for both revisions.
-- Derived metrics (IPC, cache miss rate, branch mispredict rate).
-- Anomaly list and counter summary table for report embedding.
-
-If counters are unavailable (`counters_available=false`), continue with timing-only review and explicitly mark profiling as unavailable in the report.
-
-## Step 4 - Analyze timing and counter data
-
-Timing classification per benchmark entry:
-- Improvement: time delta < -5%
-- Regression: time delta > +5%
-- Neutral: between -5% and +5%
-
-Aggregate per binary:
-- Number of improvements/regressions/neutral
-- Net average percentage change
-- Largest regression and largest improvement
-
-Counter correlation:
-- Use skill-provided hardware counter summary and anomaly list to explain major timing changes.
-- Do not recompute derived counter metrics in this command.
-
-Judgment priority:
-- Base verdict primarily on benchmark timing comparison.
-- Use counter data as explanatory evidence and confidence signal.
-
-Noise-control expectations:
-- Include at least one control benchmark family expected to be unaffected by the code change.
-- Treat isolated swings without pattern as noise unless reproduced across related sizes/fill ratios.
-
-## Step 5 - Produce final markdown report
-
-Return a structured markdown report with this shape:
-
-```markdown
-## Performance Review: <contender_branch> vs <baseline_branch>
-
-### Configuration
-- Baseline: <branch> (<hash>)
-- Contender: <branch> (<hash>)
-- Platform: <os/arch>
-- Benchmarks run: <binary list>
-- Filter: <regex or none>
-- Hardware counters: available / unavailable
-
-### Timing Summary
-| Binary | Improvements | Regressions | Neutral | Net Change |
-|---|---:|---:|---:|---:|
-| ... | N | N | N | +/-X% |
-
-### Detailed Timing Results
-<Annotated compare.py outputs by binary>
-
-### Hardware Counter Profile (if available)
-| Benchmark | IPC (base->new) | Cache Miss Rate (base->new) | Branch Mispredict (base->new) |
-|---|---:|---:|---:|
-| ... | X.XX -> Y.YY | A.A% -> B.B% | C.C% -> D.D% |
-
-### Key Findings
-- <Most important regressions/improvements>
-- <Counter-based explanations for key timing shifts>
-
-### Verdict
-**[IMPROVES PERFORMANCE | REGRESSES PERFORMANCE | NO SIGNIFICANT CHANGE]**
-
-<1-2 sentence justification grounded in benchmark metrics, with profiling context if available>
-```
-
-Verdict rules:
-- `IMPROVES PERFORMANCE`: improvements outnumber regressions, no severe regression (>10%), and net average change is favorable.
-- `REGRESSES PERFORMANCE`: any severe regression (>10%) or regressions dominate with net unfavorable average.
-- `NO SIGNIFICANT CHANGE`: mostly neutral changes or mixed results that approximately cancel out.
-
-## Failure Handling
-
-- If required builds fail or timing comparison cannot run, output a blocked review with exact failure points and no misleading verdict.
-- If only profiling fails (`counters_available=false` from delegated skill output), continue with timing-based verdict and explicitly list profiling limitation.
-- If JSON output is invalid/truncated, discard it and rerun that benchmark command once with tighter filter and explicit output redirection.
diff --git a/agentic/cpp/commands/ping.md b/agentic/cpp/commands/ping.md
deleted file mode 100644
index b5edaf7..0000000
--- a/agentic/cpp/commands/ping.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-description: Test command that replies with pong
----
-
-Respond with exactly `pong`.
-Do not add any other words.
-Do not add quotes or punctuation.
diff --git a/agentic/cpp/skills/benchmarks-affected/SKILL.md b/agentic/cpp/skills/benchmarks-affected/SKILL.md
deleted file mode 100644
index 2072dcb..0000000
--- a/agentic/cpp/skills/benchmarks-affected/SKILL.md
+++ /dev/null
@@ -1,81 +0,0 @@
----
-name: benchmarks-affected
-description: Analyze current branch versus a baseline and extract affected benchmark targets and benchmark functions using compile_commands and clang AST.
----
-
-# Benchmarks Affected Skill
-
-Use this skill to identify exactly which benchmark binaries and benchmark functions are affected by code changes on the current branch.
-
-It implements a two-stage workflow:
-
-1. `compile_commands.json` analysis to determine affected compile targets.
-2. Clang AST analysis to determine affected benchmark functions.
-
-## Goal
-
-Given `HEAD` and a baseline branch (default `main`), produce:
-
-- Changed files.
-- Affected targets (with emphasis on benchmark targets).
-- Exact benchmark functions impacted by the changes.
-- A ready-to-use Google Benchmark filter regex.
-
-## Prerequisites
-
-1. Build tree with benchmarks enabled and compile database exported. Use the
-repository's normal benchmark-enabling CMake options:
-
-```bash
-BUILD_SUFFIX=local
-cmake -B build/benchmarks-all_${BUILD_SUFFIX} \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-cmake --build build/benchmarks-all_${BUILD_SUFFIX} --config Release -j
-```
-
-2. `clang++` must be available on `PATH` (used for AST dump).
-
-For repository-specific invocations, check
-`agentic/local/cpp/skills/benchmarks-affected/EXAMPLES.md` when present.
-
-## Run
-
-```bash
-python3 agentic/cpp/skills/benchmarks-affected/analyze_benchmarks_affected.py \
-  --baseline main \
-  --compile-commands build/benchmarks-all_local/compile_commands.json \
-  --format json
-```
-
-If `--compile-commands` is omitted, the script auto-selects the most recently modified `build/**/compile_commands.json`.
-Working tree changes are included by default. Use `--no-include-working-tree` to restrict analysis to `<baseline>...HEAD` only.
-
-## Output
-
-The analyzer reports:
-
-- `affected_targets`: impacted CMake targets inferred from compile dependency analysis.
-- `affected_benchmark_targets`: subset of benchmark binaries impacted.
-- `affected_benchmarks`: precise benchmark function names from AST-level call analysis.
-- `suggested_filter_regex`: regex to pass as `--benchmark_filter`.
-
-## How to Use Findings
-
-1. Build only impacted benchmark binaries where feasible.
-2. Run benchmark binaries with the suggested filter:
-
-```bash
-FILTER='^(BM_Foo|BM_Bar)(/|$)'
-BENCH_CPU=${BENCH_CPU:-0}
-taskset -c "${BENCH_CPU}" build/benchmarks-all_local/benchmarks --benchmark_filter="${FILTER}"
-```
-
-3. If impact mapping is broad/uncertain, run full binary for selected benchmark target(s).
-
-## Guardrails
-
-1. Keep baseline comparison at merge-base style diff: `<baseline>...HEAD`.
-2. Use Release binaries for timing runs.
-3. If AST parse fails for a TU, still trust compile target impact and mark benchmark-function scope as partial.
-4. If benchmark infra (`CMakeLists.txt`, benchmark source layout) changed, fall back to conservative benchmark selection.
diff --git a/agentic/cpp/skills/benchmarks-affected/analyze_benchmarks_affected.py b/agentic/cpp/skills/benchmarks-affected/analyze_benchmarks_affected.py
deleted file mode 100755
index 4dcae85..0000000
--- a/agentic/cpp/skills/benchmarks-affected/analyze_benchmarks_affected.py
+++ /dev/null
@@ -1,1132 +0,0 @@
-#!/usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import concurrent.futures
-import json
-import os
-import re
-import shlex
-import shutil
-import subprocess
-import sys
-from collections import defaultdict
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any
-
-
-def is_project_source(source: Path, repo_root: Path) -> bool:
-    """Exclude third-party deps and generated build files."""
-    try:
-        rel = source.relative_to(repo_root)
-    except ValueError:
-        return False
-    rel_text = rel.as_posix()
-    if rel_text.startswith("build/") or "_deps/" in rel_text:
-        return False
-    return True
-
-
-KNOWN_BENCHMARK_TARGETS = {"benchmarks"}
-
-HEADER_EXTENSIONS = {
-    ".h",
-    ".hh",
-    ".hpp",
-    ".hxx",
-    ".inc",
-    ".ipp",
-    ".tcc",
-}
-
-BUILD_INFRA_FILES = {
-    "CMakeLists.txt",
-    "CMakePresets.json",
-}
-
-DIFF_HUNK_RE = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
-
-CPP_FUNCTION_START_RE = re.compile(
-    r"^\s*"
-    r"(?:template\s*<[^>]*>\s*)?"
-    r"(?:(?:inline|constexpr|consteval|constinit|static|friend|virtual|explicit)\s+)*"
-    r"[A-Za-z_~][\w:<>,\s\*&\[\]]*\s+"
-    r"([~A-Za-z_][A-Za-z0-9_]*)\s*"
-    r"\([^;{}]*\)\s*"
-    r"(?:const\s*)?"
-    r"(?:noexcept\s*)?"
-    r"(?:->\s*[^\{]+)?\{"
-)
-
-
-@dataclass
-class CompileCommandEntry:
-    directory: Path
-    source: Path
-    arguments: list[str]
-    output: Path | None
-    target: str | None
-    dependencies: set[Path] = field(default_factory=set)
-    dep_error: str | None = None
-
-
-@dataclass
-class AstImpactResult:
-    benchmark_names: set[str] = field(default_factory=set)
-    affected_names: set[str] = field(default_factory=set)
-    ast_error: str | None = None
-
-
-def run_command(
-    args: list[str],
-    cwd: Path,
-    check: bool = True,
-    timeout: float | None = 60.0,
-) -> subprocess.CompletedProcess[str]:
-    return subprocess.run(
-        args,
-        cwd=str(cwd),
-        text=True,
-        capture_output=True,
-        check=check,
-        timeout=timeout,
-    )
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description=(
-            "Analyze benchmark impact between baseline and HEAD via "
-            "compile_commands dependency mapping and clang AST analysis."
-        )
-    )
-    parser.add_argument(
-        "--baseline",
-        default="main",
-        help="Baseline ref used as <baseline>...HEAD (default: main).",
-    )
-    parser.add_argument(
-        "--head",
-        default="HEAD",
-        help="Contender ref (default: HEAD).",
-    )
-    parser.add_argument(
-        "--compile-commands",
-        default=None,
-        help=(
-            "Path to compile_commands.json. If omitted, auto-discovers most "
-            "recent build/**/compile_commands.json."
-        ),
-    )
-    parser.add_argument(
-        "--clangxx",
-        default=None,
-        help="clang++ executable for AST dump (auto-detected if omitted).",
-    )
-    parser.add_argument(
-        "--format",
-        choices=["text", "json"],
-        default="text",
-        help="Output format (default: text).",
-    )
-    parser.add_argument(
-        "--include-working-tree",
-        dest="include_working_tree",
-        action="store_true",
-        default=True,
-        help=(
-            "Include local unstaged/staged changes in changed-files set, "
-            "in addition to <baseline>...<head> (default: enabled)."
-        ),
-    )
-    parser.add_argument(
-        "--no-include-working-tree",
-        dest="include_working_tree",
-        action="store_false",
-        help="Disable working-tree inclusion and only analyze <baseline>...<head>.",
-    )
-    return parser.parse_args()
-
-
-def git_repo_root() -> Path:
-    proc = run_command(["git", "rev-parse", "--show-toplevel"], cwd=Path.cwd())
-    return Path(proc.stdout.strip()).resolve()
-
-
-def resolve_compile_commands(repo_root: Path, explicit_path: str | None) -> Path:
-    if explicit_path:
-        compile_path = Path(explicit_path)
-        if not compile_path.is_absolute():
-            compile_path = (repo_root / compile_path).resolve()
-        if not compile_path.exists():
-            raise FileNotFoundError(f"compile_commands.json not found: {compile_path}")
-        return compile_path
-
-    candidates = sorted(
-        repo_root.glob("build/**/compile_commands.json"),
-        key=lambda path: path.stat().st_mtime,
-        reverse=True,
-    )
-    if not candidates:
-        raise FileNotFoundError(
-            "No compile_commands.json found under build/**. "
-            "Configure with -DCMAKE_EXPORT_COMPILE_COMMANDS=ON first."
-        )
-    return candidates[0].resolve()
-
-
-def load_compile_commands(
-    compile_commands_path: Path,
-    repo_root: Path,
-) -> list[CompileCommandEntry]:
-    entries: list[CompileCommandEntry] = []
-    data = json.loads(compile_commands_path.read_text(encoding="utf-8"))
-    for raw_entry in data:
-        directory = Path(raw_entry["directory"]).resolve()
-
-        raw_source = Path(raw_entry["file"])
-        if raw_source.is_absolute():
-            source = raw_source.resolve()
-        else:
-            source = (directory / raw_source).resolve()
-
-        if not is_project_source(source, repo_root):
-            continue
-
-        if "arguments" in raw_entry:
-            arguments = [str(arg) for arg in raw_entry["arguments"]]
-        else:
-            arguments = shlex.split(raw_entry["command"])
-
-        output = infer_output_path(arguments, directory)
-        target = infer_cmake_target_from_output(output)
-
-        entries.append(
-            CompileCommandEntry(
-                directory=directory,
-                source=source,
-                arguments=arguments,
-                output=output,
-                target=target,
-            )
-        )
-    return entries
-
-
-def infer_output_path(arguments: list[str], directory: Path) -> Path | None:
-    output_token: str | None = None
-    for idx, arg in enumerate(arguments):
-        if arg == "-o" and idx + 1 < len(arguments):
-            output_token = arguments[idx + 1]
-        elif arg.startswith("-o") and len(arg) > 2:
-            output_token = arg[2:]
-        elif arg.startswith("/Fo") and len(arg) > 3:
-            output_token = arg[3:]
-
-    if output_token is None:
-        return None
-
-    out_path = Path(output_token)
-    if out_path.is_absolute():
-        return out_path.resolve()
-    return (directory / out_path).resolve()
-
-
-def infer_cmake_target_from_output(output: Path | None) -> str | None:
-    if output is None:
-        return None
-    parts = output.parts
-    for index, part in enumerate(parts):
-        if part == "CMakeFiles" and index + 1 < len(parts):
-            target_part = parts[index + 1]
-            if target_part.endswith(".dir"):
-                return target_part[: -len(".dir")]
-            return target_part
-    return None
-
-
-def git_changed_files(repo_root: Path, baseline: str, head: str) -> set[Path]:
-    diff_range = f"{baseline}...{head}"
-    proc = run_command(["git", "diff", "--name-only", diff_range], cwd=repo_root)
-    changed_files: set[Path] = set()
-    for line in proc.stdout.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        changed_files.add((repo_root / line).resolve())
-    return changed_files
-
-
-def git_working_tree_changed_files(repo_root: Path) -> set[Path]:
-    changed_files: set[Path] = set()
-    commands = [
-        ["git", "diff", "--name-only"],
-        ["git", "diff", "--name-only", "--cached"],
-    ]
-    for cmd in commands:
-        proc = run_command(cmd, cwd=repo_root)
-        for line in proc.stdout.splitlines():
-            line = line.strip()
-            if not line:
-                continue
-            changed_files.add((repo_root / line).resolve())
-    return changed_files
-
-
-def parse_changed_lines_from_diff_text(
-    diff_text: str,
-    repo_root: Path,
-) -> dict[Path, set[int]]:
-    changed_lines: dict[Path, set[int]] = defaultdict(set)
-
-    current_file: Path | None = None
-    in_hunk = False
-    new_line = 0
-
-    for raw_line in diff_text.splitlines():
-        if raw_line.startswith("+++ "):
-            file_token = raw_line[4:].strip()
-            if file_token == "/dev/null":
-                current_file = None
-                in_hunk = False
-                continue
-            if file_token.startswith("b/"):
-                file_token = file_token[2:]
-            current_file = (repo_root / file_token).resolve()
-            in_hunk = False
-            continue
-
-        hunk_match = DIFF_HUNK_RE.match(raw_line)
-        if hunk_match:
-            in_hunk = current_file is not None
-            new_line = int(hunk_match.group(1))
-            continue
-
-        if not in_hunk or current_file is None:
-            continue
-
-        if raw_line.startswith("+") and not raw_line.startswith("+++"):
-            changed_lines[current_file].add(new_line)
-            new_line += 1
-            continue
-
-        if raw_line.startswith("-") and not raw_line.startswith("---"):
-            continue
-
-        if raw_line.startswith(" "):
-            new_line += 1
-            continue
-
-    return changed_lines
-
-
-def git_changed_line_map(
-    repo_root: Path,
-    baseline: str,
-    head: str,
-    include_working_tree: bool,
-) -> dict[Path, set[int]]:
-    changed_lines: dict[Path, set[int]] = defaultdict(set)
-
-    proc = run_command(
-        ["git", "diff", "--unified=0", f"{baseline}...{head}"],
-        cwd=repo_root,
-    )
-    baseline_map = parse_changed_lines_from_diff_text(proc.stdout, repo_root)
-    for path, lines in baseline_map.items():
-        changed_lines[path].update(lines)
-
-    if include_working_tree:
-        for cmd in (
-            ["git", "diff", "--unified=0"],
-            ["git", "diff", "--cached", "--unified=0"],
-        ):
-            wt_proc = run_command(cmd, cwd=repo_root)
-            wt_map = parse_changed_lines_from_diff_text(wt_proc.stdout, repo_root)
-            for path, lines in wt_map.items():
-                changed_lines[path].update(lines)
-
-    return changed_lines
-
-
-def extract_changed_symbol_names_from_file(
-    file_path: Path,
-    changed_lines: set[int],
-) -> set[str]:
-    if not changed_lines or not file_path.exists():
-        return set()
-
-    lines = file_path.read_text(encoding="utf-8", errors="replace").splitlines()
-    symbols: set[str] = set()
-
-    line_index = 1
-    max_line = len(lines)
-    while line_index <= max_line:
-        line = lines[line_index - 1]
-        match = CPP_FUNCTION_START_RE.match(line)
-        if not match:
-            line_index += 1
-            continue
-
-        symbol_name = match.group(1)
-        start_line = line_index
-        brace_depth = line.count("{") - line.count("}")
-        end_line = start_line
-
-        while brace_depth > 0 and end_line < max_line:
-            end_line += 1
-            body_line = lines[end_line - 1]
-            brace_depth += body_line.count("{") - body_line.count("}")
-
-        if any(start_line <= line_no <= end_line for line_no in changed_lines):
-            symbols.add(symbol_name)
-
-        line_index = end_line + 1
-
-    return symbols
-
-
-def collect_changed_symbol_names(
-    changed_line_map: dict[Path, set[int]],
-) -> set[str]:
-    symbol_names: set[str] = set()
-    for file_path, changed_lines in changed_line_map.items():
-        symbol_names.update(
-            extract_changed_symbol_names_from_file(file_path, changed_lines)
-        )
-    return symbol_names
-
-
-def clean_command_for_dependency_scan(arguments: list[str]) -> list[str]:
-    cleaned: list[str] = []
-    skip_next = False
-    flags_with_value = {
-        "-o",
-        "-MF",
-        "-MT",
-        "-MQ",
-        "-MJ",
-        "-Xclang",
-    }
-    standalone_drop = {
-        "-c",
-        "-MD",
-        "-MMD",
-        "-MP",
-        "-MM",
-        "-M",
-        "-E",
-        "-S",
-    }
-
-    index = 0
-    while index < len(arguments):
-        arg = arguments[index]
-        if skip_next:
-            skip_next = False
-            index += 1
-            continue
-
-        if arg in flags_with_value:
-            skip_next = True
-            index += 1
-            continue
-        if arg in standalone_drop:
-            index += 1
-            continue
-        if arg.startswith("-o") and len(arg) > 2:
-            index += 1
-            continue
-        if arg.startswith("-MF") and len(arg) > 3:
-            index += 1
-            continue
-        if arg.startswith("-MT") and len(arg) > 3:
-            index += 1
-            continue
-        if arg.startswith("-MQ") and len(arg) > 3:
-            index += 1
-            continue
-        if arg.startswith("-MJ") and len(arg) > 3:
-            index += 1
-            continue
-
-        cleaned.append(arg)
-        index += 1
-
-    return cleaned
-
-
-def parse_makefile_dependencies(stdout_text: str) -> list[str]:
-    flattened = stdout_text.replace("\\\n", " ").replace("\n", " ")
-    if ":" not in flattened:
-        return []
-    dep_payload = flattened.split(":", 1)[1].strip()
-    if not dep_payload:
-        return []
-    return shlex.split(dep_payload)
-
-
-def compute_tu_dependencies(entry: CompileCommandEntry) -> None:
-    dep_cmd = clean_command_for_dependency_scan(entry.arguments)
-    if not dep_cmd:
-        entry.dep_error = "Empty compile command after sanitization"
-        entry.dependencies = {entry.source}
-        return
-
-    dep_cmd.extend(["-MM", "-MF", "-", "-MT", "__benchmark_affected_tu__"])
-    source_arg = str(entry.source)
-    if source_arg not in dep_cmd:
-        dep_cmd.append(source_arg)
-
-    try:
-        proc = run_command(dep_cmd, cwd=entry.directory, check=False)
-    except FileNotFoundError as exc:
-        entry.dep_error = str(exc)
-        entry.dependencies = {entry.source}
-        return
-
-    dependencies: set[Path] = {entry.source}
-    if proc.returncode != 0:
-        stderr = proc.stderr.strip()
-        entry.dep_error = (
-            stderr if stderr else f"Dependency scan failed ({proc.returncode})"
-        )
-        entry.dependencies = dependencies
-        return
-
-    for dep in parse_makefile_dependencies(proc.stdout):
-        dep_path = Path(dep)
-        resolved = (
-            dep_path.resolve()
-            if dep_path.is_absolute()
-            else (entry.directory / dep_path).resolve()
-        )
-        dependencies.add(resolved)
-
-    entry.dependencies = dependencies
-
-
-def is_build_infra_change(repo_root: Path, changed: set[Path]) -> bool:
-    for path in changed:
-        if path.name in BUILD_INFRA_FILES:
-            return True
-        try:
-            rel = path.relative_to(repo_root)
-        except ValueError:
-            continue
-        rel_text = rel.as_posix()
-        if rel_text.startswith("cmake/"):
-            return True
-    return False
-
-
-def identify_benchmark_targets(
-    entries: list[CompileCommandEntry], repo_root: Path
-) -> set[str]:
-    benchmark_targets: set[str] = set()
-    targets_present = {entry.target for entry in entries if entry.target}
-    for entry in entries:
-        if entry.target is None:
-            continue
-        try:
-            rel = entry.source.relative_to(repo_root)
-            rel_text = rel.as_posix()
-        except ValueError:
-            rel_text = entry.source.as_posix()
-
-        if rel_text.startswith("src/benchmarks/"):
-            benchmark_targets.add(entry.target)
-
-    benchmark_targets.update(targets_present.intersection(KNOWN_BENCHMARK_TARGETS))
-    return benchmark_targets
-
-
-def is_benchmark_source(source: Path, repo_root: Path) -> bool:
-    try:
-        rel_text = source.relative_to(repo_root).as_posix()
-    except ValueError:
-        return False
-    return rel_text.startswith("src/benchmarks/")
-
-
-def dedupe_entries_by_target_source(
-    entries: list[CompileCommandEntry],
-) -> list[CompileCommandEntry]:
-    deduped: list[CompileCommandEntry] = []
-    seen: set[tuple[str | None, Path]] = set()
-    for entry in entries:
-        key = (entry.target, entry.source)
-        if key in seen:
-            continue
-        seen.add(key)
-        deduped.append(entry)
-    return deduped
-
-
-def discover_clangxx(explicit: str | None) -> str:
-    if explicit:
-        return explicit
-
-    candidates = [
-        "clang++",
-        "clang++-19",
-        "clang++-18",
-        "clang++-17",
-        "clang++-16",
-    ]
-    for candidate in candidates:
-        resolved = shutil.which(candidate)
-        if resolved:
-            return resolved
-    raise FileNotFoundError(
-        "clang++ was not found on PATH. Provide --clangxx to select a clang compiler."
-    )
-
-
-def clean_command_for_ast(arguments: list[str], clangxx: str) -> list[str]:
-    cleaned = clean_command_for_dependency_scan(arguments)
-    if not cleaned:
-        return []
-    cleaned[0] = clangxx
-    cleaned.extend(["-Xclang", "-ast-dump=json", "-fsyntax-only"])
-    return cleaned
-
-
-def normalize_path_candidate(path_text: str | None, working_dir: Path) -> Path | None:
-    if not path_text:
-        return None
-    path = Path(path_text)
-    if path.is_absolute():
-        return path.resolve()
-    return (working_dir / path).resolve()
-
-
-def file_from_loc(loc: dict[str, Any] | None, working_dir: Path) -> Path | None:
-    if not isinstance(loc, dict):
-        return None
-    if "file" in loc:
-        return normalize_path_candidate(str(loc["file"]), working_dir)
-    for nested_key in ("spellingLoc", "expansionLoc", "includedFrom"):
-        nested_loc = loc.get(nested_key)
-        if isinstance(nested_loc, dict):
-            resolved = file_from_loc(nested_loc, working_dir)
-            if resolved is not None:
-                return resolved
-    return None
-
-
-def iter_ast_nodes(node: Any):
-    if isinstance(node, dict):
-        yield node
-        inner = node.get("inner", [])
-        if isinstance(inner, list):
-            for child in inner:
-                yield from iter_ast_nodes(child)
-    elif isinstance(node, list):
-        for item in node:
-            yield from iter_ast_nodes(item)
-
-
-def referenced_decl_file(node: dict[str, Any], working_dir: Path) -> Path | None:
-    referenced = node.get("referencedDecl")
-    if not isinstance(referenced, dict):
-        return None
-    return file_from_loc(referenced.get("loc"), working_dir)
-
-
-def node_references_changed_symbol(
-    node: dict[str, Any],
-    changed_symbol_names: set[str],
-) -> bool:
-    if not changed_symbol_names:
-        return False
-
-    for subnode in iter_ast_nodes(node):
-        if not isinstance(subnode, dict):
-            continue
-
-        kind = subnode.get("kind")
-        if kind == "MemberExpr":
-            member_name = subnode.get("name")
-            if isinstance(member_name, str) and member_name in changed_symbol_names:
-                return True
-
-        if kind == "DeclRefExpr":
-            ref_decl = subnode.get("referencedDecl")
-            if not isinstance(ref_decl, dict):
-                continue
-            ref_name = ref_decl.get("name")
-            if isinstance(ref_name, str) and ref_name in changed_symbol_names:
-                return True
-
-    return False
-
-
-def call_expr_callee_name(call_expr: dict[str, Any]) -> str | None:
-    for node in iter_ast_nodes(call_expr):
-        if not isinstance(node, dict):
-            continue
-        if node.get("kind") != "DeclRefExpr":
-            continue
-        referenced = node.get("referencedDecl")
-        if isinstance(referenced, dict) and isinstance(referenced.get("name"), str):
-            return referenced["name"]
-    return None
-
-
-def string_literals_in_node(node: dict[str, Any]) -> list[str]:
-    values: list[str] = []
-    for cur in iter_ast_nodes(node):
-        if not isinstance(cur, dict):
-            continue
-        if cur.get("kind") != "StringLiteral":
-            continue
-        value = cur.get("value")
-        if isinstance(value, str):
-            if len(value) >= 2 and value[0] == '"' and value[-1] == '"':
-                value = value[1:-1]
-            values.append(value)
-    return values
-
-
-def benchmark_names_from_source(source: Path) -> set[str]:
-    names: set[str] = set()
-    if not source.exists():
-        return names
-    text = source.read_text(encoding="utf-8", errors="replace")
-    for match in re.finditer(r"BENCHMARK\(\s*([A-Za-z_][A-Za-z0-9_]*)\s*\)", text):
-        names.add(match.group(1))
-    for match in re.finditer(r"register_op\(\s*\"([^\"]+)\"", text):
-        names.add(match.group(1))
-    return names
-
-
-def ast_analyze_entry(
-    entry: CompileCommandEntry,
-    changed_files: set[Path],
-    changed_symbol_names: set[str],
-    clangxx: str,
-) -> AstImpactResult:
-    result = AstImpactResult()
-
-    ast_cmd = clean_command_for_ast(entry.arguments, clangxx)
-    if not ast_cmd:
-        result.ast_error = "Failed to build AST command"
-        return result
-
-    try:
-        proc = run_command(ast_cmd, cwd=entry.directory, check=False)
-    except FileNotFoundError as exc:
-        result.ast_error = str(exc)
-        return result
-
-    if proc.returncode != 0:
-        stderr = proc.stderr.strip()
-        result.ast_error = (
-            stderr if stderr else f"AST command failed ({proc.returncode})"
-        )
-        return result
-
-    try:
-        ast_root = json.loads(proc.stdout)
-    except json.JSONDecodeError as exc:
-        result.ast_error = f"Invalid AST JSON: {exc}"
-        return result
-
-    function_callees: dict[str, set[str]] = defaultdict(set)
-    direct_impacted_functions: set[str] = set()
-    dynamic_benchmarks_by_function: dict[str, set[str]] = defaultdict(set)
-
-    for node in iter_ast_nodes(ast_root):
-        if not isinstance(node, dict):
-            continue
-
-        if node.get("kind") not in {"FunctionDecl", "CXXMethodDecl"}:
-            continue
-
-        function_name = node.get("name")
-        if not isinstance(function_name, str) or not function_name:
-            continue
-
-        if function_name.startswith("BM_"):
-            result.benchmark_names.add(function_name)
-
-        function_callees.setdefault(function_name, set())
-
-        function_loc = file_from_loc(node.get("loc"), entry.directory)
-        is_directly_impacted = function_loc in changed_files
-        if not is_directly_impacted:
-            is_directly_impacted = node_references_changed_symbol(
-                node, changed_symbol_names
-            )
-
-        for subnode in iter_ast_nodes(node):
-            if not isinstance(subnode, dict):
-                continue
-
-            sub_kind = subnode.get("kind")
-            if sub_kind in {"CallExpr", "CXXMemberCallExpr", "CXXOperatorCallExpr"}:
-                callee = call_expr_callee_name(subnode)
-                if callee:
-                    function_callees[function_name].add(callee)
-
-                if callee == "register_op":
-                    literal_values = string_literals_in_node(subnode)
-                    if literal_values:
-                        dynamic_benchmarks_by_function[function_name].add(
-                            literal_values[0]
-                        )
-
-            if not is_directly_impacted:
-                ref_file = referenced_decl_file(subnode, entry.directory)
-                if ref_file in changed_files:
-                    is_directly_impacted = True
-
-        if is_directly_impacted:
-            direct_impacted_functions.add(function_name)
-
-    # Reverse call-graph propagation: if a function is directly impacted,
-    # every caller in this TU is impacted as well (fixed-point DFS/BFS).
-    callers_of: dict[str, set[str]] = defaultdict(set)
-    for caller, callees in function_callees.items():
-        for callee in callees:
-            callers_of[callee].add(caller)
-
-    impacted_functions = set(direct_impacted_functions)
-    stack = list(direct_impacted_functions)
-    while stack:
-        callee_name = stack.pop()
-        for caller_name in callers_of.get(callee_name, set()):
-            if caller_name in impacted_functions:
-                continue
-            impacted_functions.add(caller_name)
-            stack.append(caller_name)
-
-    for function_name in impacted_functions:
-        if function_name.startswith("BM_"):
-            result.affected_names.add(function_name)
-
-    for function_name, names in dynamic_benchmarks_by_function.items():
-        result.benchmark_names.update(names)
-        if function_name in impacted_functions:
-            result.affected_names.update(names)
-
-    return result
-
-
-def regex_for_benchmarks(names: set[str]) -> str | None:
-    if not names:
-        return None
-    ordered = sorted(names)
-    body = "|".join(re.escape(name) for name in ordered)
-    return rf"^({body})(/|$)"
-
-
-def relpath_or_abs(path: Path, root: Path) -> str:
-    try:
-        return path.relative_to(root).as_posix()
-    except ValueError:
-        return path.as_posix()
-
-
-def main() -> int:
-    cli = parse_args()
-
-    try:
-        repo_root = git_repo_root()
-        changed_files = git_changed_files(repo_root, cli.baseline, cli.head)
-        if cli.include_working_tree:
-            changed_files.update(git_working_tree_changed_files(repo_root))
-        changed_line_map = git_changed_line_map(
-            repo_root,
-            cli.baseline,
-            cli.head,
-            cli.include_working_tree,
-        )
-        changed_symbol_names = collect_changed_symbol_names(changed_line_map)
-        compile_commands_path = resolve_compile_commands(
-            repo_root, cli.compile_commands
-        )
-        entries = load_compile_commands(compile_commands_path, repo_root)
-    except FileNotFoundError as exc:
-        print(f"error: {exc}", file=sys.stderr)
-        return 2
-    except subprocess.CalledProcessError as exc:
-        stderr = (exc.stderr or "").strip()
-        if stderr:
-            print(f"error: {stderr}", file=sys.stderr)
-        else:
-            print(f"error: command failed: {' '.join(exc.cmd)}", file=sys.stderr)
-        return 2
-
-    target_to_entries: dict[str, list[CompileCommandEntry]] = defaultdict(list)
-    source_to_entries: dict[Path, list[CompileCommandEntry]] = defaultdict(list)
-    for entry in entries:
-        source_to_entries[entry.source].append(entry)
-        if entry.target:
-            target_to_entries[entry.target].append(entry)
-
-    benchmark_targets = identify_benchmark_targets(entries, repo_root)
-    all_targets = {entry.target for entry in entries if entry.target}
-    benchmark_entries = dedupe_entries_by_target_source(
-        [entry for entry in entries if entry.target in benchmark_targets]
-    )
-
-    infra_change = is_build_infra_change(repo_root, changed_files)
-    relevant_changed_files = {
-        path
-        for path in changed_files
-        if is_project_source(path, repo_root)
-        or path.name in BUILD_INFRA_FILES
-        or relpath_or_abs(path, repo_root).startswith("cmake/")
-    }
-    has_header_changes = any(
-        path.suffix.lower() in HEADER_EXTENSIONS for path in relevant_changed_files
-    )
-    benchmark_source_extensions = {".c", ".cc", ".cpp", ".cxx"}
-    only_benchmark_source_changes = bool(relevant_changed_files) and all(
-        is_benchmark_source(path, repo_root)
-        and path.suffix.lower() in benchmark_source_extensions
-        for path in relevant_changed_files
-    )
-
-    directly_affected_targets: set[str] = set()
-    for changed_path in changed_files:
-        for entry in source_to_entries.get(changed_path, []):
-            if entry.target:
-                directly_affected_targets.add(entry.target)
-
-    dependency_scan_entries: list[CompileCommandEntry] = []
-    if not infra_change and not only_benchmark_source_changes:
-        if has_header_changes:
-            dependency_scan_entries = dedupe_entries_by_target_source(entries)
-        else:
-            dependency_scan_entries = benchmark_entries
-
-    if dependency_scan_entries:
-        with concurrent.futures.ThreadPoolExecutor(
-            max_workers=min(8, (os.cpu_count() or 4))
-        ) as pool:
-            list(pool.map(compute_tu_dependencies, dependency_scan_entries))
-
-    affected_targets: set[str] = set(directly_affected_targets)
-    for entry in dependency_scan_entries:
-        has_changed_dependency = any(dep in changed_files for dep in entry.dependencies)
-        if has_changed_dependency and entry.target:
-            affected_targets.add(entry.target)
-
-    if infra_change:
-        affected_targets.update(all_targets)
-
-    dependency_impacted_benchmark_targets = affected_targets.intersection(
-        benchmark_targets
-    )
-    impacted_benchmark_entries = [
-        entry
-        for entry in benchmark_entries
-        if entry.target in dependency_impacted_benchmark_targets
-    ]
-
-    ast_errors: dict[str, str] = {}
-    benchmark_target_to_names: dict[str, set[str]] = defaultdict(set)
-    benchmark_target_to_affected: dict[str, set[str]] = defaultdict(set)
-    warnings: list[str] = []
-    ast_fallback_used = False
-    ast_entries_scanned = 0
-
-    if impacted_benchmark_entries:
-        try:
-            clangxx = discover_clangxx(cli.clangxx)
-        except FileNotFoundError as exc:
-            clangxx = ""
-            warnings.append(str(exc))
-
-        if not clangxx:
-            ast_fallback_used = True
-            for entry in impacted_benchmark_entries:
-                target_name = entry.target or "<unknown-target>"
-                fallback_names = benchmark_names_from_source(entry.source)
-                benchmark_target_to_names[target_name].update(fallback_names)
-                benchmark_target_to_affected[target_name].update(fallback_names)
-        else:
-            max_ast_workers = min(2, (os.cpu_count() or 2))
-            with concurrent.futures.ThreadPoolExecutor(
-                max_workers=max_ast_workers
-            ) as pool:
-                futures = {
-                    pool.submit(
-                        ast_analyze_entry,
-                        entry,
-                        changed_files,
-                        changed_symbol_names,
-                        clangxx,
-                    ): entry
-                    for entry in impacted_benchmark_entries
-                }
-                ast_entries_scanned = len(futures)
-                for future in concurrent.futures.as_completed(futures):
-                    entry = futures[future]
-                    target_name = entry.target or "<unknown-target>"
-                    source_path = entry.source
-                    source_is_changed = source_path in changed_files
-
-                    try:
-                        ast_result = future.result(timeout=120)
-                    except Exception as exc:
-                        ast_result = AstImpactResult(
-                            ast_error=f"AST worker failed: {exc}"
-                        )
-
-                    if ast_result.ast_error:
-                        ast_errors[relpath_or_abs(source_path, repo_root)] = (
-                            ast_result.ast_error
-                        )
-
-                    benchmark_names = ast_result.benchmark_names
-                    if not benchmark_names:
-                        benchmark_names = benchmark_names_from_source(source_path)
-                    benchmark_target_to_names[target_name].update(benchmark_names)
-
-                    if ast_result.affected_names:
-                        benchmark_target_to_affected[target_name].update(
-                            ast_result.affected_names
-                        )
-                    elif source_is_changed or ast_result.ast_error:
-                        benchmark_target_to_affected[target_name].update(
-                            benchmark_names
-                        )
-                        if benchmark_names:
-                            ast_fallback_used = True
-
-    if infra_change and benchmark_targets:
-        for target_name in sorted(benchmark_targets):
-            for entry in target_to_entries.get(target_name, []):
-                names = benchmark_names_from_source(entry.source)
-                benchmark_target_to_names[target_name].update(names)
-                benchmark_target_to_affected[target_name].update(names)
-
-    if infra_change:
-        affected_benchmark_targets = sorted(benchmark_targets)
-    else:
-        affected_benchmark_targets = sorted(
-            target for target, names in benchmark_target_to_affected.items() if names
-        )
-
-    all_affected_benchmarks: set[str] = set()
-    for names in benchmark_target_to_affected.values():
-        all_affected_benchmarks.update(names)
-
-    dep_scan_failures = {
-        relpath_or_abs(entry.source, repo_root): entry.dep_error
-        for entry in dependency_scan_entries
-        if entry.dep_error
-    }
-
-    scope_mode = "normal"
-    if infra_change:
-        scope_mode = "infra_fallback"
-    elif ast_fallback_used:
-        scope_mode = "ast_fallback"
-
-    report: dict[str, Any] = {
-        "baseline": cli.baseline,
-        "head": cli.head,
-        "include_working_tree": cli.include_working_tree,
-        "changed_symbols": sorted(changed_symbol_names),
-        "compile_commands": relpath_or_abs(compile_commands_path, repo_root),
-        "changed_files": sorted(
-            relpath_or_abs(path, repo_root) for path in changed_files
-        ),
-        "affected_targets": sorted(affected_targets),
-        "affected_benchmark_targets": affected_benchmark_targets,
-        "affected_benchmarks": {
-            target: sorted(names)
-            for target, names in sorted(benchmark_target_to_affected.items())
-            if names
-        },
-        "suggested_filter_regex": regex_for_benchmarks(all_affected_benchmarks),
-        "dependency_entries_scanned": len(dependency_scan_entries),
-        "benchmark_entries_scanned": len(benchmark_entries),
-        "ast_entries_scanned": ast_entries_scanned,
-        "scope_mode": scope_mode,
-        "dependency_scan_failures": dep_scan_failures,
-        "ast_failures": ast_errors,
-        "warnings": warnings,
-    }
-
-    if cli.format == "json":
-        json.dump(report, sys.stdout, indent=2)
-        sys.stdout.write("\n")
-        return 0
-
-    print(f"Baseline: {cli.baseline}")
-    print(f"Head: {cli.head}")
-    print(f"Compile commands: {report['compile_commands']}")
-    print(f"Scope mode: {report['scope_mode']}")
-    print(
-        "Scan counts: "
-        f"dependency={report['dependency_entries_scanned']}, "
-        f"benchmark={report['benchmark_entries_scanned']}, "
-        f"ast={report['ast_entries_scanned']}"
-    )
-    print("")
-
-    print(f"Changed files ({len(report['changed_files'])}):")
-    for item in report["changed_files"]:
-        print(f"- {item}")
-    if not report["changed_files"]:
-        print("- none")
-
-    print("")
-    print(f"Affected targets ({len(report['affected_targets'])}):")
-    for item in report["affected_targets"]:
-        print(f"- {item}")
-    if not report["affected_targets"]:
-        print("- none")
-
-    print("")
-    print(f"Affected benchmark targets ({len(report['affected_benchmark_targets'])}):")
-    for item in report["affected_benchmark_targets"]:
-        print(f"- {item}")
-    if not report["affected_benchmark_targets"]:
-        print("- none")
-
-    print("")
-    print("Affected benchmark functions:")
-    if report["affected_benchmarks"]:
-        for target, names in report["affected_benchmarks"].items():
-            print(f"- {target}:")
-            for name in names:
-                print(f"  - {name}")
-    else:
-        print("- none")
-
-    print("")
-    print("Suggested --benchmark_filter regex:")
-    print(report["suggested_filter_regex"] or "none")
-
-    if dep_scan_failures:
-        print("")
-        print("Dependency scan failures:")
-        for source, error in dep_scan_failures.items():
-            print(f"- {source}: {error}")
-
-    if ast_errors:
-        print("")
-        print("AST failures:")
-        for source, error in ast_errors.items():
-            print(f"- {source}: {error}")
-
-    if warnings:
-        print("")
-        print("Warnings:")
-        for warning in warnings:
-            print(f"- {warning}")
-
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/agentic/cpp/skills/benchmarks-compare-revisions/SKILL.md b/agentic/cpp/skills/benchmarks-compare-revisions/SKILL.md
deleted file mode 100644
index caa9542..0000000
--- a/agentic/cpp/skills/benchmarks-compare-revisions/SKILL.md
+++ /dev/null
@@ -1,225 +0,0 @@
----
-name: benchmarks-compare-revisions
-description: Compare benchmark performance between two git revisions via Google Benchmark compare.py, with optional hardware-counter comparison from diagnostic libpfm builds.
----
-
-# Benchmarks Compare Revisions Skill
-
-Use this skill to compare performance between two git revisions.
-
-This workflow now depends on:
-
-1. `../benchmarks-affected/SKILL.md` to determine affected benchmark targets/functions and produce a benchmark filter.
-2. `../benchmarks/SKILL.md` for build/run operational details.
-
-## Goal
-
-Build two separate benchmark binaries using short commit hashes as build suffixes, compare timing results with Google Benchmark compare.py, and optionally compare hardware counters across the same revisions.
-
-## Step 0 — Choose revisions, hashes, and options
-
-Pick a baseline and a contender revision. Use short commit hashes to suffix build directories so builds do not collide.
-
-Optional behavior flags:
-
-- `COLLECT_COUNTERS=1` to enable hardware-counter collection and analysis in addition to timing comparison.
-- `COLLECT_COUNTERS=0` to run timing-only comparison.
-
-Counter collection is Linux-only and requires:
-
-- diagnostic builds with `BENCHMARK_ENABLE_LIBPFM=ON`
-- perf permissions on the host (for access to performance counters)
-
-Example:
-```bash
-BASELINE=abc1234
-CONTENDER=def5678
-```
-
-## Step 1 — Compute affected benchmark scope first
-
-Run `benchmarks-affected` from the contender checkout to derive the compare scope.
-
-Do not duplicate `benchmarks-affected` internals here (compile database selection, AST analysis, or fallback heuristics). Follow that skill directly and consume only its outputs.
-
-Inputs to pass through:
-
-- `--baseline ${BASELINE}`
-- optional compile-commands path if auto-detection is not desired
-- optional output format (`json` recommended for parsing)
-
-Consume these outputs from `benchmarks-affected`:
-
-- `suggested_filter_regex` -> set `FILTER`
-- `affected_benchmark_targets` -> optionally constrain which benchmark binary/binaries to run
-- `affected_benchmarks` -> function-level scope for validation/reporting
-
-If `FILTER` is empty, fall back to full benchmark binary compare (conservative mode).
-
-## Step 2 — Build both revisions
-
-Use the existing benchmarks skill build steps, but set the build suffix to include the short hash for each revision.
-
-Always build Release timing binaries.
-
-If `COLLECT_COUNTERS=1`, also build diagnostic binaries (RelWithDebInfo + libpfm) for both revisions.
-
-```bash
-# Baseline
-BUILD_SUFFIX=bench_${BASELINE}
-git checkout ${BASELINE}
-# Follow ../benchmarks/SKILL.md timing build instructions with this suffix
-# If COLLECT_COUNTERS=1, also follow the diagnostic build instructions with this suffix
-
-# Contender
-BUILD_SUFFIX=bench_${CONTENDER}
-git checkout ${CONTENDER}
-# Follow ../benchmarks/SKILL.md timing build instructions with this suffix
-# If COLLECT_COUNTERS=1, also follow the diagnostic build instructions with this suffix
-```
-
-Expected build trees:
-
-- Timing: `build/benchmarks-all_bench_<short-hash>`
-- Counters (optional): `build/benchmarks-diagnostic_bench_<short-hash>`
-
-## Step 3 — Compare using compare.py
-
-Use Google Benchmark compare tooling with a JSON-first flow to avoid long-running binary-vs-binary retries.
-
-Locate compare.py from the Google Benchmark dependency (installed under the build tree):
-```bash
-COMPARE_PY=build/benchmarks-all_bench_${BASELINE}/_deps/googlebenchmark-src/tools/compare.py
-```
-
-Verify Python deps once (compare.py imports numpy/scipy):
-```bash
-python3 -c "import numpy, scipy"
-```
-
-Generate baseline/contender JSON sequentially with explicit file outputs:
-```bash
-BENCH_CPU=${BENCH_CPU:-0}
-BENCH_RUN="taskset -c ${BENCH_CPU}"
-BASE_JSON=/tmp/bench_${BASELINE}.json
-CONT_JSON=/tmp/bench_${CONTENDER}.json
-
-${BENCH_RUN} build/benchmarks-all_bench_${BASELINE}/benchmarks \
-  --benchmark_report_aggregates_only=true \
-  --benchmark_display_aggregates_only=true \
-  --benchmark_format=json \
-  --benchmark_out=${BASE_JSON} > /tmp/bench_${BASELINE}.log 2>&1
-
-${BENCH_RUN} build/benchmarks-all_bench_${CONTENDER}/benchmarks \
-  --benchmark_report_aggregates_only=true \
-  --benchmark_display_aggregates_only=true \
-  --benchmark_format=json \
-  --benchmark_out=${CONT_JSON} > /tmp/bench_${CONTENDER}.log 2>&1
-```
-
-Validate JSON before comparing:
-```bash
-python3 -m json.tool ${BASE_JSON} > /dev/null
-python3 -m json.tool ${CONT_JSON} > /dev/null
-```
-
-Run the comparison:
-```bash
-python3 ${COMPARE_PY} -a benchmarks ${BASE_JSON} ${CONT_JSON}
-```
-
-Use the affected filter from Step 1 when generating JSON files:
-```bash
-if [ -n "${FILTER}" ]; then
-  FILTER_ARG="--benchmark_filter=${FILTER}"
-else
-  FILTER_ARG=""
-fi
-
-${BENCH_RUN} build/benchmarks-all_bench_${BASELINE}/benchmarks ${FILTER_ARG} --benchmark_report_aggregates_only=true --benchmark_display_aggregates_only=true ...
-${BENCH_RUN} build/benchmarks-all_bench_${CONTENDER}/benchmarks ${FILTER_ARG} --benchmark_report_aggregates_only=true --benchmark_display_aggregates_only=true ...
-```
-
-## Step 3b — Compare hardware counters (optional, Linux only)
-
-Run this step only when `COLLECT_COUNTERS=1`.
-
-1. Preflight first with one tiny counter-enabled benchmark run from a diagnostic binary. If output includes warnings such as `Failed to get a file descriptor for performance counter`, mark counters unavailable and skip counter collection.
-2. Run baseline and contender diagnostic binaries sequentially with explicit JSON outputs and the same filter scope:
-
-```bash
-BASE_COUNTERS_JSON=/tmp/bench_counters_${BASELINE}.json
-CONT_COUNTERS_JSON=/tmp/bench_counters_${CONTENDER}.json
-
-${BENCH_RUN} build/benchmarks-diagnostic_bench_${BASELINE}/benchmarks \
-  ${FILTER_ARG} \
-  --benchmark_counters_tabular=true \
-  --benchmark_format=json \
-  --benchmark_out=${BASE_COUNTERS_JSON} > /tmp/bench_counters_${BASELINE}.log 2>&1
-
-${BENCH_RUN} build/benchmarks-diagnostic_bench_${CONTENDER}/benchmarks \
-  ${FILTER_ARG} \
-  --benchmark_counters_tabular=true \
-  --benchmark_format=json \
-  --benchmark_out=${CONT_COUNTERS_JSON} > /tmp/bench_counters_${CONTENDER}.log 2>&1
-```
-
-3. Validate JSON files before consuming:
-
-```bash
-python3 -m json.tool ${BASE_COUNTERS_JSON} > /dev/null
-python3 -m json.tool ${CONT_COUNTERS_JSON} > /dev/null
-```
-
-4. Collect and compare these counter families when present:
-
-- `instructions`, `cycles`
-- `cache-misses`, `cache-references`
-- `branch-misses`, `branches`
-- `L1-dcache-load-misses`
-
-5. Compute derived metrics when denominators are non-zero:
-
-- IPC = `instructions / cycles`
-- Cache miss rate = `cache-misses / cache-references`
-- Branch mispredict rate = `branch-misses / branches`
-
-6. Pair baseline and contender rows by benchmark name, compute deltas, and flag anomalies where timing direction conflicts with key counter direction.
-
-7. Emit a canonical summary table for downstream consumers:
-
-```markdown
-| Benchmark | IPC (base -> new) | Cache Miss Rate (base -> new) | Branch Mispredict (base -> new) | Anomaly? |
-|---|---:|---:|---:|---|
-```
-
-## Retry and Timeout Policy
-
-1. Run benchmarks sequentially; do not background with `nohup`/`&`.
-2. If a run times out, narrow filter and retry once.
-3. Maximum retries per benchmark group: 1.
-4. If still failing, emit blocked/partial findings instead of repeated attempts.
-
-Apply this policy to both timing and counter runs.
-
-## Step 4 — Record findings
-
-Capture and return:
-
-- compare.py output (terminal transcript or redirected file)
-- effective filter used
-- timing JSON artifacts for baseline and contender
-- `counters_available` (`true`/`false`)
-- if `counters_available=false`, a reason string (unsupported OS, missing libpfm, perf permission denied, preflight failure)
-- if counters are available: counter JSON artifacts, derived metrics table, and anomaly list
-
-## Best Practices / Guardrails
-
-1. **Release only**: never compare Debug binaries.
-2. **Short hash suffixes**: keep build dirs isolated per revision (example: `bench_<short-hash>`).
-3. **Same host, same conditions**: do not compare across different machines or power profiles.
-4. **Filter from analysis**: use `benchmarks-affected` output instead of hand-crafted filters whenever possible.
-5. **Pin process and frequency**: use `taskset -c ${BENCH_CPU:-0}` for all benchmark executions and follow benchmark skill guidance on CPU governor.
-6. **Counter collection is optional and Linux-only**: when unavailable, return timing-only outputs with `counters_available=false`.
-7. **Always preflight counters**: do not run full counter collection if preflight fails.
-8. **Keep build types separated**: timing uses `benchmarks-all_*` Release builds; counters use `benchmarks-diagnostic_*` RelWithDebInfo builds; never Debug.
diff --git a/agentic/cpp/skills/benchmarks/SKILL.md b/agentic/cpp/skills/benchmarks/SKILL.md
deleted file mode 100644
index 92af231..0000000
--- a/agentic/cpp/skills/benchmarks/SKILL.md
+++ /dev/null
@@ -1,209 +0,0 @@
----
-name: benchmarks
-description: Run Google Benchmark binaries, including filtering, hardware counters, and perf profiling.
----
-
-# Benchmarks Skill
-
-You now have expertise in running and interpreting Google Benchmark suites.
-Follow these workflows:
-
-## Build Directory Convention
-
-Use a short commit hash suffix for committed revisions:
-
-```bash
-BUILD_SUFFIX=$(git rev-parse --short HEAD)
-```
-
-If the worktree has uncommitted changes, append a descriptive suffix so results
-cannot be confused with a clean HEAD build:
-
-```bash
-BUILD_SUFFIX=$(git rev-parse --short HEAD)-dirty
-```
-
-If not a git repository, use
-
-```bash
-BUILD_SUFFIX=agent
-```
-
-## CRITICAL: Never Run Benchmarks from a Debug Build
-
-> **Always pass `--config Release` (or `--config RelWithDebInfo`) to `cmake --build`.**
-> Multi-config generators (MSVC, Xcode) default to `Debug` if no `--config` is given.
-> Google Benchmark will print `***WARNING*** Library was built as DEBUG` and timings will
-> be 3-10x slower and meaningless. Always verify the binary path contains `Release/` or
-> `RelWithDebInfo/`, never `Debug/`.
-
-## Step 1 — Build
-
-If benchmarks affected by the changes are easily tractable build only related targets.
-
-**Pure timing (benchmarks, Release):**
-```bash
-cmake -B build/benchmarks_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Release
-cmake --build build/benchmarks_${BUILD_SUFFIX} --config Release -j
-```
-
-**Hardware counters / verbose report (benchmarks-diagnostic, RelWithDebInfo, Linux only):**
-```bash
-cmake -B build/benchmarks-diagnostic_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DBENCHMARK_ENABLE_LIBPFM=ON
-cmake --build build/benchmarks-diagnostic_${BUILD_SUFFIX} --config RelWithDebInfo -j
-```
-
-For repository-specific benchmark examples, check
-`agentic/local/cpp/skills/benchmarks/EXAMPLES.md` when present.
-
-## Step 2 — Run
-
-Prefer running benchmarks with filtering passing the benchmarks that should be affected.
-
-Unless the user explicitly asks otherwise, pin benchmark execution to one CPU with
-`taskset` to reduce scheduler noise. Use CPU 0 by default, or override with
-`BENCH_CPU=<id>` when a better isolated/performance core is known:
-
-```bash
-BENCH_CPU=${BENCH_CPU:-0}
-BENCH_RUN="taskset -c ${BENCH_CPU}"
-```
-
-If `taskset` is unavailable or fails on the host, report that benchmark results
-are unpinned and more noisy.
-
-Execution guardrails:
-- Run benchmark commands sequentially in CI.
-- Avoid background jobs (`nohup`, `&`) for benchmark collection.
-- Always write machine-readable results with `--benchmark_out` when data is later parsed.
-
-### Available benchmark binaries
-
-Discover benchmark binary names from the repository's build system. Common
-locations include `build/**/<binary>` for single-config generators and
-`build/**/Release/<binary>` for multi-config generators. Repository-specific
-binary lists belong in the repo-local benchmark examples overlay.
-
-Binary paths vary by generator type:
-
-| Generator | Path pattern |
-|-----------|-------------|
-| MSVC / Xcode (multi-config) | `build/<preset>_<suffix>/Release/<binary>` |
-| Ninja / Make (single-config) | `build/<preset>_<suffix>/<binary>` |
-
-### Run all benchmarks in a binary
-
-```bash
-# Multi-config (MSVC/Xcode)
-${BENCH_RUN} build/benchmarks_${BUILD_SUFFIX}/Release/benchmarks
-
-# Single-config (Ninja/Make)
-${BENCH_RUN} build/benchmarks_${BUILD_SUFFIX}/benchmarks
-```
-
-### Filter benchmarks with a regex (FILTER parameter)
-
-```bash
-FILTER="BM_Foo"   # change to match benchmark names in the target binary
-
-# Multi-config
-${BENCH_RUN} build/benchmarks_${BUILD_SUFFIX}/Release/benchmarks --benchmark_filter="${FILTER}"
-
-# Single-config
-${BENCH_RUN} build/benchmarks_${BUILD_SUFFIX}/benchmarks --benchmark_filter="${FILTER}"
-```
-
-Examples:
-```bash
-# Only one benchmark family
-... --benchmark_filter="BM_Foo"
-
-# Only one layout/parameter family
-... --benchmark_filter="BM_Foo.*Variant"
-
-# List all available benchmark names without running
-... --benchmark_list_tests=true
-```
-
-### Run with hardware counters (benchmarks-diagnostic build, Linux only)
-
-The `--benchmark_perf_counters` flag requests hardware counter collection via libpfm. Counter names are platform-specific but common ones include `CYCLES`, `INSTRUCTIONS`, `CACHE-MISSES`, `CACHE-REFERENCES`, `BRANCH-MISSES`, `BRANCH-INSTRUCTIONS`.
-
-```bash
-${BENCH_RUN} build/benchmarks-diagnostic_${BUILD_SUFFIX}/RelWithDebInfo/benchmarks \
-  --benchmark_filter="${FILTER}" \
-  --benchmark_perf_counters=CYCLES,INSTRUCTIONS,CACHE-MISSES \
-  --benchmark_counters_tabular=true
-```
-
-### Save results to file
-
-```bash
-${BENCH_RUN} build/benchmarks_${BUILD_SUFFIX}/Release/benchmarks \
-  --benchmark_filter="${FILTER}" \
-  --benchmark_report_aggregates_only=true \
-  --benchmark_display_aggregates_only=true \
-  --benchmark_format=json \
-  --benchmark_out=results.json
-```
-
-Validate output before consuming:
-```bash
-python3 -m json.tool results.json > /dev/null
-```
-
-## Step 3 — Profile with perf (Linux only)
-
-Use when hardware counters alone are not enough and you need a full call-graph profile for post-processing.
-
-**Record:**
-```bash
-perf record -g -F 999 \
-  -- ${BENCH_RUN} build/benchmarks-diagnostic_${BUILD_SUFFIX}/benchmarks \
-  --benchmark_filter="${FILTER}" \
-  --benchmark_min_time=5s
-```
-
-**Quick report (terminal):**
-```bash
-perf report --stdio
-```
-
-**Flame graph (requires FlameGraph scripts):**
-```bash
-perf script | stackcollapse-perf.pl | flamegraph.pl > flamegraph.html
-```
-
-**Export for external tools (Hotspot, Firefox Profiler):**
-```bash
-perf script -F +pid > perf.data.txt
-# or open with `hotspot perf.data`
-```
-
-## Useful Benchmark Flags
-
-| Flag | Purpose |
-|------|---------|
-| `--benchmark_filter=<regex>` | Run only matching benchmarks |
-| `--benchmark_list_tests=true` | List names without running |
-| `--benchmark_repetitions=<n>` | Repeat each benchmark n times |
-| `--benchmark_min_time=<Ns\|Xs>` | Minimum run time per benchmark |
-| `--benchmark_format=json` | Machine-readable output |
-| `--benchmark_out=<file>` | Save output to file |
-| `--benchmark_perf_counters=CYCLES,INSTRUCTIONS,...` | Collect hardware perf counters (requires libpfm build) |
-| `--benchmark_counters_tabular=true` | Align user/perf counter columns into a table |
-| `--benchmark_time_unit=ms` | Change display unit (ns/us/ms/s) |
-
-## Best Practices
-
-1. **Never run from a Debug binary**: always use `--config Release` at build time; check path contains `Release/`
-2. **Use benchmarks for clean timing**: Release optimizations, no debug info, no libpfm overhead
-3. **Use benchmarks-diagnostic for hardware counters**: RelWithDebInfo + libpfm; Linux only
-4. **Use perf for deep profiling**: when counters point to a hotspot but don't explain it
-5. **Pin benchmark process** with `taskset -c ${BENCH_CPU:-0}` unless unavailable
-6. **Pin CPU frequency** before timing runs: `sudo cpupower frequency-set -g performance`
-7. **Filter to reduce noise**: narrow the filter regex to the benchmark under investigation
-8. **Save JSON output** when comparing before/after changes: use `--benchmark_out` and diff the files
-9. **Fail fast on environment issues**: precheck Python deps used by compare tooling (`numpy`, `scipy`)
-10. **Use explicit retry limits**: on timeout, narrow scope and retry once; avoid repeated full-suite attempts
-11. **Preflight perf counters**: run a tiny counter-enabled benchmark first; if counters unavailable, skip counter workflow
diff --git a/agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py b/agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py
deleted file mode 100644
index abe5040..0000000
--- a/agentic/cpp/skills/benchmarks/scripts/plot_benchmarks.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env python3
-"""Generic Google Benchmark JSON plotter.
-
-Plots timing and (optionally) hardware counter data from Google Benchmark
---benchmark_format=json output. Correctly handles repetition output by
-averaging raw iterations and skipping aggregate entries.
-
-Usage:
-    python3 plot_benchmarks.py results.json [output_prefix]
-    python3 plot_benchmarks.py results.json report --min-n 16 --max-n 1048576
-"""
-import json
-import sys
-import argparse
-import matplotlib.pyplot as plt
-import numpy as np
-
-def load_benchmark_json(path):
-    with open(path, 'r') as f:
-        return json.load(f)
-
-def extract_series(data, metric='time', min_n=None, max_n=None):
-    """Extract series from benchmark JSON.
-
-    metric='time'     -> real_time (ns)
-    metric='counter'  -> first available hardware counter (e.g. CACHE-MISSES)
-    """
-    # Determine counter key if requested
-    counter_key = None
-    if metric == 'counter':
-        # Find first counter key from the first iteration entry
-        for bench in data.get('benchmarks', []):
-            if bench.get('run_type', 'iteration') != 'iteration':
-                continue
-            for k in bench.keys():
-                if k not in ('name', 'run_name', 'run_type', 'repetitions',
-                             'repetition_index', 'threads', 'iterations',
-                             'real_time', 'cpu_time', 'time_unit',
-                             'items_per_second', 'aggregate_name',
-                             'aggregate_unit', 'family_index',
-                             'per_family_instance_index'):
-                    counter_key = k
-                    break
-            if counter_key:
-                break
-        if not counter_key:
-            return {}
-
-    raw = {}
-    for bench in data.get('benchmarks', []):
-        if bench.get('run_type', 'iteration') != 'iteration':
-            continue
-
-        name = bench['name']
-        parts = name.split('/')
-        if len(parts) < 2:
-            continue
-
-        bench_name = parts[0]
-        try:
-            n = int(parts[1])
-        except ValueError:
-            continue
-
-        if min_n is not None and n < min_n:
-            continue
-        if max_n is not None and n > max_n:
-            continue
-
-        if metric == 'time':
-            val = bench.get('real_time', bench.get('cpu_time', 0))
-        else:
-            val = bench.get(counter_key)
-
-        if val is None or val == 0:
-            continue
-
-        key = (bench_name, n)
-        raw.setdefault(key, []).append(val)
-
-    series = {}
-    for (bench_name, n), vals in raw.items():
-        series.setdefault(bench_name, []).append((n, sum(vals) / len(vals)))
-
-    for name in series:
-        series[name].sort(key=lambda x: x[0])
-
-    return series
-
-def plot_series(series, output_prefix, ylabel, title_suffix):
-    if not series:
-        print(f"No data to plot for {title_suffix}")
-        return
-
-    fig, ax = plt.subplots(figsize=(12, 8))
-    colors = plt.cm.tab10(np.linspace(0, 1, len(series)))
-
-    for idx, (name, points) in enumerate(sorted(series.items())):
-        xs = [p[0] for p in points]
-        ys = [p[1] for p in points]
-        ax.plot(xs, ys, marker='o', markersize=3, label=name, color=colors[idx])
-
-    ax.set_xscale('log')
-    ax.set_xlabel('Benchmark parameter n')
-    ax.set_ylabel(ylabel)
-    ax.set_title(f'Benchmark Results - {title_suffix}')
-    ax.legend(loc='upper left', fontsize='small')
-    ax.grid(True, which='both', ls='--', alpha=0.5)
-    fig.tight_layout()
-    fig.savefig(f'{output_prefix}.png', dpi=150)
-    fig.savefig(f'{output_prefix}.svg')
-    print(f'Saved {output_prefix}.png and {output_prefix}.svg')
-    plt.close(fig)
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Plot Google Benchmark JSON results.')
-    parser.add_argument('json_path', help='Path to benchmark JSON file')
-    parser.add_argument('output_prefix', nargs='?', default='report',
-                        help='Output file prefix (default: report)')
-    parser.add_argument('--min-n', type=int, default=None,
-                        help='Minimum parameter value to include (inclusive)')
-    parser.add_argument('--max-n', type=int, default=None,
-                        help='Maximum parameter value to include (inclusive)')
-    args = parser.parse_args()
-
-    data = load_benchmark_json(args.json_path)
-
-    # Timing plot
-    time_series = extract_series(data, metric='time',
-                                 min_n=args.min_n, max_n=args.max_n)
-    if time_series:
-        plot_series(time_series, args.output_prefix,
-                    'Time per iteration (ns)', 'Time')
-
-    # Counter plot
-    counter_series = extract_series(data, metric='counter',
-                                    min_n=args.min_n, max_n=args.max_n)
-    if counter_series:
-        # Determine counter name for labels
-        counter_name = 'Counter'
-        for bench in data.get('benchmarks', []):
-            if bench.get('run_type', 'iteration') != 'iteration':
-                continue
-            for k in bench.keys():
-                if k not in ('name', 'run_name', 'run_type', 'repetitions',
-                             'repetition_index', 'threads', 'iterations',
-                             'real_time', 'cpu_time', 'time_unit',
-                             'items_per_second', 'aggregate_name',
-                             'aggregate_unit', 'family_index',
-                             'per_family_instance_index'):
-                    counter_name = k
-                    break
-            break
-        plot_series(counter_series, f'{args.output_prefix}_{counter_name.lower().replace("-", "_")}',
-                    f'{counter_name} per iteration', counter_name)
-
-if __name__ == '__main__':
-    main()
diff --git a/agentic/cpp/skills/cmake/SKILL.md b/agentic/cpp/skills/cmake/SKILL.md
deleted file mode 100644
index a659e1a..0000000
--- a/agentic/cpp/skills/cmake/SKILL.md
+++ /dev/null
@@ -1,96 +0,0 @@
----
-name: cmake
-description: Compile and build CMake projects, including configuring build types, options, and running test binaries.
----
-
-# CMake Build Skill
-
-You now have expertise in building and configuring CMake projects. Follow these workflows:
-
-## Build Directory Convention
-
-Use a short commit hash suffix for committed revisions:
-
-```bash
-BUILD_SUFFIX=$(git rev-parse --short HEAD)
-```
-
-If the worktree has uncommitted changes, append a descriptive suffix so generated
-artifacts cannot be confused with a clean HEAD build:
-
-```bash
-BUILD_SUFFIX=$(git rev-parse --short HEAD)-dirty
-```
-
-If not a git repository, use
-
-```bash
-BUILD_SUFFIX=agent
-```
-
-Build directories follow the pattern `build/<preset_name>_<suffix>`.
-
-## Using Presets (Preferred When Available)
-
-> **Important**: `cmake --preset` sets cache variables and generator but its `binaryDir` cannot be
-> overridden from the command line. To use a preset's settings with a custom build dir, pass the
-> relevant `-D` flags explicitly together with `-B`. Use `--preset` only to discover what flags a
-> preset applies.
-
-**List available presets:**
-```bash
-cmake --list-presets
-```
-
-**Replicate a preset's settings with a custom suffix build dir:**
-
-Release:
-```bash
-cmake -B build/release_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Release
-cmake --build build/release_${BUILD_SUFFIX} -j
-```
-
-Debug:
-```bash
-cmake -B build/debug_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Debug
-cmake --build build/debug_${BUILD_SUFFIX} -j
-```
-
-AddressSanitizer:
-```bash
-cmake -B build/asan_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Debug -DENABLE_ADDRESS_SANITIZER=ON
-cmake --build build/asan_${BUILD_SUFFIX} -j
-```
-
-Coverage:
-```bash
-cmake -B build/coverage_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Debug -DENABLE_COVERAGE=ON
-cmake --build build/coverage_${BUILD_SUFFIX} -j
-```
-
-Benchmarks:
-```bash
-cmake -B build/benchmarks_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Release -DBUILD_BENCHMARKS=ON
-cmake --build build/benchmarks_${BUILD_SUFFIX} -j
-```
-
-## Additional Feature Options
-
-Feature flags are project-specific. Inspect `CMakeLists.txt`,
-`CMakePresets.json`, or `cmake -LAH <build-dir>` before toggling options. For
-repository-specific examples, check
-`agentic/local/cpp/skills/cmake/EXAMPLES.md` when present.
-
-**Example feature toggle:**
-```bash
-cmake -B build/release_${BUILD_SUFFIX} -DCMAKE_BUILD_TYPE=Release -DENABLE_FEATURE=ON
-cmake --build build/release_${BUILD_SUFFIX} -j
-```
-
-## Best Practices
-
-1. **Use out-of-source builds**: Keep build artifacts in `build/<preset_name>_<suffix>` directories
-2. **Presets fix binaryDir**: `--preset` cannot be combined with `-B` to change the build dir; replicate `-D` flags manually with `-B` instead
-3. **Reconfigure when options change**: Rerun the `cmake -B ...` step when toggling options
-4. **Clean build directory when needed**: Delete the entire build folder for a fresh configuration
-5. **Match build type to task**: Release for performance work, Debug/ASan for correctness
diff --git a/agentic/cpp/skills/diagnose-segfault/SKILL.md b/agentic/cpp/skills/diagnose-segfault/SKILL.md
deleted file mode 100644
index 91c8950..0000000
--- a/agentic/cpp/skills/diagnose-segfault/SKILL.md
+++ /dev/null
@@ -1,195 +0,0 @@
----
-name: diagnose-segfault
-description: Diagnose C++ crashes and memory-safety errors with AddressSanitizer, GDB, and core dumps. Use when a C++ binary crashes with SIGSEGV, SIGABRT, heap-buffer-overflow, use-after-free, stack-buffer-overflow, double-free, suspected memory corruption, or an available core file.
----
-
-# C++ Segfault and Memory Error Diagnosis
-
-Use this skill to find the first bad access or corrupting operation, not just
-the frame where the process finally crashed.
-
-## When To Use
-
-- A C++ binary crashes with `Segmentation fault`, `SIGSEGV`, or `SIGABRT`.
-- AddressSanitizer reports `ERROR: AddressSanitizer:`.
-- A test reports heap-buffer-overflow, stack-buffer-overflow, use-after-free,
-  double-free, global-buffer-overflow, or similar memory-safety failures.
-- A core dump exists and the user wants root-cause analysis.
-- Memory corruption is suspected but the immediate failure is ambiguous.
-
-For repository-specific binary names, CMake options, or known reproducer
-patterns, also read `agentic/local/cpp/skills/diagnose-segfault/EXAMPLES.md`
-when present.
-
-## Workflow 1: ASan First
-
-Prefer AddressSanitizer when the issue is reproducible. It usually reports the
-bad access with file and line information.
-
-### Build With ASan
-
-For CMake projects, first check whether the repository already has an ASan
-preset or option. If not, configure a dedicated debug build:
-
-```bash
-cmake -B build/asan -DCMAKE_BUILD_TYPE=Debug -DENABLE_ADDRESS_SANITIZER=ON
-cmake --build build/asan -j
-```
-
-For non-CMake builds, compile and link with:
-
-```bash
--fsanitize=address -fno-omit-frame-pointer -g -O1
-```
-
-Use `-O0` instead of `-O1` when you expect to inspect many local variables in
-GDB.
-
-### Run The Minimal Reproducer
-
-Run the specific binary, test case, or input that triggers the crash. For Google
-Test binaries, prefer a narrow filter:
-
-```bash
-./build/asan/unittests --gtest_filter="SuiteName.TestName"
-```
-
-### Read The ASan Report
-
-Focus on:
-
-| Report section | Meaning |
-|---|---|
-| `ERROR: AddressSanitizer: <type>` | Error class |
-| `READ/WRITE of size N` | Access direction and size |
-| First user-code frame | Exact bad access |
-| Allocation/deallocation stack | Object lifetime and ownership |
-| Shadow-byte legend | Boundary or lifetime category |
-| `SUMMARY:` | One-line location summary |
-
-Useful options:
-
-```bash
-ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=1
-ASAN_OPTIONS=halt_on_error=0
-ASAN_OPTIONS=print_stats=1
-```
-
-Disable leak detection while diagnosing a crash if leak noise hides the primary
-failure:
-
-```bash
-ASAN_OPTIONS=detect_leaks=0 ./build/asan/unittests
-```
-
-## Workflow 2: GDB Live Debugging
-
-Use GDB when ASan is unavailable, when the crash is not a direct memory-safety
-violation, or when variable inspection is needed.
-
-Build with debug symbols:
-
-```bash
-cmake -B build/debug -DCMAKE_BUILD_TYPE=Debug
-cmake --build build/debug -j
-```
-
-Run under GDB:
-
-```bash
-gdb --args <binary> [arguments...]
-```
-
-Core commands:
-
-```gdb
-run
-bt full
-info registers
-info locals
-info args
-frame N
-list
-print <expr>
-thread apply all bt
-```
-
-Make C++ values easier to inspect:
-
-```gdb
-set print pretty on
-set print object on
-set pagination off
-```
-
-## Workflow 3: ASan Under GDB
-
-Use this when ASan points at a bad access but the pointer or lifetime corruption
-comes from an earlier frame.
-
-```bash
-gdb --args <asan-binary> [arguments...]
-```
-
-Break on ASan reporting or abort:
-
-```gdb
-break __asan::ReportGenericError
-catch signal SIGABRT
-run
-bt full
-```
-
-Then inspect the last user-code frames before ASan internals.
-
-## Workflow 4: Core Dump Analysis
-
-Use when the crash already happened or reproduction is expensive.
-
-Enable core dumps for future runs if needed:
-
-```bash
-ulimit -c unlimited
-```
-
-Analyze:
-
-```bash
-gdb <binary> <core-file>
-```
-
-Useful commands:
-
-```gdb
-bt full
-info threads
-thread apply all bt
-frame N
-info locals
-info args
-print <expr>
-```
-
-## Common ASan Errors
-
-| Error type | Typical cause |
-|---|---|
-| `heap-buffer-overflow` | Read or write past heap allocation bounds |
-| `stack-buffer-overflow` | Read or write past a local stack object |
-| `global-buffer-overflow` | Read or write past global/static storage |
-| `heap-use-after-free` | Access after `delete`, `free`, or container invalidation |
-| `stack-use-after-return` | Pointer/reference to a returned stack frame |
-| `double-free` | Object released twice |
-| `alloc-dealloc-mismatch` | Mixed allocation APIs, such as `new[]` with `free` |
-
-## Best Practices
-
-1. Build with `-g`; reports without symbols are often not actionable.
-2. Prefer the smallest reproducer over full-suite runs.
-3. Rebuild after toggling sanitizer or debug options.
-4. Treat the first ASan error as primary; later errors are often fallout.
-5. Check container iterator/reference invalidation around the reported object.
-6. Validate the fix with the same reproducer under ASan before running broader
-   tests.
-7. If ASan is too slow for a large input, use GDB on the same input or reduce
-   the input while preserving the crash.
diff --git a/agentic/cpp/skills/optimization-experiment/SKILL.md b/agentic/cpp/skills/optimization-experiment/SKILL.md
deleted file mode 100644
index 75904f6..0000000
--- a/agentic/cpp/skills/optimization-experiment/SKILL.md
+++ /dev/null
@@ -1,193 +0,0 @@
----
-name: optimization-experiment
-description: Run iterative C++ optimization experiments for a target function or class by adding same-API experimental variants, validating correctness, benchmarking, comparing results, and deciding whether to promote a faster implementation.
----
-
-# Optimization Experiment Skill
-
-Use this skill when a user wants to improve performance of a specific C++
-function, class, algorithm, or hot path through benchmark-driven experiments.
-
-This workflow depends on:
-
-1. `../benchmarks/SKILL.md` for Google Benchmark build/run commands, JSON output,
-   hardware counters, pinning, and perf profiling.
-2. `../benchmarks-affected/SKILL.md` when changes need an affected benchmark
-   scope.
-3. `../benchmarks-compare-revisions/SKILL.md` when comparing committed
-   revisions.
-
-## Goal
-
-Iterate from a production implementation to one or more experimental
-implementations, prove semantic equivalence, measure the impact, and decide
-whether a candidate is worth promoting.
-
-The standard loop is:
-
-```text
-target -> benchmark baseline -> experimental same-API variant
-       -> correctness check -> benchmark compare -> keep / revise / discard
-```
-
-Stop when a candidate is clearly better on the intended workload without
-correctness or maintenance regressions, or when the remaining ideas are too weak
-to justify more iteration.
-
-## Step 1 - Identify the Target and Contract
-
-Start from the requested function/class and inspect the real implementation.
-Record:
-
-- public signature/API and call sites that must not change
-- input domains, invalid-input behavior, boundary conditions, and tie-breaking
-- compile-time feature gates such as SIMD flags or platform-specific paths
-- existing tests and reference implementations
-- existing benchmarks that should move if the optimization succeeds
-
-Do not optimize before the contract is clear. If behavior is ambiguous, add or
-find tests before changing implementation.
-
-## Step 2 - Establish Benchmark Coverage
-
-Find benchmark rows that directly exercise the target. Prefer narrow benchmark
-filters over full-suite runs during iteration.
-
-If coverage is missing or too broad, add focused benchmark cases before adding
-the optimized implementation. Include cases for:
-
-- the expected common path
-- boundary and alignment-sensitive paths
-- short, medium, and long ranges or sizes when width matters
-- random or mixed workloads when real calls are not fixed-shape
-- current production behavior and each experimental variant
-
-Capture a baseline JSON before implementation changes:
-
-```bash
-BENCH_CPU=${BENCH_CPU:-0}
-taskset -c "${BENCH_CPU}" <benchmark-binary> \
-  --benchmark_filter="${FILTER}" \
-  --benchmark_report_aggregates_only=true \
-  --benchmark_display_aggregates_only=true \
-  --benchmark_out=/tmp/<target>_baseline.json \
-  --benchmark_out_format=json
-```
-
-Use `../benchmarks/SKILL.md` for exact build directories, Release versus
-diagnostic builds, hardware-counter setup, and retry policy.
-
-## Step 3 - Add Experimental Same-API Variants
-
-Add candidate implementations beside production code in an experimental area,
-namespace, header, or benchmark-local adapter that is already consistent with
-the repository.
-
-Rules:
-
-- keep the callable signature/API identical to production where practical
-- preserve public semantics exactly, including invalid inputs and tie-breaking
-- keep production callers unchanged during experiments
-- make variants benchmark-selectable by name
-- avoid unrelated refactors while measuring
-- keep losing variants only when they document useful evidence or support future
-  comparison
-
-For C++ libraries with feature-gated implementations, provide correct fallbacks
-for unsupported targets or compile configurations.
-
-## Step 4 - Validate Correctness Before Timing
-
-Run relevant tests before trusting benchmark numbers. Add tests when the
-experimental implementation introduces new risk.
-
-Prefer:
-
-- fixed edge cases for boundaries, empty/sentinel behavior, and exact ties
-- randomized differential tests against a scalar or naive reference
-- tests for feature-gated fallback builds when the code has SIMD or platform
-  branches
-- targeted regression tests for any bug found during benchmarking
-
-Do not compare performance for a candidate that has not passed the correctness
-checks for the same semantics as production.
-
-## Step 5 - Benchmark and Compare
-
-Run timing benchmarks from Release builds. Save JSON for every meaningful
-baseline and candidate.
-
-Use diagnostic builds with hardware counters when timing changes need
-explanation:
-
-- cycles and instructions for core execution cost
-- cache counters for memory behavior
-- branch counters when early exits or dispatch logic are involved
-
-Compare both absolute timings and relative deltas. Watch for cases where a
-candidate wins the cherry-picked row but regresses neighboring or realistic
-workloads.
-
-When results are noisy:
-
-- pin to a CPU with `taskset` when available
-- increase repetitions or minimum benchmark time
-- rerun the narrow benchmark filter once
-- avoid changing benchmark scope between baseline and candidate
-
-## Step 6 - Iterate Deliberately
-
-For each candidate, decide one of:
-
-- **Promote**: repeatedly faster on intended rows, no important regressions,
-  correct and maintainable.
-- **Keep experimental**: interesting or workload-specific, but not production
-  ready.
-- **Discard**: slower, too complex, too narrow, or semantically risky.
-
-Use benchmark data to choose the next idea. Examples:
-
-- higher instruction count suggests fewer operations or simpler dispatch
-- lower instructions but higher cycles suggests stalls, memory, or dependency
-  chains
-- short-range regressions suggest a narrower dispatch condition
-- alignment-sensitive rows suggest splitting aligned and unaligned paths
-
-When no idea wins convincingly, document the best result and stop rather than
-overfitting.
-
-## Step 7 - Finalize the Result
-
-If promoting a candidate to production:
-
-- keep the public API unchanged unless the user explicitly requested otherwise
-- keep or update tests that protect the optimized behavior
-- remove accidental benchmark-only scaffolding from production code
-- preserve experimental variants only when useful for future research
-
-If leaving work experimental:
-
-- add a short note near the experimental code with benchmark date, command, and
-  the relevant table or JSON artifact path
-- clearly state that production callers do not use the experimental variant
-- explain which workload the variant helps and where it loses
-
-The final response should include:
-
-- what changed
-- correctness checks run
-- benchmark command or JSON artifacts
-- concise result table
-- recommendation: promote, keep experimenting, or stop
-
-## Guardrails
-
-1. Benchmark before optimizing; otherwise there is no trustworthy baseline.
-2. Never change semantics to win a benchmark.
-3. Never compare Debug timings.
-4. Keep production and experimental code paths distinguishable.
-5. Prefer focused benchmark filters during iteration, then broaden before
-   promotion.
-6. Treat hardware counters as explanatory data, not a replacement for timing.
-7. Record enough benchmark context that future agents do not confuse
-   experimental wins with production behavior.
diff --git a/agentic/cpp/skills/paper-search/SKILL.md b/agentic/cpp/skills/paper-search/SKILL.md
deleted file mode 100644
index 27f108c..0000000
--- a/agentic/cpp/skills/paper-search/SKILL.md
+++ /dev/null
@@ -1,106 +0,0 @@
----
-name: paper-search
-description: "Search for academic papers across Semantic Scholar, arXiv, and CrossRef APIs. Returns unified results with title, authors, year, abstract, DOI, venue, and citation counts. Integrates with Zotero MCP tools for adding found papers to a Zotero library and generating BibTeX entries. Use when the user asks to find papers, search for related work, look up a DOI, or discover references on a topic."
----
-
-# Paper Search
-
-Search external academic APIs for papers. Provides a unified interface across Semantic Scholar, arXiv, and CrossRef with optional Zotero integration.
-
-## Workflow
-
-### 1. Search for Papers
-
-Run the search script from the skill's `scripts/` directory:
-
-```bash
-python3 scripts/search_papers.py --query "topic" --source semantic_scholar --limit 10 --format compact
-```
-
-Available sources:
-- `semantic_scholar` — Default. Best for comprehensive search with citation counts.
-- `arxiv` — Best for preprints and recent unpublished work.
-- `crossref` — Best for published works and DOI-based metadata.
-- `all` — Query all three sources (slower, results combined).
-
-Output formats:
-- `json` — Full JSON output (default). Good for programmatic use.
-- `compact` — Human-readable summary with title, authors, year, venue, citations, and truncated abstract.
-
-### 2. DOI Lookup
-
-Look up a specific paper by DOI:
-
-```bash
-python3 scripts/search_papers.py --doi "10.1145/1234567.1234568" --format compact
-```
-
-### 3. Download PDFs
-
-Download open-access PDFs directly from search results:
-
-```bash
-python3 scripts/search_papers.py --query "wavelet tree" --source arxiv --limit 3 --download ~/papers
-```
-
-- arXiv papers always have PDFs available.
-- Semantic Scholar provides `openAccessPdf` URLs when available.
-- CrossRef may provide PDF links via publisher APIs.
-
-The `--download` flag adds a `downloaded_path` field to each result in JSON output.
-
-### 4. Add to Zotero
-
-**Option A: Via DOI/URL (metadata only)**
-
-After finding relevant papers, add them to Zotero using the Zotero MCP tools:
-
-- `zotero_add_by_doi` — Preferred when DOI is available. Fetches full metadata from CrossRef.
-- `zotero_add_by_url` — Use for arXiv papers or when only a URL is available.
-
-**Option B: Via downloaded PDF (metadata + attachment)**
-
-Download the PDF first, then add to Zotero with the PDF file:
-
-```bash
-# Step 1: Download PDFs and get paths in JSON
-python3 scripts/search_papers.py --doi "10.1007/978-3-540-73420-8_13" --download ~/papers --format json
-
-# Step 2: Use zotero_zotero_add_from_file with the downloaded_path
-```
-
-The agent should call `zotero_zotero_add_from_file` with the `downloaded_path` from the JSON output. This attaches the PDF to the Zotero item and attempts DOI-based metadata extraction.
-
-**Option C: Download + Zotero in one step**
-
-Use `--zotero` to download PDFs with paths formatted for easy Zotero import:
-
-```bash
-python3 scripts/search_papers.py -q "succinct data structures" -s arxiv -n 3 --zotero --download ~/papers
-```
-
-After adding papers, update the semantic search database:
-
-```
-zotero_update_search_database
-```
-
-### 5. Generate BibTeX
-
-For papers already in Zotero, use `zotero_get_item_metadata` with `format: "bibtex"` to get BibTeX entries. Alternatively, use `zotero_fetch` for full metadata.
-
-For papers NOT in Zotero, BibTeX can be constructed from the search results' JSON fields (`authors`, `year`, `title`, `venue`, `doi`).
-
-## Guidance
-
-- Start with `semantic_scholar` for general queries — it has the broadest coverage and citation data.
-- Use `arxiv` when looking for very recent work or preprints in CS/ML/physics.
-- Use `crossref` for DOI lookups or when Semantic Scholar returns no results.
-- When using `--source all`, results may contain duplicates (same paper from different sources). Deduplicate by DOI or title similarity.
-- Citation counts are approximate and may differ across sources.
-- arXiv results return the arXiv ID (e.g., `2301.12345`) which can be used with `zotero_add_by_url` via `https://arxiv.org/abs/2301.12345`.
-
-## API Quirks
-
-- **arXiv `atom:id` is NOT a DOI** — it contains an arXiv URL like `http://arxiv.org/abs/2301.12345`. Store the extracted ID in `arxiv_id` only; set `doi` to `None` for arXiv results. Writing the arXiv URL into `doi` produces invalid DOI metadata downstream (e.g., Zotero import).
-- **CrossRef `select` must include `link`** — the `link` field is needed for `pdf_url` extraction. If omitted from `select`, the API won't return link metadata and `pdf_url` will silently be empty for all CrossRef results.
diff --git a/agentic/cpp/skills/paper-search/references/api_reference.md b/agentic/cpp/skills/paper-search/references/api_reference.md
deleted file mode 100644
index dcb5aa5..0000000
--- a/agentic/cpp/skills/paper-search/references/api_reference.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# External Paper Search APIs
-
-## Semantic Scholar
-
-- **Base URL**: `https://api.semanticscholar.org/graph/v1/`
-- **Rate limit**: 1 req/sec without API key, 10 req/sec with key
-- **No auth required** for basic usage
-- **Fields**: title, authors, year, abstract, externalIds (DOI, ArXiv), venue, citationCount
-- **Best for**: Comprehensive academic search with citation counts
-
-## arXiv
-
-- **Base URL**: `http://export.arxiv.org/api/query`
-- **Rate limit**: Be nice, ~3 sec between requests
-- **No auth required**
-- **Returns**: XML (Atom feed)
-- **Best for**: Preprints, recent work not yet published
-
-## CrossRef
-
-- **Base URL**: `https://api.crossref.org/`
-- **Rate limit**: 50 req/sec with polite pool (include `mailto` header)
-- **No auth required**
-- **Best for**: DOI lookup, published works, metadata enrichment
-
-## Zotero Integration
-
-After finding papers via external search, use Zotero MCP tools:
-
-1. `zotero_add_by_doi` — Add paper by DOI (fetches metadata from CrossRef)
-2. `zotero_add_by_url` — Add paper by URL (arXiv, DOI URLs)
-3. `zotero_update_search_database` — Update semantic search index after adding
diff --git a/agentic/cpp/skills/paper-search/scripts/search_papers.py b/agentic/cpp/skills/paper-search/scripts/search_papers.py
deleted file mode 100755
index c65351d..0000000
--- a/agentic/cpp/skills/paper-search/scripts/search_papers.py
+++ /dev/null
@@ -1,333 +0,0 @@
-#!/usr/bin/env python3
-"""Search external APIs for academic papers.
-
-Sources: Semantic Scholar, arXiv, CrossRef.
-Outputs unified JSON to stdout.
-
-Usage:
-    python3 search_papers.py --query "wavelet tree succinct" --source semantic_scholar --limit 10
-    python3 search_papers.py --query "succinct data structures" --source arxiv --limit 5
-    python3 search_papers.py --doi "10.1145/123" --source crossref
-    python3 search_papers.py --query "rank select" --source all --limit 5
-    python3 search_papers.py --query "wavelet tree" --source arxiv --limit 1 --download ~/papers
-    python3 search_papers.py --doi "10.1007/978-3-540-73420-8_13" --download ~/papers --zotero
-"""
-
-import argparse
-import json
-import os
-import re
-import sys
-import time
-import urllib.error
-import urllib.parse
-import urllib.request
-from pathlib import Path
-from typing import Any
-
-
-def _get(url: str, headers: dict[str, str] | None = None, timeout: int = 30,
-         retries: int = 2) -> dict:
-    for attempt in range(retries + 1):
-        req = urllib.request.Request(url, headers=headers or {})
-        try:
-            with urllib.request.urlopen(req, timeout=timeout) as resp:
-                return json.loads(resp.read().decode())
-        except urllib.error.HTTPError as e:
-            if e.code == 429 and attempt < retries:
-                wait = 2 ** attempt
-                print(f"Rate limited, retrying in {wait}s...", file=sys.stderr)
-                time.sleep(wait)
-                continue
-            print(f"HTTP {e.code}: {e.reason} for {url}", file=sys.stderr)
-            return {}
-        except urllib.error.URLError as e:
-            print(f"URL error: {e.reason} for {url}", file=sys.stderr)
-            return {}
-    return {}
-
-
-def search_semantic_scholar(query: str, limit: int = 10) -> list[dict[str, Any]]:
-    """Search Semantic Scholar API."""
-    params = urllib.parse.urlencode({
-        "query": query,
-        "limit": limit,
-        "fields": "title,authors,year,abstract,externalIds,venue,publicationDate,citationCount,url,openAccessPdf",
-    })
-    url = f"https://api.semanticscholar.org/graph/v1/paper/search?{params}"
-    data = _get(url, headers={"Accept": "application/json"})
-    results = []
-    for paper in data.get("data", []):
-        ext_ids = paper.get("externalIds") or {}
-        pdf_info = paper.get("openAccessPdf") or {}
-        results.append({
-            "source": "semantic_scholar",
-            "title": paper.get("title", ""),
-            "authors": [a.get("name", "") for a in paper.get("authors", [])],
-            "year": paper.get("year"),
-            "abstract": paper.get("abstract", ""),
-            "doi": ext_ids.get("DOI"),
-            "arxiv_id": ext_ids.get("ArXiv"),
-            "venue": paper.get("venue", ""),
-            "citation_count": paper.get("citationCount"),
-            "url": paper.get("url", ""),
-            "pdf_url": pdf_info.get("url"),
-        })
-    return results
-
-
-def search_arxiv(query: str, limit: int = 10) -> list[dict[str, Any]]:
-    """Search arXiv API."""
-    words = query.split()
-    if len(words) == 1:
-        search_term = f"all:{query}"
-    elif len(words) == 2:
-        # Phrase search for 2-word queries
-        search_term = f'all:"{query}"'
-    else:
-        # Use OR of phrase and individual terms for 3+ words
-        # This catches exact phrase matches AND papers with all terms
-        phrase = f'all:"{query}"'
-        and_terms = " AND ".join(f"all:{w}" for w in words)
-        search_term = f"({phrase}) OR ({and_terms})"
-    params = urllib.parse.urlencode({
-        "search_query": search_term,
-        "start": 0,
-        "max_results": limit,
-    })
-    url = f"http://export.arxiv.org/api/query?{params}"
-    req = urllib.request.Request(url)
-    try:
-        with urllib.request.urlopen(req, timeout=30) as resp:
-            xml_data = resp.read().decode()
-    except (urllib.error.URLError, urllib.error.HTTPError) as e:
-        print(f"arXiv API error: {e}", file=sys.stderr)
-        return []
-
-    import xml.etree.ElementTree as ET
-    root = ET.fromstring(xml_data)
-    ns = {"atom": "http://www.w3.org/2005/Atom"}
-    results = []
-    for entry in root.findall("atom:entry", ns):
-        title = entry.findtext("atom:title", "", ns).strip().replace("\n", " ")
-        abstract = entry.findtext("atom:summary", "", ns).strip().replace("\n", " ")
-        authors = [a.findtext("atom:name", "", ns) for a in entry.findall("atom:author", ns)]
-        published = entry.findtext("atom:published", "", ns)
-        year = int(published[:4]) if published else None
-        arxiv_id = ""
-        for link in entry.findall("atom:link", ns):
-            href = link.get("href", "")
-            if "arxiv.org/abs/" in href:
-                arxiv_id = href.split("/abs/")[-1]
-                break
-        results.append({
-            "source": "arxiv",
-            "title": title,
-            "authors": authors,
-            "year": year,
-            "abstract": abstract,
-            "doi": None,
-            "arxiv_id": arxiv_id,
-            "venue": "arXiv",
-            "citation_count": None,
-            "url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
-            "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else None,
-        })
-    return results
-
-
-def search_crossref(query: str, limit: int = 10) -> list[dict[str, Any]]:
-    """Search CrossRef API."""
-    params = urllib.parse.urlencode({
-        "query": query,
-        "rows": limit,
-        "select": "DOI,title,author,published-print,abstract,container-title,is-referenced-by-count,URL,type,link",
-    })
-    url = f"https://api.crossref.org/works?{params}"
-    data = _get(url, headers={"Accept": "application/json"})
-    results = []
-    for item in data.get("message", {}).get("items", []):
-        title_list = item.get("title", [])
-        title = title_list[0] if title_list else ""
-        authors = []
-        for a in item.get("author", []):
-            name = f"{a.get('given', '')} {a.get('family', '')}".strip()
-            if name:
-                authors.append(name)
-        pub_date = item.get("published-print", {}).get("date-parts", [[None]])
-        year = pub_date[0][0] if pub_date and pub_date[0] else None
-        venue_list = item.get("container-title", [])
-        venue = venue_list[0] if venue_list else ""
-        pdf_url = None
-        for link in item.get("link", []):
-            if "pdf" in link.get("content-type", ""):
-                pdf_url = link.get("URL")
-                break
-        results.append({
-            "source": "crossref",
-            "title": title,
-            "authors": authors,
-            "year": year,
-            "abstract": item.get("abstract", ""),
-            "doi": item.get("DOI"),
-            "arxiv_id": None,
-            "venue": venue,
-            "citation_count": item.get("is-referenced-by-count"),
-            "url": item.get("URL", ""),
-            "pdf_url": pdf_url,
-        })
-    return results
-
-
-def lookup_doi(doi: str) -> dict[str, Any] | None:
-    """Look up a single paper by DOI via CrossRef."""
-    url = f"https://api.crossref.org/works/{urllib.parse.quote(doi, safe='')}"
-    data = _get(url)
-    item = data.get("message")
-    if not item:
-        return None
-    title_list = item.get("title", [])
-    title = title_list[0] if title_list else ""
-    authors = []
-    for a in item.get("author", []):
-        name = f"{a.get('given', '')} {a.get('family', '')}".strip()
-        if name:
-            authors.append(name)
-    pub_date = item.get("published-print", {}).get("date-parts", [[None]])
-    year = pub_date[0][0] if pub_date and pub_date[0] else None
-    venue_list = item.get("container-title", [])
-    venue = venue_list[0] if venue_list else ""
-    pdf_url = None
-    for link in item.get("link", []):
-        if "pdf" in link.get("content-type", ""):
-            pdf_url = link.get("URL")
-            break
-    return {
-        "source": "crossref",
-        "title": title,
-        "authors": authors,
-        "year": year,
-        "abstract": item.get("abstract", ""),
-        "doi": item.get("DOI"),
-        "arxiv_id": None,
-        "venue": venue,
-        "citation_count": item.get("is-referenced-by-count"),
-        "url": item.get("URL", ""),
-        "pdf_url": pdf_url,
-    }
-
-
-SOURCES = {
-    "semantic_scholar": search_semantic_scholar,
-    "arxiv": search_arxiv,
-    "crossref": search_crossref,
-}
-
-
-def _sanitize_filename(title: str) -> str:
-    """Generate a clean filename from paper title."""
-    name = re.sub(r'[^\w\s-]', '', title.lower())
-    name = re.sub(r'[\s]+', '_', name.strip())
-    return name[:80]
-
-
-def download_pdf(url: str, output_dir: str, paper: dict[str, Any]) -> str | None:
-    """Download a PDF and return the local path."""
-    filename = _sanitize_filename(paper.get("title", "paper")) + ".pdf"
-    output_path = Path(output_dir) / filename
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-
-    req = urllib.request.Request(url, headers={
-        "User-Agent": "Mozilla/5.0 (academic paper-search script)"
-    })
-    try:
-        with urllib.request.urlopen(req, timeout=60) as resp:
-            content_type = resp.headers.get("Content-Type", "")
-            if "pdf" not in content_type and "octet-stream" not in content_type:
-                print(f"Warning: unexpected content type '{content_type}' for {url}",
-                      file=sys.stderr)
-            with open(output_path, "wb") as f:
-                f.write(resp.read())
-        print(f"Downloaded: {output_path}", file=sys.stderr)
-        return str(output_path)
-    except (urllib.error.URLError, urllib.error.HTTPError) as e:
-        print(f"Download failed for {url}: {e}", file=sys.stderr)
-        return None
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Search for academic papers")
-    parser.add_argument("--query", "-q", help="Search query")
-    parser.add_argument("--doi", help="Look up a specific DOI")
-    parser.add_argument("--source", "-s", default="semantic_scholar",
-                        choices=["semantic_scholar", "arxiv", "crossref", "all"],
-                        help="API source (default: semantic_scholar)")
-    parser.add_argument("--limit", "-n", type=int, default=10,
-                        help="Max results per source (default: 10)")
-    parser.add_argument("--format", "-f", default="json",
-                        choices=["json", "compact"],
-                        help="Output format (default: json)")
-    parser.add_argument("--download", "-d", metavar="DIR",
-                        help="Download PDFs to DIR (requires pdf_url in results)")
-    parser.add_argument("--zotero", "-z", action="store_true",
-                        help="Download PDFs and output paths for Zotero import (implies --download)")
-    args = parser.parse_args()
-
-    if not args.query and not args.doi:
-        parser.error("Either --query or --doi is required")
-
-    if args.zotero and not args.download:
-        args.download = "."
-
-    results = []
-    if args.doi:
-        paper = lookup_doi(args.doi)
-        if paper:
-            results.append(paper)
-    elif args.source == "all":
-        for name, func in SOURCES.items():
-            try:
-                results.extend(func(args.query, args.limit))
-            except Exception as e:
-                print(f"Error searching {name}: {e}", file=sys.stderr)
-            time.sleep(1)
-    else:
-        results = SOURCES[args.source](args.query, args.limit)
-
-    if args.download:
-        for r in results:
-            pdf_url = r.get("pdf_url")
-            if pdf_url:
-                path = download_pdf(pdf_url, args.download, r)
-                r["downloaded_path"] = path
-            else:
-                r["downloaded_path"] = None
-
-    if args.format == "json":
-        print(json.dumps(results, indent=2))
-    else:
-        for i, r in enumerate(results, 1):
-            authors = ", ".join(r["authors"][:3])
-            if len(r["authors"]) > 3:
-                authors += " et al."
-            doi_str = f"  DOI: {r['doi']}" if r.get("doi") else ""
-            arxiv_str = f"  arXiv: {r['arxiv_id']}" if r.get("arxiv_id") else ""
-            cite_str = f"  Citations: {r['citation_count']}" if r.get("citation_count") else ""
-            pdf_str = f"  PDF: {r['pdf_url']}" if r.get("pdf_url") else "  PDF: N/A"
-            dl_str = ""
-            if r.get("downloaded_path"):
-                dl_str = f"  Downloaded: {r['downloaded_path']}"
-            print(f"[{i}] {r['title']}")
-            print(f"    {authors} ({r.get('year', '?')}) — {r.get('venue', '')}")
-            print(f"    {r.get('url', '')}{doi_str}{arxiv_str}{cite_str}")
-            print(f"    {pdf_str}{dl_str}")
-            if r.get("abstract"):
-                abstract = r["abstract"][:200]
-                if len(r["abstract"]) > 200:
-                    abstract += "..."
-                print(f"    {abstract}")
-            print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/agentic/cpp/skills/pdf/SKILL.md b/agentic/cpp/skills/pdf/SKILL.md
deleted file mode 100644
index ddbce00..0000000
--- a/agentic/cpp/skills/pdf/SKILL.md
+++ /dev/null
@@ -1,112 +0,0 @@
----
-name: pdf
-description: Process PDF files - extract text, create PDFs, merge documents. Use when user asks to read PDF, create PDF, or work with PDF files.
----
-
-# PDF Processing Skill
-
-You now have expertise in PDF manipulation. Follow these workflows:
-
-## Reading PDFs
-
-**Option 1: Quick text extraction (preferred)**
-```bash
-# Using pdftotext (poppler-utils)
-pdftotext input.pdf -  # Output to stdout
-pdftotext input.pdf output.txt  # Output to file
-
-# If pdftotext not available, try:
-python3 -c "
-import fitz  # PyMuPDF
-doc = fitz.open('input.pdf')
-for page in doc:
-    print(page.get_text())
-"
-```
-
-**Option 2: Page-by-page with metadata**
-```python
-import fitz  # pip install pymupdf
-
-doc = fitz.open("input.pdf")
-print(f"Pages: {len(doc)}")
-print(f"Metadata: {doc.metadata}")
-
-for i, page in enumerate(doc):
-    text = page.get_text()
-    print(f"--- Page {i+1} ---")
-    print(text)
-```
-
-## Creating PDFs
-
-**Option 1: From Markdown (recommended)**
-```bash
-# Using pandoc
-pandoc input.md -o output.pdf
-
-# With custom styling
-pandoc input.md -o output.pdf --pdf-engine=xelatex -V geometry:margin=1in
-```
-
-**Option 2: Programmatically**
-```python
-from reportlab.lib.pagesizes import letter
-from reportlab.pdfgen import canvas
-
-c = canvas.Canvas("output.pdf", pagesize=letter)
-c.drawString(100, 750, "Hello, PDF!")
-c.save()
-```
-
-**Option 3: From HTML**
-```bash
-# Using wkhtmltopdf
-wkhtmltopdf input.html output.pdf
-
-# Or with Python
-python3 -c "
-import pdfkit
-pdfkit.from_file('input.html', 'output.pdf')
-"
-```
-
-## Merging PDFs
-
-```python
-import fitz
-
-result = fitz.open()
-for pdf_path in ["file1.pdf", "file2.pdf", "file3.pdf"]:
-    doc = fitz.open(pdf_path)
-    result.insert_pdf(doc)
-result.save("merged.pdf")
-```
-
-## Splitting PDFs
-
-```python
-import fitz
-
-doc = fitz.open("input.pdf")
-for i in range(len(doc)):
-    single = fitz.open()
-    single.insert_pdf(doc, from_page=i, to_page=i)
-    single.save(f"page_{i+1}.pdf")
-```
-
-## Key Libraries
-
-| Task | Library | Install |
-|------|---------|---------|
-| Read/Write/Merge | PyMuPDF | `pip install pymupdf` |
-| Create from scratch | ReportLab | `pip install reportlab` |
-| HTML to PDF | pdfkit | `pip install pdfkit` + wkhtmltopdf |
-| Text extraction | pdftotext | `brew install poppler` / `apt install poppler-utils` |
-
-## Best Practices
-
-1. **Always check if tools are installed** before using them
-2. **Handle encoding issues** - PDFs may contain various character encodings
-3. **Large PDFs**: Process page by page to avoid memory issues
-4. **OCR for scanned PDFs**: Use `pytesseract` if text extraction returns empty
diff --git a/agentic/cpp/skills/setup-cpp-repo/SKILL.md b/agentic/cpp/skills/setup-cpp-repo/SKILL.md
deleted file mode 100644
index 83201a4..0000000
--- a/agentic/cpp/skills/setup-cpp-repo/SKILL.md
+++ /dev/null
@@ -1,136 +0,0 @@
----
-name: setup-cpp-repo
-description: Scaffold a new C++20 repository with CMake, Google Test, Google Benchmark, CI workflows, Doxygen docs, and Chromium code style. Use when the user asks to create a new C++ project, set up a C++ library, or initialize a C++ repository with modern tooling.
----
-
-# setup-cpp-repo
-
-## Overview
-
-This skill generates a complete modern C++20 project scaffold. The generated
-repository is header-only by default and includes:
-
-- CMake build system with presets
-- Google Test for unit testing
-- Google Benchmark for performance benchmarks
-- Doxygen documentation with doxygen-awesome-css theme
-- GitHub Actions CI workflows (ASan, lint, coverage, docs)
-- Chromium C++ code style via `.clang-format`
-- `AGENTS.md` for AI coding assistant guidelines
-
-## When to Use This Skill
-
-Use this skill when:
-- The user wants to create a new C++ library or project from scratch
-- The user asks for a "C++ project template" or "C++ repo setup"
-- The user needs CMake + Google Test + benchmark scaffolding
-- The user wants header-only library conventions with optional SIMD-oriented
-  build flags and Doxygen docs
-
-Do **not** use this skill when:
-- Working with an existing codebase (use the `cmake` skill instead)
-- The project is not C++ (use a different skill)
-- The user only wants a single file or snippet
-
-## Workflow
-
-### Step 1: Gather Parameters
-
-Ask the user for (or infer from context):
-- **Project name** (required): Hyphenated lowercase identifier, e.g., `my-lib`
-- **Namespace** (optional): C++ namespace. Defaults to project name with hyphens removed, e.g., `mylib`
-- **Output directory** (optional): Where to create the project. Defaults to current directory.
-
-### Step 2: Run the Generator
-
-Execute the generation script:
-
-```bash
-python3 agentic/cpp/skills/setup-cpp-repo/scripts/init_cpp_project.py \
-    --name <project-name> \
-    [--namespace <namespace>] \
-    [--output-dir <directory>]
-```
-
-For concrete examples, check
-`agentic/local/cpp/skills/setup-cpp-repo/EXAMPLES.md` when present.
-
-### Step 3: Verify the Scaffold
-
-After generation, the project structure should look like:
-
-```
-<project-name>/
-├── CMakeLists.txt
-├── CMakePresets.json
-├── .clang-format
-├── .gitignore
-├── README.md
-├── AGENTS.md
-├── include/
-│   └── <namespace>/
-│       └── <project_snake>.hpp
-├── src/
-│   ├── tests/
-│   │   └── unittests.cpp
-│   ├── benchmarks/
-│   │   └── benchmarks.cpp
-│   └── docs/
-│       ├── Doxyfile.in
-│       └── images/
-├── scripts/
-│   └── coverage_report.sh
-└── .github/
-    └── workflows/
-        ├── build-test.yml
-        ├── linter.yml
-        ├── coverage.yml
-        └── doxygen.yml
-```
-
-### Step 4: Initial Build and Test
-
-Change into the project directory and run an initial build to verify everything works:
-
-```bash
-cd <project-name>
-cmake --preset release
-cmake --build --preset release -j
-./build/release/unittests
-```
-
-If the build and tests pass, the scaffold is ready.
-
-### Step 5: Hand Off to cmake Skill
-
-After project creation, use the **`cmake` skill** (`../cmake/SKILL.md`) for all subsequent build operations. The `cmake` skill documents:
-- Build directory conventions with git short-hash suffixes
-- How to replicate preset settings with custom build directories
-- AddressSanitizer, coverage, and benchmark workflows
-- Best practices for out-of-source builds
-
-## Customization Guide
-
-### Adding More Test Executables
-
-Edit `CMakeLists.txt` and add new `add_executable` blocks under the `if(<PROJECT_UPPER>_TESTS)` section, following the pattern of the existing `unittests` target.
-
-Update `scripts/coverage_report.sh` to run any new test binaries.
-
-Update `.github/workflows/build-test.yml` to execute new test binaries in CI.
-
-### Adding More Benchmark Executables
-
-Edit `CMakeLists.txt` and add new `add_executable` blocks under the `if(<PROJECT_UPPER>_BENCHMARKS)` section, following the pattern of the existing `benchmarks` target.
-
-### Adding Third-Party Dependencies
-
-For header-only libraries, prefer `FetchContent` in `CMakeLists.txt`. For compiled libraries, consider vendoring or using a package manager (Conan, vcpkg).
-
-### Modifying Doxygen Configuration
-
-Edit `src/docs/Doxyfile.in`. The generated version is intentionally minimal (only non-default settings). Add or override settings as needed. Run `doxygen -g` to see all available options.
-
-## Reference
-
-See `references/project_structure.md` for a detailed breakdown of every generated file and its purpose.
diff --git a/agentic/cpp/skills/setup-cpp-repo/references/project_structure.md b/agentic/cpp/skills/setup-cpp-repo/references/project_structure.md
deleted file mode 100644
index 6bf3236..0000000
--- a/agentic/cpp/skills/setup-cpp-repo/references/project_structure.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Generated Project Structure Reference
-
-This document describes every file and directory generated by `init_cpp_project.py` and its purpose.
-
-## Root Files
-
-### `CMakeLists.txt`
-Main CMake configuration. Defines:
-- C++20 standard requirements
-- `MARCH` cache variable (defaults to `native`)
-- Optional SIMD fallback flag when the generated project enables SIMD-specific
-  code paths
-- `ENABLE_ADDRESS_SANITIZER` option for ASan builds
-- `<PROJECT_UPPER>_COVERAGE` option for gcov instrumentation
-- Build options: `<PROJECT_UPPER>_TESTS`, `<PROJECT_UPPER>_BENCHMARKS`, `<PROJECT_UPPER>_DIAGNOSTICS`, `<PROJECT_UPPER>_DOCS`
-- FetchContent dependencies: Google Test, Google Benchmark, spdlog (diagnostics only), Doxygen theme
-- Test executable: `unittests`
-- Benchmark executable: `benchmarks`
-- Custom target: `docs` (when Doxygen is enabled)
-
-### `CMakePresets.json`
-CMake presets (version 4) with a hidden `base` preset. Defines presets for:
-- `debug` — Debug build
-- `release` — Release build
-- `benchmarks` — Release with benchmarks enabled
-- `benchmarks-diagnostic` — RelWithDebInfo with diagnostics and libpfm
-- `docs` — Documentation build
-- `coverage` — Debug with coverage instrumentation
-- `asan` — Debug with AddressSanitizer
-
-### `.clang-format`
-Chromium-based C++ formatting configuration. Simplified from the full Chromium style by removing Windows-specific include priorities and IPC macro block definitions. Key settings:
-- `BasedOnStyle: Chromium`
-- `Standard: Cpp11`
-- `InsertBraces: true`
-- `InsertNewlineAtEOF: true`
-- `IncludeBlocks: Regroup` with generic priority categories
-
-### `.gitignore`
-Standard C++ project ignores:
-- `build/`, `.vscode/`, `Testing/`
-- `plans/*`, `venv/`, `docs/*`
-- `CMakeUserPresets.json`
-- `_deps/`, gcov outputs (`*.gcda`, `*.gcno`, `*.gcov`)
-
-### `README.md`
-Minimal project README used as the Doxygen main page.
-
-### `AGENTS.md`
-Project documentation for AI coding assistants. Contains:
-- Project overview and architecture conventions
-- Technology stack (C++20, CMake, Google Test, Google Benchmark)
-- Build commands with all CMake options
-- Testing patterns and style guidelines
-- Common tasks for AI agents (adding components, modifying SIMD code, adding tests)
-- Performance philosophy
-
-## Directories
-
-### `include/<namespace>/`
-Header-only library API. Contains a placeholder header (`<project_snake>.hpp`) with:
-- Doxygen file documentation
-- Example function in the project's namespace
-- `#pragma once` guard
-
-### `src/tests/`
-Unit test scaffold. Contains `unittests.cpp` with:
-- Google Test includes
-- Basic assertion test against the placeholder header
-- `gtest_main` supplies the test runner entry point
-
-### `src/benchmarks/`
-Benchmark scaffold. Contains `benchmarks.cpp` with:
-- Google Benchmark includes
-- Example benchmark using `benchmark::DoNotOptimize`
-- `BENCHMARK_MAIN()` macro
-
-### `src/docs/`
-Doxygen configuration. Contains:
-- `Doxyfile.in` — Trimmed Doxygen config (~300 lines vs. 1100+ in full). Only non-default settings are specified. Key templated values:
-  - `PROJECT_NAME`
-  - `INPUT` (points to `include/` and `README.md`)
-  - `STRIP_FROM_PATH` (strips source dir from file paths)
-  - `IMAGE_PATH`
-  - `HTML_EXTRA_STYLESHEET` (doxygen-awesome-css)
-  - `USE_MDFILE_AS_MAINPAGE`
-- `images/` — Empty directory for documentation images
-
-### `scripts/`
-Utility scripts. Contains:
-- `coverage_report.sh` — Runs the `coverage` CMake preset, executes tests, and generates gcov reports. Excludes `_deps/`, `third_party/`, and `src/benchmarks/` from coverage.
-
-### `.github/workflows/`
-CI/CD workflows:
-
-#### `build-test.yml`
-Builds the project with AddressSanitizer and runs unit tests on `ubuntu-latest`. Triggered on pushes and PRs to `main`.
-
-#### `linter.yml`
-Runs `clang-format --dry-run --Werror` on all C/C++ files. Triggered on pushes to `main` and all PRs.
-
-#### `coverage.yml`
-Runs the coverage script and uploads results to Codecov. Also uploads coverage artifacts. Triggered on pushes and PRs to `main`.
-
-#### `doxygen.yml`
-Installs Doxygen, builds documentation with the `docs` preset, and deploys HTML output to GitHub Pages. Triggered on pushes to `main` and manual dispatch.
-
-## Template Substitution
-
-All generated files use these placeholders, replaced by the script:
-
-| Placeholder | Example input | Example output |
-|-------------|---------------|----------------|
-| `{{PROJECT_NAME}}` | `my-lib` | `my-lib` |
-| `{{NAMESPACE}}` | `mylib` | `mylib` |
-| `{{PROJECT_NAME_UPPER}}` | `MY_LIB` | `MY_LIB` |
-| `{{HEADER_NAME}}` | `my_lib.hpp` | `my_lib.hpp` |
diff --git a/agentic/cpp/skills/setup-cpp-repo/scripts/init_cpp_project.py b/agentic/cpp/skills/setup-cpp-repo/scripts/init_cpp_project.py
deleted file mode 100755
index d76624f..0000000
--- a/agentic/cpp/skills/setup-cpp-repo/scripts/init_cpp_project.py
+++ /dev/null
@@ -1,1053 +0,0 @@
-#!/usr/bin/env python3
-"""
-init_cpp_project.py - Scaffold a new C++20 repository following modern C++ conventions.
-
-Usage:
-    init_cpp_project.py --name <project-name> [--namespace <namespace>] [--output-dir <dir>]
-
-Example:
-    init_cpp_project.py --name my-lib --namespace mylib --output-dir .
-"""
-
-import argparse
-import os
-import sys
-from pathlib import Path
-
-
-# ---------------------------------------------------------------------------
-# Helper functions
-# ---------------------------------------------------------------------------
-
-def to_upper(name: str) -> str:
-    """Convert project name to uppercase with underscores."""
-    return name.replace("-", "_").upper()
-
-
-def to_snake(name: str) -> str:
-    """Convert project name to snake_case for filenames."""
-    return name.replace("-", "_")
-
-
-# ---------------------------------------------------------------------------
-# Templates
-# ---------------------------------------------------------------------------
-
-CMAKE_LISTS_TXT = """cmake_minimum_required(VERSION 3.18)
-project({{PROJECT_NAME}})
-
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-set(MARCH "native" CACHE STRING "march compiler flag")
-add_compile_options("-march=${MARCH}")
-message(STATUS "MARCH is '${MARCH}'")
-
-option({{PROJECT_NAME_UPPER}}_DISABLE_WIDE_SIMD "Disable wide SIMD instructions" OFF)
-if({{PROJECT_NAME_UPPER}}_DISABLE_WIDE_SIMD)
-    add_compile_options("-mno-avx512f")
-    message(STATUS "{{PROJECT_NAME_UPPER}}_DISABLE_WIDE_SIMD is ON")
-endif()
-
-option(ENABLE_ADDRESS_SANITIZER "Enable AddressSanitizer" OFF)
-if(ENABLE_ADDRESS_SANITIZER)
-    add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-    add_link_options(-fsanitize=address)
-    message(STATUS "AddressSanitizer is ON")
-endif()
-
-option({{PROJECT_NAME_UPPER}}_COVERAGE "Enable coverage instrumentation" OFF)
-if({{PROJECT_NAME_UPPER}}_COVERAGE)
-    add_compile_options(-O0 -g --coverage)
-    add_link_options(--coverage)
-    message(STATUS "Coverage instrumentation is ON")
-endif()
-
-# ---------------------------------------------------------------------------
-# Build options
-# ---------------------------------------------------------------------------
-option({{PROJECT_NAME_UPPER}}_TESTS "Build unit tests" ON)
-option({{PROJECT_NAME_UPPER}}_BENCHMARKS "Build benchmarks" OFF)
-option({{PROJECT_NAME_UPPER}}_DIAGNOSTICS "Include diagnostic logs" OFF)
-option({{PROJECT_NAME_UPPER}}_DOCS "Build Doxygen documentation" OFF)
-
-if({{PROJECT_NAME_UPPER}}_DIAGNOSTICS)
-    add_compile_definitions({{PROJECT_NAME_UPPER}}_DIAGNOSTICS)
-    set({{PROJECT_NAME_UPPER}}_DIAGNOSTICS_LIBS spdlog::spdlog_header_only)
-endif()
-
-# ---------------------------------------------------------------------------
-# Dependencies (fetched only when needed)
-# ---------------------------------------------------------------------------
-include(FetchContent)
-
-if({{PROJECT_NAME_UPPER}}_DIAGNOSTICS)
-    set(SPDLOG_BUILD_SHARED OFF CACHE BOOL "" FORCE)
-    set(SPDLOG_BUILD_EXAMPLE OFF CACHE BOOL "" FORCE)
-    set(SPDLOG_BUILD_TESTING OFF CACHE BOOL "" FORCE)
-    set(SPDLOG_INSTALL OFF CACHE BOOL "" FORCE)
-    FetchContent_Declare(
-        spdlog
-        GIT_REPOSITORY https://github.com/gabime/spdlog.git
-        GIT_TAG v1.14.1
-    )
-    FetchContent_MakeAvailable(spdlog)
-endif()
-
-if({{PROJECT_NAME_UPPER}}_BENCHMARKS)
-    FetchContent_Declare(
-        googlebenchmark
-        GIT_REPOSITORY https://github.com/google/benchmark.git
-        GIT_TAG v1.9.4
-    )
-    set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable Google Benchmark tests")
-    FetchContent_MakeAvailable(googlebenchmark)
-endif()
-
-if({{PROJECT_NAME_UPPER}}_TESTS)
-    if(NOT TARGET gtest_main)
-        FetchContent_Declare(
-            googletest
-            GIT_REPOSITORY https://github.com/google/googletest.git
-            GIT_TAG v1.17.0
-        )
-        FetchContent_MakeAvailable(googletest)
-    endif()
-    include(GoogleTest)
-endif()
-
-# ---------------------------------------------------------------------------
-# Unit tests
-# ---------------------------------------------------------------------------
-if({{PROJECT_NAME_UPPER}}_TESTS)
-    enable_testing()
-
-    add_executable(unittests
-        src/tests/unittests.cpp)
-    target_include_directories(unittests
-        PUBLIC include)
-    target_link_libraries(unittests
-        gtest_main
-        ${{{PROJECT_NAME_UPPER}}_DIAGNOSTICS_LIBS})
-    gtest_discover_tests(unittests)
-endif()
-
-# ---------------------------------------------------------------------------
-# Benchmarks
-# ---------------------------------------------------------------------------
-if({{PROJECT_NAME_UPPER}}_BENCHMARKS)
-    add_executable(benchmarks
-        src/benchmarks/benchmarks.cpp)
-    target_include_directories(benchmarks
-        PUBLIC include)
-    target_link_libraries(benchmarks
-        benchmark
-        benchmark_main
-        ${{{PROJECT_NAME_UPPER}}_DIAGNOSTICS_LIBS})
-endif()
-
-# ---------------------------------------------------------------------------
-# Documentation (Doxygen)
-# ---------------------------------------------------------------------------
-if({{PROJECT_NAME_UPPER}}_DOCS)
-    find_package(Doxygen REQUIRED)
-
-    FetchContent_Declare(
-        doxygen-awesome-css
-        URL https://github.com/jothepro/doxygen-awesome-css/archive/refs/heads/main.zip
-    )
-    FetchContent_MakeAvailable(doxygen-awesome-css)
-
-    FetchContent_GetProperties(doxygen-awesome-css SOURCE_DIR AWESOME_CSS_DIR)
-
-    set(DOXYFILE_IN ${CMAKE_CURRENT_SOURCE_DIR}/src/docs/Doxyfile.in)
-    set(DOXYFILE_OUT ${CMAKE_CURRENT_BINARY_DIR}/docs/Doxyfile)
-    configure_file(${DOXYFILE_IN} ${DOXYFILE_OUT} @ONLY)
-
-    add_custom_target(docs
-        COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYFILE_OUT}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-        COMMENT "Generating API documentation with Doxygen"
-        VERBATIM)
-endif()
-"""
-
-CMAKE_PRESETS_JSON = """{
-    "version": 4,
-    "cmakeMinimumRequired": {
-        "major": 3,
-        "minor": 18,
-        "patch": 0
-    },
-    "configurePresets": [
-        {
-            "name": "base",
-            "hidden": true,
-            "cacheVariables": {
-                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
-            }
-        },
-        {
-            "name": "debug",
-            "displayName": "Debug",
-            "inherits": "base",
-            "binaryDir": "${sourceDir}/build/debug",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
-        },
-        {
-            "name": "release",
-            "displayName": "Release",
-            "inherits": "base",
-            "binaryDir": "${sourceDir}/build/release",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Release"
-            }
-        },
-        {
-            "name": "benchmarks",
-            "displayName": "Benchmarks",
-            "inherits": "base",
-            "binaryDir": "${sourceDir}/build/benchmarks",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Release",
-                "{{PROJECT_NAME_UPPER}}_BENCHMARKS": "ON"
-            }
-        },
-        {
-            "name": "benchmarks-diagnostic",
-            "displayName": "Benchmarks diagnostic build",
-            "inherits": "base",
-            "binaryDir": "${sourceDir}/build/release-with-deb",
-            "cacheVariables": {
-                "BENCHMARK_ENABLE_LIBPFM": "ON",
-                "CMAKE_BUILD_TYPE": "RelWithDebInfo",
-                "{{PROJECT_NAME_UPPER}}_DIAGNOSTICS": "ON",
-                "{{PROJECT_NAME_UPPER}}_BENCHMARKS": "ON"
-            }
-        },
-        {
-            "name": "docs",
-            "displayName": "Docs",
-            "inherits": "base",
-            "binaryDir": "${sourceDir}/build/docs",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Release",
-                "{{PROJECT_NAME_UPPER}}_DOCS": "ON"
-            }
-        },
-        {
-            "name": "coverage",
-            "displayName": "Coverage",
-            "inherits": "base",
-            "binaryDir": "${sourceDir}/build/coverage",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Debug",
-                "{{PROJECT_NAME_UPPER}}_BENCHMARKS": "OFF",
-                "{{PROJECT_NAME_UPPER}}_COVERAGE": "ON"
-            }
-        },
-        {
-            "name": "asan",
-            "displayName": "AddressSanitizer",
-            "inherits": "base",
-            "binaryDir": "${sourceDir}/build/asan",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Debug",
-                "{{PROJECT_NAME_UPPER}}_BENCHMARKS": "OFF",
-                "ENABLE_ADDRESS_SANITIZER": "ON"
-            }
-        }
-    ],
-    "buildPresets": [
-        {
-            "name": "debug",
-            "displayName": "Build Debug",
-            "configurePreset": "debug"
-        },
-        {
-            "name": "release",
-            "displayName": "Build Release",
-            "configurePreset": "release"
-        },
-        {
-            "name": "benchmarks",
-            "displayName": "Build Benchmarks",
-            "configurePreset": "benchmarks"
-        },
-        {
-            "name": "benchmarks-diagnostic",
-            "displayName": "Benchmarks diagnostic",
-            "configurePreset": "benchmarks-diagnostic"
-        },
-        {
-            "name": "docs",
-            "displayName": "Build Docs",
-            "configurePreset": "docs",
-            "targets": [
-                "docs"
-            ]
-        },
-        {
-            "name": "coverage",
-            "displayName": "Build Coverage",
-            "configurePreset": "coverage"
-        },
-        {
-            "name": "asan",
-            "displayName": "Build AddressSanitizer",
-            "configurePreset": "asan"
-        }
-    ]
-}
-"""
-
-CLANG_FORMAT = """# Defines the Chromium style for automatic reformatting.
-# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
-BasedOnStyle: Chromium
-# This defaults to 'Auto'. Explicitly set it for a while, so that
-# 'vector<vector<int> >' in existing files gets formatted to
-# 'vector<vector<int>>'. ('Auto' means that clang-format will only use
-# 'int>>' if the file already contains at least one such instance.)
-Standard: Cpp11
-
-# TODO(crbug.com/1392808): Remove when InsertBraces has been upstreamed into
-# the Chromium style (is implied by BasedOnStyle: Chromium).
-InsertBraces: true
-InsertNewlineAtEOF: true
-
-# Sort #includes by following
-# https://google.github.io/styleguide/cppguide.html#Names_and_Order_of_Includes
-IncludeBlocks: Regroup
-IncludeCategories:
-  # C system headers.
-  - Regex:           '^<.*\\.h>'
-    Priority:        1
-  # C++ standard library headers.
-  - Regex:           '^<.*>'
-    Priority:        2
-  # Project headers (quoted includes).
-  - Regex:           '^".*"'
-    Priority:        3
-  # Other libraries.
-  - Regex:           '.*'
-    Priority:        4
-"""
-
-GITIGNORE = """build/
-.vscode/
-Testing/
-plans/*
-venv/
-docs/*
-src/docs/presentations/*
-CMakeUserPresets.json
-_deps/
-*.gcda
-*.gcno
-*.gcov
-"""
-
-README_MD = """# {{PROJECT_NAME}}
-
-{{PROJECT_NAME}} is a C++20 header-only library.
-
-## Build
-
-```bash
-cmake --preset release
-cmake --build --preset release -j
-./build/release/unittests
-```
-"""
-
-AGENTS_MD = """# AGENTS.md - AI Coding Assistant Guidelines for {{PROJECT_NAME}}
-
-## Project Overview
-
-{{PROJECT_NAME}} is a **C++20 header-only library**. It provides [TODO: brief description].
-
-## Skills
-
-Shared C++ agent skills live in `agentic/cpp/skills` when this repository
-vendors the shared skills subtree. Project-specific examples live in
-`agentic/local/cpp/skills`.
-
-## Architecture
-
-### Project Layout Conventions
-
-- **`include/`**: Header-only library API (all implementations here, no `.cpp` files)
-- **`src/*_tests.cpp`**: Unit tests (Google Test)
-- **`src/*_benchmarks.cpp`**: Performance benchmarks (Google Benchmark)
-- **`src/docs/`**: Doxygen configuration
-
-### Key Design Decisions
-
-1. **Header-only library**: All code in `include/`; no compiled library.
-2. **Non-owning spans**: Use `std::span<const T>` for external data where appropriate.
-3. **SIMD conditional compilation**: Use `#ifdef {{PROJECT_NAME_UPPER}}_AVX512_SUPPORT` / `{{PROJECT_NAME_UPPER}}_AVX2_SUPPORT` with scalar fallbacks.
-4. **Target domain**: Optimized for practical data sizes.
-5. **Platform**: Linux/Unix is the primary target platform.
-
-### Why Header-Only?
-
-- **SIMD flexibility**: Users compile with their target `-march` flags.
-- **Better inlining**: Compiler sees full implementation.
-- **No ABI issues**: Works across compilers and standard library versions.
-- **Easy integration**: Users just `#include` headers.
-- **Template-friendly**: No explicit instantiation needed.
-
-## Technology Stack
-
-- **Language**: C++20 (required features: `std::span`, `std::popcount`, `<bit>`)
-- **Build**: CMake >= 3.18
-- **Testing**: Google Test v1.17.0
-- **Benchmarking**: Google Benchmark v1.9.4
-- **SIMD**: AVX-512 (primary), AVX2 (fallback), scalar fallbacks
-- **Style**: Chromium C++ style (`.clang-format`)
-
-### Dependencies
-
-The library itself is header-only and has **no runtime dependencies**. Build-time dependencies are managed via CMake FetchContent and controlled by options:
-
-| Option | Default | What it enables |
-|--------|---------|-----------------|
-| `{{PROJECT_NAME_UPPER}}_TESTS` | `ON` | Unit tests (fetches Google Test) |
-| `{{PROJECT_NAME_UPPER}}_BENCHMARKS` | `OFF` | Benchmarks (fetches Google Benchmark) |
-
-## Build Commands
-
-```bash
-# Standard build (Release)
-cmake -B build/release -DCMAKE_BUILD_TYPE=Release
-cmake --build build/release -j
-
-# Debug build
-cmake -B build/debug -DCMAKE_BUILD_TYPE=Debug
-cmake --build build/debug -j
-
-# Without wide SIMD
-cmake -B build/release -D{{PROJECT_NAME_UPPER}}_DISABLE_WIDE_SIMD=ON
-cmake --build build/release -j
-
-# With AddressSanitizer
-cmake -B build/asan -DENABLE_ADDRESS_SANITIZER=ON
-cmake --build build/asan -j
-
-# Custom march flag
-cmake -B build/release -DMARCH=icelake-client
-cmake --build build/release -j
-
-# Tests only (no benchmarks)
-cmake -B build/release -D{{PROJECT_NAME_UPPER}}_BENCHMARKS=OFF
-cmake --build build/release -j
-```
-
-## Testing
-
-### Running Tests
-
-```bash
-./build/release/unittests
-```
-
-### Testing Patterns
-
-- **Differential testing**: Compare against naive reference implementations.
-- **Randomized testing**: Random inputs with configurable seed.
-- **Exhaustive short inputs**: Test all patterns for small sizes.
-
-## Code Style Guidelines
-
-1. **Formatting**: Run `clang-format` before committing (Chromium style)
-2. **Namespace**: All library code in `{{NAMESPACE}}` namespace
-3. **Documentation**: Use Doxygen-style comments for public API
-4. **Constants**: Use `constexpr` for compile-time values
-5. **Alignment**: Be aware of data alignment; prefer 64-byte aligned array allocations where performance matters
-
-## CI/CD Workflows
-
-- **build-test.yml**: Builds and runs tests with AddressSanitizer
-- **linter.yml**: Clang-format checks on all C/C++ files
-- **coverage.yml**: Coverage reporting with codecov upload
-- **doxygen.yml**: Documentation generation and GitHub Pages deployment
-
-## Common Tasks for AI Agents
-
-### Adding a New Component
-
-1. Create header in `include/{{NAMESPACE}}/` with Doxygen documentation
-2. Add unit tests in `src/tests/<name>_tests.cpp`
-3. Add benchmarks in `src/benchmarks/<name>_benchmarks.cpp`
-4. Update `CMakeLists.txt` with new executables
-5. Run `clang-format` on new files
-
-### Modifying SIMD Code
-
-1. Provide implementations for:
-   - Wide SIMD (`#ifdef {{PROJECT_NAME_UPPER}}_AVX512_SUPPORT`)
-   - AVX2 (`#ifdef {{PROJECT_NAME_UPPER}}_AVX2_SUPPORT`)
-   - Scalar fallback
-2. Test with `-D{{PROJECT_NAME_UPPER}}_DISABLE_WIDE_SIMD=ON` to verify fallback works
-3. Benchmark to ensure performance is maintained
-
-### Adding Tests
-
-1. Use Google Test framework
-2. Include naive reference implementation for differential testing
-3. Add edge cases: empty input, single element, boundary conditions
-4. Use random testing with configurable seed for reproducibility
-
-## Performance Philosophy
-
-- **Goal**: Best practical performance (not just asymptotic complexity)
-- **Approach**: Benchmark-driven optimization using Google Benchmark
-- **SIMD**: Leverage vectorized operations where beneficial
-- **Cache efficiency**: Align data structures to cache line boundaries (64 bytes)
-"""
-
-DOXYFILE_IN = """# Doxyfile
-
-DOXYFILE_ENCODING      = UTF-8
-PROJECT_NAME           = "{{PROJECT_NAME}}"
-PROJECT_NUMBER         =
-PROJECT_BRIEF          =
-PROJECT_LOGO           =
-PROJECT_ICON           =
-OUTPUT_DIRECTORY       = docs
-CREATE_SUBDIRS         = NO
-CREATE_SUBDIRS_LEVEL   = 8
-ALLOW_UNICODE_NAMES    = NO
-OUTPUT_LANGUAGE        = English
-BRIEF_MEMBER_DESC      = YES
-REPEAT_BRIEF           = YES
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-ALWAYS_DETAILED_SEC    = NO
-INLINE_INHERITED_MEMB  = NO
-FULL_PATH_NAMES        = YES
-STRIP_FROM_PATH        = @CMAKE_CURRENT_SOURCE_DIR@
-STRIP_FROM_INC_PATH    =
-SHORT_NAMES            = NO
-JAVADOC_AUTOBRIEF      = NO
-JAVADOC_BANNER         = NO
-QT_AUTOBRIEF           = NO
-MULTILINE_CPP_IS_BRIEF = NO
-PYTHON_DOCSTRING       = YES
-INHERIT_DOCS           = YES
-SEPARATE_MEMBER_PAGES  = NO
-TAB_SIZE               = 4
-ALIASES                =
-OPTIMIZE_OUTPUT_FOR_C  = NO
-OPTIMIZE_OUTPUT_JAVA   = NO
-OPTIMIZE_FOR_FORTRAN   = NO
-OPTIMIZE_OUTPUT_VHDL   = NO
-OPTIMIZE_OUTPUT_SLICE  = NO
-EXTENSION_MAPPING      =
-MARKDOWN_SUPPORT       = YES
-MARKDOWN_STRICT        = YES
-TOC_INCLUDE_HEADINGS   = 6
-MARKDOWN_ID_STYLE      = DOXYGEN
-AUTOLINK_SUPPORT       = YES
-AUTOLINK_IGNORE_WORDS  =
-BUILTIN_STL_SUPPORT    = NO
-CPP_CLI_SUPPORT        = NO
-SIP_SUPPORT            = NO
-IDL_PROPERTY_SUPPORT   = YES
-DISTRIBUTE_GROUP_DOC   = NO
-GROUP_NESTED_COMPOUNDS = NO
-SUBGROUPING            = YES
-INLINE_GROUPED_CLASSES = NO
-INLINE_SIMPLE_STRUCTS  = NO
-TYPEDEF_HIDES_STRUCT   = NO
-LOOKUP_CACHE_SIZE      = 0
-NUM_PROC_THREADS       = 1
-TIMESTAMP              = NO
-EXTRACT_ALL            = NO
-EXTRACT_PRIVATE        = NO
-EXTRACT_PRIV_VIRTUAL   = NO
-EXTRACT_PACKAGE        = NO
-EXTRACT_STATIC         = NO
-EXTRACT_LOCAL_CLASSES  = YES
-EXTRACT_LOCAL_METHODS  = NO
-EXTRACT_ANON_NSPACES   = NO
-RESOLVE_UNNAMED_PARAMS = YES
-HIDE_UNDOC_MEMBERS     = NO
-HIDE_UNDOC_CLASSES     = NO
-HIDE_UNDOC_NAMESPACES  = YES
-HIDE_FRIEND_COMPOUNDS  = NO
-HIDE_IN_BODY_DOCS      = NO
-INTERNAL_DOCS          = NO
-CASE_SENSE_NAMES       = SYSTEM
-HIDE_SCOPE_NAMES       = NO
-HIDE_COMPOUND_REFERENCE= NO
-SHOW_HEADERFILE        = YES
-SHOW_INCLUDE_FILES     = YES
-SHOW_GROUPED_MEMB_INC  = NO
-FORCE_LOCAL_INCLUDES   = NO
-INLINE_INFO            = YES
-SORT_MEMBER_DOCS       = YES
-SORT_BRIEF_DOCS        = NO
-SORT_MEMBERS_CTORS_1ST = NO
-SORT_GROUP_NAMES       = NO
-SORT_BY_SCOPE_NAME     = NO
-STRICT_PROTO_MATCHING  = NO
-GENERATE_TODOLIST      = YES
-GENERATE_TESTLIST      = YES
-GENERATE_BUGLIST       = YES
-GENERATE_DEPRECATEDLIST= YES
-ENABLED_SECTIONS       =
-MAX_INITIALIZER_LINES  = 30
-SHOW_USED_FILES        = YES
-SHOW_FILES             = YES
-SHOW_NAMESPACES        = YES
-FILE_VERSION_FILTER    =
-LAYOUT_FILE            =
-CITE_BIB_FILES         =
-EXTERNAL_TOOL_PATH     =
-QUIET                  = NO
-WARNINGS               = YES
-WARN_IF_UNDOCUMENTED   = YES
-WARN_IF_DOC_ERROR      = YES
-WARN_IF_INCOMPLETE_DOC = YES
-WARN_NO_PARAMDOC       = NO
-WARN_IF_UNDOC_ENUM_VAL = NO
-WARN_LAYOUT_FILE       = YES
-WARN_AS_ERROR          = NO
-WARN_FORMAT            = "$file:$line: $text"
-WARN_LINE_FORMAT       = "at line $line of file $file"
-WARN_LOGFILE           =
-INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/include \
-                         @CMAKE_CURRENT_SOURCE_DIR@/README.md
-INPUT_ENCODING         = UTF-8
-INPUT_FILE_ENCODING    =
-FILE_PATTERNS          = *.c \
-                         *.cc \
-                         *.cxx \
-                         *.cpp \
-                         *.h \
-                         *.hh \
-                         *.hxx \
-                         *.hpp
-RECURSIVE              = YES
-EXCLUDE                =
-EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
-EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           =
-EXAMPLE_PATTERNS       = *
-EXAMPLE_RECURSIVE      = NO
-IMAGE_PATH             = @CMAKE_CURRENT_SOURCE_DIR@/src/docs/images
-INPUT_FILTER           =
-FILTER_PATTERNS        =
-FILTER_SOURCE_FILES    = NO
-FILTER_SOURCE_PATTERNS =
-USE_MDFILE_AS_MAINPAGE = @CMAKE_CURRENT_SOURCE_DIR@/README.md
-IMPLICIT_DIR_DOCS      = YES
-FORTRAN_COMMENT_AFTER  = 72
-SOURCE_BROWSER         = NO
-INLINE_SOURCES         = NO
-STRIP_CODE_COMMENTS    = YES
-REFERENCED_BY_RELATION = NO
-REFERENCES_RELATION    = NO
-REFERENCES_LINK_SOURCE = YES
-SOURCE_TOOLTIPS        = YES
-USE_HTAGS              = NO
-VERBATIM_HEADERS       = YES
-CLANG_ASSISTED_PARSING = NO
-CLANG_ADD_INC_PATHS    = YES
-CLANG_OPTIONS          =
-CLANG_DATABASE_PATH    =
-ALPHABETICAL_INDEX     = YES
-IGNORE_PREFIX          =
-GENERATE_HTML          = YES
-HTML_OUTPUT            = html
-HTML_FILE_EXTENSION    = .html
-HTML_HEADER            =
-HTML_FOOTER            =
-HTML_STYLESHEET        =
-HTML_EXTRA_STYLESHEET  = @AWESOME_CSS_DIR@/doxygen-awesome.css
-HTML_EXTRA_FILES       =
-HTML_COLORSTYLE        = AUTO_LIGHT
-HTML_COLORSTYLE_HUE    = 220
-HTML_COLORSTYLE_SAT    = 100
-HTML_COLORSTYLE_GAMMA  = 80
-HTML_DYNAMIC_MENUS     = YES
-HTML_DYNAMIC_SECTIONS  = NO
-HTML_CODE_FOLDING      = YES
-HTML_COPY_CLIPBOARD    = YES
-HTML_PROJECT_COOKIE    =
-HTML_INDEX_NUM_ENTRIES = 100
-GENERATE_DOCSET        = NO
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-DOCSET_FEEDURL         =
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-DOCSET_PUBLISHER_NAME  = Publisher
-GENERATE_HTMLHELP      = NO
-CHM_FILE               =
-HHC_LOCATION           =
-GENERATE_CHI           = NO
-CHM_INDEX_ENCODING     =
-BINARY_TOC             = NO
-TOC_EXPAND             = NO
-SITEMAP_URL            =
-GENERATE_QHP           = NO
-QCH_FILE               =
-QHP_NAMESPACE          = org.doxygen.Project
-QHP_VIRTUAL_FOLDER     = doc
-QHP_CUST_FILTER_NAME   =
-QHP_CUST_FILTER_ATTRS  =
-QHP_SECT_FILTER_ATTRS  =
-QHG_LOCATION           =
-GENERATE_ECLIPSEHELP   = NO
-ECLIPSE_DOC_ID         = org.doxygen.Project
-DISABLE_INDEX          = NO
-GENERATE_TREEVIEW      = YES
-PAGE_OUTLINE_PANEL     = YES
-FULL_SIDEBAR           = NO
-ENUM_VALUES_PER_LINE   = 4
-SHOW_ENUM_VALUES       = NO
-TREEVIEW_WIDTH         = 250
-EXT_LINKS_IN_WINDOW    = NO
-OBFUSCATE_EMAILS       = YES
-HTML_FORMULA_FORMAT    = png
-FORMULA_FONTSIZE       = 10
-FORMULA_MACROFILE      =
-USE_MATHJAX            = NO
-MATHJAX_VERSION        = MathJax_2
-MATHJAX_FORMAT         = HTML-CSS
-MATHJAX_RELPATH        =
-MATHJAX_EXTENSIONS     =
-MATHJAX_CODEFILE       =
-SEARCHENGINE           = YES
-SERVER_BASED_SEARCH    = NO
-EXTERNAL_SEARCH        = NO
-SEARCHENGINE_URL       =
-SEARCHDATA_FILE        = searchdata.xml
-EXTERNAL_SEARCH_ID     =
-EXTRA_SEARCH_MAPPINGS  =
-GENERATE_LATEX         = NO
-"""
-
-COVERAGE_REPORT_SH = """#!/usr/bin/env bash
-set -euo pipefail
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-BUILD_DIR="${ROOT_DIR}/build/coverage"
-
-cmake --preset coverage
-cmake --build --preset coverage
-
-"${BUILD_DIR}/unittests"
-
-cd "${BUILD_DIR}"
-find . -name "*.gcda" > gcov_files.txt
-while read -r f; do
-  case "${f}" in
-    *"/_deps/"*|*"/third_party/"*|*"/src/benchmarks/"*)
-      ;;
-    *)
-      gcov -pb "${f}" >> coverage.txt
-      ;;
-  esac
-done < gcov_files.txt
-echo "gcov report written to ${BUILD_DIR}/coverage.txt"
-"""
-
-BUILD_TEST_YML = """name: Tests (ASan)
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  build-and-test:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Create Build Directory
-      run: mkdir build
-
-    - name: Configure CMake
-      working-directory: ./build
-      run: cmake -D{{PROJECT_NAME_UPPER}}_DISABLE_WIDE_SIMD=ON -DENABLE_ADDRESS_SANITIZER=ON -D{{PROJECT_NAME_UPPER}}_BENCHMARKS=OFF ..
-
-    - name: Build Project
-      working-directory: ./build
-      run: make -j
-
-    - name: Run Unittests
-      working-directory: ./build
-      run: ./unittests
-"""
-
-LINTER_YML = """name: Clang Format Lint
-
-on:
-  pull_request:
-  push:
-    branches: [main]
-
-jobs:
-  clang-format:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install clang-format
-        run: sudo apt-get update && sudo apt-get install -y clang-format
-
-      - name: Run clang-format check
-        run: |
-          mapfile -t FILES < <(find include src -type f \\( -name '*.cpp' -o -name '*.hpp' -o -name '*.cc' -o -name '*.c' -o -name '*.h' \\))
-          clang-format --version
-          if [ ${#FILES[@]} -eq 0 ]; then
-            echo "No C/C++ files found."
-            exit 0
-          fi
-
-          clang-format --dry-run --Werror "${FILES[@]}"
-"""
-
-COVERAGE_YML = """name: coverage
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  coverage:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Create Build Directory
-      run: mkdir build
-
-    - name: Run coverage
-      run: ./scripts/coverage_report.sh
-
-    - name: Upload to Codecov
-      uses: codecov/codecov-action@v4
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        files: build/coverage/coverage.txt
-        flags: gcov
-        fail_ci_if_error: false
-
-    - name: Upload coverage artifacts
-      uses: actions/upload-artifact@v4
-      with:
-        name: coverage-gcov
-        path: |
-          build/coverage/coverage.txt
-          build/coverage/*.gcov
-"""
-
-DOXYGEN_YML = """# Simple workflow for deploying static content to GitHub Pages
-name: Deploy static content to Pages
-
-on:
-  # Runs on pushes targeting the default branch
-  push:
-    branches: ["main"]
-
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-
-# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
-permissions:
-  contents: read
-  pages: write
-  id-token: write
-
-# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
-# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
-concurrency:
-  group: "pages"
-  cancel-in-progress: false
-
-jobs:
-  # Single deploy job since we're just deploying
-  deploy:
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Doxygen v1.13.2
-        run: |
-          transformed_version=$(echo "1.13.2" | tr '.' '_')
-          wget https://github.com/doxygen/doxygen/releases/download/Release_${transformed_version}/doxygen-1.13.2.linux.bin.tar.gz
-          tar -xzf doxygen-1.13.2.linux.bin.tar.gz
-          sudo mv doxygen-1.13.2/bin/doxygen /usr/local/bin/doxygen
-        shell: bash
-      - name: Cmake configure
-        run: cmake -S ${{github.workspace}} -B ${{github.workspace}}/build -D{{PROJECT_NAME_UPPER}}_DOCS=ON -D{{PROJECT_NAME_UPPER}}_TESTS=OFF -D{{PROJECT_NAME_UPPER}}_BENCHMARKS=OFF
-      - name: Build docs
-        run: cmake --build ${{github.workspace}}/build --target docs
-      - name: Upload artifact
-        uses: actions/upload-pages-artifact@v3
-        with:
-          # Upload entire repository
-          path: ${{github.workspace}}/build/docs/html
-      - name: Deploy to GitHub Pages
-        id: deployment
-        uses: actions/deploy-pages@v4
-"""
-
-HEADER_HPP = """#pragma once
-
-/**
- * @file {{HEADER_NAME}}
- * @brief Main header for the {{PROJECT_NAME}} library
- */
-
-namespace {{NAMESPACE}} {
-
-/**
- * @brief Example function.
- *
- * TODO: Replace with actual library functionality.
- */
-inline int example() {
-    return 42;
-}
-
-}  // namespace {{NAMESPACE}}
-"""
-
-UNITTESTS_CPP = """#include <gtest/gtest.h>
-
-#include "{{NAMESPACE}}/{{HEADER_NAME}}"
-
-TEST(ExampleTest, BasicAssertion) {
-    EXPECT_EQ({{NAMESPACE}}::example(), 42);
-}
-"""
-
-BENCHMARKS_CPP = """#include <benchmark/benchmark.h>
-
-#include "{{NAMESPACE}}/{{HEADER_NAME}}"
-
-static void BM_Example(benchmark::State& state) {
-    for (auto _ : state) {
-        benchmark::DoNotOptimize({{NAMESPACE}}::example());
-    }
-}
-
-BENCHMARK(BM_Example);
-
-BENCHMARK_MAIN();
-"""
-
-
-# ---------------------------------------------------------------------------
-# Generation logic
-# ---------------------------------------------------------------------------
-
-def generate(args: argparse.Namespace) -> None:
-    project_name = args.name
-    namespace = args.namespace or project_name.replace("-", "")
-    project_name_upper = to_upper(project_name)
-    header_name = f"{to_snake(project_name)}.hpp"
-    output_dir = Path(args.output_dir).resolve() / project_name
-
-    if output_dir.exists():
-        print(f"Error: output directory already exists: {output_dir}")
-        sys.exit(1)
-
-    substitutions = {
-        "{{PROJECT_NAME}}": project_name,
-        "{{NAMESPACE}}": namespace,
-        "{{PROJECT_NAME_UPPER}}": project_name_upper,
-        "{{HEADER_NAME}}": header_name,
-    }
-
-    def sub(text: str) -> str:
-        for key, value in substitutions.items():
-            text = text.replace(key, value)
-        return text
-
-    # Create directories
-    (output_dir / "include" / namespace).mkdir(parents=True)
-    (output_dir / "src" / "tests").mkdir(parents=True)
-    (output_dir / "src" / "benchmarks").mkdir(parents=True)
-    (output_dir / "src" / "docs").mkdir(parents=True)
-    (output_dir / "src" / "docs" / "images").mkdir(parents=True)
-    (output_dir / "scripts").mkdir(parents=True)
-    (output_dir / ".github" / "workflows").mkdir(parents=True)
-
-    # Write files
-    files = {
-        output_dir / "CMakeLists.txt": sub(CMAKE_LISTS_TXT),
-        output_dir / "CMakePresets.json": sub(CMAKE_PRESETS_JSON),
-        output_dir / ".clang-format": sub(CLANG_FORMAT),
-        output_dir / ".gitignore": sub(GITIGNORE),
-        output_dir / "README.md": sub(README_MD),
-        output_dir / "AGENTS.md": sub(AGENTS_MD),
-        output_dir / "src" / "docs" / "Doxyfile.in": sub(DOXYFILE_IN),
-        output_dir / "scripts" / "coverage_report.sh": sub(COVERAGE_REPORT_SH),
-        output_dir / ".github" / "workflows" / "build-test.yml": sub(BUILD_TEST_YML),
-        output_dir / ".github" / "workflows" / "linter.yml": sub(LINTER_YML),
-        output_dir / ".github" / "workflows" / "coverage.yml": sub(COVERAGE_YML),
-        output_dir / ".github" / "workflows" / "doxygen.yml": sub(DOXYGEN_YML),
-        output_dir / "include" / namespace / header_name: sub(HEADER_HPP),
-        output_dir / "src" / "tests" / "unittests.cpp": sub(UNITTESTS_CPP),
-        output_dir / "src" / "benchmarks" / "benchmarks.cpp": sub(BENCHMARKS_CPP),
-    }
-
-    for path, content in files.items():
-        path.write_text(content)
-        print(f"Created: {path.relative_to(output_dir.parent)}")
-
-    # Make coverage script executable
-    (output_dir / "scripts" / "coverage_report.sh").chmod(0o755)
-
-    print(f"\\nProject '{project_name}' generated successfully at {output_dir}")
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Scaffold a new C++20 repository following modern C++ conventions."
-    )
-    parser.add_argument("--name", required=True, help="Project name (e.g., my-lib)")
-    parser.add_argument(
-        "--namespace",
-        help="C++ namespace (defaults to project name with hyphens removed)",
-    )
-    parser.add_argument(
-        "--output-dir",
-        default=".",
-        help="Output directory (default: current directory)",
-    )
-    args = parser.parse_args()
-    generate(args)
-
-
-if __name__ == "__main__":
-    main()

From 8b6b095abe7700533537c5d3ee9b7335997c1026 Mon Sep 17 00:00:00 2001
From: Nikolay Malkovsky <malkovskynv@gmail.com>
Date: Sat, 30 May 2026 22:51:26 +0300
Subject: [PATCH 5/7] chore: add agentic/cpp as a git submodule

---
 .gitmodules | 3 +++
 agentic/cpp | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 agentic/cpp

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..6f6274b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "agentic/cpp"]
+	path = agentic/cpp
+	url = git@github.com:Malkovsky/ai_for_cpp.git
diff --git a/agentic/cpp b/agentic/cpp
new file mode 160000
index 0000000..e1e5aeb
--- /dev/null
+++ b/agentic/cpp
@@ -0,0 +1 @@
+Subproject commit e1e5aeb54ff729b22245d09d6d9cdc4e14b0a60d

From adc0492b902ad6f28c02670b166a865744799409 Mon Sep 17 00:00:00 2001
From: Nikolay Malkovsky <malkovskynv@gmail.com>
Date: Sat, 30 May 2026 23:28:00 +0300
Subject: [PATCH 6/7] Updated agentic

---
 agentic/cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agentic/cpp b/agentic/cpp
index e1e5aeb..4fa94cd 160000
--- a/agentic/cpp
+++ b/agentic/cpp
@@ -1 +1 @@
-Subproject commit e1e5aeb54ff729b22245d09d6d9cdc4e14b0a60d
+Subproject commit 4fa94cdcfbd0ddbcd77492324b45fcb82e4f85bd

From 3bf81cdb0e0822cbc49231fd21c552b69c30839a Mon Sep 17 00:00:00 2001
From: Nikolay Malkovsky <malkovskynv@gmail.com>
Date: Sun, 31 May 2026 21:44:15 +0300
Subject: [PATCH 7/7] Improved implementation

---
 include/pixie/bits.h                          | 280 +++++++++-
 include/pixie/rmq/bp_plus_minus_one_rmq.h     | 383 ++++++++++++--
 include/pixie/rmq/cartesian_tree_rmq.h        | 116 +++++
 include/pixie/rmq/rmq_base.h                  |  21 +
 include/pixie/rmq/segment_tree.h              |  54 ++
 include/pixie/rmq/sparse_table.h              |  52 ++
 src/benchmarks/bench_rmq.cpp                  | 492 +++++++++++++++++-
 .../excess_positions_benchmarks.cpp           |  64 +++
 src/tests/excess_positions_tests.cpp          |  81 +++
 src/tests/rmq_tests.cpp                       |  21 +
 10 files changed, 1510 insertions(+), 54 deletions(-)

diff --git a/include/pixie/bits.h b/include/pixie/bits.h
index b103600..57e5c16 100644
--- a/include/pixie/bits.h
+++ b/include/pixie/bits.h
@@ -880,6 +880,17 @@ struct ExcessResult {
   size_t offset = 128;
 };
 
+/**
+ * @brief Pair of boundary minimum results for adjacent BP query blocks.
+ *
+ * @details `suffix` is local to the left/suffix block and `prefix` is local to
+ * the right/prefix block.
+ */
+struct ExcessBoundaryPairResult {
+  ExcessResult suffix;
+  ExcessResult prefix;
+};
+
 constexpr int8_t excess_byte_delta_value(uint8_t x) {
   return static_cast<int8_t>(2 * std::popcount(x) - 8);
 }
@@ -1106,15 +1117,16 @@ static inline ExcessResult excess_min_128(const uint64_t* s,
   left = std::min<size_t>(left, 128);
   right = std::min<size_t>(right, 128);
 
+  if (right - left <= 32 && (left & 7u) == 0 && (right & 7u) == 0)
+      [[unlikely]] {
+    return excess_min_128_byte_lut_short(s, left, right);
+  }
+
   int best = prefix_excess_128(s, left);
   size_t best_offset = left;
   if (left == right) {
     return {best, best_offset};
   }
-  if (right - left <= 32 && (left & 7u) == 0 && (right & 7u) == 0)
-      [[unlikely]] {
-    return excess_min_128_byte_lut_short(s, left, right);
-  }
 
 #ifdef PIXIE_AVX2_SUPPORT
   int current = best;
@@ -1130,7 +1142,10 @@ static inline ExcessResult excess_min_128(const uint64_t* s,
 
   const size_t first_full_nibble = bit >> 2;
   const size_t last_full_nibble = right >> 2;
-  if (first_full_nibble < last_full_nibble) {
+  const size_t right_partial_width = bit < right ? (right & 3u) : 0;
+  const size_t end_nibble =
+      last_full_nibble + (right_partial_width == 0 ? 0 : 1);
+  if (first_full_nibble < end_nibble) {
     const __m256i nibbles = excess_nibbles_128_avx2(s);
 
     __m256i ps = _mm256_shuffle_epi8(excess_lut_delta, nibbles);
@@ -1148,19 +1163,34 @@ static inline ExcessResult excess_min_128(const uint64_t* s,
 
     __m256i b = _mm256_permute2x128_si256(ps, ps, 0x08);
     const __m256i excl_ps = _mm256_alignr_epi8(ps, b, 15);
-    const __m256i candidates =
-        _mm256_add_epi8(excl_ps, _mm256_shuffle_epi8(excess_lut_min, nibbles));
+    __m256i local_min = _mm256_shuffle_epi8(excess_lut_min, nibbles);
+    if (right_partial_width != 0) {
+      __m256i partial_min = _mm256_shuffle_epi8(excess_lut_pos0, nibbles);
+      if (right_partial_width >= 2) {
+        partial_min = _mm256_min_epi8(
+            partial_min, _mm256_shuffle_epi8(excess_lut_pos1, nibbles));
+      }
+      if (right_partial_width >= 3) {
+        partial_min = _mm256_min_epi8(
+            partial_min, _mm256_shuffle_epi8(excess_lut_pos2, nibbles));
+      }
+      local_min = _mm256_blendv_epi8(
+          local_min, partial_min,
+          _mm256_cmpeq_epi8(
+              excess_lut_nibble_index,
+              _mm256_set1_epi8(static_cast<int8_t>(last_full_nibble))));
+    }
+    const __m256i partial_candidates = _mm256_add_epi8(excl_ps, local_min);
 
     const __m256i idx = excess_lut_nibble_index;
     const int first_minus_one_value = static_cast<int>(first_full_nibble) - 1;
     const __m256i first_minus_one =
         _mm256_set1_epi8(static_cast<int8_t>(first_minus_one_value));
-    const __m256i last =
-        _mm256_set1_epi8(static_cast<int8_t>(last_full_nibble));
+    const __m256i last = _mm256_set1_epi8(static_cast<int8_t>(end_nibble));
     const __m256i active = _mm256_and_si256(
         _mm256_cmpgt_epi8(idx, first_minus_one), _mm256_cmpgt_epi8(last, idx));
     const __m256i masked_candidates =
-        _mm256_blendv_epi8(_mm256_set1_epi8(127), candidates, active);
+        _mm256_blendv_epi8(_mm256_set1_epi8(127), partial_candidates, active);
 
     __m128i min128 =
         _mm_min_epi8(_mm256_castsi256_si128(masked_candidates),
@@ -1183,12 +1213,25 @@ static inline ExcessResult excess_min_128(const uint64_t* s,
       const uint8_t nibble =
           static_cast<uint8_t>((word >> ((nibble_index & 15u) * 4u)) & 0xFu);
       best = candidate_min;
-      best_offset = static_cast<size_t>(nibble_index) * 4u +
-                    static_cast<size_t>(excess_lut_min_offset[nibble]);
+      if (right_partial_width != 0 && nibble_index == last_full_nibble) {
+        int local = 0;
+        int local_best = 0;
+        size_t local_offset = 1;
+        for (size_t i = 0; i < right_partial_width; ++i) {
+          local += ((nibble >> i) & 1u) != 0 ? 1 : -1;
+          if (i == 0 || local < local_best) {
+            local_best = local;
+            local_offset = i + 1;
+          }
+        }
+        best_offset = static_cast<size_t>(nibble_index) * 4u + local_offset;
+      } else {
+        best_offset = static_cast<size_t>(nibble_index) * 4u +
+                      static_cast<size_t>(excess_lut_min_offset[nibble]);
+      }
     }
 
-    bit = last_full_nibble * 4;
-    current = prefix_excess_128(s, bit);
+    bit = end_nibble * 4;
   }
 
   for (; bit < right; ++bit) {
@@ -1214,6 +1257,215 @@ static inline ExcessResult excess_min_128(const uint64_t* s,
   return {best, best_offset};
 }
 
+/**
+ * @brief Compute disjoint suffix/prefix boundary minima for two 128-bit blocks.
+ *
+ * @details Computes `excess_min_128(suffix_s, suffix_left, 127)` and
+ * `excess_min_128(prefix_s, 0, prefix_right)`. When `suffix_left >
+ * prefix_right`, the AVX2 path packs the two disjoint local ranges into one
+ * nibble stream and shares the prefix-sum/reduction work. Non-disjoint or
+ * out-of-shape inputs fall back to the two independent production calls.
+ *
+ * @param suffix_s Left boundary block.
+ * @param suffix_left First local prefix offset in the suffix range.
+ * @param prefix_s Right boundary block.
+ * @param prefix_right Last local prefix offset in the prefix range.
+ * @return Pair of local minimum results.
+ */
+static inline ExcessBoundaryPairResult excess_min_128_disjoint_suffix_prefix(
+    const uint64_t* suffix_s,
+    size_t suffix_left,
+    const uint64_t* prefix_s,
+    size_t prefix_right) noexcept {
+  suffix_left = std::min<size_t>(suffix_left, 128);
+  prefix_right = std::min<size_t>(prefix_right, 128);
+  if (suffix_left <= prefix_right || suffix_left > 127 || prefix_right > 127) {
+    return {excess_min_128(suffix_s, suffix_left, 127),
+            excess_min_128(prefix_s, 0, prefix_right)};
+  }
+
+#ifdef PIXIE_AVX2_SUPPORT
+  ExcessResult prefix{0, 0};
+
+  int suffix_best = prefix_excess_128(suffix_s, suffix_left);
+  ExcessResult suffix{suffix_best, suffix_left};
+  size_t suffix_bit = suffix_left;
+  int suffix_current = suffix_best;
+  for (; suffix_bit < 127 && (suffix_bit & 3u) != 0; ++suffix_bit) {
+    suffix_current +=
+        ((suffix_s[suffix_bit >> 6] >> (suffix_bit & 63)) & 1ull) != 0 ? 1 : -1;
+    const size_t offset = suffix_bit + 1;
+    if (suffix_current < suffix.min_excess) {
+      suffix = {suffix_current, offset};
+    }
+  }
+
+  alignas(32) int8_t lane_nibbles[32] = {};
+  alignas(32) int8_t lane_indices[32] = {};
+  alignas(32) int8_t prefix_active[32] = {};
+  alignas(32) int8_t suffix_active[32] = {};
+  alignas(32) int8_t partial_widths[32] = {};
+  alignas(32) int8_t prefix_partial[32] = {};
+  alignas(32) int8_t suffix_partial[32] = {};
+
+  size_t lane = 0;
+  int prefix_artificial_delta = 0;
+  const size_t prefix_last_nibble = prefix_right >> 2;
+  const size_t prefix_partial_width = prefix_right & 3u;
+  const size_t prefix_end_nibble =
+      prefix_last_nibble + (prefix_partial_width == 0 ? 0 : 1);
+  for (size_t nibble_index = 0; nibble_index < prefix_end_nibble;
+       ++nibble_index) {
+    const uint64_t word = prefix_s[nibble_index >> 4];
+    const uint8_t nibble =
+        static_cast<uint8_t>((word >> ((nibble_index & 15u) * 4u)) & 0xFu);
+    lane_nibbles[lane] = static_cast<int8_t>(nibble);
+    lane_indices[lane] = static_cast<int8_t>(nibble_index);
+    prefix_active[lane] = static_cast<int8_t>(0xFFu);
+    if (prefix_partial_width != 0 && nibble_index == prefix_last_nibble) {
+      partial_widths[lane] = static_cast<int8_t>(prefix_partial_width);
+      prefix_partial[lane] = static_cast<int8_t>(0xFFu);
+    }
+    prefix_artificial_delta += 2 * std::popcount(nibble) - 4;
+    ++lane;
+  }
+
+  if (suffix_bit < 127) {
+    const size_t suffix_first_nibble = suffix_bit >> 2;
+    for (size_t nibble_index = suffix_first_nibble; nibble_index < 32;
+         ++nibble_index) {
+      const uint64_t word = suffix_s[nibble_index >> 4];
+      const uint8_t nibble =
+          static_cast<uint8_t>((word >> ((nibble_index & 15u) * 4u)) & 0xFu);
+      lane_nibbles[lane] = static_cast<int8_t>(nibble);
+      lane_indices[lane] = static_cast<int8_t>(nibble_index);
+      suffix_active[lane] = static_cast<int8_t>(0xFFu);
+      if (nibble_index == 31) {
+        partial_widths[lane] = 3;
+        suffix_partial[lane] = static_cast<int8_t>(0xFFu);
+      }
+      ++lane;
+    }
+  }
+
+  if (lane > 32) {
+    return {excess_min_128(suffix_s, suffix_left, 127),
+            excess_min_128(prefix_s, 0, prefix_right)};
+  }
+
+  __m256i nibbles =
+      _mm256_load_si256(reinterpret_cast<const __m256i*>(lane_nibbles));
+  __m256i ps = _mm256_shuffle_epi8(excess_lut_delta, nibbles);
+  ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 1));
+  ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 2));
+  ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 4));
+  ps = _mm256_add_epi8(ps, _mm256_slli_si256(ps, 8));
+
+  __m128i ps_lo = _mm256_castsi256_si128(ps);
+  __m128i ps_hi = _mm256_extracti128_si256(ps, 1);
+  __m128i carry =
+      _mm_set1_epi8(static_cast<int8_t>(_mm_extract_epi8(ps_lo, 15)));
+  ps_hi = _mm_add_epi8(ps_hi, carry);
+  ps = _mm256_inserti128_si256(_mm256_castsi128_si256(ps_lo), ps_hi, 1);
+
+  __m256i b = _mm256_permute2x128_si256(ps, ps, 0x08);
+  const __m256i excl_ps = _mm256_alignr_epi8(ps, b, 15);
+
+  __m256i local_min = _mm256_shuffle_epi8(excess_lut_min, nibbles);
+  const __m256i width =
+      _mm256_load_si256(reinterpret_cast<const __m256i*>(partial_widths));
+  __m256i partial_min = _mm256_shuffle_epi8(excess_lut_pos0, nibbles);
+  partial_min = _mm256_min_epi8(
+      partial_min,
+      _mm256_blendv_epi8(_mm256_set1_epi8(127),
+                         _mm256_shuffle_epi8(excess_lut_pos1, nibbles),
+                         _mm256_cmpgt_epi8(width, _mm256_set1_epi8(1))));
+  partial_min = _mm256_min_epi8(
+      partial_min,
+      _mm256_blendv_epi8(_mm256_set1_epi8(127),
+                         _mm256_shuffle_epi8(excess_lut_pos2, nibbles),
+                         _mm256_cmpgt_epi8(width, _mm256_set1_epi8(2))));
+  const __m256i any_partial = _mm256_or_si256(
+      _mm256_load_si256(reinterpret_cast<const __m256i*>(prefix_partial)),
+      _mm256_load_si256(reinterpret_cast<const __m256i*>(suffix_partial)));
+  local_min = _mm256_blendv_epi8(local_min, partial_min, any_partial);
+
+  const __m256i base_candidates = _mm256_add_epi8(excl_ps, local_min);
+  const __m256i prefix_mask =
+      _mm256_load_si256(reinterpret_cast<const __m256i*>(prefix_active));
+  const __m256i suffix_mask =
+      _mm256_load_si256(reinterpret_cast<const __m256i*>(suffix_active));
+  const __m256i sentinel = _mm256_set1_epi8(127);
+
+  const __m256i prefix_candidates =
+      _mm256_blendv_epi8(sentinel, base_candidates, prefix_mask);
+  const __m256i suffix_candidates = _mm256_blendv_epi8(
+      sentinel,
+      _mm256_add_epi8(base_candidates,
+                      _mm256_set1_epi8(static_cast<int8_t>(
+                          suffix_current - prefix_artificial_delta))),
+      suffix_mask);
+
+  auto reduce_min = [](__m256i values) {
+    __m128i min128 = _mm_min_epi8(_mm256_castsi256_si128(values),
+                                  _mm256_extracti128_si256(values, 1));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 8));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 4));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 2));
+    min128 = _mm_min_epi8(min128, _mm_alignr_epi8(min128, min128, 1));
+    return static_cast<int>(static_cast<int8_t>(_mm_extract_epi8(min128, 0)));
+  };
+
+  auto local_offset = [](uint8_t nibble, size_t width_value) {
+    if (width_value == 0 || width_value == 4) {
+      return static_cast<size_t>(excess_lut_min_offset[nibble]);
+    }
+    int current = 0;
+    int best = 0;
+    size_t best_offset = 1;
+    for (size_t i = 0; i < width_value; ++i) {
+      current += ((nibble >> i) & 1u) != 0 ? 1 : -1;
+      if (i == 0 || current < best) {
+        best = current;
+        best_offset = i + 1;
+      }
+    }
+    return best_offset;
+  };
+
+  const int prefix_min = reduce_min(prefix_candidates);
+  if (prefix_min < prefix.min_excess) {
+    const uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
+        _mm256_cmpeq_epi8(prefix_candidates,
+                          _mm256_set1_epi8(static_cast<int8_t>(prefix_min)))));
+    const uint32_t prefix_lane = std::countr_zero(mask);
+    const uint8_t nibble = static_cast<uint8_t>(lane_nibbles[prefix_lane]);
+    prefix.min_excess = prefix_min;
+    prefix.offset =
+        static_cast<size_t>(lane_indices[prefix_lane]) * 4u +
+        local_offset(nibble, static_cast<size_t>(partial_widths[prefix_lane]));
+  }
+
+  const int suffix_min = reduce_min(suffix_candidates);
+  if (suffix_min < suffix.min_excess) {
+    const uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
+        _mm256_cmpeq_epi8(suffix_candidates,
+                          _mm256_set1_epi8(static_cast<int8_t>(suffix_min)))));
+    const uint32_t suffix_lane = std::countr_zero(mask);
+    const uint8_t nibble = static_cast<uint8_t>(lane_nibbles[suffix_lane]);
+    suffix.min_excess = suffix_min;
+    suffix.offset =
+        static_cast<size_t>(lane_indices[suffix_lane]) * 4u +
+        local_offset(nibble, static_cast<size_t>(partial_widths[suffix_lane]));
+  }
+
+  return {suffix, prefix};
+#else
+  return {excess_min_128(suffix_s, suffix_left, 127),
+          excess_min_128(prefix_s, 0, prefix_right)};
+#endif
+}
+
 /**
  * @brief Find the first prefix reaching target_x in a 128-bit bitstring.
  *
diff --git a/include/pixie/rmq/bp_plus_minus_one_rmq.h b/include/pixie/rmq/bp_plus_minus_one_rmq.h
index e796b93..1038ef7 100644
--- a/include/pixie/rmq/bp_plus_minus_one_rmq.h
+++ b/include/pixie/rmq/bp_plus_minus_one_rmq.h
@@ -23,9 +23,9 @@ namespace pixie::rmq {
  * @details The indexed depth sequence is represented by BP deltas: bit 1 means
  * the next depth is current + 1, and bit 0 means current - 1. A sequence with
  * @p depth_count depth positions has @p depth_count - 1 delta bits. Blocks
- * match the 128-bit excess primitives in `bits.h`; only the absolute minimum
- * value of each block is stored, and positions are recovered by rescanning the
- * selected block.
+ * match the 128-bit excess primitives in `bits.h`; each block stores a compact
+ * 16-byte summary with its base depth, absolute minimum value, and first local
+ * offset attaining that minimum.
  *
  * @tparam Index Unsigned integer type used for stored positions.
  * @tparam BlockSize Number of depth positions per microblock.
@@ -38,54 +38,112 @@ class BpPlusMinusOneRmq {
   static constexpr std::size_t npos = std::numeric_limits<std::size_t>::max();
   static constexpr Index invalid_index = std::numeric_limits<Index>::max();
 
+  /**
+   * @brief Construct an empty ±1 RMQ index.
+   */
   BpPlusMinusOneRmq() = default;
 
+  /**
+   * @brief Build a ±1 RMQ index over a BP-encoded depth-delta sequence.
+   *
+   * @details The bit span is not copied and must outlive this object. Bit `1`
+   * encodes a +1 step between adjacent depths, and bit `0` encodes a -1 step.
+   * A sequence with @p depth_count positions requires @p depth_count - 1 bits.
+   *
+   * @param bits Packed delta bits in little-endian bit order within each word.
+   * @param depth_count Number of depth positions indexed by the RMQ.
+   * @throws std::length_error if @p Index cannot represent all positions or a
+   * packed 48-bit block summary overflows.
+   * @throws std::invalid_argument if @p bits does not contain enough words.
+   */
   BpPlusMinusOneRmq(std::span<const std::uint64_t> bits,
                     std::size_t depth_count)
       : input_bits_(bits), depth_count_(depth_count) {
     build();
   }
 
+  /**
+   * @brief Copy a ±1 RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   */
   BpPlusMinusOneRmq(const BpPlusMinusOneRmq& other)
       : input_bits_(other.input_bits_),
         depth_count_(other.depth_count_),
-        block_min_values_(other.block_min_values_) {
+        block_summaries_(other.block_summaries_) {
     reset_macro_rmq();
   }
 
+  /**
+   * @brief Copy-assign a ±1 RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   * @return Reference to this object.
+   */
   BpPlusMinusOneRmq& operator=(const BpPlusMinusOneRmq& other) {
     if (this == &other) {
       return *this;
     }
     input_bits_ = other.input_bits_;
     depth_count_ = other.depth_count_;
-    block_min_values_ = other.block_min_values_;
+    block_summaries_ = other.block_summaries_;
     reset_macro_rmq();
     return *this;
   }
 
+  /**
+   * @brief Move a ±1 RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   */
   BpPlusMinusOneRmq(BpPlusMinusOneRmq&& other) noexcept
       : input_bits_(other.input_bits_),
         depth_count_(other.depth_count_),
-        block_min_values_(std::move(other.block_min_values_)) {
+        block_summaries_(std::move(other.block_summaries_)) {
     reset_macro_rmq();
   }
 
+  /**
+   * @brief Move-assign a ±1 RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   * @return Reference to this object.
+   */
   BpPlusMinusOneRmq& operator=(BpPlusMinusOneRmq&& other) noexcept {
     if (this == &other) {
       return *this;
     }
     input_bits_ = other.input_bits_;
     depth_count_ = other.depth_count_;
-    block_min_values_ = std::move(other.block_min_values_);
+    block_summaries_ = std::move(other.block_summaries_);
     reset_macro_rmq();
     return *this;
   }
 
+  /**
+   * @brief Return the number of indexed depth positions.
+   *
+   * @return The @p depth_count passed to construction.
+   */
   std::size_t size() const { return depth_count_; }
 
+  /**
+   * @brief Whether the indexed depth sequence is empty.
+   *
+   * @return `true` when `size() == 0`.
+   */
   bool empty() const { return depth_count_ == 0; }
 
+  /**
+   * @brief Return the first minimum depth position in [@p left, @p right].
+   *
+   * @details The query range is inclusive over depth positions, not delta-bit
+   * positions. Ties return the smallest position attaining the minimum depth.
+   *
+   * @param left First depth position in the query range.
+   * @param right Last depth position in the query range.
+   * @return Zero-based position of the first range minimum, or `npos`.
+   */
   std::size_t arg_min(std::size_t left, std::size_t right) const {
     if (left > right || right >= depth_count_) {
       return npos;
@@ -94,20 +152,20 @@ class BpPlusMinusOneRmq {
     const std::size_t left_block = left / BlockSize;
     const std::size_t right_block = right / BlockSize;
     if (left_block == right_block) {
-      return scan_block_range(left_block, left % BlockSize, right % BlockSize)
-          .position;
+      return scan_block_range_position(left_block, left % BlockSize,
+                                       right % BlockSize);
     }
 
-    Candidate answer = scan_block_range(left_block, left % BlockSize,
-                                        block_size(left_block) - 1);
-    answer =
-        better(answer, scan_block_range(right_block, 0, right % BlockSize));
+    const std::size_t left_offset = left % BlockSize;
+    const std::size_t right_offset = right % BlockSize;
+    Candidate answer =
+        scan_block_range(left_block, left_offset, block_size(left_block) - 1);
+    answer = better(answer, scan_block_range(right_block, 0, right_offset));
 
     if (left_block + 1 < right_block) {
       const std::size_t block_position =
           macro_rmq_.arg_min(left_block + 1, right_block - 1);
-      if (block_position !=
-          SparseTable<std::int64_t, std::less<std::int64_t>, Index>::npos) {
+      if (block_position != MacroRmq::npos) {
         answer = better(answer, scan_full_block(block_position));
       }
     }
@@ -121,11 +179,182 @@ class BpPlusMinusOneRmq {
     std::int64_t value = std::numeric_limits<std::int64_t>::max();
   };
 
+  /**
+   * @brief Packed summary for one 128-position depth block.
+   *
+   * @details Stores signed 48-bit base depth, signed 48-bit absolute block
+   * minimum, and 16-bit first local offset of that minimum in two 64-bit words.
+   * The hot macro-RMQ comparison keeps the sign-biased minimum in the low
+   * 48 bits of `word0`, so signed ordering is available as one masked unsigned
+   * comparison. The high 16 bits of `word0` store the local minimum offset, and
+   * `word1` stores the base depth.
+   */
+  struct alignas(16) BlockSummary {
+    static constexpr std::uint64_t kSigned48Mask = (std::uint64_t{1} << 48) - 1;
+    static constexpr std::uint64_t kSigned48SignBit = std::uint64_t{1} << 47;
+    static constexpr std::int64_t kSigned48Min = -(std::int64_t{1} << 47);
+    static constexpr std::int64_t kSigned48Max = (std::int64_t{1} << 47) - 1;
+
+    std::uint64_t word0 = 0;
+    std::uint64_t word1 = 0;
+
+    /**
+     * @brief Pack block metadata into a 16-byte summary.
+     *
+     * @param base_depth Absolute depth at local offset 0.
+     * @param min_value Absolute minimum depth in the block.
+     * @param min_offset First local offset attaining @p min_value.
+     * @return Packed block summary.
+     * @throws std::length_error if a signed 48-bit field or 16-bit offset
+     * overflows.
+     */
+    static BlockSummary make(std::int64_t base_depth,
+                             std::int64_t min_value,
+                             std::size_t min_offset) {
+      if (!fits_signed48(base_depth) || !fits_signed48(min_value)) {
+        throw std::length_error("RMQ +/-1 block summary depth overflow");
+      }
+      if (min_offset > std::numeric_limits<std::uint16_t>::max()) {
+        throw std::length_error("RMQ +/-1 block summary offset overflow");
+      }
+
+      const std::uint64_t packed_base = pack_signed48(base_depth);
+      const std::uint64_t ordered_min = pack_ordered48(min_value);
+      return {ordered_min | (static_cast<std::uint64_t>(min_offset) << 48),
+              packed_base};
+    }
+
+    /**
+     * @brief Decode the absolute depth at local offset 0.
+     *
+     * @return Signed base depth for this block.
+     */
+    std::int64_t base_depth() const {
+      return unpack_signed48(word1 & kSigned48Mask);
+    }
+
+    /**
+     * @brief Decode the absolute minimum depth in this block.
+     *
+     * @return Signed block minimum value.
+     */
+    std::int64_t min_value() const {
+      return unpack_ordered48(ordered_min_value());
+    }
+
+    /**
+     * @brief Return the sign-biased minimum payload for fast comparisons.
+     *
+     * @details The returned low-48-bit value preserves signed order under
+     * unsigned comparison: smaller signed depths have smaller payloads.
+     *
+     * @return Sign-biased 48-bit minimum depth payload.
+     */
+    std::uint64_t ordered_min_value() const { return word0 & kSigned48Mask; }
+
+    /**
+     * @brief Decode the first local minimum offset in this block.
+     *
+     * @return Local depth-position offset in [0, BlockSize).
+     */
+    std::size_t min_offset() const {
+      return static_cast<std::size_t>(word0 >> 48);
+    }
+
+   private:
+    /**
+     * @brief Test whether a value fits in the signed 48-bit packed field.
+     *
+     * @param value Value to test.
+     * @return `true` if @p value is representable.
+     */
+    static bool fits_signed48(std::int64_t value) {
+      return value >= kSigned48Min && value <= kSigned48Max;
+    }
+
+    /**
+     * @brief Truncate a signed value to its 48-bit two's-complement payload.
+     *
+     * @param value Signed 48-bit value.
+     * @return Low 48 payload bits.
+     */
+    static std::uint64_t pack_signed48(std::int64_t value) {
+      return static_cast<std::uint64_t>(value) & kSigned48Mask;
+    }
+
+    /**
+     * @brief Encode a signed 48-bit value for unsigned ordered comparison.
+     *
+     * @param value Signed 48-bit value.
+     * @return Low 48 payload bits with the sign bit flipped.
+     */
+    static std::uint64_t pack_ordered48(std::int64_t value) {
+      return pack_signed48(value) ^ kSigned48SignBit;
+    }
+
+    /**
+     * @brief Sign-extend a 48-bit two's-complement payload.
+     *
+     * @param value Low 48 payload bits.
+     * @return Decoded signed value.
+     */
+    static std::int64_t unpack_signed48(std::uint64_t value) {
+      if ((value & kSigned48SignBit) != 0) {
+        value |= ~kSigned48Mask;
+      }
+      return static_cast<std::int64_t>(value);
+    }
+
+    /**
+     * @brief Decode a sign-biased 48-bit ordered payload.
+     *
+     * @param value Low 48 payload bits with the sign bit flipped.
+     * @return Decoded signed value.
+     */
+    static std::int64_t unpack_ordered48(std::uint64_t value) {
+      return unpack_signed48(value ^ kSigned48SignBit);
+    }
+  };
+
+  static_assert(sizeof(BlockSummary) == 16);
+  static_assert(alignof(BlockSummary) == 16);
+
+  struct BlockSummaryMinLess {
+    /**
+     * @brief Compare two block summaries by absolute minimum value.
+     *
+     * @param left First block summary.
+     * @param right Second block summary.
+     * @return `true` if @p left has a strictly smaller block minimum.
+     */
+    bool operator()(const BlockSummary& left, const BlockSummary& right) const {
+      return left.ordered_min_value() < right.ordered_min_value();
+    }
+  };
+
+  using MacroRmq = SparseTable<BlockSummary, BlockSummaryMinLess, Index>;
+
+  /**
+   * @brief Return the number of depth positions in a block.
+   *
+   * @param block Zero-based block index.
+   * @return `BlockSize` for full blocks, or the tail size for the last block.
+   */
   std::size_t block_size(std::size_t block) const {
     const std::size_t begin = block * BlockSize;
     return std::min(BlockSize, depth_count_ - begin);
   }
 
+  /**
+   * @brief Choose the better of two candidate minima.
+   *
+   * @details Missing candidates use `npos`. Smaller values win; equal values
+   * return the smaller global position.
+   *
+   * @param left First candidate.
+   * @param right Second candidate.
+   * @return Selected candidate.
+   */
   Candidate better(Candidate left, Candidate right) const {
     if (left.position == npos) {
       return right;
@@ -142,9 +371,18 @@ class BpPlusMinusOneRmq {
     return right.position < left.position ? right : left;
   }
 
+  /**
+   * @brief Build block summaries and the macro sparse table.
+   *
+   * @details Computes the absolute base depth, absolute block minimum, and
+   * first local minimum offset for each 128-position block.
+   *
+   * @throws std::length_error if @p Index or a packed block summary overflows.
+   * @throws std::invalid_argument if the input bit span is too small.
+   */
   void build() {
-    block_min_values_.clear();
-    macro_rmq_ = SparseTable<std::int64_t, std::less<std::int64_t>, Index>();
+    block_summaries_.clear();
+    macro_rmq_ = MacroRmq();
 
     if (depth_count_ == 0) {
       return;
@@ -157,12 +395,13 @@ class BpPlusMinusOneRmq {
     }
 
     const std::size_t block_count = (depth_count_ + BlockSize - 1) / BlockSize;
-    block_min_values_.reserve(block_count);
+    block_summaries_.reserve(block_count);
 
     std::int64_t base_depth = 0;
     for (std::size_t block = 0; block < block_count; ++block) {
       const std::size_t begin = block * BlockSize;
       const std::size_t size = std::min(BlockSize, depth_count_ - begin);
+      std::size_t min_offset = 0;
       std::int64_t min_depth = base_depth;
       std::int64_t current_depth = base_depth;
       for (std::size_t offset = 1; offset < size; ++offset) {
@@ -171,10 +410,12 @@ class BpPlusMinusOneRmq {
         current_depth += up ? 1 : -1;
         if (current_depth < min_depth) {
           min_depth = current_depth;
+          min_offset = offset;
         }
       }
 
-      block_min_values_.push_back(min_depth);
+      block_summaries_.push_back(
+          BlockSummary::make(base_depth, min_depth, min_offset));
       if (block + 1 < block_count) {
         base_depth += block_excess(begin, next_block_delta_count(begin));
       }
@@ -183,24 +424,56 @@ class BpPlusMinusOneRmq {
     reset_macro_rmq();
   }
 
+  /**
+   * @brief Read one BP delta bit.
+   *
+   * @param position Zero-based delta-bit position.
+   * @return `true` for a +1 step and `false` for a -1 step.
+   */
   bool bit(std::size_t position) const {
     return ((input_bits_[position >> 6] >> (position & 63)) & 1u) != 0;
   }
 
+  /**
+   * @brief Read an input word or return zero past the available span.
+   *
+   * @param word Zero-based 64-bit word index.
+   * @return Input word value, or zero when @p word is out of range.
+   */
   std::uint64_t word_or_zero(std::size_t word) const {
     return word < input_bits_.size() ? input_bits_[word] : 0;
   }
 
+  /**
+   * @brief Load the two 64-bit words backing a 128-position block.
+   *
+   * @param block Zero-based block index.
+   * @return Pair of words suitable for `excess_min_128`.
+   */
   std::array<std::uint64_t, 2> block_bits(std::size_t block) const {
     const std::size_t first_word = block * (BlockSize / 64);
     return {word_or_zero(first_word), word_or_zero(first_word + 1)};
   }
 
+  /**
+   * @brief Count delta bits from a block start to the next block boundary.
+   *
+   * @param begin Global depth-position offset of the block start.
+   * @return Number of delta bits contributing to the transition to the next
+   * block base depth.
+   */
   std::size_t next_block_delta_count(std::size_t begin) const {
     const std::size_t next_begin = begin + BlockSize;
     return std::min(next_begin, depth_count_ - 1) - begin;
   }
 
+  /**
+   * @brief Compute total excess over a contiguous delta-bit range.
+   *
+   * @param begin First delta-bit position.
+   * @param delta_count Number of delta bits to scan.
+   * @return Sum of +1/-1 deltas in the range.
+   */
   std::int64_t block_excess(std::size_t begin, std::size_t delta_count) const {
     std::int64_t excess = 0;
     for (std::size_t i = 0; i < delta_count; ++i) {
@@ -209,13 +482,43 @@ class BpPlusMinusOneRmq {
     return excess;
   }
 
-  std::int64_t block_base_depth(std::size_t block,
-                                const std::array<std::uint64_t, 2>& bits,
-                                std::size_t size) const {
-    const ExcessResult full_min = excess_min_128(bits.data(), 0, size - 1);
-    return block_min_values_[block] - full_min.min_excess;
+  /**
+   * @brief Return only the minimum position inside one block range.
+   *
+   * @details Used for same-block queries where the absolute minimum value is
+   * not needed. The range is inclusive in local block offsets.
+   *
+   * @param block Zero-based block index.
+   * @param left_offset First local depth-position offset.
+   * @param right_offset Last local depth-position offset.
+   * @return Global position of the first local minimum, or `npos`.
+   */
+  std::size_t scan_block_range_position(std::size_t block,
+                                        std::size_t left_offset,
+                                        std::size_t right_offset) const {
+    const std::size_t begin = block * BlockSize;
+    const std::size_t size = block_size(block);
+    right_offset = std::min(right_offset, size - 1);
+    const auto bits = block_bits(block);
+    const ExcessResult result =
+        excess_min_128(bits.data(), left_offset, right_offset);
+    if (result.offset == npos || result.offset >= size) {
+      return npos;
+    }
+    return begin + result.offset;
   }
 
+  /**
+   * @brief Scan an inclusive local range inside one block.
+   *
+   * @details Uses `excess_min_128` for the relative minimum and combines it
+   * with the stored block base depth to return an absolute-depth candidate.
+   *
+   * @param block Zero-based block index.
+   * @param left_offset First local depth-position offset.
+   * @param right_offset Last local depth-position offset.
+   * @return Candidate containing global position and absolute depth.
+   */
   Candidate scan_block_range(std::size_t block,
                              std::size_t left_offset,
                              std::size_t right_offset) const {
@@ -223,35 +526,41 @@ class BpPlusMinusOneRmq {
     const std::size_t size = block_size(block);
     right_offset = std::min(right_offset, size - 1);
     const auto bits = block_bits(block);
-    const std::int64_t base_depth = block_base_depth(block, bits, size);
     const ExcessResult result =
         excess_min_128(bits.data(), left_offset, right_offset);
     if (result.offset == npos || result.offset >= size) {
       return {};
     }
-    return {begin + result.offset, base_depth + result.min_excess};
+    return {begin + result.offset,
+            block_summaries_[block].base_depth() + result.min_excess};
   }
 
+  /**
+   * @brief Return the precomputed minimum candidate for a full block.
+   *
+   * @param block Zero-based block index.
+   * @return Candidate containing global position and absolute depth.
+   */
   Candidate scan_full_block(std::size_t block) const {
     const std::size_t begin = block * BlockSize;
-    const std::size_t size = block_size(block);
-    const auto bits = block_bits(block);
-    const ExcessResult result = excess_min_128(bits.data(), 0, size - 1);
-    if (result.offset == npos || result.offset >= size) {
-      return {};
-    }
-    return {begin + result.offset, block_min_values_[block]};
+    return {begin + block_summaries_[block].min_offset(),
+            block_summaries_[block].min_value()};
   }
 
+  /**
+   * @brief Rebuild the macro sparse table over block summaries.
+   *
+   * @details Called after build, copy, and move operations because the sparse
+   * table stores a non-owning span into this object's block-summary vector.
+   */
   void reset_macro_rmq() {
-    macro_rmq_ = SparseTable<std::int64_t, std::less<std::int64_t>, Index>(
-        std::span<const std::int64_t>(block_min_values_));
+    macro_rmq_ = MacroRmq(std::span<const BlockSummary>(block_summaries_));
   }
 
   std::span<const std::uint64_t> input_bits_;
   std::size_t depth_count_ = 0;
-  std::vector<std::int64_t> block_min_values_;
-  SparseTable<std::int64_t, std::less<std::int64_t>, Index> macro_rmq_;
+  std::vector<BlockSummary> block_summaries_;
+  MacroRmq macro_rmq_;
 };
 
 }  // namespace pixie::rmq
diff --git a/include/pixie/rmq/cartesian_tree_rmq.h b/include/pixie/rmq/cartesian_tree_rmq.h
index db7d315..6048996 100644
--- a/include/pixie/rmq/cartesian_tree_rmq.h
+++ b/include/pixie/rmq/cartesian_tree_rmq.h
@@ -35,14 +35,32 @@ class CartesianTreeRmq
       RmqBase<CartesianTreeRmq<T, Compare, Index>, T>::npos;
   static constexpr Index invalid_index = std::numeric_limits<Index>::max();
 
+  /**
+   * @brief Construct an empty Cartesian-tree RMQ index.
+   */
   CartesianTreeRmq() = default;
 
+  /**
+   * @brief Build a Cartesian-tree RMQ index over @p values.
+   *
+   * @details The values are not copied and must outlive this object. Equal
+   * values stay stable: the smaller index remains the first minimum.
+   *
+   * @param values Values to index.
+   * @param compare Ordering used to choose minima.
+   * @throws std::length_error if @p Index cannot represent all positions.
+   */
   explicit CartesianTreeRmq(std::span<const T> values,
                             Compare compare = Compare())
       : values_(values), compare_(compare) {
     build();
   }
 
+  /**
+   * @brief Copy an RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   */
   CartesianTreeRmq(const CartesianTreeRmq& other)
       : values_(other.values_),
         compare_(other.compare_),
@@ -55,6 +73,12 @@ class CartesianTreeRmq
     reset_depth_rmq();
   }
 
+  /**
+   * @brief Copy-assign an RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   * @return Reference to this object.
+   */
   CartesianTreeRmq& operator=(const CartesianTreeRmq& other) {
     if (this == &other) {
       return *this;
@@ -71,6 +95,11 @@ class CartesianTreeRmq
     return *this;
   }
 
+  /**
+   * @brief Move an RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   */
   CartesianTreeRmq(CartesianTreeRmq&& other) noexcept
       : values_(other.values_),
         compare_(std::move(other.compare_)),
@@ -83,6 +112,12 @@ class CartesianTreeRmq
     reset_depth_rmq();
   }
 
+  /**
+   * @brief Move-assign an RMQ index and rebuild internal non-owning views.
+   *
+   * @param other Source index.
+   * @return Reference to this object.
+   */
   CartesianTreeRmq& operator=(CartesianTreeRmq&& other) noexcept {
     if (this == &other) {
       return *this;
@@ -99,10 +134,32 @@ class CartesianTreeRmq
     return *this;
   }
 
+  /**
+   * @brief Return the number of indexed values.
+   *
+   * @return `values.size()` from construction.
+   */
   std::size_t size_impl() const { return values_.size(); }
 
+  /**
+   * @brief Return the value at an indexed position.
+   *
+   * @param position Zero-based position in the indexed values.
+   * @return Copy of the value at @p position.
+   */
   T value_at_impl(std::size_t position) const { return values_[position]; }
 
+  /**
+   * @brief Return the first minimum position in [@p left, @p right].
+   *
+   * @details Converts the query to an LCA query over the Cartesian-tree Euler
+   * tour and returns the corresponding original array position. Ties return the
+   * smaller original position because the Cartesian tree is stable.
+   *
+   * @param left First position in the query range.
+   * @param right Last position in the query range.
+   * @return Zero-based position of the first range minimum, or `npos`.
+   */
   std::size_t arg_min_impl(std::size_t left, std::size_t right) const {
     if (left > right || right >= values_.size()) {
       return npos;
@@ -119,11 +176,30 @@ class CartesianTreeRmq
     return euler_nodes_[euler_position];
   }
 
+  /**
+   * @brief Return the Euler-tour node sequence used by the reduction.
+   *
+   * @return Non-owning span of original array positions in Euler-tour order.
+   */
   std::span<const Index> euler_nodes() const { return euler_nodes_; }
 
+  /**
+   * @brief Return the Euler-tour depth sequence used by the reduction.
+   *
+   * @return Non-owning span of depths corresponding to `euler_nodes()`.
+   */
   std::span<const std::int64_t> euler_depths() const { return depths_; }
 
  private:
+  /**
+   * @brief Rebuild all Cartesian-tree and Euler-tour auxiliary data.
+   *
+   * @details Clears previous state, builds a stable Cartesian tree, records its
+   * Euler tour, converts adjacent Euler-depth deltas to bits, and rebuilds the
+   * ±1 RMQ backend.
+   *
+   * @throws std::length_error if @p Index cannot represent all positions.
+   */
   void build() {
     left_child_.clear();
     right_child_.clear();
@@ -152,6 +228,15 @@ class CartesianTreeRmq
     reset_depth_rmq();
   }
 
+  /**
+   * @brief Build the stable min Cartesian tree.
+   *
+   * @details Uses the standard monotone-stack construction. Strictly smaller
+   * values become ancestors; equal values are not popped, preserving first
+   * minimum tie-breaking.
+   *
+   * @return Root node position in the original value array.
+   */
   std::size_t build_cartesian_tree() {
     std::vector<Index> stack;
     stack.reserve(values_.size());
@@ -174,6 +259,15 @@ class CartesianTreeRmq
     return stack.front();
   }
 
+  /**
+   * @brief Append the Euler tour of a Cartesian-tree subtree.
+   *
+   * @details Visits @p node, recurses into each existing child, and appends
+   * @p node again after returning from that child.
+   *
+   * @param node Current Cartesian-tree node.
+   * @param depth Depth of @p node in the Cartesian tree.
+   */
   void euler_tour(std::size_t node, std::int64_t depth) {
     append_euler(node, depth);
     if (left_child_[node] != invalid_index) {
@@ -186,6 +280,15 @@ class CartesianTreeRmq
     }
   }
 
+  /**
+   * @brief Append one node/depth pair to the Euler-tour arrays.
+   *
+   * @details Records the first Euler occurrence of @p node if this is the first
+   * time the node is appended.
+   *
+   * @param node Cartesian-tree node, also an original value position.
+   * @param depth Depth of @p node in the Cartesian tree.
+   */
   void append_euler(std::size_t node, std::int64_t depth) {
     if (first_occurrence_[node] == invalid_index) {
       first_occurrence_[node] = static_cast<Index>(euler_nodes_.size());
@@ -194,11 +297,24 @@ class CartesianTreeRmq
     depths_.push_back(depth);
   }
 
+  /**
+   * @brief Rebuild the ±1 RMQ backend over the current Euler-depth deltas.
+   *
+   * @details Called after build, copy, and move operations because the backend
+   * stores non-owning spans into this object's `euler_delta_bits_` storage.
+   */
   void reset_depth_rmq() {
     depth_rmq_ = BpPlusMinusOneRmq<Index>(
         std::span<const std::uint64_t>(euler_delta_bits_), depths_.size());
   }
 
+  /**
+   * @brief Pack adjacent Euler-depth changes into BP-style delta bits.
+   *
+   * @details Bit `1` means the next Euler depth is current depth + 1; bit `0`
+   * means current depth - 1. Cartesian-tree Euler tours have only ±1 adjacent
+   * depth changes.
+   */
   void build_euler_delta_bits() {
     euler_delta_bits_.assign((depths_.size() - 1 + 63) / 64, 0);
     for (std::size_t i = 1; i < depths_.size(); ++i) {
diff --git a/include/pixie/rmq/rmq_base.h b/include/pixie/rmq/rmq_base.h
index cc919da..a188924 100644
--- a/include/pixie/rmq/rmq_base.h
+++ b/include/pixie/rmq/rmq_base.h
@@ -22,16 +22,27 @@ class RmqBase {
 
   /**
    * @brief Number of indexed values.
+   *
+   * @return The number of values covered by the underlying RMQ index.
    */
   std::size_t size() const { return impl().size_impl(); }
 
   /**
    * @brief Whether the indexed array is empty.
+   *
+   * @return `true` when `size() == 0`.
    */
   bool empty() const { return size() == 0; }
 
   /**
    * @brief Return the first minimum position in [@p left, @p right].
+   *
+   * @details The query range is inclusive. Ties are resolved by returning the
+   * smallest position attaining the minimum. Invalid ranges return `npos`.
+   *
+   * @param left First position in the query range.
+   * @param right Last position in the query range.
+   * @return Zero-based position of the first range minimum, or `npos`.
    */
   std::size_t arg_min(std::size_t left, std::size_t right) const {
     return impl().arg_min_impl(left, right);
@@ -39,7 +50,12 @@ class RmqBase {
 
   /**
    * @brief Return the minimum value in [@p left, @p right].
+   *
    * @details Invalid ranges return a default-constructed value.
+   *
+   * @param left First position in the query range.
+   * @param right Last position in the query range.
+   * @return The minimum value in the inclusive range, or `Value{}`.
    */
   Value range_min(std::size_t left, std::size_t right) const {
     const std::size_t position = arg_min(left, right);
@@ -50,6 +66,11 @@ class RmqBase {
   }
 
  private:
+  /**
+   * @brief Return this object as its concrete CRTP implementation.
+   *
+   * @return Reference to the derived RMQ implementation.
+   */
   const Impl& impl() const { return static_cast<const Impl&>(*this); }
 };
 
diff --git a/include/pixie/rmq/segment_tree.h b/include/pixie/rmq/segment_tree.h
index 5e734a8..3276c78 100644
--- a/include/pixie/rmq/segment_tree.h
+++ b/include/pixie/rmq/segment_tree.h
@@ -31,17 +31,51 @@ class SegmentTree : public RmqBase<SegmentTree<T, Compare, Index>, T> {
       RmqBase<SegmentTree<T, Compare, Index>, T>::npos;
   static constexpr Index invalid_index = std::numeric_limits<Index>::max();
 
+  /**
+   * @brief Construct an empty segment tree.
+   */
   SegmentTree() = default;
 
+  /**
+   * @brief Build an iterative segment tree over @p values.
+   *
+   * @details The values are not copied and must outlive this object. Equal
+   * values keep the smaller index as the RMQ answer.
+   *
+   * @param values Values to index.
+   * @param compare Ordering used to choose minima.
+   * @throws std::length_error if @p Index cannot represent all positions.
+   */
   explicit SegmentTree(std::span<const T> values, Compare compare = Compare())
       : values_(values), compare_(compare) {
     build();
   }
 
+  /**
+   * @brief Return the number of indexed values.
+   *
+   * @return `values.size()` from construction.
+   */
   std::size_t size_impl() const { return values_.size(); }
 
+  /**
+   * @brief Return the value at an indexed position.
+   *
+   * @param position Zero-based position in the indexed values.
+   * @return Copy of the value at @p position.
+   */
   T value_at_impl(std::size_t position) const { return values_[position]; }
 
+  /**
+   * @brief Return the first minimum position in [@p left, @p right].
+   *
+   * @details Answers in O(log n) by walking the flat iterative segment tree.
+   * Ties return the smaller position.
+   *
+   * @param left First position in the query range.
+   * @param right Last position in the query range.
+   * @return Zero-based position of the first range minimum, or `npos`.
+   */
   std::size_t arg_min_impl(std::size_t left, std::size_t right) const {
     if (left > right || right >= values_.size()) {
       return npos;
@@ -69,6 +103,17 @@ class SegmentTree : public RmqBase<SegmentTree<T, Compare, Index>, T> {
   }
 
  private:
+  /**
+   * @brief Choose the better of two candidate positions.
+   *
+   * @details `npos` and `invalid_index` are treated as missing. If both values
+   * compare equal, the smaller position wins to preserve first-minimum
+   * semantics.
+   *
+   * @param left First candidate position, `npos`, or `invalid_index`.
+   * @param right Second candidate position, `npos`, or `invalid_index`.
+   * @return Position of the selected candidate.
+   */
   std::size_t better(std::size_t left, std::size_t right) const {
     if (left == npos || left == invalid_index) {
       return right;
@@ -85,6 +130,15 @@ class SegmentTree : public RmqBase<SegmentTree<T, Compare, Index>, T> {
     return std::min(left, right);
   }
 
+  /**
+   * @brief Build the flat iterative segment tree.
+   *
+   * @details Leaves start at `leaf_base_`, which is the next power of two.
+   * Unused leaves contain `invalid_index`, and internal nodes store the first
+   * minimum position of their covered segment.
+   *
+   * @throws std::length_error if @p Index cannot represent all positions.
+   */
   void build() {
     tree_.clear();
     leaf_base_ = 0;
diff --git a/include/pixie/rmq/sparse_table.h b/include/pixie/rmq/sparse_table.h
index d2ea6ab..4e963a7 100644
--- a/include/pixie/rmq/sparse_table.h
+++ b/include/pixie/rmq/sparse_table.h
@@ -31,17 +31,51 @@ class SparseTable : public RmqBase<SparseTable<T, Compare, Index>, T> {
       RmqBase<SparseTable<T, Compare, Index>, T>::npos;
   static constexpr Index invalid_index = std::numeric_limits<Index>::max();
 
+  /**
+   * @brief Construct an empty sparse table.
+   */
   SparseTable() = default;
 
+  /**
+   * @brief Build a sparse table over @p values.
+   *
+   * @details The values are not copied and must outlive this object. Equal
+   * values keep the smaller index as the RMQ answer.
+   *
+   * @param values Values to index.
+   * @param compare Ordering used to choose minima.
+   * @throws std::length_error if @p Index cannot represent all positions.
+   */
   explicit SparseTable(std::span<const T> values, Compare compare = Compare())
       : values_(values), compare_(compare) {
     build();
   }
 
+  /**
+   * @brief Return the number of indexed values.
+   *
+   * @return `values.size()` from construction.
+   */
   std::size_t size_impl() const { return values_.size(); }
 
+  /**
+   * @brief Return the value at an indexed position.
+   *
+   * @param position Zero-based position in the indexed values.
+   * @return Copy of the value at @p position.
+   */
   T value_at_impl(std::size_t position) const { return values_[position]; }
 
+  /**
+   * @brief Return the first minimum position in [@p left, @p right].
+   *
+   * @details Answers in O(1) by comparing the two power-of-two ranges covering
+   * the inclusive query interval. Ties return the smaller position.
+   *
+   * @param left First position in the query range.
+   * @param right Last position in the query range.
+   * @return Zero-based position of the first range minimum, or `npos`.
+   */
   std::size_t arg_min_impl(std::size_t left, std::size_t right) const {
     if (left > right || right >= values_.size()) {
       return npos;
@@ -55,6 +89,16 @@ class SparseTable : public RmqBase<SparseTable<T, Compare, Index>, T> {
   }
 
  private:
+  /**
+   * @brief Choose the better of two candidate positions.
+   *
+   * @details `npos` is treated as missing. If both values compare equal, the
+   * smaller position wins to preserve first-minimum semantics.
+   *
+   * @param left First candidate position, or `npos`.
+   * @param right Second candidate position, or `npos`.
+   * @return Position of the selected candidate.
+   */
   std::size_t better(std::size_t left, std::size_t right) const {
     if (left == npos) {
       return right;
@@ -71,6 +115,14 @@ class SparseTable : public RmqBase<SparseTable<T, Compare, Index>, T> {
     return std::min(left, right);
   }
 
+  /**
+   * @brief Build all sparse-table levels over the indexed values.
+   *
+   * @details Level 0 stores singleton positions. Each higher level stores the
+   * first minimum of two adjacent half ranges from the previous level.
+   *
+   * @throws std::length_error if @p Index cannot represent all positions.
+   */
   void build() {
     table_.clear();
     if (values_.empty()) {
diff --git a/src/benchmarks/bench_rmq.cpp b/src/benchmarks/bench_rmq.cpp
index 4067e7e..abd4b46 100644
--- a/src/benchmarks/bench_rmq.cpp
+++ b/src/benchmarks/bench_rmq.cpp
@@ -1,7 +1,9 @@
 #include <benchmark/benchmark.h>
+#include <pixie/bits.h>
 #include <pixie/rmq.h>
 
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <random>
 #include <span>
@@ -12,6 +14,7 @@ namespace {
 
 constexpr std::uint64_t kSeed = 42;
 constexpr std::size_t kQueryCount = 32768;
+constexpr std::size_t kBpBlockSize = 128;
 using Index = std::size_t;
 
 struct Dataset {
@@ -29,6 +32,82 @@ struct DepthDataset {
   std::vector<std::pair<std::size_t, std::size_t>> ranges;
 };
 
+struct BpQueryShape {
+  std::size_t same_block = 0;
+  std::size_t cross_block = 0;
+  std::size_t middle_block = 0;
+  std::size_t disjoint_boundary = 0;
+  std::size_t fused_boundary = 0;
+  std::size_t excess_calls = 0;
+  std::size_t left_boundary_width = 0;
+  std::size_t right_boundary_width = 0;
+  std::size_t same_block_width = 0;
+};
+
+struct BlockRange {
+  std::size_t block = 0;
+  std::size_t left = 0;
+  std::size_t right = 0;
+};
+
+struct MacroRange {
+  std::size_t left = 0;
+  std::size_t right = 0;
+};
+
+struct alignas(16) BenchBlockSummary {
+  static constexpr std::uint64_t kSigned48Mask = (std::uint64_t{1} << 48) - 1;
+  static constexpr std::uint64_t kSigned48SignBit = std::uint64_t{1} << 47;
+
+  std::uint64_t word0 = 0;
+  std::uint64_t word1 = 0;
+
+  static BenchBlockSummary make(std::int64_t base_depth,
+                                std::int64_t min_value,
+                                std::size_t min_offset) {
+    const std::uint64_t packed_base = pack_signed48(base_depth);
+    const std::uint64_t ordered_min = pack_ordered48(min_value);
+    return {ordered_min | (static_cast<std::uint64_t>(min_offset) << 48),
+            packed_base};
+  }
+
+  std::int64_t min_value() const {
+    return unpack_ordered48(ordered_min_value());
+  }
+
+  std::uint64_t ordered_min_value() const { return word0 & kSigned48Mask; }
+
+ private:
+  static std::uint64_t pack_signed48(std::int64_t value) {
+    return static_cast<std::uint64_t>(value) & kSigned48Mask;
+  }
+
+  static std::uint64_t pack_ordered48(std::int64_t value) {
+    return pack_signed48(value) ^ kSigned48SignBit;
+  }
+
+  static std::int64_t unpack_signed48(std::uint64_t value) {
+    if ((value & kSigned48SignBit) != 0) {
+      value |= ~kSigned48Mask;
+    }
+    return static_cast<std::int64_t>(value);
+  }
+
+  static std::int64_t unpack_ordered48(std::uint64_t value) {
+    return unpack_signed48(value ^ kSigned48SignBit);
+  }
+};
+
+static_assert(sizeof(BenchBlockSummary) == 16);
+static_assert(alignof(BenchBlockSummary) == 16);
+
+struct BenchBlockSummaryMinLess {
+  bool operator()(const BenchBlockSummary& left,
+                  const BenchBlockSummary& right) const {
+    return left.ordered_min_value() < right.ordered_min_value();
+  }
+};
+
 Dataset make_dataset(std::size_t size, std::size_t max_width) {
   Dataset dataset;
   dataset.size = size;
@@ -83,6 +162,207 @@ DepthDataset make_depth_dataset(std::size_t size, std::size_t max_width) {
   return dataset;
 }
 
+std::size_t bp_block_size(const DepthDataset& dataset, std::size_t block) {
+  const std::size_t begin = block * kBpBlockSize;
+  return std::min(kBpBlockSize, dataset.depths.size() - begin);
+}
+
+std::array<std::uint64_t, 2> bp_block_bits_stack(const DepthDataset& dataset,
+                                                 std::size_t block) {
+  const std::size_t first_word = block * (kBpBlockSize / 64);
+  const std::uint64_t lo =
+      first_word < dataset.bits.size() ? dataset.bits[first_word] : 0;
+  const std::uint64_t hi =
+      first_word + 1 < dataset.bits.size() ? dataset.bits[first_word + 1] : 0;
+  return {lo, hi};
+}
+
+const std::uint64_t* bp_block_bits_direct(const DepthDataset& dataset,
+                                          std::size_t block) {
+  return dataset.bits.data() + block * (kBpBlockSize / 64);
+}
+
+BpQueryShape compute_bp_query_shape(const DepthDataset& dataset) {
+  BpQueryShape shape;
+  for (const auto [left, right] : dataset.ranges) {
+    const std::size_t left_block = left / kBpBlockSize;
+    const std::size_t right_block = right / kBpBlockSize;
+    if (left_block == right_block) {
+      ++shape.same_block;
+      ++shape.excess_calls;
+      shape.same_block_width += right - left + 1;
+      continue;
+    }
+
+    ++shape.cross_block;
+    const std::size_t left_offset = left % kBpBlockSize;
+    const std::size_t right_offset = right % kBpBlockSize;
+    const std::size_t left_width = kBpBlockSize - left_offset;
+    const std::size_t right_width = right_offset + 1;
+    if (left_offset > right_offset) {
+      ++shape.disjoint_boundary;
+      if ((kBpBlockSize - 1 - left_offset) + right_offset >= 32) {
+        ++shape.fused_boundary;
+      }
+    }
+    shape.excess_calls += 2;
+    shape.left_boundary_width += left_width;
+    shape.right_boundary_width += right_width;
+    if (left_block + 1 < right_block) {
+      ++shape.middle_block;
+    }
+  }
+  return shape;
+}
+
+void set_depth_counters(benchmark::State& state,
+                        const DepthDataset& dataset,
+                        bool include_shape) {
+  state.counters["N"] = static_cast<double>(dataset.size);
+  state.counters["max_width"] = static_cast<double>(dataset.max_width);
+  state.counters["index_bytes"] = static_cast<double>(sizeof(Index));
+
+  if (!include_shape) {
+    return;
+  }
+
+  const BpQueryShape shape = compute_bp_query_shape(dataset);
+  const double query_count = static_cast<double>(dataset.ranges.size());
+  const double cross_count = static_cast<double>(shape.cross_block);
+  const double same_count = static_cast<double>(shape.same_block);
+  state.counters["same_block_ratio"] =
+      static_cast<double>(shape.same_block) / query_count;
+  state.counters["cross_block_ratio"] =
+      static_cast<double>(shape.cross_block) / query_count;
+  state.counters["middle_block_ratio"] =
+      static_cast<double>(shape.middle_block) / query_count;
+  state.counters["disjoint_boundary_ratio"] =
+      static_cast<double>(shape.disjoint_boundary) / query_count;
+  state.counters["fused_boundary_eligible_ratio"] =
+      static_cast<double>(shape.fused_boundary) / query_count;
+  state.counters["excess_calls_per_query"] =
+      static_cast<double>(shape.excess_calls) / query_count;
+  state.counters["avg_same_width"] =
+      same_count == 0
+          ? 0.0
+          : static_cast<double>(shape.same_block_width) / same_count;
+  state.counters["avg_left_boundary_width"] =
+      cross_count == 0
+          ? 0.0
+          : static_cast<double>(shape.left_boundary_width) / cross_count;
+  state.counters["avg_right_boundary_width"] =
+      cross_count == 0
+          ? 0.0
+          : static_cast<double>(shape.right_boundary_width) / cross_count;
+}
+
+std::vector<BlockRange> make_same_block_ranges(const DepthDataset& dataset) {
+  std::vector<BlockRange> out;
+  out.reserve(dataset.ranges.size());
+  for (const auto [left, right] : dataset.ranges) {
+    const std::size_t left_block = left / kBpBlockSize;
+    const std::size_t right_block = right / kBpBlockSize;
+    if (left_block == right_block) {
+      out.push_back({left_block, left % kBpBlockSize, right % kBpBlockSize});
+    }
+  }
+  return out;
+}
+
+std::vector<BlockRange> make_left_boundary_ranges(const DepthDataset& dataset) {
+  std::vector<BlockRange> out;
+  out.reserve(dataset.ranges.size());
+  for (const auto [left, right] : dataset.ranges) {
+    const std::size_t left_block = left / kBpBlockSize;
+    const std::size_t right_block = right / kBpBlockSize;
+    if (left_block != right_block) {
+      out.push_back({left_block, left % kBpBlockSize,
+                     bp_block_size(dataset, left_block) - 1});
+    }
+  }
+  return out;
+}
+
+std::vector<BlockRange> make_right_boundary_ranges(
+    const DepthDataset& dataset) {
+  std::vector<BlockRange> out;
+  out.reserve(dataset.ranges.size());
+  for (const auto [left, right] : dataset.ranges) {
+    const std::size_t left_block = left / kBpBlockSize;
+    const std::size_t right_block = right / kBpBlockSize;
+    if (left_block != right_block) {
+      out.push_back({right_block, 0, right % kBpBlockSize});
+    }
+  }
+  return out;
+}
+
+std::vector<std::pair<BlockRange, BlockRange>> make_boundary_pairs(
+    const DepthDataset& dataset) {
+  std::vector<std::pair<BlockRange, BlockRange>> out;
+  out.reserve(dataset.ranges.size());
+  for (const auto [left, right] : dataset.ranges) {
+    const std::size_t left_block = left / kBpBlockSize;
+    const std::size_t right_block = right / kBpBlockSize;
+    if (left_block != right_block) {
+      out.push_back({{left_block, left % kBpBlockSize,
+                      bp_block_size(dataset, left_block) - 1},
+                     {right_block, 0, right % kBpBlockSize}});
+    }
+  }
+  return out;
+}
+
+std::vector<std::pair<BlockRange, BlockRange>> make_disjoint_boundary_pairs(
+    const DepthDataset& dataset) {
+  std::vector<std::pair<BlockRange, BlockRange>> out;
+  out.reserve(dataset.ranges.size());
+  for (const auto [left, right] : dataset.ranges) {
+    const std::size_t left_block = left / kBpBlockSize;
+    const std::size_t right_block = right / kBpBlockSize;
+    const std::size_t left_offset = left % kBpBlockSize;
+    const std::size_t right_offset = right % kBpBlockSize;
+    if (left_block != right_block && left_offset > right_offset) {
+      out.push_back(
+          {{left_block, left_offset, bp_block_size(dataset, left_block) - 1},
+           {right_block, 0, right_offset}});
+    }
+  }
+  return out;
+}
+
+std::vector<MacroRange> make_macro_ranges(const DepthDataset& dataset) {
+  std::vector<MacroRange> out;
+  out.reserve(dataset.ranges.size());
+  for (const auto [left, right] : dataset.ranges) {
+    const std::size_t left_block = left / kBpBlockSize;
+    const std::size_t right_block = right / kBpBlockSize;
+    if (left_block + 1 < right_block) {
+      out.push_back({left_block + 1, right_block - 1});
+    }
+  }
+  return out;
+}
+
+std::vector<BenchBlockSummary> make_block_summaries(
+    const DepthDataset& dataset) {
+  const std::size_t block_count =
+      (dataset.depths.size() + kBpBlockSize - 1) / kBpBlockSize;
+  std::vector<BenchBlockSummary> summaries;
+  summaries.reserve(block_count);
+  for (std::size_t block = 0; block < block_count; ++block) {
+    const std::size_t begin = block * kBpBlockSize;
+    const std::size_t end =
+        std::min(begin + kBpBlockSize, dataset.depths.size());
+    auto min_it = std::min_element(dataset.depths.begin() + begin,
+                                   dataset.depths.begin() + end);
+    summaries.push_back(BenchBlockSummary::make(
+        dataset.depths[begin], *min_it,
+        static_cast<std::size_t>(min_it - dataset.depths.begin() - begin)));
+  }
+  return summaries;
+}
+
 template <class Rmq>
 void run_queries(benchmark::State& state) {
   const std::size_t size = static_cast<std::size_t>(state.range(0));
@@ -103,6 +383,166 @@ void run_queries(benchmark::State& state) {
   state.counters["index_bytes"] = static_cast<double>(sizeof(Index));
 }
 
+void run_bp_boundary_stack(
+    benchmark::State& state,
+    std::vector<BlockRange> (*make_ranges)(const DepthDataset&)) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const DepthDataset dataset = make_depth_dataset(size, max_width);
+  const std::vector<BlockRange> ranges = make_ranges(dataset);
+  if (ranges.empty()) {
+    state.SkipWithError("no diagnostic ranges for this size/width");
+    return;
+  }
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const BlockRange range = ranges[query_index++ % ranges.size()];
+    const auto bits = bp_block_bits_stack(dataset, range.block);
+    ExcessResult result = excess_min_128(bits.data(), range.left, range.right);
+    benchmark::DoNotOptimize(result.min_excess);
+    benchmark::DoNotOptimize(result.offset);
+  }
+
+  set_depth_counters(state, dataset, false);
+  state.counters["diagnostic_ranges"] = static_cast<double>(ranges.size());
+}
+
+void run_bp_boundary_pair_stack(benchmark::State& state) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const DepthDataset dataset = make_depth_dataset(size, max_width);
+  const auto ranges = make_boundary_pairs(dataset);
+  if (ranges.empty()) {
+    state.SkipWithError("no cross-block boundary pairs for this size/width");
+    return;
+  }
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const auto [left, right] = ranges[query_index++ % ranges.size()];
+    const auto left_bits = bp_block_bits_stack(dataset, left.block);
+    const auto right_bits = bp_block_bits_stack(dataset, right.block);
+    ExcessResult left_result =
+        excess_min_128(left_bits.data(), left.left, left.right);
+    ExcessResult right_result =
+        excess_min_128(right_bits.data(), right.left, right.right);
+    benchmark::DoNotOptimize(left_result.min_excess);
+    benchmark::DoNotOptimize(left_result.offset);
+    benchmark::DoNotOptimize(right_result.min_excess);
+    benchmark::DoNotOptimize(right_result.offset);
+  }
+
+  set_depth_counters(state, dataset, false);
+  state.counters["diagnostic_ranges"] = static_cast<double>(ranges.size());
+}
+
+void run_bp_boundary_pair_direct(benchmark::State& state) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const DepthDataset dataset = make_depth_dataset(size, max_width);
+  const auto ranges = make_boundary_pairs(dataset);
+  if (ranges.empty()) {
+    state.SkipWithError("no cross-block boundary pairs for this size/width");
+    return;
+  }
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const auto [left, right] = ranges[query_index++ % ranges.size()];
+    ExcessResult left_result = excess_min_128(
+        bp_block_bits_direct(dataset, left.block), left.left, left.right);
+    ExcessResult right_result = excess_min_128(
+        bp_block_bits_direct(dataset, right.block), right.left, right.right);
+    benchmark::DoNotOptimize(left_result.min_excess);
+    benchmark::DoNotOptimize(left_result.offset);
+    benchmark::DoNotOptimize(right_result.min_excess);
+    benchmark::DoNotOptimize(right_result.offset);
+  }
+
+  set_depth_counters(state, dataset, false);
+  state.counters["diagnostic_ranges"] = static_cast<double>(ranges.size());
+}
+
+void run_bp_boundary_pair_disjoint_direct(benchmark::State& state) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const DepthDataset dataset = make_depth_dataset(size, max_width);
+  const auto ranges = make_disjoint_boundary_pairs(dataset);
+  if (ranges.empty()) {
+    state.SkipWithError("no disjoint cross-block boundary pairs");
+    return;
+  }
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const auto [left, right] = ranges[query_index++ % ranges.size()];
+    ExcessResult left_result = excess_min_128(
+        bp_block_bits_direct(dataset, left.block), left.left, left.right);
+    ExcessResult right_result = excess_min_128(
+        bp_block_bits_direct(dataset, right.block), right.left, right.right);
+    benchmark::DoNotOptimize(left_result.min_excess);
+    benchmark::DoNotOptimize(left_result.offset);
+    benchmark::DoNotOptimize(right_result.min_excess);
+    benchmark::DoNotOptimize(right_result.offset);
+  }
+
+  set_depth_counters(state, dataset, false);
+  state.counters["diagnostic_ranges"] = static_cast<double>(ranges.size());
+}
+
+void run_bp_boundary_pair_fused(benchmark::State& state) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const DepthDataset dataset = make_depth_dataset(size, max_width);
+  const auto ranges = make_disjoint_boundary_pairs(dataset);
+  if (ranges.empty()) {
+    state.SkipWithError("no disjoint cross-block boundary pairs");
+    return;
+  }
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const auto [left, right] = ranges[query_index++ % ranges.size()];
+    ExcessBoundaryPairResult result = excess_min_128_disjoint_suffix_prefix(
+        bp_block_bits_direct(dataset, left.block), left.left,
+        bp_block_bits_direct(dataset, right.block), right.right);
+    benchmark::DoNotOptimize(result.suffix.min_excess);
+    benchmark::DoNotOptimize(result.suffix.offset);
+    benchmark::DoNotOptimize(result.prefix.min_excess);
+    benchmark::DoNotOptimize(result.prefix.offset);
+  }
+
+  set_depth_counters(state, dataset, false);
+  state.counters["diagnostic_ranges"] = static_cast<double>(ranges.size());
+}
+
+void run_bp_macro_only(benchmark::State& state) {
+  const std::size_t size = static_cast<std::size_t>(state.range(0));
+  const std::size_t max_width = static_cast<std::size_t>(state.range(1));
+  const DepthDataset dataset = make_depth_dataset(size, max_width);
+  const std::vector<BenchBlockSummary> block_summaries =
+      make_block_summaries(dataset);
+  const pixie::rmq::SparseTable<BenchBlockSummary, BenchBlockSummaryMinLess,
+                                Index>
+      macro_rmq{std::span<const BenchBlockSummary>(block_summaries)};
+  const std::vector<MacroRange> ranges = make_macro_ranges(dataset);
+  if (ranges.empty()) {
+    state.SkipWithError("no middle-block macro ranges for this size/width");
+    return;
+  }
+
+  std::size_t query_index = 0;
+  for (auto _ : state) {
+    const MacroRange range = ranges[query_index++ % ranges.size()];
+    std::size_t result = macro_rmq.arg_min(range.left, range.right);
+    benchmark::DoNotOptimize(result);
+  }
+
+  set_depth_counters(state, dataset, false);
+  state.counters["diagnostic_ranges"] = static_cast<double>(ranges.size());
+}
+
 template <class Rmq>
 void run_depth_queries(benchmark::State& state) {
   const std::size_t size = static_cast<std::size_t>(state.range(0));
@@ -119,9 +559,7 @@ void run_depth_queries(benchmark::State& state) {
     benchmark::DoNotOptimize(result);
   }
 
-  state.counters["N"] = static_cast<double>(size);
-  state.counters["max_width"] = static_cast<double>(max_width);
-  state.counters["index_bytes"] = static_cast<double>(sizeof(Index));
+  set_depth_counters(state, dataset, true);
 }
 
 void register_benchmarks() {
@@ -172,6 +610,54 @@ void register_benchmarks() {
           ->Args({static_cast<std::int64_t>(size),
                   static_cast<std::int64_t>(width)})
           ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_same_block_boundary",
+                                   [](benchmark::State& state) {
+                                     run_bp_boundary_stack(
+                                         state, make_same_block_ranges);
+                                   })
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_left_boundary",
+                                   [](benchmark::State& state) {
+                                     run_bp_boundary_stack(
+                                         state, make_left_boundary_ranges);
+                                   })
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_right_boundary",
+                                   [](benchmark::State& state) {
+                                     run_bp_boundary_stack(
+                                         state, make_right_boundary_ranges);
+                                   })
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_boundary_pair_stack",
+                                   run_bp_boundary_pair_stack)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_boundary_pair_direct",
+                                   run_bp_boundary_pair_direct)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_boundary_pair_disjoint_direct",
+                                   run_bp_boundary_pair_disjoint_direct)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_boundary_pair_fused",
+                                   run_bp_boundary_pair_fused)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
+      benchmark::RegisterBenchmark("rmq_bp_diag_macro_only", run_bp_macro_only)
+          ->Args({static_cast<std::int64_t>(size),
+                  static_cast<std::int64_t>(width)})
+          ->Unit(benchmark::kNanosecond);
     }
   }
 }
diff --git a/src/benchmarks/excess_positions_benchmarks.cpp b/src/benchmarks/excess_positions_benchmarks.cpp
index 0e0850f..148ebe6 100644
--- a/src/benchmarks/excess_positions_benchmarks.cpp
+++ b/src/benchmarks/excess_positions_benchmarks.cpp
@@ -63,6 +63,19 @@ static std::vector<std::pair<size_t, size_t>> make_128_ranges(
   return ranges;
 }
 
+static std::vector<std::pair<size_t, size_t>> make_disjoint_boundary_ranges(
+    size_t num_ranges = 4096) {
+  std::mt19937_64 rng(45);
+  std::uniform_int_distribution<size_t> prefix_dist(0, 126);
+  std::vector<std::pair<size_t, size_t>> ranges(num_ranges);
+  for (auto& [suffix_left, prefix_right] : ranges) {
+    prefix_right = prefix_dist(rng);
+    std::uniform_int_distribution<size_t> suffix_dist(prefix_right + 1, 127);
+    suffix_left = suffix_dist(rng);
+  }
+  return ranges;
+}
+
 static std::vector<int> make_512_targets(size_t num_targets = 4096) {
   std::mt19937 rng(44);
   std::uniform_int_distribution<int> target_dist(-128, 128);
@@ -113,6 +126,57 @@ BENCHMARK(BM_ExcessMin128)
     ->Args({63, 64})
     ->Args({17, 17});
 
+static void BM_ExcessMin128BoundaryPairIndependent(benchmark::State& state) {
+  const auto blocks = make_128_blocks();
+  const auto ranges = make_disjoint_boundary_ranges();
+  const size_t num_blocks = blocks.size();
+  const size_t num_ranges = ranges.size();
+
+  size_t idx = 0;
+  for (auto _ : state) {
+    const auto& suffix = blocks[idx % num_blocks];
+    const auto& prefix = blocks[(idx + 1) % num_blocks];
+    const auto [suffix_left, prefix_right] = ranges[idx % num_ranges];
+    ExcessResult suffix_result =
+        excess_min_128(suffix.data(), suffix_left, 127);
+    ExcessResult prefix_result = excess_min_128(prefix.data(), 0, prefix_right);
+    benchmark::DoNotOptimize(suffix_result.min_excess);
+    benchmark::DoNotOptimize(suffix_result.offset);
+    benchmark::DoNotOptimize(prefix_result.min_excess);
+    benchmark::DoNotOptimize(prefix_result.offset);
+    ++idx;
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_ExcessMin128BoundaryPairIndependent);
+
+static void BM_ExcessMin128BoundaryPairFused(benchmark::State& state) {
+  const auto blocks = make_128_blocks();
+  const auto ranges = make_disjoint_boundary_ranges();
+  const size_t num_blocks = blocks.size();
+  const size_t num_ranges = ranges.size();
+
+  size_t idx = 0;
+  for (auto _ : state) {
+    const auto& suffix = blocks[idx % num_blocks];
+    const auto& prefix = blocks[(idx + 1) % num_blocks];
+    const auto [suffix_left, prefix_right] = ranges[idx % num_ranges];
+    ExcessBoundaryPairResult result = excess_min_128_disjoint_suffix_prefix(
+        suffix.data(), suffix_left, prefix.data(), prefix_right);
+    benchmark::DoNotOptimize(result.suffix.min_excess);
+    benchmark::DoNotOptimize(result.suffix.offset);
+    benchmark::DoNotOptimize(result.prefix.min_excess);
+    benchmark::DoNotOptimize(result.prefix.offset);
+    ++idx;
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_ExcessMin128BoundaryPairFused);
+
 template <ExcessResult (*Fn)(const uint64_t*, size_t, size_t)>
 static void BM_ExcessMin128Variant(benchmark::State& state) {
   const size_t left = static_cast<size_t>(state.range(0));
diff --git a/src/tests/excess_positions_tests.cpp b/src/tests/excess_positions_tests.cpp
index 62cdf3a..c5c43f7 100644
--- a/src/tests/excess_positions_tests.cpp
+++ b/src/tests/excess_positions_tests.cpp
@@ -162,6 +162,28 @@ static size_t count_matches(const uint64_t* out) {
   return cnt;
 }
 
+static void check_boundary_pair_matches_independent(
+    const std::array<uint64_t, 2>& suffix,
+    size_t suffix_left,
+    const std::array<uint64_t, 2>& prefix,
+    size_t prefix_right) {
+  const ExcessBoundaryPairResult result = excess_min_128_disjoint_suffix_prefix(
+      suffix.data(), suffix_left, prefix.data(), prefix_right);
+  const ExcessResult expected_suffix =
+      excess_min_128(suffix.data(), suffix_left, 127);
+  const ExcessResult expected_prefix =
+      excess_min_128(prefix.data(), 0, prefix_right);
+
+  ASSERT_EQ(result.suffix.min_excess, expected_suffix.min_excess)
+      << "suffix_left=" << suffix_left << " prefix_right=" << prefix_right;
+  ASSERT_EQ(result.suffix.offset, expected_suffix.offset)
+      << "suffix_left=" << suffix_left << " prefix_right=" << prefix_right;
+  ASSERT_EQ(result.prefix.min_excess, expected_prefix.min_excess)
+      << "suffix_left=" << suffix_left << " prefix_right=" << prefix_right;
+  ASSERT_EQ(result.prefix.offset, expected_prefix.offset)
+      << "suffix_left=" << suffix_left << " prefix_right=" << prefix_right;
+}
+
 template <typename Fn>
 static void check_matches_naive(Fn fn,
                                 const char* fn_name,
@@ -345,6 +367,65 @@ TEST(ExcessPositions128, MinMatchesNaiveRandom) {
   }
 }
 
+TEST(ExcessPositions128, DisjointBoundaryPairMatchesIndependentFixedCases) {
+  const std::array<std::array<uint64_t, 2>, 5> cases = {{
+      {0, 0},
+      {UINT64_MAX, UINT64_MAX},
+      {0xAAAAAAAAAAAAAAAAull, 0x5555555555555555ull},
+      {0x0123456789ABCDEFull, 0xFEDCBA9876543210ull},
+      {0x0000FFFF0000FFFFull, 0xFFFF0000FFFF0000ull},
+  }};
+  const std::array<std::pair<size_t, size_t>, 10> ranges = {{
+      {1, 0},
+      {4, 3},
+      {17, 16},
+      {32, 31},
+      {63, 62},
+      {64, 32},
+      {65, 63},
+      {96, 31},
+      {127, 0},
+      {120, 119},
+  }};
+
+  for (const auto& suffix : cases) {
+    for (const auto& prefix : cases) {
+      for (const auto [suffix_left, prefix_right] : ranges) {
+        check_boundary_pair_matches_independent(suffix, suffix_left, prefix,
+                                                prefix_right);
+      }
+    }
+  }
+}
+
+TEST(ExcessPositions128, DisjointBoundaryPairMatchesIndependentRandom) {
+  std::mt19937_64 rng(45);
+  std::uniform_int_distribution<size_t> prefix_dist(0, 126);
+
+  for (int t = 0; t < 1000; ++t) {
+    const std::array<uint64_t, 2> suffix = {rng(), rng()};
+    const std::array<uint64_t, 2> prefix = {rng(), rng()};
+    for (int q = 0; q < 16; ++q) {
+      const size_t prefix_right = prefix_dist(rng);
+      std::uniform_int_distribution<size_t> suffix_dist(prefix_right + 1, 127);
+      const size_t suffix_left = suffix_dist(rng);
+      check_boundary_pair_matches_independent(suffix, suffix_left, prefix,
+                                              prefix_right);
+    }
+  }
+}
+
+TEST(ExcessPositions128, BoundaryPairFallbackMatchesIndependent) {
+  const std::array<uint64_t, 2> suffix = {0x0123456789ABCDEFull,
+                                          0xFEDCBA9876543210ull};
+  const std::array<uint64_t, 2> prefix = {0x0000FFFF0000FFFFull,
+                                          0xFFFF0000FFFF0000ull};
+
+  check_boundary_pair_matches_independent(suffix, 32, prefix, 32);
+  check_boundary_pair_matches_independent(suffix, 0, prefix, 127);
+  check_boundary_pair_matches_independent(suffix, 128, prefix, 0);
+}
+
 TEST(ExcessPositions128Experimental, MinVariantsMatchNaive) {
   const std::array<std::array<uint64_t, 2>, 6> cases = {{
       {0, 0},
diff --git a/src/tests/rmq_tests.cpp b/src/tests/rmq_tests.cpp
index e727353..5f13d8b 100644
--- a/src/tests/rmq_tests.cpp
+++ b/src/tests/rmq_tests.cpp
@@ -166,6 +166,27 @@ TEST(RmqBpPlusMinusOne, CrossBlockTieKeepsFirstPosition) {
                           std::less<std::int64_t>()));
 }
 
+TEST(RmqBpPlusMinusOne, DisjointBoundaryRangeMatchesNaive) {
+  std::vector<std::int64_t> depths(384);
+  for (std::size_t i = 1; i < depths.size(); ++i) {
+    const bool up = (i % 9 == 0) || (i % 9 == 3) || (i % 11 == 0);
+    depths[i] = depths[i - 1] + (up ? 1 : -1);
+  }
+
+  const std::vector<std::uint64_t> bits = pack_depth_deltas(depths);
+  const pixie::rmq::BpPlusMinusOneRmq<> rmq(bits, depths.size());
+  const std::vector<std::pair<std::size_t, std::size_t>> ranges = {
+      {96, 160}, {120, 140}, {127, 129}, {190, 258}, {250, 260},
+  };
+
+  for (const auto [left, right] : ranges) {
+    EXPECT_EQ(rmq.arg_min(left, right),
+              naive_arg_min(std::span<const std::int64_t>(depths), left, right,
+                            std::less<std::int64_t>()))
+        << "range=[" << left << "," << right << "]";
+  }
+}
+
 TEST(RmqBpPlusMinusOne, RejectsTooSmallBitSpan) {
   const std::vector<std::uint64_t> bits;
   EXPECT_THROW((pixie::rmq::BpPlusMinusOneRmq<>(bits, 2)),