Skip to content

Commit f11e97a

Browse files
committed
perf(parquet): vectorize amd64 SSE4/AVX2 bool unpacking for ~9x throughput
Replace the scalar bit-by-bit implementations of _bytes_to_bools_sse4 and _bytes_to_bools_avx2 with actual SIMD vectorized code. The previous implementations were auto-generated by c2goasm from clang output that failed to auto-vectorize, resulting in purely scalar code (movzx/shr/and/ mov one bit at a time) despite being labeled as SSE4 and AVX2. SSE4: uses PSHUFB to broadcast 2 input bytes into 16 XMM lanes, then PAND+PCMPEQB for parallel bit-test and PAND to normalize to 0/1. Processes 2 bytes (16 bools) per iteration. AVX2: uses VPBROADCASTD+VPSHUFB to broadcast 4 input bytes into 32 YMM lanes, then VPAND+VPCMPEQB+VPAND for parallel bit-test and normalize. Processes 4 bytes (32 bools) per iteration. Includes VZEROUPPER to avoid SSE-AVX transition penalties. Both include scalar tails for edge cases with <vector-width output slots. Benchmarks on AMD Ryzen 7 7800X3D (linux/amd64): BytesToBools/64B 146.0ns -> 15.60ns (9.4x, 418->3913 MiB/s) BytesToBools/256B 562.3ns -> 63.36ns (8.9x, 434->3853 MiB/s) BytesToBools/1KB 2247ns -> 253.9ns (8.8x, 435->3846 MiB/s) BytesToBools/4KB 8970ns -> 1018ns (8.8x, 436->3838 MiB/s) BytesToBools/16KB 35798ns -> 4044ns (8.9x, 437->3864 MiB/s) geomean: -88.8% latency, +795% throughput
1 parent 25f2367 commit f11e97a

File tree

6 files changed

+341
-165
lines changed

6 files changed

+341
-165
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ jobs:
330330
name: TinyGo
331331
runs-on: ubuntu-latest
332332
env:
333-
TINYGO_VERSION: 0.38.0
333+
TINYGO_VERSION: 0.40.1
334334
timeout-minutes: 20
335335
steps:
336336
- name: Checkout

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
module github.com/apache/arrow-go/v18
1818

19-
go 1.24.0
19+
go 1.25.0
2020

2121
require (
2222
github.com/andybalholm/brotli v1.2.0

parquet/internal/utils/unpack_bool.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,11 @@ package utils
2020
func bytesToBoolsGo(in []byte, out []bool) {
2121
for i, b := range in {
2222
for j := 0; j < 8; j++ {
23-
out[8*i+j] = (b & (1 << j)) != 0
23+
idx := 8*i + j
24+
if idx >= len(out) {
25+
return
26+
}
27+
out[idx] = (b & (1 << j)) != 0
2428
}
2529
}
2630
}
Lines changed: 120 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,127 @@
11
//+build !noasm !appengine
2-
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
32

4-
TEXT ·_bytes_to_bools_avx2(SB), $0-32
3+
// AVX2 vectorized bytes-to-bools using VPBROADCASTD + VPSHUFB + VPCMPEQB.
4+
// Processes 4 input bytes → 32 output bools per vector iteration.
5+
// Replaces the original c2goasm-generated scalar code which used zero SIMD.
6+
7+
#include "textflag.h"
8+
9+
// VPSHUFB operates on two independent 128-bit lanes.
10+
// Lower lane: byte 0 → lanes 0-7, byte 1 → lanes 8-15
11+
// Upper lane: byte 2 → lanes 16-23, byte 3 → lanes 24-31
12+
DATA shuffle_avx2<>+0x00(SB)/8, $0x0000000000000000
13+
DATA shuffle_avx2<>+0x08(SB)/8, $0x0101010101010101
14+
DATA shuffle_avx2<>+0x10(SB)/8, $0x0202020202020202
15+
DATA shuffle_avx2<>+0x18(SB)/8, $0x0303030303030303
16+
GLOBL shuffle_avx2<>(SB), (NOPTR+RODATA), $32
17+
18+
// [1, 2, 4, 8, 16, 32, 64, 128] × 4
19+
DATA bitmask_avx2<>+0x00(SB)/8, $0x8040201008040201
20+
DATA bitmask_avx2<>+0x08(SB)/8, $0x8040201008040201
21+
DATA bitmask_avx2<>+0x10(SB)/8, $0x8040201008040201
22+
DATA bitmask_avx2<>+0x18(SB)/8, $0x8040201008040201
23+
GLOBL bitmask_avx2<>(SB), (NOPTR+RODATA), $32
24+
25+
// [1, 1, 1, ...] × 32
26+
DATA ones_avx2<>+0x00(SB)/8, $0x0101010101010101
27+
DATA ones_avx2<>+0x08(SB)/8, $0x0101010101010101
28+
DATA ones_avx2<>+0x10(SB)/8, $0x0101010101010101
29+
DATA ones_avx2<>+0x18(SB)/8, $0x0101010101010101
30+
GLOBL ones_avx2<>(SB), (NOPTR+RODATA), $32
31+
32+
TEXT ·_bytes_to_bools_avx2(SB), NOSPLIT, $0-32
533

634
MOVQ in+0(FP), DI
735
MOVQ len+8(FP), SI
836
MOVQ out+16(FP), DX
9-
MOVQ outlen+24(FP), CX
10-
11-
WORD $0xf685 // test esi, esi
12-
JLE LBB0_5
13-
WORD $0x8941; BYTE $0xf0 // mov r8d, esi
14-
LONG $0x03e0c149 // shl r8, 3
15-
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
16-
JMP LBB0_2
17-
18-
LBB0_4:
19-
LONG $0x08c28349 // add r10, 8
20-
LONG $0x01c78348 // add rdi, 1
21-
WORD $0x394d; BYTE $0xd0 // cmp r8, r10
22-
JE LBB0_5
23-
24-
LBB0_2:
25-
WORD $0x3941; BYTE $0xca // cmp r10d, ecx
26-
JGE LBB0_4
27-
WORD $0x8945; BYTE $0xd1 // mov r9d, r10d
28-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
29-
WORD $0x0124 // and al, 1
30-
LONG $0x0a048842 // mov byte [rdx + r9], al
31-
WORD $0x894c; BYTE $0xce // mov rsi, r9
32-
LONG $0x01ce8348 // or rsi, 1
33-
WORD $0xce39 // cmp esi, ecx
34-
JGE LBB0_4
35-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
36-
WORD $0xe8d0 // shr al, 1
37-
WORD $0x0124 // and al, 1
38-
WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
39-
WORD $0x894c; BYTE $0xce // mov rsi, r9
40-
LONG $0x02ce8348 // or rsi, 2
41-
WORD $0xce39 // cmp esi, ecx
42-
JGE LBB0_4
43-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
44-
WORD $0xe8c0; BYTE $0x02 // shr al, 2
45-
WORD $0x0124 // and al, 1
46-
WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
47-
WORD $0x894c; BYTE $0xce // mov rsi, r9
48-
LONG $0x03ce8348 // or rsi, 3
49-
WORD $0xce39 // cmp esi, ecx
50-
JGE LBB0_4
51-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
52-
WORD $0xe8c0; BYTE $0x03 // shr al, 3
53-
WORD $0x0124 // and al, 1
54-
WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
55-
WORD $0x894c; BYTE $0xce // mov rsi, r9
56-
LONG $0x04ce8348 // or rsi, 4
57-
WORD $0xce39 // cmp esi, ecx
58-
JGE LBB0_4
59-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
60-
WORD $0xe8c0; BYTE $0x04 // shr al, 4
61-
WORD $0x0124 // and al, 1
62-
WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
63-
WORD $0x894c; BYTE $0xce // mov rsi, r9
64-
LONG $0x05ce8348 // or rsi, 5
65-
WORD $0xce39 // cmp esi, ecx
66-
JGE LBB0_4
67-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
68-
WORD $0xe8c0; BYTE $0x05 // shr al, 5
69-
WORD $0x0124 // and al, 1
70-
WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
71-
WORD $0x894c; BYTE $0xce // mov rsi, r9
72-
LONG $0x06ce8348 // or rsi, 6
73-
WORD $0xce39 // cmp esi, ecx
74-
JGE LBB0_4
75-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
76-
WORD $0xe8c0; BYTE $0x06 // shr al, 6
77-
WORD $0x0124 // and al, 1
78-
WORD $0x0488; BYTE $0x32 // mov byte [rdx + rsi], al
79-
LONG $0x07c98349 // or r9, 7
80-
WORD $0x3941; BYTE $0xc9 // cmp r9d, ecx
81-
JGE LBB0_4
82-
WORD $0xb60f; BYTE $0x07 // movzx eax, byte [rdi]
83-
WORD $0xe8c0; BYTE $0x07 // shr al, 7
84-
LONG $0x0a048842 // mov byte [rdx + r9], al
85-
JMP LBB0_4
86-
87-
LBB0_5:
37+
MOVQ outlen+24(FP), R13
38+
39+
TESTL SI, SI
40+
JLE done
41+
42+
VMOVDQU shuffle_avx2<>(SB), Y3
43+
VMOVDQU bitmask_avx2<>(SB), Y4
44+
VMOVDQU ones_avx2<>(SB), Y5
45+
46+
XORQ R8, R8
47+
XORQ R9, R9
48+
49+
loop32:
50+
MOVQ SI, AX
51+
SUBQ R8, AX
52+
CMPQ AX, $4
53+
JL loop8
54+
55+
MOVQ R13, AX
56+
SUBQ R9, AX
57+
CMPQ AX, $32
58+
JL loop8
59+
60+
MOVL (DI)(R8*1), AX
61+
MOVD AX, X0
62+
VPBROADCASTD X0, Y0
63+
VPSHUFB Y3, Y0, Y0
64+
VPAND Y4, Y0, Y1
65+
VPCMPEQB Y4, Y1, Y1
66+
VPAND Y5, Y1, Y1
67+
VMOVDQU Y1, (DX)(R9*1)
68+
69+
ADDQ $4, R8
70+
ADDQ $32, R9
71+
JMP loop32
72+
73+
loop8:
74+
CMPQ R8, SI
75+
JGE avx_done
76+
77+
MOVQ R13, AX
78+
SUBQ R9, AX
79+
CMPQ AX, $8
80+
JL scalar
81+
82+
MOVBLZX (DI)(R8*1), AX
83+
MOVD AX, X0
84+
VPBROADCASTD X0, Y0
85+
VPSHUFB Y3, Y0, Y0
86+
VPAND Y4, Y0, Y1
87+
VPCMPEQB Y4, Y1, Y1
88+
VPAND Y5, Y1, Y1
89+
MOVQ X1, (DX)(R9*1)
90+
91+
ADDQ $1, R8
92+
ADDQ $8, R9
93+
JMP loop8
94+
95+
scalar:
96+
CMPQ R8, SI
97+
JGE avx_done
98+
CMPQ R9, R13
99+
JGE avx_done
100+
101+
MOVBLZX (DI)(R8*1), AX
102+
XORQ CX, CX
103+
104+
scalar_bit:
105+
CMPQ CX, $8
106+
JGE scalar_next
107+
CMPQ R9, R13
108+
JGE avx_done
109+
110+
MOVL AX, R11
111+
SHRL CL, R11
112+
ANDL $1, R11
113+
MOVB R11, (DX)(R9*1)
114+
115+
INCQ CX
116+
INCQ R9
117+
JMP scalar_bit
118+
119+
scalar_next:
120+
INCQ R8
121+
JMP scalar
122+
123+
avx_done:
124+
VZEROUPPER
125+
126+
done:
88127
RET
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
17+
package utils_test
18+
19+
import (
20+
"fmt"
21+
"math/rand"
22+
"testing"
23+
24+
"github.com/apache/arrow-go/v18/parquet/internal/utils"
25+
)
26+
27+
func BenchmarkBytesToBools(b *testing.B) {
28+
for _, nBytes := range []int{64, 256, 1024, 4096, 16384} {
29+
in := make([]byte, nBytes)
30+
rng := rand.New(rand.NewSource(42))
31+
for i := range in {
32+
in[i] = byte(rng.Intn(256))
33+
}
34+
out := make([]bool, nBytes*8)
35+
36+
b.Run(fmt.Sprintf("bytes=%d", nBytes), func(b *testing.B) {
37+
b.SetBytes(int64(nBytes))
38+
for i := 0; i < b.N; i++ {
39+
utils.BytesToBools(in, out)
40+
}
41+
})
42+
}
43+
}
44+
45+
func TestBytesToBoolsCorrectness(t *testing.T) {
46+
rng := rand.New(rand.NewSource(12345))
47+
48+
for _, nBytes := range []int{1, 2, 3, 7, 8, 15, 16, 31, 32, 63, 64, 100, 256, 1024} {
49+
t.Run(fmt.Sprintf("bytes=%d", nBytes), func(t *testing.T) {
50+
in := make([]byte, nBytes)
51+
for i := range in {
52+
in[i] = byte(rng.Intn(256))
53+
}
54+
55+
outlen := nBytes * 8
56+
got := make([]bool, outlen)
57+
want := make([]bool, outlen)
58+
59+
for i, b := range in {
60+
for j := 0; j < 8; j++ {
61+
want[8*i+j] = (b & (1 << j)) != 0
62+
}
63+
}
64+
65+
utils.BytesToBools(in, got)
66+
67+
for i := 0; i < outlen; i++ {
68+
if got[i] != want[i] {
69+
byteIdx := i / 8
70+
bitIdx := i % 8
71+
t.Fatalf("mismatch at index %d (byte %d, bit %d): got %v, want %v (input byte = 0x%02x)",
72+
i, byteIdx, bitIdx, got[i], want[i], in[byteIdx])
73+
}
74+
}
75+
})
76+
}
77+
}
78+
79+
func TestBytesToBoolsOutlenSmaller(t *testing.T) {
80+
in := []byte{0xFF, 0xAA, 0x55}
81+
for outlen := 1; outlen <= 24; outlen++ {
82+
t.Run(fmt.Sprintf("outlen=%d", outlen), func(t *testing.T) {
83+
got := make([]bool, outlen)
84+
want := make([]bool, outlen)
85+
86+
for i, b := range in {
87+
for j := 0; j < 8; j++ {
88+
idx := 8*i + j
89+
if idx >= outlen {
90+
break
91+
}
92+
want[idx] = (b & (1 << j)) != 0
93+
}
94+
}
95+
96+
utils.BytesToBools(in, got)
97+
98+
for i := 0; i < outlen; i++ {
99+
if got[i] != want[i] {
100+
t.Fatalf("outlen=%d: mismatch at index %d: got %v, want %v", outlen, i, got[i], want[i])
101+
}
102+
}
103+
})
104+
}
105+
}

0 commit comments

Comments
 (0)