@@ -93,7 +93,8 @@ pub fn radix_sort_to_indices(rows: &Rows) -> Vec<u32> {
9393 let n = rows. num_rows ( ) ;
9494 let mut indices: Vec < u32 > = ( 0 ..n as u32 ) . collect ( ) ;
9595 let mut temp = vec ! [ 0u32 ; n] ;
96- msd_radix_sort ( & mut indices, & mut temp, rows, 0 , true ) ;
96+ let mut bytes = vec ! [ 0u8 ; n] ;
97+ msd_radix_sort ( & mut indices, & mut temp, & mut bytes, rows, 0 , true ) ;
9798 indices
9899}
99100
@@ -121,6 +122,7 @@ unsafe fn row_byte(rows: &Rows, idx: u32, byte_pos: usize) -> u8 {
121122fn msd_radix_sort (
122123 src : & mut [ u32 ] ,
123124 dst : & mut [ u32 ] ,
125+ bytes : & mut [ u8 ] ,
124126 rows : & Rows ,
125127 byte_pos : usize ,
126128 result_in_src : bool ,
@@ -155,15 +157,16 @@ fn msd_radix_sort(
155157 return ;
156158 }
157159
158- // Both the histogram and scatter loops read each row's byte via
159- // row_unchecked. Pre-extracting bytes into a contiguous buffer was
160- // tried but benchmarked slower — the extra write pass costs more
161- // than the second read through row offsets already hot in cache.
160+ // Extract bytes and build histogram in one pass. The bytes buffer
161+ // is reused across levels so the scatter loop can read from a flat
162+ // array instead of chasing pointers through Rows a second time.
163+ let bytes = & mut bytes [ ..n ] ;
162164 let mut counts = [ 0u32 ; 256 ] ;
163- for & idx in & * src {
164- // SAFETY: indices contains a permutation of 0..rows.num_rows()
165- let byte = unsafe { row_byte ( rows, idx, byte_pos) } ;
166- counts[ byte as usize ] += 1 ;
165+ for ( i, & idx) in src. iter ( ) . enumerate ( ) {
166+ // SAFETY: src contains valid row indices
167+ let b = unsafe { row_byte ( rows, idx, byte_pos) } ;
168+ bytes[ i] = b;
169+ counts[ b as usize ] += 1 ;
167170 }
168171
169172 let mut offsets = [ 0u32 ; 257 ] ;
@@ -175,17 +178,16 @@ fn msd_radix_sort(
175178
176179 // No scatter happened — data is still in src, roles unchanged.
177180 if num_buckets == 1 {
178- msd_radix_sort ( src, dst, rows, byte_pos + 1 , result_in_src) ;
181+ msd_radix_sort ( src, dst, bytes , rows, byte_pos + 1 , result_in_src) ;
179182 return ;
180183 }
181184
182- // Scatter src → dst
185+ // Scatter src → dst using the pre-extracted bytes
183186 let mut write_pos = offsets;
184- for & idx in & * src {
185- // SAFETY: indices contains a permutation of 0..rows.num_rows()
186- let byte = unsafe { row_byte ( rows, idx, byte_pos) } as usize ;
187- dst[ write_pos[ byte] as usize ] = idx;
188- write_pos[ byte] += 1 ;
187+ for ( i, & idx) in src. iter ( ) . enumerate ( ) {
188+ let b = bytes[ i] as usize ;
189+ dst[ write_pos[ b] as usize ] = idx;
190+ write_pos[ b] += 1 ;
189191 }
190192
191193 // Recurse with roles swapped: after scatter the data lives in dst,
@@ -199,6 +201,7 @@ fn msd_radix_sort(
199201 msd_radix_sort (
200202 & mut dst[ start..end] ,
201203 & mut src[ start..end] ,
204+ & mut bytes[ start..end] ,
202205 rows,
203206 byte_pos + 1 ,
204207 !result_in_src,
0 commit comments