Skip to content

Commit 44cb7be

Browse files
authored
Inline quintic extension (#197)
* perf(quintic): force-inline quintic_mul + packed Mul impls LLVM was not force-inlining quintic_mul despite #[inline] — the monomorphized body is large enough that LLVM's cost heuristic declined. Each call-site paid ~5 cycles of function-call overhead. With quintic_mul called millions of times per proof, this accumulated to ~2.4% of total runtime. Zen 4 (c7a.2xlarge): -2.38% on xmss_leaf_1400sigs, p=0.0, revert-A/B confirmed. * perf(quintic): force-inline quintic_square, quintic_mul_packed, MulAssign Extends the previous commit's inlining pattern to additional multiplication- related functions: quintic_square, all platform-specific quintic_mul_packed variants (AVX-512/AVX2/NEON/fallback), and MulAssign<Self>/MulAssign<QEF>. Testing established the I-cache budget boundary for forced inlining on Zen 4: these 9 functions are the optimal set. Inlining more (e.g. Add/Sub/Neg) causes regression from expanded code size. Zen 4 (c7a.2xlarge): additional -1.25% on xmss_leaf_1400sigs, p=0.0, revert-A/B confirmed. Combined with previous commit: ~-3.6% total.
1 parent e5cd331 commit 44cb7be

File tree

3 files changed

+10
-10
lines changed

3 files changed

+10
-10
lines changed

crates/backend/koala-bear/src/quintic_extension/extension.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ impl<F: TwoAdicField + QuinticExtendable> TwoAdicField for QuinticExtensionField
527527
}
528528

529529
/// Quintic extension field multiplication in F[X]/(X^5 + X^2 - 1).
530-
#[inline]
530+
#[inline(always)]
531531
pub fn quintic_mul<T: Copy + Sub<Output = T>>(
532532
a: &[T; 5],
533533
b: &[T; 5],
@@ -546,7 +546,7 @@ pub fn quintic_mul<T: Copy + Sub<Output = T>>(
546546
]
547547
}
548548

549-
#[inline]
549+
#[inline(always)]
550550
pub(crate) fn quintic_square<F, R>(a: &[R; 5], res: &mut [R; 5])
551551
where
552552
F: Field,

crates/backend/koala-bear/src/quintic_extension/packed_extension.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ where
461461
{
462462
type Output = Self;
463463

464-
#[inline]
464+
#[inline(always)]
465465
fn mul(self, rhs: Self) -> Self {
466466
Self {
467467
value: super::extension::quintic_mul(&self.value, &rhs.value, PF::dot_product::<5>),
@@ -476,7 +476,7 @@ where
476476
{
477477
type Output = Self;
478478

479-
#[inline]
479+
#[inline(always)]
480480
fn mul(self, rhs: QuinticExtensionField<F>) -> Self {
481481
let b: [PF; 5] = rhs.value.map(|x| x.into());
482482
Self {
@@ -516,7 +516,7 @@ where
516516
F: QuinticExtendable,
517517
PF: PackedField<Scalar = F>,
518518
{
519-
#[inline]
519+
#[inline(always)]
520520
fn mul_assign(&mut self, rhs: Self) {
521521
*self = *self * rhs;
522522
}
@@ -527,7 +527,7 @@ where
527527
F: QuinticExtendable,
528528
PF: PackedField<Scalar = F>,
529529
{
530-
#[inline]
530+
#[inline(always)]
531531
fn mul_assign(&mut self, rhs: QuinticExtensionField<F>) {
532532
*self = *self * rhs;
533533
}

crates/backend/koala-bear/src/quintic_extension/packing.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,15 @@ use crate::KoalaBear;
88
all(target_arch = "aarch64", target_feature = "neon"),
99
all(target_arch = "x86_64", target_feature = "avx2",)
1010
)))]
11-
#[inline]
11+
#[inline(always)]
1212
pub(crate) fn quintic_mul_packed(a: &[KoalaBear; 5], b: &[KoalaBear; 5], res: &mut [KoalaBear; 5]) {
1313
use field::PrimeCharacteristicRing;
1414
*res = super::extension::quintic_mul(a, b, KoalaBear::dot_product::<5>);
1515
}
1616

1717
#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
1818
/// Multiplication in a quintic binomial extension field.
19-
#[inline]
19+
#[inline(always)]
2020
pub(crate) fn quintic_mul_packed(a: &[KoalaBear; 5], b: &[KoalaBear; 5], res: &mut [KoalaBear; 5]) {
2121
// TODO: This could likely be optimised further with more effort.
2222
// in particular it would benefit from a custom AVX2 implementation.
@@ -73,7 +73,7 @@ pub(crate) fn quintic_mul_packed(a: &[KoalaBear; 5], b: &[KoalaBear; 5], res: &m
7373

7474
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
7575
/// Multiplication in a quintic binomial extension field.
76-
#[inline]
76+
#[inline(always)]
7777
pub(crate) fn quintic_mul_packed(a: &[KoalaBear; 5], b: &[KoalaBear; 5], res: &mut [KoalaBear; 5]) {
7878
use crate::{PackedMontyField31AVX512, dot_product_2};
7979
use field::PrimeCharacteristicRing;
@@ -157,7 +157,7 @@ pub(crate) fn quintic_mul_packed(a: &[KoalaBear; 5], b: &[KoalaBear; 5], res: &m
157157

158158
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
159159
/// Multiplication in a quintic binomial extension field.
160-
#[inline]
160+
#[inline(always)]
161161
pub(crate) fn quintic_mul_packed(a: &[KoalaBear; 5], b: &[KoalaBear; 5], res: &mut [KoalaBear; 5]) {
162162
// TODO: This could be optimised further with a custom NEON implementation.
163163

0 commit comments

Comments
 (0)