Skip to content

Commit b0ec8e5

Browse files
committed
Use Kahan summation algorithm in the integrator step
1 parent 64a6807 commit b0ec8e5

8 files changed

Lines changed: 97 additions & 81 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.13)
22

33
project(
44
rk4_solver
5-
VERSION 1.1.1
5+
VERSION 1.2.0
66
LANGUAGES CXX
77
)
88

README.md

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,16 +222,15 @@ The benchmark test is a 3rd order linear system compiled using g++ with ```-O3``
222222
223223
| Flags | Loop (million steps per second) | Cumulative Loop (million steps per second) |
224224
| -----------------------------------------------------: | :-----------------------------: | :----------------------------------------: |
225-
| None *(Default)* | 22.7 | 27.5 |
226-
| ```USE_SINGLE_PRECISION``` | 23.4 | 28.7 |
227-
| ```DO_NOT_USE_HEAP``` | 19.7 | 36.1 |
228-
| ```DO_NOT_USE_HEAP``` *and* ```USE_SINGLE_PRECISION``` | 19.7 | 34.1 |
225+
| None *(Default)* | 18.6 | 33.3 |
226+
| ```USE_SINGLE_PRECISION``` | 18.6 | 36.1 |
227+
| ```DO_NOT_USE_HEAP``` | 22.8 | 28.9 |
228+
| ```DO_NOT_USE_HEAP``` *and* ```USE_SINGLE_PRECISION``` | 22.3 | 28.6 |
229229
230230
231231
232232
## 6.1. Discussion
233233
1. Using the ```USE_SINGLE_PRECISION``` flag to use single-precision floats does not affect the performance.
234-
2. Using the ```DO_NOT_USE_HEAP``` flag can negatively affect performance of integration loops without final time.
235-
3. If the problem size can fit the stack size, then using the ```DO_NOT_USE_HEAP``` flag to disable heap allocation can provide a significant performance boost for cumulative integration loops.
236-
234+
2. Using the ```DO_NOT_USE_HEAP``` flag seems to benefit loops without final time, but it hurts cumulative loops. However, what is happening here could be simply due to different memory layouts, so take these results with a grain of salt.
235+
3. If the problem size can fit the stack size, then using the ```DO_NOT_USE_HEAP``` flag to disable heap allocation can provide a performance boost to loops without final time.
237236
**WARNING**: Your stack can easily overflow for large problems with the ```DO_NOT_USE_HEAP``` flag.

benchmarks/cum_loop-benchmark.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ Dynamics dynamics;
3333
int
3434
main()
3535
{
36-
Real_T t_arr[t_dim];
37-
Real_T x_arr[t_dim * x_dim];
36+
Real_T(&t_arr)[t_dim] = *(Real_T(*)[t_dim]) new Real_T[t_dim];
37+
Real_T(&x_arr)[t_dim * x_dim] = *(Real_T(*)[t_dim * x_dim]) new Real_T[t_dim * x_dim];
38+
3839
printf("Cumulatively integrating 3rd order linear ODE for %.3g steps... ",
3940
static_cast<Real_T>(t_dim));
4041

@@ -48,7 +49,8 @@ main()
4849

4950
const Real_T(&x_final)[x_dim] = *matrix_op::select_row<t_dim, x_dim>(t_dim - 1, x_arr);
5051

51-
printf("Done.\nx at t = %.3g s: [%.3g; %.3g; %.3g]\n", t_arr[t_dim - 1], x_final[0], x_final[1], x_final[2]);
52+
printf("Done.\nx at t = %.3g s: [%.3g; %.3g; %.3g]\n", t_arr[t_dim - 1], x_final[0],
53+
x_final[1], x_final[2]);
5254
printf("Score: %.3g steps per second (%g ms)\n",
5355
static_cast<Real_T>(t_dim) / since_sample_ns.count() * 1e9,
5456
static_cast<Real_T>(since_sample_ns.count()) / 1e6);

benchmarks/step-benchmark.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,10 @@ main()
4343
auto now_tp = std::chrono::high_resolution_clock::now();
4444
auto since_sample = sample_tp - now_tp;
4545

46+
rk4_solver::Integrator<x_dim, Dynamics> rk4;
47+
4648
while (true) {
47-
rk4_solver::step(dynamics, &Dynamics::ode_fun, t, x, time_step, 0, x);
49+
rk4.step(dynamics, &Dynamics::ode_fun, t, x, time_step, 0, x);
4850
t = t + time_step;
4951

5052
++step_counter;

examples/step-example.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ main()
2828
{
2929
Real_T x_next[x_dim];
3030
//* integration step
31-
rk4_solver::step(dyn, &Dynamics::ode_fun, t, x, h, i, x_next);
31+
rk4_solver::Integrator<x_dim, Dynamics> rk4;
32+
rk4.step(dyn, &Dynamics::ode_fun, t, x, h, i, x_next);
3233

3334
return 0;
3435
}

include/rk4_solver/cum_loop.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,10 @@ cum_loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t0, const Real_T (&x0)
6767
t_arr[0] = t;
6868
matrix_op::replace_row<T_DIM>(0, x, x_arr);
6969

70+
rk4_solver::Integrator<X_DIM, T> integrator;
71+
7072
for (size_t i = 0; i < T_DIM - 1; ++i) {
71-
step<X_DIM, T>(obj, ode_fun, t, x, h, i, x); //* update x to the next x
73+
integrator.step(obj, ode_fun, t, x, h, i, x); //* update x to the next x
7274

7375
t = t0 + (i + 1) * h; //* update t to the next t
7476

@@ -117,9 +119,10 @@ cum_loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, EventFun_T<X_DIM, T> event_fun, con
117119
t_arr[0] = t;
118120

119121
matrix_op::replace_row<T_DIM>(0, x, x_arr);
122+
rk4_solver::Integrator<X_DIM, T> integrator;
120123

121124
for (; !stop_flag && i < T_DIM - 1; ++i) {
122-
step(obj, ode_fun, t, x, h, i, x); //* update x to the next x
125+
integrator.step(obj, ode_fun, t, x, h, i, x); //* update x to the next x
123126

124127
t = t0 + (i + 1) * h; //* update t to the next t
125128

include/rk4_solver/loop.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,10 @@ loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t0, const Real_T (&x0)[X_D
5656
matrix_op::replace_row<1>(0, x0, x); //* initialize x
5757
*t = t0; //* initialize t
5858

59+
rk4_solver::Integrator<X_DIM, T> integrator;
60+
5961
for (size_t i = 0; i < T_DIM - 1; ++i) {
60-
step<X_DIM, T>(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
62+
integrator.step(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
6163

6264
*t = t0 + (i + 1) * h; //* update t to the next t
6365
}
@@ -92,9 +94,10 @@ loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, EventFun_T<X_DIM, T> event_fun, const R
9294

9395
//* check for events at the initial condition
9496
bool stop_flag = (obj.*event_fun)(*t, x, i, x);
97+
rk4_solver::Integrator<X_DIM, T> integrator;
9598

9699
for (; !stop_flag && i < T_DIM - 1; ++i) {
97-
step(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
100+
integrator.step(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
98101

99102
*t = t0 + (i + 1) * h; //* update t to the next t
100103

include/rk4_solver/step.hpp

Lines changed: 70 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -32,79 +32,85 @@
3232

3333
namespace rk4_solver
3434
{
35-
/*
36-
* Computes the next Runge-Kutta 4th Order step.
37-
* `ode_fun` can be parametrized using the time (row) index `i`.
38-
*
39-
* `step<OPT: X_DIM, T>(obj, ode_fun, t, x, h, i, OUT:x_next)`
40-
*
41-
* 1. `obj`: dynamics object (type `T`)
42-
* 2. `ode_fun`: ode function, member of `obj` (type `T::*`)
43-
* 3. `t`: time [s]
44-
* 4. `x`: state
45-
* 5. `h`: time step [s]
46-
* 6. `i`: time index corresponding to `t`
47-
*
48-
* OUT:
49-
* 7. `x_next`: next state
50-
*/
51-
template <size_t X_DIM, typename T>
52-
void
53-
step(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t, const Real_T (&x)[X_DIM], const Real_T h,
54-
const size_t i, Real_T (&x_next)[X_DIM])
35+
template <size_t X_DIM, typename T> class Integrator
5536
{
56-
constexpr Real_T rk4_weight_0 = 1. / 6.;
57-
constexpr Real_T rk4_weight_1 = 1. / 3.;
58-
#ifdef DO_NOT_USE_HEAP
59-
static Real_T k_0[X_DIM];
60-
static Real_T k_1[X_DIM];
61-
static Real_T k_2[X_DIM];
62-
static Real_T k_3[X_DIM];
63-
static Real_T x_temp[X_DIM];
64-
#else
37+
public:
38+
Integrator()
39+
{
40+
for (size_t i = 0; i < X_DIM; ++i) {
41+
accumulator[i] = 0;
42+
}
43+
}
44+
6545
/*
66-
* `..._ptr`s are of type `Real_T(*)[X_DIM]`.
67-
* They point to `Real_T[X_DIM]`s which are allocated on the heap.
68-
* Dereferencing them gives us rvalue references to `Real_T[X_DIM]`s,
69-
* which can be substituted for `Real_T[X_DIM]`s allocated on the stack.
70-
* (Maybe typedef should be used more.)
46+
* Computes the next Runge-Kutta 4th Order step.
47+
* `ode_fun` can be parametrized using the time (row) index `i`.
48+
*
49+
* `step<OPT: X_DIM, T>(obj, ode_fun, t, x, h, i, OUT:x_next)`
50+
*
51+
* 1. `obj`: dynamics object (type `T`)
52+
* 2. `ode_fun`: ode function, member of `obj` (type `T::*`)
53+
* 3. `t`: time [s]
54+
* 4. `x`: state
55+
* 5. `h`: time step [s]
56+
* 6. `i`: time index corresponding to `t`
57+
*
58+
* OUT:
59+
* 7. `x_next`: next state
7160
*/
72-
static Real_T(*k_0_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
73-
static Real_T(*k_1_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
74-
static Real_T(*k_2_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
75-
static Real_T(*k_3_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
76-
static Real_T(*x_temp_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
77-
Real_T(*dx_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM]{}; //* not static, needs to be zeroed before every loop
78-
Real_T(&k_0)[X_DIM] = *k_0_ptr;
79-
Real_T(&k_1)[X_DIM] = *k_1_ptr;
80-
Real_T(&k_2)[X_DIM] = *k_2_ptr;
81-
Real_T(&k_3)[X_DIM] = *k_3_ptr;
82-
Real_T(&x_temp)[X_DIM] = *x_temp_ptr;
83-
Real_T(&dx)[X_DIM] = *dx_ptr;
84-
#endif
61+
void
62+
step(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t, const Real_T (&x)[X_DIM],
63+
const Real_T h, const size_t i, Real_T (&x_next)[X_DIM])
64+
{
65+
(obj.*ode_fun)(t, x, i, k_0); //* ode_fun(ti, xi)
8566

86-
(obj.*ode_fun)(t, x, i, k_0); //* ode_fun(ti, xi)
67+
//* zero-order hold, i.e. no ODE_FUN(,, i+.5), ODE_FUN(,, i+1,) etc.
68+
matrix_op::weighted_sum(h / 2, k_0, 1., x, x_temp);
69+
(obj.*ode_fun)(t + h / 2, x_temp, i, k_1); //* ode_fun(ti + h/2, xi + h/2*k_0)
8770

88-
//* zero-order hold, i.e. no ODE_FUN(,, i+.5), ODE_FUN(,, i+1,) etc.
89-
matrix_op::weighted_sum(h / 2, k_0, 1., x, x_temp);
90-
(obj.*ode_fun)(t + h / 2, x_temp, i, k_1); //* ode_fun(ti + h/2, xi + h/2*k_0)
71+
matrix_op::weighted_sum(h / 2, k_1, 1., x, x_temp);
72+
(obj.*ode_fun)(t + h / 2, x_temp, i, k_2); //* ode_fun(ti + h/2, xi + h/2*k_1)
9173

92-
matrix_op::weighted_sum(h / 2, k_1, 1., x, x_temp);
93-
(obj.*ode_fun)(t + h / 2, x_temp, i, k_2); //* ode_fun(ti + h/2, xi + h/2*k_1)
74+
matrix_op::weighted_sum(h, k_2, 1., x, x_temp);
75+
(obj.*ode_fun)(t + h, x_temp, i, k_3); //* ode_fun(ti + h, xi + k_2)
9476

95-
matrix_op::weighted_sum(h, k_2, 1., x, x_temp);
96-
(obj.*ode_fun)(t + h, x_temp, i, k_3); //* ode_fun(ti + h, xi + k_2)
77+
constexpr Real_T w0 = 1. / 6.;
78+
constexpr Real_T w1 = 1. / 3.;
9779

98-
//* compensated summation (Kahan summation), probably ffast-math would break it
99-
for (size_t i = 0; i < X_DIM; ++i) {
100-
dx[i] += h *
101-
(rk4_weight_0 * k_0[i] + rk4_weight_1 * k_1[i] + rk4_weight_1 * k_2[i] +
102-
rk4_weight_0 * k_3[i]); //* dx accumulates floating point errors
103-
x_temp[i] = x[i]; //* stores x when x_next is pointing to x's address
104-
x_next[i] = x[i] + dx[i]; //* uncompensated summation
105-
dx[i] -= (x_next[i] - x_temp[i]); //* removes the uncompensated summation from the accumulated error
80+
for (size_t i = 0; i < X_DIM; ++i) {
81+
dx = h * (w0 * k_0[i] + w1 * k_1[i] + w1 * k_2[i] + w0 * k_3[i]);
82+
//* compensated (Kahan) summation, ffast-math might break this
83+
compensated_dx = dx - accumulator[i];
84+
x_temp[i] = x[i] + compensated_dx;
85+
accumulator[i] = (x_temp[i] - x[i]) - compensated_dx;
86+
x_next[i] = x_temp[i];
87+
}
10688
}
107-
}
89+
90+
private:
91+
Real_T dx;
92+
Real_T compensated_dx;
93+
94+
#ifdef DO_NOT_USE_HEAP
95+
Real_T k_0[X_DIM];
96+
Real_T k_1[X_DIM];
97+
Real_T k_2[X_DIM];
98+
Real_T k_3[X_DIM];
99+
Real_T x_temp[X_DIM];
100+
Real_T accumulator[X_DIM];
101+
#else
102+
/*
103+
* Dereferencing pointers that point to `Real_T[X_DIM]`s which are allocated on the heap, in
104+
* order to get rvalue references to the `Real_T[X_DIM]`s.
105+
*/
106+
Real_T (&k_0)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
107+
Real_T (&k_1)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
108+
Real_T (&k_2)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
109+
Real_T (&k_3)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
110+
Real_T (&x_temp)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
111+
Real_T (&accumulator)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
112+
#endif
113+
};
108114
} // namespace rk4_solver
109115

110116
#endif

0 commit comments

Comments
 (0)