Use Kahan summation algorithm in the integrator step

cinaral · cinaral · commit b0ec8e5b2caf · 2023-03-15T18:22:55.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.13)
 
 project(
 	rk4_solver
-	VERSION 1.1.1
+	VERSION 1.2.0
 	LANGUAGES CXX
 )
 
diff --git a/README.md b/README.md
@@ -222,16 +222,15 @@ The benchmark test is a 3rd order linear system compiled using g++ with ```-O3``
 
 |                                                  Flags | Loop (million steps per second) | Cumulative Loop (million steps per second) |
 | -----------------------------------------------------: | :-----------------------------: | :----------------------------------------: |
-|                                       None *(Default)* |              22.7               |                    27.5                    |
-|                             ```USE_SINGLE_PRECISION``` |              23.4               |                    28.7                    |
-|                                  ```DO_NOT_USE_HEAP``` |              19.7               |                    36.1                    |
-| ```DO_NOT_USE_HEAP``` *and* ```USE_SINGLE_PRECISION``` |              19.7               |                    34.1                    |
+|                                       None *(Default)* |              18.6               |                    33.3                    |
+|                             ```USE_SINGLE_PRECISION``` |              18.6               |                    36.1                    |
+|                                  ```DO_NOT_USE_HEAP``` |              22.8               |                    28.9                    |
+| ```DO_NOT_USE_HEAP``` *and* ```USE_SINGLE_PRECISION``` |              22.3               |                    28.6                    |
 
 
 
 ## 6.1. Discussion
 1. Using the ```USE_SINGLE_PRECISION``` flag to use single-precision floats does not affect the performance.
-2. Using the ```DO_NOT_USE_HEAP``` flag can negatively affect performance of integration loops without final time.
-3. If the problem size can fit the stack size, then using the ```DO_NOT_USE_HEAP``` flag to disable heap allocation can provide a significant performance boost for cumulative integration loops. 
-
+2. Using the ```DO_NOT_USE_HEAP``` flag seems to benefit loops without final time, but it hurts cumulative loops. However, what is happening here could be simply due to different memory layouts, so take these results with a grain of salt.
+3. If the problem size can fit the stack size, then using the ```DO_NOT_USE_HEAP``` flag to disable heap allocation can provide a performance boost to loops without final time.
 **WARNING**: Your stack can easily overflow for large problems with the ```DO_NOT_USE_HEAP``` flag.
diff --git a/benchmarks/cum_loop-benchmark.cpp b/benchmarks/cum_loop-benchmark.cpp
@@ -33,8 +33,9 @@ Dynamics dynamics;
 int
 main()
 {
-	Real_T t_arr[t_dim];
-	Real_T x_arr[t_dim * x_dim];
+	Real_T(&t_arr)[t_dim] = *(Real_T(*)[t_dim]) new Real_T[t_dim];
+	Real_T(&x_arr)[t_dim * x_dim] = *(Real_T(*)[t_dim * x_dim]) new Real_T[t_dim * x_dim];
+
 	printf("Cumulatively integrating 3rd order linear ODE for %.3g steps... ",
 	       static_cast<Real_T>(t_dim));
 
@@ -48,7 +49,8 @@ main()
 
 	const Real_T(&x_final)[x_dim] = *matrix_op::select_row<t_dim, x_dim>(t_dim - 1, x_arr);
 
-	printf("Done.\nx at t = %.3g s: [%.3g; %.3g; %.3g]\n", t_arr[t_dim - 1], x_final[0], x_final[1], x_final[2]);
+	printf("Done.\nx at t = %.3g s: [%.3g; %.3g; %.3g]\n", t_arr[t_dim - 1], x_final[0],
+	       x_final[1], x_final[2]);
 	printf("Score: %.3g steps per second (%g ms)\n",
 	       static_cast<Real_T>(t_dim) / since_sample_ns.count() * 1e9,
 	       static_cast<Real_T>(since_sample_ns.count()) / 1e6);
diff --git a/benchmarks/step-benchmark.cpp b/benchmarks/step-benchmark.cpp
@@ -43,8 +43,10 @@ main()
 	auto now_tp = std::chrono::high_resolution_clock::now();
 	auto since_sample = sample_tp - now_tp;
 
+	rk4_solver::Integrator<x_dim, Dynamics> rk4;
+
 	while (true) {
-		rk4_solver::step(dynamics, &Dynamics::ode_fun, t, x, time_step, 0, x);
+		rk4.step(dynamics, &Dynamics::ode_fun, t, x, time_step, 0, x);
 		t = t + time_step;
 
 		++step_counter;
diff --git a/examples/step-example.cpp b/examples/step-example.cpp
@@ -28,7 +28,8 @@ main()
 {
 	Real_T x_next[x_dim];
 	//* integration step
-	rk4_solver::step(dyn, &Dynamics::ode_fun, t, x, h, i, x_next);
+	rk4_solver::Integrator<x_dim, Dynamics> rk4;
+	rk4.step(dyn, &Dynamics::ode_fun, t, x, h, i, x_next);
 	
 	return 0;
 }
diff --git a/include/rk4_solver/cum_loop.hpp b/include/rk4_solver/cum_loop.hpp
@@ -67,8 +67,10 @@ cum_loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t0, const Real_T (&x0)
 	t_arr[0] = t;
 	matrix_op::replace_row<T_DIM>(0, x, x_arr);
 
+	rk4_solver::Integrator<X_DIM, T> integrator;
+
 	for (size_t i = 0; i < T_DIM - 1; ++i) {
-		step<X_DIM, T>(obj, ode_fun, t, x, h, i, x); //* update x to the next x
+		integrator.step(obj, ode_fun, t, x, h, i, x); //* update x to the next x
 
 		t = t0 + (i + 1) * h; //* update t to the next t
 
@@ -117,9 +119,10 @@ cum_loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, EventFun_T<X_DIM, T> event_fun, con
 	t_arr[0] = t;
 
 	matrix_op::replace_row<T_DIM>(0, x, x_arr);
+	rk4_solver::Integrator<X_DIM, T> integrator;
 
 	for (; !stop_flag && i < T_DIM - 1; ++i) {
-		step(obj, ode_fun, t, x, h, i, x); //* update x to the next x
+		integrator.step(obj, ode_fun, t, x, h, i, x); //* update x to the next x
 
 		t = t0 + (i + 1) * h; //* update t to the next t
 
diff --git a/include/rk4_solver/loop.hpp b/include/rk4_solver/loop.hpp
@@ -56,8 +56,10 @@ loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t0, const Real_T (&x0)[X_D
 	matrix_op::replace_row<1>(0, x0, x); //* initialize x
 	*t = t0;                             //* initialize t
 
+	rk4_solver::Integrator<X_DIM, T> integrator;
+
 	for (size_t i = 0; i < T_DIM - 1; ++i) {
-		step<X_DIM, T>(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
+		integrator.step(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
 
 		*t = t0 + (i + 1) * h; //* update t to the next t
 	}
@@ -92,9 +94,10 @@ loop(T &obj, OdeFun_T<X_DIM, T> ode_fun, EventFun_T<X_DIM, T> event_fun, const R
 
 	//* check for events at the initial condition
 	bool stop_flag = (obj.*event_fun)(*t, x, i, x);
+	rk4_solver::Integrator<X_DIM, T> integrator;
 
 	for (; !stop_flag && i < T_DIM - 1; ++i) {
-		step(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
+		integrator.step(obj, ode_fun, *t, x, h, i, x); //* update x to the next x
 
 		*t = t0 + (i + 1) * h; //* update t to the next t
 
diff --git a/include/rk4_solver/step.hpp b/include/rk4_solver/step.hpp
@@ -32,79 +32,85 @@
 
 namespace rk4_solver
 {
-/*
- * Computes the next Runge-Kutta 4th Order step.
- * `ode_fun` can be parametrized using the time (row) index `i`.
- *
- * `step<OPT: X_DIM, T>(obj, ode_fun, t, x, h, i, OUT:x_next)`
- *
- * 1. `obj`: dynamics object (type `T`)
- * 2. `ode_fun`: ode function, member of `obj` (type `T::*`)
- * 3. `t`: time [s]
- * 4. `x`: state
- * 5. `h`: time step [s]
- * 6. `i`: time index corresponding to `t`
- *
- * OUT:
- *	7. `x_next`: next state
- */
-template <size_t X_DIM, typename T>
-void
-step(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t, const Real_T (&x)[X_DIM], const Real_T h,
-     const size_t i, Real_T (&x_next)[X_DIM])
+template <size_t X_DIM, typename T> class Integrator
 {
-	constexpr Real_T rk4_weight_0 = 1. / 6.;
-	constexpr Real_T rk4_weight_1 = 1. / 3.;
-#ifdef DO_NOT_USE_HEAP
-	static Real_T k_0[X_DIM];
-	static Real_T k_1[X_DIM];
-	static Real_T k_2[X_DIM];
-	static Real_T k_3[X_DIM];
-	static Real_T x_temp[X_DIM];
-#else
+  public:
+	Integrator()
+	{
+		for (size_t i = 0; i < X_DIM; ++i) {
+			accumulator[i] = 0;
+		}
+	}
+
 	/*
-	 * `..._ptr`s are of type `Real_T(*)[X_DIM]`.
-	 * They point to `Real_T[X_DIM]`s which are allocated on the heap.
-	 * Dereferencing them gives us rvalue references to `Real_T[X_DIM]`s,
-	 * which can be substituted for `Real_T[X_DIM]`s allocated on the stack.
-	 * (Maybe typedef should be used more.)
+	 * Computes the next Runge-Kutta 4th Order step.
+	 * `ode_fun` can be parametrized using the time (row) index `i`.
+	 *
+	 * `step<OPT: X_DIM, T>(obj, ode_fun, t, x, h, i, OUT:x_next)`
+	 *
+	 * 1. `obj`: dynamics object (type `T`)
+	 * 2. `ode_fun`: ode function, member of `obj` (type `T::*`)
+	 * 3. `t`: time [s]
+	 * 4. `x`: state
+	 * 5. `h`: time step [s]
+	 * 6. `i`: time index corresponding to `t`
+	 *
+	 * OUT:
+	 *	7. `x_next`: next state
 	 */
-	static Real_T(*k_0_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
-	static Real_T(*k_1_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
-	static Real_T(*k_2_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
-	static Real_T(*k_3_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
-	static Real_T(*x_temp_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM];
-	Real_T(*dx_ptr)[X_DIM] = (Real_T(*)[X_DIM]) new Real_T[X_DIM]{}; //* not static, needs to be zeroed before every loop
-	Real_T(&k_0)[X_DIM] = *k_0_ptr;
-	Real_T(&k_1)[X_DIM] = *k_1_ptr;
-	Real_T(&k_2)[X_DIM] = *k_2_ptr;
-	Real_T(&k_3)[X_DIM] = *k_3_ptr;
-	Real_T(&x_temp)[X_DIM] = *x_temp_ptr;
-	Real_T(&dx)[X_DIM] = *dx_ptr;
-#endif
+	void
+	step(T &obj, OdeFun_T<X_DIM, T> ode_fun, const Real_T t, const Real_T (&x)[X_DIM],
+	     const Real_T h, const size_t i, Real_T (&x_next)[X_DIM])
+	{
+		(obj.*ode_fun)(t, x, i, k_0); //* ode_fun(ti, xi)
 
-	(obj.*ode_fun)(t, x, i, k_0); //* ode_fun(ti, xi)
+		//* zero-order hold, i.e. no ODE_FUN(,, i+.5), ODE_FUN(,, i+1,) etc.
+		matrix_op::weighted_sum(h / 2, k_0, 1., x, x_temp);
+		(obj.*ode_fun)(t + h / 2, x_temp, i, k_1); //* ode_fun(ti + h/2, xi + h/2*k_0)
 
-	//* zero-order hold, i.e. no ODE_FUN(,, i+.5), ODE_FUN(,, i+1,) etc.
-	matrix_op::weighted_sum(h / 2, k_0, 1., x, x_temp);
-	(obj.*ode_fun)(t + h / 2, x_temp, i, k_1); //* ode_fun(ti + h/2, xi + h/2*k_0)
+		matrix_op::weighted_sum(h / 2, k_1, 1., x, x_temp);
+		(obj.*ode_fun)(t + h / 2, x_temp, i, k_2); //* ode_fun(ti + h/2, xi + h/2*k_1)
 
-	matrix_op::weighted_sum(h / 2, k_1, 1., x, x_temp);
-	(obj.*ode_fun)(t + h / 2, x_temp, i, k_2); //* ode_fun(ti + h/2, xi + h/2*k_1)
+		matrix_op::weighted_sum(h, k_2, 1., x, x_temp);
+		(obj.*ode_fun)(t + h, x_temp, i, k_3); //* ode_fun(ti + h, xi + k_2)
 
-	matrix_op::weighted_sum(h, k_2, 1., x, x_temp);
-	(obj.*ode_fun)(t + h, x_temp, i, k_3); //* ode_fun(ti + h, xi + k_2)
+		constexpr Real_T w0 = 1. / 6.;
+		constexpr Real_T w1 = 1. / 3.;
 
-	//* compensated summation (Kahan summation), probably ffast-math would break it
-	for (size_t i = 0; i < X_DIM; ++i) {
-		dx[i] += h *
-		    (rk4_weight_0 * k_0[i] + rk4_weight_1 * k_1[i] + rk4_weight_1 * k_2[i] +
-		     rk4_weight_0 * k_3[i]);      //* dx accumulates floating point errors
-		x_temp[i] = x[i];                 //* stores x when x_next is pointing to x's address
-		x_next[i] = x[i] + dx[i];         //* uncompensated summation
-		dx[i] -= (x_next[i] - x_temp[i]); //* removes the uncompensated summation from the accumulated error
+		for (size_t i = 0; i < X_DIM; ++i) {
+			dx = h * (w0 * k_0[i] + w1 * k_1[i] + w1 * k_2[i] + w0 * k_3[i]);
+			//* compensated (Kahan) summation, ffast-math might break this
+			compensated_dx = dx - accumulator[i];
+			x_temp[i] = x[i] + compensated_dx;
+			accumulator[i] = (x_temp[i] - x[i]) - compensated_dx;
+			x_next[i] = x_temp[i];
+		}
 	}
-}
+
+  private:
+	Real_T dx;
+	Real_T compensated_dx;
+
+#ifdef DO_NOT_USE_HEAP
+	Real_T k_0[X_DIM];
+	Real_T k_1[X_DIM];
+	Real_T k_2[X_DIM];
+	Real_T k_3[X_DIM];
+	Real_T x_temp[X_DIM];
+	Real_T accumulator[X_DIM];
+#else
+	/*
+	 * Dereferencing pointers that point to `Real_T[X_DIM]`s which are allocated on the heap, in
+	 * order to get rvalue references to the `Real_T[X_DIM]`s.
+	 */
+	Real_T (&k_0)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
+	Real_T (&k_1)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
+	Real_T (&k_2)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
+	Real_T (&k_3)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
+	Real_T (&x_temp)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
+	Real_T (&accumulator)[X_DIM] = *(Real_T(*)[X_DIM]) new Real_T[X_DIM];
+#endif
+};
 } // namespace rk4_solver
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.13)`
`2`	`2`
`3`	`3`	`project(`
`4`	`4`	`rk4_solver`
`5`		`- VERSION 1.1.1`
	`5`	`+ VERSION 1.2.0`
`6`	`6`	`LANGUAGES CXX`
`7`	`7`	`)`
`8`	`8`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,8 @@ main()`
`28`	`28`	`{`
`29`	`29`	`Real_T x_next[x_dim];`
`30`	`30`	`//* integration step`
`31`		`- rk4_solver::step(dyn, &Dynamics::ode_fun, t, x, h, i, x_next);`
	`31`	`+ rk4_solver::Integrator<x_dim, Dynamics> rk4;`
	`32`	`+ rk4.step(dyn, &Dynamics::ode_fun, t, x, h, i, x_next);`
`32`	`33`
`33`	`34`	`return 0;`
`34`	`35`	`}`