diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000..256143394a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,19 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The repository is organized around modular physics kernels in `EOS/`, `networks/`, `integration/`, and companion directories such as `conductivity/`, `neutrinos/`, `opacity/`, and `rates/`. Shared math utilities and build scripts remain in `util/`, while reusable constants sit in `constants/`. Interfaces consumed by AMReX-based application codes are in `interfaces/`. Unit regression assets reside under `unit_test/`, and Sphinx documentation sources are in `Docs/` with release history tracked in `CHANGES.md`. + +## Build, Test, and Development Commands +Export `AMREX_HOME` and `MICROPHYSICS_HOME` before building so the GNU make stubs locate AMReX sources. Build a standalone unit test in place, e.g. `cd unit_test/burn_cell && make -j4`, which yields `main3d.gnu.ex`. Select physics at compile time with flags like `make NETWORK_DIR=aprox19 EOS_DIR=helmholtz`, and reset a directory via `make clean`. + +## Coding Style & Naming Conventions +C++ sources follow `.editorconfig`: four-space indentation, LF line endings, and tabs only inside Makefiles. Headers keep the uppercase `.H` suffix and implementations use `.cpp`. Favor the existing snake_case routine pattern (`nse_check`, `burn_cell`) and guard autogenerated directories such as `util/autodiff`. Run `.clang-tidy` with `make USE_CLANG_TIDY=TRUE` on significant changes. + +## Testing Guidelines +Each directory beneath `unit_test/` ships a `GNUmakefile`, `inputs_*` controls, and README notes; run the built binary with the matching inputs file (for example `./main3d.gnu.ex inputs_aprox13`). For new coverage, clone an existing `test_*` layout, describe the scenario, and rerun the most relevant case before posting. Capture scratch artifacts via `.gitignore` rather than committing them. + +## Commit & Pull Request Guidelines +Develop on topic branches and open pull requests against `development`; merges to `main` occur monthly. Keep commit subjects concise and imperative, mirroring history such as `add NSE network compatibility docs (#1852)`. Update `CHANGES.md` for user-facing fixes, link issues, and note required AMReX revisions. PRs should summarize physics choices, list tests run, and attach plots or logs when behavior shifts. + +## Documentation & Automation +Sphinx sources in `Docs/` back the published guide; update them with code changes and ensure `make -C Docs html` finishes cleanly. GitHub Actions publishes the result and exercises unit tests, so keep workflows green by limiting optional dependencies and recording new Python requirements in `requirements.txt`. diff --git a/integration/BackwardEuler/be_integrator.H b/integration/BackwardEuler/be_integrator.H index a204f404a5..2778ba28e8 100644 --- a/integration/BackwardEuler/be_integrator.H +++ b/integration/BackwardEuler/be_integrator.H @@ -2,6 +2,7 @@ #define BE_INTEGRATOR_H #include +#include #include #include @@ -11,7 +12,6 @@ #endif #include #include -#include #ifdef STRANG #include #endif @@ -31,6 +31,7 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int single_step (BurnT& state, BeT& be, const amrex::Real dt) { constexpr int int_neqs = integrator_neqs(); + constexpr amrex::Real uround = std::numeric_limits::epsilon(); int ierr = IERR_SUCCESS; bool converged = false; @@ -88,10 +89,41 @@ int single_step (BurnT& state, BeT& be, const amrex::Real dt) if (be.jacobian_type == 1) { jac(be.t, state, be, be.jac); } else { - jac_info_t jac_info; - jac_info.h = dt; - numerical_jac(state, jac_info, be.jac); - be.n_rhs += (NumSpec+1); + // Build a numerical Jacobian by differencing the wrapped RHS. + // This keeps the Jacobian consistent with whatever variables + // the integrator is actually evolving (X or number densities). + amrex::Real fac = 0.0_rt; + for (int i = 1; i <= int_neqs; ++i) { + const amrex::Real wt = (i <= NumSpec) ? + (integrator_rp::rtol_spec * std::abs(be.y(i)) + integrator_rp::atol_spec) : + (integrator_rp::rtol_enuc * std::abs(be.y(i)) + integrator_rp::atol_enuc); + fac += (ydot(i) / wt) * (ydot(i) / wt); + } + fac = std::sqrt(fac / int_neqs); + + amrex::Real r0 = 1000.0_rt * std::abs(dt) * uround * int_neqs * fac; + if (r0 == 0.0_rt) { + r0 = 1.0_rt; + } + + constexpr bool in_jacobian = true; + amrex::Array1D ydot_pert; + for (int j = 1; j <= int_neqs; ++j) { + const amrex::Real yj = be.y(j); + const amrex::Real wt = (j <= NumSpec) ? + (integrator_rp::rtol_spec * std::abs(yj) + integrator_rp::atol_spec) : + (integrator_rp::rtol_enuc * std::abs(yj) + integrator_rp::atol_enuc); + const amrex::Real r = amrex::max(std::sqrt(uround) * std::abs(yj), r0 * wt); + + be.y(j) += r; + rhs(be.t, state, be, ydot_pert, in_jacobian); + for (int i = 1; i <= int_neqs; ++i) { + be.jac(i, j) = (ydot_pert(i) - ydot(i)) / r; + } + be.y(j) = yj; + } + + be.n_rhs += int_neqs; } be.n_jac++; diff --git a/integration/VODE/vode_dvnlsd.H b/integration/VODE/vode_dvnlsd.H index f080958a90..a505bece89 100644 --- a/integration/VODE/vode_dvnlsd.H +++ b/integration/VODE/vode_dvnlsd.H @@ -148,7 +148,7 @@ amrex::Real dvnlsd (int& NFLAG, BurnT& state, DvodeT& vstate) // sometime VODE goes way off tangent. If these mass fractions // are really bad, then let's just bail now - if (integrator_rp::do_corrector_validation && !integrator_rp::use_number_densities) { + if (integrator_rp::do_corrector_validation) { #ifdef SDC const amrex::Real rho_current = state.rho_orig + vstate.tn * state.ydot_a[SRHO]; const amrex::Real thresh = species_failure_tolerance * rho_current; diff --git a/integration/VODE/vode_dvode.H b/integration/VODE/vode_dvode.H index bbbf414398..aeb69725b1 100644 --- a/integration/VODE/vode_dvode.H +++ b/integration/VODE/vode_dvode.H @@ -170,14 +170,18 @@ int dvode (BurnT& state, DvodeT& vstate) if (vstate.n_step == 0) { #ifndef AMREX_USE_GPU + if (!state.suppress_failure_output) { std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: too much accuracy requested at start of integration" << amrex::ResetDisplay << std::endl; + } #endif return IERR_TOO_MUCH_ACCURACY_REQUESTED; } // Too much accuracy requested for machine precision. #ifndef AMREX_USE_GPU - std::cout << "DVODE: too much accuracy requested" << std::endl; + if (!state.suppress_failure_output) { + std::cout << "DVODE: too much accuracy requested" << std::endl; + } #endif for (int i = 1; i <= int_neqs; ++i) { vstate.y(i) = vstate.yh(i,1); @@ -197,7 +201,9 @@ int dvode (BurnT& state, DvodeT& vstate) if (kflag == -1) { // Error test failed repeatedly or with ABS(H) = HMIN. #ifndef AMREX_USE_GPU - std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: error test failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + if (!state.suppress_failure_output) { + std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: error test failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + } #endif // Set Y array, T, and optional output. for (int i = 1; i <= int_neqs; ++i) { @@ -211,7 +217,9 @@ int dvode (BurnT& state, DvodeT& vstate) else if (kflag == -2) { // Convergence failed repeatedly or with ABS(H) = HMIN. #ifndef AMREX_USE_GPU - std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: corrector convergence failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + if (!state.suppress_failure_output) { + std::cout << amrex::Font::Bold << amrex::FGColor::Red << "DVODE: corrector convergence failed repeatedly or with abs(H) = HMIN" << amrex::ResetDisplay << std::endl; + } #endif // Set Y array, T, and optional output. for (int i = 1; i <= int_neqs; ++i) { diff --git a/integration/integrator_setup_strang.H b/integration/integrator_setup_strang.H index 39ad238148..a9fe630674 100644 --- a/integration/integrator_setup_strang.H +++ b/integration/integrator_setup_strang.H @@ -181,7 +181,7 @@ void integrator_cleanup (IntegratorT& int_state, BurnT& state, // If we failed, print out the current state of the integration. - if (! state.success) { + if (! state.success && !state.suppress_failure_output) { #ifndef AMREX_USE_GPU std::cout << amrex::Font::Bold << amrex::FGColor::Red << "[ERROR] integration failed in net" << amrex::ResetDisplay << std::endl; std::cout << "istate = " << istate << std::endl; diff --git a/integration/utils/jacobian_utilities.H b/integration/utils/jacobian_utilities.H index 291fc939fd..52cfb299e2 100644 --- a/integration/utils/jacobian_utilities.H +++ b/integration/utils/jacobian_utilities.H @@ -2,7 +2,8 @@ #define JACOBIAN_UTILITIES_H #ifndef AMREX_USE_GPU -#include +#include +#include #endif #include @@ -31,20 +32,20 @@ print_jacobian (const T& jac) { constexpr int int_neqs = integrator_neqs(); - std::cout << std::format("{:9} ", "*"); + std::cout << std::setw(9) << "*" << " "; for (int i = 1; i <= NumSpec; ++i) { - std::cout << std::format("{:9} ", short_spec_names_cxx[i-1]); + std::cout << std::setw(9) << short_spec_names_cxx[i-1] << " "; } - std::cout << std::format("{:9} ", "e") << std::endl; + std::cout << std::setw(9) << "e" << std::endl; for (int j = 1; j <= int_neqs; ++j) { if (j < int_neqs) { - std::cout << std::format("{:5} ", short_spec_names_cxx[j-1]); + std::cout << std::setw(5) << short_spec_names_cxx[j-1] << " "; } else { - std::cout << std::format("{:5} ", "e"); + std::cout << std::setw(5) << "e" << " "; } for (int i = 1; i <= int_neqs; ++i) { - std::cout << std::format("{:9.3g} ", jac.get(i, j)); + std::cout << std::setw(9) << std::setprecision(3) << jac.get(i, j) << " "; } std::cout << std::endl; } diff --git a/integration/utils/numerical_jacobian.H b/integration/utils/numerical_jacobian.H index 5affd37179..c025acb48a 100644 --- a/integration/utils/numerical_jacobian.H +++ b/integration/utils/numerical_jacobian.H @@ -59,12 +59,16 @@ void numerical_jac(const BurnT& state, const jac_info_t& jac_info, JacNetArray2D BurnT state_delp = state; - // default -- plus convert the dY/dt into dX/dt + // default state RHS - actual_rhs(state, ydotm); + actual_rhs(state_delp, ydotm); - for (int q = 1; q <= NumSpec; q++) { - ydotm(q) *= aion[q-1]; + // We integrate X, not Y, except for primordial chemistry where + // we integrate number densities directly. + if (! integrator_rp::use_number_densities) { + for (int q = 1; q <= NumSpec; q++) { + ydotm(q) *= aion[q-1]; + } } // start by computing |f|, which is the norm of the RHS / weight @@ -112,10 +116,12 @@ void numerical_jac(const BurnT& state, const jac_info_t& jac_info, JacNetArray2D actual_rhs(state_delp, ydotp); - // We integrate X, so convert from the Y we got back from the RHS - - for (int q = 1; q <= NumSpec; q++) { - ydotp(q) *= aion[q-1]; + // We integrate X, not Y, except for primordial chemistry where + // we integrate number densities directly. + if (! integrator_rp::use_number_densities) { + for (int q = 1; q <= NumSpec; q++) { + ydotp(q) *= aion[q-1]; + } } // now fill in all of the rows for this column X_n @@ -150,8 +156,10 @@ void numerical_jac(const BurnT& state, const jac_info_t& jac_info, JacNetArray2D actual_rhs(state_delp, ydotp); - for (int q = 1; q <= NumSpec; q++) { - ydotp(q) *= aion[q-1]; + if (! integrator_rp::use_number_densities) { + for (int q = 1; q <= NumSpec; q++) { + ydotp(q) *= aion[q-1]; + } } // first fill just the last column with dy/dT diff --git a/interfaces/burn_type.H b/interfaces/burn_type.H index 64e25d75be..023796ace3 100644 --- a/interfaces/burn_type.H +++ b/interfaces/burn_type.H @@ -162,6 +162,9 @@ struct burn_t // integrator error code short error_code{}; + // suppress verbose failure diagnostics (useful for GPU retries) + bool suppress_failure_output{}; + }; diff --git a/unit_test/burn_cell_primordial_chem/GNUmakefile b/unit_test/burn_cell_primordial_chem/GNUmakefile index 669e9839c7..3bb548690b 100644 --- a/unit_test/burn_cell_primordial_chem/GNUmakefile +++ b/unit_test/burn_cell_primordial_chem/GNUmakefile @@ -2,20 +2,21 @@ PRECISION = DOUBLE PROFILE = FALSE # Set DEBUG to TRUE if debugging -DEBUG = TRUE +DEBUG = FALSE DIM = 1 COMP = gnu +EXTRACXXFLAGS += -Wno-psabi USE_MPI = FALSE -USE_OMP = FALSE +USE_OMP = TRUE #set USE_CUDA to TRUE to compile and run on GPUs USE_CUDA = FALSE USE_REACT = TRUE # Set USE_MICROPHYSICS_DEBUG to TRUE if debugging -USE_MICROPHYSICS_DEBUG = TRUE +USE_MICROPHYSICS_DEBUG = FALSE EBASE = main @@ -38,3 +39,18 @@ Bpack := ./Make.package Blocs := . include $(MICROPHYSICS_HOME)/unit_test/Make.unit_test + +ifeq ($(USE_CUDA),TRUE) +ifeq ($(shell uname -s),Linux) +ifeq ($(shell uname -m),aarch64) +# NVTX3 can trigger GCC outlined atomics on AArch64; make link+codegen robust. +EXTRACXXFLAGS += -mno-outline-atomics +NVCC_FLAGS += -Xcompiler -mno-outline-atomics +override XTRALIBS += -latomic +endif +endif +endif + +ifeq ($(shell uname),Darwin) +AMREX_LINKER := ./post_link_install_name.sh $(AMREX_LINKER) +endif diff --git a/unit_test/burn_cell_primordial_chem/README.md b/unit_test/burn_cell_primordial_chem/README.md index ed6a65afff..467a545109 100644 --- a/unit_test/burn_cell_primordial_chem/README.md +++ b/unit_test/burn_cell_primordial_chem/README.md @@ -21,3 +21,29 @@ # continuous integration The code is built with the `primordial_chem` network and run with `inputs_primordial_chem`. + +# reference and compare workflow + +Use this two-step workflow to split CPU reference generation from later +comparison runs. + +1. Generate/update the saved CPU reference solution: + + ```bash + ./main1d.gnu.ex inputs_primordial_chem + ``` + + This runs the normal Richardson-enabled path and writes + `burn_cell_final_state.txt`. + +2. Compare a future run against the saved reference: + + ```bash + ./main1d.gnu.ex inputs_primordial_chem --compare-final-state burn_cell_final_state.txt + ``` + + In compare mode, the code does not run Richardson. It performs one batch + run on the active backend (GPU when enabled, otherwise CPU), compares each + zone against the saved reference, and reports per-zone `PASS`/`FAIL`. + For `T`, `e`, and each species number density, the comparison threshold is + `2 * |saved truncation error|` from the reference file. diff --git a/unit_test/burn_cell_primordial_chem/_parameters b/unit_test/burn_cell_primordial_chem/_parameters index a81e1ff435..0d0d88c74e 100644 --- a/unit_test/burn_cell_primordial_chem/_parameters +++ b/unit_test/burn_cell_primordial_chem/_parameters @@ -16,6 +16,18 @@ tfirst real 0.0 # number of steps for the single zone burn nsteps int 1000 +# print multi-zone burn progress every N timesteps (<= 0 disables) +burn_progress_interval int 100 + +# estimate truncation error with three tolerance levels and Richardson extrapolation +richardson_enable bool 1 +richardson_tol_factor real 10.0 +richardson_print_per_zone bool 0 +# non-trace threshold as a fraction of the most abundant species in each zone +richardson_trace_species_cutoff real 0.0 + +# compute per-zone Jacobian stiffness diagnostics (expensive on CPU) +compute_stiffness_diagnostics bool 0 # initial temperature temperature real 1e2 diff --git a/unit_test/burn_cell_primordial_chem/burn_cell.H b/unit_test/burn_cell_primordial_chem/burn_cell.H index f5ced4cb5e..bf8c56d65b 100644 --- a/unit_test/burn_cell_primordial_chem/burn_cell.H +++ b/unit_test/burn_cell_primordial_chem/burn_cell.H @@ -1,21 +1,214 @@ #ifndef BURN_CELL_H #define BURN_CELL_H +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include #include #include +#include +#include #include #include amrex::Real grav_constant = 6.674e-8; +namespace { + +using Complex = std::complex; +using ComplexMatrix = std::vector>; + +struct BurnZoneKernel { + burn_t *states_ptr; + const amrex::Real *dt_ptr; + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void operator()(int iz) const noexcept { + amrex::Real dt = dt_ptr[iz]; + if (dt <= 0.0_rt) { + return; + } + burn_t &state = states_ptr[iz]; + state.time = 0.0_rt; + state.suppress_failure_output = true; + burner(state, dt); + } +}; + +AMREX_INLINE +auto identity_complex_matrix(int n) -> ComplexMatrix { + ComplexMatrix I(n, std::vector(n, Complex(0.0_rt, 0.0_rt))); + for (int i = 0; i < n; ++i) { + I[i][i] = Complex(1.0_rt, 0.0_rt); + } + return I; +} + +AMREX_INLINE +auto matmul(const ComplexMatrix &A, const ComplexMatrix &B) -> ComplexMatrix { + const int n = static_cast(A.size()); + ComplexMatrix C(n, std::vector(n, Complex(0.0_rt, 0.0_rt))); + for (int i = 0; i < n; ++i) { + for (int k = 0; k < n; ++k) { + const Complex aik = A[i][k]; + if (std::abs(aik) == 0.0_rt) { + continue; + } + for (int j = 0; j < n; ++j) { + C[i][j] += aik * B[k][j]; + } + } + } + return C; +} + AMREX_INLINE -auto burn_cell_c() -> int { +void qr_decompose_mgs(const ComplexMatrix &A, ComplexMatrix &Q, + ComplexMatrix &R) { + const int n = static_cast(A.size()); + Q.assign(n, std::vector(n, Complex(0.0_rt, 0.0_rt))); + R.assign(n, std::vector(n, Complex(0.0_rt, 0.0_rt))); + + std::vector v(n); + + for (int j = 0; j < n; ++j) { + for (int i = 0; i < n; ++i) { + v[i] = A[i][j]; + } + + for (int i = 0; i < j; ++i) { + Complex rij(0.0_rt, 0.0_rt); + for (int k = 0; k < n; ++k) { + rij += std::conj(Q[k][i]) * v[k]; + } + R[i][j] = rij; + for (int k = 0; k < n; ++k) { + v[k] -= rij * Q[k][i]; + } + } + + amrex::Real vnorm2 = 0.0_rt; + for (int k = 0; k < n; ++k) { + vnorm2 += std::norm(v[k]); + } + const amrex::Real vnorm = std::sqrt(vnorm2); + R[j][j] = Complex(vnorm, 0.0_rt); + + if (vnorm > 0.0_rt) { + for (int k = 0; k < n; ++k) { + Q[k][j] = v[k] / vnorm; + } + } else { + Q[j][j] = Complex(1.0_rt, 0.0_rt); + } + } +} + +AMREX_INLINE +auto jacobian_eigenvalues_qr(const RArray2D &jac) -> std::vector { + constexpr int n = INT_NEQS; + ComplexMatrix A(n, std::vector(n, Complex(0.0_rt, 0.0_rt))); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + A[i][j] = Complex(static_cast(jac(i + 1, j + 1)), 0.0_rt); + } + } - burn_t state; + ComplexMatrix Q; + ComplexMatrix R; + const auto I = identity_complex_matrix(n); + + constexpr int max_iter = 256; + constexpr amrex::Real offdiag_tol = 1.0e-12_rt; + + for (int iter = 0; iter < max_iter; ++iter) { + amrex::Real offdiag_norm = 0.0_rt; + for (int i = 1; i < n; ++i) { + for (int j = 0; j < i; ++j) { + offdiag_norm += std::norm(A[i][j]); + } + } + offdiag_norm = std::sqrt(offdiag_norm); + if (offdiag_norm < offdiag_tol) { + break; + } + + const Complex mu = A[n - 1][n - 1]; + ComplexMatrix shifted = A; + for (int i = 0; i < n; ++i) { + shifted[i][i] -= mu * I[i][i]; + } + + qr_decompose_mgs(shifted, Q, R); + A = matmul(R, Q); + for (int i = 0; i < n; ++i) { + A[i][i] += mu * I[i][i]; + } + } + + std::vector eigvals(n, Complex(0.0_rt, 0.0_rt)); + for (int i = 0; i < n; ++i) { + eigvals[i] = A[i][i]; + } + return eigvals; +} + +struct JacobianStiffness { + amrex::Real max_abs_real = 0.0_rt; + amrex::Real min_abs_real = std::numeric_limits::infinity(); + amrex::Real ratio = std::numeric_limits::infinity(); + bool valid = false; +}; + +AMREX_INLINE +auto compute_jacobian_stiffness(const burn_t &state_in) -> JacobianStiffness { + burn_t state = state_in; + RArray2D jac; + actual_jac(state, jac); + + const auto eigvals = jacobian_eigenvalues_qr(jac); + constexpr amrex::Real tiny = 1.0e-300_rt; + + JacobianStiffness result; + for (const auto &lam : eigvals) { + const amrex::Real ar = std::abs(lam.real()); + if (!std::isfinite(ar)) { + continue; + } + result.max_abs_real = std::max(result.max_abs_real, ar); + if (ar > tiny) { + result.min_abs_real = std::min(result.min_abs_real, ar); + } + } + + if (result.max_abs_real > tiny && + std::isfinite(result.min_abs_real) && + result.min_abs_real > tiny) { + result.ratio = result.max_abs_real / result.min_abs_real; + result.valid = true; + } + + return result; +} + +AMREX_INLINE +void init_burn_state(burn_t &state, amrex::Real density_scale, + amrex::Real temp_offset, bool print_initial) { amrex::Real numdens[NumSpec] = {-1.0}; @@ -67,23 +260,28 @@ auto burn_cell_c() -> int { } } - // Echo initial conditions at burn and fill burn state input - - std::cout << "Maximum Time (s): " << unit_test_rp::tmax << std::endl; - std::cout << "State Temperature (K): " << unit_test_rp::temperature << std::endl; for (int n = 0; n < NumSpec; ++n) { - std::cout << "Number Density input (" << short_spec_names_cxx[n] - << "): " << numdens[n] << std::endl; + numdens[n] *= density_scale; + } + + if (print_initial) { + std::cout << "Maximum Time (s): " << unit_test_rp::tmax << std::endl; + std::cout << "State Temperature (K): " + << unit_test_rp::temperature + temp_offset << std::endl; + for (int n = 0; n < NumSpec; ++n) { + std::cout << "Number Density input (" << short_spec_names_cxx[n] + << "): " << numdens[n] << std::endl; + } } - state.T = unit_test_rp::temperature; + state = burn_t{}; + state.T = unit_test_rp::temperature + temp_offset; // find the density in g/cm^3 amrex::Real rhotot = 0.0_rt; for (int n = 0; n < NumSpec; ++n) { state.xn[n] = numdens[n]; - rhotot += state.xn[n] * spmasses[n]; // spmasses contains the masses of all - // species, defined in EOS + rhotot += state.xn[n] * spmasses[n]; } state.rho = rhotot; @@ -105,158 +303,1357 @@ auto burn_cell_c() -> int { } #ifdef DEBUG - for (int n = 0; n < NumSpec; ++n) { - std::cout << "Mass fractions (" << short_spec_names_cxx[n] - << "): " << mfracs[n] << std::endl; - std::cout << "Number Density conserved (" << short_spec_names_cxx[n] - << "): " << state.xn[n] << std::endl; + if (print_initial) { + for (int n = 0; n < NumSpec; ++n) { + std::cout << "Mass fractions (" << short_spec_names_cxx[n] + << "): " << mfracs[n] << std::endl; + std::cout << "Number Density conserved (" << short_spec_names_cxx[n] + << "): " << state.xn[n] << std::endl; + } } #endif - // call the EOS to set initial internal energy e eos(eos_input_rt, state); +} - // name of output file - std::ofstream state_over_time("state_over_time.txt"); +AMREX_INLINE +void enforce_post_burn_constraints(burn_t &state) { - // save the initial state -- we'll use this to determine - // how much things changed over the entire burn - burn_t state_in = state; + amrex::Real inmfracs[NumSpec] = {-1.0}; + amrex::Real insum = 0.0_rt; + for (int nn = 0; nn < NumSpec; ++nn) { + state.xn[nn] = amrex::max(state.xn[nn], small_x); + inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; + insum += inmfracs[nn]; + } - // output the data in columns, one line per timestep - state_over_time << std::setw(10) << "# Time"; - state_over_time << std::setw(15) << "Density"; - state_over_time << std::setw(15) << "Temperature"; - for (int x = 0; x < NumSpec; ++x) { - const std::string &element = short_spec_names_cxx[x]; - state_over_time << std::setw(15) << element; + for (int nn = 0; nn < NumSpec; ++nn) { + inmfracs[nn] /= insum; + // update the number densities with conserved mass fractions + state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; } - state_over_time << std::endl; - amrex::Real t = 0.0; + // update the number density of electrons due to charge conservation + balance_charge(state); + + // reconserve mass fractions post charge conservation + insum = 0.0_rt; + for (int nn = 0; nn < NumSpec; ++nn) { + state.xn[nn] = amrex::max(state.xn[nn], small_x); + inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; + insum += inmfracs[nn]; + } - state_over_time << std::setw(10) << t; - state_over_time << std::setw(15) << state.rho; - state_over_time << std::setw(15) << state.T; - for (int x = 0; x < NumSpec; ++x) { - state_over_time << std::setw(15) << state.xn[x]; + for (int nn = 0; nn < NumSpec; ++nn) { + inmfracs[nn] /= insum; + // update the number densities with conserved mass fractions + state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; } - state_over_time << std::endl; - // loop over steps, burn, and output the current state - // the loop below is similar to that used in krome and pynucastro - amrex::Real dd = rhotot; - amrex::Real dd1 = 0.0_rt; + // get the updated T + eos(eos_input_re, state); +} + +} // namespace + + +AMREX_INLINE +auto burn_cell_multi_c(int n_zones, bool enable_gpu, + const std::string &compare_final_state_file = "") + -> int { + + constexpr amrex::Real rel_tol = 1.0e-10_rt; + constexpr amrex::Real abs_tol = 1.0e-12_rt; - for (int n = 0; n < unit_test_rp::nsteps; n++) { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(n_zones > 0, + "n_zones must be positive"); - dd1 = dd; + struct MultiZoneSummary { + int failures = 0; + int retries = 0; + bool stiffness_enabled = false; + amrex::Real t_min = std::numeric_limits::max(); + amrex::Real t_max = -std::numeric_limits::max(); + amrex::Real stiffness_ratio_min = std::numeric_limits::max(); + amrex::Real stiffness_ratio_max = 0.0_rt; + int stiffness_samples = 0; + bool any_success = false; + }; - amrex::Real rhotmp = 0.0_rt; + auto run_multi_zone = [&](bool use_gpu, std::vector &final_states, + std::vector &final_times, + MultiZoneSummary &summary) -> bool { + amrex::Gpu::HostVector states_h(n_zones); + amrex::Gpu::HostVector dd_h(n_zones); + amrex::Gpu::HostVector dt_h(n_zones); + amrex::Gpu::HostVector time_h(n_zones); + std::vector states_before(n_zones); + std::vector active_h(n_zones, 1); - for (int nn = 0; nn < NumSpec; ++nn) { - rhotmp += state.xn[nn] * spmasses[nn]; + summary = MultiZoneSummary{}; + summary.stiffness_enabled = unit_test_rp::compute_stiffness_diagnostics; + int retry_total = 0; + + for (int iz = 0; iz < n_zones; ++iz) { + amrex::Real temp_offset = 0.0_rt; + if (n_zones > 1) { + temp_offset = 0.5_rt * unit_test_rp::temperature * + (static_cast(iz) / + static_cast(n_zones - 1) - + 0.5_rt); + } + init_burn_state(states_h[iz], 1.0_rt, temp_offset, false); + dd_h[iz] = states_h[iz].rho; + dt_h[iz] = 0.0_rt; + time_h[iz] = 0.0_rt; + active_h[iz] = 1; } - // find the freefall time - amrex::Real tff = std::sqrt(M_PI * 3.0 / (32.0 * rhotmp * grav_constant)); - amrex::Real dt = unit_test_rp::tff_reduc * tff; - // scale the density - dd += dt * (dd / tff); + amrex::Gpu::DeviceVector states_d; + amrex::Gpu::DeviceVector dt_d; + if (use_gpu) { + states_d.resize(n_zones); + dt_d.resize(n_zones); + } -#ifdef DEBUG - std::cout << "step params " << dd << ", " << tff << ", " << rhotmp - << std::endl; + const int max_split_depth = 6; + constexpr amrex::Real min_split_dt = 1.0_rt; + + auto burn_backend = [&](burn_t &state, amrex::Real dt) { + state.time = 0.0_rt; + state.suppress_failure_output = true; + + if (use_gpu) { + amrex::AsyncArray state_device(&state, 1); + burn_t *state_ptr = state_device.data(); + + amrex::ParallelFor( + 1, [state_ptr, dt] AMREX_GPU_DEVICE(int idx) noexcept { + burner(state_ptr[idx], dt); + }); + + state_device.copyToHost(&state, 1); + amrex::Gpu::streamSynchronize(); + } else { + burner(state, dt); + } + }; + + std::function integrate_with_retries; + integrate_with_retries = [&](burn_t &state, amrex::Real dt, + int depth) -> bool { + burn_t backup = state; + burn_backend(state, dt); + if (state.success && state.e > 0.0_rt) { + state.suppress_failure_output = false; + return true; + } + if (depth >= max_split_depth || 0.5_rt * dt <= min_split_dt) { + state = backup; + state.suppress_failure_output = false; + return false; + } + amrex::Real half_dt = 0.5_rt * dt; + state = backup; + if (!integrate_with_retries(state, half_dt, depth + 1)) { + state.suppress_failure_output = false; + return false; + } + bool status = integrate_with_retries(state, half_dt, depth + 1); + state.suppress_failure_output = false; + return status; + }; + + auto precompute_total_steps = [&]() -> int { + auto dd_pc = dd_h; + auto active_pc = active_h; + int steps = 0; + + for (int step = 0; step < unit_test_rp::nsteps; ++step) { + int any_active_int = 0; +#ifdef AMREX_USE_OMP +#pragma omp parallel for schedule(dynamic, 64) reduction(max : any_active_int) #endif + for (int iz = 0; iz < n_zones; ++iz) { + if (active_pc[iz] == 0) { + continue; + } - // stop the test if dt is very small - if (dt < 10) { - break; + amrex::Real dd1 = dd_pc[iz]; + amrex::Real tff = + std::sqrt(M_PI * 3.0_rt / (32.0_rt * dd1 * grav_constant)); + amrex::Real dt = unit_test_rp::tff_reduc * tff; + amrex::Real dd_new = dd1 + dt * (dd1 / tff); + + if (dt < 10.0_rt || dd_new > 2.0e-6_rt) { + active_pc[iz] = 0; + continue; + } + + dd_pc[iz] = dd_new; + any_active_int = 1; + } + + const bool any_active = (any_active_int != 0); + if (!any_active) { + break; + } + ++steps; + } + + return steps; + }; + + const int total_steps = precompute_total_steps(); + + for (int step = 0; step < unit_test_rp::nsteps; ++step) { + int any_active_int = 0; +#ifdef AMREX_USE_OMP +#pragma omp parallel for schedule(dynamic, 64) reduction(max : any_active_int) +#endif + for (int iz = 0; iz < n_zones; ++iz) { + if (active_h[iz] == 0) { + dt_h[iz] = 0.0_rt; + continue; + } + + amrex::Real dd1 = dd_h[iz]; + amrex::Real tff = + std::sqrt(M_PI * 3.0_rt / (32.0_rt * dd1 * grav_constant)); + amrex::Real dt = unit_test_rp::tff_reduc * tff; + amrex::Real dd_new = dd1 + dt * (dd1 / tff); + + if (dt < 10.0_rt || dd_new > 2.0e-6_rt) { + active_h[iz] = 0; + dt_h[iz] = 0.0_rt; + continue; + } + + amrex::Real ratio = dd_new / dd1; + for (int nn = 0; nn < NumSpec; ++nn) { + states_h[iz].xn[nn] *= ratio; + } + states_h[iz].rho *= ratio; + + states_h[iz].suppress_failure_output = true; + states_before[iz] = states_h[iz]; + + dd_h[iz] = dd_new; + dt_h[iz] = dt; + any_active_int = 1; + } + + const bool any_active = (any_active_int != 0); + if (!any_active) { + break; + } + + if (unit_test_rp::burn_progress_interval > 0 && + (step + 1) % unit_test_rp::burn_progress_interval == 0) { + std::cout << "Burn progress: timestep " << (step + 1) << " / " + << total_steps << std::endl; + } + + if (use_gpu) { + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, states_h.begin(), + states_h.end(), states_d.begin()); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, dt_h.begin(), + dt_h.end(), dt_d.begin()); + amrex::Gpu::streamSynchronize(); + + burn_t *states_ptr = states_d.data(); + const amrex::Real *dt_ptr = dt_d.data(); + + amrex::ParallelFor( + n_zones, [states_ptr, dt_ptr] AMREX_GPU_DEVICE(int idx) noexcept { + amrex::Real dt = dt_ptr[idx]; + if (dt <= 0.0_rt) { + return; + } + burner(states_ptr[idx], dt); + }); + amrex::Gpu::streamSynchronize(); + + amrex::Gpu::copyAsync(amrex::Gpu::deviceToHost, states_d.begin(), + states_d.end(), states_h.begin()); + amrex::Gpu::streamSynchronize(); + } else { + burn_t *states_ptr = states_h.data(); + const amrex::Real *dt_ptr = dt_h.data(); +#ifdef AMREX_USE_OMP +#pragma omp parallel for schedule(dynamic, 32) +#endif + for (int iz = 0; iz < n_zones; ++iz) { + amrex::Real dt = dt_ptr[iz]; + if (dt <= 0.0_rt) { + continue; + } + burn_t &state = states_ptr[iz]; + state.time = 0.0_rt; + state.suppress_failure_output = true; + burner(state, dt); + } + } + + int retry_total_step = 0; + amrex::Real stiffness_ratio_min_step = + std::numeric_limits::max(); + amrex::Real stiffness_ratio_max_step = 0.0_rt; + int stiffness_samples_step = 0; + +#ifdef AMREX_USE_OMP +#pragma omp parallel for schedule(dynamic, 32) \ + reduction(+ : retry_total_step, stiffness_samples_step) \ + reduction(min : stiffness_ratio_min_step) reduction(max : stiffness_ratio_max_step) +#endif + for (int iz = 0; iz < n_zones; ++iz) { + if (dt_h[iz] <= 0.0_rt) { + continue; + } + + auto &state = states_h[iz]; + + if (!state.success || state.e <= 0.0_rt) { + burn_t retry_state = states_before[iz]; + retry_state.suppress_failure_output = false; + ++retry_total_step; + if (integrate_with_retries(retry_state, dt_h[iz], 0)) { + state = retry_state; + } else { + state = states_before[iz]; + state.success = false; + active_h[iz] = 0; + dt_h[iz] = 0.0_rt; + continue; + } + } + + state.suppress_failure_output = false; + enforce_post_burn_constraints(state); + if (summary.stiffness_enabled) { + const auto stiff = compute_jacobian_stiffness(state); + if (stiff.valid) { + stiffness_ratio_min_step = + std::min(stiffness_ratio_min_step, stiff.ratio); + stiffness_ratio_max_step = + std::max(stiffness_ratio_max_step, stiff.ratio); + ++stiffness_samples_step; + } + } + time_h[iz] += dt_h[iz]; + } + + retry_total += retry_total_step; + if (summary.stiffness_enabled && stiffness_samples_step > 0) { + summary.stiffness_ratio_min = + std::min(summary.stiffness_ratio_min, stiffness_ratio_min_step); + summary.stiffness_ratio_max = + std::max(summary.stiffness_ratio_max, stiffness_ratio_max_step); + summary.stiffness_samples += stiffness_samples_step; + } } - // stop the test if we have reached very high densities - if (dd > 2e-6) { - break; + summary.failures = 0; + summary.t_min = std::numeric_limits::max(); + summary.t_max = -std::numeric_limits::max(); + summary.any_success = false; + summary.retries = retry_total; + + for (int iz = 0; iz < n_zones; ++iz) { + if (!states_h[iz].success || states_h[iz].e <= 0.0_rt) { + ++summary.failures; + continue; + } + summary.any_success = true; + summary.t_min = std::min(summary.t_min, states_h[iz].T); + summary.t_max = std::max(summary.t_max, states_h[iz].T); } - std::cout << "step " << n << " done" << std::endl; + final_states.assign(states_h.begin(), states_h.end()); + final_times.assign(time_h.begin(), time_h.end()); - // scale the number densities - for (int nn = 0; nn < NumSpec; ++nn) { - state.xn[nn] *= dd / dd1; + return (summary.failures == 0); + }; + + auto print_summary = [&](const std::string &label, + const MultiZoneSummary &summary) { + const std::string prefix = label + " multi-zone"; + std::cout << prefix << " summary:" << std::endl; + std::cout << " burn complete : zones failed = " + << summary.failures << std::endl; + if (summary.any_success) { + std::cout << " temperature range : [" + << summary.t_min << ", " << summary.t_max << "]" << std::endl; + std::cout << " retries applied : " + << summary.retries << std::endl; + if (summary.stiffness_enabled) { + if (summary.stiffness_samples > 0) { + std::cout << " stiffness ratio range : [" + << std::scientific << summary.stiffness_ratio_min << ", " + << summary.stiffness_ratio_max << "]" + << std::defaultfloat << " (samples=" + << summary.stiffness_samples << ")" << std::endl; + } else { + std::cout << " stiffness ratio range : unavailable" << std::endl; + } + } else { + std::cout << " stiffness diagnostics : disabled" << std::endl; + } + } else { + std::cout << " status : no successful zones" + << std::endl; } + }; - // input the scaled density in burn state - state.rho *= dd / dd1; + struct BackendRunResult { + std::vector states; + std::vector times; + MultiZoneSummary summary; + bool success = false; + std::chrono::duration wall{0.0}; + }; - // do the actual integration - burner(state, dt); + struct TolLevelResult { + amrex::Real scale = 1.0_rt; + BackendRunResult cpu; + BackendRunResult gpu; + }; + + const amrex::Real base_rtol_spec = integrator_rp::rtol_spec; + const amrex::Real base_atol_spec = integrator_rp::atol_spec; + const amrex::Real base_rtol_enuc = integrator_rp::rtol_enuc; + const amrex::Real base_atol_enuc = integrator_rp::atol_enuc; + const amrex::Real base_retry_rtol_spec = integrator_rp::retry_rtol_spec; + const amrex::Real base_retry_atol_spec = integrator_rp::retry_atol_spec; + const amrex::Real base_retry_rtol_enuc = integrator_rp::retry_rtol_enuc; + const amrex::Real base_retry_atol_enuc = integrator_rp::retry_atol_enuc; + + const bool compare_against_saved = !compare_final_state_file.empty(); + const bool do_richardson = + !compare_against_saved && unit_test_rp::richardson_enable; + const int n_tol_levels = do_richardson ? 3 : 1; + const std::array richardson_scales = { + 0.5_rt, 1.0_rt, 2.0_rt}; + constexpr int richardson_loose_level = 0; + constexpr int richardson_base_level = 1; + constexpr int richardson_tight_level = 2; + const amrex::Real richardson_ratio = + richardson_scales[richardson_tight_level] / + richardson_scales[richardson_base_level]; + const bool compare_backend_is_gpu = compare_against_saved && enable_gpu; + const bool run_gpu_backend = + compare_against_saved ? compare_backend_is_gpu : enable_gpu; + + auto apply_tolerance_scale = [&](amrex::Real scale) { + integrator_rp::rtol_spec = base_rtol_spec / scale; + integrator_rp::atol_spec = base_atol_spec / scale; + integrator_rp::rtol_enuc = base_rtol_enuc / scale; + integrator_rp::atol_enuc = base_atol_enuc / scale; + + if (base_retry_rtol_spec > 0.0_rt) { + integrator_rp::retry_rtol_spec = base_retry_rtol_spec / scale; + } + if (base_retry_atol_spec > 0.0_rt) { + integrator_rp::retry_atol_spec = base_retry_atol_spec / scale; + } + if (base_retry_rtol_enuc > 0.0_rt) { + integrator_rp::retry_rtol_enuc = base_retry_rtol_enuc / scale; + } + if (base_retry_atol_enuc > 0.0_rt) { + integrator_rp::retry_atol_enuc = base_retry_atol_enuc / scale; + } + }; + + auto restore_base_tolerances = [&]() { + integrator_rp::rtol_spec = base_rtol_spec; + integrator_rp::atol_spec = base_atol_spec; + integrator_rp::rtol_enuc = base_rtol_enuc; + integrator_rp::atol_enuc = base_atol_enuc; + integrator_rp::retry_rtol_spec = base_retry_rtol_spec; + integrator_rp::retry_atol_spec = base_retry_atol_spec; + integrator_rp::retry_rtol_enuc = base_retry_rtol_enuc; + integrator_rp::retry_atol_enuc = base_retry_atol_enuc; + }; + + auto format_tol = [](amrex::Real tol) { + std::ostringstream out; + out << std::scientific << std::setprecision(10) << tol; + return out.str(); + }; + + std::vector level_results(n_tol_levels); + bool cpu_success = true; + bool gpu_success = true; + + for (int ilev = 0; ilev < n_tol_levels; ++ilev) { + amrex::Real scale = do_richardson ? richardson_scales[ilev] : 1.0_rt; + level_results[ilev].scale = scale; + apply_tolerance_scale(scale); + + if (do_richardson) { + std::cout << "Richardson tolerances (level " << (ilev + 1) + << ", scale=" << scale << "):" << std::endl; + std::cout << " rtol_spec / atol_spec : " + << format_tol(integrator_rp::rtol_spec) << " / " + << format_tol(integrator_rp::atol_spec) << std::endl; + std::cout << " rtol_enuc / atol_enuc : " + << format_tol(integrator_rp::rtol_enuc) << " / " + << format_tol(integrator_rp::atol_enuc) << std::endl; + if (integrator_rp::retry_rtol_spec > 0.0_rt && + integrator_rp::retry_atol_spec > 0.0_rt) { + std::cout << " retry_rtol_spec / retry_atol_spec : " + << format_tol(integrator_rp::retry_rtol_spec) << " / " + << format_tol(integrator_rp::retry_atol_spec) << std::endl; + } + if (integrator_rp::retry_rtol_enuc > 0.0_rt && + integrator_rp::retry_atol_enuc > 0.0_rt) { + std::cout << " retry_rtol_enuc / retry_atol_enuc : " + << format_tol(integrator_rp::retry_rtol_enuc) << " / " + << format_tol(integrator_rp::retry_atol_enuc) << std::endl; + } + } - // ensure positivity and normalize - amrex::Real inmfracs[NumSpec] = {-1.0}; - amrex::Real insum = 0.0_rt; - for (int nn = 0; nn < NumSpec; ++nn) { - state.xn[nn] = amrex::max(state.xn[nn], small_x); - inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; - insum += inmfracs[nn]; + std::ostringstream level_label; + if (do_richardson) { + level_label << " (tol level " << (ilev + 1) << ", scale=" << scale + << ")"; } + if (!compare_against_saved || !compare_backend_is_gpu) { + auto cpu_start = std::chrono::steady_clock::now(); + level_results[ilev].cpu.success = + run_multi_zone(false, level_results[ilev].cpu.states, + level_results[ilev].cpu.times, + level_results[ilev].cpu.summary); + auto cpu_end = std::chrono::steady_clock::now(); + level_results[ilev].cpu.wall = cpu_end - cpu_start; + cpu_success = cpu_success && level_results[ilev].cpu.success; - for (int nn = 0; nn < NumSpec; ++nn) { - inmfracs[nn] /= insum; - // update the number densities with conserved mass fractions - state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; + const std::string cpu_label = "CPU" + level_label.str(); + print_summary(cpu_label, level_results[ilev].cpu.summary); + std::cout << " burn walltime (s) : " + << level_results[ilev].cpu.wall.count() << std::endl; + } else if (ilev == 0) { + std::cout << "CPU multi-zone burn skipped (saved-state compare mode)" + << std::endl; } - // update the number density of electrons due to charge conservation - balance_charge(state); + if (run_gpu_backend) { + auto gpu_start = std::chrono::steady_clock::now(); + level_results[ilev].gpu.success = + run_multi_zone(true, level_results[ilev].gpu.states, + level_results[ilev].gpu.times, + level_results[ilev].gpu.summary); + auto gpu_end = std::chrono::steady_clock::now(); + level_results[ilev].gpu.wall = gpu_end - gpu_start; + gpu_success = gpu_success && level_results[ilev].gpu.success; - // reconserve mass fractions post charge conservation - insum = 0; - for (int nn = 0; nn < NumSpec; ++nn) { - state.xn[nn] = amrex::max(state.xn[nn], small_x); - inmfracs[nn] = spmasses[nn] * state.xn[nn] / state.rho; - insum += inmfracs[nn]; + const std::string gpu_label = "GPU" + level_label.str(); + print_summary(gpu_label, level_results[ilev].gpu.summary); + std::cout << " burn walltime (s) : " + << level_results[ilev].gpu.wall.count() << std::endl; + } else if (ilev == 0) { + std::cout << "GPU multi-zone burn skipped (no GPU backend)" + << std::endl; } + } + + restore_base_tolerances(); + + const auto &cpu_states = level_results.front().cpu.states; + const auto &cpu_times = level_results.front().cpu.times; + const auto &gpu_states = level_results.front().gpu.states; + const auto &gpu_times = level_results.front().gpu.times; + const auto &compare_states = compare_backend_is_gpu ? gpu_states : cpu_states; + const auto &compare_times = compare_backend_is_gpu ? gpu_times : cpu_times; + const bool compare_backend_success = + compare_backend_is_gpu ? level_results.front().gpu.success + : level_results.front().cpu.success; - for (int nn = 0; nn < NumSpec; ++nn) { - inmfracs[nn] /= insum; - // update the number densities with conserved mass fractions - state.xn[nn] = inmfracs[nn] * state.rho / spmasses[nn]; + struct RichardsonEstimate { + bool valid = false; + amrex::Real p = 0.0_rt; + amrex::Real err = 0.0_rt; + amrex::Real delta01 = 0.0_rt; + }; + + auto estimate_richardson = [&](amrex::Real y_loose, amrex::Real y_base, + amrex::Real y_tight) -> RichardsonEstimate { + RichardsonEstimate est; + const amrex::Real d01 = y_loose - y_base; + const amrex::Real d12 = y_base - y_tight; + est.delta01 = d01; + + constexpr amrex::Real tiny = 1.0e-300_rt; + const amrex::Real abs_d01 = std::abs(d01); + const amrex::Real abs_d12 = std::abs(d12); + + if (abs_d01 <= tiny && abs_d12 <= tiny) { + est.valid = true; + return est; } - // get the updated T - eos(eos_input_re, state); + if (abs_d01 <= tiny || abs_d12 <= tiny) { + est.err = d01; + return est; + } - t += dt; + amrex::Real ratio = abs_d01 / abs_d12; + if (!(ratio > 0.0_rt) || !std::isfinite(ratio)) { + est.err = d01; + return est; + } + + est.p = std::log(ratio) / std::log(richardson_ratio); + if (!std::isfinite(est.p)) { + est.err = d01; + return est; + } + + amrex::Real denom = std::pow(richardson_ratio, est.p) - 1.0_rt; + if (std::abs(denom) <= tiny || !std::isfinite(denom)) { + est.err = d01; + return est; + } + + est.err = d01 / denom; + est.valid = true; + return est; + }; + + auto within_tol = [&](amrex::Real a, amrex::Real b) -> bool { + amrex::Real diff = std::abs(a - b); + amrex::Real scale = std::max(std::abs(a), std::abs(b)); + return diff <= (abs_tol + rel_tol * scale); + }; + + if (compare_against_saved) { + struct SavedZoneData { + bool present = false; + int zone = -1; + amrex::Real time = 0.0_rt; + amrex::Real rho = 0.0_rt; + amrex::Real T = 0.0_rt; + amrex::Real e = 0.0_rt; + int success = 0; + std::array xn{}; + int trunc_valid = 0; + amrex::Real trunc_err_T = 0.0_rt; + amrex::Real trunc_err_e = 0.0_rt; + std::array trunc_err_xn{}; + }; - state_over_time << std::setw(10) << t; - state_over_time << std::setw(15) << state.rho; - state_over_time << std::setw(12) << state.T; - for (int x = 0; x < NumSpec; ++x) { - state_over_time << std::setw(15) << state.xn[x]; + std::ifstream ifs(compare_final_state_file); + if (!ifs) { + std::cout << "ERROR: unable to open saved final-state file " + << compare_final_state_file << std::endl; + return false; } - state_over_time << std::endl; + + std::vector saved(n_zones); + std::string line; + int parsed_rows = 0; + while (std::getline(ifs, line)) { + if (line.empty() || line[0] == '#') { + continue; + } + + std::istringstream iss(line); + SavedZoneData row; + if (!(iss >> row.zone >> row.time >> row.rho >> row.T >> row.e >> + row.success)) { + std::cout << "ERROR: malformed row in saved-state file: " << line + << std::endl; + return false; + } + for (int n = 0; n < NumSpec; ++n) { + if (!(iss >> row.xn[n])) { + std::cout << "ERROR: missing species values in saved-state file row: " + << line << std::endl; + return false; + } + } + if (!(iss >> row.trunc_valid >> row.trunc_err_T >> row.trunc_err_e)) { + std::cout << "ERROR: missing truncation fields in saved-state file row: " + << line << std::endl; + return false; + } + for (int n = 0; n < NumSpec; ++n) { + if (!(iss >> row.trunc_err_xn[n])) { + std::cout << "ERROR: missing species truncation fields in saved-state " + "file row: " + << line << std::endl; + return false; + } + } + + if (row.zone < 0 || row.zone >= n_zones) { + std::cout << "ERROR: zone index " << row.zone + << " is out of bounds for n_zones = " << n_zones + << std::endl; + return false; + } + row.present = true; + saved[row.zone] = row; + ++parsed_rows; + } + + std::cout << "Loaded " << parsed_rows << " zone rows from " + << compare_final_state_file << std::endl; + + int missing_rows = 0; + int zones_failed = 0; + int zones_passed = 0; + amrex::Real max_saved_err_units = 0.0_rt; + int max_saved_err_zone = -1; + std::string max_saved_err_field = "none"; + + for (int iz = 0; iz < n_zones; ++iz) { + if (!saved[iz].present) { + ++missing_rows; + ++zones_failed; + std::cout << "zone " << iz << " : FAIL (missing from saved file)" + << std::endl; + continue; + } + + const auto &ref = saved[iz]; + std::vector mismatches; + + const bool cur_success = + compare_states[iz].success && compare_states[iz].e > 0.0_rt; + const bool ref_success = (ref.success != 0); + if (cur_success != ref_success) { + mismatches.push_back("success"); + } + + if (!within_tol(compare_times[iz], ref.time)) { + mismatches.push_back("time"); + } + if (!within_tol(compare_states[iz].rho, ref.rho)) { + mismatches.push_back("rho"); + } + + auto within_saved_err = [&](amrex::Real cur, amrex::Real reference, + amrex::Real saved_err) { + const amrex::Real threshold = 2.0_rt * std::abs(saved_err); + return std::abs(cur - reference) <= threshold; + }; + auto update_saved_err_units = [&](const std::string &field, + amrex::Real cur, + amrex::Real reference, + amrex::Real saved_err) { + const amrex::Real diff = std::abs(cur - reference); + amrex::Real units = 0.0_rt; + if (std::abs(saved_err) > 0.0_rt) { + units = diff / std::abs(saved_err); + } else if (diff > 0.0_rt) { + units = std::numeric_limits::infinity(); + } + if (units > max_saved_err_units) { + max_saved_err_units = units; + max_saved_err_zone = iz; + max_saved_err_field = field; + } + }; + + if (ref.trunc_valid != 0) { + update_saved_err_units("T", compare_states[iz].T, ref.T, + ref.trunc_err_T); + if (!within_saved_err(compare_states[iz].T, ref.T, ref.trunc_err_T)) { + mismatches.push_back("T"); + } + update_saved_err_units("e", compare_states[iz].e, ref.e, + ref.trunc_err_e); + if (!within_saved_err(compare_states[iz].e, ref.e, ref.trunc_err_e)) { + mismatches.push_back("e"); + } + for (int n = 0; n < NumSpec; ++n) { + const std::string species_field = + "xn(" + std::string(short_spec_names_cxx[n]) + ")"; + update_saved_err_units(species_field, compare_states[iz].xn[n], + ref.xn[n], ref.trunc_err_xn[n]); + if (!within_saved_err(compare_states[iz].xn[n], ref.xn[n], + ref.trunc_err_xn[n])) { + mismatches.push_back(species_field); + } + } + } else { + if (!within_tol(compare_states[iz].T, ref.T)) { + mismatches.push_back("T"); + } + if (!within_tol(compare_states[iz].e, ref.e)) { + mismatches.push_back("e"); + } + for (int n = 0; n < NumSpec; ++n) { + if (!within_tol(compare_states[iz].xn[n], ref.xn[n])) { + mismatches.push_back("xn(" + std::string(short_spec_names_cxx[n]) + + ")"); + } + } + } + + if (mismatches.empty()) { + ++zones_passed; + std::cout << "zone " << iz << " : PASS" << std::endl; + } else { + ++zones_failed; + std::cout << "zone " << iz << " : FAIL ["; + for (int i = 0; i < static_cast(mismatches.size()); ++i) { + if (i > 0) { + std::cout << ", "; + } + std::cout << mismatches[i]; + } + std::cout << "]" << std::endl; + } + } + + std::cout << "Saved-state comparison summary: passed=" << zones_passed + << ", failed=" << zones_failed + << ", missing_rows=" << missing_rows << std::endl; + if (max_saved_err_zone >= 0) { + std::cout << "Largest saved-state difference in truncation-error units: " + << std::fixed << std::setprecision(3) << max_saved_err_units + << std::defaultfloat << " (zone " << max_saved_err_zone + << ", field " << max_saved_err_field << ")" << std::endl; + } + return (compare_backend_success && zones_failed == 0); } - state_over_time.close(); - - // output diagnostics to the terminal - std::cout << "------------------------------------" << std::endl; - std::cout << "successful? " << state.success << std::endl; - - std::cout << "------------------------------------" << std::endl; - std::cout << "T initial = " << state_in.T << std::endl; - std::cout << "T final = " << state.T << std::endl; - std::cout << "Eint initial = " << state_in.e << std::endl; - std::cout << "Eint final = " << state.e << std::endl; - std::cout << "rho initial = " << state_in.rho << std::endl; - std::cout << "rho final = " << state.rho << std::endl; - - std::cout << "------------------------------------" << std::endl; - std::cout << "New number densities: " << std::endl; - for (int n = 0; n < NumSpec; ++n) { - std::cout << state.xn[n] << std::endl; + + auto report_richardson = [&](const std::string &label, + const std::vector &l0, + const std::vector &l1, + const std::vector &l2) { + if (!do_richardson) { + return; + } + if (l0.size() != l1.size() || l0.size() != l2.size()) { + std::cout << label + << " Richardson estimate skipped (inconsistent level sizes)" + << std::endl; + return; + } + + int valid_zones = 0; + int valid_temp_estimates = 0; + amrex::Real max_abs_temp_err = 0.0_rt; + amrex::Real max_abs_energy_err = 0.0_rt; + amrex::Real max_abs_species_err = 0.0_rt; + amrex::Real max_rel_temp_err = 0.0_rt; + amrex::Real max_rel_energy_err = 0.0_rt; + amrex::Real max_rel_species_err = 0.0_rt; + int species_rel_count = 0; + std::vector species_rel_hit_count(NumSpec, 0); + + const amrex::Real trace_rel_cutoff = + amrex::max(unit_test_rp::richardson_trace_species_cutoff, 0.0_rt); + + auto rel_error = [](amrex::Real err, amrex::Real ref) { + constexpr amrex::Real tiny = 1.0e-300_rt; + return std::abs(err) / amrex::max(std::abs(ref), tiny); + }; + + std::cout << label << " Richardson truncation estimate:" << std::endl; + std::cout << " levels with scales : [" + << richardson_scales[richardson_loose_level] << ", " + << richardson_scales[richardson_base_level] << ", " + << richardson_scales[richardson_tight_level] << "]" + << std::endl; + + for (int iz = 0; iz < n_zones; ++iz) { + const bool ok0 = l0[iz].success && l0[iz].e > 0.0_rt; + const bool ok1 = l1[iz].success && l1[iz].e > 0.0_rt; + const bool ok2 = l2[iz].success && l2[iz].e > 0.0_rt; + if (!(ok0 && ok1 && ok2)) { + continue; + } + + ++valid_zones; + + const RichardsonEstimate t_est = + estimate_richardson(l0[iz].T, l1[iz].T, l2[iz].T); + const RichardsonEstimate e_est = + estimate_richardson(l0[iz].e, l1[iz].e, l2[iz].e); + + if (t_est.valid) { + ++valid_temp_estimates; + } + max_abs_temp_err = std::max(max_abs_temp_err, std::abs(t_est.err)); + max_abs_energy_err = std::max(max_abs_energy_err, std::abs(e_est.err)); + max_rel_temp_err = + std::max(max_rel_temp_err, rel_error(t_est.err, l1[iz].T)); + max_rel_energy_err = + std::max(max_rel_energy_err, rel_error(e_est.err, l1[iz].e)); + + amrex::Real zone_species_linf = 0.0_rt; + amrex::Real zone_rel_species_linf = 0.0_rt; + std::ostringstream zone_species_list; + bool zone_first_species = true; + amrex::Real zone_max_xn = 0.0_rt; + for (int n = 0; n < NumSpec; ++n) { + zone_max_xn = std::max(zone_max_xn, std::abs(l1[iz].xn[n])); + } + const amrex::Real zone_trace_cutoff = trace_rel_cutoff * zone_max_xn; + for (int n = 0; n < NumSpec; ++n) { + const RichardsonEstimate xn_est = + estimate_richardson(l0[iz].xn[n], l1[iz].xn[n], l2[iz].xn[n]); + zone_species_linf = std::max(zone_species_linf, std::abs(xn_est.err)); + if (std::abs(l1[iz].xn[n]) >= zone_trace_cutoff) { + zone_rel_species_linf = std::max( + zone_rel_species_linf, rel_error(xn_est.err, l1[iz].xn[n])); + ++species_rel_count; + ++species_rel_hit_count[n]; + if (!zone_first_species) { + zone_species_list << ", "; + } + zone_species_list << short_spec_names_cxx[n]; + zone_first_species = false; + } + } + max_abs_species_err = std::max(max_abs_species_err, zone_species_linf); + max_rel_species_err = + std::max(max_rel_species_err, zone_rel_species_linf); + + if (unit_test_rp::richardson_print_per_zone) { + std::cout << label << " zone " << iz << ": |E_T|=" << std::abs(t_est.err) + << ", rel(E_T)=" << rel_error(t_est.err, l2[iz].T) + << ", p_T=" << t_est.p << ", |E_e|=" << std::abs(e_est.err) + << ", rel(E_e)=" << rel_error(e_est.err, l2[iz].e) + << ", p_e=" << e_est.p << ", |E_xn|_inf=" << zone_species_linf + << ", rel(E_xn)_inf(non-trace)=" << zone_rel_species_linf + << ", non-trace species: [" + << (zone_first_species ? "none" : zone_species_list.str()) + << "]" + << std::endl; + } + } + + std::cout << label << " Richardson summary:" << std::endl; + std::cout << " valid zones : " << valid_zones << "/" + << n_zones << std::endl; + std::cout << " valid temperature estimates: " << valid_temp_estimates + << std::endl; + std::cout << " max |E_T| : " << max_abs_temp_err + << std::endl; + std::cout << " max |E_e| : " << max_abs_energy_err + << std::endl; + std::cout << " max |E_xn|_inf : " << max_abs_species_err + << std::endl; + std::cout << " max rel(E_T) : " << max_rel_temp_err + << std::endl; + std::cout << " max rel(E_e) : " << max_rel_energy_err + << std::endl; + std::cout << " max rel(E_xn)_inf(non-trace): " << max_rel_species_err + << std::endl; + std::cout << " trace rel cutoff : " << trace_rel_cutoff + << std::endl; + std::cout << " species count (relative) : " << species_rel_count + << std::endl; + + std::ostringstream considered_species; + bool first_species = true; + for (int n = 0; n < NumSpec; ++n) { + if (species_rel_hit_count[n] > 0) { + if (!first_species) { + considered_species << ", "; + } + considered_species << short_spec_names_cxx[n]; + first_species = false; + } + } + std::cout << label << " non-trace species used in relative metric:" + << std::endl; + std::cout << " species : [" + << (first_species ? "none" : considered_species.str()) << "]" + << std::endl; + }; + + if (do_richardson && n_tol_levels == 3) { + report_richardson("CPU", level_results[richardson_loose_level].cpu.states, + level_results[richardson_base_level].cpu.states, + level_results[richardson_tight_level].cpu.states); + } + + struct ZoneTruncationEstimate { + bool valid = false; + amrex::Real temp_err = 0.0_rt; + amrex::Real energy_err = 0.0_rt; + std::array species_err{}; + }; + + std::vector cpu_zone_truncation; + if (do_richardson && n_tol_levels == 3) { + cpu_zone_truncation.resize(n_zones); + const auto &cpu_l0 = level_results[richardson_loose_level].cpu.states; + const auto &cpu_l1 = level_results[richardson_base_level].cpu.states; + const auto &cpu_l2 = level_results[richardson_tight_level].cpu.states; + + for (int iz = 0; iz < n_zones; ++iz) { + auto &est = cpu_zone_truncation[iz]; + const bool ok0 = cpu_l0[iz].success && cpu_l0[iz].e > 0.0_rt; + const bool ok1 = cpu_l1[iz].success && cpu_l1[iz].e > 0.0_rt; + const bool ok2 = cpu_l2[iz].success && cpu_l2[iz].e > 0.0_rt; + if (!(ok0 && ok1 && ok2)) { + continue; + } + + est.valid = true; + est.temp_err = + estimate_richardson(cpu_l0[iz].T, cpu_l1[iz].T, cpu_l2[iz].T).err; + est.energy_err = + estimate_richardson(cpu_l0[iz].e, cpu_l1[iz].e, cpu_l2[iz].e).err; + for (int n = 0; n < NumSpec; ++n) { + est.species_err[n] = + estimate_richardson(cpu_l0[iz].xn[n], cpu_l1[iz].xn[n], + cpu_l2[iz].xn[n]) + .err; + } + } } - return state.success; + { + const std::string output_file = "burn_cell_final_state.txt"; + std::ofstream ofs(output_file); + if (!ofs) { + std::cout << "WARNING: unable to open " << output_file + << " for writing final state output" << std::endl; + } else { + ofs << std::scientific << std::setprecision(16); + ofs << "# burn_cell_primordial_chem final state (CPU, tolerance level 1)\n"; + ofs << "# columns: zone time rho T e success"; + for (int n = 0; n < NumSpec; ++n) { + ofs << " xn_" << short_spec_names_cxx[n]; + } + ofs << " truncation_valid truncation_err_T truncation_err_e"; + for (int n = 0; n < NumSpec; ++n) { + ofs << " truncation_err_xn_" << short_spec_names_cxx[n]; + } + ofs << "\n"; + + for (int iz = 0; iz < n_zones; ++iz) { + ofs << iz << " " << cpu_times[iz] << " " << cpu_states[iz].rho << " " + << cpu_states[iz].T << " " << cpu_states[iz].e << " " + << (cpu_states[iz].success ? 1 : 0); + for (int n = 0; n < NumSpec; ++n) { + ofs << " " << cpu_states[iz].xn[n]; + } + + if (!cpu_zone_truncation.empty() && cpu_zone_truncation[iz].valid) { + ofs << " 1 " << cpu_zone_truncation[iz].temp_err << " " + << cpu_zone_truncation[iz].energy_err; + for (int n = 0; n < NumSpec; ++n) { + ofs << " " << cpu_zone_truncation[iz].species_err[n]; + } + } else { + ofs << " 0 0.0 0.0"; + for (int n = 0; n < NumSpec; ++n) { + ofs << " 0.0"; + } + } + ofs << "\n"; + } + std::cout << "Wrote final state and truncation estimates to " + << output_file << std::endl; + } + } + + const bool use_richardson_gpu_failure = + enable_gpu && do_richardson && (n_tol_levels == 3); + + struct ZoneRichardsonThresholds { + bool valid = false; + amrex::Real temp_err = 0.0_rt; + amrex::Real energy_err = 0.0_rt; + std::array species_err{}; + std::array species_non_trace{}; + }; + + std::vector cpu_zone_thresholds; + if (use_richardson_gpu_failure) { + cpu_zone_thresholds.resize(n_zones); + const auto &cpu_l0 = level_results[richardson_loose_level].cpu.states; + const auto &cpu_l1 = level_results[richardson_base_level].cpu.states; + const auto &cpu_l2 = level_results[richardson_tight_level].cpu.states; + const amrex::Real trace_rel_cutoff = + amrex::max(unit_test_rp::richardson_trace_species_cutoff, 0.0_rt); + + for (int iz = 0; iz < n_zones; ++iz) { + auto &thr = cpu_zone_thresholds[iz]; + const bool ok0 = cpu_l0[iz].success && cpu_l0[iz].e > 0.0_rt; + const bool ok1 = cpu_l1[iz].success && cpu_l1[iz].e > 0.0_rt; + const bool ok2 = cpu_l2[iz].success && cpu_l2[iz].e > 0.0_rt; + if (!(ok0 && ok1 && ok2)) { + continue; + } + + thr.valid = true; + thr.temp_err = + std::abs(estimate_richardson(cpu_l0[iz].T, cpu_l1[iz].T, cpu_l2[iz].T) + .err); + thr.energy_err = + std::abs(estimate_richardson(cpu_l0[iz].e, cpu_l1[iz].e, cpu_l2[iz].e) + .err); + + amrex::Real zone_max_xn = 0.0_rt; + for (int n = 0; n < NumSpec; ++n) { + zone_max_xn = std::max(zone_max_xn, std::abs(cpu_l1[iz].xn[n])); + } + const amrex::Real zone_trace_cutoff = trace_rel_cutoff * zone_max_xn; + + for (int n = 0; n < NumSpec; ++n) { + thr.species_err[n] = std::abs(estimate_richardson( + cpu_l0[iz].xn[n], cpu_l1[iz].xn[n], + cpu_l2[iz].xn[n]) + .err); + thr.species_non_trace[n] = + (std::abs(cpu_l1[iz].xn[n]) >= zone_trace_cutoff) ? 1 : 0; + } + } + } + + struct MismatchStats { + int count = 0; + amrex::Real max_abs_diff = 0.0_rt; + int max_zone = -1; + amrex::Real cpu_value_at_max = 0.0_rt; + amrex::Real gpu_value_at_max = 0.0_rt; + }; + + struct GpuFailureSummary { + int zones_failed = 0; + int temp_failures = 0; + int energy_failures = 0; + int species_failures = 0; + int missing_richardson = 0; + }; + + std::map mismatch_summary; + bool gpu_failure_detected = false; + GpuFailureSummary gpu_failure_summary; + + struct NormAccumulator { + amrex::Real diff_l1 = 0.0_rt; + amrex::Real diff_l2_sq = 0.0_rt; + amrex::Real diff_linf = 0.0_rt; + amrex::Real ref_l1 = 0.0_rt; + amrex::Real ref_l2_sq = 0.0_rt; + amrex::Real ref_linf = 0.0_rt; + void record(amrex::Real diff, amrex::Real ref) { + amrex::Real abs_diff = std::abs(diff); + amrex::Real abs_ref = std::abs(ref); + diff_l1 += abs_diff; + diff_l2_sq += diff * diff; + diff_linf = std::max(diff_linf, abs_diff); + ref_l1 += abs_ref; + ref_l2_sq += ref * ref; + ref_linf = std::max(ref_linf, abs_ref); + } + }; + + std::map field_norms; + + auto record_mismatch = [&](int zone, const std::string &field, + amrex::Real cpu_value, amrex::Real gpu_value) { + auto &stats = mismatch_summary[field]; + stats.count++; + amrex::Real diff = std::abs(cpu_value - gpu_value); + if (stats.max_zone < 0 || diff > stats.max_abs_diff) { + stats.max_abs_diff = diff; + stats.max_zone = zone; + stats.cpu_value_at_max = cpu_value; + stats.gpu_value_at_max = gpu_value; + } + }; + + auto record_difference = [&](const std::string &field, amrex::Real cpu_value, + amrex::Real gpu_value, bool track_norm) { + if (!track_norm) { + return; + } + field_norms[field].record(cpu_value - gpu_value, cpu_value); + }; + + if (enable_gpu) { + for (int iz = 0; iz < n_zones; ++iz) { + bool cpu_zone_success = + cpu_states[iz].success && cpu_states[iz].e > 0.0_rt; + bool gpu_zone_success = + gpu_states[iz].success && gpu_states[iz].e > 0.0_rt; + + if (cpu_zone_success != gpu_zone_success) { + record_mismatch(iz, "success_flag", + cpu_zone_success ? 1.0_rt : 0.0_rt, + gpu_zone_success ? 1.0_rt : 0.0_rt); + record_difference("success_flag", + cpu_zone_success ? 1.0_rt : 0.0_rt, + gpu_zone_success ? 1.0_rt : 0.0_rt, false); + if (use_richardson_gpu_failure) { + gpu_failure_detected = true; + ++gpu_failure_summary.zones_failed; + } + continue; + } + if (!cpu_zone_success) { + continue; + } + + if (!within_tol(cpu_times[iz], gpu_times[iz])) { + record_mismatch(iz, "time", cpu_times[iz], gpu_times[iz]); + } + record_difference("time", cpu_times[iz], gpu_times[iz], false); + + if (!within_tol(cpu_states[iz].rho, gpu_states[iz].rho)) { + record_mismatch(iz, "rho", cpu_states[iz].rho, gpu_states[iz].rho); + } + record_difference("rho", cpu_states[iz].rho, gpu_states[iz].rho, false); + + bool zone_failed_by_richardson = false; + if (use_richardson_gpu_failure) { + const auto &thr = cpu_zone_thresholds[iz]; + if (!thr.valid) { + record_mismatch(iz, "richardson_unavailable", 1.0_rt, 0.0_rt); + zone_failed_by_richardson = true; + ++gpu_failure_summary.missing_richardson; + } else { + if (std::abs(cpu_states[iz].T - gpu_states[iz].T) > + 2.0_rt * thr.temp_err) { + record_mismatch(iz, "T_richardson_fail", cpu_states[iz].T, + gpu_states[iz].T); + zone_failed_by_richardson = true; + ++gpu_failure_summary.temp_failures; + } + if (std::abs(cpu_states[iz].e - gpu_states[iz].e) > + 2.0_rt * thr.energy_err) { + record_mismatch(iz, "e_richardson_fail", cpu_states[iz].e, + gpu_states[iz].e); + zone_failed_by_richardson = true; + ++gpu_failure_summary.energy_failures; + } + for (int n = 0; n < NumSpec; ++n) { + if (thr.species_non_trace[n] == 0) { + continue; + } + if (std::abs(cpu_states[iz].xn[n] - gpu_states[iz].xn[n]) > + 2.0_rt * thr.species_err[n]) { + std::string label = "xn_richardson_fail(" + + std::string(short_spec_names_cxx[n]) + ")"; + record_mismatch(iz, label, cpu_states[iz].xn[n], + gpu_states[iz].xn[n]); + zone_failed_by_richardson = true; + ++gpu_failure_summary.species_failures; + } + } + } + } else { + if (!within_tol(cpu_states[iz].T, gpu_states[iz].T)) { + record_mismatch(iz, "T", cpu_states[iz].T, gpu_states[iz].T); + } + if (!within_tol(cpu_states[iz].e, gpu_states[iz].e)) { + record_mismatch(iz, "e", cpu_states[iz].e, gpu_states[iz].e); + } + for (int n = 0; n < NumSpec; ++n) { + if (!within_tol(cpu_states[iz].xn[n], gpu_states[iz].xn[n])) { + std::string label = + "xn(" + std::string(short_spec_names_cxx[n]) + ")"; + record_mismatch(iz, label, cpu_states[iz].xn[n], + gpu_states[iz].xn[n]); + } + } + } + + record_difference("temperature", cpu_states[iz].T, gpu_states[iz].T, + true); + record_difference("e", cpu_states[iz].e, gpu_states[iz].e, false); + for (int n = 0; n < NumSpec; ++n) { + std::string norm_label = + "number_density(" + std::string(short_spec_names_cxx[n]) + ")"; + record_difference(norm_label, cpu_states[iz].xn[n], + gpu_states[iz].xn[n], true); + } + + if (use_richardson_gpu_failure && zone_failed_by_richardson) { + gpu_failure_detected = true; + ++gpu_failure_summary.zones_failed; + } + } + } + + if (!field_norms.empty()) { + std::cout << "CPU/GPU multi-zone difference norms:" << std::endl; + auto safe_norm_ratio = [](amrex::Real num, amrex::Real denom) { + if (denom > 0.0_rt) { + return num / denom; + } + return (num == 0.0_rt) ? 0.0_rt + : std::numeric_limits::infinity(); + }; + auto print_norm = [&](const std::string &label, + const NormAccumulator &acc) { + amrex::Real diff_l2 = std::sqrt(acc.diff_l2_sq); + amrex::Real ref_l2 = std::sqrt(acc.ref_l2_sq); + std::cout << " " << label + << ": L1 = " << safe_norm_ratio(acc.diff_l1, acc.ref_l1) + << ", L2 = " << safe_norm_ratio(diff_l2, ref_l2) + << ", L_inf = " + << safe_norm_ratio(acc.diff_linf, acc.ref_linf) << std::endl; + }; + auto temp_it = field_norms.find("temperature"); + if (temp_it != field_norms.end()) { + print_norm("temperature", temp_it->second); + } + for (int n = 0; n < NumSpec; ++n) { + std::string label = + "number_density(" + std::string(short_spec_names_cxx[n]) + ")"; + auto it = field_norms.find(label); + if (it != field_norms.end()) { + print_norm(label, it->second); + } + } + } else if (enable_gpu) { + std::cout << "No comparable CPU/GPU multi-zone states to compute norms" + << std::endl; + } + + if (mismatch_summary.empty() && enable_gpu) { + if (use_richardson_gpu_failure) { + std::cout << "CPU/GPU multi-zone states pass Richardson-based GPU " + "failure criterion" + << std::endl; + } else { + std::cout << "CPU/GPU multi-zone states agree within tolerances" + << std::endl; + } + } else if (enable_gpu) { + std::cout << "CPU/GPU mismatches detected:" << std::endl; + for (const auto &entry : mismatch_summary) { + const auto &field = entry.first; + const auto &stats = entry.second; + std::cout << " field " << field << " mismatched in " << stats.count + << " zone(s)"; + if (stats.max_zone >= 0) { + std::cout << "; largest difference at zone " << stats.max_zone + << " (CPU = " << stats.cpu_value_at_max + << ", GPU = " << stats.gpu_value_at_max << ")"; + } + std::cout << std::endl; + } + if (use_richardson_gpu_failure) { + std::cout << "GPU failure summary (|CPU-GPU| > 2*|Richardson error|): " + << "zones_failed=" << gpu_failure_summary.zones_failed + << ", temp_failures=" << gpu_failure_summary.temp_failures + << ", energy_failures=" << gpu_failure_summary.energy_failures + << ", species_failures=" << gpu_failure_summary.species_failures + << ", missing_richardson=" + << gpu_failure_summary.missing_richardson << std::endl; + } + } + + if (!enable_gpu) { + return cpu_success; + } + if (use_richardson_gpu_failure) { + return (cpu_success && gpu_success && !gpu_failure_detected); + } + return (cpu_success && gpu_success && mismatch_summary.empty()); } #endif diff --git a/unit_test/burn_cell_primordial_chem/dtai-1gpu.submit b/unit_test/burn_cell_primordial_chem/dtai-1gpu.submit new file mode 100644 index 0000000000..24719650b0 --- /dev/null +++ b/unit_test/burn_cell_primordial_chem/dtai-1gpu.submit @@ -0,0 +1,14 @@ +#!/bin/bash +#SBATCH --mem=64G +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-socket=1 +#SBATCH --cpus-per-task=64 +#SBATCH --partition=ghx4 +#SBATCH --time=00:10:00 +#SBATCH --job-name=quokka-1gpu +#SBATCH --account=cvz-dtai-gh +#SBATCH --gpus-per-node=1 +#SBATCH --gpu-bind=verbose,closest + +srun ./main1d.gnu.CUDA.ex inputs_primordial_chem --compare-final-state burn_cell_final_state.txt diff --git a/unit_test/burn_cell_primordial_chem/dtai-cpu.submit b/unit_test/burn_cell_primordial_chem/dtai-cpu.submit new file mode 100644 index 0000000000..7707eb4ca6 --- /dev/null +++ b/unit_test/burn_cell_primordial_chem/dtai-cpu.submit @@ -0,0 +1,14 @@ +#!/bin/bash +#SBATCH --mem=64G +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-socket=1 +#SBATCH --cpus-per-task=64 +#SBATCH --partition=ghx4 +#SBATCH --time=01:00:00 +#SBATCH --job-name=quokka-1gpu +#SBATCH --account=cvz-dtai-gh +#SBATCH --gpus-per-node=1 +#SBATCH --gpu-bind=verbose,closest + +srun ./main1d.gnu.arm-grace.OMP.ex inputs_primordial_chem diff --git a/unit_test/burn_cell_primordial_chem/inputs_primordial_chem b/unit_test/burn_cell_primordial_chem/inputs_primordial_chem index 27305ba634..0e693eed17 100644 --- a/unit_test/burn_cell_primordial_chem/inputs_primordial_chem +++ b/unit_test/burn_cell_primordial_chem/inputs_primordial_chem @@ -1,15 +1,32 @@ +unit_test.n_zones = 16**3 +integrator.burner_verbose = 0 + +## unit_test runtime parameters unit_test.run_prefix = "burn_cell_primordial_chem_" -# unit_test runtime parameters -unit_test.small_temp = 1.e1 -unit_test.small_dens = 1.e-60 -unit_test.tff_reduc = 1.e-1 +unit_test.small_temp = 10. +unit_test.small_dens = 1.0e-60 +unit_test.tff_reduc = 0.03 + # number of integration steps -unit_test.nsteps = 1000 +unit_test.nsteps = 1e4 +unit_test.burn_progress_interval = 10 + +unit_test.richardson_enable = 1 +unit_test.richardson_tol_factor = 10.0 +# set to 1 to print per-zone Richardson estimates +unit_test.richardson_print_per_zone = 0 +# non-trace threshold as fraction of most abundant species per zone +unit_test.richardson_trace_species_cutoff = 1e-15 +# set to 1 to compute per-zone Jacobian stiffness diagnostics (expensive) +unit_test.compute_stiffness_diagnostics = 0 + # max total time unit_test.tmax = 7.e20 + # initial temperature unit_test.temperature = 1e2 + # initial number densities unit_test.primary_species_1 = 1e-4 unit_test.primary_species_2 = 1e-4 @@ -26,30 +43,45 @@ unit_test.primary_species_12 = 1e-40 unit_test.primary_species_13 = 1e-40 unit_test.primary_species_14 = 0.0775e0 -# integrator runtime parameters +## integrator runtime parameters + # are we using primordial chemistry? then we use number densities integrator.use_number_densities = 1 + # we do not want to subtract the internal energy integrator.subtract_internal_energy = 0 + # we do not want to clip species between 0 and 1 integrator.do_species_clip = 0 + +# in the corrector loop, do we check if the predicted state is +# valid (X > 0) before calling the RHS? +integrator.do_corrector_validation = 1 + # minimum positive value of number densities integrator.SMALL_X_SAFE = 1e-100 -integrator.burner_verbose = 0 + # do you want to use the jacobian calculated in a previous step? integrator.use_jacobian_caching = 0 + # integration will fail if the number density > X_reject_buffer*atol integrator.X_reject_buffer = 1e100 + # Set which jacobian to use # 1 = analytic jacobian # 2 = numerical jacobian -integrator.jacobian = 1 +integrator.jacobian = 2 + # do you want to normalize abundances within VODE? (you don't!) integrator.renormalize_abundances = 0 + # tolerances integrator.rtol_spec = 1.0e-4 integrator.atol_spec = 1.0e-4 +# max substeps +integrator.ode_max_steps = 3000000 + #assumed redshift for Pop III star formation network.redshift = 30.0 @@ -57,4 +89,3 @@ network.redshift = 30.0 # these params help debug the code #amrex.throw_exception = 1 #amrex.signal_handling = 0 - diff --git a/unit_test/burn_cell_primordial_chem/main.cpp b/unit_test/burn_cell_primordial_chem/main.cpp index 5e458f8936..b86109f450 100644 --- a/unit_test/burn_cell_primordial_chem/main.cpp +++ b/unit_test/burn_cell_primordial_chem/main.cpp @@ -1,7 +1,9 @@ #include #include +#include #include +#include #include #include using namespace amrex; @@ -14,7 +16,40 @@ using namespace amrex; int main(int argc, char *argv[]) { - amrex::Initialize(argc, argv); + std::string compare_final_state_file; + std::vector amrex_argv; + amrex_argv.reserve(static_cast(argc)); + amrex_argv.push_back(argv[0]); + + for (int i = 1; i < argc; ++i) { + std::string arg(argv[i]); + if (arg == "--compare-final-state") { + if (i + 1 >= argc) { + std::cerr << "ERROR: --compare-final-state requires a file path" + << std::endl; + return 1; + } + compare_final_state_file = argv[++i]; + continue; + } + + constexpr const char *prefix = "--compare-final-state="; + if (arg.rfind(prefix, 0) == 0) { + compare_final_state_file = arg.substr(std::strlen(prefix)); + if (compare_final_state_file.empty()) { + std::cerr << "ERROR: --compare-final-state requires a non-empty file path" + << std::endl; + return 1; + } + continue; + } + + amrex_argv.push_back(argv[i]); + } + + int amrex_argc = static_cast(amrex_argv.size()); + char **amrex_argv_ptr = amrex_argv.data(); + amrex::Initialize(amrex_argc, amrex_argv_ptr); int success = 0; { @@ -23,11 +58,12 @@ int main(int argc, char *argv[]) { std::string const run_prefix = "burn_cell_primordial_chem_"; std::string input_run_prefix; pp.query("run_prefix", input_run_prefix); + int n_zones = 128; + pp.query("n_zones", n_zones); + amrex::Print() << "n_zones = " << n_zones << "\n"; AMREX_ALWAYS_ASSERT_WITH_MESSAGE(run_prefix == input_run_prefix, "input file is missing or incorrect!"); - std::cout << "starting the single zone burn..." << std::endl; - ParmParse ppa("amr"); init_unit_test(); @@ -39,7 +75,20 @@ int main(int argc, char *argv[]) { // C++ Network, RHS, screening, rates initialization network_init(); - success = burn_cell_c(); + bool enable_gpu = false; +#ifdef AMREX_USE_GPU + enable_gpu = true; +#endif + + std::cout << "\nstarting the multi-zone burn..." << std::endl; + if (!compare_final_state_file.empty()) { + std::cout << "saved-state comparison file: " << compare_final_state_file + << std::endl; + } + int multi_success = + burn_cell_multi_c(n_zones, enable_gpu, compare_final_state_file); + + success = multi_success; } amrex::Finalize(); diff --git a/unit_test/burn_cell_primordial_chem/plot_state_over_time.py b/unit_test/burn_cell_primordial_chem/plot_state_over_time.py new file mode 100644 index 0000000000..48fa7ee710 --- /dev/null +++ b/unit_test/burn_cell_primordial_chem/plot_state_over_time.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +""" +Plot temperature and species fractions versus density from state_over_time.txt using Plotly. +""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Sequence + +import pandas as pd +import plotly.graph_objects as go +import plotly.io as pio +from plotly.subplots import make_subplots + + +def parse_header(path: Path) -> Sequence[str]: + """Extract column names from the leading comment line.""" + with path.open("r", encoding="ascii") as handle: + for line in handle: + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("#"): + return stripped.lstrip("#").split() + # If no comment header is present, fall back to whitespace split. + return stripped.split() + raise ValueError(f"Could not find a header row in {path}") + + +def load_state_data(path: Path) -> pd.DataFrame: + columns = parse_header(path) + # Skip the header we just parsed and load the remainder. + return pd.read_csv( + path, + delim_whitespace=True, + skiprows=1, + names=columns, + comment="#", + ) + + +def plot_state(path: Path, output: Path | None) -> None: + data = load_state_data(path) + + required_columns = {"Density", "Temperature"} + missing = required_columns - set(data.columns) + if missing: + missing_cols = ", ".join(sorted(missing)) + raise KeyError(f"Missing required column(s): {missing_cols}") + + density = data["Density"] + temperature = data["Temperature"] + species_cols = [ + col for col in data.columns if col not in {"Time", "Density", "Temperature"} + ] + + if not species_cols: + raise ValueError("No species fraction columns found in input data.") + + if (density <= 0).any(): + raise ValueError("Density contains non-positive values; cannot use log scale.") + if (temperature <= 0).any(): + raise ValueError("Temperature contains non-positive values; cannot use log scale.") + + fig = make_subplots( + rows=2, + cols=1, + shared_xaxes=True, + vertical_spacing=0.08, + subplot_titles=( + "Temperature vs. Density", + "Species Fractions vs. Density", + ), + ) + + fig.add_trace( + go.Scatter( + x=density, + y=temperature, + mode="lines", + name="Temperature", + hovertemplate=( + "Density: %{x:.3e}
Temperature: %{y:.3e}Temperature" + ), + ), + row=1, + col=1, + ) + + for column in species_cols: + series = data[column] + positive_mask = series > 0 + if positive_mask.any(): + fig.add_trace( + go.Scatter( + x=density[positive_mask], + y=series[positive_mask], + mode="lines", + name=column, + hovertemplate=( + "Species: " + + column + + "
Density: %{x:.3e}" + + "
Fraction: %{y:.3e}" + ), + ), + row=2, + col=1, + ) + + fig.update_xaxes(type="log", row=1, col=1) + fig.update_xaxes(type="log", title_text="Density", row=2, col=1) + fig.update_yaxes(type="log", title_text="Temperature", row=1, col=1) + fig.update_yaxes(type="log", title_text="Species Fraction", row=2, col=1) + + fig.update_layout( + height=800, + legend_title_text="Species", + hovermode="x unified", + margin=dict(l=70, r=200, t=80, b=60), + ) + + if output: + output.parent.mkdir(parents=True, exist_ok=True) + pio.write_html(fig, file=str(output), include_plotlyjs="cdn", full_html=True) + else: + fig.show() + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Plot temperature and species fractions versus density." + ) + parser.add_argument( + "--input", + type=Path, + default=Path("state_over_time.txt"), + help="Path to the state_over_time.txt file (default: state_over_time.txt).", + ) + parser.add_argument( + "--output", + type=Path, + default=None, + help="Optional output path for saving the Plotly figure as HTML.", + ) + args = parser.parse_args() + + plot_state(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/unit_test/burn_cell_primordial_chem/post_link_install_name.sh b/unit_test/burn_cell_primordial_chem/post_link_install_name.sh new file mode 100755 index 0000000000..982ebe4978 --- /dev/null +++ b/unit_test/burn_cell_primordial_chem/post_link_install_name.sh @@ -0,0 +1,51 @@ +#!/bin/sh +# +# Wrapper around the AMReX linker command that patches stale Zerobrew GCC +# install_names after linking on macOS. + +set -eu + +real_linker="$1" +shift + +output="" +prev="" +for arg in "$@"; do + if [ "$prev" = "-o" ]; then + output="$arg" + break + fi + prev="$arg" +done + +"$real_linker" "$@" + +if [ "$(uname)" != "Darwin" ]; then + exit 0 +fi + +if [ -z "$output" ] || [ ! -f "$output" ]; then + exit 0 +fi + +if ! command -v otool >/dev/null 2>&1; then + exit 0 +fi + +if ! command -v install_name_tool >/dev/null 2>&1; then + exit 0 +fi + +target_root="/opt/zerobrew/prefix/lib/gcc/current" + +for lib in libgfortran.5.dylib libquadmath.0.dylib libstdc++.6.dylib libgomp.1.dylib; do + new_path="${target_root}/${lib}" + if [ ! -e "$new_path" ]; then + continue + fi + + old_path=$(otool -L "$output" | awk -v lib="$lib" 'index($1, lib) {print $1; exit}') + if [ -n "${old_path}" ] && [ "${old_path}" != "${new_path}" ]; then + install_name_tool -change "$old_path" "$new_path" "$output" + fi +done