diff --git a/CHANGELOG.md b/CHANGELOG.md index ae553aadb26..a40dafc2ce1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,8 @@ section for the next release. - maespa did not compile with new fortran compiler - updated github action to build docker images - PEcAn.SIPNET now accepts relative paths in its input XML (#3418). Previously all files referenced in the autogenerated `job.sh` needed to be specified as absolute paths. +- Refactored and created new version of `SDA_downscale` named `ensemble_downscale`. +- Added helper function `.convert_coords_to_sf()` for consistent conversion of data with lat lon data to `sf` pts. - R version 4.4 installs Python 3.12 which wants to leverage os managed packages instead, install python3-pika using apt. ### Changed diff --git a/modules/assim.sequential/DESCRIPTION b/modules/assim.sequential/DESCRIPTION index 11b10e2fee2..5066a54b143 100644 --- a/modules/assim.sequential/DESCRIPTION +++ b/modules/assim.sequential/DESCRIPTION @@ -41,7 +41,9 @@ Suggests: ggrepel, glue, ggpubr, + ggrepel, gridExtra, + keras3 (>= 1.0.0), itertools, magic (>= 1.5.0), methods, @@ -53,18 +55,18 @@ Suggests: plotrix, plyr (>= 1.8.4), randomForest, - keras3 (>= 1.0.0), + ranger, raster, readr, reshape2 (>= 1.4.2), rlist, sf, + sp, stats, terra, testthat, tictoc, tidyr, - sp, utils, xgboost, XML diff --git a/modules/assim.sequential/NAMESPACE b/modules/assim.sequential/NAMESPACE index f33ff7cf8c2..35bd797d879 100644 --- a/modules/assim.sequential/NAMESPACE +++ b/modules/assim.sequential/NAMESPACE @@ -29,8 +29,10 @@ export(assessParams) export(block_matrix) export(conj_wt_wishart_sampler) export(construct_nimble_H) +export(downscale_metrics) export(downscale_qsub_main) export(dwtmnorm) +export(ensemble_downscale) export(get_ensemble_weights) export(hop_test) export(interactive.plotting.sda) @@ -60,6 +62,7 @@ export(sda.enkf_local) export(sda_assemble) export(sda_weights_site) export(simple.local) +export(subset_ensemble) export(stack_covariates_2_geotiff) export(tobit.model) export(tobit2space.model) diff --git a/modules/assim.sequential/R/Helper.functions.R b/modules/assim.sequential/R/Helper.functions.R index 9c3ccfcc31f..fb64b52e5a7 100644 --- a/modules/assim.sequential/R/Helper.functions.R +++ b/modules/assim.sequential/R/Helper.functions.R @@ -6,10 +6,10 @@ #' @description This function performs a simple outlier replacement on all the columns of dataframes inside a list #' @return A list the same dimension as X, with each column of each dataframe #' modified by replacing outlier points with the column median -#' @export +#' @export outlier.detector.boxplot #' @importFrom magrittr %>% #' -outlier.detector.boxplot<-function(X) { +outlier.detector.boxplot <- function(X) { X <- X %>% purrr::map(function(X.tmp){ #X.tmp is all the state variables for each element of the list (site) diff --git a/modules/assim.sequential/R/downscale_function.R b/modules/assim.sequential/R/downscale_function.R index 317926eba1e..8f1420dd037 100644 --- a/modules/assim.sequential/R/downscale_function.R +++ b/modules/assim.sequential/R/downscale_function.R @@ -1,67 +1,113 @@ ##' @title Preprocess Data for Downscaling ##' @name SDA_downscale_preprocess ##' @author Sambhav Dixit +##' @author David LeBauer ##' -##' @param data_path Character. File path for .rds containing ensemble data. -##' @param coords_path Character. File path for .csv file containing the site coordinates, with columns named "lon" and "lat". -##' @param date Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within the file supplied to 'data_path'. -##' @param carbon_pool Character. Carbon pool of interest. Name must match the carbon pool name found within the file supplied to 'data_path'. +##' @param ensemble_data Character or list. Either a file path to an `.rds` file containing ensemble data, or an in-memory list where each element is a data.frame indexed by date. +##' @param site_coordinates Character, data.frame, or sf object. Either: +##' - A file path to a `.csv` containing site coordinates with columns `"id"`, `"lat"`, and `"lon"`. +##' - A `data.frame` with the same structure. +##' - An `sf` object with point geometries. +##' @param date Date. The date for the run, must be a year within `ensemble_data`. +##' @param carbon_pool Character. Carbon pool of interest. Name must match the carbon pool name found within the file or object supplied to 'ensemble_data'. ##' @details This function ensures that the specified date and carbon pool are present in the input data. It also checks the validity of the site coordinates and aligns the number of rows between site coordinates and carbon data. ##' ##' @description This function reads and checks the input data, ensuring that the required date and carbon pool exist, and that the site coordinates are valid. ##' ##' @return A list containing The read .rds data , The cleaned site coordinates, and the preprocessed carbon data. -SDA_downscale_preprocess <- function(data_path, coords_path, date, carbon_pool) { - # Read the input data and site coordinates - input_data <- readRDS(data_path) - site_coordinates <- readr::read_csv(coords_path) - - # Convert input_data names to Date objects +SDA_downscale_preprocess <- function(ensemble_data, site_coords, date, carbon_pool) { + # Read the input data and site coordinates (if inputs are file paths, load them; otherwise, use as provided) + if (is.character(ensemble_data)) { + input_data <- readRDS(ensemble_data) + } else { + input_data <- ensemble_data + } + + if (is.character(site_coords)) { + site_coordinates <- readr::read_csv(site_coords, show_col_types = FALSE) + } else { + site_coordinates <- site_coords + } + + # If sf object, convert to data.frame with 'lon' and 'lat' + if (inherits(site_coordinates, "sf")) { + site_coordinates <- sf::st_coordinates(site_coordinates) |> + dplyr::rename(lon = X, lat = Y) |> + dplyr::bind_cols( + site_coordinates + ) + } + + # Ensure site coordinates have 'lon' and 'lat' columns + if (!all(c("lon", "lat") %in% names(site_coordinates))) { + PEcAn.logger::logger.error("Site coordinates must contain 'lon' and 'lat' columns.") + } + + # Standardize input_data date names YYYY-MM-DD input_date_names <- lubridate::ymd(names(input_data)) names(input_data) <- input_date_names - - # Convert the input date to a Date object - standard_date <- lubridate::ymd(date) - + + # Ensure 'date' is a Date object, if not, convert + if (!inherits(date, "Date")) { + standard_date <- lubridate::ymd(date) + } + # Ensure the date exists in the input data if (!standard_date %in% input_date_names) { - stop(paste("Date", date, "not found in the input data.")) + PEcAn.logger::logger.error(paste("Date", standard_date, "not found in the input data.")) } - + # Extract the carbon data for the specified focus year index <- which(input_date_names == standard_date) data <- input_data[[index]] - + # Ensure the carbon pool exists in the input data if (!carbon_pool %in% names(data)) { - stop(paste("Carbon pool", carbon_pool, "not found in the input data.")) + PEcAn.logger::logger.error("Carbon pool", carbon_pool, "not found in the input data.") } - + carbon_data <- as.data.frame(t(data[which(names(data) == carbon_pool)])) names(carbon_data) <- paste0("ensemble", seq(ncol(carbon_data))) - - # Ensure site coordinates have 'lon' and 'lat' columns - if (!all(c("lon", "lat") %in% names(site_coordinates))) { - stop("Site coordinates must contain 'lon' and 'lat' columns.") - } - + # Ensure the number of rows in site coordinates matches the number of rows in carbon data if (nrow(site_coordinates) != nrow(carbon_data)) { - message("Number of rows in site coordinates does not match the number of rows in carbon data.") - if (nrow(site_coordinates) > nrow(carbon_data)) { - message("Truncating site coordinates to match carbon data rows.") - site_coordinates <- site_coordinates[1:nrow(carbon_data), ] - } else { - message("Truncating carbon data to match site coordinates rows.") - carbon_data <- carbon_data[1:nrow(site_coordinates), ] - } + PEcAn.logger::logger.info( + "Number of rows in site coordinates does not match the number of rows in carbon data." + ) + PEcAn.logger::logger.info( + "There are", nrow(site_coordinates), "row(s) in site coordinates and", + nrow(carbon_data), "row(s) in carbon data." + ) + PEcAn.logger::logger.error("I am not sure how to reconcile these differences.") + # see https://github.com/PecanProject/pecan/pull/3431/files#r1953601359 + # if (nrow(site_coordinates) > nrow(carbon_data)) { + # PEcAn.logger::logger.info("Truncating site coordinates to match carbon data rows.") + # site_coordinates <- site_coordinates[seq_len(nrow(carbon_data)), ] + # } else { + # PEcAn.logger::logger.info("Truncating carbon data to match site coordinates rows.") + # carbon_data <- carbon_data[seq_len(nrow(site_coordinates)), ] + # } } - - message("Preprocessing completed successfully.") + + PEcAn.logger::logger.info("Preprocessing completed successfully.") return(list(input_data = input_data, site_coordinates = site_coordinates, carbon_data = carbon_data)) } +## Helper function to convert coordinates into an sf object +.convert_coords_to_sf <- function(coords) { + if (inherits(coords, "sf")) { + return(coords) + } else if (is.data.frame(coords)) { + if (!all(c("lon", "lat") %in% names(coords))) { + PEcAn.logger::logger.error("Coordinates data frame must contain 'lon' and 'lat'.") + } + return(sf::st_as_sf(coords, coords = c("lon", "lat"), crs = 4326)) + } else { + PEcAn.logger::logger.error("Unsupported coordinates format. Must be an sf object or a data.frame.") + } +} + ##' @noRd ##' ##' @title Create folds function @@ -82,28 +128,30 @@ SDA_downscale_preprocess <- function(data_path, coords_path, date, carbon_pool) n <- length(y) indices <- seq_len(n) folds <- split(indices, cut(seq_len(n), breaks = k, labels = FALSE)) - + if (!returnTrain) { - folds <- folds # Test indices are already what we want + folds <- folds # Test indices are already what we want } else { - folds <- lapply(folds, function(x) indices[-x]) # Return training indices + folds <- lapply(folds, function(x) indices[-x]) # Return training indices } - + if (!list) { folds <- unlist(folds) } - + return(folds) } + + ##' @title SDA Downscale Function ##' @name SDA_downscale ##' @author Joshua Ploshay, Sambhav Dixit ##' ##' @param preprocessed List. Preprocessed data returned as an output from the SDA_downscale_preprocess function. -##' @param date Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'preprocessed' from the 'data_path'. -##' @param carbon_pool Character. Carbon pool of interest. Name must match carbon pool name found within file supplied to 'preprocessed' from the 'data_path'. -##' @param covariates SpatRaster stack. Used as predictors in downscaling. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder +##' @param date *Deprecated*. This argument has never been used and will be removed after 2026-04-01 +##' @param carbon_pool Character. Carbon pool of interest. Name must match carbon pool name found within file supplied to 'preprocessed' from the 'ensemble_data'. +##' @param covariates SpatRaster stack or sf object. Used as predictors in downscaling. If providing a raster stack, layers should be named. If providing an sf object, predictor attributes should be present. ##' @param model_type Character. Either "rf" for Random Forest or "cnn" for Convolutional Neural Network. Default is Random Forest. ##' @param seed Numeric or NULL. Optional seed for random number generation. Default is NULL. ##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations @@ -112,64 +160,82 @@ SDA_downscale_preprocess <- function(data_path, coords_path, date, carbon_pool) ##' ##' @return A list containing the training and testing data sets, models, predicted maps for each ensemble member, and predictions for testing data. -SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_type = "rf", seed = NULL) { +SDA_downscale <- function(preprocessed, date = NULL, carbon_pool, covariates, model_type = "rf", seed = NULL) { + if (!missing(date)) { + ## If you see this and it is after 2026-04-01, please remove + # 1. line starting with ##' @param date + # 2. date = NULL from function call + # 3. this conditional starting with if (!missing(date)) and ending after the warning below + PEcAn.logger::logger.warn("'date' argument is not used and will be removed on or after 2026-04-01. It is currently ignored.", call. = FALSE) + } carbon_data <- preprocessed$carbon_data - - # Convert site coordinates to SpatVector - site_coordinates <- terra::vect(preprocessed$site_coordinates, geom = c("lon", "lat"), crs = "EPSG:4326") - - # Extract predictors from covariates raster using site coordinates - predictors <- as.data.frame(terra::extract(covariates, site_coordinates, ID = FALSE)) - + + # Convert site coordinates to an sf object using the helper function + site_coordinates_sf <- .convert_coords_to_sf(preprocessed$site_coordinates) + + # Extract predictors from covariates support both SpatRaster and sf objects: + if (inherits(covariates, "SpatRaster")) { + # For raster objects, convert the sf coordinates to SpatVector before extraction + predictors <- as.data.frame(terra::extract(covariates, terra::vect(site_coordinates_sf), ID = FALSE)) + } else if (inherits(covariates, "sf")) { + # For sf objects, spatial join + predictors <- sf::st_drop_geometry(sf::st_join(site_coordinates_sf, covariates, join = sf::st_intersects)) + } else { + PEcAn.logger::logger.error("Unsupported covariates object. Must be a SpatRaster or an sf object.") + } + # Dynamically get covariate names covariate_names <- names(predictors) - + # Create a single data frame with all predictors and ensemble data full_data <- cbind(carbon_data, predictors) - + # Split the observations into training and testing sets if (!is.null(seed)) { - set.seed(seed) # Only set seed if provided + set.seed(seed) # Only set seed if provided } - sample <- sample(1:nrow(full_data), size = round(0.75 * nrow(full_data))) + sample <- sample(seq_len(nrow(full_data)), + size = round(0.75 * nrow(full_data)) + ) train_data <- full_data[sample, ] test_data <- full_data[-sample, ] - + # Prepare data for both RF and CNN x_data <- as.matrix(full_data[, covariate_names]) y_data <- as.matrix(carbon_data) - + # Calculate scaling parameters from all data scaling_params <- list( mean = colMeans(x_data), sd = apply(x_data, 2, stats::sd) ) - + # Normalize the data x_data_scaled <- scale(x_data, center = scaling_params$mean, scale = scaling_params$sd) - + # Split into training and testing sets x_train <- x_data_scaled[sample, ] x_test <- x_data_scaled[-sample, ] y_train <- y_data[sample, ] y_test <- y_data[-sample, ] - + # Initialize lists for outputs models <- list() maps <- list() predictions <- list() - + if (model_type == "rf") { for (i in seq_along(carbon_data)) { ensemble_col <- paste0("ensemble", i) formula <- stats::as.formula(paste(ensemble_col, "~", paste(covariate_names, collapse = " + "))) models[[i]] <- randomForest::randomForest(formula, - data = train_data, - ntree = 1000, - na.action = stats::na.omit, - keep.forest = TRUE, - importance = TRUE) - + data = train_data, + ntree = 1000, + na.action = stats::na.omit, + keep.forest = TRUE, + importance = TRUE + ) + maps[[i]] <- terra::predict(covariates, model = models[[i]], na.rm = TRUE) predictions[[i]] <- stats::predict(models[[i]], test_data) } @@ -184,33 +250,33 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ for (i in seq_along(carbon_data)) { all_models <- list() - + # Create k-fold indices fold_indices <- .create_folds(y = seq_len(nrow(x_train)), k = k_folds, list = TRUE, returnTrain = FALSE) - #initialise operations for each fold + # initialise operations for each fold for (fold in 1:k_folds) { cat(sprintf("Processing ensemble %d, fold %d of %d\n", i, fold, k_folds)) - + # Split data into training and validation sets for this fold train_indices <- setdiff(seq_len(nrow(x_train)), fold_indices[[fold]]) val_indices <- fold_indices[[fold]] - + x_train_fold <- x_train[train_indices, , drop = FALSE] y_train_fold <- y_train[train_indices, i] x_val_fold <- x_train[val_indices, , drop = FALSE] y_val_fold <- y_train[val_indices, i] - + # Create bagged models for this fold fold_models <- list() for (bag in 1:num_bags) { # Create bootstrap sample - bootstrap_indices <- sample(1:nrow(x_train_fold), size = nrow(x_train_fold), replace = TRUE) + bootstrap_indices <- sample(seq_len(x_train_fold), size = nrow(x_train_fold), replace = TRUE) x_train_bag <- x_train_fold[bootstrap_indices, ] y_train_bag <- y_train_fold[bootstrap_indices] - + # Define the CNN model architecture - # Used dual batch normalization and dropout as the first set of batch normalization and + # Used dual batch normalization and dropout as the first set of batch normalization and model <- keras3::keras_model_sequential() |> # Layer Reshape : Reshape to fit target shape for the convolutional layer keras3::layer_reshape(target_shape = c(ncol(x_train), 1, 1), input_shape = ncol(x_train)) |> @@ -218,15 +284,15 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ keras3::layer_conv_2d( filters = 32, kernel_size = c(3, 1), - activation = 'relu', - padding = 'same' + activation = "relu", + padding = "same" ) |> # Flatten: Converts 3D output to 1D for dense layer input keras3::layer_flatten() |> # Dense layer: Learns complex combinations of features keras3::layer_dense( - units = 64, - activation = 'relu', + units = 64, + activation = "relu", kernel_regularizer = keras3::regularizer_l2(0.01) ) |> # Batch normalization: Normalizes layer inputs, stabilizes learning, reduces internal covariate shift @@ -235,8 +301,8 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ keras3::layer_dropout(rate = 0.3) |> # Dense layer: Learns complex combinations of features keras3::layer_dense( - units = 32, - activation = 'relu', + units = 32, + activation = "relu", kernel_regularizer = keras3::regularizer_l2(0.01) ) |> # Batch normalization: Further stabilizes learning in deeper layers @@ -248,26 +314,26 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ units = 1, kernel_regularizer = keras3::regularizer_l2(0.01) ) - + # Learning rate scheduler lr_schedule <- keras3::learning_rate_schedule_exponential_decay( initial_learning_rate = 0.001, decay_steps = 1000, decay_rate = 0.9 ) - + # Early stopping callback early_stopping <- keras3::callback_early_stopping( - monitor = 'loss', + monitor = "loss", patience = 10, restore_best_weights = TRUE ) # Compile the model model |> keras3::compile( - loss = 'mean_squared_error', + loss = "mean_squared_error", optimizer = keras3::optimizer_adam(learning_rate = lr_schedule), - metrics = c('mean_absolute_error') + metrics = c("mean_absolute_error") ) # Train the model @@ -283,14 +349,14 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ # Store the trained model for this bag in the fold_models list fold_models[[bag]] <- model } - + # Add fold models to all_models list all_models <- c(all_models, fold_models) } - + # Store all models for this ensemble models[[i]] <- all_models - + # Use all models for predictions cnn_ensemble_predict <- function(models, newdata, scaling_params) { newdata <- scale(newdata, center = scaling_params$mean, scale = scaling_params$sd) @@ -302,18 +368,19 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ prediction_rast <- terra::rast(covariates) # Generate spatial predictions using the trained model - maps[[i]] <- terra::predict(prediction_rast, model = models[[i]], - fun = cnn_ensemble_predict, - scaling_params = scaling_params) + maps[[i]] <- terra::predict(prediction_rast, + model = models[[i]], + fun = cnn_ensemble_predict, + scaling_params = scaling_params + ) # Make predictions on held-out test data predictions[[i]] <- cnn_ensemble_predict(models[[i]], x_data[-sample, ], scaling_params) - } } else { - stop("Invalid model_type. Please choose either 'rf' for Random Forest or 'cnn' for Convolutional Neural Network.") + PEcAn.logger::logger.error("Invalid model_type. Please choose either 'rf' for Random Forest or 'cnn' for Convolutional Neural Network.") } - + # Organize the results into a single output list downscale_output <- list( data = list(training = train_data, testing = test_data), @@ -322,14 +389,14 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ predictions = predictions, scaling_params = scaling_params ) - + # Rename each element of the output list with appropriate ensemble numbers for (i in seq_along(carbon_data)) { names(downscale_output$models)[i] <- paste0("ensemble", i) names(downscale_output$maps)[i] <- paste0("ensemble", i) names(downscale_output$predictions)[i] <- paste0("ensemble", i) } - + return(downscale_output) } @@ -344,23 +411,23 @@ SDA_downscale <- function(preprocessed, date, carbon_pool, covariates, model_typ ##' ##' @description This function takes the output from the SDA_downscale function and computes various performance metrics for each ensemble. It provides a way to evaluate the accuracy of the downscaling results without modifying the main downscaling function. ##' -##' @return A list of metrics for each ensemble, where each element contains MAE , MSE ,R_squared ,actual values from testing data and predicted values for the testing data +##' @return A list of metrics for each ensemble, where each element contains MAE , MSE ,R_squared ,actual values from testing data and predicted values for the testing data SDA_downscale_metrics <- function(downscale_output, carbon_pool) { metrics <- list() - - for (i in 1:length(downscale_output$data)) { + + for (i in seq_along(downscale_output$data)) { actual <- downscale_output$data[[i]]$testing[[paste0(carbon_pool, "_ens", i)]] predicted <- downscale_output$predictions[[i]] - + mse <- mean((actual - predicted)^2) mae <- mean(abs(actual - predicted)) r_squared <- 1 - sum((actual - predicted)^2) / sum((actual - mean(actual))^2) - + metrics[[i]] <- list(MSE = mse, MAE = mae, R_squared = r_squared, actual = actual, predicted = predicted) } - + names(metrics) <- paste0("ensemble", seq_along(metrics)) - + return(metrics) } diff --git a/modules/assim.sequential/R/ensemble_downscale.R b/modules/assim.sequential/R/ensemble_downscale.R new file mode 100644 index 00000000000..28131c7a45c --- /dev/null +++ b/modules/assim.sequential/R/ensemble_downscale.R @@ -0,0 +1,407 @@ +##' @title Subset ensemble data for downscaling +##' @name SDA_downscale_preprocess +##' @author Sambhav Dixit, David LeBauer +##' +##' @param ensemble_data EFI standard tibble or data.frame +##' @param site_coords data.frame with unique site id +##' @param date Date. The date for the run, must be a date within `ensemble_data`. +##' @param carbon_pool Character. Carbon pool of interest. Name must match the carbon pool name in ensemble_data. +##' found within the file or object supplied to 'ensemble_data'. +##' @details This function subsets ensemble data and ensures that the specified date and +##' carbon pool are present in the ensemble data. +##' +##' @return A list containing the cleaned site coordinates and the ensemble carbon output for the +##' specified date and carbon pool. +##' +##' @export + +subset_ensemble <- function(ensemble_data, site_coords, date, carbon_pool) { + # Confirm date is in ensemble data + if (!any(lubridate::date(unique(ensemble_data$datetime)) == lubridate::date(date))) { + PEcAn.logger::logger.error(paste( + "Provided date", date, + "is not found in the ensemble_data input." + )) + } + + # Ensure the carbon pool exists in the input data + if (!carbon_pool %in% unique(ensemble_data$variable)) { + PEcAn.logger::logger.error("Carbon pool", carbon_pool, "not found in the input data.") + } + + # Ensure the sites are in the ensemble data + if (!all(unique(site_coords$site_id) %in% unique(ensemble_data$site_id))) { + PEcAn.logger::logger.error("Some sites in site_coords are not present in the ensemble_data.") + # identify which sites are missing + missing <- setdiff(unique(site_coords$site_id), unique(ensemble_data$site_id)) + setdiff(unique(ensemble_data$site_id), unique(site_coords$site_id)) + length(unique(site_coords$site_id)) # number of sites in site_coords + length(unique(ensemble_data$site_id)) # number of sites in ensemble_data + } + + # Filter the ensemble data to the specified date and carbon pool + ensemble_data <- ensemble_data |> + dplyr::filter( + lubridate::date(datetime) == lubridate::date(date), + site_id %in% unique(site_coords$site_id), + variable == carbon_pool + ) |> + dplyr::select(site_id, ensemble, prediction) # use site_id instead of site + + if (nrow(ensemble_data) == 0) { + PEcAn.logger::logger.error("No carbon data found for the specified carbon pool.") + } + + PEcAn.logger::logger.info("Ensemble data subset completed successfully.") + return(ensemble_data) +} + +## Helper function to convert table with lat, lon into an sf object +.convert_coords_to_sf <- function(coords) { + if (inherits(coords, "sf")) { + return(coords) + } else if (is.data.frame(coords)) { + if (!all(c("lon", "lat") %in% names(coords))) { + PEcAn.logger::logger.error("Coordinates data frame must contain 'lon' and 'lat'.") + } + return(sf::st_as_sf(coords, coords = c("lon", "lat"), crs = 4326)) + } else { + PEcAn.logger::logger.error("Unsupported coordinates format. Must be an sf object or a data.frame.") + } +} + +## Helper function to convert sf object into table with lat, lon +.convert_sf_to_coords <- function(sf_obj) { + # Check if it's an sf object + if (!inherits(sf_obj, "sf")) { + PEcAn.logger::logger.error("Input must be an 'sf' object.") + } + + # Extract the geometry into columns named lon/lat + coord_mat <- sf::st_coordinates(sf_obj) + colnames(coord_mat) <- c("lon", "lat") + + # Drop the geometry column from the sf, then bind coordinate columns + out <- sf_obj %>% + sf::st_drop_geometry() %>% + tibble::as_tibble() %>% + dplyr::bind_cols(as.data.frame(coord_mat)) + return(out) +} + +##' @noRd +##' +##' @title Create folds function +##' @name .create_folds +##' @author Sambhav Dixit +##' +##' @param y Vector. A vector of outcome data or indices. +##' @param k Numeric. The number of folds to create. +##' @param list Logical. If TRUE, returns a list of fold indices. If FALSE, returns a vector. +##' @param returnTrain Logical. If TRUE, returns indices for training sets. If FALSE, returns indices for test sets. +##' @details This function creates k-fold indices for cross-validation. It can return either training or test set indices, and the output can be in list or vector format. +##' +##' @description This function generates k-fold indices for cross-validation, allowing for flexible output formats. +##' +##' @return A list of k elements (if list = TRUE), each containing indices for a fold, or a vector of indices (if list = FALSE). + +.create_folds <- function(y, k, list = TRUE, returnTrain = FALSE) { + n <- length(y) + indices <- seq_len(n) + folds <- split(indices, cut(seq_len(n), breaks = k, labels = FALSE)) + + if (!returnTrain) { + folds <- folds # Test indices are already what we want + } else { + folds <- lapply(folds, function(x) indices[-x]) # Return training indices + } + + if (!list) { + folds <- unlist(folds) + } + + return(folds) +} + + + +##' @title Ensemble Downscale +##' @name ensemble_downscale +##' @author Joshua Ploshay, Sambhav Dixit, David LeBauer +##' +##' @param ensemble_data EFI standard tibble or data.frame. Contains carbon data for downscaling. +##' @param site_coords data.frame, tibble, or sf object. Design points. If not sf object, must have +##' 'lon' and 'lat' columns. Must have unique identifier 'site' field. +##' @param covariates table containing numeric predictors to be used in downscaling. +##' Must have unique identifier 'site_id' field and predictor attributes +##' @details This function will downscale forecast data to unmodeled locations using covariates and site locations +##' +##' @return A list containing the model, predictions for all values of covariates as well as test data and test predictions for downstream +##' statistics. +##' +##' @export + +ensemble_downscale <- function(ensemble_data, site_coords, covariates) { + ## TODO + ## - Accept raster stack as covariates + ## - Split into separate train and predict functions + ## - Add CNN functionality, use tidymodels? + + # Dynamically get covariate names + covariate_names <- colnames(covariates |> dplyr::select(-site_id)) + # Drop zero-variance predictors (lead to NaN on scale()) + if (length(covariate_names) > 0) { + cov_num <- as.data.frame(covariates[, covariate_names, drop = FALSE]) + var0 <- vapply(cov_num, function(x) stats::var(x, na.rm = TRUE), numeric(1)) + drop_cols <- names(var0)[is.finite(var0) & var0 == 0] + if (length(drop_cols) > 0) { + PEcAn.logger::logger.warn( + "Dropping zero-variance predictors:", paste(drop_cols, collapse = ", ") + ) + covariate_names <- setdiff(covariate_names, drop_cols) + } + } + + # scale to N(0,1) (defaults of scale function) + scaled_covariates <- covariates |> + dplyr::mutate(dplyr::across(dplyr::all_of(covariate_names), scale)) + # Assert no NA/NaN/Inf after scaling (data should be cleaned earlier) + if (length(covariate_names) > 0) { + preds_df <- as.data.frame(scaled_covariates[, covariate_names, drop = FALSE]) + bad_cols <- names(which(colSums(!is.finite(as.matrix(preds_df))) > 0 | colSums(is.na(preds_df)) > 0)) + if (length(bad_cols) > 0) { + PEcAn.logger::logger.severe( + "Non-finite values present after scaling in predictors:", + paste(bad_cols, collapse = ", "), + " please fix upstream covariates." + ) + } + } + + # Create a single data frame with all predictors and ensemble data + design_pt_data <- ensemble_data |> # from SIPNET ensemble runs + dplyr::left_join(scaled_covariates, by = "site_id") # n = nrow(site_coords) * ensemble_size + + # Split the observations into training and testing sets + n_sites <- nrow(design_pt_data) + + ## TODO: Use groups from the 01_cluster_and_select_design_points.R + global_idx <- if (n_sites > 0) sample(seq_len(n_sites), size = max(1, round(0.8 * n_sites))) else integer(0) + train_data <- if (length(global_idx) > 0) design_pt_data[global_idx, , drop = FALSE] else design_pt_data[0, , drop = FALSE] + test_data <- if (length(global_idx) > 0) design_pt_data[-global_idx, , drop = FALSE] else design_pt_data[0, , drop = FALSE] + + ensembles <- unique(ensemble_data$ensemble) + n_ensembles <- length(ensembles) + + PEcAn.logger::logger.info( + paste("Start downscaling with", n_ensembles, "ensembles.") + ) + + results <- furrr::future_map(seq_along(ensembles), function(i) { + ens_label <- ensembles[i] + formula <- as.formula( + paste("prediction ~", paste(covariate_names, collapse = " + ")) + ) + + # Build a per-ensemble stratified split (80/20), guaranteeing at least 1 train row + ens_data <- design_pt_data |> + dplyr::filter(ensemble == ens_label) + # Assert no NA in per-ensemble predictors (from join) before splitting + if (length(covariate_names) > 0) { + ens_preds <- as.data.frame(ens_data[, covariate_names, drop = FALSE]) + if (nrow(ens_preds) > 0) { + na_cols_ens <- names(which(colSums(is.na(ens_preds)) > 0)) + if (length(na_cols_ens) > 0) { + # Identify a small sample of problematic site_ids for debugging + bad_rows <- which(!stats::complete.cases(ens_preds)) + bad_sites <- unique(ens_data$site_id[bad_rows]) + PEcAn.logger::logger.error( + "NA found in per-ensemble predictors after join for ensemble ", ens_label, + "; columns: ", paste(na_cols_ens, collapse = ", "), + "; example site_ids: ", paste(utils::head(bad_sites, 10), collapse = ", ") + ) + } + } + } + n <- nrow(ens_data) + if (n <= 9) { + PEcAn.logger::logger.severe(glue::glue("Only {n} records available for ensemble {ens_label} at selected slice")) + } + if (n >= 10) { + n_train <- max(1, floor(0.8 * n)) + idx <- sample(seq_len(n), size = n_train) + .train_data <- ens_data[idx, , drop = FALSE] + .test_data <- ens_data[-idx, , drop = FALSE] + } + + PEcAn.logger::logger.info( + glue::glue("Fitting model for ensemble {ens_label} ({i}/{n_ensembles}) with {nrow(.train_data)} training points and {nrow(.test_data)} testing points.") + ) + + # nodesize is just a temporary hack + nodesize <- ifelse(nrow(.train_data) < 50, 1, 5) + PEcAn.logger::logger.info("Using nodesize=", nodesize, " for ensemble ", ens_label) + # Assert no NA in training predictors or response (do not impute/drop) + if (length(covariate_names) > 0) { + train_preds <- as.data.frame(.train_data[, covariate_names, drop = FALSE]) + na_cols <- names(which(colSums(is.na(train_preds)) > 0)) + if (length(na_cols) > 0) { + PEcAn.logger::logger.error( + "NA found in training predictors for ensemble ", ens_label, + ": ", paste(na_cols, collapse = ", "), + ". Please fix upstream covariates for these site_ids." + ) + } + } + if (any(is.na(.train_data$prediction))) { + PEcAn.logger::logger.error( + "NA found in training response 'prediction' for ensemble ", ens_label, + ". Please fix upstream ensemble data." + ) + } + # Ensure training proceeds even if global na.action is set to na.fail + model <- randomForest::randomForest(formula, + data = .train_data, + ntree = 1000, + keep.forest = TRUE, + importance = TRUE, + nodesize = nodesize, + na.action = stats::na.fail + ) + + # TODO: enable alternative for speed once RF gets slow: + # model <- ranger::ranger( + # formula, + # data = .train_data, + # num.trees = 1000, # correct replacement for ntree + # importance = "impurity", # matches randomForest default + # num.threads = 4 # optional, speeds up predict() + # ) + PEcAn.logger::logger.info( + "Predicting for ensemble", paste0(" ", ens_label, " (", i, "/", n_ensembles, ")"), + "with", nrow(scaled_covariates), "design points." + ) + start <- Sys.time() + prediction <- predict(model, scaled_covariates) + end <- Sys.time() + ### Optimization notes for when we scale up: + ### for speed as this scales up use ranger::predict + # prediction <- ranger::predict(model, scaled_covariates, + # num.threads = parallel::detectCores() - 1) + ### If predict runs out of memory, can split / apply predict / and combine results + # split_rows <- split(scaled_covariates, ceiling(seq_len(nrow(scaled_covariates)) / 10000)) + # preds <- purrr::map(split_rows, ~ ranger::predict(model, .x)) + # prediction <- do.call(rbind, lapply(preds, function(x) x$predictions)) + + ### for raster stack as covariates + # prediction <- terra::predict(model, scaled_covariates) + + PEcAn.logger::logger.info( + "Prediction for ensemble", i, "completed in", + round(as.numeric(end - start, units = "secs"), 2), "seconds." + ) + + + # Predicting for test data should be much faster + if (length(covariate_names) > 0 && nrow(.test_data) > 0) { + test_preds <- as.data.frame(.test_data[, covariate_names, drop = FALSE]) + na_cols_test <- names(which(colSums(is.na(test_preds)) > 0)) + if (length(na_cols_test) > 0) { + bad_rows_test <- which(!stats::complete.cases(test_preds)) + bad_sites_test <- unique(.test_data$site_id[bad_rows_test]) + PEcAn.logger::logger.error( + "NA found in test predictors for ensemble ", ens_label, + "; columns: ", paste(na_cols_test, collapse = ", "), + "; example site_ids: ", paste(utils::head(bad_sites_test, 10), collapse = ", ") + ) + } + } + test_prediction <- predict(model, .test_data) + + list( + model = model, + prediction = prediction, + test_data = .test_data, + test_prediction = test_prediction + ) + }, + .progress = TRUE, + .options = furrr::furrr_options(seed = TRUE) # Use global seed to silence warnings + ) + + # Organize the results into a single output list + # TODO: need to disambiguate terms design point, sipnet prediction @ design points (which become 'test' + # vs downscaling prediction + downscale_output <- + list( + data = list(training = train_data, testing = test_data), # should these be the scaled versions? + model = purrr::map(results, "model"), + predictions = purrr::map(results, "prediction"), + test_data = purrr::map(results, "test_data"), + test_predictions = purrr::map(results, "test_prediction") + ) + + return(downscale_output) +} + +##' @title Calculate Metrics for Downscaling Results +##' @name downscale_metrics +##' @author Sambhav Dixit, David LeBauer +##' +##' @param downscale_output List. Output from the downscale function, containing data, models, maps, predictions, +##' and test predictions for each ensemble. +##' @param carbon_pool Character. Name of the carbon pool used in the downscaling process. +##' +##' @details This function calculates performance metrics for the downscaling results. It computes Mean Squared Error (MSE), +##' Mean Absolute Error (MAE), and R-squared for each ensemble. The function uses the actual values from the testing data and +##' the predictions generated during the downscaling process. +##' +##' @description This function takes the output from the downscale function and computes various performance metrics for each ensemble. +##' It provides a way to evaluate the accuracy of the downscaling results without modifying the main downscaling function. +##' +##' @return A list of metrics for each ensemble, where each element contains MAE , RMSE ,R_squared ,CV, +##' and actual values from testing data and predicted values for the testing data +##' +##' @export +downscale_metrics <- function(downscale_output) { + test_data_list <- lapply(downscale_output$test_data, function(x) dplyr::pull(x, prediction)) + predicted_list <- downscale_output$test_predictions + + metric_fn <- function(actual, predicted) { # Could use PEcAn.benchmark pkg? + + # Ensure no NA values in actual and predicted + if (any(is.na(actual)) || any(is.na(predicted))) { + n_na_act <- sum(is.na(actual)) + n_na_pred <- sum(is.na(predicted)) + PEcAn.logger::logger.error( + "NA values found in actual or predicted data (", + n_na_act, " actual; ", n_na_pred, " predicted )." + ) + } + mean <- mean(actual) + RMSE <- sqrt(mean((actual - predicted)^2)) + MAE <- mean(abs(actual - predicted)) + R2 <- 1 - sum((actual - predicted)^2) / + sum((actual - mean(actual))^2) + + CV <- 100 * RMSE / mean + + if (CV > 500) { + PEcAn.logger::logger.error("Mean / RMSE > 500, indicating CV may not be an appropriate metric") + CV <- NA + } + + stats <- data.frame( + mean = mean, + RMSE = RMSE, + MAE = MAE, + R2 = R2, + CV = CV + ) |> + signif(digits = 2) + } + metrics <- purrr::map2(test_data_list, predicted_list, metric_fn) |> + dplyr::bind_rows(.id = "ensemble") + + return(metrics) +} diff --git a/modules/assim.sequential/man/SDA_downscale.Rd b/modules/assim.sequential/man/SDA_downscale.Rd index 89f6810866b..9f33bc83bbc 100644 --- a/modules/assim.sequential/man/SDA_downscale.Rd +++ b/modules/assim.sequential/man/SDA_downscale.Rd @@ -6,7 +6,7 @@ \usage{ SDA_downscale( preprocessed, - date, + date = NULL, carbon_pool, covariates, model_type = "rf", @@ -16,11 +16,11 @@ SDA_downscale( \arguments{ \item{preprocessed}{List. Preprocessed data returned as an output from the SDA_downscale_preprocess function.} -\item{date}{Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within file supplied to 'preprocessed' from the 'data_path'.} +\item{date}{*Deprecated*. This argument has never been used and will be removed after 2026-04-01} -\item{carbon_pool}{Character. Carbon pool of interest. Name must match carbon pool name found within file supplied to 'preprocessed' from the 'data_path'.} +\item{carbon_pool}{Character. Carbon pool of interest. Name must match carbon pool name found within file supplied to 'preprocessed' from the 'ensemble_data'.} -\item{covariates}{SpatRaster stack. Used as predictors in downscaling. Layers within stack should be named. Recommended that this stack be generated using 'covariates' instructions in assim.sequential/inst folder} +\item{covariates}{SpatRaster stack or sf object. Used as predictors in downscaling. If providing a raster stack, layers should be named. If providing an sf object, predictor attributes should be present.} \item{model_type}{Character. Either "rf" for Random Forest or "cnn" for Convolutional Neural Network. Default is Random Forest.} diff --git a/modules/assim.sequential/man/SDA_downscale_preprocess.Rd b/modules/assim.sequential/man/SDA_downscale_preprocess.Rd index 0e2a9f70bfe..86daeba6a4d 100644 --- a/modules/assim.sequential/man/SDA_downscale_preprocess.Rd +++ b/modules/assim.sequential/man/SDA_downscale_preprocess.Rd @@ -1,29 +1,48 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/downscale_function.R +% Please edit documentation in R/downscale_function.R, R/ensemble_downscale.R \name{SDA_downscale_preprocess} \alias{SDA_downscale_preprocess} +\alias{subset_ensemble} \title{Preprocess Data for Downscaling} \usage{ -SDA_downscale_preprocess(data_path, coords_path, date, carbon_pool) +SDA_downscale_preprocess(ensemble_data, site_coords, date, carbon_pool) + +subset_ensemble(ensemble_data, site_coords, date, carbon_pool) } \arguments{ -\item{data_path}{Character. File path for .rds containing ensemble data.} +\item{ensemble_data}{EFI standard tibble or data.frame} + +\item{site_coords}{data.frame with unique site id} -\item{coords_path}{Character. File path for .csv file containing the site coordinates, with columns named "lon" and "lat".} +\item{date}{Date. The date for the run, must be a date within `ensemble_data`.} -\item{date}{Date. If SDA site run, format is yyyy/mm/dd; if NEON, yyyy-mm-dd. Restricted to years within the file supplied to 'data_path'.} +\item{carbon_pool}{Character. Carbon pool of interest. Name must match the carbon pool name in ensemble_data. +found within the file or object supplied to 'ensemble_data'.} -\item{carbon_pool}{Character. Carbon pool of interest. Name must match the carbon pool name found within the file supplied to 'data_path'.} +\item{site_coordinates}{Character, data.frame, or sf object. Either: +- A file path to a `.csv` containing site coordinates with columns `"id"`, `"lat"`, and `"lon"`. +- A `data.frame` with the same structure. +- An `sf` object with point geometries.} } \value{ A list containing The read .rds data , The cleaned site coordinates, and the preprocessed carbon data. + +A list containing the cleaned site coordinates and the ensemble carbon output for the +specified date and carbon pool. } \description{ This function reads and checks the input data, ensuring that the required date and carbon pool exist, and that the site coordinates are valid. } \details{ This function ensures that the specified date and carbon pool are present in the input data. It also checks the validity of the site coordinates and aligns the number of rows between site coordinates and carbon data. + +This function subsets ensemble data and ensures that the specified date and +carbon pool are present in the ensemble data. } \author{ Sambhav Dixit + +David LeBauer + +Sambhav Dixit, David LeBauer } diff --git a/modules/assim.sequential/man/downscale_metrics.Rd b/modules/assim.sequential/man/downscale_metrics.Rd new file mode 100644 index 00000000000..8951c0f1b46 --- /dev/null +++ b/modules/assim.sequential/man/downscale_metrics.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ensemble_downscale.R +\name{downscale_metrics} +\alias{downscale_metrics} +\title{Calculate Metrics for Downscaling Results} +\usage{ +downscale_metrics(downscale_output) +} +\arguments{ +\item{downscale_output}{List. Output from the downscale function, containing data, models, maps, predictions, +and test predictions for each ensemble.} + +\item{carbon_pool}{Character. Name of the carbon pool used in the downscaling process.} +} +\value{ +A list of metrics for each ensemble, where each element contains MAE , MSE ,R_squared ,actual values from testing data and predicted values for the testing data +} +\description{ +This function takes the output from the downscale function and computes various performance metrics for each ensemble. +It provides a way to evaluate the accuracy of the downscaling results without modifying the main downscaling function. +} +\details{ +This function calculates performance metrics for the downscaling results. It computes Mean Squared Error (MSE), +Mean Absolute Error (MAE), and R-squared for each ensemble. The function uses the actual values from the testing data and +the predictions generated during the downscaling process. +} +\author{ +Sambhav Dixit, David LeBauer +} diff --git a/modules/assim.sequential/man/ensemble_downscale.Rd b/modules/assim.sequential/man/ensemble_downscale.Rd new file mode 100644 index 00000000000..92d230188c4 --- /dev/null +++ b/modules/assim.sequential/man/ensemble_downscale.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ensemble_downscale.R +\name{ensemble_downscale} +\alias{ensemble_downscale} +\title{Ensemble Downscale} +\usage{ +ensemble_downscale(ensemble_data, site_coords, covariates, seed = NULL) +} +\arguments{ +\item{ensemble_data}{EFI standard tibble or data.frame. Contains carbon data for downscaling.} + +\item{site_coords}{data.frame, tibble, or sf object. Design points. If not sf object, must have +'lon' and 'lat' columns. Must have unique identifier 'site' field.} + +\item{covariates}{table containing numeric predictors to be used in downscaling. +Must have unique identifier 'site_id' field and predictor attributes} + +\item{seed}{Numeric or NULL. Optional seed for random number generation. Default is NULL.} +} +\value{ +A list containing the model, predictions for all values of covariates as well as test data and test predictions for downstream +statistics. +} +\description{ +Ensemble Downscale +} +\details{ +This function will downscale forecast data to unmodeled locations using covariates and site locations +} +\author{ +Joshua Ploshay, Sambhav Dixit, David LeBauer +} diff --git a/modules/assim.sequential/man/outlier.detector.boxplot.Rd b/modules/assim.sequential/man/outlier.detector.boxplot.Rd index 2c2b7c74030..cc811006502 100644 --- a/modules/assim.sequential/man/outlier.detector.boxplot.Rd +++ b/modules/assim.sequential/man/outlier.detector.boxplot.Rd @@ -4,7 +4,7 @@ \alias{outlier.detector.boxplot} \title{outlier.detector.boxplot} \usage{ -outlier.detector.boxplot(X) +\method{outlier}{detector.boxplot}(X) } \arguments{ \item{X}{A list of dataframes} diff --git a/modules/assim.sequential/tests/testthat/test_downscaling.R b/modules/assim.sequential/tests/testthat/test_downscaling.R new file mode 100644 index 00000000000..460e45d3767 --- /dev/null +++ b/modules/assim.sequential/tests/testthat/test_downscaling.R @@ -0,0 +1,82 @@ +# Test setup: temporary datasets and files +withr::with_tempdir({ + temp_ensemble_data_rds <- "ensemble_data.rds" + temp_coords_csv <- "final_design_points.csv" + file.remove(temp_ensemble_data_rds, temp_coords_csv) + + set.seed(123) + ensemble_data <- list( + "2020-01-01" = data.frame( + site_id = 1:10, + SOC = runif(10, 8, 15), + AGB = runif(10, -0.5, 0.5) + ) + ) + saveRDS(ensemble_data, temp_ensemble_data) + + # Generate test coordinates with 10 values + site_coordinates <- data.frame( + id = 1:10, + lat = runif(10, 33.5, 34.5), + lon = runif(10, -118, -117) + ) + write.csv(site_coordinates, temp_coords_csv, row.names = FALSE) + + # Test `SDA_downscale_preprocess` + test_that("SDA_downscale_preprocess handles both file and objects as inputs", { + # file inputs + processed_data <- SDA_downscale_preprocess( + ensemble_data = temp_ensemble_data_rds, + date = "2020-01-01", + carbon_pool = "SOC", + site_coords = site_coordinates + ) + # object inputs + processed_data2 <- SDA_downscale_preprocess( + ensemble_data = ensemble_data, + date = "2020-01-01", + carbon_pool = "SOC", + site_coords = temp_coords_csv + ) + expect_identical(processed_data, processed_data2) + + expect_true(is.list(processed_data)) + expect_true("input_data" %in% names(processed_data)) + expect_true("site_coordinates" %in% names(processed_data)) + expect_true("carbon_data" %in% names(processed_data)) + expect_true(is.data.frame(processed_data$site_coordinates)) + expect_true("id" %in% colnames(processed_data$site_coordinates)) + expect_true("lat" %in% colnames(processed_data$site_coordinates)) + expect_true("lon" %in% colnames(processed_data$site_coordinates)) + expect_true(is.data.frame(processed_data$carbon_data)) + expect_true("ensemble1" %in% colnames(processed_data$carbon_data)) + }) + + # Generate test raster data + r <- terra::rast(ncols = 10, nrows = 10) + values(r) <- runif(100) + + # Create preprocessed data object with 10 values + preprocessed <- list( + input_data = ensemble_data, + site_coordinates = site_coordinates, + carbon_data = data.frame( + ensemble1 = runif(10, 8, 15) + ) + ) + + # Test `SDA_downscale` + test_that("SDA_downscale works with sf coordinates and raster covariates", { + downscaled_results <- SDA_downscale( + preprocessed = preprocessed, + carbon_pool = "SOC", + covariates = r, + model_type = "rf" + ) + + expect_true(is.list(downscaled_results)) + expect_true("data" %in% names(downscaled_results)) + expect_true("models" %in% names(downscaled_results)) + expect_true("maps" %in% names(downscaled_results)) + }) +})