Skip to content

Commit

Permalink
🔥 Addition of superlearners to ensemble
Browse files Browse the repository at this point in the history
  • Loading branch information
Martin-Jung committed Aug 22, 2024
1 parent f282d58 commit 7dfbc6d
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 5 deletions.
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# ibis.iSDM 0.1.5 (current dev branch)

#### New features
* Support for 'modal' value calculations in `ensemble()` and export of method.
* Support for 'superlearner' in `ensemble()`.
* Support for future processing streamlined. See FAQ section for instructions #18.

#### Minor improvements and bug fixes
* Support for modal value calculations in `ensemble()` and export of method.
* Minor :bug: fix related to misaligned thresholds and negative exponential kernels.
* :fire: :bug: fix for scenario projections that use different grain sizes than for inference.

Expand Down
62 changes: 58 additions & 4 deletions R/ensemble.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
#' standard deviation (\code{"sd"}), the average of all PCA axes except the
#' first \code{"pca"}, the coefficient of variation (\code{"cv"}, Default) or
#' the range between the lowest and highest value (\code{"range"}).
#' @param point A [`sf`] object containing observational data used for model training. Used
#' for method \code{'superlearner'} only (Default: \code{NULL}).
#' @param field_occurrence A [`character`] location of biodiversity point records (Default: \code{'observed'}).
#' @param apply_threshold A [`logical`] flag (Default: \code{TRUE}) specifying
#' whether threshold values should also be created via \code{"method"}. Only
#' applies and works for [`DistributionModel`] and thresholds found.
Expand All @@ -49,6 +52,8 @@
#' * \code{'min.sd'} - Ensemble created by minimizing the uncertainty among predictions.
#' * \code{'threshold.frequency'} - Returns an ensemble based on threshold frequency (simple count). Requires thresholds to be computed.
#' * \code{'pca'} - Calculates a PCA between predictions of each algorithm and then extract the first axis (the one explaining the most variation).
#' * \code{'superlearner'} - Composites two predictions through a 'meta-model' fitted on top
#' (using a [`glm`] by default). Requires binomial data in current Setup.
#'
#' In addition to the different ensemble methods, a minimal threshold
#' (\code{min.value}) can be set that needs to be surpassed for averaging. By
Expand Down Expand Up @@ -95,14 +100,17 @@ NULL
methods::setGeneric("ensemble",
signature = methods::signature("..."),
function(..., method = "mean", weights = NULL, min.value = NULL, layer = "mean",
normalize = FALSE, uncertainty = "cv", apply_threshold = TRUE) standardGeneric("ensemble"))
normalize = FALSE, uncertainty = "cv",
point = NULL, field_occurrence = 'observed',
apply_threshold = TRUE) standardGeneric("ensemble"))

#' @rdname ensemble
methods::setMethod(
"ensemble",
methods::signature("ANY"),
function(..., method = "mean", weights = NULL, min.value = NULL, layer = "mean",
normalize = FALSE, uncertainty = "cv", apply_threshold = TRUE){
normalize = FALSE, uncertainty = "cv",
point = NULL, field_occurrence = 'observed', apply_threshold = TRUE){
if(length(list(...))>1) {
mc <- list(...)
} else {
Expand Down Expand Up @@ -137,10 +145,21 @@ methods::setMethod(

# Check the method
method <- match.arg(method, c('mean', 'weighted.mean', 'median', 'max', 'min','mode',
'superlearner',
'threshold.frequency', 'min.sd', 'pca'), several.ok = FALSE)
# Uncertainty calculation
uncertainty <- match.arg(uncertainty, c('none','sd', 'cv', 'range', 'pca'), several.ok = FALSE)

# Method specific checks
if(method == "superlearner"){
assertthat::assert_that(!is.null(point),
msg = "Ensemble method superlearner requires a specified point data.")
assertthat::assert_that(utils::hasName(point, field_occurrence),
msg = "Field occurrence not found in specified point data.")
assertthat::assert_that(dplyr::n_distinct(point[[field_occurrence]])==2,
msg = "Superlearner currently requires binomial data for ensembling!")
}
# --- #
# For Distribution model ensembles
if( all( sapply(mods, function(z) inherits(z, "DistributionModel")) ) ){
assertthat::assert_that(length(mods)>=2, # Need at least 2 otherwise this does not make sense
Expand Down Expand Up @@ -228,6 +247,23 @@ methods::setMethod(
} else if(method == 'pca'){
# Calculate a pca on the layers and return the first axes
new <- predictor_transform(ras, option = "pca", pca.var = 1)[[1]]
} else if(method == 'superlearner'){
# Run a superlearner with the specified point data.
# Ensure that predictions have unique names
names(ras) <- paste0('model', 1:terra::nlyr(ras))
ex <- terra::extract(ras, point, ID = FALSE)
ex <- cbind(point[,field_occurrence], ex)
fit <- glm(
formula = paste(field_occurrence, "~", paste0(names(ras), collapse = ' + ')) |> as.formula(),
family = binomial(),data = ex
)
# Now predict output with the meta-learner
new <- emptyraster(ras)
new[which(!is.na(ras[[1]])[])] <- terra::predict(
fit, ras, na.rm = FALSE, type = "response",
cores = getOption('ibis.nthread'))
attr(new, "superlearner.coefficients") <- coef(fit)
try({ rm(ex,fit) },silent = TRUE)
}

# Rename
Expand Down Expand Up @@ -349,7 +385,6 @@ methods::setMethod(
} else if(method == 'threshold.frequency'){
# Check that thresholds are available
stop("This function does not (yet) work with directly provided Raster objects.")

} else if(method == 'min.sd'){
# If method 'min.sd' furthermore check that there is a sd object for all
# of them
Expand All @@ -368,6 +403,23 @@ methods::setMethod(
} else if(method == 'pca'){
# Calculate a pca on the layers and return the first axes
new <- predictor_transform(ras, option = "pca", pca.var = 1)[[1]]
} else if(method == 'superlearner'){
# Run a superlearner with the specified point data.
# Ensure that predictions have unique names
names(ras) <- paste0('model', 1:terra::nlyr(ras))
ex <- terra::extract(ras, point, ID = FALSE)
ex <- cbind(point[,field_occurrence], ex)
fit <- glm(
formula = paste(field_occurrence, "~", paste0(names(ras), collapse = ' + ')) |> as.formula(),
family = binomial(),data = ex
)
# Now predict output with the meta-learner
new <- emptyraster(ras)
new[which(!is.na(ras[[1]])[])] <- terra::predict(
fit, ras, na.rm = FALSE, type = "response",
cores = getOption('ibis.nthread'))
attr(new, "superlearner.coefficients") <- coef(fit)
try({ rm(ex,fit) },silent = TRUE)
}
# Rename
names(new) <- paste0("ensemble_", lyr)
Expand All @@ -392,7 +444,7 @@ methods::setMethod(
suppressWarnings( out <- c(out, new) )
}
}

# Check and return output
assertthat::assert_that(is.Raster(out))
return(out)
} else {
Expand Down Expand Up @@ -470,6 +522,8 @@ methods::setMethod(
stop("This has not been reasonably implemented in this context.")
} else if(method == 'pca'){
stop("This has not been reasonably implemented in this context.")
} else if(method == 'superlearner'){
stop("This has not been reasonably implemented in this context.")
}
# Add dimensions to output
if(inherits(mods[[1]], "stars")){
Expand Down
11 changes: 11 additions & 0 deletions man/ensemble.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7dfbc6d

Please sign in to comment.