Binomial models: full data without interaction


Running binomial models for count data, for each land class.

Load packages, read data and source custom scripts

rm(list = ls())
library(mgcv)
#> Loading required package: nlme
#> This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
library(purrr)
library(tibble)

path_proj <- day2day::git_path()
path_data <- file.path(path_proj, "data")
path_processed <- file.path(path_data, "processed")
path_modelled <- file.path(path_data, "modelled")

land_cover <- file.path(path_processed, "uk_1km_dataframe_train_full.fst") %>%
    fst::read_fst()

prefix <- "binom_full_simple"
output_name <- paste0(sub("binom", "model_binomials", prefix), ".rds")
run_model <- TRUE

Running binomial models

Defining formulas for 6 classes of land cover.

form <-  ~ s(gdd) + s(min_tas) + s(max_tas) + s(smd) + s(sms)
form1 <- update(
    form,
    cbind(count_1_arable, count_total - count_1_arable) ~ . + s(elev) + s(slope_nb8)
)
form2 <- update(
    form,
    cbind(count_2_wetland, count_total - count_2_wetland) ~ . + s(elev) + s(slope_nb8)
)
form3 <- update(
    form,
    cbind(count_3_improved_grassland, count_total - count_3_improved_grassland) ~ .
)
form4 <- update(
    form,
    cbind(count_4_forest, count_total - count_4_forest) ~ . + s(elev) + s(slope_nb8)
)
form5 <- update(
    form,
    cbind(count_5_semi_natural_grassland, count_total - count_5_semi_natural_grassland) ~ .
)
# form6 <- update(
#     form,
#     cbind(count_6_urban, count_total - count_6_urban) ~ s(dist) + s(pop) + s(gdhi)
# )

Define model data frame.

form_names <- grep("form[[:digit:]]", ls(), value = TRUE)
make_model_name <-  function(x, prefix) sub("count", prefix, all.vars(get(x))[1])
model_names <- sapply(form_names, make_model_name, prefix = prefix)
model_df <- tibble::tibble(
    name = model_names,
    form = map(form_names, get)
)

Run models.

if (run_model) {
    ini <- Sys.time()
    model_df$model <-  map(model_df$form, ~ gam(., family = binomial(), data = land_cover))
    print(Sys.time() - ini)
} else {
    model_df <- readRDS(file.path(path_modelled, output_name))
}
#> Time difference of 1.903525 hours

Save model

Using compress = FALSE for faster readRDS and saveRDS.

model_df
#> # A tibble: 5 x 3
#>   name                                       form      model 
#>   <chr>                                      <list>    <list>
#> 1 binom_full_simple_1_arable                 <formula> <gam> 
#> 2 binom_full_simple_2_wetland                <formula> <gam> 
#> 3 binom_full_simple_3_improved_grassland     <formula> <gam> 
#> 4 binom_full_simple_4_forest                 <formula> <gam> 
#> 5 binom_full_simple_5_semi_natural_grassland <formula> <gam>
saveRDS(model_df, file = file.path(path_modelled, output_name), compress = FALSE)

Time to execute the task

Only useful when executed with Rscript.

proc.time()
#>     user   system  elapsed 
#> 5602.881  778.108 6862.902