Simulate data for testing
Simulating testing data to check adequacy of bamlss
models with independent random
effects.
Load packages, read data and source custom scripts
rm(list = ls())
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
path_proj <- day2day::git_path()
path_data <- file.path(path_proj, "data")
path_processed <- file.path(path_data, "processed")
Simulate data
\[\text{born_weight} = \beta_0 + g(\text{marital_status}) + f(\text{age}) + f(\text{municipality})\]
# groups parameters
groups_n <- 40
groups_mu <- 0
groups_sigma <- 200
groups_eff <- rnorm(groups_n, groups_mu, groups_sigma)
# marital_status parameters
marital_eff <- c(40, 20, - 100)
# race parameters
race_eff <- seq(1, by = 20, length = 4)
# data parameters
n <- 20000
intercept <- 3000
sigma <- 200
# simulate
bwdata <- tibble::tibble(id = 1:n) %>% mutate(
municipality = sample(1:groups_n, n, replace = TRUE),
marital_status = factor(sample(seq_along(marital_eff), n, replace = TRUE)),
race = factor(sample(seq_along(race_eff), n, replace = TRUE)),
age = 30 + rnorm(n, 10, 10),
born_weight = intercept +
marital_eff[marital_status] +
race_eff[race] -
(age - 30) ^ 2 +
groups_eff[municipality] +
rnorm(n, 0, sigma),
municipality = factor(municipality, 1:groups_n),
fac_fake = factor(sample(0:1, n, replace = TRUE))
)
# discrete age
bwdata <- bwdata %>%
mutate(age_bin = round(age))
Check Simulated data
# intercept
with(bwdata, mean(born_weight))
#> [1] 2806.258
with(bwdata, mean(born_weight - marital_eff[marital_status] - race_eff[race]) +
marital_eff[1] + race_eff[1])
#> [1] 2829.555
intercept + 40 + 1 - mean((bwdata$age - 30) ^ 2) + with(bwdata, mean(groups_eff[municipality]))
#> [1] 2828.273
# ire
groups_sigma^2
#> [1] 40000
# intercept on sigma
log(sigma)
#> [1] 5.298317
# mean per group
setNames(round(groups_eff), seq_along(groups_eff))
#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#> 270 113 -214 -48 15 353 16 -135 -253 -335 94 141 390 -204 142 201 6 160 -215 -260
#> 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#> 109 1 60 12 -252 -117 53 47 232 -26 92 -246 58 -295 55 -238 -455 246 44 -142
# marital_eff
marital_eff - marital_eff[1]
#> [1] 0 -20 -140
# race_eff
race_eff - race_eff[1]
#> [1] 0 20 40 60
Visualize
ggplot(bwdata, aes(age, born_weight, group = municipality)) +
geom_point(size = 0.7) +
geom_smooth(aes(col = municipality), se = FALSE, lwd = 0.5, show.legend = FALSE) +
ggtitle("Birthweight vs age")
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(bwdata, aes(marital_status, born_weight)) +
geom_violin(aes(fill = marital_status)) +
ggtitle("Birthweight by marital_status")
ggplot(bwdata, aes(municipality, born_weight)) +
geom_violin(aes(fill = municipality), show.legend = FALSE) +
ggtitle("Birthweight by municipality")
Create dummy variables
bwdata <- cbind(
bwdata,
model.matrix(~ marital_status + race, bwdata)[, -1]
)
Save birthweight data for modelling
fst::write_fst(bwdata, path = file.path(path_processed, "bwdata_51_test.fst"))
Time to execute the task
Only useful when executed with Rscript
.
proc.time()
#> user system elapsed
#> 4.759 0.119 4.940