Simulate data for testing


Simulating testing data to check adequacy of bamlss models with independent random effects.

Load packages, read data and source custom scripts

rm(list = ls())
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)

path_proj <- day2day::git_path()
path_data <- file.path(path_proj, "data")
path_processed <- file.path(path_data, "processed")

Simulate data

\[\text{born_weight} = \beta_0 + g(\text{marital_status}) + f(\text{age}) + f(\text{municipality})\]

# groups parameters
groups_n <- 40
groups_mu <- 0
groups_sigma <- 200
groups_eff <- rnorm(groups_n, groups_mu, groups_sigma)

# marital_status parameters
marital_eff <- c(40, 20, - 100)

# race parameters
race_eff <- seq(1, by = 20, length = 4)

# data parameters
n <- 20000
intercept <- 3000
sigma <- 200

# simulate
bwdata <- tibble::tibble(id = 1:n) %>% mutate(
    municipality = sample(1:groups_n, n, replace = TRUE),
    marital_status = factor(sample(seq_along(marital_eff), n, replace = TRUE)),
    race = factor(sample(seq_along(race_eff), n, replace = TRUE)),
    age = 30 + rnorm(n, 10, 10),
    born_weight = intercept +
        marital_eff[marital_status] +
        race_eff[race] -
        (age - 30) ^ 2 +
        groups_eff[municipality] +
        rnorm(n, 0, sigma),
    municipality = factor(municipality, 1:groups_n),
    fac_fake = factor(sample(0:1, n, replace = TRUE))
)

# discrete age
bwdata <- bwdata %>%
    mutate(age_bin = round(age))

Check Simulated data

# intercept
with(bwdata, mean(born_weight))
#> [1] 2806.258
with(bwdata, mean(born_weight - marital_eff[marital_status] - race_eff[race]) +
     marital_eff[1] + race_eff[1])
#> [1] 2829.555
intercept + 40 + 1 - mean((bwdata$age - 30) ^ 2) + with(bwdata, mean(groups_eff[municipality]))
#> [1] 2828.273
# ire
groups_sigma^2
#> [1] 40000
# intercept on sigma
log(sigma)
#> [1] 5.298317
# mean per group
setNames(round(groups_eff), seq_along(groups_eff))
#>    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20 
#>  270  113 -214  -48   15  353   16 -135 -253 -335   94  141  390 -204  142  201    6  160 -215 -260 
#>   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40 
#>  109    1   60   12 -252 -117   53   47  232  -26   92 -246   58 -295   55 -238 -455  246   44 -142
# marital_eff
marital_eff - marital_eff[1]
#> [1]    0  -20 -140
# race_eff
race_eff - race_eff[1]
#> [1]  0 20 40 60

Visualize

ggplot(bwdata, aes(age, born_weight, group = municipality)) +
    geom_point(size = 0.7) +
    geom_smooth(aes(col = municipality), se = FALSE, lwd = 0.5, show.legend = FALSE) +
    ggtitle("Birthweight vs age")
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(bwdata, aes(marital_status, born_weight)) +
    geom_violin(aes(fill = marital_status)) +
    ggtitle("Birthweight by marital_status")

ggplot(bwdata, aes(municipality, born_weight)) +
    geom_violin(aes(fill = municipality), show.legend = FALSE) +
    ggtitle("Birthweight by municipality")

Create dummy variables

bwdata <- cbind(
    bwdata,
    model.matrix(~ marital_status + race, bwdata)[, -1]
    )

Save birthweight data for modelling

fst::write_fst(bwdata, path = file.path(path_processed, "bwdata_51_test.fst"))

Time to execute the task

Only useful when executed with Rscript.

proc.time()
#>    user  system elapsed 
#>   4.759   0.119   4.940