# Simulate data for testing

Simulating testing data to check adequacy of `bamlss`

models with independent random
effects.

## Load packages, read data and source custom scripts

```
rm(list = ls())
library(dplyr)
```

```
#>
#> Attaching package: 'dplyr'
```

```
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
```

```
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
```

```
library(ggplot2)
path_proj <- day2day::git_path()
path_data <- file.path(path_proj, "data")
path_processed <- file.path(path_data, "processed")
```

## Simulate data

\[\text{born_weight} = \beta_0 + g(\text{marital_status}) + f(\text{age}) + f(\text{municipality})\]

```
# groups parameters
groups_n <- 40
groups_mu <- 0
groups_sigma <- 200
groups_eff <- rnorm(groups_n, groups_mu, groups_sigma)
# marital_status parameters
marital_eff <- c(40, 20, - 100)
# race parameters
race_eff <- seq(1, by = 20, length = 4)
# data parameters
n <- 20000
intercept <- 3000
sigma <- 200
# simulate
bwdata <- tibble::tibble(id = 1:n) %>% mutate(
municipality = sample(1:groups_n, n, replace = TRUE),
marital_status = factor(sample(seq_along(marital_eff), n, replace = TRUE)),
race = factor(sample(seq_along(race_eff), n, replace = TRUE)),
age = 30 + rnorm(n, 10, 10),
born_weight = intercept +
marital_eff[marital_status] +
race_eff[race] -
(age - 30) ^ 2 +
groups_eff[municipality] +
rnorm(n, 0, sigma),
municipality = factor(municipality, 1:groups_n),
fac_fake = factor(sample(0:1, n, replace = TRUE))
)
# discrete age
bwdata <- bwdata %>%
mutate(age_bin = round(age))
```

## Check Simulated data

```
# intercept
with(bwdata, mean(born_weight))
```

`#> [1] 2806.258`

```
with(bwdata, mean(born_weight - marital_eff[marital_status] - race_eff[race]) +
marital_eff[1] + race_eff[1])
```

`#> [1] 2829.555`

`intercept + 40 + 1 - mean((bwdata$age - 30) ^ 2) + with(bwdata, mean(groups_eff[municipality]))`

`#> [1] 2828.273`

```
# ire
groups_sigma^2
```

`#> [1] 40000`

```
# intercept on sigma
log(sigma)
```

`#> [1] 5.298317`

```
# mean per group
setNames(round(groups_eff), seq_along(groups_eff))
```

```
#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#> 270 113 -214 -48 15 353 16 -135 -253 -335 94 141 390 -204 142 201 6 160 -215 -260
#> 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#> 109 1 60 12 -252 -117 53 47 232 -26 92 -246 58 -295 55 -238 -455 246 44 -142
```

```
# marital_eff
marital_eff - marital_eff[1]
```

`#> [1] 0 -20 -140`

```
# race_eff
race_eff - race_eff[1]
```

`#> [1] 0 20 40 60`

## Visualize

```
ggplot(bwdata, aes(age, born_weight, group = municipality)) +
geom_point(size = 0.7) +
geom_smooth(aes(col = municipality), se = FALSE, lwd = 0.5, show.legend = FALSE) +
ggtitle("Birthweight vs age")
```

`#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'`

```
ggplot(bwdata, aes(marital_status, born_weight)) +
geom_violin(aes(fill = marital_status)) +
ggtitle("Birthweight by marital_status")
```

```
ggplot(bwdata, aes(municipality, born_weight)) +
geom_violin(aes(fill = municipality), show.legend = FALSE) +
ggtitle("Birthweight by municipality")
```

## Create dummy variables

```
bwdata <- cbind(
bwdata,
model.matrix(~ marital_status + race, bwdata)[, -1]
)
```

## Save birthweight data for modelling

`fst::write_fst(bwdata, path = file.path(path_processed, "bwdata_51_test.fst"))`

## Time to execute the task

Only useful when executed with `Rscript`

.

`proc.time()`

```
#> user system elapsed
#> 4.759 0.119 4.940
```