Data quality: check missing data patterns
Check percentage of missing values per each covariate and visualize variables with associated missing values. We are considering all UK area, so it is reasonable to see missing values in some variables in some regions.
Load packages, read data and source custom scripts
rm(list = ls())
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(colorspace)
path_proj <- day2day::git_path()
path_data <- file.path(path_proj, "data")
path_processed <- file.path(path_data, "processed")
land_cover <- fst::read_fst(file.path(path_processed, "uk_1km_dataframe.fst"))
Metadata
# variables
names(land_cover)
#> [1] "x" "y"
#> [3] "count_0_outside" "count_1_arable"
#> [5] "count_2_wetland" "count_3_improved_grassland"
#> [7] "count_4_forest" "count_5_semi_natural_grassland"
#> [9] "count_6_urban" "count_7_other"
#> [11] "elev" "slope_nb8"
#> [13] "pop" "dist"
#> [15] "gdhi" "country"
#> [17] "gdd" "max_tas"
#> [19] "max_tasmax" "maxmonth_tas"
#> [21] "min_tas" "min_tasmin"
#> [23] "smd" "sms"
#> [25] "id" "count_total"
#> [27] "count_no0" "count_no_urban"
# number of rows
nrow(land_cover)
#> [1] 244105
# number of rows after removing missing values
nrow(na.omit(land_cover))
#> [1] 226071
Missing data
In the following figure, we show the covariates with percentage of missing values in decreasing order.
miss_per <- dplyr::select(land_cover, - id) %>%
purrr::map_dbl(~ sum(is.na(.)) / length(.)) %>%
sort(decreasing = TRUE)
miss_per <- data.frame(variable = factor(names(miss_per), names(miss_per)),
percentage = miss_per) %>%
subset(percentage > 0)
ggplot(miss_per) +
geom_col(aes(variable, percentage)) +
scale_y_continuous(labels = scales::percent) +
labs(x = NULL, y = NULL) +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
Visualize missing pattern for all covariates
gg_mis <- function(var) {
gg <- land_cover %>%
ggplot(aes(x, y)) +
geom_tile(aes_string(fill = var)) +
labs(x = NULL, y = NULL, color = NULL, title = toupper(var)) +
theme_bw() +
coord_fixed() +
scale_x_continuous(labels = label_number(scale = 1 / 1e3, suffix = " km")) +
scale_y_continuous(labels = label_number(scale = 1 / 1e3, suffix = " km")) +
geom_point(data = filter(land_cover, is.na(.data[[var]])), colour = rgb(0.6, 0, 0),
size = rel(0.7)) +
scale_fill_continuous_sequential(
"Viridis", trans = "pseudo_log", na.value = rgb(0.6, 0, 0))
return(gg)
}
lapply(names(land_cover)[-(1:2)], gg_mis)
#> [[1]]
#>
#> [[2]]
#>
#> [[3]]
#>
#> [[4]]
#>
#> [[5]]
#>
#> [[6]]
#>
#> [[7]]
#>
#> [[8]]
#>
#> [[9]]
#>
#> [[10]]
#>
#> [[11]]
#>
#> [[12]]
#>
#> [[13]]
#>
#> [[14]]
#>
#> [[15]]
#>
#> [[16]]
#>
#> [[17]]
#>
#> [[18]]
#>
#> [[19]]
#>
#> [[20]]
#>
#> [[21]]
#>
#> [[22]]
#>
#> [[23]]
#>
#> [[24]]
#>
#> [[25]]
#>
#> [[26]]
Time to execute the task
Only useful when executed with Rscript
.
proc.time()
#> user system elapsed
#> 425.845 5.259 432.226