Data quality: check missing data patterns


Check percentage of missing values per each covariate and visualize variables with associated missing values. We are considering all UK area, so it is reasonable to see missing values in some variables in some regions.

Load packages, read data and source custom scripts

rm(list = ls())
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(colorspace)

path_proj <- day2day::git_path()
path_data <- file.path(path_proj, "data")
path_processed <- file.path(path_data, "processed")

land_cover <- fst::read_fst(file.path(path_processed, "uk_1km_dataframe.fst"))

Metadata

# variables
names(land_cover)
#>  [1] "x"                              "y"                             
#>  [3] "count_0_outside"                "count_1_arable"                
#>  [5] "count_2_wetland"                "count_3_improved_grassland"    
#>  [7] "count_4_forest"                 "count_5_semi_natural_grassland"
#>  [9] "count_6_urban"                  "count_7_other"                 
#> [11] "elev"                           "slope_nb8"                     
#> [13] "pop"                            "dist"                          
#> [15] "gdhi"                           "country"                       
#> [17] "gdd"                            "max_tas"                       
#> [19] "max_tasmax"                     "maxmonth_tas"                  
#> [21] "min_tas"                        "min_tasmin"                    
#> [23] "smd"                            "sms"                           
#> [25] "id"                             "count_total"                   
#> [27] "count_no0"                      "count_no_urban"
# number of rows
nrow(land_cover)
#> [1] 244105
# number of rows after removing missing values
nrow(na.omit(land_cover))
#> [1] 226071

Missing data

In the following figure, we show the covariates with percentage of missing values in decreasing order.

miss_per <- dplyr::select(land_cover, - id) %>%
    purrr::map_dbl(~ sum(is.na(.)) / length(.)) %>%
    sort(decreasing = TRUE)
miss_per <- data.frame(variable = factor(names(miss_per), names(miss_per)),
                       percentage = miss_per) %>%
    subset(percentage > 0)

ggplot(miss_per) +
    geom_col(aes(variable, percentage)) +
    scale_y_continuous(labels = scales::percent) +
    labs(x = NULL, y = NULL) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))

Visualize missing pattern for all covariates

gg_mis <- function(var) {
gg <- land_cover %>%
    ggplot(aes(x, y)) +
    geom_tile(aes_string(fill = var)) +
    labs(x = NULL, y = NULL, color = NULL, title = toupper(var)) +
    theme_bw() +
    coord_fixed() +
    scale_x_continuous(labels = label_number(scale = 1 / 1e3, suffix = " km")) +
    scale_y_continuous(labels = label_number(scale = 1 / 1e3, suffix = " km")) +
    geom_point(data = filter(land_cover, is.na(.data[[var]])), colour = rgb(0.6, 0, 0),
               size = rel(0.7)) +
    scale_fill_continuous_sequential(
        "Viridis", trans = "pseudo_log", na.value = rgb(0.6, 0, 0))
return(gg)
}
lapply(names(land_cover)[-(1:2)], gg_mis)
#> [[1]]

#> 
#> [[2]]

#> 
#> [[3]]

#> 
#> [[4]]

#> 
#> [[5]]

#> 
#> [[6]]

#> 
#> [[7]]

#> 
#> [[8]]

#> 
#> [[9]]

#> 
#> [[10]]

#> 
#> [[11]]

#> 
#> [[12]]

#> 
#> [[13]]

#> 
#> [[14]]

#> 
#> [[15]]

#> 
#> [[16]]

#> 
#> [[17]]

#> 
#> [[18]]

#> 
#> [[19]]

#> 
#> [[20]]

#> 
#> [[21]]

#> 
#> [[22]]

#> 
#> [[23]]

#> 
#> [[24]]

#> 
#> [[25]]

#> 
#> [[26]]

Time to execute the task

Only useful when executed with Rscript.

proc.time()
#>    user  system elapsed 
#> 425.845   5.259 432.226