slideimp

Choosing between K-NN and PCA imputation

Let’s use the included khanmiss1 dataset.

data(khanmiss1)
obj <- t(khanmiss1)
set.seed(1234)

Instead of randomly placing NA values, we fix the locations of NA values in advance for cross-validation. We will select 30 random genes with 5 random missing values each and repeat 10 times to compare the imputation accuracy between knn_imp() and pca_imp(). The na_location object is a list of 10 elements (one per repetition), where each element contains the 2D coordinates (row and column) of values that will be set to missing and then re-imputed by the K-NN/PCA imputation functions to calculate imputation metrics.

n_genes <- 30
n_samples <- 5
n_reps <- 10
na_location <- replicate(
  n = n_reps,
  {
    chosen_genes <- sample.int(ncol(obj), size = n_genes)
    values <- lapply(
      chosen_genes, \(x) {
        chosen_samples <- sample.int(nrow(obj), size = n_samples)
        matrix(c(chosen_samples, rep(x, n_samples)), ncol = 2, dimnames = list(NULL, c("row", "col")))
      }
    )
    do.call(rbind, values)
  },
  simplify = FALSE
)
na_location[[1]][1:10, ]
#>       row  col
#>  [1,]  49 1004
#>  [2,]  29 1004
#>  [3,]  32 1004
#>  [4,]  63 1004
#>  [5,]  57 1004
#>  [6,]   8  623
#>  [7,]  26  623
#>  [8,]  17  623
#>  [9,]  59  623
#> [10,]  63  623

We proceed to measure the accuracy of the imputation using “good” default parameters for K-NN (k = 10) and PCA (ncp = 5). For PCA imputation, we leverage the {mirai} package for parallelization while the cores argument control parallelization for K-NN.

tune_knn <- tune_imp(obj, parameters = data.frame(k = 10), rep = na_location, cores = 4)
#> Tuning knn_imp
#> Step 1/2: Injecting NA
#> Running in parallel...
#> Step 2/2: Tuning
tune_pca <- tune_imp(obj, parameters = data.frame(ncp = 5), rep = na_location)
#> Tuning pca_imp
#> Step 1/2: Injecting NA
#> Running in sequential...
#> Step 2/2: Tuning

In this case, PCA imputation performs better. Since the data only needs to be imputed once and the run time is so low, using PCA imputation for this dataset is preferable.

rmse_knn <- mean(compute_metrics(tune_knn, metrics = "rmse")$.estimate)
rmse_pca <- mean(compute_metrics(tune_pca, metrics = "rmse")$.estimate)
c("knn" = rmse_knn, "pca" = rmse_pca)
#>      knn      pca 
#> 489.4291 479.9477

Subset and grouped K-NN and PCA imputation

Some DNA methylation analyses only focus on epigenetic clocks, which require only a subset of CpGs to be imputed. The knn_imp() function takes advantage of this to significantly reduce run time. To impute only specific CpGs for K-NN imputation and make use of grouped imputation we can use the group_features() function.

Here, we simulate data where only a subset of CpGs are needed for imputation. PCA imputation does not benefit from subset imputation, but does benefit from grouped imputation.

# 500 features, 50 samples, 2 chromosomes
sim_obj <- sim_mat(n = 500, m = 50, nchr = 2)
# sim_mat returns matrix with features in rows, so we use t() to put features in columns
obj <- t(sim_obj$input)
# `group_feature` contains metadata for the simulated data
head(sim_obj$group_feature)
#>   feature_id group
#> 1      feat1  chr2
#> 2      feat2  chr2
#> 3      feat3  chr2
#> 4      feat4  chr2
#> 5      feat5  chr1
#> 6      feat6  chr1
n_needed <- 100
# Randomly select CpGs that are needed for clock calculation
subset_cpgs <- sample(colnames(obj), size = n_needed)

We define the features list-column as a list of character vectors containing features to be imputed. Elements of the features list-column are subsets of the subset_cpgs character vector. These CpGs belong to either chr1 or chr2, so we split them into two groups, resulting in two rows of the group_df tibble. The aux list-column now contains the remaining CpGs from either chr1 or chr2 that are not imputed themselves but are used to increase the accuracy of imputing the CpGs in the features list-column.

# Construct the group data.frame with one row per group (i.e., chromosome)
group_df <- group_features(obj, sim_obj$group_feature, subset = subset_cpgs)
group_df
#> # A tibble: 2 × 3
#>   features   group aux        
#>   <list>     <chr> <list>     
#> 1 <chr [41]> chr1  <chr [208]>
#> 2 <chr [59]> chr2  <chr [192]>

For demonstration purposes, we use the parameters column to add group-specific parameters. This can be useful if hyperparameters are tuned separately for each group, though this is typically not necessary. When k or ncp is too large for some groups, we can use the parameters column to specify group-specific k or ncp values. k and ncp can also be specified in group_features().

pca_group <- group_df
# For PCA imputation, we use ncp = 5 for the first group and ncp = 10 for the second group
pca_group$parameters <- list(data.frame(ncp = 5), data.frame(ncp = 10))
# For K-NN imputation, we use k = 5 for both groups
knn_group <- group_df
knn_group$parameters <- list(data.frame(k = 5), data.frame(k = 5))

pca_group
#> # A tibble: 2 × 4
#>   features   group aux         parameters  
#>   <list>     <chr> <list>      <list>      
#> 1 <chr [41]> chr1  <chr [208]> <df [1 × 1]>
#> 2 <chr [59]> chr2  <chr [192]> <df [1 × 1]>
knn_group
#> # A tibble: 2 × 4
#>   features   group aux         parameters  
#>   <list>     <chr> <list>      <list>      
#> 1 <chr [41]> chr1  <chr [208]> <df [1 × 1]>
#> 2 <chr [59]> chr2  <chr [192]> <df [1 × 1]>

K-NN subset imputation is very fast.

system.time(
  knn_results <- group_imp(obj, group = knn_group, cores = 4)
)
#> Performing group-wise KNN imputation with group-wise parameters.
#> Running in parallel...
#>    user  system elapsed 
#>       0       0       0
system.time(
  pca_results <- group_imp(obj, group = pca_group)
)
#> Performing group-wise PCA imputation with group-wise parameters.
#>    user  system elapsed 
#>    0.11    0.00    0.10

Sliding Window K-NN and PCA imputation

We simulate the output of the {methylKit} package. Note: WGBS/EM-seq data should be grouped by chromosome before performing sliding window imputation.

sample_names <- paste0("S", 1:10)
positions <- 500

methyl <- tibble::tibble(
  chr = "chr1",
  start = seq_len(positions),
  end = start,
  strand = "+"
)

set.seed(1234)
for (i in seq_along(sample_names)) {
  methyl[[paste0("numCs", i)]] <- sample.int(100, size = positions, replace = TRUE)
  methyl[[paste0("numTs", i)]] <- sample.int(100, size = positions, replace = TRUE)
  methyl[[paste0("coverage", i)]] <- methyl[[paste0("numCs", i)]] + methyl[[paste0("numTs", i)]]
}

methyl[1:5, 1:10]
#> # A tibble: 5 × 10
#>   chr   start   end strand numCs1 numTs1 coverage1 numCs2 numTs2 coverage2
#>   <chr> <int> <int> <chr>   <int>  <int>     <int>  <int>  <int>     <int>
#> 1 chr1      1     1 +          28     18        46     78     20        98
#> 2 chr1      2     2 +          80     25       105     47     95       142
#> 3 chr1      3     3 +          22     85       107     48     99       147
#> 4 chr1      4     4 +           9     22        31     74     37       111
#> 5 chr1      5     5 +           5     54        59     71     92       163

First, we convert the data into a beta matrix with features in the columns. Important: We sort the methyl object by position and name the CpGs by their genomic position. This is important because slide_imp imputes the beta matrix column-wise in windows so the order of the column has to be meaningful.

methyl <- methyl[order(methyl$start), ] # <---- Important, sort!
numCs_matrix <- as.matrix(methyl[, paste0("numCs", seq_along(sample_names))])
cov_matrix <- as.matrix(methyl[, paste0("coverage", seq_along(sample_names))])
beta_matrix <- numCs_matrix / cov_matrix

colnames(beta_matrix) <- sample_names
rownames(beta_matrix) <- methyl$start

beta_matrix <- t(beta_matrix)
# Set 10% of the data to missing
set.seed(1234)
beta_matrix[sample.int(length(beta_matrix), floor(length(beta_matrix) * 0.1))] <- NA
beta_matrix[1:5, 1:5]
#>            1          2         3         4         5
#> S1 0.6086957         NA 0.2056075 0.2903226        NA
#> S2 0.7959184 0.33098592 0.3265306 0.6666667 0.4355828
#> S3 0.6396396         NA        NA 0.7875000 0.4895833
#> S4 0.8863636 0.51111111 0.3211679 0.5000000 0.9479167
#> S5 0.7339450 0.06666667 0.2580645 0.1596639 0.6500000

This example dataset is small, but in actual analyses with millions of CpGs, we can tune the K-NN or PCA imputation on just the first tens of thousands of CpGs. Here, we perform K-NN imputation by specifying k, a window size (n_feat) of 100, and a 10-CpG overlap (n_overlap) between windows.

# For example, we are tuning `k` value here.
slide_knn_params <- tibble::tibble(k = c(10, 20), n_feat = 100, n_overlap = 10)
# Increase rep from 2 in actual analyses
tune_slide_knn <- tune_imp(
  obj = beta_matrix,
  parameters = slide_knn_params,
  cores = 4,
  rep = 2
)
#> Tuning slide_imp
#> Step 1/2: Injecting NA
#> Running in parallel...
#> Step 2/2: Tuning
compute_metrics(tune_slide_knn)
#> # A tibble: 12 × 10
#>        k n_feat n_overlap .progress cores param_set   rep .metric .estimator
#>    <dbl>  <dbl>     <dbl> <lgl>     <dbl>     <int> <int> <chr>   <chr>     
#>  1    10    100        10 FALSE         4         1     1 mae     standard  
#>  2    10    100        10 FALSE         4         1     1 rmse    standard  
#>  3    10    100        10 FALSE         4         1     1 rsq     standard  
#>  4    20    100        10 FALSE         4         2     1 mae     standard  
#>  5    20    100        10 FALSE         4         2     1 rmse    standard  
#>  6    20    100        10 FALSE         4         2     1 rsq     standard  
#>  7    10    100        10 FALSE         4         1     2 mae     standard  
#>  8    10    100        10 FALSE         4         1     2 rmse    standard  
#>  9    10    100        10 FALSE         4         1     2 rsq     standard  
#> 10    20    100        10 FALSE         4         2     2 mae     standard  
#> 11    20    100        10 FALSE         4         2     2 rmse    standard  
#> 12    20    100        10 FALSE         4         2     2 rsq     standard  
#> # ℹ 1 more variable: .estimate <dbl>

Finally, we can use slide_imp() to impute the beta_matrix.

imputed_beta <- slide_imp(obj = beta_matrix, n_feat = 100, n_overlap = 10, k = 10, cores = 4)
#> Step 1/2: Imputing
#>  Processing window 1 of 5
#>  Processing window 2 of 5
#>  Processing window 3 of 5
#>  Processing window 4 of 5
#>  Processing window 5 of 5
#> Step 2/2: Averaging overlapping regions
#> Post-imputation: filling remaining NAs with column means
imputed_beta
#> ImputedMatrix (KNN)
#> Dimensions: 10 x 500
#> 
#>            1          2         3         4         5
#> S1 0.6086957 0.37449497 0.2056075 0.2903226 0.4579225
#> S2 0.7959184 0.33098592 0.3265306 0.6666667 0.4355828
#> S3 0.6396396 0.55105585 0.5863953 0.7875000 0.4895833
#> S4 0.8863636 0.51111111 0.3211679 0.5000000 0.9479167
#> S5 0.7339450 0.06666667 0.2580645 0.1596639 0.6500000
#> 
#> # Showing [1:5, 1:5] of full matrix