Simulate CITE-seq data (original) (raw)

Introduction

In this tutorial, we will show how to use scDesign3 to simulate CITE-seq data and illustrate the similarity between the inputted reference data and synthetic data.

Read in the reference data

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581968")))
print(example_sce)

To save computational time, we only use the top 100 genes and six more genes with protein and RNA abundance information.

keep_gene <- c("CD4",  "CD14", "CD19", "CD34", "CD3E", "CD8A")
keep_adt <- c("ADT_CD4", "ADT_CD14", "ADT_CD19", "ADT_CD34", "ADT_CD3", "ADT_CD8")
keep <- c(keep_gene, keep_adt)
idx <- which(rownames(example_sce) %in% keep)
idx <- c(1:100,idx)
example_sce <- example_sce[idx,]
logcounts(example_sce) <- log1p(counts(example_sce))

Simulation

We input the reference data and use the one-shot [scdesign3()](../reference/scdesign3.html) function to simulate CITE-seq dat using discrete cell types as the covariates for fitting each gene’s marginal distribution.

set.seed(123)
example_simu <- scdesign3(
    sce = example_sce,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    corr_formula = "cell_type",
    copula = "vine",
    DT = TRUE,
    pseudo_obs = FALSE,
    return_model = FALSE,
    nonzerovar = TRUE,
    nonnegative = TRUE
  )

After the simulation, we can create the SinglecellExperiment object using the synthetic count matrix and store the logcounts to the input and synthetic SinglecellExperiment objects.

Then, we obtained the PCA and UMAP for both the inputted reference data and the synthetic data. These sets of embedding will be used for the visualization below.

set.seed(123)
 train_pca_fit <- irlba::prcomp_irlba(t(log1p(counts(example_sce))), 
                                          center = TRUE, 
                                          scale. = FALSE, 
                                          n = 50)
reducedDim(simu_sce, "PCA") <- predict(train_pca_fit, newdata= t(log1p(counts(simu_sce))))
simu_pac_fit <- predict(train_pca_fit, newdata= t(logcounts(simu_sce)))
train_umap_fit <- umap::umap(train_pca_fit$x, n_neighbors = 15, min_dist = 0.1)
simu_umap_fit <-  predict(object = train_umap_fit, data= (reducedDim(simu_sce, "PCA")))
colnames(simu_umap_fit ) <- c("UMAP1", "UMAP2")
reducedDim(simu_sce, "UMAP") <- simu_umap_fit 
train_umap <- train_umap_fit$layout
rownames(train_umap) <- colnames(example_sce)
colnames(train_umap) <- c("UMAP1", "UMAP2")

Visualization

To visualize the results, we select six genes and reformat their UMAP embedding we got in the previous step.

CITE_dat <- bind_rows(expression_train, expression_scDesign3) %>% as_tibble() %>%
            dplyr::mutate_at(vars(-c(UMAP1, UMAP2, Method)), funs(scales::rescale)) %>% tidyr::pivot_longer(-c("UMAP1", "UMAP2", "Method"), names_to = "Feature", values_to = "Expression") %>% dplyr::mutate(Type = if_else(str_detect(Feature, "ADT"), "Protein", "RNA")) %>% dplyr::mutate(Gene = str_replace(Feature, "ADT_", "")) %>% dplyr::mutate(Gene = if_else(Gene == "CD3E", "CD3", Gene))%>% dplyr::mutate(Gene = if_else(Gene == "CD8A", "CD8", Gene))%>% dplyr::filter(Gene %in% c("CD14", "CD3", "CD8", "CD19")) %>% dplyr::mutate(Gene = factor(Gene, levels = c("CD3", "CD8", "CD14", "CD19"))) %>% dplyr::mutate(Method = factor(Method, levels = c("Train data", "scDesign3")))
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> ℹ Please use a list of either functions or lambdas:
#> 
#> # Simple named list: list(mean = mean, median = median)
#> 
#> # Auto named with `tibble::lst()`: tibble::lst(mean, median)
#> 
#> # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
head(CITE_dat)
#> # A tibble: 6 × 7
#>    UMAP1 UMAP2 Method     Feature  Expression Type    Gene 
#>    <dbl> <dbl> <fct>      <chr>         <dbl> <chr>   <fct>
#> 1 -0.883  14.5 Train data CD14          0     RNA     CD14 
#> 2 -0.883  14.5 Train data CD19          0     RNA     CD19 
#> 3 -0.883  14.5 Train data CD3E          0     RNA     CD3  
#> 4 -0.883  14.5 Train data CD8A          0     RNA     CD8  
#> 5 -0.883  14.5 Train data ADT_CD14      0.623 Protein CD14 
#> 6 -0.883  14.5 Train data ADT_CD19      0.527 Protein CD19

Six genes’ protein and RNA abundances are shown on the cell UMAP embeddings in the inputted reference data and the synthetic data below.

CITE_dat  %>% ggplot(aes(x = UMAP1, y = UMAP2, color = Expression)) + geom_point(size = 0.1, alpha = 0.5) + scale_colour_gradientn(colors = viridis_pal(option = "A", direction = -1)(10), limits=c(0, 1)) + coord_fixed(ratio = 1) + facet_nested(Method ~ Gene + Type ) + theme(aspect.ratio = 1, legend.position = "bottom")  + theme(aspect.ratio = 1, legend.position = "right") + theme(
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    axis.text.y=element_blank(),
    axis.ticks.y=element_blank())

Session information

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.6 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Los_Angeles
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] ggh4x_0.2.5                 scales_1.2.1               
#>  [3] tidyr_1.3.0                 stringr_1.5.0              
#>  [5] ggplot2_3.4.2               dplyr_1.1.2                
#>  [7] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
#>  [9] Biobase_2.60.0              GenomicRanges_1.52.0       
#> [11] GenomeInfoDb_1.36.1         IRanges_2.34.1             
#> [13] S4Vectors_0.38.1            BiocGenerics_0.46.0        
#> [15] MatrixGenerics_1.12.2       matrixStats_1.0.0          
#> [17] scDesign3_0.99.6            BiocStyle_2.28.0           
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0        viridisLite_0.4.2       farver_2.1.1           
#>  [4] bitops_1.0-7            fastmap_1.1.1           RCurl_1.98-1.12        
#>  [7] kde1d_1.0.5             digest_0.6.33           lifecycle_1.0.3        
#> [10] survival_3.5-5          gamlss.dist_6.0-5       magrittr_2.0.3         
#> [13] compiler_4.3.1          rlang_1.1.1             sass_0.4.7             
#> [16] tools_4.3.1             utf8_1.2.3              yaml_2.3.7             
#> [19] knitr_1.43              labeling_0.4.2          askpass_1.1            
#> [22] S4Arrays_1.0.4          mclust_6.0.0            reticulate_1.30        
#> [25] DelayedArray_0.26.6     withr_2.5.0             purrr_1.0.1            
#> [28] rvinecopulib_0.6.3.1.1  desc_1.4.2              grid_4.3.1             
#> [31] fansi_1.0.4             colorspace_2.1-0        MASS_7.3-60            
#> [34] cli_3.6.1               rmarkdown_2.23          crayon_1.5.2           
#> [37] ragg_1.2.5              generics_0.1.3          umap_0.2.10.0          
#> [40] RSpectra_0.16-1         cachem_1.0.8            zlibbioc_1.46.0        
#> [43] splines_4.3.1           assertthat_0.2.1        parallel_4.3.1         
#> [46] BiocManager_1.30.21.1   XVector_0.40.0          vctrs_0.6.3            
#> [49] Matrix_1.6-0            jsonlite_1.8.7          bookdown_0.34          
#> [52] gamlss_5.4-12           irlba_2.3.5.1           systemfonts_1.0.4      
#> [55] jquerylib_0.1.4         glue_1.6.2              pkgdown_2.0.7          
#> [58] rngWELL_0.10-9          stringi_1.7.12          gtable_0.3.3           
#> [61] randtoolbox_2.0.4       munsell_0.5.0           tibble_3.2.1           
#> [64] pillar_1.9.0            htmltools_0.5.5         openssl_2.1.0          
#> [67] gamlss.data_6.0-2       GenomeInfoDbData_1.2.10 R6_2.5.1               
#> [70] textshaping_0.3.6       rprojroot_2.0.3         evaluate_0.21          
#> [73] lattice_0.21-8          highr_0.10              png_0.1-8              
#> [76] memoise_2.0.1           bslib_0.5.0             Rcpp_1.0.11            
#> [79] nlme_3.1-162            mgcv_1.9-0              xfun_0.39              
#> [82] fs_1.6.3                pkgconfig_2.0.3