Single Cell Analysis (original) (raw)

This section demonstrates how to take in raw data (in this case, the output of Cell Ranger v3.0) and go through a popular analysis pipeline to ultimately cluster and annotate the data. Some people prefer to use Cell Ranger’s built-in dimension reduction and clustering analysis and to view the results with Loupe Cell Browser.

Read in and process the data

Note that there are other approaches to reading in single cell transcriptomics data, such as Bioconductor’s DropletUtils.

# load the data
obj <- readRDS(file = path_obj_raw[1])

# add useful metadata
obj[["percent.mt"]] <- PercentageFeatureSet(object = obj, pattern = "^MT-")

Quality control

Metrics summary

# read in and display the summary table
metrics_summary <- read.csv(file = path_metrics_summary[1])
metrics_summary

GEX QC metrics

Before filtering

VlnPlot(object = obj,
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
        pt.size = 0.01, ncol = 3) &
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5),
        axis.title.x = element_blank())

After filtering

Based on the previous plots, a minimum of 200 features and a maximum of 20% mitochondrial seemed like good cutoffs.

# adjust cutoffs as desired
obj <- subset(x = obj, subset = nFeature_RNA > 200 & percent.mt < 20)

VlnPlot(object = obj,
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
        pt.size = 0.01, ncol = 3) &
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5),
        axis.title.x = element_blank())

Standard Seurat pipeline

You can set verbose = FALSE for many of these commands if you don’t want to see outputs.

Most of these are run on the RNA.

# standard log normalization for RNA and centered log for ADT
obj <- NormalizeData(object = obj, normalization.method = "LogNormalize",
                     scale.factor = 10000, assay = "RNA")
## Performing log-normalization
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|

obj <- NormalizeData(object = obj, normalization.method = "CLR",
                     margin = 2, assay = "ADT") # go across cells, not features
## Normalizing across cells
##   |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s

# highly variable features
obj <- FindVariableFeatures(object = obj, selection.method = "vst",
                            nfeatures = 2000)
## Calculating gene variances
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## Calculating feature variances of standardized and clipped values
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|

# scaling so the average expression is 0 and the variance is 1
obj <- ScaleData(object = obj, features = rownames(obj))
## Centering and scaling data matrix
##   |===============================================================================================| 100%

# dimensionality reduction
obj <- RunPCA(object = obj)
## PC_ 1
## Positive:  LYZ, FCN1, CST3, MNDA, CTSS, PSAP, S100A9, FGL2, AIF1, GRN
##     NCF2, LST1, CD68, TYMP, SERPINA1, CYBB, CLEC12A, CSTA, SPI1, TNFAIP2
##     CPVL, VCAN, MPEG1, TYROBP, KLF4, FTL, S100A8, IGSF6, CD14, MS4A6A
## Negative:  CD3D, TRAC, LTB, TRBC2, IL32, CD3G, IL7R, CD69, CD247, TRBC1
##     CD2, CD7, CD27, ARL4C, ISG20, HIST1H4C, SYNE2, GZMM, ITM2A, CCR7
##     RORA, MAL, CXCR4, LEF1, TRAT1, CTSW, GZMA, KLRB1, TRABD2A, CCL5
## PC_ 2
## Positive:  CD79A, MS4A1, IGHM, BANK1, HLA-DQA1, CD79B, IGKC, LINC00926, RALGPS2, TNFRSF13C
##     VPREB3, IGHD, SPIB, CD22, FCRL1, HLA-DQB1, BLK, FAM129C, FCRLA, TCL1A
##     GNG7, TCF4, COBLL1, PAX5, SWAP70, CD40, BCL11A, P2RX5, TSPAN13, ADAM28
## Negative:  NKG7, CST7, GZMA, PRF1, KLRD1, CTSW, FGFBP2, GNLY, GZMH, CCL5
##     GZMM, CD247, KLRF1, HOPX, SPON2, ADGRG1, TRDC, MATK, GZMB, FCGR3A
##     S100A4, CCL4, CLIC3, KLRB1, IL2RB, TBX21, TTC38, ANXA1, PTGDR, PLEKHF1
## PC_ 3
## Positive:  GZMB, NKG7, GNLY, CLIC3, PRF1, KLRD1, FGFBP2, KLRF1, SPON2, CST7
##     GZMH, FCGR3A, ADGRG1, GZMA, HOPX, CTSW, TRDC, CCL4, HLA-DPB1, C12orf75
##     PLAC8, TTC38, PLEK, APOBEC3G, TBX21, PRSS23, CYBA, MATK, SYNGR1, CXXC5
## Negative:  IL7R, MAL, LEF1, TRABD2A, TRAC, CCR7, LTB, CD27, FOS, LRRN3
##     FHIT, TRAT1, RGCC, CAMK4, CD3D, RGS10, CD40LG, FOSB, AQP3, SOCS3
##     FLT3LG, CD3G, SLC2A3, TSHZ2, VIM, S100A12, S100A8, CD28, PLK3, VCAN
## PC_ 4
## Positive:  FCER1A, PLD4, SERPINF1, IL3RA, CLEC10A, GAS6, LILRA4, TPM2, CLEC4C, ENHO
##     FLT3, SMPD3, ITM2C, LGMN, CD1C, P2RY14, PPP1R14B, SCT, PROC, LAMP5
##     RUNX2, AC119428.2, PACSIN1, DNASE1L3, PTCRA, RGS10, UGCG, CLIC2, PPM1J, P2RY6
## Negative:  MS4A1, CD79A, LINC00926, BANK1, TNFRSF13C, VPREB3, CD79B, RALGPS2, IGHD, FCRL1
##     BLK, IGHM, CD22, PAX5, ARHGAP24, CD24, P2RX5, NCF1, S100A12, CD19
##     SWAP70, FCRLA, VNN2, TNFRSF13B, FCER2, IGKC, FCRL2, RBP7, CD40, S100A8
## PC_ 5
## Positive:  BATF3, C1QA, TCF7L2, CTSL, CDKN1C, HLA-DQB1, HES4, SIGLEC10, CLEC10A, ABI3
##     HLA-DQA1, RHOC, CSF1R, ENHO, CAMK1, MTSS1, IFITM3, CD1C, LY6E, FCGR3A
##     HLA-DPA1, CLIC2, HLA-DPB1, YBX1, RRAS, AC064805.1, NR4A1, GBP1, ZNF703, CXCL16
## Negative:  LILRA4, TPM2, CLEC4C, SMPD3, IL3RA, SERPINF1, DERL3, MZB1, SCT, JCHAIN
##     PACSIN1, PROC, S100A12, PTCRA, LINC00996, PADI4, ASIP, KCNK17, ITM2C, EPHB1
##     ALOX5AP, LAMP5, DNASE1L3, MAP1A, S100A8, APP, CYP1B1, VNN2, UGCG, ZFAT

# clustering (adjust dimensions and resolutions as desired)
obj <- FindNeighbors(object = obj, reduction = "pca", dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN

obj <- FindClusters(object = obj, resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 4193
## Number of edges: 154360
##
## Running Louvain algorithm...
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## Maximum modularity in 10 random starts: 0.9097
## Number of communities: 10
## Elapsed time: 0 seconds

obj <- RunUMAP(object = obj, reduction = "pca", dims = 1:20)
## 14:44:35 UMAP embedding parameters a = 0.9922 b = 1.112
## 14:44:35 Read 4193 rows and found 20 numeric columns
## 14:44:35 Using Annoy for neighbor search, n_neighbors = 30
## 14:44:35 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:44:36 Writing NN index file to temp file /tmp/RtmpHhoQAU/filec346711dd9f93
## 14:44:36 Searching Annoy index using 1 thread, search_k = 3000
## 14:44:37 Annoy recall = 100%
## 14:44:38 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:44:39 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:44:39 Commencing optimization for 500 epochs, with 174318 positive edges
## Using method 'umap'
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:44:44 Optimization finished

Dimensionality reduction

Load in the processed object

To avoid having to save multiple large objects, this file already includes the annotations defined in a later section, so we will clear them out here to proceed as normal.

obj <- readRDS(file = path_obj_annotated[1])

# reset to baseline
Idents(object = obj) <- "seurat_clusters"
obj$annotated_clusters <- c()

PCA

Typically this section would go in between the RunPCA() and FindNeighbors() steps above. It has been included here because of how we save and load the objects separately for processing speed.

Now that we’ve run PCA, we can examine an elbow plot as a simple method for selecting how many PCs to choose when identifying neighbors/clusters.

ElbowPlot(object = obj, ndims = 30)

From the plot, it looked like 20 works as a cut-off for the number of PCs to include. We could have also chosen to use 10, but decided to go a little higher.

You can also examine other information which was used for the PCA, such as the top twenty most variable features:

head(VariableFeatures(object = obj), 20)
##  [1] "IGLC2"    "IGLC3"    "HLA-DQA1" "FCER1A"   "C1QA"     "PTGDS"   
##  [7] "GNLY"     "CLEC10A"  "IGHM"     "IGKC"     "CCL4L2"   "JCHAIN"  
## [13] "LYPD2"    "CD1C"     "CCL4"     "C1QB"     "HLA-DPA1" "HLA-DPB1"
## [19] "EREG"     "HLA-DRB1"
# these can be plotted if desired
LabelPoints(plot = VariableFeaturePlot(object = obj),
            points = head(VariableFeatures(object = obj), 20),
            repel = TRUE, xnudge = 0, ynudge = 0) +
  labs(title = "Top Twenty Variable Features") +
  theme(legend.position = "bottom")

UMAP

# you can also use the LabelClusters function to help label individual clusters
plot_seurat_clusters <- UMAPPlot(object = obj, label = TRUE, label.size = 4,
                                 label.box = TRUE) +
                          labs(title = "Initial Clusters") +
                          scale_fill_manual(
                               values = rep("white",
                                            n_distinct(obj$seurat_clusters))) +
                          theme(plot.title = element_text(hjust = 0.5),
                                axis.text = element_blank(),
                                axis.ticks = element_blank())
plot_seurat_clusters

Marker overlays

Note that the following marker genes **are not meant to be an exhaustive list.**They are included as examples of some of the cell types you could look for.

Load marker genes

# sort by Cell_Type and Marker if not already sorted
markers_all <- read.csv(file = path_marker_genes[1])
markers_all %>% slice_sample(n = 10) # example markers

Dot plots

GEX

Here we only demonstrate a few of the cell types included within the provided markers database. We plot those cell types on top of the dot plot to make it easier to see which markers they correspond with, but you can comment that code out if you would rather have a simple dot plot.

select_cell_types <- c("B", "Macrophages", "mDC", "NK", "T")

# do features = unique(markers_all$Marker) to use all possible features
p <- DotPlot(object = obj,
             features = markers_all %>%
                          dplyr::filter(Cell_Type %in% select_cell_types) %>%
                          distinct(Marker) %>% pull(),
             cols = "RdBu", col.min = -1, dot.scale = 3, cluster.idents = TRUE)

# add in the cell type information
# if desired, you could rename the "Cell_Type"s in the original database to be
# more informative e.g. Natural_Killers instead of NK
p$data <- left_join(p$data,
                    markers_all %>%
                      dplyr::filter(Cell_Type %in% select_cell_types) %>%
                      dplyr::rename(features.plot = "Marker"),
                    by = "features.plot", multiple = "all")
# depending on your version of dplyr, you can set `relationship = "many-to-many"`
# to surpress the warning

# plot
p +
  facet_grid(cols = vars(Cell_Type), scales = "free_x", space = "free") +
  theme(strip.text.x = element_text(size = 10)) + RotatedAxis()

ADT

The average expression was set to a minimum of zero to better see the up-regulated features.

# you have to change the default assay for the dot plot
DefaultAssay(object = obj) <- "ADT"

DotPlot(object = obj,
        features = rownames(GetAssayData(object = obj, assay = "ADT",
                                         slot = "counts")),
        cols = "RdBu", col.min = -1, dot.scale = 3, cluster.idents = TRUE) +
  RotatedAxis()
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# reset the default
DefaultAssay(object = obj) <- "RNA"

Feature plots

If you would like to generate feature plots for a cell type within your markers database (if you are using one) e.g. for B cells, you could do features = (dplyr::filter(markers_all, Cell_Type == "B"))$Marker in the following command. Here we just show a few characteristic markers for different cell types for simplicity.

FeaturePlot(object = obj,
            features = c("MS4A1", "CD14", # MS4A1 is another name for CD20
                         "NKG7", "IL3RA", "CD3D", "IL7R"),
            min.cutoff = 0, label = TRUE, label.size = 4,
            ncol = 3, raster = FALSE)

Violin plots

Here the point size is set to zero so that you can just see the violins. We only show a few of the cell surface markers here, but you could plot them all withfeatures = rownames(obj[["ADT"]]). You can also plot the standard GEX markers just like for the feature plots (just make sure to remove assay = "ADT").

VlnPlot(object = obj,
        features = c("CD3-TotalSeqB", "CD14-TotalSeqB",
                     "CD20-TotalSeqB", "CD335-TotalSeqB"),
        pt.size = 0, assay = "ADT", ncol = 2) &
  theme(plot.title = element_text(size = 10),
        axis.text.x = element_text(angle = 0, hjust = 0.5),
        axis.title.x = element_blank())

Annotate cell clusters

These annotations are chosen here more as a proof of general concept instead of a more highly refined and verified approach that you would typically see within a single-cell publication. There are also automated methods for annotation.

Annotations

If you would like to further break down these clusters, you can increase the clustering resolution in the standard pipeline in order to increase the number of clusters.

# mDC = myeloid dendritic cells
# pDC = plasmacytoid dendritic cells
obj_annotations <- rbind(c("0", "Macrophages/mDCs"), # difficult to separate
                         c("1", "T Cells"),
                         c("2", "T Cells"),
                         c("3", "T Cells"),
                         c("4", "Natural Killers"),
                         c("5", "B Cells"),
                         c("6", "Macrophages/mDCs"), # difficult to separate
                         c("7", "Macrophages/mDCs"), # difficult to separate
                         c("8", "T Cells"), # faint signal
                         c("9", "pDCs"))
colnames(obj_annotations) <- c("Cluster", "CellType")
obj_annotations <- data.frame(obj_annotations)

# save the annotations as a csv if you'd like
# write.csv(obj_annotations, file = file.path(path_data, "obj_annotations.csv"))

# prepare the annotation information
annotations <- setNames(obj_annotations$CellType, obj_annotations$Cluster)

# relabel the Seurat clusters
# Idents(obj) <- "seurat_clusters"
obj <- RenameIdents(object = obj, annotations)

# alphabetize the cell types
Idents(obj) <- factor(Idents(object = obj), levels = sort(levels(obj)))

# useful metadata (e.g. if you want to have multiple annotation sets)
obj[["annotated_clusters"]] <- Idents(object = obj)

# save the processed and annotated Seurat object
# saveRDS(obj, file = file.path(path_data, "tenx_pbmc5k_CITEseq_annotated.rds"))

# info about the clusters
obj_annotations %>%
  group_by(CellType) %>%
  transmute(Clusters = paste0(Cluster, collapse = ", ")) %>%
  distinct() %>%
  arrange(CellType)
# metadata file for MCIA
meta <- data.frame("CellType" = Idents(object = obj))

# save the metadata file for MCIA
# write.csv(meta, file = file.path(path_data, "metadata_sc.csv"))

UMAPs

Here we use the selected metadata colors from the MCIA section, so be sure to change that (or just remove the cols = ... to use the default) if you are running this section first. If you don’t want the cluster labels to have the white backgrounds, remove label.box and scale_fill_manual().

# change colors and themes as desired
plot_annotated_clusters <- UMAPPlot(object = obj,
                                    label = TRUE, label.size = 4,
                                    label.box = TRUE,
                                    cols = meta_colors_sc) +
                             labs(title = "Annotated Clusters") +
                             scale_fill_manual(
                               values = rep("white", length(meta_colors_sc))) +
                             theme(plot.title = element_text(hjust = 0.5),
                                   axis.text = element_blank(),
                                   axis.ticks = element_blank()) +
                             NoLegend() # a legend is not needed here

ggpubr::ggarrange(plot_seurat_clusters + NoLegend(), plot_annotated_clusters,
                  heights = c(1, 1), widths = c(1, 1))

You should change the DefaultAssay to/from ADT like in ADT if you don’t want to specify “adt_” before the feature name.

# plot ADT information on top as an example
FeaturePlot(object = obj, features = "adt_CD19-TotalSeqB",
            label = TRUE, label.size = 4, ncol = 1, raster = FALSE)

Check the annotations

For example, if you want to visually check your T cell cluster annotations (just like before, you can also do features = (dplyr::filter(markers_all, Cell_Type == "T"))$Markerto use all of the T cell markers in the markers database):

VlnPlot(object = obj,
        features = c("CD3D", "CD4", "IL7R", "TRAC"),
        cols = meta_colors_sc, pt.size = 0.01, ncol = 2) &
  theme(plot.title = element_text(size = 10),
        axis.title.x = element_blank())

You can also obtain the following bar plot using the output of nipals_multiblock e.g. with ggplot(data.frame(table(nmb_get_metadata(mcia_results_sc))), aes(x = CellType, y = Freq, fill = CellType)) + geom_bar(stat = "identity") (for the base plot).

# examine the total counts
ggplot(obj[[]], aes(x = annotated_clusters, fill = annotated_clusters)) +
  geom_bar(color = "black", linewidth = 0.2) +
  labs(title = "pbmc5k-CITEseq Cell Type Counts",
       x = "Cell Type", y = "Count") +
  scale_fill_manual(values = meta_colors_sc) +
  theme_bw() +
  theme(plot.title = element_text(size = 14, hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major.x = element_blank(),
        legend.position = "none")

Save for MCIA

The file has to be structured in the form of a (large) list, with each omic saved as an element within it. If you would like to append the type of omic to the end of your features, uncomment the commented lines.

# mrna
data_rna <- GetAssayData(object = obj, slot = "data",
                         assay = "RNA")[VariableFeatures(object = obj), ]
data_rna <- as.matrix(data_rna)
data_rna <- t(data_rna) # switch the rows
# colnames(data_rna) <- paste(colnames(data_rna), "mrna", sep = "_")

# adt
data_adt <- GetAssayData(object = obj, slot = "data", assay = "ADT")
data_adt <- as.matrix(data_adt)
data_adt <- t(data_adt) # switch the rows
# colnames(data_adt) <- paste(colnames(data_adt), "adt", sep = "_")

# combined
data_blocks_sc <- list(mrna = data_rna, adt = data_adt)

# examine the contents
data.frame(data_blocks_sc$mrna[1:5, 1:5])
data.frame(data_blocks_sc$adt[1:5, 1:5])

# save(data_blocks_sc, file = file.path(path_data, "data_blocks_sc.Rda"))