Load libraries

library(Seurat)
library(knitr)
library(kableExtra)

Load the Seurat object from part 1

load(file="original_seurat_object.RData")
experiment.aggregate
## An object of class Seurat 
## 12811 features across 2896 samples within 1 assay 
## Active assay: RNA (12811 features)

Some basic QA/QC of the metadata, print tables of the 5% quantiles.

Show 5% quantiles for number of genes per cell per sample

do.call("cbind", tapply(experiment.aggregate$nFeature_RNA, Idents(experiment.aggregate),quantile,probs=seq(0,1,0.05)))
##      UCD_Adj_VitE UCD_Supp_VitE UCD_VitE_Def
## 0%         200.00        246.00       235.00
## 5%         357.15        383.00       381.30
## 10%        418.50        488.90       475.20
## 15%        566.95        592.40       642.90
## 20%        847.00        740.00       787.00
## 25%       1005.50        829.50       933.50
## 30%       1102.00        927.10      1062.00
## 35%       1189.05       1003.65      1156.55
## 40%       1301.20       1079.80      1267.40
## 45%       1427.00       1185.75      1428.90
## 50%       1584.50       1302.50      1555.50
## 55%       1689.30       1423.45      1671.15
## 60%       1814.00       1555.20      1810.00
## 65%       1934.70       1661.05      1907.00
## 70%       2099.00       1805.80      2038.40
## 75%       2250.00       1954.00      2172.50
## 80%       2470.80       2063.40      2336.40
## 85%       2763.55       2287.05      2578.35
## 90%       3001.00       2514.30      2928.90
## 95%       3306.05       2801.20      3319.40
## 100%      4514.00       4238.00      4735.00

Show 5% quantiles for number of UMI per cell per sample

do.call("cbind", tapply(experiment.aggregate$nCount_RNA, Idents(experiment.aggregate),quantile,probs=seq(0,1,0.05)))
##      UCD_Adj_VitE UCD_Supp_VitE UCD_VitE_Def
## 0%         496.00        498.00       499.00
## 5%         551.15        565.95       586.00
## 10%        693.50        673.90       712.30
## 15%        866.90        816.85       894.65
## 20%       1287.80       1062.40      1164.60
## 25%       1623.00       1209.25      1401.25
## 30%       1873.50       1397.90      1690.00
## 35%       2067.00       1571.65      1898.55
## 40%       2282.40       1776.60      2165.60
## 45%       2649.00       2013.25      2581.60
## 50%       3036.00       2231.00      2893.00
## 55%       3390.85       2521.10      3278.10
## 60%       3785.40       2834.40      3647.60
## 65%       4142.45       3239.60      4168.00
## 70%       4622.80       3673.50      4563.00
## 75%       5297.25       4242.75      5018.25
## 80%       6229.80       4693.20      5677.40
## 85%       7924.55       5442.35      6890.60
## 90%       9066.50       6447.30      8489.90
## 95%      10963.05       7869.40     11031.10
## 100%     20634.00      19948.00     26952.00

Show 5% quantiles for number of mitochondrial percentage per cell per sample

round(do.call("cbind", tapply(experiment.aggregate$percent.mito, Idents(experiment.aggregate),quantile,probs=seq(0,1,0.05))), digits = 3)
##      UCD_Adj_VitE UCD_Supp_VitE UCD_VitE_Def
## 0%          0.031         0.000        0.000
## 5%          1.354         0.653        0.652
## 10%         2.059         0.991        1.105
## 15%         2.461         1.409        1.536
## 20%         2.842         1.693        1.794
## 25%         3.179         1.972        2.121
## 30%         3.548         2.215        2.411
## 35%         3.844         2.598        2.718
## 40%         4.233         2.904        3.023
## 45%         4.526         3.152        3.399
## 50%         4.908         3.429        3.749
## 55%         5.346         3.825        4.111
## 60%         5.666         4.209        4.482
## 65%         6.023         4.662        4.856
## 70%         6.482         5.018        5.297
## 75%         6.996         5.483        5.834
## 80%         7.586         5.980        6.497
## 85%         8.356         6.626        7.157
## 90%        10.026         7.529        8.059
## 95%        27.560        10.056       10.858
## 100%       59.279        51.836       64.441

Table of cell cycle

table(experiment.aggregate@meta.data$cell.cycle) %>% kable(caption = "Number of Cells in each Cell Cycle Stage", col.names = c("Stage", "Count"), align = "c") %>% kable_styling()
Number of Cells in each Cell Cycle Stage
Stage Count
G1 2764
G2M 97
S 35

Plot the number of cells each gene is represented by

plot(sort(Matrix::rowSums(GetAssayData(experiment.aggregate) >= 3)) , xlab="gene rank", ylab="number of cells", main="Cells per genes (reads/gene >= 3 )")

Violin plot of 1) number of genes, 2) number of UMI and 3) percent mitochondrial genes

VlnPlot(
  experiment.aggregate,
  features = c("nFeature_RNA", "nCount_RNA","percent.mito"),
  ncol = 1, pt.size = 0.3)

Gene Plot, scatter plot of gene expression across cells, (colored by sample)

FeatureScatter(experiment.aggregate, feature1 = "nCount_RNA", feature2 = "percent.mito")

FeatureScatter(
  experiment.aggregate, "nCount_RNA", "nFeature_RNA",
  pt.size = 0.5)

Cell filtering

We use the information above to filter out cells. Here we choose those that have percent mitochondrial genes max of 10% and unique UMI counts under 20,000 or greater than 500.

experiment.aggregate <- subset(experiment.aggregate, percent.mito <= 10)

experiment.aggregate <- subset(experiment.aggregate, nCount_RNA >= 500 & nCount_RNA <= 20000)

experiment.aggregate
## An object of class Seurat 
## 12811 features across 2681 samples within 1 assay 
## Active assay: RNA (12811 features)

You may also want to filter out additional genes.

When creating the base Seurat object we did filter out some genes, recall Keep all genes expressed in >= 10 cells. After filtering cells and you may want to be more aggressive with the gene filter. Seurat doesn’t supply such a function (that I can find), so below is a function that can do so, it filters genes requiring a min.value (log-normalized) in at least min.cells, here expression of 1 in at least 400 cells.

FilterGenes <-
 function (object, min.value=1, min.cells = 0, genes = NULL) {
   genes.use <- rownames(object)
   if (!is.null(genes)) {
     genes.use <- intersect(genes.use, genes)
     object@data <- GetAssayData(object)[genes.use, ]
   } else if (min.cells > 0) {
     num.cells <- Matrix::rowSums(GetAssayData(object) > min.value)
     genes.use <- names(num.cells[which(num.cells >= min.cells)])
     object = object[genes.use, ]
   }
  object <- LogSeuratCommand(object = object)
  return(object)
}

experiment.aggregate.genes <- FilterGenes(object = experiment.aggregate, min.value = 1, min.cells = 400)
experiment.aggregate.genes
## An object of class Seurat 
## 1117 features across 2681 samples within 1 assay 
## Active assay: RNA (1117 features)
table(experiment.aggregate$orig.ident)
## 
##  UCD_Adj_VitE UCD_Supp_VitE  UCD_VitE_Def 
##           808           947           926

Next we want to normalize the data

After filtering out cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method LogNormalize that normalizes the gene expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and then log-transforms the data.

?NormalizeData
experiment.aggregate <- NormalizeData(
  object = experiment.aggregate,
  normalization.method = "LogNormalize",
  scale.factor = 10000)

Identify variable genes

The function FindVariableFeatures identifies the most highly variable genes (default 2000 genes) by fitting a line to the relationship of log(variance) and log(mean) using loess smoothing, uses this information to standardize the data, then calculates the variance of the standardized data. This helps avoid selecting genes that only appear variable due to their expression level.

?FindVariableFeatures

experiment.aggregate <- FindVariableFeatures(
  object = experiment.aggregate,
  selection.method = "vst")

length(VariableFeatures(experiment.aggregate))
## [1] 2000
top10 <- head(VariableFeatures(experiment.aggregate), 10)

top10
##  [1] "Pvalb"    "Fxyd7"    "S100a16"  "Apoe"     "Crip1"    "Ly86"    
##  [7] "Ifi27l2a" "Sst"      "Pcp4"     "Hbb-bs"
vfp1 <- VariableFeaturePlot(experiment.aggregate)
vfp1 <- LabelPoints(plot = vfp1, points = top10, repel = TRUE)
vfp1

Question(s)

  1. Play some with the filtering parameters, see how results change?
  2. How do the results change if you use selection.method = “dispersion” or selection.method = “mean.var.plot”

Finally, lets save the filtered and normalized data

save(experiment.aggregate, file="pre_sample_corrected.RData")

Get the next Rmd file

download.file("https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2019-single-cell-RNA-sequencing-Workshop-UCD_UCSF/master/scrnaseq_analysis/scRNA_Workshop-PART3.Rmd", "scRNA_Workshop-PART3.Rmd")

Session Information

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] kableExtra_1.1.0 knitr_1.23       Seurat_3.0.2    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-140        tsne_0.1-3          bitops_1.0-6       
##  [4] webshot_0.5.1       RColorBrewer_1.1-2  httr_1.4.0         
##  [7] sctransform_0.2.0   tools_3.6.0         R6_2.4.0           
## [10] irlba_2.3.3         KernSmooth_2.23-15  lazyeval_0.2.2     
## [13] colorspace_1.4-1    withr_2.1.2         npsurv_0.4-0       
## [16] tidyselect_0.2.5    gridExtra_2.3       compiler_3.6.0     
## [19] rvest_0.3.4         xml2_1.2.0          plotly_4.9.0       
## [22] labeling_0.3        caTools_1.17.1.2    scales_1.0.0       
## [25] lmtest_0.9-37       readr_1.3.1         ggridges_0.5.1     
## [28] pbapply_1.4-0       stringr_1.4.0       digest_0.6.19      
## [31] rmarkdown_1.13      R.utils_2.9.0       pkgconfig_2.0.2    
## [34] htmltools_0.3.6     bibtex_0.4.2        highr_0.8          
## [37] htmlwidgets_1.3     rlang_0.3.4         rstudioapi_0.10    
## [40] zoo_1.8-6           jsonlite_1.6        ica_1.0-2          
## [43] gtools_3.8.1        dplyr_0.8.1         R.oo_1.22.0        
## [46] magrittr_1.5        Matrix_1.2-17       Rcpp_1.0.1         
## [49] munsell_0.5.0       ape_5.3             reticulate_1.12    
## [52] R.methodsS3_1.7.1   stringi_1.4.3       yaml_2.2.0         
## [55] gbRd_0.4-11         MASS_7.3-51.4       gplots_3.0.1.1     
## [58] Rtsne_0.15          plyr_1.8.4          grid_3.6.0         
## [61] parallel_3.6.0      gdata_2.18.0        listenv_0.7.0      
## [64] ggrepel_0.8.1       crayon_1.3.4        lattice_0.20-38    
## [67] cowplot_0.9.4       splines_3.6.0       hms_0.4.2          
## [70] SDMTools_1.1-221.1  pillar_1.4.1        igraph_1.2.4.1     
## [73] future.apply_1.3.0  reshape2_1.4.3      codetools_0.2-16   
## [76] glue_1.3.1          evaluate_0.14       lsei_1.2-0         
## [79] metap_1.1           data.table_1.12.2   png_0.1-7          
## [82] Rdpack_0.11-0       gtable_0.3.0        RANN_2.6.1         
## [85] purrr_0.3.2         tidyr_0.8.3         future_1.13.0      
## [88] assertthat_0.2.1    ggplot2_3.2.0       xfun_0.7           
## [91] rsvd_1.0.1          survival_2.44-1.1   viridisLite_0.3.0  
## [94] tibble_2.1.3        cluster_2.1.0       globals_0.12.4     
## [97] fitdistrplus_1.0-14 ROCR_1.0-7