Aim: perform post-quantification quality control checks.

1 File paths/files for workflow

# File paths
source(here::here("R", "file_paths.R"))

load(file.path(path_to_recount, "counts/rse_gene.Rdata"))
gtf <- readGFF(filepath = path_to_ref_gtf)

sample_info <- 
  read_excel(
    path = 
      file.path(
        path_to_raw_data,
        "sample_details/SRP058181_sample_metadata.xlsx"
      )
  ) %>% 
  dplyr::na_if(.,"N/A") %>% 
  dplyr::select(-Proteomics, -Proteomics_SV1, -Proteomics_SV2, -Proteomics_SV3, -Microarray_study_ID) %>% 
  dplyr::mutate(
    sample_id = `RNA-Seq_Samples`,
    Braak_score = Braak_score %>% 
      str_replace_all(c("IV" = "4",
                        "II-III" = "3",
                        "I-II" = "2",
                        "III" = "3",
                        "II" = "2",
                        "I" = "1")) %>% 
      as.integer(),
    Disease_Group = ifelse(Condition == "Control", "Control",
                           ifelse(Condition == "PD" & Dementia == "no", "PD",
                                  ifelse(Condition == "PD" & Dementia == "yes", "PDD", NA))),
    Disease_Group = replace_na(Disease_Group, "PD")
  ) %>% 
  dplyr::inner_join(
    colData(rse_gene) %>% 
      as_tibble() %>% 
      dplyr::select(run, title, mapped_read_count) %>% 
      dplyr::mutate(recount_id = run, 
                    title = str_replace(title, " \\[.*", "")) %>% 
      dplyr::select(-run), 
    by = c("sample_id" = "title")
  ) %>% 
  dplyr::full_join(
    read_delim(
      file.path(
        path_to_results,
        "deconvolution/scaden/SRP058181/scaden_predictions_cells.1000:samples.1000.txt"
      ), 
      delim = "\t"
    )  %>% 
      dplyr::rename(recount_id = X1) %>% 
      dplyr::rename_at(vars(-recount_id),function(x) paste0(x,"_proportion"))) %>% 
  dplyr::select(sample_id, recount_id, Disease_Group, everything(), -Condition, -`RNA-Seq_Samples`) %>% 
  dplyr::arrange(recount_id)

2 Background

Data originally downloaded from recount, SRP058181.
Used data to perform a number of QC checks, including:
1. Checked library sizes and count distributions across samples.
2. Sex checks.
3. How samples clustered using uncorrected gene count data.

3 Creating a DESeqDataSet

Recount2 provide a SummarisedExperiment. DESeq does have a function DESeqDataSet to convert such SummarisedExperiment objects. However, metadata in the summarised experiment does not include columns for sample characteristics including PMI, RIN, etc.
Therefore, chose to re-generate the DESeqDataSet from the count matrix in the SummarisedExperiment.

# Important that rows in the dataframe, 'sample_info', are in the same order as the columns in the count matrix, 'countData'.
countdata <- assay(rse_gene)
countdata <- countdata[ ,sort(colnames(countdata))] # Sorted alphanumerically

coldata <- 
  sample_info %>% 
  dplyr::select(sample_id, recount_id, Disease_Group, everything()) %>% 
  inner_join(colData(rse_gene) %>% 
                      as_tibble() %>% 
                      dplyr::mutate(recount_id = run, 
                                    title = str_replace(title, " \\[.*", "")) %>% 
                      dplyr::select(-run, -title, -characteristics, -mapped_read_count), 
                    by = c("recount_id" = "recount_id")) %>% 
  arrange(recount_id) # Sorted alphanumerically to match sorting in countdata

dds <- DESeqDataSetFromMatrix(countData = countdata,
                              colData = coldata,
                              design = ~ Disease_Group)

saveRDS(
  dds, 
  file = 
    file.path(
      path_to_results,
      "SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
      )
  )

4 Library size and count distribution

dds <- readRDS(
  file.path(
    path_to_results,
    "SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
  )
)

# Plot library size using sum of all gene counts within a sample
ggplot(data = colData(dds) %>% 
         as.data.frame(), 
       aes(x = MarkerGenes::reorder_within(x = recount_id,
                                           by = mapped_read_count,
                                           within = recount_id,
                                           fun = median,
                                           desc = TRUE), 
           y = mapped_read_count,
           fill = RIN)) +
  geom_col() +
  MarkerGenes::scale_x_reordered() + 
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
  labs(x = "Sample ID", y = "Recount2: Mapped read count") +
  theme_rhr

Plot of total library size, as determined by summing unnormalised gene counts across samples. Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.

Figure 4.1: Plot of total library size, as determined by summing unnormalised gene counts across samples. Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.

Average library size is: 83.25 million reads. This is smaller than our average of ~ 155 million reads per sample.
Average read length: 202
As count data is not normally distributed,vst function from DESeq2 used to apply a variance-stabilising transformation. This transformation is roughly similar to putting the data on the log2 scale, while also dealing with the sampling variability of low counts.
The vst function also corrects for library size.

# Get log2 counts
vsd <- vst(object = dds,
           blind = TRUE)

# Check distributions of samples using boxplots
ggplot(data = vsd %>% 
         assay() %>% 
         as_tibble() %>% 
         dplyr::mutate(gene_id = rownames(assay(vsd))) %>% 
         gather(key = sample_id, value = log2_counts_per_million, -gene_id) %>% 
         inner_join(sample_info, by = c("sample_id" = "recount_id")), 
       aes(x = MarkerGenes::reorder_within(x = sample_id,
                                           by = log2_counts_per_million,
                                           within = sample_id,
                                           fun = median,
                                           desc = FALSE), 
           y = log2_counts_per_million, fill = RIN)) + 
  geom_boxplot() + 
  MarkerGenes::scale_x_reordered() +
  geom_hline(yintercept = median(assay(vsd)), size = 0.75, linetype = "dashed") + 
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
  labs(x = "Sample ID", y = "Log2 counts per million") +
  theme_rhr

Plot of log2 counts per million across samples. Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.

Figure 4.2: Plot of log2 counts per million across samples. Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.

There does not appear to be a relationship between median log2CPM and RIN.

5 Sex checks

Following quantification, did a basic sex check. Looked at genes known to be exclusively expressed in female/males e.g. XIST (X-chromosome expressed and used for X-inactivation) and DDX3Y (Y chromosome).
Based on cohort description would not expect any samples to have notable XIST expression.
NOTE: Sample SRR2015746 (C0061) has expression of both XIST (~60,000 counts) and DDX3Y (~800,000 counts) - be sure to remove this sample from further analysis!

# XIST and DDX3Y ensembl IDs
sex_genes <- gtf %>% 
  dplyr::filter(gene_name %in% c("XIST", "DDX3Y")) %>% 
  distinct(gene_name, .keep_all = TRUE) %>% 
  dplyr::select(gene_id, gene_name)

sex_gene_expr <- sex_genes %>% 
  inner_join(dds %>% 
               assay() %>% # Access count data
               as_tibble() %>% 
               dplyr::mutate(gene_id = dds %>% 
                               assay() %>% 
                               rownames() %>% 
                               str_replace("\\..*", "")) %>% 
               dplyr::select(gene_id, everything()), 
             by = c("gene_id" = "gene_id")) %>% 
  gather(key = sample_id, value = counts, -gene_id, -gene_name) %>% 
  inner_join(sample_info, by = c("sample_id" = "recount_id"))

ggplot(sex_gene_expr, aes(x = sample_id, y = counts, fill = Sex)) +
  geom_col() + 
  facet_grid(gene_name~., scales = "free_y") +
  labs(x = "Sample ID", y = "Counts") +
  scale_fill_manual(values = c("#00BFC4")) +
  theme_rhr

Plot of read counts across *DDX3Y* (Y-chromosome expressed) and *XIST* (required for X-chromosome inactivation). Samples are coloured by their assigned sex, as provided in the initial sample information from Imperial. Expression patterns of the female/male-specific genes match the assigned sexes.

Figure 5.1: Plot of read counts across DDX3Y (Y-chromosome expressed) and XIST (required for X-chromosome inactivation). Samples are coloured by their assigned sex, as provided in the initial sample information from Imperial. Expression patterns of the female/male-specific genes match the assigned sexes.

6 Heatmap of sample-to-sample distances

Transformed count matrix can be transposed (i.e. samples as rows and genes as columns), and the distance between samples can be computed and plotted in a heatmap. This can then provide an overview of the similarities/dissimilarities between samples.

sampleDists <- vsd %>% 
  assay() %>% 
  t() %>% # Transpose transformed count matrix, such that samples now rows.
  dist() # Compute distances between rows of the matrix i.e. sample-to-sample distance
  
sampleDistMatrix <- sampleDists %>% 
  as.matrix() # Convert to matrix

rownames(sampleDistMatrix) <- str_c(vsd$Disease_Group)
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col= viridis(n = 20))

Figure 6.1: Heatmap of sample-to-sample distances calculated using vst-transformed count matrix. Samples labelled by name, disease group and sex.

Based on heatmap of sample-to-sample distances, some samples do appear to be grouping by disease, although it is not entirely clear cut.

7 PCA

7.1 Scree plot

A scree plot shows the fraction of total variance explained by each PC.

pca_vsd <- prcomp(t(assay(vsd)))

ggarrange(pcascree(pca_vsd, type = "pev"),
          pcascree(pca_vsd, type = "cev"),
          nrow = 2,
          labels = c("a", "b"))

Plot of (a) the proportion of variance explained by each PC and (b) the cumulative proportion of variance explained with the addition of each PC.

Figure 7.1: Plot of (a) the proportion of variance explained by each PC and (b) the cumulative proportion of variance explained with the addition of each PC.

7.2 Correlation of PC axes with sample covariates

Easiest way to determine associations between PC and co-variates is to perform a correlation analysis between PCs and co-variates.
Figure and table below show that there is a significant association between 3 covariates (RIN, age at death, disease group) and PC1, in addition to age at death and PC73.
Table shows FDR-corrected p-values of association.
Based on the below, will be important to correct downstream analyses for:
- RIN
- Age at death
- Several cell-type proportions

PC_correlates <- correlatePCs(pcaobj = pca_vsd, 
                              coldata = colData(vsd) %>% 
                                as.data.frame() %>% 
                                dplyr::select(Disease_Group, Age_at_death, PMI, RIN, Sex, Motor_onset, Disease_duration, Braak_score,mapped_read_count, contains("proportion")), 
                              pcs = 1:73)

# Some proportions so highly correlated that p-value = 0
# Replace these 0s with p-value lower than minimum of 1.6E-9
PC_correlates_FDR <- PC_correlates %>% 
  as_tibble() %>% 
  dplyr::mutate(PC = str_c("PC_", c(1:73))) %>% 
  dplyr::select(-Sex, -Motor_onset, -Disease_duration) %>% 
  tidyr::gather(key = covariate, value = p, -PC) %>% 
  dplyr::mutate(p = case_when(p == 0 ~ 1 * 10^-12,
                              TRUE ~ p),
                FDR = p.adjust(p, method = "fdr"))

PC_correlates_FDR %>% 
  dplyr::filter(FDR < 0.05) %>% 
  dplyr::arrange(FDR)

PC_correlates_FDR_mat <- PC_correlates_FDR %>% 
  dplyr::select(-p) %>% 
  tidyr::spread(key = covariate, value = FDR) %>% 
  dplyr::mutate(PC_number = str_replace(PC, "PC_", "") %>% 
                  as.numeric()) %>% 
  dplyr::arrange(PC_number) %>% 
  dplyr::select(-PC_number, -PC) %>% 
  as.matrix()

rownames(PC_correlates_FDR_mat) <- PC_correlates_FDR %>% .[["PC"]] %>% unique()

heatmap.2(x = -log10(PC_correlates_FDR_mat[1:40,]) %>% t(), 
          Colv = NA, 
          Rowv = NA,
          dendrogram = "none",
          col = viridis(n = 10),
          trace = "none",
          density.info = "none", 
          key = TRUE, 
          key.xlab = "-log10(FDR-corrected p-value)", 
          cexRow = 0.65, cexCol = 0.65)

Heatmap of significance of (cor)relations between PCA scores (only showing first 40) and sample covariates. Significance computed using Kruskal-Wallis for categorical variables and the cor.test based on Spearman's correlation for continuous variables. P-values were FDR-corrected.

Figure 7.2: Heatmap of significance of (cor)relations between PCA scores (only showing first 40) and sample covariates. Significance computed using Kruskal-Wallis for categorical variables and the cor.test based on Spearman's correlation for continuous variables. P-values were FDR-corrected.

7.3 Clustering by PC1 and PC2

Can attempt to "cluster" samples using the PCA plot. As might be expected, this is sensitive to the number of top genes (as selected by highest row variance) used for the analysis.
The general trend confirms the heatmap above i.e. samples cluster by RIN and age at death.

plot_PCA <- function(data, intgroup, ntop){
  
  pcaData <- plotPCA(data, intgroup= intgroup, 
                     ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
                     returnData = TRUE)
  
  pcVar <- round(100 * attr(pcaData, "percentVar"))
  
  plot <- ggplot(pcaData,
              aes(x = PC1,
                  y = PC2,
                  colour = RIN,
                  shape = Disease_Group,
                  label = name)) + 
    geom_point(size = 2) +
    labs(x = str_c("PC1: ", pcVar[1], "%"),
         y = str_c("PC2: ", pcVar[2], "%")) +
    theme_rhr
  
  list <- list(plot, pcaData)
  names(list) <- c("plot", "pcaData")
  
  return(list)
  
}

pca <- lapply(c(100, 500, 5000, 10000), plot_PCA, data = vsd, intgroup = c("RIN", "Disease_Group"))
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")

ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot), 
          common.legend = TRUE, 
          legend = "right",
          labels = c("a", "b", "c", "d"))

PCA plots using the top a) 100, b) 500, c) 5000 or d) 10000 variable genes (selected by highest row variance). Points are coloured by RIN, and their shape corresponds to disease group.

Figure 7.3: PCA plots using the top a) 100, b) 500, c) 5000 or d) 10000 variable genes (selected by highest row variance). Points are coloured by RIN, and their shape corresponds to disease group.

plot_PCA <- function(data, intgroup, ntop){
  
  pcaData <- plotPCA(data, intgroup= intgroup, 
                     ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
                     returnData = TRUE)
  
  pcVar <- round(100 * attr(pcaData, "percentVar"))
  
  plot <- ggplot(pcaData,
              aes(x = PC1,
                  y = PC2,
                  colour = Age_at_death,
                  shape = Disease_Group,
                  label = name)) + 
    geom_point(size = 2) +
    labs(x = str_c("PC1: ", pcVar[1], "%"),
         y = str_c("PC2: ", pcVar[2], "%")) +
    theme_rhr
  
  list <- list(plot, pcaData)
  names(list) <- c("plot", "pcaData")
  
  return(list)
  
}

pca <- lapply(c(100, 500, 5000, 10000), plot_PCA, data = vsd, intgroup = c("Age_at_death", "Disease_Group"))
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")

ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot), 
          common.legend = TRUE, 
          legend = "right",
          labels = c("a", "b", "c", "d"))

PCA plots using the top a) 100, b) 500, c) 5000 or d) 10000 variable genes (selected by highest row variance). Points are coloured by age at death, and their shape corresponds to disease group.

Figure 7.4: PCA plots using the top a) 100, b) 500, c) 5000 or d) 10000 variable genes (selected by highest row variance). Points are coloured by age at death, and their shape corresponds to disease group.

8 Session Info

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MarkerGenes_0.0.0.9000      EWCE_0.99.2                
##  [3] viridis_0.5.1               viridisLite_0.3.0          
##  [5] rtracklayer_1.46.0          rjson_0.2.20               
##  [7] readxl_1.3.1                RColorBrewer_1.1-2         
##  [9] pcaExplorer_2.12.0          tximport_1.14.2            
## [11] forcats_0.5.1               stringr_1.4.0              
## [13] dplyr_1.0.2                 purrr_0.3.4                
## [15] readr_1.4.0                 tidyr_1.1.1                
## [17] tibble_3.0.3                tidyverse_1.3.0            
## [19] pheatmap_1.0.12             ggpubr_0.4.0               
## [21] ggrepel_0.8.2               ggplot2_3.3.2              
## [23] gplots_3.0.4                ensembldb_2.10.2           
## [25] AnnotationFilter_1.10.0     GenomicFeatures_1.38.2     
## [27] AnnotationDbi_1.48.0        DT_0.15                    
## [29] devtools_2.3.2              usethis_1.6.3              
## [31] DESeq2_1.26.0               SummarizedExperiment_1.16.1
## [33] DelayedArray_0.12.3         BiocParallel_1.20.1        
## [35] matrixStats_0.56.0          Biobase_2.46.0             
## [37] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [39] IRanges_2.20.2              S4Vectors_0.24.4           
## [41] BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.1           SparseM_1.78             AnnotationForge_1.28.0  
##   [4] pkgmaker_0.31.1          bit64_4.0.2              knitr_1.29              
##   [7] data.table_1.13.0        rpart_4.1-15             RCurl_1.98-1.2          
##  [10] doParallel_1.0.15        generics_0.0.2           callr_3.5.1             
##  [13] cowplot_1.0.0            RSQLite_2.2.0            bit_4.0.4               
##  [16] xml2_1.3.2               lubridate_1.7.9          httpuv_1.5.4            
##  [19] assertthat_0.2.1         xfun_0.16                hms_1.0.0               
##  [22] evaluate_0.14            promises_1.1.1           progress_1.2.2          
##  [25] caTools_1.18.0           dbplyr_1.4.4             Rgraphviz_2.30.0        
##  [28] igraph_1.2.5             DBI_1.1.1                geneplotter_1.64.0      
##  [31] htmlwidgets_1.5.3        ellipsis_0.3.1           crosstalk_1.1.0.1       
##  [34] backports_1.1.8          bookdown_0.21            annotate_1.64.0         
##  [37] gridBase_0.4-7           biomaRt_2.42.1           vctrs_0.3.2             
##  [40] here_1.0.0               remotes_2.2.0            abind_1.4-5             
##  [43] cachem_1.0.3             withr_2.2.0              checkmate_2.0.0         
##  [46] GenomicAlignments_1.22.1 prettyunits_1.1.1        cluster_2.1.0           
##  [49] lazyeval_0.2.2           crayon_1.4.1             genefilter_1.68.0       
##  [52] labeling_0.4.2           pkgconfig_2.0.3          pkgload_1.1.0           
##  [55] ProtGenerics_1.18.0      nnet_7.3-12              rlang_0.4.7             
##  [58] lifecycle_0.2.0          registry_0.5-1           BiocFileCache_1.10.2    
##  [61] GOstats_2.52.0           modelr_0.1.8             cellranger_1.1.0        
##  [64] rprojroot_2.0.2          graph_1.64.0             rngtools_1.5            
##  [67] Matrix_1.2-17            carData_3.0-4            reprex_1.0.0            
##  [70] base64enc_0.1-3          processx_3.4.5           png_0.1-7               
##  [73] bitops_1.0-6             shinydashboard_0.7.1     KernSmooth_2.23-15      
##  [76] Biostrings_2.54.0        blob_1.2.1               jpeg_0.1-8.1            
##  [79] rstatix_0.6.0            shinyAce_0.4.1           ggsignif_0.6.0          
##  [82] scales_1.1.1             memoise_2.0.0            GSEABase_1.48.0         
##  [85] magrittr_2.0.1           plyr_1.8.6               bibtex_0.4.3            
##  [88] gdata_2.18.0             zlibbioc_1.32.0          threejs_0.3.3           
##  [91] compiler_3.6.1           d3heatmap_0.6.1.2        Rsamtools_2.2.3         
##  [94] cli_2.2.0.9000           XVector_0.26.0           Category_2.52.1         
##  [97] ps_1.3.4                 htmlTable_2.0.1          Formula_1.2-4           
## [100] MASS_7.3-51.4            tidyselect_1.1.0         stringi_1.5.3           
## [103] shinyBS_0.61             highr_0.8                yaml_2.2.1              
## [106] askpass_1.1              locfit_1.5-9.4           latticeExtra_0.6-29     
## [109] grid_3.6.1               tools_3.6.1              rio_0.5.16              
## [112] rstudioapi_0.13          foreach_1.5.0            foreign_0.8-72          
## [115] gridExtra_2.3            farver_2.0.3             HGNChelper_0.8.1        
## [118] digest_0.6.27            shiny_1.5.0              Rcpp_1.0.5              
## [121] car_3.0-9                broom_0.7.0              later_1.1.0.1           
## [124] httr_1.4.2               ggdendro_0.1.21          colorspace_2.0-0        
## [127] rvest_0.3.6              XML_3.99-0.3             fs_1.5.0                
## [130] topGO_2.38.1             splines_3.6.1            RBGL_1.62.1             
## [133] sessioninfo_1.1.1        xtable_1.8-4             jsonlite_1.7.1          
## [136] testthat_2.3.2           R6_2.5.0                 Hmisc_4.4-1             
## [139] pillar_1.4.6             htmltools_0.5.1.1        mime_0.9                
## [142] NMF_0.23.0               glue_1.4.2               fastmap_1.0.1           
## [145] codetools_0.2-16         pkgbuild_1.1.0           lattice_0.20-38         
## [148] curl_4.3                 gtools_3.8.2             zip_2.1.0               
## [151] GO.db_3.10.0             openxlsx_4.2.3           openssl_1.4.2           
## [154] survival_3.2-7           limma_3.42.2             rmarkdown_2.5           
## [157] desc_1.2.0               munsell_0.5.0            GenomeInfoDbData_1.2.2  
## [160] iterators_1.0.12         haven_2.3.1              reshape2_1.4.4          
## [163] gtable_0.3.0

SRP058181: Post-quantificationQC

Regina H. Reynolds