Aim: perform post-quantification quality control checks.
# File paths
source(here::here("R", "file_paths.R"))
load(file.path(path_to_recount, "counts/rse_gene.Rdata"))
gtf <- readGFF(filepath = path_to_ref_gtf)
sample_info <-
read_excel(
path =
file.path(
path_to_raw_data,
"sample_details/SRP058181_sample_metadata.xlsx"
)
) %>%
dplyr::na_if(.,"N/A") %>%
dplyr::select(-Proteomics, -Proteomics_SV1, -Proteomics_SV2, -Proteomics_SV3, -Microarray_study_ID) %>%
dplyr::mutate(
sample_id = `RNA-Seq_Samples`,
Braak_score = Braak_score %>%
str_replace_all(c("IV" = "4",
"II-III" = "3",
"I-II" = "2",
"III" = "3",
"II" = "2",
"I" = "1")) %>%
as.integer(),
Disease_Group = ifelse(Condition == "Control", "Control",
ifelse(Condition == "PD" & Dementia == "no", "PD",
ifelse(Condition == "PD" & Dementia == "yes", "PDD", NA))),
Disease_Group = replace_na(Disease_Group, "PD")
) %>%
dplyr::inner_join(
colData(rse_gene) %>%
as_tibble() %>%
dplyr::select(run, title, mapped_read_count) %>%
dplyr::mutate(recount_id = run,
title = str_replace(title, " \\[.*", "")) %>%
dplyr::select(-run),
by = c("sample_id" = "title")
) %>%
dplyr::full_join(
read_delim(
file.path(
path_to_results,
"deconvolution/scaden/SRP058181/scaden_predictions_cells.1000:samples.1000.txt"
),
delim = "\t"
) %>%
dplyr::rename(recount_id = X1) %>%
dplyr::rename_at(vars(-recount_id),function(x) paste0(x,"_proportion"))) %>%
dplyr::select(sample_id, recount_id, Disease_Group, everything(), -Condition, -`RNA-Seq_Samples`) %>%
dplyr::arrange(recount_id)
DESeqDataSet
to convert such SummarisedExperiment objects. However, metadata in the summarised experiment does not include columns for sample characteristics including PMI, RIN, etc.# Important that rows in the dataframe, 'sample_info', are in the same order as the columns in the count matrix, 'countData'.
countdata <- assay(rse_gene)
countdata <- countdata[ ,sort(colnames(countdata))] # Sorted alphanumerically
coldata <-
sample_info %>%
dplyr::select(sample_id, recount_id, Disease_Group, everything()) %>%
inner_join(colData(rse_gene) %>%
as_tibble() %>%
dplyr::mutate(recount_id = run,
title = str_replace(title, " \\[.*", "")) %>%
dplyr::select(-run, -title, -characteristics, -mapped_read_count),
by = c("recount_id" = "recount_id")) %>%
arrange(recount_id) # Sorted alphanumerically to match sorting in countdata
dds <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ Disease_Group)
saveRDS(
dds,
file =
file.path(
path_to_results,
"SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
)
)
dds <- readRDS(
file.path(
path_to_results,
"SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
)
)
# Plot library size using sum of all gene counts within a sample
ggplot(data = colData(dds) %>%
as.data.frame(),
aes(x = MarkerGenes::reorder_within(x = recount_id,
by = mapped_read_count,
within = recount_id,
fun = median,
desc = TRUE),
y = mapped_read_count,
fill = RIN)) +
geom_col() +
MarkerGenes::scale_x_reordered() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
labs(x = "Sample ID", y = "Recount2: Mapped read count") +
theme_rhr
Figure 4.1: Plot of total library size, as determined by summing unnormalised gene counts across samples. Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.
vst
function from DESeq2
used to apply a variance-stabilising transformation. This transformation is roughly similar to putting the data on the log2 scale, while also dealing with the sampling variability of low counts.vst
function also corrects for library size.# Get log2 counts
vsd <- vst(object = dds,
blind = TRUE)
# Check distributions of samples using boxplots
ggplot(data = vsd %>%
assay() %>%
as_tibble() %>%
dplyr::mutate(gene_id = rownames(assay(vsd))) %>%
gather(key = sample_id, value = log2_counts_per_million, -gene_id) %>%
inner_join(sample_info, by = c("sample_id" = "recount_id")),
aes(x = MarkerGenes::reorder_within(x = sample_id,
by = log2_counts_per_million,
within = sample_id,
fun = median,
desc = FALSE),
y = log2_counts_per_million, fill = RIN)) +
geom_boxplot() +
MarkerGenes::scale_x_reordered() +
geom_hline(yintercept = median(assay(vsd)), size = 0.75, linetype = "dashed") +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
labs(x = "Sample ID", y = "Log2 counts per million") +
theme_rhr
Figure 4.2: Plot of log2 counts per million across samples. Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.
# XIST and DDX3Y ensembl IDs
sex_genes <- gtf %>%
dplyr::filter(gene_name %in% c("XIST", "DDX3Y")) %>%
distinct(gene_name, .keep_all = TRUE) %>%
dplyr::select(gene_id, gene_name)
sex_gene_expr <- sex_genes %>%
inner_join(dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = dds %>%
assay() %>%
rownames() %>%
str_replace("\\..*", "")) %>%
dplyr::select(gene_id, everything()),
by = c("gene_id" = "gene_id")) %>%
gather(key = sample_id, value = counts, -gene_id, -gene_name) %>%
inner_join(sample_info, by = c("sample_id" = "recount_id"))
ggplot(sex_gene_expr, aes(x = sample_id, y = counts, fill = Sex)) +
geom_col() +
facet_grid(gene_name~., scales = "free_y") +
labs(x = "Sample ID", y = "Counts") +
scale_fill_manual(values = c("#00BFC4")) +
theme_rhr
Figure 5.1: Plot of read counts across DDX3Y (Y-chromosome expressed) and XIST (required for X-chromosome inactivation). Samples are coloured by their assigned sex, as provided in the initial sample information from Imperial. Expression patterns of the female/male-specific genes match the assigned sexes.
sampleDists <- vsd %>%
assay() %>%
t() %>% # Transpose transformed count matrix, such that samples now rows.
dist() # Compute distances between rows of the matrix i.e. sample-to-sample distance
sampleDistMatrix <- sampleDists %>%
as.matrix() # Convert to matrix
rownames(sampleDistMatrix) <- str_c(vsd$Disease_Group)
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col= viridis(n = 20))
Figure 6.1: Heatmap of sample-to-sample distances calculated using vst-transformed count matrix. Samples labelled by name, disease group and sex.
A scree plot shows the fraction of total variance explained by each PC.
pca_vsd <- prcomp(t(assay(vsd)))
ggarrange(pcascree(pca_vsd, type = "pev"),
pcascree(pca_vsd, type = "cev"),
nrow = 2,
labels = c("a", "b"))
Figure 7.1: Plot of (a) the proportion of variance explained by each PC and (b) the cumulative proportion of variance explained with the addition of each PC.
PC_correlates <- correlatePCs(pcaobj = pca_vsd,
coldata = colData(vsd) %>%
as.data.frame() %>%
dplyr::select(Disease_Group, Age_at_death, PMI, RIN, Sex, Motor_onset, Disease_duration, Braak_score,mapped_read_count, contains("proportion")),
pcs = 1:73)
# Some proportions so highly correlated that p-value = 0
# Replace these 0s with p-value lower than minimum of 1.6E-9
PC_correlates_FDR <- PC_correlates %>%
as_tibble() %>%
dplyr::mutate(PC = str_c("PC_", c(1:73))) %>%
dplyr::select(-Sex, -Motor_onset, -Disease_duration) %>%
tidyr::gather(key = covariate, value = p, -PC) %>%
dplyr::mutate(p = case_when(p == 0 ~ 1 * 10^-12,
TRUE ~ p),
FDR = p.adjust(p, method = "fdr"))
PC_correlates_FDR %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::arrange(FDR)
PC_correlates_FDR_mat <- PC_correlates_FDR %>%
dplyr::select(-p) %>%
tidyr::spread(key = covariate, value = FDR) %>%
dplyr::mutate(PC_number = str_replace(PC, "PC_", "") %>%
as.numeric()) %>%
dplyr::arrange(PC_number) %>%
dplyr::select(-PC_number, -PC) %>%
as.matrix()
rownames(PC_correlates_FDR_mat) <- PC_correlates_FDR %>% .[["PC"]] %>% unique()
heatmap.2(x = -log10(PC_correlates_FDR_mat[1:40,]) %>% t(),
Colv = NA,
Rowv = NA,
dendrogram = "none",
col = viridis(n = 10),
trace = "none",
density.info = "none",
key = TRUE,
key.xlab = "-log10(FDR-corrected p-value)",
cexRow = 0.65, cexCol = 0.65)
Figure 7.2: Heatmap of significance of (cor)relations between PCA scores (only showing first 40) and sample covariates. Significance computed using Kruskal-Wallis for categorical variables and the cor.test based on Spearman's correlation for continuous variables. P-values were FDR-corrected.
plot_PCA <- function(data, intgroup, ntop){
pcaData <- plotPCA(data, intgroup= intgroup,
ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
returnData = TRUE)
pcVar <- round(100 * attr(pcaData, "percentVar"))
plot <- ggplot(pcaData,
aes(x = PC1,
y = PC2,
colour = RIN,
shape = Disease_Group,
label = name)) +
geom_point(size = 2) +
labs(x = str_c("PC1: ", pcVar[1], "%"),
y = str_c("PC2: ", pcVar[2], "%")) +
theme_rhr
list <- list(plot, pcaData)
names(list) <- c("plot", "pcaData")
return(list)
}
pca <- lapply(c(100, 500, 5000, 10000), plot_PCA, data = vsd, intgroup = c("RIN", "Disease_Group"))
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")
ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot),
common.legend = TRUE,
legend = "right",
labels = c("a", "b", "c", "d"))
Figure 7.3: PCA plots using the top a) 100, b) 500, c) 5000 or d) 10000 variable genes (selected by highest row variance). Points are coloured by RIN, and their shape corresponds to disease group.
plot_PCA <- function(data, intgroup, ntop){
pcaData <- plotPCA(data, intgroup= intgroup,
ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
returnData = TRUE)
pcVar <- round(100 * attr(pcaData, "percentVar"))
plot <- ggplot(pcaData,
aes(x = PC1,
y = PC2,
colour = Age_at_death,
shape = Disease_Group,
label = name)) +
geom_point(size = 2) +
labs(x = str_c("PC1: ", pcVar[1], "%"),
y = str_c("PC2: ", pcVar[2], "%")) +
theme_rhr
list <- list(plot, pcaData)
names(list) <- c("plot", "pcaData")
return(list)
}
pca <- lapply(c(100, 500, 5000, 10000), plot_PCA, data = vsd, intgroup = c("Age_at_death", "Disease_Group"))
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")
ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot),
common.legend = TRUE,
legend = "right",
labels = c("a", "b", "c", "d"))
Figure 7.4: PCA plots using the top a) 100, b) 500, c) 5000 or d) 10000 variable genes (selected by highest row variance). Points are coloured by age at death, and their shape corresponds to disease group.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] viridis_0.5.1 viridisLite_0.3.0
## [5] rtracklayer_1.46.0 rjson_0.2.20
## [7] readxl_1.3.1 RColorBrewer_1.1-2
## [9] pcaExplorer_2.12.0 tximport_1.14.2
## [11] forcats_0.5.1 stringr_1.4.0
## [13] dplyr_1.0.2 purrr_0.3.4
## [15] readr_1.4.0 tidyr_1.1.1
## [17] tibble_3.0.3 tidyverse_1.3.0
## [19] pheatmap_1.0.12 ggpubr_0.4.0
## [21] ggrepel_0.8.2 ggplot2_3.3.2
## [23] gplots_3.0.4 ensembldb_2.10.2
## [25] AnnotationFilter_1.10.0 GenomicFeatures_1.38.2
## [27] AnnotationDbi_1.48.0 DT_0.15
## [29] devtools_2.3.2 usethis_1.6.3
## [31] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [33] DelayedArray_0.12.3 BiocParallel_1.20.1
## [35] matrixStats_0.56.0 Biobase_2.46.0
## [37] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [39] IRanges_2.20.2 S4Vectors_0.24.4
## [41] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.1 SparseM_1.78 AnnotationForge_1.28.0
## [4] pkgmaker_0.31.1 bit64_4.0.2 knitr_1.29
## [7] data.table_1.13.0 rpart_4.1-15 RCurl_1.98-1.2
## [10] doParallel_1.0.15 generics_0.0.2 callr_3.5.1
## [13] cowplot_1.0.0 RSQLite_2.2.0 bit_4.0.4
## [16] xml2_1.3.2 lubridate_1.7.9 httpuv_1.5.4
## [19] assertthat_0.2.1 xfun_0.16 hms_1.0.0
## [22] evaluate_0.14 promises_1.1.1 progress_1.2.2
## [25] caTools_1.18.0 dbplyr_1.4.4 Rgraphviz_2.30.0
## [28] igraph_1.2.5 DBI_1.1.1 geneplotter_1.64.0
## [31] htmlwidgets_1.5.3 ellipsis_0.3.1 crosstalk_1.1.0.1
## [34] backports_1.1.8 bookdown_0.21 annotate_1.64.0
## [37] gridBase_0.4-7 biomaRt_2.42.1 vctrs_0.3.2
## [40] here_1.0.0 remotes_2.2.0 abind_1.4-5
## [43] cachem_1.0.3 withr_2.2.0 checkmate_2.0.0
## [46] GenomicAlignments_1.22.1 prettyunits_1.1.1 cluster_2.1.0
## [49] lazyeval_0.2.2 crayon_1.4.1 genefilter_1.68.0
## [52] labeling_0.4.2 pkgconfig_2.0.3 pkgload_1.1.0
## [55] ProtGenerics_1.18.0 nnet_7.3-12 rlang_0.4.7
## [58] lifecycle_0.2.0 registry_0.5-1 BiocFileCache_1.10.2
## [61] GOstats_2.52.0 modelr_0.1.8 cellranger_1.1.0
## [64] rprojroot_2.0.2 graph_1.64.0 rngtools_1.5
## [67] Matrix_1.2-17 carData_3.0-4 reprex_1.0.0
## [70] base64enc_0.1-3 processx_3.4.5 png_0.1-7
## [73] bitops_1.0-6 shinydashboard_0.7.1 KernSmooth_2.23-15
## [76] Biostrings_2.54.0 blob_1.2.1 jpeg_0.1-8.1
## [79] rstatix_0.6.0 shinyAce_0.4.1 ggsignif_0.6.0
## [82] scales_1.1.1 memoise_2.0.0 GSEABase_1.48.0
## [85] magrittr_2.0.1 plyr_1.8.6 bibtex_0.4.3
## [88] gdata_2.18.0 zlibbioc_1.32.0 threejs_0.3.3
## [91] compiler_3.6.1 d3heatmap_0.6.1.2 Rsamtools_2.2.3
## [94] cli_2.2.0.9000 XVector_0.26.0 Category_2.52.1
## [97] ps_1.3.4 htmlTable_2.0.1 Formula_1.2-4
## [100] MASS_7.3-51.4 tidyselect_1.1.0 stringi_1.5.3
## [103] shinyBS_0.61 highr_0.8 yaml_2.2.1
## [106] askpass_1.1 locfit_1.5-9.4 latticeExtra_0.6-29
## [109] grid_3.6.1 tools_3.6.1 rio_0.5.16
## [112] rstudioapi_0.13 foreach_1.5.0 foreign_0.8-72
## [115] gridExtra_2.3 farver_2.0.3 HGNChelper_0.8.1
## [118] digest_0.6.27 shiny_1.5.0 Rcpp_1.0.5
## [121] car_3.0-9 broom_0.7.0 later_1.1.0.1
## [124] httr_1.4.2 ggdendro_0.1.21 colorspace_2.0-0
## [127] rvest_0.3.6 XML_3.99-0.3 fs_1.5.0
## [130] topGO_2.38.1 splines_3.6.1 RBGL_1.62.1
## [133] sessioninfo_1.1.1 xtable_1.8-4 jsonlite_1.7.1
## [136] testthat_2.3.2 R6_2.5.0 Hmisc_4.4-1
## [139] pillar_1.4.6 htmltools_0.5.1.1 mime_0.9
## [142] NMF_0.23.0 glue_1.4.2 fastmap_1.0.1
## [145] codetools_0.2-16 pkgbuild_1.1.0 lattice_0.20-38
## [148] curl_4.3 gtools_3.8.2 zip_2.1.0
## [151] GO.db_3.10.0 openxlsx_4.2.3 openssl_1.4.2
## [154] survival_3.2-7 limma_3.42.2 rmarkdown_2.5
## [157] desc_1.2.0 munsell_0.5.0 GenomeInfoDbData_1.2.2
## [160] iterators_1.0.12 haven_2.3.1 reshape2_1.4.4
## [163] gtable_0.3.0