Aim: perform post-quantification quality control checks.
# File paths
source(here::here("R", "file_paths.R"))
load(file.path(path_to_recount, "counts/rse_gene.Rdata"))
gtf <- readGFF(filepath = path_to_ref_gtf)
sample_info <-
read_excel(
path =
file.path(
path_to_raw_data,
"sample_details/SRP058181_sample_metadata.xlsx"
)
) %>%
dplyr::na_if(.,"N/A") %>%
dplyr::select(-Proteomics, -Proteomics_SV1, -Proteomics_SV2, -Proteomics_SV3, -Microarray_study_ID) %>%
dplyr::mutate(
sample_id = `RNA-Seq_Samples`,
Braak_score = Braak_score %>%
str_replace_all(c("IV" = "4",
"II-III" = "3",
"I-II" = "2",
"III" = "3",
"II" = "2",
"I" = "1")) %>%
as.integer(),
Disease_Group = ifelse(Condition == "Control", "Control",
ifelse(Condition == "PD" & Dementia == "no", "PD",
ifelse(Condition == "PD" & Dementia == "yes", "PDD", NA))),
Disease_Group = replace_na(Disease_Group, "PD")
) %>%
dplyr::inner_join(
colData(rse_gene) %>%
as_tibble() %>%
dplyr::select(run, title, mapped_read_count) %>%
dplyr::mutate(recount_id = run,
title = str_replace(title, " \\[.*", "")) %>%
dplyr::select(-run),
by = c("sample_id" = "title")
) %>%
dplyr::full_join(
read_delim(
file.path(
path_to_results,
"deconvolution/scaden/SRP058181/scaden_predictions_cells.1000:samples.1000.txt"
),
delim = "\t"
) %>%
dplyr::rename(recount_id = X1) %>%
dplyr::rename_at(vars(-recount_id),function(x) paste0(x,"_proportion"))) %>%
dplyr::select(sample_id, recount_id, Disease_Group, everything(), -Condition, -`RNA-Seq_Samples`) %>%
dplyr::arrange(recount_id)
DESeqDataSet
to convert such SummarisedExperiment objects. However, metadata in the summarised experiment does not include columns for sample characteristics including PMI, RIN, etc.# Important that rows in the dataframe, 'sample_info', are in the same order as the columns in the count matrix, 'countData'.
countdata <- assay(rse_gene)
countdata <- countdata[ ,sort(colnames(countdata))] # Sorted alphanumerically
coldata <-
sample_info %>%
dplyr::select(sample_id, recount_id, Disease_Group, everything()) %>%
inner_join(colData(rse_gene) %>%
as_tibble() %>%
dplyr::mutate(recount_id = run,
title = str_replace(title, " \\[.*", "")) %>%
dplyr::select(-run, -title, -characteristics, -mapped_read_count),
by = c("recount_id" = "recount_id")) %>%
arrange(recount_id) # Sorted alphanumerically to match sorting in countdata
dds <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ Disease_Group)
saveRDS(
dds,
file =
file.path(
path_to_results,
"SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
)
)
dds <- readRDS(
file.path(
path_to_results,
"SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
)
)
# Plot library size using sum of all gene counts within a sample
ggplot(data = colData(dds) %>%
as.data.frame(),
aes(x = MarkerGenes::reorder_within(x = recount_id,
by = mapped_read_count,
within = recount_id,
fun = median,
desc = TRUE),
y = mapped_read_count,
fill = RIN)) +
geom_col() +
MarkerGenes::scale_x_reordered() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
labs(x = "Sample ID", y = "Recount2: Mapped read count") +
theme_rhr
vst
function from DESeq2
used to apply a variance-stabilising transformation. This transformation is roughly similar to putting the data on the log2 scale, while also dealing with the sampling variability of low counts.vst
function also corrects for library size.# Get log2 counts
vsd <- vst(object = dds,
blind = TRUE)
# Check distributions of samples using boxplots
ggplot(data = vsd %>%
assay() %>%
as_tibble() %>%
dplyr::mutate(gene_id = rownames(assay(vsd))) %>%
gather(key = sample_id, value = log2_counts_per_million, -gene_id) %>%
inner_join(sample_info, by = c("sample_id" = "recount_id")),
aes(x = MarkerGenes::reorder_within(x = sample_id,
by = log2_counts_per_million,
within = sample_id,
fun = median,
desc = FALSE),
y = log2_counts_per_million, fill = RIN)) +
geom_boxplot() +
MarkerGenes::scale_x_reordered() +
geom_hline(yintercept = median(assay(vsd)), size = 0.75, linetype = "dashed") +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
labs(x = "Sample ID", y = "Log2 counts per million") +
theme_rhr
# XIST and DDX3Y ensembl IDs
sex_genes <- gtf %>%
dplyr::filter(gene_name %in% c("XIST", "DDX3Y")) %>%
distinct(gene_name, .keep_all = TRUE) %>%
dplyr::select(gene_id, gene_name)
sex_gene_expr <- sex_genes %>%
inner_join(dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = dds %>%
assay() %>%
rownames() %>%
str_replace("\\..*", "")) %>%
dplyr::select(gene_id, everything()),
by = c("gene_id" = "gene_id")) %>%
gather(key = sample_id, value = counts, -gene_id, -gene_name) %>%
inner_join(sample_info, by = c("sample_id" = "recount_id"))
ggplot(sex_gene_expr, aes(x = sample_id, y = counts, fill = Sex)) +
geom_col() +
facet_grid(gene_name~., scales = "free_y") +
labs(x = "Sample ID", y = "Counts") +
scale_fill_manual(values = c("#00BFC4")) +
theme_rhr
sampleDists <- vsd %>%
assay() %>%
t() %>% # Transpose transformed count matrix, such that samples now rows.
dist() # Compute distances between rows of the matrix i.e. sample-to-sample distance
sampleDistMatrix <- sampleDists %>%
as.matrix() # Convert to matrix
rownames(sampleDistMatrix) <- str_c(vsd$Disease_Group)
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col= viridis(n = 20))
A scree plot shows the fraction of total variance explained by each PC.
pca_vsd <- prcomp(t(assay(vsd)))
ggarrange(pcascree(pca_vsd, type = "pev"),
pcascree(pca_vsd, type = "cev"),
nrow = 2,
labels = c("a", "b"))
PC_correlates <- correlatePCs(pcaobj = pca_vsd,
coldata = colData(vsd) %>%
as.data.frame() %>%
dplyr::select(Disease_Group, Age_at_death, PMI, RIN, Sex, Motor_onset, Disease_duration, Braak_score,mapped_read_count, contains("proportion")),
pcs = 1:73)
# Some proportions so highly correlated that p-value = 0
# Replace these 0s with p-value lower than minimum of 1.6E-9
PC_correlates_FDR <- PC_correlates %>%
as_tibble() %>%
dplyr::mutate(PC = str_c("PC_", c(1:73))) %>%
dplyr::select(-Sex, -Motor_onset, -Disease_duration) %>%
tidyr::gather(key = covariate, value = p, -PC) %>%
dplyr::mutate(p = case_when(p == 0 ~ 1 * 10^-12,
TRUE ~ p),
FDR = p.adjust(p, method = "fdr"))
PC_correlates_FDR %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::arrange(FDR)
PC_correlates_FDR_mat <- PC_correlates_FDR %>%
dplyr::select(-p) %>%
tidyr::spread(key = covariate, value = FDR) %>%
dplyr::mutate(PC_number = str_replace(PC, "PC_", "") %>%
as.numeric()) %>%
dplyr::arrange(PC_number) %>%
dplyr::select(-PC_number, -PC) %>%
as.matrix()
rownames(PC_correlates_FDR_mat) <- PC_correlates_FDR %>% .[["PC"]] %>% unique()
heatmap.2(x = -log10(PC_correlates_FDR_mat[1:40,]) %>% t(),
Colv = NA,
Rowv = NA,
dendrogram = "none",
col = viridis(n = 10),
trace = "none",
density.info = "none",
key = TRUE,
key.xlab = "-log10(FDR-corrected p-value)",
cexRow = 0.65, cexCol = 0.65)
plot_PCA <- function(data, intgroup, ntop){
pcaData <- plotPCA(data, intgroup= intgroup,
ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
returnData = TRUE)
pcVar <- round(100 * attr(pcaData, "percentVar"))
plot <- ggplot(pcaData,
aes(x = PC1,
y = PC2,
colour = RIN,
shape = Disease_Group,
label = name)) +
geom_point(size = 2) +
labs(x = str_c("PC1: ", pcVar[1], "%"),
y = str_c("PC2: ", pcVar[2], "%")) +
theme_rhr
list <- list(plot, pcaData)
names(list) <- c("plot", "pcaData")
return(list)
}
pca <- lapply(c(100, 500, 5000, 10000), plot_PCA, data = vsd, intgroup = c("RIN", "Disease_Group"))
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")
ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot),
common.legend = TRUE,
legend = "right",
labels = c("a", "b", "c", "d"))
plot_PCA <- function(data, intgroup, ntop){
pcaData <- plotPCA(data, intgroup= intgroup,
ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
returnData = TRUE)
pcVar <- round(100 * attr(pcaData, "percentVar"))
plot <- ggplot(pcaData,
aes(x = PC1,
y = PC2,
colour = Age_at_death,
shape = Disease_Group,
label = name)) +
geom_point(size = 2) +
labs(x = str_c("PC1: ", pcVar[1], "%"),
y = str_c("PC2: ", pcVar[2], "%")) +
theme_rhr
list <- list(plot, pcaData)
names(list) <- c("plot", "pcaData")
return(list)
}
pca <- lapply(c(100, 500, 5000, 10000), plot_PCA, data = vsd, intgroup = c("Age_at_death", "Disease_Group"))
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")
ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot),
common.legend = TRUE,
legend = "right",
labels = c("a", "b", "c", "d"))
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] viridis_0.5.1 viridisLite_0.3.0
## [5] rtracklayer_1.46.0 rjson_0.2.20
## [7] readxl_1.3.1 RColorBrewer_1.1-2
## [9] pcaExplorer_2.12.0 tximport_1.14.2
## [11] forcats_0.5.1 stringr_1.4.0
## [13] dplyr_1.0.2 purrr_0.3.4
## [15] readr_1.4.0 tidyr_1.1.1
## [17] tibble_3.0.3 tidyverse_1.3.0
## [19] pheatmap_1.0.12 ggpubr_0.4.0
## [21] ggrepel_0.8.2 ggplot2_3.3.2
## [23] gplots_3.0.4 ensembldb_2.10.2
## [25] AnnotationFilter_1.10.0 GenomicFeatures_1.38.2
## [27] AnnotationDbi_1.48.0 DT_0.15
## [29] devtools_2.3.2 usethis_1.6.3
## [31] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [33] DelayedArray_0.12.3 BiocParallel_1.20.1
## [35] matrixStats_0.56.0 Biobase_2.46.0
## [37] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [39] IRanges_2.20.2 S4Vectors_0.24.4
## [41] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.1 SparseM_1.78 AnnotationForge_1.28.0
## [4] pkgmaker_0.31.1 bit64_4.0.2 knitr_1.29
## [7] data.table_1.13.0 rpart_4.1-15 RCurl_1.98-1.2
## [10] doParallel_1.0.15 generics_0.0.2 callr_3.5.1
## [13] cowplot_1.0.0 RSQLite_2.2.0 bit_4.0.4
## [16] xml2_1.3.2 lubridate_1.7.9 httpuv_1.5.4
## [19] assertthat_0.2.1 xfun_0.16 hms_1.0.0
## [22] evaluate_0.14 promises_1.1.1 progress_1.2.2
## [25] caTools_1.18.0 dbplyr_1.4.4 Rgraphviz_2.30.0
## [28] igraph_1.2.5 DBI_1.1.1 geneplotter_1.64.0
## [31] htmlwidgets_1.5.3 ellipsis_0.3.1 crosstalk_1.1.0.1
## [34] backports_1.1.8 bookdown_0.21 annotate_1.64.0
## [37] gridBase_0.4-7 biomaRt_2.42.1 vctrs_0.3.2
## [40] here_1.0.0 remotes_2.2.0 abind_1.4-5
## [43] cachem_1.0.3 withr_2.2.0 checkmate_2.0.0
## [46] GenomicAlignments_1.22.1 prettyunits_1.1.1 cluster_2.1.0
## [49] lazyeval_0.2.2 crayon_1.4.1 genefilter_1.68.0
## [52] labeling_0.4.2 pkgconfig_2.0.3 pkgload_1.1.0
## [55] ProtGenerics_1.18.0 nnet_7.3-12 rlang_0.4.7
## [58] lifecycle_0.2.0 registry_0.5-1 BiocFileCache_1.10.2
## [61] GOstats_2.52.0 modelr_0.1.8 cellranger_1.1.0
## [64] rprojroot_2.0.2 graph_1.64.0 rngtools_1.5
## [67] Matrix_1.2-17 carData_3.0-4 reprex_1.0.0
## [70] base64enc_0.1-3 processx_3.4.5 png_0.1-7
## [73] bitops_1.0-6 shinydashboard_0.7.1 KernSmooth_2.23-15
## [76] Biostrings_2.54.0 blob_1.2.1 jpeg_0.1-8.1
## [79] rstatix_0.6.0 shinyAce_0.4.1 ggsignif_0.6.0
## [82] scales_1.1.1 memoise_2.0.0 GSEABase_1.48.0
## [85] magrittr_2.0.1 plyr_1.8.6 bibtex_0.4.3
## [88] gdata_2.18.0 zlibbioc_1.32.0 threejs_0.3.3
## [91] compiler_3.6.1 d3heatmap_0.6.1.2 Rsamtools_2.2.3
## [94] cli_2.2.0.9000 XVector_0.26.0 Category_2.52.1
## [97] ps_1.3.4 htmlTable_2.0.1 Formula_1.2-4
## [100] MASS_7.3-51.4 tidyselect_1.1.0 stringi_1.5.3
## [103] shinyBS_0.61 highr_0.8 yaml_2.2.1
## [106] askpass_1.1 locfit_1.5-9.4 latticeExtra_0.6-29
## [109] grid_3.6.1 tools_3.6.1 rio_0.5.16
## [112] rstudioapi_0.13 foreach_1.5.0 foreign_0.8-72
## [115] gridExtra_2.3 farver_2.0.3 HGNChelper_0.8.1
## [118] digest_0.6.27 shiny_1.5.0 Rcpp_1.0.5
## [121] car_3.0-9 broom_0.7.0 later_1.1.0.1
## [124] httr_1.4.2 ggdendro_0.1.21 colorspace_2.0-0
## [127] rvest_0.3.6 XML_3.99-0.3 fs_1.5.0
## [130] topGO_2.38.1 splines_3.6.1 RBGL_1.62.1
## [133] sessioninfo_1.1.1 xtable_1.8-4 jsonlite_1.7.1
## [136] testthat_2.3.2 R6_2.5.0 Hmisc_4.4-1
## [139] pillar_1.4.6 htmltools_0.5.1.1 mime_0.9
## [142] NMF_0.23.0 glue_1.4.2 fastmap_1.0.1
## [145] codetools_0.2-16 pkgbuild_1.1.0 lattice_0.20-38
## [148] curl_4.3 gtools_3.8.2 zip_2.1.0
## [151] GO.db_3.10.0 openxlsx_4.2.3 openssl_1.4.2
## [154] survival_3.2-7 limma_3.42.2 rmarkdown_2.5
## [157] desc_1.2.0 munsell_0.5.0 GenomeInfoDbData_1.2.2
## [160] iterators_1.0.12 haven_2.3.1 reshape2_1.4.4
## [163] gtable_0.3.0