Aim: perform post-quantification quality control checks.
source(here::here("R", "file_paths.R"))
Salmon
quantification, performed a number of QC checks, including:
dds <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD.Rds"
)
)
sample_info <-
colData(dds) %>%
as_tibble()
total_counts <-
dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = rownames(assay(dds))) %>%
gather(key = sample_id, value = counts, -gene_id) %>%
group_by(sample_id) %>%
dplyr::summarise(total_counts = sum(counts)) %>%
inner_join(sample_info, by = "sample_id")
# Plot library size using sum of all gene counts within a sample
ggplot(data = total_counts,
aes(x = MarkerGenes::reorder_within(x = sample_id,
by = total_counts,
within = sample_id,
fun = median,
desc = TRUE),
y = total_counts,
fill = RIN)) +
geom_col() +
MarkerGenes::scale_x_reordered() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
labs(x = "Sample ID", y = "Library size (sum of gene counts across each sample)") +
theme_rhr
sumstats <- total_counts %>%
dplyr::mutate(total_counts_per_million = total_counts/1000000) %>%
dplyr::group_by(Disease_Group) %>%
dplyr::summarise(mean_counts_per_million = mean(total_counts_per_million),
sd_counts_per_million = sd(total_counts_per_million),
median_counts_per_million = median(total_counts_per_million))
sumstats
# write_csv(sumstats,
# path = "/home/rreynolds/projects/Aim2_PDsequencing_wd/paper_draft/tables/counts_per_disease_group_summarystats.csv")
plot <- total_counts %>%
dplyr::mutate(total_counts_per_million = total_counts/1000000) %>%
ggplot(aes(x = Disease_Group, y = total_counts_per_million)) +
geom_boxplot(outlier.shape = NA, width = 0.5, alpha = 0.8) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "lightgray", dotsize = 1.5, alpha = 0.7) +
labs(x = "Disease group", y = "Total counts per million") +
coord_cartesian(ylim = c(0,200)) +
theme_rhr
plot
# ggsave("mapped_reads_per_disease_group.tiff", plot, device = "tiff",
# path = "/home/rreynolds/projects/Aim2_PDsequencing_wd/paper_draft/figures/",
# width = 100,
# height = 100,
# units = "mm",
# dpi = 200,
# limitsize = TRUE)
vst
function from DESeq2
used to apply a variance-stabilising transformation. This transformation is roughly similar to putting the data on the log2 scale, while also dealing with the sampling variability of low counts. See Regularized logarithm transformation from DESeq2 paper for more thorough explanation. N.B. The vst
function corrects for library size.DESeq2
also offers rlog transformation, which accounts for variation in sequencing depth, unlike vst
. vst
is effective at stabilising variance, but does not directly account for differences in size factors (i.e. large variation in sequencing depth).vst
is that it is computationally much faster.dds <- estimateSizeFactors(dds)
# Filter for only genes with normalised counts > 0 in more than or equal to 2 samples, as 0s will otherwise skew distribution.
filter <- rowSums(counts(dds, normalized = TRUE) >= 1) >= 2
filter_dds <- dds[filter,]
# Get transformed counts
rsd_filter <- rlog(object = filter_dds,
blind = TRUE # blind to experimental design
)
vsd_filter <- vst(object = filter_dds,
blind = TRUE # blind to experimental design
)
vsn::meanSdPlot(assay(rsd_filter), ranks = TRUE, xlab = "rank(mean) \n transformed using rlog")
vsn::meanSdPlot(assay(vsd_filter), ranks = TRUE, xlab = "rank(mean) \n transformed using vst")
gtf <- readGFF(filepath = path_to_ref_gtf)
# XIST and DDX3Y ensembl IDs
sex_genes <- gtf %>%
dplyr::filter(gene_name %in% c("XIST", "DDX3Y")) %>%
distinct(gene_name, .keep_all = TRUE) %>%
dplyr::select(gene_id, gene_name)
sex_gene_expr <- sex_genes %>%
inner_join(dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = dds %>%
assay() %>%
rownames() %>%
str_replace("\\..*", "")) %>%
dplyr::select(gene_id, everything()),
by = c("gene_id" = "gene_id")) %>%
gather(key = sample_id, value = counts, -gene_id, -gene_name) %>%
inner_join(sample_info, by = "sample_id")
ggplot(sex_gene_expr, aes(x = sample_id, y = counts, fill = Sex)) +
geom_col() +
facet_grid(gene_name~., scales = "free_y") +
labs(x = "Sample ID", y = "Counts") +
theme_rhr
sampleDists <- vsd %>%
assay() %>%
t() %>% # Transpose transformed count matrix, such that samples now rows.
dist() # Compute distances between rows of the matrix i.e. sample-to-sample distance
sampleDistMatrix <- sampleDists %>%
as.matrix() # Convert to matrix
rownames(sampleDistMatrix) <- str_c(vsd$sample_id, ": ", vsd$Disease_Group, ", ", vsd$Sex)
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col= viridis(n = 20))
prcomp()
to perform PCA. prcomp()
uses singular value decomposition to perform PCA (as opposed to spectral decomposition, as performed by other functions, such as princomp()
). See sthda practical guide or sthda practical guide to FactoMineR
for more information on PCA in R.pca_vsd <- prcomp(t(assay(vsd)))
ggarrange(pcascree(pca_vsd, type = "pev"),
pcascree(pca_vsd, type = "cev"),
labels = c("a", "b"))
# Source function correlate PCs
source(here::here("R", "correlatePCs.R"))
# Select col 1-20 (excludes Unclassified with NAs)
PC_correlates <- correlatePCs(pcaobj = pca_vsd, coldata = colData(vsd)[,1:19], pcs = 1:24)
PC_correlates_FDR <- PC_correlates$pvalue %>%
as_tibble() %>%
dplyr::mutate(PC = str_c("PC_", c(1:24))) %>%
dplyr::select(-sample_id) %>%
tidyr::gather(key = covariate, value = p, -PC) %>%
dplyr::mutate(FDR = p.adjust(p, method = "fdr"))
PC_correlates_FDR %>%
arrange(FDR)
# Include only continuous covariates
heatmap.2(x = PC_correlates$statistic[,4:19] %>% t(),
Colv = NA,
Rowv = NA,
dendrogram = "none",
col = viridis(n = 30),
trace = "none",
density.info = "none",
key = TRUE,
key.xlab = "-log10(FDR-corrected p-value)",
cexRow = 0.65, cexCol = 0.65)
PC_correlates_FDR_mat <- PC_correlates_FDR %>%
dplyr::select(-p) %>%
tidyr::spread(key = covariate, value = FDR) %>%
dplyr::mutate(PC_number = str_replace(PC, "PC_", "") %>%
as.numeric()) %>%
dplyr::arrange(PC_number) %>%
dplyr::select(-PC_number, -PC) %>%
as.matrix()
PC_correlates_FDR_mat <- PC_correlates_FDR_mat[, match(colnames(PC_correlates$statistic), colnames(PC_correlates_FDR_mat))]
rownames(PC_correlates_FDR_mat) <- PC_correlates_FDR %>% .[["PC"]] %>% unique()
heatmap.2(x = -log10(PC_correlates_FDR_mat) %>% t(),
Colv = NA,
Rowv = NA,
dendrogram = "none",
col = viridis(n = 30),
trace = "none",
density.info = "none",
key = TRUE,
key.xlab = "-log10(FDR-corrected p-value)",
cexRow = 0.65, cexCol = 0.65)
PC_correlates_FDR %>%
dplyr::filter(covariate == "Sex") %>%
dplyr::arrange(FDR)
plot_PCA <- function(data, intgroup, ntop, colour, shape){
colour <- enquo(colour)
shape <- enquo(shape)
pcaData <- plotPCA(data, intgroup= intgroup,
ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
returnData = TRUE)
pcVar <- round(100 * attr(pcaData, "percentVar"))
plot <- ggplot(pcaData,
aes(x = PC1,
y = PC2,
colour = !!colour,
shape = !!shape,
label = name)) +
geom_point(size = 2) +
geom_label_repel(data = subset(pcaData, name == "PD416"),
min.segment.length = unit(0, 'lines'),
nudge_x = 5,
nudge_y = -5,
size = 2) +
labs(x = str_c("PC1: ", pcVar[1], "%"),
y = str_c("PC2: ", pcVar[2], "%")) +
theme_rhr
list <- list(plot, pcaData)
names(list) <- c("plot", "pcaData")
return(list)
}
pca <- lapply(c(100, 500, 5000, 10000),
plot_PCA, data = vsd, intgroup = c("RIN", "Sex"),
colour = RIN, shape = Sex)
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")
ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot),
common.legend = TRUE,
legend = "right",
labels = c("a", "b", "c", "d"))
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] vsn_3.54.0 viridis_0.5.1
## [5] viridisLite_0.3.0 rjson_0.2.20
## [7] readxl_1.3.1 rtracklayer_1.46.0
## [9] pcaExplorer_2.12.0 forcats_0.5.1
## [11] stringr_1.4.0 dplyr_1.0.2
## [13] purrr_0.3.4 readr_1.4.0
## [15] tidyr_1.1.1 tibble_3.0.3
## [17] tidyverse_1.3.0 pheatmap_1.0.12
## [19] ggpubr_0.4.0 ggrepel_0.8.2
## [21] ggplot2_3.3.2 gplots_3.0.4
## [23] DT_0.15 devtools_2.3.2
## [25] usethis_1.6.3 DESeq2_1.26.0
## [27] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [29] BiocParallel_1.20.1 matrixStats_0.56.0
## [31] Biobase_2.46.0 GenomicRanges_1.38.0
## [33] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [35] S4Vectors_0.24.4 BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.1 SparseM_1.78 AnnotationForge_1.28.0
## [4] pkgmaker_0.31.1 bit64_4.0.2 knitr_1.29
## [7] data.table_1.13.0 rpart_4.1-15 RCurl_1.98-1.2
## [10] doParallel_1.0.15 generics_0.0.2 preprocessCore_1.48.0
## [13] callr_3.5.1 cowplot_1.0.0 RSQLite_2.2.0
## [16] bit_4.0.4 xml2_1.3.2 lubridate_1.7.9
## [19] httpuv_1.5.4 assertthat_0.2.1 xfun_0.16
## [22] hms_1.0.0 evaluate_0.14 promises_1.1.1
## [25] progress_1.2.2 caTools_1.18.0 dbplyr_1.4.4
## [28] Rgraphviz_2.30.0 igraph_1.2.5 DBI_1.1.1
## [31] geneplotter_1.64.0 htmlwidgets_1.5.3 ellipsis_0.3.1
## [34] crosstalk_1.1.0.1 backports_1.1.8 bookdown_0.21
## [37] annotate_1.64.0 gridBase_0.4-7 biomaRt_2.42.1
## [40] vctrs_0.3.2 here_1.0.0 remotes_2.2.0
## [43] abind_1.4-5 cachem_1.0.3 withr_2.2.0
## [46] checkmate_2.0.0 GenomicAlignments_1.22.1 prettyunits_1.1.1
## [49] cluster_2.1.0 crayon_1.4.1 genefilter_1.68.0
## [52] labeling_0.4.2 pkgconfig_2.0.3 pkgload_1.1.0
## [55] nnet_7.3-12 rlang_0.4.7 lifecycle_0.2.0
## [58] registry_0.5-1 affyio_1.56.0 BiocFileCache_1.10.2
## [61] GOstats_2.52.0 modelr_0.1.8 cellranger_1.1.0
## [64] rprojroot_2.0.2 graph_1.64.0 rngtools_1.5
## [67] Matrix_1.2-17 carData_3.0-4 reprex_1.0.0
## [70] base64enc_0.1-3 processx_3.4.5 png_0.1-7
## [73] bitops_1.0-6 shinydashboard_0.7.1 KernSmooth_2.23-15
## [76] Biostrings_2.54.0 blob_1.2.1 jpeg_0.1-8.1
## [79] rstatix_0.6.0 shinyAce_0.4.1 ggsignif_0.6.0
## [82] scales_1.1.1 hexbin_1.28.1 memoise_2.0.0
## [85] GSEABase_1.48.0 magrittr_2.0.1 plyr_1.8.6
## [88] bibtex_0.4.3 gdata_2.18.0 zlibbioc_1.32.0
## [91] threejs_0.3.3 compiler_3.6.1 RColorBrewer_1.1-2
## [94] d3heatmap_0.6.1.2 Rsamtools_2.2.3 cli_2.2.0.9000
## [97] affy_1.64.0 XVector_0.26.0 Category_2.52.1
## [100] ps_1.3.4 htmlTable_2.0.1 Formula_1.2-4
## [103] MASS_7.3-51.4 tidyselect_1.1.0 stringi_1.5.3
## [106] shinyBS_0.61 highr_0.8 yaml_2.2.1
## [109] askpass_1.1 locfit_1.5-9.4 latticeExtra_0.6-29
## [112] grid_3.6.1 tools_3.6.1 rio_0.5.16
## [115] rstudioapi_0.13 foreach_1.5.0 foreign_0.8-72
## [118] gridExtra_2.3 farver_2.0.3 HGNChelper_0.8.1
## [121] digest_0.6.27 BiocManager_1.30.10 shiny_1.5.0
## [124] Rcpp_1.0.5 car_3.0-9 broom_0.7.0
## [127] later_1.1.0.1 httr_1.4.2 ggdendro_0.1.21
## [130] AnnotationDbi_1.48.0 colorspace_2.0-0 rvest_0.3.6
## [133] XML_3.99-0.3 fs_1.5.0 topGO_2.38.1
## [136] splines_3.6.1 RBGL_1.62.1 sessioninfo_1.1.1
## [139] xtable_1.8-4 jsonlite_1.7.1 testthat_2.3.2
## [142] R6_2.5.0 Hmisc_4.4-1 pillar_1.4.6
## [145] htmltools_0.5.1.1 mime_0.9 NMF_0.23.0
## [148] glue_1.4.2 fastmap_1.0.1 codetools_0.2-16
## [151] pkgbuild_1.1.0 lattice_0.20-38 curl_4.3
## [154] gtools_3.8.2 zip_2.1.0 GO.db_3.10.0
## [157] openxlsx_4.2.3 openssl_1.4.2 survival_3.2-7
## [160] limma_3.42.2 rmarkdown_2.5 desc_1.2.0
## [163] munsell_0.5.0 GenomeInfoDbData_1.2.2 iterators_1.0.12
## [166] haven_2.3.1 reshape2_1.4.4 gtable_0.3.0