Aim: perform post-quantification quality control checks.
source(here::here("R", "file_paths.R"))
Salmon
quantification, performed a number of QC checks, including:
dds <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD.Rds"
)
)
sample_info <-
colData(dds) %>%
as_tibble()
total_counts <-
dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = rownames(assay(dds))) %>%
gather(key = sample_id, value = counts, -gene_id) %>%
group_by(sample_id) %>%
dplyr::summarise(total_counts = sum(counts)) %>%
inner_join(sample_info, by = "sample_id")
# Plot library size using sum of all gene counts within a sample
ggplot(data = total_counts,
aes(x = MarkerGenes::reorder_within(x = sample_id,
by = total_counts,
within = sample_id,
fun = median,
desc = TRUE),
y = total_counts,
fill = RIN)) +
geom_col() +
MarkerGenes::scale_x_reordered() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = median(sample_info$RIN)) +
labs(x = "Sample ID", y = "Library size (sum of gene counts across each sample)") +
theme_rhr
Figure 4.1: Plot of total library size, as determined by summing unnormalised gene counts across samples. Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.
sumstats <- total_counts %>%
dplyr::mutate(total_counts_per_million = total_counts/1000000) %>%
dplyr::group_by(Disease_Group) %>%
dplyr::summarise(mean_counts_per_million = mean(total_counts_per_million),
sd_counts_per_million = sd(total_counts_per_million),
median_counts_per_million = median(total_counts_per_million))
sumstats
# write_csv(sumstats,
# path = "/home/rreynolds/projects/Aim2_PDsequencing_wd/paper_draft/tables/counts_per_disease_group_summarystats.csv")
plot <- total_counts %>%
dplyr::mutate(total_counts_per_million = total_counts/1000000) %>%
ggplot(aes(x = Disease_Group, y = total_counts_per_million)) +
geom_boxplot(outlier.shape = NA, width = 0.5, alpha = 0.8) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "lightgray", dotsize = 1.5, alpha = 0.7) +
labs(x = "Disease group", y = "Total counts per million") +
coord_cartesian(ylim = c(0,200)) +
theme_rhr
plot
Figure 4.2: Plot of total counts per million across each disease group.
# ggsave("mapped_reads_per_disease_group.tiff", plot, device = "tiff",
# path = "/home/rreynolds/projects/Aim2_PDsequencing_wd/paper_draft/figures/",
# width = 100,
# height = 100,
# units = "mm",
# dpi = 200,
# limitsize = TRUE)
vst
function from DESeq2
used to apply a variance-stabilising transformation. This transformation is roughly similar to putting the data on the log2 scale, while also dealing with the sampling variability of low counts. See Regularized logarithm transformation from DESeq2 paper for more thorough explanation. N.B. The vst
function corrects for library size.DESeq2
also offers rlog transformation, which accounts for variation in sequencing depth, unlike vst
. vst
is effective at stabilising variance, but does not directly account for differences in size factors (i.e. large variation in sequencing depth).vst
is that it is computationally much faster.dds <- estimateSizeFactors(dds)
# Filter for only genes with normalised counts > 0 in more than or equal to 2 samples, as 0s will otherwise skew distribution.
filter <- rowSums(counts(dds, normalized = TRUE) >= 1) >= 2
filter_dds <- dds[filter,]
# Get transformed counts
rsd_filter <- rlog(object = filter_dds,
blind = TRUE # blind to experimental design
)
vsd_filter <- vst(object = filter_dds,
blind = TRUE # blind to experimental design
)
vsn::meanSdPlot(assay(rsd_filter), ranks = TRUE, xlab = "rank(mean) \n transformed using rlog")
vsn::meanSdPlot(assay(vsd_filter), ranks = TRUE, xlab = "rank(mean) \n transformed using vst")
Figure 4.3: Plot of log2 counts per million across (a) unfiltered samples and (b) samples filtered for non-zero gene expression (i.e. normalised counts > 0 in more than or equal to 2 samples). Samples are coloured by RIN, with the midpoint of the colour scale (white) indicating the median RIN across all samples.
gtf <- readGFF(filepath = path_to_ref_gtf)
# XIST and DDX3Y ensembl IDs
sex_genes <- gtf %>%
dplyr::filter(gene_name %in% c("XIST", "DDX3Y")) %>%
distinct(gene_name, .keep_all = TRUE) %>%
dplyr::select(gene_id, gene_name)
sex_gene_expr <- sex_genes %>%
inner_join(dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = dds %>%
assay() %>%
rownames() %>%
str_replace("\\..*", "")) %>%
dplyr::select(gene_id, everything()),
by = c("gene_id" = "gene_id")) %>%
gather(key = sample_id, value = counts, -gene_id, -gene_name) %>%
inner_join(sample_info, by = "sample_id")
ggplot(sex_gene_expr, aes(x = sample_id, y = counts, fill = Sex)) +
geom_col() +
facet_grid(gene_name~., scales = "free_y") +
labs(x = "Sample ID", y = "Counts") +
theme_rhr
Figure 5.1: Plot of read counts across DDX3Y (Y-chromosome expressed) and XIST (required for X-chromosome inactivation). Samples are coloured by their assigned sex, as provided in the initial sample information from Imperial. Expression patterns of the female/male-specific genes match the assigned sexes.
sampleDists <- vsd %>%
assay() %>%
t() %>% # Transpose transformed count matrix, such that samples now rows.
dist() # Compute distances between rows of the matrix i.e. sample-to-sample distance
sampleDistMatrix <- sampleDists %>%
as.matrix() # Convert to matrix
rownames(sampleDistMatrix) <- str_c(vsd$sample_id, ": ", vsd$Disease_Group, ", ", vsd$Sex)
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col= viridis(n = 20))
Figure 6.1: Heatmap of sample-to-sample distances calculated using vst-transformed count matrix. Samples labelled by name, disease group and sex.
prcomp()
to perform PCA. prcomp()
uses singular value decomposition to perform PCA (as opposed to spectral decomposition, as performed by other functions, such as princomp()
). See sthda practical guide or sthda practical guide to FactoMineR
for more information on PCA in R.pca_vsd <- prcomp(t(assay(vsd)))
ggarrange(pcascree(pca_vsd, type = "pev"),
pcascree(pca_vsd, type = "cev"),
labels = c("a", "b"))
Figure 7.1: Plot of (a) the proportion of variance explained by each PC and (b) the cumulative proportion of variance explained with the addition of each PC.
# Source function correlate PCs
source(here::here("R", "correlatePCs.R"))
# Select col 1-20 (excludes Unclassified with NAs)
PC_correlates <- correlatePCs(pcaobj = pca_vsd, coldata = colData(vsd)[,1:19], pcs = 1:24)
PC_correlates_FDR <- PC_correlates$pvalue %>%
as_tibble() %>%
dplyr::mutate(PC = str_c("PC_", c(1:24))) %>%
dplyr::select(-sample_id) %>%
tidyr::gather(key = covariate, value = p, -PC) %>%
dplyr::mutate(FDR = p.adjust(p, method = "fdr"))
PC_correlates_FDR %>%
arrange(FDR)
# Include only continuous covariates
heatmap.2(x = PC_correlates$statistic[,4:19] %>% t(),
Colv = NA,
Rowv = NA,
dendrogram = "none",
col = viridis(n = 30),
trace = "none",
density.info = "none",
key = TRUE,
key.xlab = "-log10(FDR-corrected p-value)",
cexRow = 0.65, cexCol = 0.65)
Figure 7.2: Heatmap of correlations between PCA scores and continuous covariates.
PC_correlates_FDR_mat <- PC_correlates_FDR %>%
dplyr::select(-p) %>%
tidyr::spread(key = covariate, value = FDR) %>%
dplyr::mutate(PC_number = str_replace(PC, "PC_", "") %>%
as.numeric()) %>%
dplyr::arrange(PC_number) %>%
dplyr::select(-PC_number, -PC) %>%
as.matrix()
PC_correlates_FDR_mat <- PC_correlates_FDR_mat[, match(colnames(PC_correlates$statistic), colnames(PC_correlates_FDR_mat))]
rownames(PC_correlates_FDR_mat) <- PC_correlates_FDR %>% .[["PC"]] %>% unique()
heatmap.2(x = -log10(PC_correlates_FDR_mat) %>% t(),
Colv = NA,
Rowv = NA,
dendrogram = "none",
col = viridis(n = 30),
trace = "none",
density.info = "none",
key = TRUE,
key.xlab = "-log10(FDR-corrected p-value)",
cexRow = 0.65, cexCol = 0.65)
Figure 7.3: Heatmap of significance of (cor)relations between PCA scores and sample covariates. Significance computed using Kruskal-Wallis for categorical variables and the cor.test based on Spearman's correlation for continuous variables. P-values were FDR-corrected.
PC_correlates_FDR %>%
dplyr::filter(covariate == "Sex") %>%
dplyr::arrange(FDR)
plot_PCA <- function(data, intgroup, ntop, colour, shape){
colour <- enquo(colour)
shape <- enquo(shape)
pcaData <- plotPCA(data, intgroup= intgroup,
ntop = ntop, # number of top genes to use for principal components [default: 500, but when used PC1 only accounted for 10%]
returnData = TRUE)
pcVar <- round(100 * attr(pcaData, "percentVar"))
plot <- ggplot(pcaData,
aes(x = PC1,
y = PC2,
colour = !!colour,
shape = !!shape,
label = name)) +
geom_point(size = 2) +
geom_label_repel(data = subset(pcaData, name == "PD416"),
min.segment.length = unit(0, 'lines'),
nudge_x = 5,
nudge_y = -5,
size = 2) +
labs(x = str_c("PC1: ", pcVar[1], "%"),
y = str_c("PC2: ", pcVar[2], "%")) +
theme_rhr
list <- list(plot, pcaData)
names(list) <- c("plot", "pcaData")
return(list)
}
pca <- lapply(c(100, 500, 5000, 10000),
plot_PCA, data = vsd, intgroup = c("RIN", "Sex"),
colour = RIN, shape = Sex)
names(pca) <- c("top_100", "top_500", "top_5000", "top_10000")
ggarrange(plotlist = list(pca[[1]]$plot, pca[[2]]$plot, pca[[3]]$plot, pca[[4]]$plot),
common.legend = TRUE,
legend = "right",
labels = c("a", "b", "c", "d"))
Figure 7.4: PCA plots using the top a) 100, b) 500, c) 5000 or d) 10000 variable genes (selected by highest row variance). Points are coloured by RIN, and their shape corresponds to sex. Highlighted sample PD416, which in (a) and (b) does not appear to cluster by sex.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] vsn_3.54.0 viridis_0.5.1
## [5] viridisLite_0.3.0 rjson_0.2.20
## [7] readxl_1.3.1 rtracklayer_1.46.0
## [9] pcaExplorer_2.12.0 forcats_0.5.1
## [11] stringr_1.4.0 dplyr_1.0.2
## [13] purrr_0.3.4 readr_1.4.0
## [15] tidyr_1.1.1 tibble_3.0.3
## [17] tidyverse_1.3.0 pheatmap_1.0.12
## [19] ggpubr_0.4.0 ggrepel_0.8.2
## [21] ggplot2_3.3.2 gplots_3.0.4
## [23] DT_0.15 devtools_2.3.2
## [25] usethis_1.6.3 DESeq2_1.26.0
## [27] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [29] BiocParallel_1.20.1 matrixStats_0.56.0
## [31] Biobase_2.46.0 GenomicRanges_1.38.0
## [33] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [35] S4Vectors_0.24.4 BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.1 SparseM_1.78 AnnotationForge_1.28.0
## [4] pkgmaker_0.31.1 bit64_4.0.2 knitr_1.29
## [7] data.table_1.13.0 rpart_4.1-15 RCurl_1.98-1.2
## [10] doParallel_1.0.15 generics_0.0.2 preprocessCore_1.48.0
## [13] callr_3.5.1 cowplot_1.0.0 RSQLite_2.2.0
## [16] bit_4.0.4 xml2_1.3.2 lubridate_1.7.9
## [19] httpuv_1.5.4 assertthat_0.2.1 xfun_0.16
## [22] hms_1.0.0 evaluate_0.14 promises_1.1.1
## [25] progress_1.2.2 caTools_1.18.0 dbplyr_1.4.4
## [28] Rgraphviz_2.30.0 igraph_1.2.5 DBI_1.1.1
## [31] geneplotter_1.64.0 htmlwidgets_1.5.3 ellipsis_0.3.1
## [34] crosstalk_1.1.0.1 backports_1.1.8 bookdown_0.21
## [37] annotate_1.64.0 gridBase_0.4-7 biomaRt_2.42.1
## [40] vctrs_0.3.2 here_1.0.0 remotes_2.2.0
## [43] abind_1.4-5 cachem_1.0.3 withr_2.2.0
## [46] checkmate_2.0.0 GenomicAlignments_1.22.1 prettyunits_1.1.1
## [49] cluster_2.1.0 crayon_1.4.1 genefilter_1.68.0
## [52] labeling_0.4.2 pkgconfig_2.0.3 pkgload_1.1.0
## [55] nnet_7.3-12 rlang_0.4.7 lifecycle_0.2.0
## [58] registry_0.5-1 affyio_1.56.0 BiocFileCache_1.10.2
## [61] GOstats_2.52.0 modelr_0.1.8 cellranger_1.1.0
## [64] rprojroot_2.0.2 graph_1.64.0 rngtools_1.5
## [67] Matrix_1.2-17 carData_3.0-4 reprex_1.0.0
## [70] base64enc_0.1-3 processx_3.4.5 png_0.1-7
## [73] bitops_1.0-6 shinydashboard_0.7.1 KernSmooth_2.23-15
## [76] Biostrings_2.54.0 blob_1.2.1 jpeg_0.1-8.1
## [79] rstatix_0.6.0 shinyAce_0.4.1 ggsignif_0.6.0
## [82] scales_1.1.1 hexbin_1.28.1 memoise_2.0.0
## [85] GSEABase_1.48.0 magrittr_2.0.1 plyr_1.8.6
## [88] bibtex_0.4.3 gdata_2.18.0 zlibbioc_1.32.0
## [91] threejs_0.3.3 compiler_3.6.1 RColorBrewer_1.1-2
## [94] d3heatmap_0.6.1.2 Rsamtools_2.2.3 cli_2.2.0.9000
## [97] affy_1.64.0 XVector_0.26.0 Category_2.52.1
## [100] ps_1.3.4 htmlTable_2.0.1 Formula_1.2-4
## [103] MASS_7.3-51.4 tidyselect_1.1.0 stringi_1.5.3
## [106] shinyBS_0.61 highr_0.8 yaml_2.2.1
## [109] askpass_1.1 locfit_1.5-9.4 latticeExtra_0.6-29
## [112] grid_3.6.1 tools_3.6.1 rio_0.5.16
## [115] rstudioapi_0.13 foreach_1.5.0 foreign_0.8-72
## [118] gridExtra_2.3 farver_2.0.3 HGNChelper_0.8.1
## [121] digest_0.6.27 BiocManager_1.30.10 shiny_1.5.0
## [124] Rcpp_1.0.5 car_3.0-9 broom_0.7.0
## [127] later_1.1.0.1 httr_1.4.2 ggdendro_0.1.21
## [130] AnnotationDbi_1.48.0 colorspace_2.0-0 rvest_0.3.6
## [133] XML_3.99-0.3 fs_1.5.0 topGO_2.38.1
## [136] splines_3.6.1 RBGL_1.62.1 sessioninfo_1.1.1
## [139] xtable_1.8-4 jsonlite_1.7.1 testthat_2.3.2
## [142] R6_2.5.0 Hmisc_4.4-1 pillar_1.4.6
## [145] htmltools_0.5.1.1 mime_0.9 NMF_0.23.0
## [148] glue_1.4.2 fastmap_1.0.1 codetools_0.2-16
## [151] pkgbuild_1.1.0 lattice_0.20-38 curl_4.3
## [154] gtools_3.8.2 zip_2.1.0 GO.db_3.10.0
## [157] openxlsx_4.2.3 openssl_1.4.2 survival_3.2-7
## [160] limma_3.42.2 rmarkdown_2.5 desc_1.2.0
## [163] munsell_0.5.0 GenomeInfoDbData_1.2.2 iterators_1.0.12
## [166] haven_2.3.1 reshape2_1.4.4 gtable_0.3.0