Aim: perform differential gene expression accounting for (i) AoD, RIN, and Sex and (ii) PC axes 1-4, which correlated with AoD, RIN, Sex and the proportions of several cell types. Perform pathway enrichment analyses on results from both correction methods to determine the effect of correcting for cell-type proportion on pathways highlighted.
source(here::here("R", "file_paths.R"))
sample_info <-
read_excel(path =
file.path(
path_to_raw_data,
"sample_details/20201229_MasterFile_SampleInfo.xlsx"
),
sheet = "SampleInfo",
skip = 1) %>%
dplyr::filter(Sample_Type == "Tissue section") %>%
dplyr::select(
CaseNo, Disease_Group, Sex, AoO,
AoD, DD, PMI, aSN, TAU, 'thal AB', 'aCG aSN score',
Genetics, RINe_bulkRNA_Tapestation
) %>%
dplyr::rename(
sample_id = CaseNo,
RIN = RINe_bulkRNA_Tapestation
) %>%
dplyr::mutate(
Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")),
aSN = factor(aSN, levels = c(0,1,2,3,4,5,6)),
TAU = factor(TAU, levels = c(0,1,2,3,4,5,6)),
`thal AB` = factor(`thal AB`, levels = c(0,1,2,3)),
`aCG aSN score` = factor(`aCG aSN score`, levels = c(0,1,2,3))
) %>%
dplyr::inner_join(
read_delim(
file.path(
path_to_results,
"deconvolution/scaden/2020Feb/scaden_summary_results.txt"
),
delim = "\t") %>%
dplyr::mutate(
Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")),
Celltype = str_c(Celltype, "_proportions")
) %>%
dplyr::filter(cells == 1000 & samples == 1000) %>%
dplyr::select(-source, -cells, -samples, -Disease_Group, -Celltype_class) %>%
tidyr::spread(key = Celltype, value = cell_type_proportion)
)
# Create dataframe of file paths and sample names
file_df <- tibble(file_paths = list.files(path = file.path(path_to_bulk_seq_data, "salmon_quant"),
recursive = T,
pattern = "quant.sf",full.names = T),
sample_name = list.files(path = file.path(path_to_bulk_seq_data, "salmon_quant"),
recursive = T,
pattern = "quant.sf",full.names = T) %>%
str_replace("/quant.sf", "") %>%
str_replace("/.*/", "") %>%
str_replace("_.*", "")) %>%
dplyr::filter(!sample_name == "Undetermined")
files <- file_df$file_paths
names(files) <- file_df$sample_name
# # Performed only once. Can load from saved location.
# txdb <- makeTxDbFromEnsembl(organism="Homo sapiens",
# release=97,
# server="ensembldb.ensembl.org")
# saveDb(txdb,
# file = "/data/references/ensembl/txdb_sqlite/v97/Homo_sapiens.GRCh38.97.sqlite")
txdb <- loadDb("/data/references/ensembl/txdb_sqlite/v97/Homo_sapiens.GRCh38.97.sqlite")
tx2gene <- ensembldb::select(txdb,
keys = keys(txdb, keytype = "TXNAME"),
columns = c("GENEID", "TXNAME"),
keytype = "TXNAME")
# Loading in salmon data
txi <- tximport::tximport(files = files,
type = c("salmon"),
tx2gene = tx2gene,
ignoreTxVersion = TRUE, # splits tx id on the "." character to remove version info for easier matching with tx id in tx2gene
dropInfReps = TRUE)
# Check sample names match in metadata and counts file and are also in the correct order
all(colnames(txi$abundance) %in% c(sample_info %>%
dplyr::filter(sample_id %in% colnames(txi$abundance)) %>%
.[["sample_id"]]))
all(colnames(txi$abundance) == c(sample_info %>%
dplyr::filter(sample_id %in% colnames(txi$abundance)) %>%
.[["sample_id"]]))
# Create a DESeqDataSet (dds) object
# Important that rows in the dataframe, 'sample_info', are in the same order as the columns in the count matrix, 'txi'.
# Include RIN and AoD in design due to post-quant QC -- found to be significantly correlated with gene expression
dds <- DESeq2::DESeqDataSetFromTximport(txi = txi,
colData = sample_info %>%
dplyr::filter(sample_id %in% colnames(txi$abundance)) %>%
dplyr::select(-Genetics),
design = ~ Sex + RIN + AoD + Disease_Group)
# Remove blacklist regions from dds
# Load granges with blacklist regions that overlap with ensembl v97 elements and extract gene IDs
blacklist_genes <- readRDS(
file.path(path_to_raw_data, "misc/homo_sapiens_v97_BR.rds")
) %>%
as_tibble() %>%
.[["gene_id"]] %>%
unique()
dds <- subset(dds, !rownames(dds) %in% blacklist_genes)
saveRDS(
dds,
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD.Rds"
)
)
# Raw counts used by DESeq2, but can be useful to have TPM values for visualisation, etc.
# Remove blacklisted genes
tpm <- subset(txi$abundance, !rownames(txi$abundance) %in% blacklist_genes)
saveRDS(
tpm,
file=
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_TPM.Rds")
)
Design of differential expression analysis will be:
## ~Sex + RIN + AoD + Disease_Group
# Perform differential expression - corrects for library size
# No need to correct for gene length at this point, as we are comparing the same gene across sample groups
dds <-
DESeq2::DESeq(dds, test = "Wald")
saveRDS(
dds,
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD_DEG.Rds"
)
)
DESeq2::DESeq(dds, test = "Wald")
-- this can be caused by issues with GLM code that can have trouble converging when there is a single row with many 0's or genes with very small counts and little power. A way of resolving this is by filtering prior to differential expression analyses and increasing the maximum iterations used in the Wald test for convergence of the coefficient vector.filter_count <- DEGreport::degFilter(counts = counts(dds),
metadata = as.data.frame(colData(dds)),
group = "Disease_Group",
min = 1, # All samples in group must have more than expr > 0
minreads = 0)
cat("Genes in final count matrix: ", nrow(filter_count))
## Genes in final count matrix: 28692
filter_dds <- dds[rownames(filter_count),]
# Exactly the same as using DESeq2::DESeq(filter_dds, test = "Wald"), with addition of iterations run in Wald test.
# Iterations usually set to 100
filter_dds <- estimateSizeFactors(filter_dds)
filter_dds <- estimateDispersions(filter_dds)
filter_dds <- nbinomWaldTest(filter_dds, maxit=1000)
saveRDS(
filter_dds,
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_SexRINAoD_DEG.Rds"
)
)
dds <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD_DEG.Rds"
)
)
filter_dds <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_SexRINAoD_DEG.Rds"
)
)
genes_detected <-
dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = rownames(assay(dds))) %>%
gather(key = sample_id, value = counts, -gene_id) %>%
dplyr::filter(counts > 0) %>%
dplyr::inner_join(colData(dds) %>%
as_tibble(), by = "sample_id") %>%
dplyr::distinct(Disease_Group, gene_id) %>%
dplyr::group_by(Disease_Group) %>%
dplyr::summarise(n_genes_detected = n()) %>%
dplyr::inner_join( filter_dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = rownames(assay(filter_dds))) %>%
gather(key = sample_id, value = counts, -gene_id) %>%
dplyr::filter(counts > 0) %>%
dplyr::inner_join(colData(filter_dds) %>%
as_tibble(), by = "sample_id") %>%
dplyr::distinct(Disease_Group, gene_id) %>%
dplyr::group_by(Disease_Group) %>%
dplyr::summarise(n_genes_detected_after_filtering = n()))
genes_detected
DESeq2::plotDispEsts(dds)
comparison[1]_vs_comparison[2]
, where comparison[1]
is the baseline/reference value and comparison[2]
is in reference to the baseline value i.e. you will get log2FC(comparison[2]/comparison[1])# Collect all results and save as RDS object
# Requires looping across and extracting for each comparison
extract_results_dds <- function(dds, sample_info, group_column_name){
# Create vector of groups
groups <- sample_info %>%
.[[group_column_name]] %>%
unique() %>%
sort() %>%
as.character()
# Create dataframe of comparisons
comparisons <-
combn(x = groups,
m = 2) %>%
t() %>%
as_tibble()
# Loop through comparisons and for each extract results
for(i in 1:nrow(comparisons)){
# Filter for comparisons
comparison <- comparisons[i,] %>%
purrr::as_vector()
# Contrast in this order will result in logFC(comparison[2]/comparison[1])
contrast <- c(group_column_name, comparison[2], comparison[1])
print(contrast)
# Unshrunken and shrunk results
res_unshrunken <- results(dds, contrast = contrast, alpha = 0.05)
res <- lfcShrink(dds, contrast = contrast, res = res_unshrunken, type = "normal")
# All genes in tibble format
all_genes <- res_unshrunken %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble() %>%
inner_join(res %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble() %>%
dplyr::rename(log2FoldChange_shrunk = log2FoldChange,
lfcSE_shrunk = lfcSE,
pvalue_shrunk = pvalue,
padj_shrunk = padj)) %>%
dplyr::select(gene, baseMean, log2FoldChange, lfcSE, pvalue, padj, everything(), -stat)
# Extract genes with FDR < 0.05
significant_genes <- all_genes %>%
dplyr::filter(padj < 0.05)
results <- list(res_unshrunken, res, all_genes, significant_genes)
names(results) <- c("dds_results_unshrunken", "dds_results_lfc", "tibble_all_genes", "tibble_significant_genes")
if(i == 1){
all_results <- list(results)
list_name <- str_c(comparison[1], "_vs_", comparison[2])
} else {
all_results <- c(all_results, list(results))
list_name <- c(list_name, str_c(comparison[1], "_vs_", comparison[2]))
}
}
names(all_results) <- list_name
return(all_results)
}
results <- extract_results_dds(dds = filter_dds, sample_info = sample_info, group_column_name = "Disease_Group")
# Save results as .Rds
saveRDS(
results,
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DEG_lists_design_SexRINAoD.Rds"
)
)
source(here::here("R", "biomart_df.R"))
results <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DEG_lists_design_SexRINAoD.Rds"
)
)
significant_genes <- results %>%
lapply(., function(x){
x$tibble_significant_genes %>%
dplyr::mutate(direction_of_effect = ifelse(log2FoldChange < 0, "downregulated", "upregulated"))
}) %>%
qdapTools::list_df2df(., col = "Comparison") %>%
biomart_df(columnToFilter = "gene",
mart = 38,
ensembl_version = "jul2019.archive.ensembl.org",
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filter = "ensembl_gene_id") %>%
dplyr::select(Comparison, direction_of_effect, gene, hgnc_symbol, everything())
## [1] "Number of unique genes to search: 700"
## [1] "Number of matches found:700"
covar_count <- significant_genes %>%
dplyr::group_by(Comparison, direction_of_effect) %>%
summarise(n = n()) %>%
spread(key = direction_of_effect, value = n)
covar_count
genes_list <-
results %>%
lapply(., function(x){
x$tibble_significant_genes %>% .[["gene"]] %>% unique()
})
upset(fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")
# all tested genes as background
all_genes <- results$Control_vs_DLB$tibble_all_genes$gene
# Split significant genes by up and downregulated genes
gprofiler_list <- setNames(significant_genes %>%
# arrange by adjusted p-value
dplyr::arrange(Comparison, direction_of_effect, padj) %>%
dplyr::group_by(Comparison, direction_of_effect) %>%
dplyr::group_split(),
significant_genes %>%
dplyr::arrange(Comparison, direction_of_effect, padj) %>%
tidyr::unite(col = "list_name", c("Comparison", "direction_of_effect"), sep = ":", remove = FALSE) %>%
.[["list_name"]] %>%
unique())
# Run once as ordered query
gprofiler_results_list <- lapply(gprofiler_list, function(x){
x %>%
.[["gene"]] %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
ordered_query = TRUE,
evcodes = TRUE)})
saveRDS(
gprofiler_results_list,
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/gprofiler/PD_tissue_polyA_DEG_design_SexRINAoD_gprofiler.Rds"
)
)
ordered_query = TRUE
, the hypergeometric testing is performed for each possible prefix of the list starting from the first gene and adding next genes one by one. The smallest enrichment P-value is reported for each of the terms along with corresponding gene list length. For different terms this length can vary, especially as the broader terms can be enriched for larger lists only.gprofiler_df_top5 <-
gprofiler_df %>%
dplyr::filter(term_size <= 2000, term_size >= 20) %>%
dplyr::group_by(comparison, direction_of_effect) %>%
dplyr::top_n(n = 5, wt = -p_value) %>%
dplyr::ungroup() %>%
dplyr::mutate(
comparison =
fct_relevel(
comparison,
c("Control_vs_PD", "Control_vs_PDD", "Control_vs_DLB",
"PD_vs_DLB", "PDD_vs_DLB", "PD_vs_PDD")
)
)
gprofiler_df_top5 %>%
ggplot(aes(
x = comparison,
y = fct_relevel(
term_name,
gprofiler_df_top5 %>%
dplyr::arrange(desc(comparison), direction_of_effect, desc(term_name)) %>%
.[["term_name"]] %>%
unique()
),
size = precision,
colour = -log10(p_value)
)) +
geom_point() +
geom_point(shape = 1, colour = "black") +
labs(x = "Comparison", y = "") +
facet_grid(rows = vars(direction_of_effect), scales = "free_y", space = "free_y") +
scale_colour_viridis_c(name = "-log10(p-value)", option = "cividis") +
theme_rhr +
theme(
legend.position = "right",
legend.key.size = unit(0.35, "cm")
)
# Read in PD genes
PD_genes <-
setNames(vector(mode = "list", length = 2),
c("mendelian", "sporadic"))
PD_genes$mendelian <-
read_delim(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/20191128_GE_ParkinsonDiseaseComplexParkinsonism_greengenes.tsv"
),
delim = "\t"
) %>%
dplyr::rename(Gene = `Gene Symbol`)
PD_genes$sporadic <-
read_excel(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/TableS6_Complete_summary_statistics_for_QTL_Mendelian_randomization.xlsx"
)
) %>%
dplyr::filter(`Pass Bonferroni` == "pass")
# Note that the following genes in the mendelian list are also present within the sporadic MR list
PD_genes$mendelian$Gene[PD_genes$mendelian$Gene %in% PD_genes$sporadic$Gene]
## [1] "GCH1" "GRN" "LRRK2" "MAPT" "SNCA"
# Differentially expressed genes overlapping PD genes
PD_genes %>%
lapply(., function(x){
significant_genes %>%
dplyr::filter(hgnc_symbol %in% x$Gene) %>%
.[["hgnc_symbol"]] %>%
unique() %>%
sort()
})
## $mendelian
## [1] "GBA" "PINK1"
##
## $sporadic
## [1] "CCAR2" "CUEDC2" "FCGR2A" "GPR65" "HLA-DQB1" "ITGA8" "MMRN1"
significant_genes %>%
dplyr::filter(hgnc_symbol %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>%
dplyr::arrange(hgnc_symbol) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
# Using likelihood ratio test (i.e. goodness of fit)
# Number of DEgenes much higher than with Wald test
# But this also because we are analysing all levels of a factor at once, similar to ANOVA.
# P-values determined by difference in deviance between full and reduced model (not log2 fold changes)
# See: https://hbctraining.github.io/DGE_workshop/lessons/08_DGE_LRT.html
dds_LRT <- DESeq(filter_dds, test = "LRT", reduced = ~Sex + RIN + AoD)
saveRDS(
dds_LRT,
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_SexRINAoD_LRT.Rds"
)
)
dds_LRT <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_SexRINAoD_LRT.Rds"
)
)
# Design formula
design(dds_LRT)
## ~Sex + RIN + AoD + Disease_Group
res_LRT <- results(dds_LRT)
sig_res_LRT <- res_LRT %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble() %>%
dplyr::filter(padj < 0.05)
# Proportion of genes that overlap between pairwise comparisons and LRT
(c(significant_genes$gene %in% sig_res_LRT$gene)[c(significant_genes$gene %in% sig_res_LRT$gene) == TRUE] %>% length())/nrow(significant_genes)
## [1] 0.4525641
degPatterns
function from the 'DEGreport' package to determine sets of genes that exhibit similar expression patterns across sample groups. The degPatterns
tool uses a hierarchical clustering approach based on pair-wise correlations, then cuts the hierarchical tree to generate groups of genes with similar expression profiles. The tool cuts the tree in a way to optimize the diversity of the clusters, such that the variability inter-cluster > the variability intra-cluster.# Identify gene clusters exhibiting particular patterns across samples, using the `degPatterns` function from the 'DEGreport' package to show gene clusters across disease groups
vsd <- DESeq2::vst(dds_LRT, blind = FALSE)
vsd_mat <- assay(vsd)
# Filter only for significant genes
cluster_rlog <- vsd_mat[sig_res_LRT$gene,]
clusters <- DEGreport::degPatterns(ma = cluster_rlog, metadata = colData(dds_LRT), time = "Disease_Group", col=NULL, minc = 14)
# saveRDS(clusters, file.path(path_to_results, "Salmon_quant/bulk_tissue/DEGreport_LRT_SexRINAoD_clusters.Rds"))
## [1] "Number of unique genes to search: 425"
## [1] "Number of matches found:425"
all_genes <- results$Control_vs_DLB$tibble_all_genes$gene
gprofiler_clusters <- setNames(clusters_df %>%
dplyr::group_split(cluster),
clusters_df %>%
dplyr::mutate(cluster = str_c("cluster_", cluster)) %>%
dplyr::arrange(cluster) %>%
.[["cluster"]] %>%
unique()
) %>%
lapply(., function(x){x %>% .[["genes"]] %>% unique()})
gprofiler_clusters_results <- gprofiler_clusters %>%
lapply(., function(x){
y <- x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"))
y$result %>%
as_tibble()
})
gprofiler_clusters_results %>%
qdapTools::list_df2df(col = "list_name") %>%
dplyr::select(list_name, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000) %>%
dplyr::arrange(list_name, source, p_value) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
gprofiler_clusters_results_top10 <-
gprofiler_clusters_results %>%
qdapTools::list_df2df(col1 = "cluster") %>%
dplyr::filter(term_size <= 2000, term_size >= 20) %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n = 10, wt = -p_value)
gprofiler_clusters_results_top10 %>%
ggplot(aes(
x = cluster,
y = fct_relevel(
term_name,
gprofiler_clusters_results_top10 %>%
dplyr::arrange(desc(cluster), desc(term_name)) %>%
.[["term_name"]]
),
size = precision,
colour = -log10(p_value)
)) +
geom_point() +
geom_point(shape = 1, colour = "black") +
labs(x = "Cluster", y = "") +
scale_colour_viridis_c(name = "-log10(p-value)", option = "cividis") +
theme_rhr +
theme(
legend.position = "right",
legend.key.size = unit(0.35, "cm")
)
# Transform data and perform PCA to extract relevant PC axes
vsd <- vst(object = filter_dds,
# not blind to experimental design, as recommended in https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#blind-dispersion-estimation
blind = FALSE
)
pca_vsd <- prcomp(t(assay(vsd)))
# Extract coordinates of individuals on principle components 1-4
PC_axes <- pca_vsd$x[, str_c("PC", 1:4)] %>%
as.data.frame()
# Check that rows in PC axes match order of rows in colData
all(rownames(colData(filter_dds)) == rownames(PC_axes)) # TRUE
# Include PC axes in filter_dds colData & design
colData(filter_dds) <- cbind(colData(filter_dds), PC_axes)
design(filter_dds) <- ~ PC1 + PC2 + PC3 + PC4 + Disease_Group
# Differential expression
filter_dds <- estimateSizeFactors(filter_dds)
filter_dds <- estimateDispersions(filter_dds)
filter_dds <- nbinomWaldTest(filter_dds, maxit=1000)
saveRDS(
filter_dds,
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_DEG.Rds"
)
)
## ~PC1 + PC2 + PC3 + PC4 + Disease_Group
comparison[1]_vs_comparison[2]
, where comparison[1]
is the baseline/reference value and comparison[2]
is in reference to the baseline value i.e. you will get log2FC(comparison[2]/comparison[1])results <- extract_results_dds(dds = filter_dds, sample_info = sample_info, group_column_name = "Disease_Group")
# Save results as .Rds
saveRDS(
results,
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DEG_lists_design_PCaxes.Rds"
)
)
results <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DEG_lists_design_PCaxes.Rds"
)
)
significant_genes <- results %>%
lapply(., function(x){
x$tibble_significant_genes %>%
dplyr::mutate(direction_of_effect = ifelse(log2FoldChange < 0, "downregulated", "upregulated"))
}) %>%
qdapTools::list_df2df(., col = "Comparison") %>%
biomart_df(columnToFilter = "gene",
mart = 38,
ensembl_version = "jul2019.archive.ensembl.org",
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filter = "ensembl_gene_id") %>%
dplyr::select(Comparison, direction_of_effect, gene, hgnc_symbol, everything())
## [1] "Number of unique genes to search: 54"
## [1] "Number of matches found:54"
PC_count <- significant_genes %>%
dplyr::group_by(Comparison, direction_of_effect) %>%
summarise(n = n()) %>%
spread(key = direction_of_effect, value = n)
PC_count
significant_genes %>%
dplyr::arrange(Comparison) %>%
dplyr::select(Comparison, direction_of_effect, gene, hgnc_symbol, log2FoldChange_shrunk, lfcSE_shrunk, padj) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
# all tested genes as background
all_genes <- results$Control_vs_DLB$tibble_all_genes$gene
# Run once
gprofiler_results_list <- lapply(genes_list, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
ordered_query = TRUE,
evcodes = TRUE)})
saveRDS(
gprofiler_results_list,
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/gprofiler/PD_tissue_polyA_DEG_design_PCaxes_gprofiler.Rds"
)
)
# Differentially expressed genes overlapping PD genes
PD_genes %>%
lapply(., function(x){
significant_genes %>%
dplyr::filter(hgnc_symbol %in% x$Gene) %>%
.[["hgnc_symbol"]] %>%
unique() %>%
unlist() %>%
sort()
})
## $mendelian
## character(0)
##
## $sporadic
## character(0)
degPatterns
function from the 'DEGreport' package to determine sets of genes that exhibit similar expression patterns across sample groups.# Using likelihood ratio test (i.e. goodness of fit)
# Number of DEgenes much higher than with Wald test
# But this also because we are analysing all levels of a factor at once, similar to ANOVA.
# P-values determined by difference in deviance between full and reduced model (not log2 fold changes)
# See: https://hbctraining.github.io/DGE_workshop/lessons/08_DGE_LRT.html
dds_LRT <- DESeq(filter_dds, test = "LRT", reduced = ~PC1 + PC2 + PC3 + PC4)
saveRDS(
dds_LRT,
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_LRT.Rds"
)
)
dds_LRT <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_LRT.Rds"
)
)
# Design formula
design(dds_LRT)
## ~PC1 + PC2 + PC3 + PC4 + Disease_Group
res_LRT <- results(dds_LRT)
sig_res_LRT <- res_LRT %>%
as_tibble(rownames = "gene") %>%
dplyr::filter(padj < 0.05)
# Identify gene clusters exhibiting particular patterns across samples, using the `degPatterns` function from the 'DEGreport' package to show gene clusters across disease groups
vsd <- DESeq2::vst(dds_LRT, blind = FALSE)
vsd_mat <- assay(vsd)
# Filter only for significant genes
cluster_rlog <- vsd_mat[sig_res_LRT$gene,]
clusters <- DEGreport::degPatterns(cluster_rlog, metadata = colData(dds_LRT), time = "Disease_Group", col=NULL, minc = 14)
# saveRDS(clusters, file.path(path_to_results, "Salmon_quant/bulk_tissue/DEGreport_LRT_PCaxes_clusters.Rds"))
## [1] "Number of unique genes to search: 35"
## [1] "Number of matches found:35"
gprofiler_cluster <- clusters_df %>%
.[["genes"]] %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"))
gprofiler_cluster$result
## NULL
Correcting by PC axes 1-4 greatly reduces the number of differentially expressed genes, and likewise pathways, suggesting that many of the differential genes and pathways identified when only corrected by AoD, Sex, RIN relate to the changes observed in cell-type proportions.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] UpSetR_1.4.0 qdapTools_1.3.5
## [3] readxl_1.3.1 tximport_1.14.2
## [5] forcats_0.5.1 stringr_1.4.0
## [7] dplyr_1.0.2 purrr_0.3.4
## [9] readr_1.4.0 tidyr_1.1.1
## [11] tibble_3.0.3 ggplot2_3.3.2
## [13] tidyverse_1.3.0 ensembldb_2.10.2
## [15] AnnotationFilter_1.10.0 GenomicFeatures_1.38.2
## [17] AnnotationDbi_1.48.0 gprofiler2_0.1.9
## [19] DT_0.15 DEGreport_1.22.0
## [21] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [23] DelayedArray_0.12.3 BiocParallel_1.20.1
## [25] matrixStats_0.56.0 Biobase_2.46.0
## [27] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [29] IRanges_2.20.2 S4Vectors_0.24.4
## [31] BiocGenerics_0.32.0 clusterProfiler_3.14.3
## [33] biomaRt_2.42.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 RSQLite_2.2.0
## [3] htmlwidgets_1.5.3 grid_3.6.1
## [5] munsell_0.5.0 codetools_0.2-16
## [7] chron_2.3-56 withr_2.2.0
## [9] colorspace_2.0-0 GOSemSim_2.17.1
## [11] highr_0.8 knitr_1.29
## [13] rstudioapi_0.13 DOSE_3.12.0
## [15] labeling_0.4.2 lasso2_1.2-20
## [17] urltools_1.7.3 GenomeInfoDbData_1.2.2
## [19] mnormt_2.0.2 polyclip_1.10-0
## [21] bit64_4.0.2 farver_2.0.3
## [23] rprojroot_2.0.2 vctrs_0.3.2
## [25] generics_0.0.2 xfun_0.16
## [27] BiocFileCache_1.10.2 R6_2.5.0
## [29] doParallel_1.0.15 clue_0.3-57
## [31] graphlayouts_0.7.0 locfit_1.5-9.4
## [33] bitops_1.0-6 cachem_1.0.3
## [35] reshape_0.8.8 fgsea_1.12.0
## [37] gridGraphics_0.5-0 assertthat_0.2.1
## [39] scales_1.1.1 ggraph_2.0.3
## [41] nnet_7.3-12 enrichplot_1.6.1
## [43] gtable_0.3.0 Cairo_1.5-12.2
## [45] tidygraph_1.2.0 rlang_0.4.7
## [47] genefilter_1.68.0 GlobalOptions_0.1.2
## [49] splines_3.6.1 rtracklayer_1.46.0
## [51] lazyeval_0.2.2 broom_0.7.0
## [53] europepmc_0.4 checkmate_2.0.0
## [55] modelr_0.1.8 BiocManager_1.30.10
## [57] yaml_2.2.1 reshape2_1.4.4
## [59] crosstalk_1.1.0.1 backports_1.1.8
## [61] qvalue_2.18.0 Hmisc_4.4-1
## [63] tools_3.6.1 bookdown_0.21
## [65] psych_2.0.7 logging_0.10-108
## [67] ggplotify_0.0.5 ellipsis_0.3.1
## [69] RColorBrewer_1.1-2 ggdendro_0.1.21
## [71] ggridges_0.5.2 Rcpp_1.0.5
## [73] plyr_1.8.6 base64enc_0.1-3
## [75] progress_1.2.2 zlibbioc_1.32.0
## [77] RCurl_1.98-1.2 prettyunits_1.1.1
## [79] rpart_4.1-15 openssl_1.4.2
## [81] GetoptLong_1.0.2 viridis_0.5.1
## [83] cowplot_1.0.0 haven_2.3.1
## [85] ggrepel_0.8.2 cluster_2.1.0
## [87] here_1.0.0 fs_1.5.0
## [89] magrittr_2.0.1 data.table_1.13.0
## [91] DO.db_2.9 circlize_0.4.10
## [93] reprex_2.0.0 triebeard_0.3.0
## [95] tmvnsim_1.0-2 ProtGenerics_1.18.0
## [97] hms_1.0.0 evaluate_0.14
## [99] xtable_1.8-4 XML_3.99-0.3
## [101] jpeg_0.1-8.1 gridExtra_2.3
## [103] shape_1.4.4 compiler_3.6.1
## [105] crayon_1.4.1 htmltools_0.5.1.1
## [107] mgcv_1.8-29 Formula_1.2-4
## [109] geneplotter_1.64.0 lubridate_1.7.9
## [111] DBI_1.1.1 tweenr_1.0.1
## [113] dbplyr_1.4.4 ComplexHeatmap_2.7.7
## [115] MASS_7.3-51.4 rappdirs_0.3.1
## [117] Matrix_1.2-17 cli_2.2.0.9000
## [119] igraph_1.2.5 pkgconfig_2.0.3
## [121] rvcheck_0.1.8 GenomicAlignments_1.22.1
## [123] foreign_0.8-72 plotly_4.9.2.1
## [125] xml2_1.3.2 foreach_1.5.0
## [127] annotate_1.64.0 XVector_0.26.0
## [129] rvest_0.3.6 digest_0.6.27
## [131] ConsensusClusterPlus_1.50.0 Biostrings_2.54.0
## [133] cellranger_1.1.0 rmarkdown_2.5
## [135] fastmatch_1.1-0 htmlTable_2.0.1
## [137] edgeR_3.28.1 curl_4.3
## [139] Rsamtools_2.2.3 rjson_0.2.20
## [141] lifecycle_0.2.0 nlme_3.1-141
## [143] jsonlite_1.7.1 viridisLite_0.3.0
## [145] askpass_1.1 limma_3.42.2
## [147] pillar_1.4.6 lattice_0.20-38
## [149] Nozzle.R1_1.1-1 fastmap_1.0.1
## [151] httr_1.4.2 survival_3.2-11
## [153] GO.db_3.10.0 glue_1.4.2
## [155] png_0.1-7 iterators_1.0.12
## [157] bit_4.0.4 ggforce_0.3.2
## [159] stringi_1.5.3 blob_1.2.1
## [161] latticeExtra_0.6-29 memoise_2.0.0