Aims: (i) to quantify intron excision ratios across samples; (ii) to perform differential splicing analyses across disease groups; (iii) to perform pathway enrichment analyses on groups of differentially spliced genes.
source(here::here("R", "file_paths.R"))
leafcutter_dir <- file.path(path_to_results, "leafcutter/")
leafviz_dir <- "/home/rreynolds/packages/leafcutter/leafviz/"
markergene_dir <- "/home/rreynolds/projects/MarkerGenes/"
PCaxes <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_DEG.Rds"
)
) %>%
colData() %>%
as_tibble() %>%
dplyr::select(sample_id, PC1, PC2, PC3, PC4)
sample_info <-
read_excel(
path =
file.path(
path_to_raw_data,
"sample_details/20201229_MasterFile_SampleInfo.xlsx"
),
sheet = "SampleInfo", skip = 1
) %>%
dplyr::filter(Sample_Type == "Tissue section" & sent_to_bulk_seq == "yes") %>%
dplyr::rename(sample_id = CaseNo,
RIN = RINe_bulkRNA_Tapestation) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD","DLB"))) %>%
dplyr::select(sample_id, Disease_Group, Sex, AoO, AoD, DD, PMI, aSN, TAU, 'thal AB', 'aCG aSN score', Genetics, RIN) %>%
dplyr::inner_join(PCaxes)
leafcutter
quantifies intron excision ratios building a splicing graph with all split reads that have a shared donor or acceptor site sitting within one clusterleafcutter
operates in 3 steps:
leafcutter
. These include:
RNAseqProcessing::convert_STAR_SJ_to_junc(sj_dir_path = file.path(path_to_bulk_seq_data, "STAR/"),
output_path = str_c(leafcutter_dir, "SJ_formatting/"),
filter_out_blacklist_regions = TRUE,
path_to_ENCODE_blacklist = file.path(path_to_raw_data, "misc/hg38-blacklist.v2.bed")
python /tools/leafcutter/clustering/leafcutter_cluster.py \
-j /home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/SJ_formatting/list_juncfiles.txt \
-r /home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/ \
-o tissue_polyA_test_diseasegroups \
-l 1000000 \
-m 30 \
-p 0.001 \
-s True
Rscript /tools/leafcutter/scripts/gtf_to_exons.R \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz
# Load per individual intron counts for column names
sample_names <- fread(
str_c(leafcutter_dir, "intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz")
) %>%
dplyr::select(-V1) %>%
colnames()
# Create master df of sample info, with grouping variable (Disease_Group) and confounders (RIN, Sex, AoD)
master <-
tibble(lc_sample_name = sample_names,
sample_name = sample_names %>%
str_replace("_.*", "")) %>%
inner_join(sample_info, by = c("sample_name" = "sample_id")) %>%
dplyr::select(lc_sample_name, Disease_Group, RIN, Sex, AoD) %>%
arrange(Disease_Group)
RNAseqProcessing::create_group_files_multi_pairwisecomp(df = master,
group_column_name = "Disease_Group",
output_path = str_c(leafcutter_dir, "diff_splicing/group_files/"))
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafcutter_ds.log&
# Load per individual intron counts for column names
sample_names <-
fread(str_c(leafcutter_dir, "intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz")) %>%
dplyr::select(-V1) %>%
colnames()
# Create master df of sample info, with grouping variable (Disease_Group) and confounders (PC axes 1-4)
master <-
tibble(lc_sample_name = sample_names,
sample_name = sample_names %>%
str_replace("_.*", "")) %>%
inner_join(sample_info, by = c("sample_name" = "sample_id")) %>%
dplyr::select(lc_sample_name, Disease_Group, PC1, PC2, PC3, PC4) %>%
arrange(Disease_Group)
RNAseqProcessing::create_group_files_multi_pairwisecomp(df = master,
group_column_name = "Disease_Group",
output_path = str_c(leafcutter_dir, "diff_splicing_PCaxes/group_files/"))
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafcutter_ds_PCaxes.log&
/tools/leafcutter/leafviz/gtf2leafcutter.pl \
-o /data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz
prepare_results.R
, which performs this formatting, albeit for only one pairwise comparison. To format the results of multiple pairwise comparisons requires looping across the various pairwise comparisons and running the prepare_results.R
for each individual pairwise comparison. This is what the leafviz_multi_pairwise.R
script used below does.leafviz_multi_pairwise.R
script assumes use of the leafcutter_ds_multi_pairwise.R
script and the create_group_files_multi_pairwisecomp()
function. This ensures that all files needed are named consistently. That is, the (i) _cluster_signficance.txt
, (ii) _effect_sizes.txt
and (iii) _group_file.txt
for an individual pairwise comparison all have the same prefix.nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafviz_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/ \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/ \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
--output_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/leafviz/ \
--group_file_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/group_files/ \
--FDR=0.05 \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafviz_prepare_results.log&
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafviz_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/ \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/ \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
--output_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/leafviz_PCaxes/ \
--group_file_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/group_files/ \
--FDR=0.05 \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafviz_PCaxes_prepare_results.log&
RNAseqProcessing::run_leafviz(leafviz_dir = leafviz_dir,
results_filename = file.path(leafcutter_dir, "leafviz_PCaxes/PDD_vs_DLB.Rda"))
Note on naming convention: results named comparison[1]_vs_comparison[2]
, where comparison[1]
is the baseline/reference value and comparison[2]
is in reference to the baseline value.
clusters <- list.files(file.path(leafcutter_dir, "diff_splicing/"), pattern = "cluster_", recursive = T, full.names = T)
intron <- list.files(file.path(leafcutter_dir, "diff_splicing/"), pattern = "effect_", recursive = T, full.names = T)
for(i in 1:length(clusters)){
comparison <- clusters[i] %>%
str_replace("/.*/", "") %>%
str_replace("_cluster_significance.txt", "")
lc <- read_delim(clusters[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
lc_all <- lc
} else{
lc_all <- lc_all %>%
bind_rows(lc)
}
}
for(i in 1:length(intron)){
comparison <- intron[i] %>%
str_replace("/.*/", "") %>%
str_replace("_effect_sizes.txt", "")
int <- read_delim(intron[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
int_all <- int
} else{
int_all <- int_all %>%
bind_rows(int)
}
}
# Filter for significant clusters
# Add direction of effect
lc_filtered <- lc_all %>%
# filter for successful tests and remove clusters overlapping multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>%
dplyr::inner_join(int_all %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>%
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change"))
# Save results
write_csv(lc_filtered,
path = file.path(leafcutter_dir, "diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.csv"))
saveRDS(setNames(list(lc_all, int_all, lc_filtered),
c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
file.path(leafcutter_dir, "diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.Rds"))
Figure 4.1: Histogram of intron numbers per cluster in various comparisons. Dashed red line indicates median intron number within each comparison. X-axis limited to show only clusters with up to 40 introns (max. within the significant differentially spliced clusters).
# Run once
all_genes <- results_covar$cluster_significance %>%
# remove clusters that overlap multiple genes
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
genes_list <- setNames(results_covar$significant_clusters_0.05_filter %>%
group_split(comparison),
results_covar$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>%
dplyr::distinct(genes, p.adjust, .keep_all = TRUE) %>%
dplyr::group_by(genes) %>%
dplyr::top_n(n = 1, wt = -p.adjust) %>%
# Order by p-value
dplyr::arrange(p.adjust) %>%
.[["genes"]]
})
upset(fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")
Figure 4.2: Overlap between pairwise comparisons. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.
gom.self <- newGOM(genes_list, genes_list, genome.size = length(all_genes))
All.jaccard <- getMatrix(gom.self, "Jaccard")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(All.jaccard, method="color", col=col(100), cl.lim = c(0,1),
type="lower",
# addCoef.col = "black", number.cex = 0.75, # Add coefficient of correlation
tl.col="black", tl.srt=45, tl.cex = 0.75, mar=c(0,0,1,0), #Text label color and rotation
p.mat = All.jaccard, sig.level = (0.05), insig = "p-value", number.cex = 0.75,
diag = TRUE)
Figure 4.3: Jaccard index between diffentially spliced genes identified in pairwise comparisons. Calculated by dividing the size of the intersection between two gene lists by the union of the two. Intersections with a Jaccard index > 0.05 (i.e. greater than 5% overlap) are highlighted within the plot, with Jaccard indices provided.
TRUE
for the statement upregulated > 0 & downregulated > 0
.results_covar$significant_clusters_0.05_filter %>%
dplyr::select(comparison, genes, direction_of_effect) %>%
dplyr::group_by(comparison, genes, direction_of_effect) %>%
dplyr::summarise(n = n()) %>%
tidyr::spread(key = direction_of_effect, value = n) %>%
# Create new column with ifelse determining whether number of splicing events that are both up and down > 0 for each gene
dplyr::mutate(up_and_down = ifelse(upregulated > 0 & downregulated > 0, TRUE, FALSE)) %>%
.[["up_and_down"]] %>%
# Determine wheter all logical values in up_and_down column = TRUE
all(na.rm = T)
## [1] TRUE
# Run once
gprofiler_results_list <- lapply(genes_list, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
saveRDS(gprofiler_results_list, file.path(leafcutter_dir, "gprofiler/gprofiler_leafcutter_ds_SexRINAoD.Rds"))
gprofiler_results_list <- readRDS(file.path(leafcutter_dir, "gprofiler/gprofiler_leafcutter_ds_SexRINAoD.Rds"))
# Added some filtering to remove broad terms
gprofiler_df <- gprofiler_results_list %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000) %>%
dplyr::arrange(comparison, source, p_value)
gprofiler_df %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n_enriched_pathways = n())
gprofiler_df %>%
dplyr::arrange(comparison, source, p_value) %>%
dplyr::select(-evidence_codes, -intersection) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
Note on naming convention: results named comparison[1]_vs_comparison[2]
, where comparison[1]
is the baseline/reference value and comparison[2]
is in reference to the baseline value.
clusters <- list.files(file.path(leafcutter_dir, "diff_splicing_PCaxes/"), pattern = "cluster_significance", recursive = T, full.names = T)
intron <- list.files(file.path(leafcutter_dir, "diff_splicing_PCaxes/"), pattern = "effect_", recursive = T, full.names = T)
for(i in 1:length(clusters)){
comparison <- clusters[i] %>%
str_replace("/.*/", "") %>%
str_replace("_cluster_significance.txt", "")
lc <- read_delim(clusters[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
lc_all <- lc
} else{
lc_all <- lc_all %>%
bind_rows(lc)
}
}
for(i in 1:length(intron)){
comparison <- intron[i] %>%
str_replace("/.*/", "") %>%
str_replace("_effect_sizes.txt", "")
int <- read_delim(intron[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
int_all <- int
} else{
int_all <- int_all %>%
bind_rows(int)
}
}
# Filter for significant clusters
# Add direction of effect
lc_filtered_PC <- lc_all %>%
# filter for successful tests and remove clusters overlapping multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>%
dplyr::inner_join(int_all %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>%
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change"))
# Save results
write_csv(lc_filtered_PC,
path = file.path(leafcutter_dir, "diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.csv"))
saveRDS(setNames(list(lc_all, int_all, lc_filtered_PC),
c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
file.path(leafcutter_dir, "diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.Rds"))
Figure 5.1: Histogram of intron numbers per cluster in various comparisons. Dashed red line indicates median intron number within each comparison. X-axis limited to show only clusters with up to 40 introns (max. within the significant differentially spliced clusters).
all_genes <- results_PC$cluster_significance %>%
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
# Extract differentially spliced genes
genes_list_PC <- setNames(results_PC$significant_clusters_0.05_filter %>%
group_split(comparison),
results_PC$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>%
dplyr::distinct(genes, p.adjust, .keep_all = TRUE) %>%
dplyr::group_by(genes) %>%
dplyr::top_n(n = 1, wt = -p.adjust) %>%
# Order by p-value
dplyr::arrange(p.adjust) %>%
.[["genes"]]
})
# Change names on genes_list_PC to denote correction method
names(genes_list_PC) <- str_c(names(genes_list_PC), "_PCaxes")
# Create master list with genes from both correction methods
master_list <- c(genes_list, genes_list_PC)
# Run once
gprofiler_results_list <- lapply(genes_list_PC, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcode = TRUE)})
saveRDS(gprofiler_results_list, file.path(leafcutter_dir, "gprofiler/gprofiler_leafcutter_ds_PCaxes.Rds"))
Figure 5.2: Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.
gprofiler_term_list <- c(setNames(gprofiler_df %>%
group_split(comparison),
gprofiler_df %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
lapply(., function(x){x %>% .[["term_id"]] %>% unique()}),
setNames(gprofiler_df_PC %>%
group_split(comparison),
gprofiler_df_PC %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
lapply(., function(x){x %>% .[["term_id"]] %>% unique()}))
upset(fromList(gprofiler_term_list), sets = names(gprofiler_term_list), keep.order = TRUE, order.by = "freq")
Figure 6.1: Overlap between gprofiler output of pairwise comparisons. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.
# Load internal specificity matrices
ctd_files <-
list.files(
file.path(
path_to_results,
"snRNA/specificity_matrices/2020_March/"
),
pattern = "ctd",
full.names = T)
ctd_list <- vector(mode = "list", length = length(ctd_files))
for(i in 1:length(ctd_files)){
# Load ctd
ctd_list[[i]] <- readRDS(ctd_files[i])
# Extract file name
title <-
ctd_files[i] %>%
str_replace(".*/", "") %>%
str_replace("\\..*", "") %>%
str_replace(".*_", "")
# Name list
names(ctd_list)[i] <- title
}
# Load external datasets
load(file.path(markergene_dir, "specificity_matrices/AIBS2018_MTG.rda"))
load(file.path(markergene_dir, "specificity_matrices/Habib2017_DroNc_Human.rda"))
# To save repeatedly querying biomaRt we have a stored dataset containing all the human
# orthologs of MGI genes, mouse_to_human_homologs.
load(file.path(markergene_dir, "data/mouse_to_human_orthologs.rda"))
m2h <- mouse_to_human_orthologs %>%
dplyr::filter(mmusculus_homolog_orthology_type == "ortholog_one2one") %>%
dplyr::select(hgnc_symbol, mmusculus_homolog_associated_gene_name) %>%
dplyr::rename(MGI_symbol = mmusculus_homolog_associated_gene_name) %>%
dplyr::filter(hgnc_symbol != "") %>%
dplyr::distinct(hgnc_symbol, MGI_symbol)
# Function to loop across gene lists of n length
ewce_top_n <- function(leafcutter_results, top_n){
for(i in 1:length(top_n)){
genes_list_ewce <- setNames(leafcutter_results$significant_clusters_0.05_filter %>%
group_split(comparison),
leafcutter_results$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>%
dplyr::distinct(comparison, genes, p.adjust) %>%
dplyr::group_by(genes) %>%
# Within gene, choose top cluster based on p.adjust
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::ungroup() %>%
# Select top genes by p.adjust
dplyr::top_n(n = top_n[i], wt = -p.adjust) %>%
.[["genes"]]
})
# Running EWCE with 10000 bootstraps
results <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list_ewce,
ctd_AIBS2018, ctd_DRONC_human,
ctd_list$Control, ctd_list$PD,
ctd_list$PDD, ctd_list$DLB,
celltypeLevel = 1,
reps = 10000,
genelistSpecies = "human",
sctSpecies = "human",
mouse_to_human = m2h) %>%
dplyr::mutate(n_genes = top_n[i])
if(i == 1){
final_ewce <- results
} else{
final_ewce <- final_ewce %>%
bind_rows(results)
}
}
return(final_ewce)
}
top_n <- c(50, 100, 250, 500)
ewce <- setNames(vector(mode = "list", length = 2),
c("covar", "PC"))
ewce$covar <- ewce_top_n(leafcutter_results = results_covar,
top_n = top_n)
ewce$PC <- ewce_top_n(leafcutter_results = results_PC,
top_n = top_n)
saveRDS(ewce, file = file.path(leafcutter_dir, "ewce/ewce_leafcutter_ds.Rds"))
Figure 7.1: EWCE results using differentially spliced genes (a) pre-deconvolution and (b) post-deconvolution and single-cell RNA-seq data from two external datasets (AIBS2018 study and the DRONC_human study) and our own internal snRNA. The x-axis denotes the size of the gene set used in EWCE i.e. whether it was the top 50, 100, 250 or 500 differentially spliced genes (as determined by FDR). The y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by the disease groups compared in the differential splicing analysis and overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. Multiple test correction was performed across all results using FDR; non-significant results (p > 0.05) were coloured grey. exPFC=glutamatergic neurons from the PFC, exCA1/3=pyramidal neurons from the Hip CA region, GABA=GABAergic interneurons, exDG=granule neurons from the Hip dentate gyrus region, ASC=astrocytes, NSC=neuronal stem cells, MG=microglia, ODC=oligodendrocytes, OPC=oligodendrocyte precursor cells, NSC=neuronal stem cells, SMC=smooth muscle cells, END= endothelial cells.
Figure 7.2: EWCE results using the top 100 differentially spliced genes (as determined by FDR) post-deconvolution and single-cell RNA-seq data from two external datasets (AIBS2018 study and the DRONC_human study) and our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. Multiple test correction was performed across all results using FDR; non-significant results (p > 0.05) were coloured grey. exPFC=glutamatergic neurons from the PFC, exCA1/3=pyramidal neurons from the Hip CA region, GABA=GABAergic interneurons, exDG=granule neurons from the Hip dentate gyrus region, ASC=astrocytes, NSC=neuronal stem cells, MG=microglia, ODC=oligodendrocytes, OPC=oligodendrocyte precursor cells, NSC=neuronal stem cells, SMC=smooth muscle cells, END= endothelial cells.
Figure 8.1: Histogram of deltaPSI of introns within differentially spliced clusters identified using (a) co-variate correction and (b) PC-axis correction. Dashed red line indicates median delta PSI within each facet, while dashed blue line indicates the |dPSI| >= 0.1 cut-off.
# Adding filter of dPSI
stringent <- leafcutter_list %>%
lapply(., function (x){
x$cluster_significance %>%
# filter for successful tests and remove clusters that overlap multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster_id"), sep = ":") %>%
dplyr::inner_join(x$intron_usage %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster_id"), sep = ":")) %>%
dplyr::filter(abs(deltapsi) >= 0.1)
})
# Number of differentially spliced clusters
stringent %>%
lapply(., function(x){
x %>%
dplyr::distinct(comparison, cluster_id, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n())
})
## $covar
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1663
## 2 Control_vs_PD 610
## 3 Control_vs_PDD 325
## 4 PDD_vs_DLB 620
## 5 PD_vs_DLB 445
## 6 PD_vs_PDD 510
##
## $PC
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1805
## 2 Control_vs_PD 313
## 3 Control_vs_PDD 365
## 4 PDD_vs_DLB 1180
## 5 PD_vs_DLB 1742
## 6 PD_vs_PDD 266
# Number of differentially spliced genes
stringent %>%
lapply(., function(x){
x %>%
dplyr::distinct(comparison, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n())
})
## $covar
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1501
## 2 Control_vs_PD 579
## 3 Control_vs_PDD 318
## 4 PDD_vs_DLB 598
## 5 PD_vs_DLB 433
## 6 PD_vs_PDD 493
##
## $PC
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1654
## 2 Control_vs_PD 305
## 3 Control_vs_PDD 355
## 4 PDD_vs_DLB 1099
## 5 PD_vs_DLB 1582
## 6 PD_vs_PDD 263
Figure 8.2: EWCE results using the top 100 differentially spliced genes (FDR < 0.05, with rank determined by absolute delta PSI) post-deconvolution and single-cell RNA-seq data from two external datasets (AIBS2018 study and the DRONC_human study) and our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. Multiple test correction was performed across all results using FDR; non-significant results (p > 0.05) were coloured grey. exPFC=glutamatergic neurons from the PFC, exCA1/3=pyramidal neurons from the Hip CA region, GABA=GABAergic interneurons, exDG=granule neurons from the Hip dentate gyrus region, ASC=astrocytes, NSC=neuronal stem cells, MG=microglia, ODC=oligodendrocytes, OPC=oligodendrocyte precursor cells, NSC=neuronal stem cells, SMC=smooth muscle cells, END= endothelial cells.
genes_list_PC <-
setNames(stringent$PC %>%
group_split(comparison),
stringent$PC %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
lapply(., function(x){
x %>%
dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>%
dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>%
dplyr::group_by(genes) %>%
# Where duplicate genes occur, select by lowest p-value first
# If still duplicate, select by highest absolute delta PSI
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::top_n(n = 1, wt = abs_deltapsi) %>%
# Order by p.adjust
dplyr::arrange(p.adjust) %>%
.[["genes"]]
})
gprofiler_PSI <- lapply(genes_list_PC, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS",
significant = TRUE,
user_threshold = 0.05,
ordered_query = FALSE,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
gprofiler_PSI_df <- gprofiler_PSI %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000)
gprofiler_PSI_df %>%
dplyr::mutate(term_name = fct_relevel(term_name,
gprofiler_PSI_df %>%
dplyr::select(-evidence_codes, -intersection) %>%
dplyr::distinct(term_name, p_value) %>%
dplyr::group_by(term_name) %>%
dplyr::top_n(n = 1, wt = p_value) %>%
.[["term_name"]] %>%
rev()),
comparison = str_replace(comparison, "_PCaxes", "") %>%
fct_relevel(., c("Control_vs_PD", "Control_vs_PDD", "Control_vs_DLB", "PD_vs_PDD", "PD_vs_DLB", "PDD_vs_DLB"))) %>%
ggplot(aes(x = comparison,
y = term_name,
size = precision,
fill = p_value)) +
geom_point(pch = 21) +
scale_fill_viridis_c(direction = -1) +
labs(x = "Comparison", y = "Dataset: Cell type") +
theme_rhr
Figure 8.3: Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.
compareCluster
: permits comparison of GO enrichments across groups. Uses hypergeometric model to perform enrichment analysis within each group (i.e. does not account for semantic similarity in testing as GProfiler does, which could be a disadvantage)library(clusterProfiler)
source(here::here("R", "biomart_df.R"))
cluster_compare_input <-
stringent$PC %>%
dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>%
dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>%
dplyr::mutate(comparison = fct_relevel(comparison,
c("Control_vs_PD", "Control_vs_PDD", "Control_vs_DLB",
"PD_vs_PDD", "PD_vs_DLB", "PDD_vs_DLB"))) %>%
dplyr::arrange(comparison) %>%
dplyr::group_by(genes) %>%
# Where duplicate genes occur, select by lowest p-value first
# If still duplicate, select by highest absolute delta PSI
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::top_n(n = 1, wt = abs_deltapsi)
ont <- c("BP", "CC", "MF")
cluster_compare_list <- setNames(vector(mode = "list", length = length(ont)),
ont)
for(i in 1:length(ont)){
cluster_compare_list[[i]] <-
cluster_compare_input %>%
clusterProfiler::compareCluster(genes ~ comparison,
data=.,
keyType = "SYMBOL",
fun="enrichGO",
ont = ont[i],
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
OrgDb='org.Hs.eg.db')
}
saveRDS(cluster_compare_list,
file.path(leafcutter_dir, "gprofiler/clusterProfiler_leafcutter_ds_PC_dPSIfilter.Rds"))
cluster_compare <- readRDS(file.path(leafcutter_dir, "gprofiler/clusterProfiler_leafcutter_ds_PC_dPSIfilter.Rds"))
cluster_compare %>%
lapply(., function(df){
df %>%
clusterProfiler::simplify(., cutoff=0.5, by="p.adjust", select_fun=min) %>%
clusterProfiler::dotplot(., showCategory = NULL, font.size = 8) +
geom_point(shape = 1,colour = "black") +
scale_size_continuous(name = "gene ratio") +
scale_colour_viridis_c(direction = -1) +
theme_rhr
})
## $BP
Figure 8.4: Gene ontology enrichments (GO) of genes across each comparison. Size of dot indicates gene ratio, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.
##
## $CC
Figure 8.5: Gene ontology enrichments (GO) of genes across each comparison. Size of dot indicates gene ratio, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.
##
## $MF
Figure 8.6: Gene ontology enrichments (GO) of genes across each comparison. Size of dot indicates gene ratio, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] UpSetR_1.4.0 rtracklayer_1.46.0
## [5] rlist_0.4.6.1 readxl_1.3.1
## [7] RNAseqProcessing_0.0.0.9000 forcats_0.5.1
## [9] stringr_1.4.0 dplyr_1.0.2
## [11] purrr_0.3.4 readr_1.4.0
## [13] tidyr_1.1.1 tibble_3.0.3
## [15] tidyverse_1.3.0 ggpubr_0.4.0
## [17] ggplot2_3.3.2 gprofiler2_0.1.9
## [19] GeneOverlap_1.22.0 DESeq2_1.26.0
## [21] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [23] BiocParallel_1.20.1 matrixStats_0.56.0
## [25] Biobase_2.46.0 GenomicRanges_1.38.0
## [27] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [29] S4Vectors_0.24.4 BiocGenerics_0.32.0
## [31] data.table_1.13.0 corrplot_0.84
## [33] clusterProfiler_3.14.3
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.1 qdapTools_1.3.5 bit64_4.0.2
## [4] knitr_1.29 rpart_4.1-15 RCurl_1.98-1.2
## [7] generics_0.0.2 callr_3.5.1 cowplot_1.0.0
## [10] usethis_1.6.3 RSQLite_2.2.0 europepmc_0.4
## [13] chron_2.3-56 bit_4.0.4 enrichplot_1.6.1
## [16] xml2_1.3.2 lubridate_1.7.9 assertthat_0.2.1
## [19] viridis_0.5.1 xfun_0.16 hms_1.0.0
## [22] evaluate_0.14 fansi_0.4.2 progress_1.2.2
## [25] caTools_1.18.0 dbplyr_1.4.4 igraph_1.2.5
## [28] DBI_1.1.1 geneplotter_1.64.0 htmlwidgets_1.5.3
## [31] ellipsis_0.3.1 crosstalk_1.1.0.1 backports_1.1.8
## [34] bookdown_0.21 annotate_1.64.0 biomaRt_2.42.1
## [37] vctrs_0.3.2 remotes_2.2.0 here_1.0.0
## [40] abind_1.4-5 cachem_1.0.3 withr_2.2.0
## [43] ggforce_0.3.2 triebeard_0.3.0 checkmate_2.0.0
## [46] GenomicAlignments_1.22.1 prettyunits_1.1.1 cluster_2.1.0
## [49] DOSE_3.12.0 lazyeval_0.2.2 crayon_1.4.1
## [52] genefilter_1.68.0 pkgconfig_2.0.3 labeling_0.4.2
## [55] tweenr_1.0.1 pkgload_1.1.0 nnet_7.3-12
## [58] devtools_2.3.2 rlang_0.4.7 lifecycle_0.2.0
## [61] BiocFileCache_1.10.2 modelr_0.1.8 cellranger_1.1.0
## [64] rprojroot_2.0.2 polyclip_1.10-0 Matrix_1.2-17
## [67] urltools_1.7.3 carData_3.0-4 reprex_1.0.0
## [70] base64enc_0.1-3 ggridges_0.5.2 processx_3.4.5
## [73] png_0.1-7 viridisLite_0.3.0 bitops_1.0-6
## [76] KernSmooth_2.23-15 Biostrings_2.54.0 blob_1.2.1
## [79] qvalue_2.18.0 jpeg_0.1-8.1 rstatix_0.6.0
## [82] gridGraphics_0.5-0 ggsignif_0.6.0 scales_1.1.1
## [85] memoise_2.0.0 magrittr_2.0.1 plyr_1.8.6
## [88] gplots_3.0.4 gdata_2.18.0 zlibbioc_1.32.0
## [91] compiler_3.6.1 RColorBrewer_1.1-2 Rsamtools_2.2.3
## [94] cli_2.2.0.9000 XVector_0.26.0 ps_1.3.4
## [97] htmlTable_2.0.1 Formula_1.2-4 MASS_7.3-51.4
## [100] tidyselect_1.1.0 stringi_1.5.3 highr_0.8
## [103] yaml_2.2.1 GOSemSim_2.17.1 askpass_1.1
## [106] locfit_1.5-9.4 latticeExtra_0.6-29 ggrepel_0.8.2
## [109] grid_3.6.1 fastmatch_1.1-0 tools_3.6.1
## [112] rio_0.5.16 rstudioapi_0.13 foreign_0.8-72
## [115] gridExtra_2.3 farver_2.0.3 HGNChelper_0.8.1
## [118] ggraph_2.0.3 digest_0.6.27 rvcheck_0.1.8
## [121] BiocManager_1.30.10 Rcpp_1.0.5 car_3.0-9
## [124] broom_0.7.0 httr_1.4.2 ggdendro_0.1.21
## [127] AnnotationDbi_1.48.0 colorspace_2.0-0 rvest_0.3.6
## [130] XML_3.99-0.3 fs_1.5.0 splines_3.6.1
## [133] graphlayouts_0.7.0 ggplotify_0.0.5 plotly_4.9.2.1
## [136] sessioninfo_1.1.1 xtable_1.8-4 jsonlite_1.7.1
## [139] tidygraph_1.2.0 testthat_2.3.2 R6_2.5.0
## [142] Hmisc_4.4-1 pillar_1.4.6 htmltools_0.5.1.1
## [145] glue_1.4.2 fastmap_1.0.1 DT_0.15
## [148] fgsea_1.12.0 pkgbuild_1.1.0 utf8_1.1.4
## [151] lattice_0.20-38 curl_4.3 gtools_3.8.2
## [154] zip_2.1.0 GO.db_3.10.0 openxlsx_4.2.3
## [157] openssl_1.4.2 survival_3.2-7 limma_3.42.2
## [160] rmarkdown_2.5 desc_1.2.0 munsell_0.5.0
## [163] DO.db_2.9 GenomeInfoDbData_1.2.2 haven_2.3.1
## [166] reshape2_1.4.4 gtable_0.3.0