Aims: (i) to quantify intron excision ratios across samples; (ii) to perform differential splicing analyses across disease groups; (iii) to perform pathway enrichment analyses on groups of differentially spliced genes.
source(here::here("R", "file_paths.R"))
leafcutter_dir <- file.path(path_to_results, "leafcutter/")
leafviz_dir <- "/home/rreynolds/packages/leafcutter/leafviz/"
markergene_dir <- "/home/rreynolds/projects/MarkerGenes/"
PCaxes <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_DEG.Rds"
)
) %>%
colData() %>%
as_tibble() %>%
dplyr::select(sample_id, PC1, PC2, PC3, PC4)
sample_info <-
read_excel(
path =
file.path(
path_to_raw_data,
"sample_details/20201229_MasterFile_SampleInfo.xlsx"
),
sheet = "SampleInfo", skip = 1
) %>%
dplyr::filter(Sample_Type == "Tissue section" & sent_to_bulk_seq == "yes") %>%
dplyr::rename(sample_id = CaseNo,
RIN = RINe_bulkRNA_Tapestation) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD","DLB"))) %>%
dplyr::select(sample_id, Disease_Group, Sex, AoO, AoD, DD, PMI, aSN, TAU, 'thal AB', 'aCG aSN score', Genetics, RIN) %>%
dplyr::inner_join(PCaxes)
leafcutter
quantifies intron excision ratios building a splicing graph with all split reads that have a shared donor or acceptor site sitting within one clusterleafcutter
operates in 3 steps:
leafcutter
. These include:
RNAseqProcessing::convert_STAR_SJ_to_junc(sj_dir_path = file.path(path_to_bulk_seq_data, "STAR/"),
output_path = str_c(leafcutter_dir, "SJ_formatting/"),
filter_out_blacklist_regions = TRUE,
path_to_ENCODE_blacklist = file.path(path_to_raw_data, "misc/hg38-blacklist.v2.bed")
python /tools/leafcutter/clustering/leafcutter_cluster.py \
-j /home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/SJ_formatting/list_juncfiles.txt \
-r /home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/ \
-o tissue_polyA_test_diseasegroups \
-l 1000000 \
-m 30 \
-p 0.001 \
-s True
Rscript /tools/leafcutter/scripts/gtf_to_exons.R \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz
# Load per individual intron counts for column names
sample_names <- fread(
str_c(leafcutter_dir, "intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz")
) %>%
dplyr::select(-V1) %>%
colnames()
# Create master df of sample info, with grouping variable (Disease_Group) and confounders (RIN, Sex, AoD)
master <-
tibble(lc_sample_name = sample_names,
sample_name = sample_names %>%
str_replace("_.*", "")) %>%
inner_join(sample_info, by = c("sample_name" = "sample_id")) %>%
dplyr::select(lc_sample_name, Disease_Group, RIN, Sex, AoD) %>%
arrange(Disease_Group)
RNAseqProcessing::create_group_files_multi_pairwisecomp(df = master,
group_column_name = "Disease_Group",
output_path = str_c(leafcutter_dir, "diff_splicing/group_files/"))
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafcutter_ds.log&
# Load per individual intron counts for column names
sample_names <-
fread(str_c(leafcutter_dir, "intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz")) %>%
dplyr::select(-V1) %>%
colnames()
# Create master df of sample info, with grouping variable (Disease_Group) and confounders (PC axes 1-4)
master <-
tibble(lc_sample_name = sample_names,
sample_name = sample_names %>%
str_replace("_.*", "")) %>%
inner_join(sample_info, by = c("sample_name" = "sample_id")) %>%
dplyr::select(lc_sample_name, Disease_Group, PC1, PC2, PC3, PC4) %>%
arrange(Disease_Group)
RNAseqProcessing::create_group_files_multi_pairwisecomp(df = master,
group_column_name = "Disease_Group",
output_path = str_c(leafcutter_dir, "diff_splicing_PCaxes/group_files/"))
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafcutter_ds_PCaxes.log&
/tools/leafcutter/leafviz/gtf2leafcutter.pl \
-o /data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz
prepare_results.R
, which performs this formatting, albeit for only one pairwise comparison. To format the results of multiple pairwise comparisons requires looping across the various pairwise comparisons and running the prepare_results.R
for each individual pairwise comparison. This is what the leafviz_multi_pairwise.R
script used below does.leafviz_multi_pairwise.R
script assumes use of the leafcutter_ds_multi_pairwise.R
script and the create_group_files_multi_pairwisecomp()
function. This ensures that all files needed are named consistently. That is, the (i) _cluster_signficance.txt
, (ii) _effect_sizes.txt
and (iii) _group_file.txt
for an individual pairwise comparison all have the same prefix.nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafviz_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/ \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/ \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
--output_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/leafviz/ \
--group_file_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing/group_files/ \
--FDR=0.05 \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafviz_prepare_results.log&
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafviz_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/ \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/ \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
--output_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/leafviz_PCaxes/ \
--group_file_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/leafcutter/diff_splicing_PCaxes/group_files/ \
--FDR=0.05 \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_leafviz_PCaxes_prepare_results.log&
RNAseqProcessing::run_leafviz(leafviz_dir = leafviz_dir,
results_filename = file.path(leafcutter_dir, "leafviz_PCaxes/PDD_vs_DLB.Rda"))
Note on naming convention: results named comparison[1]_vs_comparison[2]
, where comparison[1]
is the baseline/reference value and comparison[2]
is in reference to the baseline value.
clusters <- list.files(file.path(leafcutter_dir, "diff_splicing/"), pattern = "cluster_", recursive = T, full.names = T)
intron <- list.files(file.path(leafcutter_dir, "diff_splicing/"), pattern = "effect_", recursive = T, full.names = T)
for(i in 1:length(clusters)){
comparison <- clusters[i] %>%
str_replace("/.*/", "") %>%
str_replace("_cluster_significance.txt", "")
lc <- read_delim(clusters[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
lc_all <- lc
} else{
lc_all <- lc_all %>%
bind_rows(lc)
}
}
for(i in 1:length(intron)){
comparison <- intron[i] %>%
str_replace("/.*/", "") %>%
str_replace("_effect_sizes.txt", "")
int <- read_delim(intron[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
int_all <- int
} else{
int_all <- int_all %>%
bind_rows(int)
}
}
# Filter for significant clusters
# Add direction of effect
lc_filtered <- lc_all %>%
# filter for successful tests and remove clusters overlapping multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>%
dplyr::inner_join(int_all %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>%
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change"))
# Save results
write_csv(lc_filtered,
path = file.path(leafcutter_dir, "diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.csv"))
saveRDS(setNames(list(lc_all, int_all, lc_filtered),
c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
file.path(leafcutter_dir, "diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.Rds"))
# Run once
all_genes <- results_covar$cluster_significance %>%
# remove clusters that overlap multiple genes
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
genes_list <- setNames(results_covar$significant_clusters_0.05_filter %>%
group_split(comparison),
results_covar$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>%
dplyr::distinct(genes, p.adjust, .keep_all = TRUE) %>%
dplyr::group_by(genes) %>%
dplyr::top_n(n = 1, wt = -p.adjust) %>%
# Order by p-value
dplyr::arrange(p.adjust) %>%
.[["genes"]]
})
upset(fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")
gom.self <- newGOM(genes_list, genes_list, genome.size = length(all_genes))
All.jaccard <- getMatrix(gom.self, "Jaccard")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(All.jaccard, method="color", col=col(100), cl.lim = c(0,1),
type="lower",
# addCoef.col = "black", number.cex = 0.75, # Add coefficient of correlation
tl.col="black", tl.srt=45, tl.cex = 0.75, mar=c(0,0,1,0), #Text label color and rotation
p.mat = All.jaccard, sig.level = (0.05), insig = "p-value", number.cex = 0.75,
diag = TRUE)
TRUE
for the statement upregulated > 0 & downregulated > 0
.results_covar$significant_clusters_0.05_filter %>%
dplyr::select(comparison, genes, direction_of_effect) %>%
dplyr::group_by(comparison, genes, direction_of_effect) %>%
dplyr::summarise(n = n()) %>%
tidyr::spread(key = direction_of_effect, value = n) %>%
# Create new column with ifelse determining whether number of splicing events that are both up and down > 0 for each gene
dplyr::mutate(up_and_down = ifelse(upregulated > 0 & downregulated > 0, TRUE, FALSE)) %>%
.[["up_and_down"]] %>%
# Determine wheter all logical values in up_and_down column = TRUE
all(na.rm = T)
## [1] TRUE
# Run once
gprofiler_results_list <- lapply(genes_list, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
saveRDS(gprofiler_results_list, file.path(leafcutter_dir, "gprofiler/gprofiler_leafcutter_ds_SexRINAoD.Rds"))
gprofiler_results_list <- readRDS(file.path(leafcutter_dir, "gprofiler/gprofiler_leafcutter_ds_SexRINAoD.Rds"))
# Added some filtering to remove broad terms
gprofiler_df <- gprofiler_results_list %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000) %>%
dplyr::arrange(comparison, source, p_value)
gprofiler_df %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n_enriched_pathways = n())
gprofiler_df %>%
dplyr::arrange(comparison, source, p_value) %>%
dplyr::select(-evidence_codes, -intersection) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
Note on naming convention: results named comparison[1]_vs_comparison[2]
, where comparison[1]
is the baseline/reference value and comparison[2]
is in reference to the baseline value.
clusters <- list.files(file.path(leafcutter_dir, "diff_splicing_PCaxes/"), pattern = "cluster_significance", recursive = T, full.names = T)
intron <- list.files(file.path(leafcutter_dir, "diff_splicing_PCaxes/"), pattern = "effect_", recursive = T, full.names = T)
for(i in 1:length(clusters)){
comparison <- clusters[i] %>%
str_replace("/.*/", "") %>%
str_replace("_cluster_significance.txt", "")
lc <- read_delim(clusters[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
lc_all <- lc
} else{
lc_all <- lc_all %>%
bind_rows(lc)
}
}
for(i in 1:length(intron)){
comparison <- intron[i] %>%
str_replace("/.*/", "") %>%
str_replace("_effect_sizes.txt", "")
int <- read_delim(intron[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
int_all <- int
} else{
int_all <- int_all %>%
bind_rows(int)
}
}
# Filter for significant clusters
# Add direction of effect
lc_filtered_PC <- lc_all %>%
# filter for successful tests and remove clusters overlapping multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>%
dplyr::inner_join(int_all %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>%
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change"))
# Save results
write_csv(lc_filtered_PC,
path = file.path(leafcutter_dir, "diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.csv"))
saveRDS(setNames(list(lc_all, int_all, lc_filtered_PC),
c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
file.path(leafcutter_dir, "diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.Rds"))
all_genes <- results_PC$cluster_significance %>%
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
# Extract differentially spliced genes
genes_list_PC <- setNames(results_PC$significant_clusters_0.05_filter %>%
group_split(comparison),
results_PC$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>%
dplyr::distinct(genes, p.adjust, .keep_all = TRUE) %>%
dplyr::group_by(genes) %>%
dplyr::top_n(n = 1, wt = -p.adjust) %>%
# Order by p-value
dplyr::arrange(p.adjust) %>%
.[["genes"]]
})
# Change names on genes_list_PC to denote correction method
names(genes_list_PC) <- str_c(names(genes_list_PC), "_PCaxes")
# Create master list with genes from both correction methods
master_list <- c(genes_list, genes_list_PC)
# Run once
gprofiler_results_list <- lapply(genes_list_PC, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcode = TRUE)})
saveRDS(gprofiler_results_list, file.path(leafcutter_dir, "gprofiler/gprofiler_leafcutter_ds_PCaxes.Rds"))
gprofiler_term_list <- c(setNames(gprofiler_df %>%
group_split(comparison),
gprofiler_df %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
lapply(., function(x){x %>% .[["term_id"]] %>% unique()}),
setNames(gprofiler_df_PC %>%
group_split(comparison),
gprofiler_df_PC %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
lapply(., function(x){x %>% .[["term_id"]] %>% unique()}))
upset(fromList(gprofiler_term_list), sets = names(gprofiler_term_list), keep.order = TRUE, order.by = "freq")
# Load internal specificity matrices
ctd_files <-
list.files(
file.path(
path_to_results,
"snRNA/specificity_matrices/2020_March/"
),
pattern = "ctd",
full.names = T)
ctd_list <- vector(mode = "list", length = length(ctd_files))
for(i in 1:length(ctd_files)){
# Load ctd
ctd_list[[i]] <- readRDS(ctd_files[i])
# Extract file name
title <-
ctd_files[i] %>%
str_replace(".*/", "") %>%
str_replace("\\..*", "") %>%
str_replace(".*_", "")
# Name list
names(ctd_list)[i] <- title
}
# Load external datasets
load(file.path(markergene_dir, "specificity_matrices/AIBS2018_MTG.rda"))
load(file.path(markergene_dir, "specificity_matrices/Habib2017_DroNc_Human.rda"))
# To save repeatedly querying biomaRt we have a stored dataset containing all the human
# orthologs of MGI genes, mouse_to_human_homologs.
load(file.path(markergene_dir, "data/mouse_to_human_orthologs.rda"))
m2h <- mouse_to_human_orthologs %>%
dplyr::filter(mmusculus_homolog_orthology_type == "ortholog_one2one") %>%
dplyr::select(hgnc_symbol, mmusculus_homolog_associated_gene_name) %>%
dplyr::rename(MGI_symbol = mmusculus_homolog_associated_gene_name) %>%
dplyr::filter(hgnc_symbol != "") %>%
dplyr::distinct(hgnc_symbol, MGI_symbol)
# Function to loop across gene lists of n length
ewce_top_n <- function(leafcutter_results, top_n){
for(i in 1:length(top_n)){
genes_list_ewce <- setNames(leafcutter_results$significant_clusters_0.05_filter %>%
group_split(comparison),
leafcutter_results$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>%
dplyr::distinct(comparison, genes, p.adjust) %>%
dplyr::group_by(genes) %>%
# Within gene, choose top cluster based on p.adjust
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::ungroup() %>%
# Select top genes by p.adjust
dplyr::top_n(n = top_n[i], wt = -p.adjust) %>%
.[["genes"]]
})
# Running EWCE with 10000 bootstraps
results <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list_ewce,
ctd_AIBS2018, ctd_DRONC_human,
ctd_list$Control, ctd_list$PD,
ctd_list$PDD, ctd_list$DLB,
celltypeLevel = 1,
reps = 10000,
genelistSpecies = "human",
sctSpecies = "human",
mouse_to_human = m2h) %>%
dplyr::mutate(n_genes = top_n[i])
if(i == 1){
final_ewce <- results
} else{
final_ewce <- final_ewce %>%
bind_rows(results)
}
}
return(final_ewce)
}
top_n <- c(50, 100, 250, 500)
ewce <- setNames(vector(mode = "list", length = 2),
c("covar", "PC"))
ewce$covar <- ewce_top_n(leafcutter_results = results_covar,
top_n = top_n)
ewce$PC <- ewce_top_n(leafcutter_results = results_PC,
top_n = top_n)
saveRDS(ewce, file = file.path(leafcutter_dir, "ewce/ewce_leafcutter_ds.Rds"))
# Adding filter of dPSI
stringent <- leafcutter_list %>%
lapply(., function (x){
x$cluster_significance %>%
# filter for successful tests and remove clusters that overlap multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster_id"), sep = ":") %>%
dplyr::inner_join(x$intron_usage %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster_id"), sep = ":")) %>%
dplyr::filter(abs(deltapsi) >= 0.1)
})
# Number of differentially spliced clusters
stringent %>%
lapply(., function(x){
x %>%
dplyr::distinct(comparison, cluster_id, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n())
})
## $covar
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1663
## 2 Control_vs_PD 610
## 3 Control_vs_PDD 325
## 4 PDD_vs_DLB 620
## 5 PD_vs_DLB 445
## 6 PD_vs_PDD 510
##
## $PC
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1805
## 2 Control_vs_PD 313
## 3 Control_vs_PDD 365
## 4 PDD_vs_DLB 1180
## 5 PD_vs_DLB 1742
## 6 PD_vs_PDD 266
# Number of differentially spliced genes
stringent %>%
lapply(., function(x){
x %>%
dplyr::distinct(comparison, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n())
})
## $covar
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1501
## 2 Control_vs_PD 579
## 3 Control_vs_PDD 318
## 4 PDD_vs_DLB 598
## 5 PD_vs_DLB 433
## 6 PD_vs_PDD 493
##
## $PC
## # A tibble: 6 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_DLB 1654
## 2 Control_vs_PD 305
## 3 Control_vs_PDD 355
## 4 PDD_vs_DLB 1099
## 5 PD_vs_DLB 1582
## 6 PD_vs_PDD 263
genes_list_PC <-
setNames(stringent$PC %>%
group_split(comparison),
stringent$PC %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
lapply(., function(x){
x %>%
dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>%
dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>%
dplyr::group_by(genes) %>%
# Where duplicate genes occur, select by lowest p-value first
# If still duplicate, select by highest absolute delta PSI
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::top_n(n = 1, wt = abs_deltapsi) %>%
# Order by p.adjust
dplyr::arrange(p.adjust) %>%
.[["genes"]]
})
gprofiler_PSI <- lapply(genes_list_PC, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS",
significant = TRUE,
user_threshold = 0.05,
ordered_query = FALSE,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
gprofiler_PSI_df <- gprofiler_PSI %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000)
gprofiler_PSI_df %>%
dplyr::mutate(term_name = fct_relevel(term_name,
gprofiler_PSI_df %>%
dplyr::select(-evidence_codes, -intersection) %>%
dplyr::distinct(term_name, p_value) %>%
dplyr::group_by(term_name) %>%
dplyr::top_n(n = 1, wt = p_value) %>%
.[["term_name"]] %>%
rev()),
comparison = str_replace(comparison, "_PCaxes", "") %>%
fct_relevel(., c("Control_vs_PD", "Control_vs_PDD", "Control_vs_DLB", "PD_vs_PDD", "PD_vs_DLB", "PDD_vs_DLB"))) %>%
ggplot(aes(x = comparison,
y = term_name,
size = precision,
fill = p_value)) +
geom_point(pch = 21) +
scale_fill_viridis_c(direction = -1) +
labs(x = "Comparison", y = "Dataset: Cell type") +
theme_rhr
compareCluster
: permits comparison of GO enrichments across groups. Uses hypergeometric model to perform enrichment analysis within each group (i.e. does not account for semantic similarity in testing as GProfiler does, which could be a disadvantage)library(clusterProfiler)
source(here::here("R", "biomart_df.R"))
cluster_compare_input <-
stringent$PC %>%
dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>%
dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>%
dplyr::mutate(comparison = fct_relevel(comparison,
c("Control_vs_PD", "Control_vs_PDD", "Control_vs_DLB",
"PD_vs_PDD", "PD_vs_DLB", "PDD_vs_DLB"))) %>%
dplyr::arrange(comparison) %>%
dplyr::group_by(genes) %>%
# Where duplicate genes occur, select by lowest p-value first
# If still duplicate, select by highest absolute delta PSI
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::top_n(n = 1, wt = abs_deltapsi)
ont <- c("BP", "CC", "MF")
cluster_compare_list <- setNames(vector(mode = "list", length = length(ont)),
ont)
for(i in 1:length(ont)){
cluster_compare_list[[i]] <-
cluster_compare_input %>%
clusterProfiler::compareCluster(genes ~ comparison,
data=.,
keyType = "SYMBOL",
fun="enrichGO",
ont = ont[i],
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
OrgDb='org.Hs.eg.db')
}
saveRDS(cluster_compare_list,
file.path(leafcutter_dir, "gprofiler/clusterProfiler_leafcutter_ds_PC_dPSIfilter.Rds"))
cluster_compare <- readRDS(file.path(leafcutter_dir, "gprofiler/clusterProfiler_leafcutter_ds_PC_dPSIfilter.Rds"))
cluster_compare %>%
lapply(., function(df){
df %>%
clusterProfiler::simplify(., cutoff=0.5, by="p.adjust", select_fun=min) %>%
clusterProfiler::dotplot(., showCategory = NULL, font.size = 8) +
geom_point(shape = 1,colour = "black") +
scale_size_continuous(name = "gene ratio") +
scale_colour_viridis_c(direction = -1) +
theme_rhr
})
## $BP
##
## $CC
##
## $MF
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] UpSetR_1.4.0 rtracklayer_1.46.0
## [5] rlist_0.4.6.1 readxl_1.3.1
## [7] RNAseqProcessing_0.0.0.9000 forcats_0.5.1
## [9] stringr_1.4.0 dplyr_1.0.2
## [11] purrr_0.3.4 readr_1.4.0
## [13] tidyr_1.1.1 tibble_3.0.3
## [15] tidyverse_1.3.0 ggpubr_0.4.0
## [17] ggplot2_3.3.2 gprofiler2_0.1.9
## [19] GeneOverlap_1.22.0 DESeq2_1.26.0
## [21] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [23] BiocParallel_1.20.1 matrixStats_0.56.0
## [25] Biobase_2.46.0 GenomicRanges_1.38.0
## [27] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [29] S4Vectors_0.24.4 BiocGenerics_0.32.0
## [31] data.table_1.13.0 corrplot_0.84
## [33] clusterProfiler_3.14.3
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.1 qdapTools_1.3.5 bit64_4.0.2
## [4] knitr_1.29 rpart_4.1-15 RCurl_1.98-1.2
## [7] generics_0.0.2 callr_3.5.1 cowplot_1.0.0
## [10] usethis_1.6.3 RSQLite_2.2.0 europepmc_0.4
## [13] chron_2.3-56 bit_4.0.4 enrichplot_1.6.1
## [16] xml2_1.3.2 lubridate_1.7.9 assertthat_0.2.1
## [19] viridis_0.5.1 xfun_0.16 hms_1.0.0
## [22] evaluate_0.14 fansi_0.4.2 progress_1.2.2
## [25] caTools_1.18.0 dbplyr_1.4.4 igraph_1.2.5
## [28] DBI_1.1.1 geneplotter_1.64.0 htmlwidgets_1.5.3
## [31] ellipsis_0.3.1 crosstalk_1.1.0.1 backports_1.1.8
## [34] bookdown_0.21 annotate_1.64.0 biomaRt_2.42.1
## [37] vctrs_0.3.2 remotes_2.2.0 here_1.0.0
## [40] abind_1.4-5 cachem_1.0.3 withr_2.2.0
## [43] ggforce_0.3.2 triebeard_0.3.0 checkmate_2.0.0
## [46] GenomicAlignments_1.22.1 prettyunits_1.1.1 cluster_2.1.0
## [49] DOSE_3.12.0 lazyeval_0.2.2 crayon_1.4.1
## [52] genefilter_1.68.0 pkgconfig_2.0.3 labeling_0.4.2
## [55] tweenr_1.0.1 pkgload_1.1.0 nnet_7.3-12
## [58] devtools_2.3.2 rlang_0.4.7 lifecycle_0.2.0
## [61] BiocFileCache_1.10.2 modelr_0.1.8 cellranger_1.1.0
## [64] rprojroot_2.0.2 polyclip_1.10-0 Matrix_1.2-17
## [67] urltools_1.7.3 carData_3.0-4 reprex_1.0.0
## [70] base64enc_0.1-3 ggridges_0.5.2 processx_3.4.5
## [73] png_0.1-7 viridisLite_0.3.0 bitops_1.0-6
## [76] KernSmooth_2.23-15 Biostrings_2.54.0 blob_1.2.1
## [79] qvalue_2.18.0 jpeg_0.1-8.1 rstatix_0.6.0
## [82] gridGraphics_0.5-0 ggsignif_0.6.0 scales_1.1.1
## [85] memoise_2.0.0 magrittr_2.0.1 plyr_1.8.6
## [88] gplots_3.0.4 gdata_2.18.0 zlibbioc_1.32.0
## [91] compiler_3.6.1 RColorBrewer_1.1-2 Rsamtools_2.2.3
## [94] cli_2.2.0.9000 XVector_0.26.0 ps_1.3.4
## [97] htmlTable_2.0.1 Formula_1.2-4 MASS_7.3-51.4
## [100] tidyselect_1.1.0 stringi_1.5.3 highr_0.8
## [103] yaml_2.2.1 GOSemSim_2.17.1 askpass_1.1
## [106] locfit_1.5-9.4 latticeExtra_0.6-29 ggrepel_0.8.2
## [109] grid_3.6.1 fastmatch_1.1-0 tools_3.6.1
## [112] rio_0.5.16 rstudioapi_0.13 foreign_0.8-72
## [115] gridExtra_2.3 farver_2.0.3 HGNChelper_0.8.1
## [118] ggraph_2.0.3 digest_0.6.27 rvcheck_0.1.8
## [121] BiocManager_1.30.10 Rcpp_1.0.5 car_3.0-9
## [124] broom_0.7.0 httr_1.4.2 ggdendro_0.1.21
## [127] AnnotationDbi_1.48.0 colorspace_2.0-0 rvest_0.3.6
## [130] XML_3.99-0.3 fs_1.5.0 splines_3.6.1
## [133] graphlayouts_0.7.0 ggplotify_0.0.5 plotly_4.9.2.1
## [136] sessioninfo_1.1.1 xtable_1.8-4 jsonlite_1.7.1
## [139] tidygraph_1.2.0 testthat_2.3.2 R6_2.5.0
## [142] Hmisc_4.4-1 pillar_1.4.6 htmltools_0.5.1.1
## [145] glue_1.4.2 fastmap_1.0.1 DT_0.15
## [148] fgsea_1.12.0 pkgbuild_1.1.0 utf8_1.1.4
## [151] lattice_0.20-38 curl_4.3 gtools_3.8.2
## [154] zip_2.1.0 GO.db_3.10.0 openxlsx_4.2.3
## [157] openssl_1.4.2 survival_3.2-7 limma_3.42.2
## [160] rmarkdown_2.5 desc_1.2.0 munsell_0.5.0
## [163] DO.db_2.9 GenomeInfoDbData_1.2.2 haven_2.3.1
## [166] reshape2_1.4.4 gtable_0.3.0