Aims: to determine overlap of junctions and clusters derived from our own data and SRP058181
source(here::here("R", "file_paths.R"))
source(here::here("R", "convert_leafcutter.R"))
clu <-
setNames(
list(
fread(
file.path(
path_to_results,
"leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz"
)
),
fread(
file.path(
path_to_results,
"SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz"
)
)
),
c("own", "SRP058181")
) %>%
lapply(., function(x){
x %>%
dplyr::rename(intron = V1) %>%
convert_leafcutter(leafcutter_results = .,
use_strand = TRUE) %>%
dasper::annotate_junc_ref(junc_metadata = .,
gtf = path_to_ref_gtf)
})
saveRDS(
clu$own,
file =
file.path(
path_to_results,
"leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
)
)
saveRDS(
clu$SRP058181,
file =
file.path(
path_to_results,
"SRP058181/leafcutter/intron_clustering/SRP058181_all_clusters_gtfannotated.Rds"
)
)
# Import annotated clusters
clu <-
setNames(
list(
readRDS(
file.path(
path_to_results,
"leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
)
),
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/intron_clustering/SRP058181_all_clusters_gtfannotated.Rds"
)
)
),
c("own", "SRP058181")
)
# Check that majority of clu are annotated
clu %>%
lapply(., function(x){
x %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::arrange(-prop)
})
## $own
## # A tibble: 7 x 3
## junc_cat n prop
## <chr> <int> <dbl>
## 1 annotated 96622 0.634
## 2 novel_acceptor 18186 0.119
## 3 novel_donor 12927 0.0849
## 4 none 10988 0.0721
## 5 novel_exon_skip 9255 0.0608
## 6 ambig_gene 2903 0.0191
## 7 novel_combo 1417 0.00930
##
## $SRP058181
## # A tibble: 7 x 3
## junc_cat n prop
## <chr> <int> <dbl>
## 1 annotated 87133 0.676
## 2 novel_acceptor 13479 0.105
## 3 novel_donor 11717 0.0910
## 4 novel_exon_skip 7924 0.0615
## 5 none 4820 0.0374
## 6 ambig_gene 2544 0.0198
## 7 novel_combo 1183 0.00918
Final analysis included in paper: cluster-level overlaps using differentially spliced clusters and the "exact match" definition (most stringent, but also makes the most sense)
# Find junctions that overlap
overlapped_junctions <-
GenomicRanges::findOverlaps(query = clu$SRP058181,
subject = clu$own,
ignore.strand = F,
type = "equal")
# Absolute numbers
print(str_c("Total number of junctions found in own data: ", length(clu$own)))
## [1] "Total number of junctions found in own data: 152298"
print(str_c("Total number of junctions found in SRP058181: ", length(clu$SRP058181)))
## [1] "Total number of junctions found in SRP058181: 128800"
print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))
## [1] "Number of intersecting junctions: 107703"
# Percentage overlap from smaller SRP058181 dataset into larger
print(str_c("Percentage overlap from smaller SRP058181 dataset into larger: ",
c((overlapped_junctions %>% queryHits() %>% unique() %>% length())/(clu$SRP058181 %>% length()) * 100)))
## [1] "Percentage overlap from smaller SRP058181 dataset into larger: 83.6203416149068"
# Percentage overlap from larger own data into smaller SRP058181
print(str_c("Percentage overlap from larger own data into smaller SRP058181: ",
c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(clu$own %>% length()) * 100)))
## [1] "Percentage overlap from larger own data into smaller SRP058181: 70.7185911830753"
plot <-
clu$SRP058181[queryHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(clu$SRP058181)) %>%
dplyr::mutate(overlap = "SRP058181 into own data") %>%
dplyr::bind_rows(clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(clu$own)) %>%
dplyr::mutate(overlap = "Own data into SRP058181")) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>%
# ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat, alpha = overlap)) +
ggplot(aes(x = junc_cat, y = prop_overlap, alpha = overlap)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
# facet_wrap(vars(overlap)) +
labs(x = "", y = "Proportion of overlapping junctions") +
# scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
scale_alpha_manual(name = "Direction of overlap",
values = c(1, 0.5)) +
guides(fill = FALSE,
alpha=guide_legend(nrow=2,byrow=TRUE)) +
theme_rhr +
theme(legend.position = "bottom")
plot
# ggsave(plot,
# filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/Departmental/SRP058181_junction_overlap.tiff",
# device = "tiff",
# height = 100,
# width = 100,
# dpi = 200,
# units = "mm")
# Import leafcutter lists
leafcutter_list <-
setNames(
list(
readRDS(
file.path(
path_to_results,
"leafcutter/diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.Rds"
)
),
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
)
)
),
c("own", "SRP058181")
)
# Import differentially spliced clusters (no dPSI filter, yet)
ds <-
leafcutter_list %>%
lapply(., function(x){
x$significant_clusters_0.05_filter %>%
dplyr::mutate(chr = chr %>%
str_replace(., "chr", ""),
cluster_id = str_c(chr, ":", cluster)) %>%
dplyr::select(comparison, chr, cluster, cluster_id, everything()) %>%
# Filtering for only those comparisons that are comparable between the two datasets
# I.e. Control_vs_PD, Control_vs_PDD, PD_vs_PDD
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"))
})
# Filter by differentially spliced clusters
ds_clu <-
setNames(list(clu$own[elementMetadata(clu$own)[, "cluster_id"] %in%
c(ds$own$cluster_id %>% unique())],
clu$SRP058181[elementMetadata(clu$SRP058181)[, "cluster_id"] %in%
c(ds$SRP058181$cluster_id %>% unique())]),
c("own", "SRP058181"))
# Find junctions that overlap
overlapped_junctions <-
GenomicRanges::findOverlaps(query = ds_clu$SRP058181,
subject = ds_clu$own,
ignore.strand = F,
type = "equal")
# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))
## [1] "Total number of junctions within ds clusters in own data: 15876"
print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))
## [1] "Total number of junctions within ds clusters in SRP058181: 2992"
print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))
## [1] "Number of intersecting junctions: 726"
# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: ",
c((overlapped_junctions %>% queryHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: 24.2647058823529"
print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ",
c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 4.572940287226"
ds_clu$SRP058181[queryHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$SRP058181)) %>%
dplyr::mutate(overlap = "SRP058181 into own data") %>%
dplyr::bind_rows(ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$own)) %>%
dplyr::mutate(overlap = "Own data into SRP058181")) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>%
# ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat, alpha = overlap)) +
ggplot(aes(x = junc_cat, y = prop_overlap, alpha = overlap)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
# facet_wrap(vars(overlap)) +
labs(x = "", y = "Proportion of overlapping junctions") +
# scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
scale_alpha_manual(name = "Direction of overlap",
values = c(1, 0.5)) +
guides(fill = FALSE,
alpha=guide_legend(nrow=2,byrow=TRUE)) +
theme_rhr +
theme(legend.position = "bottom")
# Number of genes
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
length()
## [1] 104
# Gene names
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
sort()
## [1] "ABCD4" "ACAA1" "ACBD4" "ACSBG1" "ACTR8"
## [6] "ACYP2" "ADAM15" "AGRN" "AKAP8L" "AL139260.1"
## [11] "ANK2" "AP3S1" "ARMC10" "ART3" "ASPH"
## [16] "BCAS1" "BRSK2" "C1orf61" "CAMK2B" "CAPN10"
## [21] "CDK5RAP1" "CEP290" "CERS4" "CHKA" "CIRBP"
## [26] "CLK4" "COMMD4" "CSAD" "CSPG5" "DAZAP1"
## [31] "DHX30" "DOCK10" "DPH7" "DST" "EIF2D"
## [36] "ELOVL2" "ENY2" "EPB41L2" "EPB41L3" "ETNPPL"
## [41] "FAM168A" "FASTK" "FBRSL1" "FBXO7" "FYN"
## [46] "GDPD5" "GGT5" "GPBP1" "IL17RB" "ILK"
## [51] "IQSEC1" "KMT2E" "LIMCH1" "LUC7L3" "MARK2"
## [56] "MBP" "METTL26" "MOK" "MRNIP" "MTX2"
## [61] "MVD" "MYL6" "MYO6" "NPL" "NSFL1C"
## [66] "PCM1" "PCNX2" "PICALM" "PIP5K1C" "PPIE"
## [71] "PPP1R3F" "PREPL" "PRMT7" "PRPF38B" "PTPN12"
## [76] "R3HDM1" "R3HDM2" "RBM3" "RBM39" "RHOT1"
## [81] "RPAIN" "SBDSP1" "SEPT4" "SEPT8" "SERPINH1"
## [86] "SFXN5" "SHTN1" "SLC25A23" "SLC25A27" "SLC30A3"
## [91] "SMARCD3" "SUZ12P1" "SYNGAP1" "TAZ" "TESPA1"
## [96] "TIAL1" "TJP1" "TMEM144" "TMEM161B-AS1" "TMEM165"
## [101] "TPD52L1" "TRIM33" "TSPOAP1" "UQCRB"
# Read in PD genes
PD_genes <-
setNames(vector(mode = "list", length = 2),
c("mendelian", "sporadic"))
PD_genes$mendelian <-
read_delim(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/20191128_GE_ParkinsonDiseaseComplexParkinsonism_greengenes.tsv"
),
delim = "\t"
) %>%
dplyr::rename(Gene = `Gene Symbol`)
PD_genes$sporadic <-
read_excel(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/TableS6_Complete_summary_statistics_for_QTL_Mendelian_randomization.xlsx"
)
) %>%
dplyr::filter(`Pass Bonferroni` == "pass")
# Check for overlaps
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::filter(gene_name_junc %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
sort()
## [1] "FBRSL1" "FBXO7"
# Construct contingency table of differential splicing
# Extract cluster_significance from corresponding leafcutter lists
clu_significance <-
leafcutter_list%>%
lapply(., function(x){
x$cluster_significance %>%
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"),
!str_detect(genes, ",")) %>%
dplyr::mutate(cluster_id = str_replace(cluster, "chr", "")) %>%
dplyr::select(comparison, cluster_id, everything(), -cluster)
})
cont_table <- setNames(vector(mode = "list", length = length(clu)),
names(clu))
for(i in 1:length(clu)){
cont_table[[i]] <- mcols(clu[[i]])[c("cluster_id", "gene_name_junc")] %>%
as_tibble() %>%
dplyr::filter(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
dplyr::filter(status == "Success") %>%
.[["cluster_id"]] %>%
unique())) %>%
tidyr::unnest(gene_name_junc) %>%
dplyr::distinct(cluster_id, gene_name_junc) %>%
dplyr::group_by(cluster_id) %>%
dplyr::filter(n() <= 1) %>%
dplyr::mutate(ds = case_when(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
dplyr::filter(p.adjust < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ "yes",
TRUE~ "no"),
PD_gene = case_when(gene_name_junc %in%
unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene)) ~ "yes",
TRUE ~ "no")) %>%
with(., table(ds, PD_gene, dnn = c("Differentially spliced?", "Overlapping PD gene?")))
}
cont_table
## $own
## Overlapping PD gene?
## Differentially spliced? no yes
## no 29336 398
## yes 2867 50
##
## $SRP058181
## Overlapping PD gene?
## Differentially spliced? no yes
## no 23551 342
## yes 521 6
# Fisher's exact test
cont_table %>%
lapply(., function(x) x %>% fisher.test())
## $own
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.09563
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9357161 1.7329065
## sample estimates:
## odds ratio
## 1.285453
##
##
## $SRP058181
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.7112
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.2876699 1.7558886
## sample estimates:
## odds ratio
## 0.7930489
ds_overlap <- ds$own %>%
dplyr::filter(cluster_id %in% c(ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["cluster_id"]] %>%
unique())) %>%
dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect) %>%
dplyr::full_join(ds$SRP058181 %>%
dplyr::filter(cluster_id %in% c(ds_clu$SRP058181[queryHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["cluster_id"]] %>%
unique())) %>%
dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect),
by = c("comparison", "chr", "start", "end"),
suffix = c(".own", ".SRP058181")) %>%
dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE,
direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
TRUE ~ NA)) %>%
dplyr::filter(!is.na(same_direction_of_effect))
ds_overlap %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
ds_overlap %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n(),
same_direction_of_effect = sum(same_direction_of_effect)) %>%
dplyr::mutate(prop = (same_direction_of_effect/n) * 100)
ds_overlap %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>%
tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>%
dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>%
ggplot(aes(x = abs(dPSI), fill = dataset)) +
geom_density(alpha = 0.5) +
labs(x = expression("Absolute "*Delta*"PSI")) +
theme_rhr
a <- ds_overlap %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
b <- ds_overlap %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
ggarrange(a, b,
ncol = 1,
labels = c("a", "b"))
# Correlation of junctions within ds clusters from same comparison
ds_overlap %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 5366852, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5863913
# Correlation of junctions within ds clusters from same comparison and with same direction of effect
ds_overlap %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 291220, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9283554
This section is simply looking to see that junctions found within differentially spliced clusters from one dataset, also validate across in the other dataset (without needing to be differentially spliced in the other, too).
# Find junctions that overlap
overlapped_junctions <-
GenomicRanges::findOverlaps(query = clu$own,
subject = ds_clu$SRP058181,
ignore.strand = F,
type = "equal")
# Absolute numbers
print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))
## [1] "Total number of junctions within ds clusters in SRP058181: 2992"
print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))
## [1] "Number of intersecting junctions: 2678"
# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: ",
c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: 89.5053475935829"
ds_clu$SRP058181[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$SRP058181)) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>%
# ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) +
ggplot(aes(x = junc_cat, y = prop_overlap)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
# facet_wrap(vars(overlap)) +
labs(x = "", y = "Proportion of overlapping junctions") +
# scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
guides(fill = FALSE,
alpha=guide_legend(nrow=2,byrow=TRUE)) +
theme_rhr +
theme(legend.position = "bottom")
# Find junctions that overlap
overlapped_junctions <-
GenomicRanges::findOverlaps(query = clu$SRP058181,
subject = ds_clu$own,
ignore.strand = F,
type = "equal")
# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))
## [1] "Total number of junctions within ds clusters in own data: 15876"
print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))
## [1] "Number of intersecting junctions: 12493"
# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ",
c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 78.6911060720584"
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$own)) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>%
# ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) +
ggplot(aes(x = junc_cat, y = prop_overlap)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
# facet_wrap(vars(overlap)) +
labs(x = "", y = "Proportion of overlapping junctions") +
scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
guides(fill = FALSE,
alpha=guide_legend(nrow=2,byrow=TRUE)) +
theme_rhr +
theme(legend.position = "bottom")
# Import leafcutter lists
leafcutter_list <-
setNames(
list(
readRDS(
file.path(
path_to_results,
"leafcutter/diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.Rds")
),
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
)
)
),
c("own", "SRP058181")
)
# Import differentially spliced clusters (no dPSI filter, yet)
ds <-
leafcutter_list %>%
lapply(., function(x){
x$significant_clusters_0.05_filter %>%
dplyr::mutate(chr = chr %>%
str_replace(., "chr", ""),
cluster_id = str_c(chr, ":", cluster)) %>%
dplyr::select(comparison, chr, cluster, cluster_id, everything()) %>%
# Filtering for only those comparisons that are comparable between the two datasets
# I.e. Control_vs_PD, Control_vs_PDD, PD_vs_PDD
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"))
})
# Filter by differentially spliced clusters
ds_clu <-
setNames(list(clu$own[elementMetadata(clu$own)[, "cluster_id"] %in%
c(ds$own$cluster_id %>% unique())],
clu$SRP058181[elementMetadata(clu$SRP058181)[, "cluster_id"] %in%
c(ds$SRP058181$cluster_id %>% unique())]),
c("own", "SRP058181"))
# Find junctions that overlap
overlapped_junctions <-
GenomicRanges::findOverlaps(query = ds_clu$SRP058181,
subject = ds_clu$own,
ignore.strand = F,
type = "equal")
# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))
## [1] "Total number of junctions within ds clusters in own data: 12755"
print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))
## [1] "Total number of junctions within ds clusters in SRP058181: 6793"
print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))
## [1] "Number of intersecting junctions: 868"
# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: ",
c((overlapped_junctions %>% queryHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: 12.7778595613131"
print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ",
c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 6.80517444139553"
ds_clu$SRP058181[queryHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$SRP058181)) %>%
dplyr::mutate(overlap = "SRP058181 into own data") %>%
dplyr::bind_rows(ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$own)) %>%
dplyr::mutate(overlap = "Own data into SRP058181")) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>%
# ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat, alpha = overlap)) +
ggplot(aes(x = junc_cat, y = prop_overlap, alpha = overlap)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
# facet_wrap(vars(overlap)) +
labs(x = "", y = "Proportion of overlapping junctions") +
# scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
scale_alpha_manual(name = "Direction of overlap",
values = c(1, 0.5)) +
guides(fill = FALSE,
alpha=guide_legend(nrow=2,byrow=TRUE)) +
theme_rhr +
theme(legend.position = "bottom")
# Number of genes
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
length()
## [1] 156
# Gene names
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
sort()
## [1] "ABHD11" "AC002470.2" "ACTR8" "ADRA1B" "AIFM3"
## [6] "AKAP8" "AKAP8L" "AMPD2" "APIP" "ARAP1"
## [11] "ARAP2" "ARHGAP21" "ARHGAP9" "ARHGEF1" "ASIC3"
## [16] "ASPA" "ASPDH" "ASPH" "ATP1A2" "BCAS1"
## [21] "BRWD1" "BTAF1" "C1orf61" "C20orf27" "CAPN2"
## [26] "CARMIL3" "CARNS1" "CAST" "CCDC74A" "CCNL1"
## [31] "CD22" "CDC37" "CDK18" "CLIP2" "CLK4"
## [36] "CNOT3" "COQ9" "CPT1C" "CREBBP" "CSDE1"
## [41] "CTSZ" "CYP2J2" "DBI" "DBNDD1" "DCAF16"
## [46] "DLGAP1-AS4" "DMXL1" "DPH7" "DPY19L2P1" "DRAM2"
## [51] "DTX3" "EIF2D" "ERCC1" "F3" "FAAH"
## [56] "FAM13B" "FAM221A" "FBXO7" "FLAD1" "GABBR1"
## [61] "GGT5" "GORASP1" "HAUS2" "HHATL" "IFT140"
## [66] "IQSEC2" "IZUMO4" "KIAA1755" "L3MBTL2" "LINC00299"
## [71] "LMO3" "LPIN1" "LUC7L3" "MAST1" "MBNL2"
## [76] "MFSD12" "MIA3" "MORN5" "MRPL43" "MRPL51"
## [81] "MSTO1" "MTDH" "MTMR12" "MVD" "NAALAD2"
## [86] "NAP1L4" "NCALD" "NDRG2" "NDUFA6-DT" "NDUFAF6"
## [91] "NELFA" "NETO1" "NSUN5P2" "ODF2" "OGFOD3"
## [96] "OSBPL1A" "P4HA2" "PCYT2" "PIN4" "PLCH2"
## [101] "PLEKHA5" "PLXNB1" "POMT2" "POT1" "PRKAG1"
## [106] "PRKAG2" "PRR14" "PRRC2B" "PRRC2C" "PSMD13"
## [111] "PTPRZ1" "RANGAP1" "RAPGEF1" "RASGRF1" "RASSF8-AS1"
## [116] "RBM25" "RBMX" "RERGL" "RFC5" "RGS20"
## [121] "RHOJ" "RNF212" "ROCK2" "ROGDI" "RPAIN"
## [126] "RPUSD3" "RUFY3" "SCMH1" "SEZ6L2" "SHC4"
## [131] "SLCO1C1" "SNCB" "SNHG29" "SNHG8" "SNX32"
## [136] "SON" "SPATS2L" "SPINDOC" "SPRYD7" "SPSB3"
## [141] "SRCIN1" "SRPK2" "STMN4" "STYXL1" "TMEM167A"
## [146] "TMEM176B" "TMEM63A" "TMEM98" "TSPAN17" "UBA3"
## [151] "UIMC1" "URB1" "VEZT" "XPO1" "ZNF530"
## [156] "ZNF76"
# Check for overlaps
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::filter(gene_name_junc %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
sort()
## [1] "DCAF16" "FBXO7"
# Construct contingency table of differential splicing
# Extract cluster_significance from corresponding leafcutter lists
clu_significance <-
leafcutter_list%>%
lapply(., function(x){
x$cluster_significance %>%
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"),
!str_detect(genes, ",")) %>%
dplyr::mutate(cluster_id = str_replace(cluster, "chr", "")) %>%
dplyr::select(comparison, cluster_id, everything(), -cluster)
})
cont_table <- setNames(vector(mode = "list", length = length(clu)),
names(clu))
for(i in 1:length(clu)){
cont_table[[i]] <- mcols(clu[[i]])[c("cluster_id", "gene_name_junc")] %>%
as_tibble() %>%
dplyr::filter(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
dplyr::filter(status == "Success") %>%
.[["cluster_id"]] %>%
unique())) %>%
tidyr::unnest(gene_name_junc) %>%
dplyr::distinct(cluster_id, gene_name_junc) %>%
dplyr::group_by(cluster_id) %>%
dplyr::filter(n() <= 1) %>%
dplyr::mutate(ds = case_when(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
dplyr::filter(p.adjust < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ "yes",
TRUE~ "no"),
PD_gene = case_when(gene_name_junc %in%
unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene)) ~ "yes",
TRUE ~ "no")) %>%
with(., table(ds, PD_gene, dnn = c("Differentially spliced?", "Overlapping PD gene?")))
}
cont_table
## $own
## Overlapping PD gene?
## Differentially spliced? no yes
## no 29746 404
## yes 2457 44
##
## $SRP058181
## Overlapping PD gene?
## Differentially spliced? no yes
## no 22548 327
## yes 1524 21
# Fisher's exact test
cont_table %>%
lapply(., function(x) x %>% fisher.test())
## $own
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.08873
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9405449 1.8082126
## sample estimates:
## odds ratio
## 1.318531
##
##
## $SRP058181
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.9118
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.5784634 1.4824421
## sample estimates:
## odds ratio
## 0.9501564
ds_overlap <- ds$own %>%
dplyr::filter(cluster_id %in% c(ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["cluster_id"]] %>%
unique())) %>%
dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect) %>%
dplyr::full_join(ds$SRP058181 %>%
dplyr::filter(cluster_id %in% c(ds_clu$SRP058181[queryHits(overlapped_junctions)] %>%
as.data.frame() %>%
.[["cluster_id"]] %>%
unique())) %>%
dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect),
by = c("comparison", "chr", "start", "end"),
suffix = c(".own", ".SRP058181")) %>%
dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE,
direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
TRUE ~ NA)) %>%
dplyr::filter(!is.na(same_direction_of_effect))
ds_overlap %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
ds_overlap %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n(),
same_direction_of_effect = sum(same_direction_of_effect)) %>%
dplyr::mutate(prop = (same_direction_of_effect/n) * 100)
ds_overlap %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>%
tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>%
dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>%
ggplot(aes(x = abs(dPSI), fill = dataset)) +
geom_density(alpha = 0.5) +
labs(x = expression("Absolute "*Delta*"PSI")) +
theme_rhr
a <- ds_overlap %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
b <- ds_overlap %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
ggarrange(a, b,
ncol = 1,
labels = c("a", "b"))
# Correlation of junctions within ds clusters from same comparison
ds_overlap %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 4254074, p-value = 0.02033
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1381891
# Correlation of junctions within ds clusters from same comparison and with same direction of effect
ds_overlap %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 24930, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9112306
This section is simply looking to see that junctions found within differentially spliced clusters from one dataset, also validate across in the other dataset (without needing to be differentially spliced in the other, too).
# Find junctions that overlap
overlapped_junctions <-
GenomicRanges::findOverlaps(query = clu$own,
subject = ds_clu$SRP058181,
ignore.strand = F,
type = "equal")
# Absolute numbers
print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))
## [1] "Total number of junctions within ds clusters in SRP058181: 6793"
print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))
## [1] "Number of intersecting junctions: 5952"
# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: ",
c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: 87.6196084204328"
ds_clu$SRP058181[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$SRP058181)) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>%
# ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) +
ggplot(aes(x = junc_cat, y = prop_overlap)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
# facet_wrap(vars(overlap)) +
labs(x = "", y = "Proportion of overlapping junctions") +
# scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
guides(fill = FALSE,
alpha=guide_legend(nrow=2,byrow=TRUE)) +
theme_rhr +
theme(legend.position = "bottom")
# Find junctions that overlap
overlapped_junctions <-
GenomicRanges::findOverlaps(query = clu$SRP058181,
subject = ds_clu$own,
ignore.strand = F,
type = "equal")
# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))
## [1] "Total number of junctions within ds clusters in own data: 12755"
print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))
## [1] "Number of intersecting junctions: 9993"
# Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181
print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ",
c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))
## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 78.3457467659741"
ds_clu$own[subjectHits(overlapped_junctions)] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(n = n(),
prop_overlap = n/length(ds_clu$own)) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>%
# ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) +
ggplot(aes(x = junc_cat, y = prop_overlap)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
# facet_wrap(vars(overlap)) +
labs(x = "", y = "Proportion of overlapping junctions") +
scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
guides(fill = FALSE,
alpha=guide_legend(nrow=2,byrow=TRUE)) +
theme_rhr +
theme(legend.position = "bottom")
# Find overlapping junctions/clusters
overlapped_junctions <-
GenomicRanges::findOverlaps(query = clu$SRP058181,
subject = clu$own,
ignore.strand = F,
type = "equal")
# Filter by cluster names from overlap
clu_overlap <- clu$own[subjectHits(overlapped_junctions), "cluster_id"] %>%
as_tibble() %>%
dplyr::inner_join(clu$SRP058181[queryHits(overlapped_junctions), "cluster_id"] %>%
as_tibble(),
by = c("seqnames", "start", "end", "strand"),
suffix = c(".own", ".SRP058181"))
# Remove clusters from own that overlap more than one cluster in SRP058181
# I.e. ensure no ambiguous cluster mapping
ambig_clu <- clu_overlap %>%
dplyr::distinct(cluster_id.own, cluster_id.SRP058181) %>%
dplyr::filter(!is.na(cluster_id.SRP058181)) %>%
dplyr::group_by(cluster_id.own) %>%
dplyr::filter(n() > 1)
# Extract non-ambiguous overlapping clusters
overlapping_clusters <- clu_overlap %>%
dplyr::filter(!cluster_id.own %in% unique(ambig_clu$cluster_id.own)) %>%
.[["cluster_id.own"]] %>%
unique()
# Absolute numbers
print(str_c("Number of clusters in own data: ", length(clu$own$cluster_id %>% unique())))
## [1] "Number of clusters in own data: 43544"
print(str_c("Number of clusters in SRP058181: ", length(clu$SRP058181$cluster_id %>% unique())))
## [1] "Number of clusters in SRP058181: 37021"
print(str_c("Number of overlapping clusters between both datasets: ", length(overlapping_clusters)))
## [1] "Number of overlapping clusters between both datasets: 31825"
# Use intron usage, as unlike clu (leafcutter cluster definitions), this will only include successfully tested clusters
# "Successful" test means cluster passed necessary minimum filters for differential splicing (i.e. min_samples_per_intron, min_samples_per_group, min_coverage)
intron_usage <-
leafcutter_list %>%
lapply(., function(x){
x$intron_usage %>%
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")) %>%
tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster"), sep = ":") %>%
tidyr::separate(col = cluster, into = c("clu", "id", "strand"), sep = "_", remove = FALSE) %>%
dplyr::mutate(chr = str_replace(chr, "chr", ""),
cluster_id = str_c(chr, ":", cluster))
})
# Loop through clusters to determine proportion overlap
# Run in parallel
cl <- makeCluster(20)
# Register clusters
registerDoParallel(cl)
getDoParWorkers()
clu_prop_overlap <- foreach(i = 1:length(overlapping_clusters),
.verbose = TRUE,
.packages = c("tidyverse", "stringr")) %dopar% {
# Determine number of junctions in own cluster
n_junctions.own <- intron_usage$own %>%
dplyr::filter(cluster_id == overlapping_clusters[i]) %>%
dplyr::distinct(chr, start, end, strand) %>%
nrow()
# Determine number of junctions in overlap
n_overlap <- clu_overlap %>%
dplyr::filter(cluster_id.own == overlapping_clusters[i],
!is.na(cluster_id.SRP058181)) %>%
nrow()
# Determine number of junctions in SRP058181 cluster
cluster_id.SRP058181 <- clu_overlap %>%
dplyr::filter(cluster_id.own == overlapping_clusters[i],
!is.na(cluster_id.SRP058181)) %>%
.[["cluster_id.SRP058181"]] %>%
unique()
n_junctions.SRP058181 <- intron_usage$SRP058181 %>%
dplyr::filter(cluster_id == cluster_id.SRP058181) %>%
dplyr::distinct(chr, start, end, strand) %>%
nrow()
df <- tibble(cluster_id.own = overlapping_clusters[i],
n_junctions.own = n_junctions.own,
cluster_id.SRP058181 = cluster_id.SRP058181,
n_junctions.SRP058181 = n_junctions.SRP058181,
n_overlap = n_overlap,
prop_overlap.own = (n_overlap/n_junctions.own)*100,
prop_overlap.SRP058181 = (n_overlap/n_junctions.SRP058181)*100)
df
}
# Stop cluster
stopCluster(cl = cl)
clu_prop_overlap <- clu_prop_overlap %>%
qdapTools::list_df2df() %>%
dplyr::select(-X1)
saveRDS(
clu_prop_overlap,
file.path(
path_to_results,
"leafcutter/diff_splicing_PCaxes/cluster_overlap_with_SRP058181.Rds"
)
)
clu_prop_overlap <-
readRDS(
file.path(
path_to_results,
"leafcutter/diff_splicing_PCaxes/cluster_overlap_with_SRP058181.Rds"
)
)
clu_prop_overlap %>%
tidyr::gather(key = "overlap", value = "prop_overlap", contains("prop_overlap")) %>%
dplyr::mutate(overlap = case_when(overlap == "prop_overlap.own" ~ "own --> SRP058181",
overlap == "prop_overlap.SRP058181" ~ "SRP058181 --> own")) %>%
ggplot(aes(x = prop_overlap, fill = overlap)) +
geom_histogram(binwidth = 5, alpha = 0.5, colour = "black") +
facet_wrap(~overlap) +
labs(x = "Proportion overlap of cluster from discovery dataset into validation dataset") +
theme_rhr +
theme(legend.position = "none")
cluster_significance
dataframe for both datasets, which contains the necessary information re. cluster-level differential splicing.# Extract cluster_significance from corresponding leafcutter lists
clu_significance <-
leafcutter_list%>%
lapply(., function(x){
x$cluster_significance %>%
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"),
!str_detect(genes, ",")) %>%
dplyr::mutate(cluster_id = str_replace(cluster, "chr", "")) %>%
dplyr::select(comparison, cluster_id, everything(), -cluster)
})
# Filter cluster overlap proportions to determine clusters that "exactly" match
clu_prop_overlap_exact <- clu_prop_overlap %>%
dplyr::filter(prop_overlap.own != Inf,
prop_overlap.own >= 100,
prop_overlap.own == prop_overlap.SRP058181)
# Filter clu_overlap to include only overlapping clusters that "exactly" match
clu_overlap_exact <- clu_overlap %>%
dplyr::distinct(cluster_id.own, cluster_id.SRP058181) %>%
dplyr::filter(cluster_id.own %in% clu_prop_overlap_exact$cluster_id.own)
print(str_c("Number of clusters that overlap exactly between both datasets: ", length(clu_overlap_exact$cluster_id.own %>% unique())))
## [1] "Number of clusters that overlap exactly between both datasets: 13433"
# Construct contingency table of differential splicing
cont_table <- clu_overlap_exact %>%
dplyr::mutate(ds.own = fct_relevel(case_when(cluster_id.own %in% c(clu_significance$own %>%
dplyr::filter(p.adjust < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE")),
ds.SRP058181 = fct_relevel(case_when(cluster_id.SRP058181 %in% c(clu_significance$SRP058181 %>%
dplyr::filter(p.adjust < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE"))) %>%
with(., table(ds.SRP058181, ds.own, dnn = c("Differentially spliced in SRP058181?", "Differentially spliced in own data?")))
cont_table
## Differentially spliced in own data?
## Differentially spliced in SRP058181? TRUE FALSE
## TRUE 64 696
## FALSE 772 11901
# Fisher's exact test
cont_table %>%
fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.01312
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.068660 1.853993
## sample estimates:
## odds ratio
## 1.417495
# Construct contingency table of differential splicing
cont_table <- clu_overlap_exact %>%
dplyr::mutate(ds.own = fct_relevel(case_when(cluster_id.own %in% c(clu_significance$own %>%
dplyr::filter(p.adjust < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE")),
ds.SRP058181 = fct_relevel(case_when(cluster_id.SRP058181 %in% c(clu_significance$SRP058181 %>%
dplyr::filter(p < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE"))) %>%
with(., table(ds.SRP058181, ds.own, dnn = c("Differentially spliced in SRP058181?", "Differentially spliced in own data?")))
cont_table
## Differentially spliced in own data?
## Differentially spliced in SRP058181? TRUE FALSE
## TRUE 304 4020
## FALSE 532 8577
# Fisher's exact test
cont_table %>%
fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.008332
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.050127 1.413516
## sample estimates:
## odds ratio
## 1.219146
# Find overlapping junctions/clusters
overlapped_junctions <-
GenomicRanges::findOverlaps(query = clu$SRP058181,
subject = ds_clu$own,
ignore.strand = F,
type = "equal")
# Filter by cluster names from overlap
clu_overlap_ds <-
ds_clu$own[subjectHits(overlapped_junctions), "cluster_id"] %>%
as_tibble() %>%
dplyr::inner_join(clu$SRP058181[queryHits(overlapped_junctions), "cluster_id"] %>%
as_tibble(),
by = c("seqnames", "start", "end", "strand"),
suffix = c(".own", ".SRP058181"))
# Filter generated clu_prop_overlap for cluster ids from clu_overlap_ds
# clu_prop_overlap already filtered for ambiguous clusters
# Generate geom_density
clu_prop_overlap %>%
dplyr::filter(cluster_id.own %in% clu_overlap_ds$cluster_id.own) %>%
tidyr::gather(key = "overlap", value = "prop_overlap", contains("prop_overlap")) %>%
dplyr::mutate(overlap = case_when(overlap == "prop_overlap.own" ~ "ds own --> all SRP058181",
overlap == "prop_overlap.SRP058181" ~ "all SRP058181 --> ds own")) %>%
ggplot(aes(x = prop_overlap, fill = overlap)) +
geom_histogram(binwidth = 5, alpha = 0.5, colour = "black") +
facet_wrap(~overlap) +
labs(x = "Proportion overlap of cluster from discovery dataset into validation dataset") +
theme_rhr +
theme(legend.position = "none")
overlapping_clusters
) with clusters from own data that:
cluster_significance
dataframe for those clusters from overlapping_clusters
. Then, for each pairwise comparison, perform FDR-correction using the smaller set of overlapping clusters from SRP058181.intron_usage
) with cluster-level data (cluster_significance
) and filter for only those cluster ids in overlapping_clusters
. Once this is done, merge output from each dataset (ensuring to filter SRP058181 for only those clusters that are now significant following FDR-correction).same_direction_of_effect
. This column evaluates whether junctions that were found differentially spliced in the same comparison across one or both datasets share the same direction of effect.# Filter for overlapping clusters with 100% overlap from own data into SRP058181
# AND that are differentially spliced in own data
overlapping_clusters <-
clu_prop_overlap %>%
dplyr::filter(prop_overlap.own != Inf,
prop_overlap.own >= 100,
cluster_id.own %in% clu_overlap_ds$cluster_id.own
)
print(str_c("Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: ",
clu_significance$SRP058181 %>%
dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181,
status == "Success",
p.adjust < 0.05) %>%
dplyr::distinct(cluster_id) %>%
nrow()))
## [1] "Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: 88"
# Correct each comparison using fdr on smaller set
clu_significance_fdr <- clu_significance
clu_significance_fdr$SRP058181 <-
clu_significance$SRP058181 %>%
dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, status == "Success") %>%
dplyr::group_by(comparison) %>%
dplyr::mutate(p.adjust = p.adjust(p, method = "fdr"))
print(str_c("Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: ",
clu_significance_fdr$SRP058181 %>%
dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181,
status == "Success",
p.adjust < 0.05) %>%
dplyr::distinct(cluster_id) %>%
nrow()))
## [1] "Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: 116"
# Create list with dataframes from each dataset containing merged intron usage and cluster significance only for overlapping_clusters
clu_val <- vector(mode = "list", length = 2)
for(i in 1:length(clu_val)){
# Get name of each dataset
name <- names(intron_usage)[i]
# Column name for cluster ids in each dataset
col_name <- str_c("cluster_id.", name)
# Join intron usage (i.e. junction-level data) with cluster significance (cluster-level significance)
# And filter for only those clusters found within overlapping_clusters
clu_val[[i]] <- intron_usage[[name]] %>%
# Filter for appropriate comparisons
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")) %>%
dplyr::inner_join(clu_significance_fdr[[name]]) %>%
dplyr::select(chr, start, end, comparison, cluster_id, loglr, p.adjust, df, Control, PD, PDD, deltapsi, genes) %>%
# Add direction of effect
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change")) %>%
dplyr::filter(cluster_id %in% c(overlapping_clusters %>%
.[[col_name]]))
names(clu_val)[i] <- name
}
# Join lists together
# Add additional column evaluating for those junctions shared between comparisons whether they share direction of effect
# Remove any junctions that are not shared between comparisons
ds_val <- clu_val$own %>%
full_join(clu_val$SRP058181 %>%
dplyr::filter(p.adjust < 0.05),
by = c("comparison", "chr", "start", "end", "genes"),
suffix = c(".own", ".SRP058181")) %>%
dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE,
direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
TRUE ~ NA))
cont_table <-
ds_val %>%
dplyr::mutate(same_direction_of_effect = fct_relevel(as.factor(same_direction_of_effect),
levels = c("TRUE", "FALSE")),
ds_both = fct_relevel(case_when(p.adjust.own < 0.05 & p.adjust.SRP058181 < 0.05~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE"))) %>%
dplyr::filter(!is.na(same_direction_of_effect)) %>%
with(., table(same_direction_of_effect, ds_both,
dnn = c("Same direction of effect?", "Differentially spliced (same comparison) in both datasets?")))
cont_table
## Differentially spliced (same comparison) in both datasets?
## Same direction of effect? TRUE FALSE
## TRUE 93 168
## FALSE 120 187
cont_table %>%
fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.434
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.603794 1.231333
## sample estimates:
## odds ratio
## 0.8628749
a <- ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>%
tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>%
dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>%
ggplot(aes(x = abs(dPSI), fill = dataset)) +
geom_density(alpha = 0.5) +
coord_cartesian(ylim = c(0,40)) +
labs(x = expression("Absolute "*Delta*"PSI")) +
theme_rhr
b <- ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>%
tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>%
dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>%
ggplot(aes(x = abs(dPSI), fill = dataset)) +
geom_density(alpha = 0.5) +
coord_cartesian(ylim = c(0,40)) +
labs(x = expression("Absolute "*Delta*"PSI")) +
theme_rhr
ggarrange(a, b,
labels = c("a", "b"),
common.legend = TRUE,
legend = "bottom")
a <- ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
b <- ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
ggarrange(a, b,
ncol = 1,
labels = c("a", "b"))
# Correlation of junctions with same direction of effect within ds clusters
ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 385454, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8699206
# Correlation of all junctions within ds clusters that contain some junctions with same direction of effect
ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 20826034, p-value = 0.1909
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.0580042
ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
dplyr::group_by(cluster_id.own) %>%
dplyr::summarise(n = n(),
same_direction_of_effect = sum(same_direction_of_effect, na.rm = TRUE),
prop_own = same_direction_of_effect/n) %>%
ggplot(aes(x = prop_own)) +
geom_histogram(binwidth = 0.01) +
labs(x = "Proportion of junctions within a shared cluster that share the same direction of effect") +
coord_cartesian(xlim = c(0,1.05)) +
theme_rhr
Gene lists derived by using genes overlapping "validated" clusters that are differentially spliced in both datasets that contain junctions that share the same direction of effect.
# Extract genes
genes_list <- setNames(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE,
p.adjust.own < 0.05,
p.adjust.SRP058181 < 0.05
) %>%
group_split(comparison),
ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE,
p.adjust.own < 0.05,
p.adjust.SRP058181 < 0.05
) %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){
x %>%
.[["genes"]] %>%
unique()
})
lapply(genes_list, length)
## $Control_vs_PD
## [1] 3
##
## $Control_vs_PDD
## [1] 15
##
## $PD_vs_PDD
## [1] 10
# Background list
all_genes <- leafcutter_list$own$cluster_significance %>%
# remove clusters that overlap multiple genes
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
UpSetR::upset(data = fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")
genes_list
## $Control_vs_PD
## [1] "ARHGAP9" "KIAA1755" "TPMT"
##
## $Control_vs_PDD
## [1] "NAP1L4" "RAPGEF3" "PRKAG1" "RBM25" "POMT2" "RASGRF1" "RHOT1"
## [8] "AP3D1" "ARHGEF1" "RPS9" "ADORA1" "BCAS1" "LPIN1" "ACTR8"
## [15] "RUFY3"
##
## $PD_vs_PDD
## [1] "MVD" "DLGAP1-AS4" "AMPD2" "SCMH1" "CTSZ"
## [6] "CCNL1" "ARAP2" "CAST" "ABHD11" "RGS20"
gprofiler <- lapply(genes_list, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "fdr", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
gprofiler %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000) %>%
ggplot(aes(x = comparison,
y = term_name,
size = precision,
fill = p_value)) +
geom_point(pch = 21) +
scale_fill_viridis_c(direction = -1) +
labs(x = "Comparison", y = "Dataset: Cell type") +
theme_rhr
# Load internal specificity matrices
ctd_files <-
list.files(
file.path(
path_to_results,
"snRNA/specificity_matrices/2020_March/"
),
pattern = "ctd",
full.names = T)
ctd_list <- vector(mode = "list", length = length(ctd_files))
for(i in 1:length(ctd_files)){
# Load ctd
ctd_list[[i]] <- readRDS(ctd_files[i])
# Extract file name
title <-
ctd_files[i] %>%
str_replace(".*/", "") %>%
str_replace("\\..*", "") %>%
str_replace(".*_", "")
# Name list
names(ctd_list)[i] <- title
}
# Run EWCE
# Cannot run with Control_vs_PD, as list is too small for EWCE
ewce <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list[2:3],
ctd_list$Control, ctd_list$PD,
ctd_list$PDD, ctd_list$DLB,
celltypeLevel = 1,
reps = 100000,
genelistSpecies = "human",
sctSpecies = "human",
mouse_to_human = NULL)
saveRDS(
ewce,
file =
file.path(
path_to_results,
"leafcutter/ewce/ewce_leafcutter_ds_validated_celltype.Rds"
)
)
ewce <-
readRDS(
file.path(
path_to_results,
"leafcutter/ewce/ewce_leafcutter_ds_validated_celltype.Rds"
)
)
plot <- ewce %>%
dplyr::mutate(FDR.p = p.adjust(p, method="BH"),
Class = case_when(
CellType %in% c("Astrocyte", "ASC", "Astro") ~ "Astrocyte",
CellType %in% c("MG", "Microglia") ~ "Microglia",
CellType %in% c("END", "Endo", "Endothelial cell", "Vascular") ~ "Vascular",
CellType %in% c("ODC", "Oligo", "Oligodendrocyte") ~ "Oligodendrocyte",
CellType %in% c("exCA", "Excitatory", "exDG", "exPFC", "Glutamatergic") ~ "Excitatory \n neuron",
CellType %in% c("GABA", "GABAergic", "Inhibitory") ~ "Inhibitory \n neuron",
CellType == "OPC" ~ "OPC",
CellType == "NSC" ~ "NSC"),
Study = str_replace(Study, "list\\$", ""),
Study = fct_relevel(Study,
c("Control", "PD", "PDD", "DLB")),
joint_name = str_c(Study, CellType, sep = ":"),
sd_from_mean = ifelse(p < 0.05, sd_from_mean, NA)) %>%
dplyr::arrange(Class, Study) %>%
dplyr::mutate_at(vars(joint_name), list(~ factor(., levels = unique(.)))) %>%
ggplot(aes(x = GeneSet, y = forcats::fct_rev(joint_name))) +
geom_tile(aes(fill = sd_from_mean), colour = "black") +
facet_grid(rows = vars(Class), scales = "free", space = "free_y") +
scale_fill_viridis_c(na.value = "grey") +
labs(x = "", y = "Dataset: Cell type") +
theme_rhr +
theme(panel.grid = element_blank(),
strip.text.y = element_text(size = 8, angle = 0))
plot
# Filter generated clu_prop_overlap for cluster ids from clu_overlap_ds
# clu_prop_overlap already filtered for ambiguous clusters
# Generate geom_density
clu_prop_overlap_exact %>%
dplyr::filter(cluster_id.own %in% clu_overlap_ds$cluster_id.own) %>%
tidyr::gather(key = "overlap", value = "prop_overlap", contains("prop_overlap")) %>%
dplyr::mutate(overlap = case_when(overlap == "prop_overlap.own" ~ "ds own --> all SRP058181",
overlap == "prop_overlap.SRP058181" ~ "all SRP058181 --> ds own")) %>%
ggplot(aes(x = prop_overlap, fill = overlap)) +
geom_histogram(binwidth = 5, alpha = 0.5, colour = "black") +
facet_wrap(~overlap) +
labs(x = "Proportion overlap of cluster from discovery dataset into validation dataset") +
theme_rhr +
theme(legend.position = "none")
overlapping_clusters
) with clusters from own data that:
cluster_significance
dataframe for those clusters from overlapping_clusters
. Then, for each pairwise comparison, perform FDR-correction using the smaller set of overlapping clusters from SRP058181.intron_usage
) with cluster-level data (cluster_significance
) and filter for only those cluster ids in overlapping_clusters
. Once this is done, merge output from each dataset (ensuring to filter SRP058181 for only those clusters that are now significant following FDR-correction).same_direction_of_effect
. This column evaluates whether junctions that were found differentially spliced in the same comparison across one or both datasets share the same direction of effect.# Filter for overlapping clusters with 100% overlap from own data into SRP058181
# AND that are differentially spliced in own data
overlapping_clusters <-
clu_prop_overlap_exact %>%
dplyr::filter(cluster_id.own %in% clu_overlap_ds$cluster_id.own)
print(str_c("Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: ",
clu_significance$SRP058181 %>%
dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181,
status == "Success",
p.adjust < 0.05) %>%
dplyr::ungroup() %>%
dplyr::distinct(cluster_id) %>%
nrow()))
## [1] "Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: 64"
# Correct each comparison using fdr on smaller set
clu_significance_fdr <- clu_significance
clu_significance_fdr$SRP058181 <-
clu_significance$SRP058181 %>%
dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, status == "Success") %>%
dplyr::group_by(comparison) %>%
dplyr::mutate(p.adjust = p.adjust(p, method = "fdr"))
print(str_c("Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: ",
clu_significance_fdr$SRP058181 %>%
dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181,
status == "Success",
p.adjust < 0.05) %>%
dplyr::ungroup() %>%
dplyr::distinct(cluster_id) %>%
nrow()))
## [1] "Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: 69"
# Create list with dataframes from each dataset containing merged intron usage and cluster significance only for overlapping_clusters
clu_val <- vector(mode = "list", length = 2)
for(i in 1:length(clu_val)){
# Get name of each dataset
name <- names(intron_usage)[i]
# Column name for cluster ids in each dataset
col_name <- str_c("cluster_id.", name)
# Join intron usage (i.e. junction-level data) with cluster significance (cluster-level significance)
# And filter for only those clusters found within overlapping_clusters
clu_val[[i]] <- intron_usage[[name]] %>%
# Filter for appropriate comparisons
dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")) %>%
# dplyr::inner_join(clu_significance_fdr[[name]]) %>%
dplyr::inner_join(clu_significance_fdr[[name]]) %>%
dplyr::select(chr, start, end, comparison, cluster_id, loglr, p.adjust, df, Control, PD, PDD, deltapsi, genes) %>%
# Add direction of effect
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change")) %>%
dplyr::filter(cluster_id %in% c(overlapping_clusters %>%
.[[col_name]]))
names(clu_val)[i] <- name
}
# Join lists together
# Add additional column evaluating for those junctions shared between comparisons whether they share direction of effect
# Remove any junctions that are not shared between comparisons
ds_val <- clu_val$own %>%
full_join(clu_val$SRP058181 %>%
dplyr::filter(p.adjust < 0.05),
by = c("comparison", "chr", "start", "end", "genes"),
suffix = c(".own", ".SRP058181")) %>%
dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE,
direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
TRUE ~ NA))
cont_table <-
ds_val %>%
dplyr::mutate(same_direction_of_effect = fct_relevel(as.factor(same_direction_of_effect),
levels = c("TRUE", "FALSE")),
ds_both = fct_relevel(case_when(p.adjust.own < 0.05 & p.adjust.SRP058181 < 0.05~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE"))) %>%
dplyr::filter(!is.na(same_direction_of_effect)) %>%
with(., table(same_direction_of_effect, ds_both,
dnn = c("Same direction of effect?", "Differentially spliced (same comparison) in both datasets?")))
cont_table
## Differentially spliced (same comparison) in both datasets?
## Same direction of effect? TRUE FALSE
## TRUE 49 114
## FALSE 67 111
cont_table %>%
fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.1697
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4413735 1.1461547
## sample estimates:
## odds ratio
## 0.7128169
a <- ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>%
tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>%
dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>%
ggplot(aes(x = abs(dPSI), fill = dataset)) +
geom_density(alpha = 0.5) +
coord_cartesian(ylim = c(0,40)) +
labs(x = expression("Absolute "*Delta*"PSI")) +
theme_rhr
b <- ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>%
tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>%
dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>%
ggplot(aes(x = abs(dPSI), fill = dataset)) +
geom_density(alpha = 0.5) +
coord_cartesian(ylim = c(0,40)) +
labs(x = expression("Absolute "*Delta*"PSI")) +
theme_rhr
ggarrange(a, b,
labels = c("a", "b"),
common.legend = TRUE,
legend = "bottom")
a <- ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
b <- ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
theme_rhr
ggarrange(a, b,
ncol = 1,
labels = c("a", "b"))
# Correlation of junctions with same direction of effect within ds clusters
ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 94434, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8691622
# Correlation of all junctions within ds clusters that contain some junctions with same direction of effect
ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")
##
## Spearman's rank correlation rho
##
## data: deltapsi.own and deltapsi.SRP058181
## S = 3825230, p-value = 0.009352
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1499394
ds_val %>%
dplyr::filter(cluster_id.own %in% c(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE) %>%
.[["cluster_id.own"]])) %>%
dplyr::group_by(cluster_id.own) %>%
dplyr::summarise(n = n(),
same_direction_of_effect = sum(same_direction_of_effect, na.rm = TRUE),
prop_own = same_direction_of_effect/n) %>%
ggplot(aes(x = prop_own)) +
geom_histogram(binwidth = 0.01) +
labs(x = "Proportion of junctions within a shared cluster that share the same direction of effect") +
coord_cartesian(xlim = c(0,1.05)) +
theme_rhr
clu_significance_fdr$SRP058181 <- clu_significance_fdr$SRP058181 %>%
dplyr::bind_rows(clu_significance$SRP058181 %>%
dplyr::anti_join(clu_significance_fdr$SRP058181,
by = c("comparison", "cluster_id")))
# Construct contingency table of differential splicing
clu_overlap_exact_ds <- clu_overlap_exact %>%
dplyr::mutate(ds.own = fct_relevel(case_when(cluster_id.own %in% c(clu_significance_fdr$own %>%
dplyr::filter(p.adjust < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE")),
ds.SRP058181 = fct_relevel(case_when(cluster_id.SRP058181 %in% c(clu_significance_fdr$SRP058181 %>%
dplyr::filter(p.adjust < 0.05) %>%
.[["cluster_id"]] %>%
unique()) ~ TRUE,
TRUE~ FALSE) %>% as.factor(),
levels = c("TRUE", "FALSE")))
cont_table <- clu_overlap_exact_ds %>%
with(., table(ds.SRP058181, ds.own, dnn = c("Differentially spliced in SRP058181?", "Differentially spliced in own data?")))
cont_table
## Differentially spliced in own data?
## Differentially spliced in SRP058181? TRUE FALSE
## TRUE 69 696
## FALSE 767 11901
# saveRDS(clu_overlap_exact_ds, file.path(path_to_results, "leafcutter/diff_splicing_PCaxes/cluster_overlap_exact_ds_with_SRP058181.Rds"))
# Fisher's exact test
cont_table %>%
fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.001956
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.170652 1.995428
## sample estimates:
## odds ratio
## 1.538183
Gene lists derived by using genes overlapping exactly-matching "validated" clusters that are differentially spliced in both datasets that contain junctions that share the same direction of effect.
# Extract genes
genes_list <- setNames(ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE,
p.adjust.own < 0.05,
p.adjust.SRP058181 < 0.05
) %>%
group_split(comparison),
ds_val %>%
dplyr::filter(same_direction_of_effect == TRUE,
p.adjust.own < 0.05,
p.adjust.SRP058181 < 0.05
) %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){
x %>%
.[["genes"]] %>%
unique()
})
lapply(genes_list, length)
## $Control_vs_PD
## [1] 2
##
## $Control_vs_PDD
## [1] 8
##
## $PD_vs_PDD
## [1] 5
# Background list
all_genes <- leafcutter_list$own$cluster_significance %>%
# remove clusters that overlap multiple genes
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
UpSetR::upset(data = fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")
genes_list
## $Control_vs_PD
## [1] "ARHGAP9" "TPMT"
##
## $Control_vs_PDD
## [1] "PRKAG1" "RBM25" "POMT2" "RASGRF1" "ARHGEF1" "BCAS1" "LPIN1"
## [8] "ACTR8"
##
## $PD_vs_PDD
## [1] "DLGAP1-AS4" "SCMH1" "CCNL1" "ARAP2" "CAST"
gprofiler <- lapply(genes_list, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "fdr", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
gprofiler %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000) %>%
ggplot(aes(x = comparison,
y = term_name,
size = precision,
fill = p_value)) +
geom_point(pch = 21) +
scale_fill_viridis_c(direction = -1) +
labs(x = "Comparison", y = "Dataset: Cell type") +
theme_rhr
# Also, try running with clusterProfiler, which has a number of useful functionalities:
# `compareCluster`: permits comparison of GO enrichments across groups.
# Uses hypergeometric model to perform enrichment analysis within each group (i.e. does not account for semantic similarity in testing as GProfiler does, which could be a disadvantage)
# Allows semantic trimming of terms.
library(clusterProfiler)
ont <- c("BP", "CC", "MF")
cluster_compare_GO <- setNames(vector(mode = "list", length = length(ont)),
ont)
for(i in 1:length(ont)){
cluster_compare_GO[[i]] <-
genes_list %>%
qdapTools::list2df(col1 = "genes", col2 = "comparison") %>%
clusterProfiler::compareCluster(genes ~ comparison,
data=.,
keyType = "SYMBOL",
fun="enrichGO",
ont = ont[i],
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
OrgDb='org.Hs.eg.db')
}
# saveRDS(cluster_compare_GO, file.path(path_to_results, "leafcutter/gprofiler/clusterProfiler_leafcutter_overlap_exact_ds_with_SRP058181.Rds"))
cluster_compare_GO <-
readRDS(
file.path(path_to_results, "leafcutter/gprofiler/clusterProfiler_leafcutter_overlap_exact_ds_with_SRP058181.Rds")
)
cluster_compare_GO %>%
lapply(., function(df){
df %>%
clusterProfiler::simplify(., cutoff=0.4, by="p.adjust", select_fun=min) %>%
clusterProfiler::dotplot(., showCategory = NULL, font.size = 8) +
geom_point(shape = 1,colour = "black") +
scale_size_continuous(name = "precision") +
scale_colour_viridis_c(direction = -1) +
theme_rhr
})
## $BP
##
## $CC
##
## $MF
# Load internal specificity matrices
ctd_files <-
list.files(
file.path(
path_to_results,
"snRNA/specificity_matrices/2020_March/"
),
pattern = "ctd",
full.names = T)
ctd_list <- vector(mode = "list", length = length(ctd_files))
for(i in 1:length(ctd_files)){
# Load ctd
ctd_list[[i]] <- readRDS(ctd_files[i])
# Extract file name
title <-
ctd_files[i] %>%
str_replace(".*/", "") %>%
str_replace("\\..*", "") %>%
str_replace(".*_", "")
# Name list
names(ctd_list)[i] <- title
}
# Run EWCE
# Cannot run with Control_vs_PD, as list is too small for EWCE
ewce <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list[2:3],
ctd_list$Control, ctd_list$PD,
ctd_list$PDD, ctd_list$DLB,
celltypeLevel = 1,
reps = 100000,
genelistSpecies = "human",
sctSpecies = "human",
mouse_to_human = NULL)
saveRDS(
ewce,
file =
file.path(
path_to_results,
"leafcutter/ewce/ewce_leafcutter_ds_exact_validated_celltype.Rds"
)
)
ewce <-
readRDS(
file.path(
path_to_results,
"leafcutter/ewce/ewce_leafcutter_ds_exact_validated_celltype.Rds"
)
)
plot <- ewce %>%
dplyr::filter(!Study %in% c("AIBS2018", "DRONC_human")) %>%
dplyr::mutate(FDR.p = p.adjust(p, method="BH"),
Class = case_when(
CellType %in% c("Astrocyte", "ASC", "Astro") ~ "Astrocyte",
CellType %in% c("MG", "Microglia") ~ "Microglia",
CellType %in% c("END", "Endo", "Endothelial cell", "Vascular") ~ "Vascular",
CellType %in% c("ODC", "Oligo", "Oligodendrocyte") ~ "Oligodendrocyte",
CellType %in% c("exCA", "Excitatory", "exDG", "exPFC", "Glutamatergic") ~ "Excitatory \n neuron",
CellType %in% c("GABA", "GABAergic", "Inhibitory") ~ "Inhibitory \n neuron",
CellType == "OPC" ~ "OPC",
CellType == "NSC" ~ "NSC"),
Study = str_replace(Study, "list\\$", ""),
Study = fct_relevel(Study,
c("Control", "PD", "PDD", "DLB")),
joint_name = str_c(Study, CellType, sep = ":"),
sd_from_mean = ifelse(p <= 0.05, sd_from_mean, NA)) %>%
dplyr::arrange(Class, Study) %>%
dplyr::mutate_at(vars(joint_name), list(~ factor(., levels = unique(.)))) %>%
ggplot(aes(x = GeneSet, y = forcats::fct_rev(joint_name))) +
geom_tile(aes(fill = sd_from_mean), colour = "black") +
facet_grid(rows = vars(Class), scales = "free", space = "free_y") +
scale_fill_viridis_c(na.value = "grey") +
labs(x = "", y = "Dataset: Cell type") +
theme_rhr +
theme(panel.grid = element_blank(),
strip.text.y = element_text(size = 8, angle = 0))
plot
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2 dasper_0.0.0.9000
## [4] testthat_2.3.2 UpSetR_1.4.0 rtracklayer_1.46.0
## [7] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1 IRanges_2.20.2
## [10] S4Vectors_0.24.4 BiocGenerics_0.32.0 readxl_1.3.1
## [13] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.2
## [16] purrr_0.3.4 readr_1.4.0 tidyr_1.1.1
## [19] tibble_3.0.3 tidyverse_1.3.0 ggsci_2.9
## [22] ggpubr_0.4.0 ggforce_0.3.2 ggplot2_3.3.2
## [25] gprofiler2_0.1.9 doParallel_1.0.15 iterators_1.0.12
## [28] foreach_1.5.0 devtools_2.3.2 usethis_1.6.3
## [31] data.table_1.13.0 clusterProfiler_3.14.3
##
## loaded via a namespace (and not attached):
## [1] utf8_1.1.4 tidyselect_1.1.0
## [3] RSQLite_2.2.0 AnnotationDbi_1.48.0
## [5] htmlwidgets_1.5.3 grid_3.6.1
## [7] BiocParallel_1.20.1 munsell_0.5.0
## [9] codetools_0.2-16 chron_2.3-56
## [11] DT_0.15 withr_2.2.0
## [13] colorspace_2.0-0 GOSemSim_2.17.1
## [15] Biobase_2.46.0 highr_0.8
## [17] knitr_1.29 rstudioapi_0.13
## [19] ggsignif_0.6.0 DOSE_3.12.0
## [21] labeling_0.4.2 urltools_1.7.3
## [23] GenomeInfoDbData_1.2.2 polyclip_1.10-0
## [25] bit64_4.0.2 farver_2.0.3
## [27] rprojroot_2.0.2 vctrs_0.3.2
## [29] generics_0.0.2 xfun_0.16
## [31] BiocFileCache_1.10.2 R6_2.5.0
## [33] graphlayouts_0.7.0 bitops_1.0-6
## [35] cachem_1.0.3 fgsea_1.12.0
## [37] gridGraphics_0.5-0 DelayedArray_0.12.3
## [39] assertthat_0.2.1 scales_1.1.1
## [41] ggraph_2.0.3 enrichplot_1.6.1
## [43] gtable_0.3.0 processx_3.4.5
## [45] tidygraph_1.2.0 rlang_0.4.7
## [47] splines_3.6.1 rstatix_0.6.0
## [49] lazyeval_0.2.2 broom_0.7.0
## [51] europepmc_0.4 BiocManager_1.30.10
## [53] yaml_2.2.1 reshape2_1.4.4
## [55] abind_1.4-5 modelr_0.1.8
## [57] crosstalk_1.1.0.1 backports_1.1.8
## [59] qvalue_2.18.0 tools_3.6.1
## [61] bookdown_0.21 qdapTools_1.3.5
## [63] ggplotify_0.0.5 ellipsis_0.3.1
## [65] RColorBrewer_1.1-2 ggdendro_0.1.21
## [67] sessioninfo_1.1.1 ggridges_0.5.2
## [69] Rcpp_1.0.5 plyr_1.8.6
## [71] progress_1.2.2 zlibbioc_1.32.0
## [73] RCurl_1.98-1.2 ps_1.3.4
## [75] prettyunits_1.1.1 openssl_1.4.2
## [77] viridis_0.5.1 cowplot_1.0.0
## [79] SummarizedExperiment_1.16.1 haven_2.3.1
## [81] ggrepel_0.8.2 here_1.0.0
## [83] fs_1.5.0 magrittr_2.0.1
## [85] DO.db_2.9 openxlsx_4.2.3
## [87] triebeard_0.3.0 reprex_1.0.0
## [89] matrixStats_0.56.0 pkgload_1.1.0
## [91] hms_1.0.0 evaluate_0.14
## [93] XML_3.99-0.3 rio_0.5.16
## [95] gridExtra_2.3 biomaRt_2.42.1
## [97] compiler_3.6.1 crayon_1.4.1
## [99] htmltools_0.5.1.1 mgcv_1.8-29
## [101] lubridate_1.7.9 DBI_1.1.1
## [103] tweenr_1.0.1 dbplyr_1.4.4
## [105] rappdirs_0.3.1 MASS_7.3-51.4
## [107] refGenome_1.7.7 Matrix_1.2-17
## [109] car_3.0-9 cli_2.2.0.9000
## [111] igraph_1.2.5 doBy_4.6.7
## [113] pkgconfig_2.0.3 rvcheck_0.1.8
## [115] GenomicAlignments_1.22.1 foreign_0.8-72
## [117] plotly_4.9.2.1 xml2_1.3.2
## [119] XVector_0.26.0 rvest_0.3.6
## [121] callr_3.5.1 digest_0.6.27
## [123] Biostrings_2.54.0 HGNChelper_0.8.1
## [125] rmarkdown_2.5 cellranger_1.1.0
## [127] fastmatch_1.1-0 Deriv_4.0
## [129] curl_4.3 Rsamtools_2.2.3
## [131] nlme_3.1-141 lifecycle_0.2.0
## [133] jsonlite_1.7.1 carData_3.0-4
## [135] fansi_0.4.2 limma_3.42.2
## [137] askpass_1.1 desc_1.2.0
## [139] viridisLite_0.3.0 pillar_1.4.6
## [141] lattice_0.20-38 fastmap_1.0.1
## [143] httr_1.4.2 pkgbuild_1.1.0
## [145] GO.db_3.10.0 glue_1.4.2
## [147] remotes_2.2.0 zip_2.1.0
## [149] bit_4.0.4 stringi_1.5.3
## [151] blob_1.2.1 memoise_2.0.0