Aims: to determine overlap of junctions and clusters derived from our own data and SRP058181

1 File paths for workflow

source(here::here("R", "file_paths.R"))

2 Importing cluster definitions

source(here::here("R", "convert_leafcutter.R"))

clu <- 
  setNames(
    list(
      fread(
        file.path(
          path_to_results,
          "leafcutter/intron_clustering/tissue_polyA_test_diseasegroups_perind_numers.counts.gz"
        )
      ), 
      fread(
        file.path(
          path_to_results,
          "SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz"
        )
      )
    ),
    c("own", "SRP058181")
  ) %>% 
  lapply(., function(x){
    x %>% 
      dplyr::rename(intron = V1) %>% 
      convert_leafcutter(leafcutter_results = ., 
                         use_strand = TRUE) %>%
      dasper::annotate_junc_ref(junc_metadata = .,
                                gtf = path_to_ref_gtf)
  })

saveRDS(
  clu$own, 
  file = 
    file.path(
      path_to_results,
      "leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
    )
)
saveRDS(
  clu$SRP058181, 
  file = 
    file.path(
      path_to_results,
      "SRP058181/leafcutter/intron_clustering/SRP058181_all_clusters_gtfannotated.Rds"
    )
)

Run quick QC check that majority of clusters are annotated.

# Import annotated clusters
clu <- 
  setNames(
    list(
      readRDS(
        file.path(
          path_to_results, 
          "leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
        )
      ), 
      readRDS(
        file.path(
          path_to_results,
          "SRP058181/leafcutter/intron_clustering/SRP058181_all_clusters_gtfannotated.Rds"
        )
      )
    ),
    c("own", "SRP058181")
  )

# Check that majority of clu are annotated
clu %>%
  lapply(., function(x){
    
    x %>% 
      as.data.frame() %>% 
      dplyr::group_by(junc_cat) %>% 
      dplyr::summarise(n = n()) %>% 
      dplyr::ungroup() %>% 
      dplyr::mutate(prop = n/sum(n)) %>% 
      dplyr::arrange(-prop)
    
  })

## $own
## # A tibble: 7 x 3
##   junc_cat            n    prop
##   <chr>           <int>   <dbl>
## 1 annotated       96622 0.634  
## 2 novel_acceptor  18186 0.119  
## 3 novel_donor     12927 0.0849 
## 4 none            10988 0.0721 
## 5 novel_exon_skip  9255 0.0608 
## 6 ambig_gene       2903 0.0191 
## 7 novel_combo      1417 0.00930
## 
## $SRP058181
## # A tibble: 7 x 3
##   junc_cat            n    prop
##   <chr>           <int>   <dbl>
## 1 annotated       87133 0.676  
## 2 novel_acceptor  13479 0.105  
## 3 novel_donor     11717 0.0910 
## 4 novel_exon_skip  7924 0.0615 
## 5 none             4820 0.0374 
## 6 ambig_gene       2544 0.0198 
## 7 novel_combo      1183 0.00918

3 Conceptual overview of the analysis

Final analysis included in paper: cluster-level overlaps using differentially spliced clusters and the "exact match" definition (most stringent, but also makes the most sense)

4 Junction-level overlaps: all junctions

4.1 Intersection

# Find junctions that overlap 
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = clu$SRP058181,
                              subject = clu$own,
                              ignore.strand = F, 
                              type = "equal")

# Absolute numbers
print(str_c("Total number of junctions found in own data: ", length(clu$own)))

## [1] "Total number of junctions found in own data: 152298"

print(str_c("Total number of junctions found in SRP058181: ", length(clu$SRP058181)))

## [1] "Total number of junctions found in SRP058181: 128800"

print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))

## [1] "Number of intersecting junctions: 107703"

# Percentage overlap from smaller SRP058181 dataset into larger
print(str_c("Percentage overlap from smaller SRP058181 dataset into larger: ", 
            c((overlapped_junctions %>% queryHits() %>% unique() %>% length())/(clu$SRP058181 %>% length()) * 100)))

## [1] "Percentage overlap from smaller SRP058181 dataset into larger: 83.6203416149068"

# Percentage overlap from larger own data into smaller SRP058181
print(str_c("Percentage overlap from larger own data into smaller SRP058181: ", 
            c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(clu$own %>% length()) * 100)))

## [1] "Percentage overlap from larger own data into smaller SRP058181: 70.7185911830753"

As expected, overlap of smaller dataset into larger dataset is greater than the opposite away around.

4.2 Intersections by annotation type

plot <- 
  clu$SRP058181[queryHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(n = n(),
                   prop_overlap = n/length(clu$SRP058181)) %>% 
  dplyr::mutate(overlap = "SRP058181 into own data") %>% 
  dplyr::bind_rows(clu$own[subjectHits(overlapped_junctions)] %>% 
                     as.data.frame() %>% 
                     dplyr::group_by(junc_cat) %>% 
                     dplyr::summarise(n = n(),
                                      prop_overlap = n/length(clu$own)) %>% 
                     dplyr::mutate(overlap = "Own data into SRP058181")) %>% 
  dplyr::filter(junc_cat != "ambig_gene") %>% 
  dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>% 
  # ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat, alpha = overlap)) + 
  ggplot(aes(x = junc_cat, y = prop_overlap, alpha = overlap)) + 
  geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
  # facet_wrap(vars(overlap)) +
  labs(x = "", y = "Proportion of overlapping junctions") +
  # scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
  scale_alpha_manual(name = "Direction of overlap",
                     values = c(1, 0.5)) +
  guides(fill = FALSE,
         alpha=guide_legend(nrow=2,byrow=TRUE)) +
  theme_rhr +
  theme(legend.position = "bottom")

plot

Overlap of junctions within Leafcutter-defined intron clusters from our own dataset into the recount dataset, SRP058181, and vice-versa.

Figure 4.1: Overlap of junctions within Leafcutter-defined intron clusters from our own dataset into the recount dataset, SRP058181, and vice-versa.

# ggsave(plot, 
#        filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/Departmental/SRP058181_junction_overlap.tiff",
#        device = "tiff",
#        height = 100,
#        width = 100,
#        dpi = 200,
#        units = "mm")

As expected, overlap of annotated is the highest compared to the other annotation types.

5 Junction-level overlaps: junctions in differentially spliced clusters

Analysis run using:
1. Covariate-corrected data
2. Covariate and cell-type-corrected data

5.1 Covariate-corrected data

Data corrected for AoD, RIN and sex, in the case of our own data. No correction for sex applied to SRP058181, as all subjects were male.

# Import leafcutter lists
leafcutter_list <-   
  setNames(
    list(
      readRDS(
        file.path(
          path_to_results, 
          "leafcutter/diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.Rds"
        )
      ), 
      readRDS(
        file.path(
          path_to_results,
          "SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
        )
      )
    ),
    c("own", "SRP058181")
  )

# Import differentially spliced clusters (no dPSI filter, yet)
ds <- 
  leafcutter_list %>%
  lapply(., function(x){
    
    x$significant_clusters_0.05_filter %>% 
      dplyr::mutate(chr = chr %>% 
                      str_replace(., "chr", ""),
                    cluster_id = str_c(chr, ":", cluster)) %>% 
      dplyr::select(comparison, chr, cluster, cluster_id, everything()) %>% 
      # Filtering for only those comparisons that are comparable between the two datasets
      # I.e. Control_vs_PD, Control_vs_PDD, PD_vs_PDD
      dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"))
    
  })

# Filter by differentially spliced clusters
ds_clu <- 
  setNames(list(clu$own[elementMetadata(clu$own)[, "cluster_id"] %in% 
                          c(ds$own$cluster_id %>% unique())],
                clu$SRP058181[elementMetadata(clu$SRP058181)[, "cluster_id"] %in% 
                                c(ds$SRP058181$cluster_id %>% unique())]),
           c("own", "SRP058181"))

5.1.1 Stringent intersection: junction in clusters differentially spliced in both datasets

# Find junctions that overlap 
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = ds_clu$SRP058181,
                              subject = ds_clu$own,
                              ignore.strand = F, 
                              type = "equal")

# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))

## [1] "Total number of junctions within ds clusters in own data: 15876"

print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))

## [1] "Total number of junctions within ds clusters in SRP058181: 2992"

print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))

## [1] "Number of intersecting junctions: 726"

# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: ", 
            c((overlapped_junctions %>% queryHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: 24.2647058823529"

print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ", 
            c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 4.572940287226"

ds_clu$SRP058181[queryHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(n = n(),
                   prop_overlap = n/length(ds_clu$SRP058181)) %>% 
  dplyr::mutate(overlap = "SRP058181 into own data") %>% 
  dplyr::bind_rows(ds_clu$own[subjectHits(overlapped_junctions)] %>% 
                     as.data.frame() %>% 
                     dplyr::group_by(junc_cat) %>% 
                     dplyr::summarise(n = n(),
                                      prop_overlap = n/length(ds_clu$own)) %>% 
                     dplyr::mutate(overlap = "Own data into SRP058181")) %>% 
  dplyr::filter(junc_cat != "ambig_gene") %>% 
  dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>% 
  # ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat, alpha = overlap)) + 
  ggplot(aes(x = junc_cat, y = prop_overlap, alpha = overlap)) + 
  geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
  # facet_wrap(vars(overlap)) +
  labs(x = "", y = "Proportion of overlapping junctions") +
  # scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
  scale_alpha_manual(name = "Direction of overlap",
                     values = c(1, 0.5)) +
  guides(fill = FALSE,
         alpha=guide_legend(nrow=2,byrow=TRUE)) +
  theme_rhr +
  theme(legend.position = "bottom")

Overlap of junctions within differentially spliced intron clusters from our own dataset with junctions within differentially spliced intron clusters from the recount dataset, SRP058181, and vice-versa.

Figure 5.1: Overlap of junctions within differentially spliced intron clusters from our own dataset with junctions within differentially spliced intron clusters from the recount dataset, SRP058181, and vice-versa.

Validation of differential splicing is very low across both datasets, but particularly when looking at the overlap between our own data and SRP058181.
This is in large part driven by the difference in the total number of junctions found within differentially spliced introns in each dataset (own data: 15876; SRP058181: 2992; fold change difference = own/SRP058181 = 5.3061497; overlapping junctions: 726).
By comparison, the total number of junctions in each is not so different (own data: 152298; SRP058181: 128800; fold change difference = own/SRP058181 = 1.1824379).

5.1.1.1 Overlap with genes

So how many genes and what genes does this amount to?

# Number of genes
ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  .[["gene_name_junc"]] %>% 
  unlist() %>% 
  unique() %>% 
  length()

## [1] 104

# Gene names
ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  .[["gene_name_junc"]] %>% 
  unlist() %>% 
  unique() %>% 
  sort()

##   [1] "ABCD4"        "ACAA1"        "ACBD4"        "ACSBG1"       "ACTR8"       
##   [6] "ACYP2"        "ADAM15"       "AGRN"         "AKAP8L"       "AL139260.1"  
##  [11] "ANK2"         "AP3S1"        "ARMC10"       "ART3"         "ASPH"        
##  [16] "BCAS1"        "BRSK2"        "C1orf61"      "CAMK2B"       "CAPN10"      
##  [21] "CDK5RAP1"     "CEP290"       "CERS4"        "CHKA"         "CIRBP"       
##  [26] "CLK4"         "COMMD4"       "CSAD"         "CSPG5"        "DAZAP1"      
##  [31] "DHX30"        "DOCK10"       "DPH7"         "DST"          "EIF2D"       
##  [36] "ELOVL2"       "ENY2"         "EPB41L2"      "EPB41L3"      "ETNPPL"      
##  [41] "FAM168A"      "FASTK"        "FBRSL1"       "FBXO7"        "FYN"         
##  [46] "GDPD5"        "GGT5"         "GPBP1"        "IL17RB"       "ILK"         
##  [51] "IQSEC1"       "KMT2E"        "LIMCH1"       "LUC7L3"       "MARK2"       
##  [56] "MBP"          "METTL26"      "MOK"          "MRNIP"        "MTX2"        
##  [61] "MVD"          "MYL6"         "MYO6"         "NPL"          "NSFL1C"      
##  [66] "PCM1"         "PCNX2"        "PICALM"       "PIP5K1C"      "PPIE"        
##  [71] "PPP1R3F"      "PREPL"        "PRMT7"        "PRPF38B"      "PTPN12"      
##  [76] "R3HDM1"       "R3HDM2"       "RBM3"         "RBM39"        "RHOT1"       
##  [81] "RPAIN"        "SBDSP1"       "SEPT4"        "SEPT8"        "SERPINH1"    
##  [86] "SFXN5"        "SHTN1"        "SLC25A23"     "SLC25A27"     "SLC30A3"     
##  [91] "SMARCD3"      "SUZ12P1"      "SYNGAP1"      "TAZ"          "TESPA1"      
##  [96] "TIAL1"        "TJP1"         "TMEM144"      "TMEM161B-AS1" "TMEM165"     
## [101] "TPD52L1"      "TRIM33"       "TSPOAP1"      "UQCRB"

5.1.1.1.1 Overlap with PD genes

And do any of these genes overlap PD-implicated genes?

# Read in PD genes
PD_genes <- 
  setNames(vector(mode = "list", length = 2),
                     c("mendelian", "sporadic"))

PD_genes$mendelian <- 
  read_delim(
    file.path(
      path_to_raw_data,
      "misc/PD_risk_genes/20191128_GE_ParkinsonDiseaseComplexParkinsonism_greengenes.tsv"
    ),
    delim = "\t"
  ) %>% 
  dplyr::rename(Gene = `Gene Symbol`)
PD_genes$sporadic <- 
  read_excel(
    file.path(
      path_to_raw_data,
      "misc/PD_risk_genes/TableS6_Complete_summary_statistics_for_QTL_Mendelian_randomization.xlsx"
    )
  ) %>% 
  dplyr::filter(`Pass Bonferroni` == "pass")

# Check for overlaps
ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::filter(gene_name_junc %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>% 
  .[["gene_name_junc"]] %>% 
  unlist() %>% 
  unique() %>% 
  sort()

## [1] "FBRSL1" "FBXO7"

5.1.1.1.2 Baseline overlap with PD genes

As a baseline, it's worth establishing the level of overlap in each dataset independently.
Can do this at a cluster level by:
- Constructing contingency table using clusters that were successfully tested of:
  1. Whether cluster is differentially spliced.
  2. Whether cluster overlaps a PD implicated gene.
Differential splicing in this case includes:
- Clusters found differentially spliced in any comparison (Control_vs_PD, Control_vs_PDD or PD_vs_PDD) that pass FDR < 0.05.

# Construct contingency table of differential splicing
# Extract cluster_significance from corresponding leafcutter lists
clu_significance <- 
  leafcutter_list%>%
  lapply(., function(x){
    
    x$cluster_significance %>% 
      dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"),
                    !str_detect(genes, ",")) %>% 
      dplyr::mutate(cluster_id = str_replace(cluster, "chr", "")) %>% 
      dplyr::select(comparison, cluster_id, everything(), -cluster)
    
  })

cont_table <- setNames(vector(mode = "list", length = length(clu)),
                       names(clu))

for(i in 1:length(clu)){
  
  cont_table[[i]] <- mcols(clu[[i]])[c("cluster_id", "gene_name_junc")] %>% 
    as_tibble() %>% 
    dplyr::filter(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
                                                     dplyr::filter(status == "Success") %>%
                                                     .[["cluster_id"]] %>%
                                                     unique())) %>% 
    tidyr::unnest(gene_name_junc) %>% 
    dplyr::distinct(cluster_id, gene_name_junc) %>% 
    dplyr::group_by(cluster_id) %>% 
    dplyr::filter(n() <= 1) %>% 
    dplyr::mutate(ds = case_when(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
                                                     dplyr::filter(p.adjust < 0.05) %>%
                                                     .[["cluster_id"]] %>%
                                                     unique()) ~ "yes",
                                 TRUE~ "no"),
                  PD_gene = case_when(gene_name_junc %in% 
                                        unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene)) ~ "yes",
                                      TRUE ~ "no")) %>% 
    with(., table(ds, PD_gene, dnn = c("Differentially spliced?", "Overlapping PD gene?")))
  
}

cont_table

## $own
##                        Overlapping PD gene?
## Differentially spliced?    no   yes
##                     no  29336   398
##                     yes  2867    50
## 
## $SRP058181
##                        Overlapping PD gene?
## Differentially spliced?    no   yes
##                     no  23551   342
##                     yes   521     6

# Fisher's exact test
cont_table %>% 
  lapply(., function(x) x %>% fisher.test())

## $own
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.09563
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9357161 1.7329065
## sample estimates:
## odds ratio 
##   1.285453 
## 
## 
## $SRP058181
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.7112
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2876699 1.7558886
## sample estimates:
## odds ratio 
##  0.7930489

With the Fisher's exact test we are attempting to answer the question: If there was no association between differential splicing of a cluster and whether the cluster overlaps a PD gene, what is the chance that random sampling would result in the association observed?
For our own covariate-corrected data, the p-value < 0.05, suggesting that the chances of observing this contingency table, and by extension the overlap between differential splicing and PD genes, by random chance is very small. Arguably, this is driven by the high overlap of no/no.
For the covariate-corrected SRP058181, no evidence of association.

5.1.1.2 Correlation of dPSI estimates in junctions found within differentially spliced clusters across both datasets

ds_overlap <- ds$own %>% 
  dplyr::filter(cluster_id %in% c(ds_clu$own[subjectHits(overlapped_junctions)] %>% 
                                    as.data.frame() %>% 
                                    .[["cluster_id"]] %>% 
                                    unique())) %>% 
  dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect) %>% 
  dplyr::full_join(ds$SRP058181 %>% 
                     dplyr::filter(cluster_id %in% c(ds_clu$SRP058181[queryHits(overlapped_junctions)] %>% 
                                                       as.data.frame() %>% 
                                                       .[["cluster_id"]] %>% 
                                                       unique())) %>% 
                     dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect),
                   by = c("comparison", "chr", "start", "end"),
                   suffix = c(".own", ".SRP058181")) %>% 
  dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE, 
                                                     direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
                                                     TRUE ~ NA)) %>% 
  dplyr::filter(!is.na(same_direction_of_effect))

ds_overlap %>% 
  DT::datatable(rownames = FALSE,
                options = list(scrollX = TRUE),
                class = 'white-space: nowrap')

What proportion of these overlapping junctions found within the same comparison share the same direction of effect?

ds_overlap %>% 
  dplyr::group_by(comparison) %>% 
  dplyr::summarise(n = n(),
                   same_direction_of_effect = sum(same_direction_of_effect)) %>% 
  dplyr::mutate(prop = (same_direction_of_effect/n) * 100)

Across pairwise comparisons, between 47-72% of junctions share the same direction of effect.
What is the range of dPSI across these shared junctions?

ds_overlap %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>% 
  tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>% 
  dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>% 
  ggplot(aes(x = abs(dPSI), fill = dataset)) + 
  geom_density(alpha = 0.5) +
  labs(x = expression("Absolute "*Delta*"PSI")) +
  theme_rhr

Density plot of absolute delta PSI values for those junctions that (i) overlap between our own data and SRP05181, (ii) are within clusters that are differentially spliced in both datasets across the same comparisons, and (iii) share the same direction of effect.

Figure 5.2: Density plot of absolute delta PSI values for those junctions that (i) overlap between our own data and SRP05181, (ii) are within clusters that are differentially spliced in both datasets across the same comparisons, and (iii) share the same direction of effect.

And how well do dPSI estimates correlate?

a <- ds_overlap %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

b <- ds_overlap %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

ggarrange(a, b,
          ncol = 1,
          labels = c("a", "b"))

Scatterplot of absolute delta PSI values for those junctions that (a) overlap between our own data and SRP0518181 and are within clusters that are differentially spliced in both datasets across the same comparisons and (b) share the same direction of effect.

Figure 5.3: Scatterplot of absolute delta PSI values for those junctions that (a) overlap between our own data and SRP0518181 and are within clusters that are differentially spliced in both datasets across the same comparisons and (b) share the same direction of effect.

# Correlation of junctions within ds clusters from same comparison
ds_overlap %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 5366852, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5863913

# Correlation of junctions within ds clusters from same comparison and with same direction of effect
ds_overlap %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 291220, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9283554

From two plots above, two observations can be made. Junctions overlapping clusters that are differentially spliced across both datasets, in the same comparison, and with the same direction of effect:
1. Mostly have low dPSI values.
2. But they are postively correlated.

5.1.2 Leniant intersection: junctions in clusters differentially spliced in one dataset

This section is simply looking to see that junctions found within differentially spliced clusters from one dataset, also validate across in the other dataset (without needing to be differentially spliced in the other, too).

5.1.2.1 SRP058181 into own data

# Find junctions that overlap 
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = clu$own,
                              subject = ds_clu$SRP058181,
                              ignore.strand = F, 
                              type = "equal")

# Absolute numbers
print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))

## [1] "Total number of junctions within ds clusters in SRP058181: 2992"

print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))

## [1] "Number of intersecting junctions: 2678"

# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: ",
            c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: 89.5053475935829"

ds_clu$SRP058181[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(n = n(),
                   prop_overlap = n/length(ds_clu$SRP058181)) %>% 
  dplyr::filter(junc_cat != "ambig_gene") %>% 
  dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>% 
  # ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) + 
  ggplot(aes(x = junc_cat, y = prop_overlap)) + 
  geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
  # facet_wrap(vars(overlap)) +
  labs(x = "", y = "Proportion of overlapping junctions") +
  # scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
  guides(fill = FALSE,
         alpha=guide_legend(nrow=2,byrow=TRUE)) +
  theme_rhr +
  theme(legend.position = "bottom")

Figure 5.4: Overlap of junctions found within differentially spliced clusters from SRP058181 into own data.

5.1.2.2 Own data into SRP058181

# Find junctions that overlap 
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = clu$SRP058181,
                              subject = ds_clu$own,
                              ignore.strand = F, 
                              type = "equal")

# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))

## [1] "Total number of junctions within ds clusters in own data: 15876"

print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))

## [1] "Number of intersecting junctions: 12493"

# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ",
            c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 78.6911060720584"

ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(n = n(),
                   prop_overlap = n/length(ds_clu$own)) %>% 
  dplyr::filter(junc_cat != "ambig_gene") %>% 
  dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>% 
  # ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) + 
  ggplot(aes(x = junc_cat, y = prop_overlap)) + 
  geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
  # facet_wrap(vars(overlap)) +
  labs(x = "", y = "Proportion of overlapping junctions") +
  scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
  guides(fill = FALSE,
         alpha=guide_legend(nrow=2,byrow=TRUE)) +
  theme_rhr +
  theme(legend.position = "bottom")

Figure 5.5: Overlap of junctions found within differentially spliced clusters from our own dataset into the recount dataset, SRP058181.

5.1.2.3 Summary

Unsurprisingly, we see greater overlap of junctions found within differentially spliced clusters from SRP058181 into our own dataset than the reverse situation.
This is expected, given that we have greater sequencing depth to call junctions in our own data. However, still good to see that 78% of junctions found within differentially spliced clusters in our own data validate in the other dataset.

5.2 Covariate and cell-type-corrected data

For our data, this was achieved by incorportating PC axes 1-4 (which correlated with AoD, RIN and cell-type proportions) into the model for differential splicing.
For SRP058181, cell-type proportions were included as additional variables in the model for differential splicing.

# Import leafcutter lists
leafcutter_list <-   
  setNames(
    list(
      readRDS(
        file.path(
          path_to_results, 
          "leafcutter/diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.Rds")
      ), 
      readRDS(
        file.path(
          path_to_results, 
          "SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
        )
      )
    ),
    c("own", "SRP058181")
  )

# Import differentially spliced clusters (no dPSI filter, yet)
ds <- 
  leafcutter_list %>%
  lapply(., function(x){
    
    x$significant_clusters_0.05_filter %>% 
      dplyr::mutate(chr = chr %>% 
                      str_replace(., "chr", ""),
                    cluster_id = str_c(chr, ":", cluster)) %>% 
      dplyr::select(comparison, chr, cluster, cluster_id, everything()) %>% 
      # Filtering for only those comparisons that are comparable between the two datasets
      # I.e. Control_vs_PD, Control_vs_PDD, PD_vs_PDD
      dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"))
    
  })

# Filter by differentially spliced clusters
ds_clu <- 
  setNames(list(clu$own[elementMetadata(clu$own)[, "cluster_id"] %in% 
                          c(ds$own$cluster_id %>% unique())],
                clu$SRP058181[elementMetadata(clu$SRP058181)[, "cluster_id"] %in% 
                                c(ds$SRP058181$cluster_id %>% unique())]),
           c("own", "SRP058181"))

5.2.1 Stringent: differentially spliced in both

# Find junctions that overlap 
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = ds_clu$SRP058181,
                              subject = ds_clu$own,
                              ignore.strand = F, 
                              type = "equal")

# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))

## [1] "Total number of junctions within ds clusters in own data: 12755"

print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))

## [1] "Total number of junctions within ds clusters in SRP058181: 6793"

print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))

## [1] "Number of intersecting junctions: 868"

# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: ", 
            c((overlapped_junctions %>% queryHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced cluster from SRP058181 into own data: 12.7778595613131"

print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ", 
            c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 6.80517444139553"

ds_clu$SRP058181[queryHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(n = n(),
                   prop_overlap = n/length(ds_clu$SRP058181)) %>% 
  dplyr::mutate(overlap = "SRP058181 into own data") %>% 
  dplyr::bind_rows(ds_clu$own[subjectHits(overlapped_junctions)] %>% 
                     as.data.frame() %>% 
                     dplyr::group_by(junc_cat) %>% 
                     dplyr::summarise(n = n(),
                                      prop_overlap = n/length(ds_clu$own)) %>% 
                     dplyr::mutate(overlap = "Own data into SRP058181")) %>% 
  dplyr::filter(junc_cat != "ambig_gene") %>% 
  dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>% 
  # ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat, alpha = overlap)) + 
  ggplot(aes(x = junc_cat, y = prop_overlap, alpha = overlap)) + 
  geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
  # facet_wrap(vars(overlap)) +
  labs(x = "", y = "Proportion of overlapping junctions") +
  # scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
  scale_alpha_manual(name = "Direction of overlap",
                     values = c(1, 0.5)) +
  guides(fill = FALSE,
         alpha=guide_legend(nrow=2,byrow=TRUE)) +
  theme_rhr +
  theme(legend.position = "bottom")

Figure 5.6: Overlap of junctions within differentially spliced intron clusters from our own dataset with junctions within differentially spliced intron clusters from the recount dataset, SRP058181, and vice-versa.

While overlap of SRP058181 into our own data has reduced in comparison to when this was done with data corrected only for AoD, RIN (and sex), overlap of our own data into the smaller SRP058181 has increased.
This is in large part driven by changes difference in the total number of junctions found within differentially spliced introns in each dataset (own data: 12755; SRP058181: 6793; fold change difference = own/SRP058181 = 1.8776682). Compared to data that was only corrected for AoD, RIN (and sex), where the fold change difference between the two datasets was > 5, the difference between the two has decreased significantly.
Furthermore, the number of overlapping junctions once data is cell-type corrected is higher (n = 868 junctions) compared to when data is corrected only by AoD and RIN (n = 701). This is clear from the next section, too, where the number of overlapping genes is higher for cell-type-corrected data.

5.2.1.1 Overlap with genes

So how many genes and what genes does this amount to?

# Number of genes
ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  .[["gene_name_junc"]] %>% 
  unlist() %>% 
  unique() %>% 
  length()

## [1] 156

# Gene names
ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  .[["gene_name_junc"]] %>% 
  unlist() %>% 
  unique() %>% 
  sort()

##   [1] "ABHD11"     "AC002470.2" "ACTR8"      "ADRA1B"     "AIFM3"     
##   [6] "AKAP8"      "AKAP8L"     "AMPD2"      "APIP"       "ARAP1"     
##  [11] "ARAP2"      "ARHGAP21"   "ARHGAP9"    "ARHGEF1"    "ASIC3"     
##  [16] "ASPA"       "ASPDH"      "ASPH"       "ATP1A2"     "BCAS1"     
##  [21] "BRWD1"      "BTAF1"      "C1orf61"    "C20orf27"   "CAPN2"     
##  [26] "CARMIL3"    "CARNS1"     "CAST"       "CCDC74A"    "CCNL1"     
##  [31] "CD22"       "CDC37"      "CDK18"      "CLIP2"      "CLK4"      
##  [36] "CNOT3"      "COQ9"       "CPT1C"      "CREBBP"     "CSDE1"     
##  [41] "CTSZ"       "CYP2J2"     "DBI"        "DBNDD1"     "DCAF16"    
##  [46] "DLGAP1-AS4" "DMXL1"      "DPH7"       "DPY19L2P1"  "DRAM2"     
##  [51] "DTX3"       "EIF2D"      "ERCC1"      "F3"         "FAAH"      
##  [56] "FAM13B"     "FAM221A"    "FBXO7"      "FLAD1"      "GABBR1"    
##  [61] "GGT5"       "GORASP1"    "HAUS2"      "HHATL"      "IFT140"    
##  [66] "IQSEC2"     "IZUMO4"     "KIAA1755"   "L3MBTL2"    "LINC00299" 
##  [71] "LMO3"       "LPIN1"      "LUC7L3"     "MAST1"      "MBNL2"     
##  [76] "MFSD12"     "MIA3"       "MORN5"      "MRPL43"     "MRPL51"    
##  [81] "MSTO1"      "MTDH"       "MTMR12"     "MVD"        "NAALAD2"   
##  [86] "NAP1L4"     "NCALD"      "NDRG2"      "NDUFA6-DT"  "NDUFAF6"   
##  [91] "NELFA"      "NETO1"      "NSUN5P2"    "ODF2"       "OGFOD3"    
##  [96] "OSBPL1A"    "P4HA2"      "PCYT2"      "PIN4"       "PLCH2"     
## [101] "PLEKHA5"    "PLXNB1"     "POMT2"      "POT1"       "PRKAG1"    
## [106] "PRKAG2"     "PRR14"      "PRRC2B"     "PRRC2C"     "PSMD13"    
## [111] "PTPRZ1"     "RANGAP1"    "RAPGEF1"    "RASGRF1"    "RASSF8-AS1"
## [116] "RBM25"      "RBMX"       "RERGL"      "RFC5"       "RGS20"     
## [121] "RHOJ"       "RNF212"     "ROCK2"      "ROGDI"      "RPAIN"     
## [126] "RPUSD3"     "RUFY3"      "SCMH1"      "SEZ6L2"     "SHC4"      
## [131] "SLCO1C1"    "SNCB"       "SNHG29"     "SNHG8"      "SNX32"     
## [136] "SON"        "SPATS2L"    "SPINDOC"    "SPRYD7"     "SPSB3"     
## [141] "SRCIN1"     "SRPK2"      "STMN4"      "STYXL1"     "TMEM167A"  
## [146] "TMEM176B"   "TMEM63A"    "TMEM98"     "TSPAN17"    "UBA3"      
## [151] "UIMC1"      "URB1"       "VEZT"       "XPO1"       "ZNF530"    
## [156] "ZNF76"

5.2.1.1.1 Overlap with PD genes

And do any of these genes overlap PD-implicated genes?

# Check for overlaps
ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::filter(gene_name_junc %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>% 
  .[["gene_name_junc"]] %>% 
  unlist() %>% 
  unique() %>% 
  sort()

## [1] "DCAF16" "FBXO7"

Worth noting that FBXO7 was also highlighted in the data only corrected for AoD, RIN (and sex).

5.2.1.1.2 Baseline overlap with PD genes

As a baseline, it's worth establishing the level of overlap in each dataset independently.
Can do this at a cluster level by:
- Constructing contingency table using clusters that were successfully tested of:
  1. Whether cluster is differentially spliced.
  2. Whether cluster overlaps a PD implicated gene.
Differential splicing in this case includes:
- Clusters found differentially spliced in any comparison (Control_vs_PD, Control_vs_PDD or PD_vs_PDD) that pass FDR < 0.05.

# Construct contingency table of differential splicing
# Extract cluster_significance from corresponding leafcutter lists
clu_significance <- 
  leafcutter_list%>%
  lapply(., function(x){
    
    x$cluster_significance %>% 
      dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"),
                    !str_detect(genes, ",")) %>% 
      dplyr::mutate(cluster_id = str_replace(cluster, "chr", "")) %>% 
      dplyr::select(comparison, cluster_id, everything(), -cluster)
    
  })

cont_table <- setNames(vector(mode = "list", length = length(clu)),
                       names(clu))

for(i in 1:length(clu)){
  
  cont_table[[i]] <- mcols(clu[[i]])[c("cluster_id", "gene_name_junc")] %>% 
    as_tibble() %>% 
    dplyr::filter(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
                                                     dplyr::filter(status == "Success") %>%
                                                     .[["cluster_id"]] %>%
                                                     unique())) %>% 
    tidyr::unnest(gene_name_junc) %>% 
    dplyr::distinct(cluster_id, gene_name_junc) %>% 
    dplyr::group_by(cluster_id) %>% 
    dplyr::filter(n() <= 1) %>% 
    dplyr::mutate(ds = case_when(cluster_id %in% c(clu_significance[[names(clu[i])]] %>%
                                                     dplyr::filter(p.adjust < 0.05) %>%
                                                     .[["cluster_id"]] %>%
                                                     unique()) ~ "yes",
                                 TRUE~ "no"),
                  PD_gene = case_when(gene_name_junc %in% 
                                        unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene)) ~ "yes",
                                      TRUE ~ "no")) %>% 
    with(., table(ds, PD_gene, dnn = c("Differentially spliced?", "Overlapping PD gene?")))
  
}

cont_table

## $own
##                        Overlapping PD gene?
## Differentially spliced?    no   yes
##                     no  29746   404
##                     yes  2457    44
## 
## $SRP058181
##                        Overlapping PD gene?
## Differentially spliced?    no   yes
##                     no  22548   327
##                     yes  1524    21

# Fisher's exact test
cont_table %>% 
  lapply(., function(x) x %>% fisher.test())

## $own
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.08873
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9405449 1.8082126
## sample estimates:
## odds ratio 
##   1.318531 
## 
## 
## $SRP058181
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.9118
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.5784634 1.4824421
## sample estimates:
## odds ratio 
##  0.9501564

With the Fisher's exact test we are attempting to answer the question: If there was no association between differential splicing of a cluster and whether the cluster overlaps a PD gene, what is the chance that random sampling would result in the association observed?
For both datasets, following correction for cell-type, there is no evidence of an assoication between differential splicing of a cluster and whether the cluster overlaps a PD-associated gene.

5.2.1.2 dPSI estimates in junctions found within differentially spliced clusters across both datasets

ds_overlap <- ds$own %>% 
  dplyr::filter(cluster_id %in% c(ds_clu$own[subjectHits(overlapped_junctions)] %>% 
                                    as.data.frame() %>% 
                                    .[["cluster_id"]] %>% 
                                    unique())) %>% 
  dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect) %>% 
  dplyr::full_join(ds$SRP058181 %>% 
                     dplyr::filter(cluster_id %in% c(ds_clu$SRP058181[queryHits(overlapped_junctions)] %>% 
                                                       as.data.frame() %>% 
                                                       .[["cluster_id"]] %>% 
                                                       unique())) %>% 
                     dplyr::select(chr, start, end, comparison, cluster_id, loglr, df, Control, PD, PDD, deltapsi, direction_of_effect),
                   by = c("comparison", "chr", "start", "end"),
                   suffix = c(".own", ".SRP058181")) %>% 
  dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE, 
                                                     direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
                                                     TRUE ~ NA)) %>% 
  dplyr::filter(!is.na(same_direction_of_effect))

ds_overlap %>% 
  DT::datatable(rownames = FALSE,
                options = list(scrollX = TRUE),
                class = 'white-space: nowrap')

What proportion of these overlapping junctions found within the same comparison share the same direction of effect?

ds_overlap %>% 
  dplyr::group_by(comparison) %>% 
  dplyr::summarise(n = n(),
                   same_direction_of_effect = sum(same_direction_of_effect)) %>% 
  dplyr::mutate(prop = (same_direction_of_effect/n) * 100)

Despite having a higher number of overlapping junctions when correcting for cell-type compared to just AoD/RIN (868 vs 701)), the number (and proportion) of junctions that actually overlap across the same comparison and share the same direction of effect is lower in cell-type-corrected data.
What is the range of dPSI across these shared junctions?

ds_overlap %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>% 
  tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>% 
  dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>% 
  ggplot(aes(x = abs(dPSI), fill = dataset)) + 
  geom_density(alpha = 0.5) +
  labs(x = expression("Absolute "*Delta*"PSI")) +
  theme_rhr

Density plot of absolute delta PSI values for those junctions that (i) overlap between our own data and SRP0518181, (ii) are within clusters that are differentially spliced in both datasets across the same comparisons, and (iii) share the same direction of effect.

Figure 5.7: Density plot of absolute delta PSI values for those junctions that (i) overlap between our own data and SRP0518181, (ii) are within clusters that are differentially spliced in both datasets across the same comparisons, and (iii) share the same direction of effect.

And how well do dPSI estimates correlate?

a <- ds_overlap %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

b <- ds_overlap %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.025, 0.025), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

ggarrange(a, b,
          ncol = 1,
          labels = c("a", "b"))

Figure 5.8: Scatterplot of absolute delta PSI values for those junctions that (a) overlap between our own data and SRP0518181 and are within clusters that are differentially spliced in both datasets across the same comparisons and (b) share the same direction of effect.

# Correlation of junctions within ds clusters from same comparison
ds_overlap %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 4254074, p-value = 0.02033
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1381891

# Correlation of junctions within ds clusters from same comparison and with same direction of effect
ds_overlap %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 24930, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9112306

From two plots above, following observations can be made.
- Junctions overlapping clusters that are differentially spliced across both datasets, in the same comparison, and with the same direction of effect:
  - Mostly have low dPSI values, although SRP058181 junctions appear to have a higher density of junctions with dPSI > 0.1.
  - Positively correlated.
- If using only junctions overlapping the same comparison, it is clear that there is no real correlation between junctions -- in large part driven by the sheer number of junctions that share a comparison, but not the same direction of effect.

5.2.2 Leniant: differentially spliced in one dataset

5.2.2.1 SRP058181 into own data

# Find junctions that overlap 
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = clu$own,
                              subject = ds_clu$SRP058181,
                              ignore.strand = F, 
                              type = "equal")

# Absolute numbers
print(str_c("Total number of junctions within ds clusters in SRP058181: ", length(ds_clu$SRP058181)))

## [1] "Total number of junctions within ds clusters in SRP058181: 6793"

print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))

## [1] "Number of intersecting junctions: 5952"

# Percentage overlaps
print(str_c("Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: ",
            c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$SRP058181 %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced clusters from SRP058181 into own data: 87.6196084204328"

ds_clu$SRP058181[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(n = n(),
                   prop_overlap = n/length(ds_clu$SRP058181)) %>% 
  dplyr::filter(junc_cat != "ambig_gene") %>% 
  dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>% 
  # ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) + 
  ggplot(aes(x = junc_cat, y = prop_overlap)) + 
  geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
  # facet_wrap(vars(overlap)) +
  labs(x = "", y = "Proportion of overlapping junctions") +
  # scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
  guides(fill = FALSE,
         alpha=guide_legend(nrow=2,byrow=TRUE)) +
  theme_rhr +
  theme(legend.position = "bottom")

Figure 5.9: Overlap of junctions found within differentially spliced clusters from SRP058181 into own data.

5.2.2.2 Own data into SRP058181

# Find junctions that overlap 
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = clu$SRP058181,
                              subject = ds_clu$own,
                              ignore.strand = F, 
                              type = "equal")

# Absolute numbers
print(str_c("Total number of junctions within ds clusters in own data: ", length(ds_clu$own)))

## [1] "Total number of junctions within ds clusters in own data: 12755"

print(str_c("Number of intersecting junctions: ", c(overlapped_junctions %>% subjectHits() %>% unique() %>% length())))

## [1] "Number of intersecting junctions: 9993"

# Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181
print(str_c("Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: ",
            c((overlapped_junctions %>% subjectHits() %>% unique() %>% length())/(ds_clu$own %>% length()) * 100)))

## [1] "Percentage overlap of junctions within differentially spliced cluster from own data into smaller SRP058181: 78.3457467659741"

ds_clu$own[subjectHits(overlapped_junctions)] %>% 
  as.data.frame() %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(n = n(),
                   prop_overlap = n/length(ds_clu$own)) %>% 
  dplyr::filter(junc_cat != "ambig_gene") %>% 
  dplyr::mutate(junc_cat = junc_cat %>% factor(levels = c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))) %>% 
  # ggplot(aes(x = junc_cat, y = prop_overlap, fill = junc_cat)) + 
  ggplot(aes(x = junc_cat, y = prop_overlap)) + 
  geom_col(position = position_dodge2(preserve = "single", padding = 0), colour = "black") +
  # facet_wrap(vars(overlap)) +
  labs(x = "", y = "Proportion of overlapping junctions") +
  scale_fill_manual(values = c(pal_npg(palette = c("nrc"), alpha = 1)(10)[c(4,2,1,3)], "#808080")) +
  guides(fill = FALSE,
         alpha=guide_legend(nrow=2,byrow=TRUE)) +
  theme_rhr +
  theme(legend.position = "bottom")

Figure 5.10: Overlap of junctions found within differentially spliced clusters from our own dataset into the recount dataset, SRP058181.

5.2.2.3 Summary

Unsurprisingly, we see greater overlap of junctions found within differentially spliced clusters from SRP058181 into our own dataset than the reverse situation.
This is expected, given that we have greater sequencing depth to call junctions in our own data.
Proportions are also very similar to what we see with data corrected only for AoD and RIN.

6 Cluster-level overlaps: all clusters

6.1 Discovery-validation matches

Definition of match: 100% of junctions in a cluster from the discovery dataset also found in a cluster from the validation dataset (which may/may not contain an equal number of junctions).
Strategy:
1. Look for overlapping junctions between two datasets.
2. Use overlapping junctions to extract the cluster ids from our own data.
3. Remove ambiguous clusters i.e. clusters from our own data that overlap more than one cluster from SRP058181.
4. Within clusters from own data that overlap a cluster in SRP058181, determine proportion of junctions from total number of junctions in own cluster that overlap. Do the same in the opposite direction i.e. within clusters from SRP058181 that overlap a cluster in own data, determine proportion of junctions from total number of junctions in SRP058181 junction that overlap.
5. Plot histogram.

# Find overlapping junctions/clusters
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = clu$SRP058181,
                              subject = clu$own,
                              ignore.strand = F, 
                              type = "equal")

# Filter by cluster names from overlap
clu_overlap <- clu$own[subjectHits(overlapped_junctions), "cluster_id"] %>% 
  as_tibble() %>% 
  dplyr::inner_join(clu$SRP058181[queryHits(overlapped_junctions), "cluster_id"] %>% 
                      as_tibble(),
                    by = c("seqnames", "start", "end", "strand"),
                    suffix = c(".own", ".SRP058181"))

# Remove clusters from own that overlap more than one cluster in SRP058181
# I.e. ensure no ambiguous cluster mapping
ambig_clu <- clu_overlap %>% 
  dplyr::distinct(cluster_id.own, cluster_id.SRP058181) %>% 
  dplyr::filter(!is.na(cluster_id.SRP058181)) %>% 
  dplyr::group_by(cluster_id.own) %>% 
  dplyr::filter(n() > 1)

# Extract non-ambiguous overlapping clusters
overlapping_clusters <- clu_overlap %>% 
  dplyr::filter(!cluster_id.own %in% unique(ambig_clu$cluster_id.own)) %>% 
  .[["cluster_id.own"]] %>% 
  unique()

# Absolute numbers
print(str_c("Number of clusters in own data: ", length(clu$own$cluster_id %>% unique())))

## [1] "Number of clusters in own data: 43544"

print(str_c("Number of clusters in SRP058181: ", length(clu$SRP058181$cluster_id %>% unique())))

## [1] "Number of clusters in SRP058181: 37021"

print(str_c("Number of overlapping clusters between both datasets: ", length(overlapping_clusters)))

## [1] "Number of overlapping clusters between both datasets: 31825"

# Use intron usage, as unlike clu (leafcutter cluster definitions), this will only include successfully tested clusters
# "Successful" test means cluster passed necessary minimum filters for differential splicing (i.e. min_samples_per_intron, min_samples_per_group, min_coverage)
intron_usage <- 
  leafcutter_list %>%
  lapply(., function(x){
    
    x$intron_usage %>% 
      dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")) %>% 
      tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster"), sep = ":") %>% 
      tidyr::separate(col = cluster, into = c("clu", "id", "strand"), sep = "_", remove = FALSE) %>% 
      dplyr::mutate(chr = str_replace(chr, "chr", ""),
                    cluster_id = str_c(chr, ":", cluster))
    
    })

# Loop through clusters to determine proportion overlap
# Run in parallel
cl <- makeCluster(20)

# Register clusters
registerDoParallel(cl)
getDoParWorkers()

clu_prop_overlap <- foreach(i = 1:length(overlapping_clusters),
                                 .verbose = TRUE,
                                 .packages = c("tidyverse", "stringr")) %dopar% {
                                   
                                   # Determine number of junctions in own cluster
                                   n_junctions.own <- intron_usage$own %>% 
                                     dplyr::filter(cluster_id == overlapping_clusters[i]) %>% 
                                     dplyr::distinct(chr, start, end, strand) %>% 
                                     nrow()
                                   
                                   # Determine number of junctions in overlap
                                   n_overlap <- clu_overlap %>% 
                                     dplyr::filter(cluster_id.own == overlapping_clusters[i],
                                                   !is.na(cluster_id.SRP058181)) %>% 
                                     nrow()
                                   
                                   # Determine number of junctions in SRP058181 cluster
                                   cluster_id.SRP058181 <- clu_overlap %>% 
                                     dplyr::filter(cluster_id.own == overlapping_clusters[i],
                                                   !is.na(cluster_id.SRP058181)) %>% 
                                     .[["cluster_id.SRP058181"]] %>% 
                                     unique()
                                   
                                   n_junctions.SRP058181 <- intron_usage$SRP058181 %>% 
                                     dplyr::filter(cluster_id == cluster_id.SRP058181) %>% 
                                     dplyr::distinct(chr, start, end, strand) %>% 
                                     nrow()
                                   
                                   df <- tibble(cluster_id.own = overlapping_clusters[i],
                                                n_junctions.own = n_junctions.own,
                                                cluster_id.SRP058181 = cluster_id.SRP058181,
                                                n_junctions.SRP058181 = n_junctions.SRP058181,
                                                n_overlap = n_overlap,
                                                prop_overlap.own = (n_overlap/n_junctions.own)*100,
                                                prop_overlap.SRP058181 = (n_overlap/n_junctions.SRP058181)*100)
                                   
                                   
                                   df
                                 }

# Stop cluster
stopCluster(cl = cl)

clu_prop_overlap <- clu_prop_overlap %>% 
  qdapTools::list_df2df() %>% 
  dplyr::select(-X1)

saveRDS(
  clu_prop_overlap, 
  file.path(
    path_to_results,
    "leafcutter/diff_splicing_PCaxes/cluster_overlap_with_SRP058181.Rds"
  )
)

clu_prop_overlap <- 
  readRDS(
    file.path(
      path_to_results,
      "leafcutter/diff_splicing_PCaxes/cluster_overlap_with_SRP058181.Rds"
    )
  )

clu_prop_overlap %>% 
  tidyr::gather(key = "overlap", value = "prop_overlap", contains("prop_overlap")) %>% 
  dplyr::mutate(overlap = case_when(overlap == "prop_overlap.own" ~ "own --> SRP058181",
                                    overlap == "prop_overlap.SRP058181" ~ "SRP058181 --> own")) %>% 
  ggplot(aes(x = prop_overlap, fill = overlap)) +
  geom_histogram(binwidth = 5, alpha = 0.5, colour = "black") +
  facet_wrap(~overlap) +
  labs(x = "Proportion overlap of cluster from discovery dataset into validation dataset") +
  theme_rhr +
  theme(legend.position = "none")

Figure 6.1: Histogram of the proportion overlap between all clusters derived from own data that are also found in SRP058181, and vice versa.

Own --> SRP058181
- Found that 63.4% of clusters overlapping between own data and SRP058181 share 100% of their junctions, as defined by their cluster definition in our own data.
SRP058181 --> Own
- Found that 57% of clusters overlapping between own data and SRP058181 share 100% of their junctions, as defined by their cluster definition in SRP058181.

6.2 Exact match

Definition of match: clusters that intersect between datasets contain the same number of junctions, with matching donor and acceptor positions.
Strategy:
1. Use calculated cluster overlap proportions. Filter for rows where proportion overlap is 100% irrespective of which dataset was used to define the cluster.
2. Construct contigency table of differentially spliced/not differentially spliced for exact matches. To do this requires extracting cluster_significance dataframe for both datasets, which contains the necessary information re. cluster-level differential splicing.
3. Fisher's exact test.

6.2.1 Number of "exactly" matching clusters

# Extract cluster_significance from corresponding leafcutter lists
clu_significance <- 
  leafcutter_list%>%
  lapply(., function(x){
    
    x$cluster_significance %>% 
      dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"),
                    !str_detect(genes, ",")) %>% 
      dplyr::mutate(cluster_id = str_replace(cluster, "chr", "")) %>% 
      dplyr::select(comparison, cluster_id, everything(), -cluster)
    
  })

# Filter cluster overlap proportions to determine clusters that "exactly" match
clu_prop_overlap_exact <- clu_prop_overlap %>% 
  dplyr::filter(prop_overlap.own != Inf,
                prop_overlap.own >= 100, 
                prop_overlap.own == prop_overlap.SRP058181)

# Filter clu_overlap to include only overlapping clusters that "exactly" match
clu_overlap_exact <- clu_overlap %>% 
  dplyr::distinct(cluster_id.own, cluster_id.SRP058181) %>% 
  dplyr::filter(cluster_id.own %in% clu_prop_overlap_exact$cluster_id.own)

print(str_c("Number of clusters that overlap exactly between both datasets: ", length(clu_overlap_exact$cluster_id.own %>% unique())))

## [1] "Number of clusters that overlap exactly between both datasets: 13433"

6.2.2 Contingency table and fisher's exact test

Differential splicing in this case includes:
- Clusters found differentially spliced in any comparison (Control_vs_PD, Control_vs_PDD or PD_vs_PDD)
- Clusters that pass FDR < 0.05 in both datasets (i.e. this is quite stringent)

# Construct contingency table of differential splicing
cont_table <- clu_overlap_exact %>% 
  dplyr::mutate(ds.own = fct_relevel(case_when(cluster_id.own %in% c(clu_significance$own %>%
                                                                       dplyr::filter(p.adjust < 0.05) %>%
                                                                       .[["cluster_id"]] %>%
                                                                       unique()) ~ TRUE,
                                               TRUE~ FALSE) %>% as.factor(),
                                     levels = c("TRUE", "FALSE")),
                ds.SRP058181 = fct_relevel(case_when(cluster_id.SRP058181 %in% c(clu_significance$SRP058181 %>%
                                                                                   dplyr::filter(p.adjust < 0.05) %>%
                                                                                   .[["cluster_id"]] %>%
                                                                                   unique()) ~ TRUE,
                                                     TRUE~ FALSE) %>% as.factor(),
                                           levels = c("TRUE", "FALSE"))) %>% 
  with(., table(ds.SRP058181, ds.own, dnn = c("Differentially spliced in SRP058181?", "Differentially spliced in own data?")))

cont_table

##                                     Differentially spliced in own data?
## Differentially spliced in SRP058181?  TRUE FALSE
##                                TRUE     64   696
##                                FALSE   772 11901

# Fisher's exact test
cont_table %>% 
  fisher.test()

## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.01312
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.068660 1.853993
## sample estimates:
## odds ratio 
##   1.417495

With the Fisher's exact test we are attempting to answer the question: If there was no association between differential splicing in our own data and differential splicing in SRP058181, what is the chance that random sampling would result in the association observed?
Thus, significant p-value suggests to us that chances of observing this contingency table, and by extension the overlap between datasets, by random chance is very small.
Interpretation of odds ratio:
- OR = odds(differentially spliced in SRP|differentially spliced in own) / odds(differentially spliced in SRP|not differentially spliced in own)
- The odds of a cluster being differentially spliced in SRP058181 conditional upon the cluster being differentially spliced in our own data is 1.42 times that for a cluster that is not differentially spliced in our own data.

6.2.3 Contingency table and fisher's exact test (using nominal p-value in SRP058181)

Differential splicing in this case includes:
- Clusters found differentially spliced in any comparison (Control_vs_PD, Control_vs_PDD or PD_vs_PDD)
- Clusters that pass FDR < 0.05 in bulk-tissue RNA-seq and p < 0.05 in SRP058181

# Construct contingency table of differential splicing
cont_table <- clu_overlap_exact %>% 
  dplyr::mutate(ds.own = fct_relevel(case_when(cluster_id.own %in% c(clu_significance$own %>%
                                                                       dplyr::filter(p.adjust < 0.05) %>%
                                                                       .[["cluster_id"]] %>%
                                                                       unique()) ~ TRUE,
                                               TRUE~ FALSE) %>% as.factor(),
                                     levels = c("TRUE", "FALSE")),
                ds.SRP058181 = fct_relevel(case_when(cluster_id.SRP058181 %in% c(clu_significance$SRP058181 %>%
                                                                                   dplyr::filter(p < 0.05) %>%
                                                                                   .[["cluster_id"]] %>%
                                                                                   unique()) ~ TRUE,
                                                     TRUE~ FALSE) %>% as.factor(),
                                           levels = c("TRUE", "FALSE"))) %>% 
  with(., table(ds.SRP058181, ds.own, dnn = c("Differentially spliced in SRP058181?", "Differentially spliced in own data?")))

cont_table

##                                     Differentially spliced in own data?
## Differentially spliced in SRP058181? TRUE FALSE
##                                TRUE   304  4020
##                                FALSE  532  8577

# Fisher's exact test
cont_table %>% 
  fisher.test()

## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.008332
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.050127 1.413516
## sample estimates:
## odds ratio 
##   1.219146

With the Fisher's exact test we are attempting to answer the question: If there was no association between differential splicing in our own data and differential splicing in SRP058181, what is the chance that random sampling would result in the association observed?
Thus, significant p-value suggests to us that chances of observing this contingency table, and by extension the overlap between datasets, by random chance is very small.
Interpretation of odds ratio:
- OR = odds(differentially spliced in SRP|differentially spliced in own) / odds(differentially spliced in SRP|not differentially spliced in own)
- The odds of a cluster being differentially spliced in SRP058181 conditional upon the cluster being differentially spliced in our own data is 1.22 times that for a cluster that is not differentially spliced in our own data.

7 Cluster-level overlaps: differentially spliced clusters

7.1 Discovery-validation matches

Overall strategy:
1. Find the set of intersecting junctions between both datasets --> these can be used to define which clusters from our own data could be detected in SRP058181.
2. Validation will depend on X % of junctions found within a cluster from our own data being found in a cluster from SRP058181.
3. Those clusters that "validate" and are found differentially spliced in our own data can then be tested for differential splicing in SRP058181 i.e. adjust p-value within comparison to reflect smaller number of tests.
Only cell-type-corrected data used.

7.1.1 Intersecting clusters

Strategy:
1. Look for overlapping junctions between two datasets, using:
  1. Clusters differentially spliced in our own data.
  2. All clusters from SRP058181.
2. Use overlapping junctions to extract the cluster ids from our own data.
3. Remove ambiguous clusters i.e. clusters from our own data that overlap more than one cluster from SRP058181.
4. Within clusters from own data that overlap a cluster in SRP058181, determine proportion of junctions from total number of junctions in own cluster that overlap. Plot histogram.

# Find overlapping junctions/clusters
overlapped_junctions <- 
  GenomicRanges::findOverlaps(query = clu$SRP058181,
                              subject = ds_clu$own,
                              ignore.strand = F, 
                              type = "equal")

# Filter by cluster names from overlap
clu_overlap_ds <- 
  ds_clu$own[subjectHits(overlapped_junctions), "cluster_id"] %>% 
  as_tibble() %>% 
  dplyr::inner_join(clu$SRP058181[queryHits(overlapped_junctions), "cluster_id"] %>% 
              as_tibble(),
              by = c("seqnames", "start", "end", "strand"),
              suffix = c(".own", ".SRP058181"))

# Filter generated clu_prop_overlap for cluster ids from clu_overlap_ds
# clu_prop_overlap already filtered for ambiguous clusters
# Generate geom_density
clu_prop_overlap %>% 
  dplyr::filter(cluster_id.own %in% clu_overlap_ds$cluster_id.own) %>% 
  tidyr::gather(key = "overlap", value = "prop_overlap", contains("prop_overlap")) %>% 
  dplyr::mutate(overlap = case_when(overlap == "prop_overlap.own" ~ "ds own --> all SRP058181",
                                    overlap == "prop_overlap.SRP058181" ~ "all SRP058181 --> ds own")) %>% 
  ggplot(aes(x = prop_overlap, fill = overlap)) +
  geom_histogram(binwidth = 5, alpha = 0.5, colour = "black") +
  facet_wrap(~overlap) +
  labs(x = "Proportion overlap of cluster from discovery dataset into validation dataset") +
  theme_rhr +
  theme(legend.position = "none")

Figure 7.1: Histogram of the proportion overlap between differentially spliced clusters derived from own data that are also found in SRP058181.

Differentially spliced own --> all SRP058181
- Found that 60.2% clusters overlapping between own data (filtered to include only differentially spliced clusters) and SRP058181 share 100% of their junctions, as defined by their cluster definition in our own data.
All SRP058181 --> differentially spliced own
- Found that 62.8% of clusters overlapping between own data (filtered to include only differentially spliced clusters) and SRP058181 share 100% of their junctions, as defined by their cluster definition in SRP058181.
This is similar to what we saw when using all clusters.
Given that proportion is so large, will use those clusters with 100% overlap.

7.1.2 FDR-correction of intersecting SRP058181 clusters

Strategy:
1. Create dataframe (overlapping_clusters) with clusters from own data that:
  1. Have a 100% overlap i.e. all junctions within cluster were found within a cluster from SRP058181.
  2. AND were found differentially spliced.
2. For SRP058181, filter cluster_significance dataframe for those clusters from overlapping_clusters. Then, for each pairwise comparison, perform FDR-correction using the smaller set of overlapping clusters from SRP058181.
3. For both datasets, merge junction-level data (intron_usage) with cluster-level data (cluster_significance) and filter for only those cluster ids in overlapping_clusters. Once this is done, merge output from each dataset (ensuring to filter SRP058181 for only those clusters that are now significant following FDR-correction).
4. Perform a full join of the two lists by: comparison, chromosome, start and end site of junction, overlapping genes. By joining using comparison, we ensure that we are only keeping those junctions that were found differentially spliced in the SAME comparison in one or both datasets.
5. Add a column same_direction_of_effect. This column evaluates whether junctions that were found differentially spliced in the same comparison across one or both datasets share the same direction of effect.

# Filter for overlapping clusters with 100% overlap from own data into SRP058181
# AND that are differentially spliced in own data
overlapping_clusters <- 
  clu_prop_overlap %>% 
  dplyr::filter(prop_overlap.own != Inf,
                prop_overlap.own >= 100,
                cluster_id.own %in% clu_overlap_ds$cluster_id.own
                )

print(str_c("Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: ",
            clu_significance$SRP058181 %>% 
              dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, 
                            status == "Success",
                            p.adjust < 0.05) %>% 
              dplyr::distinct(cluster_id) %>% 
              nrow()))

## [1] "Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: 88"

# Correct each comparison using fdr on smaller set
clu_significance_fdr <- clu_significance

clu_significance_fdr$SRP058181 <-
  clu_significance$SRP058181 %>% 
  dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, status == "Success") %>% 
  dplyr::group_by(comparison) %>% 
  dplyr::mutate(p.adjust = p.adjust(p, method = "fdr"))

print(str_c("Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: ",
            clu_significance_fdr$SRP058181 %>% 
              dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, 
                            status == "Success",
                            p.adjust < 0.05) %>% 
              dplyr::distinct(cluster_id) %>% 
              nrow()))

## [1] "Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: 116"

# Create list with dataframes from each dataset containing merged intron usage and cluster significance only for overlapping_clusters
clu_val <- vector(mode = "list", length = 2)

for(i in 1:length(clu_val)){
  
  # Get name of each dataset
  name <- names(intron_usage)[i]
  
  # Column name for cluster ids in each dataset
  col_name <- str_c("cluster_id.", name)
  
  # Join intron usage (i.e. junction-level data) with cluster significance (cluster-level significance)
  # And filter for only those clusters found within overlapping_clusters
  clu_val[[i]] <- intron_usage[[name]] %>% 
    # Filter for appropriate comparisons
    dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")) %>% 
    dplyr::inner_join(clu_significance_fdr[[name]]) %>% 
    dplyr::select(chr, start, end, comparison, cluster_id, loglr, p.adjust, df, Control, PD, PDD, deltapsi, genes) %>% 
    # Add direction of effect
    dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
                                                  deltapsi < 0 ~ "downregulated",
                                                  deltapsi == 0 ~ "no_change")) %>% 
    dplyr::filter(cluster_id %in% c(overlapping_clusters %>% 
                                                  .[[col_name]])) 
  names(clu_val)[i] <- name
  
}

# Join lists together
# Add additional column evaluating for those junctions shared between comparisons whether they share direction of effect 
# Remove any junctions that are not shared between comparisons
ds_val <- clu_val$own %>% 
  full_join(clu_val$SRP058181 %>% 
              dplyr::filter(p.adjust < 0.05), 
            by = c("comparison", "chr", "start", "end", "genes"),
            suffix = c(".own", ".SRP058181")) %>% 
  dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE, 
                                                          direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
                                                    TRUE ~ NA))

7.1.3 Correlating dPSI estimates

If we now return to the junction level... from the set of junctions that are (i) found differentially spliced in one or both datasets and (ii) appear in the same comparison in both datasets, what proportion of these share the same direction of effect? And would we expect to see this by random chance?
Strategy:
- Construct contingency tables, with:
  1. Differentially spliced in both data sets?
  2. Same direction of effect (conditional on same comparison)?
- Run Fisher's exact test.

cont_table <- 
  ds_val %>% 
  dplyr::mutate(same_direction_of_effect = fct_relevel(as.factor(same_direction_of_effect),
                                                       levels = c("TRUE", "FALSE")),
                ds_both = fct_relevel(case_when(p.adjust.own < 0.05 & p.adjust.SRP058181 < 0.05~ TRUE,
                                                TRUE~ FALSE) %>% as.factor(),
                                      levels = c("TRUE", "FALSE"))) %>% 
  dplyr::filter(!is.na(same_direction_of_effect)) %>% 
  with(., table(same_direction_of_effect, ds_both,  
                dnn = c("Same direction of effect?", "Differentially spliced (same comparison) in both datasets?")))

cont_table

##                          Differentially spliced (same comparison) in both datasets?
## Same direction of effect? TRUE FALSE
##                     TRUE    93   168
##                     FALSE  120   187

cont_table %>% 
  fisher.test()

## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.434
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.603794 1.231333
## sample estimates:
## odds ratio 
##  0.8628749

No association between whether junctions within clusters found differentially spliced in both datasets across the same comparison also share the same direction of effect.

a <- ds_val %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>% 
  tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>% 
  dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>% 
  ggplot(aes(x = abs(dPSI), fill = dataset)) + 
  geom_density(alpha = 0.5) +
  coord_cartesian(ylim = c(0,40)) +
  labs(x = expression("Absolute "*Delta*"PSI")) +
  theme_rhr

b <- ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>% 
  tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>% 
  dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>% 
  ggplot(aes(x = abs(dPSI), fill = dataset)) + 
  geom_density(alpha = 0.5) +
  coord_cartesian(ylim = c(0,40)) +
  labs(x = expression("Absolute "*Delta*"PSI")) +
  theme_rhr

ggarrange(a, b,
          labels = c("a", "b"),
          common.legend = TRUE, 
          legend = "bottom")

Density plot of absolute delta PSI values for (a) those junctions that (i) overlap between our own data and SRP0518181, (ii) are within clusters that are differentially spliced in *one or both* datasets across the same comparisons, and (iii) share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

Figure 7.2: Density plot of absolute delta PSI values for (a) those junctions that (i) overlap between our own data and SRP0518181, (ii) are within clusters that are differentially spliced in one or both datasets across the same comparisons, and (iii) share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

And how well do dPSI estimates correlate?

a <- ds_val %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

b <- ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

ggarrange(a, b,
          ncol = 1,
          labels = c("a", "b"))

Scatterplot of absolute delta PSI values for those junctions that (a) overlap between our own data and SRP0518181, are within clusters that are differentially spliced in *one or both* datasets across the same comparisons and share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

Figure 7.3: Scatterplot of absolute delta PSI values for those junctions that (a) overlap between our own data and SRP0518181, are within clusters that are differentially spliced in one or both datasets across the same comparisons and share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

# Correlation of junctions with same direction of effect within ds clusters 
ds_val %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 385454, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8699206

# Correlation of all junctions within ds clusters that contain some junctions with same direction of effect
ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 20826034, p-value = 0.1909
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.0580042

Unsurprisingly, dPSI correlates well (and positively) when looking at junctions with the same direction of effect within ds clusters found across both datasets. However, if we seed from these junctions back to the clusters they originate from and check correlation across all junctions within a cluster, the correlation of dPSI is far from obvious.
This is in large part because junctions found to share same direction of effect typically make up < 75% of the larger cluster they originate from, as seen from the histogram below. Thus, looking at this from a whole cluster perspective will add noise to the comparison.

ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  dplyr::group_by(cluster_id.own) %>% 
  dplyr::summarise(n = n(),
                   same_direction_of_effect = sum(same_direction_of_effect, na.rm = TRUE),
                   prop_own = same_direction_of_effect/n) %>% 
  ggplot(aes(x = prop_own)) + 
  geom_histogram(binwidth = 0.01) +
  labs(x = "Proportion of junctions within a shared cluster that share the same direction of effect") +
  coord_cartesian(xlim = c(0,1.05)) +
  theme_rhr

Figure 7.4: Histogram of shared clusters and the proportion of junctions within each cluster that share the same direction of effect.

7.1.4 Pathway enrichment analysis

Gene lists derived by using genes overlapping "validated" clusters that are differentially spliced in both datasets that contain junctions that share the same direction of effect.

7.1.4.1 Overlap between gene lists

Gene lists are quite small:

# Extract genes
genes_list <- setNames(ds_val %>% 
                              dplyr::filter(same_direction_of_effect == TRUE,
                                            p.adjust.own < 0.05,
                                            p.adjust.SRP058181 < 0.05
                                            ) %>% 
                              group_split(comparison),
                            ds_val %>% 
                              dplyr::filter(same_direction_of_effect == TRUE,
                                            p.adjust.own < 0.05,
                                            p.adjust.SRP058181 < 0.05
                                            ) %>% 
                              .[["comparison"]] %>% 
                              unique() %>% 
                              sort()) %>% 
  # For each dataframe, extract the gene column and remove duplicate genes
  lapply(., function(x){
    
    x %>% 
      .[["genes"]] %>% 
      unique()
  })

lapply(genes_list, length)

## $Control_vs_PD
## [1] 3
## 
## $Control_vs_PDD
## [1] 15
## 
## $PD_vs_PDD
## [1] 10

Thus, worth checking overlap between them.

# Background list
all_genes <- leafcutter_list$own$cluster_significance %>% 
  # remove clusters that overlap multiple genes
  dplyr::filter(!str_detect(genes, ",")) %>% 
  .[["genes"]] %>% 
  unique() 

UpSetR::upset(data = fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")

Overlap between genes lists derived from clusters found differentially spliced in the same comparison and with the same direction of effect in our own data & SRP058181. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

Figure 7.5: Overlap between genes lists derived from clusters found differentially spliced in the same comparison and with the same direction of effect in our own data & SRP058181. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

No overlaps.
Genes in each list include:

genes_list

## $Control_vs_PD
## [1] "ARHGAP9"  "KIAA1755" "TPMT"    
## 
## $Control_vs_PDD
##  [1] "NAP1L4"  "RAPGEF3" "PRKAG1"  "RBM25"   "POMT2"   "RASGRF1" "RHOT1"  
##  [8] "AP3D1"   "ARHGEF1" "RPS9"    "ADORA1"  "BCAS1"   "LPIN1"   "ACTR8"  
## [15] "RUFY3"  
## 
## $PD_vs_PDD
##  [1] "MVD"        "DLGAP1-AS4" "AMPD2"      "SCMH1"      "CTSZ"      
##  [6] "CCNL1"      "ARAP2"      "CAST"       "ABHD11"     "RGS20"

7.1.4.2 Pathway results

FDR correction (as opposed to gSCS) used.
Terms filtered for size (i.e. term size > 20 or <= 2000)

gprofiler <- lapply(genes_list, function(x){
  x %>% 
    gost(., organism = "hsapiens",
       correction_method= "fdr", significant = TRUE,
       user_threshold = 0.05,
       custom_bg= all_genes,
       sources = c("GO", "REAC", "KEGG", "OMIM"),
       evcodes = TRUE)})

gprofiler %>% 
  lapply(., function(x){
    x$result %>% 
      as_tibble()
  }) %>% 
  qdapTools::list_df2df(., col = "comparison") %>% 
  dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>% 
  dplyr::filter(term_size > 20 & term_size <= 2000) %>%
  ggplot(aes(x = comparison, 
             y = term_name, 
             size = precision, 
             fill = p_value)) +
  geom_point(pch = 21) +    
  scale_fill_viridis_c(direction = -1) +
  labs(x = "Comparison", y = "Dataset: Cell type") +
  theme_rhr

Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

Figure 7.6: Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

7.1.5 EWCE

Same gene list as above i.e. genes overlapping "validated" clusters that are differentially spliced in both datasets and contain junctions that are differentially spliced across the same comparison and with the same direction of effect.

# Load internal specificity matrices
ctd_files <- 
    list.files(
    file.path(
      path_to_results,
      "snRNA/specificity_matrices/2020_March/"
    ), 
    pattern = "ctd", 
    full.names = T)
    

ctd_list <- vector(mode = "list", length = length(ctd_files))

for(i in 1:length(ctd_files)){
  
  # Load ctd
  ctd_list[[i]] <- readRDS(ctd_files[i])
  
  # Extract file name
  title <- 
    ctd_files[i] %>% 
    str_replace(".*/", "") %>% 
    str_replace("\\..*", "") %>% 
    str_replace(".*_", "")
  
  # Name list
  names(ctd_list)[i] <- title
 
}

# Run EWCE
# Cannot run with Control_vs_PD, as list is too small for EWCE
ewce <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list[2:3],
                                         ctd_list$Control, ctd_list$PD,
                                         ctd_list$PDD, ctd_list$DLB,
                                         celltypeLevel = 1, 
                                         reps = 100000, 
                                         genelistSpecies = "human", 
                                         sctSpecies = "human",
                                         mouse_to_human = NULL)

saveRDS(
  ewce, 
  file = 
    file.path(
      path_to_results, 
      "leafcutter/ewce/ewce_leafcutter_ds_validated_celltype.Rds"
    )
)

ewce <- 
  readRDS(
    file.path(
      path_to_results, 
      "leafcutter/ewce/ewce_leafcutter_ds_validated_celltype.Rds"
    )
  )

plot <- ewce %>%
  dplyr::mutate(FDR.p = p.adjust(p, method="BH"),
                Class = case_when(
                  CellType %in% c("Astrocyte", "ASC", "Astro") ~ "Astrocyte",
                  CellType %in% c("MG", "Microglia") ~ "Microglia",
                  CellType %in% c("END", "Endo", "Endothelial cell", "Vascular") ~ "Vascular",
                  CellType %in% c("ODC", "Oligo", "Oligodendrocyte") ~ "Oligodendrocyte",
                  CellType %in% c("exCA", "Excitatory", "exDG", "exPFC", "Glutamatergic") ~ "Excitatory \n neuron",
                  CellType %in% c("GABA", "GABAergic", "Inhibitory") ~ "Inhibitory \n neuron",
                  CellType == "OPC" ~ "OPC",
                  CellType == "NSC" ~ "NSC"),
                Study = str_replace(Study, "list\\$", ""),
                Study = fct_relevel(Study, 
                                    c("Control", "PD", "PDD", "DLB")),
                joint_name = str_c(Study, CellType, sep = ":"),
                sd_from_mean = ifelse(p < 0.05, sd_from_mean, NA)) %>% 
  dplyr::arrange(Class, Study) %>% 
  dplyr::mutate_at(vars(joint_name), list(~ factor(., levels = unique(.)))) %>% 
  ggplot(aes(x = GeneSet, y = forcats::fct_rev(joint_name))) +
  geom_tile(aes(fill = sd_from_mean), colour = "black") +    
  facet_grid(rows = vars(Class), scales = "free", space = "free_y") +
  scale_fill_viridis_c(na.value = "grey") +
  labs(x = "", y = "Dataset: Cell type") +
  theme_rhr + 
  theme(panel.grid = element_blank(),
        strip.text.y = element_text(size = 8, angle = 0))

plot

EWCE results using only genes overlapping clusters validated in SRP058181 and single-cell RNA-seq data from our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. No cell types passed multiple test correction performed across all results using FDR. Results with uncorrected p > 0.05 were coloured grey.

Figure 7.7: EWCE results using only genes overlapping clusters validated in SRP058181 and single-cell RNA-seq data from our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. No cell types passed multiple test correction performed across all results using FDR. Results with uncorrected p > 0.05 were coloured grey.

Highlighted cell types include (with a comparison to enrichments seen for our own data wherein top 100 differentially spliced genes filtered by dPSI >= 0.1 were used; see section 8.2 in Leafcutter.html:
- Oligodendrocytes for Control_vs_PDD. Similar to what we saw using top 100 differentially spliced genes.
- And if we allow for nominal results (as shown below), excitatory neurons also highlighted for PD_vs_PDD. Similar to what we saw using top 100 differentially spliced genes, and also in the same specificity matrices (i.e. PDD and DLB).

7.2 Exact match

Overall strategy:
1. Find the set of intersecting junctions between both datasets --> these can be used to define which clusters from our own data could be detected in SRP058181.
  - Definition of match: clusters that intersect between datasets contain the same number of junctions, with matching donor and acceptor positions.
2. Validation will depend on the match between clusters in the two datasets being exact.
3. Those clusters that "validate" and are found differentially spliced in our own data can then be tested for differential splicing in SRP058181 i.e. adjust p-value within comparison to reflect smaller number of tests.
Only cell-type-corrected data used.

7.2.1 Intersecting clusters

This has already been done in the previous section. All that now requires doing is filtering the output of this to include only clusters that overlap exactly between the two datasets.

# Filter generated clu_prop_overlap for cluster ids from clu_overlap_ds
# clu_prop_overlap already filtered for ambiguous clusters
# Generate geom_density
clu_prop_overlap_exact %>% 
  dplyr::filter(cluster_id.own %in% clu_overlap_ds$cluster_id.own) %>% 
  tidyr::gather(key = "overlap", value = "prop_overlap", contains("prop_overlap")) %>% 
  dplyr::mutate(overlap = case_when(overlap == "prop_overlap.own" ~ "ds own --> all SRP058181",
                                    overlap == "prop_overlap.SRP058181" ~ "all SRP058181 --> ds own")) %>% 
  ggplot(aes(x = prop_overlap, fill = overlap)) +
  geom_histogram(binwidth = 5, alpha = 0.5, colour = "black") +
  facet_wrap(~overlap) +
  labs(x = "Proportion overlap of cluster from discovery dataset into validation dataset") +
  theme_rhr +
  theme(legend.position = "none")

Histogram of the proportion overlap between *differentially spliced* clusters derived from own data that exactly match clusters that are also found in SRP058181.

Figure 7.8: Histogram of the proportion overlap between differentially spliced clusters derived from own data that exactly match clusters that are also found in SRP058181.

7.2.2 FDR-correction of intersecting SRP058181 clusters

Strategy:
1. Create dataframe (overlapping_clusters) with clusters from own data that:
  1. Have a 100% overlap i.e. all junctions within cluster were found within a cluster from SRP058181.
  2. AND were found differentially spliced.
2. For SRP058181, filter cluster_significance dataframe for those clusters from overlapping_clusters. Then, for each pairwise comparison, perform FDR-correction using the smaller set of overlapping clusters from SRP058181.
3. For both datasets, merge junction-level data (intron_usage) with cluster-level data (cluster_significance) and filter for only those cluster ids in overlapping_clusters. Once this is done, merge output from each dataset (ensuring to filter SRP058181 for only those clusters that are now significant following FDR-correction).
4. Perform a full join of the two lists by: comparison, chromosome, start and end site of junction, overlapping genes. By joining using comparison, we ensure that we are only keeping those junctions that were found differentially spliced in the SAME comparison in one or both datasets.
5. Add a column same_direction_of_effect. This column evaluates whether junctions that were found differentially spliced in the same comparison across one or both datasets share the same direction of effect.

# Filter for overlapping clusters with 100% overlap from own data into SRP058181
# AND that are differentially spliced in own data
overlapping_clusters <- 
  clu_prop_overlap_exact %>% 
  dplyr::filter(cluster_id.own %in% clu_overlap_ds$cluster_id.own)

print(str_c("Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: ",
            clu_significance$SRP058181 %>% 
              dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, 
                            status == "Success",
                            p.adjust < 0.05) %>%
              dplyr::ungroup() %>% 
              dplyr::distinct(cluster_id) %>% 
              nrow()))

## [1] "Number of overlapping clusters differentially spliced in SRP058181, prior to FDR-correction within smaller set: 64"

# Correct each comparison using fdr on smaller set
clu_significance_fdr <- clu_significance

clu_significance_fdr$SRP058181 <-
  clu_significance$SRP058181 %>% 
  dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, status == "Success") %>% 
  dplyr::group_by(comparison) %>% 
  dplyr::mutate(p.adjust = p.adjust(p, method = "fdr"))

print(str_c("Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: ",
            clu_significance_fdr$SRP058181 %>% 
              dplyr::filter(cluster_id %in% overlapping_clusters$cluster_id.SRP058181, 
                            status == "Success",
                            p.adjust < 0.05) %>% 
              dplyr::ungroup() %>% 
              dplyr::distinct(cluster_id) %>% 
              nrow()))

## [1] "Number of overlapping clusters differentially spliced in SRP058181, following FDR-correction within smaller set: 69"

# Create list with dataframes from each dataset containing merged intron usage and cluster significance only for overlapping_clusters
clu_val <- vector(mode = "list", length = 2)

for(i in 1:length(clu_val)){
  
  # Get name of each dataset
  name <- names(intron_usage)[i]
  
  # Column name for cluster ids in each dataset
  col_name <- str_c("cluster_id.", name)
  
  # Join intron usage (i.e. junction-level data) with cluster significance (cluster-level significance)
  # And filter for only those clusters found within overlapping_clusters
  clu_val[[i]] <- intron_usage[[name]] %>% 
    # Filter for appropriate comparisons
    dplyr::filter(comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")) %>% 
    # dplyr::inner_join(clu_significance_fdr[[name]]) %>% 
        dplyr::inner_join(clu_significance_fdr[[name]]) %>% 
    dplyr::select(chr, start, end, comparison, cluster_id, loglr, p.adjust, df, Control, PD, PDD, deltapsi, genes) %>% 
    # Add direction of effect
    dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
                                                  deltapsi < 0 ~ "downregulated",
                                                  deltapsi == 0 ~ "no_change")) %>% 
    dplyr::filter(cluster_id %in% c(overlapping_clusters %>% 
                                                  .[[col_name]])) 
  names(clu_val)[i] <- name
  
}

# Join lists together
# Add additional column evaluating for those junctions shared between comparisons whether they share direction of effect 
# Remove any junctions that are not shared between comparisons
ds_val <- clu_val$own %>% 
  full_join(clu_val$SRP058181 %>% 
              dplyr::filter(p.adjust < 0.05), 
            by = c("comparison", "chr", "start", "end", "genes"),
            suffix = c(".own", ".SRP058181")) %>% 
  dplyr::mutate(same_direction_of_effect = case_when(direction_of_effect.own == direction_of_effect.SRP058181 ~ TRUE, 
                                                          direction_of_effect.own != direction_of_effect.SRP058181 ~ FALSE,
                                                    TRUE ~ NA))

7.2.3 Correlating dPSI estimates

If we now return to the junction level... from the set of junctions that are (i) found differentially spliced in one or both datasets and (ii) appear in the same comparison in both datasets, what proportion of these share the same direction of effect? And would we expect to see this by random chance?
Strategy:
- Construct contingency tables, with:
  1. Differentially spliced in both data sets?
  2. Same direction of effect (conditional on same comparison)?
- Run Fisher's exact test.

cont_table <- 
  ds_val %>% 
  dplyr::mutate(same_direction_of_effect = fct_relevel(as.factor(same_direction_of_effect),
                                                       levels = c("TRUE", "FALSE")),
                ds_both = fct_relevel(case_when(p.adjust.own < 0.05 & p.adjust.SRP058181 < 0.05~ TRUE,
                                                TRUE~ FALSE) %>% as.factor(),
                                      levels = c("TRUE", "FALSE"))) %>% 
  dplyr::filter(!is.na(same_direction_of_effect)) %>% 
  with(., table(same_direction_of_effect, ds_both,  
                dnn = c("Same direction of effect?", "Differentially spliced (same comparison) in both datasets?")))

cont_table

##                          Differentially spliced (same comparison) in both datasets?
## Same direction of effect? TRUE FALSE
##                     TRUE    49   114
##                     FALSE   67   111

cont_table %>% 
  fisher.test()

## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.1697
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4413735 1.1461547
## sample estimates:
## odds ratio 
##  0.7128169

No association between whether junctions within exactly-matching clusters found differentially spliced in both datasets across the same comparison also share the same direction of effect.

a <- ds_val %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>% 
  tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>% 
  dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>% 
  ggplot(aes(x = abs(dPSI), fill = dataset)) + 
  geom_density(alpha = 0.5) +
  coord_cartesian(ylim = c(0,40)) +
  labs(x = expression("Absolute "*Delta*"PSI")) +
  theme_rhr

b <- ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  dplyr::select(chr, start, end, contains("comparison"), contains("deltapsi")) %>% 
  tidyr::gather(key = dataset, value = dPSI, deltapsi.own:deltapsi.SRP058181) %>% 
  dplyr::mutate(dataset = str_replace(dataset, ".*\\.", "")) %>% 
  ggplot(aes(x = abs(dPSI), fill = dataset)) + 
  geom_density(alpha = 0.5) +
  coord_cartesian(ylim = c(0,40)) +
  labs(x = expression("Absolute "*Delta*"PSI")) +
  theme_rhr

ggarrange(a, b,
          labels = c("a", "b"),
          common.legend = TRUE, 
          legend = "bottom")

Density plot of absolute delta PSI values for (a) those junctions that (i) overlap between our own data and SRP0518181, (ii) are within *exactly-matching* clusters that are differentially spliced in *one or both* datasets across the same comparisons, and (iii) share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

Figure 7.9: Density plot of absolute delta PSI values for (a) those junctions that (i) overlap between our own data and SRP0518181, (ii) are within exactly-matching clusters that are differentially spliced in one or both datasets across the same comparisons, and (iii) share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

a <- ds_val %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

b <- ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  ggplot(aes(x = deltapsi.own, y = deltapsi.SRP058181)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm") +
  labs(x = expression(Delta*"PSI in own data"), y = expression(Delta*"PSI in SRP058181")) +
  facet_zoom(xlim = c(-0.05, 0.05), ylim = c(-0.025, 0.025), zoom.size = 1) +
  theme_rhr

ggarrange(a, b,
          ncol = 1,
          labels = c("a", "b"))

Scatterplot of absolute delta PSI values for those junctions that (a) overlap between our own data and SRP0518181, are within *exactly-matching* clusters that are differentially spliced in *one or both* datasets across the same comparisons and share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

Figure 7.10: Scatterplot of absolute delta PSI values for those junctions that (a) overlap between our own data and SRP0518181, are within exactly-matching clusters that are differentially spliced in one or both datasets across the same comparisons and share the same direction of effect and (b) all junctions found within 'validated' clusters that contain junctions shared as described in (a).

# Correlation of junctions with same direction of effect within ds clusters 
ds_val %>% 
  dplyr::filter(same_direction_of_effect == TRUE) %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 94434, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8691622

# Correlation of all junctions within ds clusters that contain some junctions with same direction of effect
ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  cor.test(~ deltapsi.own + deltapsi.SRP058181, data = ., method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  deltapsi.own and deltapsi.SRP058181
## S = 3825230, p-value = 0.009352
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1499394

Unsurprisingly, dPSI correlates well (and positively) when looking at junctions with the same direction of effect within ds clusters found across both datasets.
However, as before when we seed from these junctions back to the clusters they originate from and check correlation across all junctions within a cluster, the correlation of dPSI while improved using exactly-matching clusters, it is still weak.
Again, this is in large part because junctions found to share same direction of effect typically make up < 75% of the larger cluster they originate from, as seen from the histogram below. Thus, looking at this from a whole cluster perspective will add noise to the comparison.

ds_val %>% 
  dplyr::filter(cluster_id.own %in% c(ds_val %>% 
                                        dplyr::filter(same_direction_of_effect == TRUE) %>% 
                                        .[["cluster_id.own"]])) %>% 
  dplyr::group_by(cluster_id.own) %>% 
  dplyr::summarise(n = n(),
                   same_direction_of_effect = sum(same_direction_of_effect, na.rm = TRUE),
                   prop_own = same_direction_of_effect/n) %>% 
  ggplot(aes(x = prop_own)) + 
  geom_histogram(binwidth = 0.01) +
  labs(x = "Proportion of junctions within a shared cluster that share the same direction of effect") +
  coord_cartesian(xlim = c(0,1.05)) +
  theme_rhr

Figure 7.11: Histogram of shared clusters and the proportion of junctions within each cluster that share the same direction of effect.

7.2.4 Contingency table and fisher's exact test

Differential splicing in this case includes:
- Clusters found differentially spliced in any comparison (Control_vs_PD, Control_vs_PDD or PD_vs_PDD)
- Clusters that pass FDR < 0.05 in both datasets, following multiple testing of SRP058181 using smaller set of clusters that validated between both datasets

clu_significance_fdr$SRP058181 <- clu_significance_fdr$SRP058181 %>% 
              dplyr::bind_rows(clu_significance$SRP058181 %>% 
                                 dplyr::anti_join(clu_significance_fdr$SRP058181, 
                                                  by = c("comparison", "cluster_id")))

# Construct contingency table of differential splicing
clu_overlap_exact_ds <- clu_overlap_exact %>% 
  dplyr::mutate(ds.own = fct_relevel(case_when(cluster_id.own %in% c(clu_significance_fdr$own %>%
                                                                       dplyr::filter(p.adjust < 0.05) %>%
                                                                       .[["cluster_id"]] %>%
                                                                       unique()) ~ TRUE,
                                               TRUE~ FALSE) %>% as.factor(),
                                     levels = c("TRUE", "FALSE")),
                ds.SRP058181 = fct_relevel(case_when(cluster_id.SRP058181 %in% c(clu_significance_fdr$SRP058181 %>%
                                                                                   dplyr::filter(p.adjust < 0.05) %>%
                                                                                   .[["cluster_id"]] %>%
                                                                                   unique()) ~ TRUE,
                                                     TRUE~ FALSE) %>% as.factor(),
                                           levels = c("TRUE", "FALSE")))

cont_table <- clu_overlap_exact_ds %>%
  with(., table(ds.SRP058181, ds.own, dnn = c("Differentially spliced in SRP058181?", "Differentially spliced in own data?")))

cont_table

##                                     Differentially spliced in own data?
## Differentially spliced in SRP058181?  TRUE FALSE
##                                TRUE     69   696
##                                FALSE   767 11901

# saveRDS(clu_overlap_exact_ds, file.path(path_to_results, "leafcutter/diff_splicing_PCaxes/cluster_overlap_exact_ds_with_SRP058181.Rds"))


# Fisher's exact test
cont_table %>% 
  fisher.test()

## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.001956
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.170652 1.995428
## sample estimates:
## odds ratio 
##   1.538183

7.2.5 Pathway enrichment analysis

Gene lists derived by using genes overlapping exactly-matching "validated" clusters that are differentially spliced in both datasets that contain junctions that share the same direction of effect.

7.2.5.1 Overlap between gene lists

Gene lists are very small (and arguably not even worth running gene ontology enrichment on them):

# Extract genes
genes_list <- setNames(ds_val %>% 
                              dplyr::filter(same_direction_of_effect == TRUE,
                                            p.adjust.own < 0.05,
                                            p.adjust.SRP058181 < 0.05
                                            ) %>% 
                              group_split(comparison),
                            ds_val %>% 
                              dplyr::filter(same_direction_of_effect == TRUE,
                                            p.adjust.own < 0.05,
                                            p.adjust.SRP058181 < 0.05
                                            ) %>% 
                              .[["comparison"]] %>% 
                              unique() %>% 
                              sort()) %>% 
  # For each dataframe, extract the gene column and remove duplicate genes
  lapply(., function(x){
    
    x %>% 
      .[["genes"]] %>% 
      unique()
  })

lapply(genes_list, length)

## $Control_vs_PD
## [1] 2
## 
## $Control_vs_PDD
## [1] 8
## 
## $PD_vs_PDD
## [1] 5

Thus, worth checking overlap between them.

# Background list
all_genes <- leafcutter_list$own$cluster_significance %>% 
  # remove clusters that overlap multiple genes
  dplyr::filter(!str_detect(genes, ",")) %>% 
  .[["genes"]] %>% 
  unique() 

UpSetR::upset(data = fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")

Figure 7.12: Overlap between genes lists derived from clusters found differentially spliced in the same comparison and with the same direction of effect in our own data & SRP058181. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

No overlaps.
Genes in each list include:

genes_list

## $Control_vs_PD
## [1] "ARHGAP9" "TPMT"   
## 
## $Control_vs_PDD
## [1] "PRKAG1"  "RBM25"   "POMT2"   "RASGRF1" "ARHGEF1" "BCAS1"   "LPIN1"  
## [8] "ACTR8"  
## 
## $PD_vs_PDD
## [1] "DLGAP1-AS4" "SCMH1"      "CCNL1"      "ARAP2"      "CAST"

7.2.5.2 Pathway results

FDR correction (as opposed to gSCS) used.
Terms filtered for size (i.e. term size > 20 or <= 2000)

gprofiler <- lapply(genes_list, function(x){
  x %>% 
    gost(., organism = "hsapiens",
       correction_method= "fdr", significant = TRUE,
       user_threshold = 0.05,
       custom_bg= all_genes,
       sources = c("GO", "REAC", "KEGG", "OMIM"),
       evcodes = TRUE)})

gprofiler %>% 
  lapply(., function(x){
    x$result %>% 
      as_tibble()
  }) %>% 
  qdapTools::list_df2df(., col = "comparison") %>% 
  dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>% 
  dplyr::filter(term_size > 20 & term_size <= 2000) %>%
  ggplot(aes(x = comparison, 
             y = term_name, 
             size = precision, 
             fill = p_value)) +
  geom_point(pch = 21) +    
  scale_fill_viridis_c(direction = -1) +
  labs(x = "Comparison", y = "Dataset: Cell type") +
  theme_rhr

Figure 7.13: Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

And with clusterProfiler (not filtered for term size, but terms clustered by semantic similarity)

# Also, try running with clusterProfiler, which has a number of useful functionalities:
# `compareCluster`: permits comparison of GO enrichments across groups. 
# Uses hypergeometric model to perform enrichment analysis within each group (i.e. does not account for semantic similarity in testing as GProfiler does, which could be a disadvantage)
# Allows semantic trimming of terms.

library(clusterProfiler)

ont <- c("BP", "CC", "MF")
cluster_compare_GO <- setNames(vector(mode = "list", length = length(ont)),
                             ont)

for(i in 1:length(ont)){
  
  cluster_compare_GO[[i]] <-   
    genes_list %>% 
    qdapTools::list2df(col1 = "genes", col2 = "comparison") %>%  
    clusterProfiler::compareCluster(genes ~ comparison, 
                                    data=., 
                                    keyType = "SYMBOL",
                                    fun="enrichGO", 
                                    ont = ont[i],
                                    pAdjustMethod = "BH",
                                    qvalueCutoff = 0.05,
                                    OrgDb='org.Hs.eg.db')
  
}

# saveRDS(cluster_compare_GO, file.path(path_to_results, "leafcutter/gprofiler/clusterProfiler_leafcutter_overlap_exact_ds_with_SRP058181.Rds"))

cluster_compare_GO <- 
  readRDS(
    file.path(path_to_results, "leafcutter/gprofiler/clusterProfiler_leafcutter_overlap_exact_ds_with_SRP058181.Rds")
  )

cluster_compare_GO %>% 
  lapply(., function(df){
    
    df %>% 
      clusterProfiler::simplify(., cutoff=0.4, by="p.adjust", select_fun=min) %>% 
      clusterProfiler::dotplot(., showCategory = NULL, font.size = 8) +
      geom_point(shape = 1,colour = "black") +
      scale_size_continuous(name = "precision") +
      scale_colour_viridis_c(direction = -1) +
      theme_rhr
    
  })

## $BP

## 
## $CC

## 
## $MF

7.2.6 EWCE

Same gene list as above i.e. genes overlapping exactly-matching "validated" clusters that are differentially spliced in both datasets and contain junctions that are differentially spliced across the same comparison and with the same direction of effect.

# Load internal specificity matrices
ctd_files <- 
    list.files(
    file.path(
      path_to_results,
      "snRNA/specificity_matrices/2020_March/"
    ), 
    pattern = "ctd", 
    full.names = T)
    

ctd_list <- vector(mode = "list", length = length(ctd_files))

for(i in 1:length(ctd_files)){
  
  # Load ctd
  ctd_list[[i]] <- readRDS(ctd_files[i])
  
  # Extract file name
  title <- 
    ctd_files[i] %>% 
    str_replace(".*/", "") %>% 
    str_replace("\\..*", "") %>% 
    str_replace(".*_", "")
  
  # Name list
  names(ctd_list)[i] <- title
 
}

# Run EWCE
# Cannot run with Control_vs_PD, as list is too small for EWCE
ewce <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list[2:3],
                                         ctd_list$Control, ctd_list$PD,
                                         ctd_list$PDD, ctd_list$DLB,
                                         celltypeLevel = 1, 
                                         reps = 100000, 
                                         genelistSpecies = "human", 
                                         sctSpecies = "human",
                                         mouse_to_human = NULL)

saveRDS(
  ewce, 
  file = 
    file.path(
      path_to_results, 
      "leafcutter/ewce/ewce_leafcutter_ds_exact_validated_celltype.Rds"
      )
)

As no results passed FDR < 0.05, used unadjusted p-value < 0.05.

ewce <- 
  readRDS(
    file.path(
      path_to_results, 
      "leafcutter/ewce/ewce_leafcutter_ds_exact_validated_celltype.Rds"
    )
  )

plot <- ewce %>%
  dplyr::filter(!Study %in% c("AIBS2018", "DRONC_human")) %>%
  dplyr::mutate(FDR.p = p.adjust(p, method="BH"),
                Class = case_when(
                  CellType %in% c("Astrocyte", "ASC", "Astro") ~ "Astrocyte",
                  CellType %in% c("MG", "Microglia") ~ "Microglia",
                  CellType %in% c("END", "Endo", "Endothelial cell", "Vascular") ~ "Vascular",
                  CellType %in% c("ODC", "Oligo", "Oligodendrocyte") ~ "Oligodendrocyte",
                  CellType %in% c("exCA", "Excitatory", "exDG", "exPFC", "Glutamatergic") ~ "Excitatory \n neuron",
                  CellType %in% c("GABA", "GABAergic", "Inhibitory") ~ "Inhibitory \n neuron",
                  CellType == "OPC" ~ "OPC",
                  CellType == "NSC" ~ "NSC"),
                Study = str_replace(Study, "list\\$", ""),
                Study = fct_relevel(Study, 
                                    c("Control", "PD", "PDD", "DLB")),
                joint_name = str_c(Study, CellType, sep = ":"),
                sd_from_mean = ifelse(p <= 0.05, sd_from_mean, NA)) %>% 
  dplyr::arrange(Class, Study) %>% 
  dplyr::mutate_at(vars(joint_name), list(~ factor(., levels = unique(.)))) %>% 
  ggplot(aes(x = GeneSet, y = forcats::fct_rev(joint_name))) +
  geom_tile(aes(fill = sd_from_mean), colour = "black") +    
  facet_grid(rows = vars(Class), scales = "free", space = "free_y") +
  scale_fill_viridis_c(na.value = "grey") +
  labs(x = "", y = "Dataset: Cell type") +
  theme_rhr + 
  theme(panel.grid = element_blank(),
        strip.text.y = element_text(size = 8, angle = 0))

plot

EWCE results using only genes overlapping *exactly-matching* clusters validated in SRP058181 and single-cell RNA-seq data from our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. Non-significant results (p > 0.05) were coloured grey.

Figure 7.14: EWCE results using only genes overlapping exactly-matching clusters validated in SRP058181 and single-cell RNA-seq data from our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. Non-significant results (p > 0.05) were coloured grey.

Using exactly-matching clusters and allowing for nominal enrichments, we observe an enrichment of PD_vs_PDD genes in excitatory neurons. This is similar to what we observed for our own data wherein top 100 differentially spliced genes filtered by dPSI >= 0.1 were used; see Leafcutter.html.

8 Session info

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2            dasper_0.0.0.9000     
##  [4] testthat_2.3.2         UpSetR_1.4.0           rtracklayer_1.46.0    
##  [7] GenomicRanges_1.38.0   GenomeInfoDb_1.22.1    IRanges_2.20.2        
## [10] S4Vectors_0.24.4       BiocGenerics_0.32.0    readxl_1.3.1          
## [13] forcats_0.5.1          stringr_1.4.0          dplyr_1.0.2           
## [16] purrr_0.3.4            readr_1.4.0            tidyr_1.1.1           
## [19] tibble_3.0.3           tidyverse_1.3.0        ggsci_2.9             
## [22] ggpubr_0.4.0           ggforce_0.3.2          ggplot2_3.3.2         
## [25] gprofiler2_0.1.9       doParallel_1.0.15      iterators_1.0.12      
## [28] foreach_1.5.0          devtools_2.3.2         usethis_1.6.3         
## [31] data.table_1.13.0      clusterProfiler_3.14.3
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.1.4                  tidyselect_1.1.0           
##   [3] RSQLite_2.2.0               AnnotationDbi_1.48.0       
##   [5] htmlwidgets_1.5.3           grid_3.6.1                 
##   [7] BiocParallel_1.20.1         munsell_0.5.0              
##   [9] codetools_0.2-16            chron_2.3-56               
##  [11] DT_0.15                     withr_2.2.0                
##  [13] colorspace_2.0-0            GOSemSim_2.17.1            
##  [15] Biobase_2.46.0              highr_0.8                  
##  [17] knitr_1.29                  rstudioapi_0.13            
##  [19] ggsignif_0.6.0              DOSE_3.12.0                
##  [21] labeling_0.4.2              urltools_1.7.3             
##  [23] GenomeInfoDbData_1.2.2      polyclip_1.10-0            
##  [25] bit64_4.0.2                 farver_2.0.3               
##  [27] rprojroot_2.0.2             vctrs_0.3.2                
##  [29] generics_0.0.2              xfun_0.16                  
##  [31] BiocFileCache_1.10.2        R6_2.5.0                   
##  [33] graphlayouts_0.7.0          bitops_1.0-6               
##  [35] cachem_1.0.3                fgsea_1.12.0               
##  [37] gridGraphics_0.5-0          DelayedArray_0.12.3        
##  [39] assertthat_0.2.1            scales_1.1.1               
##  [41] ggraph_2.0.3                enrichplot_1.6.1           
##  [43] gtable_0.3.0                processx_3.4.5             
##  [45] tidygraph_1.2.0             rlang_0.4.7                
##  [47] splines_3.6.1               rstatix_0.6.0              
##  [49] lazyeval_0.2.2              broom_0.7.0                
##  [51] europepmc_0.4               BiocManager_1.30.10        
##  [53] yaml_2.2.1                  reshape2_1.4.4             
##  [55] abind_1.4-5                 modelr_0.1.8               
##  [57] crosstalk_1.1.0.1           backports_1.1.8            
##  [59] qvalue_2.18.0               tools_3.6.1                
##  [61] bookdown_0.21               qdapTools_1.3.5            
##  [63] ggplotify_0.0.5             ellipsis_0.3.1             
##  [65] RColorBrewer_1.1-2          ggdendro_0.1.21            
##  [67] sessioninfo_1.1.1           ggridges_0.5.2             
##  [69] Rcpp_1.0.5                  plyr_1.8.6                 
##  [71] progress_1.2.2              zlibbioc_1.32.0            
##  [73] RCurl_1.98-1.2              ps_1.3.4                   
##  [75] prettyunits_1.1.1           openssl_1.4.2              
##  [77] viridis_0.5.1               cowplot_1.0.0              
##  [79] SummarizedExperiment_1.16.1 haven_2.3.1                
##  [81] ggrepel_0.8.2               here_1.0.0                 
##  [83] fs_1.5.0                    magrittr_2.0.1             
##  [85] DO.db_2.9                   openxlsx_4.2.3             
##  [87] triebeard_0.3.0             reprex_1.0.0               
##  [89] matrixStats_0.56.0          pkgload_1.1.0              
##  [91] hms_1.0.0                   evaluate_0.14              
##  [93] XML_3.99-0.3                rio_0.5.16                 
##  [95] gridExtra_2.3               biomaRt_2.42.1             
##  [97] compiler_3.6.1              crayon_1.4.1               
##  [99] htmltools_0.5.1.1           mgcv_1.8-29                
## [101] lubridate_1.7.9             DBI_1.1.1                  
## [103] tweenr_1.0.1                dbplyr_1.4.4               
## [105] rappdirs_0.3.1              MASS_7.3-51.4              
## [107] refGenome_1.7.7             Matrix_1.2-17              
## [109] car_3.0-9                   cli_2.2.0.9000             
## [111] igraph_1.2.5                doBy_4.6.7                 
## [113] pkgconfig_2.0.3             rvcheck_0.1.8              
## [115] GenomicAlignments_1.22.1    foreign_0.8-72             
## [117] plotly_4.9.2.1              xml2_1.3.2                 
## [119] XVector_0.26.0              rvest_0.3.6                
## [121] callr_3.5.1                 digest_0.6.27              
## [123] Biostrings_2.54.0           HGNChelper_0.8.1           
## [125] rmarkdown_2.5               cellranger_1.1.0           
## [127] fastmatch_1.1-0             Deriv_4.0                  
## [129] curl_4.3                    Rsamtools_2.2.3            
## [131] nlme_3.1-141                lifecycle_0.2.0            
## [133] jsonlite_1.7.1              carData_3.0-4              
## [135] fansi_0.4.2                 limma_3.42.2               
## [137] askpass_1.1                 desc_1.2.0                 
## [139] viridisLite_0.3.0           pillar_1.4.6               
## [141] lattice_0.20-38             fastmap_1.0.1              
## [143] httr_1.4.2                  pkgbuild_1.1.0             
## [145] GO.db_3.10.0                glue_1.4.2                 
## [147] remotes_2.2.0               zip_2.1.0                  
## [149] bit_4.0.4                   stringi_1.5.3              
## [151] blob_1.2.1                  memoise_2.0.0

Comparing junction and cluster definitions between own data and SRP058181

Regina H. Reynolds