Aims: to annotate introns within leafcutter clusters by .gtf exon definitions; to identify and visualise diffential splicing events that overlap between comparisons and to assign relevance to differential splicing events.
source(here::here("R", "file_paths.R"))annotate_junc_ref() from David's dasper package. This function has since been deprecated and replaced by junction_annot. Both functions perform the same task, but junction_annot has been made to run faster.annotate_junc_ref() takes as input the junction co-ordinates and annotates the start and end position based on precise overlap with a known exon boundary. Then using reference annotation, infers the strand of junction and categorises junctions in "annotated", "novel_acceptor", "novel_donor", "novel_combo", novel_exon_skip", "ambig_gene" & "none".annotated <- setNames(vector(mode = "list", 2), 
                      c("covar", "PC"))
annotated <- leafcutter_list %>% 
  lapply(., function(x){
    
    # Filter defined introns by those successfully tested
    x$intron_usage %>% 
      tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":", remove = F) %>% 
      dplyr::mutate(cluster_id = str_c(chr, ":", cluster_id)) %>% 
      dplyr::filter(cluster_id %in% c(x$cluster_significance %>% 
                                        dplyr::filter(status == "Success") %>% 
                                        .[["cluster"]] %>% 
                                        unique())) %>% 
      dplyr::select(-chr, -start, -end) %>% 
      # Convert leafcutter output to input for annotate_junc_ref
      convert_leafcutter(leafcutter_results = .,
                         use_strand = TRUE) %>% 
      # Annotate introns
      dasper::annotate_junc_ref(junc_metadata = .,
                                gtf = path_to_ref_gtf)
    
    
  })
saveRDS(
  annotated$covar, 
  file = 
    file.path(path_to_results,
              "leafcutter/diff_splicing/leafcutter_ds_SexRINAoD_gtfannotated.Rds"
    )
)
saveRDS(
  annotated$PC, 
  file = 
    file.path(path_to_results,
              "leafcutter/diff_splicing_PCaxes/leafcutter_ds_PCaxes_gtfannotated.Rds"
    )
)## [1] "covar"
## # A tibble: 7 x 4
##   junc_cat           all significant proportion
##   <chr>            <int>       <int>      <dbl>
## 1 annotated       477838       28265    0.0592 
## 2 novel_acceptor   87873        4910    0.0559 
## 3 novel_donor      58225        3746    0.0643 
## 4 novel_exon_skip  46870        3623    0.0773 
## 5 none             31621         583    0.0184 
## 6 ambig_gene       14078          53    0.00376
## 7 novel_combo       6638         716    0.108  
## [1] "PC"
## # A tibble: 7 x 4
##   junc_cat           all significant proportion
##   <chr>            <int>       <int>      <dbl>
## 1 annotated       477818       29985    0.0628 
## 2 novel_acceptor   87868        5791    0.0659 
## 3 novel_donor      58222        4290    0.0737 
## 4 novel_exon_skip  46870        3721    0.0794 
## 5 none             31619         799    0.0253 
## 6 ambig_gene       14077          41    0.00291
## 7 novel_combo       6637         699    0.105# Significant ds events
for(i in 1:length(annotated)){
  
  print(names(annotated)[i])
  
  annotated[[i]] %>% 
    as.data.frame() %>%
    dplyr::group_by(junc_cat) %>% 
    dplyr::summarise(all = n()) %>% 
    dplyr::inner_join(annotated[[i]] %>% 
                        as.data.frame() %>% 
                        dplyr::inner_join(leafcutter_list[[i]]$significant_clusters_0.05_filter %>% 
                                            dplyr::mutate(cluster_id = str_c(chr, ":", cluster)) %>% 
                                            # Add dPSI filter
                                            dplyr::filter(abs(deltapsi) >= 0.1) %>% 
                                            dplyr::distinct(comparison, cluster_id)
                        ) %>% 
                        dplyr::group_by(junc_cat) %>% 
                        dplyr::summarise(significant = n())) %>% 
    dplyr::mutate(proportion = significant/all) %>% 
    dplyr::arrange(-all) %>% 
    print()
  
}## [1] "covar"
## # A tibble: 7 x 4
##   junc_cat           all significant proportion
##   <chr>            <int>       <int>      <dbl>
## 1 annotated       477838       14312    0.0300 
## 2 novel_acceptor   87873        2265    0.0258 
## 3 novel_donor      58225        1868    0.0321 
## 4 novel_exon_skip  46870        1534    0.0327 
## 5 none             31621         379    0.0120 
## 6 ambig_gene       14078          28    0.00199
## 7 novel_combo       6638         398    0.0600 
## [1] "PC"
## # A tibble: 7 x 4
##   junc_cat           all significant proportion
##   <chr>            <int>       <int>      <dbl>
## 1 annotated       477818       17856    0.0374 
## 2 novel_acceptor   87868        3111    0.0354 
## 3 novel_donor      58222        2373    0.0408 
## 4 novel_exon_skip  46870        1950    0.0416 
## 5 none             31619         491    0.0155 
## 6 ambig_gene       14077          21    0.00149
## 7 novel_combo       6637         431    0.0649For simplicity, from this point on only results corrected with PC axes will be used.
leafcutter_ds.R annotates intron clusters correctly using the inputted exon file (Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz).# All leafcutter-annotated genes also found in annotated lists?
(leafcutter_list$PC$significant_clusters_0.05_filter$genes %>% unique()) %in% (annotated$PC$gene_name_junc %>% unlist() %>% unique()) %>% 
  all()## [1] TRUEAim: to identify and visualise diffential splicing events that overlap between comparisons
# All successfully tested introns
success <- annotated$PC %>% 
  as.data.frame() %>% 
  dplyr::inner_join(leafcutter_list$PC$cluster_significance %>% 
                      dplyr::filter(status == "Success") %>% 
                      dplyr::distinct(comparison, cluster) %>% 
                      dplyr::rename(cluster_id = cluster))
# Significant ds events (|dPSI| >= 0.1)
significant <- annotated$PC %>% 
                      as.data.frame() %>% 
                      dplyr::inner_join(leafcutter_list$PC$significant_clusters_0.05_filter %>% 
                                            dplyr::mutate(cluster_id = str_c(chr, ":", cluster)) %>% 
                                            # Add dPSI filter
                                            dplyr::filter(abs(deltapsi) >= 0.1) %>% 
                                            dplyr::distinct(comparison, cluster_id)
                        )
# Initiate lists
intron_list <- list(success, significant)
plot_list <- vector(mode = "list", 2)
for(i in 1:length(intron_list)){
  
  count <- intron_list[[i]] %>% 
    dplyr::filter(!junc_cat == "ambig_gene") %>% 
    dplyr::group_by(comparison, junc_cat) %>% 
    dplyr::summarise(n = n()) %>% 
    dplyr::mutate(prop = n/sum(n)) %>% 
    dplyr::ungroup()
  
  plot_list[[i]] <- count %>% 
    dplyr::mutate(junc_cat = junc_cat %>% factor(levels = rev(c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))),
                comparison = comparison %>% factor(levels = count %>% 
                                                                   dplyr::filter(junc_cat == "annotated") %>% 
                                                                   dplyr::arrange(desc(prop)) %>% 
                                                                   .[["comparison"]])) %>% 
  ggplot(aes(x = comparison, y = prop, fill = junc_cat), colour = "black") +
  geom_col() +
  labs(x = "") +
  scale_fill_manual(name = "Acceptor/donor annotation", 
                    values = rev(c("#3C5488", "#E64B35", "#00A087", "#4DBBD5", "#7E6148", "grey"))) +
  theme_rhr
  
}
ggarrange(plotlist = plot_list,
          labels = c("a", "b"),
          common.legend = TRUE, legend = "top")Figure 3.1: Proportion of (a) all successfully tested introns and (b) the subset of differentially spliced introns coloured by annotation type. Comparisons ordered by proportion of annotated introns.
# saveRDS(intron_list, here::here("workflows", "figures", "intron_list.Rds"))n_unique_int <- significant %>% 
  dplyr::filter(!junc_cat == "ambig_gene") %>% 
  dplyr::group_by_at(vars(!!!c("seqnames", "start", "end", "cluster_id"))) %>%
  dplyr::filter(n() == 1) %>%
  nrow()
n_total_int <- significant %>% 
  dplyr::filter(!junc_cat == "ambig_gene") %>%
  dplyr::distinct(seqnames, start, end, cluster_id) %>% 
  nrow()
(n_unique_int/n_total_int) *100## [1] 78.91089Figure 3.2: Figure: Number of differentially spliced introns across overlaps.
Figure 3.3: Figure: Proportion of differentially spliced introns per overlap across annotation types and comparisons. Proportions were calculated per comparison, by dividing the number of distinct events in each annotation type by the total number across a comparison.
cluster_annot_count <- significant %>% 
  dplyr::group_by(comparison, cluster_id) %>% 
  dplyr::summarise(introns_all_annotated = all(junc_in_ref)) %>% 
  dplyr::group_by(comparison, introns_all_annotated) %>% 
  dplyr::summarise(n = n()) %>% 
  dplyr::mutate(prop = n/sum(n)) %>% 
  dplyr::ungroup()
cluster_annot_count %>% 
  dplyr::mutate(comparison = factor(comparison, cluster_annot_count %>% 
                                      dplyr::filter(introns_all_annotated == TRUE) %>% 
                                      dplyr::arrange(-prop) %>% 
                                      .[["comparison"]])) %>% 
  ggplot(aes(x = comparison, y = prop, fill = introns_all_annotated)) +
  labs(y = "Proportion of clusters") +
  geom_col() +
  theme_rhrFigure 3.4: Proportion of differentially spliced clusters containing introns that are all fully annotated. Comparisons are ordered by the proportion of TRUE values.
Aim: to assign relevance to differential splicing events
# Read in PD genes
PD_genes <- 
  setNames(vector(mode = "list", length = 2),
                     c("mendelian", "sporadic"))
PD_genes$mendelian <- 
  read_delim(
    file.path(
      path_to_raw_data,
      "misc/PD_risk_genes/20191128_GE_ParkinsonDiseaseComplexParkinsonism_greengenes.tsv"
    ),
    delim = "\t"
  ) %>% 
  dplyr::rename(Gene = `Gene Symbol`)
PD_genes$sporadic <- 
  read_excel(
    file.path(
      path_to_raw_data,
      "misc/PD_risk_genes/TableS6_Complete_summary_statistics_for_QTL_Mendelian_randomization.xlsx"
    )
  ) %>% 
  dplyr::filter(`Pass Bonferroni` == "pass")
# Note that the following genes in the mendelian list are also present within the sporadic MR list
PD_genes$mendelian$Gene[PD_genes$mendelian$Gene %in% PD_genes$sporadic$Gene]## [1] "GCH1"  "GRN"   "LRRK2" "MAPT"  "SNCA"# Differentially spliced genes overlapping PD genes
PD_genes %>% 
  lapply(., function(x){
    
    significant %>%  
      dplyr::filter(gene_name_junc %in% x$Gene) %>% 
      .[["gene_name_junc"]] %>% 
      unlist() %>% 
      unique() %>% 
      sort()
    
  })## $mendelian
##  [1] "ATXN1"    "ATXN2"    "ATXN3"    "C19orf12" "DCTN1"    "DNAJC6"  
##  [7] "FBXO7"    "LYST"     "PLA2G6"   "SPG11"    "SYNJ1"    "VPS13A"  
## 
## $sporadic
##  [1] "AMPD3"    "ARIH2"    "BST1"     "CAMK2D"   "CYLD"     "DCAF16"  
##  [7] "DGKQ"     "EIF3C"    "ELOVL7"   "FBRSL1"   "FDFT1"    "GAK"     
## [13] "GBAP1"    "GIN1"     "GPNMB"    "IGSF9B"   "IP6K2"    "ITGA8"   
## [19] "ITPKB"    "MCCC1"    "NAGLU"    "NEK1"     "PAM"      "PGS1"    
## [25] "PPIP5K2"  "QRICH1"   "RPS6KL1"  "SH2B1"    "STX4"     "SULT1A1" 
## [31] "TMEM175"  "ZNF192P1" "ZNF514"leafcutter_list$PC$significant_clusters_0.05_filter %>% 
  dplyr::filter(genes %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>% 
  dplyr::distinct(genes, comparison, cluster) %>% 
  dplyr::arrange(genes) %>% 
  DT::datatable(rownames = FALSE,
                options = list(scrollX = TRUE),
                class = 'white-space: nowrap')Figure 4.1: Figure: Number of differentially spliced introns overlapping (a) genes implicated in mendelian and (b) sporafic forms of PD. Differentially spliced introns have been grouped by the number of comparisons they overlap.
plot_list <- vector(mode = "list", length = 2)
for(i in 1:length(PD_genes)){
  
  count <- significant %>% 
    dplyr::filter(gene_name_junc %in% PD_genes[[i]]$Gene) %>% 
    dplyr::group_by(comparison, cluster_id) %>% 
    dplyr::summarise(introns_all_annotated = all(junc_in_ref)) %>% 
    dplyr::group_by(comparison, introns_all_annotated) %>% 
    dplyr::summarise(n = n()) %>% 
    dplyr::mutate(prop = n/sum(n)) %>% 
    dplyr::ungroup()
  
  plot_list[[i]] <- count %>% 
    # dplyr::mutate(comparison = factor(comparison, count %>% 
    #                                   dplyr::filter(introns_all_annotated == TRUE) %>% 
    #                                   dplyr::arrange(-prop) %>% 
    #                                   .[["comparison"]])) %>% 
    ggplot(aes(x = comparison, y = n, fill = introns_all_annotated)) +
    labs(y = "Number of genes") +
    geom_col() +
    theme_rhr
  
}
ggarrange(plotlist = plot_list, 
          labels = c("a", "b"),
          common.legend = TRUE,
          legend = "top")Figure 4.2: Number of (a) genes implicated in mendelian PD and (b) sporadic PD that contain only annotated introns.
# Load uniprot features
source(file = here::here("R", "extract_uniprot_features.R"))
uniprot <-
  create_granges_uniprot(
    tracks = c("Domain_and_Sites", "Molecule_Processing", "PTM", "Structural_Features", "Topology"),
    by = "category",
    ncores = 12
  )# Load all annotated introns (annotated in here::here("workflows", "validation", cluster_validation_btwn_datasets.Rmd")
clusters <- 
  readRDS(
  file.path(
    path_to_results, "leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
    )
  )
# Proportion of DS introns that overlap a protein feature
uniprot_features <-
  overlap_granges_with_uniprot(
    junc_metadata = clusters[mcols(clusters)[, "cluster_id"] %in% unique(intron_list[[2]]$cluster_id %>% str_remove("chr"))],
    uniprot = uniprot
  )
uniprot_overlap_df <-
  intron_list[[2]] %>%
  dplyr::rename(intron_chr = seqnames, intron_start = start, intron_end = end) %>%
  dplyr::mutate(cluster_id = str_remove(cluster_id, "chr")) %>%
  dplyr::left_join(uniprot_features %>%
    dplyr::select(contains("feature"), contains("intron"), cluster_id) %>%
    dplyr::ungroup() %>%
    # # Remove chains as these typically signify an entire protein, as opposed to specific sections of the protein
    # dplyr::filter(feature_category != "chain") %>%
    dplyr::group_by(intron_chr, intron_start, intron_end, cluster_id) %>%
    dplyr::summarise(
      feature_category = str_c(feature_category, collapse = " / "),
      feature_name = str_c(feature_name, collapse = " / ")
    ),
  by = c("intron_chr", "intron_start", "intron_end", "cluster_id")
  ) %>%
  dplyr::mutate(overlapping_protein_feature = case_when(
    is.na(feature_name) ~ FALSE,
    TRUE ~ TRUE
  )) %>%
  dplyr::group_by(comparison, junc_cat, overlapping_protein_feature) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(junc_cat != "ambig_gene") %>%
  dplyr::mutate(prop = n / sum(n) * 100) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(junc_cat = junc_cat %>%
    str_replace("novel_combo", "novel_combination") %>%
    str_replace_all("_", " ") %>%
    str_to_sentence() %>%
    fct_relevel(., c("Annotated", "Novel exon skip", "Novel combination", "Novel acceptor", "Novel donor", "None")))
uniprot_overlap_df %>%
  dplyr::filter(overlapping_protein_feature == TRUE) %>%
  ggplot(aes(x = fct_reorder(.f = junc_cat, .x = prop, .fun = median, .desc = T), y = prop, fill = junc_cat)) +
  geom_boxplot(outlier.alpha = 0) +
  ggbeeswarm::geom_beeswarm(priority = "density", shape = 21, alpha = 0.5) +
  labs(x = "", y = "Proportion of DS introns (within a cateogory)\noverlapping a UniProt protein feature (% per comparison)", fill = "Splicing event") +
  scale_fill_manual(values = c("#3C5488", "#4DBBD5", "#F3E361", "#E64B35", "#00A087", "grey")) +
  guides(fill = guide_legend(nrow = 1)) +
  coord_cartesian(ylim = c(0, 100)) +
  theme_rhr +
  theme(
    legend.position = "none",
    legend.key.size = unit(0.4, "cm"),
    axis.text.x = element_text(angle = 45, vjust = 1)
  )Figure 4.3: Number of differentially spliced introns that overlap at least one UniProt protein feature as a proportion of the total number of differentially spliced introns within a category of splicing event. UniProt protein features included domains and sites, molecule processing, post-translational modifications, structural features and topological features. Proportions were calculated separately for each pairwise comparison.
sessionInfo()## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dasper_0.0.0.9000    testthat_2.3.2       rtracklayer_1.46.0  
##  [4] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1  IRanges_2.20.2      
##  [7] S4Vectors_0.24.4     BiocGenerics_0.32.0  readxl_1.3.1        
## [10] forcats_0.5.1        stringr_1.4.0        dplyr_1.0.2         
## [13] purrr_0.3.4          readr_1.4.0          tidyr_1.1.1         
## [16] tibble_3.0.3         tidyverse_1.3.0      ggsci_2.9           
## [19] ggpubr_0.4.0         ggplot2_3.3.2       
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.8             Hmisc_4.4-1                
##   [3] qdapTools_1.3.5             BiocFileCache_1.10.2       
##   [5] lazyeval_0.2.2              splines_3.6.1              
##   [7] BiocParallel_1.20.1         crosstalk_1.1.0.1          
##   [9] usethis_1.6.3               digest_0.6.27              
##  [11] ensembldb_2.10.2            htmltools_0.5.1.1          
##  [13] fansi_0.4.2                 checkmate_2.0.0            
##  [15] magrittr_2.0.1              memoise_2.0.0              
##  [17] BSgenome_1.54.0             cluster_2.1.0              
##  [19] maser_1.4.0                 openxlsx_4.2.3             
##  [21] remotes_2.2.0               Biostrings_2.54.0          
##  [23] modelr_0.1.8                matrixStats_0.56.0         
##  [25] askpass_1.1                 prettyunits_1.1.1          
##  [27] jpeg_0.1-8.1                colorspace_2.0-0           
##  [29] rappdirs_0.3.1              blob_1.2.1                 
##  [31] rvest_0.3.6                 haven_2.3.1                
##  [33] xfun_0.16                   callr_3.5.1                
##  [35] crayon_1.4.1                RCurl_1.98-1.2             
##  [37] jsonlite_1.7.1              VariantAnnotation_1.32.0   
##  [39] survival_3.2-7              glue_1.4.2                 
##  [41] gtable_0.3.0                zlibbioc_1.32.0            
##  [43] XVector_0.26.0              DelayedArray_0.12.3        
##  [45] car_3.0-9                   pkgbuild_1.1.0             
##  [47] abind_1.4-5                 scales_1.1.1               
##  [49] DBI_1.1.1                   rstatix_0.6.0              
##  [51] Rcpp_1.0.5                  htmlTable_2.0.1            
##  [53] progress_1.2.2              foreign_0.8-72             
##  [55] bit_4.0.4                   Formula_1.2-4              
##  [57] DT_0.15                     htmlwidgets_1.5.3          
##  [59] httr_1.4.2                  RColorBrewer_1.1-2         
##  [61] ellipsis_0.3.1              pkgconfig_2.0.3            
##  [63] XML_3.99-0.3                farver_2.0.3               
##  [65] nnet_7.3-12                 Gviz_1.30.3                
##  [67] dbplyr_1.4.4                utf8_1.1.4                 
##  [69] here_1.0.0                  tidyselect_1.1.0           
##  [71] labeling_0.4.2              rlang_0.4.7                
##  [73] AnnotationDbi_1.48.0        munsell_0.5.0              
##  [75] cellranger_1.1.0            tools_3.6.1                
##  [77] cachem_1.0.3                cli_2.2.0.9000             
##  [79] generics_0.0.2              RSQLite_2.2.0              
##  [81] devtools_2.3.2              broom_0.7.0                
##  [83] evaluate_0.14               fastmap_1.0.1              
##  [85] yaml_2.2.1                  processx_3.4.5             
##  [87] knitr_1.29                  bit64_4.0.2                
##  [89] fs_1.5.0                    zip_2.1.0                  
##  [91] AnnotationFilter_1.10.0     refGenome_1.7.7            
##  [93] xml2_1.3.2                  biomaRt_2.42.1             
##  [95] doBy_4.6.7                  compiler_3.6.1             
##  [97] rstudioapi_0.13             beeswarm_0.2.3             
##  [99] png_0.1-7                   curl_4.3                   
## [101] ggsignif_0.6.0              reprex_1.0.0               
## [103] stringi_1.5.3               highr_0.8                  
## [105] ps_1.3.4                    GenomicFeatures_1.38.2     
## [107] desc_1.2.0                  lattice_0.20-38            
## [109] ProtGenerics_1.18.0         Matrix_1.2-17              
## [111] vctrs_0.3.2                 pillar_1.4.6               
## [113] lifecycle_0.2.0             data.table_1.13.0          
## [115] cowplot_1.0.0               bitops_1.0-6               
## [117] latticeExtra_0.6-29         R6_2.5.0                   
## [119] bookdown_0.21               gridExtra_2.3              
## [121] rio_0.5.16                  vipor_0.4.5                
## [123] sessioninfo_1.1.1           dichromat_2.0-0            
## [125] MASS_7.3-51.4               assertthat_0.2.1           
## [127] pkgload_1.1.0               chron_2.3-56               
## [129] SummarizedExperiment_1.16.1 openssl_1.4.2              
## [131] rprojroot_2.0.2             withr_2.2.0                
## [133] GenomicAlignments_1.22.1    Rsamtools_2.2.3            
## [135] Deriv_4.0                   GenomeInfoDbData_1.2.2     
## [137] hms_1.0.0                   rpart_4.1-15               
## [139] grid_3.6.1                  rmarkdown_2.5              
## [141] carData_3.0-4               biovizBase_1.34.1          
## [143] base64enc_0.1-3             Biobase_2.46.0             
## [145] lubridate_1.7.9             ggbeeswarm_0.6.0