Aims: to annotate introns within leafcutter clusters by .gtf exon definitions; to identify and visualise diffential splicing events that overlap between comparisons and to assign relevance to differential splicing events.
source(here::here("R", "file_paths.R"))
annotate_junc_ref()
from David's dasper package. This function has since been deprecated and replaced by junction_annot
. Both functions perform the same task, but junction_annot
has been made to run faster.annotate_junc_ref()
takes as input the junction co-ordinates and annotates the start and end position based on precise overlap with a known exon boundary. Then using reference annotation, infers the strand of junction and categorises junctions in "annotated", "novel_acceptor", "novel_donor", "novel_combo", novel_exon_skip", "ambig_gene" & "none".annotated <- setNames(vector(mode = "list", 2),
c("covar", "PC"))
annotated <- leafcutter_list %>%
lapply(., function(x){
# Filter defined introns by those successfully tested
x$intron_usage %>%
tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":", remove = F) %>%
dplyr::mutate(cluster_id = str_c(chr, ":", cluster_id)) %>%
dplyr::filter(cluster_id %in% c(x$cluster_significance %>%
dplyr::filter(status == "Success") %>%
.[["cluster"]] %>%
unique())) %>%
dplyr::select(-chr, -start, -end) %>%
# Convert leafcutter output to input for annotate_junc_ref
convert_leafcutter(leafcutter_results = .,
use_strand = TRUE) %>%
# Annotate introns
dasper::annotate_junc_ref(junc_metadata = .,
gtf = path_to_ref_gtf)
})
saveRDS(
annotated$covar,
file =
file.path(path_to_results,
"leafcutter/diff_splicing/leafcutter_ds_SexRINAoD_gtfannotated.Rds"
)
)
saveRDS(
annotated$PC,
file =
file.path(path_to_results,
"leafcutter/diff_splicing_PCaxes/leafcutter_ds_PCaxes_gtfannotated.Rds"
)
)
## [1] "covar"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477838 28265 0.0592
## 2 novel_acceptor 87873 4910 0.0559
## 3 novel_donor 58225 3746 0.0643
## 4 novel_exon_skip 46870 3623 0.0773
## 5 none 31621 583 0.0184
## 6 ambig_gene 14078 53 0.00376
## 7 novel_combo 6638 716 0.108
## [1] "PC"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477818 29985 0.0628
## 2 novel_acceptor 87868 5791 0.0659
## 3 novel_donor 58222 4290 0.0737
## 4 novel_exon_skip 46870 3721 0.0794
## 5 none 31619 799 0.0253
## 6 ambig_gene 14077 41 0.00291
## 7 novel_combo 6637 699 0.105
# Significant ds events
for(i in 1:length(annotated)){
print(names(annotated)[i])
annotated[[i]] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(all = n()) %>%
dplyr::inner_join(annotated[[i]] %>%
as.data.frame() %>%
dplyr::inner_join(leafcutter_list[[i]]$significant_clusters_0.05_filter %>%
dplyr::mutate(cluster_id = str_c(chr, ":", cluster)) %>%
# Add dPSI filter
dplyr::filter(abs(deltapsi) >= 0.1) %>%
dplyr::distinct(comparison, cluster_id)
) %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(significant = n())) %>%
dplyr::mutate(proportion = significant/all) %>%
dplyr::arrange(-all) %>%
print()
}
## [1] "covar"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477838 14312 0.0300
## 2 novel_acceptor 87873 2265 0.0258
## 3 novel_donor 58225 1868 0.0321
## 4 novel_exon_skip 46870 1534 0.0327
## 5 none 31621 379 0.0120
## 6 ambig_gene 14078 28 0.00199
## 7 novel_combo 6638 398 0.0600
## [1] "PC"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477818 17856 0.0374
## 2 novel_acceptor 87868 3111 0.0354
## 3 novel_donor 58222 2373 0.0408
## 4 novel_exon_skip 46870 1950 0.0416
## 5 none 31619 491 0.0155
## 6 ambig_gene 14077 21 0.00149
## 7 novel_combo 6637 431 0.0649
For simplicity, from this point on only results corrected with PC axes will be used.
leafcutter_ds.R
annotates intron clusters correctly using the inputted exon file (Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz
).# All leafcutter-annotated genes also found in annotated lists?
(leafcutter_list$PC$significant_clusters_0.05_filter$genes %>% unique()) %in% (annotated$PC$gene_name_junc %>% unlist() %>% unique()) %>%
all()
## [1] TRUE
Aim: to identify and visualise diffential splicing events that overlap between comparisons
# All successfully tested introns
success <- annotated$PC %>%
as.data.frame() %>%
dplyr::inner_join(leafcutter_list$PC$cluster_significance %>%
dplyr::filter(status == "Success") %>%
dplyr::distinct(comparison, cluster) %>%
dplyr::rename(cluster_id = cluster))
# Significant ds events (|dPSI| >= 0.1)
significant <- annotated$PC %>%
as.data.frame() %>%
dplyr::inner_join(leafcutter_list$PC$significant_clusters_0.05_filter %>%
dplyr::mutate(cluster_id = str_c(chr, ":", cluster)) %>%
# Add dPSI filter
dplyr::filter(abs(deltapsi) >= 0.1) %>%
dplyr::distinct(comparison, cluster_id)
)
# Initiate lists
intron_list <- list(success, significant)
plot_list <- vector(mode = "list", 2)
for(i in 1:length(intron_list)){
count <- intron_list[[i]] %>%
dplyr::filter(!junc_cat == "ambig_gene") %>%
dplyr::group_by(comparison, junc_cat) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::ungroup()
plot_list[[i]] <- count %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = rev(c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))),
comparison = comparison %>% factor(levels = count %>%
dplyr::filter(junc_cat == "annotated") %>%
dplyr::arrange(desc(prop)) %>%
.[["comparison"]])) %>%
ggplot(aes(x = comparison, y = prop, fill = junc_cat), colour = "black") +
geom_col() +
labs(x = "") +
scale_fill_manual(name = "Acceptor/donor annotation",
values = rev(c("#3C5488", "#E64B35", "#00A087", "#4DBBD5", "#7E6148", "grey"))) +
theme_rhr
}
ggarrange(plotlist = plot_list,
labels = c("a", "b"),
common.legend = TRUE, legend = "top")
Figure 3.1: Proportion of (a) all successfully tested introns and (b) the subset of differentially spliced introns coloured by annotation type. Comparisons ordered by proportion of annotated introns.
# saveRDS(intron_list, here::here("workflows", "figures", "intron_list.Rds"))
n_unique_int <- significant %>%
dplyr::filter(!junc_cat == "ambig_gene") %>%
dplyr::group_by_at(vars(!!!c("seqnames", "start", "end", "cluster_id"))) %>%
dplyr::filter(n() == 1) %>%
nrow()
n_total_int <- significant %>%
dplyr::filter(!junc_cat == "ambig_gene") %>%
dplyr::distinct(seqnames, start, end, cluster_id) %>%
nrow()
(n_unique_int/n_total_int) *100
## [1] 78.91089
Figure 3.2: Figure: Number of differentially spliced introns across overlaps.
Figure 3.3: Figure: Proportion of differentially spliced introns per overlap across annotation types and comparisons. Proportions were calculated per comparison, by dividing the number of distinct events in each annotation type by the total number across a comparison.
cluster_annot_count <- significant %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(introns_all_annotated = all(junc_in_ref)) %>%
dplyr::group_by(comparison, introns_all_annotated) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::ungroup()
cluster_annot_count %>%
dplyr::mutate(comparison = factor(comparison, cluster_annot_count %>%
dplyr::filter(introns_all_annotated == TRUE) %>%
dplyr::arrange(-prop) %>%
.[["comparison"]])) %>%
ggplot(aes(x = comparison, y = prop, fill = introns_all_annotated)) +
labs(y = "Proportion of clusters") +
geom_col() +
theme_rhr
Figure 3.4: Proportion of differentially spliced clusters containing introns that are all fully annotated. Comparisons are ordered by the proportion of TRUE values.
Aim: to assign relevance to differential splicing events
# Read in PD genes
PD_genes <-
setNames(vector(mode = "list", length = 2),
c("mendelian", "sporadic"))
PD_genes$mendelian <-
read_delim(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/20191128_GE_ParkinsonDiseaseComplexParkinsonism_greengenes.tsv"
),
delim = "\t"
) %>%
dplyr::rename(Gene = `Gene Symbol`)
PD_genes$sporadic <-
read_excel(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/TableS6_Complete_summary_statistics_for_QTL_Mendelian_randomization.xlsx"
)
) %>%
dplyr::filter(`Pass Bonferroni` == "pass")
# Note that the following genes in the mendelian list are also present within the sporadic MR list
PD_genes$mendelian$Gene[PD_genes$mendelian$Gene %in% PD_genes$sporadic$Gene]
## [1] "GCH1" "GRN" "LRRK2" "MAPT" "SNCA"
# Differentially spliced genes overlapping PD genes
PD_genes %>%
lapply(., function(x){
significant %>%
dplyr::filter(gene_name_junc %in% x$Gene) %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
sort()
})
## $mendelian
## [1] "ATXN1" "ATXN2" "ATXN3" "C19orf12" "DCTN1" "DNAJC6"
## [7] "FBXO7" "LYST" "PLA2G6" "SPG11" "SYNJ1" "VPS13A"
##
## $sporadic
## [1] "AMPD3" "ARIH2" "BST1" "CAMK2D" "CYLD" "DCAF16"
## [7] "DGKQ" "EIF3C" "ELOVL7" "FBRSL1" "FDFT1" "GAK"
## [13] "GBAP1" "GIN1" "GPNMB" "IGSF9B" "IP6K2" "ITGA8"
## [19] "ITPKB" "MCCC1" "NAGLU" "NEK1" "PAM" "PGS1"
## [25] "PPIP5K2" "QRICH1" "RPS6KL1" "SH2B1" "STX4" "SULT1A1"
## [31] "TMEM175" "ZNF192P1" "ZNF514"
leafcutter_list$PC$significant_clusters_0.05_filter %>%
dplyr::filter(genes %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>%
dplyr::distinct(genes, comparison, cluster) %>%
dplyr::arrange(genes) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
Figure 4.1: Figure: Number of differentially spliced introns overlapping (a) genes implicated in mendelian and (b) sporafic forms of PD. Differentially spliced introns have been grouped by the number of comparisons they overlap.
plot_list <- vector(mode = "list", length = 2)
for(i in 1:length(PD_genes)){
count <- significant %>%
dplyr::filter(gene_name_junc %in% PD_genes[[i]]$Gene) %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(introns_all_annotated = all(junc_in_ref)) %>%
dplyr::group_by(comparison, introns_all_annotated) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::ungroup()
plot_list[[i]] <- count %>%
# dplyr::mutate(comparison = factor(comparison, count %>%
# dplyr::filter(introns_all_annotated == TRUE) %>%
# dplyr::arrange(-prop) %>%
# .[["comparison"]])) %>%
ggplot(aes(x = comparison, y = n, fill = introns_all_annotated)) +
labs(y = "Number of genes") +
geom_col() +
theme_rhr
}
ggarrange(plotlist = plot_list,
labels = c("a", "b"),
common.legend = TRUE,
legend = "top")
Figure 4.2: Number of (a) genes implicated in mendelian PD and (b) sporadic PD that contain only annotated introns.
# Load uniprot features
source(file = here::here("R", "extract_uniprot_features.R"))
uniprot <-
create_granges_uniprot(
tracks = c("Domain_and_Sites", "Molecule_Processing", "PTM", "Structural_Features", "Topology"),
by = "category",
ncores = 12
)
# Load all annotated introns (annotated in here::here("workflows", "validation", cluster_validation_btwn_datasets.Rmd")
clusters <-
readRDS(
file.path(
path_to_results, "leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
)
)
# Proportion of DS introns that overlap a protein feature
uniprot_features <-
overlap_granges_with_uniprot(
junc_metadata = clusters[mcols(clusters)[, "cluster_id"] %in% unique(intron_list[[2]]$cluster_id %>% str_remove("chr"))],
uniprot = uniprot
)
uniprot_overlap_df <-
intron_list[[2]] %>%
dplyr::rename(intron_chr = seqnames, intron_start = start, intron_end = end) %>%
dplyr::mutate(cluster_id = str_remove(cluster_id, "chr")) %>%
dplyr::left_join(uniprot_features %>%
dplyr::select(contains("feature"), contains("intron"), cluster_id) %>%
dplyr::ungroup() %>%
# # Remove chains as these typically signify an entire protein, as opposed to specific sections of the protein
# dplyr::filter(feature_category != "chain") %>%
dplyr::group_by(intron_chr, intron_start, intron_end, cluster_id) %>%
dplyr::summarise(
feature_category = str_c(feature_category, collapse = " / "),
feature_name = str_c(feature_name, collapse = " / ")
),
by = c("intron_chr", "intron_start", "intron_end", "cluster_id")
) %>%
dplyr::mutate(overlapping_protein_feature = case_when(
is.na(feature_name) ~ FALSE,
TRUE ~ TRUE
)) %>%
dplyr::group_by(comparison, junc_cat, overlapping_protein_feature) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(prop = n / sum(n) * 100) %>%
dplyr::ungroup() %>%
dplyr::mutate(junc_cat = junc_cat %>%
str_replace("novel_combo", "novel_combination") %>%
str_replace_all("_", " ") %>%
str_to_sentence() %>%
fct_relevel(., c("Annotated", "Novel exon skip", "Novel combination", "Novel acceptor", "Novel donor", "None")))
uniprot_overlap_df %>%
dplyr::filter(overlapping_protein_feature == TRUE) %>%
ggplot(aes(x = fct_reorder(.f = junc_cat, .x = prop, .fun = median, .desc = T), y = prop, fill = junc_cat)) +
geom_boxplot(outlier.alpha = 0) +
ggbeeswarm::geom_beeswarm(priority = "density", shape = 21, alpha = 0.5) +
labs(x = "", y = "Proportion of DS introns (within a cateogory)\noverlapping a UniProt protein feature (% per comparison)", fill = "Splicing event") +
scale_fill_manual(values = c("#3C5488", "#4DBBD5", "#F3E361", "#E64B35", "#00A087", "grey")) +
guides(fill = guide_legend(nrow = 1)) +
coord_cartesian(ylim = c(0, 100)) +
theme_rhr +
theme(
legend.position = "none",
legend.key.size = unit(0.4, "cm"),
axis.text.x = element_text(angle = 45, vjust = 1)
)
Figure 4.3: Number of differentially spliced introns that overlap at least one UniProt protein feature as a proportion of the total number of differentially spliced introns within a category of splicing event. UniProt protein features included domains and sites, molecule processing, post-translational modifications, structural features and topological features. Proportions were calculated separately for each pairwise comparison.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dasper_0.0.0.9000 testthat_2.3.2 rtracklayer_1.46.0
## [4] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1 IRanges_2.20.2
## [7] S4Vectors_0.24.4 BiocGenerics_0.32.0 readxl_1.3.1
## [10] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.2
## [13] purrr_0.3.4 readr_1.4.0 tidyr_1.1.1
## [16] tibble_3.0.3 tidyverse_1.3.0 ggsci_2.9
## [19] ggpubr_0.4.0 ggplot2_3.3.2
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.8 Hmisc_4.4-1
## [3] qdapTools_1.3.5 BiocFileCache_1.10.2
## [5] lazyeval_0.2.2 splines_3.6.1
## [7] BiocParallel_1.20.1 crosstalk_1.1.0.1
## [9] usethis_1.6.3 digest_0.6.27
## [11] ensembldb_2.10.2 htmltools_0.5.1.1
## [13] fansi_0.4.2 checkmate_2.0.0
## [15] magrittr_2.0.1 memoise_2.0.0
## [17] BSgenome_1.54.0 cluster_2.1.0
## [19] maser_1.4.0 openxlsx_4.2.3
## [21] remotes_2.2.0 Biostrings_2.54.0
## [23] modelr_0.1.8 matrixStats_0.56.0
## [25] askpass_1.1 prettyunits_1.1.1
## [27] jpeg_0.1-8.1 colorspace_2.0-0
## [29] rappdirs_0.3.1 blob_1.2.1
## [31] rvest_0.3.6 haven_2.3.1
## [33] xfun_0.16 callr_3.5.1
## [35] crayon_1.4.1 RCurl_1.98-1.2
## [37] jsonlite_1.7.1 VariantAnnotation_1.32.0
## [39] survival_3.2-7 glue_1.4.2
## [41] gtable_0.3.0 zlibbioc_1.32.0
## [43] XVector_0.26.0 DelayedArray_0.12.3
## [45] car_3.0-9 pkgbuild_1.1.0
## [47] abind_1.4-5 scales_1.1.1
## [49] DBI_1.1.1 rstatix_0.6.0
## [51] Rcpp_1.0.5 htmlTable_2.0.1
## [53] progress_1.2.2 foreign_0.8-72
## [55] bit_4.0.4 Formula_1.2-4
## [57] DT_0.15 htmlwidgets_1.5.3
## [59] httr_1.4.2 RColorBrewer_1.1-2
## [61] ellipsis_0.3.1 pkgconfig_2.0.3
## [63] XML_3.99-0.3 farver_2.0.3
## [65] nnet_7.3-12 Gviz_1.30.3
## [67] dbplyr_1.4.4 utf8_1.1.4
## [69] here_1.0.0 tidyselect_1.1.0
## [71] labeling_0.4.2 rlang_0.4.7
## [73] AnnotationDbi_1.48.0 munsell_0.5.0
## [75] cellranger_1.1.0 tools_3.6.1
## [77] cachem_1.0.3 cli_2.2.0.9000
## [79] generics_0.0.2 RSQLite_2.2.0
## [81] devtools_2.3.2 broom_0.7.0
## [83] evaluate_0.14 fastmap_1.0.1
## [85] yaml_2.2.1 processx_3.4.5
## [87] knitr_1.29 bit64_4.0.2
## [89] fs_1.5.0 zip_2.1.0
## [91] AnnotationFilter_1.10.0 refGenome_1.7.7
## [93] xml2_1.3.2 biomaRt_2.42.1
## [95] doBy_4.6.7 compiler_3.6.1
## [97] rstudioapi_0.13 beeswarm_0.2.3
## [99] png_0.1-7 curl_4.3
## [101] ggsignif_0.6.0 reprex_1.0.0
## [103] stringi_1.5.3 highr_0.8
## [105] ps_1.3.4 GenomicFeatures_1.38.2
## [107] desc_1.2.0 lattice_0.20-38
## [109] ProtGenerics_1.18.0 Matrix_1.2-17
## [111] vctrs_0.3.2 pillar_1.4.6
## [113] lifecycle_0.2.0 data.table_1.13.0
## [115] cowplot_1.0.0 bitops_1.0-6
## [117] latticeExtra_0.6-29 R6_2.5.0
## [119] bookdown_0.21 gridExtra_2.3
## [121] rio_0.5.16 vipor_0.4.5
## [123] sessioninfo_1.1.1 dichromat_2.0-0
## [125] MASS_7.3-51.4 assertthat_0.2.1
## [127] pkgload_1.1.0 chron_2.3-56
## [129] SummarizedExperiment_1.16.1 openssl_1.4.2
## [131] rprojroot_2.0.2 withr_2.2.0
## [133] GenomicAlignments_1.22.1 Rsamtools_2.2.3
## [135] Deriv_4.0 GenomeInfoDbData_1.2.2
## [137] hms_1.0.0 rpart_4.1-15
## [139] grid_3.6.1 rmarkdown_2.5
## [141] carData_3.0-4 biovizBase_1.34.1
## [143] base64enc_0.1-3 Biobase_2.46.0
## [145] lubridate_1.7.9 ggbeeswarm_0.6.0