Aims: to annotate introns within leafcutter clusters by .gtf exon definitions; to identify and visualise diffential splicing events that overlap between comparisons and to assign relevance to differential splicing events.
source(here::here("R", "file_paths.R"))
annotate_junc_ref()
from David's dasper package. This function has since been deprecated and replaced by junction_annot
. Both functions perform the same task, but junction_annot
has been made to run faster.annotate_junc_ref()
takes as input the junction co-ordinates and annotates the start and end position based on precise overlap with a known exon boundary. Then using reference annotation, infers the strand of junction and categorises junctions in "annotated", "novel_acceptor", "novel_donor", "novel_combo", novel_exon_skip", "ambig_gene" & "none".annotated <- setNames(vector(mode = "list", 2),
c("covar", "PC"))
annotated <- leafcutter_list %>%
lapply(., function(x){
# Filter defined introns by those successfully tested
x$intron_usage %>%
tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":", remove = F) %>%
dplyr::mutate(cluster_id = str_c(chr, ":", cluster_id)) %>%
dplyr::filter(cluster_id %in% c(x$cluster_significance %>%
dplyr::filter(status == "Success") %>%
.[["cluster"]] %>%
unique())) %>%
dplyr::select(-chr, -start, -end) %>%
# Convert leafcutter output to input for annotate_junc_ref
convert_leafcutter(leafcutter_results = .,
use_strand = TRUE) %>%
# Annotate introns
dasper::annotate_junc_ref(junc_metadata = .,
gtf = path_to_ref_gtf)
})
saveRDS(
annotated$covar,
file =
file.path(path_to_results,
"leafcutter/diff_splicing/leafcutter_ds_SexRINAoD_gtfannotated.Rds"
)
)
saveRDS(
annotated$PC,
file =
file.path(path_to_results,
"leafcutter/diff_splicing_PCaxes/leafcutter_ds_PCaxes_gtfannotated.Rds"
)
)
## [1] "covar"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477838 28265 0.0592
## 2 novel_acceptor 87873 4910 0.0559
## 3 novel_donor 58225 3746 0.0643
## 4 novel_exon_skip 46870 3623 0.0773
## 5 none 31621 583 0.0184
## 6 ambig_gene 14078 53 0.00376
## 7 novel_combo 6638 716 0.108
## [1] "PC"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477818 29985 0.0628
## 2 novel_acceptor 87868 5791 0.0659
## 3 novel_donor 58222 4290 0.0737
## 4 novel_exon_skip 46870 3721 0.0794
## 5 none 31619 799 0.0253
## 6 ambig_gene 14077 41 0.00291
## 7 novel_combo 6637 699 0.105
# Significant ds events
for(i in 1:length(annotated)){
print(names(annotated)[i])
annotated[[i]] %>%
as.data.frame() %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(all = n()) %>%
dplyr::inner_join(annotated[[i]] %>%
as.data.frame() %>%
dplyr::inner_join(leafcutter_list[[i]]$significant_clusters_0.05_filter %>%
dplyr::mutate(cluster_id = str_c(chr, ":", cluster)) %>%
# Add dPSI filter
dplyr::filter(abs(deltapsi) >= 0.1) %>%
dplyr::distinct(comparison, cluster_id)
) %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(significant = n())) %>%
dplyr::mutate(proportion = significant/all) %>%
dplyr::arrange(-all) %>%
print()
}
## [1] "covar"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477838 14312 0.0300
## 2 novel_acceptor 87873 2265 0.0258
## 3 novel_donor 58225 1868 0.0321
## 4 novel_exon_skip 46870 1534 0.0327
## 5 none 31621 379 0.0120
## 6 ambig_gene 14078 28 0.00199
## 7 novel_combo 6638 398 0.0600
## [1] "PC"
## # A tibble: 7 x 4
## junc_cat all significant proportion
## <chr> <int> <int> <dbl>
## 1 annotated 477818 17856 0.0374
## 2 novel_acceptor 87868 3111 0.0354
## 3 novel_donor 58222 2373 0.0408
## 4 novel_exon_skip 46870 1950 0.0416
## 5 none 31619 491 0.0155
## 6 ambig_gene 14077 21 0.00149
## 7 novel_combo 6637 431 0.0649
For simplicity, from this point on only results corrected with PC axes will be used.
leafcutter_ds.R
annotates intron clusters correctly using the inputted exon file (Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz
).# All leafcutter-annotated genes also found in annotated lists?
(leafcutter_list$PC$significant_clusters_0.05_filter$genes %>% unique()) %in% (annotated$PC$gene_name_junc %>% unlist() %>% unique()) %>%
all()
## [1] TRUE
Aim: to identify and visualise diffential splicing events that overlap between comparisons
# All successfully tested introns
success <- annotated$PC %>%
as.data.frame() %>%
dplyr::inner_join(leafcutter_list$PC$cluster_significance %>%
dplyr::filter(status == "Success") %>%
dplyr::distinct(comparison, cluster) %>%
dplyr::rename(cluster_id = cluster))
# Significant ds events (|dPSI| >= 0.1)
significant <- annotated$PC %>%
as.data.frame() %>%
dplyr::inner_join(leafcutter_list$PC$significant_clusters_0.05_filter %>%
dplyr::mutate(cluster_id = str_c(chr, ":", cluster)) %>%
# Add dPSI filter
dplyr::filter(abs(deltapsi) >= 0.1) %>%
dplyr::distinct(comparison, cluster_id)
)
# Initiate lists
intron_list <- list(success, significant)
plot_list <- vector(mode = "list", 2)
for(i in 1:length(intron_list)){
count <- intron_list[[i]] %>%
dplyr::filter(!junc_cat == "ambig_gene") %>%
dplyr::group_by(comparison, junc_cat) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::ungroup()
plot_list[[i]] <- count %>%
dplyr::mutate(junc_cat = junc_cat %>% factor(levels = rev(c("annotated", "novel_acceptor", "novel_donor", "novel_exon_skip", "novel_combo", "none"))),
comparison = comparison %>% factor(levels = count %>%
dplyr::filter(junc_cat == "annotated") %>%
dplyr::arrange(desc(prop)) %>%
.[["comparison"]])) %>%
ggplot(aes(x = comparison, y = prop, fill = junc_cat), colour = "black") +
geom_col() +
labs(x = "") +
scale_fill_manual(name = "Acceptor/donor annotation",
values = rev(c("#3C5488", "#E64B35", "#00A087", "#4DBBD5", "#7E6148", "grey"))) +
theme_rhr
}
ggarrange(plotlist = plot_list,
labels = c("a", "b"),
common.legend = TRUE, legend = "top")
# saveRDS(intron_list, here::here("workflows", "figures", "intron_list.Rds"))
n_unique_int <- significant %>%
dplyr::filter(!junc_cat == "ambig_gene") %>%
dplyr::group_by_at(vars(!!!c("seqnames", "start", "end", "cluster_id"))) %>%
dplyr::filter(n() == 1) %>%
nrow()
n_total_int <- significant %>%
dplyr::filter(!junc_cat == "ambig_gene") %>%
dplyr::distinct(seqnames, start, end, cluster_id) %>%
nrow()
(n_unique_int/n_total_int) *100
## [1] 78.91089
cluster_annot_count <- significant %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(introns_all_annotated = all(junc_in_ref)) %>%
dplyr::group_by(comparison, introns_all_annotated) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::ungroup()
cluster_annot_count %>%
dplyr::mutate(comparison = factor(comparison, cluster_annot_count %>%
dplyr::filter(introns_all_annotated == TRUE) %>%
dplyr::arrange(-prop) %>%
.[["comparison"]])) %>%
ggplot(aes(x = comparison, y = prop, fill = introns_all_annotated)) +
labs(y = "Proportion of clusters") +
geom_col() +
theme_rhr
Aim: to assign relevance to differential splicing events
# Read in PD genes
PD_genes <-
setNames(vector(mode = "list", length = 2),
c("mendelian", "sporadic"))
PD_genes$mendelian <-
read_delim(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/20191128_GE_ParkinsonDiseaseComplexParkinsonism_greengenes.tsv"
),
delim = "\t"
) %>%
dplyr::rename(Gene = `Gene Symbol`)
PD_genes$sporadic <-
read_excel(
file.path(
path_to_raw_data,
"misc/PD_risk_genes/TableS6_Complete_summary_statistics_for_QTL_Mendelian_randomization.xlsx"
)
) %>%
dplyr::filter(`Pass Bonferroni` == "pass")
# Note that the following genes in the mendelian list are also present within the sporadic MR list
PD_genes$mendelian$Gene[PD_genes$mendelian$Gene %in% PD_genes$sporadic$Gene]
## [1] "GCH1" "GRN" "LRRK2" "MAPT" "SNCA"
# Differentially spliced genes overlapping PD genes
PD_genes %>%
lapply(., function(x){
significant %>%
dplyr::filter(gene_name_junc %in% x$Gene) %>%
.[["gene_name_junc"]] %>%
unlist() %>%
unique() %>%
sort()
})
## $mendelian
## [1] "ATXN1" "ATXN2" "ATXN3" "C19orf12" "DCTN1" "DNAJC6"
## [7] "FBXO7" "LYST" "PLA2G6" "SPG11" "SYNJ1" "VPS13A"
##
## $sporadic
## [1] "AMPD3" "ARIH2" "BST1" "CAMK2D" "CYLD" "DCAF16"
## [7] "DGKQ" "EIF3C" "ELOVL7" "FBRSL1" "FDFT1" "GAK"
## [13] "GBAP1" "GIN1" "GPNMB" "IGSF9B" "IP6K2" "ITGA8"
## [19] "ITPKB" "MCCC1" "NAGLU" "NEK1" "PAM" "PGS1"
## [25] "PPIP5K2" "QRICH1" "RPS6KL1" "SH2B1" "STX4" "SULT1A1"
## [31] "TMEM175" "ZNF192P1" "ZNF514"
leafcutter_list$PC$significant_clusters_0.05_filter %>%
dplyr::filter(genes %in% unique(c(PD_genes$mendelian$Gene, PD_genes$sporadic$Gene))) %>%
dplyr::distinct(genes, comparison, cluster) %>%
dplyr::arrange(genes) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
plot_list <- vector(mode = "list", length = 2)
for(i in 1:length(PD_genes)){
count <- significant %>%
dplyr::filter(gene_name_junc %in% PD_genes[[i]]$Gene) %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(introns_all_annotated = all(junc_in_ref)) %>%
dplyr::group_by(comparison, introns_all_annotated) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n/sum(n)) %>%
dplyr::ungroup()
plot_list[[i]] <- count %>%
# dplyr::mutate(comparison = factor(comparison, count %>%
# dplyr::filter(introns_all_annotated == TRUE) %>%
# dplyr::arrange(-prop) %>%
# .[["comparison"]])) %>%
ggplot(aes(x = comparison, y = n, fill = introns_all_annotated)) +
labs(y = "Number of genes") +
geom_col() +
theme_rhr
}
ggarrange(plotlist = plot_list,
labels = c("a", "b"),
common.legend = TRUE,
legend = "top")
# Load uniprot features
source(file = here::here("R", "extract_uniprot_features.R"))
uniprot <-
create_granges_uniprot(
tracks = c("Domain_and_Sites", "Molecule_Processing", "PTM", "Structural_Features", "Topology"),
by = "category",
ncores = 12
)
# Load all annotated introns (annotated in here::here("workflows", "validation", cluster_validation_btwn_datasets.Rmd")
clusters <-
readRDS(
file.path(
path_to_results, "leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
)
)
# Proportion of DS introns that overlap a protein feature
uniprot_features <-
overlap_granges_with_uniprot(
junc_metadata = clusters[mcols(clusters)[, "cluster_id"] %in% unique(intron_list[[2]]$cluster_id %>% str_remove("chr"))],
uniprot = uniprot
)
uniprot_overlap_df <-
intron_list[[2]] %>%
dplyr::rename(intron_chr = seqnames, intron_start = start, intron_end = end) %>%
dplyr::mutate(cluster_id = str_remove(cluster_id, "chr")) %>%
dplyr::left_join(uniprot_features %>%
dplyr::select(contains("feature"), contains("intron"), cluster_id) %>%
dplyr::ungroup() %>%
# # Remove chains as these typically signify an entire protein, as opposed to specific sections of the protein
# dplyr::filter(feature_category != "chain") %>%
dplyr::group_by(intron_chr, intron_start, intron_end, cluster_id) %>%
dplyr::summarise(
feature_category = str_c(feature_category, collapse = " / "),
feature_name = str_c(feature_name, collapse = " / ")
),
by = c("intron_chr", "intron_start", "intron_end", "cluster_id")
) %>%
dplyr::mutate(overlapping_protein_feature = case_when(
is.na(feature_name) ~ FALSE,
TRUE ~ TRUE
)) %>%
dplyr::group_by(comparison, junc_cat, overlapping_protein_feature) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(prop = n / sum(n) * 100) %>%
dplyr::ungroup() %>%
dplyr::mutate(junc_cat = junc_cat %>%
str_replace("novel_combo", "novel_combination") %>%
str_replace_all("_", " ") %>%
str_to_sentence() %>%
fct_relevel(., c("Annotated", "Novel exon skip", "Novel combination", "Novel acceptor", "Novel donor", "None")))
uniprot_overlap_df %>%
dplyr::filter(overlapping_protein_feature == TRUE) %>%
ggplot(aes(x = fct_reorder(.f = junc_cat, .x = prop, .fun = median, .desc = T), y = prop, fill = junc_cat)) +
geom_boxplot(outlier.alpha = 0) +
ggbeeswarm::geom_beeswarm(priority = "density", shape = 21, alpha = 0.5) +
labs(x = "", y = "Proportion of DS introns (within a cateogory)\noverlapping a UniProt protein feature (% per comparison)", fill = "Splicing event") +
scale_fill_manual(values = c("#3C5488", "#4DBBD5", "#F3E361", "#E64B35", "#00A087", "grey")) +
guides(fill = guide_legend(nrow = 1)) +
coord_cartesian(ylim = c(0, 100)) +
theme_rhr +
theme(
legend.position = "none",
legend.key.size = unit(0.4, "cm"),
axis.text.x = element_text(angle = 45, vjust = 1)
)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dasper_0.0.0.9000 testthat_2.3.2 rtracklayer_1.46.0
## [4] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1 IRanges_2.20.2
## [7] S4Vectors_0.24.4 BiocGenerics_0.32.0 readxl_1.3.1
## [10] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.2
## [13] purrr_0.3.4 readr_1.4.0 tidyr_1.1.1
## [16] tibble_3.0.3 tidyverse_1.3.0 ggsci_2.9
## [19] ggpubr_0.4.0 ggplot2_3.3.2
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.8 Hmisc_4.4-1
## [3] qdapTools_1.3.5 BiocFileCache_1.10.2
## [5] lazyeval_0.2.2 splines_3.6.1
## [7] BiocParallel_1.20.1 crosstalk_1.1.0.1
## [9] usethis_1.6.3 digest_0.6.27
## [11] ensembldb_2.10.2 htmltools_0.5.1.1
## [13] fansi_0.4.2 checkmate_2.0.0
## [15] magrittr_2.0.1 memoise_2.0.0
## [17] BSgenome_1.54.0 cluster_2.1.0
## [19] maser_1.4.0 openxlsx_4.2.3
## [21] remotes_2.2.0 Biostrings_2.54.0
## [23] modelr_0.1.8 matrixStats_0.56.0
## [25] askpass_1.1 prettyunits_1.1.1
## [27] jpeg_0.1-8.1 colorspace_2.0-0
## [29] rappdirs_0.3.1 blob_1.2.1
## [31] rvest_0.3.6 haven_2.3.1
## [33] xfun_0.16 callr_3.5.1
## [35] crayon_1.4.1 RCurl_1.98-1.2
## [37] jsonlite_1.7.1 VariantAnnotation_1.32.0
## [39] survival_3.2-7 glue_1.4.2
## [41] gtable_0.3.0 zlibbioc_1.32.0
## [43] XVector_0.26.0 DelayedArray_0.12.3
## [45] car_3.0-9 pkgbuild_1.1.0
## [47] abind_1.4-5 scales_1.1.1
## [49] DBI_1.1.1 rstatix_0.6.0
## [51] Rcpp_1.0.5 htmlTable_2.0.1
## [53] progress_1.2.2 foreign_0.8-72
## [55] bit_4.0.4 Formula_1.2-4
## [57] DT_0.15 htmlwidgets_1.5.3
## [59] httr_1.4.2 RColorBrewer_1.1-2
## [61] ellipsis_0.3.1 pkgconfig_2.0.3
## [63] XML_3.99-0.3 farver_2.0.3
## [65] nnet_7.3-12 Gviz_1.30.3
## [67] dbplyr_1.4.4 utf8_1.1.4
## [69] here_1.0.0 tidyselect_1.1.0
## [71] labeling_0.4.2 rlang_0.4.7
## [73] AnnotationDbi_1.48.0 munsell_0.5.0
## [75] cellranger_1.1.0 tools_3.6.1
## [77] cachem_1.0.3 cli_2.2.0.9000
## [79] generics_0.0.2 RSQLite_2.2.0
## [81] devtools_2.3.2 broom_0.7.0
## [83] evaluate_0.14 fastmap_1.0.1
## [85] yaml_2.2.1 processx_3.4.5
## [87] knitr_1.29 bit64_4.0.2
## [89] fs_1.5.0 zip_2.1.0
## [91] AnnotationFilter_1.10.0 refGenome_1.7.7
## [93] xml2_1.3.2 biomaRt_2.42.1
## [95] doBy_4.6.7 compiler_3.6.1
## [97] rstudioapi_0.13 beeswarm_0.2.3
## [99] png_0.1-7 curl_4.3
## [101] ggsignif_0.6.0 reprex_1.0.0
## [103] stringi_1.5.3 highr_0.8
## [105] ps_1.3.4 GenomicFeatures_1.38.2
## [107] desc_1.2.0 lattice_0.20-38
## [109] ProtGenerics_1.18.0 Matrix_1.2-17
## [111] vctrs_0.3.2 pillar_1.4.6
## [113] lifecycle_0.2.0 data.table_1.13.0
## [115] cowplot_1.0.0 bitops_1.0-6
## [117] latticeExtra_0.6-29 R6_2.5.0
## [119] bookdown_0.21 gridExtra_2.3
## [121] rio_0.5.16 vipor_0.4.5
## [123] sessioninfo_1.1.1 dichromat_2.0-0
## [125] MASS_7.3-51.4 assertthat_0.2.1
## [127] pkgload_1.1.0 chron_2.3-56
## [129] SummarizedExperiment_1.16.1 openssl_1.4.2
## [131] rprojroot_2.0.2 withr_2.2.0
## [133] GenomicAlignments_1.22.1 Rsamtools_2.2.3
## [135] Deriv_4.0 GenomeInfoDbData_1.2.2
## [137] hms_1.0.0 rpart_4.1-15
## [139] grid_3.6.1 rmarkdown_2.5
## [141] carData_3.0-4 biovizBase_1.34.1
## [143] base64enc_0.1-3 Biobase_2.46.0
## [145] lubridate_1.7.9 ggbeeswarm_0.6.0