Aim: determine whether the proportion of introns targeted by major/minor spliceosome differs between differentially spliced and non-differentially spliced introns across pairwise comparisons of disease to control
U2 <-
read_delim(file = "https://introndb.lerner.ccf.org/static/bed/GRCh38_U2.bed",
delim = "\t",
col_names = c("chr", "start", "stop", "intron_id", "score", "strand"),
col_types=
list(col_character(),
col_double(),
col_double(),
col_character(),
col_double(),
col_character())) %>%
dplyr::mutate(intron_type = "U2")
print(str_c("Number of U2 introns: ", nrow(U2)))
## [1] "Number of U2 introns: 221420"
U12 <-
read_delim(file = "https://introndb.lerner.ccf.org/static/bed/GRCh38_U12.bed",
delim = "\t",
col_names = c("chr", "start", "stop", "intron_id", "score", "strand"),
col_types =
list(col_character(),
col_double(),
col_double(),
col_character(),
col_double(),
col_character())) %>%
dplyr::mutate(intron_type = "U12")
print(str_c("Number of U12 introns: ", nrow(U2)))
## [1] "Number of U12 introns: 221420"
spliceosome_gr <-
U2 %>%
dplyr::bind_rows(U12) %>%
makeGRangesFromDataFrame(
keep.extra.columns = TRUE,
seqnames.field = "chr",
start.field = "start",
end.field = "stop",
strand.field = "strand"
) %>%
GenomeInfoDb::keepSeqlevels(c(1:22, "X", "Y"),
pruning.mode = "coarse")
# List with 2 dataframes: (1) all successfully tested and annotated introns and (2) significant and annotated introns
# Significance: FDR < 0.05, cluster contains >= 1 intron with |dPSI| >= 0.1
intron_list <-
readRDS(here::here("docs", "figures", "intron_list.Rds"))
# Add a column to dataframe with ds events signifying significance
# Left join this to the dataframe of all successfully tested events
# Replace NAs with FALSE
intron_ds <-
intron_list[[1]] %>%
dplyr::left_join(
intron_list[[2]] %>%
dplyr::mutate(diff_spliced = TRUE)
) %>%
dplyr::mutate(
diff_spliced = case_when(is.na(diff_spliced) ~ FALSE,
TRUE ~ diff_spliced)
)
## Joining, by = c("seqnames", "start", "end", "width", "strand", "comparison", "cluster_id", "logef", "Control", "DLB", "deltapsi", "PDD", "PD", "strand_start", "gene_name_start", "gene_id_start", "transcript_id_start", "exon_id_start", "strand_end", "gene_name_end", "gene_id_end", "transcript_id_end", "exon_id_end", "gene_name_junc", "gene_id_junc", "junc_in_ref", "junc_cat")
intron_ds_gr <-
intron_ds %>%
dplyr::select(-width) %>%
makeGRangesFromDataFrame(
keep.extra.columns = TRUE,
seqnames.field = "seqnames",
start.field = "start",
end.field = "end",
strand.field = "strand"
)
# Looking only for exactly overlapping introns
overlap <-
GenomicRanges::findOverlaps(
intron_ds_gr,
spliceosome_gr,
type = "equal"
) %>%
as_tibble()
overlap_df <-
intron_ds_gr %>%
as_tibble() %>%
dplyr::left_join(
intron_ds_gr %>%
as_tibble() %>%
dplyr::slice(overlap$queryHits) %>%
dplyr::bind_cols(
spliceosome_gr %>%
as_tibble() %>%
dplyr::select(intron_id, score, intron_type) %>%
dplyr::rename_with(
~ stringr::str_c("ioad", .x, sep = "_")
) %>%
dplyr::slice(overlap$subjectHits)
)
)
## Joining, by = c("seqnames", "start", "end", "width", "strand", "comparison", "cluster_id", "logef", "Control", "DLB", "deltapsi", "PDD", "PD", "strand_start", "gene_name_start", "gene_id_start", "transcript_id_start", "exon_id_start", "strand_end", "gene_name_end", "gene_id_end", "transcript_id_end", "exon_id_end", "gene_name_junc", "gene_id_junc", "junc_in_ref", "junc_cat", "diff_spliced")
overlap_df %>%
dplyr::filter(!junc_cat %in% c("ambig_gene")) %>%
dplyr::distinct(
comparison,
seqnames,
start,
end,
junc_cat,
ioad_intron_type
) %>%
ggplot(
aes(x = ioad_intron_type)
) +
geom_bar() +
geom_text(
stat='count',
aes(label=..count..),
vjust=-1
) +
labs(
x = "Intron type",
y = "Number of introns"
) +
facet_wrap(vars(comparison)) +
expand_limits(y = c(0, 100000)) +
theme_bw()
comparisons <- overlap_df$comparison %>% unique()
fig_list <-
vector(mode = "list",
length = length(comparisons))
for(i in 1:length(comparisons)){
fig_list[[i]] <-
overlap_df %>%
dplyr::filter(
!is.na(ioad_intron_type),
comparison == comparisons[i]
) %>%
ggplot(
aes(
x = diff_spliced,
fill = ioad_intron_type
)
) +
geom_bar(position = "fill", colour = "black") +
labs(
x = "Differentially spliced?",
y = "Proportion of introns annotated\nto each intron type (%)",
title = comparisons[i] %>%
stringr::str_replace_all("_", " ")
) +
scale_y_continuous(labels = scales::percent) +
ggforce::facet_zoom(ylim = c(0.99,1), zoom.size = 1) +
theme_bw()
}
ggpubr::ggarrange(
plotlist = fig_list,
ncol = 1,
common.legend = T
)
overlap_df %>%
dplyr::filter(!is.na(ioad_intron_type),
comparison %in% c("Control_vs_DLB", "PD_vs_DLB")) %>%
dplyr::select(comparison, diff_spliced, ioad_intron_type) %>%
dplyr::group_by(comparison) %>%
summarise(
p.value= chisq.test(diff_spliced, ioad_intron_type)$p.value,
method = chisq.test(diff_spliced, ioad_intron_type)$method
)
## `summarise()` ungrouping output (override with `.groups` argument)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] forcats_0.5.1 dplyr_1.0.2 purrr_0.3.4
## [4] readr_1.4.0 tidyr_1.1.1 tibble_3.0.3
## [7] tidyverse_1.3.0 stringr_1.4.0 GenomicRanges_1.38.0
## [10] GenomeInfoDb_1.22.1 IRanges_2.20.2 S4Vectors_0.24.4
## [13] BiocGenerics_0.32.0 ggforce_0.3.2 ggplot2_3.3.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 fs_1.5.0 lubridate_1.7.9
## [4] httr_1.4.2 rprojroot_2.0.2 tools_3.6.1
## [7] backports_1.1.8 R6_2.5.0 DBI_1.1.1
## [10] colorspace_2.0-0 withr_2.2.0 gridExtra_2.3
## [13] tidyselect_1.1.0 curl_4.3 compiler_3.6.1
## [16] cli_2.2.0.9000 rvest_0.3.6 xml2_1.3.2
## [19] labeling_0.4.2 bookdown_0.21 scales_1.1.1
## [22] digest_0.6.27 foreign_0.8-72 rmarkdown_2.5
## [25] rio_0.5.16 XVector_0.26.0 pkgconfig_2.0.3
## [28] htmltools_0.5.1.1 dbplyr_1.4.4 highr_0.8
## [31] rlang_0.4.7 readxl_1.3.1 rstudioapi_0.13
## [34] farver_2.0.3 generics_0.0.2 jsonlite_1.7.1
## [37] zip_2.1.0 car_3.0-9 RCurl_1.98-1.2
## [40] magrittr_2.0.1 GenomeInfoDbData_1.2.2 Rcpp_1.0.5
## [43] munsell_0.5.0 abind_1.4-5 lifecycle_0.2.0
## [46] stringi_1.5.3 yaml_2.2.1 carData_3.0-4
## [49] MASS_7.3-51.4 zlibbioc_1.32.0 grid_3.6.1
## [52] blob_1.2.1 crayon_1.4.1 cowplot_1.0.0
## [55] haven_2.3.1 hms_1.0.0 knitr_1.29
## [58] pillar_1.4.6 ggpubr_0.4.0 ggsignif_0.6.0
## [61] reprex_1.0.0 glue_1.4.2 evaluate_0.14
## [64] data.table_1.13.0 modelr_0.1.8 vctrs_0.3.2
## [67] tweenr_1.0.1 cellranger_1.1.0 gtable_0.3.0
## [70] polyclip_1.10-0 assertthat_0.2.1 xfun_0.16
## [73] openxlsx_4.2.3 broom_0.7.0 rstatix_0.6.0
## [76] ellipsis_0.3.1 here_1.0.0