Aims: to annotate STAR-outputted junctions by .gtf exon definitions; to remove background noise; to determine the proportion of annotated/partially annotated/unannotated junctions in each group; and to compare this with leafcutter defined introns
RangedSummarizedExperiment
:
source(here::here("R", "file_paths.R"))
sample_info <-
read_excel(path =
file.path(
path_to_raw_data,
"sample_details/20201229_MasterFile_SampleInfo.xlsx"
),
sheet = "SampleInfo",
skip = 1) %>%
dplyr::filter(Sample_Type == "Tissue section" & sent_to_bulk_seq == "yes") %>%
dplyr::rename(sample_id = CaseNo,
RIN = RINe_bulkRNA_Tapestation) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD","DLB")),
Sex = as.factor(Sex)) %>%
dplyr::select(sample_id, Disease_Group, Sex, AoO, AoD, DD, PMI, aSN, TAU, 'thal AB', 'aCG aSN score', Genetics, RIN) %>%
dplyr::inner_join(
read_delim(
file.path(
path_to_results,
"deconvolution/scaden/2020Feb/scaden_summary_results.txt"
),
delim = "\t") %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")),
Celltype = str_c(Celltype, "_proportions")) %>%
dplyr::filter(cells == 1000 & samples == 1000) %>%
dplyr::select(-source, -cells, -samples, -Disease_Group, -Celltype_class) %>%
tidyr::spread(key = Celltype, value = cell_type_proportion)
)
merged_junctions.SJ.out.tab
, which contains only junction coordinates and not counts, etc.create_rse_jx()
. As a part of this, rowRanges were annotated using .gtf annotations, with the annotate_junc_ref()
from David's dasper package. This function has since been deprecated and replaced by junction_annot
. Both functions perform the same task, but junction_annot
has been made to run faster. annotate_junc_ref()
takes as input the junction co-ordinates and annotates the start and end position based on precise overlap with a known exon boundary. Then using reference annotation, infers the strand of junction and categorises junctions in "annotated", "novel_acceptor", "novel_donor", "novel_combo", novel_exon_skip", "ambig_gene" & "none".source(here::here("R", "load_sj_df.R"))
source(here::here("R", "create_rse_jx.R"))
sj_df <-
load_sj_df(
file.path(
path_to_bulk_seq,
"STAR"
)
) %>%
dplyr::filter(Sample != "merged_junctions.SJ.out.tab") %>%
dplyr::mutate(Sample = str_replace(Sample, "_.*", ""))
rse_jx <- create_rse_jx(sj_df = sj_df,
sample_info = sample_info,
sample_id_col = "sample_id")
saveRDS(rse_jx,
file =
file.path(
path_to_results,
"STAR/STAR_rse_jx.rds")
)
.\SoniaRuiz\20200107_Defining_junctions_length.docx
):
rse_jx <-
readRDS(
file =
file.path(
path_to_results,
"STAR/STAR_rse_jx.rds"
)
)
# Load encode blacklist (https://github.com/Boyle-Lab/Blacklist/tree/master/lists)
ENCODE_blacklist_hg38 <- rtracklayer::import("/data/references/ENCODE_blacklist_v2/hg38-blacklist.v2.bed")
ENCODE_blacklist_hg38 <-
ENCODE_blacklist_hg38 %>%
as.data.frame() %>%
dplyr::mutate(chr = str_replace(seqnames, "chr", "")) %>%
dplyr::select(-seqnames) %>%
makeGRangesFromDataFrame(.,
keep.extra.columns = TRUE,
seqnames.field = "chr",
start.field = "start",
end.field = "end",
ignore.strand = FALSE)
# Find junctions that overlap with ENCODE blacklist regions
overlapped_junctions <-
GenomicRanges::findOverlaps(query = ENCODE_blacklist_hg38,
subject = rowRanges(rse_jx),
ignore.strand = F)
ids <- subjectHits(overlapped_junctions)
# Remove junctions overlapping blacklist regions
rse_jx <- rse_jx[-ids,]
# Find junctions with width < 25
width_25 <- which(c(rowRanges(rse_jx) %>% width()) < 25)
rse_jx <- rse_jx[-width_25,]
# Total number of junctions removed
length(ids) + length(width_25) # 19640 + 6620 = 26260
saveRDS(
rse_jx,
file =
file.path(
path_to_results,
"STAR/STAR_rse_jx_filtered.rds"
)
)
dds <-
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD.Rds"
)
)
rse_jx <-
readRDS(
file.path(
path_to_results,
"STAR/STAR_rse_jx_filtered.rds"
)
)
library_size <-
dds %>%
assay() %>% # Access count data
as_tibble() %>%
dplyr::mutate(gene_id = rownames(assay(dds))) %>%
gather(key = sample_id, value = counts, -gene_id) %>%
group_by(sample_id) %>%
dplyr::summarise(mapped_read_depth = sum(counts))
filtered_counts <-
assay(rse_jx) %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "junction_id") %>%
tidyr::gather(key = "sample_id", value = "count", -junction_id) %>%
dplyr::filter(count > 0)
annot_ref_df <- rowRanges(rse_jx)[c(filtered_counts %>% .[["junction_id"]]), "junc_cat"] %>%
mcols() %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "junction_id") %>%
dplyr::inner_join(filtered_counts) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::group_by(sample_id, junc_cat) %>%
dplyr::summarise(n_junc = n(),
count = sum(count)) %>%
dplyr::inner_join(library_size) %>%
dplyr::mutate(total_junc = sum(n_junc),
total_count = sum(count),
prop_junc = n_junc/total_junc,
prop_count = count/total_count,
prop_mapped_read = count/mapped_read_depth)
# Calculate difference between acceptor and donor for all metrics
# Add disease groups and re-level disease groups + junc_cat
annot_ref_df <-
annot_ref_df %>%
tidyr::gather(key = "metric", value = "value", -junc_cat, -sample_id) %>%
tidyr::spread(key = junc_cat, value = value) %>%
dplyr::mutate(diff_acceptor_donor = novel_acceptor - novel_donor) %>%
tidyr::gather(key = "junc_cat", value = "value", -sample_id, -metric) %>%
tidyr::spread(key = metric, value = value) %>%
dplyr::inner_join(colData(rse_jx) %>%
as.data.frame() %>%
rownames_to_column(var = "sample_id") %>%
dplyr::select(sample_id, Disease_Group)) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")),
junc_cat = fct_relevel(junc_cat,
c("annotated", "novel_acceptor", "novel_donor", "diff_acceptor_donor", "novel_exon_skip", "novel_combo", "none")))
plot_list <- vector(mode = "list", length = 2)
# Plot junction proportions by disease group
plot_list[[1]] <-
annot_ref_df %>%
ggplot(aes(x = junc_cat, y = prop_junc, fill = Disease_Group)) +
geom_boxplot(colour = "black") +
labs(x = "", y = "Proportion of junctions") +
ylim(c(0,round(max(annot_ref_df$prop_count), digits = 2))) +
scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
theme_rhr
# Plot count proportions by disease group
plot_list[[2]] <-
annot_ref_df %>%
ggplot(aes(x = junc_cat, y = prop_count, fill = Disease_Group)) +
geom_boxplot(colour = "black") +
facet_wrap(vars(junc_cat), scale = "free") +
labs(x = "", y = "Proportion of counts") +
scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
theme_rhr +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
ggarrange(plotlist = plot_list,
labels = c("a", "b"),
common.legend = T,
nrow = 2)
# Assign junc_cat to annotation classes i.e annotated/partially annotated/unannotated
annot_ref_df <-
annot_ref_df %>%
dplyr::mutate(annot_class = ifelse(junc_cat == "annotated", "annotated",
ifelse(junc_cat %in% c("none"), "unannotated", "partially_annotated")) %>%
as.factor())
plot_list <- vector(mode = "list", length = 2)
# Plot junction proportions by disease group
plot_list[[1]] <-
annot_ref_df %>%
dplyr::group_by(Disease_Group, sample_id, annot_class) %>%
dplyr::summarise(prop_junc = sum(prop_junc)) %>%
ggplot(aes(x = annot_class, y = prop_junc, fill = Disease_Group)) +
geom_boxplot(colour = "black") +
ylim(c(0,round(max(annot_ref_df$prop_count), digits = 2))) +
labs(x = "", y = "Proportion of junctions") +
scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
theme_rhr
# Plot count proportions by disease group
plot_list[[2]] <-
annot_ref_df %>%
dplyr::group_by(Disease_Group, sample_id, annot_class) %>%
dplyr::summarise(prop_count = sum(prop_count)) %>%
dplyr::ungroup() %>%
ggplot(aes(x = annot_class, y = prop_count, fill = Disease_Group)) +
geom_boxplot(colour = "black") +
facet_zoom(ylim = c(0,0.015), zoom.size = 1) +
labs(x = "", y = "Proportion of counts") +
scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
theme_rhr
ggarrange(plotlist = plot_list,
labels = c("a", "b"),
common.legend = T,
nrow = 2)
# Load PC axes and join to annot_ref_df
annot_ref_df <-
annot_ref_df %>%
dplyr::inner_join(
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_DEG.Rds"
)
) %>%
colData() %>%
as_tibble() %>%
dplyr::select(sample_id, PC1, PC2, PC3, PC4))
# Model: prop ~ Disease_Group
annot_ref_df %>%
tidyr::gather(key = prop_type, value = prop, starts_with("prop")) %>%
dplyr::group_by(prop_type, junc_cat) %>%
do(lm(prop ~ Disease_Group, data = .) %>%
broom::tidy()) %>%
# dplyr::filter(p.value < 0.05) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
# Model: prop ~ Disease_Group + PC1 + PC2 + PC3 + PC4
annot_ref_df %>%
tidyr::gather(key = prop_type, value = prop, starts_with("prop")) %>%
dplyr::group_by(prop_type, junc_cat) %>%
do(lm(prop ~ Disease_Group + PC1 + PC2 + PC3 + PC4, data = .) %>%
broom::tidy()) %>%
# dplyr::filter(p.value < 0.05) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
drop1()
function, which essentially tests how dropping each variable from the full model changes the fit. If a variable contributes significantly to the fit of a model, we would expect a full model and one where the term is dropped to be significantly different -- this is represented by the p.value
column in the table below.for(i in 1:length(unique(annot_ref_df$junc_cat))){
df <- drop1(lm(prop_junc ~ Disease_Group + PC1 + PC2 + PC3 + PC4,
data = annot_ref_df %>%
dplyr::filter(junc_cat == unique(annot_ref_df$junc_cat)[i])),
scope = ~., test = "F") %>%
tidy() %>%
dplyr::mutate(junc_cat = unique(annot_ref_df$junc_cat)[i]) %>%
dplyr::select(junc_cat, term, p.value, everything())
if(i == 1){
anova_df <- df
} else{
anova_df <- anova_df %>%
bind_rows(df)
}
}
anova_df %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
for(i in 1:length(unique(annot_ref_df$junc_cat))){
df <- drop1(lm(prop_count ~ Disease_Group + PC1 + PC2 + PC3 + PC4,
data = annot_ref_df %>%
dplyr::filter(junc_cat == unique(annot_ref_df$junc_cat)[i])),
scope = ~., test = "F") %>%
tidy() %>%
dplyr::mutate(junc_cat = unique(annot_ref_df$junc_cat)[i]) %>%
dplyr::select(junc_cat, term, p.value, everything())
if(i == 1){
anova_df <- df
} else{
anova_df <- anova_df %>%
bind_rows(df)
}
}
anova_df %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
clu <-
readRDS(
file.path(
path_to_results,
"leafcutter/intron_clustering/tissue_polyA_all_clusters_gtfannotated.Rds"
)
)
# Calculate proportions
annot_leafcutter <- clu %>%
as_tibble() %>%
dplyr::mutate(cluster_id_full = str_c(cluster_id, ":", start, "-", end)) %>%
dplyr::select(contains("cluster_id"), contains("leafcutter"), junc_cat) %>%
tidyr::gather(key = "sample_id", value = "count", -cluster_id, -cluster_id_full, -junc_cat) %>%
dplyr::mutate(sample_id = str_remove(sample_id, "_.*")) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::group_by(sample_id, junc_cat) %>%
dplyr::summarise(n_junc = n(),
count = sum(count)) %>%
dplyr::inner_join(library_size) %>%
dplyr::mutate(total_junc = sum(n_junc),
total_count = sum(count),
prop_junc = n_junc/total_junc,
prop_count = count/total_count,
prop_mapped_read = count/mapped_read_depth)
plot_list <- vector(mode = "list", length = 2)
# Calculate difference between acceptor and donor for all metrics
# Add disease groups and re-level disease groups + junc_cat
annot_leafcutter <-
annot_leafcutter %>%
tidyr::gather(key = "metric", value = "value", -junc_cat, -sample_id) %>%
tidyr::spread(key = junc_cat, value = value) %>%
dplyr::mutate(diff_acceptor_donor = novel_acceptor - novel_donor,
ratio_acceptor_donor = novel_acceptor/novel_donor) %>%
tidyr::gather(key = "junc_cat", value = "value", -sample_id, -metric) %>%
tidyr::spread(key = metric, value = value) %>%
dplyr::inner_join(colData(rse_jx) %>%
as.data.frame() %>%
rownames_to_column(var = "sample_id") %>%
dplyr::select(sample_id, Disease_Group)) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")),
junc_cat = fct_relevel(junc_cat,
c("annotated", "novel_acceptor", "novel_donor", "diff_acceptor_donor", "ratio_acceptor_donor", "novel_exon_skip", "novel_combo", "none")))
plot_list <- vector(mode = "list", length = 2)
# Plot count proportions by disease group
plot_list[[1]] <- annot_leafcutter %>%
ggplot(aes(x = junc_cat, y = prop_count, fill = Disease_Group)) +
geom_boxplot(colour = "black") +
facet_wrap(vars(junc_cat), scales = "free") +
labs(x = "", y = "Proportion of total junction counts") +
scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
theme_rhr +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
plot_list[[2]] <- annot_leafcutter %>%
ggplot(aes(x = junc_cat, y = prop_mapped_read, fill = Disease_Group)) +
geom_boxplot(colour = "black") +
facet_wrap(vars(junc_cat), scales = "free") +
labs(x = "", y = "Proportion of total mapped reads") +
scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
theme_rhr +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
ggarrange(plotlist = plot_list,
labels = c("a", "b"),
common.legend = T,
nrow = 2)
# Load PC axes and join to annot_ref_df
annot_leafcutter <-
annot_leafcutter %>%
dplyr::inner_join(
readRDS(
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_DEG.Rds"
)
) %>%
colData() %>%
as_tibble())
# Model: prop ~ Disease_Group
annot_leafcutter %>%
dplyr::select(-prop_junc) %>%
tidyr::gather(key = prop_type, value = prop, starts_with("prop")) %>%
dplyr::group_by(prop_type, junc_cat) %>%
do(lm(prop ~ Disease_Group, data = .) %>%
broom::tidy()) %>%
# dplyr::filter(p.value < 0.05) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
# Model: prop ~ Disease_Group + Sex + RIN + AoD
annot_leafcutter %>%
dplyr::select(-prop_junc) %>%
tidyr::gather(key = prop_type, value = prop, starts_with("prop")) %>%
dplyr::group_by(prop_type, junc_cat) %>%
do(lm(prop ~ Disease_Group + Sex + RIN + AoD, data = .) %>%
broom::tidy()) %>%
# dplyr::filter(p.value < 0.05) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
# Model: prop ~ Disease_Group + PC1 + PC2 + PC3 + PC4
annot_leafcutter %>%
dplyr::select(-prop_junc) %>%
tidyr::gather(key = prop_type, value = prop, starts_with("prop")) %>%
dplyr::group_by(prop_type, junc_cat) %>%
do(lm(prop ~ Disease_Group + PC1 + PC2 + PC3 + PC4, data = .) %>%
broom::tidy()) %>%
# dplyr::filter(p.value < 0.05) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
rse_jx %>%
rowRanges() %>%
as_tibble() %>%
dplyr::select(junction_id, width, junc_cat) %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(
median_intron_length = median(width)
)
rse_jx %>%
rowRanges() %>%
as_tibble() %>%
dplyr::select(junction_id, width, junc_cat) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(
junc_cat =
fct_relevel(
junc_cat,
c("annotated", "novel_acceptor", "novel_donor", "diff_acceptor_donor", "novel_exon_skip", "novel_combo", "none")
)
) %>%
ggplot(aes(x = junc_cat, y = width)) +
geom_boxplot() +
coord_cartesian(ylim = c(0,50000)) +
labs(x = "Junction category", y = "Intron length") +
theme_rhr
clu %>%
as_tibble() %>%
dplyr::select(seqnames, start, end, cluster_id, width, junc_cat) %>%
dplyr::group_by(junc_cat) %>%
dplyr::summarise(
median_intron_length = median(width)
)
clu %>%
as_tibble() %>%
dplyr::select(seqnames, start, end, cluster_id, width, junc_cat) %>%
dplyr::filter(junc_cat != "ambig_gene") %>%
dplyr::mutate(
junc_cat =
fct_relevel(
junc_cat,
c("annotated", "novel_acceptor", "novel_donor", "diff_acceptor_donor", "novel_exon_skip", "novel_combo", "none")
)
) %>%
ggplot(aes(x = junc_cat, y = width)) +
geom_boxplot() +
coord_cartesian(ylim = c(0,50000)) +
labs(x = "Junction category", y = "Intron length") +
theme_rhr