source(here::here("R", "load_sj_df.R"))
source(here::here("R", "get_fastp_QC_df.R"))
source(here::here("R", "file_paths.R"))
path_to_fastp <- file.path(path_to_bulk_seq_data, "QC/fastp/")
path_to_sj <- file.path(path_to_bulk_seq_data, "STAR/SJ_out_1pass")
# Load QC metrics and SJ files
QC_tissue <-
get_fastp_QC_df(fastp_json_dir_paths = path_to_fastp) %>%
dplyr::select(SampleID, reads_unknown_insert_size_percent, everything())
sj_df_tissue <-
load_sj_df(sj_dir_path = path_to_sj)
# Dataframe of unique junctions with respect to genomic location
unique_junc_tiss <- sj_df_tissue %>%
tidyr::unite(col = "junction", chr, intron_start, intron_end, sep = ":", remove = FALSE) %>%
dplyr::group_by(junction) %>%
dplyr::summarise(unique_reads_junction_across_all_samples = sum(unique_reads_junction)) %>%
inner_join(sj_df_tissue %>%
dplyr::select(-unique_reads_junction, -multi_map_reads_junction, -max_splice_alignment_overhang, -Sample) %>%
tidyr::unite(col = "junction", chr, intron_start, intron_end, sep = ":", remove = FALSE) %>%
distinct(junction, .keep_all = TRUE)) %>%
dplyr::select(junction, chr, intron_start, intron_end, everything())
sj_df_tissue %>%
dplyr::mutate(in_annotation = ifelse(in_annotation == 1, TRUE, FALSE)) %>%
dplyr::group_by(in_annotation) %>%
dplyr::summarise(min_unique_reads_junction = min(unique_reads_junction),
max_unique_reads_junction = max(unique_reads_junction))
ggplot(data = sj_df_tissue %>%
dplyr::mutate(in_annotation = ifelse(in_annotation == 1, TRUE, FALSE)),
aes(x = in_annotation, y = unique_reads_junction, fill = in_annotation)) +
geom_boxplot(outlier.shape = NA) +
ylim(0,100) +
labs(y = "Number of junctions reads") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8),
axis.title = element_text(size = 8, face = "bold"),
strip.text = element_text(size = 8, face = "bold"))
Figure 3.1: Distribution of reads across unannotated (false) and annotated (true) junctions across all bulk tissue samples. Y-axis limited to 100 and outliers hidden for illustration purposes.
# Relationship between junctions and read depth
summary_sj <- sj_df_tissue %>%
dplyr::mutate(in_annotation = ifelse(in_annotation == 1, TRUE, FALSE)) %>%
dplyr::group_by(Sample, in_annotation) %>%
dplyr::summarise(n_junction_reads = sum(unique_reads_junction)) %>%
dplyr::mutate(sample_shortened = str_replace(Sample, "_.*", "")) %>%
inner_join(QC_tissue %>% dplyr::select(SampleID, after_filtering_total_reads), by = c("sample_shortened" = "SampleID")) %>%
dplyr::select(-sample_shortened)
ggplot(data = summary_sj) +
geom_point(aes(x = after_filtering_total_reads, y = n_junction_reads, colour = in_annotation)) +
facet_wrap(~in_annotation, scales = "free_y") +
labs(x = "Total number of reads after QC", y = "Total number of junction reads") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8),
axis.title = element_text(size = 8, face = "bold"),
strip.text = element_text(size = 8, face = "bold"))
Figure 3.2: Figure: Comparison of total number of reads passing QC vs. total number of junction reads (coloured by whether or not they are in annotation). Each individual point represents one bulk tissue sample.
# Percentage of unique reads mapping to unique junctions in relation to total number of unique reads across all junctions
unique_junc_tiss %>%
dplyr::mutate(total_unique_reads_junction = sum(unique_reads_junction_across_all_samples),
percentage_of_total_unique_reads_junction = (unique_reads_junction_across_all_samples/total_unique_reads_junction) * 100) %>%
dplyr::select(-total_unique_reads_junction) %>%
arrange(desc(percentage_of_total_unique_reads_junction))
nohup Rscript \
/home/rreynolds/packages/RNAseqProcessing/alignment/STAR_splice_junction_merge.R \
/data/RNAseq_PD/tissue_polyA_samples/STAR/SJ_out_1pass \
-o /data/RNAseq_PD/tissue_polyA_samples/STAR/ \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/PD_tissue_polyA_SJ_filtering.log&