Aims: to (i) quantify intron excision ratios across samples; (ii) perform differential splicing analyses across disease groups; and (iii) replicate findings from own study with those from this study, SRP058181.
# File paths
source(here::here("R", "file_paths.R"))
leafviz_dir <- "/home/rreynolds/packages/leafcutter/leafviz/"
markergene_dir <- "/home/rreynolds/projects/MarkerGenes/"
# Functions
source(here::here("R", "biomart_df.R"))
# Files
load(file.path(path_to_recount, "counts/rse_gene.Rdata"))
sample_info <-
readRDS(
file.path(
path_to_results,
"SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
)
) %>%
colData() %>%
as_tibble()
As the primary aim was validation, Leafcutter was performed using the same parameters as those used in analysis of own samples.
load(file.path(path_to_recount, "junctions/rse_jx.Rdata"))
ENCODE_blacklist_hg38 <- rtracklayer::import(file.path(path_to_raw_data, "misc/hg38-blacklist.v2.bed"))
recount_ids <- rse_jx %>%
assay() %>%
colnames()
# Remove SRR2015746 (C0061), due to uncertainty re. sex
recount_ids <- recount_ids[!str_detect(recount_ids,"SRR2015746")]
junction_counts <- rse_jx %>%
# Need to extract row names, as they are unnamed in count.tsv and assay(rse_jx)
rowRanges() %>%
as_tibble() %>%
dplyr::select(seqnames, start, end, strand, junction_id) %>%
# Bind columns from assay(rse_jx)
bind_cols(rse_jx %>%
assay() %>%
as_tibble()) %>%
# Remove unwanted chromosomes builds
dplyr::filter(seqnames %in% c(str_c("chr", 1:22), "chrX", "chrY", "chrM")) %>%
dplyr::mutate(unknown = ".")
# Remove junctions that overlap with ENCODE blacklist regions
overlapped_junctions <- GenomicRanges::findOverlaps(query = ENCODE_blacklist_hg38,
subject = junction_counts %>%
dplyr::select(seqnames, start, end, strand) %>%
GenomicRanges::makeGRangesFromDataFrame(),
ignore.strand = F)
indexes <- subjectHits(overlapped_junctions)
junction_counts <- junction_counts[-indexes, ] %>%
dplyr::mutate(seqnames = str_replace(seqnames, "chr", ""))
output_path <- file.path(path_to_results, "SRP058181/leafcutter/SJ_formatting/")
for(i in 1:length(recount_ids)){
id <- recount_ids[i]
print(str_c(i, ": ", id))
filtered <- junction_counts %>%
dplyr::select(seqnames, start, end, unknown, one_of(id), strand)
colnames(filtered) <- ifelse(colnames(filtered) == id, "samp_id", colnames(filtered))
filtered <-
filtered %>%
dplyr::filter(samp_id !=0)
write_delim(filtered %>%
mutate(start = as.character(as.integer(start)),
end = as.character(as.integer(end)),
samp_id = as.character(as.integer(samp_id))),
str_c(output_path, id, "_SJ_leafcutter.junc"),
delim = "\t",
col_names = F)
}
# write a .txt file with each .junc file
junc_df <- tibble(junc_file_name = list.files(path = output_path, pattern = "_SJ_leafcutter.junc", full.names = TRUE))
write_delim(junc_df,
path = str_c(output_path, "/list_juncfiles.txt"),
delim = "\t",
col_names = F)
python /tools/leafcutter/clustering/leafcutter_cluster.py \
-j /home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/SJ_formatting/list_juncfiles.txt \
-r /home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/ \
-o test_diseasegroups \
-l 1000000 \
-m 30 \
-p 0.001 \
-s True
Rscript /tools/leafcutter/scripts/gtf_to_exons.R \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz
# Load per individual intron counts for column names
sample_names <-
fread(
file.path(
path_to_results,
"SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz"
)
) %>%
dplyr::select(-V1) %>%
colnames()
# Create master df of sample info, with grouping variable (Disease_Group) and confounders (RIN and AoD)
master <-
tibble(lc_sample_name = sample_names,
sample_name = sample_names %>%
str_replace("_.*", "")) %>%
inner_join(sample_info, by = c("sample_name" = "recount_id")) %>%
dplyr::select(lc_sample_name, Disease_Group, RIN, Age_at_death) %>%
arrange(Disease_Group)
RNAseqProcessing::create_group_files_multi_pairwisecomp(
df = master,
group_column_name = "Disease_Group",
output_path = file.path(path_to_results, "SRP058181/leafcutter/diff_splicing/group_files/")
)
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/SRP058181_leafcutter_ds.log&
# Load per individual intron counts for column names
sample_names <-
fread(
file.path(
path_to_results,
"SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz"
)
) %>%
dplyr::select(-V1) %>%
colnames()
# Create master df of sample info, with grouping variable (Disease_Group) and confounders (RIN, AoD, PMI, cell-type proportions)
master <-
tibble(lc_sample_name = sample_names,
sample_name = sample_names %>%
str_replace("_.*", "")) %>%
inner_join(sample_info, by = c("sample_name" = "recount_id")) %>%
dplyr::select(lc_sample_name, Disease_Group, RIN, Age_at_death, PMI, contains("_proportion")) %>%
arrange(Disease_Group)
RNAseqProcessing::create_group_files_multi_pairwisecomp(
df = master,
group_column_name = "Disease_Group",
output_path = file.path(path_to_results, "SRP058181/leafcutter/diff_splicing_celltype/group_files/")
)
nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing_celltype/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing_celltype/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/SRP058181_leafcutter_ds_celltype.log&
/tools/leafcutter/leafviz/gtf2leafcutter.pl \
-o /data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz
prepare_results.R
, which performs this formatting, albeit for only one pairwise comparison. To format the results of multiple pairwise comparisons requires looping across the various pairwise comparisons and running the prepare_results.R
for each individual pairwise comparison. This is what the leafviz_multi_pairwise.R
script used below does.leafviz_multi_pairwise.R
script assumes use of the leafcutter_ds_multi_pairwise.R
script and the create_group_files_multi_pairwisecomp()
function. This ensures that all files needed are named consistently. That is, the (i) _cluster_signficance.txt
, (ii) _effect_sizes.txt
and (iii) _group_file.txt
for an individual pairwise comparison all have the same prefix.nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafviz_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/ \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/ \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
--output_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/leafviz/ \
--group_file_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/group_files/ \
--FDR=0.05 \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/SRP058181_leafviz_prepare_results.log&
RNAseqProcessing::run_leafviz(
leafviz_dir = leafviz_dir,
results_filename = file.path(path_to_results, "SRP058181/leafcutter/leafviz/Control_vs_PD.Rda")
)
clusters <-
list.files(
file.path(path_to_results, "SRP058181/leafcutter/diff_splicing/"),
pattern = "cluster_",
recursive = T,
full.names = T
)
intron <-
list.files(
file.path(path_to_results, "SRP058181/leafcutter/diff_splicing/"),
pattern = "effect_",
recursive = T,
full.names = T
)
for(i in 1:length(clusters)){
comparison <- clusters[i] %>%
str_replace("/.*/", "") %>%
str_replace("_cluster_significance.txt", "")
lc <- read_delim(clusters[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
lc_all <- lc
} else{
lc_all <- lc_all %>%
bind_rows(lc)
}
}
for(i in 1:length(intron)){
comparison <- intron[i] %>%
str_replace("/.*/", "") %>%
str_replace("_effect_sizes.txt", "")
int <- read_delim(intron[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
int_all <- int
} else{
int_all <- int_all %>%
bind_rows(int)
}
}
# Filter for significant clusters
# Add direction of effect
lc_filtered <- lc_all %>%
# filter for successful tests and remove clusters overlapping multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>%
dplyr::inner_join(int_all %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>%
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change"))
# Save results
write_delim(
lc_filtered,
path =
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.txt"
),
delim = "\t"
)
saveRDS(
setNames(list(lc_all, int_all, lc_filtered),
c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
)
)
results_covar <-
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
)
)
# Summary of differential splicing run
summary <- results_covar$cluster_significance %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n()) %>%
tidyr::spread(key = comparison, value = n) %>%
dplyr::mutate(status = "Total clusters across all samples (read >= 30)") %>%
dplyr::select(status, everything()) %>%
dplyr::bind_rows(results_covar$cluster_significance %>%
dplyr::group_by(comparison, status) %>%
dplyr::summarise(n = n()) %>%
tidyr::spread(key = comparison, value = n)) %>%
dplyr::bind_rows(results_covar$significant_clusters_0.05_filter %>%
dplyr::group_by(comparison) %>%
summarise(n = n()) %>%
tidyr::spread(key = comparison, value = n) %>%
dplyr::mutate(status = "Differential intron excision, p.adjust < 0.05") %>%
dplyr::select(status, everything())) %>%
dplyr::mutate(status = str_replace(status, "Success", "Successfully tested"))
# Add propotions
summary <-
summary %>%
dplyr::bind_rows(
(((summary %>%
dplyr::filter(status == "Successfully tested") %>%
dplyr::select(-status))/(summary %>%
dplyr::filter(status == "Total clusters across all samples (read >= 30)") %>%
dplyr::select(-status))) *100) %>%
dplyr::mutate(status = "Successfully tested/Total clusters (%)")
) %>%
dplyr::bind_rows(
(((summary %>%
dplyr::filter(status == "Differential intron excision, p.adjust < 0.05") %>%
dplyr::select(-status))/(summary %>%
dplyr::filter(status == "Successfully tested") %>%
dplyr::select(-status))) *100) %>%
dplyr::mutate(status = "Differentially spliced/Successfully tested (%)")
)
summary %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results_covar$intron_usage %>%
tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(n_intron = n()) %>%
dplyr::inner_join(results_covar$intron_usage %>%
tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(n_intron = n()) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(median = median(n_intron))) %>%
ggplot(aes(x = n_intron)) +
stat_count(color = "black", alpha = 0.5) +
geom_vline(aes(xintercept=median), color="red",
linetype="dashed") +
facet_wrap(vars(comparison)) +
coord_cartesian(xlim = c(0,30)) +
theme_rhr
# Run once
all_genes <- results_covar$cluster_significance %>%
# remove clusters that overlap multiple genes
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
genes_list <- setNames(results_covar$significant_clusters_0.05_filter %>%
group_split(comparison),
results_covar$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>% .[["genes"]] %>% unique()})
upset(fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")
gom.self <- newGOM(genes_list, genes_list, genome.size = length(all_genes))
All.jaccard <- getMatrix(gom.self, "Jaccard")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(All.jaccard, method="color", col=col(100), cl.lim = c(0,1),
type="lower",
# addCoef.col = "black", number.cex = 0.75, # Add coefficient of correlation
tl.col="black", tl.srt=45, tl.cex = 0.75, mar=c(0,0,1,0), #Text label color and rotation
p.mat = All.jaccard, sig.level = (0.05), insig = "p-value", number.cex = 0.75,
diag = TRUE)
# Run once
gprofiler_results_list <- lapply(genes_list, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
saveRDS(
gprofiler_results_list,
file.path(
path_to_results,
"SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_RINAoD.Rds"
)
)
gprofiler_results_list <-
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_RINAoD.Rds"
)
)
# Added some filtering to remove broad terms
gprofiler_df <- gprofiler_results_list %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000) %>%
dplyr::arrange(comparison, source, p_value)
gprofiler_df %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n_enriched_pathways = n())
gprofiler_df %>%
ggplot(aes(x = comparison,
y = fct_rev(term_name),
size = precision,
fill = p_value)) +
geom_point(pch = 21) +
scale_fill_viridis_c(direction = -1) +
labs(x = "Comparison", y = "Dataset: Cell type") +
theme_rhr
clusters <-
list.files(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing_celltype/"
),
pattern = "cluster_",
recursive = T,
full.names = T
)
intron <-
list.files(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing_celltype/"
),
pattern = "effect_",
recursive = T,
full.names = T
)
for(i in 1:length(clusters)){
comparison <- clusters[i] %>%
str_replace("/.*/", "") %>%
str_replace("_cluster_significance.txt", "")
lc <- read_delim(clusters[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
lc_all <- lc
} else{
lc_all <- lc_all %>%
bind_rows(lc)
}
}
for(i in 1:length(intron)){
comparison <- intron[i] %>%
str_replace("/.*/", "") %>%
str_replace("_effect_sizes.txt", "")
int <- read_delim(intron[i], delim = "\t") %>%
dplyr::mutate(comparison = comparison) %>%
dplyr::select(comparison, everything())
if(i == 1){
int_all <- int
} else{
int_all <- int_all %>%
bind_rows(int)
}
}
# Filter for significant clusters
# Add direction of effect
lc_filtered <- lc_all %>%
# filter for successful tests and remove clusters overlapping multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>%
dplyr::inner_join(int_all %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>%
dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
deltapsi < 0 ~ "downregulated",
deltapsi == 0 ~ "no_change"))
# Save results
write_delim(
lc_filtered,
path =
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.txt"
),
delim = "\t"
)
saveRDS(
setNames(list(lc_all, int_all, lc_filtered),
c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
)
)
results_ct <- readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
)
)
# Summary of differential splicing run
summary <- results_ct$cluster_significance %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n()) %>%
tidyr::spread(key = comparison, value = n) %>%
dplyr::mutate(status = "Total clusters across all samples (read >= 30)") %>%
dplyr::select(status, everything()) %>%
dplyr::bind_rows(results_ct$cluster_significance %>%
dplyr::group_by(comparison, status) %>%
dplyr::summarise(n = n()) %>%
tidyr::spread(key = comparison, value = n)) %>%
dplyr::bind_rows(results_ct$significant_clusters_0.05_filter %>%
dplyr::group_by(comparison) %>%
summarise(n = n()) %>%
tidyr::spread(key = comparison, value = n) %>%
dplyr::mutate(status = "Differential intron excision, p.adjust < 0.05") %>%
dplyr::select(status, everything())) %>%
dplyr::mutate(status = str_replace(status, "Success", "Successfully tested"))
# Add propotions
summary <- summary %>%
dplyr::bind_rows((((summary %>%
dplyr::filter(status == "Successfully tested") %>%
dplyr::select(-status))/(summary %>%
dplyr::filter(status == "Total clusters across all samples (read >= 30)") %>%
dplyr::select(-status))) *100) %>%
dplyr::mutate(status = "Successfully tested/Total clusters (%)")) %>%
dplyr::bind_rows((((summary %>%
dplyr::filter(status == "Differential intron excision, p.adjust < 0.05") %>%
dplyr::select(-status))/(summary %>%
dplyr::filter(status == "Successfully tested") %>%
dplyr::select(-status))) *100) %>%
dplyr::mutate(status = "Differentially spliced/Successfully tested (%)"))
summary %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results_ct$intron_usage %>%
tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(n_intron = n()) %>%
dplyr::inner_join(results_ct$intron_usage %>%
tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>%
dplyr::group_by(comparison, cluster_id) %>%
dplyr::summarise(n_intron = n()) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(median = median(n_intron))) %>%
ggplot(aes(x = n_intron)) +
stat_count(color = "black", alpha = 0.5) +
geom_vline(aes(xintercept=median), color="red",
linetype="dashed") +
facet_wrap(vars(comparison)) +
coord_cartesian(xlim = c(0,30)) +
theme_rhr
# Run once
all_genes <- results_ct$cluster_significance %>%
# remove clusters that overlap multiple genes
dplyr::filter(!str_detect(genes, ",")) %>%
.[["genes"]] %>%
unique()
genes_list_ct <- setNames(results_ct$significant_clusters_0.05_filter %>%
group_split(comparison),
results_ct$significant_clusters_0.05_filter %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
# For each dataframe, extract the gene column and remove duplicate genes
lapply(., function(x){x %>% .[["genes"]] %>% unique()})
upset(fromList(genes_list_ct), sets = names(genes_list_ct), keep.order = TRUE, nintersects = 25, order.by = "freq")
# Run once
gprofiler_results_list_ct <- lapply(genes_list_ct, function(x){
x %>%
gost(., organism = "hsapiens",
correction_method= "gSCS", significant = TRUE,
user_threshold = 0.05,
custom_bg= all_genes,
sources = c("GO", "REAC", "KEGG", "OMIM"),
evcodes = TRUE)})
saveRDS(
gprofiler_results_list_ct,
file.path(
path_to_results,
"SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_celltype.Rds"
)
)
gprofiler_results_list_ct <-
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_celltype.Rds"
)
)
# Added some filtering to remove broad terms
gprofiler_df <- gprofiler_results_list_ct %>%
lapply(., function(x){
x$result %>%
as_tibble()
}) %>%
qdapTools::list_df2df(., col = "comparison") %>%
dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>%
dplyr::filter(term_size > 20 & term_size <= 2000) %>%
dplyr::arrange(comparison, source, p_value)
gprofiler_df %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n_enriched_pathways = n())
gprofiler_df %>%
ggplot(aes(x = comparison,
y = fct_rev(term_name),
size = precision,
fill = p_value)) +
geom_point(pch = 21) +
scale_fill_viridis_c(direction = -1) +
labs(x = "Comparison", y = "Dataset: Cell type") +
theme_rhr
# Change names on genes_list_ct to denote correction method
names(genes_list_ct) <- str_c(names(genes_list_ct), "_ct")
# Create master list with genes from both correction methods
master_list <- c(genes_list, genes_list_ct)
upset(fromList(master_list), sets = names(master_list), keep.order = TRUE, nintersects = 30, order.by = "freq")
lc_filtered_genes <- results_covar$significant_clusters_0.05_filter %>%
dplyr::select(comparison, genes) %>%
dplyr::distinct(comparison, genes)
lc_filtered_ct_genes <- results_ct$significant_clusters_0.05_filter %>%
dplyr::select(comparison, genes) %>%
dplyr::distinct(comparison, genes)
lc_filtered_genes %>%
dplyr::group_by(comparison) %>%
dplyr::distinct(genes) %>%
dplyr::summarise(n_significant_spliced_genes = n()) %>%
dplyr::bind_rows(lc_filtered_ct_genes %>%
dplyr::group_by(comparison) %>%
dplyr::distinct(genes) %>%
dplyr::summarise(n_significant_spliced_genes = n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(comparison = str_c(comparison, "_ct"))) %>%
dplyr::mutate(unique_significant_spliced_genes = c(53, 52, 8, 80, 394, 424),
proportion_unique = round(unique_significant_spliced_genes/n_significant_spliced_genes, 2)) %>%
# dplyr::select(-unique_significant_spliced_genes) %>%
arrange(desc(n_significant_spliced_genes))
leafcutter_list <-
setNames(
list(
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
)
),
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
)
)
),
c("covar", "ct")
)
plot_list <- leafcutter_list %>%
lapply(., function(x){
x$significant_clusters_0.05_filter %>%
dplyr::filter(direction_of_effect != "no_change") %>%
dplyr::mutate(xintercept = ifelse(direction_of_effect == "upregulated", 0.1, -0.1)) %>%
dplyr::inner_join(x$significant_clusters_0.05_filter %>%
dplyr::group_by(comparison, direction_of_effect) %>%
dplyr::summarise(median = median(deltapsi))) %>%
ggplot(aes(x = deltapsi)) +
geom_histogram(color = "black", binwidth = 0.01, alpha = 0.5) +
geom_vline(aes(xintercept = median), color="red",
linetype="dashed") +
geom_vline(aes(xintercept = xintercept), color="#00BFC4",
linetype="dashed") +
facet_grid(rows = vars(comparison), cols = vars(direction_of_effect), scales = "free_x") +
coord_cartesian(ylim = c(0,450)) +
theme_rhr
})
ggarrange(plotlist = plot_list,
ncol = 2,
labels = c("a", "b"))
leafcutter_list %>%
lapply(., function(x){
x$significant_clusters_0.05_filter %>%
dplyr::filter(direction_of_effect != "no_change") %>%
dplyr::group_by(comparison, direction_of_effect) %>%
dplyr::summarise(median = median(deltapsi))
})
## $covar
## # A tibble: 6 x 3
## # Groups: comparison [3]
## comparison direction_of_effect median
## <chr> <chr> <dbl>
## 1 Control_vs_PD downregulated -0.0121
## 2 Control_vs_PD upregulated 0.0127
## 3 Control_vs_PDD downregulated -0.0207
## 4 Control_vs_PDD upregulated 0.0159
## 5 PD_vs_PDD downregulated -0.0160
## 6 PD_vs_PDD upregulated 0.0152
##
## $ct
## # A tibble: 6 x 3
## # Groups: comparison [3]
## comparison direction_of_effect median
## <chr> <chr> <dbl>
## 1 Control_vs_PD downregulated -0.0174
## 2 Control_vs_PD upregulated 0.0133
## 3 Control_vs_PDD downregulated -0.0228
## 4 Control_vs_PDD upregulated 0.0206
## 5 PD_vs_PDD downregulated -0.0152
## 6 PD_vs_PDD upregulated 0.0170
# Adding filter of dPSI
stringent <- leafcutter_list %>%
lapply(., function (x){
x$cluster_significance %>%
# filter for successful tests and remove clusters that overlap multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster_id"), sep = ":") %>%
dplyr::inner_join(x$intron_usage %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster_id"), sep = ":")) %>%
dplyr::filter(abs(deltapsi) >= 0.1)
})
# Number of differentially spliced clusters
stringent %>%
lapply(., function(x){
x %>%
dplyr::distinct(comparison, cluster_id, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n())
})
## $covar
## # A tibble: 3 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_PD 57
## 2 Control_vs_PDD 132
## 3 PD_vs_PDD 16
##
## $ct
## # A tibble: 3 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_PD 45
## 2 Control_vs_PDD 278
## 3 PD_vs_PDD 158
# Number of differentially spliced genes
stringent %>%
lapply(., function(x){
x %>%
dplyr::distinct(comparison, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n = n())
})
## $covar
## # A tibble: 3 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_PD 57
## 2 Control_vs_PDD 131
## 3 PD_vs_PDD 16
##
## $ct
## # A tibble: 3 x 2
## comparison n
## <chr> <int>
## 1 Control_vs_PD 45
## 2 Control_vs_PDD 273
## 3 PD_vs_PDD 157
# Load internal specificity matrices
ctd_files <-
list.files(
file.path(
path_to_results,
"snRNA/specificity_matrices/2020_March/"
),
pattern = "ctd",
full.names = T
)
ctd_list <- vector(mode = "list", length = length(ctd_files))
for(i in 1:length(ctd_files)){
# Load ctd
ctd_list[[i]] <- readRDS(ctd_files[i])
# Extract file name
title <-
ctd_files[i] %>%
str_replace(".*/", "") %>%
str_replace("\\..*", "") %>%
str_replace(".*_", "")
# Name list
names(ctd_list)[i] <- title
}
# Load external datasets
load(file.path(markergene_dir, "specificity_matrices/AIBS2018_MTG.rda"))
load(file.path(markergene_dir, "specificity_matrices/Habib2017_DroNc_Human.rda")
# ewce lists
genes_list_ewce <-
setNames(stringent$ct %>%
group_split(comparison),
stringent$ct %>%
.[["comparison"]] %>%
unique() %>%
sort()) %>%
lapply(., function(x){
x %>%
dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>%
dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>%
dplyr::group_by(genes) %>%
# Where duplicate genes occur, select by lowest p-value first
# If still duplicate, select by highest absolute delta PSI
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::top_n(n = 1, wt = abs_deltapsi) %>%
dplyr::ungroup() %>%
dplyr::top_n(n = 100, wt = abs_deltapsi) %>%
.[["genes"]]
})
ewce_dPSI <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list_ewce,
ctd_AIBS2018, ctd_DRONC_human,
ctd_list$Control, ctd_list$PD,
ctd_list$PDD, ctd_list$DLB,
celltypeLevel = 1,
reps = 100000,
genelistSpecies = "human",
sctSpecies = "human")
saveRDS(
ewce_dPSI,
file =
file.path(
path_to_results,
"SRP058181/leafcutter/ewce/ewce_leafcutter_ds_celltype_dPSIfilter.Rds"
)
)
ewce_dPSI <-
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/ewce/ewce_leafcutter_ds_celltype_dPSIfilter.Rds"
)
)
plot <-
ewce_dPSI %>%
dplyr::mutate(FDR.p = p.adjust(p, method="BH"),
Class = case_when(
CellType %in% c("Astrocyte", "ASC", "Astro") ~ "Astrocyte",
CellType %in% c("MG", "Microglia") ~ "Microglia",
CellType %in% c("END", "Endo", "Endothelial cell", "Vascular") ~ "Vascular",
CellType %in% c("ODC", "Oligo", "Oligodendrocyte") ~ "Oligodendrocyte",
CellType %in% c("exCA", "Excitatory", "exDG", "exPFC", "Glutamatergic") ~ "Excitatory \n neuron",
CellType %in% c("GABA", "GABAergic", "Inhibitory") ~ "Inhibitory \n neuron",
CellType == "OPC" ~ "OPC",
CellType == "NSC" ~ "NSC"),
Study = str_replace(Study, "DRONC_human", "DRONC"),
Study = str_replace(Study, "list\\$", ""),
Study = fct_relevel(Study,
c("AIBS2018", "DRONC", "Control", "PD", "PDD", "DLB")),
comparison_type = ifelse(str_detect(GeneSet, "Control"), "Ref: control", "Ref: disease"),
joint_name = str_c(Study, CellType, sep = ":"),
GeneSet = fct_relevel(GeneSet,
c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")),
sd_from_mean = ifelse(FDR.p <= 0.1, sd_from_mean, NA)) %>%
dplyr::arrange(Class, Study) %>%
dplyr::mutate_at(vars(joint_name), list(~ factor(., levels = unique(.)))) %>%
ggplot(aes(x = GeneSet, y = forcats::fct_rev(joint_name))) +
geom_tile(aes(fill = sd_from_mean), colour = "black") +
facet_grid(cols = vars(comparison_type), rows = vars(Class), scales = "free", space = "free_y") +
scale_fill_viridis_c(na.value = "grey") +
labs(x = "", y = "Dataset: Cell type") +
theme_rhr +
theme(panel.grid = element_blank(),
strip.text.y = element_text(size = 8, angle = 0))
plot
compareCluster
: permits comparison of GO enrichments across groups. Uses hypergeometric model to perform enrichment analysis within each group (i.e. does not account for semantic similarity in testing as GProfiler does, which could be a disadvantage)cluster_compare_input <-
stringent$ct %>%
dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>%
dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>%
dplyr::group_by(genes) %>%
# Where duplicate genes occur, select by lowest p-value first
# If still duplicate, select by highest absolute delta PSI
dplyr::top_n(n = 1, wt = -p.adjust) %>%
dplyr::top_n(n = 1, wt = abs_deltapsi)
ont <- c("BP", "CC", "MF")
cluster_compare_list <- setNames(vector(mode = "list", length = length(ont)),
ont)
for(i in 1:length(ont)){
cluster_compare_list[[i]] <-
cluster_compare_input %>%
clusterProfiler::compareCluster(genes ~ comparison,
data=.,
keyType = "SYMBOL",
fun="enrichGO",
ont = ont[i],
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
OrgDb='org.Hs.eg.db')
}
saveRDS(
cluster_compare_list,
file.path(
path_to_results,
"SRP058181/leafcutter/gprofiler/clusterProfiler_leafcutter_ds_celltype_dPSIfilter.Rds"
)
)
cluster_compare <-
readRDS(
file.path(
path_to_results,
"SRP058181/leafcutter/gprofiler/clusterProfiler_leafcutter_ds_celltype_dPSIfilter.Rds"
)
)
cluster_compare %>%
purrr::discard(is.null) %>%
lapply(., function(df){
df %>%
clusterProfiler::dotplot(., showCategory = NULL, font.size = 8) +
geom_point(shape = 1,colour = "black") +
scale_size_continuous(name = "gene ratio") +
scale_colour_viridis_c(direction = -1) +
theme_rhr
})
## $BP
leafcutter_list_own <-
setNames(
list(
readRDS(
file.path(
path_to_results,
"leafcutter/diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.Rds"
)
),
readRDS(
file.path(
path_to_results,
"leafcutter/diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.Rds"
)
)
),
c("covar", "PC")
)
# Adding filter of dPSI
stringent_own <- leafcutter_list_own %>%
lapply(., function (x){
x$cluster_significance %>%
# filter for successful tests and remove clusters that overlap multiple genes
dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
tidyr::separate(cluster, into = c("chr", "cluster_id"), sep = ":") %>%
dplyr::inner_join(x$intron_usage %>%
tidyr::separate(intron, into = c("chr", "start", "end", "cluster_id"), sep = ":")) %>%
dplyr::filter(abs(deltapsi) >= 0.1, comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD"))
})
stringent$ct %>%
dplyr::distinct(comparison, genes) %>%
dplyr::inner_join(stringent_own$PC %>%
dplyr::distinct(comparison, genes)) %>%
dplyr::arrange(comparison, genes)
stringent$ct %>%
dplyr::distinct(comparison, genes) %>%
dplyr::inner_join(stringent_own$PC %>%
dplyr::distinct(comparison, genes)) %>%
dplyr::arrange(comparison, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n_intersect = n()) %>%
dplyr::inner_join(stringent$ct %>%
dplyr::distinct(comparison, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n_SRP058181 = n())) %>%
dplyr::inner_join(stringent_own$PC %>%
dplyr::distinct(comparison, genes) %>%
dplyr::group_by(comparison) %>%
dplyr::summarise(n_own = n())) %>%
dplyr::mutate(n_SRP058181_unique = n_SRP058181 - n_intersect,
n_own_unique = n_own - n_intersect)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] UpSetR_1.4.0 rtracklayer_1.46.0
## [5] readxl_1.3.1 RNAseqProcessing_0.0.0.9000
## [7] forcats_0.5.1 stringr_1.4.0
## [9] dplyr_1.0.2 purrr_0.3.4
## [11] readr_1.4.0 tidyr_1.1.1
## [13] tibble_3.0.3 tidyverse_1.3.0
## [15] ggpubr_0.4.0 ggplot2_3.3.2
## [17] gprofiler2_0.1.9 GeneOverlap_1.22.0
## [19] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [21] DelayedArray_0.12.3 BiocParallel_1.20.1
## [23] matrixStats_0.56.0 Biobase_2.46.0
## [25] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [27] IRanges_2.20.2 S4Vectors_0.24.4
## [29] BiocGenerics_0.32.0 data.table_1.13.0
## [31] corrplot_0.84 clusterProfiler_3.14.3
##
## loaded via a namespace (and not attached):
## [1] utf8_1.1.4 tidyselect_1.1.0 RSQLite_2.2.0
## [4] AnnotationDbi_1.48.0 htmlwidgets_1.5.3 grid_3.6.1
## [7] devtools_2.3.2 munsell_0.5.0 chron_2.3-56
## [10] DT_0.15 withr_2.2.0 colorspace_2.0-0
## [13] GOSemSim_2.17.1 highr_0.8 knitr_1.29
## [16] rstudioapi_0.13 ggsignif_0.6.0 DOSE_3.12.0
## [19] labeling_0.4.2 urltools_1.7.3 GenomeInfoDbData_1.2.2
## [22] polyclip_1.10-0 bit64_4.0.2 farver_2.0.3
## [25] rprojroot_2.0.2 vctrs_0.3.2 generics_0.0.2
## [28] xfun_0.16 BiocFileCache_1.10.2 R6_2.5.0
## [31] graphlayouts_0.7.0 locfit_1.5-9.4 bitops_1.0-6
## [34] cachem_1.0.3 fgsea_1.12.0 gridGraphics_0.5-0
## [37] assertthat_0.2.1 scales_1.1.1 ggraph_2.0.3
## [40] nnet_7.3-12 enrichplot_1.6.1 gtable_0.3.0
## [43] processx_3.4.5 tidygraph_1.2.0 rlang_0.4.7
## [46] genefilter_1.68.0 splines_3.6.1 rstatix_0.6.0
## [49] lazyeval_0.2.2 broom_0.7.0 europepmc_0.4
## [52] checkmate_2.0.0 BiocManager_1.30.10 yaml_2.2.1
## [55] reshape2_1.4.4 abind_1.4-5 modelr_0.1.8
## [58] crosstalk_1.1.0.1 backports_1.1.8 qvalue_2.18.0
## [61] Hmisc_4.4-1 usethis_1.6.3 tools_3.6.1
## [64] bookdown_0.21 qdapTools_1.3.5 ggplotify_0.0.5
## [67] ellipsis_0.3.1 gplots_3.0.4 RColorBrewer_1.1-2
## [70] ggdendro_0.1.21 sessioninfo_1.1.1 ggridges_0.5.2
## [73] Rcpp_1.0.5 plyr_1.8.6 base64enc_0.1-3
## [76] progress_1.2.2 zlibbioc_1.32.0 RCurl_1.98-1.2
## [79] ps_1.3.4 prettyunits_1.1.1 openssl_1.4.2
## [82] rpart_4.1-15 viridis_0.5.1 cowplot_1.0.0
## [85] haven_2.3.1 ggrepel_0.8.2 cluster_2.1.0
## [88] here_1.0.0 fs_1.5.0 magrittr_2.0.1
## [91] DO.db_2.9 openxlsx_4.2.3 triebeard_0.3.0
## [94] reprex_1.0.0 pkgload_1.1.0 hms_1.0.0
## [97] evaluate_0.14 xtable_1.8-4 XML_3.99-0.3
## [100] rio_0.5.16 jpeg_0.1-8.1 gridExtra_2.3
## [103] biomaRt_2.42.1 testthat_2.3.2 compiler_3.6.1
## [106] KernSmooth_2.23-15 crayon_1.4.1 htmltools_0.5.1.1
## [109] Formula_1.2-4 geneplotter_1.64.0 lubridate_1.7.9
## [112] DBI_1.1.1 tweenr_1.0.1 dbplyr_1.4.4
## [115] rappdirs_0.3.1 MASS_7.3-51.4 Matrix_1.2-17
## [118] car_3.0-9 cli_2.2.0.9000 gdata_2.18.0
## [121] igraph_1.2.5 pkgconfig_2.0.3 rvcheck_0.1.8
## [124] GenomicAlignments_1.22.1 foreign_0.8-72 plotly_4.9.2.1
## [127] xml2_1.3.2 annotate_1.64.0 XVector_0.26.0
## [130] rvest_0.3.6 callr_3.5.1 digest_0.6.27
## [133] Biostrings_2.54.0 HGNChelper_0.8.1 rmarkdown_2.5
## [136] cellranger_1.1.0 fastmatch_1.1-0 htmlTable_2.0.1
## [139] curl_4.3 Rsamtools_2.2.3 gtools_3.8.2
## [142] lifecycle_0.2.0 jsonlite_1.7.1 carData_3.0-4
## [145] fansi_0.4.2 limma_3.42.2 askpass_1.1
## [148] desc_1.2.0 viridisLite_0.3.0 pillar_1.4.6
## [151] lattice_0.20-38 pkgbuild_1.1.0 fastmap_1.0.1
## [154] httr_1.4.2 survival_3.2-7 GO.db_3.10.0
## [157] remotes_2.2.0 glue_1.4.2 zip_2.1.0
## [160] png_0.1-7 bit_4.0.4 ggforce_0.3.2
## [163] stringi_1.5.3 blob_1.2.1 latticeExtra_0.6-29
## [166] caTools_1.18.0 memoise_2.0.0