Aims: to (i) quantify intron excision ratios across samples; (ii) perform differential splicing analyses across disease groups; and (iii) replicate findings from own study with those from this study, SRP058181.

1 File paths for workflow

# File paths
source(here::here("R", "file_paths.R"))
leafviz_dir <- "/home/rreynolds/packages/leafcutter/leafviz/"
markergene_dir <- "/home/rreynolds/projects/MarkerGenes/"

# Functions
source(here::here("R", "biomart_df.R"))

# Files
load(file.path(path_to_recount, "counts/rse_gene.Rdata"))
sample_info <- 
  readRDS(
    file.path(
      path_to_results,
      "SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
      )
    ) %>% 
  colData() %>% 
  as_tibble()

2 Running the code

As the primary aim was validation, Leafcutter was performed using the same parameters as those used in analysis of own samples.

2.1 Generating .junc input files

2.1.1 Convert recount junction files to Leafcutter-friendly .junc files

For each sample, need .junc file with columns: chr, intron_start, intron_end, unknown, unique_reads_junction, strand
Removed sample C0061 from analysis due to uncertainty re. sex.

load(file.path(path_to_recount, "junctions/rse_jx.Rdata"))
ENCODE_blacklist_hg38 <- rtracklayer::import(file.path(path_to_raw_data, "misc/hg38-blacklist.v2.bed"))

recount_ids <- rse_jx %>% 
  assay() %>% 
  colnames()

# Remove SRR2015746 (C0061), due to uncertainty re. sex
recount_ids <- recount_ids[!str_detect(recount_ids,"SRR2015746")] 

junction_counts <- rse_jx %>% 
  # Need to extract row names, as they are unnamed in count.tsv and assay(rse_jx)
  rowRanges() %>% 
  as_tibble() %>% 
  dplyr::select(seqnames, start, end, strand, junction_id) %>% 
  # Bind columns from assay(rse_jx)
  bind_cols(rse_jx %>% 
              assay() %>% 
              as_tibble()) %>% 
  # Remove unwanted chromosomes builds
  dplyr::filter(seqnames %in% c(str_c("chr", 1:22), "chrX", "chrY", "chrM")) %>% 
  dplyr::mutate(unknown = ".")

# Remove junctions that overlap with ENCODE blacklist regions
overlapped_junctions <- GenomicRanges::findOverlaps(query = ENCODE_blacklist_hg38,
                                                    subject = junction_counts %>% 
                                                      dplyr::select(seqnames, start, end, strand) %>% 
                                                      GenomicRanges::makeGRangesFromDataFrame(),
                                                    ignore.strand = F)

indexes <- subjectHits(overlapped_junctions)
junction_counts <- junction_counts[-indexes, ] %>%
  dplyr::mutate(seqnames = str_replace(seqnames, "chr", ""))
  
output_path <- file.path(path_to_results, "SRP058181/leafcutter/SJ_formatting/")

for(i in 1:length(recount_ids)){
  
  id <- recount_ids[i] 
  
  print(str_c(i, ": ", id))
  
  filtered <- junction_counts %>% 
    dplyr::select(seqnames, start, end, unknown, one_of(id), strand) 
  
  colnames(filtered) <- ifelse(colnames(filtered) == id, "samp_id", colnames(filtered))
  
  filtered <- 
    filtered %>% 
    dplyr::filter(samp_id !=0)
  
  write_delim(filtered %>% 
                mutate(start = as.character(as.integer(start)), 
                       end = as.character(as.integer(end)), 
                       samp_id = as.character(as.integer(samp_id))),
              str_c(output_path, id, "_SJ_leafcutter.junc"), 
              delim = "\t", 
              col_names = F)
  
  
}
  
# write a .txt file with each .junc file
junc_df <- tibble(junc_file_name = list.files(path = output_path, pattern = "_SJ_leafcutter.junc", full.names = TRUE))

write_delim(junc_df,
            path = str_c(output_path, "/list_juncfiles.txt"),
            delim = "\t",
            col_names = F)

2.2 Cluster introns

Here, the leafcutter script has various inputs:
1. -j - text file with all junction files to be processed
2. -r - write to directory (default ./)
3. -o - output prefix (default leafcutter)
4. -l - maximum intron length in bp (default 100,000bp). Set to 1,000,000, which is what was used in the STAR alignment.
5. -m - minimum reads in a cluster (default 30 reads across all samples). Kept this at the default value. In theory, this would be the equivalent of detecting 1-2 reads for the same cluster across each of the 24 samples. Thus, it should still be possible to detect clusters with low read counts.
6. -p - minimum fraction of reads in a cluster that support a junction (default 0.001)
7. -s - use strand info (default=False)

python /tools/leafcutter/clustering/leafcutter_cluster.py \
-j /home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/SJ_formatting/list_juncfiles.txt \
-r /home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/ \
-o test_diseasegroups \
-l 1000000 \ 
-m 30 \
-p 0.001 \
-s True

2.3 Differential splicing analysis - corrected for AoD and RIN

Here leafcutter allows an input of an exon file generated from a .gtf, which annotates clusters according to which gene they correspond to. The code for generating this file is below:

Rscript /tools/leafcutter/scripts/gtf_to_exons.R \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz

Another input is a splitting the samples by groups. Within this file, it is also possible to include confounders. In this analysis, RIN and age of death were added as potential confounders (sex is irrelevant as all of the subjects were male). File was generated using code below:

# Load per individual intron counts for column names
sample_names <- 
  fread(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz"
    )
  ) %>% 
  dplyr::select(-V1) %>% 
  colnames()

# Create master df of sample info, with grouping variable (Disease_Group) and confounders (RIN and AoD)
master <- 
  tibble(lc_sample_name = sample_names,
         sample_name = sample_names %>% 
           str_replace("_.*", "")) %>% 
  inner_join(sample_info, by = c("sample_name" = "recount_id")) %>% 
  dplyr::select(lc_sample_name, Disease_Group, RIN, Age_at_death) %>% 
  arrange(Disease_Group)

RNAseqProcessing::create_group_files_multi_pairwisecomp(
  df = master, 
  group_column_name = "Disease_Group", 
  output_path = file.path(path_to_results, "SRP058181/leafcutter/diff_splicing/group_files/")
)

Currently, only pairwise comparisons are possible, thus will have to run the various combinations of pairwise comparison, and perform FDR correction once all results are compiled.
The leafcutter differential splicing script parameters:
- -o - The prefix for the two output files
- -s - Don’t test clusters with more introns than this [default = Inf]
- -i - Ignore introns used (i.e. at least one supporting read) in fewer than n samples [default = 5]
- -g - Require this many samples in each group to have at least min_coverage reads [default = 3]
- -c - Require min_samples_per_group samples in each group to have at least this many reads [default = 20]
- -t - Maximum time (in seconds) allowed for a single optimization run [default = 30]
- -p - Number of threads to use [default = 1]
- -e - File defining known exons, example in data/gencode19_exons.txt.gz. Columns should be chr, start, end, strand, gene_name. Optional, only just to label the clusters.[default = NULL]

nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/SRP058181_leafcutter_ds.log&

2.4 Differential splicing analysis - corrected for AoD, PMI, RIN and cell-type proportions

The only thing that requires changing here is the confounders used to create the group files, in addition to the location of the group files (so as not overwrite the previously created group files).

# Load per individual intron counts for column names
sample_names <- 
  fread(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz"
    )
  ) %>% 
  dplyr::select(-V1) %>% 
  colnames()

# Create master df of sample info, with grouping variable (Disease_Group) and confounders (RIN, AoD, PMI, cell-type proportions)
master <- 
  tibble(lc_sample_name = sample_names,
         sample_name = sample_names %>% 
           str_replace("_.*", "")) %>% 
  inner_join(sample_info, by = c("sample_name" = "recount_id")) %>% 
  dplyr::select(lc_sample_name, Disease_Group, RIN, Age_at_death, PMI, contains("_proportion")) %>% 
  arrange(Disease_Group)

RNAseqProcessing::create_group_files_multi_pairwisecomp(
  df = master, 
  group_column_name = "Disease_Group", 
  output_path = file.path(path_to_results, "SRP058181/leafcutter/diff_splicing_celltype/group_files/")
)

Likewise, need to change the paths given in the nohup command.

nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafcutter_ds_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing_celltype/group_files/ \
--output_prefix=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing_celltype/ \
--max_cluster_size=Inf \
--min_samples_per_intron=5 \
--min_samples_per_group=3 \
--min_coverage=20 \
--timeout=30 \
--num_threads=15 \
--exon_file=/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97_LC_exon_file.txt.gz \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/SRP058181_leafcutter_ds_celltype.log&

2.5 Leafviz

2.5.1 Generate the annotation files

Processes a given GTF file to produce exons, introns and splice sites.

/tools/leafcutter/leafviz/gtf2leafcutter.pl \
-o /data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
/data/references/ensembl/gtf_gff3/v97/Homo_sapiens.GRCh38.97.gtf.gz

2.5.2 Prepare results for use with Leafviz

For the shiny application, Leafviz, to run with the results of a differential splicing analysis, this requires that the results of the differential splicing have been formatted for use.
LeafCutter provides a script, prepare_results.R, which performs this formatting, albeit for only one pairwise comparison. To format the results of multiple pairwise comparisons requires looping across the various pairwise comparisons and running the prepare_results.R for each individual pairwise comparison. This is what the leafviz_multi_pairwise.R script used below does.
Note: Using the leafviz_multi_pairwise.R script assumes use of the leafcutter_ds_multi_pairwise.R script and the create_group_files_multi_pairwisecomp() function. This ensures that all files needed are named consistently. That is, the (i) _cluster_signficance.txt, (ii) _effect_sizes.txt and (iii) _group_file.txt for an individual pairwise comparison all have the same prefix.

nohup Rscript /home/rreynolds/packages/RNAseqProcessing/analysis/leafviz_multi_pairwise.R \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/intron_clustering/test_diseasegroups_perind_numers.counts.gz \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/ \
/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/ \
/data/references/ensembl/gtf_gff3/v97/leafcutter/Homo_sapiens.GRCh38.97 \
--output_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/leafviz/ \
--group_file_dir=/home/rreynolds/projects/Aim2_PDsequencing_wd/results/SRP058181/leafcutter/diff_splicing/group_files/ \
--FDR=0.05 \
&>/home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/nohup_logs/SRP058181_leafviz_prepare_results.log&

2.5.3 Running Leafviz

The function has to be run separately for each of the pairwise comparisons.

RNAseqProcessing::run_leafviz(
  leafviz_dir = leafviz_dir, 
  results_filename = file.path(path_to_results, "SRP058181/leafcutter/leafviz/Control_vs_PD.Rda")
)

3 Results - covariate correction

3.1 Format

LeafCutter outputs 2 result files. The results come tested cluster by cluster rather than intron by intron. The two files list:
1. The significance of a cluster i.e. whether there was differential intron excision between the two tested groups. Columns include:
  1. cluster: the cluster id
  2. Status: whether this cluster was a) successfully tested b) not tested for some reason (e.g. too many introns) c) there was an error during testing - this should be rare.
  3. loglr: log likelihood ratio between the null model (no difference between the groups) and alternative (there is a difference)
  4. df: degrees of freedom, equal to the number of introns in the cluster minus one (assuming two groups)
  5. p: the resulting (unadjusted!) p-value under the asymptotic Chi-squared distribution. We just use p.adjust( ..., method="fdr") in R to control FDR based on these.
2. The effect sizes for every intron. Columns include:
  1. intron: this has the form chromosome:intron_start:intron_end:cluster_id
  2. log effect size (as fitted by LeafCutter).
  3. Fitted usage proportion in condition 1.
  4. Fitted usage proportion in condition 2.
  5. DeltaPSI: the difference in usage proprotion (condition 2 - condition 1). Note that in general this will have the same sign as the log effect size but in some cases the sign may change as a result of larger changes for other introns in the cluster.
As multiple pairwise comparisons were made, multiple test correction should reflect this. Thus, FDR correction was applied across all comparisons made.

clusters <- 
  list.files(
    file.path(path_to_results, "SRP058181/leafcutter/diff_splicing/"),
    pattern = "cluster_", 
    recursive = T, 
    full.names = T
  )
intron <- 
  list.files(
    file.path(path_to_results, "SRP058181/leafcutter/diff_splicing/"), 
    pattern = "effect_", 
    recursive = T, 
    full.names = T
  ) 

for(i in 1:length(clusters)){
  
  comparison <- clusters[i] %>% 
                    str_replace("/.*/", "") %>% 
                    str_replace("_cluster_significance.txt", "")
  
  lc <- read_delim(clusters[i], delim = "\t") %>% 
    
    dplyr::mutate(comparison = comparison) %>% 
    dplyr::select(comparison, everything())
  
  if(i == 1){
    
    lc_all <- lc

  } else{
    
    lc_all <- lc_all %>% 
      bind_rows(lc)
    
  }
  
}

for(i in 1:length(intron)){
  
  comparison <- intron[i] %>% 
                    str_replace("/.*/", "") %>% 
                    str_replace("_effect_sizes.txt", "")
  
  int <- read_delim(intron[i], delim = "\t") %>% 
    dplyr::mutate(comparison = comparison) %>% 
    dplyr::select(comparison, everything())
  
  if(i == 1){
    
    int_all <- int
    
  } else{
    
    int_all <- int_all %>% 
      bind_rows(int)
    
  }
  
}

# Filter for significant clusters
# Add direction of effect
lc_filtered <- lc_all %>% 
  # filter for successful tests and remove clusters overlapping multiple genes
  dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
  tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>% 
  dplyr::inner_join(int_all %>% 
                      tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>% 
  dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
                                                deltapsi < 0 ~ "downregulated",
                                                deltapsi == 0 ~ "no_change"))


# Save results
write_delim(
  lc_filtered,
  path = 
    file.path(
      path_to_results, 
      "SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.txt"
      ),
  delim = "\t"
)
saveRDS(
  setNames(list(lc_all, int_all, lc_filtered),
           c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
  file.path(
    path_to_results,
    "SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
  )
)

3.2 Summary of results

results_covar <- 
  readRDS(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
    )
  ) 

# Summary of differential splicing run
summary <- results_covar$cluster_significance %>% 
  dplyr::group_by(comparison) %>% 
  dplyr::summarise(n = n()) %>%
  tidyr::spread(key = comparison, value = n) %>% 
  dplyr::mutate(status = "Total clusters across all samples (read >= 30)") %>% 
  dplyr::select(status, everything()) %>% 
  dplyr::bind_rows(results_covar$cluster_significance %>% 
                     dplyr::group_by(comparison, status) %>% 
                     dplyr::summarise(n = n()) %>% 
                     tidyr::spread(key = comparison, value = n)) %>% 
  dplyr::bind_rows(results_covar$significant_clusters_0.05_filter %>% 
                     dplyr::group_by(comparison) %>% 
                     summarise(n = n()) %>% 
                     tidyr::spread(key = comparison, value = n) %>% 
                     dplyr::mutate(status = "Differential intron excision, p.adjust < 0.05") %>% 
                     dplyr::select(status, everything())) %>% 
  dplyr::mutate(status = str_replace(status, "Success", "Successfully tested"))

# Add propotions
summary <- 
  summary %>% 
  dplyr::bind_rows(
    (((summary %>% 
         dplyr::filter(status == "Successfully tested") %>% 
         dplyr::select(-status))/(summary %>% 
                                    dplyr::filter(status == "Total clusters across all samples (read >= 30)") %>% 
                                    dplyr::select(-status))) *100) %>% 
      dplyr::mutate(status = "Successfully tested/Total clusters (%)")
  ) %>% 
  dplyr::bind_rows(
    (((summary %>% 
         dplyr::filter(status == "Differential intron excision, p.adjust < 0.05") %>% 
         dplyr::select(-status))/(summary %>% 
                                    dplyr::filter(status == "Successfully tested") %>% 
                                    dplyr::select(-status))) *100) %>% 
      dplyr::mutate(status = "Differentially spliced/Successfully tested (%)")
  )

summary %>% 
  DT::datatable(rownames = FALSE,
                options = list(scrollX = TRUE),
                class = 'white-space: nowrap')

results_covar$intron_usage %>% 
  tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>% 
  dplyr::group_by(comparison, cluster_id) %>% 
  dplyr::summarise(n_intron = n()) %>% 
  dplyr::inner_join(results_covar$intron_usage %>% 
                      tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>% 
                      dplyr::group_by(comparison, cluster_id) %>% 
                      dplyr::summarise(n_intron = n()) %>% 
                      dplyr::group_by(comparison) %>% 
                      dplyr::summarise(median = median(n_intron))) %>% 
  ggplot(aes(x = n_intron)) + 
  stat_count(color = "black", alpha = 0.5) +
  geom_vline(aes(xintercept=median), color="red",
             linetype="dashed") +
  facet_wrap(vars(comparison)) + 
  coord_cartesian(xlim = c(0,30)) + 
  theme_rhr

Histogram of intron numbers per cluster in various comparisons. Dashed red line indicates median intron number within each comparison.

Figure 3.1: Histogram of intron numbers per cluster in various comparisons. Dashed red line indicates median intron number within each comparison.

3.3 Pathway enrichment analysis

Using a .gtf file, intron clusters can be annotated to genes. Thus, differential excision events can be run through g:Profiler using the genes to which the these events were annotated.
Prior to running this analysis, worth knowing just how many differentially spliced genes overlap between the pairwise comparisons and likewise how many can be considered "unique" to a pairwise comparison.

# Run once
all_genes <- results_covar$cluster_significance %>% 
  # remove clusters that overlap multiple genes
  dplyr::filter(!str_detect(genes, ",")) %>% 
  .[["genes"]] %>% 
  unique() 

genes_list <- setNames(results_covar$significant_clusters_0.05_filter %>% 
                           group_split(comparison),
                         results_covar$significant_clusters_0.05_filter %>% 
                           .[["comparison"]] %>% 
                           unique() %>% 
                           sort()) %>% 
  # For each dataframe, extract the gene column and remove duplicate genes
  lapply(., function(x){x %>% .[["genes"]] %>% unique()})

upset(fromList(genes_list), sets = names(genes_list), keep.order = TRUE, nintersects = 25, order.by = "freq")

Overlap between pairwise comparisons. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

Figure 3.2: Overlap between pairwise comparisons. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

gom.self <- newGOM(genes_list, genes_list, genome.size = length(all_genes))
All.jaccard <- getMatrix(gom.self, "Jaccard")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(All.jaccard, method="color", col=col(100), cl.lim = c(0,1), 
          type="lower", 
          # addCoef.col = "black", number.cex = 0.75, # Add coefficient of correlation
          tl.col="black", tl.srt=45, tl.cex = 0.75, mar=c(0,0,1,0), #Text label color and rotation
          p.mat = All.jaccard, sig.level = (0.05), insig = "p-value", number.cex = 0.75,  
          diag = TRUE)

Jaccard index between diffentially spliced genes identified in pairwise comparisons. Calculated by dividing the size of the intersection between two gene lists by the union of the two. Intersections with a Jaccard index > 0.05 (i.e. greater than 5% overlap) are highlighted within the plot, with Jaccard indices provided.

Figure 3.3: Jaccard index between diffentially spliced genes identified in pairwise comparisons. Calculated by dividing the size of the intersection between two gene lists by the union of the two. Intersections with a Jaccard index > 0.05 (i.e. greater than 5% overlap) are highlighted within the plot, with Jaccard indices provided.

# Run once
gprofiler_results_list <- lapply(genes_list, function(x){
  x %>% 
    gost(., organism = "hsapiens",
       correction_method= "gSCS", significant = TRUE,
       user_threshold = 0.05,
       custom_bg= all_genes,
       sources = c("GO", "REAC", "KEGG", "OMIM"),
       evcodes = TRUE)})

saveRDS(
  gprofiler_results_list, 
  file.path(
    path_to_results,
    "SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_RINAoD.Rds"
    )
  )

Some filtering of results occurred to remove broad terms i.e. only term_sizes >20 and <= 2000 are included in table below. Same threshold set during pathway analysis for differential gene expression.
Background list used includes all genes tested in the analysis amounting to: 12782 genes

gprofiler_results_list <- 
  readRDS(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_RINAoD.Rds"
    )
  )

# Added some filtering to remove broad terms
gprofiler_df <- gprofiler_results_list %>% 
  lapply(., function(x){
    x$result %>% 
      as_tibble()
  }) %>% 
  qdapTools::list_df2df(., col = "comparison") %>% 
  dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>% 
  dplyr::filter(term_size > 20 & term_size <= 2000) %>%
  dplyr::arrange(comparison, source, p_value)

gprofiler_df %>%
  dplyr::group_by(comparison) %>%
  dplyr::summarise(n_enriched_pathways = n())

gprofiler_df %>% 
  ggplot(aes(x = comparison, 
             y = fct_rev(term_name), 
             size = precision, 
             fill = p_value)) +
  geom_point(pch = 21) +    
  scale_fill_viridis_c(direction = -1) +
  labs(x = "Comparison", y = "Dataset: Cell type") +
  theme_rhr

Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

Figure 3.4: Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

4 Results - correction with AoD, PMI, RIN, and cell-type proportions

4.1 Format

clusters <- 
  list.files(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/diff_splicing_celltype/"
    ), 
    pattern = "cluster_", 
    recursive = T, 
    full.names = T
    )
intron <- 
  list.files(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/diff_splicing_celltype/"
    ), 
    pattern = "effect_", 
    recursive = T, 
    full.names = T
    ) 

for(i in 1:length(clusters)){
  
  comparison <- clusters[i] %>% 
                    str_replace("/.*/", "") %>% 
                    str_replace("_cluster_significance.txt", "")
  
  lc <- read_delim(clusters[i], delim = "\t") %>% 
    
    dplyr::mutate(comparison = comparison) %>% 
    dplyr::select(comparison, everything())
  
  if(i == 1){
    
    lc_all <- lc

  } else{
    
    lc_all <- lc_all %>% 
      bind_rows(lc)
    
  }
  
}

for(i in 1:length(intron)){
  
  comparison <- intron[i] %>% 
                    str_replace("/.*/", "") %>% 
                    str_replace("_effect_sizes.txt", "")
  
  int <- read_delim(intron[i], delim = "\t") %>% 
    dplyr::mutate(comparison = comparison) %>% 
    dplyr::select(comparison, everything())
  
  if(i == 1){
    
    int_all <- int
    
  } else{
    
    int_all <- int_all %>% 
      bind_rows(int)
    
  }
  
}

# Filter for significant clusters
# Add direction of effect
lc_filtered <- lc_all %>% 
  # filter for successful tests and remove clusters overlapping multiple genes
  dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
  tidyr::separate(cluster, into = c("chr", "cluster"), sep = ":") %>% 
  dplyr::inner_join(int_all %>% 
                      tidyr::separate(intron, into = c("chr", "start", "end", "cluster"), sep = ":")) %>% 
  dplyr::mutate(direction_of_effect = case_when(deltapsi > 0 ~ "upregulated",
                                                deltapsi < 0 ~ "downregulated",
                                                deltapsi == 0 ~ "no_change"))


# Save results
write_delim(
  lc_filtered,
  path = 
    file.path(
      path_to_results, 
      "SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.txt"
    ),
  delim = "\t"
)
saveRDS(
  setNames(list(lc_all, int_all, lc_filtered),
           c("cluster_significance", "intron_usage", "significant_clusters_0.05_filter")),
  file.path(
    path_to_results,
    "SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
  )
)

4.2 Summary of results

results_ct <- readRDS(
  file.path(
    path_to_results,
    "SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
  )
) 

# Summary of differential splicing run
summary <- results_ct$cluster_significance %>% 
  dplyr::group_by(comparison) %>% 
  dplyr::summarise(n = n()) %>%
  tidyr::spread(key = comparison, value = n) %>% 
  dplyr::mutate(status = "Total clusters across all samples (read >= 30)") %>% 
  dplyr::select(status, everything()) %>% 
  dplyr::bind_rows(results_ct$cluster_significance %>% 
                     dplyr::group_by(comparison, status) %>% 
                     dplyr::summarise(n = n()) %>% 
                     tidyr::spread(key = comparison, value = n)) %>% 
  dplyr::bind_rows(results_ct$significant_clusters_0.05_filter %>% 
                     dplyr::group_by(comparison) %>% 
                     summarise(n = n()) %>% 
                     tidyr::spread(key = comparison, value = n) %>% 
                     dplyr::mutate(status = "Differential intron excision, p.adjust < 0.05") %>% 
                     dplyr::select(status, everything())) %>% 
  dplyr::mutate(status = str_replace(status, "Success", "Successfully tested"))

# Add propotions
summary <- summary %>% 
  dplyr::bind_rows((((summary %>% 
                       dplyr::filter(status == "Successfully tested") %>% 
                       dplyr::select(-status))/(summary %>% 
                                                  dplyr::filter(status == "Total clusters across all samples (read >= 30)") %>% 
                                                  dplyr::select(-status))) *100) %>% 
                     dplyr::mutate(status = "Successfully tested/Total clusters (%)")) %>% 
  dplyr::bind_rows((((summary %>% 
                       dplyr::filter(status == "Differential intron excision, p.adjust < 0.05") %>% 
                       dplyr::select(-status))/(summary %>% 
                                                  dplyr::filter(status == "Successfully tested") %>% 
                                                  dplyr::select(-status))) *100) %>% 
                     dplyr::mutate(status = "Differentially spliced/Successfully tested (%)"))

summary %>% 
  DT::datatable(rownames = FALSE,
                options = list(scrollX = TRUE),
                class = 'white-space: nowrap')

results_ct$intron_usage %>% 
  tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>% 
  dplyr::group_by(comparison, cluster_id) %>% 
  dplyr::summarise(n_intron = n()) %>% 
  dplyr::inner_join(results_ct$intron_usage %>% 
                      tidyr::separate(col = intron, into = c("chr", "start", "end", "cluster_id"), sep = ":") %>% 
                      dplyr::group_by(comparison, cluster_id) %>% 
                      dplyr::summarise(n_intron = n()) %>% 
                      dplyr::group_by(comparison) %>% 
                      dplyr::summarise(median = median(n_intron))) %>% 
  ggplot(aes(x = n_intron)) + 
  stat_count(color = "black", alpha = 0.5) +
  geom_vline(aes(xintercept=median), color="red",
             linetype="dashed") +
  facet_wrap(vars(comparison)) + 
  coord_cartesian(xlim = c(0,30)) + 
  theme_rhr

Figure 4.1: Histogram of intron numbers per cluster in various comparisons. Dashed red line indicates median intron number within each comparison.

4.3 Pathway enrichment analysis

# Run once
all_genes <- results_ct$cluster_significance %>% 
  # remove clusters that overlap multiple genes
  dplyr::filter(!str_detect(genes, ",")) %>% 
  .[["genes"]] %>% 
  unique() 

genes_list_ct <- setNames(results_ct$significant_clusters_0.05_filter %>% 
                           group_split(comparison),
                         results_ct$significant_clusters_0.05_filter %>% 
                           .[["comparison"]] %>% 
                           unique() %>% 
                           sort()) %>% 
  # For each dataframe, extract the gene column and remove duplicate genes
  lapply(., function(x){x %>% .[["genes"]] %>% unique()})

upset(fromList(genes_list_ct), sets = names(genes_list_ct), keep.order = TRUE, nintersects = 25, order.by = "freq")

Figure 4.2: Overlap between pairwise comparisons. In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

# Run once
gprofiler_results_list_ct <- lapply(genes_list_ct, function(x){
  x %>% 
    gost(., organism = "hsapiens",
       correction_method= "gSCS", significant = TRUE,
       user_threshold = 0.05,
       custom_bg= all_genes,
       sources = c("GO", "REAC", "KEGG", "OMIM"),
       evcodes = TRUE)})

saveRDS(
  gprofiler_results_list_ct, 
  file.path(
    path_to_results,
    "SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_celltype.Rds"
  )
)

Some filtering of results occurred to remove broad terms i.e. only term_sizes >20 and <= 2000 are included in table below. Same threshold set during pathway analysis for differential gene expression.
Background list used includes all genes tested in the analysis amounting to: 12782 genes

gprofiler_results_list_ct <- 
  readRDS(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/gprofiler/gprofiler_leafcutter_ds_celltype.Rds"
    )
  )

# Added some filtering to remove broad terms
gprofiler_df <- gprofiler_results_list_ct %>% 
  lapply(., function(x){
    x$result %>% 
      as_tibble()
  }) %>% 
  qdapTools::list_df2df(., col = "comparison") %>% 
  dplyr::select(comparison, term_name, source, everything(), -query, -significant) %>% 
  dplyr::filter(term_size > 20 & term_size <= 2000) %>%
  dplyr::arrange(comparison, source, p_value)

gprofiler_df %>%
  dplyr::group_by(comparison) %>%
  dplyr::summarise(n_enriched_pathways = n())

gprofiler_df %>% 
  ggplot(aes(x = comparison, 
             y = fct_rev(term_name), 
             size = precision, 
             fill = p_value)) +
  geom_point(pch = 21) +    
  scale_fill_viridis_c(direction = -1) +
  labs(x = "Comparison", y = "Dataset: Cell type") +
  theme_rhr

Figure 4.3: Gene ontology enrichments (GO, KEGG, REACTOME) of genes across each comparison. Size of dot indicates precision, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

In spite of identifying more differential intron excision events and differentially spliced genes, fewer pathways found to be enriched.
Also, very non-specific terms identified, even in spite of filtering for term size.
Notably, there is some overlap to gprofiler terms we saw highlighted in cell-type-corrected leafcutter analyses of our own data (albeit not in the same comparisons), including:
- Plasma membrane bounded cell project: highlighted for Control_vs_PD in our own data and in Control_vs_PDD in this recount dataset
- Regulation of cellular component organisation: highlighted for Control_vs_DLB and PD_vs_DLB in our own data and Control_vs_PDD in this recount dataset

5 Overlap between correction methods

For all comparisons corrected with cell type (denoted with "_ct" after the comparison) the number of differentially spliced genes has increased (significantly so in some cases e.g. Control_vs_PDD).
Proportion of unique genes is relatively similar between comparisons (i.e. low), except for cell-type-corrected comparisons of Control_vs_PDD and PD_vs_PDD, which makes sense given these are the lists that see the biggest increase in differentially spliced genes.

# Change names on genes_list_ct to denote correction method
names(genes_list_ct) <- str_c(names(genes_list_ct), "_ct")
# Create master list with genes from both correction methods
master_list <- c(genes_list, genes_list_ct)

upset(fromList(master_list), sets = names(master_list), keep.order = TRUE, nintersects = 30, order.by = "freq")

Overlap between pairwise comparisons from covariate-corrected and cell-type-corrected data (latter denoted with '_ct' following comparison name). In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

Figure 5.1: Overlap between pairwise comparisons from covariate-corrected and cell-type-corrected data (latter denoted with '_ct' following comparison name). In the matrix (lower half of panel), rows represent the pairwise comparisons and the columns represent their intersections, with a single black filled circle representing those genes that were not found to be part of an intersection, while black filled circles connected by a vertical line represent genes that intersect across panels. The size of each intersection is shown as a bar chart above the matrix (upper half of panel), while the size of each gene set is shown to the left of the matrix.

lc_filtered_genes <- results_covar$significant_clusters_0.05_filter %>%
  dplyr::select(comparison, genes) %>%
  dplyr::distinct(comparison, genes)

lc_filtered_ct_genes <- results_ct$significant_clusters_0.05_filter %>%
  dplyr::select(comparison, genes) %>%
  dplyr::distinct(comparison, genes) 
  
lc_filtered_genes %>% 
  dplyr::group_by(comparison) %>%
  dplyr::distinct(genes) %>% 
  dplyr::summarise(n_significant_spliced_genes = n()) %>% 
  dplyr::bind_rows(lc_filtered_ct_genes %>%
                     dplyr::group_by(comparison) %>% 
                     dplyr::distinct(genes) %>% 
                     dplyr::summarise(n_significant_spliced_genes = n()) %>% 
                     dplyr::ungroup() %>% 
                     dplyr::mutate(comparison = str_c(comparison, "_ct"))) %>% 
  dplyr::mutate(unique_significant_spliced_genes = c(53, 52, 8, 80, 394, 424),
                proportion_unique = round(unique_significant_spliced_genes/n_significant_spliced_genes, 2)) %>% 
  # dplyr::select(-unique_significant_spliced_genes) %>% 
  arrange(desc(n_significant_spliced_genes))

6 Filtering by |dPSI| >= 0.1

6.1 Filtering

As with own data, will run using a filter for those events that are less likely to be false positives.
Prior to running this, it is worthwhile knowing the distribution of dPSI, and what effect applying the filter of |dPSI| >= 0.1 might have.

leafcutter_list <- 
  setNames(
    list(
      readRDS(
        file.path(
          path_to_results, 
          "SRP058181/leafcutter/diff_splicing/allcomparisons_leafcutter_ds_RINAoD.Rds"
        )
      ), 
      readRDS(
        file.path(
          path_to_results,
          "SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"
        )
      )
    ),
    c("covar", "ct")
  )

plot_list <- leafcutter_list %>% 
  lapply(., function(x){
    
    x$significant_clusters_0.05_filter %>% 
      dplyr::filter(direction_of_effect != "no_change") %>% 
      dplyr::mutate(xintercept = ifelse(direction_of_effect == "upregulated", 0.1, -0.1)) %>% 
      dplyr::inner_join(x$significant_clusters_0.05_filter %>% 
                          dplyr::group_by(comparison, direction_of_effect) %>% 
                          dplyr::summarise(median = median(deltapsi))) %>% 
      ggplot(aes(x = deltapsi)) + 
      geom_histogram(color = "black", binwidth = 0.01, alpha = 0.5) +
      geom_vline(aes(xintercept = median), color="red",
                 linetype="dashed") +
      geom_vline(aes(xintercept = xintercept), color="#00BFC4",
                 linetype="dashed") +
      facet_grid(rows = vars(comparison), cols = vars(direction_of_effect), scales = "free_x") + 
      coord_cartesian(ylim = c(0,450)) +
      theme_rhr
    
  })

ggarrange(plotlist = plot_list,
          ncol = 2, 
          labels = c("a", "b"))

Histogram of deltaPSI of introns within differentially spliced clusters identified using (a) co-variate correction and (b) cell-type correction. Dashed red line indicates median delta PSI within each facet, while dashed blue line indicates the |dPSI| >= 0.1 cut-off.

Figure 6.1: Histogram of deltaPSI of introns within differentially spliced clusters identified using (a) co-variate correction and (b) cell-type correction. Dashed red line indicates median delta PSI within each facet, while dashed blue line indicates the |dPSI| >= 0.1 cut-off.

leafcutter_list %>% 
  lapply(., function(x){
    
    x$significant_clusters_0.05_filter %>% 
      dplyr::filter(direction_of_effect != "no_change") %>% 
      dplyr::group_by(comparison, direction_of_effect) %>% 
      dplyr::summarise(median = median(deltapsi))
    
  })

## $covar
## # A tibble: 6 x 3
## # Groups:   comparison [3]
##   comparison     direction_of_effect  median
##   <chr>          <chr>                 <dbl>
## 1 Control_vs_PD  downregulated       -0.0121
## 2 Control_vs_PD  upregulated          0.0127
## 3 Control_vs_PDD downregulated       -0.0207
## 4 Control_vs_PDD upregulated          0.0159
## 5 PD_vs_PDD      downregulated       -0.0160
## 6 PD_vs_PDD      upregulated          0.0152
## 
## $ct
## # A tibble: 6 x 3
## # Groups:   comparison [3]
##   comparison     direction_of_effect  median
##   <chr>          <chr>                 <dbl>
## 1 Control_vs_PD  downregulated       -0.0174
## 2 Control_vs_PD  upregulated          0.0133
## 3 Control_vs_PDD downregulated       -0.0228
## 4 Control_vs_PDD upregulated          0.0206
## 5 PD_vs_PDD      downregulated       -0.0152
## 6 PD_vs_PDD      upregulated          0.0170

It is worth noting from this that correction with cell-type proportions does not particularly affect the median dPSI.
With this in mind, let's now apply the filter of |dPSI| >= 0.1.

# Adding filter of dPSI
stringent <- leafcutter_list %>% 
  lapply(., function (x){
    
    x$cluster_significance %>% 
      # filter for successful tests and remove clusters that overlap multiple genes
      dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
      tidyr::separate(cluster, into = c("chr", "cluster_id"), sep = ":") %>% 
      dplyr::inner_join(x$intron_usage %>% 
                          tidyr::separate(intron, into = c("chr", "start", "end", "cluster_id"), sep = ":")) %>% 
      dplyr::filter(abs(deltapsi) >= 0.1) 

    
  })

# Number of differentially spliced clusters
stringent %>% 
  lapply(., function(x){
    
    x %>% 
      dplyr::distinct(comparison, cluster_id, genes) %>% 
      dplyr::group_by(comparison) %>% 
      dplyr::summarise(n = n())
    
  })

## $covar
## # A tibble: 3 x 2
##   comparison         n
##   <chr>          <int>
## 1 Control_vs_PD     57
## 2 Control_vs_PDD   132
## 3 PD_vs_PDD         16
## 
## $ct
## # A tibble: 3 x 2
##   comparison         n
##   <chr>          <int>
## 1 Control_vs_PD     45
## 2 Control_vs_PDD   278
## 3 PD_vs_PDD        158

# Number of differentially spliced genes
stringent %>% 
  lapply(., function(x){
    
    x %>% 
      dplyr::distinct(comparison, genes) %>% 
      dplyr::group_by(comparison) %>% 
      dplyr::summarise(n = n())
    
  })

## $covar
## # A tibble: 3 x 2
##   comparison         n
##   <chr>          <int>
## 1 Control_vs_PD     57
## 2 Control_vs_PDD   131
## 3 PD_vs_PDD         16
## 
## $ct
## # A tibble: 3 x 2
##   comparison         n
##   <chr>          <int>
## 1 Control_vs_PD     45
## 2 Control_vs_PDD   273
## 3 PD_vs_PDD        157

6.2 EWCE

As performed with own analyses, will cap gene lists at 100 for EWCE.

# Load internal specificity matrices
ctd_files <- 
  list.files(
    file.path(
      path_to_results,
      "snRNA/specificity_matrices/2020_March/"
    ), 
    pattern = "ctd", 
    full.names = T
  )
    

ctd_list <- vector(mode = "list", length = length(ctd_files))

for(i in 1:length(ctd_files)){
  
  # Load ctd
  ctd_list[[i]] <- readRDS(ctd_files[i])
  
  # Extract file name
  title <- 
    ctd_files[i] %>% 
    str_replace(".*/", "") %>% 
    str_replace("\\..*", "") %>% 
    str_replace(".*_", "")
  
  # Name list
  names(ctd_list)[i] <- title
 
}

# Load external datasets
load(file.path(markergene_dir, "specificity_matrices/AIBS2018_MTG.rda"))
load(file.path(markergene_dir, "specificity_matrices/Habib2017_DroNc_Human.rda")

# ewce lists
genes_list_ewce <- 
  setNames(stringent$ct %>% 
             group_split(comparison),
           stringent$ct %>% 
             .[["comparison"]] %>% 
             unique() %>% 
             sort()) %>% 
  lapply(., function(x){
    
    x %>% 
      dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>% 
      dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>% 
      dplyr::group_by(genes) %>%
      # Where duplicate genes occur, select by lowest p-value first
      # If still duplicate, select by highest absolute delta PSI
      dplyr::top_n(n = 1, wt = -p.adjust) %>% 
      dplyr::top_n(n = 1, wt = abs_deltapsi) %>% 
      dplyr::ungroup() %>% 
      dplyr::top_n(n = 100, wt = abs_deltapsi) %>%
      .[["genes"]]
    
  })

ewce_dPSI <- MarkerGenes::run_ewce_controlled(list_of_genes = genes_list_ewce,
                                            ctd_AIBS2018, ctd_DRONC_human, 
                                            ctd_list$Control, ctd_list$PD,
                                            ctd_list$PDD, ctd_list$DLB,
                                            celltypeLevel = 1, 
                                            reps = 100000, 
                                            genelistSpecies = "human", 
                                            sctSpecies = "human")


saveRDS(
  ewce_dPSI, 
  file = 
    file.path(
      path_to_results,
      "SRP058181/leafcutter/ewce/ewce_leafcutter_ds_celltype_dPSIfilter.Rds"
    )
  )

ewce_dPSI <- 
  readRDS(
    file.path(
      path_to_results,
      "SRP058181/leafcutter/ewce/ewce_leafcutter_ds_celltype_dPSIfilter.Rds"
    )
  )

plot <- 
  ewce_dPSI %>%
  dplyr::mutate(FDR.p = p.adjust(p, method="BH"),
                Class = case_when(
                  CellType %in% c("Astrocyte", "ASC", "Astro") ~ "Astrocyte",
                  CellType %in% c("MG", "Microglia") ~ "Microglia",
                  CellType %in% c("END", "Endo", "Endothelial cell", "Vascular") ~ "Vascular",
                  CellType %in% c("ODC", "Oligo", "Oligodendrocyte") ~ "Oligodendrocyte",
                  CellType %in% c("exCA", "Excitatory", "exDG", "exPFC", "Glutamatergic") ~ "Excitatory \n neuron",
                  CellType %in% c("GABA", "GABAergic", "Inhibitory") ~ "Inhibitory \n neuron",
                  CellType == "OPC" ~ "OPC",
                  CellType == "NSC" ~ "NSC"),
                Study = str_replace(Study, "DRONC_human", "DRONC"),
                Study = str_replace(Study, "list\\$", ""),
                Study = fct_relevel(Study, 
                                    c("AIBS2018", "DRONC", "Control", "PD", "PDD", "DLB")),
                comparison_type = ifelse(str_detect(GeneSet, "Control"), "Ref: control", "Ref: disease"),
                joint_name = str_c(Study, CellType, sep = ":"),
                GeneSet = fct_relevel(GeneSet,
                                      c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")),
                sd_from_mean = ifelse(FDR.p <= 0.1, sd_from_mean, NA)) %>% 
  dplyr::arrange(Class, Study) %>% 
  dplyr::mutate_at(vars(joint_name), list(~ factor(., levels = unique(.)))) %>% 
  ggplot(aes(x = GeneSet, y = forcats::fct_rev(joint_name))) +
  geom_tile(aes(fill = sd_from_mean), colour = "black") +    
  facet_grid(cols = vars(comparison_type), rows = vars(Class), scales = "free", space = "free_y") +
  scale_fill_viridis_c(na.value = "grey") +
  labs(x = "", y = "Dataset: Cell type") +
  theme_rhr + 
  theme(panel.grid = element_blank(),
        strip.text.y = element_text(size = 8, angle = 0))

plot

EWCE results using the top 100 differentially spliced genes (FDR < 0.05, with rank determined by absolute delta PSI) post-deconvolution and single-cell RNA-seq data from two external datasets (AIBS2018 study and the DRONC_human study) and our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. Multiple test correction was performed across all results using FDR; non-significant results (FDR > 0.1) were coloured grey. exPFC=glutamatergic neurons from the PFC, exCA1/3=pyramidal neurons from the Hip CA region, GABA=GABAergic interneurons, exDG=granule neurons from the Hip dentate gyrus region, ASC=astrocytes, NSC=neuronal stem cells, MG=microglia, ODC=oligodendrocytes, OPC=oligodendrocyte precursor cells, NSC=neuronal stem cells, SMC=smooth muscle cells, END= endothelial cells.

Figure 6.2: EWCE results using the top 100 differentially spliced genes (FDR < 0.05, with rank determined by absolute delta PSI) post-deconvolution and single-cell RNA-seq data from two external datasets (AIBS2018 study and the DRONC_human study) and our own internal snRNA. The x-axis denotes the disease groups compared in the differential splicing analysis, while the y-axis denotes the cell type and from which specificity matrix it is derived. Enrichments are grouped by overall cell-type classes. Standard deviations from the mean denotes the distance (in standard deviations) of the target list from the mean of the bootstrapped samples. Multiple test correction was performed across all results using FDR; non-significant results (FDR > 0.1) were coloured grey. exPFC=glutamatergic neurons from the PFC, exCA1/3=pyramidal neurons from the Hip CA region, GABA=GABAergic interneurons, exDG=granule neurons from the Hip dentate gyrus region, ASC=astrocytes, NSC=neuronal stem cells, MG=microglia, ODC=oligodendrocytes, OPC=oligodendrocyte precursor cells, NSC=neuronal stem cells, SMC=smooth muscle cells, END= endothelial cells.

When FDR < 0.05 used:
- Control_vs_PD: no enrichment of differentially spliced genes in any cell type -- in contrast to our own data, where we see an enrichment in oligos and astrocytes. This may also be the result of a smaller gene list i.e. 45 genes vs. 100 genes in our own data.
- Control_vs_PDD: enrichment of differentially spliced genes in astrocytes/OPCs -- in own data, only seen in oligodendrocytes.
When FDR < 0.1 used (as in the figure above):
- Control_vs_PD: we now see an enrichment in oligos (but not astrocytes) in the recount data, as in own data.
- Control_vs_PDD: we now see an enrichment in oligos, as in own data.

6.3 ClusterProfiler

Also, try running with clusterProfiler, which has a number of useful functionalities:
- compareCluster: permits comparison of GO enrichments across groups. Uses hypergeometric model to perform enrichment analysis within each group (i.e. does not account for semantic similarity in testing as GProfiler does, which could be a disadvantage)
- Allows semantic trimming of terms.
- A number of useful visualisations.

cluster_compare_input <- 
  stringent$ct %>% 
  dplyr::mutate(abs_deltapsi = abs(deltapsi)) %>% 
  dplyr::distinct(comparison, genes, abs_deltapsi, p.adjust) %>% 
  dplyr::group_by(genes) %>%
  # Where duplicate genes occur, select by lowest p-value first
  # If still duplicate, select by highest absolute delta PSI
  dplyr::top_n(n = 1, wt = -p.adjust) %>% 
  dplyr::top_n(n = 1, wt = abs_deltapsi)

ont <- c("BP", "CC", "MF")
cluster_compare_list <- setNames(vector(mode = "list", length = length(ont)),
                             ont)

for(i in 1:length(ont)){
  
  cluster_compare_list[[i]] <-   
    cluster_compare_input %>% 
    clusterProfiler::compareCluster(genes ~ comparison, 
                                    data=., 
                                    keyType = "SYMBOL",
                                    fun="enrichGO", 
                                    ont = ont[i],
                                    pAdjustMethod = "BH",
                                    qvalueCutoff = 0.05,
                                    OrgDb='org.Hs.eg.db')
  
}


saveRDS(
  cluster_compare_list,
  file.path(
    path_to_results,
    "SRP058181/leafcutter/gprofiler/clusterProfiler_leafcutter_ds_celltype_dPSIfilter.Rds"
  )
)

cluster_compare <- 
    readRDS(
      file.path(
        path_to_results,
        "SRP058181/leafcutter/gprofiler/clusterProfiler_leafcutter_ds_celltype_dPSIfilter.Rds"
      )
    )

cluster_compare %>% 
  purrr::discard(is.null) %>% 
  lapply(., function(df){
    
    df %>% 
      clusterProfiler::dotplot(., showCategory = NULL, font.size = 8) +
      geom_point(shape = 1,colour = "black") +
      scale_size_continuous(name = "gene ratio") +
      scale_colour_viridis_c(direction = -1) +
      theme_rhr
    
  })

## $BP

Gene ontology enrichments of genes across each comparison. Size of dot indicates gene ratio, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

Figure 6.3: Gene ontology enrichments of genes across each comparison. Size of dot indicates gene ratio, which is the proportion of genes in the input list that are annotated to the function (defined as intersection_size/query_size). Fill of dot indicates FDR-adjusted p-value.

6.4 Gene-level overlaps with own bulk-tissue RNA-sequencing

Using stringent filter in both datasets.

leafcutter_list_own <- 
  setNames(
    list(
      readRDS(
        file.path(
          path_to_results,
          "leafcutter/diff_splicing/allcomparisons_leafcutter_ds_SexRINAoD.Rds"
        )
      ), 
      readRDS(
        file.path(
          path_to_results,
          "leafcutter/diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.Rds"
        )
      )
    ),
    c("covar", "PC")
  )


# Adding filter of dPSI
stringent_own <- leafcutter_list_own %>% 
  lapply(., function (x){
    
    x$cluster_significance %>% 
      # filter for successful tests and remove clusters that overlap multiple genes
      dplyr::filter(status == "Success", p.adjust < 0.05, !str_detect(genes, ",")) %>%
      tidyr::separate(cluster, into = c("chr", "cluster_id"), sep = ":") %>% 
      dplyr::inner_join(x$intron_usage %>% 
                          tidyr::separate(intron, into = c("chr", "start", "end", "cluster_id"), sep = ":")) %>% 
      dplyr::filter(abs(deltapsi) >= 0.1, comparison %in% c("Control_vs_PD", "Control_vs_PDD", "PD_vs_PDD")) 

    
  })

stringent$ct %>% 
  dplyr::distinct(comparison, genes) %>% 
  dplyr::inner_join(stringent_own$PC %>% 
                      dplyr::distinct(comparison, genes)) %>% 
  dplyr::arrange(comparison, genes)

stringent$ct %>% 
  dplyr::distinct(comparison, genes) %>% 
  dplyr::inner_join(stringent_own$PC %>% 
                      dplyr::distinct(comparison, genes)) %>% 
  dplyr::arrange(comparison, genes) %>% 
  dplyr::group_by(comparison) %>% 
  dplyr::summarise(n_intersect = n()) %>% 
  dplyr::inner_join(stringent$ct %>% 
                      dplyr::distinct(comparison, genes) %>% 
                      dplyr::group_by(comparison) %>% 
                      dplyr::summarise(n_SRP058181 = n())) %>% 
  dplyr::inner_join(stringent_own$PC %>% 
                      dplyr::distinct(comparison, genes) %>% 
                      dplyr::group_by(comparison) %>% 
                      dplyr::summarise(n_own = n())) %>% 
  dplyr::mutate(n_SRP058181_unique = n_SRP058181 - n_intersect,
                n_own_unique = n_own - n_intersect)

7 Session info

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MarkerGenes_0.0.0.9000      EWCE_0.99.2                
##  [3] UpSetR_1.4.0                rtracklayer_1.46.0         
##  [5] readxl_1.3.1                RNAseqProcessing_0.0.0.9000
##  [7] forcats_0.5.1               stringr_1.4.0              
##  [9] dplyr_1.0.2                 purrr_0.3.4                
## [11] readr_1.4.0                 tidyr_1.1.1                
## [13] tibble_3.0.3                tidyverse_1.3.0            
## [15] ggpubr_0.4.0                ggplot2_3.3.2              
## [17] gprofiler2_0.1.9            GeneOverlap_1.22.0         
## [19] DESeq2_1.26.0               SummarizedExperiment_1.16.1
## [21] DelayedArray_0.12.3         BiocParallel_1.20.1        
## [23] matrixStats_0.56.0          Biobase_2.46.0             
## [25] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [27] IRanges_2.20.2              S4Vectors_0.24.4           
## [29] BiocGenerics_0.32.0         data.table_1.13.0          
## [31] corrplot_0.84               clusterProfiler_3.14.3     
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.1.4               tidyselect_1.1.0         RSQLite_2.2.0           
##   [4] AnnotationDbi_1.48.0     htmlwidgets_1.5.3        grid_3.6.1              
##   [7] devtools_2.3.2           munsell_0.5.0            chron_2.3-56            
##  [10] DT_0.15                  withr_2.2.0              colorspace_2.0-0        
##  [13] GOSemSim_2.17.1          highr_0.8                knitr_1.29              
##  [16] rstudioapi_0.13          ggsignif_0.6.0           DOSE_3.12.0             
##  [19] labeling_0.4.2           urltools_1.7.3           GenomeInfoDbData_1.2.2  
##  [22] polyclip_1.10-0          bit64_4.0.2              farver_2.0.3            
##  [25] rprojroot_2.0.2          vctrs_0.3.2              generics_0.0.2          
##  [28] xfun_0.16                BiocFileCache_1.10.2     R6_2.5.0                
##  [31] graphlayouts_0.7.0       locfit_1.5-9.4           bitops_1.0-6            
##  [34] cachem_1.0.3             fgsea_1.12.0             gridGraphics_0.5-0      
##  [37] assertthat_0.2.1         scales_1.1.1             ggraph_2.0.3            
##  [40] nnet_7.3-12              enrichplot_1.6.1         gtable_0.3.0            
##  [43] processx_3.4.5           tidygraph_1.2.0          rlang_0.4.7             
##  [46] genefilter_1.68.0        splines_3.6.1            rstatix_0.6.0           
##  [49] lazyeval_0.2.2           broom_0.7.0              europepmc_0.4           
##  [52] checkmate_2.0.0          BiocManager_1.30.10      yaml_2.2.1              
##  [55] reshape2_1.4.4           abind_1.4-5              modelr_0.1.8            
##  [58] crosstalk_1.1.0.1        backports_1.1.8          qvalue_2.18.0           
##  [61] Hmisc_4.4-1              usethis_1.6.3            tools_3.6.1             
##  [64] bookdown_0.21            qdapTools_1.3.5          ggplotify_0.0.5         
##  [67] ellipsis_0.3.1           gplots_3.0.4             RColorBrewer_1.1-2      
##  [70] ggdendro_0.1.21          sessioninfo_1.1.1        ggridges_0.5.2          
##  [73] Rcpp_1.0.5               plyr_1.8.6               base64enc_0.1-3         
##  [76] progress_1.2.2           zlibbioc_1.32.0          RCurl_1.98-1.2          
##  [79] ps_1.3.4                 prettyunits_1.1.1        openssl_1.4.2           
##  [82] rpart_4.1-15             viridis_0.5.1            cowplot_1.0.0           
##  [85] haven_2.3.1              ggrepel_0.8.2            cluster_2.1.0           
##  [88] here_1.0.0               fs_1.5.0                 magrittr_2.0.1          
##  [91] DO.db_2.9                openxlsx_4.2.3           triebeard_0.3.0         
##  [94] reprex_1.0.0             pkgload_1.1.0            hms_1.0.0               
##  [97] evaluate_0.14            xtable_1.8-4             XML_3.99-0.3            
## [100] rio_0.5.16               jpeg_0.1-8.1             gridExtra_2.3           
## [103] biomaRt_2.42.1           testthat_2.3.2           compiler_3.6.1          
## [106] KernSmooth_2.23-15       crayon_1.4.1             htmltools_0.5.1.1       
## [109] Formula_1.2-4            geneplotter_1.64.0       lubridate_1.7.9         
## [112] DBI_1.1.1                tweenr_1.0.1             dbplyr_1.4.4            
## [115] rappdirs_0.3.1           MASS_7.3-51.4            Matrix_1.2-17           
## [118] car_3.0-9                cli_2.2.0.9000           gdata_2.18.0            
## [121] igraph_1.2.5             pkgconfig_2.0.3          rvcheck_0.1.8           
## [124] GenomicAlignments_1.22.1 foreign_0.8-72           plotly_4.9.2.1          
## [127] xml2_1.3.2               annotate_1.64.0          XVector_0.26.0          
## [130] rvest_0.3.6              callr_3.5.1              digest_0.6.27           
## [133] Biostrings_2.54.0        HGNChelper_0.8.1         rmarkdown_2.5           
## [136] cellranger_1.1.0         fastmatch_1.1-0          htmlTable_2.0.1         
## [139] curl_4.3                 Rsamtools_2.2.3          gtools_3.8.2            
## [142] lifecycle_0.2.0          jsonlite_1.7.1           carData_3.0-4           
## [145] fansi_0.4.2              limma_3.42.2             askpass_1.1             
## [148] desc_1.2.0               viridisLite_0.3.0        pillar_1.4.6            
## [151] lattice_0.20-38          pkgbuild_1.1.0           fastmap_1.0.1           
## [154] httr_1.4.2               survival_3.2-7           GO.db_3.10.0            
## [157] remotes_2.2.0            glue_1.4.2               zip_2.1.0               
## [160] png_0.1-7                bit_4.0.4                ggforce_0.3.2           
## [163] stringi_1.5.3            blob_1.2.1               latticeExtra_0.6-29     
## [166] caTools_1.18.0           memoise_2.0.0

SRP058181: LeafCutter

Regina H. Reynolds