library(cowplot) # For alignment of plots using axes
library(clusterProfiler) # For pathway analyses with bulk DEG/DS
library(ComplexHeatmap) # For summary heatmap
library(DESeq2) # For working with bulk DE results
library(DT) # R interface to the JavaScript library DataTables
library(factoextra) # For extracting PCA variables
library(forcats) # For factor reordering
library(ggpubr) # For arranging plots in one panel
library(ggmosaic) # For contingency plot
library(ggh4x) # For customised ggplot2 facets
library(janitor) # To clean column names
library(readxl) # For loading excel workbooks
library(rutils) # For go_reduce()
library(tidyverse) # For data manipulation
library(stringr) # For string manipulation
library(qdapTools) # For list manipulation

source(here::here("docs", "figures", "colour_schemes_and_themes.R"))
source(here::here("docs", "figures", "plotting_functions.R"))

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

1 Dependencies

# Dataframes and lists
ct_class # Useful dt for ordering of cell types

colour_schemes_list

## $cell_class
##          group colour_hex
## 1    Astrocyte  #3C5488FF
## 2       Immune  #4DBBD5FF
## 3        Oligo  #E64B35FF
## 4     Vascular  #8491B4FF
## 5       Neuron  #00A087FF
## 6 Unclassified  #B09C85FF
## 
## $cell_type
##               group colour_hex
## 1         Astrocyte  #3C5488FF
## 2         Microglia  #4DBBD5FF
## 3   Oligodendrocyte  #E64B35FF
## 4               OPC  #F39B7FFF
## 5          Vascular  #8491B4FF
## 6 Excitatory neuron  #00A087FF
## 7 Inhibitory neuron  #91D1C2FF
## 8      Unclassified  #B09C85FF
## 
## $disease_greens
##     group colour_hex
## 1 Control    #868686
## 2      PD    #c4ebe3
## 3     PDD    #73cab9
## 4     DLB    #19967d
## 
## $disease_discrete
##     group colour_hex
## 1 Control  #868686E5
## 2      PD  #A73030E5
## 3     PDD  #0073C2E5
## 4     DLB  #EFC000E5

# Palette ordered by cell type
palette <-
  ct_class %>%
  dplyr::arrange(class) %>%
  dplyr::inner_join(colour_schemes_list$cell_type, by = c("class" = "group")) %>%
  .[["colour_hex"]] %>%
  as.character()

palette

## [1] "#00A087FF" "#91D1C2FF" "#3C5488FF" "#E64B35FF" "#F39B7FFF" "#4DBBD5FF"
## [7] "#8491B4FF"

# Disease re-factoring
fct_disease <- c("Control vs PD", "Control vs PDD", "Control vs DLB", "PD vs PDD", "PD vs DLB", "PDD vs DLB")

# File paths
source(here::here("R", "file_paths.R"))

path_for_figures <- file.path(path_to_wd, "paper_draft/figures")
path_for_tables <- file.path(path_to_wd, "paper_draft/tables")

1.1 Functions

Plotting functions sourced from plotting_functions.R

1.2 Single-nucleus data from seurat objects

Data for single-nucleus marker gene expression and SNCA expression were generated by sourcing the following script snRNAseq_get_marker_and_pd_gene_expr.R

source(here::here("misc_scripts", "snRNAseq_get_marker_and_pd_gene_expr.R"))

2 Sample demographics

2.1 Loading data

# Load sample info, filter for sequenced samples and select necessary columns
sample_info <-
  read_excel(
    path = file.path(path_to_raw_data, "sample_details/20201229_MasterFile_SampleInfo.xlsx"),
    sheet = "SampleInfo",
    skip = 1
  )

# Load LP consensus staging
lp_consensus <-
  read_excel(
    path = file.path(path_to_raw_data, "sample_details/20210613_acta_case_regrading.xlsx"), 
    na =  c("n/a")
  ) %>% 
  janitor::clean_names() %>% 
  dplyr::select(-x2) %>% 
  dplyr::rename_with(.fn = ~str_c("lpc_", .x)) %>% 
  dplyr::filter(!is.na(lpc_case_no))

sample_info <-
  sample_info %>%
  dplyr::filter(sent_to_bulk_seq == "yes" | CaseNo %in% c("C036", "C048", "PD060", "PD501")) %>%
  dplyr::filter(Sample_Type %in% c("Tissue section", NA)) %>%
  dplyr::left_join(
    lp_consensus,
    by = c("CaseNo" = "lpc_case_no")
  ) %>% 
  dplyr::select(
    sample_id = CaseNo, 
    Disease_Group, Sex, AoO, AoD, DD, PMI, aSN, TAU, 
    `thal AB`, `aCG aSN score`, starts_with("lpc"), CSF_pH, `brain_weight_(g)`, 
    RIN = RINe_bulkRNA_Tapestation, 
    ng_per_ul = `ng/ul_bulkRNA_Tapestation`, 
    ng = ng_bulkRNA_Tapestation, 
    sent_to_bulk_seq) %>%
  dplyr::mutate(
    Disease_Group = fct_relevel(
      Disease_Group,
      c("Control", "PD", "PDD", "DLB")
    ),
    sent_to_bulk_seq = case_when(
      is.na(sent_to_bulk_seq) ~ "no",
      TRUE ~ sent_to_bulk_seq
    )
  ) 

sample_info %>%
  DT::datatable(
    rownames = FALSE,
    options = list(scrollX = TRUE),
    class = "white-space: nowrap"
  )

2.2 Numbers quoted in text

# Sex count
sample_info %>% 
  dplyr::group_by(Disease_Group, Sex) %>% 
  dplyr::count()

# Median disease duration
sample_info %>% 
  dplyr::group_by(Disease_Group) %>% 
  dplyr::summarise(
    median_disease_duration = median(DD)
  )

2.3 Supplementary figure(s)

supp_fig <-
  sample_info %>%
  plot_sample_info()

supp_fig

2.4 Export files

ggsave(
  "suppfigure_sample_info.png",
  supp_fig,
  device = "png",
  path = path_for_figures,
  width = 175,
  height = 210,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

3 Single-nucleus RNA-sequencing metrics

3.1 Loading data

snrna_qc <-
  readxl::read_xlsx(
    path = file.path(path_to_results, "snRNA/", 
                 "sequencing_qc/", "PD_snRNA-Seq_Master_Table_.xlsx"),
    skip = 1
  ) %>% 
  dplyr::rename_with(
    ~ str_replace(.x, "Cells", "Nuclei") %>% 
      str_replace("Cell", "Nucleus"),
    .cols = everything()
  ) %>% 
  dplyr::rename(
    sample_id = SampleID,
    disease_group = Condition
  ) %>% 
  dplyr::mutate(
    disease_group = 
      fct_relevel(disease_group,
                  "Control", "PD", "PDD", "DLB")
  )

marker_genes_data <- 
  readRDS(
    file = file.path(path_to_results, "snRNA/seurat_objects/norm_expr_marker_genes.rds")
  )

3.2 Supplementary figure(s)

fig_list <- vector(mode = "list", length = 2)

fig_list[[1]] <- 
  snrna_qc %>% 
  plot_snrna_metrics()

fig_list[[2]] <-
  marker_genes_data$expression_data %>% 
  dplyr::mutate(
    cell_type = as.character(cell_type)
  ) %>% 
  plot_snrna_marker_genes(
    .,
    cols = marker_genes_data$marker_genes$gene,
    marker_genes_df = marker_genes_data$marker_genes,
    ct_class = ct_class,
    palette = palette
  )

supp_fig <- 
  ggarrange(
  ggarrange(fig_list[[1]],
            geom_blank +
              theme_minimal(),
            ncol = 1,
            heights = c(4,1),
            labels = letters[1:2]),
  fig_list[[2]], 
  widths = c(2.5,1),
  labels = c("", "c")
  )

supp_fig

3.3 Export files

ggsave(
  "suppfigure_snrna_seq_metrics.png",
  supp_fig,
  device = "png",
  path = path_for_figures,
  width = 190,
  height = 210,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

4 Bulk RNA-sequencing metrics

4.1 Loading data

# Load multiqc data and process
multiqc <- process_multiqc_metrics(filepath_to_multiqc = "/data/RNAseq_PD/tissue_polyA_samples/QC/multiqc/PD_tissue_polyA_full_report_data/")

# DESeq2 objects
dds <- readRDS(file.path(path_to_results, "Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD.Rds"))
filter_dds <- readRDS(file.path(path_to_results, "Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_filtered_design_PCaxes_DEG.Rds"))

# PCA of vst-transformed counts
filter_vsd <- DESeq2::vst(filter_dds)
pca_vsd <- prcomp(assay(filter_vsd) %>% t())

# Get number of genes detected pre and post filtering
genes_detected <-
  dds %>%
  assay() %>% # Access count data
  as_tibble() %>%
  dplyr::mutate(gene_id = rownames(assay(dds))) %>%
  tidyr::gather(key = sample_id, value = counts, -gene_id) %>%
  dplyr::filter(counts > 0) %>%
  dplyr::distinct(sample_id, gene_id) %>%
  dplyr::group_by(sample_id) %>%
  dplyr::summarise(n_genes_detected = n()) %>%
  dplyr::inner_join(
    filter_dds %>%
      assay() %>% # Access count data
      as_tibble() %>%
      dplyr::mutate(gene_id = rownames(assay(filter_dds))) %>%
      tidyr::gather(key = sample_id, value = counts, -gene_id) %>%
      dplyr::filter(counts > 0) %>%
      dplyr::distinct(sample_id, gene_id) %>%
      dplyr::group_by(sample_id) %>%
      dplyr::summarise(n_genes_detected_after_filtering = n())
  )

genes_detected

4.2 Numbers quoted in text

# Number of genes in filtered DESeq2 object
length(filter_dds)

## [1] 28692

# Average number of genes post-filtering across all samples
mean(genes_detected$n_genes_detected_after_filtering) %>%
  round(., digits = 1)

## [1] 27802

# Total variance account for by PC1-4
tibble(
  PC = c(1:length(pca_vsd$sdev)),
  sd = pca_vsd$sdev,
  var = pca_vsd$sdev^2,
  percent_var = var / sum(var) * 100
) %>%
  dplyr::filter(PC %in% c(1:4)) %>%
  dplyr::summarise(sum_var = sum(percent_var))

4.3 Supplementary figure(s)

a <- plot_multiqc_metrics(multiqc_processed = multiqc, sample_info = sample_info)

b <- plot_genes_detected(genes_detected = genes_detected, sample_info = sample_info)

supp_fig <-
  ggarrange(a,
    ggarrange(b[[1]], b[[2]],
      widths = c(1, 1.5),
      labels = c("b", "c"),
      align = "hv"
    ),
    nrow = 2,
    heights = c(2.5, 1),
    labels = c("a", "")
  )

supp_fig

4.4 Supplementary table(s)

# Sample info, bulk-RNAseq metrics, and PC axes 1-4
supp_table <-
  sample_info %>%
  dplyr::rename(disease_group = Disease_Group) %>%
  dplyr::left_join(
    multiqc %>%
      dplyr::mutate(tool_metric = str_c(tool, ":", metric)) %>%
      dplyr::select(sample_id, tool_metric, value) %>%
      tidyr::spread(key = "tool_metric", value = "value") %>%
      # Inner join bulk-RNAseq metrics
      dplyr::inner_join(
        genes_detected %>%
          dplyr::select(sample_id, contains("genes"))
      ) %>%
      # Inner join PC axes
      dplyr::inner_join(
        colData(filter_dds) %>%
          as.data.frame() %>%
          dplyr::select(sample_id, PC1:PC4)
      )
  ) %>%
  dplyr::arrange(disease_group, sample_id)

4.5 Export files

ggsave(
  "suppfigure_bulk_rnaseq_metrics.png",
  supp_fig,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 200,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

write_csv(
  supp_table,
  file = file.path(path_for_tables, "/", "sample_info_bulk_seq_metrics.csv"),
  na = ""
)

5 Conos and deconvolution

5.1 Loading data

# Load conos umap embedding
# Attach cell type labels
# Ensure cell type labelling consistent with ct_class
conos <- 
  readRDS(file.path(path_to_results, 
                "snRNA/conos_embedding/Conembedding.rds")) %>% 
  as_tibble(rownames = "cell_id") %>% 
  dplyr::rename(UMAP1 = V1,
                UMAP2 = V2) %>% 
  dplyr::inner_join(
    read_csv(file.path(path_to_results, 
                   "snRNA/conos_embedding/All.Samples.CellType.Labels.csv")) %>% 
      dplyr::rename(cell_id = X1,
                    CellType = CellTypes) %>% 
      dplyr::select(-Colours) %>% 
      dplyr::mutate(
        CellType = 
          case_when(CellType == "Astrocyte" ~ "Astro",
                    CellType == "Oligodendrocyte" ~ "Oligo",
                    CellType == "Excitatory.neuron" ~ "Excitatory",
                    CellType == "Inhibitory.neuron" ~ "Inhibitory",
                    TRUE ~ CellType)
      ),
    by = c("cell_id"))
  
# Load snRNA ct proportions
snrna_proportions <-
  read_delim(
    file.path(path_to_results, "deconvolution/scaden/2020Feb/snRNA_proportions.txt"),
    delim = "\t"
  ) %>%
  tidyr::gather(key = "Celltype", 
                value = "cell_type_proportion", 
                -sample_id)

# Bulk deconvolution data
bulk <- readRDS(file.path(path_to_results, "deconvolution/scaden/2020Feb/scaden_summary_results.Rds"))

# Calculate cell-type proportions relative to control
bulk_rel_ctrl <-
  setNames(
    object = bulk$ranks %>%
      dplyr::ungroup() %>%
      dplyr::group_split(source),
    nm = c(bulk$ranks %>%
      dplyr::ungroup() %>%
      .[["source"]] %>%
      unique() %>%
      sort())
  ) %>%
  lapply(., function(df) {
    df %>%
      dplyr::select(-rank_within_sample, -cells, -samples, -source) %>%
      tidyr::spread(key = Disease_Group, value = cell_type_proportion) %>%
      dplyr::group_by(Celltype) %>%
      dplyr::mutate(median_control = median(Control, na.rm = T)) %>%
      dplyr::ungroup() %>%
      # Calculate fold change from reference, controls, by dividing each column by median_control
      dplyr::mutate_at(
        .vars = vars("Control", "PD", "PDD", "DLB"), 
        .funs = function(x) {
          x / .[["median_control"]]
        }) %>%
      dplyr::select(-median_control) %>%
      tidyr::gather(
        key = "Disease_Group",
        value = "cell_type_proportion_relative_to_control",
        -sample_id, -Celltype, -Celltype_class
      ) %>%
      dplyr::filter(Celltype != "Unclassified") %>%
      dplyr::inner_join(ct_class, by = c("Celltype" = "CellType")) %>%
      dplyr::mutate(
        Disease_Group = fct_relevel(
          Disease_Group,
          c("Control", "PD", "PDD", "DLB")
        ))
  })

# Run wilcoxon test for bulk data
# Only comparisons to control
bulk_wilcox <-
  bulk$ranks %>%
  dplyr::filter(source == "Scaden deconvolution", cells == 1000, samples == 1000) %>%
  dplyr::group_by(Celltype) %>%
  dplyr::do(
    pairwise.wilcox.test(
      x = .$cell_type_proportion, 
      g = .$Disease_Group, 
      p.adjust.method = "none", 
      paired = FALSE) %>%
    broom::tidy()
    ) %>%
  dplyr::filter(str_detect(group2, "Control")) %>%
  dplyr::mutate(
    fdr = p.adjust(p.value, method = "BH"),
    comparison = str_c(group2, " vs ", group1) %>%
      fct_relevel(
        .,
        "Control vs PD", "Control vs PDD", "Control vs DLB"
      )
  ) %>%
  dplyr::inner_join(ct_class, by = c("Celltype" = "CellType"))

bulk_wilcox %>%
  dplyr::select(Celltype:fdr) %>%
  dplyr::filter(fdr < 0.1)

# SRP058181 deconvolution data
SRP <- readRDS(file.path(path_to_results, "deconvolution/scaden/SRP058181/scaden_summary_results.Rds"))

# Join SRP058181 and bulk data
SRP_bulk <-
  SRP$ranks %>%
  dplyr::select(sample_id, Celltype, cell_type_proportion, Celltype_class, Disease_Group) %>%
  dplyr::mutate(dataset = "SRP058181") %>%
  dplyr::bind_rows(
    bulk$ranks %>%
      dplyr::filter(
        source == "Scaden deconvolution",
        cells == 1000, samples == 1000,
        Disease_Group != "DLB"
      ) %>%
      dplyr::select(
        sample_id, Celltype, cell_type_proportion,
        Celltype_class, Disease_Group
      ) %>%
      dplyr::mutate(
        dataset = "In-house dataset",
        Disease_Group = factor(Disease_Group, ordered = F)
      )
  ) %>%
  dplyr::inner_join(ct_class,
    by = c("Celltype" = "CellType")
  ) %>%
  dplyr::mutate(Disease_Group = fct_relevel(
    Disease_Group,
    c("Control", "PD", "PDD")
  ))

# Run wilcoxon test for SRP058181
SRP_wilcox <-
  SRP_bulk %>%
  dplyr::filter(str_detect(dataset, "SRP")) %>%
  dplyr::group_by(Celltype) %>%
  dplyr::do(
    pairwise.wilcox.test(
      x = .$cell_type_proportion, 
      g = .$Disease_Group, 
      p.adjust.method = "none", 
      paired = FALSE) %>%
    broom::tidy()
    ) %>%
  dplyr::filter(str_detect(group2, "Control")) %>%
  dplyr::mutate(
    fdr = p.adjust(p.value, method = "BH"),
    comparison = str_c(group2, " vs ", group1) %>%
      fct_relevel(
        .,
        "Control vs PD", "Control vs PDD"
      ),
    dataset = "SRP058181"
  ) %>%
  dplyr::inner_join(ct_class, by = c("Celltype" = "CellType")) %>%
  dplyr::bind_rows(
    bulk_wilcox %>%
      dplyr::filter(!str_detect(comparison, "DLB")) %>%
      dplyr::mutate(dataset = "Bulk-tissue RNA-seq")
  )

SRP_wilcox %>%
  dplyr::select(dataset, Celltype:fdr) %>%
  dplyr::filter(fdr < 0.1)

5.2 Numbers quoted in text

# Overall correlation of snRNA proportions with scaden deconvolution
bulk$ranks %>%
  dplyr::select(sample_id, Celltype, cell_type_proportion, source) %>%
  tidyr::spread(key = source, value = cell_type_proportion) %>%
  dplyr::ungroup() %>%
  dplyr::summarise(
    estimate = cor.test(`Scaden deconvolution`, `snRNA-seq`, method = "spearman")$estimate,
    p.value = cor.test(`Scaden deconvolution`, `snRNA-seq`, method = "spearman")$p.value
  )

5.3 Main text figure(s)

fig_list <- vector(mode = "list", length = 3)

fig_list[[1]] <- 
  conos %>% 
  dplyr::inner_join(ct_class) %>% 
  ggplot(
    aes(
      x = UMAP1,
      y = UMAP2,
      colour = class
    )
  ) +
  geom_point(size = 0.1, alpha = 0.05) +
  scale_colour_manual(values = palette) +
  guides(colour = 
           guide_legend(
             title = "Cell type",
             override.aes = list(size = 1,
                                 alpha = 1))
         ) +
  theme_rhr + 
  theme(
    legend.position = "right", 
    legend.key.size = unit(0.35, "cm")
  )

fig_list[[2]] <-
  bulk_rel_ctrl$`Scaden deconvolution` %>%
  dplyr::rename(cell_type_proportion = cell_type_proportion_relative_to_control) %>%
  plot_cell_prop(
    y_lab = "Cell-type proportion relative to controls",
    y_lim = c(0, 8),
    fill_var = c("Disease_Group"),
    fill_var_title = "Disease group",
    fill_palette = as.character(colour_schemes_list$disease_greens$colour_hex)
  )

fig_list[[3]] <-
  bulk_wilcox %>%
  test_cell_prop()

fig <-
    cowplot::plot_grid(
      plotlist = fig_list,
      ncol = 1,
      align = "v", axis = "lr",
      rel_heights = c(0.6, 1, 0.65),
      labels = c("a", "b", "")
    )
  

fig

5.4 Supplementary figure(s)

scatterplot <-
  bulk$ranks %>%
  plot_cell_prop_scatterplot(., ct_class = ct_class, ncol = 4)

cell_prop_plots <-
  snrna_proportions %>%
  plot_snrna_cell_prop(., sample_info = sample_info, ct_class = ct_class, palette = palette)

supp_fig_1 <-
  ggarrange(
    cowplot::plot_grid(
      plotlist = cell_prop_plots,
      labels = c("a", "b"),
      align = "h", axis = "bt",
      rel_widths = c(1, 1.5)
    ),
    scatterplot,
    nrow = 2,
    labels = c("", "c"),
    heights = c(1, 1.5)
    )

supp_fig_1

supp_list <- vector(mode = "list", length = 2)

supp_list[[1]] <-
  SRP_bulk %>%
  plot_cell_prop(
    y_lab = "Cell-type proportion (%)",
    y_lim = c(0, 100),
    fill_var = c("Disease_Group"),
    fill_var_title = "Disease group",
    fill_palette = as.character(colour_schemes_list$disease_greens$colour_hex)
  ) +
  facet_grid(cols = vars(dataset))

supp_list[[2]] <-
  SRP_wilcox %>%
  test_cell_prop() +
  facet_grid(cols = vars(dataset)) +
  theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
  )

supp_fig_2 <-
  cowplot::plot_grid(
    plotlist = supp_list,
    ncol = 1,
    align = "v", axis = "lr",
    rel_heights = c(1, 0.5)
  )

supp_fig_2

5.5 Supplementary table(s)

supp_table_1 <- 
  setNames(
    vector(mode = "list", length = 3),
    c("Column_descriptions", "Cell_type_proportions", "Wilcoxon_test")
  )

supp_table_1[[2]] <-
    bulk$ranks %>%
  dplyr::filter(source == "Scaden deconvolution") %>%
  dplyr::mutate(
    dataset = "Own",
    Disease_Group = as.character(Disease_Group)
  ) %>%
  dplyr::select(dataset, sample_id, disease_group = Disease_Group, source, Celltype, cell_type_proportion) %>%
  dplyr::bind_rows(
    snrna_proportions %>%
      dplyr::inner_join(
        sample_info %>%
          dplyr::select(sample_id, disease_group = Disease_Group)
      ) %>%
      dplyr::mutate(
        dataset = "Own",
        source = "snRNA-seq"
      )
  ) %>%
  dplyr::bind_rows(
    SRP$ranks %>%
      dplyr::mutate(dataset = "recount2: SRP058181") %>%
      dplyr::select(dataset, sample_id, disease_group = Disease_Group, source, Celltype, cell_type_proportion)
  ) %>%
  tidyr::spread(key = "Celltype", "cell_type_proportion") %>%
  dplyr::arrange(dataset, source, disease_group)

supp_table_1[[3]] <-
  bulk_wilcox %>%
  dplyr::mutate(dataset = "Own") %>%
  dplyr::bind_rows(
    SRP_wilcox %>%
      dplyr::mutate(dataset = "recount2: SRP058181")
  ) %>%
  dplyr::select(dataset, comparison, cell_type = Celltype, p.value, fdr)


supp_table_1[[1]] <-
  tibble(
    Sheet = c(
      rep(names(supp_table_1)[2], length(supp_table_1[[2]])),
      rep(names(supp_table_1)[3], length(supp_table_1[[3]]))
      ),
    `Column name` = c(
      colnames(supp_table_1[[2]]),
        colnames(supp_table_1[[3]])
        ),
    Description = c(
      "Dataset from which proportions are derived from i.e. our own data or from the replication dataset from recount2",
      "Sample ID",
      "Disease state of individual from which sample derived",
      "Whether proportion is derived from single-nucleus labelling or Scaden deconvolution predictions",
      "Proportion of astrocytes within a sample (%)",
      "Proportion of excitatory neurons within a sample (%)",
      "Proportion of inhibitory neurons within a sample (%)",
      "Proportion of microglia within a sample (%)",
      "Proportion of oligodendrocytes within a sample (%)",
      "Proportion of OPCs within a sample (%)",
      "Proportion of vascular cells within a sample (%)",
      "Dataset from which proportions are derived from i.e. our own data or from the replication dataset from recount2",
      "Pairwise comparison",
      "Cell type",
      "P-value of Wilcoxon rank sum test",
      "FDR adjustment for multiple tests (n tests = 3 comparisons per cell type and dataset)"
    )
  )

5.6 Export files

ggsave(
  "figure_cellular_diversity.tiff",
  fig,
  device = "tiff",
  path = path_for_figures,
  width = 130,
  height = 160,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_cellular_diversity_correlations.png",
  supp_fig_1,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 180,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_cellular_diversity_validation.png",
  supp_fig_2,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 200,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

openxlsx::write.xlsx(supp_table_1,
  file = file.path(path_for_tables, "Supplementary_Table_2_Cell_type_proportions.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

6 Single-nucleus DEG and pathways (general)

6.1 Loading DEGs

# File paths
DEG_files <-
  list.files(
    file.path(path_to_results, "snRNA/differential_gene_expr/2021_Mar"),
    pattern = "_Allfiles2",
    full.names = T
  )

# Thresholds
fc_threshold <- log2(1.5)
fdr_threshold <- 0.05

# Load across .Rds
DEG_list <-
  setNames(
    object = DEG_files %>%
      lapply(., function(file) {
        file %>%
          readRDS() %>%
          lapply(., function(df) {
            df %>%
              dplyr::rename(gene = primerid)
          })
      }),
    nm = DEG_files %>%
      str_replace(".*/", "") %>%
      str_replace("_Allfiles2.*", "")
  )

# Create dataframe of DEGs
DEG_df <-
  DEG_list %>%
  lapply(., function(DEG) {
    DEG %>%
      qdapTools::list_df2df(col1 = "cell_type")
  }) %>%
  qdapTools::list_df2df(col1 = "comparison") %>%
  # Change naming convention to reflect our convention
  tidyr::separate(comparison, into = c("comparison_2", "comparison_1"), sep = "_") %>%
  dplyr::mutate(
    comparison_1 = str_replace(comparison_1, pattern = "C", "Control"),
    comparison = str_c(comparison_1, "_vs_", comparison_2),
    # Add direction of effect
    direction_of_effect = ifelse(coef > 0, "up", "down"),
    cell_type = case_when(
      cell_type == "Astrocytes" ~ "Astro",
      cell_type == "Oligodendrocytes" ~ "Oligo",
      TRUE ~ cell_type
    )
  ) %>%
  dplyr::select(comparison, everything(), -comparison_1, -comparison_2) %>%
  dplyr::filter(fdr < fdr_threshold, abs(coef) > fc_threshold) %>%
  as_tibble() %>%
  dplyr::inner_join(ct_class, by = c("cell_type" = "CellType"))

# Dataframe filtered significant DEGs
DEG_df_signif <- 
  DEG_df %>% 
  dplyr::filter(fdr < fdr_threshold, abs(coef) > fc_threshold)

6.2 Loading pathways

# File paths
pathway_files <-
  list.files(
    file.path(path_to_results, "snRNA/pathways/webgestalt/"),
    pattern = "enrichment_results",
    recursive = T,
    full.names = T
  )

# Filter for up/down-regulated gene sets
pathway_files <- pathway_files[str_detect(pathway_files, str_c("DownRegulated", "UpRegulated", sep = "|"))]

# Load across multiple files
pathway_list <-
  setNames(
    object = pathway_files %>%
      lapply(., function(file) {
        file %>%
          read_delim(delim = "\t")
      }),
    nm = pathway_files %>%
      str_remove(".*/") %>%
      str_remove("enrichment_results_") %>%
      str_remove(".txt")
  )

# Create dataframe of pathways
snrna_pathway_df <-
  pathway_list %>%
  qdapTools::list_df2df(col1 = "list_name") %>%
  as_tibble() %>%
  # Tidy tibble
  tidyr::separate(
    col = list_name, 
    into = c("go_type", "cell_type", "comparison_2", "comparison_1", "direction_of_effect")
    ) %>%
  dplyr::rename(
    go_id = geneSet
  ) %>% 
  dplyr::mutate(
    comparison_1 = case_when(
      comparison_1 == "C" ~ "Control",
      TRUE ~ comparison_1
    ),
    comparison = str_c(comparison_1, "_vs_", comparison_2),
    direction_of_effect = case_when(
      direction_of_effect == "UpRegulated" ~ "up",
      direction_of_effect == "DownRegulated" ~ "down",
      TRUE ~ direction_of_effect
    ),
    cell_type = case_when(
      cell_type == "Astrocytes" ~ "Astro",
      cell_type == "Oligodendrocytes" ~ "Oligo",
      TRUE ~ cell_type
    ),
    description = description %>% str_to_sentence()
  ) %>%
  dplyr::inner_join(ct_class, by = c("cell_type" = "CellType")) %>%
  dplyr::select(comparison, cell_type, class, 
                direction_of_effect, everything(), 
                -comparison_1, -comparison_2, -overlapId)  %>%
  # Remove pathways with size <= 20 or >= 2000
  dplyr::filter(size >= 20, size <= 2000) %>% 
  # Remove pathways found enriched in up- and down-regulated gene sets from same cell type
  dplyr::group_by(comparison, class, go_id) %>%
  dplyr::filter(n() < 2) %>%
  dplyr::ungroup()

# Reduce GO terms by calculating semantic similarity ("Wang"), which is used to create a distance matrix, defined as (1-simMatrix).
# The terms are then hierarchically clustered using complete linkage, and the tree is cut at the desired threshold
# Term with the highest score is used to represent each group.
pathway_GO_sim_df <-
  rutils::go_reduce(
    pathway_df =
      snrna_pathway_df,
    threshold = 0.9
  )

## [1] "Reducing sub-ontology: BP"
## [1] "Reducing sub-ontology: CC"
## [1] "Reducing sub-ontology: MF"

6.3 Numbers quoted in text

# Total number of DEG
DEG_df_signif %>% 
  nrow()

## [1] 30135

# Total number of unique DEG
DEG_df_signif %>%
  .[["gene"]] %>%
  unique() %>%
  length()

## [1] 9242

# Total number of unique DEG per comparison
DEG_df_signif %>% 
  dplyr::distinct(comparison, gene) %>% 
  dplyr::count(comparison, name = "n_unique_genes")

# Ratio of n genes in each cell type compared to inhibitory neurons
DEG_df_signif %>% 
  dplyr::distinct(comparison, cell_type, gene) %>% 
  dplyr::count(comparison, cell_type, name = "n_genes") %>% 
  dplyr::inner_join(
    DEG_df_signif %>% 
      dplyr::filter(cell_type == "Inhibitory") %>% 
      dplyr::distinct(comparison, gene) %>% 
      dplyr::count(comparison, name = "n_genes_inhib")
  ) %>% 
  dplyr::mutate(
    n_genes_divided_by_n_genes_inhib = n_genes/n_genes_inhib
  ) %>% 
  dplyr::arrange(cell_type, -n_genes_divided_by_n_genes_inhib)

# For each comparison & direction of effect, the number of DEGs detected in <= 1 cell type as % of the total number of unique DEGs detected in that comparison & direction of effect
DEG_df_signif %>% 
  dplyr::distinct(comparison, cell_type, gene, direction_of_effect) %>% 
  dplyr::group_by(comparison, direction_of_effect, gene) %>% 
  dplyr::filter(n() <= 1) %>% 
  dplyr::ungroup() %>% 
  dplyr::count(comparison, direction_of_effect, name = "n_genes_unique") %>% 
  dplyr::inner_join(
    DEG_df_signif %>% 
      dplyr::distinct(comparison, gene, direction_of_effect) %>% 
      dplyr::count(comparison, direction_of_effect, name = "n_genes")
  ) %>% 
  dplyr::mutate(percent_unique = (n_genes_unique/n_genes) * 100) %>% 
  dplyr::arrange(direction_of_effect, -percent_unique)

# Number of unique enriched pathways in down-reg and up-reg DE gene sets (across case-control or between-disease comparisons)
snrna_pathway_df %>% 
  dplyr::mutate(
    comparison_type =
      case_when(str_detect(comparison, "Control") ~ "ref_control",
                TRUE ~ "ref_disease")
  ) %>% 
  dplyr::distinct(comparison_type, direction_of_effect, go_id) %>%
  dplyr::count(comparison_type, direction_of_effect, name = "n_pathways")

# Number of unique reduced pathways in down-reg and up-reg DE gene sets (across case-control or between-disease comparisons)
pathway_GO_sim_df %>% 
  dplyr::mutate(
    comparison_type =
      case_when(str_detect(comparison, "Control") ~ "ref_control",
                TRUE ~ "ref_disease")
  ) %>% 
  dplyr::distinct(comparison_type, direction_of_effect, parent_id) %>%
  dplyr::count(comparison_type, direction_of_effect, name = "n_pathways")

# In what cell types in all pairwise comparisons are there more up-reg than down-reg DEGs?
DEG_df_signif %>% 
  dplyr::distinct(comparison, cell_type, gene, direction_of_effect) %>% 
  dplyr::count(comparison, cell_type, direction_of_effect, name = "n_genes") %>% 
  dplyr::group_by(comparison, cell_type) %>% 
  dplyr::mutate(rank = rank(-n_genes)) %>% 
  dplyr::filter(
    direction_of_effect == "up" & rank == 1
    )

6.4 Main text figure(s)

# Get gene ids for upregulated genes
DEG_ids_up <-
  generate_DEG_dummy_df(
    DEG_df = DEG_df,
    fdr_threshold = fdr_threshold,
    fc_threshold = fc_threshold,
    DEG_direction = "up"
  )

# Get gene ids for downregulated genes
DEG_ids_down <-
  generate_DEG_dummy_df(
    DEG_df = DEG_df,
    fdr_threshold = fdr_threshold,
    fc_threshold = fc_threshold,
    DEG_direction = "down"
  )

# Add 1/0 to indicate DEG
DEG_df <-
  DEG_df %>%
  dplyr::mutate(DEG = case_when(
    abs(coef) > fc_threshold & fdr < fdr_threshold ~ 1,
    TRUE ~ 0
  ))

a <-
  plot_snrna_summary_table(
    DEG_df_filtered = DEG_df_signif %>%
      dplyr::filter(str_detect(comparison, "Control")),
    ct_class = ct_class,
    fct_levels = c("Control vs PD", "Control vs PDD", "Control vs DLB")
  )

b_1 <-
  DEG_df %>%
  dplyr::filter(str_detect(comparison, "Control"), direction_of_effect == "down") %>%
  dplyr::right_join(
    DEG_ids_down %>%
      dplyr::filter(str_detect(comparison, "Control"))
  ) %>%
  dplyr::mutate(comparison = fct_relevel(
    comparison %>% str_replace_all("_", " "),
    c("Control vs PD", "Control vs PDD", "Control vs DLB")
  )) %>%
  plot_snrna_gene_overlaps(
    ylab = "Down-reg DEG (no or yes)",
    palette = palette
  )


b_2 <-
  DEG_df %>%
  dplyr::filter(str_detect(comparison, "Control"), direction_of_effect == "up") %>%
  dplyr::right_join(
    DEG_ids_up %>%
      dplyr::filter(str_detect(comparison, "Control")),
    by = c("comparison", "gene", "class")
  ) %>%
  dplyr::mutate(
    DEG = case_when(
      is.na(DEG) ~ 0,
      TRUE ~ DEG
    ),
    comparison = fct_relevel(
      comparison %>% str_replace_all("_", " "),
      c("Control vs PD", "Control vs PDD", "Control vs DLB")
    )
  ) %>%
  plot_snrna_gene_overlaps(
    ylab = "Up-reg DEG (no or yes)",
    palette = palette
  )

gsea_dummy_df <-
  generate_dummy_pathway_df(
    df = pathway_GO_sim_df %>%
      dplyr::filter(str_detect(comparison, "Control")),
    pathway_col = "parent_term",
    cell_type_col = "class",
    direction_col = "direction_of_effect",
    comparison_col = "comparison"
  ) %>%
  dplyr::rename(
    parent_term = pathway,
    class = cell_type
  )

c <- plot_snrna_gsea_pathways(
  pathway_GO_sim_df = pathway_GO_sim_df %>%
    dplyr::filter(str_detect(comparison, "Control")),
  gsea_dummy_df = gsea_dummy_df,
  fct_levels = c("Control vs PD", "Control vs PDD", "Control vs DLB")
) +
  theme(legend.position = "bottom")

fig <-
  ggpubr::ggarrange(ggpubr::ggarrange(a,
    cowplot::plot_grid(b_1, b_2,
      align = "v", axis = "lr",
      nrow = 2
    ),
    ncol = 2,
    labels = c("a", "b"),
    widths = c(1, 3)
  ),
  c,
  labels = c("", "c"),
  ncol = 1,
  heights = c(1, 1.5)
  )


fig

6.5 Supplementary figure(s)

a <-
  plot_snrna_summary_table(
    DEG_df_filtered = DEG_df_signif %>% 
      dplyr::filter(!str_detect(comparison, "Control")),
    ct_class = ct_class,
    fct_levels = c("PD vs PDD", "PD vs DLB", "PDD vs DLB")
  )

b_1 <-
  DEG_df %>%
  dplyr::filter(!str_detect(comparison, "Control"), direction_of_effect == "down") %>%
  dplyr::right_join(
    DEG_ids_down %>%
      dplyr::filter(!str_detect(comparison, "Control"))
  ) %>%
  dplyr::mutate(
    comparison = 
      fct_relevel(
        comparison %>% str_replace_all("_", " "),
        c("PD vs PDD", "PD vs DLB", "PDD vs DLB")
      )
    ) %>%
  plot_snrna_gene_overlaps(
    ylab = "Down-reg DEG (no or yes)",
    palette = palette
  )

b_2 <-
  DEG_df %>%
  dplyr::filter(!str_detect(comparison, "Control"), direction_of_effect == "up") %>%
  dplyr::right_join(
    DEG_ids_up %>%
      dplyr::filter(!str_detect(comparison, "Control")),
    by = c("comparison", "gene", "class")
  ) %>%
  dplyr::mutate(
    DEG = case_when(
      is.na(DEG) ~ 0,
      TRUE ~ DEG
    ),
    comparison = fct_relevel(
      comparison %>% str_replace_all("_", " "),
      c("PD vs PDD", "PD vs DLB", "PDD vs DLB")
    )
  ) %>%
  plot_snrna_gene_overlaps(
    ylab = "Up-reg DEG (no or yes)",
    palette = palette
  )



gsea_dummy_df <-
  generate_dummy_pathway_df(
    df = pathway_GO_sim_df %>%
      dplyr::filter(!str_detect(comparison, "Control")),
    pathway_col = "parent_term",
    cell_type_col = "class",
    direction_col = "direction_of_effect",
    comparison_col = "comparison"
  ) %>%
  dplyr::rename(
    parent_term = pathway,
    class = cell_type
  )

c <-
  plot_snrna_gsea_pathways(
    pathway_GO_sim_df = pathway_GO_sim_df %>%
      dplyr::filter(!str_detect(comparison, "Control")),
    gsea_dummy_df = gsea_dummy_df,
    fct_levels = c("PD vs PDD", "PD vs DLB", "PDD vs DLB")
  ) +
  theme(legend.position = "bottom")

supp_fig_1 <-
  ggpubr::ggarrange(
    ggpubr::ggarrange(a,
    cowplot::plot_grid(b_1, b_2,
      align = "v", axis = "lr",
      nrow = 2
    ),
    ncol = 2,
    labels = c("a", "b"),
    widths = c(1, 3)
  ),
  c,
  labels = c("", "c"),
  ncol = 1,
  heights = c(1, 1.5)
  )

supp_fig_1

fig_list <- vector(mode = "list", length = 2)
deg_pal <- alpha(RColorBrewer::brewer.pal(n = 11, name = "RdYlBu")[c(9,3)], alpha = 0.7)

# Number of cell types in which pathways were enriched across comparisons/direction of effect
fig_list[[1]] <-
  pathway_GO_sim_df %>% 
  dplyr::mutate(
    comparison =
        str_replace_all(comparison, "_", " ") %>% 
        fct_relevel(fct_disease),
    direction_of_effect = case_when(
        direction_of_effect == "down" ~ "Down-reg DEG",
        direction_of_effect == "up" ~ "Up-reg DEG")
  ) %>% 
  dplyr::distinct(comparison, direction_of_effect, cell_type, parent_id) %>% 
  dplyr::group_by(comparison, direction_of_effect, parent_id) %>% 
  dplyr::count() %>% 
  ggplot(
    aes(
      x = comparison,
      y = n,
      fill = direction_of_effect
    )
  ) +
  geom_boxplot(outlier.size = 0.5) +
  labs(
    x = "", 
    y = "Number of cell types per reduced GO term enrichment"
  ) +
  scale_fill_manual(name = "", values = deg_pal) +
  expand_limits(y = 0) +
  theme_rhr +
    theme(legend.key.size = unit(0.35, "cm"))

# Number of pathways enriched in each cell type across comparisons/direction of effect
data_to_plot <- 
  pathway_GO_sim_df %>% 
  dplyr::distinct(comparison, direction_of_effect, class, parent_id) %>% 
  dplyr::count(comparison, direction_of_effect, class) %>% 
  dplyr::mutate(
    comparison =
      str_replace_all(comparison, "_", " ") %>% 
      fct_relevel(fct_disease),
    direction_of_effect = case_when(
      direction_of_effect == "down" ~ "Down-reg DEG",
      direction_of_effect == "up" ~ "Up-reg DEG"
    ),
    label_ct= 
      case_when(class %in% c("Excitatory neuron", "Inhibitory neuron") ~ "black",
                TRUE ~ "white")
  )

fig_list[[2]] <-
 data_to_plot %>% 
  ggplot(
    aes(
      x = 
        MarkerGenes::reorder_within(
          x = class, 
          by = n, 
          within = comparison, 
          fun = sum, 
          desc = T
        ),
      y = n,
      fill = direction_of_effect,
      colour = label_ct
    )
  ) +
  geom_col(size = 0.25) +
  MarkerGenes::scale_x_reordered() +
  facet_wrap(vars(comparison), scales = "free_x", ncol = 3) +
  labs(
    x = "",
    y = "Number of enriched reduced GO terms"
  ) +
  scale_colour_manual(values = unique(data_to_plot$label_ct)) +
  scale_fill_manual(values = deg_pal) +
  guides(colour = F) +
  theme_rhr

supp_fig_2 <- 
  ggpubr::ggarrange(
  plotlist = fig_list,
  labels = letters[1:2],
  widths = c(1,1.5),
  common.legend = T
)

supp_fig_2

6.6 Supplementary table(s)

supp_table_1 <- 
  setNames(
    vector(mode = "list", length = 2),
    c("Column_descriptions", "Results")
  )

supp_table_1[[2]] <-
  DEG_df %>%
  dplyr::select(
    comparison, cell_type = class, 
    hgnc_symbol = gene, `Pr(>Chisq)` = `Pr..Chisq.`, 
    coef, ci_high = ci.hi, ci_low = ci.lo, fdr
  )

supp_table_1[[1]] <-
  tibble(
    `Column name` = colnames(supp_table_1[[2]]),
    Description = c(
      "Pairwise comparison in question. Results extracted with the name group[1]_vs_group[2], where group[1] is the baseline/reference value and group[2] is in reference to the baseline value",
      "Cell type",
      "Gene HGNC symbol",
      "Likelihood ratio test p-value",
      "Log2 fold change",
      "Upper bound of confidence interval",
      "Lower bound of confidence interval",
      "FDR-adjusted p-value (Pr(>Chisq))"
    )
  )

supp_table_2 <- 
  setNames(
    vector(mode = "list", length = 2),
    c("Column_descriptions", "Results")
  )

supp_table_2[[2]] <-
  pathway_GO_sim_df %>%
  dplyr::select(comparison,
    cell_type = class, direction_of_effect, go_type, 
    parent_term = parent_id, parent_description = parent_term, 
    child_to_parent_sim_score = parent_sim_score,
    child_term = go_id, child_description = description, 
    link, size, overlap, expect, enrichmentRatio, pValue, FDR, 
    overlapping_genes = userId
  ) %>%
  dplyr::arrange(comparison, cell_type, direction_of_effect, 
                 parent_description, child_description)

supp_table_2[[1]] <-
  tibble(
    `Column name` = colnames(supp_table_2[[2]]),
    Description = c(
      "Pairwise comparison from which the set of differentially expressed genes run in pathway analysis was derived",
      "Cell type from which the set of differentially expressed genes run in pathway analysis was derived",
      "Direction of effect of the set of differentially expressed genes run in pathway analysis (down = differentially expressed genes with a negative log2(fold change); up = differentially expressed genes with a positive log2(fold change))",
      "Domain of Gene Ontology from which child and parent term are derived (BP = biological process; CC = cellular component; MF = molecular function).",
      "The representative term used after child terms were reduced for redundancy",
      "Description of the parent GO term",
      "Child to parent semantic similarity score",
      "ID of the enriched GO term",
      "Description of the enriched GO term",
      "Web link to the child term",
      "The number of genes in the GO term",
      "The number of mapped input genes that are annotated in the gene set",
      "Expected number of input genes that are annotated to the gene set",
      "Enrichment ratio = overlap/expect",
      "P-value from hypergeometric test of overrepresentation",
      "FDR-corrected p-value",
      "The gene IDs found overlapping between the tested set of differentially expressed genes and the child GO term"
    )
  )

6.7 Export files

ggsave(
  "figure_snRNA_DEG_general.tiff",
  fig,
  device = "tiff",
  path = path_for_figures,
  width = 150,
  height = 180,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_snRNA_DEG_general_disease.png",
  supp_fig_1,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 180,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_snRNA_DEG_general_pathways.png",
  supp_fig_2,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 100,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

openxlsx::write.xlsx(supp_table_1,
  file = file.path(path_for_tables, "Supplementary_Table_5_Single_nucleus_DE_genes.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

openxlsx::write.xlsx(supp_table_2,
  file = file.path(path_for_tables, "Supplementary_Table_6_Single_nucleus_GO_pathway_enrichments.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

7 Single-nucleus DEG and pathways (PD-specific)

7.1 Loading data

# File paths
pathway_files <-
  list.files(file.path(path_to_results, "snRNA/pathways/pd_related_pathways_results/"),
    pattern = "enrichment_results",
    recursive = T,
    full.names = T
  )

# Filter for up/down-regulated gene sets
pathway_files <- pathway_files[str_detect(pathway_files, str_c("DownRegulated", "UpRegulated", sep = "|"))]

# Load across multiple workbooks and sheets
pathway_list <-
  setNames(
    object = pathway_files %>%
      lapply(., function(file) {
        file %>%
          read_delim(delim = "\t")
      }),
    nm = pathway_files %>%
      str_remove(".*/") %>%
      str_remove(".*_PATHWAYS_") %>%
      str_remove(".txt")
  )

# Create dataframe of pathways
pd_pathways_df <-
  pathway_list %>%
  qdapTools::list_df2df(col1 = "list_name") %>%
  as_tibble() %>%
  # Tidy tibble
  tidyr::separate(col = list_name, into = c("cell_type", "comparison_2", "comparison_1", "direction_of_effect")) %>%
  dplyr::mutate(
    comparison_1 = case_when(
      comparison_1 == "C" ~ "Control",
      TRUE ~ comparison_1
    ),
    comparison = str_c(comparison_1, " vs ", comparison_2) %>%
      fct_relevel(
        .,
        fct_disease
      ),
    direction_of_effect = case_when(
      direction_of_effect == "UpRegulated" ~ "up",
      direction_of_effect == "DownRegulated" ~ "down",
      TRUE ~ direction_of_effect
    ),
    cell_type = case_when(
      cell_type == "Astrocytes" ~ "Astro",
      cell_type == "Oligodendrocytes" ~ "Oligo",
      TRUE ~ cell_type
    ),
    pathway = case_when(
      str_detect(geneSet, "KEGG") ~ geneSet %>%
      str_split(pattern = "_") %>%
      lapply(., function(strings) {
        strings[-1] %>%
          str_c(collapse = " ")
      }) %>%
      unlist() %>%
      str_replace_all("-", " ") %>% 
      str_to_sentence(),
      TRUE ~ geneSet %>% 
        str_replace_all("-", " ") %>% 
        str_to_sentence()
      )
  ) %>%
  dplyr::inner_join(ct_class,
    by = c("cell_type" = "CellType")
  ) %>%
  # Need to join pathway groups
  dplyr::inner_join(
    read_csv(file.path(path_to_results, "snRNA/pathways/pathway_groups.csv")) %>% 
      dplyr::mutate(
        pathway = pathway %>% 
          str_remove(".*: ") %>% 
          str_to_sentence(),
        pathway_group = pathway_group %>% 
          fct_relevel(
            .,
            c(
              "Immune system", "Metabolism", "Neuronal transmission across chemical synapses", "Programmed cell death",
              "Signal transduction", "Vesicle mediated transport", "Miscellaneous"
            )
          ),
        pathway_group_wrapped = str_replace_all(pathway_group, "\\.", " ") %>% str_to_sentence() %>% str_wrap(., width = 15)
      )
  ) %>%
  dplyr::select(comparison, cell_type, class, direction_of_effect, pathway, everything(), -comparison_1, -comparison_2, -overlapId, -geneSet)

# Load PD Genes
pd_genes <- 
  read_delim(
    file.path(path_to_raw_data, "/misc/PD_risk_genes/Blauwendraat2019_PDgenes.txt"),
  delim = "\t"
)

# Load PD gene snRNA expression
pd_genes_data <-
  read_delim(
    file.path(path_to_results, "snRNA/seurat_objects/norm_expr_pd_genes.txt"),
    delim = "\t"
  )

# Load SNCA umap embedding
snca_umap <- 
  readRDS(
    file.path(path_to_results, "snRNA/seurat_objects/plot_data_umap_snca.rds")
  )

names(snca_umap) <- c("neuron_umap", "snca_umap")

7.2 Numbers quoted in text

# Number of PD genes
pd_genes$Gene %>% 
  length()

## [1] 21

# Number of PD genes that are DE in case-control comparisons
DEG_df_signif %>%
  dplyr::filter(str_detect(comparison, "Control"), 
                gene %in% pd_genes$Gene) %>% 
  dplyr::pull(gene) %>% 
  unique() %>% 
  length()

## [1] 13

# Number of  PD genes that are DE in a cell type across any case-control comparison 
DEG_df_signif %>%
  dplyr::filter(str_detect(comparison, "Control"), 
                gene %in% pd_genes$Gene) %>% 
  dplyr::distinct(cell_type, gene) %>% 
  dplyr::count(cell_type)

# Fold change and p-value range of SNCA in Control vs PD
DEG_df_signif %>%
  dplyr::filter(comparison == "Control_vs_PD", 
                gene == "SNCA") %>% 
  dplyr::mutate(
    across(
      .cols = c("coef", "ci.hi", "ci.lo", "fdr"),
      .fn = ~signif(.x, digits = 2))
    ) %>% 
  dplyr::select(comparison, cell_type, gene, coef, contains("ci"), fdr)

# Number of excitatory and inhibitory nuclei
nrow(snca_umap$neuron_umap)

## [1] 102293

# Proportion of nuclei expressing SNCA across disease groups
# And range of non-zero SNCA expression
pd_genes_data %>% 
  dplyr::filter(SNCA != 0) %>% 
  dplyr::group_by(disease_group, cell_type) %>% 
  dplyr::summarise(n_nuclei_nonzero = n(),
                   min_expr_nonzero = min(SNCA),
                   max_expr = max(SNCA),
                   range_expr = max_expr - min_expr_nonzero) %>% 
  dplyr::inner_join(pd_genes_data %>% 
                      dplyr::group_by(disease_group, cell_type) %>% 
                      dplyr::summarise(n_nuclei_total = n())) %>% 
  dplyr::mutate(
    percent_nuclei_expr_snca = 
      (n_nuclei_nonzero/n_nuclei_total) * 100
    ) %>%
  dplyr::mutate(
    dplyr::across(contains("expr"), ~ round(.x, digits = 2))
  ) %>% 
  dplyr::arrange(disease_group, -percent_nuclei_expr_snca)

# Number of pathways per cell type in case-control comparisons
pd_pathways_df %>% 
  dplyr::filter(str_detect(comparison, "Control")) %>% 
    dplyr::distinct(class, pathway) %>% 
    dplyr::count(class)

7.3 Main text figure(s)

fig_list <- vector(mode = "list", length = 5)

fig_list[[1]] <-
  plot_deg_fc(
    DEG_df = DEG_df_signif %>%
      dplyr::filter(str_detect(comparison, "Control"), gene %in% pd_genes$Gene),
    fct_levels = c("Control vs PD", "Control vs PDD", "Control vs DLB")
  )

fig_list[[2]] <-
  snca_umap$neuron_umap %>% 
  ggplot(
    aes(
      x = UMAP_1,
      y = UMAP_2,
      colour = cell_type
    )
  ) +
  geom_point(size = 0.01, alpha = 0.05) +
  coord_fixed(ratio = 1,
              xlim = c(-10,10),
                  ylim = c(-10,10)) +
  labs(
    x = "UMAP 1",
    y = "UMAP 2",
    colour = ""
    ) +
  scale_colour_manual(
    values = 
      colour_schemes_list$cell_type %>% 
      dplyr::filter(str_detect(group, "neuron")) %>% 
      .[["colour_hex"]] %>% 
      as.character(), 
    labels = 
      c("Excitatory" = "Excitatory neuron",
        "Inhibitory" = "Inhibitory neuron")
  ) +
  theme_rhr +
  guides(
    colour = guide_legend(
    override.aes = list(alpha = 1, size=1)
    )
    ) +
  theme(axis.text.x = element_text(size = 5, angle = 0),
        legend.position = "top",
        plot.margin = margin(t = -1, r = -1, b = -1, l = -1, unit = "pt")
        )

fig_list[[3]] <-
  snca_umap$snca_umap %>% 
  dplyr::arrange(SNCA) %>% 
  ggplot(
    aes(
      x = UMAP_1,
      y = UMAP_2,
      colour = SNCA
    )
  ) +
  geom_point(size = 0.01, alpha = 0.05) +
  coord_fixed(ratio = 1,
              xlim = c(-10,10),
                  ylim = c(-10,10)) +
  labs(x = "UMAP 1",
       y = "UMAP 2",
       title = "",
       colour = "SNCA expression"
  ) +
  scale_color_gradientn(
    colors = c('lightgrey', 'blue'),
    guide = "colorbar",
    limits = c(0,
               max(snca_umap$snca_umap$SNCA) %>% round())
  ) +
  theme_rhr +
  theme(
    axis.text.x = element_text(size = 5, angle = 0),
    legend.key.size = unit(0.2, "cm"),
    legend.position = "top",
    plot.margin = margin(t = -1, r = -1, b = 0, l = -1, unit = "cm")
  )

fig_list[[4]] <- 
  pd_genes_data %>% 
  dplyr::select(-(PRKN:LRP10)) %>% 
  dplyr::filter(
    cell_type %in% c("Excitatory")
  ) %>% 
  plot_snrna_ridge_expr(
    cols = c("SNCA"),
    ct_class = ct_class
  )


fig_list[[5]] <-
  pd_pathways_df %>%
  dplyr::filter(str_detect(comparison, "Control")) %>%
  dplyr::mutate(
    direction_of_effect = 
      case_when(
        direction_of_effect == "down" ~ "Down-reg DEG",
        direction_of_effect == "up" ~ "Up-reg DEG"
      )
    ) %>%
  plot_n_snrna_pd_pathways()


fig <- 
  cowplot::plot_grid(
    cowplot::plot_grid(
      cowplot::plot_grid(
        plotlist = fig_list[c(1,4)],
        ncol = 1,
        align = "v",
        axis = "lr",
        labels = c("a", "c")
      ),
      cowplot::plot_grid(
        plotlist = fig_list[2:3],
        ncol = 1,
        align = "hv",
        labels = c("b")
      ),
      ncol = 2,
      rel_widths = c(1,1.25)
    ),
    fig_list[[5]],
    labels = c("", "d"),
    ncol = 1,
    rel_heights = c(2,1)
  )

fig

7.4 Supplementary figure(s)

supp_fig_1 <-
  pd_genes_data %>% 
  dplyr::mutate(
    disease_group =
      fct_relevel(disease_group,
                  c("Control", "PD", "PDD", "DLB"))
  ) %>% 
  dplyr::select(-(PRKN:LRP10)) %>% 
  plot_snrna_cdf_expr(
    cols = c("SNCA"),
    ct_class = ct_class,
    palette = palette  
    )

supp_fig_1

dummy_pathway_df <-
  generate_dummy_pathway_df(
    df = pd_pathways_df,
    pathway_col = "pathway",
    cell_type_col = "class",
    direction_col = "direction_of_effect",
    comparison_col = "comparison"
  ) %>%
  dplyr::rename(class = cell_type) %>%
  dplyr::inner_join(
    pd_pathways_df %>%
      dplyr::distinct(pathway, pathway_group)
  )

supp_fig_2 <-
  pd_pathways_df %>%
  dplyr::right_join(dummy_pathway_df) %>%
  dplyr::filter(str_detect(comparison, "Control")) %>%
  dplyr::mutate(direction_of_effect = case_when(
    direction_of_effect == "down" ~ "Down-reg\nDEG",
    direction_of_effect == "up" ~ "Up-reg\nDEG"
  )) %>%
  plot_snrna_pd_pathways(ct_class = ct_class, pathway_group = TRUE)

supp_fig_2

7.5 Supplementary table(s)

supp_table <- 
  setNames(
    vector(mode = "list", length = 2),
    c("Column_descriptions", "Results")
  )

supp_table[[2]] <-
  pd_pathways_df %>%
  dplyr::select(
    comparison, cell_type = class, direction_of_effect, 
    pathway_group, pathway, link, size, 
    overlap, expect, enrichmentRatio, 
    pValue, FDR, overlapping_genes = userId
  ) %>%
  dplyr::arrange(
    comparison, cell_type, 
    direction_of_effect, pathway_group, pathway
  )

supp_table[[1]] <-
  tibble(
    `Column name` = colnames(supp_table[[2]]),
    Description = c(
      "Pairwise comparison from which the set of differentially expressed genes run in pathway analysis was derived",
      "Cell type from which the set of differentially expressed genes run in pathway analysis was derived",
      "Direction of effect of the set of differentially expressed genes run in pathway analysis (down = differentially expressed genes with a negative log2(fold change); up = differentially expressed genes with a positive log2(fold change))",
      "Group of the pathway, as grouped by Bandres-Ciga et al. (PMID: 32601912)",
      "Description of the enriched pathway",
      "Web link to the child term",
      "The number of genes in the pathway",
      "The number of mapped input genes that are annotated in the gene set",
      "Expected number of input genes that are annotated to the gene set",
      "Enrichment ratio = overlap/expect",
      "P-value from hypergeometric test of overrepresentation",
      "FDR-corrected p-value",
      "The gene IDs found overlapping between the tested set of differentially expressed genes and the child GO term"
    )
  )

7.6 Export files

ggsave(
  "figure_snRNA_DEG_PD_specific.tiff",
  fig,
  device = "tiff",
  path = path_for_figures,
  width = 150,
  height = 210,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_snRNA_DEG_PD_specific_SNCA.png",
  supp_fig_1,
  device = "png",
  path = path_for_figures,
  width = 140,
  height = 160,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_snRNA_DEG_PD_specific.png",
  supp_fig_2,
  device = "png",
  path = path_for_figures,
  width = 170,
  height = 170,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

openxlsx::write.xlsx(supp_table,
  file = file.path(path_for_tables, "Supplementary_Table_7_Single_nucleus_pd_pathway_enrichments.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

8 Bulk-tissue differential expression

8.1 Loading data

list_vsd_corrected <- readRDS(here::here("docs", "figures", "list_vsd_correction_strategies.Rds"))

pc_pathways <-
  readRDS(file.path(path_to_results, "Salmon_quant/", "bulk_tissue/", "clusterprofiler/", "clusterProfiler_PC1_PCcorrected.Rds")) %>%
  lapply(., function(df) {

    # Extraction with @result requires post-filtering for significant terms
    df@result %>%
      dplyr::filter(p.adjust < 0.05) %>%
      tidyr::separate(col = "BgRatio", into = c("term_size", "background_size"), sep = "/") %>%
      tidyr::separate(col = "GeneRatio", into = c("num", "denom"), sep = "/") %>%
      dplyr::mutate(
        GeneRatio = as.numeric(num) / as.numeric(denom),
        term_size = as.integer(term_size),
        background_size = as.integer(background_size)
      ) %>%
      dplyr::filter(term_size >= 20, term_size <= 2000) %>%
      dplyr::rename(go_id = ID) %>%
      dplyr::select(-num, -denom)
  }) %>%
  qdapTools::list_df2df(col1 = "go_type")

# Reduce cluster compare terms using rutils::go_reduce()
pc_pathways_reduced <-
  pc_pathways %>%
  rutils::go_reduce(.,
    threshold = 0.9
  )

## [1] "Reducing sub-ontology: BP"
## [1] "Reducing sub-ontology: CC"
## [1] "Reducing sub-ontology: MF"

bulk_deg <- 
  extract_results_dds(
    dds = filter_dds, 
    sample_info = colData(filter_dds), 
    group_column_name = "Disease_Group"
    )

##                              V2              V1 
## "Disease_Group"            "PD"       "Control" 
##                              V2              V1 
## "Disease_Group"           "PDD"       "Control" 
##                              V2              V1 
## "Disease_Group"           "DLB"       "Control" 
##                              V2              V1 
## "Disease_Group"           "PDD"            "PD" 
##                              V2              V1 
## "Disease_Group"           "DLB"            "PD" 
##                              V2              V1 
## "Disease_Group"           "DLB"           "PDD"

8.2 Numbers quoted in text

source(here::here("R", "biomart_df.R"))

# Total number of DE genes
bulk_deg %>%
  dplyr::filter(padj < 0.05) %>%
  nrow()

## [1] 60

# Total unique DE genes
bulk_deg %>%
  dplyr::filter(padj < 0.05) %>%
  .[["gene"]] %>%
  unique() %>%
  length()

## [1] 53

# Overlap with snRNA - DEG
bulk_deg %>%
  dplyr::filter(padj < 0.05) %>%
  biomart_df(
    columnToFilter = "gene",
    mart = 38,
    ensembl_version = "jul2019.archive.ensembl.org",
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filter = "ensembl_gene_id"
  ) %>%
  dplyr::inner_join(
    DEG_df %>%
      dplyr::filter(abs(coef) > fc_threshold, fdr < 0.05),
    by = c("comparison", "hgnc_symbol" = "gene")
  ) %>%
  dplyr::select(comparison,
    ensembl_id = gene, hgnc_symbol,
    bulk_log2FC = shrunk_log2FC, bulk_pvalue = pvalue,
    bulk_FDR = padj, cell_type,
    snrna_logFC = coef, snrna_FDR = fdr
  )

## [1] "Number of unique genes to search: 53"
## [1] "Number of matches found:53"

8.3 Supplementary figure(s)

fig_list <- vector(mode = "list", length = 4)
titles <- c("Uncorrected", "Corrected by AoD, Sex, RIN", "Corrected by PC axes 1-4")

for (i in 1:length(list_vsd_corrected)) {
  fig_list[[i]] <-
    factoextra::fviz_pca_ind(list_vsd_corrected[[i]],
      geom.ind = "point", # show points only (nbut not "text")
      col.ind = colData(filter_vsd)[, c("Disease_Group")], # color by groups
      addEllipses = TRUE, ellipse.type = "confidence", # Confidence ellipses
      palette = as.character(colour_schemes_list$disease_greens$colour_hex),
      mean.point = FALSE,
      legend.title = "Disease group",
      title = titles[i]
    ) +
    theme_rhr +
    theme(plot.title = element_text(size = 7))
}

fig_list[[4]] <-
  plot_cprofiler_reduced_barplot(
    pathway_results = pc_pathways_reduced %>%
      dplyr::mutate(parent_term = str_wrap(parent_term, width = 20))
  ) +
  theme(legend.position = "right")


supp_fig <-
  ggarrange(ggarrange(
    plotlist = fig_list[1:3],
    labels = letters[1:3],
    ncol = 1,
    common.legend = T,
    legend = "bottom"
  ),
  fig_list[[4]],
  labels = c("", "d"),
  ncol = 2,
  widths = c(1.5, 1)
  )

supp_fig

8.4 Supplementary table(s)

supp_table_1 <- 
  setNames(
    vector(mode = "list", length = 2),
    c("Column_descriptions", "Genes")
  )

supp_table_1[[2]] <-
  bulk_deg %>%
  dplyr::filter(padj < 0.05) %>%
  biomart_df(
    columnToFilter = "gene",
    mart = 38,
    ensembl_version = "jul2019.archive.ensembl.org",
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filter = "ensembl_gene_id"
  ) %>%
  dplyr::select(comparison, ensembl_id = gene, hgnc_symbol, 
                baseMean, log2FoldChange, lfcSE, shrunk_log2FC, 
                shrunk_lfcSE, stat, pvalue, padj)

supp_table_1[[1]] <-
  tibble(
    `Column name` = colnames(supp_table_1[[2]]),
    Description = c(
      "Pairwise comparison in question. Results extracted with the name group[1]_vs_group[2], where group[1] is the baseline/reference value and group[2] is in reference to the baseline value.",
      "Gene ensembl id",
      "Gene HGNC symbol",
      "Mean of normalized counts for all samples",
      "Log2 fold change (using DESeq2 maximum likelihood estimation). Reference = group[1] i.e. in 'Control_vs_PD' the reference is the control group.",
      "Log2 fold change standard error",
      "Shrinkage of effect size (i.e. log2 FC) using the normal prior distribution in DESeq2",
      "Shrinkage of effect size standard error (i.e. lfcSE) using the normal prior distribution in DESeq2",
      "Test statistic (test used = Wald test)",
      "P-value",
      "FDR-adjusted p-value"
    )
  )

supp_table_2 <- 
  setNames(
    vector(mode = "list", length = 3),
    c("Column_descriptions", "Genes", "Pathway_enrichments")
  )

supp_table_2[[2]] <-
  pc_top_genes %>% 
  dplyr::select(ensembl_id, hgnc_symbol, PC1) %>% 
  dplyr::arrange(-PC1)

supp_table_2[[3]] <-
  pc_pathways_reduced %>%
  dplyr::select(
    go_type,
    parent_term = parent_id, parent_description = parent_term, 
    child_to_parent_sim_score = parent_sim_score,
    child_term = go_id, child_description = Description, 
    GeneRatio, term_size, overlap = Count, pvalue, FDR = p.adjust, 
    overlapping_genes = geneID
  ) %>%
  dplyr::arrange(go_type, parent_term, -child_to_parent_sim_score)


supp_table_2[[1]] <-
  tibble(
    Sheet = c(
      rep(names(supp_table_2)[2], length(supp_table_2[[2]])),
      rep(names(supp_table_2)[3], length(supp_table_2[[3]]))
    ),
    `Column name` = c(
      colnames(supp_table_2[[2]]),
      colnames(supp_table_2[[3]])
    ),
    Description = c(
      "Ensembl gene ID",
      "Gene HGNC symbol",
      "Contribution of genes to gene expression PC axis 1 (following correction for cell-type proportions and experimental covariates). Extracted using factoextra::get_pca_var().",
      "Domain of Gene Ontology from which child and parent term are derived (BP = biological process; CC = cellular component; MF = molecular function)",
      "ID of the parent GO term",
      "Description of the parent GO term",
      "Child to parent semantic similarity score",
      "ID of the enriched GO term",
      "Description of the enriched GO term",
      "Proportion of genes in the input list that are annotated to the GO term (i.e. intersection/input list)",
      "The number of genes in the GO term",
      "Intersection size",
      "P-value of enrichment using a hypergeometric test",
      "FDR-corrected p-value",
      "The gene IDs found overlapping between the tested set of differentially expressed genes and the child GO term"
    )
  )

8.5 Export files

ggsave(
  "suppfigure_pca_correction_strategies.png",
  supp_fig,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 140,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

openxlsx::write.xlsx(supp_table_1,
  file = file.path(path_for_tables, "Supplementary_Table_3_Bulk_tissue_DE_genes.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

openxlsx::write.xlsx(supp_table_2,
  file = file.path(path_for_tables, "Supplementary_Table_4_Genes_contributing_to_bulk_tissue_PC1.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

9 Bulk-tissue differential splicing

9.1 Loading data

# Load summary data
summary <-
  read_delim(file.path(path_to_results, "leafcutter/diff_splicing_PCaxes/summary.txt"), 
             delim = "\t") %>%
  dplyr::filter(status %in% c(
    "Successfully tested",
    "Differentially spliced clusters, p.adjust < 0.05",
    "Differentially spliced clusters, p.adjust < 0.05, |dPSI| >= 0.1"
  )) %>%
  dplyr::mutate(status = status %>%
    str_replace("p.adjust", "FDR") %>%
    fct_relevel(., c(
      "Successfully tested",
      "Differentially spliced clusters, FDR < 0.05",
      "Differentially spliced clusters, FDR < 0.05, |dPSI| >= 0.1"
    ))) %>%
  tidyr::gather(key = "comparison", value = "number_of_clusters", -status) %>%
  dplyr::mutate(
    comparison = str_replace_all(comparison, "_", " "),
    comparison_type = case_when(
      str_detect(comparison, "Control") ~ "Ref: control",
      TRUE ~ "Ref: disease"
    )
  )

# Load EWCE data
ewce <-
  setNames(
    object = c(
      list(readRDS(file.path(path_to_results, "leafcutter/ewce/ewce_leafcutter_ds_PC_dPSIfilter.Rds"))),
      list(readRDS(file.path(path_to_results, "leafcutter/ewce/ewce_leafcutter_ds_exact_validated_celltype.Rds")))
    ),
    nm = c("dPSI", "replication")
  ) %>%
  lapply(., function(df) {
    df %>%
      dplyr::filter(!Study %in% c("AIBS2018", "DRONC_human")) %>%
      dplyr::mutate(
        FDR.p = p.adjust(p, method = "BH"),
        Study = str_replace(Study, "list\\$", ""),
        Study = fct_relevel(
          Study,
          c("Control", "PD", "PDD", "DLB")
        ),
        comparison_type = case_when(
          str_detect(GeneSet, "Control") ~ "Ref: control",
          TRUE ~ "Ref: disease"
        ),
        GeneSet = fct_relevel(
          GeneSet %>% str_replace_all(., "_", " "),
          fct_disease
        )
      )
  })

# Load clusterProfiler data
cluster_compare <-
  setNames(
    object = c(
      list(readRDS(file.path(path_to_results, "leafcutter/gprofiler/clusterProfiler_leafcutter_ds_PC_dPSIfilter.Rds"))),
      list(readRDS(file.path(path_to_results, "leafcutter/gprofiler/clusterProfiler_leafcutter_overlap_exact_ds_with_SRP058181.Rds")))
    ),
    nm = c("Own", "recount2: SRP058181")
  ) %>%
  lapply(., function(list) {
    list %>%
      lapply(., function(df) {

        # Get length of input gene list
        n_input <- df@geneClusters %>%
          lapply(length) %>%
          qdapTools::list2df(col1 = "n_genes", col2 = "comparison")

        df %>%
          fortify(., showCategory = NULL, split = NULL) %>%
          tidyr::separate(col = "BgRatio", into = c("term_size", "background_size"), sep = "/") %>%
          dplyr::mutate(
            term_size = as.integer(term_size),
            background_size = as.integer(background_size)
          ) %>%
          dplyr::filter(term_size >= 20, term_size <= 2000) %>%
          dplyr::rename(go_id = ID) %>%
          dplyr::inner_join(n_input)
      }) %>%
      qdapTools::list_df2df(col1 = "go_type")
  }) %>%
  qdapTools::list_df2df(col1 = "dataset")

# Reduce cluster compare terms using rutils::go_reduce()
# Note: reducing redundancy using all terms identified from in-house data and external data
# This is to ensure similar parent terms across both datasets and allow for easier comparison
cluster_compare_reduced <-
  cluster_compare %>%
  rutils::go_reduce(.,
    threshold = 0.9
  )

## [1] "Reducing sub-ontology: BP"
## [1] "Reducing sub-ontology: CC"
## [1] "Reducing sub-ontology: MF"

# Load dataframe of exactly overlapping intron clusters from bulk and SRP058181
# Used to construct contigency table
cont_table <- readRDS(file.path(path_to_results, "leafcutter/diff_splicing_PCaxes/cluster_overlap_exact_ds_with_SRP058181.Rds"))

# Load leafcutter results lists form bulk and SRP058181
leafcutter_list <-
  setNames(
    object = list(
      readRDS(file.path(path_to_results, "leafcutter/diff_splicing_PCaxes/allcomparisons_leafcutter_ds_PCaxes.Rds")),
      readRDS(file.path(path_to_results, "SRP058181/leafcutter/diff_splicing_celltype/allcomparisons_leafcutter_ds_celltype.Rds"))
    ),
    nm = c("in-house", "SRP058181")
  )

# List with 2 dataframes: (1) successfully tested and annotated introns and (2) significant and annotated introns
intron_list <- readRDS(here::here("docs", "figures", "intron_list.Rds"))

For pathway summary figure, need to combine all pathway files into one list.

pathway_summary <-
  setNames(
    object =
      list(
        snrna_pathway_df %>%
          # Remove pathways found enriched in up- and down-regulated gene sets from same cell type
          dplyr::group_by(comparison, class, go_id) %>%
          dplyr::filter(n() < 2) %>%
          # Collapse across up and down-reg genes
          dplyr::distinct(comparison, class, go_type, go_id, description),
        pc_pathways %>%
          dplyr::rename(description = Description) %>%
          dplyr::mutate(
            comparison = "PC1",
            class = NA
          ),
        cluster_compare %>%
          dplyr::rename(description = Description) %>%
          # Collapse across own/validated dataset
          dplyr::distinct(comparison, go_type, go_id, description) %>% 
          dplyr::mutate(
            class = NA
          )
      ),
    nm = c("snrna_deg", "bulk_pc", "bulk_ds")
  ) %>% 
  # Standardise across dataframes
  lapply(., function(df){
    
    df %>% 
      dplyr::mutate(
        description = str_to_lower(description)
      ) %>% 
      dplyr::distinct(comparison, class, go_type, go_id, description) %>% 
      as_tibble()
    
  })

9.2 Numbers quoted in text

# Number of unique clusters
intron_list[[2]] %>%
  .[["cluster_id"]] %>%
  unique() %>%
  length()

## [1] 4656

# Number of unique genes
intron_list[[2]] %>%
  .[["gene_name_junc"]] %>%
  unique() %>%
  length()

## [1] 3751

# Proportion of each type of event
intron_list[[2]] %>%
  dplyr::filter(!junc_cat %in% c("ambig_gene")) %>%
  dplyr::group_by(comparison, junc_cat) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::group_by(comparison) %>%
  dplyr::mutate(prop = n / sum(n)) %>%
  dplyr::arrange(junc_cat, -prop)

# Proportion of partially annotated ds introns
intron_list[[2]] %>%
  dplyr::filter(!junc_cat %in% c("ambig_gene")) %>%
  dplyr::group_by(comparison, junc_cat) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::group_by(comparison) %>%
  dplyr::mutate(prop = n / sum(n)) %>%
  dplyr::filter(str_detect(junc_cat, "novel")) %>%
  dplyr::summarise(sum = sum(prop))

# Proportion of unannotated ds introns
intron_list[[2]] %>%
  dplyr::filter(!junc_cat %in% c("ambig_gene")) %>%
  dplyr::group_by(comparison, junc_cat) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::group_by(comparison) %>%
  dplyr::mutate(prop = n / sum(n)) %>%
  dplyr::filter(str_detect(junc_cat, "none")) %>%
  dplyr::summarise(sum = sum(prop))

# Chisq test
cont_table %>%
  with(., table(ds.SRP058181, ds.own, dnn = c("Differentially spliced in SRP058181?", "Differentially spliced in own data?"))) %>%
  fisher.test()

## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.001956
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.170652 1.995428
## sample estimates:
## odds ratio 
##   1.538183

# Number of ds genes across own/SRP058181
cont_table %>% 
  dplyr::group_by(ds.own, ds.SRP058181) %>% 
  dplyr::summarise(n = n())

# Proportion of bulk analysis pathways overlapping another analysis
pathway_count <- 
  pathway_summary %>%
  qdapTools::list_df2df(col1 = "analysis") %>%
  dplyr::group_by(go_id, description) %>%
  dplyr::summarise(
    n = n()
  ) %>%
  dplyr::mutate(
    present_in_bulk_ds =
      case_when(go_id %in% c(pathway_summary$bulk_ds %>%
                               .[["go_id"]] %>%
                               unique()) ~ TRUE,
                TRUE ~ FALSE),
    present_in_bulk_pc =
      case_when(go_id %in% pathway_summary$bulk_pc$go_id ~ TRUE,
                TRUE ~ FALSE),
    present_in_snrna =
      case_when(go_id %in% c(pathway_summary$snrna %>%
                               .[["go_id"]] %>%
                               unique()) ~ TRUE,
                TRUE ~ FALSE),
    sum_present = sum(present_in_bulk_ds, present_in_bulk_pc, present_in_snrna)
  ) %>%
  dplyr::filter(
    present_in_bulk_ds == TRUE | present_in_bulk_pc == TRUE
  )

print(
  str_c(
    "Number of bulk DS pathways that are shared: ", 
    pathway_count %>% 
      dplyr::filter(present_in_bulk_ds == TRUE, sum_present > 1) %>% 
      nrow()
  )
)

## [1] "Number of bulk DS pathways that are shared: 39"

print(
  str_c(
    "Total number of bulk DS pathways: ", 
    pathway_count %>% 
      dplyr::filter(present_in_bulk_ds == TRUE) %>% 
      nrow()
  )
)

## [1] "Total number of bulk DS pathways: 157"

print(
  str_c(
    "Number of bulk PC pathways that are shared: ", 
    pathway_count %>% 
      dplyr::filter(present_in_bulk_pc == TRUE, sum_present > 1) %>% 
      nrow()
  )
)

## [1] "Number of bulk PC pathways that are shared: 18"

print(
  str_c(
    "Total number of bulk DS pathways:", 
    pathway_count %>% 
      dplyr::filter(present_in_bulk_pc == TRUE) %>% 
      nrow()
  )
)

## [1] "Total number of bulk DS pathways:71"

9.2.1 Check frequency of events in GTEx anterior cingulate cortex

# Convert to granges object
query_gr <- 
  intron_list[[2]] %>%
  dplyr::filter(!junc_cat %in% c("ambig_gene")) %>%
  dplyr::distinct(seqnames, start, end, strand, cluster_id) %>%
  GenomicRanges::makeGRangesFromDataFrame()

# Load frontal cortex data
load("/data/recount/GTEx_SRP012682/gtex_split_read_table_annotated_rda/brain_anteriorcingulatecortex_ba24_split_read_table_annotated.rda")
gtex_split_read_table_annotated_only_junc_coverage <- 
  gtex_split_read_table_annotated_only_junc_coverage %>%
  GenomicRanges::makeGRangesFromDataFrame(.,
    keep.extra.columns = T,
    seqnames.field = "chr",
    start.field = "start",
    end.field = "stop",
    strand.field = "strand"
  )

# Find precisely overlapping junctions in gtex data
ids <- GenomicRanges::findOverlaps(query_gr, gtex_split_read_table_annotated_only_junc_coverage, type = "equal")
gtex_split_read_table_annotated_only_junc_coverage <- gtex_split_read_table_annotated_only_junc_coverage[subjectHits(ids)]

# For each junction, calculate frequency of samples with count > 0 (represented by countsSamples in GTEx granges)
# Total number of samples in ACC = 99
gtex_acc_freq <- 
  gtex_split_read_table_annotated_only_junc_coverage[, "countsSamples"] %>%
  as_tibble() %>%
  dplyr::mutate(frequency = countsSamples / 99 * 100)

# Join to original ds intron list
# Replace NAs with 0
ds_intron_w_gtex_freq <-
  intron_list[[2]] %>%
  dplyr::filter(!junc_cat %in% c("ambig_gene")) %>%
  dplyr::left_join(gtex_acc_freq) %>%
  dplyr::mutate(frequency = case_when(
    is.na(frequency) ~ 0,
    TRUE ~ frequency
  )) %>%
  dplyr::mutate(
    comparison = comparison %>%
      str_replace_all("_", " ") %>%
      fct_relevel(., fct_disease),
    junc_cat = junc_cat %>%
      str_replace("novel_combo", "novel_combination") %>%
      str_replace_all("_", " ") %>%
      str_to_sentence() %>%
      fct_relevel(., c("Annotated", "Novel exon skip", "Novel combination", "Novel acceptor", "Novel donor", "None"))
  )

# Proportion of DS introns undetected in GTEx
ds_intron_w_gtex_freq %>%
  dplyr::mutate(
    gtex_detected = case_when(
      frequency == 0 ~ FALSE,
      frequency > 0 ~ TRUE
    )) %>%
  dplyr::group_by(comparison, gtex_detected) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::group_by(comparison) %>%
  dplyr::mutate(prop = n / sum(n) * 100) %>%
  dplyr::filter(gtex_detected == FALSE) %>%
  dplyr::arrange(prop)

# Min/max proportion of DS introns undetected across all junction categories/comparisons
ds_intron_w_gtex_freq %>%
  dplyr::mutate(
    gtex_detected = case_when(
      frequency == 0 ~ FALSE,
      frequency > 0 ~ TRUE
    )
  ) %>%
  dplyr::group_by(comparison, junc_cat, gtex_detected) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::group_by(comparison, junc_cat) %>%
  dplyr::mutate(prop = n / sum(n) * 100) %>%
  dplyr::filter(gtex_detected == FALSE) %>% 
  dplyr::ungroup() %>% dplyr::summarise(min = min(prop), max = max(prop))

# Median detection rate across broader junction categories
ds_intron_w_gtex_freq %>%
  dplyr::mutate(
    junc_cat =
      case_when(str_detect(junc_cat, "Novel") ~ "Partial_anno",
                TRUE ~ as.character(junc_cat))
  ) %>% 
  dplyr::group_by(junc_cat) %>% 
  dplyr::summarise(median = median(frequency))

9.3 Main text figure(s)

fig <-
  ewce$dPSI %>%
  dplyr::mutate(
    p_text = case_when(
      FDR.p < 0.001 ~ "***",
      FDR.p < 0.01 ~ "**",
      FDR.p < 0.05 ~ "*"
    ),
    sd_from_mean = ifelse(FDR.p <= 0.05, sd_from_mean, NA)
  ) %>%
  plot_ewce(ct_class = ct_class, rotate = TRUE, cividis_palette = TRUE) +
  geom_text(aes(label = p_text, colour = p_text), size = 2.5) +
  scale_colour_manual(values = c("white", "black", "black")) +
  guides(colour = F) +
  theme(
    legend.key.size = unit(0.35, "cm")
  )

fig

plot_pathway_summary_heatmap(
  pathway_summary_list = pathway_summary,
  comparison_pattern = "Control_vs_PD|Control_vs_PDD|Control_vs_DLB|PC",
  output_dir = path_for_figures,
  output_name = "figure_differential_splicing_summary", 
  figure_height = 110
  )

9.4 Supplementary figure(s)

supp_list <- vector(mode = "list", length = 2)

supp_list[[1]] <-
  ggplot(
    summary,
    aes(
      x = fct_relevel(comparison,
                      fct_disease),
      y = number_of_clusters
    )
  ) +
  geom_col(position = position_dodge(), colour = "black") +
  facet_wrap(
    vars(status),
    scales = "free_y",
    labeller = labeller(status = setNames(
      object = summary$status %>%
        as.character() %>%
        unique() %>%
        str_replace("Successfully ", "Successfully\n") %>%
        str_replace("Differentially ", "Differentially\n") %>%
        str_replace_all(", ", "\n"),
      nm = summary$status %>%
        as.character() %>%
        unique()
    ))
  ) +
  labs(x = "", y = "Number of clusters") +
  theme_rhr

supp_list[[2]] <-
  plot_cprofiler_reduced(
    pathway_results = cluster_compare_reduced %>%
      dplyr::filter(dataset == "Own",
                    qvalue < 0.05),
    fct_levels = c("Control vs DLB", "PD vs PDD", "PD vs DLB", "PDD vs DLB")
  ) +
  theme(legend.position = "right")

supp_fig_1 <-
  cowplot::plot_grid(
    plotlist = supp_list,
    nrow = 2,
    rel_heights = c(1, 2),
    labels = letters[1:2]
  )

supp_fig_1

supp_fig_2 <-
  cont_table %>%
  dplyr::mutate(both = case_when(ds.own == TRUE & ds.SRP058181 == TRUE ~ TRUE,
                                 TRUE ~ FALSE)) %>%
  ggplot() +
  ggmosaic::geom_mosaic(aes(x = ggmosaic::product(ds.SRP058181, ds.own), fill = ds.own), na.rm = TRUE) +
  scale_fill_manual(values = c("#00BFC4", "#F8766D")) +
  labs(
    x = "Differentially spliced\nin bulk-tissue RNA-seq?",
    y = "Differentially spliced\nin SRP058181?",
    fill = "Differentially spliced\nin bulk-tissue RNA-seq?"
  ) +
  theme_rhr +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "none"
  )

supp_fig_2 <- 
  supp_fig_2 +
  geom_text(
    data = ggplot_build(supp_fig_2)$data[[1]],
    aes(x = (xmin + xmax) / 2, y = (ymin + ymax) / 2, label = .wt), size = 2
  )

supp_fig_2

supp_list <- vector(mode = "list", length = 2)

supp_list[[1]] <-
  ewce$replication %>%
  dplyr::mutate(
    p_text = case_when(p < 0.05 ~ signif(p, digits = 2)),
    p_colour = case_when(
      FDR.p < 0.1 ~ "**",
      p < 0.05 ~ "*"
    ),
    sd_from_mean = ifelse(p < 0.05, sd_from_mean, NA)
  ) %>%
  plot_ewce(., ct_class, cividis_palette = TRUE) +
  geom_text(aes(label = p_text, colour = p_colour), size = 2) +
  scale_colour_manual(values = c("white", "black")) +
  guides(colour = F) +
  theme(
    legend.position = "top",
    legend.key.size = unit(0.6, "cm")
  )

supp_list[[2]] <-
  plot_cprofiler_reduced(
    pathway_results = cluster_compare_reduced %>%
      dplyr::filter(dataset == "recount2: SRP058181") %>%
      dplyr::mutate(parent_term = str_wrap(parent_term, 20)),
    fct_levels = c("Control vs PD", "Control vs PDD")
  ) +
  theme(
    legend.key.size = unit(0.6, "cm"),
    legend.justification = "centre",
    legend.position = "top",
    legend.box = "vertical"
  )

supp_fig_3 <-
  cowplot::plot_grid(
    plotlist = supp_list,
    ncol = 2,
    # axis = "bt",
    # align = "h",
    rel_widths = c(1, 1.5),
    labels = c("a", "b")
  )

supp_fig_3

supp_fig_list <- vector(mode = "list", length = 4)

supp_fig_list[[1]] <- 
  ggplot() +
  geom_blank() +
  theme_minimal()

supp_fig_list[[2]] <-
  plot_intron_annot_prop(intron_annotated = intron_list[[2]], 
                         novel_combo = TRUE) +
  theme(legend.position = "none")

# What proportion of partially annotated events not detected in gtex?
supp_fig_list[[3]] <-
  ds_intron_w_gtex_freq %>%
  dplyr::mutate(
    gtex_detected = case_when(
      frequency == 0 ~ FALSE,
      frequency > 0 ~ TRUE
    )) %>%
  dplyr::group_by(comparison, junc_cat, gtex_detected) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::group_by(comparison, junc_cat) %>%
  dplyr::mutate(prop = n / sum(n) * 100) %>%
  dplyr::filter(gtex_detected == FALSE) %>%
  ggplot(
    aes(
      x = fct_reorder(.f = junc_cat, .x = prop, .fun = median, .desc = T),
      y = prop, fill = junc_cat
    )
  ) +
  geom_boxplot(fatten = 1.5, outlier.alpha = 0) +
  ggbeeswarm::geom_beeswarm(priority = "density", shape = 21, alpha = 0.5) +
  expand_limits(y = 0) +
  labs(x = "", y = "Proportion of DS introns\n(within a category) not detected\nin GTEx BA24samples\n(% per comparison)") +
  scale_fill_manual(
    name = "Splicing event",
    values = c("#3C5488", "#4DBBD5", "#F3E361", "#E64B35", "#00A087", "grey")
  ) +
  theme_rhr +
  theme(legend.position = "none")

# Plot distribution of GTEx frequencies
supp_fig_list[[4]] <-
  ds_intron_w_gtex_freq %>%
  ggplot(
    aes(
      x = fct_reorder(.f = junc_cat, .x = frequency, .fun = median, .desc = T),
      y = frequency, fill = junc_cat
    )
  ) +
  geom_violin() +
  geom_boxplot(fill = "white", width = 0.1, outlier.alpha = 0) +
  # facet_wrap(vars(comparison)) +
  labs(x = "", y = "Proportion of GTEx\nBA24 samples where\nDS intron detected\n(% per intron)", fill = "Splicing event") +
  scale_fill_manual(values = c("#3C5488", "#4DBBD5", "#F3E361", "#E64B35", "#00A087", "grey")) +
  guides(fill = guide_legend(nrow = 1)) +
  theme_rhr +
  theme(
    legend.position = "none",
    legend.key.size = unit(0.4, "cm"),
    axis.text.x = element_text(angle = 45, vjust = 1)
  )

supp_fig_4 <-
  ggarrange(
    supp_fig_list[[1]],
    ggarrange(
      plotlist = supp_fig_list[c(2:3)],
      labels = letters[2:3]
    ),
    supp_fig_list[[4]],
    ncol = 1,
    labels = c("a", "", "d"),
    heights = c(2, 1.25, 1)
  )

supp_fig_4

plot_pathway_summary_heatmap(
  pathway_summary_list = pathway_summary,
  comparison_pattern = "PD_vs_PDD|PD_vs_DLB|PDD_vs_DLB|PC",
  output_dir = path_for_figures,
  output_name = "suppfigure_differential_splicing_summary",
  column_title_rotation = 90,
  figure_height = 140
  )

## png 
##   2

9.5 Supplementary table(s)

supp_table_1 <- 
  setNames(
    vector(mode = "list", length = 2),
    c("Column_descriptions", "Results")
  )

supp_table_1[[2]] <-
  leafcutter_list$`in-house`$cluster_significance %>%
  dplyr::select(comparison, cluster, status, loglr, df, p, p.adjust) %>%
  dplyr::inner_join(
    intron_list[[2]] %>%
      dplyr::select(
        comparison, cluster_id,
        chr = seqnames, start, end, strand, logef, 
        deltapsi, Control, PD, PDD, DLB,
        gene_name_junc, gene_id_junc, junc_in_ref, junc_cat
      ) %>%
      tidyr::unnest(gene_name_junc, gene_id_junc) %>%
      dplyr::group_by(
        comparison, cluster_id, chr, start, end, strand, 
        logef, deltapsi, Control, PD, PDD, DLB,
        junc_in_ref, junc_cat
      ) %>%
      # Unlist list components of columns
      dplyr::summarise(
        gene_name_junc = toString(gene_name_junc),
        gene_id_junc = toString(gene_id_junc)
      ),
    by = c("comparison", "cluster" = "cluster_id")
  ) %>%
  dplyr::mutate(comparison = comparison %>%
    fct_relevel(
      .,
      fct_disease
    )) %>%
  dplyr::arrange(comparison, p.adjust)

supp_table_1[[1]] <-
  tibble(
    `Column name` = colnames(supp_table_1[[2]]),
    Description = c(
      "Pairwise comparison in question. Results extracted with the name group[1]_vs_group[2], where group[1] is the baseline/reference value and group[2] is in reference to the baseline value.",
      "Cluster ID, assigned by Leafcutter during intron clustering",
      "Whether this cluster was a) successfully tested b) not tested for some reason (e.g. too many introns) c) there was an error during testing (this is rare)",
      "Log likelihood ratio between the null model (no difference between the groups) and alternative (there is a difference)",
      "Degrees of freedom, equal to the number of introns in the cluster minus one",
      "The resulting unadjusted p-value under the asymptotic Chi-squared distribution",
      "FDR-adjusted p-value",
      "Chromosome on which intron is located",
      "Start co-ordinate of the intron",
      "End co-ordinate of the intron. To ensure optimal mapping to reference annotation this is the equivalent of the intron end outputted by Leafcutter - 1 bp",
      "Strand on which intron is located",
      "Log effect size (as fitted by LeafCutter)",
      "Delta percent-spliced-in (PSI), calculated by taking the difference between the PSI of group[2] - group[1]",
      "Fitted usage proportion in the control group in the relevant comparison",
      "Fitted usage proportion in the PD group in the relevant comparison",
      "Fitted usage proportion in the PDD group in the relevant comparison",
      "Fitted usage proportion in the DLB group in the relevant comparison",
      "Logical value designating whether junction and the intron it represents is present in reference annotation",
      "Junction category assigned",
      "HGNC symbol of the gene the junction overlaps",
      "Ensembl ID of the gene the junction overlaps"
    )
  )

supp_table_2 <- 
  setNames(
    vector(mode = "list", length = 3),
    c("Column_descriptions", "DS", "DS_replication")
  )

supp_table_2[2:3] <-
  ewce %>%
  lapply(., function(df) {
    df %>%
      dplyr::inner_join(ct_class) %>%
      dplyr::select(comparison_type, GeneSet,
                    specificity_matrix = Study, cell_type = class,
                    p, fold_change, sd_from_mean, FDR = FDR.p
      ) %>%
      dplyr::arrange(comparison_type, GeneSet, specificity_matrix)
  })

supp_table_2[[1]] <-
  tibble(
    `Column name` = colnames(supp_table_2[[2]]),
    Description = c(
      "Whether pairwise comparisons involve comparisons of diseased individuals to control individuals (Ref: control) or to other diseased individuals (Ref: disease)",
      "Set of genes run in EWCE analysis",
      "Disease group used to generate specificity values for EWCE analysis",
      "Cell type",
      "Probability of cellular enrichment",
      "Expression in the target gene list divided by the mean level of expression in the bootstrap samples",
      "The distance (in standard deviations) of the target list from the mean of the bootstrap sample",
      "FDR-adjusted p-value"
    )
  )

supp_table_3 <- 
  setNames(
    vector(mode = "list", length = 2),
    c("Column_descriptions", "Results")
  )

supp_table_3[[2]] <-
  cluster_compare_reduced %>%
  dplyr::select(
    dataset, comparison,
    n_input = n_genes, go_type,
    parent_term = parent_id, parent_description = parent_term,
    child_to_parent_sim_score = parent_sim_score,
    child_term = go_id, child_description = Description,
    GeneRatio, term_size, overlap = Count, pvalue,
    FDR = p.adjust, overlapping_genes = geneID
  ) %>%
  dplyr::arrange(dataset, comparison, go_type, parent_term, -child_to_parent_sim_score)


supp_table_3[[1]] <-
  tibble(
    `Column name` = colnames(supp_table_3[[2]]),
    Description = c(
      "Dataset on which differential splicing was performed i.e. our own data or from the replication dataset from recount2",
      "Set of differentially spliced genes run in pathway analysis",
      "Size of input gene list",
      "Domain of Gene Ontology from which child and parent term are derived (BP = biological process; CC = cellular component; MF = molecular function)",
      "ID of the parent GO term",
      "Description of the parent GO term",
      "Child to parent semantic similarity score",
      "ID of the enriched GO term",
      "Description of the enriched GO term",
      "Proportion of genes in the input list that are annotated to the GO term (i.e. intersection/input list)",
      "The number of genes in the GO term",
      "Intersection size",
      "P-value of enrichment using a hypergeometric test",
      "FDR-corrected p-value",
      "The gene IDs found overlapping between the tested set of differentially expressed genes and the child GO term"
    )
  )

9.6 Export files

openxlsx::write.xlsx(supp_table_1,
  file = file.path(path_for_tables, "Supplementary_Table_9_Bulk_tissue_DS_introns.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

openxlsx::write.xlsx(supp_table_2,
  file = file.path(path_for_tables, "Supplementary_Table_10_Bulk_tissue_DS_EWCE.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

openxlsx::write.xlsx(supp_table_3,
  file = file.path(path_for_tables, "Supplementary_Table_11_Bulk_tissue_DS_pathways.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)


ggsave(
  "figure_differential_splicing.tiff",
  fig,
  device = "tiff",
  path = path_for_figures,
  width = 150,
  height = 60,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_differential_splicing.png",
  supp_fig_1,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 150,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_differential_splicing_cont_table.png",
  supp_fig_2,
  device = "png",
  path = path_for_figures,
  width = 80,
  height = 80,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_differential_splicing_validation.png",
  supp_fig_3,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 170,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_differential_splicing_gtex.png",
  supp_fig_4,
  device = "png",
  path = path_for_figures,
  width = 150,
  height = 210,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

10 RBP enrichment analysis

10.1 Loading data

ame_rbp <- 
  readRDS(
    file.path(path_to_results, "rbp_analysis/rbp_enrichment_fasta/ame_summary.Rds")
    )

attract_db <- 
  read.table(
    file.path(path_to_raw_data, "rbp_analysis/attract_db/ATtRACT_db.txt"), 
    header = T, sep = "\t"
    ) %>% 
  dplyr::filter(Organism == "Homo_sapiens", Score == "1.000000**") %>% 
  dplyr::mutate(
    Gene_name = as.character(Gene_name)
  )

ame_results <- 
  S4Vectors::Filter(nrow, ame_rbp) %>% 
  qdapTools::list_df2df(col1 = "list_name") %>% 
  tidyr::separate(list_name, into = c("comparison", "intron_annotation_type", NA), sep = ":", remove = F) %>% 
  tidyr::separate(motif_ID, into = c("hgnc_symbol", "ensembl_id", "matrix_id"), sep = ":", remove = T) %>% 
  dplyr::filter(intron_annotation_type == "prox_intron",
                motif_DB == "pwm_acc_RBP") %>% 
  dplyr::mutate(list_name = list_name %>% 
                  str_replace("query", "ds") %>% 
                  str_replace("control", "nonds"),
                comparison = fct_relevel(comparison,
                                         fct_disease),
                intron_annotation_type = intron_annotation_type %>% 
                  str_replace("_", " "),
                # Replace Ts (DNA) with Us (RNA)
                consensus = str_replace_all(consensus, "T", "U"))

# Add on alternative RBPs that might bind consensus motifs identified
ame_results_w_alt_rbps <-
  ame_results %>%
  dplyr::inner_join(
    attract_db %>% 
      dplyr::select(Gene_name, consensus = Motif) %>% 
      dplyr::distinct(),
    by = c("consensus")
  ) %>% 
  dplyr::mutate(
    same_gene = 
      case_when(Gene_name == hgnc_symbol ~ TRUE,
                TRUE ~ FALSE),
    Gene_name = 
      case_when(same_gene == FALSE ~ Gene_name)
  ) %>% 
  dplyr::ungroup()

motifs_w_multiple_rbps <- 
  ame_results_w_alt_rbps %>% 
  dplyr::distinct(Gene_name, consensus) %>%
  dplyr::filter(!is.na(Gene_name), n() > 1)

ame_results_w_alt_rbps <- 
  ame_results_w_alt_rbps %>% 
  dplyr::filter(!consensus %in% c(motifs_w_multiple_rbps$consensus)) %>% 
  dplyr::bind_rows(
    ame_results_w_alt_rbps %>% 
      dplyr::filter(
        consensus %in% c(motifs_w_multiple_rbps$consensus),
        !is.na(Gene_name)
      ) %>% 
      dplyr::group_by(across(c(-Gene_name))) %>% 
      dplyr::summarise(
        alternative_rbp = str_c(Gene_name, collapse = ",")
      ) %>% 
      dplyr::mutate(
        alternative_rbp =
          case_when(hgnc_symbol == "HNRNPC" ~ str_c(alternative_rbp, ",RBM25"),
                    TRUE ~ alternative_rbp)
      )
  )

10.2 Numbers mentioned in text

# Intron cluster overlapping RBM25
rbm25_overlap <-
  cont_table %>%
  dplyr::filter(str_detect(cluster_id.own, "clu_26788_+"))

rbm25_overlap

# P-values for RBM25 clusters in each dataset
leafcutter_list$`in-house`$significant_clusters_0.05_filter %>% 
  dplyr::filter(
    cluster == rbm25_overlap$cluster_id.own %>% str_remove(".*:"),
    comparison == "Control_vs_PDD"
  ) %>% 
  dplyr::select(-PD, -DLB)

leafcutter_list$SRP058181$significant_clusters_0.05_filter %>% 
  dplyr::filter(
    cluster == rbm25_overlap$cluster_id.SRP058181 %>% str_remove(".*:"),
    comparison == "Control_vs_PDD"
  ) %>% 
  dplyr::select(-PD)

# AME results
ame_results_w_alt_rbps %>% 
  dplyr::filter(bonferroni_adj_p_val < 0.05)

10.3 Supplementary table(s)

supp_table <- 
  setNames(
    vector(mode = "list", length = 2),
    c("Column_descriptions", "Results")
  )

# Add on alternative RBPs that might bind consensus motifs identified
supp_table[[2]] <-
  ame_results_w_alt_rbps %>%
  dplyr::select(
    comparison, rbp_hgnc_symbol = hgnc_symbol, ensembl_id, 
    consensus, contains("p_val"), alternative_rbp
  )

supp_table[[1]] <-
  tibble(
    `Column name` = colnames(supp_table[[2]]),
    Description = c(
      "Set of differentially spliced genes run in pathway analysis",
      "HGNC symbol of RNA-binding protein",
      "Gene ensembl ID",
      "RNA-binding protein consensus sequence",
      "The optimal enrichment p-value of the motif according a Fisher's exact test",
      "The optimal enrichment p-value of the motif according to a Fisher's exact test, adjusted for multiple tests using a Bonferroni correction",
      "Alternative human RNA-binding proteins (denoted by HGNC symbols) in the ATtRACT database that have a quality score of 1 or implicated from previous literature"
    )
  )

10.4 Export files

openxlsx::write.xlsx(
  supp_table,
  file = file.path(path_for_tables, "Supplementary_Table_12_RBP_binding_motifs.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

11 LDSC/MAGMA output

11.1 Loading data

11.1.1 H-MAGMA

load_magma_csv <- 
  function(file_dir) {
    file_df <- tibble(
      file_paths = list.files(file_dir, full.names = T, pattern = "csv"),
      file_name = list.files(file_dir, full.names = F, pattern = "csv")
    )
    
    for (i in 1:nrow(file_df)) {
      file <- read_delim(file_df$file_paths[i] %>% as.character(), delim = " ", col_names = F, skip = 1)
      
      colnames(file) <- c("X1", "VARIABLE", "TYPE", "NGENES", "BETA", "BETA_STD", "SE", "P", "FDR")
      
      file <- file %>%
        dplyr::select(-X1, -TYPE) %>%
        dplyr::mutate(file_name = file_df$file_name[i])
      
      if (i == 1) {
        magma <- file
      } else {
        magma <- magma %>%
          bind_rows(file)
      }
    }
    
    return(magma)
  }

load_magma_gse <- 
  function(file_dir) {
    file_df <- tibble(
      file_paths = list.files(file_dir, full.names = T, pattern = "gse.out.txt"),
      file_name = list.files(file_dir, full.names = F, pattern = "gse.out.txt")
    )
    
    for (i in 1:nrow(file_df)) {
      file <- read_delim(file_df$file_paths[i] %>% as.character(), delim = " ", col_names = F, skip = 1)
      
      colnames(file) <- c("VARIABLE", "TYPE", "NGENES", "BETA", "BETA_STD", "SE", "P", "FDR")
      
      file <- file %>%
        dplyr::select(-TYPE) %>%
        dplyr::mutate(file_name = file_df$file_name[i],
                      VARIABLE = 
                        case_when(VARIABLE == "Astrocytes" ~ "Astro",
                                  VARIABLE == "Oligodendrocytes" ~ "Oligo",
                                  TRUE ~ VARIABLE))
      
      if (i == 1) {
        magma <- file
      } else {
        magma <- magma %>%
          bind_rows(file)
      }
    }
    
    return(magma)
  }

hmagma <-
  setNames(
    object = list(
      load_magma_gse(file.path(path_to_results, "hmagma/DEG/")) %>% 
        dplyr::filter(str_detect(file_name, "_C_")),
      load_magma_csv(file.path(path_to_results, "hmagma/top10/"))
    ),
    nm = c("deg", "top10")
  ) %>%
  lapply(., function(df) {
    df %>%
      dplyr::mutate(
        group = case_when(
          str_detect(file_name, "PD_C") ~ "Control_vs_PD",
          str_detect(file_name, "PDD_C") ~ "Control_vs_PDD",
          str_detect(file_name, "DLB_C") ~ "Control_vs_DLB",
          str_detect(file_name, "TOP10") & str_detect(file_name, "\\.C\\.") ~ "Control",
          str_detect(file_name, "TOP10") & str_detect(file_name, "\\.PD\\.") ~ "PD",
          str_detect(file_name, "TOP10") & str_detect(file_name, "\\.PDD\\.") ~ "PDD",
          str_detect(file_name, "TOP10") & str_detect(file_name, "\\.DLB\\.") ~ "DLB",
          TRUE ~ file_name
        ),
        direction_of_effect = case_when(
          str_detect(file_name, "DownRegulated") ~ "down",
          str_detect(file_name, "UpRegulated") ~ "up"
        ),
        GWAS = case_when(
          str_detect(file_name, "^PD2019") ~ "PD2019_ex23andMe",
          str_detect(file_name, "PD2018_AOO") ~ "PD2018_AOO",
          str_detect(file_name, "AD2019") ~ "AD",
          TRUE ~ file_name
        ),
        VARIABLE = case_when(
          VARIABLE == "Ex" ~ "Excitatory",
          VARIABLE == "In" ~ "Inhibitory",
          VARIABLE == "EndoPer" ~ "Vascular",
          TRUE ~ VARIABLE
        )
      ) %>%
      dplyr::select(-file_name) %>%
      dplyr::inner_join(ct_class, by = c("VARIABLE" = "CellType"))
  })

11.1.2 sLDSC

ldsc <-
  setNames(
    object = list(
      read_delim(
        file = file.path(path_to_results, "ldsc/sldsc_celltype_DEG.txt"),
        delim = "\t"
      ) %>%
        dplyr::filter(
          str_detect(comparison, "Control"),
          GWAS %in% c("AD2019", "PD2019.meta5.ex23andMe.hg38", "PD2018.AOO.hg38")
        ) %>%
        dplyr::mutate(GWAS = case_when(
          GWAS == "AD2019" ~ "AD",
          GWAS == "PD2018.AOO.hg38" ~ "PD2018_AOO",
          GWAS == "PD2019.meta5.ex23andMe.hg38" ~ "PD2019_ex23andMe"
        )) %>%
        dplyr::inner_join(ct_class, by = c("cell_type" = "CellType")) %>%
        dplyr::rename(group = comparison),
      read_delim(
        file = file.path(path_to_results, "ldsc/sldsc_celltype_deciles.txt"),
        delim = "\t"
      ) %>%
        dplyr::filter(decile == 10, 
                      GWAS %in% c("AD2019", "PD2018.AOO.hg38", "PD2019.meta5.ex23andMe.hg38")) %>%
        dplyr::mutate(
          GWAS = case_when(
            GWAS == "AD2019" ~ "AD",
            GWAS == "PD2018.AOO.hg38" ~ "PD2018_AOO",
            GWAS == "PD2019.meta5.ex23andMe.hg38" ~ "PD2019_ex23andMe"),
          direction_of_effect = NA
        ) %>%
        dplyr::inner_join(ct_class, by = c("cell_type" = "CellType")) %>%
        dplyr::rename(group = Disease_Group)
    ),
    nm = c("deg", "top10")
  )

11.2 Main figure

facet_labels <-
  setNames(
    object = c("AD", "PD - age of onset", "PD - risk"),
    nm = unique(ldsc$deg$GWAS)
  )

a <-
  process_magma_ldsc(
    ldsc_results = ldsc$top10,
    magma_results = hmagma$top10,
    fct_levels = c("Control", "PD", "PDD", "DLB")
  ) %>%
  plot_magma_ldsc(facet_labels = facet_labels)

b <-
  process_magma_ldsc(
    ldsc_results = ldsc$deg %>% 
      dplyr::filter(is.na(direction_of_effect)),
    magma_results = hmagma$deg %>% 
      dplyr::filter(is.na(direction_of_effect)),
    fct_levels = c("Control vs PD", "Control vs PDD", "Control vs DLB")
  ) %>%
  plot_magma_ldsc(facet_labels = facet_labels)

fig <-
  ggpubr::ggarrange(a, b,
    nrow = 2,
    labels = letters[1:2],
    common.legend = T,
    legend = "right"
  )

fig

11.3 Supplementary figure(s)

supp_fig <- 
  process_magma_ldsc(
    ldsc_results = ldsc$deg %>% 
      dplyr::filter(!is.na(direction_of_effect)),
    magma_results = hmagma$deg %>% 
      dplyr::filter(!is.na(direction_of_effect)),
    fct_levels = c("Control vs PD", "Control vs PDD", "Control vs DLB")
  ) %>%
  plot_magma_ldsc(
    facet_by_direction = T,
    facet_labels = facet_labels
  ) 

supp_fig

11.4 Supplementary table(s)

supp_table <- 
  setNames(
    vector(mode = "list", length = 4),
    c("Column_descriptions", "HMAGMA", "SLDSC", "Joint")
  )

supp_table[[2]] <-
  hmagma %>%
  lapply(., function(df) {
    df %>%
      dplyr::arrange(GWAS, group, VARIABLE, direction_of_effect) %>%
      dplyr::select(GWAS, group, class, direction_of_effect, NGENES:P) %>%
      dplyr::rename(cell_type = class)
  }) %>%
  qdapTools::list_df2df(col1 = "gene_list") %>%
  dplyr::mutate(group = str_replace_all(group, "_", " "))

supp_table[[3]] <-
  ldsc %>%
  lapply(., function(df) {
    df %>%
      dplyr::arrange(GWAS, group, cell_type, direction_of_effect) %>%
      dplyr::select(GWAS, group, class, direction_of_effect, Prop._SNPs:`Coefficient_z-score`, Z_score_P) %>%
      dplyr::rename(cell_type = class)
  }) %>%
  qdapTools::list_df2df(col1 = "gene_list") %>%
  dplyr::mutate(group = str_replace_all(group, "_", " "))

supp_table[[4]] <-
  process_magma_ldsc(
    ldsc_results = ldsc$top10,
    magma_results = hmagma$top10,
    fct_levels = c("Control", "PD", "PDD", "DLB")
  ) %>%
  dplyr::mutate(gene_list = "top10") %>%
  dplyr::bind_rows(
    process_magma_ldsc(
      ldsc_results = ldsc$deg,
      magma_results = hmagma$deg,
      fct_levels = c("Control vs PD", "Control vs PDD", "Control vs DLB")
    ) %>%
      dplyr::mutate(gene_list = "deg")
  ) %>%
  dplyr::select(gene_list, GWAS, group, class, direction_of_effect, contains("magma"), contains("ldsc"), p_category) %>%
  dplyr::rename(cell_type = class) %>%
  dplyr::arrange(gene_list, GWAS, group)

supp_table[[1]] <-
  tibble(
    Sheet = c(
      rep(names(supp_table)[2], length(supp_table[[2]])),
      rep(names(supp_table)[3], length(supp_table[[3]])),
      rep(names(supp_table)[4], length(supp_table[[4]]))
    ),
    `Column name` = c(
      colnames(supp_table[[2]]),
      colnames(supp_table[[3]]),
      colnames(supp_table[[4]])
    ),
    Description = c(
      "Whether the input list was derived from (i) the top 10% most cell-type-specific genes or (ii) cell-type-specific differentially expressed genes",
      "GWAS",
      "For top 10% lists, the disease group from which specificity was derived. For differentially expressed genes, the pairwise comparisons from which input genes were derived",
      "Cell type",
      "For differentially expressed genes, the direction of effect of the set of differentially expressed genes run ('down' = differentially expressed genes with a negative log2(fold change); 'up' = differentially expressed genes with a positive log2(fold change); blank cell = all differentially expressed genes)",
      "Number of genes in input list",
      "The regression coefficient of association between cell-type and GWAS",
      "The semi-standardised regression coefficient, corresponding to the predicted change in Z-value given a change of one standard deviation in the predictor gene set/gene covariate",
      "The standard error of the regression coefficient",
      "P-value of association",
      "Whether the input list was derived from (i) the top 10% most cell-type-specific genes or (ii) cell-type-specific differentially expressed genes",
      "GWAS",
      "For top 10% lists, the disease group from which specificity was derived. For differentially expressed genes, the pairwise comparisons from which input genes were derived",
      "Cell type",
      "For differentially expressed genes, the direction of effect of the set of differentially expressed genes run ('down' = differentially expressed genes with a negative log2(fold change); 'up' = differentially expressed genes with a positive log2(fold change); blank cell = all differentially expressed genes)",
      "Proportion of SNPs accounted for by annotation within the baseline set of SNPs",
      "Proportion of heritability accounted for by the annotation",
      "Jackknife standard errors for the proportion of heritability. Block jacknife over SNPs with 200 equally sized blocks of adjacent SNPs.",
      "Enrichment = (Proportion of heritability)/(Proportion of SNPs)",
      "Standard error of enrichment",
      "P-value of total enrichment",
      "Regression co-efficient i.e. contribution of annotation after controlling for all other categories in the model",
      "Standard error of coefficient. Estimated using the covariance matrix for coefficient estimates.",
      "Z-score for significance of coefficient",
      "P-value for coefficient computed from z-score",
      "Whether the input list was derived from (i) the top 10% most cell-type-specific genes or (ii) cell-type-specific differentially expressed genes",
      "GWAS",
      "For top 10% lists, the disease group from which specificity was derived. For differentially expressed genes, the pairwise comparisons from which input genes were derived",
      "Cell type",
      "For differentially expressed genes, the direction of effect of the set of differentially expressed genes run ('down' = differentially expressed genes with a negative log2(fold change); 'up' = differentially expressed genes with a positive log2(fold change); blank cell = all differentially expressed genes)",
      "MAGMA p-value",
      "FDR-adjusted MAGMA p-value (adjusted by number of cell types)",
      "sLDSC coefficient p-value",
      "FDR-adjusted sLDSC coefficient p-value (adjusted by number of cell types)",
      "P-value category"
    )
  )

11.5 Export files

openxlsx::write.xlsx(
  supp_table,
  file = file.path(path_for_tables, "Supplementary_Table_8_HMAGMA_LDSC.xlsx"),
  row.names = FALSE,
  headerStyle = openxlsx::createStyle(textDecoration = "BOLD"),
  firstRow = TRUE,
  append = TRUE
)

ggsave(
  "figure_ldsc_hmagma.tiff",
  fig,
  device = "tiff",
  path = path_for_figures,
  width = 130,
  height = 120,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

ggsave(
  "suppfigure_ldsc_hmagma.png",
  supp_fig,
  device = "png",
  path = path_for_figures,
  width = 130,
  height = 120,
  units = "mm",
  dpi = 600,
  limitsize = TRUE
)

12 Session info

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  stats4    grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] biomaRt_2.42.1              org.Hs.eg.db_3.10.0        
##  [3] AnnotationDbi_1.48.0        scales_1.1.1               
##  [5] ggsci_2.9                   qdapTools_1.3.5            
##  [7] stringr_1.4.0               dplyr_1.0.2                
##  [9] purrr_0.3.4                 readr_1.4.0                
## [11] tidyr_1.1.1                 tibble_3.0.3               
## [13] tidyverse_1.3.0             rutils_0.99.2              
## [15] readxl_1.3.1                janitor_2.1.0              
## [17] ggh4x_0.1.0.9000            ggmosaic_0.3.2             
## [19] ggpubr_0.4.0                forcats_0.5.1              
## [21] factoextra_1.0.7            ggplot2_3.3.2              
## [23] DT_0.15                     DESeq2_1.26.0              
## [25] SummarizedExperiment_1.16.1 DelayedArray_0.12.3        
## [27] BiocParallel_1.20.1         matrixStats_0.56.0         
## [29] Biobase_2.46.0              GenomicRanges_1.38.0       
## [31] GenomeInfoDb_1.22.1         IRanges_2.20.2             
## [33] S4Vectors_0.24.4            BiocGenerics_0.32.0        
## [35] ComplexHeatmap_2.7.7        clusterProfiler_3.14.3     
## [37] cowplot_1.0.0              
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.1         bit64_4.0.2            knitr_1.29            
##   [4] data.table_1.13.0      rpart_4.1-15           RCurl_1.98-1.2        
##   [7] doParallel_1.0.15      generics_0.0.2         RSQLite_2.2.0         
##  [10] europepmc_0.4          chron_2.3-56           bit_4.0.4             
##  [13] enrichplot_1.6.1       xml2_1.3.2             lubridate_1.7.9       
##  [16] httpuv_1.5.4           assertthat_0.2.1       viridis_0.5.1         
##  [19] xfun_0.16              hms_1.0.0              evaluate_0.14         
##  [22] promises_1.1.1         progress_1.2.2         dbplyr_1.4.4          
##  [25] igraph_1.2.5           DBI_1.1.1              geneplotter_1.64.0    
##  [28] htmlwidgets_1.5.3      ellipsis_0.3.1         crosstalk_1.1.0.1     
##  [31] backports_1.1.8        annotate_1.64.0        gridBase_0.4-7        
##  [34] vctrs_0.3.2            here_1.0.0             Cairo_1.5-12.2        
##  [37] abind_1.4-5            cachem_1.0.3           withr_2.2.0           
##  [40] ggforce_0.3.2          triebeard_0.3.0        treemap_2.4-2         
##  [43] checkmate_2.0.0        prettyunits_1.1.1      cluster_2.1.0         
##  [46] DOSE_3.12.0            lazyeval_0.2.2         crayon_1.4.1          
##  [49] genefilter_1.68.0      pkgconfig_2.0.3        slam_0.1-47           
##  [52] labeling_0.4.2         tweenr_1.0.1           vipor_0.4.5           
##  [55] wordcloud_2.6          rrvgo_1.1.4            nnet_7.3-12           
##  [58] rlang_0.4.7            lifecycle_0.2.0        BiocFileCache_1.10.2  
##  [61] productplots_0.1.1     modelr_0.1.8           cellranger_1.1.0      
##  [64] rprojroot_2.0.2        polyclip_1.10-0        Matrix_1.2-17         
##  [67] urltools_1.7.3         carData_3.0-4          reprex_2.0.0          
##  [70] base64enc_0.1-3        beeswarm_0.2.3         ggridges_0.5.2        
##  [73] GlobalOptions_0.1.2    pheatmap_1.0.12        png_0.1-7             
##  [76] viridisLite_0.3.0      rjson_0.2.20           bitops_1.0-6          
##  [79] blob_1.2.1             shape_1.4.4            qvalue_2.18.0         
##  [82] jpeg_0.1-8.1           rstatix_0.6.0          gridGraphics_0.5-0    
##  [85] ggsignif_0.6.0         memoise_2.0.0          magrittr_2.0.1        
##  [88] plyr_1.8.6             zlibbioc_1.32.0        compiler_3.6.1        
##  [91] RColorBrewer_1.1-2     clue_0.3-57            snakecase_0.11.0      
##  [94] cli_2.2.0.9000         XVector_0.26.0         htmlTable_2.0.1       
##  [97] Formula_1.2-4          MASS_7.3-51.4          tidyselect_1.1.0      
## [100] stringi_1.5.3          yaml_2.2.1             GOSemSim_2.17.1       
## [103] askpass_1.1            locfit_1.5-9.4         latticeExtra_0.6-29   
## [106] ggrepel_0.8.2          fastmatch_1.1-0        tools_3.6.1           
## [109] rio_0.5.16             circlize_0.4.10        rstudioapi_0.13       
## [112] foreach_1.5.0          foreign_0.8-72         gridExtra_2.3         
## [115] farver_2.0.3           ggraph_2.0.3           digest_0.6.27         
## [118] rvcheck_0.1.8          BiocManager_1.30.10    shiny_1.5.0           
## [121] Rcpp_1.0.5             car_3.0-9              broom_0.7.0           
## [124] later_1.1.0.1          httr_1.4.2             colorspace_2.0-0      
## [127] rvest_0.3.6            XML_3.99-0.3           fs_1.5.0              
## [130] MarkerGenes_0.0.0.9000 splines_3.6.1          graphlayouts_0.7.0    
## [133] ggplotify_0.0.5        plotly_4.9.2.1         xtable_1.8-4          
## [136] jsonlite_1.7.1         tidygraph_1.2.0        NLP_0.2-1             
## [139] R6_2.5.0               tm_0.7-7               Hmisc_4.4-1           
## [142] pillar_1.4.6           htmltools_0.5.1.1      mime_0.9              
## [145] glue_1.4.2             fastmap_1.0.1          codetools_0.2-16      
## [148] fgsea_1.12.0           lattice_0.20-38        curl_4.3              
## [151] ggbeeswarm_0.6.0       zip_2.1.0              GO.db_3.10.0          
## [154] openxlsx_4.2.3         openssl_1.4.2          survival_3.2-11       
## [157] rmarkdown_2.5          munsell_0.5.0          DO.db_2.9             
## [160] GetoptLong_1.0.2       GenomeInfoDbData_1.2.2 iterators_1.0.12      
## [163] haven_2.3.1            reshape2_1.4.4         gtable_0.3.0

Manuscript figures

Regina H. Reynolds

1 Dependencies

1.1 Functions

1.2 Single-nucleus data from seurat objects

2 Sample demographics

2.1 Loading data

2.2 Numbers quoted in text

2.3 Supplementary figure(s)

2.4 Export files

3 Single-nucleus RNA-sequencing metrics

3.1 Loading data

3.2 Supplementary figure(s)

3.3 Export files

4 Bulk RNA-sequencing metrics

4.1 Loading data

4.2 Numbers quoted in text

4.3 Supplementary figure(s)

4.4 Supplementary table(s)

4.5 Export files

5 Conos and deconvolution

5.1 Loading data

5.2 Numbers quoted in text

5.3 Main text figure(s)

5.4 Supplementary figure(s)

5.5 Supplementary table(s)

5.6 Export files

6 Single-nucleus DEG and pathways (general)

6.1 Loading DEGs

6.2 Loading pathways

6.3 Numbers quoted in text

6.4 Main text figure(s)

6.5 Supplementary figure(s)

6.6 Supplementary table(s)

6.7 Export files

7 Single-nucleus DEG and pathways (PD-specific)

7.1 Loading data

7.2 Numbers quoted in text

7.3 Main text figure(s)

7.4 Supplementary figure(s)

7.5 Supplementary table(s)

7.6 Export files

8 Bulk-tissue differential expression

8.1 Loading data

8.2 Numbers quoted in text

8.3 Supplementary figure(s)

8.4 Supplementary table(s)

8.5 Export files

9 Bulk-tissue differential splicing

9.1 Loading data

9.2 Numbers quoted in text

9.2.1 Check frequency of events in GTEx anterior cingulate cortex

9.3 Main text figure(s)

9.4 Supplementary figure(s)

9.5 Supplementary table(s)

9.6 Export files

10 RBP enrichment analysis

10.1 Loading data

10.2 Numbers mentioned in text

10.3 Supplementary table(s)

10.4 Export files

11 LDSC/MAGMA output

11.1 Loading data

11.1.1 H-MAGMA

11.1.2 sLDSC

11.2 Main figure

11.3 Supplementary figure(s)

11.4 Supplementary table(s)

11.5 Export files

12 Session info