Aim: to use the cell deconvolution algorithm, Scaden, to approximate the cell type composition of a given bulk tissue sample.
source(here::here("R", "file_paths.R"))
path_to_results <- file.path(path_to_wd, "results")
path_to_raw_data <- file.path(path_to_wd, "raw_data")
sample_info <-
read_excel(path =
file.path(
path_to_raw_data,
"sample_details/20201229_MasterFile_SampleInfo.xlsx"
),
sheet = "SampleInfo", skip = 1) %>%
dplyr::filter(Sample_Type == "Tissue section" & sent_to_bulk_seq == "yes") %>%
dplyr::select(CaseNo, Disease_Group, Sex, AoO, AoD, DD, PMI, aSN, TAU, 'thal AB', 'aCG aSN score', Genetics, RINe_bulkRNA_Tapestation) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD","DLB"))) %>%
dplyr::rename(sample_id = CaseNo)
Overview of training data and cell type deconvolution with Scaden. A: Artificial bulk samples are generated by subsampling random cells from a scRNA-seq datasets and merging their expression profiles. B: Model training and parameter optimization on simulated tissue RNA-seq data by comparing cell fraction predictions to ground-truth cell composition. C: Cell deconvolution of real tissue RNA-seq data using Scaden.
Steps include:
scaden process
command, as this command looks for intersection of genes between training and prediction data.source(here::here("misc_scripts", "snRNAseq_transform_logcounts_to_raw.R"))
scaden process
a smaller function within this command (get_signature_genes
) is used to find the common genes between the training and prediction data. To do this it first computes the variance of each genes across samples and only includes those with a variance > 0.1 (line 80, in scaden/scaden/model/functions.py
: keep = data.var(axis=1) > var_cutoff
). Thus, important to scale bulk RNA-seq data to ensure variance isn't too small. Scaled by 1E6.source(here::here("R", "biomart_df.R"))
dds <-
readRDS(
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD.Rds"
)
)
# Library size matrix
lib_matrix <- colSums(counts(dds)) %>%
as.matrix() %>%
t()
lib_matrix <- lib_matrix[rep(1, times = nrow(counts(dds))),]
# Divide raw counts by library size and scale with 1E6
norm_counts <- (counts(dds)/lib_matrix) * 1000000
# Processing for final scaden format
# Convert ensembl IDs to gene symbols
norm_counts <- norm_counts %>%
as.data.frame() %>%
rownames_to_column(var = "gene") %>%
biomart_df(columnToFilter = "gene", mart = 38, attributes = c("ensembl_gene_id", "hgnc_symbol"), filter = "ensembl_gene_id") %>%
dplyr::select(gene, hgnc_symbol, everything())
# Filter out rows where ensembl ID has no corresponding gene symbol
# Remove duplicate hgnc_symbols (i.e. multiple ensembl ids mapping to same hgnc_symbol)
norm_counts <- norm_counts %>%
dplyr::filter(hgnc_symbol != "") %>%
dplyr::filter(!(duplicated(hgnc_symbol) | duplicated(hgnc_symbol, fromLast = TRUE))) %>%
dplyr::select(-gene)
# Save for scaden
write_delim(format(norm_counts, scientific = FALSE),
path =
file.path(
path_to_raw_data,
"deconvolution/scaden/bulkRNAseq_norm_counts.txt"
),
delim = "\t")
# initiate docker
docker run --name scaden_rreynolds -it kevinmenden/scaden
# to exit
exit
# to re-connect to scaden_rreynolds container and get a bash shell in the container
docker exec -it scaden_rreynolds bash
#---Generating training data----------------------------------------
# required installation of anndata module (not within container)
pip install anndata
# copy files into container from server
docker cp /home/rreynolds/projects/Aim2_PDsequencing_wd/raw_data/deconvolution/scaden/2019Oct scaden_rreynolds:/home/data/PD_seq/
# move files within docker to directory samples/
cd /home/data/PD_seq/scaden/
mkdir samples
mv *.txt samples/
# make folder for simulated samples
mkdir /home/data/PD_seq/scaden/cells.100_samples.1000/samples_simulated
# bulk simulation
python /opt/conda/pkgs/scaden-0.9.0-py_0/site-packages/scaden/preprocessing/bulk_simulation.py --cells 100 --samples 1000 --data /home/data/PD_seq/scaden/samples/ --out /home/data/PD_seq/scaden/cells.100_samples.1000/samples_simulated/
# Create h5ad file
python /opt/conda/pkgs/scaden-0.9.0-py_0/site-packages/scaden/preprocessing/create_h5ad_file.py --data /home/data/PD_seq/scaden/cells.100_samples.1000/samples_simulated/ --out /home/data/PD_seq/scaden/cells.100_samples.1000/pdseq_bulksim.h5ad
#---pre-processing of data----------------------------------------
scaden process /home/data/PD_seq/scaden/cells.100_samples.1000/pdseq_bulksim.h5ad /home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt --processed_path /home/data/PD_seq/scaden/cells.100_samples.1000/processed_pdseq_bulksim.h5ad
mkdir scaden_model_5000steps
scaden train /home/data/PD_seq/scaden/cells.100_samples.1000/processed_pdseq_bulksim.h5ad --model_dir /home/data/PD_seq/scaden/cells.100_samples.1000/scaden_model_5000steps --steps 5000
scaden predict /home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt --model_dir /home/data/PD_seq/scaden/cells.100_samples.1000/scaden_model_5000steps --outname /home/data/PD_seq/scaden/cells.100_samples.1000/scaden_predictions_cells.100:samples.1000.txt
# exit docker
exit
# copy output to server
docker cp scaden_rreynolds:/home/data/PD_seq/scaden/cells.100_samples.1000/scaden_predictions_cells.100:samples.1000.txt /home/rreynolds/projects/Aim2_PDsequencing_wd/results/deconvolution/scaden/2019Oct/scaden_predictions_cells.100:samples.1000.txt
#----Loading results----
# Source functions
source(here::here("R", "calculate_snRNA_proportions.R"))
source(here::here("R", "scaden_rankings_and_calculate_ratios.R"))
# Calculate cell type proportions from cell type assignments
sn_proportions <-
calculate_snRNA_proportions(
path_to_snRNAseq_celltype_files =
file.path(
path_to_raw_data,
"deconvolution/scaden/2019Oct"
)
)
# Create dataframe with cell type classes
cell_type_class <-
data.frame(Celltype = sn_proportions %>%
dplyr::select(-sample_id) %>%
colnames() %>%
sort(),
Celltype_class = factor(c("Astrocyte", "Vascular", "Neuron", "Neuron",
"Immune", "Oligo", "Oligo", "Vascular",
"Unclassified"),
levels=c("Astrocyte", "Immune", "Oligo", "Vascular",
"Neuron", "Unclassified")))
# Read in scaden predictions, determine within-sample rankings of cell-types by proportions, and calculate glia-to-neuron and non-neuron-to-neuron ratios
results <-
scaden_rankings_and_calculate_ratios(prediction_dir =
file.path(
path_to_results,
"deconvolution/scaden/2019Oct"
),
unclassified_cell_type = "Unclassified",
sample_info = sample_info %>%
dplyr::select(sample_id, Disease_Group),
snRNAseq_proportions = sn_proportions,
cell_type_class = cell_type_class,
glia = vars(Astro, Microglia, Oligo, OPC),
nonneuron = vars(Astro, Endo, Microglia, Oligo, OPC, Per),
neuron = vars(Excitatory, Inhibitory),
astrocyte = vars(Astro),
oligos = vars(Oligo, OPC),
microglia = vars(Microglia))
# # Export cell-type proportions
# write_delim(results$ranks %>%
# dplyr::select(-rank_within_sample),
# "/home/rreynolds/projects/Aim2_PDsequencing_wd/results/deconvolution/scaden/2019Oct/scaden_summary_results.txt", delim = "\t")
data_to_plot <-
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 |
source == "snRNA-seq") %>%
dplyr::select(sample_id, Celltype, cell_type_proportion, source) %>%
tidyr::spread(key = source, value = cell_type_proportion) %>%
dplyr::filter(Celltype != "Unclassified")
data_to_plot %>%
ggplot(aes(x = `Scaden deconvolution`, y = `snRNA-seq`)) +
geom_point() +
facet_wrap(~ Celltype, scales = "free") +
labs(x = "Scaden cell-type proportions", y = "snRNA-seq cell-type proportions") +
theme_rhr +
theme(axis.text.x = element_text(angle = 0))
Figure 4.1: Scatterplot of cell-type proportions derived from snRNA-seq labelling or from scaden predictions across cell types.
# Overall correlation
print("Overall correlation")
## [1] "Overall correlation"
cor.test(data_to_plot$`Scaden deconvolution`,
data_to_plot$`snRNA-seq`,
method = "spearman")
##
## Spearman's rank correlation rho
##
## data: data_to_plot$`Scaden deconvolution` and data_to_plot$`snRNA-seq`
## S = 523352, p-value = 1.225e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.414071
# Correlation of rankings for each cell type between scaden predictions and snRNA-seq proportions
print("Correlation by cell type")
## [1] "Correlation by cell type"
data_to_plot %>%
dplyr::filter(Celltype != "Unclassified") %>%
tidyr::nest(-Celltype) %>%
dplyr::mutate(test = purrr::map(data, ~ cor.test(.x$`Scaden deconvolution`, .x$`snRNA-seq`, method = "spearman")),
tidied = map(test, broom::tidy)) %>%
tidyr::unnest(tidied, .drop = TRUE) %>%
dplyr::select(-data, -test) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 | source == "snRNA-seq") %>%
ggplot(aes(x = fct_relevel(Celltype,
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000) %>%
arrange(Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()),
y = cell_type_proportion,
fill = Celltype_class)) +
geom_boxplot() +
facet_grid(Disease_Group ~ source) +
ylim(0,100) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "", y = "Cell type proportion", fill = "Cell-type class") +
theme_rhr
Figure 4.2: Cell-type proportions derived from scaden deconvolution predictions or single-nuclear cell-type assignments grouped by disease status. Scaden neural networks were trained using simulated bulk RNA-sequencing profiles generated from each individual's single-nuclear RNA-sequencing data; 1000 simulated bulk samples were prepared per subject, using 100 nuclei per sample. Once trained, Scaden was used to predict cell-type proportions for bulk RNA-sequencing data derived from the same individuals. Single-nuclear cell-type proportions were calculated per individual, by dividing the number of cells assigned to a cell type by the total number of cells. Cell-type proportions are summarised by cell type in each disease group. Cell types are coloured by their overarching cell-type class.
plot <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 1000 & samples == 1000 & Disease_Group == "Control" & Celltype != "Unclassified") %>%
ggplot(aes(x = fct_relevel(Celltype,
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000) %>%
arrange(Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()),
y = cell_type_proportion,
fill = Celltype_class)) +
geom_boxplot() +
# ylim(0,100) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "", y = "Cell type proportion", fill = "Cell-type class") +
theme_rhr
# ggsave(plot,
# filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/Departmental/control_deconvolution_prop.tiff",
# height = 120,
# width = 100,
# dpi = 200,
# units = "mm")
# # Median by cell type
# medians <- results$ranks %>%
# dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 | source == "snRNA-seq") %>%
# dplyr::group_by(source, Disease_Group, Celltype) %>%
# dplyr::summarise(median_scaden = median(cell_type_proportion, na.rm = TRUE)) %>%
# dplyr::arrange(Celltype)
results$ratios %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 | source == "snRNA-seq") %>%
ggplot(aes(x = Disease_Group, y = value)) +
geom_boxplot(aes(fill = ratio)) +
geom_hline(yintercept = 1.5, linetype = 2, colour = "black") +
facet_wrap(~ source) +
labs(x = "", y = "Value of ratio", fill = "Type of ratio") +
coord_cartesian(ylim = c(0,20)) +
theme_rhr
Figure 4.3: Ratio of glia-to-neurons and non-neurons-to-neurons across disease groups, as calculated using cell-type proportions derived from scaden deconvolution predictions or single-nuclear cell-type assignments. Scaden neural networks were trained using simulated bulk RNA-sequencing profiles generated from each individual's single-nuclear RNA-sequencing data; 1000 simulated bulk samples were prepared per subject, using 100 nuclei per sample.
plot <-
results$ratios %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 1000 & samples == 1000 & ratio == "glia_neuron_ratio" & Disease_Group == "Control") %>%
ggplot(aes(x = Disease_Group, y = value)) +
geom_boxplot() +
geom_hline(yintercept = 1.5, linetype = 2, colour = "black") +
labs(x = "", y = "Glia:neuron ratio") +
scale_y_continuous(limits = c(0,10)) +
# coord_cartesian(ylim = c(0,20)) +
theme_rhr
# ggsave(plot,
# filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/Departmental/glia_neuron_ratio.tiff",
# device = "tiff",
# height = 120,
# width = 30,
# dpi = 200,
# units = "mm")
# # Median values
# ratios %>%
# dplyr::group_by(source, Disease_Group, ratio) %>%
# dplyr::summarise(median = median(value))
Figure 5.1: Number of (a) nuclei per sample or (b) reads per nucleus grouped by disease.
# Copy scaden_script.sh to docker container
docker cp /home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/misc_scripts/scaden_script.sh scaden_rreynolds:/home/scripts/scaden_script.sh
# Run script for each sample scenario after navigating to docker and re-initiating
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=500 --n_simulated_samples=1000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=1000 --n_simulated_samples=1000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=2000 --n_simulated_samples=1000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=1000 --n_simulated_samples=2000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=1000 --n_simulated_samples=4000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
# # Plots showing raw cell-type proportions
# a <- results$ranks %>%
# dplyr::mutate(cells = fct_relevel(cells,
# c("100", "500", "1000", "2000"))) %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & samples == "1000") %>%
# ggplot(aes(x = cells,
# y = cell_type_proportion,
# fill = Celltype_class)) +
# geom_boxplot() +
# facet_grid(cols = vars(Disease_Group),
# rows = vars(fct_relevel(Celltype,
# results$ranks %>%
# dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
# arrange(source, cells_samples, Celltype_class, Celltype) %>%
# .[["Celltype"]] %>%
# unique() %>%
# as.character())),
# scales = "free_y") +
# # ylim(0,100) +
# scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
# labs(x = "Number of cells used per simulated sample", y = "Cell type proportion", fill = "Cell-type class") +
# theme_rhr
#
# b <- results$ranks %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000") %>%
# ggplot(aes(x = samples,
# y = cell_type_proportion,
# fill = Celltype_class)) +
# geom_boxplot() +
# facet_grid(cols = vars(Disease_Group),
# rows = vars(fct_relevel(Celltype,
# results$ranks %>%
# dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
# arrange(source, cells_samples, Celltype_class, Celltype) %>%
# .[["Celltype"]] %>%
# unique() %>%
# as.character())),
# scales = "free_y") +
# # ylim(0,100) +
# scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
# labs(x = "Number of bulk simulated samples used per individual", y = "Cell type proportion", fill = "Cell-type class") +
# theme_rhr
a <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & samples == "1000") %>%
dplyr::select(-rank_within_sample) %>%
# Spread columns and group by Disease & Celltype to calculate median for 100 cells
tidyr::spread(key = cells, value = cell_type_proportion) %>%
dplyr::group_by(Disease_Group, Celltype) %>%
dplyr::mutate(median_100 = median(`100`)) %>%
dplyr::ungroup() %>%
# Calculate fold change from reference, 100 cells, by dividing each column by median_100
dplyr::mutate_at(.vars = vars(`100`, `500`, `1000`, `2000`), .funs = function(x){x/.[["median_100"]]}) %>%
dplyr::select(-median_100) %>%
tidyr::gather(key = "cells", value = "cell_type_proportion_relative_to_100_cells", -sample_id, -Celltype, -source, -samples, -Celltype_class, -Disease_Group) %>%
ggplot(aes(x = fct_relevel(cells,
rev(c("100", "500", "1000", "2000"))),
y = cell_type_proportion_relative_to_100_cells,
fill = Celltype_class)) +
geom_boxplot() +
geom_hline(aes(yintercept = 1), colour = "black", linetype = "dashed", size = 0.75) +
facet_grid(cols = vars(Disease_Group),
rows = vars(fct_relevel(Celltype,
results$ranks %>%
dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
arrange(source, cells_samples, Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()))) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "Number of cells used per simulated sample", y = "Cell-type proportion relative to 100 cells", fill = "Cell-type class") +
theme_rhr_coord_flip +
coord_flip()
b <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000") %>%
dplyr::select(-rank_within_sample) %>%
# Spread columns and group by Disease & Celltype to calculate median for 1000 samples
tidyr::spread(key = samples, value = cell_type_proportion) %>%
dplyr::group_by(Disease_Group, Celltype) %>%
dplyr::mutate(median_1000 = median(`1000`)) %>%
dplyr::ungroup() %>%
# Calculate fold change from reference, 1000 samples, by dividing each column by median_1000
dplyr::mutate_at(.vars = vars(`1000`, `2000`, `4000`), .funs = function(x){x/.[["median_1000"]]}) %>%
dplyr::select(-median_1000) %>%
tidyr::gather(key = "samples", value = "cell_type_proportion_relative_to_1000_samples", -sample_id, -Celltype, -source, -cells, -Celltype_class, -Disease_Group) %>%
ggplot(aes(x = fct_relevel(samples,
rev(c("1000", "2000", "4000"))),
y = cell_type_proportion_relative_to_1000_samples,
fill = Celltype_class)) +
geom_boxplot() +
geom_hline(aes(yintercept = 1), colour = "black", linetype = "dashed", size = 0.75) +
facet_grid(cols = vars(Disease_Group),
rows = vars(fct_relevel(Celltype,
results$ranks %>%
dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
arrange(source, cells_samples, Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()))) +
ylim(0,9) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
scale_colour_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "Number of bulk simulated samples used per individual", y = "Cell-type proportion relative to 1000 samples", fill = "Cell-type class") +
theme_rhr_coord_flip +
coord_flip()
ggarrange(a, b,
labels = c("a", "b"),
nrow = 2,
common.legend = TRUE)
Figure 5.2: Cell-type proportions as predicted by Scaden using (a) an increasing number of cells per simulated sample or (b) an increasing number of bulk simulated samples per subject. Scaden was run with (a) 100, 500, 1000 or 2000 cells to generate 1000 bulk simulated samples per subject or (b) 1000 cells to generate 1000, 2000 or 4000 bulk simulated samples per subject. Cell-type proportions are grouped by cell type and disease status, displayed relative to the median of (a) 100 cells or (b) 1000 samples (median indicated with black dashed line) and coloured by their overarching cell-type class.
results$ratios %>%
dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples) %>%
tidyr::replace_na(., "snRNA-seq") %>%
fct_relevel(.,
c("cells.100_samples.1000", "cells.500_samples.1000", "cells.1000_samples.1000",
"cells.2000_samples.1000", "cells.1000_samples.2000", "cells.1000_samples.4000", "snRNA-seq"))) %>%
dplyr::filter(Disease_Group == "Control") %>%
ggplot(aes(x = ratio, y = value)) +
geom_boxplot(aes(fill = ratio), width = 0.4) +
geom_hline(yintercept = 1.5, linetype = 2, colour = "black") +
facet_wrap(vars(cells_samples)) +
labs(x = "", y = "Value of ratio", fill = "Type of ratio") +
coord_cartesian(ylim = c(0,15)) +
theme_rhr
Figure 5.3: Ratio of glia-to-neurons and non-neurons-to-neurons across controls when using variations of scaden parameters i.e. cell numbers used or the number of bulk simulated samples generated.
# # Median values
# ratios %>%
# dplyr::group_by(source, Disease_Group, ratio) %>%
# dplyr::summarise(median = median(value))
source(here::here("R", "scoring_deconvolution_performance.R"))
scoring_deconvolution_performance(results_list = results %>%
lapply(., function(x){x %>% dplyr::filter(Disease_Group == "Control" & source == "Scaden deconvolution")}),
true_GNR = 1.5,
deconvolution_variables = vars(cells, samples),
glia = c("Astro", "Microglia", "Oligo"),
oligo = c("Oligo"),
astrocyte = c("Astro"),
microglia = c("Microglia"),
use_ranges = TRUE) %>%
dplyr::arrange(overall_rank, GNR_rank) %>%
dplyr::select(cells, samples, overall_rank, overall_score, everything()) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
overall_rank
: final ranking of deconvolution instances based on overall_score
i.e. highest overall score receives rank of 1overall_score
: overall score across the five scoring parameters (GNR_score, *_glial_proportion and glia ranking_score). For each scoring parameter, award score of 1 to the deconvolution instance(s) with the highest rank (where 1 = best).GNR_rank
: ranking of instances from best (equivalent to rank of 1) to worst by the deviation of its median glia-to-neuron ratio from the true ratio.*_glial_proportion
: ranking of each instance from best (equivalent to rank of 1) to worst based on the proportion of samples within a deconvolution instance that have a proportion of the glial cell-type in question that falls within expected ranges.*_glial_proportion_deviation_from_mean
: ranking of each instance from best (equivalent to rank 1) to worst by the deviation of its median astro/oligo/microglial glial proportion from the expected mean across 3 studies.glia_ranking_score
: ranking of deconvolution instance form best (equivalent to rank of 1) to worst by the proportion of samples wherein oligodendrocytes make up the biggest glial cell-type population, followed by astrocytes and then microglia.plot <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
dplyr::select(-rank_within_sample, -cells, -samples, -source) %>%
tidyr::spread(key = Disease_Group, value = cell_type_proportion) %>%
dplyr::group_by(Celltype) %>%
dplyr::mutate(median_control = median(Control, na.rm = T)) %>%
dplyr::ungroup() %>%
# Calculate fold change from reference, controls, by dividing each column by median_control
dplyr::mutate_at(.vars = vars("Control", "PD", "PDD", "DLB"), .funs = function(x){x/.[["median_control"]]}) %>%
dplyr::select(-median_control) %>%
tidyr::gather(key = "Disease_Group", value = "cell_type_proportion_relative_to_control", -sample_id, -Celltype, -Celltype_class) %>%
# dplyr::filter(Celltype == "Excitatory") %>%
ggplot(aes(x = Celltype,
y = cell_type_proportion_relative_to_control,
fill = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")))) +
geom_boxplot() +
# scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
scale_fill_manual(values = c("#868686", "#c4ebe3", "#73cab9", "#19967d")) +
coord_cartesian(ylim = c(0,8)) +
labs(x = "Cell type", y = "Cell type proportion relative to controls", fill = "Disease group") +
theme_rhr
plot
Figure 6.1: Cell-type proportions derived from scaden deconvolution. Scaden neural networks were trained using simulated bulk RNA-sequencing profiles generated from each individual's single-nuclear RNA-sequencing data; 1000 simulated bulk samples were prepared per subject, using 1000 nuclei per sample. Once trained, Scaden was used to predict cell-type proportions for bulk RNA-sequencing data derived from the same individuals. Cell-type proportions are grouped by cell type and disease status and displayed relative to the median of controls (within a cell type).
# ggsave(plot,
# filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/IPDGC_presentation/scaden_deconvolution_exct.tiff",
# device = "tiff",
# dpi = 200,
# width = 50,
# height = 120,
# units = "mm")
# # Median by cell type
# medians <- results$ranks %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
# dplyr::group_by(source, Disease_Group, Celltype) %>%
# dplyr::summarise(median_scaden = median(cell_type_proportion, na.rm = TRUE)) %>%
# dplyr::arrange(Celltype)
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
dplyr::group_by(Celltype) %>%
dplyr::do(pairwise.wilcox.test(x= .$cell_type_proportion, g = .$Disease_Group, p.adjust.method = "none", paired = FALSE) %>%
broom::tidy()) %>%
dplyr::mutate(fdr = p.adjust(p.value, method = "BH")) %>%
dplyr::arrange(fdr) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
dplyr::mutate(combined = str_c(cells, ":", samples)) %>%
dplyr::group_by(combined, Celltype) %>%
dplyr::do(pairwise.wilcox.test(x= .$cell_type_proportion, g = .$Disease_Group, p.adjust.method = "none", paired = FALSE) %>%
broom::tidy()) %>%
dplyr::group_by(combined) %>%
dplyr::mutate(fdr = p.adjust(p.value, method = "BH"),
rank = rank(p.value, ties.method = "average"),
comparison = str_c(group2, ":", group1)) %>%
ggplot(aes(x = reorder_within(x = comparison, by = rank, within = Celltype, fun = median, desc = F), y = rank)) +
geom_boxplot() +
facet_wrap(vars(Celltype), scales = "free_x") +
scale_x_reordered() +
labs(x = "", y = "Rank") +
theme_rhr +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
Figure 6.2: Ranking of p-values derived from Wilcoxon rank sum tests run for each combination of cells x samples (total of 6 different combinations - 100,500,1000 and 2000 cells run with 1000 samples and 1000 cells run with 2000 and 4000 samples). Lower rank = lower p-value.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] forcats_0.5.1 stringr_1.4.0
## [5] dplyr_1.0.2 purrr_0.3.4
## [7] readr_1.4.0 tidyr_1.1.1
## [9] tibble_3.0.3 tidyverse_1.3.0
## [11] readxl_1.3.1 here_1.0.0
## [13] ggpubr_0.4.0 ggplot2_3.3.2
## [15] ggsci_2.9 DESeq2_1.26.0
## [17] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [19] BiocParallel_1.20.1 matrixStats_0.56.0
## [21] Biobase_2.46.0 GenomicRanges_1.38.0
## [23] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [25] S4Vectors_0.24.4 BiocGenerics_0.32.0
## [27] DT_0.15 DescTools_0.99.37
## [29] biomaRt_2.42.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.8 Hmisc_4.4-1 qdapTools_1.3.5
## [4] BiocFileCache_1.10.2 plyr_1.8.6 splines_3.6.1
## [7] crosstalk_1.1.0.1 usethis_1.6.3 digest_0.6.27
## [10] htmltools_0.5.1.1 magrittr_2.0.1 checkmate_2.0.0
## [13] memoise_2.0.0 cluster_2.1.0 limma_3.42.2
## [16] remotes_2.2.0 openxlsx_4.2.3 annotate_1.64.0
## [19] modelr_0.1.8 askpass_1.1 prettyunits_1.1.1
## [22] jpeg_0.1-8.1 colorspace_2.0-0 blob_1.2.1
## [25] rvest_0.3.6 rappdirs_0.3.1 haven_2.3.1
## [28] xfun_0.16 callr_3.5.1 crayon_1.4.1
## [31] RCurl_1.98-1.2 jsonlite_1.7.1 genefilter_1.68.0
## [34] Exact_2.0 survival_3.2-7 glue_1.4.2
## [37] gtable_0.3.0 zlibbioc_1.32.0 XVector_0.26.0
## [40] HGNChelper_0.8.1 car_3.0-9 pkgbuild_1.1.0
## [43] abind_1.4-5 scales_1.1.1 mvtnorm_1.1-1
## [46] DBI_1.1.1 rstatix_0.6.0 Rcpp_1.0.5
## [49] xtable_1.8-4 progress_1.2.2 htmlTable_2.0.1
## [52] foreign_0.8-72 bit_4.0.4 Formula_1.2-4
## [55] htmlwidgets_1.5.3 httr_1.4.2 RColorBrewer_1.1-2
## [58] ellipsis_0.3.1 farver_2.0.3 pkgconfig_2.0.3
## [61] XML_3.99-0.3 nnet_7.3-12 dbplyr_1.4.4
## [64] locfit_1.5-9.4 labeling_0.4.2 reshape2_1.4.4
## [67] tidyselect_1.1.0 rlang_0.4.7 AnnotationDbi_1.48.0
## [70] munsell_0.5.0 cellranger_1.1.0 tools_3.6.1
## [73] cachem_1.0.3 cli_2.2.0.9000 generics_0.0.2
## [76] RSQLite_2.2.0 devtools_2.3.2 broom_0.7.0
## [79] ggdendro_0.1.21 evaluate_0.14 fastmap_1.0.1
## [82] yaml_2.2.1 processx_3.4.5 knitr_1.29
## [85] bit64_4.0.2 fs_1.5.0 zip_2.1.0
## [88] xml2_1.3.2 compiler_3.6.1 rstudioapi_0.13
## [91] curl_4.3 png_0.1-7 testthat_2.3.2
## [94] ggsignif_0.6.0 reprex_1.0.0 geneplotter_1.64.0
## [97] stringi_1.5.3 highr_0.8 ps_1.3.4
## [100] desc_1.2.0 lattice_0.20-38 Matrix_1.2-17
## [103] vctrs_0.3.2 pillar_1.4.6 lifecycle_0.2.0
## [106] cowplot_1.0.0 data.table_1.13.0 bitops_1.0-6
## [109] R6_2.5.0 latticeExtra_0.6-29 bookdown_0.21
## [112] gridExtra_2.3 rio_0.5.16 sessioninfo_1.1.1
## [115] boot_1.3-23 MASS_7.3-51.4 assertthat_0.2.1
## [118] pkgload_1.1.0 chron_2.3-56 openssl_1.4.2
## [121] rprojroot_2.0.2 withr_2.2.0 GenomeInfoDbData_1.2.2
## [124] expm_0.999-5 hms_1.0.0 grid_3.6.1
## [127] rpart_4.1-15 rmarkdown_2.5 carData_3.0-4
## [130] lubridate_1.7.9 base64enc_0.1-3