Aim: to use the cell deconvolution algorithm, Scaden, to approximate the cell type composition of a given bulk tissue sample.
source(here::here("R", "file_paths.R"))
path_to_results <- file.path(path_to_wd, "results")
path_to_raw_data <- file.path(path_to_wd, "raw_data")
sample_info <-
read_excel(path =
file.path(
path_to_raw_data,
"sample_details/20201229_MasterFile_SampleInfo.xlsx"
),
sheet = "SampleInfo", skip = 1) %>%
dplyr::filter(Sample_Type == "Tissue section" & sent_to_bulk_seq == "yes") %>%
dplyr::select(CaseNo, Disease_Group, Sex, AoO, AoD, DD, PMI, aSN, TAU, 'thal AB', 'aCG aSN score', Genetics, RINe_bulkRNA_Tapestation) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD","DLB"))) %>%
dplyr::rename(sample_id = CaseNo)
Steps include:
scaden process
command, as this command looks for intersection of genes between training and prediction data.source(here::here("misc_scripts", "snRNAseq_transform_logcounts_to_raw.R"))
scaden process
a smaller function within this command (get_signature_genes
) is used to find the common genes between the training and prediction data. To do this it first computes the variance of each genes across samples and only includes those with a variance > 0.1 (line 80, in scaden/scaden/model/functions.py
: keep = data.var(axis=1) > var_cutoff
). Thus, important to scale bulk RNA-seq data to ensure variance isn't too small. Scaled by 1E6.source(here::here("R", "biomart_df.R"))
dds <-
readRDS(
file =
file.path(
path_to_results,
"Salmon_quant/bulk_tissue/PD_tissue_polyA_DESeqDataSet_design_SexRINAoD.Rds"
)
)
# Library size matrix
lib_matrix <- colSums(counts(dds)) %>%
as.matrix() %>%
t()
lib_matrix <- lib_matrix[rep(1, times = nrow(counts(dds))),]
# Divide raw counts by library size and scale with 1E6
norm_counts <- (counts(dds)/lib_matrix) * 1000000
# Processing for final scaden format
# Convert ensembl IDs to gene symbols
norm_counts <- norm_counts %>%
as.data.frame() %>%
rownames_to_column(var = "gene") %>%
biomart_df(columnToFilter = "gene", mart = 38, attributes = c("ensembl_gene_id", "hgnc_symbol"), filter = "ensembl_gene_id") %>%
dplyr::select(gene, hgnc_symbol, everything())
# Filter out rows where ensembl ID has no corresponding gene symbol
# Remove duplicate hgnc_symbols (i.e. multiple ensembl ids mapping to same hgnc_symbol)
norm_counts <- norm_counts %>%
dplyr::filter(hgnc_symbol != "") %>%
dplyr::filter(!(duplicated(hgnc_symbol) | duplicated(hgnc_symbol, fromLast = TRUE))) %>%
dplyr::select(-gene)
# Save for scaden
write_delim(format(norm_counts, scientific = FALSE),
path =
file.path(
path_to_raw_data,
"deconvolution/scaden/bulkRNAseq_norm_counts.txt"
),
delim = "\t")
# initiate docker
docker run --name scaden_rreynolds -it kevinmenden/scaden
# to exit
exit
# to re-connect to scaden_rreynolds container and get a bash shell in the container
docker exec -it scaden_rreynolds bash
#---Generating training data----------------------------------------
# required installation of anndata module (not within container)
pip install anndata
# copy files into container from server
docker cp /home/rreynolds/projects/Aim2_PDsequencing_wd/raw_data/deconvolution/scaden/2019Oct scaden_rreynolds:/home/data/PD_seq/
# move files within docker to directory samples/
cd /home/data/PD_seq/scaden/
mkdir samples
mv *.txt samples/
# make folder for simulated samples
mkdir /home/data/PD_seq/scaden/cells.100_samples.1000/samples_simulated
# bulk simulation
python /opt/conda/pkgs/scaden-0.9.0-py_0/site-packages/scaden/preprocessing/bulk_simulation.py --cells 100 --samples 1000 --data /home/data/PD_seq/scaden/samples/ --out /home/data/PD_seq/scaden/cells.100_samples.1000/samples_simulated/
# Create h5ad file
python /opt/conda/pkgs/scaden-0.9.0-py_0/site-packages/scaden/preprocessing/create_h5ad_file.py --data /home/data/PD_seq/scaden/cells.100_samples.1000/samples_simulated/ --out /home/data/PD_seq/scaden/cells.100_samples.1000/pdseq_bulksim.h5ad
#---pre-processing of data----------------------------------------
scaden process /home/data/PD_seq/scaden/cells.100_samples.1000/pdseq_bulksim.h5ad /home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt --processed_path /home/data/PD_seq/scaden/cells.100_samples.1000/processed_pdseq_bulksim.h5ad
mkdir scaden_model_5000steps
scaden train /home/data/PD_seq/scaden/cells.100_samples.1000/processed_pdseq_bulksim.h5ad --model_dir /home/data/PD_seq/scaden/cells.100_samples.1000/scaden_model_5000steps --steps 5000
scaden predict /home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt --model_dir /home/data/PD_seq/scaden/cells.100_samples.1000/scaden_model_5000steps --outname /home/data/PD_seq/scaden/cells.100_samples.1000/scaden_predictions_cells.100:samples.1000.txt
# exit docker
exit
# copy output to server
docker cp scaden_rreynolds:/home/data/PD_seq/scaden/cells.100_samples.1000/scaden_predictions_cells.100:samples.1000.txt /home/rreynolds/projects/Aim2_PDsequencing_wd/results/deconvolution/scaden/2019Oct/scaden_predictions_cells.100:samples.1000.txt
#----Loading results----
# Source functions
source(here::here("R", "calculate_snRNA_proportions.R"))
source(here::here("R", "scaden_rankings_and_calculate_ratios.R"))
# Calculate cell type proportions from cell type assignments
sn_proportions <-
calculate_snRNA_proportions(
path_to_snRNAseq_celltype_files =
file.path(
path_to_raw_data,
"deconvolution/scaden/2019Oct"
)
)
# Create dataframe with cell type classes
cell_type_class <-
data.frame(Celltype = sn_proportions %>%
dplyr::select(-sample_id) %>%
colnames() %>%
sort(),
Celltype_class = factor(c("Astrocyte", "Vascular", "Neuron", "Neuron",
"Immune", "Oligo", "Oligo", "Vascular",
"Unclassified"),
levels=c("Astrocyte", "Immune", "Oligo", "Vascular",
"Neuron", "Unclassified")))
# Read in scaden predictions, determine within-sample rankings of cell-types by proportions, and calculate glia-to-neuron and non-neuron-to-neuron ratios
results <-
scaden_rankings_and_calculate_ratios(prediction_dir =
file.path(
path_to_results,
"deconvolution/scaden/2019Oct"
),
unclassified_cell_type = "Unclassified",
sample_info = sample_info %>%
dplyr::select(sample_id, Disease_Group),
snRNAseq_proportions = sn_proportions,
cell_type_class = cell_type_class,
glia = vars(Astro, Microglia, Oligo, OPC),
nonneuron = vars(Astro, Endo, Microglia, Oligo, OPC, Per),
neuron = vars(Excitatory, Inhibitory),
astrocyte = vars(Astro),
oligos = vars(Oligo, OPC),
microglia = vars(Microglia))
# # Export cell-type proportions
# write_delim(results$ranks %>%
# dplyr::select(-rank_within_sample),
# "/home/rreynolds/projects/Aim2_PDsequencing_wd/results/deconvolution/scaden/2019Oct/scaden_summary_results.txt", delim = "\t")
data_to_plot <-
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 |
source == "snRNA-seq") %>%
dplyr::select(sample_id, Celltype, cell_type_proportion, source) %>%
tidyr::spread(key = source, value = cell_type_proportion) %>%
dplyr::filter(Celltype != "Unclassified")
data_to_plot %>%
ggplot(aes(x = `Scaden deconvolution`, y = `snRNA-seq`)) +
geom_point() +
facet_wrap(~ Celltype, scales = "free") +
labs(x = "Scaden cell-type proportions", y = "snRNA-seq cell-type proportions") +
theme_rhr +
theme(axis.text.x = element_text(angle = 0))
# Overall correlation
print("Overall correlation")
## [1] "Overall correlation"
cor.test(data_to_plot$`Scaden deconvolution`,
data_to_plot$`snRNA-seq`,
method = "spearman")
##
## Spearman's rank correlation rho
##
## data: data_to_plot$`Scaden deconvolution` and data_to_plot$`snRNA-seq`
## S = 523352, p-value = 1.225e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.414071
# Correlation of rankings for each cell type between scaden predictions and snRNA-seq proportions
print("Correlation by cell type")
## [1] "Correlation by cell type"
data_to_plot %>%
dplyr::filter(Celltype != "Unclassified") %>%
tidyr::nest(-Celltype) %>%
dplyr::mutate(test = purrr::map(data, ~ cor.test(.x$`Scaden deconvolution`, .x$`snRNA-seq`, method = "spearman")),
tidied = map(test, broom::tidy)) %>%
tidyr::unnest(tidied, .drop = TRUE) %>%
dplyr::select(-data, -test) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 | source == "snRNA-seq") %>%
ggplot(aes(x = fct_relevel(Celltype,
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000) %>%
arrange(Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()),
y = cell_type_proportion,
fill = Celltype_class)) +
geom_boxplot() +
facet_grid(Disease_Group ~ source) +
ylim(0,100) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "", y = "Cell type proportion", fill = "Cell-type class") +
theme_rhr
plot <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 1000 & samples == 1000 & Disease_Group == "Control" & Celltype != "Unclassified") %>%
ggplot(aes(x = fct_relevel(Celltype,
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000) %>%
arrange(Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()),
y = cell_type_proportion,
fill = Celltype_class)) +
geom_boxplot() +
# ylim(0,100) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "", y = "Cell type proportion", fill = "Cell-type class") +
theme_rhr
# ggsave(plot,
# filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/Departmental/control_deconvolution_prop.tiff",
# height = 120,
# width = 100,
# dpi = 200,
# units = "mm")
# # Median by cell type
# medians <- results$ranks %>%
# dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 | source == "snRNA-seq") %>%
# dplyr::group_by(source, Disease_Group, Celltype) %>%
# dplyr::summarise(median_scaden = median(cell_type_proportion, na.rm = TRUE)) %>%
# dplyr::arrange(Celltype)
results$ratios %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 100 & samples == 1000 | source == "snRNA-seq") %>%
ggplot(aes(x = Disease_Group, y = value)) +
geom_boxplot(aes(fill = ratio)) +
geom_hline(yintercept = 1.5, linetype = 2, colour = "black") +
facet_wrap(~ source) +
labs(x = "", y = "Value of ratio", fill = "Type of ratio") +
coord_cartesian(ylim = c(0,20)) +
theme_rhr
plot <-
results$ratios %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 1000 & samples == 1000 & ratio == "glia_neuron_ratio" & Disease_Group == "Control") %>%
ggplot(aes(x = Disease_Group, y = value)) +
geom_boxplot() +
geom_hline(yintercept = 1.5, linetype = 2, colour = "black") +
labs(x = "", y = "Glia:neuron ratio") +
scale_y_continuous(limits = c(0,10)) +
# coord_cartesian(ylim = c(0,20)) +
theme_rhr
# ggsave(plot,
# filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/Departmental/glia_neuron_ratio.tiff",
# device = "tiff",
# height = 120,
# width = 30,
# dpi = 200,
# units = "mm")
# # Median values
# ratios %>%
# dplyr::group_by(source, Disease_Group, ratio) %>%
# dplyr::summarise(median = median(value))
# Copy scaden_script.sh to docker container
docker cp /home/rreynolds/projects/Aim2_PDsequencing_wd/LBD-seq-bulk-analyses/misc_scripts/scaden_script.sh scaden_rreynolds:/home/scripts/scaden_script.sh
# Run script for each sample scenario after navigating to docker and re-initiating
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=500 --n_simulated_samples=1000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=1000 --n_simulated_samples=1000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=2000 --n_simulated_samples=1000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=1000 --n_simulated_samples=2000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden/samples/ --n_cells=1000 --n_simulated_samples=4000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
# # Plots showing raw cell-type proportions
# a <- results$ranks %>%
# dplyr::mutate(cells = fct_relevel(cells,
# c("100", "500", "1000", "2000"))) %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & samples == "1000") %>%
# ggplot(aes(x = cells,
# y = cell_type_proportion,
# fill = Celltype_class)) +
# geom_boxplot() +
# facet_grid(cols = vars(Disease_Group),
# rows = vars(fct_relevel(Celltype,
# results$ranks %>%
# dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
# arrange(source, cells_samples, Celltype_class, Celltype) %>%
# .[["Celltype"]] %>%
# unique() %>%
# as.character())),
# scales = "free_y") +
# # ylim(0,100) +
# scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
# labs(x = "Number of cells used per simulated sample", y = "Cell type proportion", fill = "Cell-type class") +
# theme_rhr
#
# b <- results$ranks %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000") %>%
# ggplot(aes(x = samples,
# y = cell_type_proportion,
# fill = Celltype_class)) +
# geom_boxplot() +
# facet_grid(cols = vars(Disease_Group),
# rows = vars(fct_relevel(Celltype,
# results$ranks %>%
# dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
# arrange(source, cells_samples, Celltype_class, Celltype) %>%
# .[["Celltype"]] %>%
# unique() %>%
# as.character())),
# scales = "free_y") +
# # ylim(0,100) +
# scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
# labs(x = "Number of bulk simulated samples used per individual", y = "Cell type proportion", fill = "Cell-type class") +
# theme_rhr
a <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & samples == "1000") %>%
dplyr::select(-rank_within_sample) %>%
# Spread columns and group by Disease & Celltype to calculate median for 100 cells
tidyr::spread(key = cells, value = cell_type_proportion) %>%
dplyr::group_by(Disease_Group, Celltype) %>%
dplyr::mutate(median_100 = median(`100`)) %>%
dplyr::ungroup() %>%
# Calculate fold change from reference, 100 cells, by dividing each column by median_100
dplyr::mutate_at(.vars = vars(`100`, `500`, `1000`, `2000`), .funs = function(x){x/.[["median_100"]]}) %>%
dplyr::select(-median_100) %>%
tidyr::gather(key = "cells", value = "cell_type_proportion_relative_to_100_cells", -sample_id, -Celltype, -source, -samples, -Celltype_class, -Disease_Group) %>%
ggplot(aes(x = fct_relevel(cells,
rev(c("100", "500", "1000", "2000"))),
y = cell_type_proportion_relative_to_100_cells,
fill = Celltype_class)) +
geom_boxplot() +
geom_hline(aes(yintercept = 1), colour = "black", linetype = "dashed", size = 0.75) +
facet_grid(cols = vars(Disease_Group),
rows = vars(fct_relevel(Celltype,
results$ranks %>%
dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
arrange(source, cells_samples, Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()))) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "Number of cells used per simulated sample", y = "Cell-type proportion relative to 100 cells", fill = "Cell-type class") +
theme_rhr_coord_flip +
coord_flip()
b <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000") %>%
dplyr::select(-rank_within_sample) %>%
# Spread columns and group by Disease & Celltype to calculate median for 1000 samples
tidyr::spread(key = samples, value = cell_type_proportion) %>%
dplyr::group_by(Disease_Group, Celltype) %>%
dplyr::mutate(median_1000 = median(`1000`)) %>%
dplyr::ungroup() %>%
# Calculate fold change from reference, 1000 samples, by dividing each column by median_1000
dplyr::mutate_at(.vars = vars(`1000`, `2000`, `4000`), .funs = function(x){x/.[["median_1000"]]}) %>%
dplyr::select(-median_1000) %>%
tidyr::gather(key = "samples", value = "cell_type_proportion_relative_to_1000_samples", -sample_id, -Celltype, -source, -cells, -Celltype_class, -Disease_Group) %>%
ggplot(aes(x = fct_relevel(samples,
rev(c("1000", "2000", "4000"))),
y = cell_type_proportion_relative_to_1000_samples,
fill = Celltype_class)) +
geom_boxplot() +
geom_hline(aes(yintercept = 1), colour = "black", linetype = "dashed", size = 0.75) +
facet_grid(cols = vars(Disease_Group),
rows = vars(fct_relevel(Celltype,
results$ranks %>%
dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples)) %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
arrange(source, cells_samples, Celltype_class, Celltype) %>%
.[["Celltype"]] %>%
unique() %>%
as.character()))) +
ylim(0,9) +
scale_fill_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
scale_colour_manual(values = pal_npg("nrc", alpha = 1)(10)[c(4,2,1,6,3,10)]) +
labs(x = "Number of bulk simulated samples used per individual", y = "Cell-type proportion relative to 1000 samples", fill = "Cell-type class") +
theme_rhr_coord_flip +
coord_flip()
ggarrange(a, b,
labels = c("a", "b"),
nrow = 2,
common.legend = TRUE)
results$ratios %>%
dplyr::mutate(cells_samples = str_c("cells.", cells, "_samples.", samples) %>%
tidyr::replace_na(., "snRNA-seq") %>%
fct_relevel(.,
c("cells.100_samples.1000", "cells.500_samples.1000", "cells.1000_samples.1000",
"cells.2000_samples.1000", "cells.1000_samples.2000", "cells.1000_samples.4000", "snRNA-seq"))) %>%
dplyr::filter(Disease_Group == "Control") %>%
ggplot(aes(x = ratio, y = value)) +
geom_boxplot(aes(fill = ratio), width = 0.4) +
geom_hline(yintercept = 1.5, linetype = 2, colour = "black") +
facet_wrap(vars(cells_samples)) +
labs(x = "", y = "Value of ratio", fill = "Type of ratio") +
coord_cartesian(ylim = c(0,15)) +
theme_rhr
# # Median values
# ratios %>%
# dplyr::group_by(source, Disease_Group, ratio) %>%
# dplyr::summarise(median = median(value))
source(here::here("R", "scoring_deconvolution_performance.R"))
scoring_deconvolution_performance(results_list = results %>%
lapply(., function(x){x %>% dplyr::filter(Disease_Group == "Control" & source == "Scaden deconvolution")}),
true_GNR = 1.5,
deconvolution_variables = vars(cells, samples),
glia = c("Astro", "Microglia", "Oligo"),
oligo = c("Oligo"),
astrocyte = c("Astro"),
microglia = c("Microglia"),
use_ranges = TRUE) %>%
dplyr::arrange(overall_rank, GNR_rank) %>%
dplyr::select(cells, samples, overall_rank, overall_score, everything()) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
overall_rank
: final ranking of deconvolution instances based on overall_score
i.e. highest overall score receives rank of 1overall_score
: overall score across the five scoring parameters (GNR_score, *_glial_proportion and glia ranking_score). For each scoring parameter, award score of 1 to the deconvolution instance(s) with the highest rank (where 1 = best).GNR_rank
: ranking of instances from best (equivalent to rank of 1) to worst by the deviation of its median glia-to-neuron ratio from the true ratio.*_glial_proportion
: ranking of each instance from best (equivalent to rank of 1) to worst based on the proportion of samples within a deconvolution instance that have a proportion of the glial cell-type in question that falls within expected ranges.*_glial_proportion_deviation_from_mean
: ranking of each instance from best (equivalent to rank 1) to worst by the deviation of its median astro/oligo/microglial glial proportion from the expected mean across 3 studies.glia_ranking_score
: ranking of deconvolution instance form best (equivalent to rank of 1) to worst by the proportion of samples wherein oligodendrocytes make up the biggest glial cell-type population, followed by astrocytes and then microglia.plot <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
dplyr::select(-rank_within_sample, -cells, -samples, -source) %>%
tidyr::spread(key = Disease_Group, value = cell_type_proportion) %>%
dplyr::group_by(Celltype) %>%
dplyr::mutate(median_control = median(Control, na.rm = T)) %>%
dplyr::ungroup() %>%
# Calculate fold change from reference, controls, by dividing each column by median_control
dplyr::mutate_at(.vars = vars("Control", "PD", "PDD", "DLB"), .funs = function(x){x/.[["median_control"]]}) %>%
dplyr::select(-median_control) %>%
tidyr::gather(key = "Disease_Group", value = "cell_type_proportion_relative_to_control", -sample_id, -Celltype, -Celltype_class) %>%
# dplyr::filter(Celltype == "Excitatory") %>%
ggplot(aes(x = Celltype,
y = cell_type_proportion_relative_to_control,
fill = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")))) +
geom_boxplot() +
# scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
scale_fill_manual(values = c("#868686", "#c4ebe3", "#73cab9", "#19967d")) +
coord_cartesian(ylim = c(0,8)) +
labs(x = "Cell type", y = "Cell type proportion relative to controls", fill = "Disease group") +
theme_rhr
plot
# ggsave(plot,
# filename = "/home/rreynolds/projects/Aim2_PDsequencing_wd/figures/IPDGC_presentation/scaden_deconvolution_exct.tiff",
# device = "tiff",
# dpi = 200,
# width = 50,
# height = 120,
# units = "mm")
# # Median by cell type
# medians <- results$ranks %>%
# dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
# dplyr::group_by(source, Disease_Group, Celltype) %>%
# dplyr::summarise(median_scaden = median(cell_type_proportion, na.rm = TRUE)) %>%
# dplyr::arrange(Celltype)
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
dplyr::group_by(Celltype) %>%
dplyr::do(pairwise.wilcox.test(x= .$cell_type_proportion, g = .$Disease_Group, p.adjust.method = "none", paired = FALSE) %>%
broom::tidy()) %>%
dplyr::mutate(fdr = p.adjust(p.value, method = "BH")) %>%
dplyr::arrange(fdr) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified") %>%
dplyr::mutate(combined = str_c(cells, ":", samples)) %>%
dplyr::group_by(combined, Celltype) %>%
dplyr::do(pairwise.wilcox.test(x= .$cell_type_proportion, g = .$Disease_Group, p.adjust.method = "none", paired = FALSE) %>%
broom::tidy()) %>%
dplyr::group_by(combined) %>%
dplyr::mutate(fdr = p.adjust(p.value, method = "BH"),
rank = rank(p.value, ties.method = "average"),
comparison = str_c(group2, ":", group1)) %>%
ggplot(aes(x = reorder_within(x = comparison, by = rank, within = Celltype, fun = median, desc = F), y = rank)) +
geom_boxplot() +
facet_wrap(vars(Celltype), scales = "free_x") +
scale_x_reordered() +
labs(x = "", y = "Rank") +
theme_rhr +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MarkerGenes_0.0.0.9000 EWCE_0.99.2
## [3] forcats_0.5.1 stringr_1.4.0
## [5] dplyr_1.0.2 purrr_0.3.4
## [7] readr_1.4.0 tidyr_1.1.1
## [9] tibble_3.0.3 tidyverse_1.3.0
## [11] readxl_1.3.1 here_1.0.0
## [13] ggpubr_0.4.0 ggplot2_3.3.2
## [15] ggsci_2.9 DESeq2_1.26.0
## [17] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [19] BiocParallel_1.20.1 matrixStats_0.56.0
## [21] Biobase_2.46.0 GenomicRanges_1.38.0
## [23] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [25] S4Vectors_0.24.4 BiocGenerics_0.32.0
## [27] DT_0.15 DescTools_0.99.37
## [29] biomaRt_2.42.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.8 Hmisc_4.4-1 qdapTools_1.3.5
## [4] BiocFileCache_1.10.2 plyr_1.8.6 splines_3.6.1
## [7] crosstalk_1.1.0.1 usethis_1.6.3 digest_0.6.27
## [10] htmltools_0.5.1.1 magrittr_2.0.1 checkmate_2.0.0
## [13] memoise_2.0.0 cluster_2.1.0 limma_3.42.2
## [16] remotes_2.2.0 openxlsx_4.2.3 annotate_1.64.0
## [19] modelr_0.1.8 askpass_1.1 prettyunits_1.1.1
## [22] jpeg_0.1-8.1 colorspace_2.0-0 blob_1.2.1
## [25] rvest_0.3.6 rappdirs_0.3.1 haven_2.3.1
## [28] xfun_0.16 callr_3.5.1 crayon_1.4.1
## [31] RCurl_1.98-1.2 jsonlite_1.7.1 genefilter_1.68.0
## [34] Exact_2.0 survival_3.2-7 glue_1.4.2
## [37] gtable_0.3.0 zlibbioc_1.32.0 XVector_0.26.0
## [40] HGNChelper_0.8.1 car_3.0-9 pkgbuild_1.1.0
## [43] abind_1.4-5 scales_1.1.1 mvtnorm_1.1-1
## [46] DBI_1.1.1 rstatix_0.6.0 Rcpp_1.0.5
## [49] xtable_1.8-4 progress_1.2.2 htmlTable_2.0.1
## [52] foreign_0.8-72 bit_4.0.4 Formula_1.2-4
## [55] htmlwidgets_1.5.3 httr_1.4.2 RColorBrewer_1.1-2
## [58] ellipsis_0.3.1 farver_2.0.3 pkgconfig_2.0.3
## [61] XML_3.99-0.3 nnet_7.3-12 dbplyr_1.4.4
## [64] locfit_1.5-9.4 labeling_0.4.2 reshape2_1.4.4
## [67] tidyselect_1.1.0 rlang_0.4.7 AnnotationDbi_1.48.0
## [70] munsell_0.5.0 cellranger_1.1.0 tools_3.6.1
## [73] cachem_1.0.3 cli_2.2.0.9000 generics_0.0.2
## [76] RSQLite_2.2.0 devtools_2.3.2 broom_0.7.0
## [79] ggdendro_0.1.21 evaluate_0.14 fastmap_1.0.1
## [82] yaml_2.2.1 processx_3.4.5 knitr_1.29
## [85] bit64_4.0.2 fs_1.5.0 zip_2.1.0
## [88] xml2_1.3.2 compiler_3.6.1 rstudioapi_0.13
## [91] curl_4.3 png_0.1-7 testthat_2.3.2
## [94] ggsignif_0.6.0 reprex_1.0.0 geneplotter_1.64.0
## [97] stringi_1.5.3 highr_0.8 ps_1.3.4
## [100] desc_1.2.0 lattice_0.20-38 Matrix_1.2-17
## [103] vctrs_0.3.2 pillar_1.4.6 lifecycle_0.2.0
## [106] cowplot_1.0.0 data.table_1.13.0 bitops_1.0-6
## [109] R6_2.5.0 latticeExtra_0.6-29 bookdown_0.21
## [112] gridExtra_2.3 rio_0.5.16 sessioninfo_1.1.1
## [115] boot_1.3-23 MASS_7.3-51.4 assertthat_0.2.1
## [118] pkgload_1.1.0 chron_2.3-56 openssl_1.4.2
## [121] rprojroot_2.0.2 withr_2.2.0 GenomeInfoDbData_1.2.2
## [124] expm_0.999-5 hms_1.0.0 grid_3.6.1
## [127] rpart_4.1-15 rmarkdown_2.5 carData_3.0-4
## [130] lubridate_1.7.9 base64enc_0.1-3