Aim: re-run deconvolution with Scaden, using combination of endothelial cell and pericytes as one cell type i.e. vascular cell.
source(here::here("R", "file_paths.R"))
path_to_scaden_results <- file.path(path_to_results, "deconvolution/scaden/")
sample_info <-
read_excel(path =
file.path(
path_to_raw_data,
"sample_details/20201229_MasterFile_SampleInfo.xlsx"
),
sheet = "SampleInfo", skip = 1) %>%
dplyr::filter(Sample_Type == "Tissue section" & sent_to_bulk_seq == "yes") %>%
dplyr::select(CaseNo, Disease_Group, Sex, AoO, AoD, DD, PMI, aSN, TAU, 'thal AB', 'aCG aSN score', Genetics, RINe_bulkRNA_Tapestation) %>%
dplyr::mutate(Disease_Group = fct_relevel(Disease_Group,
c("Control", "PD", "PDD","DLB"))) %>%
dplyr::rename(sample_id = CaseNo)
knitr::include_graphics(
here::here("docs", "deconvolution", "EndoPer_2020_02_05T13_55_14_495Z.png")
)
Steps include:
# Read in new cell type labels
ct <-
read_delim(file =
file.path(path_to_raw_data,
"Cell.Type.Annotations.For.All.Samples.csv"),
delim = " ",
col_names = c("ID", "Celltype"),
skip = 1)
# Read in old celltype file
# Create dataframe of scaden-formatted files
old_path <- file.path(path_to_raw_data, "deconvolution/scaden/2019Oct/")
sn_df <- data.frame(
file_path = list.files(path = old_path, full.names = T, include.dirs = F),
files = list.files(path = old_path, include.dirs = F),
sample_name = list.files(path = old_path, include.dirs = F) %>%
str_replace("_.*", ""),
file_type = list.files(path = old_path, include.dirs = F) %>%
str_replace(".txt", "") %>%
str_remove("^.*?_")) %>%
dplyr::filter(file_type %in% c("norm_counts_all", "celltypes"))
# Extract sample names for loop
samples <- sn_df %>% .[["sample_name"]] %>% as.character() %>% unique()
# Loop to load in files
for (i in 1:length(samples)) {
celltype <- read_delim(file = sn_df %>%
dplyr::filter(sample_name == samples[i], file_type == "celltypes") %>%
.[["file_path"]] %>%
as.character(),
delim = "\t") %>%
dplyr::mutate(sample_name = samples[i])
if(i == 1){
master_df <- celltype
}else{
master_df <- master_df %>%
dplyr::bind_rows(celltype)
}
}
# Calculate difference in numbers assigned to each cell type between old and new labels
ct %>%
tidyr::separate(ID, into = c("sample_name", "disease_group", "barcode"), sep = "_") %>%
dplyr::group_by(sample_name, Celltype) %>%
dplyr::summarise(n_new_labels = n()) %>%
dplyr::filter(sample_name %in% samples) %>%
dplyr::full_join(master_df %>%
dplyr::group_by(sample_name, Celltype) %>%
dplyr::summarise(n_old_labels = n())) %>%
dplyr::mutate(diff = n_new_labels - n_old_labels)
# copy files into container from server
docker cp /home/rreynolds/projects/Aim2_PDsequencing_wd/raw_data/deconvolution/scaden/2019Oct/ scaden_rreynolds:/home/data/PD_seq/scaden_2020Feb/
docker cp /home/rreynolds/projects/Aim2_PDsequencing_wd/raw_data/deconvolution/scaden/2020Feb/ scaden_rreynolds:/home/data/PD_seq/scaden_2020Feb/
# Re-initiate docker
docker exec -it scaden_rreynolds bash
# move files within docker to directory samples/
cd /home/data/PD_seq/scaden_2020Feb/
mkdir samples
cd /home/data/PD_seq/scaden_2020Feb/2019Oct
mv *.txt /home/data/PD_seq/scaden_2020Feb/samples/
cd /home/data/PD_seq/scaden_2020Feb/2020Feb/
mv *.txt /home/data/PD_seq/scaden_2020Feb/samples/
# Run script
bash /home/scripts/scaden_script.sh --sample_dir=/home/data/PD_seq/scaden_2020Feb/samples/ --n_cells=1000 --n_simulated_samples=1000 --n_training_steps=5000 --pred_path=/home/data/PD_seq/scaden/samples/bulkRNAseq_norm_counts.txt
# Copy docker output to server
docker cp scaden_rreynolds:/home/data/PD_seq/scaden_2020Feb/cells.1000_samples.1000/scaden_predictions_cells.1000:samples.1000.txt /home/rreynolds/projects/Aim2_PDsequencing_wd/results/deconvolution/scaden/2020Feb/scaden_predictions_cells.1000:samples.1000.txt
## Check number of genes in common between training data and bulk data (see dataset.dims in .h5ad object X)
library(hdf5r)
# docker cp scaden_rreynolds:/home/data/PD_seq/scaden_2020Feb/cells.1000_samples.1000/processed_pdseq_bulksim.h5ad /home/rreynolds/downloads/processed_pdseq_bulksim.h5ad
file.h5 <- H5File$new("/home/rreynolds/downloads/processed_pdseq_bulksim.h5ad", mode="r+")
file.h5$ls(recursive=TRUE) # 13191 x 24000
file.h5$close_all()
#----Loading results----
# Source functions
source(here::here("R", "scaden_rankings_and_calculate_ratios.R"))
# Calculate cell type proportions from cell type assignments
sn_proportions <-
ct %>%
dplyr::mutate(Celltype = case_when(Celltype %in% c("Endo", "Per") ~ "Vascular",
TRUE ~ Celltype)) %>%
tidyr::separate(ID, into = c("sample_id", "disease_group", "ID"), sep = "_") %>%
dplyr::mutate(sample_id = case_when(sample_id == "C36" ~ "C036",
sample_id == "C48" ~ "C048",
TRUE ~ sample_id)) %>%
dplyr::group_by(Celltype, sample_id) %>%
dplyr::summarise(n = n()) %>%
dplyr::group_by(sample_id) %>%
dplyr::mutate(snRNA_proportions = n/sum(n)) %>%
dplyr::ungroup() %>%
dplyr::select(-n) %>%
tidyr::spread(key = Celltype, value = snRNA_proportions)
# Create dataframe with cell type classes
cell_type_class <- data.frame(Celltype = sn_proportions %>%
dplyr::select(-sample_id) %>%
colnames() %>%
sort(),
Celltype_class = factor(c("Astrocyte", "Neuron", "Neuron",
"Immune", "Oligo", "Oligo", "Vascular"),
levels=c("Astrocyte", "Immune", "Oligo", "Vascular",
"Neuron")))
# Read in scaden predictions, determine within-sample rankings of cell-types by proportions, and calculate glia-to-neuron and non-neuron-to-neuron ratios
results <- scaden_rankings_and_calculate_ratios(prediction_dir = file.path(path_to_scaden_results, "2020Feb"),
unclassified_cell_type = "Unclassified",
sample_info = sample_info %>%
dplyr::select(sample_id, Disease_Group),
snRNAseq_proportions = sn_proportions,
cell_type_class = cell_type_class,
glia = vars(Astro, Microglia, Oligo, OPC),
nonneuron = vars(Astro, Vascular, Microglia, Oligo, OPC),
neuron = vars(Excitatory, Inhibitory),
astrocyte = vars(Astro),
oligos = vars(Oligo, OPC),
microglia = vars(Microglia))
# # Export cell-type proportions
# write_delim(sn_proportions, file.path(path_to_scaden_results, "2020Feb/snRNA_proportions.txt"), delim = "\t")
#
# write_delim(results$ranks %>%
# dplyr::select(-rank_within_sample),
# file.path(path_to_scaden_results, "2020Feb/scaden_summary_results.txt"), delim = "\t")
# saveRDS(results, file.path(path_to_scaden_results, "2020Feb/scaden_summary_results.Rds"))
# Read in old results
old_results <- read_delim(file = file.path(path_to_scaden_results, "2019Oct/scaden_summary_results.txt"),
delim = "\t")
# Merge endo and per
old_results <-
old_results %>%
dplyr::mutate(Celltype = case_when(Celltype %in% c("Endo", "Per") ~ "Vascular",
TRUE ~ Celltype)) %>%
dplyr::group_by(sample_id, Celltype, source, cells, samples, .drop = F) %>%
dplyr::summarise(cell_type_proportion = sum(cell_type_proportion)) %>%
dplyr::arrange(sample_id, cells, samples, -cell_type_proportion)
# Plot
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution") %>%
dplyr::select(-cells, -samples) %>%
dplyr::inner_join(old_results %>%
dplyr::filter(cells == 1000, samples == 1000, Celltype != "Unclassified") %>%
dplyr::ungroup() %>%
dplyr::select(-cells, -samples) %>%
dplyr::rename(cell_type_proportion_old = cell_type_proportion),
by = c("sample_id", "Celltype", "source")) %>%
ggplot(aes(x = cell_type_proportion, y = cell_type_proportion_old)) +
geom_point() +
labs(x = "New scaden predictions", y = "Old scaden predictions") +
theme_rhr
# Correlation of proportions for each cell type between new and old scaden predictions
cor.test(results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 1000 & samples == 1000) %>%
dplyr::arrange(sample_id, Celltype) %>%
.[["cell_type_proportion"]],
old_results %>%
dplyr::filter(cells == 1000, samples == 1000, Celltype != "Unclassified") %>%
dplyr::arrange(sample_id, Celltype) %>%
.[["cell_type_proportion"]],
method = "spearman")
##
## Spearman's rank correlation rho
##
## data: results$ranks %>% dplyr::filter(source == "Scaden deconvolution" & and old_results %>% dplyr::filter(cells == 1000, samples == 1000, cells == 1000 & samples == 1000) %>% dplyr::arrange(sample_id, and Celltype != "Unclassified") %>% dplyr::arrange(sample_id, Celltype) %>% .[["cell_type_proportion"]] and Celltype) %>% .[["cell_type_proportion"]]
## S = 17478, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9778828
data_to_plot <-
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & cells == 1000 & samples == 1000 |
source == "snRNA-seq") %>%
dplyr::select(sample_id, Celltype, cell_type_proportion, source) %>%
tidyr::spread(key = source, value = cell_type_proportion) %>%
dplyr::filter(Celltype != "Unclassified")
data_to_plot %>%
ggplot(aes(x = `Scaden deconvolution`, y = `snRNA-seq`)) +
geom_point() +
ggpubr::stat_cor(method = "spearman", label.x.npc = "centre", label.y.npc = "top", size = 2) +
facet_wrap(~ Celltype, scales = "free") +
labs(x = "Scaden cell-type proportions", "snRNA-seq cell-type proportions") +
theme_rhr +
theme(axis.text.x = element_text(angle = 0))
# Overall correlation
print("Overall correlation")
## [1] "Overall correlation"
cor.test(data_to_plot$`Scaden deconvolution`,
data_to_plot$`snRNA-seq`,
method = "spearman")
##
## Spearman's rank correlation rho
##
## data: data_to_plot$`Scaden deconvolution` and data_to_plot$`snRNA-seq`
## S = 589400, p-value = 0.0009174
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2541544
# Correlation of rankings for each cell type between scaden predictions and snRNA-seq proportions
print("Correlation by cell type")
## [1] "Correlation by cell type"
data_to_plot %>%
dplyr::filter(Celltype != "Unclassified") %>%
tidyr::nest(-Celltype) %>%
dplyr::mutate(test = purrr::map(data, ~ cor.test(.x$`Scaden deconvolution`, .x$`snRNA-seq`, method = "spearman")),
tidied = map(test, broom::tidy)) %>%
tidyr::unnest(tidied, .drop = TRUE) %>%
dplyr::select(-data, -test) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
# Median values
results$ratios %>%
dplyr::group_by(source, Disease_Group, ratio) %>%
dplyr::summarise(median = median(value))
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution") %>%
dplyr::select(-rank_within_sample, -cells, -samples, -source) %>%
tidyr::spread(key = Disease_Group, value = cell_type_proportion) %>%
dplyr::group_by(Celltype) %>%
dplyr::mutate(median_control = median(Control, na.rm = T)) %>%
dplyr::ungroup() %>%
# Calculate fold change from reference, controls, by dividing each column by median_control
dplyr::mutate_at(.vars = vars("Control", "PD", "PDD", "DLB"), .funs = function(x){x/.[["median_control"]]}) %>%
dplyr::select(-median_control) %>%
tidyr::gather(key = "Disease_Group", value = "cell_type_proportion_relative_to_control", -sample_id, -Celltype, -Celltype_class) %>%
# dplyr::filter(Celltype == "Excitatory") %>%
ggplot(aes(x = Celltype,
y = cell_type_proportion_relative_to_control,
fill = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")))) +
geom_boxplot() +
# scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
scale_fill_manual(values = c("#868686", "#c4ebe3", "#73cab9", "#19967d")) +
coord_cartesian(ylim = c(0,8)) +
labs(x = "Cell type", y = "Cell type proportion relative to controls", fill = "Disease group") +
theme_rhr
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
dplyr::group_by(Celltype) %>%
dplyr::do(kruskal.test(x= .$cell_type_proportion, g = .$Disease_Group) %>%
broom::tidy()) %>%
dplyr::arrange(p.value) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results$ranks %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
dplyr::group_by(Celltype) %>%
dplyr::do(pairwise.wilcox.test(x= .$cell_type_proportion, g = .$Disease_Group, p.adjust.method = "none", paired = FALSE) %>%
broom::tidy()) %>%
dplyr::mutate(fdr = p.adjust(p.value, method = "BH")) %>%
dplyr::arrange(fdr) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results_lm <- results$ranks %>%
dplyr::inner_join(sample_info %>%
dplyr::select(sample_id, RIN = RINe_bulkRNA_Tapestation)) %>%
dplyr::filter(source == "Scaden deconvolution" & Celltype!= "Unclassified" & cells == "1000" & samples == "1000") %>%
dplyr::group_by(Celltype) %>%
dplyr::do(lm(cell_type_proportion ~ Disease_Group + RIN, data = .) %>%
broom::tidy()) %>%
dplyr::mutate(fdr = p.adjust(p.value, method = "BH"))
results_lm %>%
dplyr::arrange(Celltype) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
results_lm %>%
dplyr::rename(model = Celltype) %>%
dotwhisker::small_multiple(dot_args = list(colour = "#888888", size = 0.5)) +
geom_point(data = subset(results_lm, fdr < 0.1 & term != "(Intercept)"), aes(x = Celltype, y = estimate), colour = "black", pch = 19, size = 3) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(x = "", y = "Coefficient estimate") +
coord_flip() +
theme_rhr +
theme(panel.grid.major.y = element_blank(),
legend.position = "none")
a <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution") %>%
dplyr::select(-rank_within_sample, -cells, -samples, -source, -Celltype_class) %>%
dplyr::mutate(median_neuron = results$ranks %>%
dplyr::filter(source == "Scaden deconvolution", Celltype == "Excitatory", Disease_Group == "Control") %>%
dplyr::group_by(Celltype) %>%
dplyr::summarise(median_neurons = median(cell_type_proportion)) %>%
.[["median_neurons"]],
relative_cell_type_proportion = cell_type_proportion/median_neuron) %>%
ggplot(aes(x = Celltype,
y = relative_cell_type_proportion,
fill = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")))) +
geom_boxplot() +
# scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
scale_fill_manual(values = c("#868686", "#c4ebe3", "#73cab9", "#19967d")) +
labs(x = "Cell type", y = "Cell type proportion relative to\n median of excitatory neurons in controls", fill = "Disease group") +
theme_rhr
b <- results$ranks %>%
dplyr::filter(source == "Scaden deconvolution") %>%
ggplot(aes(x = Celltype,
y = cell_type_proportion,
fill = fct_relevel(Disease_Group,
c("Control", "PD", "PDD", "DLB")))) +
geom_boxplot() +
# scale_fill_manual(values = pal_jco("default", alpha = 0.9)(10)[c(3,9,1,2)]) +
scale_fill_manual(values = c("#868686", "#c4ebe3", "#73cab9", "#19967d")) +
labs(x = "Cell type", y = "Cell type proportion", fill = "Disease group") +
theme_rhr
ggarrange(a, b,
nrow = 2,
labels = c("a", "b"),
common.legend = T)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] forcats_0.5.1 stringr_1.4.0
## [3] dplyr_1.0.2 purrr_0.3.4
## [5] readr_1.4.0 tidyr_1.1.1
## [7] tibble_3.0.3 tidyverse_1.3.0
## [9] readxl_1.3.1 ggpubr_0.4.0
## [11] ggsci_2.9 DESeq2_1.26.0
## [13] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [15] BiocParallel_1.20.1 matrixStats_0.56.0
## [17] Biobase_2.46.0 GenomicRanges_1.38.0
## [19] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [21] S4Vectors_0.24.4 BiocGenerics_0.32.0
## [23] DT_0.15 dotwhisker_0.5.0
## [25] ggplot2_3.3.2 DescTools_0.99.37
## [27] biomaRt_2.42.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.8 Hmisc_4.4-1 BiocFileCache_1.10.2
## [4] splines_3.6.1 crosstalk_1.1.0.1 digest_0.6.27
## [7] htmltools_0.5.1.1 magrittr_2.0.1 checkmate_2.0.0
## [10] memoise_2.0.0 cluster_2.1.0 openxlsx_4.2.3
## [13] annotate_1.64.0 modelr_0.1.8 askpass_1.1
## [16] prettyunits_1.1.1 jpeg_0.1-8.1 colorspace_2.0-0
## [19] blob_1.2.1 rvest_0.3.6 rappdirs_0.3.1
## [22] haven_2.3.1 xfun_0.16 crayon_1.4.1
## [25] RCurl_1.98-1.2 jsonlite_1.7.1 genefilter_1.68.0
## [28] Exact_2.0 survival_3.2-7 glue_1.4.2
## [31] gtable_0.3.0 zlibbioc_1.32.0 XVector_0.26.0
## [34] car_3.0-9 abind_1.4-5 scales_1.1.1
## [37] mvtnorm_1.1-1 DBI_1.1.1 rstatix_0.6.0
## [40] Rcpp_1.0.5 xtable_1.8-4 progress_1.2.2
## [43] htmlTable_2.0.1 ggstance_0.3.4 foreign_0.8-72
## [46] bit_4.0.4 Formula_1.2-4 htmlwidgets_1.5.3
## [49] httr_1.4.2 RColorBrewer_1.1-2 ellipsis_0.3.1
## [52] pkgconfig_2.0.3 XML_3.99-0.3 farver_2.0.3
## [55] nnet_7.3-12 dbplyr_1.4.4 locfit_1.5-9.4
## [58] here_1.0.0 tidyselect_1.1.0 labeling_0.4.2
## [61] rlang_0.4.7 AnnotationDbi_1.48.0 munsell_0.5.0
## [64] cellranger_1.1.0 tools_3.6.1 cachem_1.0.3
## [67] cli_2.2.0.9000 generics_0.0.2 RSQLite_2.2.0
## [70] broom_0.7.0 evaluate_0.14 fastmap_1.0.1
## [73] yaml_2.2.1 knitr_1.29 bit64_4.0.2
## [76] fs_1.5.0 zip_2.1.0 xml2_1.3.2
## [79] compiler_3.6.1 rstudioapi_0.13 curl_4.3
## [82] png_0.1-7 ggsignif_0.6.0 reprex_1.0.0
## [85] geneplotter_1.64.0 stringi_1.5.3 highr_0.8
## [88] lattice_0.20-38 Matrix_1.2-17 vctrs_0.3.2
## [91] pillar_1.4.6 lifecycle_0.2.0 cowplot_1.0.0
## [94] data.table_1.13.0 bitops_1.0-6 R6_2.5.0
## [97] latticeExtra_0.6-29 bookdown_0.21 gridExtra_2.3
## [100] rio_0.5.16 boot_1.3-23 MASS_7.3-51.4
## [103] assertthat_0.2.1 openssl_1.4.2 rprojroot_2.0.2
## [106] withr_2.2.0 GenomeInfoDbData_1.2.2 expm_0.999-5
## [109] hms_1.0.0 grid_3.6.1 rpart_4.1-15
## [112] rmarkdown_2.5 carData_3.0-4 lubridate_1.7.9
## [115] base64enc_0.1-3