Aim: deconvolution of SRP058181 dataset using Scaden and our own single-nucleus RNA-sequencing data.
source(here::here("R", "file_paths.R"))
source(here::here("R", "biomart_df.R"))
source(here::here("R", "scaden_rankings_and_calculate_ratios.R"))
sample_info <-
readRDS(
file =
file.path(
path_to_results,
"SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
)
) %>%
colData() %>%
as_tibble() %>%
# Remove SRR2015746 (C0061), due to uncertainty re. sex
dplyr::filter(recount_id != "SRR2015746")
dds <-
readRDS(
file =
file.path(
path_to_results,
"SRP058181/gene_level_quant/SRP058181_DESeqDataSet.Rds"
)
)
# Remove SRR2015746 (C0061), due to uncertainty re. sex
dds <- dds[, dds$recount_id != "SRR2015746"]
# Library size matrix
lib_matrix <- colSums(counts(dds)) %>%
as.matrix() %>%
t()
lib_matrix <- lib_matrix[rep(1, times = nrow(counts(dds))),]
# Divide raw counts by library size and scale with 1E6 to get counts per million
norm_counts <- (counts(dds)/lib_matrix) * 1000000
# Processing for final scaden format
# Remove ".*" following each ensembl ID
# Convert ensembl IDs to gene symbols
norm_counts <- norm_counts %>%
as.data.frame() %>%
rownames_to_column(var = "gene") %>%
dplyr::mutate(gene = str_replace(gene, "\\..*", "")) %>%
# According to recount2, study aligned to hg38
biomart_df(
columnToFilter = "gene",
mart = 38,
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filter = "ensembl_gene_id"
) %>%
dplyr::select(gene, hgnc_symbol, everything())
# Filter out rows where ensembl ID has no corresponding gene symbol
# Remove duplicate hgnc_symbols (i.e. multiple ensembl ids mapping to same hgnc_symbol)
norm_counts <- norm_counts %>%
dplyr::filter(hgnc_symbol != "") %>%
dplyr::filter(!(duplicated(hgnc_symbol) | duplicated(hgnc_symbol, fromLast = TRUE))) %>%
dplyr::select(-gene)
# Save for scaden
write_delim(
format(norm_counts, scientific = FALSE),
path =
file.path(
path_to_raw_data,
"deconvolution/scaden/SRP058181_norm_counts.txt"
),
delim = "\t"
)
# copy files into container from server
docker cp /home/rreynolds/projects/Aim2_PDsequencing_wd/raw_data/deconvolution/scaden/SRP058181_norm_counts.txt scaden_rreynolds:/home/data/SRP058181/
# Re-initiate docker
docker exec -it scaden_rreynolds bash
# Make directory
mkdir /home/data/SRP058181/cells.1000_samples.1000/
# Run scaden
scaden process /home/data/PD_seq/scaden_2020Feb/cells.1000_samples.1000/pdseq_bulksim.h5ad /home/data/SRP058181/SRP058181_norm_counts.txt --processed_path /home/data/SRP058181/cells.1000_samples.1000/processed_pdseq_bulksim.h5ad
scaden train /home/data/SRP058181/cells.1000_samples.1000/processed_pdseq_bulksim.h5ad --model_dir /home/data/SRP058181/cells.1000_samples.1000/scaden_model_5000_steps --steps 5000
scaden predict /home/data/SRP058181/SRP058181_norm_counts.txt --model_dir /home/data/SRP058181/cells.1000_samples.1000/scaden_model_5000_steps --outname /home/data/SRP058181/cells.1000_samples.1000/scaden_predictions_cells.1000:samples.1000.txt
# Copy docker output to server
docker cp scaden_rreynolds:/home/data/SRP058181/cells.1000_samples.1000/scaden_predictions_cells.1000:samples.1000.txt /home/rreynolds/projects/Aim2_PDsequencing_wd/results/deconvolution/scaden/SRP058181/scaden_predictions_cells.1000:samples.1000.txt
## Check number of genes in common between training data and bulk data (see dataset.dims in .h5ad object X)
library(hdf5r)
# docker cp scaden_rreynolds:/home/data/SRP058181/cells.1000_samples.1000/processed_pdseq_bulksim.h5ad /home/rreynolds/downloads/SRP058181_processed_pdseq_bulksim.h5ad
file.h5 <- H5File$new("/home/rreynolds/downloads/SRP058181_processed_pdseq_bulksim.h5ad", mode="r+")
file.h5$ls(recursive=TRUE) # 14094 x 24000
file.h5$close_all()
results$ranks %>%
dplyr::filter(Celltype!= "Unclassified") %>%
dplyr::group_by(Celltype) %>%
dplyr::do(pairwise.wilcox.test(x= .$cell_type_proportion, g = .$Disease_Group, p.adjust.method = "none", paired = FALSE) %>%
broom::tidy()) %>%
dplyr::mutate(fdr = p.adjust(p.value, method = "BH")) %>%
dplyr::arrange(fdr) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] forcats_0.5.1 stringr_1.4.0
## [3] dplyr_1.0.2 purrr_0.3.4
## [5] readr_1.4.0 tidyr_1.1.1
## [7] tibble_3.0.3 tidyverse_1.3.0
## [9] readxl_1.3.1 ggpubr_0.4.0
## [11] ggsci_2.9 factoextra_1.0.7
## [13] ggplot2_3.3.2 DESeq2_1.26.0
## [15] SummarizedExperiment_1.16.1 DelayedArray_0.12.3
## [17] BiocParallel_1.20.1 matrixStats_0.56.0
## [19] Biobase_2.46.0 GenomicRanges_1.38.0
## [21] GenomeInfoDb_1.22.1 IRanges_2.20.2
## [23] S4Vectors_0.24.4 BiocGenerics_0.32.0
## [25] DT_0.15 DescTools_0.99.37
## [27] biomaRt_2.42.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.8 Hmisc_4.4-1 BiocFileCache_1.10.2
## [4] splines_3.6.1 crosstalk_1.1.0.1 digest_0.6.27
## [7] htmltools_0.5.1.1 magrittr_2.0.1 checkmate_2.0.0
## [10] memoise_2.0.0 cluster_2.1.0 openxlsx_4.2.3
## [13] annotate_1.64.0 modelr_0.1.8 askpass_1.1
## [16] prettyunits_1.1.1 jpeg_0.1-8.1 colorspace_2.0-0
## [19] blob_1.2.1 rvest_0.3.6 rappdirs_0.3.1
## [22] ggrepel_0.8.2 haven_2.3.1 xfun_0.16
## [25] crayon_1.4.1 RCurl_1.98-1.2 jsonlite_1.7.1
## [28] genefilter_1.68.0 Exact_2.0 survival_3.2-7
## [31] glue_1.4.2 gtable_0.3.0 zlibbioc_1.32.0
## [34] XVector_0.26.0 car_3.0-9 abind_1.4-5
## [37] scales_1.1.1 mvtnorm_1.1-1 DBI_1.1.1
## [40] rstatix_0.6.0 Rcpp_1.0.5 xtable_1.8-4
## [43] progress_1.2.2 htmlTable_2.0.1 foreign_0.8-72
## [46] bit_4.0.4 Formula_1.2-4 htmlwidgets_1.5.3
## [49] httr_1.4.2 RColorBrewer_1.1-2 ellipsis_0.3.1
## [52] pkgconfig_2.0.3 XML_3.99-0.3 farver_2.0.3
## [55] nnet_7.3-12 dbplyr_1.4.4 locfit_1.5-9.4
## [58] here_1.0.0 tidyselect_1.1.0 labeling_0.4.2
## [61] rlang_0.4.7 AnnotationDbi_1.48.0 munsell_0.5.0
## [64] cellranger_1.1.0 tools_3.6.1 cachem_1.0.3
## [67] cli_2.2.0.9000 generics_0.0.2 RSQLite_2.2.0
## [70] broom_0.7.0 evaluate_0.14 fastmap_1.0.1
## [73] yaml_2.2.1 knitr_1.29 bit64_4.0.2
## [76] fs_1.5.0 zip_2.1.0 xml2_1.3.2
## [79] compiler_3.6.1 rstudioapi_0.13 curl_4.3
## [82] png_0.1-7 ggsignif_0.6.0 reprex_1.0.0
## [85] geneplotter_1.64.0 stringi_1.5.3 highr_0.8
## [88] lattice_0.20-38 Matrix_1.2-17 vctrs_0.3.2
## [91] pillar_1.4.6 lifecycle_0.2.0 data.table_1.13.0
## [94] bitops_1.0-6 R6_2.5.0 latticeExtra_0.6-29
## [97] bookdown_0.21 gridExtra_2.3 rio_0.5.16
## [100] boot_1.3-23 MASS_7.3-51.4 assertthat_0.2.1
## [103] openssl_1.4.2 rprojroot_2.0.2 withr_2.2.0
## [106] GenomeInfoDbData_1.2.2 expm_0.999-5 hms_1.0.0
## [109] grid_3.6.1 rpart_4.1-15 rmarkdown_2.5
## [112] carData_3.0-4 lubridate_1.7.9 base64enc_0.1-3