source(here::here("R", "file_paths.R"))
# Nuclear samples
rsync -avz --progress -e ssh /drives/d/PR0199/ username@mr_server:/home/rreynolds/data/PD_bulkRNAseq/nuclear_totalRNA_samples/ | tee /home/rhrey/Desktop/20190606_bulkNuc_data_transfer.txt
# Tissue samples
rsync -avz --progress -e ssh /drives/d/PR0198/ username@mr_server:/home/rreynolds/data/PD_bulkRNAseq/tissue_polyA_samples/ | tee /home/rhrey/Desktop/20190606_tissue_data_transfer.txt
ftp
./data/RNAseq_PD/
. Files can be found within their respective project files tissue_polyA_samples/raw_daw
and nuclear_totalRNA_samples/raw_daw
UMI
folder within the raw_data
folders.source(here::here("R", "md5_checksum.R"))
file_paths <- list.files(path = "/data/RNAseq_PD", full.names = TRUE, pattern = ".fastq.gz", recursive = T)
original_md5 <- read_delim(file = "/data/RNAseq_PD/nuclear_totalRNA_samples/raw_data/md5sums_davros.txt", delim = " ", col_names = FALSE) %>%
dplyr::mutate(original_md5 = X1, file_name = X2) %>%
dplyr::select(-X1, -X2) %>%
bind_rows(read_delim(file = "/data/RNAseq_PD/tissue_polyA_samples/raw_data/md5sums_davros.txt", delim = " ", col_names = FALSE) %>%
dplyr::mutate(original_md5 = X1, file_name = X2) %>%
dplyr::select(-X1, -X2)) %>%
dplyr::mutate(file_name = str_replace(file_name, ".*/", ""),
file_name = str_replace(file_name, "/.*/", ""),
file_name = str_replace(file_name, " ", ""))
md5 <- md5_checksum(file_paths, original_md5, column_to_join_by = "file_name")
write_csv(md5, path = "/home/rreynolds/projects/Aim2_PDsequencing_wd/results/md5_check.csv")
# Sequencing files
file_paths <- list.files(
path = file.path(path_to_bulk_seq_data, "QC/fastp"),
full.names = TRUE,
pattern = ".fastq.gz",
recursive = T
)
# Filter out "Undetermined" files and extract unique file names
samples_received <-
file_paths %>%
.[!str_detect(.,"Undetermined")] %>%
str_remove("/.*/") %>%
str_remove("-T.*") %>%
str_replace("^[:alnum:]*", "") %>%
str_remove("^_") %>%
unique()
# Does this match sample names sent to sequencing?
samples_sent <-
read_delim(
file = file.path(
path_to_raw_data, "sample_details/SamplesSentToSequencing.txt"),
delim = "\t",
col_names = FALSE
) %>%
.[["X1"]]
samples_sent[!str_detect(samples_sent, "BulkNuc")] %in% samples_received
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE