Aim: preprocess GWASs for use with LAVA and LDSC
To run this .Rmd requires packages that are not available from CRAN/Bioconductor. These include colochelpR
and rutils
. If these are not already installed, run the code chunk below.
devtools::install_github("RHReynolds/colochelpR")
devtools::install_github("RHReynolds/rutils")
.lava.gz
in the commmon GWAS folder (for lab re-use). This will also make it easier to find their file paths later, using the list.files()
function.file_path <-
file.path(
gwas_dir,
"AD/AD_all_sum_stats/IGAP_stage_1.txt"
)
# Read in first few rows to check data format
fread(file_path, nrows = 6)
AD <-
fread(file_path) %>%
dplyr::mutate(
P = as.numeric(Pvalue),
N = 17008 + 37154
) %>%
dplyr::select(
CHR = Chromosome,
BP = Position,
SNP = MarkerName,
A1 = Effect_allele,
A2 = Non_Effect_allele,
BETA = Beta,
SE,
P,
N
)
fwrite(AD,
file =
stringr::str_c(
gwas_dir,
"AD/AD2013",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"AD2019/AD_sumstats_Jansenetal_2019sept.txt.gz"
)
# Read in first few rows to check data format
fread(file_path, nrows = 6)
Neff
to one of the accepted LAVA inputs.AD <-
fread(file_path) %>%
dplyr::select(CHR, BP, SNP, A1, A2, MAF = EAF, BETA, SE, Z, P, N = Neff)
fwrite(AD,
file =
stringr::str_c(
gwas_dir,
"AD2019/AD_sumstats_Jansenetal_2019sept",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"AD2019_Kunkle/Kunkle_etal_Stage1_results.txt.gz"
)
# Read in first few rows to check data format
fread(file_path, nrows = 6)
AD <-
fread(file_path) %>%
dplyr::mutate(
N = 21982 + 41944,
Pvalue = as.numeric(Pvalue)
) %>%
dplyr::select(
CHR = Chromosome,
BP = Position,
SNP = MarkerName,
A1 = Effect_allele,
A2 = Non_Effect_allele,
BETA = Beta,
SE,
P = Pvalue,
N
)
fwrite(AD,
file =
stringr::str_c(
gwas_dir,
"AD2019_Kunkle/AD2019.Kunkle",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"AD2021/PGCALZ2sumstatsExcluding23andMe.txt.gz"
)
# Read in first few rows to check data format
fread(file_path, nrows = 6)
AD <-
fread(file_path) %>%
dplyr::select(
CHR = chr,
BP = PosGRCh37,
A1 = testedAllele,
A2 = otherAllele,
Z = z,
P = p,
N
) %>%
dplyr::mutate(CHR = as.factor(CHR)) %>%
dplyr::filter(Z != "Inf") %>%
colochelpR::convert_loc_to_rs(
df = .,
dbsnp_144
)
# Function colochelpR::convert_loc_to_rs does not remove CHR:BP associated with more than one rs id
# Nor does it remove CHR:BP locations that do not have associated rs ids (left as NA)
# Thus, will remove NAs and remove duplicated CHR:BP
AD <-
AD %>%
dplyr::mutate(CHR_BP = stringr::str_c(CHR, ":", BP)) %>%
dplyr::group_by(CHR_BP) %>%
dplyr::filter(!any(row_number() > 1)) %>%
dplyr::ungroup() %>%
dplyr::select(-CHR_BP) %>%
tidyr::drop_na(SNP)
fwrite(AD,
file =
stringr::str_c(
gwas_dir,
"AD2021/AD2021",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"LBD2020/LBD2020.txt"
)
fread(file_path, nrows = 6)
rutils::liftover_coord()
.LBD <-
fread(file_path) %>%
dplyr::select(CHR = CHROM, BP = POS, A1, A2, MAF = A1_FREQ, BETA, SE, Z = Z_STAT, P, N = OBS_CT) %>%
dplyr::mutate(CHR = as.factor(CHR)) %>%
rutils::liftover_coord(
df = .,
path_to_chain = "/data/liftover/hg38/hg38ToHg19.over.chain"
) %>%
colochelpR::convert_loc_to_rs(
df = .,
dbsnp_144
)
# Function colochelpR::convert_loc_to_rs does not remove CHR:BP associated with more than one rs id
# Nor does it remove CHR:BP locations that do not have associated rs ids (left as NA)
# Thus, will remove NAs and remove duplicated CHR:BP
LBD <-
LBD %>%
dplyr::mutate(CHR_BP = stringr::str_c(CHR, ":", BP)) %>%
dplyr::group_by(CHR_BP) %>%
dplyr::filter(!any(row_number() > 1)) %>%
dplyr::ungroup() %>%
dplyr::select(-CHR_BP) %>%
tidyr::drop_na(SNP)
fwrite(LBD,
stringr::str_c(
gwas_dir,
"LBD2020/LBD2020_hg19_rsids",
lava_ext
),
sep = "\t"
)
n_original <- R.utils::countLines("/data/LDScore/GWAS/LBD2020/LBD2020.txt")
n_liftover <- R.utils::countLines("/data/LDScore/GWAS/LBD2020/LBD2020_hg19_rsids.lava.gz")
print(str_c("Number of lines in original file: ", n_original))
## [1] "Number of lines in original file: 7843555"
print(str_c("Number of lines in modified file: ", n_liftover))
## [1] "Number of lines in modified file: 7219240"
file_path <-
file.path(
gwas_dir,
"PD2011/PD-2011_Meta.txt"
)
fread(file_path, nrows = 6)
PD <-
fread(file_path) %>%
dplyr::select(
CHR, BP,
CHR_BP = MARKER,
A1 = Allele1,
A2 = Allele2,
MAF = Freq1,
BETA = Effect,
SE = StdErr,
P = meta_P,
) %>%
dplyr::mutate(
CHR = as.factor(CHR),
A1 = stringr::str_to_upper(A1),
A2 = stringr::str_to_upper(A2),
N = 5333 + 12019
) %>%
rutils::liftover_coord(
df = .,
path_to_chain = "/data/liftover/hg18/hg18ToHg19.over.chain"
) %>%
colochelpR::convert_loc_to_rs(
df = .,
dbsnp_144
)
# Function colochelpR::convert_loc_to_rs does not remove CHR:BP associated with more than one rs id
# Nor does it remove CHR:BP locations that do not have associated rs ids (left as NA)
# Thus, will remove NAs and remove duplicated CHR:BP
PD <-
PD %>%
dplyr::group_by(CHR_BP) %>%
dplyr::filter(!any(row_number() > 1)) %>%
dplyr::ungroup() %>%
dplyr::select(-CHR_BP) %>%
tidyr::drop_na(SNP)
fwrite(
PD,
stringr::str_c(
gwas_dir,
"PD2011/PD2011",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"PD2019_meta5_ex23andMe/PD2019_ex23andMe_hg19.txt.gz"
)
fread(file_path, nrows = 6)
PD <-
fread(file_path) %>%
dplyr::mutate(N = N_cases + N_controls) %>%
dplyr::select(CHR, BP, SNP, A1, A2, MAF = freq, BETA = b, SE = se, P = p, N)
fwrite(
PD,
stringr::str_c(
gwas_dir,
"PD2019_meta5_ex23andMe/PD2019_ex23andMe_hg19",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"PD2019_meta5_ex23andMe_exUKBB/META_no_UKBB_no_231.tbl.gz"
)
fread(file_path, nrows = 6)
Direction
column together with cohort sample numbers
+
, -
or ?
) reflect the order of the cohorts meta-analysed, which is as follows:
+
or -
indicate the variant direction of effect within a cohort; ?
indicate that that variant was not present in that dataset# Have to navigate to root project folder for script to work (as it uses here package)
cd /home/rreynolds/misc_projects/neurodegen-psych-local-corr
nohup Rscript \
/home/rreynolds/misc_projects/neurodegen-psych-local-corr/scripts/00_generate_pd_n.R \
&>/home/rreynolds/misc_projects/neurodegen-psych-local-corr/logs/00_generate_pd_n.log&
HetISq
> 80% (I^2 statistic measures heterogeneity on scale of 0-100%)munge_sumstats.py
removes any SNPs that have N < (90th percentile/1.5), unless another minimum is specified (in the filtered data, this accounts for 65.4 of the data).
--n-min 13655
N < median(N). This equates to 13655, as compared to NA (N < (90th percentile/1.5)) or 1.3846510^{4} (N < (90th percentile/2)), and accounts for 1.3 of the data.# Filtering by MAF, I^2 and number of cohorts where variant is present
PD_filtered <-
fread(file_path) %>%
dplyr::mutate(
n_cohort_present =
stringr::str_count(Direction, c("\\+|-")),
) %>%
dplyr::filter(
Freq1 >= 1/100,
HetISq <= 80,
n_cohort_present >= (2/3 * 13)
)
# Tidy and rename columns for use with LAVA
# Add RS ID
PD <-
PD_filtered %>%
dplyr::select(
CHR_BP = MarkerName,
A1 = Allele1,
A2 = Allele2,
MAF = Freq1,
BETA = Effect,
SE = StdErr,
P = `P-value`
) %>%
dplyr::inner_join(
PD_N %>%
dplyr::select(
CHR_BP = MarkerName,
N
)
) %>%
tidyr::separate(
col = CHR_BP,
sep = ":",
into = c("CHR", "BP"),
remove = FALSE
) %>%
dplyr::mutate(
CHR = as.factor(CHR),
A1 = stringr::str_to_upper(A1),
A2 = stringr::str_to_upper(A2),
) %>%
colochelpR::convert_loc_to_rs(
df = .,
dbsnp_144
)
# Function colochelpR::convert_loc_to_rs does not remove CHR:BP associated with more than one rs id
# Nor does it remove CHR:BP locations that do not have associated rs ids (left as NA)
# Thus, will remove NAs and remove duplicated CHR:BP
PD <-
PD %>%
dplyr::group_by(CHR_BP) %>%
dplyr::filter(!any(row_number() > 1)) %>%
dplyr::ungroup() %>%
dplyr::select(-CHR_BP) %>%
tidyr::drop_na(SNP)
fwrite(
PD,
stringr::str_c(
gwas_dir,
"PD2019_meta5_ex23andMe_exUKBB/PD2019.ex23andMe.exUKBB",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"BIP2021/pgc-bip2021-all.vcf.tsv.gz"
)
# First 72 lines are written text, therefore must remove these to format summary statistics
fread(file_path, skip = 72, nrow = 6)
BIP <-
fread(file_path, skip = 72) %>%
dplyr::mutate(
MAF = (FCAS + FCON)/2,
N = NCAS + NCON
) %>%
dplyr::select(
CHR = `#CHROM`,
BP = POS,
SNP = ID,
A1, A2, MAF, BETA, SE,
P = PVAL,
N
)
fwrite(
BIP,
stringr::str_c(
gwas_dir,
"BIP2021/BIP2021",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"MDD2019_ex23andMe/PGC_UKB_depression_genome-wide.txt.gz"
)
fread(file_path, nrow = 6)
MDD <-
fread(file_path) %>%
dplyr::mutate(
A1 = stringr::str_to_upper(A1),
A2 = stringr::str_to_upper(A2),
N = 170756 + 329443
) %>%
colochelpR::convert_rs_to_loc(
SNP_column = "MarkerName",
dbSNP = dbsnp_144
) %>%
tidyr::separate(
col = "loc",
into = c("CHR", "BP")
) %>%
dplyr::select(
CHR, BP,
SNP = MarkerName,
A1, A2,
MAF = Freq,
logOdds = LogOR,
SE = StdErrLogOR,
P,
N
)
fwrite(
MDD,
stringr::str_c(
gwas_dir,
"MDD2019_ex23andMe/MDD2019",
lava_ext
),
sep = "\t"
)
file_path <-
file.path(
gwas_dir,
"SCZ2018/RS_clozuk_pgc2.meta.sumstats.txt"
)
fread(file_path, nrow = 6)
SCZ <-
fread(file_path) %>%
dplyr::mutate(
N = 40675 + 64643
) %>%
dplyr::select(
CHR, BP, SNP, A1, A2,
MAF = Freq.A1,
OR, SE, P, N
)
fwrite(
SCZ,
stringr::str_c(
gwas_dir,
"SCZ2018/SCZ2018",
lava_ext
),
sep = "\t"
)
Show/hide
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
## setting value
## version R version 4.0.5 (2021-03-31)
## os Ubuntu 16.04.7 LTS
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_GB.UTF-8
## ctype en_GB.UTF-8
## tz Europe/London
## date 2022-03-08
##
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
## ! package * version date lib source
## P assertthat 0.2.1 2019-03-21 [?] CRAN (R 4.0.5)
## P backports 1.3.0 2021-10-27 [?] CRAN (R 4.0.5)
## P Biobase 2.50.0 2020-10-27 [?] Bioconductor
## P BiocGenerics * 0.36.1 2021-04-16 [?] Bioconductor
## P BiocParallel 1.24.1 2020-11-06 [?] Bioconductor
## P Biostrings * 2.58.0 2020-10-27 [?] Bioconductor
## P bitops 1.0-7 2021-04-24 [?] CRAN (R 4.0.5)
## P broom 0.7.10 2021-10-31 [?] CRAN (R 4.0.5)
## P BSgenome * 1.58.0 2020-10-27 [?] Bioconductor
## P bslib 0.3.1 2021-10-06 [?] CRAN (R 4.0.5)
## P cellranger 1.1.0 2016-07-27 [?] CRAN (R 4.0.5)
## P cli 3.1.0 2021-10-27 [?] CRAN (R 4.0.5)
## P colochelpR * 0.99.1 2021-07-05 [?] Github (RHReynolds/colochelpR@e571b0e)
## P colorspace 2.0-2 2021-06-24 [?] CRAN (R 4.0.5)
## P crayon 1.4.2 2021-10-29 [?] CRAN (R 4.0.5)
## P data.table * 1.14.2 2021-09-27 [?] CRAN (R 4.0.5)
## P DBI 1.1.1 2021-01-15 [?] CRAN (R 4.0.5)
## P dbplyr 2.1.1 2021-04-06 [?] CRAN (R 4.0.5)
## P DelayedArray 0.16.3 2021-03-24 [?] Bioconductor
## P digest 0.6.29 2021-12-01 [?] CRAN (R 4.0.5)
## P dplyr * 1.0.7 2021-06-18 [?] CRAN (R 4.0.5)
## P ellipsis 0.3.2 2021-04-29 [?] CRAN (R 4.0.5)
## P evaluate 0.14 2019-05-28 [?] CRAN (R 4.0.5)
## P fansi 0.5.0 2021-05-25 [?] CRAN (R 4.0.5)
## P farver 2.1.0 2021-02-28 [?] CRAN (R 4.0.5)
## P fastmap 1.1.0 2021-01-25 [?] CRAN (R 4.0.5)
## P forcats * 0.5.1 2021-01-27 [?] CRAN (R 4.0.5)
## P fs 1.5.1 2021-11-30 [?] CRAN (R 4.0.5)
## P generics 0.1.1 2021-10-25 [?] CRAN (R 4.0.5)
## P GenomeInfoDb * 1.26.7 2021-04-08 [?] Bioconductor
## P GenomeInfoDbData 1.2.4 2021-07-03 [?] Bioconductor
## P GenomicAlignments 1.26.0 2020-10-27 [?] Bioconductor
## P GenomicRanges * 1.42.0 2020-10-27 [?] Bioconductor
## P ggplot2 * 3.3.5 2021-06-25 [?] CRAN (R 4.0.5)
## P glue 1.5.1 2021-11-30 [?] CRAN (R 4.0.5)
## P gtable 0.3.0 2019-03-25 [?] CRAN (R 4.0.5)
## P haven 2.4.3 2021-08-04 [?] CRAN (R 4.0.5)
## P here * 1.0.1 2020-12-13 [?] CRAN (R 4.0.5)
## P highr 0.9 2021-04-16 [?] CRAN (R 4.0.5)
## P hms 1.1.1 2021-09-26 [?] CRAN (R 4.0.5)
## P htmltools 0.5.2 2021-08-25 [?] CRAN (R 4.0.5)
## P httr 1.4.2 2020-07-20 [?] CRAN (R 4.0.5)
## P IRanges * 2.24.1 2020-12-12 [?] Bioconductor
## P jquerylib 0.1.4 2021-04-26 [?] CRAN (R 4.0.5)
## P jsonlite 1.7.2 2020-12-09 [?] CRAN (R 4.0.5)
## P knitr 1.36 2021-09-29 [?] CRAN (R 4.0.5)
## P labeling 0.4.2 2020-10-20 [?] CRAN (R 4.0.5)
## P lattice 0.20-44 2021-05-02 [?] CRAN (R 4.0.5)
## P lifecycle 1.0.1 2021-09-24 [?] CRAN (R 4.0.5)
## P lubridate 1.8.0 2021-10-07 [?] CRAN (R 4.0.5)
## P magrittr 2.0.1 2020-11-17 [?] CRAN (R 4.0.5)
## P Matrix 1.3-4 2021-06-01 [?] CRAN (R 4.0.5)
## P MatrixGenerics 1.2.1 2021-01-30 [?] Bioconductor
## P matrixStats 0.61.0 2021-09-17 [?] CRAN (R 4.0.5)
## P modelr 0.1.8 2020-05-19 [?] CRAN (R 4.0.5)
## P munsell 0.5.0 2018-06-12 [?] CRAN (R 4.0.5)
## P pillar 1.6.4 2021-10-18 [?] CRAN (R 4.0.5)
## P pkgconfig 2.0.3 2019-09-22 [?] CRAN (R 4.0.5)
## P purrr * 0.3.4 2020-04-17 [?] CRAN (R 4.0.5)
## P R.methodsS3 1.8.1 2020-08-26 [?] CRAN (R 4.0.5)
## P R.oo 1.24.0 2020-08-26 [?] CRAN (R 4.0.5)
## P R.utils 2.11.0 2021-09-26 [?] CRAN (R 4.0.5)
## P R6 2.5.1 2021-08-19 [?] CRAN (R 4.0.5)
## P Rcpp 1.0.7 2021-07-07 [?] CRAN (R 4.0.5)
## P RCurl 1.98-1.5 2021-09-17 [?] CRAN (R 4.0.5)
## P readr * 2.0.2 2021-09-27 [?] CRAN (R 4.0.5)
## P readxl 1.3.1 2019-03-13 [?] CRAN (R 4.0.5)
## P reprex 2.0.0 2021-04-02 [?] CRAN (R 4.0.5)
## P rlang 0.4.12 2021-10-18 [?] CRAN (R 4.0.5)
## P rmarkdown 2.11 2021-09-14 [?] CRAN (R 4.0.5)
## P rprojroot 2.0.2 2020-11-15 [?] CRAN (R 4.0.5)
## P Rsamtools 2.6.0 2020-10-27 [?] Bioconductor
## P rstudioapi 0.13 2020-11-12 [?] CRAN (R 4.0.5)
## P rtracklayer * 1.50.0 2020-10-27 [?] Bioconductor
## P rutils * 0.99.2 2022-02-17 [?] Github (RHReynolds/rutils@e0e65c5)
## P rvest 1.0.1 2021-07-26 [?] CRAN (R 4.0.5)
## P S4Vectors * 0.28.1 2020-12-09 [?] Bioconductor
## P sass 0.4.0 2021-05-12 [?] CRAN (R 4.0.5)
## P scales 1.1.1 2020-05-11 [?] CRAN (R 4.0.5)
## P sessioninfo * 1.1.1 2018-11-05 [?] CRAN (R 4.0.5)
## P SNPlocs.Hsapiens.dbSNP144.GRCh37 * 0.99.20 2021-09-22 [?] Bioconductor
## P stringi 1.7.6 2021-11-29 [?] CRAN (R 4.0.5)
## P stringr * 1.4.0 2019-02-10 [?] CRAN (R 4.0.5)
## P SummarizedExperiment 1.20.0 2020-10-27 [?] Bioconductor
## P tibble * 3.1.6 2021-11-07 [?] CRAN (R 4.0.5)
## P tidyr * 1.1.4 2021-09-27 [?] CRAN (R 4.0.5)
## P tidyselect 1.1.1 2021-04-30 [?] CRAN (R 4.0.5)
## P tidyverse * 1.3.1 2021-04-15 [?] CRAN (R 4.0.5)
## P tzdb 0.2.0 2021-10-27 [?] CRAN (R 4.0.5)
## P utf8 1.2.2 2021-07-24 [?] CRAN (R 4.0.5)
## P vctrs 0.3.8 2021-04-29 [?] CRAN (R 4.0.5)
## P withr 2.4.3 2021-11-30 [?] CRAN (R 4.0.5)
## P xfun 0.27 2021-10-18 [?] CRAN (R 4.0.5)
## P XML 3.99-0.8 2021-09-17 [?] CRAN (R 4.0.5)
## P xml2 1.3.2 2020-04-23 [?] CRAN (R 4.0.5)
## P XVector * 0.30.0 2020-10-27 [?] Bioconductor
## P yaml 2.2.1 2020-02-01 [?] CRAN (R 4.0.5)
## P zlibbioc 1.36.0 2020-10-27 [?] Bioconductor
##
## [1] /home/rreynolds/misc_projects/neurodegen-psych-local-corr/renv/library/R-4.0/x86_64-pc-linux-gnu
## [2] /opt/R/4.0.5/lib/R/library
##
## P ── Loaded and on-disk path mismatch.