Aim: determine whether RBD GWAS shares any common genetic causal variants with tissue-level eQTL datasets. If there are shared common genetic causal variants this will permit linking the variants within the GWAS locus to regulation of a gene's expression.
beta
) and standard error of the regression coefficient (se
) or p-value and MAF. We will be using beta
and se
(according to the original coloc paper, these are preferred over p-value and MAF).source(here::here("R", "tidy_GWAS.R"))
RBD <- fread("/data/LDScore/GWAS/RBD2020/RBD2020.txt")
# Tidy RBD GWAS
RBD_tidy <-
RBD %>%
tidy_GWAS()
# Get varbeta and check that the necessary columns for coloc are present and are the correct type (num, character)
RBD_tidy_w_varbeta <-
RBD_tidy %>%
colochelpR::get_varbeta() %>%
colochelpR::check_coloc_data_format(beta_or_pval = "beta", check_maf = F)
# Save and gzip output
write_delim(RBD_tidy_w_varbeta, path = here::here("data", "RBD_tidy_varbeta.txt"), delim = "\t")
R.utils::gzip(filename = here::here("data", "RBD_tidy_varbeta.txt"),
destname = here::here("data", "RBD_tidy_varbeta.txt.gz"),
remove = T)
beta == 0
, as this produces NaNs in coloc output.
beta == 0
.# Deal with multiallelic SNPs
# Find duplicates and choose by highest MAF
duplicates_top_maf <-
RBD_tidy_w_varbeta %>%
dplyr::filter(duplicated(SNP) | duplicated(SNP, fromLast = TRUE)) %>%
dplyr::group_by(SNP) %>%
dplyr::top_n(1, maf)
# Remove any remaining duplicates which do not differentiate on beta/p-value and maf
duplicates_top_maf <-
duplicates_top_maf %>%
dplyr::anti_join(duplicates_top_maf %>%
dplyr::filter(duplicated(SNP) | duplicated(SNP, fromLast = TRUE)))
# Above steps results in removal of all duplicate SNPs
# That is all SNP duplicates are duplicates (and not multiallelic SNPs)
nrow(duplicates_top_maf)
# Thus simply use dplyr::distinct() to find all unique entries
RBD_tidy_w_varbeta <-
RBD_tidy_w_varbeta %>%
dplyr::distinct(across(everything()))
# Finally omit any SNPs with beta = 0, as this produces NaNs in coloc output otherwise
RBD_tidy_w_varbeta <-
RBD_tidy_w_varbeta %>%
dplyr::filter(beta != 0)
# Save tidied output
write_delim(RBD_tidy_w_varbeta, path = here::here("data", "RBD_tidy_varbeta_noduplicates.txt"), delim = "\t")
R.utils::gzip(filename = here::here("data", "RBD_tidy_varbeta_noduplicates.txt"),
destname = here::here("data", "RBD_tidy_varbeta_noduplicates.txt.gz"),
remove = T)
colochelpR::get_genes_within_1Mb_of_signif_SNPs()
function. Note that we use GRCh37 gene definitions from ensembl, as the RBD GWAS has been mapped to GRCh37.# Extract all genes within +/- 1Mb of significant RBD hits
ensembl_gene_ids_overlapping_1Mb_window_hit <-
RBD_tidy_w_varbeta %>%
tidyr::separate(col = SNP, into = c("CHR", "BP"), sep = ":", remove = FALSE) %>%
dplyr::mutate(CHR = as.integer(CHR),
BP = as.integer(BP)) %>%
colochelpR::get_genes_within_1Mb_of_signif_SNPs(pvalue_column = "p.value",
CHR_column = "CHR",
BP_column = "BP",
mart = 37)
# Number of genes within +/- 1Mb of all significant hits in RBD GWAS
ensembl_gene_ids_overlapping_1Mb_window_hit <-
tibble(ensembl_id = ensembl_gene_ids_overlapping_1Mb_window_hit) %>%
colochelpR::biomart_df(.,
columnToFilter = "ensembl_id",
attributes = c("ensembl_gene_id", "hgnc_symbol",
"chromosome_name", "start_position", "end_position"),
filter = "ensembl_gene_id",
mart = 37)
saveRDS(ensembl_gene_ids_overlapping_1Mb_window_hit,
here::here("data", "ensembl_gene_ids_overlapping_1Mb_window_hit.Rds"))
MafDb.1Kgenomes.phase3.hs37d5
, which contains MAFs for a number of populations.mafdb <- MafDb.1Kgenomes.phase3.hs37d5
populations(mafdb)
## [1] "AF" "AFR_AF" "AMR_AF" "EAS_AF" "EUR_AF" "SAS_AF"
example_df <- data.frame(rs_id = c("rs12921634", "rs1476958", "rs56189750"))
# Example code
mafs <- GenomicScores::gscores(x = mafdb, ranges = example_df$rs_id %>% as.character(), pop = "EUR_AF")
mafs <-
mafs %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "SNP") %>%
dplyr::rename(maf = EUR_AF) %>%
dplyr::select(SNP, maf)
example_df <- example_df %>%
inner_join(mafs, by = c("rs_id" = "SNP"))
coloc_RBD_eQTLGen.R
nohup Rscript /home/rreynolds/misc_projects/RBD-GWAS-analysis/scripts/coloc_RBD_eQTLGen.R &>/home/rreynolds/misc_projects/RBD-GWAS-analysis/logs/coloc_RBD_eQTLGen.log&
fread("/data/psychencode/DER-08a_hg19_eQTL.significant.txt") %>%
colnames()
## [1] "gene_id" "gene_chr" "gene_start"
## [4] "gene_end" "strand" "number_of_SNPs_tested"
## [7] "SNP_distance_to_TSS" "SNP_id" "SNP_chr"
## [10] "SNP_start" "SNP_end" "nominal_pval"
## [13] "regression_slope" "top_SNP" "FDR"
coloc_RBD_psychencode.R
nohup Rscript /home/rreynolds/misc_projects/RBD-GWAS-analysis/scripts/coloc_RBD_psychencode.R &>/home/rreynolds/misc_projects/RBD-GWAS-analysis/logs/coloc_RBD_psychencode.log&
colochelpR::tidy_eQTL_psychencode()
.results_dir <-
tibble(dir = list.files(here::here("results"), recursive = T, full.names = T, pattern = ".rda") %>%
str_replace(., "/[^/]*$", "") %>%
str_replace(., "/[^/]*$", ""),
file_path = list.files(here::here("results"), recursive = T, full.names = T, pattern = ".rda"),
file_name = list.files(here::here("results"), recursive = T, full.names = F, pattern = ".rda") %>%
str_replace(., ".rda", "")) %>%
tidyr::separate(file_name, into = c("dataset", "prior_type", "gene"), sep = "/", remove = F)
# Split into liberal/robust results
dataset_dir <- setNames(results_dir %>% dplyr::group_split(prior_type),
c(results_dir %>% .[["prior_type"]] %>% unique() %>% sort()))
# Priors df
priors_df <- tibble(prior_type = c("liberal", "robust"),
p12 = c(1e-05, 5e-06))
all_results <- setNames(vector(mode = "list", length = length(dataset_dir)),
names(dataset_dir))
for(i in 1:length(dataset_dir)){
priors_df_filtered <-
priors_df %>%
dplyr::filter(prior_type == names(dataset_dir[i]))
dataset_file_paths <- dataset_dir[[i]]
dataset_names <- dataset_file_paths$dataset %>% unique()
results_list <- vector(mode = "list", length = length(dataset_names))
for(j in 1:length(dataset_names)){
dataset <- dataset_names[j]
print(dataset)
dir_to_load <-
dataset_file_paths %>%
dplyr::filter(dataset == dataset_names[j]) %>%
.[["dir"]] %>%
unique()
dir_to_load <- str_c(dir_to_load, "/", priors_df_filtered$prior_type)
for(k in 1:length(dir_to_load)){
print(dir_to_load[k])
if(dataset %in% c("eQTLGen", "psychencode")){
results <-
colochelpR::merge_coloc_summaries(dir_to_load[k], add_signif_SNP = F, recursive = T, pattern = ".rda") %>%
dplyr::select(GWAS_1, gene_2, everything(), -eQTL_dataset_2)
}
results <-
results %>%
dplyr::mutate(dataset = dataset,
p12 = priors_df_filtered$p12) %>%
dplyr::select(GWAS_1, dataset, gene_2, everything())
if(k == 1){
results_list[[j]] <- results
} else{
results_list[[j]] <-
results_list[[j]] %>%
dplyr::bind_rows(results)
}
}
}
all_results[[i]] <- results_list
}
results <-
all_results %>%
lapply(., function(x){
x %>%
qdapTools::list_df2df() %>%
biomart_df(columnToFilter = "gene_2",
mart = 37,
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filter = c("ensembl_gene_id")) %>%
dplyr::select(GWAS_1, dataset, gene_2, hgnc_symbol, everything(), -X1)
})
saveRDS(results, file = here::here("results", "coloc_summary.Rds"))
results <- readRDS(here::here("results", "coloc_summary.Rds"))
coloc <-
results %>%
qdapTools::list_df2df(., col1 = "prior_type") %>%
dplyr::mutate(hgnc_symbol = case_when(gene_2 == "ENSG00000247775" ~ "SNCA-AS1",
TRUE ~ hgnc_symbol)) %>%
dplyr::inner_join(results %>%
qdapTools::list_df2df(., col1 = "prior_type") %>%
dplyr::filter(prior_type == "liberal", PP.H4.abf >= 0.75) %>%
dplyr::distinct(dataset, gene_2))
coloc %>%
dplyr::arrange(dataset, prior_type, hgnc_symbol) %>%
dplyr::select(prior_type, p12, GWAS_1, dataset, gene_2, hgnc_symbol, everything()) %>%
DT::datatable(rownames = FALSE,
options = list(scrollX = TRUE),
class = 'white-space: nowrap')
coloc %>%
ggplot(aes(x = hgnc_symbol, y = PP.H4.abf, alpha = prior_type)) +
geom_col(position = position_dodge2(), colour = "black") +
geom_hline(yintercept = 0.75, linetype = "dashed", colour = "#63ACBE") +
geom_hline(yintercept = 0.9, linetype = "dashed", colour = "#EE442F") +
labs(x = "Gene", y = "Posterior probability of H4") +
facet_grid(cols = vars(dataset), scales = "free_x", space = "free_x") +
theme_rhr
Figure: Plot of the posterior probability of H4 (association to both traits, shared causal variant) using a liberal and robust prior probability for any random SNP in the region of interest associating with both traits. Blue and red dashed lines indicate a posterior probability of H4 equal to 0.75 and 0.90, respectively. Liberal, p12 = 1e-05; robust, p12 = 5e-06.
# Extract coloc hits
coloc_ens <-
results %>%
lapply(., function(x){x %>%
dplyr::filter(PP.H4.abf > 0.75)}) %>%
qdapTools::list_df2df() %>%
dplyr::filter(dataset == "eQTLGen") %>%
dplyr::distinct(hgnc_symbol, gene_2) %>%
dplyr::arrange(hgnc_symbol)
coloc_ens
# Loading relevant files
results_dir <-
tibble(dir = list.files(here::here("results"), recursive = T, full.names = T, pattern = ".rda") %>%
str_replace(., "/[^/]*$", "") %>%
str_replace(., "/[^/]*$", ""),
file_path = list.files(here::here("results"), recursive = T, full.names = T, pattern = ".rda"),
file_name = list.files(here::here("results"), recursive = T, full.names = F, pattern = ".rda") %>%
str_replace(., ".rda", "")) %>%
tidyr::separate(file_name, into = c("dataset", "prior_type", "gene"), sep = "/", remove = F) %>%
dplyr::mutate(gene = str_split(gene, pattern = "_") %>%
lapply(., function(x) x[str_detect(x, "ENSG")]) %>% unlist())
coloc_files <-
results_dir %>%
dplyr::inner_join(coloc_ens, by = c("gene" = "gene_2")) %>%
dplyr::filter(prior_type == "robust", dataset == "eQTLGen") %>%
dplyr::arrange(hgnc_symbol)
coloc_list <- vector(mode = "list", length = nrow(coloc_files))
for(i in 1:length(coloc_list)){
load(as.character(coloc_files$file_path[i]))
print(str_c(coloc_files$hgnc_symbol[i], " - ", coloc_files$gene[i]))
coloc::sensitivity(coloc_results_annotated, "H4 >= 0.75", plot.manhattans = T)
}
## [1] "MMRN1 - ENSG00000138722"
results %>%
qdapTools::list_df2df(col1 = "prior_type") %>%
dplyr::filter(dataset == "eQTLGen", hgnc_symbol == "SNCA")
# Extract overlaps between GWAS and eQTL datasets and join both datasets
# Only do this for colocalisations of interest
eQTL_path <- "/data/eQTLGen/2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz"
eQTL <- fread(eQTL_path)
# Tidy eQTL data
eQTL_tidy_filtered <-
eQTL %>%
dplyr::filter(Gene %in% c(coloc_ens$gene_2, "ENSG00000145335", "ENSG00000138760")) %>%
colochelpR::tidy_eQTL_eQTLGen() %>%
colochelpR::check_coloc_data_format(beta_or_pval = "pval", check_maf = F) %>%
dplyr::distinct(gene, SNP, .keep_all = TRUE)
# Find overlapping regions between GWAS and eQTLGen
eQTL_RBD_GWAS_joined <-
setNames(eQTL_tidy_filtered %>%
group_split(gene),
eQTL_tidy_filtered %>%
.[["gene"]] %>%
unique() %>%
sort()) %>%
lapply(., function(eQTL_gene_df){
colochelpR::join_coloc_datasets(df1 = RBD_tidy_w_varbeta %>%
dplyr::filter(!duplicated(SNP)),
df2 = eQTL_gene_df,
harmonise = F)
})
# Save as Rds
saveRDS(eQTL_RBD_GWAS_joined, here::here("results", "eQTLGen", "RBD_eQTLGen_overlap.Rds"))
eQTL_RBD_GWAS_joined <- readRDS(here::here("results", "eQTLGen", "RBD_eQTLGen_overlap.Rds"))
# Format data for plotting
eQTL_RBD_GWAS_to_plot <-
eQTL_RBD_GWAS_joined %>%
qdapTools::list_df2df() %>%
dplyr::select(SNP, gene = gene_2, pvalue_gwas = p.value_1, pvalue_eqtl = p.value_2) %>%
dplyr::inner_join(results$liberal %>%
dplyr::filter(dataset == "eQTLGen") %>%
dplyr::select(gene = gene_2, hgnc_symbol)) %>%
tidyr::gather(key = "Dataset", value = "p.value", -SNP, -gene, -hgnc_symbol) %>%
tidyr::separate(col = "SNP", into = c("chr", "pos")) %>%
dplyr::mutate(pos_mb = as.numeric(pos)/1000000,
p.value = as.numeric(p.value),
log_pval = -log10(p.value))
# P-value correlations
# Spread data for scatter plot
eQTL_RBD_GWAS_corr <-
eQTL_RBD_GWAS_to_plot %>%
dplyr::select(-p.value) %>%
tidyr::spread(key = Dataset, value = log_pval)
eQTL_RBD_GWAS_corr %>%
dplyr::filter(gene %in% c(coloc_ens$gene_2, "ENSG00000145335")) %>%
ggplot(aes(x = pvalue_eqtl, y = pvalue_gwas)) +
geom_point(size = 1, alpha = 0.5) +
geom_hline(yintercept = -log10(5*10^-8), linetype = "dashed") +
geom_vline(xintercept = -log10(5*10^-8), linetype = "dashed") +
ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho", colour = "black", size = 4) +
facet_wrap(vars(hgnc_symbol), nrow = 2, scales = "free") +
labs(x = "-log10(eQTL p-value)", y = "-log10(GWAS p-value)") +
theme_pubr()
Figure: Scatter plot of GWAS and eQTL p-values in the regions surrounding MMRN1 and SNCA. Spearman's rho (R) and associated p-value (p) are displayed in the plot. Dashed lines indicated genome-wide significance.
coloc_ens <-
results %>%
lapply(., function(x){x %>%
dplyr::filter(PP.H4.abf > 0.75)}) %>%
qdapTools::list_df2df() %>%
dplyr::filter(dataset == "psychencode") %>%
dplyr::distinct(hgnc_symbol, gene_2) %>%
dplyr::mutate(hgnc_symbol = case_when(gene_2 == "ENSG00000247775" ~ "SNCA-AS1",
TRUE ~ hgnc_symbol)) %>%
dplyr::arrange(hgnc_symbol)
coloc_ens
coloc_files <-
results_dir %>%
dplyr::inner_join(coloc_ens, by = c("gene" = "gene_2")) %>%
dplyr::filter(prior_type == "robust", dataset == "psychencode") %>%
dplyr::arrange(hgnc_symbol)
coloc_list <- vector(mode = "list", length = nrow(coloc_files))
for(i in 1:length(coloc_list)){
load(as.character(coloc_files$file_path[i]))
print(str_c(coloc_files$hgnc_symbol[i], " - ", coloc_files$gene[i]))
coloc::sensitivity(coloc_results_annotated, "H4 >= 0.75", plot.manhattans = T)
}
## [1] "DLG5 - ENSG00000151208"
## [1] "SNCA-AS1 - ENSG00000247775"
results %>%
qdapTools::list_df2df(col1 = "prior_type") %>%
dplyr::filter(dataset == "psychencode", hgnc_symbol %in% c("MMRN1", "SNCA"))
# Extract overlaps between GWAS and eQTL datasets and join both datasets
eQTL_path <- "/data/psychencode/Full_hg19_cis-eQTL.txt.gz"
eQTL <- fread(eQTL_path)
SNP_info <- fread("/data/psychencode/SNP_Information_Table_with_Alleles.txt")
# Assign column names using another psychencode-derived file
colnames(eQTL) <- colnames(fread("/data/psychencode/DER-08a_hg19_eQTL.significant.txt"))[!str_detect(colnames(fread("/data/psychencode/DER-08a_hg19_eQTL.significant.txt")), "FDR")]
# Filter eQTLs for gene, tidy eQTLs
eQTL_tidy_filtered <-
eQTL %>%
colochelpR::tidy_eQTL_psychencode(., psychencode_SNP_info = SNP_info, add_colnames = F) %>%
dplyr::filter(gene %in% c(coloc_ens$gene_2, "ENSG00000145335", "ENSG00000138722", "ENSG00000138760")) %>%
colochelpR::check_coloc_data_format(beta_or_pval = "pval", check_maf = F) %>%
dplyr::distinct(gene, SNP, .keep_all = TRUE)
# Find overlapping regions between GWAS and eQTLGen
eQTL_RBD_GWAS_joined <-
setNames(eQTL_tidy_filtered %>%
group_split(gene),
eQTL_tidy_filtered %>%
.[["gene"]] %>%
unique() %>%
sort()) %>%
lapply(., function(eQTL_gene_df){
colochelpR::join_coloc_datasets(df1 = RBD_tidy_w_varbeta %>%
dplyr::filter(!duplicated(SNP)),
df2 = eQTL_gene_df,
harmonise = F)
})
# Save as Rds
saveRDS(eQTL_RBD_GWAS_joined, here::here("results", "psychencode", "RBD_psychencode_overlap.Rds"))
# Shared SNPs between GWAS/eQTL
eQTL_RBD_GWAS_joined <- readRDS(here::here("results", "psychencode", "RBD_psychencode_overlap.Rds"))
# Format data for plotting
eQTL_RBD_GWAS_to_plot <-
eQTL_RBD_GWAS_joined %>%
qdapTools::list_df2df() %>%
dplyr::select(SNP, gene = gene_2, pvalue_gwas = p.value_1, pvalue_eqtl = p.value_2) %>%
dplyr::inner_join(results$liberal %>%
dplyr::filter(dataset == "psychencode") %>%
dplyr::select(gene = gene_2, hgnc_symbol)) %>%
tidyr::gather(key = "Dataset", value = "p.value", -SNP, -gene, -hgnc_symbol) %>%
tidyr::separate(col = "SNP", into = c("chr", "pos")) %>%
dplyr::mutate(pos_mb = as.numeric(pos)/1000000,
p.value = as.numeric(p.value),
log_pval = -log10(p.value),
hgnc_symbol = case_when(gene == "ENSG00000247775" ~ "SNCA-AS1",
TRUE ~ hgnc_symbol))
# Spread data for scatter plot
eQTL_RBD_GWAS_corr <-
eQTL_RBD_GWAS_to_plot %>%
dplyr::select(-p.value) %>%
tidyr::spread(key = Dataset, value = log_pval)
eQTL_RBD_GWAS_corr %>%
dplyr::filter(gene %in% c("ENSG00000145335", "ENSG00000138722", "ENSG00000247775")) %>%
ggplot(aes(x = pvalue_eqtl, y = pvalue_gwas)) +
geom_point(size = 1, alpha = 0.5) +
geom_hline(yintercept = -log10(5*10^-8), linetype = "dashed") +
geom_vline(xintercept = -log10(5*10^-8), linetype = "dashed") +
ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho", colour = "black", size = 4) +
facet_wrap(vars(hgnc_symbol), nrow = 2, scales = "free") +
labs(x = "-log10(eQTL p-value)", y = "-log10(GWAS p-value)") +
theme_pubr()
Figure: Scatter plot of GWAS and eQTL p-values in the regions surrounding MMRN1, SNCA and SNCA-AS1. Spearman's rho (R) and associated p-value (p) are displayed in the plot. Dashed lines indicate genome-wide significance.
As we have betas for both the GWAS and the Psychencode dataset, we can make some initial attempt to determine what the direction of effect at the locus might be.
SNP_info <- fread("/data/psychencode/SNP_Information_Table_with_Alleles.txt")
SNP_info %>% dplyr::filter(PEC_id == "16:11417802")
same_effect_and_alt_allele
will be assigned the value TRUE.directionality_check <-
RBD_tidy_w_varbeta %>%
dplyr::select(SNP, Al1, Al2) %>%
dplyr::filter(SNP %in% eQTL_RBD_GWAS_joined$ENSG00000247775$SNP) %>%
dplyr::inner_join(SNP_info %>%
dplyr::filter(PEC_id %in% eQTL_RBD_GWAS_joined$ENSG00000247775$SNP) %>%
dplyr::select(SNP = PEC_id, psychencode_ALT = ALT, psychencode_REF = REF)) %>%
dplyr::mutate(same_effect_and_alt_allele = case_when(Al1 == psychencode_ALT & Al2 == psychencode_REF ~ TRUE,
TRUE ~ FALSE))
directionality_check %>%
dplyr::group_by(same_effect_and_alt_allele) %>%
dplyr::count()
effect_and_alt_swapped
:
directionality_check <-
directionality_check %>%
dplyr::mutate(effect_and_alt_swapped = case_when(same_effect_and_alt_allele == FALSE & Al1 == psychencode_REF & Al2 == psychencode_ALT ~ TRUE,
TRUE ~ FALSE))
directionality_check %>%
dplyr::filter(same_effect_and_alt_allele == FALSE) %>%
dplyr::group_by(effect_and_alt_swapped) %>%
dplyr::count()
SNPs_to_exclude <- directionality_check %>%
dplyr::filter(same_effect_and_alt_allele == FALSE, effect_and_alt_swapped == FALSE) %>%
.[["SNP"]]
SNPs_to_swap <- directionality_check %>%
dplyr::filter(same_effect_and_alt_allele == FALSE, effect_and_alt_swapped == TRUE) %>%
.[["SNP"]]
harmonised_alleles <-
eQTL_RBD_GWAS_joined$ENSG00000247775 %>%
dplyr::filter(!SNP %in% SNPs_to_exclude) %>%
dplyr::mutate(beta_2 = case_when(SNP %in% SNPs_to_swap ~ (-beta_2),
TRUE ~ beta_2),
Al1_2 = case_when(SNP %in% SNPs_to_swap ~ Al1_1,
TRUE ~ Al1_2),
Al2_2 = case_when(SNP %in% SNPs_to_swap ~ Al2_1,
TRUE ~ Al2_2))
# saveRDS(harmonised_alleles, here::here("results", "psychencode", "RBD_SNCAAS1_beta_harmonised.Rds"))
data_to_plot <-
harmonised_alleles %>%
tidyr::gather(key = "statistic", value = "value", beta_1, p.value_1, beta_2, p.value_2) %>%
dplyr::select(SNP, statistic, value) %>%
dplyr::mutate(dataset = case_when(str_detect(statistic, "_1") ~ "RBD GWAS",
str_detect(statistic, "_2") ~ "PsychENCODE eQTL"),
statistic = str_replace(statistic, "_.*", "")) %>%
tidyr::spread(key = statistic, value = value) %>%
dplyr::mutate(significant_GWAS_SNP = case_when(SNP == "4:90755909" ~ TRUE,
TRUE ~ FALSE))
data_to_plot %>%
ggplot(aes(x = beta, y = -log10(p.value))) +
geom_point(data = subset(data_to_plot, significant_GWAS_SNP == FALSE), colour = "#888888",
alpha = 0.5) +
geom_point(data = subset(data_to_plot, significant_GWAS_SNP == TRUE), size = 2, colour = "#EE442F",
alpha = 1) +
geom_hline(yintercept = -log10(5*10^-8), linetype = "dashed") +
facet_wrap(~ dataset) +
labs(x = "beta", y = "-log10(p-value)") +
theme_pubr()
Figure: Scatter plot of beta vs -log10(p-value) in eQTL and GWAS datasets. Top GWAS SNP is highlighted with a red fill in both datasets.
harmonised_alleles %>%
dplyr::mutate(genome_wide_signif = case_when(p.value_1 < 5e-8 | p.value_2 < 5e-8 ~ TRUE,
TRUE ~ FALSE)) %>%
tidyr::gather(key = "statistic", value = "value", beta_1, p.value_1, beta_2, p.value_2) %>%
dplyr::select(SNP, statistic, value, genome_wide_signif) %>%
dplyr::mutate(dataset = case_when(str_detect(statistic, "_1") ~ "RBD GWAS",
str_detect(statistic, "_2") ~ "PsychENCODE eQTL"),
statistic = str_replace(statistic, "_.*", "")) %>%
dplyr::filter(statistic == "beta") %>%
dplyr::select(-statistic) %>%
tidyr::spread(key = "dataset", value = "value") %>%
ggplot(aes(x = `PsychENCODE eQTL`, y = `RBD GWAS`)) +
geom_point(aes(colour = genome_wide_signif), alpha = 0.5) +
# geom_density2d(colour = "black", alpha = 0.8) +
geom_smooth(method = "lm", level = 0.99, colour = "black", fill = "#EE442F") +
labs(x = "beta - pscyhENCODE eQTL", y = "beta - RBD") +
scale_colour_manual(values = c("#888888", "#EE442F")) +
theme_pubr()
Figure: Plot of beta coefficients in RBD GWAS and PsychENCODE for shared SNPs. Shared SNPs that are genome-wide significant in either the LBD GWAS or in Psychencode are highlighted with a red fill. The black line represents a linear model fitted for the two beta variables, with the 99% confidence interval indicated with a red fill.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] forcats_0.5.0 stringr_1.4.0
## [3] dplyr_1.0.2 purrr_0.3.4
## [5] readr_1.3.1 tidyr_1.1.1
## [7] tibble_3.0.3 tidyverse_1.3.0
## [9] MafDb.1Kgenomes.phase3.hs37d5_3.10.0 GenomicScores_1.10.0
## [11] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [13] IRanges_2.20.2 S4Vectors_0.24.4
## [15] BiocGenerics_0.32.0 here_1.0.0
## [17] ggpubr_0.4.0 ggplot2_3.3.2
## [19] data.table_1.13.0 colochelpR_0.99.0
## [21] coloc_4.0-4
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8
## [3] AnnotationHub_2.18.0 qdapTools_1.3.5
## [5] BiocFileCache_1.10.2 splines_3.6.1
## [7] crosstalk_1.1.0.1 BiocParallel_1.20.1
## [9] inline_0.3.15 digest_0.6.25
## [11] htmltools_0.5.1.1 viridis_0.5.1
## [13] magrittr_1.5 memoise_1.1.0
## [15] BSgenome_1.54.0 openxlsx_4.2.3
## [17] Biostrings_2.54.0 modelr_0.1.8
## [19] matrixStats_0.56.0 R.utils_2.9.2
## [21] colorspace_1.4-1 blob_1.2.1
## [23] rvest_0.3.6 rappdirs_0.3.1
## [25] rrcov_1.5-5 haven_2.3.1
## [27] xfun_0.16 crayon_1.3.4
## [29] RCurl_1.98-1.2 jsonlite_1.7.1
## [31] survival_3.1-12 glue_1.4.2
## [33] gtable_0.3.0 zlibbioc_1.32.0
## [35] XVector_0.26.0 DelayedArray_0.12.3
## [37] car_3.0-9 DEoptimR_1.0-8
## [39] abind_1.4-5 scales_1.1.1
## [41] mvtnorm_1.1-1 DBI_1.1.0
## [43] rstatix_0.6.0 Rcpp_1.0.5
## [45] viridisLite_0.3.0 xtable_1.8-4
## [47] foreign_0.8-72 bit_4.0.4
## [49] DT_0.15 htmlwidgets_1.5.1
## [51] httr_1.4.2 ellipsis_0.3.1
## [53] farver_2.0.3 pkgconfig_2.0.3
## [55] XML_3.99-0.3 R.methodsS3_1.8.0
## [57] dbplyr_1.4.4 labeling_0.3
## [59] tidyselect_1.1.0 rlang_0.4.7
## [61] later_1.1.0.1 AnnotationDbi_1.48.0
## [63] munsell_0.5.0 BiocVersion_3.10.1
## [65] cellranger_1.1.0 tools_3.6.1
## [67] cli_2.2.0.9000 generics_0.0.2
## [69] RSQLite_2.2.0 broom_0.7.0
## [71] evaluate_0.14 fastmap_1.0.1
## [73] yaml_2.2.1 knitr_1.29
## [75] bit64_4.0.2 fs_1.5.0
## [77] zip_2.1.0 robustbase_0.93-6
## [79] nlme_3.1-141 mime_0.9
## [81] R.oo_1.23.0 leaps_3.1
## [83] xml2_1.3.2 compiler_3.6.1
## [85] rstudioapi_0.11 curl_4.3
## [87] interactiveDisplayBase_1.24.0 ggsignif_0.6.0
## [89] reprex_0.3.0 pcaPP_1.9-73
## [91] stringi_1.4.6 highr_0.8
## [93] lattice_0.20-38 Matrix_1.2-17
## [95] vctrs_0.3.2 pillar_1.4.6
## [97] lifecycle_0.2.0 BiocManager_1.30.10
## [99] snpStats_1.36.0 bitops_1.0-6
## [101] httpuv_1.5.4 rtracklayer_1.46.0
## [103] R6_2.4.1 promises_1.1.1
## [105] gridExtra_2.3 rio_0.5.16
## [107] assertthat_0.2.1 chron_2.3-56
## [109] SummarizedExperiment_1.16.1 BMA_3.18.12
## [111] rprojroot_2.0.2 withr_2.2.0
## [113] GenomicAlignments_1.22.1 Rsamtools_2.2.3
## [115] GenomeInfoDbData_1.2.2 mgcv_1.8-29
## [117] hms_0.5.3 grid_3.6.1
## [119] rmarkdown_2.5 carData_3.0-4
## [121] Biobase_2.46.0 shiny_1.5.0
## [123] lubridate_1.7.9