Use distance from lead significant SNP to estimate independet loci in GWAS summary stats. Uses -log10(p) derived from BETA/SE so does not need P as input.
Usage
get_loci(
gwas,
detect_headers = TRUE,
snp_col = "SNP",
chr_col = "CHR",
pos_col = "BP",
maf_col = "MAF",
beta_col = "BETA",
se_col = "SE",
stat_col = "NA",
p_col = "NA",
neglog10p_col = "NA",
use_pvalue = FALSE,
n_bases = 5e+05,
p_threshold = 5e-08,
single_hla_locus = FALSE,
hla_pos = c(2.5e+07, 3.4e+07),
ld_clump = FALSE,
ld_clump_r2 = 0.01,
ld_clump_local = TRUE,
ld_plink_bin = "plink",
ld_bfile =
"/indy/ukbiobank/data_14631/genetics/imputed_500k/5k_eur/ukb_imp_v3.qc_sub.5k_eur",
verbose = FALSE,
exclude_hla = lifecycle::deprecated(),
get_ld_indep = lifecycle::deprecated(),
ld_pruning_r2 = lifecycle::deprecated()
)
Arguments
- gwas
A data.frame. Contains the GWAS summary statistics.
- detect_headers
Logical. Default=TRUE. Search input headers to see if BOLT-LMM, SAIGE, REGENIE, or GWAS CATALOG input (user therefore doesn't need to provide). If BOLT-LMM then automatically use p-value instead of SE.
- snp_col
A string. Default="SNP". The RSID/variantID column name.
- chr_col
A string. Default="CHR". The chromosome column name.
- pos_col
A string. Default="BP". The base pair/position column name.
- maf_col
A string. Default="MAF". The MAF/minor-allele-frequency column name.
- beta_col
A string. Default="BETA". The BETA column name.
- se_col
A string. Default="SE". The SE column name.
- stat_col
A string. Default="NA". The test statistic column name. (Only required if not providing beta+se, or neglog10p)
- p_col
A string. Default="NA". The p-value column name. (Only required if `use_pvalue==TRUE` i.e., using the provided p-value, e.g., P_BOLT_LMM)
- neglog10p_col
A string. Default="NA". The -log10 p-value column name. (Only required if not providing beta+se, or stat)
- use_pvalue
Logical. Default=FALSE. Use the provided p-value (in `p_col`) rather than computing from the test statistic? Useful for BOLT-LMM output
- n_bases
An interger. Default=5e5. The distance between two significant loci, beyond which they are defined as in separate loci.
- p_threshold
A number. Default=5e-8. P-value threshold for statistical significance
- single_hla_locus
Logical. Default=FALSE. Treat HLA as one continuous locus. [Previously called `exclude_hla`]
- hla_pos
A numeric vector of length 2. Default=c(25e6, 34e6). The HLA region on chromosome 6 to treat as one continuous locus if `single_hla_locus==TRUE`
- ld_clump
Logical. Default=FALSE. Use Plink LD clumping to identify independent SNPs - see ieugwasr::ld_clump() docs. [Previously called `get_ld_indep`]
- ld_clump_r2
Numeric. Default=0.01. Pruning threshold for LD. [Previously called `ld_pruning_r2`]
- ld_clump_local
Logical. Default=TRUE. If clumping using local installation (rather than IEU API) - see ieugwasr::ld_clump() docs
- ld_plink_bin
A string. Default="plink". Path to Plink v1.90 binary
- ld_bfile
A string. Default is to 5,000 random unrelated UK Biobank Europeans on my server :) needs a path to appropriate BIM/BED reference panel files on your server
- verbose
Logical. Default=FALSE. Be verbose
Examples
# distance-based loci
gwas_loci = get_loci(gwas_example)
#>
#> Locus size (bases) = 5e+05
#> P-value threshold = 5e-08
#>
#> N variants = 319732
#> N variants p<threshold = 4132
#> N loci = 15
head(gwas_loci)
#> SNP CHR BP A1 A2 MAF BETA SE P
#> 57882 rs12046439 1 107536799 T C 0.248 0.00997159 0.00170546 5.01e-09
#> 57900 rs143849791 1 107537916 CATG C 0.325 0.01283200 0.00164361 5.85e-15
#> 57922 rs113329442 1 107539252 A G 0.330 0.01109240 0.00149706 1.27e-13
#> 57987 rs3861909 1 107544176 G A 0.327 0.01187220 0.00150837 3.52e-15
#> 58025 rs17496332 1 107546375 A G 0.331 0.01110260 0.00148844 8.70e-14
#> 58091 rs2878349 1 107549245 G A 0.327 0.01182020 0.00149200 2.33e-15
#> locus lead
#> 57882 1 FALSE
#> 57900 1 FALSE
#> 57922 1 FALSE
#> 57987 1 FALSE
#> 58025 1 FALSE
#> 58091 1 FALSE
head(gwas_loci[ gwas_loci$lead==TRUE , ])
#> SNP CHR BP A1 A2 MAF BETA SE P
#> 58431 rs111232683 1 107566149 G C 0.34300 0.01352040 0.00161401 5.43e-17
#> 75873 rs114254196 1 108635400 C T 0.00848 -0.04481140 0.00818473 4.38e-08
#> 83709 rs115292790 1 109310728 G A 0.01360 -0.05639270 0.00608890 2.01e-20
#> 91666 rs12740374 1 109817590 G T 0.21900 -0.14822800 0.00166391 4.73e-305
#> 98740 rs140266316 1 110326545 G A 0.01630 -0.05770880 0.00597770 4.73e-22
#> 120461 rs657801 1 111736389 T C 0.31500 0.00905412 0.00150713 1.88e-09
#> locus lead
#> 58431 1 TRUE
#> 75873 2 TRUE
#> 83709 3 TRUE
#> 91666 4 TRUE
#> 98740 5 TRUE
#> 120461 6 TRUE
# clump loci using Plink LD pruning
gwas_loci = get_loci(gwas_example, ld_clump=TRUE)
#>
#> Locus size (bases) = 5e+05
#> P-value threshold = 5e-08
#>
#> ** Performing LD clumping. Can take a few minutes
#> ** Local Plink installation will be called -- output appears in the terminal screen
#> N variants = 319732
#> N variants p<threshold = 4132
#> N loci = 15
#> N independent variants (LD R2 threshold 0.01) = 153
head(gwas_loci)
#> SNP CHR BP A1 A2 MAF BETA SE P
#> 57882 rs12046439 1 107536799 T C 0.248 0.00997159 0.00170546 5.01e-09
#> 57900 rs143849791 1 107537916 CATG C 0.325 0.01283200 0.00164361 5.85e-15
#> 57922 rs113329442 1 107539252 A G 0.330 0.01109240 0.00149706 1.27e-13
#> 57987 rs3861909 1 107544176 G A 0.327 0.01187220 0.00150837 3.52e-15
#> 58025 rs17496332 1 107546375 A G 0.331 0.01110260 0.00148844 8.70e-14
#> 58091 rs2878349 1 107549245 G A 0.327 0.01182020 0.00149200 2.33e-15
#> locus lead lead_dist lead_ld
#> 57882 1 FALSE FALSE FALSE
#> 57900 1 FALSE FALSE FALSE
#> 57922 1 FALSE FALSE FALSE
#> 57987 1 FALSE FALSE FALSE
#> 58025 1 FALSE FALSE FALSE
#> 58091 1 FALSE FALSE FALSE
head(gwas_loci[ gwas_loci$lead==TRUE , ])
#> SNP CHR BP A1 A2 MAF BETA SE P
#> 58431 rs111232683 1 107566149 G C 0.34300 0.0135204 0.00161401 5.43e-17
#> 75873 rs114254196 1 108635400 C T 0.00848 -0.0448114 0.00818473 4.38e-08
#> 79333 rs140300970 1 109020060 A T 0.02240 -0.0278210 0.00496721 2.13e-08
#> 81599 rs148503795 1 109166178 C G 0.01050 -0.0423267 0.00709770 2.47e-09
#> 81618 rs74896173 1 109167705 T C 0.00914 -0.0429793 0.00766819 2.08e-08
#> 82661 rs111751551 1 109242056 G A 0.01010 -0.0505560 0.00697787 4.32e-13
#> locus lead lead_dist lead_ld
#> 58431 1 TRUE TRUE TRUE
#> 75873 2 TRUE TRUE TRUE
#> 79333 3 TRUE FALSE TRUE
#> 81599 3 TRUE FALSE TRUE
#> 81618 3 TRUE FALSE TRUE
#> 82661 3 TRUE FALSE TRUE