Skip to contents

Use distance from lead significant SNP to estimate independet loci in GWAS summary stats. Uses -log10(p) derived from BETA/SE so does not need P as input.

Usage

get_loci(
  gwas,
  detect_headers = TRUE,
  snp_col = "SNP",
  chr_col = "CHR",
  pos_col = "BP",
  maf_col = "MAF",
  beta_col = "BETA",
  se_col = "SE",
  stat_col = "NA",
  p_col = "NA",
  neglog10p_col = "NA",
  use_pvalue = FALSE,
  n_bases = 5e+05,
  p_threshold = 5e-08,
  single_hla_locus = FALSE,
  hla_pos = c(2.5e+07, 3.4e+07),
  ld_clump = FALSE,
  ld_clump_r2 = 0.01,
  ld_clump_local = TRUE,
  ld_plink_bin = "plink",
  ld_bfile =
    "/indy/ukbiobank/data_14631/genetics/imputed_500k/5k_eur/ukb_imp_v3.qc_sub.5k_eur",
  verbose = FALSE,
  exclude_hla = lifecycle::deprecated(),
  get_ld_indep = lifecycle::deprecated(),
  ld_pruning_r2 = lifecycle::deprecated()
)

Arguments

gwas

A data.frame. Contains the GWAS summary statistics.

detect_headers

Logical. Default=TRUE. Search input headers to see if BOLT-LMM, SAIGE, REGENIE, or GWAS CATALOG input (user therefore doesn't need to provide). If BOLT-LMM then automatically use p-value instead of SE.

snp_col

A string. Default="SNP". The RSID/variantID column name.

chr_col

A string. Default="CHR". The chromosome column name.

pos_col

A string. Default="BP". The base pair/position column name.

maf_col

A string. Default="MAF". The MAF/minor-allele-frequency column name.

beta_col

A string. Default="BETA". The BETA column name.

se_col

A string. Default="SE". The SE column name.

stat_col

A string. Default="NA". The test statistic column name. (Only required if not providing beta+se, or neglog10p)

p_col

A string. Default="NA". The p-value column name. (Only required if `use_pvalue==TRUE` i.e., using the provided p-value, e.g., P_BOLT_LMM)

neglog10p_col

A string. Default="NA". The -log10 p-value column name. (Only required if not providing beta+se, or stat)

use_pvalue

Logical. Default=FALSE. Use the provided p-value (in `p_col`) rather than computing from the test statistic? Useful for BOLT-LMM output

n_bases

An interger. Default=5e5. The distance between two significant loci, beyond which they are defined as in separate loci.

p_threshold

A number. Default=5e-8. P-value threshold for statistical significance

single_hla_locus

Logical. Default=FALSE. Treat HLA as one continuous locus. [Previously called `exclude_hla`]

hla_pos

A numeric vector of length 2. Default=c(25e6, 34e6). The HLA region on chromosome 6 to treat as one continuous locus if `single_hla_locus==TRUE`

ld_clump

Logical. Default=FALSE. Use Plink LD clumping to identify independent SNPs - see ieugwasr::ld_clump() docs. [Previously called `get_ld_indep`]

ld_clump_r2

Numeric. Default=0.01. Pruning threshold for LD. [Previously called `ld_pruning_r2`]

ld_clump_local

Logical. Default=TRUE. If clumping using local installation (rather than IEU API) - see ieugwasr::ld_clump() docs

ld_plink_bin

A string. Default="plink". Path to Plink v1.90 binary

ld_bfile

A string. Default is to 5,000 random unrelated UK Biobank Europeans on my server :) needs a path to appropriate BIM/BED reference panel files on your server

verbose

Logical. Default=FALSE. Be verbose

Value

Returns a data frame of significant SNPs with locus annotation

Author

Luke Pilling

Examples


# distance-based loci
gwas_loci = get_loci(gwas_example)
#> 
#> Locus size (bases) = 5e+05
#> P-value threshold = 5e-08
#> 
#> N variants = 319732
#> N variants p<threshold = 4132
#> N loci = 15

head(gwas_loci)
#>               SNP CHR        BP   A1 A2   MAF       BETA         SE        P
#> 57882  rs12046439   1 107536799    T  C 0.248 0.00997159 0.00170546 5.01e-09
#> 57900 rs143849791   1 107537916 CATG  C 0.325 0.01283200 0.00164361 5.85e-15
#> 57922 rs113329442   1 107539252    A  G 0.330 0.01109240 0.00149706 1.27e-13
#> 57987   rs3861909   1 107544176    G  A 0.327 0.01187220 0.00150837 3.52e-15
#> 58025  rs17496332   1 107546375    A  G 0.331 0.01110260 0.00148844 8.70e-14
#> 58091   rs2878349   1 107549245    G  A 0.327 0.01182020 0.00149200 2.33e-15
#>       locus  lead
#> 57882     1 FALSE
#> 57900     1 FALSE
#> 57922     1 FALSE
#> 57987     1 FALSE
#> 58025     1 FALSE
#> 58091     1 FALSE

head(gwas_loci[ gwas_loci$lead==TRUE , ])
#>                SNP CHR        BP A1 A2     MAF        BETA         SE         P
#> 58431  rs111232683   1 107566149  G  C 0.34300  0.01352040 0.00161401  5.43e-17
#> 75873  rs114254196   1 108635400  C  T 0.00848 -0.04481140 0.00818473  4.38e-08
#> 83709  rs115292790   1 109310728  G  A 0.01360 -0.05639270 0.00608890  2.01e-20
#> 91666   rs12740374   1 109817590  G  T 0.21900 -0.14822800 0.00166391 4.73e-305
#> 98740  rs140266316   1 110326545  G  A 0.01630 -0.05770880 0.00597770  4.73e-22
#> 120461    rs657801   1 111736389  T  C 0.31500  0.00905412 0.00150713  1.88e-09
#>        locus lead
#> 58431      1 TRUE
#> 75873      2 TRUE
#> 83709      3 TRUE
#> 91666      4 TRUE
#> 98740      5 TRUE
#> 120461     6 TRUE

# clump loci using Plink LD pruning
gwas_loci = get_loci(gwas_example, ld_clump=TRUE)
#> 
#> Locus size (bases) = 5e+05
#> P-value threshold = 5e-08
#> 
#> ** Performing LD clumping. Can take a few minutes
#> ** Local Plink installation will be called -- output appears in the terminal screen
#> N variants = 319732
#> N variants p<threshold = 4132
#> N loci = 15
#> N independent variants (LD R2 threshold 0.01) = 153

head(gwas_loci)
#>               SNP CHR        BP   A1 A2   MAF       BETA         SE        P
#> 57882  rs12046439   1 107536799    T  C 0.248 0.00997159 0.00170546 5.01e-09
#> 57900 rs143849791   1 107537916 CATG  C 0.325 0.01283200 0.00164361 5.85e-15
#> 57922 rs113329442   1 107539252    A  G 0.330 0.01109240 0.00149706 1.27e-13
#> 57987   rs3861909   1 107544176    G  A 0.327 0.01187220 0.00150837 3.52e-15
#> 58025  rs17496332   1 107546375    A  G 0.331 0.01110260 0.00148844 8.70e-14
#> 58091   rs2878349   1 107549245    G  A 0.327 0.01182020 0.00149200 2.33e-15
#>       locus  lead lead_dist lead_ld
#> 57882     1 FALSE     FALSE   FALSE
#> 57900     1 FALSE     FALSE   FALSE
#> 57922     1 FALSE     FALSE   FALSE
#> 57987     1 FALSE     FALSE   FALSE
#> 58025     1 FALSE     FALSE   FALSE
#> 58091     1 FALSE     FALSE   FALSE

head(gwas_loci[ gwas_loci$lead==TRUE , ])
#>               SNP CHR        BP A1 A2     MAF       BETA         SE        P
#> 58431 rs111232683   1 107566149  G  C 0.34300  0.0135204 0.00161401 5.43e-17
#> 75873 rs114254196   1 108635400  C  T 0.00848 -0.0448114 0.00818473 4.38e-08
#> 79333 rs140300970   1 109020060  A  T 0.02240 -0.0278210 0.00496721 2.13e-08
#> 81599 rs148503795   1 109166178  C  G 0.01050 -0.0423267 0.00709770 2.47e-09
#> 81618  rs74896173   1 109167705  T  C 0.00914 -0.0429793 0.00766819 2.08e-08
#> 82661 rs111751551   1 109242056  G  A 0.01010 -0.0505560 0.00697787 4.32e-13
#>       locus lead lead_dist lead_ld
#> 58431     1 TRUE      TRUE    TRUE
#> 75873     2 TRUE      TRUE    TRUE
#> 79333     3 TRUE     FALSE    TRUE
#> 81599     3 TRUE     FALSE    TRUE
#> 81618     3 TRUE     FALSE    TRUE
#> 82661     3 TRUE     FALSE    TRUE