SNP Filtering

Setup

library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
theme_set(theme_bw())
if (interactive()) here::here("R") %>% setwd

This workflow goes through multiple steps involved in processing the stacks output:

Beginning with the VCF, this will be converted to a GDS object for easier interaction. As this is a slow conversion process, this will only be performed once. The resulting object will be used to prune SNPs in Linkage Disequilibrium. Notably, this VCF is not available in this github repo as it is ~92Mb and too large. Likewise the prepared GDS object is 86.6Mb, which is also too large for storage on github.

VCF Conversion to GDS

gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
makeGds <- !file.exists(gdsPath)
if (makeGds) {
    dir.create(dirname(gdsPath))
    file.path("..", "5_stacks", "vcf", "populations.snps.vcf.gz") %>%
        seqVCF2GDS(gdsPath, reference = "OryCun2.0")
}
gdsFile <- seqOpen(gdsPath, readonly = FALSE)

Sample information and subsequent population information was then extracted and added to the GDS file.

sampleID <- seqGetData(gdsFile, "sample.id") %>%
    as.data.frame(stringsAsFactors = FALSE) %>%
    set_names("Sample") %>%
    mutate(Population = case_when(
        grepl("gc", Sample) ~ 1996L,
        grepl("ora", Sample) ~ 2012L,
        !grepl("(gc|ora)", Sample) ~ 2010L
    ),
    Location = case_when(
        Population == 1996 ~ "Gum Creek",
        Population == 2012 ~ "Oraparinna",
        Population == 2010 ~ "Turretfield"
    )) %>%
    as_tibble()

if (makeGds) {
  add.gdsn(gdsFile, "sample.annotation", sampleID, replace = TRUE)
  seqClose(gdsFile)
  gdsFile <- seqOpen(gdsPath, readonly = FALSE)
}

SNP Data Summary

chromosomes <- paste0("chr", c(1:21, "X"))
scaffolds <- seqGetData(gdsFile, "chromosome") %>%
    unique() %>%
    setdiff(chromosomes)

snpSummary <- tibble(
    variant.id = seqGetData(gdsFile, "variant.id") ,
    chromosome = seqGetData(gdsFile, "chromosome"),
    position = seqGetData(gdsFile, "position")
) %>%
    cbind(seqGetData(gdsFile, "annotation/format/DP")$data %>% 
              t %>%
              set_colnames(sampleID$Sample)) %>%
    as_tibble()
mnDist <- snpSummary %>% 
  dplyr::filter(chromosome %in% 1:21) %>% 
  split(f = .$chromosome) %>% 
  lapply(function(x){diff(x$position)}) %>% 
  unlist() %>% 
  mean() %>% 
  divide_by(1000) %>% 
  round(2)
mnDiffLoc <- snpSummary %>% 
  dplyr::filter(chromosome %in% 1:21) %>% 
  split(f = .$chromosome) %>%
  lapply(function(x){diff(x$position) %>% .[. > 100]}) %>%
  unlist() %>% 
  mean() %>% 
  divide_by(1000) %>% 
  round(2)

From this initial summary, 146,763 unique SNPs were identified by the Stacks pipeline. The mean distance between all autosomal SNPs was calculated to be 21.43kb, however, many of these were within the same locus. Excluding SNPs within the same locus (i.e. <100nt apart), the mean distance between SNPs became 48.63kb.

Checking the call rate across all samples, 5x1996 samples with SNP call rates < 50% were identified. This value was not exceeded by any samples in the 2010 or 2012 populations. No SNPs were found being unique to Turretfield, which is as expected by the settings given to stacks

snpSummary %>%
    mutate_at(sampleID$Sample, list(is.na)) %>%
    summarise_at(sampleID$Sample, list(sum)) %>%
    gather(key = "Sample", value = "Missing") %>%
    mutate(Called = nrow(snpSummary) - Missing) %>%
    gather(key = "Type", value = "SNPs", -Sample) %>%
    left_join(sampleID) %>%
    ggplot(aes(Sample, SNPs / nrow(snpSummary), fill = Type)) +
    geom_bar(stat = "identity") +
    geom_hline(yintercept = 0.5, linetype = 2) +
    facet_wrap(~Population, scales = "free") +
    coord_flip() +
    labs(y = "Percent Called") +
    scale_y_continuous(expand = expand_scale(0, 0), labels = percent) +
    scale_fill_manual(values = c("green", "red"))

Summary of SNP call rate in all samples

LD Pruning

set.seed(1523)
minMaf <- 0.05
missingRate <- 0.25
minCor <- 0.4
snpset <- gdsFile %>%
    snpgdsLDpruning(
        sample.id = grep("(gc|ora)", seqGetData(., "sample.id"), value = TRUE),
        autosome.only = FALSE,
        maf = minMaf,
        missing.rate = missingRate,
        method = "corr",
        ld.threshold = minCor
        ) %>%
    .[setdiff(names(.), "chrX")]
seqResetFilter(gdsFile)

SNPs were pruned for Linkage Disequilibrium using only samples from the 1996 and 2012 populations and the following criteria:

Minor Allele Frequency > 5%
Missing Data in < 25% of samples
Correlation of > 40% indicating LD

keepSNPs <- c("variant.id", "position", "chromosome") %>%
    sapply(function(x){
        seqGetData(gdsFile, x)
    }, simplify = FALSE) %>%
    as_tibble() %>%
    dplyr::filter(variant.id %in% unlist(snpset))

keepSNPs %>%
    dplyr::filter(chromosome %in% c(1:22, "X")) %>%
    group_by(chromosome) %>%
    tally() %>%
    mutate(chromosome = factor(chromosome, levels = 1:21)) %>%
    droplevels() %>%
    arrange(chromosome) %>%
    ggplot(aes(chromosome, n)) +
    geom_bar(stat = "identity") +
    labs(x = "Primary Chromosomes",
         y = "Number of SNPs retained for initial analysis") +
    scale_x_discrete()

Summary of SNPs per chromosome after filtering for LD.

This set of SNPs was then exported as an R object for downstream use.

write_rds(keepSNPs, "keepSNPsAfterLDPruning.RDS")

SessionInfo

R version 3.6.2 (2019-12-12)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.3.0), magrittr(v.1.5), scales(v.1.1.0), pander(v.0.6.3), SNPRelate(v.1.20.1), SeqArray(v.1.26.2) and gdsfmt(v.1.22.0)

loaded via a namespace (and not attached): Rcpp(v.1.0.3), lubridate(v.1.7.4), lattice(v.0.20-38), Biostrings(v.2.54.0), assertthat(v.0.2.1), zeallot(v.0.1.0), digest(v.0.6.23), R6(v.2.4.1), GenomeInfoDb(v.1.22.0), cellranger(v.1.1.0), backports(v.1.1.5), reprex(v.0.3.0), stats4(v.3.6.2), evaluate(v.0.14), highr(v.0.8), httr(v.1.4.1), pillar(v.1.4.3), zlibbioc(v.1.32.0), rlang(v.0.4.2), lazyeval(v.0.2.2), readxl(v.1.3.1), rstudioapi(v.0.10), S4Vectors(v.0.24.3), rmarkdown(v.2.1), labeling(v.0.3), RCurl(v.1.98-1.1), munsell(v.0.5.0), broom(v.0.5.3), compiler(v.3.6.2), modelr(v.0.1.5), xfun(v.0.12), pkgconfig(v.2.0.3), BiocGenerics(v.0.32.0), htmltools(v.0.4.0), tidyselect(v.0.2.5), GenomeInfoDbData(v.1.2.2), IRanges(v.2.20.2), fansi(v.0.4.1), withr(v.2.1.2), crayon(v.1.3.4), dbplyr(v.1.4.2), bitops(v.1.0-6), grid(v.3.6.2), nlme(v.3.1-143), jsonlite(v.1.6), gtable(v.0.3.0), lifecycle(v.0.1.0), DBI(v.1.1.0), cli(v.2.0.1), stringi(v.1.4.5), farver(v.2.0.3), XVector(v.0.26.0), fs(v.1.3.1), xml2(v.1.2.2), ellipsis(v.0.3.0), vctrs(v.0.2.1), generics(v.0.0.2), tools(v.3.6.2), glue(v.1.3.1), hms(v.0.5.3), parallel(v.3.6.2), yaml(v.2.2.0), colorspace(v.1.4-1), GenomicRanges(v.1.38.0), rvest(v.0.3.5), knitr(v.1.27) and haven(v.2.2.0)