library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
theme_set(theme_bw())
if (interactive()) here::here("R") %>% setwd
This workflow goes through multiple steps involved in processing the stacks output:
gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
makeGds <- !file.exists(gdsPath)
if (makeGds) {
dir.create(dirname(gdsPath))
file.path("..", "5_stacks", "vcf", "populations.snps.vcf.gz") %>%
seqVCF2GDS(gdsPath, reference = "OryCun2.0")
}
gdsFile <- seqOpen(gdsPath, readonly = FALSE)
Sample information and subsequent population information was then extracted and added to the GDS file.
sampleID <- seqGetData(gdsFile, "sample.id") %>%
as.data.frame(stringsAsFactors = FALSE) %>%
set_names("Sample") %>%
mutate(Population = case_when(
grepl("gc", Sample) ~ 1996L,
grepl("ora", Sample) ~ 2012L,
!grepl("(gc|ora)", Sample) ~ 2010L
),
Location = case_when(
Population == 1996 ~ "Gum Creek",
Population == 2012 ~ "Oraparinna",
Population == 2010 ~ "Turretfield"
)) %>%
as_tibble()
if (makeGds) {
add.gdsn(gdsFile, "sample.annotation", sampleID, replace = TRUE)
seqClose(gdsFile)
gdsFile <- seqOpen(gdsPath, readonly = FALSE)
}
chromosomes <- paste0("chr", c(1:21, "X"))
scaffolds <- seqGetData(gdsFile, "chromosome") %>%
unique() %>%
setdiff(chromosomes)
snpSummary <- tibble(
variant.id = seqGetData(gdsFile, "variant.id") ,
chromosome = seqGetData(gdsFile, "chromosome"),
position = seqGetData(gdsFile, "position")
) %>%
cbind(seqGetData(gdsFile, "annotation/format/DP")$data %>%
t %>%
set_colnames(sampleID$Sample)) %>%
as_tibble()
mnDist <- snpSummary %>%
dplyr::filter(chromosome %in% 1:21) %>%
split(f = .$chromosome) %>%
lapply(function(x){diff(x$position)}) %>%
unlist() %>%
mean() %>%
divide_by(1000) %>%
round(2)
mnDiffLoc <- snpSummary %>%
dplyr::filter(chromosome %in% 1:21) %>%
split(f = .$chromosome) %>%
lapply(function(x){diff(x$position) %>% .[. > 100]}) %>%
unlist() %>%
mean() %>%
divide_by(1000) %>%
round(2)
From this initial summary, 146,763 unique SNPs were identified by the Stacks pipeline. The mean distance between all autosomal SNPs was calculated to be 21.43kb, however, many of these were within the same locus. Excluding SNPs within the same locus (i.e. <100nt apart), the mean distance between SNPs became 48.63kb.
Checking the call rate across all samples, 5x1996 samples with SNP call rates < 50% were identified. This value was not exceeded by any samples in the 2010 or 2012 populations. No SNPs were found being unique to Turretfield, which is as expected by the settings given to stacks
snpSummary %>%
mutate_at(sampleID$Sample, list(is.na)) %>%
summarise_at(sampleID$Sample, list(sum)) %>%
gather(key = "Sample", value = "Missing") %>%
mutate(Called = nrow(snpSummary) - Missing) %>%
gather(key = "Type", value = "SNPs", -Sample) %>%
left_join(sampleID) %>%
ggplot(aes(Sample, SNPs / nrow(snpSummary), fill = Type)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = 0.5, linetype = 2) +
facet_wrap(~Population, scales = "free") +
coord_flip() +
labs(y = "Percent Called") +
scale_y_continuous(expand = expand_scale(0, 0), labels = percent) +
scale_fill_manual(values = c("green", "red"))
Summary of SNP call rate in all samples
set.seed(1523)
minMaf <- 0.05
missingRate <- 0.25
minCor <- 0.4
snpset <- gdsFile %>%
snpgdsLDpruning(
sample.id = grep("(gc|ora)", seqGetData(., "sample.id"), value = TRUE),
autosome.only = FALSE,
maf = minMaf,
missing.rate = missingRate,
method = "corr",
ld.threshold = minCor
) %>%
.[setdiff(names(.), "chrX")]
seqResetFilter(gdsFile)
SNPs were pruned for Linkage Disequilibrium using only samples from the 1996 and 2012 populations and the following criteria:
keepSNPs <- c("variant.id", "position", "chromosome") %>%
sapply(function(x){
seqGetData(gdsFile, x)
}, simplify = FALSE) %>%
as_tibble() %>%
dplyr::filter(variant.id %in% unlist(snpset))
keepSNPs %>%
dplyr::filter(chromosome %in% c(1:22, "X")) %>%
group_by(chromosome) %>%
tally() %>%
mutate(chromosome = factor(chromosome, levels = 1:21)) %>%
droplevels() %>%
arrange(chromosome) %>%
ggplot(aes(chromosome, n)) +
geom_bar(stat = "identity") +
labs(x = "Primary Chromosomes",
y = "Number of SNPs retained for initial analysis") +
scale_x_discrete()
Summary of SNPs per chromosome after filtering for LD.
This set of SNPs was then exported as an R object for downstream use.
write_rds(keepSNPs, "keepSNPsAfterLDPruning.RDS")
R version 3.6.2 (2019-12-12)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.3.0), magrittr(v.1.5), scales(v.1.1.0), pander(v.0.6.3), SNPRelate(v.1.20.1), SeqArray(v.1.26.2) and gdsfmt(v.1.22.0)
loaded via a namespace (and not attached): Rcpp(v.1.0.3), lubridate(v.1.7.4), lattice(v.0.20-38), Biostrings(v.2.54.0), assertthat(v.0.2.1), zeallot(v.0.1.0), digest(v.0.6.23), R6(v.2.4.1), GenomeInfoDb(v.1.22.0), cellranger(v.1.1.0), backports(v.1.1.5), reprex(v.0.3.0), stats4(v.3.6.2), evaluate(v.0.14), highr(v.0.8), httr(v.1.4.1), pillar(v.1.4.3), zlibbioc(v.1.32.0), rlang(v.0.4.2), lazyeval(v.0.2.2), readxl(v.1.3.1), rstudioapi(v.0.10), S4Vectors(v.0.24.3), rmarkdown(v.2.1), labeling(v.0.3), RCurl(v.1.98-1.1), munsell(v.0.5.0), broom(v.0.5.3), compiler(v.3.6.2), modelr(v.0.1.5), xfun(v.0.12), pkgconfig(v.2.0.3), BiocGenerics(v.0.32.0), htmltools(v.0.4.0), tidyselect(v.0.2.5), GenomeInfoDbData(v.1.2.2), IRanges(v.2.20.2), fansi(v.0.4.1), withr(v.2.1.2), crayon(v.1.3.4), dbplyr(v.1.4.2), bitops(v.1.0-6), grid(v.3.6.2), nlme(v.3.1-143), jsonlite(v.1.6), gtable(v.0.3.0), lifecycle(v.0.1.0), DBI(v.1.1.0), cli(v.2.0.1), stringi(v.1.4.5), farver(v.2.0.3), XVector(v.0.26.0), fs(v.1.3.1), xml2(v.1.2.2), ellipsis(v.0.3.0), vctrs(v.0.2.1), generics(v.0.0.2), tools(v.3.6.2), glue(v.1.3.1), hms(v.0.5.3), parallel(v.3.6.2), yaml(v.2.2.0), colorspace(v.1.4-1), GenomicRanges(v.1.38.0), rvest(v.0.3.5), knitr(v.1.27) and haven(v.2.2.0)