library(tidyverse)
library(magrittr)
library(rtracklayer)
library(GenomicRanges)
library(parallel)
library(reshape2)
library(pander)
library(scales)
library(stringr)
library(qqman)
nCores <- min(12, detectCores() - 1)
This supplementary analysis uses the FLK model as outlined in Bonhomme et al, Detecting Selection in Population Trees: The Lewontin and Krakauer Test Extended. This test is robust to genetic drift, and is able to detect signatures of selection. However as this is a linear evolution dataset within one population as opposed to a series of approximately parallel selection events occurring within a series of populations, the assumptions of the FLK model may be less than satisfied. Hence this is presented as supplementary information.
A third population was included as an outlier group, representing a population separated by a large geographic distance. 35 samples were taken from Schwensow et al.
Whilst aligned to the RefSeq/NCBI genome build, for compatibility with Ensembl gene models, a mapping object was generated from the RefSeq Assembly Report
ncbiReport <- file.path("..", "data", "GCF_000003625.3_OryCun2.0_assembly_report.txt") %>%
read_delim(delim = "\t", col_names = FALSE, na = "na", comment = "#") %>%
set_colnames(c("Sequence-Name", "Sequence-Role", "Assigned-Molecule", "Assigned-Molecule-Location/Type", "GenBank-Accn", "Relationship", "RefSeq-Accn", "Assembly-Unit", "Sequence-Length", "UCSC-style-name")) %>%
filter(!is.na(`RefSeq-Accn`))
ensUsesGB <- grepl("scaffold",ncbiReport$`Sequence-Role`)
ncbiReport$Ensembl <- ncbiReport$`Sequence-Name`
ncbiReport$Ensembl[ensUsesGB] <- gsub("\\.[0-9]$", "", ncbiReport$`GenBank-Accn`[ensUsesGB])
ncbi2Ens <- with(ncbiReport, data.frame(Chr = Ensembl,
Length = `Sequence-Length`,
row.names = `RefSeq-Accn`))
Gene information was obtained from Ensembl Build 84 and loaded
ensGff <- file.path("..", "data", "Oryctolagus_cuniculus.OryCun2.0.84.gff3.gz")
ensGenes <- import.gff3(ensGff, feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE)
The same stacks output as in 02_snpFiltering was loaded, retaining data from the third population (Turretfield).
allData <- file.path("..", "data", "batch_3.sumstats.tsv.gz") %>%
gzfile() %>%
read_delim(delim = "\t", col_names = TRUE, skip = 3) %>%
dplyr::select(-contains("Batch"), -contains("Pi"), -contains("Fis")) %>%
mutate(RefSeq = gsub("^gi\\|.+\\|ref\\|(.+)\\|$", "\\1", Chr),
Chr = as.character(ncbi2Ens[RefSeq, "Chr"]),
snpID = paste(`Locus ID`, Col, sep="_"),
Private = as.logical(Private)) %>%
filter(Chr != "X",
Col > 4) %>%
dplyr::select(Chr, BP, `Locus ID`, Col, snpID,
`Pop ID`, contains("Nuc"), N, P,
contains("Obs"), contains("Exp"), Private)
allSnpsGR <- allData %>%
distinct(snpID, .keep_all = TRUE) %>%
makeGRangesFromDataFrame(ignore.strand = TRUE,
keep.extra.columns = TRUE,
seqinfo = seqinfo(ensGenes),
seqnames.field = "Chr",
start.field = "BP",
end.field = "BP") %>%
sort()
mcols(allSnpsGR) <- DataFrame(snpID = allSnpsGR$snpID)
The script FLK.R
was obtained from the QGSP and loaded into R
source("FLK.R")
hitsIn100kb <- findOverlaps(
allSnpsGR %>%
resize(width = 200001,fix = "center") %>%
trim(),
ensGenes
)
neutLoci <- allSnpsGR %>%
magrittr::extract(setdiff(seq_along(.), queryHits(hitsIn100kb))) %>%
mcols() %>%
extract2("snpID")
neutData <- allData %>%
filter(snpID %in% neutLoci) %>%
distinct(Chr, BP, `Pop ID`, .keep_all = TRUE)
As a broad definition of neutral loci, the set of SNPs > 100kb from any known gene were selected. After removal of duplicate loci, this gave a set of 26,097 SNPs.
Neutral SNPs were again checked to ensure that the P
allele was the same as used in the 1996 population.
neutData <- neutData %>%
split(f = .$snpID) %>%
mclapply(function(x){
if (length(unique(x$`P Nuc`)) != 3){
pop2Change <- which(x$`P Nuc` != x$`P Nuc`[1])
x$`P Nuc`[pop2Change] <- x$`P Nuc`[1]
x$`Q Nuc`[pop2Change] <- x$`Q Nuc`[1]
x$P[pop2Change] <- 1 - x$P[pop2Change]
}
x
}, mc.cores = nCores) %>%
bind_rows() %>%
arrange(Chr, BP, `Pop ID`)
N <- allData %>%
group_by(`Pop ID`) %>%
summarise(maxN = max(N)) %>%
mutate(minN = round(0.95*maxN))
The list of Neutral SNPs were then restricted to those identified in >95% of individuals within all populations.
neutMatrix <- neutData %>%
left_join(N) %>%
group_by(snpID) %>%
filter(all(N > minN)) %>%
mutate(`Pop ID` = c("Gum Creek (1996)", "Oraparinna (2012)", "Turretfield (2010)")[`Pop ID`]) %>%
dcast(snpID~`Pop ID`, value.var = "P") %>%
filter(`Gum Creek (1996)` != 1) %>%
column_to_rownames("snpID") %>%
as.matrix() %>%
t()
The above gave a final list of 1171 neutral SNPs for estimation of Reynolds Distance as the first step towards calculating the co-ancestry matrix \(\mathcal{F}_{ij}\).
reynDist <- reynolds(neutMatrix)
 | Gum Creek (1996) | Oraparinna (2012) | Turretfield (2010) |
---|---|---|---|
Gum Creek (1996) | 0 | 0.01062 | 0.08543 |
Oraparinna (2012) | 0.01062 | 0 | 0.09209 |
Turretfield (2010) | 0.08543 | 0.09209 | 0 |
F_ij <- Fij(neutMatrix, "Turretfield (2010)", reynDist)
regionSnps <- file.path("..", "results", "regionSNPs.txt") %>% readLines()
testData <- file.path("..", "data", "filteredSNPs.tsv.gz") %>%
gzfile() %>%
read_delim(delim = "\t") %>%
filter(!snpID %in% regionSnps)
The set of 18,393 SNPs previously obtained for testing after all filtering steps were then tested using the FLK model. (1943 SNPs previously identified as showing a regional effect in the 2012 population were removed from FLK analysis.) P-values obtained under FLK were adjusted using Bonferroni’s method to obtain a set of high-confidence SNPs, then using Benjamini-Hochberg’s FDR to provide a larger set for testing of GO enrichment.
flkResults <- testData %>%
dplyr::select(snpID, `Pop ID`, P) %>%
mutate(`Pop ID` = c("Gum Creek (1996)", "Oraparinna (2012)")[`Pop ID`]) %>%
acast(`Pop ID` ~ snpID ) %>%
FLK(Fij = F_ij) %>%
rownames_to_column("snpID") %>%
as_tibble() %>%
dplyr::select(snpID, Ht, contains("F.LK")) %>%
mutate(FDR = p.adjust(F.LK.p.val, method = "fdr"),
P_bonf = p.adjust(F.LK.p.val, method = "bonferroni")) %>%
arrange(F.LK.p.val)
flkResults <- testData %>%
dplyr::select(snpID, Chr, BP, `Pop ID`, `P Nuc`, `Q Nuc`, P) %>%
mutate(`Pop ID` = c("Gum Creek (1996)", "Oraparinna (2012)")[`Pop ID`]) %>%
split(f = .$snpID) %>%
lapply(mutate, `Q Nuc` = `Q Nuc`[1]) %>%
bind_rows() %>%
mutate(SNP = paste(`P Nuc`, `Q Nuc`, sep = "/")) %>%
dcast(snpID + Chr + BP + SNP ~ `Pop ID`, value.var = "P") %>%
right_join(flkResults) %>%
as_data_frame() %>%
arrange(F.LK.p.val)
A total of 6 SNPs retained significance after the Bonferroni adjustment with a total of 48 being considered in the larger set of SNPs with an FDR of 0.05.
flkResults %>%
write_tsv( file.path("..", "results", "flkResults.tsv"))
R version 3.4.4 (2018-03-15)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ape(v.5.1), bindrcpp(v.0.2.2), qqman(v.0.1.4), scales(v.0.5.0), pander(v.0.6.1), reshape2(v.1.4.3), rtracklayer(v.1.38.3), GenomicRanges(v.1.30.3), GenomeInfoDb(v.1.14.0), IRanges(v.2.12.0), S4Vectors(v.0.16.0), BiocGenerics(v.0.24.0), magrittr(v.1.5), forcats(v.0.3.0), stringr(v.1.3.1), dplyr(v.0.7.5), purrr(v.0.2.5), readr(v.1.1.1), tidyr(v.0.8.1), tibble(v.1.4.2), ggplot2(v.2.2.1) and tidyverse(v.1.2.1)
loaded via a namespace (and not attached): Biobase(v.2.38.0), httr(v.1.3.1), jsonlite(v.1.5), modelr(v.0.1.2), assertthat(v.0.2.0), highr(v.0.6), GenomeInfoDbData(v.1.0.0), cellranger(v.1.1.0), Rsamtools(v.1.30.0), yaml(v.2.1.19), pillar(v.1.2.3), backports(v.1.1.2), lattice(v.0.20-35), glue(v.1.2.0), digest(v.0.6.15), XVector(v.0.18.0), rvest(v.0.3.2), colorspace(v.1.3-2), htmltools(v.0.3.6), Matrix(v.1.2-14), plyr(v.1.8.4), psych(v.1.8.4), XML(v.3.98-1.11), pkgconfig(v.2.0.1), broom(v.0.4.4), haven(v.1.1.1), calibrate(v.1.7.2), zlibbioc(v.1.24.0), BiocParallel(v.1.12.0), SummarizedExperiment(v.1.8.1), lazyeval(v.0.2.1), cli(v.1.0.0), mnormt(v.1.5-5), crayon(v.1.3.4), readxl(v.1.1.0), evaluate(v.0.10.1), nlme(v.3.1-137), xml2(v.1.2.0), foreign(v.0.8-70), tools(v.3.4.4), hms(v.0.4.2), matrixStats(v.0.53.1), munsell(v.0.4.3), DelayedArray(v.0.4.1), Biostrings(v.2.46.0), compiler(v.3.4.4), rlang(v.0.2.1), grid(v.3.4.4), RCurl(v.1.95-4.10), rstudioapi(v.0.7), labeling(v.0.3), bitops(v.1.0-6), rmarkdown(v.1.9), gtable(v.0.2.0), R6(v.2.2.2), GenomicAlignments(v.1.14.2), lubridate(v.1.7.4), knitr(v.1.20), bindr(v.0.1.1), rprojroot(v.1.3-2), stringi(v.1.2.2), Rcpp(v.0.12.17) and tidyselect(v.0.2.4)