library(pander)
library(tidyverse)
library(scales)
library(ggrepel)
library(parallel)
if (interactive()) setwd(here::here("R"))
theme_set(theme_bw())
panderOptions("table.split.table", Inf)
panderOptions("big.mark", ",")
panderOptions("table.style", "rmarkdown")
logit <- binomial()$linkfun
inv.logit <- binomial()$linkinv
w <- 169/25.4
h <- 169/25.4
This analysis takes the filtered SNPs from under analysis and simulates genetic drift under no selective pressure, in order to compare detected changes in allele frequencies to the range of values predicted by the drift model.
This analysis uses a custom set of scripts collected as an R package and available from https://github.com/steveped/driftSim. This is able to be installed as below.
remotes::install_github("steveped/driftSim")
library(driftSim)
nGen <- 16
nSim <- 5000
litterSize <- 20
migRate <- 0.15
alpha <- 0.001
Ne <- c(`1996` = 222, `2012` = 116)
pSurv <- 0.1
f0 <- seq(0.05, 0.5, by = 0.05)
nPops <- c(4, 8)
sigma <- c(0.5, 0.8)
As decided the population parameters were defined as:
Parameter | Value | Comment |
---|---|---|
Ne1996 | 222 | Effective Population Size in 1996 |
Ne2012 | 116 | Effective Population Size in 2012 |
p | 0.1 | Probability of survival for each generation |
g | 16 | The number of generations between 1996 and 2012 |
n | 4, 8 | The number of neighbouring populations |
l | 20 | The annual litter size |
r | 0.15 | The migration rate |
f0 | 0.05 to 0.5 | The starting allele frequency in the main population, increased in steps of 0.05 |
σ | 0.5, 0.8 | The variability around f0 for neighbouring populations |
No selective advantage was specified for any allele.
The simulation parameters were defined as:
Parameter | Value | Comment |
---|---|---|
nsim | 5000 | The number of simulations at each starting frequency f0 |
α | 0.001 | The significance level for plotting confidence bands |
To summarise the above:
nSamp <- 1e06
x_0 <- runif(nSamp, 0.5, 0.95)
y_0 <- sigma %>%
lapply(function(sigma){
inv.logit(rnorm(nSamp, logit(x_0), sigma))
})
expand.grid(f0 = f0, sigma = sigma) %>%
apply(
MARGIN = 1,
FUN = function(x){
tibble(
f0 = x[1],
sigma = x[2],
samples = rnorm(1000, logit(x[1]), sd = x[2])
) %>%
mutate(samples = inv.logit(samples))
}
) %>%
bind_rows() %>%
mutate(sigma = paste("sigma ==", sigma)) %>%
ggplot(aes(x = f0, y = samples)) +
geom_boxplot(aes(group= as.factor(f0))) +
facet_wrap(~sigma, scales = "free", labeller = "label_parsed") +
scale_y_continuous(breaks = seq(0, 1, by = 0.2)) +
labs(
x = "Initial Value in Main Population",
y = "Simulated Initial Values In Other Populations"
)
The effect of adding variability to derive initial frequencies in neighbouring populations. Half of the simulated initial values will be contained by the box for each starting frequency, with the remaining half being outside of these bounds.
In order to express these values for σ in terms of correlations, 1,000,000 random values were simulated for a starting frequency (f0) anywhere between 0.5 and 0.95. 1,000,000 neighbouring population starting values were randomly sampled around this using the values of σ = 0.5 and 0.8, with the random sampling taking place on the logit scale.
tibble(
Sigma = sigma,
Correlation = vapply(y_0, cor, y = x_0, numeric(1))
) %>%
pander(
digits = 2,
caption = paste("Approximate correlations between starting allele frequencies in two populations, for the chosen values of $\\sigma$")
)
Sigma | Correlation |
---|---|
0.5 | 0.81 |
0.8 | 0.65 |
allParam <- expand.grid(
f0 = f0,
mig = migRate,
n = nPops,
sd = sigma
) %>%
mutate(
N0 = round(Ne["1996"], 0),
Nt = round(Ne["2012"], 0),
t = nGen,
surv = pSurv,
litter = litterSize
) %>%
split(f = factor(1:nrow(.)))
allSim <- allParam %>%
mclapply(
function(x){
x <- as.list(x)
replicate(nSim, do.call(simDrift, args = x)$ft)
},
mc.cores = 20
)
As expected the mean allele frequency after simulation of drift remained approximately equal to the initial starting frequency, however the variability of the final frequency was heavily affected by the number of neighbouring populations and their similarity to the to main population of interest.
bind_rows(allParam) %>%
mutate(
n_t = vapply(allSim, mean, numeric(1)),
sd_t = vapply(allSim, sd, numeric(1)),
n = as.factor(n),
sd = as.factor(sd)
) %>%
dplyr::select(f0, nPops = n, sd_0 = sd, n_t, sd_t) %>%
gather("variable", "value", ends_with("_t")) %>%
ggplot(aes(f0, value, shape = nPops, colour = sd_0, linetype = nPops)) +
geom_point( size = 2.5) +
geom_smooth(method = "lm", formula = y~poly(x,2), se = FALSE, size = 0.5) +
facet_wrap(
~variable,
ncol = 2,
scales = "free",
labeller = as_labeller(
c(n_t = "bar(f[t])", sd_t = "SD(f[t])"), default = label_parsed
)
) +
labs(
x = "Initial Value (f0)",
y = "After Simulations",
shape = "Number of Neighbouring Populations (n)",
linetype = "Number of Neighbouring Populations (n)",
colour = expression(
paste("Initial Variability in Neighbouring Populations (", sigma, ")")
)
) +
scale_colour_manual(values = c(rgb(0.1,0.1,0.8), rgb(0.9,0,0))) +
theme(legend.position = "bottom")
Mean and standard devation of allele frequencies (\(f_t\)) after simulating drift. Polynomial lines of best fit are shown for each value.
allSim99 <- allSim %>%
lapply(quantile, probs = c(alpha/2, 1-alpha/2)) %>%
lapply(as.list) %>%
lapply(as.data.frame) %>%
bind_rows() %>%
set_names(c("lwr", "upr"))
driftIntervals <- allParam %>%
bind_rows() %>%
dplyr::select(f0, n, sd) %>%
as_tibble() %>%
cbind(allSim99)
driftIntervals %>%
mutate(
n = as.factor(n),
sd = as.factor(sd)
) %>%
dplyr::select(f0, nPops = n, sd_0 = sd, lwr, upr) %>%
ggplot(aes(f0, shape = nPops, colour = sd_0)) +
geom_point(aes(y = lwr), size = 2) +
geom_smooth(
aes(y = lwr, linetype = nPops),
se = FALSE, method = "lm", formula = y~poly(x,2),
size = 0.5
) +
geom_point(aes(y = upr), size = 2) +
geom_smooth(
aes(y = upr, linetype = nPops),
se = FALSE, method = "lm", formula = y~poly(x,2),
size = 0.5
) +
scale_linetype_manual(values = c(1, 2)) +
scale_colour_manual(values = c(rgb(0.1,0.1,0.8), rgb(0.9,0,0))) +
labs(
x = "Initial Value (f0)",
y = paste(percent(1 -alpha), "Simulated Prediction Interval"),
shape = "Number of Neighbouring Populations (n)",
linetype = "Number of Neighbouring Populations (n)",
colour = expression(
paste("Initial Variability in Neighbouring Populations (", sigma, ")")
)
) +
theme(legend.position = "bottom")
Expected 100% intervals for individual allele frequencies under each set of criteria used in generation of simulations.
genotypeResults <- read_tsv("genotypeResults.tsv")
alleleResults <- read_tsv("alleleResults.tsv")
sigSnps <- c(
filter(alleleResults, FDR < 0.05)$snpID,
filter(genotypeResults, FDR < 0.05)$snpID
) %>%
unique
bothSnps <- intersect(
filter(alleleResults, FDR < 0.05)$snpID,
filter(genotypeResults, FDR < 0.05)$snpID
)
pdf(file.path("..", "figures", "Figure4.pdf"), width = w, height = 0.8*h)
alleleResults %>%
mutate(
Model = case_when(
FDR < 0.05 & snpID %in% filter(genotypeResults, FDR < 0.05)$snpID ~ "Both Models",
FDR < 0.05 ~ "Allele Model",
snpID %in% filter(genotypeResults, FDR < 0.05)$snpID ~ "Genotype Model",
!snpID %in% filter(genotypeResults, FDR < 0.05)$snpID ~ "Not Significant"
),
Model = as.factor(Model),
Model = relevel(Model, ref = "Both Models")
) %>%
ggplot() +
geom_point(
aes(x = MAF_1996, y = MAF_2012),
data = . %>% filter(is.na(Model)),
alpha = 0.4
) +
geom_point(
aes(x = MAF_1996, y = MAF_2012, colour = Model),
data = . %>% filter(!is.na(Model))
) +
geom_smooth(
aes(x = f0, y = lwr, linetype = as.factor(sd)),
data = filter(driftIntervals, n == 4),
method = "lm", formula = y ~ poly(x, 2),
fullrange = TRUE, se = FALSE,
colour = rgb(0.9,0,0), size = 0.8
) +
geom_smooth(
aes(x = f0, y = lwr, linetype = as.factor(sd)),
data = filter(driftIntervals, n == 8),
method = "lm", formula = y ~ poly(x, 2),
fullrange = TRUE, se = FALSE,
colour = rgb(0.1,0.1,0.8), size = 0.8
) +
geom_smooth(
aes(x = f0, y = upr, linetype = as.factor(sd)),
data = filter(driftIntervals, n == 4),
method = "lm", formula = y ~ poly(x, 2),
fullrange = FALSE, se = FALSE,
colour = rgb(0.9,0,0), size = 0.8
) +
geom_smooth(
aes(x = f0, y = upr, linetype = as.factor(sd)),
data = filter(driftIntervals, n == 8),
method = "lm", formula = y ~ poly(x, 2),
fullrange = FALSE, se = FALSE,
colour = rgb(0.1,0.1,0.8), size = 0.8
) +
geom_label_repel(
aes(x = MAF_1996, y = MAF_2012, label = snpID, colour = Model),
data = . %>% filter(Model == "Both Models"),
size = 2,
show.legend = FALSE
) +
scale_colour_viridis_d(option = "B") +
labs(
x = expression(f[1996]),
y = expression(f[2012]),
colour = "Significance",
linetype = expression(sigma)
) +
# guides(linetype = FALSE) +
theme(
legend.position = "bottom",
axis.text = element_text(size = 12),
axis.title = element_text(size= 14)
)
dev.off()
## png
## 2
Allele frequencies in the two populations. SNPs considered as significant under either the Full Genotype or Allele Frequency models are highlighted. SNPs considered significant under both models are labelled. 100% prediction intervals are indicated by the bands for differing simulation parameters. Intervals generated by 4 neighbouring populations are shown in red, whilst intervals generated by 8 neighbouring populations are shown in blue. Neighbouring populations simulated with the closest similarity to the main population are shown with solid lines (σ = 0.5), whilst those with less similarity are shown as dashed lines (σ = 0.8). The two points which clearly fall within all bands were found significant under the full genotype model and show differences in heterozygosity rather than in the allele frequencies themselves. The inner blue bands represent the expected scenario which most closely resembles the ecological system under investigation, with a continuous population of rabbits (i.e. a large number of neighbouring populations), with highly correlated allele frequencies.
save.image("06_GeneticDrift.RData")
pander(sessionInfo())
R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: driftSim(v.0.0.1), ggrepel(v.0.8.1), scales(v.1.1.0), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.3.0) and pander(v.0.6.3)
loaded via a namespace (and not attached): tidyselect(v.0.2.5), xfun(v.0.11), remotes(v.2.1.0), haven(v.2.2.0), lattice(v.0.20-38), colorspace(v.1.4-1), vctrs(v.0.2.0), generics(v.0.0.2), viridisLite(v.0.3.0), htmltools(v.0.4.0), yaml(v.2.2.0), rlang(v.0.4.2), pillar(v.1.4.2), glue(v.1.3.1), withr(v.2.1.2), DBI(v.1.1.0), dbplyr(v.1.4.2), modelr(v.0.1.5), readxl(v.1.3.1), lifecycle(v.0.1.0), munsell(v.0.5.0), gtable(v.0.3.0), cellranger(v.1.1.0), rvest(v.0.3.5), evaluate(v.0.14), labeling(v.0.3), knitr(v.1.26), curl(v.4.3), highr(v.0.8), broom(v.0.5.2), Rcpp(v.1.0.3), backports(v.1.1.5), jsonlite(v.1.6), farver(v.2.0.3), fs(v.1.3.1), png(v.0.1-7), hms(v.0.5.2), digest(v.0.6.23), stringi(v.1.4.5), grid(v.3.6.1), cli(v.1.1.0), tools(v.3.6.1), magrittr(v.1.5), lazyeval(v.0.2.2), crayon(v.1.3.4), pkgconfig(v.2.0.3), zeallot(v.0.1.0), ellipsis(v.0.3.0), xml2(v.1.2.2), reprex(v.0.3.0), lubridate(v.1.7.4), assertthat(v.0.2.1), rmarkdown(v.1.17), httr(v.1.4.1), rstudioapi(v.0.10), R6(v.2.4.1), nlme(v.3.1-143) and compiler(v.3.6.1)