Analysis of SNPs

library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
library(readxl)
library(sp)
library(ggmap)
library(rgdal)
library(ggsn)
library(parallel)
library(qqman)
library(ggrepel)
library(plyranges)
library(rtracklayer)
library(AnnotationDbi)
library(GO.db)

if (interactive()) here::here("R") %>% setwd()
theme_set(theme_bw())
panderOptions("missing", "")
mc <- min(12, detectCores() - 1)
alpha <- 0.05
maxKb <- 40

tidyP <- function(p, m = 0.001){
  x <- rep("", length(p))
  x[p < m] <- sprintf("%.2e", p[p < m])
  x[p >= m] <- sprintf("%.3f", p[p >= m])
  x
}

keepSNPs <- readRDS("keepSNPsAfterLDPruning.RDS")

w <- 169/25.4
h <- 169/25.4

This workflow immediately follows 03_SNPFiltering and performs an analysis on the 18886 SNPs retained after filtering. Analysis performed were:

To assess the rate of missing values by retained SNP and sample
Run a PCA analysis using SNPs & samples
Analyse using comparisons between genotypes and allele frequencies, as well as using Bayescan and FLK

gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
gdsFile <- seqOpen(gdsPath, readonly = TRUE)

sampleID <- tibble(
    Sample = seqGetData(gdsFile, "sample.annotation/Sample"),
    Population = seqGetData(gdsFile, "sample.annotation/Population"),
    Location = seqGetData(gdsFile, "sample.annotation/Location")
)
popSizes <- sampleID %>% 
    group_by(Population) %>%
    summarise(n = dplyr::n())

After loading the data from the previous section, an additional filtering step was performed, where SNPs were required to have a non-zero MAF in the initial 1996 population.

snpIn1996 <- genotypes %>% 
    filter(Population == 1996) %>% 
    group_by(variant.id) %>% 
    summarise(maf = mean(Genotype) / 2) %>%
    filter(maf > 0)
genotypes %<>% 
    right_join(snpIn1996)

This additional filtering step restricted analysis to 18,878 SNPs instead of the initial 18,886 SNPs.
The mean distance between SNPs in the final dataset was 139.25kb

seqClose(gdsFile)

In addition to the above, the set of genes corresponding the Ensembl Release 96 were loaded.

ensGenes <- file.path(
  "..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz"
) %>%
  import.gff3(feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE) %>%
  .[,c("gene_id", "Name", "description")]

snpsGR <- makeGRangesFromDataFrame(
    df = snps, 
    keep.extra.columns = TRUE, 
    ignore.strand = TRUE, 
    seqinfo = seqinfo(ensGenes), 
    seqnames.field = "chromosome", 
    start.field = "position", 
    end.field = "position"
)

The set of SNPs under investigation was also defined as a GRanges object. This was intersected with the set of genes to connect SNPs to genes within 40 kb.

snp2Gene <- suppressWarnings(
  snpsGR %>% 
    resize(width = 2*1000*maxKb - 1, fix = "center") %>%
    trim() %>%
    join_overlap_inner(ensGenes) 
)

A set of SNPs within exonic regions was also defined in order to detect any SNPs within coding regions.

ensExons <- file.path(
  "..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz"
) %>%
  import.gff3(feature.type = "exon", sequenceRegionsAsSeqinfo = TRUE) %>%
  join_overlap_inner_within(
    ensGenes, maxgap = 0, minoverlap = 0, suffix = c(".exon", "")
  ) %>%
  join_overlap_inner(snpsGR)
ensExons <- ensExons[,setdiff(colnames(mcols(ensExons)), "Name.exon")]

PCA

lowCall <- c("gc2901", "gc2776", "gc2731", "gc2727", "gc2686")

snp4PCA <- genotypes %>% 
    filter(!Sample %in% lowCall) %>%
    group_by(variant.id, Population) %>% 
    summarise(n = dplyr::n()) %>% 
    spread(Population, n) %>%
    mutate(N = `1996` + `2010` + `2012`) %>% 
    ungroup() %>%
    filter(N > 0.95*(sum(popSizes$n) - length(lowCall)))
pca <- genotypes %>%
    filter(variant.id %in% snp4PCA$variant.id) %>%
    dplyr::select(variant.id, Sample, Genotype) %>%
    spread(Sample, Genotype) %>%
    as.data.frame() %>%
    column_to_rownames("variant.id") %>%
    as.matrix() %>%
    apply(2, function(x){
        x[is.na(x)] <- mean(x, na.rm = TRUE)
        x
    }) %>%
    t() %>%
    .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
    prcomp( center = TRUE)

As noted in the previous section sample samples gc2901, gc2776, gc2731, gc2727 and gc2686 had a SNP identification rate $< 50$% and as such these were marked as potential outliers. Ignoring these samples, and restricting data to SNPs identified in $>95$% of all samples, a preliminary PCA was performed This amounted to 7,767 of the possible 18,878 SNPs for analysis using PCA. Missing values were specified as the mean MAF over all populations combined.

Given the initially observed structure, in which samples from the 2012 population are separating from the other samples which group with the 1996 population, the collection points for the 2012 samples as investigated.

sampleID %<>%
    left_join(
        file.path("..", "external", "GPS_Locations.xlsx") %>%
            read_excel() %>%
            dplyr::select(Sample, ends_with("tude")) %>%
            mutate(Sample = gsub("[Oo][Rr][Aa] ([0-9ABC]*)", "ora\\1", Sample))
    )

fs <- 14
ps <- 3
popCols <- c(rgb(0, 0, 0, 0.7), 
             rgb(0.8, 0.2, 0.2, 0.9), 
             rgb(0.2, 0.8, 0.2, 0.9),
             rgb(0.5, 0.5, 1, 0.7))
pca4Plot <- pca$x %>%
    as.data.frame() %>%
    rownames_to_column("Sample") %>%
    as_tibble() %>%
    dplyr::select(Sample, PC1, PC2, PC3) %>%
    left_join(sampleID) %>%
    mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
    group_by(Cluster) %>%
    mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
    ungroup() %>%
    mutate(
      Population = case_when(
        Population == 1996 ~ "1996",
        Population == 2010 ~ "Outgroup (Turretfield)",
        maxY == max(maxY) ~ "2012 (Outer)",
        maxY != max(maxY) ~ "2012 (Central)"
      )
    ) %>%
  left_join(
    genotypes %>% 
      filter(variant.id %in% snp4PCA$variant.id, !Sample %in% lowCall) %>% 
      group_by(Sample) %>% 
      tally() %>% 
      mutate(imputationRate = 1 - n / nrow(snp4PCA)
      )
  ) 
pca4Plot %>%
    ggplot(aes(PC1, PC2, colour = Population, size = imputationRate)) +
    geom_point(alpha = 0.6) +
    labs(
      x = paste0("PC1 (", percent(summary(pca)$importance[2,"PC1"]), ")"),
      y = paste0("PC2 (", percent(summary(pca)$importance[2,"PC2"]), ")"),
      size = "Imputation Rate"
    ) +
  scale_colour_manual(values = popCols)

PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.

loc <- c(
  range(sampleID$Longitude, na.rm = TRUE) %>% mean,
  range(sampleID$Latitude, na.rm = TRUE) %>% mean
)
saPoly <- readRDS(file.path("..", "external", "saPoly.RDS"))
roads <- readRDS(file.path("..", "external", "roads.RDS"))
gc <- SpatialPoints(cbind(x = 138.655972, y = -31.200305))
proj4string(gc) <- "+proj=longlat +ellps=GRS80 +no_defs"
xBreaks <- seq(138.65, 138.8, by = 0.05)
xLabs <- parse(text = paste(xBreaks, "*degree ~ E", sep = ""))
yBreaks <- seq(-31.2, -31.32, by = -0.04)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
leftN <- tibble(
  x = c(138.7965, 138.8, 138.8) - 0.01,
  y = c(-31.2, -31.198, -31.193)
)
rightN <- tibble(
  x = c(138.8, 138.8, 138.8035) - 0.01,
  y = c(-31.193, -31.198, -31.2)
)

ggMap <- get_map(loc, zoom = 12, maptype = "terrain", color = "bw")
zoomPlot <- ggmap(ggMap, extent = "normal", maprange = FALSE) +
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE == "UNSE"), 
        linetype = 2, size = 0.3) + 
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE != "UNSE"), 
        linetype = 1, size = 0.4) +
    geom_label(x = 138.74, y = -31.29, label = "Flinders Ranges NP", alpha = 0.4) +
    geom_label(x = 138.72, y = -31.22, label = "Gum Creek", alpha = 0.4) +
    geom_point(aes(x, y), data = as.data.frame(gc), shape = 3, size = 3) +
    geom_text(aes(x, y), label = "HS", data = as.data.frame(gc), nudge_y = 0.005) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "HD"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.3) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "PARK"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.2) +
  geom_point(
    data= filter(pca4Plot, grepl("2012", Population)),
    aes(Longitude, Latitude, colour = Population),
    size = 0.9*ps) +
  geom_polygon(
    aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4
  ) +
  geom_polygon(
    aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4
  ) +
  geom_text(
    x = 138.79, y = -31.19, label = "N", colour = "grey10", size = 4
  ) +
  scale_colour_manual(values = popCols[2:3]) +
  coord_cartesian(
    xlim = c(138.618, 138.81),
    ylim = c(-31.335, -31.18),
    expand = 0
  ) +
  scale_x_continuous(breaks = xBreaks, labels = xLabs) +
  scale_y_continuous(breaks = yBreaks, labels = yLabs) +
  guides(colour = FALSE) +
  ggsn::scalebar(
    x.min = 138.618,
    x.max = 138.81,
    y.min = -31.335, 
    y.max = -31.18,
    transform = TRUE,
    dist = 2, 
    dist_unit = "km",
    model = 'GRS80',
    height = 0.012, 
    st.size = 4,
    location = 'bottomright',
    anchor = c(x = 138.8, y = -31.328)) +
  labs(x = "Longitude", y = "Latitude") +
  theme(
    text = element_text(size = fs),
    plot.margin = unit(c(1, 1, 1, 1), "mm")
  )

ausPolygon <- readRDS(file.path("..", "external", "ausPolygon.RDS"))
ausPts <- SpatialPoints(cbind(x = loc[1], y = loc[2]))
proj4string(ausPts) <- proj4string(ausPolygon)
ausPlot <- ggplot() +
  geom_polygon(
    data = ausPolygon, 
    aes(long, lat, group = group),
    fill = "white", colour = "black"
  ) + 
  geom_point(data = as.data.frame(ausPts), aes(x, y), size = 1.5) +
  theme_void() +
  theme(plot.background = element_rect(fill = "white", colour = "black"))

convert \
  -density 600 \
  -size 1920 \
  ../figures/Figure1.pdf \
  ../figures/Figure1.png

knitr::include_graphics(file.path("..", "figures", "Figure1.png"))

Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and k-means clustering.

zoomLoc <- c(138.753, -31.242)
xBreaks <- seq(138.74, 138.76, by = 0.01)
xLabs <- parse(text = paste(xBreaks, "*degree ~ W", sep = ""))
yBreaks <- seq(-31.235, -31.25, by = -0.005)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
central <- rbind(
  x = c(138.749, 138.755),
  y = c(-31.2365, -31.2495)
) %>%
  set_colnames(c("min", "max"))
leftN <- tibble(
  x = c(138.7595, 138.76, 138.76),
  y = c(-31.234, -31.2335, -31.2325)
)
rightN <- tibble(
  x = c(138.76, 138.76, 138.7605),
  y = c(-31.2325, -31.2335, -31.234)
)
get_map(zoomLoc, zoom = 15, maptype = "terrain", source = "google", color = "bw") %>%
  ggmap() +
  geom_jitter(
    data = filter(pca4Plot, grepl("2012", Population)), 
    aes(Longitude, Latitude, colour = Population), 
    size = 3, width = 0.0005, height = 0) +
  geom_rect(
    xmin = central["x", "min"],
    xmax = central["x", "max"],
    ymin = central["y", "min"],
    ymax = central["y", "max"],
    fill = "red", alpha = 0.01, colour = "black") +
  geom_polygon(
    aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4
  ) +
  geom_polygon(
    aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4
  ) +
  geom_text(
    x = 138.76, y = -31.232, label = "N", colour = "grey10", size = 5
  ) +
  scale_colour_manual(values = popCols[2:3]) +
  scale_x_continuous(breaks = xBreaks, labels = xLabs) +
  scale_y_continuous(breaks = yBreaks, labels = yLabs) +
  theme_bw() +
  guides(colour = FALSE) +
  labs(x = "Longitude", y = "Latitude") +
  coord_cartesian(
    xlim = c(138.74, 138.762),
    ylim = c(-31.253, -31.231),
    expand = 0
  ) +
  ggsn::scalebar(
    x.min = 138.74, x.max = 138.762, 
    y.min = -31.253, y.max = -31.231,
    transform = TRUE,
    dist = 0.25, dist_unit = "km",, model = 'GRS80', 
    height = 0.012, st.size = 4,
    location = 'bottomright',
    anchor = c(x = 138.761, y = -31.252)
    )

Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.

Region Analysis

Removal of SNPs Associated with Collection Region

The structure observed within the 2012 population in the PCA could possibly be explained by recent migration into this region. As the samples collected in the outer regions appeared very similar to the 1996 population in the above plots, this would possibly indicate migration is a very recent event as the genetic influence of this has not spread through the wider area. Although this may be due to other factors such as sampling bias, this structure was addressed by identifying SNPs which showed an association with the sub-populations identified by PCA analysis. In this way, any candidate SNPs obtained below will be less impacted by this structure, and will be more reflective of the intended variable under study, i.e. selection over time, as opposed to any internal structure of the 2012 population which may indicate migration into the breeding population.

oraRegions <- pca4Plot %>% 
  filter(grepl("2012", Population)) %>% 
  rowwise() %>%
  mutate(
    yCentral = cut(Latitude, breaks = central["y",], include.lowest = TRUE), 
    xCentral = cut(Longitude, breaks = central["x",], include.lowest = TRUE),
    Central = (is.na(yCentral) + is.na(xCentral)) == 0
  ) %>%
  dplyr::select(Sample, Central)

Testing for Structure in 2012

This model tests:
H₀: No association between genotypes and collection region
H_A: An association exists between genotypes and collection region

regionResults <- genotypes %>%
    filter(Population == 2012) %>%
    split(f = .$variant.id) %>%
    mclapply(
      function(x){
        ft <- list(p.value = NA)
        if (length(unique(x$Genotype)) > 1) {
          ft <- x %>%
            left_join(oraRegions) %>%
            group_by(Genotype, Central) %>%
            tally() %>%
            spread(Genotype, n, fill = 0) %>%
            column_to_rownames("Central") %>%
            fisher.test()
        }
        x %>%
          distinct(variant.id, snpID, chromosome, position) %>%
          mutate(p = ft$p.value)
      }, 
      mc.cores = mc
    ) %>%
  bind_rows() %>%
  filter(!is.na(p)) %>%
  arrange(p)

A total of 1682 SNPs were detected as showing a significant association between genotype and the collection region. Under H₀, the number expected using α = 0.05 would be 943, and as this number was approximately double that expected, this was taken as evidence of this being a genuine point of concern for this dataset.

Notably, Type II errors were of principle concern in this instance, and as such every SNP with p < 0.05 in the above test was excluded from downstream analysis.

regionSNPs <- filter(regionResults, p < 0.05)
saveRDS(regionSNPs, "regionSNPs.RDS")

Under this additional filtering step, the original set of 18784 SNPs will be reduced to 17,102 for testing by genotype and allele frequency.

Verification Of Removal

In order to verify that the removal of the above SNPs removed the undesired population structure from the 2012 population, the above PCA was repeated, excluding the SNPs marked for removal. The previous structure noted in the data was no longer evident, and as such, these SNPs were marked for removal during analysis by genotype and allele frequency.

pcaPost <- genotypes %>%
  filter(
    variant.id %in% snp4PCA$variant.id,
    !variant.id %in% regionSNPs$variant.id
  ) %>%
  dplyr::select(variant.id, Sample, Genotype) %>%
  spread(Sample, Genotype) %>%
  as.data.frame() %>%
  column_to_rownames("variant.id") %>%
  as.matrix() %>%
  apply(2, function(x){
    x[is.na(x)] <- mean(x, na.rm = TRUE)
    x
  }) %>%
  t() %>%
  .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
  prcomp( center = TRUE) 
pcaPost4Plot <- pcaPost$x %>%
  as.data.frame() %>%
  rownames_to_column("Sample") %>%
  as_tibble() %>%
  dplyr::select(Sample, PC1, PC2, PC3) %>%
  left_join(sampleID) %>%
  mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
  group_by(Cluster) %>%
  mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(
    Population = case_when(
      Population == 1996 ~ "1996",
      Population == 2010 ~ "Outgroup (Turretfield)",
      maxY == max(maxY) ~ "2012 (Outer)",
      maxY != max(maxY) ~ "2012 (Central)"
    )
  ) %>%
  left_join(
    genotypes %>% 
      filter(variant.id %in% snp4PCA$variant.id, !Sample %in% lowCall) %>% 
      group_by(Sample) %>% 
      tally() %>% 
      mutate(imputationRate = 1 - n / nrow(snp4PCA))
  )

convert \
-density 600 \
-size 1920 \
../figures/Figure2.pdf \
../figures/Figure2.png

knitr::include_graphics(file.path("..", "figures", "Figure2.png"))

Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs

SNP Analysis

Genotype Frequency Model

This model tests:
H₀: No association between genotypes and populations
H_A: An association exists between genotypes and populations

genotypeResults <- genotypes %>%
  filter(
    Population != 2010,
    !variant.id %in% regionSNPs$variant.id
  ) %>%
  group_by(variant.id, snpID, Population, Genotype) %>%
  tally() %>%
  ungroup() %>%
  split(f = .$variant.id) %>%
  mclapply(function(x){
    ft <- list(p.value = NA)
    if (length(unique(x$Genotype)) > 1) {
      ft <- x %>%
        spread(Genotype, n, fill = 0) %>%
        column_to_rownames("Population") %>%
        dplyr::select(-variant.id, -snpID) %>%
        fisher.test()
    }
    x %>% 
      distinct(variant.id) %>%
      mutate(p = ft$p.value)
  },
  mc.cores = mc
  ) %>%
  bind_rows() %>%
  filter(!is.na(p)) %>%
  mutate(FDR = p.adjust(p, "fdr"), adjP = p.adjust(p, "bonferroni")) %>%
  arrange(p) %>%
  left_join(
    genotypes %>%
      distinct(variant.id, snpID, chromosome, position)
  ) %>%
  dplyr::select(variant.id, snpID, chromosome, position, everything())

genoSNPs <- genotypeResults %>%
  dplyr::filter(FDR < alpha) %>%
  .[["snpID"]]

snpsGR %>% 
  subset(snpID %in% genoSNPs) %>% 
  join_nearest(ensGenes) %>%
  as.data.frame() %>%
  left_join(
    ensGenes %>% as.data.frame(),
    by = c("Name", "gene_id", "seqnames", "description"),
    suffix = c("_snp", "_gene")
  ) %>%
  as_tibble() %>%
  dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene) %>%
  mutate(dist2Gene = case_when(
    start_snp >= start_gene & start_snp <= end_gene ~ 0L,
    start_snp < start_gene ~ start_gene - start_snp,
    start_snp > end_gene ~ start_snp - end_gene
  ),
  Name = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ Name
  ),
    gene_id = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ gene_id
  ),
  inExon = snpID %in% ensExons$snpID,
  dist2Gene = case_when(
    Name == "" ~ "",
    inExon ~ "CDS",
    dist2Gene == 0 & !inExon ~ "Intronic",
    dist2Gene > 0 ~ comma(dist2Gene)
  ),
  seqnames = as.character(seqnames)
  ) %>%
  dplyr::select(
    snpID,
    chromosome = seqnames,
    position = start_snp,
    gene_id, Name, dist2Gene
  ) %>%
  bind_rows(
    subset(snpsGR, snpID %in% setdiff(genoSNPs, .$snpID)) %>%
      as.data.frame() %>%
      mutate(seqnames = as.character(seqnames)) %>%
      dplyr::select(
        snpID,
        chromosome = seqnames,
        position = start
      )
  ) %>%
  left_join(genotypeResults) %>%
  dplyr::select(-variant.id, FDR) %>%
  arrange(p) %>%
  mutate(
    position = comma(position),
    p = tidyP(p),
    FDR = tidyP(FDR),
    adjP = tidyP(adjP)
    ) %>%
  pander(
    justify = "llrlllrrr",
    style = "rmarkdown",
    big.mark = ",",
    split.tables = Inf,
    caption = paste(
      "*Candidate SNPs to an FDR of", percent_format(1)(alpha), "when analysing by genotype.",
      "The nearest gene to each SNP (within", paste0(maxKb, "kb"), ") is indicated.",
      "Bonferroni-adjusted p-values are also given in the final column.*")
  )

*Candidate SNPs to an FDR of 5% when analysing by genotype. The nearest gene to each SNP (within 40kb ) is indicated. Bonferroni-adjusted p-values are also given in the final column.*
snpID	chromosome	position	gene_id	Name	dist2Gene	p	FDR	adjP
3391201_92	GL018705	1,937,359	ENSOCUG00000016428	RCBTB2	37,689	6.10e-08	9.04e-04	0.001
686773_29	3	90,851,512	ENSOCUG00000009616	CRISPLD1	CDS	1.05e-07	9.04e-04	0.002
916731_91	4	89,602,973	ENSOCUG00000006126	RIC8B	Intronic	1.87e-07	9.84e-04	0.003
836950_151	4	35,111,855	ENSOCUG00000012144	TMPRSS12	5,745	2.59e-07	9.84e-04	0.004
321186_117	2	27,995,273				2.87e-07	9.84e-04	0.005
2227006_109	13	129,650,867	ENSOCUG00000022823	ID3	16,426	4.42e-07	0.001	0.008
3040789_114	19	30,689,426	ENSOCUG00000010440	MSI2	Intronic	4.84e-07	0.001	0.008
4165193_106	GL019119	147,630				4.90e-07	0.001	0.008
4223149_99	GL019332	5,745				6.12e-07	0.001	0.011
1089925_112	7	35,584,375				9.80e-07	0.002	0.017
464226_128	2	125,603,458				1.01e-06	0.002	0.017
3374849_109	GL018704	1,294,009				1.39e-06	0.002	0.024
3695415_108	GL018751	737,318	ENSOCUG00000003672	TRAF3	Intronic	1.84e-06	0.002	0.032
2099634_108	13	45,086,624				2.03e-06	0.002	0.035
1902686_98	12	60,113,885	ENSOCUG00000001871		12,820	2.09e-06	0.002	0.036
644062_1	3	56,309,981	ENSOCUG00000004827	SFXN1	Intronic	3.51e-06	0.004	0.060
4149102_107	GL019077	34,265	ENSOCUG00000024268	IFT140	Intronic	4.82e-06	0.005	0.083
87917_119	1	64,635,286	ENSOCUG00000007758	CEP78	Intronic	5.49e-06	0.005	0.094
2689075_49	16	56,527,308	ENSOCUG00000000984	ESRRG	Intronic	5.62e-06	0.005	0.096
4010189_98	GL018907	283,766	ENSOCUG00000008870	BRWD1	Intronic	5.93e-06	0.005	0.102
4044777_119	GL018933	204,058	ENSOCUG00000005859	FNBP1	Intronic	7.75e-06	0.006	0.133
4146664_90	GL019084	74,038	ENSOCUG00000010071	GNB1	Intronic	1.84e-05	0.014	0.316
3016258_99	19	19,178,148	ENSOCUG00000015254	SEZ6	Intronic	2.17e-05	0.016	0.372
4176850_23	GL019154	883	ENSOCUG00000002484	MTHFSD	828	2.18e-05	0.016	0.374
4098854_101	GL018985	129,495	ENSOCUG00000006396		Intronic	3.09e-05	0.021	0.530
2465104_19	14	157,742,593				3.67e-05	0.023	0.629
3032107_92	19	26,020,372	ENSOCUG00000024664	LHX1	20,607	3.71e-05	0.023	0.637
2906428_79	18	25,311,176	ENSOCUG00000011533	RHOBTB1	24	3.76e-05	0.023	0.646
3829050_109	GL018791	957,960	ENSOCUG00000007849	FHAD1	Intronic	4.01e-05	0.023	0.687
1009106_1	6	9,037,073	ENSOCUG00000027083	CLEC19A	15,180	4.15e-05	0.023	0.712
482617_100	2	139,723,776	ENSOCUG00000001127	RHOQ	8,346	4.34e-05	0.023	0.744
3378006_1	GL018704	2,271,135	ENSOCUG00000006255	AGO3	9,610	4.45e-05	0.023	0.764
2580149_42	15	87,473,844	ENSOCUG00000001910	ADGRL3	Intronic	4.52e-05	0.023	0.775
1685466_9	10	42,335,759				6.43e-05	0.032	1.000
4136275_162	GL019049	178,272	ENSOCUG00000026644		Intronic	9.11e-05	0.045	1.000
2028473_73	13	2,979,383				9.62e-05	0.045	1.000
1839042_17	12	15,448,280				9.72e-05	0.045	1.000
4208471_113	GL019274	63,350	ENSOCUG00000015189		37,542	1.01e-04	0.045	1.000

Under the full genotype model:

15 genotypes were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05 (using the Bonferroni adjustment)
38 genotypes were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05
For the most highly ranked SNP (3391201_92), the minor allele has been completely lost in the 2012 population

convert \
-density 600 \
-size 1920 \
../figures/FigureS3a.pdf \
../figures/FigureS3a.png

knitr::include_graphics(file.path("..", "figures", "FigureS3a.png"))

$*Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with a Bonferroni-adjusted p-value < 0,05 additionally shown in green*$

Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with a Bonferroni-adjusted p-value < 0,05 additionally shown in green

genotypeResults %>% 
  filter(FDR < alpha) %>% 
  left_join(
    filter(
      genotypes, 
      snpID %in% .$snpID, 
      Population %in% c(1996, 2012)
    )
  ) %>% 
  mutate(
    Genotype = c("AA", "AB", "BB")[Genotype + 1],
    Genotype = as.factor(Genotype),
    Population = as.factor(Population),
    snpID = case_when(
      adjP < alpha ~ paste0(snpID, "*"),
      adjP >= alpha ~ as.character(snpID)
    ),
    snpID = factor(snpID, levels = unique(snpID))
  ) %>%
  group_by(snpID, Population, Genotype) %>% 
  tally() %>%
  ungroup() %>% 
  group_by(snpID, Population) %>%
  mutate(freq = n / sum(n)) %>%
  ggplot(aes(Genotype, freq, fill = Population)) + 
  geom_bar(
    stat = "identity",
    position = position_dodge2(preserve = "single")
  ) + 
  facet_wrap(~snpID, drop = FALSE, ncol = 7) + 
  scale_fill_manual(values = c("black", "blue")) +
  labs(y = "Frequency") + 
  theme(
    legend.direction = "horizontal", 
    legend.position = c(0.75, 0.05)
  )

Variants detected as significant to an FDR of 5%. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance, with some variants becoming exclusively heterozygous in the 2012 population.

write_tsv(genotypeResults, "genotypeResults.tsv")

Allele Frequency Model

This model tests:
H₀: No association between allele frequencies and populations
H_A: An association exists between allele frequencies and populations

alleleResults <- genotypes %>%
  filter(
    Population != 2010,
    !variant.id %in% regionSNPs$variant.id
  ) %>%
  group_by(snpID, Population) %>%
  summarise(
    P = sum(2 - Genotype),
    Q = sum(Genotype)
  ) %>%
  ungroup() %>%
  split(f = .$snpID) %>%
  mclapply(function(x){
    m <- as.matrix(x[c("P", "Q")])
    ft <- list(p.value = NA) 
    if (length(m) == 4) {
      ft <- fisher.test(m)
    }
    x %>%
      mutate(MAF = Q / (P + Q)) %>%
      dplyr::select(snpID, Population, MAF) %>%
      spread(Population, MAF) %>%
      mutate(p = ft$p.value)
  },mc.cores = mc) %>%
  bind_rows() %>%
  filter(!is.na(p)) %>%
  dplyr::rename(
    MAF_1996 = `1996`,
    MAF_2012 = `2012`
  ) %>%
  mutate(
    FDR = p.adjust(p, "fdr"),
    adjP = p.adjust(p, "bonferroni"),
    # Make the MAF is relative to the 1996 population not the reference
    MAF_2012 = case_when(
      MAF_1996 > 0.5 ~ 1 - MAF_2012,
      MAF_1996 <= 0.5 ~ MAF_2012
    ),
    MAF_1996 = case_when(
      MAF_1996 > 0.5 ~ 1 - MAF_1996,
      MAF_1996 <= 0.5 ~ MAF_1996
    )
  ) %>%
  arrange(p) %>%
  left_join(
    genotypes %>% distinct(snpID, chromosome, position)
  ) %>%
  dplyr::select(snpID, chromosome, position, everything())

alleleSnps <- alleleResults %>%
  dplyr::filter(FDR < alpha) %>%
  .[["snpID"]]

snpsGR %>% 
  subset(snpID %in% alleleSnps) %>% 
  join_nearest(ensGenes) %>%
  as.data.frame() %>%
  left_join(
    ensGenes %>% as.data.frame(),
    by = c("Name", "gene_id", "seqnames", "description"),
    suffix = c("_snp", "_gene")
  ) %>%
  as_tibble() %>%
  dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene) %>%
  mutate(dist2Gene = case_when(
    start_snp >= start_gene & start_snp <= end_gene ~ 0L,
    start_snp < start_gene ~ start_gene - start_snp,
    start_snp > end_gene ~ start_snp - end_gene
  ),
  Name = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ Name
  ),
    gene_id = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ gene_id
  ),
  inExon = snpID %in% ensExons$snpID,
  dist2Gene = case_when(
    Name == "" ~ "",
    inExon ~ "CDS",
    dist2Gene == 0 & !inExon ~ "Intronic",
    dist2Gene > 0 ~ comma(dist2Gene)
  ),
  seqnames = as.character(seqnames)
  ) %>%
  dplyr::select(
    snpID,
    chromosome = seqnames,
    position = start_snp,
    gene_id, Name, dist2Gene
  ) %>%
  left_join(alleleResults) %>%
  dplyr::select(
    snpID, chromosome, position, starts_with("MAF"),
    gene_id, Name, dist2Gene, p, FDR, adjP
  ) %>%
  arrange(p) %>%
  mutate(
    position = comma(position),
    MAF_1996 = sprintf("%.3f", MAF_1996),
    MAF_2012 = sprintf("%.3f", MAF_2012),
    p = tidyP(p),
    FDR = tidyP(FDR),
    adjP = tidyP(adjP)
  ) %>%
  pander(
    justify = "llrrrlllrrr",
    style = "rmarkdown",
     big.mark = ",",
    split.tables = Inf,
      caption = paste(
        "*SNPs considered as significant when analysing by genotype using an FDR cutoff of", 
        percent_format(1)(alpha),
        "Minor allele frequencies are reported based on frequencies in the 1996 population.",
        "Any genes within", paste0(maxKb, "kb"), "are also shown.",
        "Bonferroni-adjusted p-values are given in the final column.*"
      )
  )

SNPs considered as significant when analysing by genotype using an FDR cutoff of 5% Minor allele frequencies are reported based on frequencies in the 1996 population. Any genes within 40kb are also shown. Bonferroni-adjusted p-values are given in the final column.
snpID	chromosome	position	MAF_1996	MAF_2012	gene_id	Name	dist2Gene	p	FDR	adjP
3391201_92	GL018705	1,937,359	0.225	0.010	ENSOCUG00000016428	RCBTB2	37,689	3.44e-07	0.006	0.006
916731_91	4	89,602,973	0.196	0.000	ENSOCUG00000006126	RIC8B	Intronic	6.88e-07	0.006	0.012
3040789_114	19	30,689,426	0.186	0.000	ENSOCUG00000010440	MSI2	Intronic	1.53e-06	0.008	0.026
321186_117	2	27,995,273	0.250	0.012				1.87e-06	0.008	0.032
1009106_1	6	9,037,073	0.443	0.128	ENSOCUG00000027083	CLEC19A	15,180	2.52e-06	0.008	0.043
836950_151	4	35,111,855	0.291	0.041	ENSOCUG00000012144	TMPRSS12	5,745	3.09e-06	0.008	0.053
464226_128	2	125,603,458	0.191	0.000				3.11e-06	0.008	0.053
2227006_109	13	129,650,867	0.271	0.033	ENSOCUG00000022823	ID3	16,426	4.17e-06	0.008	0.072
1902686_98	12	60,113,885	0.163	0.000	ENSOCUG00000001871		12,820	4.75e-06	0.008	0.082
3374849_109	GL018704	1,294,009	0.207	0.010				4.87e-06	0.008	0.084
4044777_119	GL018933	204,058	0.018	0.202	ENSOCUG00000005859	FNBP1	Intronic	7.75e-06	0.011	0.133
1089925_112	7	35,584,375	0.284	0.043				9.72e-06	0.013	0.167
3695415_108	GL018751	737,318	0.261	0.033	ENSOCUG00000003672	TRAF3	Intronic	1.28e-05	0.014	0.219
2099634_108	13	45,086,624	0.261	0.034				1.41e-05	0.014	0.243
87917_119	1	64,635,286	0.186	0.000	ENSOCUG00000007758	CEP78	Intronic	1.43e-05	0.014	0.246
4010189_98	GL018907	283,766	0.193	0.010	ENSOCUG00000008870	BRWD1	Intronic	1.63e-05	0.015	0.280
2689075_49	16	56,527,308	0.058	0.304	ENSOCUG00000000984	ESRRG	Intronic	1.65e-05	0.015	0.284
4149102_107	GL019077	34,265	0.223	0.029	ENSOCUG00000024268	IFT140	Intronic	2.28e-05	0.020	0.392
1997286_93	12	141,844,290	0.089	0.356	ENSOCUG00000004829	ESR1	Intronic	2.44e-05	0.020	0.419
2906428_79	18	25,311,176	0.173	0.443	ENSOCUG00000011533	RHOBTB1	24	2.54e-05	0.020	0.437
2405359_66	14	97,813,263	0.021	0.202	ENSOCUG00000005835	STXBP5L	Intronic	4.48e-05	0.033	0.770
3000704_65	19	13,055,154	0.032	0.234				5.42e-05	0.038	0.931
4146664_90	GL019084	74,038	0.196	0.020	ENSOCUG00000010071	GNB1	Intronic	5.49e-05	0.038	0.944
4098854_101	GL018985	129,495	0.170	0.010	ENSOCUG00000006396		Intronic	6.69e-05	0.043	1.000
4098838_14	GL018985	123,328	0.364	0.127	ENSOCUG00000006396		Intronic	6.80e-05	0.043	1.000
1882774_17	12	40,509,779	0.321	0.594				7.29e-05	0.043	1.000
1045185_75	7	2,702,919	0.214	0.038	ENSOCUG00000027861		3,276	7.81e-05	0.043	1.000
3016258_99	19	19,178,148	0.220	0.033	ENSOCUG00000015254	SEZ6	Intronic	7.86e-05	0.043	1.000
2028473_73	13	2,979,383	0.018	0.173				8.05e-05	0.043	1.000
3829050_109	GL018791	957,960	0.163	0.010	ENSOCUG00000007849	FHAD1	Intronic	8.09e-05	0.043	1.000
2580149_42	15	87,473,844	0.351	0.115	ENSOCUG00000001910	ADGRL3	Intronic	8.42e-05	0.043	1.000
2527760_104	15	43,156,117	0.153	0.406				8.59e-05	0.043	1.000
3999128_75	GL018883	283,912	0.365	0.125	ENSOCUG00000016318	SERPINA6	Intronic	8.68e-05	0.043	1.000
4176850_23	GL019154	883	0.234	0.041	ENSOCUG00000002484	MTHFSD	828	8.92e-05	0.043	1.000
686773_29	3	90,851,512	0.438	0.167	ENSOCUG00000009616	CRISPLD1	CDS	9.73e-05	0.045	1.000
3032107_92	19	26,020,372	0.194	0.021	ENSOCUG00000024664	LHX1	20,607	1.02e-04	0.046	1.000

Under this model:

5 SNP alleles were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05.
38 SNP alleles were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05

convert \
-density 600 \
-size 1920 \
../figures/FigureS3b.pdf \
../figures/FigureS3b.png

knitr::include_graphics(file.path("..", "figures", "FigureS3b.png"))

Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.

alleleResults %>%
    mutate(Sig = FDR < alpha) %>%
    ggplot(aes(MAF_1996, MAF_2012, colour = Sig, label = snpID)) +
    geom_point() +
    geom_text_repel(data = . %>% filter(adjP < alpha)) +
    # geom_text_repel(data = . %>%filter(FDR < alpha & MAF_2012 > MAF_1996)) +
    scale_colour_manual(values = c(rgb(0.5, 0.5, 0.5, 0.6), "red")) +
    labs(x = "MAF (1996)", y = "MAF (2012)") +
    theme(legend.position = "none")

Comparison of minor allele frequencies in both populations. SNPs with an FDR-adjusted p-value < 0.05 are coloured red, whilst those with a Bonferroni-adjusted p-value are additionally labelled.

alleleResults %>% 
  filter(FDR < alpha) %>% 
  gather(Population, B, contains("MAF")) %>%
  mutate(
    Population = str_remove(Population, "MAF_"),
    A = 1 - B,
    Population = as.factor(Population),
    snpID = case_when(
      adjP < alpha ~ paste0(snpID, "*"),
      adjP >= alpha ~ as.character(snpID)
    ),
    snpID = factor(snpID, levels = unique(snpID))
  ) %>% 
  gather(Allele, Freq, A, B) %>%
  ggplot(aes(Allele, y = Freq, fill = Population)) + 
  geom_bar(
    position = position_dodge2(preserve = "single"),
    stat = "identity"
  ) + 
  facet_wrap(~snpID, drop = FALSE, ncol = 7) + 
  scale_fill_manual(values = c("black", "blue")) +
  labs(y = "Frequency") + 
  theme(
    legend.direction = "horizontal", 
    legend.position = c(0.75, 0.05)
  )

Variants detected as significant to an FDR of 5% under the Allele Frequency model. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance.

write_tsv(alleleResults, "alleleResults.tsv")

Analysis Using the FLK model

This was performed separately and no unique SNPs were detected in addition to any detected above.

Export of Data for Bayescan

A VCF was required with only the 1996 and 2012 populations, and restricted to the candidate SNPs after pruning for linkage disequilibrium and detection of the allele in the 1996 population.

var2VCF <- genotypes %>% 
  distinct(variant.id, snpID) %>%
  filter(snpID %in% filter(regionResults, p > 0.05)$snpID) %>% 
  .[["variant.id"]]

gdsFile <- seqOpen(gdsPath, readonly = TRUE)
seqSetFilter(
  gdsFile, 
  variant.id = var2VCF,
  sample.id = sampleID %>% 
    filter(Population %in% c(1996, 2012)) %>% 
    .[["Sample"]]
)
seqGDS2VCF(gdsFile, "../5_stacks/vcf/filtered.vcf.gz")
seqResetFilter(gdsFile)
seqClose(gdsFile)

Inclusion of Bayescan Results

After running Bayescan, results were imported. Unfortunately, these results had IDs which were taken from the line in the above VCF and need to be converted back to IDs compatible with previous analysis.

bayRes <- file.path("..", "external", "BayescanOut.txt.gz") %>%
  gzfile() %>%
  read_delim(delim = " ", col_names = FALSE, skip = 1, col_types = "innnnn-") %>%
  set_colnames(c("ID", "prob","log10.PO.","qval","alpha","fst")) %>%
  mutate(variant.id = var2VCF[ID])

bayRes %>%
  dplyr::select(variant.id, qval, fst) %>%
  dplyr::filter(qval < alpha) %>%
  left_join(
    genotypes %>% 
      distinct(snpID, .keep_all = TRUE) %>% 
      dplyr::select(variant.id, chromosome, position, snpID)
  ) %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    ignore.strand = TRUE,
    seqinfo = seqinfo(snp2Gene), 
    seqnames.field = "chromosome",
    start.field = "position",
    end.field = "position"
  ) %>%
  join_nearest(ensGenes) %>%
  as.data.frame() %>%
  left_join(
    ensGenes %>% as.data.frame(),
    by = c("Name", "gene_id", "seqnames", "description"),
    suffix = c("_snp", "_gene")
  ) %>%
  as_tibble() %>%
  dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene, qval, fst) %>%
  mutate(dist2Gene = case_when(
    start_snp >= start_gene & start_snp <= end_gene ~ 0L,
    start_snp < start_gene ~ start_gene - start_snp,
    start_snp > end_gene ~ start_snp - end_gene
  ),
  Name = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ Name
  ),
    gene_id = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ gene_id
  ),
  inExon = snpID %in% ensExons$snpID,
  dist2Gene = case_when(
    Name == "" ~ "",
    inExon ~ "CDS",
    dist2Gene == 0 & !inExon ~ "Intronic",
    dist2Gene > 0 ~ comma(dist2Gene)
  ),
  seqnames = as.character(seqnames)
  ) %>%
  dplyr::select(
    snpID, 
    chromosome = seqnames, position = start_snp, 
    gene_id, Name, dist2Gene, fst, qval) %>%
  # Replace the one dropped by `join_nearest()`
  bind_rows(
    bayRes %>%
      dplyr::select(variant.id, qval, fst) %>%
      dplyr::filter(qval < alpha) %>%
      left_join(
        genotypes %>% 
          distinct(snpID, .keep_all = TRUE) %>% 
          dplyr::select(variant.id, chromosome, position, snpID)
      ) %>%
      mutate(
        gene_id = "",
        Name = "",
        dist2Gene = ""
      ) %>%
      dplyr::select(-variant.id)
  ) %>%
  distinct(snpID, .keep_all = TRUE) %>%
  mutate(position = comma(position)) %>%
  arrange(qval) %>%
  mutate(
    fst = tidyP(fst),
    qval = tidyP(qval)
  ) %>%
  pander(
    justify = "llrllrrr",
    style = "rmarkdown",
     big.mark = ",",
    split.tables = Inf,
      caption = paste(
        "*Results from Bayescan with a q-value <", paste0(alpha, "."),
        "The nearest gene within", paste0(maxKb, "kb"), "is also given.",
        "One SNP detected by Bayescan was located on a scaffold with no genes (GL019119).*"
      )
  )

*Results from Bayescan with a q-value < 0.05. The nearest gene within 40kb is also given. One SNP detected by Bayescan was located on a scaffold with no genes (GL019119).*
snpID	chromosome	position	gene_id	Name	dist2Gene	fst	qval
3391201_92	GL018705	1,937,359	ENSOCUG00000016428	RCBTB2	37,689	0.043	0.003
321186_117	2	27,995,273				0.042	0.008
1009106_1	6	9,037,073	ENSOCUG00000027083	CLEC19A	15,180	0.036	0.013
836950_151	4	35,111,855	ENSOCUG00000012144	TMPRSS12	5,745	0.035	0.018
2227006_109	13	129,650,867	ENSOCUG00000022823	ID3	16,426	0.035	0.023
3374849_109	GL018704	1,294,009				0.037	0.026
4044777_119	GL018933	204,058	ENSOCUG00000005859	FNBP1	Intronic	0.033	0.031
4165193_106	GL019119	147,630				0.034	0.034
3695415_108	GL018751	737,318	ENSOCUG00000003672	TRAF3	Intronic	0.033	0.038
4010189_98	GL018907	283,766	ENSOCUG00000008870	BRWD1	Intronic	0.033	0.044
1089925_112	7	35,584,375				0.032	0.049

grid.newpage()
list(
  Genotype = dplyr::filter(genotypeResults, FDR < alpha)$variant.id,
  Allele = alleleResults %>%
    left_join(snps) %>%
    dplyr::filter(FDR < alpha) %>%
    .[["variant.id"]],
  Bayescan = dplyr::filter(bayRes, qval < 0.05)$variant.id
) %>% 
 VennDiagram::venn.diagram(
   filename = NULL,
   cex = 2,
   cat.cex = 2
   ) %>%
  grid.draw()

Summary of results from all three methods, using an FDR of 0.05 for each approach. All SNPs detected under Bayescan were identified using Fisher’s Exact Test for both allele and genotype analysis

sigSnps <- c(
  filter(alleleResults, FDR < alpha)$snpID,
  filter(genotypeResults, FDR < alpha)$snpID
) %>%
  unique()

suppressWarnings(
  snpsGR %>%
    subset(snpID %in% sigSnps) %>%
    resize(width = 1000*maxKb*2, fix = "center") %>%
    trim() %>%
    find_overlaps(ensGenes) %>%
    as.data.frame() %>%
    as_tibble() %>%
    left_join(snps, by = c("variant.id", "snpID", "Col")) %>%
    dplyr::select(snpID, gene_id, Name, chromosome, position) %>%
    left_join(
      as.data.frame(ensGenes), by = c("gene_id", "Name")
    ) %>%
    mutate(
      distance = case_when(
        position < start ~ comma(start - position, accuracy = 1),
        position > end ~ comma(position - end, accuracy = 1),
        snpID %in% ensExons$snpID ~ "Exonic",
        position >= start & position <= end ~ "Intronic"
      ),
      model = ifelse(
        snpID %in% filter(alleleResults, FDR < alpha)$snpID, "A", ""
        ),
      model = ifelse(
         snpID %in% dplyr::filter(snps, variant.id %in% dplyr::filter(bayRes, qval < 0.05)$variant.id)$snpID,
         paste0(model, "B"), model
      ),
      model = ifelse(
        snpID %in% filter(genotypeResults, FDR < alpha)$snpID,
        paste0(model, "G"), model
      ),
      width = comma(width, accuracy = 1)
    ) %>%
    unite(location, chromosome, position, sep = ":") %>%
    dplyr::select(
      `SNP Location` = location, 
      `SNP ID` = snpID, 
      `Gene ID` = gene_id, 
      `Gene Name` = Name, 
      `Gene Width` = width, 
      `Distance` = distance, 
      Models = model
    )
) %>% 
  pander(
    justify = "llllrll",
    style = "rmarkdown",
    split.tables = Inf,
    # big.mark = ",",
    caption = paste0(
      "*All putatively significant SNPs with genes located within ", maxKb, "kb. ",
      "The distance from SNP to gene is indicated, unless the SNP is intronic/exonic. ",
      "The model(s) which identified the SNP are indicated in the final column ",
      "as (A) the Allele Model; (B) Bayescan and (G) the Genotype Model.*"
    )
  )

All putatively significant SNPs with genes located within 40kb. The distance from SNP to gene is indicated, unless the SNP is intronic/exonic. The model(s) which identified the SNP are indicated in the final column as (A) the Allele Model; (B) Bayescan and (G) the Genotype Model.
SNP Location	SNP ID	Gene ID	Gene Name	Gene Width	Distance	Models
1:64635286	87917_119	ENSOCUG00000007758	CEP78	51,885	Intronic	AG
2:139723776	482617_100	ENSOCUG00000001127	RHOQ	31,670	8,346	G
2:139723776	482617_100	ENSOCUG00000021835	ATP6V1E2	681	10,081	G
2:139723776	482617_100	ENSOCUG00000028184	TMEM247	4,917	37,653	G
3:56309981	644062_1	ENSOCUG00000004827	SFXN1	26,726	Intronic	G
3:90851512	686773_29	ENSOCUG00000009616	CRISPLD1	51,813	Exonic	AG
4:35111855	836950_151	ENSOCUG00000012144	TMPRSS12	40,859	5,745	ABG
4:89602973	916731_91	ENSOCUG00000006126	RIC8B	123,296	Intronic	AG
6:9037073	1009106_1	ENSOCUG00000027083	CLEC19A	20,430	15,180	ABG
7:2702919	1045185_75	ENSOCUG00000007975	ZNF777	32,875	12,875	A
7:2702919	1045185_75	ENSOCUG00000027861		1,123	3,276	A
12:60113885	1902686_98	ENSOCUG00000016906		103,853	13,295	AG
12:60113885	1902686_98	ENSOCUG00000001871		1,594	12,820	AG
12:60113885	1902686_98	ENSOCUG00000021694		1,144	18,149	AG
12:60113885	1902686_98	ENSOCUG00000023798		2,496	35,410	AG
12:141844290	1997286_93	ENSOCUG00000004829	ESR1	474,528	Intronic	A
13:129650867	2227006_109	ENSOCUG00000022823	ID3	528	16,426	ABG
14:97813263	2405359_66	ENSOCUG00000005835	STXBP5L	418,595	Intronic	A
15:87473844	2580149_42	ENSOCUG00000001910	ADGRL3	571,686	Intronic	AG
16:56527308	2689075_49	ENSOCUG00000000984	ESRRG	238,366	Intronic	AG
18:25311176	2906428_79	ENSOCUG00000011533	RHOBTB1	116,987	24	AG
19:19178148	3016258_99	ENSOCUG00000015254	SEZ6	46,097	Intronic	AG
19:26020372	3032107_92	ENSOCUG00000014434	AATF	127,766	30,904	AG
19:26020372	3032107_92	ENSOCUG00000024664	LHX1	6,982	20,607	AG
19:30689426	3040789_114	ENSOCUG00000010440	MSI2	396,928	Intronic	AG
GL018704:2271135	3378006_1	ENSOCUG00000011042	COL8A2	3,187	34,195	G
GL018704:2271135	3378006_1	ENSOCUG00000011038	ADPRHL2	5,946	24,425	G
GL018704:2271135	3378006_1	ENSOCUG00000029722	TEKT2	3,671	20,166	G
GL018704:2271135	3378006_1	ENSOCUG00000027798		1,002	16,486	G
GL018704:2271135	3378006_1	ENSOCUG00000006255	AGO3	128,900	9,610	G
GL018705:1937359	3391201_92	ENSOCUG00000016428	RCBTB2	37,081	37,689	ABG
GL018751:737318	3695415_108	ENSOCUG00000010323	CDC42BPB	107,021	5,698	ABG
GL018751:737318	3695415_108	ENSOCUG00000003672	TRAF3	48,133	Intronic	ABG
GL018791:957960	3829050_109	ENSOCUG00000007849	FHAD1	123,752	Intronic	AG
GL018883:283912	3999128_75	ENSOCUG00000016273	PPP4R4	96,327	21,034	A
GL018883:283912	3999128_75	ENSOCUG00000016318	SERPINA6	21,391	Intronic	A
GL018907:283766	4010189_98	ENSOCUG00000008870	BRWD1	121,750	Intronic	ABG
GL018933:204058	4044777_119	ENSOCUG00000005859	FNBP1	73,115	Intronic	ABG
GL018985:123328	4098838_14	ENSOCUG00000006403	CLDN5	657	14,238	A
GL018985:123328	4098838_14	ENSOCUG00000006396		38,624	Intronic	A
GL018985:123328	4098838_14	ENSOCUG00000025709	UFD1	18,611	28,184	A
GL018985:129495	4098854_101	ENSOCUG00000006403	CLDN5	657	20,405	AG
GL018985:129495	4098854_101	ENSOCUG00000006396		38,624	Intronic	AG
GL018985:129495	4098854_101	ENSOCUG00000025709	UFD1	18,611	22,017	AG
GL019049:178272	4136275_162	ENSOCUG00000009045	ASS1	36,901	36,520	G
GL019049:178272	4136275_162	ENSOCUG00000017433		8,159	11,078	G
GL019049:178272	4136275_162	ENSOCUG00000026644		29,389	Intronic	G
GL019084:74038	4146664_90	ENSOCUG00000010071	GNB1	40,037	Intronic	AG
GL019084:74038	4146664_90	ENSOCUG00000021568	NADK	14,270	12,587	AG
GL019084:74038	4146664_90	ENSOCUG00000007721		37,538	30,580	AG
GL019077:34265	4149102_107	ENSOCUG00000027299	PTX4	3,616	19,359	AG
GL019077:34265	4149102_107	ENSOCUG00000000521	TELO2	11,447	6,323	AG
GL019077:34265	4149102_107	ENSOCUG00000024268	IFT140	81,557	Intronic	AG
GL019077:34265	4149102_107	ENSOCUG00000001136	TMEM204	19,558	13,406	AG
GL019154:883	4176850_23	ENSOCUG00000002484	MTHFSD	13,513	828	AG
GL019154:883	4176850_23	ENSOCUG00000001701	FOXF1	2,195	19,026	AG
GL019274:63350	4208471_113	ENSOCUG00000015189		321	37,542	G

Enrichment testing

fullPaths <- list(
  toTable(GOBPANCESTOR) %>% set_names(c("go_id", "ancestor")),
  toTable(GOCCANCESTOR) %>% set_names(c("go_id", "ancestor")),
  toTable(GOMFANCESTOR) %>% set_names(c("go_id", "ancestor"))
) %>% 
  bind_rows() %>%
  dplyr::filter(!ancestor %in% c("all", "GO:0008150", "GO:0003674", "GO:0005575")) %>%
  as_tibble() %>%
  bind_rows(
    tibble(
      go_id = unique(.$go_id),
      ancestor = unique(.$go_id)
    )
  ) %>%
  arrange(go_id, ancestor)

goSummaries <- url("https://uofabioinformaticshub.github.io/summaries2GO/data/goSummaries.RDS") %>%
  readRDS() %>%
  dplyr::rename(go_id = id)

GO Analysis By Gene

For the initial GO analysis, any gene within 40kb of a SNP considered as significant were tested for any enrichment of biological characteristics.

allSnpsGR <- snps %>%
  dplyr::select(chromosome, position, snpID) %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    seqnames.field = "chromosome", 
    start.field = "position", 
    end.field = "position", 
    seqinfo = seqinfo(ensGenes)) %>%
  join_overlap_inner(
    ensGenes,
    maxgap = maxKb*1000
  )

sigSnpGR <- allSnpsGR %>%
  subset(snpID %in% sigSnps)

nGenes <- allSnpsGR$gene_id %>%
  unique() %>%
  length()
nSigGenes <- sigSnpGR$gene_id %>%
  unique() %>%
  length()

10821 genes were identified as being within 40kb of a SNP. For the 32 candidate SNPs within this distance of a gene, 54 genes were identified, and these were analysed for enrichment of any GO terms.

Mappings from genes to GO terms were downloaded manually from the biomart server at www.ensembl.org.

bm <- file.path("..", "external", "gene2GO_biomart.tsv.gz") %>%
  gzfile() %>%
  read_tsv() %>%
  dplyr::filter(!is.na(`GO domain`)) %>%
  dplyr::rename(gene_id = `Gene stable ID`,
         ontology = `GO domain`,
         go_id = `GO term accession`,
         go_term = `GO term name`) %>%
  dplyr::filter(gene_id %in% allSnpsGR$gene_id) %>%
  left_join(fullPaths) %>%
  dplyr::select(gene_id, go_id = ancestor) %>%
  distinct(gene_id, go_id) %>%
  dplyr::filter(!is.na(go_id))

Complete paths of GO terms back to ontology roots were obtained for each gene. The minimum number of steps back to the ontology roots were also obtained for each GO ID.

goRes <- bm %>%
  mutate(inSig = gene_id %in% sigSnpGR$gene_id) %>%
  group_by(go_id) %>%
  summarise(N = dplyr::n(), 
            nSig = sum(inSig)) %>%
  left_join(goSummaries) %>%
  filter(nSig > 0, shortest_path > 2) %>%
  distinct(go_id, .keep_all = TRUE) %>%
  split(f = .$go_id) %>%
  mclapply(function(x){
    ft <- matrix(
      c(nSigGenes - x$nSig, x$nSig, nGenes - nSigGenes - x$N, x$N),
      nrow = 2, byrow = TRUE) %>%
      fisher.test()
    mutate(x, p = ft$p.value)
  }, mc.cores = mc) %>%
  bind_rows() %>%
  mutate(Expected = nSigGenes * N / nGenes,
         FDR = p.adjust(p, "fdr"),
         adjP = p.adjust(p, "bonferroni"),
         term = Term(go_id),
         ontology = Ontology(go_id)) %>%
  dplyr::select(go_id, term, ontology, N, nSig, Expected, p, adjP, FDR) %>%
  arrange(p)

goRes %>%
  dplyr::filter(FDR == min(FDR), nSig > 1) %>%
  mutate(
    Term = Term(go_id),
    Genes  = vapply(go_id, function(x){
      subset(sigSnpGR, gene_id %in% I(
        dplyr::filter(bm, go_id == x) %>%
          .[["gene_id"]]))$Name %>%
        sort() %>%
        unique() %>%
        paste(collapse = ", ")
    }, character(1))
  ) %>%
  dplyr::select(ID = go_id, Ontology = ontology, Term, Total = N, Sig = nSig, Expected, p, FDR, Genes) %>%
  mutate(
    ID = gsub(":", "\\\\:", ID),
    Expected = tidyP(Expected),
    p = tidyP(p),
    FDR = round(FDR, 3)
  ) %>%
  distinct(Ontology, Genes, .keep_all = TRUE) %>%
  pander(
    split.tables = Inf, style = "rmarkdown", 
    justify = "lllrrrrrl",
    caption = paste(
      "*Most highly ranked GO terms.", 
      "Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown.",
      "Overall, this corresponds to an FDR of", 
      percent(max(.$FDR)), ".*"
    )
  )

*Most highly ranked GO terms. Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown. Overall, this corresponds to an FDR of 22% .*
ID	Ontology	Term	Total	Sig	Expected	p	FDR	Genes
GO:0005496	MF	steroid binding	34	3	0.170	8.10e-04	0.216	ESR1, ESRRG, SERPINA6
GO:0060068	BP	vagina development	7	2	0.035	8.60e-04	0.216	ESR1, LHX1
GO:0001655	BP	urogenital system development	146	5	0.729	9.03e-04	0.216	ESR1, FOXF1, ID3, IFT140, LHX1
GO:0072189	BP	ureter development	10	2	0.050	0.002	0.216	FOXF1, LHX1
GO:0002221	BP	pattern recognition receptor signaling pathway	51	3	0.255	0.002	0.216	ESR1, TRAF3, UFD1
GO:0060041	BP	retina development in camera-type eye	56	3	0.279	0.003	0.216	GNB1, IFT140, LHX1
GO:0043010	BP	camera-type eye development	122	4	0.609	0.004	0.216	COL8A2, GNB1, IFT140, LHX1
GO:0021680	BP	cerebellar Purkinje cell layer development	16	2	0.080	0.004	0.216	LHX1, SEZ6
GO:0072001	BP	renal system development	132	4	0.659	0.005	0.216	FOXF1, ID3, IFT140, LHX1
GO:0003727	MF	single-stranded RNA binding	19	2	0.095	0.005	0.216	AGO3, MSI2
GO:0031050	BP	dsRNA processing	19	2	0.095	0.005	0.216	AGO3, ESR1
GO:0004879	MF	nuclear receptor activity	20	2	0.100	0.005	0.216	ESR1, ESRRG
GO:0050688	BP	regulation of defense response to virus	20	2	0.100	0.005	0.216	TRAF3, UFD1
GO:0001750	CC	photoreceptor outer segment	23	2	0.115	0.007	0.216	GNB1, IFT140
GO:0048701	BP	embryonic cranial skeleton morphogenesis	26	2	0.130	0.009	0.216	IFT140, LHX1
GO:0030522	BP	intracellular receptor signaling pathway	85	3	0.424	0.010	0.216	ESR1, ESRRG, UFD1

Meta Analysis With Previous Paper

The results in the Turretfield population from the paper Resistance to RHD virus in wild Australia rabbits: Comparison of susceptible and resistant individuals using a genomewide approach were then compared to these results. In this previous analysis, results were generated by using Fisher’s Exact Test to test genotypes for association with RHDV susceptibility or resistance.

tfGR <- url("https://raw.githubusercontent.com/hdetering/orycun/master/results/filteredSNPScores.csv") %>% 
  read_csv() %>%
  distinct(min_snp, .keep_all = TRUE) %>%
  dplyr::select(chromosome = chr_name, pos, snpID = min_snp, p, FDR) %>%
  mutate(chromosome = gsub("^chr", "", chromosome)) %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    ignore.strand = TRUE, 
    seqinfo = seqinfo(ensGenes),
    seqnames.field = "chromosome", 
    start.field = "pos", 
    end.field = "pos") %>%
  sort()

Of the 9,229 unique SNPs contained in the previous dataset, only 1,431 were also identified in the current dataset. A meta-analysis was performed on these SNPs using the results from both separate analyses which used Fisher’s Exact Test on genotypes. In this approach, Fisher’s method was used to combine p-values, and resultant p-values were drawn from a $\chi^2$ distribution with 4 degrees of freedom (i.e. $2k$).

combinedResults <- tfGR %>%
  find_overlaps(allSnpsGR) %>%
  mcols() %>%
  as.data.frame() %>%
  as_tibble() %>%
  dplyr::select(snpID = snpID.y, tf.p = p) %>%
  distinct(snpID, .keep_all = TRUE) %>%
  left_join(
    genotypeResults %>%
      dplyr::select(snpID, chromosome, position, geno.p = p)) %>% 
  mutate(
    X = -2*(log(tf.p) + log(geno.p)), 
    p = pchisq(X, 4, lower.tail = FALSE), 
    adjP = p.adjust(p, "bonferroni"),
    FDR = p.adjust(p, "fdr")) %>% 
  arrange(p)

Using Bonferroni’s adjustment to control the FWER at 0.05 gave 2 candidate SNPs, whilst using the more relaxed criterion of an FDR < 0.05 gave 10 candidate SNPs.

combinedResults %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(
    snpID,
    Chromosome = chromosome, 
    Position = position,
    p, adjP, FDR
  ) %>%
  mutate(
    p = tidyP(p),
    adjP = tidyP(adjP),
    FDR = tidyP(FDR)
  ) %>%
  pander(
    justify = "lrrrrr",
    caption = "*Significant SNPs from the meta-analysis using the previous published results from a separate population, using an FDR < 0.05 as the sole criterion for significance.*"
    )

*Significant SNPs from the meta-analysis using the previous published results from a separate population, using an FDR < 0.05 as the sole criterion for significance.*
snpID	Chromosome	Position	p	adjP	FDR
644062_1	3	56309981	1.67e-06	0.002	0.002
3378006_1	GL018704	2271135	2.35e-05	0.031	0.015
3941015_20	GL018847	216450	5.32e-05	0.069	0.023
213788_4	1	146274492	8.65e-05	0.113	0.025
3118382_68	20	14614986	9.50e-05	0.124	0.025
3108287_38	20	10514947	1.41e-04	0.183	0.031
3156991_0	21	6185579	1.79e-04	0.234	0.033
2425370_49	14	114833115	2.08e-04	0.272	0.034
3904662_77	GL018828	225354	3.00e-04	0.392	0.043
3970912_82	GL018864	1306	3.32e-04	0.433	0.043

allSnpsGR %>% 
  subset(snpID %in% dplyr::filter(combinedResults, FDR < 0.05)$snpID) %>%
  subset(!is.na(Name)) %>% 
  as.data.frame() %>%
  dplyr::select(
    snpID, 
    Chromosome = seqnames,
    BP = start, 
    ID = gene_id,
    Name,
    Description = description
  ) %>%
  mutate(Description = str_remove(Description, " \\[Source.+")) %>%
  pander(
    style = "rmarkdown", split.table = Inf, justify = "lrrlll",
    caption = paste0(
      "*Genes within ", maxKb, "kb of SNPs identified in the meta-analysis.*"
    )
  )

*Genes within 40kb of SNPs identified in the meta-analysis.*
snpID	Chromosome	BP	ID	Name	Description
213788_4	1	146274492	ENSOCUG00000000568	HBB2	hemoglobin, beta
213788_4	1	146274492	ENSOCUG00000021180	HBG2	hemoglobin, gamma G
213788_4	1	146274492	ENSOCUG00000027533	HBG1	hemoglobin, gamma A
213788_4	1	146274492	ENSOCUG00000026912	OR51I1	olfactory receptor family 51 subfamily I member 1
213788_4	1	146274492	ENSOCUG00000025404	OLFR64_1	MOR 5’beta3
644062_1	3	56309981	ENSOCUG00000004827	SFXN1	sideroflexin 1
2425370_49	14	114833115	ENSOCUG00000011289	ALCAM	activated leukocyte cell adhesion molecule
3108287_38	20	10514947	ENSOCUG00000008859	KCNH5	potassium voltage-gated channel subfamily H member 5
3118382_68	20	14614986	ENSOCUG00000008913	RPS6KA5	ribosomal protein S6 kinase A5
3156991_0	21	6185579	ENSOCUG00000005961	KDM2B	lysine demethylase 2B
3156991_0	21	6185579	ENSOCUG00000005957	RNF34	ring finger protein 34
3378006_1	GL018704	2271135	ENSOCUG00000011042	COL8A2	collagen type VIII alpha 2 chain
3378006_1	GL018704	2271135	ENSOCUG00000011038	ADPRHL2	ADP-ribosylhydrolase like 2
3378006_1	GL018704	2271135	ENSOCUG00000029722	TEKT2	tektin 2
3378006_1	GL018704	2271135	ENSOCUG00000006255	AGO3	argonaute RISC catalytic component 3
3904662_77	GL018828	225354	ENSOCUG00000027620	PAQR4	progestin and adipoQ receptor family member 4
3904662_77	GL018828	225354	ENSOCUG00000017181	PKMYT1	protein kinase, membrane associated tyrosine/threonine 1
3904662_77	GL018828	225354	ENSOCUG00000027985	TNFRSF12A	TNF receptor superfamily member 12A
3904662_77	GL018828	225354	ENSOCUG00000017197	CLDN6	claudin 6
3941015_20	GL018847	216450	ENSOCUG00000012298	ODF2	outer dense fiber of sperm tails 2
3941015_20	GL018847	216450	ENSOCUG00000008855	CERCAM	cerebral endothelial cell adhesion molecule
3970912_82	GL018864	1306	ENSOCUG00000021601	SCARB1	scavenger receptor class B member 1

combinedResults %>%
  mutate(Sig = FDR < 0.05) %>%
  ggplot(aes(-log10(geno.p), -log10(tf.p), colour = Sig)) +
  geom_point() +
  geom_abline(slope =1, intercept = 0, linetype = 2, colour = "blue") +
  geom_text_repel(aes(label = snpID),
                  data = . %>% filter(Sig)) +
  scale_color_manual(values = c("black", "red")) +
  labs(x = expression(paste(-log[10], "p (Genotype Analysis)")),
       y = expression(paste(-log[10], "p (Previous Analysis)"))) +
  theme(legend.position = "none")

Comparison of p-values for SNPs detected in both analyses, with those considered as significant after meta-analysis shown in red.

combinedResults %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(snpID) %>%
  left_join(genotypes) %>%
  filter(Population != 2010,
         !variant.id %in% regionSNPs$variant.id) %>%
  mutate(Genotype = as.factor(Genotype),
         Population = as.factor(Population)) %>%
    ggplot(aes(Genotype, fill = Population)) + 
    geom_bar(position = position_dodge2(preserve = "single")) + 
    facet_wrap(~snpID, drop = FALSE) + 
    scale_fill_manual(values = c("black", "blue")) +
    labs(y = "Count") + 
    theme(
        legend.direction = "horizontal", 
        legend.position = c(0.75, 0.05)
    )

Allele distributions in the 1996 and 2012 populations for SNPs identified under the meta-analysis.

GO Enrichment

This list was also used to test for GO enrichment.

sigSnpGR <- allSnpsGR %>%
  subset(snpID %in% filter(combinedResults, FDR < 0.05)$snpID)
nGenes <- allSnpsGR %>%
  subset(snpID %in% combinedResults$snpID) %>%
  unique() %>%
  length()
nSigGenes <- sigSnpGR$gene_id %>%
  unique() %>%
  length()

combinedGoRes <- bm %>%
  dplyr::filter(
    gene_id %in% subset(allSnpsGR, snpID %in% combinedResults$snpID)$gene_id) %>%
  mutate(inSig = gene_id %in% sigSnpGR$gene_id) %>%
  group_by(go_id) %>%
  summarise(N = dplyr::n(), 
            nSig = sum(inSig)) %>%
  left_join(goSummaries) %>%
  filter(nSig > 0, shortest_path > 2) %>%
  distinct(go_id, .keep_all = TRUE) %>%
  split(f = .$go_id) %>%
  mclapply(function(x){
    ft <- matrix(
      c(nSigGenes - x$nSig, x$nSig, nGenes - nSigGenes - x$N, x$N),
      nrow = 2, byrow = TRUE) %>%
      fisher.test()
    mutate(x, p = ft$p.value)
  }, mc.cores = mc) %>%
  bind_rows() %>%
  mutate(Expected = nSigGenes * N / nGenes,
         FDR = p.adjust(p, "fdr"),
         adjP = p.adjust(p, "bonferroni"),
         term = Term(go_id),
         ontology = Ontology(go_id)) %>%
  dplyr::select(go_id, term, ontology, shortest_path, N, nSig, Expected, p, adjP, FDR) %>%
  arrange(p)

Inspection of the results revealed an enrichment for terms related to haemoglobin, however, as these genes are co-located this was revealed to be due to a single SNP with multiple haemoglobin genes within 40kb and can be disregarded.

combinedGoRes %>%
  dplyr::filter(
    nSig > Expected,
    FDR < 0.05
  ) %>%
  mutate(Term = Term(go_id),
         Genes  = vapply(go_id, function(x){
           subset(sigSnpGR, gene_id %in% I(
             dplyr::filter(bm, go_id == x) %>%
               .[["gene_id"]]))$Name %>%
             sort() %>%
             unique() %>%
             paste(collapse = ", ")
         }, character(1)),
         nSnps = vapply(go_id, function(x){
           subset(sigSnpGR, gene_id %in% I(
             dplyr::filter(bm, go_id == x) %>%
               .[["gene_id"]]))$snpID %>%
             unique() %>%
             length()
           }, integer(1))) %>%
  dplyr::select(ID = go_id, Ontology = ontology, Term, Total = N, Sig = nSig, nSnps, Expected, p, FDR, Genes) %>%
  mutate(ID = gsub(":", "\\\\:", ID),
         Expected = round(Expected, 3),
         p = round(p, 4),
         FDR = round(FDR, 3)) %>%
  distinct(Ontology, Genes, .keep_all = TRUE) %>%
  pander(
    split.tables = Inf, style = "rmarkdown", 
    justify = "lllrrrrrrl",
    caption = paste(
      "Most highly ranked GO terms for combined results.", 
      "Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown.",
      "Overall, this corresponds to an FDR of", 
      percent(max(.$FDR)), ".*"
    )
  )

Most highly ranked GO terms for combined results. Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown. Overall, this corresponds to an FDR of 4% .*
ID	Ontology	Term	Total	Sig	nSnps	Expected	p	FDR	Genes
GO:0015669	BP	gas transport	4	3	1	0.075	2e-04	0.034	HBB2, HBG1, HBG2
GO:0019825	MF	oxygen binding	4	3	1	0.075	2e-04	0.034	HBB2, HBG1, HBG2
GO:0015893	BP	drug transport	14	4	2	0.264	3e-04	0.037	HBB2, HBG1, HBG2, SFXN1

Export All Tested SNPs

snpsGR %>%
    resize(width = 1000*maxKb*2, fix = "center") %>%
    trim() %>%
    find_overlaps(ensGenes) %>%
    as.data.frame() %>%
    as_tibble() %>%
    left_join(snps, by = c("variant.id", "snpID", "Col")) %>%
    dplyr::select(snpID, gene_id, Name, chromosome, position) %>%
    left_join(
        as.data.frame(ensGenes), by = c("gene_id", "Name")
    ) %>%
    mutate(
        distance = case_when(
            position < start ~ comma(start - position, accuracy = 1),
            position > end ~ comma(position - end, accuracy = 1),
            snpID %in% ensExons$snpID ~ "Exonic",
            position >= start & position <= end ~ "Intronic"
        ),
        model = ifelse(
            snpID %in% filter(alleleResults, FDR < alpha)$snpID, "A", ""
        ),
        model = ifelse(
            snpID %in% dplyr::filter(snps, variant.id %in% dplyr::filter(bayRes, qval < 0.05)$variant.id)$snpID,
            paste0(model, "B"), model
        ),
        model = ifelse(
            snpID %in% filter(genotypeResults, FDR < alpha)$snpID,
            paste0(model, "G"), model
        ),
        width = comma(width, accuracy = 1)
    ) %>%
    unite(location, chromosome, position, sep = ":") %>%
    dplyr::select(
        snpID, location, gene_id, Name, distance
    ) %>%
    left_join(
        genotypeResults %>% dplyr::select(snpID, genotype_p = p)
    ) %>%
    left_join(
        alleleResults %>% dplyr::select(snpID, allele_p = p)
    ) %>%
    write_tsv(here::here("R", "allTestedSnps.tsv"))

All tested SNPs, along with genes within 40kb and p-values was then exported, and can be downloaded from here

Session Information

R version 3.6.2 (2019-12-12)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, parallel, grid, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: GO.db(v.3.10.0), AnnotationDbi(v.1.48.0), Biobase(v.2.46.0), rtracklayer(v.1.46.0), plyranges(v.1.6.7), GenomicRanges(v.1.38.0), GenomeInfoDb(v.1.22.0), IRanges(v.2.20.2), S4Vectors(v.0.24.3), BiocGenerics(v.0.32.0), ggrepel(v.0.8.1), qqman(v.0.1.4), ggsn(v.0.5.0), rgdal(v.1.4-8), ggmap(v.3.0.0), sp(v.1.3-2), readxl(v.1.3.1), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.4), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.2), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.3.0), magrittr(v.1.5), scales(v.1.1.0), pander(v.0.6.3), SNPRelate(v.1.20.1), SeqArray(v.1.26.2) and gdsfmt(v.1.22.0)

loaded via a namespace (and not attached): colorspace(v.1.4-1), rjson(v.0.2.20), ellipsis(v.0.3.0), class(v.7.3-15), rprojroot(v.1.3-2), futile.logger(v.1.4.3), XVector(v.0.26.0), fs(v.1.3.1), rstudioapi(v.0.10), farver(v.2.0.3), bit64(v.0.9-7), fansi(v.0.4.1), lubridate(v.1.7.4), xml2(v.1.2.2), knitr(v.1.27), jsonlite(v.1.6), Rsamtools(v.2.2.1), broom(v.0.5.4), dbplyr(v.1.4.2), png(v.0.1-7), compiler(v.3.6.2), httr(v.1.4.1), backports(v.1.1.5), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), cli(v.2.0.1), formatR(v.1.7), htmltools(v.0.4.0), tools(v.3.6.2), gtable(v.0.3.0), glue(v.1.3.1), GenomeInfoDbData(v.1.2.2), Rcpp(v.1.0.3), cellranger(v.1.1.0), vctrs(v.0.2.2), Biostrings(v.2.54.0), nlme(v.3.1-143), xfun(v.0.12), rvest(v.0.3.5), lifecycle(v.0.1.0), XML(v.3.99-0.3), zlibbioc(v.1.32.0), MASS(v.7.3-51.5), hms(v.0.5.3), SummarizedExperiment(v.1.16.1), lambda.r(v.1.2.4), yaml(v.2.2.1), memoise(v.1.1.0), calibrate(v.1.7.5), stringi(v.1.4.5), RSQLite(v.2.2.0), highr(v.0.8), maptools(v.0.9-9), e1071(v.1.7-3), BiocParallel(v.1.20.1), RgoogleMaps(v.1.4.5.2), rlang(v.0.4.4), pkgconfig(v.2.0.3), bitops(v.1.0-6), matrixStats(v.0.55.0), evaluate(v.0.14), lattice(v.0.20-38), sf(v.0.8-1), labeling(v.0.3), GenomicAlignments(v.1.22.1), bit(v.1.1-15.1), tidyselect(v.1.0.0), here(v.0.1), plyr(v.1.8.5), R6(v.2.4.1), generics(v.0.0.2), DelayedArray(v.0.12.2), DBI(v.1.1.0), pillar(v.1.4.3), haven(v.2.2.0), foreign(v.0.8-75), withr(v.2.1.2), units(v.0.6-5), RCurl(v.1.98-1.1), modelr(v.0.1.5), crayon(v.1.3.4), futile.options(v.1.0.1), KernSmooth(v.2.23-16), rmarkdown(v.2.1), jpeg(v.0.1-8.1), blob(v.1.2.1), reprex(v.0.3.0), digest(v.0.6.23), classInt(v.0.4-2), VennDiagram(v.1.6.20) and munsell(v.0.5.0)