library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
library(readxl)
library(sp)
library(ggmap)
library(rgdal)
library(ggsn)
library(parallel)
library(qqman)
library(ggrepel)
library(plyranges)
library(rtracklayer)
library(AnnotationDbi)
library(GO.db)
if (interactive()) here::here("R") %>% setwd()
theme_set(theme_bw())
panderOptions("missing", "")
mc <- min(12, detectCores() - 1)
alpha <- 0.05
maxKb <- 40
tidyP <- function(p, m = 0.001){
  x <- rep("", length(p))
  x[p < m] <- sprintf("%.2e", p[p < m])
  x[p >= m] <- sprintf("%.3f", p[p >= m])
  x
}
keepSNPs <- readRDS("keepSNPsAfterLDPruning.RDS")
w <- 169/25.4
h <- 169/25.4

This workflow immediately follows 03_SNPFiltering and performs an analysis on the 18886 SNPs retained after filtering. Analysis performed were:

  • To assess the rate of missing values by retained SNP and sample
  • Run a PCA analysis using SNPs & samples
  • Analyse using comparisons between genotypes and allele frequencies, as well as using Bayescan and FLK
gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
gdsFile <- seqOpen(gdsPath, readonly = TRUE)
sampleID <- tibble(
    Sample = seqGetData(gdsFile, "sample.annotation/Sample"),
    Population = seqGetData(gdsFile, "sample.annotation/Population"),
    Location = seqGetData(gdsFile, "sample.annotation/Location")
)
popSizes <- sampleID %>% 
    group_by(Population) %>%
    summarise(n = dplyr::n())
  • After loading the data from the previous section, an additional filtering step was performed, where SNPs were required to have a non-zero MAF in the initial 1996 population.
snpIn1996 <- genotypes %>% 
    filter(Population == 1996) %>% 
    group_by(variant.id) %>% 
    summarise(maf = mean(Genotype) / 2) %>%
    filter(maf > 0)
genotypes %<>% 
    right_join(snpIn1996)
  • This additional filtering step restricted analysis to 18,878 SNPs instead of the initial 18,886 SNPs.
  • The mean distance between SNPs in the final dataset was 139.25kb
seqClose(gdsFile)

In addition to the above, the set of genes corresponding the Ensembl Release 96 were loaded.

ensGenes <- file.path(
  "..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz"
) %>%
  import.gff3(feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE) %>%
  .[,c("gene_id", "Name", "description")]
snpsGR <- makeGRangesFromDataFrame(
    df = snps, 
    keep.extra.columns = TRUE, 
    ignore.strand = TRUE, 
    seqinfo = seqinfo(ensGenes), 
    seqnames.field = "chromosome", 
    start.field = "position", 
    end.field = "position"
)

The set of SNPs under investigation was also defined as a GRanges object. This was intersected with the set of genes to connect SNPs to genes within 40 kb.

snp2Gene <- suppressWarnings(
  snpsGR %>% 
    resize(width = 2*1000*maxKb - 1, fix = "center") %>%
    trim() %>%
    join_overlap_inner(ensGenes) 
)

A set of SNPs within exonic regions was also defined in order to detect any SNPs within coding regions.

ensExons <- file.path(
  "..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz"
) %>%
  import.gff3(feature.type = "exon", sequenceRegionsAsSeqinfo = TRUE) %>%
  join_overlap_inner_within(
    ensGenes, maxgap = 0, minoverlap = 0, suffix = c(".exon", "")
  ) %>%
  join_overlap_inner(snpsGR)
ensExons <- ensExons[,setdiff(colnames(mcols(ensExons)), "Name.exon")]

PCA

lowCall <- c("gc2901", "gc2776", "gc2731", "gc2727", "gc2686")
snp4PCA <- genotypes %>% 
    filter(!Sample %in% lowCall) %>%
    group_by(variant.id, Population) %>% 
    summarise(n = dplyr::n()) %>% 
    spread(Population, n) %>%
    mutate(N = `1996` + `2010` + `2012`) %>% 
    ungroup() %>%
    filter(N > 0.95*(sum(popSizes$n) - length(lowCall)))
pca <- genotypes %>%
    filter(variant.id %in% snp4PCA$variant.id) %>%
    dplyr::select(variant.id, Sample, Genotype) %>%
    spread(Sample, Genotype) %>%
    as.data.frame() %>%
    column_to_rownames("variant.id") %>%
    as.matrix() %>%
    apply(2, function(x){
        x[is.na(x)] <- mean(x, na.rm = TRUE)
        x
    }) %>%
    t() %>%
    .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
    prcomp( center = TRUE)

As noted in the previous section sample samples gc2901, gc2776, gc2731, gc2727 and gc2686 had a SNP identification rate \(< 50\)% and as such these were marked as potential outliers. Ignoring these samples, and restricting data to SNPs identified in \(>95\)% of all samples, a preliminary PCA was performed This amounted to 7,767 of the possible 18,878 SNPs for analysis using PCA. Missing values were specified as the mean MAF over all populations combined.

Given the initially observed structure, in which samples from the 2012 population are separating from the other samples which group with the 1996 population, the collection points for the 2012 samples as investigated.

sampleID %<>%
    left_join(
        file.path("..", "external", "GPS_Locations.xlsx") %>%
            read_excel() %>%
            dplyr::select(Sample, ends_with("tude")) %>%
            mutate(Sample = gsub("[Oo][Rr][Aa] ([0-9ABC]*)", "ora\\1", Sample))
    )
fs <- 14
ps <- 3
popCols <- c(rgb(0, 0, 0, 0.7), 
             rgb(0.8, 0.2, 0.2, 0.9), 
             rgb(0.2, 0.8, 0.2, 0.9),
             rgb(0.5, 0.5, 1, 0.7))
pca4Plot <- pca$x %>%
    as.data.frame() %>%
    rownames_to_column("Sample") %>%
    as_tibble() %>%
    dplyr::select(Sample, PC1, PC2, PC3) %>%
    left_join(sampleID) %>%
    mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
    group_by(Cluster) %>%
    mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
    ungroup() %>%
    mutate(
      Population = case_when(
        Population == 1996 ~ "1996",
        Population == 2010 ~ "Outgroup (Turretfield)",
        maxY == max(maxY) ~ "2012 (Outer)",
        maxY != max(maxY) ~ "2012 (Central)"
      )
    ) %>%
  left_join(
    genotypes %>% 
      filter(variant.id %in% snp4PCA$variant.id, !Sample %in% lowCall) %>% 
      group_by(Sample) %>% 
      tally() %>% 
      mutate(imputationRate = 1 - n / nrow(snp4PCA)
      )
  ) 
pca4Plot %>%
    ggplot(aes(PC1, PC2, colour = Population, size = imputationRate)) +
    geom_point(alpha = 0.6) +
    labs(
      x = paste0("PC1 (", percent(summary(pca)$importance[2,"PC1"]), ")"),
      y = paste0("PC2 (", percent(summary(pca)$importance[2,"PC2"]), ")"),
      size = "Imputation Rate"
    ) +
  scale_colour_manual(values = popCols)
*PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.*

PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.

loc <- c(
  range(sampleID$Longitude, na.rm = TRUE) %>% mean,
  range(sampleID$Latitude, na.rm = TRUE) %>% mean
)
saPoly <- readRDS(file.path("..", "external", "saPoly.RDS"))
roads <- readRDS(file.path("..", "external", "roads.RDS"))
gc <- SpatialPoints(cbind(x = 138.655972, y = -31.200305))
proj4string(gc) <- "+proj=longlat +ellps=GRS80 +no_defs"
xBreaks <- seq(138.65, 138.8, by = 0.05)
xLabs <- parse(text = paste(xBreaks, "*degree ~ E", sep = ""))
yBreaks <- seq(-31.2, -31.32, by = -0.04)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
leftN <- tibble(
  x = c(138.7965, 138.8, 138.8) - 0.01,
  y = c(-31.2, -31.198, -31.193)
)
rightN <- tibble(
  x = c(138.8, 138.8, 138.8035) - 0.01,
  y = c(-31.193, -31.198, -31.2)
)
ggMap <- get_map(loc, zoom = 12, maptype = "terrain", color = "bw")
zoomPlot <- ggmap(ggMap, extent = "normal", maprange = FALSE) +
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE == "UNSE"), 
        linetype = 2, size = 0.3) + 
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE != "UNSE"), 
        linetype = 1, size = 0.4) +
    geom_label(x = 138.74, y = -31.29, label = "Flinders Ranges NP", alpha = 0.4) +
    geom_label(x = 138.72, y = -31.22, label = "Gum Creek", alpha = 0.4) +
    geom_point(aes(x, y), data = as.data.frame(gc), shape = 3, size = 3) +
    geom_text(aes(x, y), label = "HS", data = as.data.frame(gc), nudge_y = 0.005) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "HD"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.3) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "PARK"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.2) +
  geom_point(
    data= filter(pca4Plot, grepl("2012", Population)),
    aes(Longitude, Latitude, colour = Population),
    size = 0.9*ps) +
  geom_polygon(
    aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4
  ) +
  geom_polygon(
    aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4
  ) +
  geom_text(
    x = 138.79, y = -31.19, label = "N", colour = "grey10", size = 4
  ) +
  scale_colour_manual(values = popCols[2:3]) +
  coord_cartesian(
    xlim = c(138.618, 138.81),
    ylim = c(-31.335, -31.18),
    expand = 0
  ) +
  scale_x_continuous(breaks = xBreaks, labels = xLabs) +
  scale_y_continuous(breaks = yBreaks, labels = yLabs) +
  guides(colour = FALSE) +
  ggsn::scalebar(
    x.min = 138.618,
    x.max = 138.81,
    y.min = -31.335, 
    y.max = -31.18,
    transform = TRUE,
    dist = 2, 
    dist_unit = "km",
    model = 'GRS80',
    height = 0.012, 
    st.size = 4,
    location = 'bottomright',
    anchor = c(x = 138.8, y = -31.328)) +
  labs(x = "Longitude", y = "Latitude") +
  theme(
    text = element_text(size = fs),
    plot.margin = unit(c(1, 1, 1, 1), "mm")
  )
ausPolygon <- readRDS(file.path("..", "external", "ausPolygon.RDS"))
ausPts <- SpatialPoints(cbind(x = loc[1], y = loc[2]))
proj4string(ausPts) <- proj4string(ausPolygon)
ausPlot <- ggplot() +
  geom_polygon(
    data = ausPolygon, 
    aes(long, lat, group = group),
    fill = "white", colour = "black"
  ) + 
  geom_point(data = as.data.frame(ausPts), aes(x, y), size = 1.5) +
  theme_void() +
  theme(plot.background = element_rect(fill = "white", colour = "black"))
convert \
  -density 600 \
  -size 1920 \
  ../figures/Figure1.pdf \
  ../figures/Figure1.png
knitr::include_graphics(file.path("..", "figures", "Figure1.png"))
*Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and *k*-means clustering.*

Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and k-means clustering.

zoomLoc <- c(138.753, -31.242)
xBreaks <- seq(138.74, 138.76, by = 0.01)
xLabs <- parse(text = paste(xBreaks, "*degree ~ W", sep = ""))
yBreaks <- seq(-31.235, -31.25, by = -0.005)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
central <- rbind(
  x = c(138.749, 138.755),
  y = c(-31.2365, -31.2495)
) %>%
  set_colnames(c("min", "max"))
leftN <- tibble(
  x = c(138.7595, 138.76, 138.76),
  y = c(-31.234, -31.2335, -31.2325)
)
rightN <- tibble(
  x = c(138.76, 138.76, 138.7605),
  y = c(-31.2325, -31.2335, -31.234)
)
get_map(zoomLoc, zoom = 15, maptype = "terrain", source = "google", color = "bw") %>%
  ggmap() +
  geom_jitter(
    data = filter(pca4Plot, grepl("2012", Population)), 
    aes(Longitude, Latitude, colour = Population), 
    size = 3, width = 0.0005, height = 0) +
  geom_rect(
    xmin = central["x", "min"],
    xmax = central["x", "max"],
    ymin = central["y", "min"],
    ymax = central["y", "max"],
    fill = "red", alpha = 0.01, colour = "black") +
  geom_polygon(
    aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4
  ) +
  geom_polygon(
    aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4
  ) +
  geom_text(
    x = 138.76, y = -31.232, label = "N", colour = "grey10", size = 5
  ) +
  scale_colour_manual(values = popCols[2:3]) +
  scale_x_continuous(breaks = xBreaks, labels = xLabs) +
  scale_y_continuous(breaks = yBreaks, labels = yLabs) +
  theme_bw() +
  guides(colour = FALSE) +
  labs(x = "Longitude", y = "Latitude") +
  coord_cartesian(
    xlim = c(138.74, 138.762),
    ylim = c(-31.253, -31.231),
    expand = 0
  ) +
  ggsn::scalebar(
    x.min = 138.74, x.max = 138.762, 
    y.min = -31.253, y.max = -31.231,
    transform = TRUE,
    dist = 0.25, dist_unit = "km",, model = 'GRS80', 
    height = 0.012, st.size = 4,
    location = 'bottomright',
    anchor = c(x = 138.761, y = -31.252)
    )
*Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.*

Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.

Region Analysis

Removal of SNPs Associated with Collection Region

The structure observed within the 2012 population in the PCA could possibly be explained by recent migration into this region. As the samples collected in the outer regions appeared very similar to the 1996 population in the above plots, this would possibly indicate migration is a very recent event as the genetic influence of this has not spread through the wider area. Although this may be due to other factors such as sampling bias, this structure was addressed by identifying SNPs which showed an association with the sub-populations identified by PCA analysis. In this way, any candidate SNPs obtained below will be less impacted by this structure, and will be more reflective of the intended variable under study, i.e. selection over time, as opposed to any internal structure of the 2012 population which may indicate migration into the breeding population.

oraRegions <- pca4Plot %>% 
  filter(grepl("2012", Population)) %>% 
  rowwise() %>%
  mutate(
    yCentral = cut(Latitude, breaks = central["y",], include.lowest = TRUE), 
    xCentral = cut(Longitude, breaks = central["x",], include.lowest = TRUE),
    Central = (is.na(yCentral) + is.na(xCentral)) == 0
  ) %>%
  dplyr::select(Sample, Central) 

Testing for Structure in 2012

This model tests:
H0: No association between genotypes and collection region
HA: An association exists between genotypes and collection region

regionResults <- genotypes %>%
    filter(Population == 2012) %>%
    split(f = .$variant.id) %>%
    mclapply(
      function(x){
        ft <- list(p.value = NA)
        if (length(unique(x$Genotype)) > 1) {
          ft <- x %>%
            left_join(oraRegions) %>%
            group_by(Genotype, Central) %>%
            tally() %>%
            spread(Genotype, n, fill = 0) %>%
            column_to_rownames("Central") %>%
            fisher.test()
        }
        x %>%
          distinct(variant.id, snpID, chromosome, position) %>%
          mutate(p = ft$p.value)
      }, 
      mc.cores = mc
    ) %>%
  bind_rows() %>%
  filter(!is.na(p)) %>%
  arrange(p)

A total of 1682 SNPs were detected as showing a significant association between genotype and the collection region. Under H0, the number expected using α = 0.05 would be 943, and as this number was approximately double that expected, this was taken as evidence of this being a genuine point of concern for this dataset.

Notably, Type II errors were of principle concern in this instance, and as such every SNP with p < 0.05 in the above test was excluded from downstream analysis.

regionSNPs <- filter(regionResults, p < 0.05)
saveRDS(regionSNPs, "regionSNPs.RDS")

Under this additional filtering step, the original set of 18784 SNPs will be reduced to 17,102 for testing by genotype and allele frequency.

Verification Of Removal

In order to verify that the removal of the above SNPs removed the undesired population structure from the 2012 population, the above PCA was repeated, excluding the SNPs marked for removal. The previous structure noted in the data was no longer evident, and as such, these SNPs were marked for removal during analysis by genotype and allele frequency.

pcaPost <- genotypes %>%
  filter(
    variant.id %in% snp4PCA$variant.id,
    !variant.id %in% regionSNPs$variant.id
  ) %>%
  dplyr::select(variant.id, Sample, Genotype) %>%
  spread(Sample, Genotype) %>%
  as.data.frame() %>%
  column_to_rownames("variant.id") %>%
  as.matrix() %>%
  apply(2, function(x){
    x[is.na(x)] <- mean(x, na.rm = TRUE)
    x
  }) %>%
  t() %>%
  .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
  prcomp( center = TRUE) 
pcaPost4Plot <- pcaPost$x %>%
  as.data.frame() %>%
  rownames_to_column("Sample") %>%
  as_tibble() %>%
  dplyr::select(Sample, PC1, PC2, PC3) %>%
  left_join(sampleID) %>%
  mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
  group_by(Cluster) %>%
  mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(
    Population = case_when(
      Population == 1996 ~ "1996",
      Population == 2010 ~ "Outgroup (Turretfield)",
      maxY == max(maxY) ~ "2012 (Outer)",
      maxY != max(maxY) ~ "2012 (Central)"
    )
  ) %>%
  left_join(
    genotypes %>% 
      filter(variant.id %in% snp4PCA$variant.id, !Sample %in% lowCall) %>% 
      group_by(Sample) %>% 
      tally() %>% 
      mutate(imputationRate = 1 - n / nrow(snp4PCA))
  ) 
convert \
-density 600 \
-size 1920 \
../figures/Figure2.pdf \
../figures/Figure2.png
knitr::include_graphics(file.path("..", "figures", "Figure2.png"))
*Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs*

Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs

SNP Analysis

Genotype Frequency Model

This model tests:
H0: No association between genotypes and populations
HA: An association exists between genotypes and populations

genotypeResults <- genotypes %>%
  filter(
    Population != 2010,
    !variant.id %in% regionSNPs$variant.id
  ) %>%
  group_by(variant.id, snpID, Population, Genotype) %>%
  tally() %>%
  ungroup() %>%
  split(f = .$variant.id) %>%
  mclapply(function(x){
    ft <- list(p.value = NA)
    if (length(unique(x$Genotype)) > 1) {
      ft <- x %>%
        spread(Genotype, n, fill = 0) %>%
        column_to_rownames("Population") %>%
        dplyr::select(-variant.id, -snpID) %>%
        fisher.test()
    }
    x %>% 
      distinct(variant.id) %>%
      mutate(p = ft$p.value)
  },
  mc.cores = mc
  ) %>%
  bind_rows() %>%
  filter(!is.na(p)) %>%
  mutate(FDR = p.adjust(p, "fdr"), adjP = p.adjust(p, "bonferroni")) %>%
  arrange(p) %>%
  left_join(
    genotypes %>%
      distinct(variant.id, snpID, chromosome, position)
  ) %>%
  dplyr::select(variant.id, snpID, chromosome, position, everything())
genoSNPs <- genotypeResults %>%
  dplyr::filter(FDR < alpha) %>%
  .[["snpID"]]
snpsGR %>% 
  subset(snpID %in% genoSNPs) %>% 
  join_nearest(ensGenes) %>%
  as.data.frame() %>%
  left_join(
    ensGenes %>% as.data.frame(),
    by = c("Name", "gene_id", "seqnames", "description"),
    suffix = c("_snp", "_gene")
  ) %>%
  as_tibble() %>%
  dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene) %>%
  mutate(dist2Gene = case_when(
    start_snp >= start_gene & start_snp <= end_gene ~ 0L,
    start_snp < start_gene ~ start_gene - start_snp,
    start_snp > end_gene ~ start_snp - end_gene
  ),
  Name = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ Name
  ),
    gene_id = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ gene_id
  ),
  inExon = snpID %in% ensExons$snpID,
  dist2Gene = case_when(
    Name == "" ~ "",
    inExon ~ "CDS",
    dist2Gene == 0 & !inExon ~ "Intronic",
    dist2Gene > 0 ~ comma(dist2Gene)
  ),
  seqnames = as.character(seqnames)
  ) %>%
  dplyr::select(
    snpID,
    chromosome = seqnames,
    position = start_snp,
    gene_id, Name, dist2Gene
  ) %>%
  bind_rows(
    subset(snpsGR, snpID %in% setdiff(genoSNPs, .$snpID)) %>%
      as.data.frame() %>%
      mutate(seqnames = as.character(seqnames)) %>%
      dplyr::select(
        snpID,
        chromosome = seqnames,
        position = start
      )
  ) %>%
  left_join(genotypeResults) %>%
  dplyr::select(-variant.id, FDR) %>%
  arrange(p) %>%
  mutate(
    position = comma(position),
    p = tidyP(p),
    FDR = tidyP(FDR),
    adjP = tidyP(adjP)
    ) %>%
  pander(
    justify = "llrlllrrr",
    style = "rmarkdown",
    big.mark = ",",
    split.tables = Inf,
    caption = paste(
      "*Candidate SNPs to an FDR of", percent_format(1)(alpha), "when analysing by genotype.",
      "The nearest gene to each SNP (within", paste0(maxKb, "kb"), ") is indicated.",
      "Bonferroni-adjusted p-values are also given in the final column.*")
  )
Candidate SNPs to an FDR of 5% when analysing by genotype. The nearest gene to each SNP (within 40kb ) is indicated. Bonferroni-adjusted p-values are also given in the final column.
snpID chromosome position gene_id Name dist2Gene p FDR adjP
3391201_92 GL018705 1,937,359 ENSOCUG00000016428 RCBTB2 37,689 6.10e-08 9.04e-04 0.001
686773_29 3 90,851,512 ENSOCUG00000009616 CRISPLD1 CDS 1.05e-07 9.04e-04 0.002
916731_91 4 89,602,973 ENSOCUG00000006126 RIC8B Intronic 1.87e-07 9.84e-04 0.003
836950_151 4 35,111,855 ENSOCUG00000012144 TMPRSS12 5,745 2.59e-07 9.84e-04 0.004
321186_117 2 27,995,273 2.87e-07 9.84e-04 0.005
2227006_109 13 129,650,867 ENSOCUG00000022823 ID3 16,426 4.42e-07 0.001 0.008
3040789_114 19 30,689,426 ENSOCUG00000010440 MSI2 Intronic 4.84e-07 0.001 0.008
4165193_106 GL019119 147,630 4.90e-07 0.001 0.008
4223149_99 GL019332 5,745 6.12e-07 0.001 0.011
1089925_112 7 35,584,375 9.80e-07 0.002 0.017
464226_128 2 125,603,458 1.01e-06 0.002 0.017
3374849_109 GL018704 1,294,009 1.39e-06 0.002 0.024
3695415_108 GL018751 737,318 ENSOCUG00000003672 TRAF3 Intronic 1.84e-06 0.002 0.032
2099634_108 13 45,086,624 2.03e-06 0.002 0.035
1902686_98 12 60,113,885 ENSOCUG00000001871 12,820 2.09e-06 0.002 0.036
644062_1 3 56,309,981 ENSOCUG00000004827 SFXN1 Intronic 3.51e-06 0.004 0.060
4149102_107 GL019077 34,265 ENSOCUG00000024268 IFT140 Intronic 4.82e-06 0.005 0.083
87917_119 1 64,635,286 ENSOCUG00000007758 CEP78 Intronic 5.49e-06 0.005 0.094
2689075_49 16 56,527,308 ENSOCUG00000000984 ESRRG Intronic 5.62e-06 0.005 0.096
4010189_98 GL018907 283,766 ENSOCUG00000008870 BRWD1 Intronic 5.93e-06 0.005 0.102
4044777_119 GL018933 204,058 ENSOCUG00000005859 FNBP1 Intronic 7.75e-06 0.006 0.133
4146664_90 GL019084 74,038 ENSOCUG00000010071 GNB1 Intronic 1.84e-05 0.014 0.316
3016258_99 19 19,178,148 ENSOCUG00000015254 SEZ6 Intronic 2.17e-05 0.016 0.372
4176850_23 GL019154 883 ENSOCUG00000002484 MTHFSD 828 2.18e-05 0.016 0.374
4098854_101 GL018985 129,495 ENSOCUG00000006396 Intronic 3.09e-05 0.021 0.530
2465104_19 14 157,742,593 3.67e-05 0.023 0.629
3032107_92 19 26,020,372 ENSOCUG00000024664 LHX1 20,607 3.71e-05 0.023 0.637
2906428_79 18 25,311,176 ENSOCUG00000011533 RHOBTB1 24 3.76e-05 0.023 0.646
3829050_109 GL018791 957,960 ENSOCUG00000007849 FHAD1 Intronic 4.01e-05 0.023 0.687
1009106_1 6 9,037,073 ENSOCUG00000027083 CLEC19A 15,180 4.15e-05 0.023 0.712
482617_100 2 139,723,776 ENSOCUG00000001127 RHOQ 8,346 4.34e-05 0.023 0.744
3378006_1 GL018704 2,271,135 ENSOCUG00000006255 AGO3 9,610 4.45e-05 0.023 0.764
2580149_42 15 87,473,844 ENSOCUG00000001910 ADGRL3 Intronic 4.52e-05 0.023 0.775
1685466_9 10 42,335,759 6.43e-05 0.032 1.000
4136275_162 GL019049 178,272 ENSOCUG00000026644 Intronic 9.11e-05 0.045 1.000
2028473_73 13 2,979,383 9.62e-05 0.045 1.000
1839042_17 12 15,448,280 9.72e-05 0.045 1.000
4208471_113 GL019274 63,350 ENSOCUG00000015189 37,542 1.01e-04 0.045 1.000

Under the full genotype model:

  • 15 genotypes were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05 (using the Bonferroni adjustment)
  • 38 genotypes were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05
  • For the most highly ranked SNP (3391201_92), the minor allele has been completely lost in the 2012 population
convert \
-density 600 \
-size 1920 \
../figures/FigureS3a.pdf \
../figures/FigureS3a.png
knitr::include_graphics(file.path("..", "figures", "FigureS3a.png"))
*Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with a Bonferroni-adjusted p-value < 0,05 additionally shown in green*

Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with a Bonferroni-adjusted p-value < 0,05 additionally shown in green

genotypeResults %>% 
  filter(FDR < alpha) %>% 
  left_join(
    filter(
      genotypes, 
      snpID %in% .$snpID, 
      Population %in% c(1996, 2012)
    )
  ) %>% 
  mutate(
    Genotype = c("AA", "AB", "BB")[Genotype + 1],
    Genotype = as.factor(Genotype),
    Population = as.factor(Population),
    snpID = case_when(
      adjP < alpha ~ paste0(snpID, "*"),
      adjP >= alpha ~ as.character(snpID)
    ),
    snpID = factor(snpID, levels = unique(snpID))
  ) %>%
  group_by(snpID, Population, Genotype) %>% 
  tally() %>%
  ungroup() %>% 
  group_by(snpID, Population) %>%
  mutate(freq = n / sum(n)) %>%
  ggplot(aes(Genotype, freq, fill = Population)) + 
  geom_bar(
    stat = "identity",
    position = position_dodge2(preserve = "single")
  ) + 
  facet_wrap(~snpID, drop = FALSE, ncol = 7) + 
  scale_fill_manual(values = c("black", "blue")) +
  labs(y = "Frequency") + 
  theme(
    legend.direction = "horizontal", 
    legend.position = c(0.75, 0.05)
  )
*Variants detected as significant to an FDR of 5%. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance, with some variants becoming exclusively heterozygous in the 2012 population.*

Variants detected as significant to an FDR of 5%. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance, with some variants becoming exclusively heterozygous in the 2012 population.

write_tsv(genotypeResults, "genotypeResults.tsv")

Allele Frequency Model

This model tests:
H0: No association between allele frequencies and populations
HA: An association exists between allele frequencies and populations

alleleResults <- genotypes %>%
  filter(
    Population != 2010,
    !variant.id %in% regionSNPs$variant.id
  ) %>%
  group_by(snpID, Population) %>%
  summarise(
    P = sum(2 - Genotype),
    Q = sum(Genotype)
  ) %>%
  ungroup() %>%
  split(f = .$snpID) %>%
  mclapply(function(x){
    m <- as.matrix(x[c("P", "Q")])
    ft <- list(p.value = NA) 
    if (length(m) == 4) {
      ft <- fisher.test(m)
    }
    x %>%
      mutate(MAF = Q / (P + Q)) %>%
      dplyr::select(snpID, Population, MAF) %>%
      spread(Population, MAF) %>%
      mutate(p = ft$p.value)
  },mc.cores = mc) %>%
  bind_rows() %>%
  filter(!is.na(p)) %>%
  dplyr::rename(
    MAF_1996 = `1996`,
    MAF_2012 = `2012`
  ) %>%
  mutate(
    FDR = p.adjust(p, "fdr"),
    adjP = p.adjust(p, "bonferroni"),
    # Make the MAF is relative to the 1996 population not the reference
    MAF_2012 = case_when(
      MAF_1996 > 0.5 ~ 1 - MAF_2012,
      MAF_1996 <= 0.5 ~ MAF_2012
    ),
    MAF_1996 = case_when(
      MAF_1996 > 0.5 ~ 1 - MAF_1996,
      MAF_1996 <= 0.5 ~ MAF_1996
    )
  ) %>%
  arrange(p) %>%
  left_join(
    genotypes %>% distinct(snpID, chromosome, position)
  ) %>%
  dplyr::select(snpID, chromosome, position, everything())
alleleSnps <- alleleResults %>%
  dplyr::filter(FDR < alpha) %>%
  .[["snpID"]]
snpsGR %>% 
  subset(snpID %in% alleleSnps) %>% 
  join_nearest(ensGenes) %>%
  as.data.frame() %>%
  left_join(
    ensGenes %>% as.data.frame(),
    by = c("Name", "gene_id", "seqnames", "description"),
    suffix = c("_snp", "_gene")
  ) %>%
  as_tibble() %>%
  dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene) %>%
  mutate(dist2Gene = case_when(
    start_snp >= start_gene & start_snp <= end_gene ~ 0L,
    start_snp < start_gene ~ start_gene - start_snp,
    start_snp > end_gene ~ start_snp - end_gene
  ),
  Name = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ Name
  ),
    gene_id = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ gene_id
  ),
  inExon = snpID %in% ensExons$snpID,
  dist2Gene = case_when(
    Name == "" ~ "",
    inExon ~ "CDS",
    dist2Gene == 0 & !inExon ~ "Intronic",
    dist2Gene > 0 ~ comma(dist2Gene)
  ),
  seqnames = as.character(seqnames)
  ) %>%
  dplyr::select(
    snpID,
    chromosome = seqnames,
    position = start_snp,
    gene_id, Name, dist2Gene
  ) %>%
  left_join(alleleResults) %>%
  dplyr::select(
    snpID, chromosome, position, starts_with("MAF"),
    gene_id, Name, dist2Gene, p, FDR, adjP
  ) %>%
  arrange(p) %>%
  mutate(
    position = comma(position),
    MAF_1996 = sprintf("%.3f", MAF_1996),
    MAF_2012 = sprintf("%.3f", MAF_2012),
    p = tidyP(p),
    FDR = tidyP(FDR),
    adjP = tidyP(adjP)
  ) %>%
  pander(
    justify = "llrrrlllrrr",
    style = "rmarkdown",
     big.mark = ",",
    split.tables = Inf,
      caption = paste(
        "*SNPs considered as significant when analysing by genotype using an FDR cutoff of", 
        percent_format(1)(alpha),
        "Minor allele frequencies are reported based on frequencies in the 1996 population.",
        "Any genes within", paste0(maxKb, "kb"), "are also shown.",
        "Bonferroni-adjusted p-values are given in the final column.*"
      )
  )
SNPs considered as significant when analysing by genotype using an FDR cutoff of 5% Minor allele frequencies are reported based on frequencies in the 1996 population. Any genes within 40kb are also shown. Bonferroni-adjusted p-values are given in the final column.
snpID chromosome position MAF_1996 MAF_2012 gene_id Name dist2Gene p FDR adjP
3391201_92 GL018705 1,937,359 0.225 0.010 ENSOCUG00000016428 RCBTB2 37,689 3.44e-07 0.006 0.006
916731_91 4 89,602,973 0.196 0.000 ENSOCUG00000006126 RIC8B Intronic 6.88e-07 0.006 0.012
3040789_114 19 30,689,426 0.186 0.000 ENSOCUG00000010440 MSI2 Intronic 1.53e-06 0.008 0.026
321186_117 2 27,995,273 0.250 0.012 1.87e-06 0.008 0.032
1009106_1 6 9,037,073 0.443 0.128 ENSOCUG00000027083 CLEC19A 15,180 2.52e-06 0.008 0.043
836950_151 4 35,111,855 0.291 0.041 ENSOCUG00000012144 TMPRSS12 5,745 3.09e-06 0.008 0.053
464226_128 2 125,603,458 0.191 0.000 3.11e-06 0.008 0.053
2227006_109 13 129,650,867 0.271 0.033 ENSOCUG00000022823 ID3 16,426 4.17e-06 0.008 0.072
1902686_98 12 60,113,885 0.163 0.000 ENSOCUG00000001871 12,820 4.75e-06 0.008 0.082
3374849_109 GL018704 1,294,009 0.207 0.010 4.87e-06 0.008 0.084
4044777_119 GL018933 204,058 0.018 0.202 ENSOCUG00000005859 FNBP1 Intronic 7.75e-06 0.011 0.133
1089925_112 7 35,584,375 0.284 0.043 9.72e-06 0.013 0.167
3695415_108 GL018751 737,318 0.261 0.033 ENSOCUG00000003672 TRAF3 Intronic 1.28e-05 0.014 0.219
2099634_108 13 45,086,624 0.261 0.034 1.41e-05 0.014 0.243
87917_119 1 64,635,286 0.186 0.000 ENSOCUG00000007758 CEP78 Intronic 1.43e-05 0.014 0.246
4010189_98 GL018907 283,766 0.193 0.010 ENSOCUG00000008870 BRWD1 Intronic 1.63e-05 0.015 0.280
2689075_49 16 56,527,308 0.058 0.304 ENSOCUG00000000984 ESRRG Intronic 1.65e-05 0.015 0.284
4149102_107 GL019077 34,265 0.223 0.029 ENSOCUG00000024268 IFT140 Intronic 2.28e-05 0.020 0.392
1997286_93 12 141,844,290 0.089 0.356 ENSOCUG00000004829 ESR1 Intronic 2.44e-05 0.020 0.419
2906428_79 18 25,311,176 0.173 0.443 ENSOCUG00000011533 RHOBTB1 24 2.54e-05 0.020 0.437
2405359_66 14 97,813,263 0.021 0.202 ENSOCUG00000005835 STXBP5L Intronic 4.48e-05 0.033 0.770
3000704_65 19 13,055,154 0.032 0.234 5.42e-05 0.038 0.931
4146664_90 GL019084 74,038 0.196 0.020 ENSOCUG00000010071 GNB1 Intronic 5.49e-05 0.038 0.944
4098854_101 GL018985 129,495 0.170 0.010 ENSOCUG00000006396 Intronic 6.69e-05 0.043 1.000
4098838_14 GL018985 123,328 0.364 0.127 ENSOCUG00000006396 Intronic 6.80e-05 0.043 1.000
1882774_17 12 40,509,779 0.321 0.594 7.29e-05 0.043 1.000
1045185_75 7 2,702,919 0.214 0.038 ENSOCUG00000027861 3,276 7.81e-05 0.043 1.000
3016258_99 19 19,178,148 0.220 0.033 ENSOCUG00000015254 SEZ6 Intronic 7.86e-05 0.043 1.000
2028473_73 13 2,979,383 0.018 0.173 8.05e-05 0.043 1.000
3829050_109 GL018791 957,960 0.163 0.010 ENSOCUG00000007849 FHAD1 Intronic 8.09e-05 0.043 1.000
2580149_42 15 87,473,844 0.351 0.115 ENSOCUG00000001910 ADGRL3 Intronic 8.42e-05 0.043 1.000
2527760_104 15 43,156,117 0.153 0.406 8.59e-05 0.043 1.000
3999128_75 GL018883 283,912 0.365 0.125 ENSOCUG00000016318 SERPINA6 Intronic 8.68e-05 0.043 1.000
4176850_23 GL019154 883 0.234 0.041 ENSOCUG00000002484 MTHFSD 828 8.92e-05 0.043 1.000
686773_29 3 90,851,512 0.438 0.167 ENSOCUG00000009616 CRISPLD1 CDS 9.73e-05 0.045 1.000
3032107_92 19 26,020,372 0.194 0.021 ENSOCUG00000024664 LHX1 20,607 1.02e-04 0.046 1.000

Under this model:

  • 5 SNP alleles were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05.
  • 38 SNP alleles were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05
convert \
-density 600 \
-size 1920 \
../figures/FigureS3b.pdf \
../figures/FigureS3b.png
knitr::include_graphics(file.path("..", "figures", "FigureS3b.png"))
Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.

Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.

alleleResults %>%
    mutate(Sig = FDR < alpha) %>%
    ggplot(aes(MAF_1996, MAF_2012, colour = Sig, label = snpID)) +
    geom_point() +
    geom_text_repel(data = . %>% filter(adjP < alpha)) +
    # geom_text_repel(data = . %>%filter(FDR < alpha & MAF_2012 > MAF_1996)) +
    scale_colour_manual(values = c(rgb(0.5, 0.5, 0.5, 0.6), "red")) +
    labs(x = "MAF (1996)", y = "MAF (2012)") +
    theme(legend.position = "none")
*Comparison of minor allele frequencies in both populations. SNPs with an FDR-adjusted p-value < 0.05 are coloured red, whilst those with a Bonferroni-adjusted p-value are additionally labelled.*

Comparison of minor allele frequencies in both populations. SNPs with an FDR-adjusted p-value < 0.05 are coloured red, whilst those with a Bonferroni-adjusted p-value are additionally labelled.

alleleResults %>% 
  filter(FDR < alpha) %>% 
  gather(Population, B, contains("MAF")) %>%
  mutate(
    Population = str_remove(Population, "MAF_"),
    A = 1 - B,
    Population = as.factor(Population),
    snpID = case_when(
      adjP < alpha ~ paste0(snpID, "*"),
      adjP >= alpha ~ as.character(snpID)
    ),
    snpID = factor(snpID, levels = unique(snpID))
  ) %>% 
  gather(Allele, Freq, A, B) %>%
  ggplot(aes(Allele, y = Freq, fill = Population)) + 
  geom_bar(
    position = position_dodge2(preserve = "single"),
    stat = "identity"
  ) + 
  facet_wrap(~snpID, drop = FALSE, ncol = 7) + 
  scale_fill_manual(values = c("black", "blue")) +
  labs(y = "Frequency") + 
  theme(
    legend.direction = "horizontal", 
    legend.position = c(0.75, 0.05)
  )
*Variants detected as significant to an FDR of 5% under the Allele Frequency model. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance.*

Variants detected as significant to an FDR of 5% under the Allele Frequency model. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance.

write_tsv(alleleResults, "alleleResults.tsv")

Analysis Using the FLK model

This was performed separately and no unique SNPs were detected in addition to any detected above.

Export of Data for Bayescan

A VCF was required with only the 1996 and 2012 populations, and restricted to the candidate SNPs after pruning for linkage disequilibrium and detection of the allele in the 1996 population.

var2VCF <- genotypes %>% 
  distinct(variant.id, snpID) %>%
  filter(snpID %in% filter(regionResults, p > 0.05)$snpID) %>% 
  .[["variant.id"]]
gdsFile <- seqOpen(gdsPath, readonly = TRUE)
seqSetFilter(
  gdsFile, 
  variant.id = var2VCF,
  sample.id = sampleID %>% 
    filter(Population %in% c(1996, 2012)) %>% 
    .[["Sample"]]
)
seqGDS2VCF(gdsFile, "../5_stacks/vcf/filtered.vcf.gz")
seqResetFilter(gdsFile)
seqClose(gdsFile)

Inclusion of Bayescan Results

After running Bayescan, results were imported. Unfortunately, these results had IDs which were taken from the line in the above VCF and need to be converted back to IDs compatible with previous analysis.

bayRes <- file.path("..", "external", "BayescanOut.txt.gz") %>%
  gzfile() %>%
  read_delim(delim = " ", col_names = FALSE, skip = 1, col_types = "innnnn-") %>%
  set_colnames(c("ID", "prob","log10.PO.","qval","alpha","fst")) %>%
  mutate(variant.id = var2VCF[ID])
bayRes %>%
  dplyr::select(variant.id, qval, fst) %>%
  dplyr::filter(qval < alpha) %>%
  left_join(
    genotypes %>% 
      distinct(snpID, .keep_all = TRUE) %>% 
      dplyr::select(variant.id, chromosome, position, snpID)
  ) %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    ignore.strand = TRUE,
    seqinfo = seqinfo(snp2Gene), 
    seqnames.field = "chromosome",
    start.field = "position",
    end.field = "position"
  ) %>%
  join_nearest(ensGenes) %>%
  as.data.frame() %>%
  left_join(
    ensGenes %>% as.data.frame(),
    by = c("Name", "gene_id", "seqnames", "description"),
    suffix = c("_snp", "_gene")
  ) %>%
  as_tibble() %>%
  dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene, qval, fst) %>%
  mutate(dist2Gene = case_when(
    start_snp >= start_gene & start_snp <= end_gene ~ 0L,
    start_snp < start_gene ~ start_gene - start_snp,
    start_snp > end_gene ~ start_snp - end_gene
  ),
  Name = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ Name
  ),
    gene_id = case_when(
    dist2Gene > maxKb*1000 ~ "",
    dist2Gene < maxKb*1000 ~ gene_id
  ),
  inExon = snpID %in% ensExons$snpID,
  dist2Gene = case_when(
    Name == "" ~ "",
    inExon ~ "CDS",
    dist2Gene == 0 & !inExon ~ "Intronic",
    dist2Gene > 0 ~ comma(dist2Gene)
  ),
  seqnames = as.character(seqnames)
  ) %>%
  dplyr::select(
    snpID, 
    chromosome = seqnames, position = start_snp, 
    gene_id, Name, dist2Gene, fst, qval) %>%
  # Replace the one dropped by `join_nearest()`
  bind_rows(
    bayRes %>%
      dplyr::select(variant.id, qval, fst) %>%
      dplyr::filter(qval < alpha) %>%
      left_join(
        genotypes %>% 
          distinct(snpID, .keep_all = TRUE) %>% 
          dplyr::select(variant.id, chromosome, position, snpID)
      ) %>%
      mutate(
        gene_id = "",
        Name = "",
        dist2Gene = ""
      ) %>%
      dplyr::select(-variant.id)
  ) %>%
  distinct(snpID, .keep_all = TRUE) %>%
  mutate(position = comma(position)) %>%
  arrange(qval) %>%
  mutate(
    fst = tidyP(fst),
    qval = tidyP(qval)
  ) %>%
  pander(
    justify = "llrllrrr",
    style = "rmarkdown",
     big.mark = ",",
    split.tables = Inf,
      caption = paste(
        "*Results from Bayescan with a q-value <", paste0(alpha, "."),
        "The nearest gene within", paste0(maxKb, "kb"), "is also given.",
        "One SNP detected by Bayescan was located on a scaffold with no genes (GL019119).*"
      )
  )
Results from Bayescan with a q-value < 0.05. The nearest gene within 40kb is also given. One SNP detected by Bayescan was located on a scaffold with no genes (GL019119).
snpID chromosome position gene_id Name dist2Gene fst qval
3391201_92 GL018705 1,937,359 ENSOCUG00000016428 RCBTB2 37,689 0.043 0.003
321186_117 2 27,995,273 0.042 0.008
1009106_1 6 9,037,073 ENSOCUG00000027083 CLEC19A 15,180 0.036 0.013
836950_151 4 35,111,855 ENSOCUG00000012144 TMPRSS12 5,745 0.035 0.018
2227006_109 13 129,650,867 ENSOCUG00000022823 ID3 16,426 0.035 0.023
3374849_109 GL018704 1,294,009 0.037 0.026
4044777_119 GL018933 204,058 ENSOCUG00000005859 FNBP1 Intronic 0.033 0.031
4165193_106 GL019119 147,630 0.034 0.034
3695415_108 GL018751 737,318 ENSOCUG00000003672 TRAF3 Intronic 0.033 0.038
4010189_98 GL018907 283,766 ENSOCUG00000008870 BRWD1 Intronic 0.033 0.044
1089925_112 7 35,584,375 0.032 0.049
grid.newpage()
list(
  Genotype = dplyr::filter(genotypeResults, FDR < alpha)$variant.id,
  Allele = alleleResults %>%
    left_join(snps) %>%
    dplyr::filter(FDR < alpha) %>%
    .[["variant.id"]],
  Bayescan = dplyr::filter(bayRes, qval < 0.05)$variant.id
) %>% 
 VennDiagram::venn.diagram(
   filename = NULL,
   cex = 2,
   cat.cex = 2
   ) %>%
  grid.draw()
*Summary of results from all three methods, using an FDR of 0.05 for each approach. All SNPs detected under Bayescan were identified using Fisher's Exact Test for both allele and genotype analysis*

Summary of results from all three methods, using an FDR of 0.05 for each approach. All SNPs detected under Bayescan were identified using Fisher’s Exact Test for both allele and genotype analysis

sigSnps <- c(
  filter(alleleResults, FDR < alpha)$snpID,
  filter(genotypeResults, FDR < alpha)$snpID
) %>%
  unique()
suppressWarnings(
  snpsGR %>%
    subset(snpID %in% sigSnps) %>%
    resize(width = 1000*maxKb*2, fix = "center") %>%
    trim() %>%
    find_overlaps(ensGenes) %>%
    as.data.frame() %>%
    as_tibble() %>%
    left_join(snps, by = c("variant.id", "snpID", "Col")) %>%
    dplyr::select(snpID, gene_id, Name, chromosome, position) %>%
    left_join(
      as.data.frame(ensGenes), by = c("gene_id", "Name")
    ) %>%
    mutate(
      distance = case_when(
        position < start ~ comma(start - position, accuracy = 1),
        position > end ~ comma(position - end, accuracy = 1),
        snpID %in% ensExons$snpID ~ "Exonic",
        position >= start & position <= end ~ "Intronic"
      ),
      model = ifelse(
        snpID %in% filter(alleleResults, FDR < alpha)$snpID, "A", ""
        ),
      model = ifelse(
         snpID %in% dplyr::filter(snps, variant.id %in% dplyr::filter(bayRes, qval < 0.05)$variant.id)$snpID,
         paste0(model, "B"), model
      ),
      model = ifelse(
        snpID %in% filter(genotypeResults, FDR < alpha)$snpID,
        paste0(model, "G"), model
      ),
      width = comma(width, accuracy = 1)
    ) %>%
    unite(location, chromosome, position, sep = ":") %>%
    dplyr::select(
      `SNP Location` = location, 
      `SNP ID` = snpID, 
      `Gene ID` = gene_id, 
      `Gene Name` = Name, 
      `Gene Width` = width, 
      `Distance` = distance, 
      Models = model
    )
) %>% 
  pander(
    justify = "llllrll",
    style = "rmarkdown",
    split.tables = Inf,
    # big.mark = ",",
    caption = paste0(
      "*All putatively significant SNPs with genes located within ", maxKb, "kb. ",
      "The distance from SNP to gene is indicated, unless the SNP is intronic/exonic. ",
      "The model(s) which identified the SNP are indicated in the final column ",
      "as (A) the Allele Model; (B) Bayescan and (G) the Genotype Model.*"
    )
  )
All putatively significant SNPs with genes located within 40kb. The distance from SNP to gene is indicated, unless the SNP is intronic/exonic. The model(s) which identified the SNP are indicated in the final column as (A) the Allele Model; (B) Bayescan and (G) the Genotype Model.
SNP Location SNP ID Gene ID Gene Name Gene Width Distance Models
1:64635286 87917_119 ENSOCUG00000007758 CEP78 51,885 Intronic AG
2:139723776 482617_100 ENSOCUG00000001127 RHOQ 31,670 8,346 G
2:139723776 482617_100 ENSOCUG00000021835 ATP6V1E2 681 10,081 G
2:139723776 482617_100 ENSOCUG00000028184 TMEM247 4,917 37,653 G
3:56309981 644062_1 ENSOCUG00000004827 SFXN1 26,726 Intronic G
3:90851512 686773_29 ENSOCUG00000009616 CRISPLD1 51,813 Exonic AG
4:35111855 836950_151 ENSOCUG00000012144 TMPRSS12 40,859 5,745 ABG
4:89602973 916731_91 ENSOCUG00000006126 RIC8B 123,296 Intronic AG
6:9037073 1009106_1 ENSOCUG00000027083 CLEC19A 20,430 15,180 ABG
7:2702919 1045185_75 ENSOCUG00000007975 ZNF777 32,875 12,875 A
7:2702919 1045185_75 ENSOCUG00000027861 1,123 3,276 A
12:60113885 1902686_98 ENSOCUG00000016906 103,853 13,295 AG
12:60113885 1902686_98 ENSOCUG00000001871 1,594 12,820 AG
12:60113885 1902686_98 ENSOCUG00000021694 1,144 18,149 AG
12:60113885 1902686_98 ENSOCUG00000023798 2,496 35,410 AG
12:141844290 1997286_93 ENSOCUG00000004829 ESR1 474,528 Intronic A
13:129650867 2227006_109 ENSOCUG00000022823 ID3 528 16,426 ABG
14:97813263 2405359_66 ENSOCUG00000005835 STXBP5L 418,595 Intronic A
15:87473844 2580149_42 ENSOCUG00000001910 ADGRL3 571,686 Intronic AG
16:56527308 2689075_49 ENSOCUG00000000984 ESRRG 238,366 Intronic AG
18:25311176 2906428_79 ENSOCUG00000011533 RHOBTB1 116,987 24 AG
19:19178148 3016258_99 ENSOCUG00000015254 SEZ6 46,097 Intronic AG
19:26020372 3032107_92 ENSOCUG00000014434 AATF 127,766 30,904 AG
19:26020372 3032107_92 ENSOCUG00000024664 LHX1 6,982 20,607 AG
19:30689426 3040789_114 ENSOCUG00000010440 MSI2 396,928 Intronic AG
GL018704:2271135 3378006_1 ENSOCUG00000011042 COL8A2 3,187 34,195 G
GL018704:2271135 3378006_1 ENSOCUG00000011038 ADPRHL2 5,946 24,425 G
GL018704:2271135 3378006_1 ENSOCUG00000029722 TEKT2 3,671 20,166 G
GL018704:2271135 3378006_1 ENSOCUG00000027798 1,002 16,486 G
GL018704:2271135 3378006_1 ENSOCUG00000006255 AGO3 128,900 9,610 G
GL018705:1937359 3391201_92 ENSOCUG00000016428 RCBTB2 37,081 37,689 ABG
GL018751:737318 3695415_108 ENSOCUG00000010323 CDC42BPB 107,021 5,698 ABG
GL018751:737318 3695415_108 ENSOCUG00000003672 TRAF3 48,133 Intronic ABG
GL018791:957960 3829050_109 ENSOCUG00000007849 FHAD1 123,752 Intronic AG
GL018883:283912 3999128_75 ENSOCUG00000016273 PPP4R4 96,327 21,034 A
GL018883:283912 3999128_75 ENSOCUG00000016318 SERPINA6 21,391 Intronic A
GL018907:283766 4010189_98 ENSOCUG00000008870 BRWD1 121,750 Intronic ABG
GL018933:204058 4044777_119 ENSOCUG00000005859 FNBP1 73,115 Intronic ABG
GL018985:123328 4098838_14 ENSOCUG00000006403 CLDN5 657 14,238 A
GL018985:123328 4098838_14 ENSOCUG00000006396 38,624 Intronic A
GL018985:123328 4098838_14 ENSOCUG00000025709 UFD1 18,611 28,184 A
GL018985:129495 4098854_101 ENSOCUG00000006403 CLDN5 657 20,405 AG
GL018985:129495 4098854_101 ENSOCUG00000006396 38,624 Intronic AG
GL018985:129495 4098854_101 ENSOCUG00000025709 UFD1 18,611 22,017 AG
GL019049:178272 4136275_162 ENSOCUG00000009045 ASS1 36,901 36,520 G
GL019049:178272 4136275_162 ENSOCUG00000017433 8,159 11,078 G
GL019049:178272 4136275_162 ENSOCUG00000026644 29,389 Intronic G
GL019084:74038 4146664_90 ENSOCUG00000010071 GNB1 40,037 Intronic AG
GL019084:74038 4146664_90 ENSOCUG00000021568 NADK 14,270 12,587 AG
GL019084:74038 4146664_90 ENSOCUG00000007721 37,538 30,580 AG
GL019077:34265 4149102_107 ENSOCUG00000027299 PTX4 3,616 19,359 AG
GL019077:34265 4149102_107 ENSOCUG00000000521 TELO2 11,447 6,323 AG
GL019077:34265 4149102_107 ENSOCUG00000024268 IFT140 81,557 Intronic AG
GL019077:34265 4149102_107 ENSOCUG00000001136 TMEM204 19,558 13,406 AG
GL019154:883 4176850_23 ENSOCUG00000002484 MTHFSD 13,513 828 AG
GL019154:883 4176850_23 ENSOCUG00000001701 FOXF1 2,195 19,026 AG
GL019274:63350 4208471_113 ENSOCUG00000015189 321 37,542 G

Enrichment testing

fullPaths <- list(
  toTable(GOBPANCESTOR) %>% set_names(c("go_id", "ancestor")),
  toTable(GOCCANCESTOR) %>% set_names(c("go_id", "ancestor")),
  toTable(GOMFANCESTOR) %>% set_names(c("go_id", "ancestor"))
) %>% 
  bind_rows() %>%
  dplyr::filter(!ancestor %in% c("all", "GO:0008150", "GO:0003674", "GO:0005575")) %>%
  as_tibble() %>%
  bind_rows(
    tibble(
      go_id = unique(.$go_id),
      ancestor = unique(.$go_id)
    )
  ) %>%
  arrange(go_id, ancestor)
goSummaries <- url("https://uofabioinformaticshub.github.io/summaries2GO/data/goSummaries.RDS") %>%
  readRDS() %>%
  dplyr::rename(go_id = id)

GO Analysis By Gene

For the initial GO analysis, any gene within 40kb of a SNP considered as significant were tested for any enrichment of biological characteristics.

allSnpsGR <- snps %>%
  dplyr::select(chromosome, position, snpID) %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    seqnames.field = "chromosome", 
    start.field = "position", 
    end.field = "position", 
    seqinfo = seqinfo(ensGenes)) %>%
  join_overlap_inner(
    ensGenes,
    maxgap = maxKb*1000
  )
sigSnpGR <- allSnpsGR %>%
  subset(snpID %in% sigSnps)
nGenes <- allSnpsGR$gene_id %>%
  unique() %>%
  length()
nSigGenes <- sigSnpGR$gene_id %>%
  unique() %>%
  length()

10821 genes were identified as being within 40kb of a SNP. For the 32 candidate SNPs within this distance of a gene, 54 genes were identified, and these were analysed for enrichment of any GO terms.

Mappings from genes to GO terms were downloaded manually from the biomart server at www.ensembl.org.

bm <- file.path("..", "external", "gene2GO_biomart.tsv.gz") %>%
  gzfile() %>%
  read_tsv() %>%
  dplyr::filter(!is.na(`GO domain`)) %>%
  dplyr::rename(gene_id = `Gene stable ID`,
         ontology = `GO domain`,
         go_id = `GO term accession`,
         go_term = `GO term name`) %>%
  dplyr::filter(gene_id %in% allSnpsGR$gene_id) %>%
  left_join(fullPaths) %>%
  dplyr::select(gene_id, go_id = ancestor) %>%
  distinct(gene_id, go_id) %>%
  dplyr::filter(!is.na(go_id))

Complete paths of GO terms back to ontology roots were obtained for each gene. The minimum number of steps back to the ontology roots were also obtained for each GO ID.

goRes <- bm %>%
  mutate(inSig = gene_id %in% sigSnpGR$gene_id) %>%
  group_by(go_id) %>%
  summarise(N = dplyr::n(), 
            nSig = sum(inSig)) %>%
  left_join(goSummaries) %>%
  filter(nSig > 0, shortest_path > 2) %>%
  distinct(go_id, .keep_all = TRUE) %>%
  split(f = .$go_id) %>%
  mclapply(function(x){
    ft <- matrix(
      c(nSigGenes - x$nSig, x$nSig, nGenes - nSigGenes - x$N, x$N),
      nrow = 2, byrow = TRUE) %>%
      fisher.test()
    mutate(x, p = ft$p.value)
  }, mc.cores = mc) %>%
  bind_rows() %>%
  mutate(Expected = nSigGenes * N / nGenes,
         FDR = p.adjust(p, "fdr"),
         adjP = p.adjust(p, "bonferroni"),
         term = Term(go_id),
         ontology = Ontology(go_id)) %>%
  dplyr::select(go_id, term, ontology, N, nSig, Expected, p, adjP, FDR) %>%
  arrange(p)
goRes %>%
  dplyr::filter(FDR == min(FDR), nSig > 1) %>%
  mutate(
    Term = Term(go_id),
    Genes  = vapply(go_id, function(x){
      subset(sigSnpGR, gene_id %in% I(
        dplyr::filter(bm, go_id == x) %>%
          .[["gene_id"]]))$Name %>%
        sort() %>%
        unique() %>%
        paste(collapse = ", ")
    }, character(1))
  ) %>%
  dplyr::select(ID = go_id, Ontology = ontology, Term, Total = N, Sig = nSig, Expected, p, FDR, Genes) %>%
  mutate(
    ID = gsub(":", "\\\\:", ID),
    Expected = tidyP(Expected),
    p = tidyP(p),
    FDR = round(FDR, 3)
  ) %>%
  distinct(Ontology, Genes, .keep_all = TRUE) %>%
  pander(
    split.tables = Inf, style = "rmarkdown", 
    justify = "lllrrrrrl",
    caption = paste(
      "*Most highly ranked GO terms.", 
      "Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown.",
      "Overall, this corresponds to an FDR of", 
      percent(max(.$FDR)), ".*"
    )
  )
Most highly ranked GO terms. Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown. Overall, this corresponds to an FDR of 22% .
ID Ontology Term Total Sig Expected p FDR Genes
GO:0005496 MF steroid binding 34 3 0.170 8.10e-04 0.216 ESR1, ESRRG, SERPINA6
GO:0060068 BP vagina development 7 2 0.035 8.60e-04 0.216 ESR1, LHX1
GO:0001655 BP urogenital system development 146 5 0.729 9.03e-04 0.216 ESR1, FOXF1, ID3, IFT140, LHX1
GO:0072189 BP ureter development 10 2 0.050 0.002 0.216 FOXF1, LHX1
GO:0002221 BP pattern recognition receptor signaling pathway 51 3 0.255 0.002 0.216 ESR1, TRAF3, UFD1
GO:0060041 BP retina development in camera-type eye 56 3 0.279 0.003 0.216 GNB1, IFT140, LHX1
GO:0043010 BP camera-type eye development 122 4 0.609 0.004 0.216 COL8A2, GNB1, IFT140, LHX1
GO:0021680 BP cerebellar Purkinje cell layer development 16 2 0.080 0.004 0.216 LHX1, SEZ6
GO:0072001 BP renal system development 132 4 0.659 0.005 0.216 FOXF1, ID3, IFT140, LHX1
GO:0003727 MF single-stranded RNA binding 19 2 0.095 0.005 0.216 AGO3, MSI2
GO:0031050 BP dsRNA processing 19 2 0.095 0.005 0.216 AGO3, ESR1
GO:0004879 MF nuclear receptor activity 20 2 0.100 0.005 0.216 ESR1, ESRRG
GO:0050688 BP regulation of defense response to virus 20 2 0.100 0.005 0.216 TRAF3, UFD1
GO:0001750 CC photoreceptor outer segment 23 2 0.115 0.007 0.216 GNB1, IFT140
GO:0048701 BP embryonic cranial skeleton morphogenesis 26 2 0.130 0.009 0.216 IFT140, LHX1
GO:0030522 BP intracellular receptor signaling pathway 85 3 0.424 0.010 0.216 ESR1, ESRRG, UFD1

Meta Analysis With Previous Paper

The results in the Turretfield population from the paper Resistance to RHD virus in wild Australia rabbits: Comparison of susceptible and resistant individuals using a genomewide approach were then compared to these results. In this previous analysis, results were generated by using Fisher’s Exact Test to test genotypes for association with RHDV susceptibility or resistance.

tfGR <- url("https://raw.githubusercontent.com/hdetering/orycun/master/results/filteredSNPScores.csv") %>% 
  read_csv() %>%
  distinct(min_snp, .keep_all = TRUE) %>%
  dplyr::select(chromosome = chr_name, pos, snpID = min_snp, p, FDR) %>%
  mutate(chromosome = gsub("^chr", "", chromosome)) %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    ignore.strand = TRUE, 
    seqinfo = seqinfo(ensGenes),
    seqnames.field = "chromosome", 
    start.field = "pos", 
    end.field = "pos") %>%
  sort()

Of the 9,229 unique SNPs contained in the previous dataset, only 1,431 were also identified in the current dataset. A meta-analysis was performed on these SNPs using the results from both separate analyses which used Fisher’s Exact Test on genotypes. In this approach, Fisher’s method was used to combine p-values, and resultant p-values were drawn from a \(\chi^2\) distribution with 4 degrees of freedom (i.e. \(2k\)).

combinedResults <- tfGR %>%
  find_overlaps(allSnpsGR) %>%
  mcols() %>%
  as.data.frame() %>%
  as_tibble() %>%
  dplyr::select(snpID = snpID.y, tf.p = p) %>%
  distinct(snpID, .keep_all = TRUE) %>%
  left_join(
    genotypeResults %>%
      dplyr::select(snpID, chromosome, position, geno.p = p)) %>% 
  mutate(
    X = -2*(log(tf.p) + log(geno.p)), 
    p = pchisq(X, 4, lower.tail = FALSE), 
    adjP = p.adjust(p, "bonferroni"),
    FDR = p.adjust(p, "fdr")) %>% 
  arrange(p) 

Using Bonferroni’s adjustment to control the FWER at 0.05 gave 2 candidate SNPs, whilst using the more relaxed criterion of an FDR < 0.05 gave 10 candidate SNPs.

combinedResults %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(
    snpID,
    Chromosome = chromosome, 
    Position = position,
    p, adjP, FDR
  ) %>%
  mutate(
    p = tidyP(p),
    adjP = tidyP(adjP),
    FDR = tidyP(FDR)
  ) %>%
  pander(
    justify = "lrrrrr",
    caption = "*Significant SNPs from the meta-analysis using the previous published results from a separate population, using an FDR < 0.05 as the sole criterion for significance.*"
    )
Significant SNPs from the meta-analysis using the previous published results from a separate population, using an FDR < 0.05 as the sole criterion for significance.
snpID Chromosome Position p adjP FDR
644062_1 3 56309981 1.67e-06 0.002 0.002
3378006_1 GL018704 2271135 2.35e-05 0.031 0.015
3941015_20 GL018847 216450 5.32e-05 0.069 0.023
213788_4 1 146274492 8.65e-05 0.113 0.025
3118382_68 20 14614986 9.50e-05 0.124 0.025
3108287_38 20 10514947 1.41e-04 0.183 0.031
3156991_0 21 6185579 1.79e-04 0.234 0.033
2425370_49 14 114833115 2.08e-04 0.272 0.034
3904662_77 GL018828 225354 3.00e-04 0.392 0.043
3970912_82 GL018864 1306 3.32e-04 0.433 0.043
allSnpsGR %>% 
  subset(snpID %in% dplyr::filter(combinedResults, FDR < 0.05)$snpID) %>%
  subset(!is.na(Name)) %>% 
  as.data.frame() %>%
  dplyr::select(
    snpID, 
    Chromosome = seqnames,
    BP = start, 
    ID = gene_id,
    Name,
    Description = description
  ) %>%
  mutate(Description = str_remove(Description, " \\[Source.+")) %>%
  pander(
    style = "rmarkdown", split.table = Inf, justify = "lrrlll",
    caption = paste0(
      "*Genes within ", maxKb, "kb of SNPs identified in the meta-analysis.*"
    )
  )
Genes within 40kb of SNPs identified in the meta-analysis.
snpID Chromosome BP ID Name Description
213788_4 1 146274492 ENSOCUG00000000568 HBB2 hemoglobin, beta
213788_4 1 146274492 ENSOCUG00000021180 HBG2 hemoglobin, gamma G
213788_4 1 146274492 ENSOCUG00000027533 HBG1 hemoglobin, gamma A
213788_4 1 146274492 ENSOCUG00000026912 OR51I1 olfactory receptor family 51 subfamily I member 1
213788_4 1 146274492 ENSOCUG00000025404 OLFR64_1 MOR 5’beta3
644062_1 3 56309981 ENSOCUG00000004827 SFXN1 sideroflexin 1
2425370_49 14 114833115 ENSOCUG00000011289 ALCAM activated leukocyte cell adhesion molecule
3108287_38 20 10514947 ENSOCUG00000008859 KCNH5 potassium voltage-gated channel subfamily H member 5
3118382_68 20 14614986 ENSOCUG00000008913 RPS6KA5 ribosomal protein S6 kinase A5
3156991_0 21 6185579 ENSOCUG00000005961 KDM2B lysine demethylase 2B
3156991_0 21 6185579 ENSOCUG00000005957 RNF34 ring finger protein 34
3378006_1 GL018704 2271135 ENSOCUG00000011042 COL8A2 collagen type VIII alpha 2 chain
3378006_1 GL018704 2271135 ENSOCUG00000011038 ADPRHL2 ADP-ribosylhydrolase like 2
3378006_1 GL018704 2271135 ENSOCUG00000029722 TEKT2 tektin 2
3378006_1 GL018704 2271135 ENSOCUG00000006255 AGO3 argonaute RISC catalytic component 3
3904662_77 GL018828 225354 ENSOCUG00000027620 PAQR4 progestin and adipoQ receptor family member 4
3904662_77 GL018828 225354 ENSOCUG00000017181 PKMYT1 protein kinase, membrane associated tyrosine/threonine 1
3904662_77 GL018828 225354 ENSOCUG00000027985 TNFRSF12A TNF receptor superfamily member 12A
3904662_77 GL018828 225354 ENSOCUG00000017197 CLDN6 claudin 6
3941015_20 GL018847 216450 ENSOCUG00000012298 ODF2 outer dense fiber of sperm tails 2
3941015_20 GL018847 216450 ENSOCUG00000008855 CERCAM cerebral endothelial cell adhesion molecule
3970912_82 GL018864 1306 ENSOCUG00000021601 SCARB1 scavenger receptor class B member 1
combinedResults %>%
  mutate(Sig = FDR < 0.05) %>%
  ggplot(aes(-log10(geno.p), -log10(tf.p), colour = Sig)) +
  geom_point() +
  geom_abline(slope =1, intercept = 0, linetype = 2, colour = "blue") +
  geom_text_repel(aes(label = snpID),
                  data = . %>% filter(Sig)) +
  scale_color_manual(values = c("black", "red")) +
  labs(x = expression(paste(-log[10], "p (Genotype Analysis)")),
       y = expression(paste(-log[10], "p (Previous Analysis)"))) +
  theme(legend.position = "none")
*Comparison of p-values for SNPs detected in both analyses, with those considered as significant after meta-analysis shown in red.*

Comparison of p-values for SNPs detected in both analyses, with those considered as significant after meta-analysis shown in red.

combinedResults %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(snpID) %>%
  left_join(genotypes) %>%
  filter(Population != 2010,
         !variant.id %in% regionSNPs$variant.id) %>%
  mutate(Genotype = as.factor(Genotype),
         Population = as.factor(Population)) %>%
    ggplot(aes(Genotype, fill = Population)) + 
    geom_bar(position = position_dodge2(preserve = "single")) + 
    facet_wrap(~snpID, drop = FALSE) + 
    scale_fill_manual(values = c("black", "blue")) +
    labs(y = "Count") + 
    theme(
        legend.direction = "horizontal", 
        legend.position = c(0.75, 0.05)
    )
*Allele distributions in the 1996 and 2012 populations for SNPs identified under the meta-analysis.*

Allele distributions in the 1996 and 2012 populations for SNPs identified under the meta-analysis.

GO Enrichment

This list was also used to test for GO enrichment.

sigSnpGR <- allSnpsGR %>%
  subset(snpID %in% filter(combinedResults, FDR < 0.05)$snpID)
nGenes <- allSnpsGR %>%
  subset(snpID %in% combinedResults$snpID) %>%
  unique() %>%
  length()
nSigGenes <- sigSnpGR$gene_id %>%
  unique() %>%
  length()
combinedGoRes <- bm %>%
  dplyr::filter(
    gene_id %in% subset(allSnpsGR, snpID %in% combinedResults$snpID)$gene_id) %>%
  mutate(inSig = gene_id %in% sigSnpGR$gene_id) %>%
  group_by(go_id) %>%
  summarise(N = dplyr::n(), 
            nSig = sum(inSig)) %>%
  left_join(goSummaries) %>%
  filter(nSig > 0, shortest_path > 2) %>%
  distinct(go_id, .keep_all = TRUE) %>%
  split(f = .$go_id) %>%
  mclapply(function(x){
    ft <- matrix(
      c(nSigGenes - x$nSig, x$nSig, nGenes - nSigGenes - x$N, x$N),
      nrow = 2, byrow = TRUE) %>%
      fisher.test()
    mutate(x, p = ft$p.value)
  }, mc.cores = mc) %>%
  bind_rows() %>%
  mutate(Expected = nSigGenes * N / nGenes,
         FDR = p.adjust(p, "fdr"),
         adjP = p.adjust(p, "bonferroni"),
         term = Term(go_id),
         ontology = Ontology(go_id)) %>%
  dplyr::select(go_id, term, ontology, shortest_path, N, nSig, Expected, p, adjP, FDR) %>%
  arrange(p)

Inspection of the results revealed an enrichment for terms related to haemoglobin, however, as these genes are co-located this was revealed to be due to a single SNP with multiple haemoglobin genes within 40kb and can be disregarded.

combinedGoRes %>%
  dplyr::filter(
    nSig > Expected,
    FDR < 0.05
  ) %>%
  mutate(Term = Term(go_id),
         Genes  = vapply(go_id, function(x){
           subset(sigSnpGR, gene_id %in% I(
             dplyr::filter(bm, go_id == x) %>%
               .[["gene_id"]]))$Name %>%
             sort() %>%
             unique() %>%
             paste(collapse = ", ")
         }, character(1)),
         nSnps = vapply(go_id, function(x){
           subset(sigSnpGR, gene_id %in% I(
             dplyr::filter(bm, go_id == x) %>%
               .[["gene_id"]]))$snpID %>%
             unique() %>%
             length()
           }, integer(1))) %>%
  dplyr::select(ID = go_id, Ontology = ontology, Term, Total = N, Sig = nSig, nSnps, Expected, p, FDR, Genes) %>%
  mutate(ID = gsub(":", "\\\\:", ID),
         Expected = round(Expected, 3),
         p = round(p, 4),
         FDR = round(FDR, 3)) %>%
  distinct(Ontology, Genes, .keep_all = TRUE) %>%
  pander(
    split.tables = Inf, style = "rmarkdown", 
    justify = "lllrrrrrrl",
    caption = paste(
      "Most highly ranked GO terms for combined results.", 
      "Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown.",
      "Overall, this corresponds to an FDR of", 
      percent(max(.$FDR)), ".*"
    )
  )
Most highly ranked GO terms for combined results. Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown. Overall, this corresponds to an FDR of 4% .*
ID Ontology Term Total Sig nSnps Expected p FDR Genes
GO:0015669 BP gas transport 4 3 1 0.075 2e-04 0.034 HBB2, HBG1, HBG2
GO:0019825 MF oxygen binding 4 3 1 0.075 2e-04 0.034 HBB2, HBG1, HBG2
GO:0015893 BP drug transport 14 4 2 0.264 3e-04 0.037 HBB2, HBG1, HBG2, SFXN1

Export All Tested SNPs

snpsGR %>%
    resize(width = 1000*maxKb*2, fix = "center") %>%
    trim() %>%
    find_overlaps(ensGenes) %>%
    as.data.frame() %>%
    as_tibble() %>%
    left_join(snps, by = c("variant.id", "snpID", "Col")) %>%
    dplyr::select(snpID, gene_id, Name, chromosome, position) %>%
    left_join(
        as.data.frame(ensGenes), by = c("gene_id", "Name")
    ) %>%
    mutate(
        distance = case_when(
            position < start ~ comma(start - position, accuracy = 1),
            position > end ~ comma(position - end, accuracy = 1),
            snpID %in% ensExons$snpID ~ "Exonic",
            position >= start & position <= end ~ "Intronic"
        ),
        model = ifelse(
            snpID %in% filter(alleleResults, FDR < alpha)$snpID, "A", ""
        ),
        model = ifelse(
            snpID %in% dplyr::filter(snps, variant.id %in% dplyr::filter(bayRes, qval < 0.05)$variant.id)$snpID,
            paste0(model, "B"), model
        ),
        model = ifelse(
            snpID %in% filter(genotypeResults, FDR < alpha)$snpID,
            paste0(model, "G"), model
        ),
        width = comma(width, accuracy = 1)
    ) %>%
    unite(location, chromosome, position, sep = ":") %>%
    dplyr::select(
        snpID, location, gene_id, Name, distance
    ) %>%
    left_join(
        genotypeResults %>% dplyr::select(snpID, genotype_p = p)
    ) %>%
    left_join(
        alleleResults %>% dplyr::select(snpID, allele_p = p)
    ) %>%
    write_tsv(here::here("R", "allTestedSnps.tsv"))

All tested SNPs, along with genes within 40kb and p-values was then exported, and can be downloaded from here

Session Information

R version 3.6.2 (2019-12-12)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, parallel, grid, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: GO.db(v.3.10.0), AnnotationDbi(v.1.48.0), Biobase(v.2.46.0), rtracklayer(v.1.46.0), plyranges(v.1.6.7), GenomicRanges(v.1.38.0), GenomeInfoDb(v.1.22.0), IRanges(v.2.20.2), S4Vectors(v.0.24.3), BiocGenerics(v.0.32.0), ggrepel(v.0.8.1), qqman(v.0.1.4), ggsn(v.0.5.0), rgdal(v.1.4-8), ggmap(v.3.0.0), sp(v.1.3-2), readxl(v.1.3.1), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.4), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.2), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.3.0), magrittr(v.1.5), scales(v.1.1.0), pander(v.0.6.3), SNPRelate(v.1.20.1), SeqArray(v.1.26.2) and gdsfmt(v.1.22.0)

loaded via a namespace (and not attached): colorspace(v.1.4-1), rjson(v.0.2.20), ellipsis(v.0.3.0), class(v.7.3-15), rprojroot(v.1.3-2), futile.logger(v.1.4.3), XVector(v.0.26.0), fs(v.1.3.1), rstudioapi(v.0.10), farver(v.2.0.3), bit64(v.0.9-7), fansi(v.0.4.1), lubridate(v.1.7.4), xml2(v.1.2.2), knitr(v.1.27), jsonlite(v.1.6), Rsamtools(v.2.2.1), broom(v.0.5.4), dbplyr(v.1.4.2), png(v.0.1-7), compiler(v.3.6.2), httr(v.1.4.1), backports(v.1.1.5), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), cli(v.2.0.1), formatR(v.1.7), htmltools(v.0.4.0), tools(v.3.6.2), gtable(v.0.3.0), glue(v.1.3.1), GenomeInfoDbData(v.1.2.2), Rcpp(v.1.0.3), cellranger(v.1.1.0), vctrs(v.0.2.2), Biostrings(v.2.54.0), nlme(v.3.1-143), xfun(v.0.12), rvest(v.0.3.5), lifecycle(v.0.1.0), XML(v.3.99-0.3), zlibbioc(v.1.32.0), MASS(v.7.3-51.5), hms(v.0.5.3), SummarizedExperiment(v.1.16.1), lambda.r(v.1.2.4), yaml(v.2.2.1), memoise(v.1.1.0), calibrate(v.1.7.5), stringi(v.1.4.5), RSQLite(v.2.2.0), highr(v.0.8), maptools(v.0.9-9), e1071(v.1.7-3), BiocParallel(v.1.20.1), RgoogleMaps(v.1.4.5.2), rlang(v.0.4.4), pkgconfig(v.2.0.3), bitops(v.1.0-6), matrixStats(v.0.55.0), evaluate(v.0.14), lattice(v.0.20-38), sf(v.0.8-1), labeling(v.0.3), GenomicAlignments(v.1.22.1), bit(v.1.1-15.1), tidyselect(v.1.0.0), here(v.0.1), plyr(v.1.8.5), R6(v.2.4.1), generics(v.0.0.2), DelayedArray(v.0.12.2), DBI(v.1.1.0), pillar(v.1.4.3), haven(v.2.2.0), foreign(v.0.8-75), withr(v.2.1.2), units(v.0.6-5), RCurl(v.1.98-1.1), modelr(v.0.1.5), crayon(v.1.3.4), futile.options(v.1.0.1), KernSmooth(v.2.23-16), rmarkdown(v.2.1), jpeg(v.0.1-8.1), blob(v.1.2.1), reprex(v.0.3.0), digest(v.0.6.23), classInt(v.0.4-2), VennDiagram(v.1.6.20) and munsell(v.0.5.0)