library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
library(readxl)
library(sp)
library(ggmap)
library(rgdal)
library(ggsn)
library(parallel)
library(qqman)
library(ggrepel)
library(plyranges)
library(rtracklayer)
library(AnnotationDbi)
library(GO.db)
if (interactive()) here::here("R") %>% setwd()
theme_set(theme_bw())
panderOptions("missing", "")
mc <- min(12, detectCores() - 1)
alpha <- 0.05
maxKb <- 40
tidyP <- function(p, m = 0.001){
x <- rep("", length(p))
x[p < m] <- sprintf("%.2e", p[p < m])
x[p >= m] <- sprintf("%.3f", p[p >= m])
x
}
keepSNPs <- readRDS("keepSNPsAfterLDPruning.RDS")
w <- 169/25.4
h <- 169/25.4
This workflow immediately follows 03_SNPFiltering
and performs an analysis on the 18886 SNPs retained after filtering. Analysis performed were:
gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
gdsFile <- seqOpen(gdsPath, readonly = TRUE)
sampleID <- tibble(
Sample = seqGetData(gdsFile, "sample.annotation/Sample"),
Population = seqGetData(gdsFile, "sample.annotation/Population"),
Location = seqGetData(gdsFile, "sample.annotation/Location")
)
popSizes <- sampleID %>%
group_by(Population) %>%
summarise(n = dplyr::n())
snpIn1996 <- genotypes %>%
filter(Population == 1996) %>%
group_by(variant.id) %>%
summarise(maf = mean(Genotype) / 2) %>%
filter(maf > 0)
genotypes %<>%
right_join(snpIn1996)
seqClose(gdsFile)
In addition to the above, the set of genes corresponding the Ensembl Release 96 were loaded.
ensGenes <- file.path(
"..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz"
) %>%
import.gff3(feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE) %>%
.[,c("gene_id", "Name", "description")]
snpsGR <- makeGRangesFromDataFrame(
df = snps,
keep.extra.columns = TRUE,
ignore.strand = TRUE,
seqinfo = seqinfo(ensGenes),
seqnames.field = "chromosome",
start.field = "position",
end.field = "position"
)
The set of SNPs under investigation was also defined as a GRanges object. This was intersected with the set of genes to connect SNPs to genes within 40 kb.
snp2Gene <- suppressWarnings(
snpsGR %>%
resize(width = 2*1000*maxKb - 1, fix = "center") %>%
trim() %>%
join_overlap_inner(ensGenes)
)
A set of SNPs within exonic regions was also defined in order to detect any SNPs within coding regions.
ensExons <- file.path(
"..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz"
) %>%
import.gff3(feature.type = "exon", sequenceRegionsAsSeqinfo = TRUE) %>%
join_overlap_inner_within(
ensGenes, maxgap = 0, minoverlap = 0, suffix = c(".exon", "")
) %>%
join_overlap_inner(snpsGR)
ensExons <- ensExons[,setdiff(colnames(mcols(ensExons)), "Name.exon")]
lowCall <- c("gc2901", "gc2776", "gc2731", "gc2727", "gc2686")
snp4PCA <- genotypes %>%
filter(!Sample %in% lowCall) %>%
group_by(variant.id, Population) %>%
summarise(n = dplyr::n()) %>%
spread(Population, n) %>%
mutate(N = `1996` + `2010` + `2012`) %>%
ungroup() %>%
filter(N > 0.95*(sum(popSizes$n) - length(lowCall)))
pca <- genotypes %>%
filter(variant.id %in% snp4PCA$variant.id) %>%
dplyr::select(variant.id, Sample, Genotype) %>%
spread(Sample, Genotype) %>%
as.data.frame() %>%
column_to_rownames("variant.id") %>%
as.matrix() %>%
apply(2, function(x){
x[is.na(x)] <- mean(x, na.rm = TRUE)
x
}) %>%
t() %>%
.[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
prcomp( center = TRUE)
As noted in the previous section sample samples gc2901, gc2776, gc2731, gc2727 and gc2686 had a SNP identification rate \(< 50\)% and as such these were marked as potential outliers. Ignoring these samples, and restricting data to SNPs identified in \(>95\)% of all samples, a preliminary PCA was performed This amounted to 7,767 of the possible 18,878 SNPs for analysis using PCA. Missing values were specified as the mean MAF over all populations combined.
Given the initially observed structure, in which samples from the 2012 population are separating from the other samples which group with the 1996 population, the collection points for the 2012 samples as investigated.
sampleID %<>%
left_join(
file.path("..", "external", "GPS_Locations.xlsx") %>%
read_excel() %>%
dplyr::select(Sample, ends_with("tude")) %>%
mutate(Sample = gsub("[Oo][Rr][Aa] ([0-9ABC]*)", "ora\\1", Sample))
)
fs <- 14
ps <- 3
popCols <- c(rgb(0, 0, 0, 0.7),
rgb(0.8, 0.2, 0.2, 0.9),
rgb(0.2, 0.8, 0.2, 0.9),
rgb(0.5, 0.5, 1, 0.7))
pca4Plot <- pca$x %>%
as.data.frame() %>%
rownames_to_column("Sample") %>%
as_tibble() %>%
dplyr::select(Sample, PC1, PC2, PC3) %>%
left_join(sampleID) %>%
mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
group_by(Cluster) %>%
mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
ungroup() %>%
mutate(
Population = case_when(
Population == 1996 ~ "1996",
Population == 2010 ~ "Outgroup (Turretfield)",
maxY == max(maxY) ~ "2012 (Outer)",
maxY != max(maxY) ~ "2012 (Central)"
)
) %>%
left_join(
genotypes %>%
filter(variant.id %in% snp4PCA$variant.id, !Sample %in% lowCall) %>%
group_by(Sample) %>%
tally() %>%
mutate(imputationRate = 1 - n / nrow(snp4PCA)
)
)
pca4Plot %>%
ggplot(aes(PC1, PC2, colour = Population, size = imputationRate)) +
geom_point(alpha = 0.6) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance[2,"PC1"]), ")"),
y = paste0("PC2 (", percent(summary(pca)$importance[2,"PC2"]), ")"),
size = "Imputation Rate"
) +
scale_colour_manual(values = popCols)
PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.
loc <- c(
range(sampleID$Longitude, na.rm = TRUE) %>% mean,
range(sampleID$Latitude, na.rm = TRUE) %>% mean
)
saPoly <- readRDS(file.path("..", "external", "saPoly.RDS"))
roads <- readRDS(file.path("..", "external", "roads.RDS"))
gc <- SpatialPoints(cbind(x = 138.655972, y = -31.200305))
proj4string(gc) <- "+proj=longlat +ellps=GRS80 +no_defs"
xBreaks <- seq(138.65, 138.8, by = 0.05)
xLabs <- parse(text = paste(xBreaks, "*degree ~ E", sep = ""))
yBreaks <- seq(-31.2, -31.32, by = -0.04)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
leftN <- tibble(
x = c(138.7965, 138.8, 138.8) - 0.01,
y = c(-31.2, -31.198, -31.193)
)
rightN <- tibble(
x = c(138.8, 138.8, 138.8035) - 0.01,
y = c(-31.193, -31.198, -31.2)
)
ggMap <- get_map(loc, zoom = 12, maptype = "terrain", color = "bw")
zoomPlot <- ggmap(ggMap, extent = "normal", maprange = FALSE) +
geom_path(
aes(long, lat, group = group),
data = subset(roads, SURFACE == "UNSE"),
linetype = 2, size = 0.3) +
geom_path(
aes(long, lat, group = group),
data = subset(roads, SURFACE != "UNSE"),
linetype = 1, size = 0.4) +
geom_label(x = 138.74, y = -31.29, label = "Flinders Ranges NP", alpha = 0.4) +
geom_label(x = 138.72, y = -31.22, label = "Gum Creek", alpha = 0.4) +
geom_point(aes(x, y), data = as.data.frame(gc), shape = 3, size = 3) +
geom_text(aes(x, y), label = "HS", data = as.data.frame(gc), nudge_y = 0.005) +
geom_polygon(
data = subset(saPoly, F_CODE == "HD"),
aes(long, lat, group = group),
fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.3) +
geom_polygon(
data = subset(saPoly, F_CODE == "PARK"),
aes(long, lat, group = group),
fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.2) +
geom_point(
data= filter(pca4Plot, grepl("2012", Population)),
aes(Longitude, Latitude, colour = Population),
size = 0.9*ps) +
geom_polygon(
aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4
) +
geom_polygon(
aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4
) +
geom_text(
x = 138.79, y = -31.19, label = "N", colour = "grey10", size = 4
) +
scale_colour_manual(values = popCols[2:3]) +
coord_cartesian(
xlim = c(138.618, 138.81),
ylim = c(-31.335, -31.18),
expand = 0
) +
scale_x_continuous(breaks = xBreaks, labels = xLabs) +
scale_y_continuous(breaks = yBreaks, labels = yLabs) +
guides(colour = FALSE) +
ggsn::scalebar(
x.min = 138.618,
x.max = 138.81,
y.min = -31.335,
y.max = -31.18,
transform = TRUE,
dist = 2,
dist_unit = "km",
model = 'GRS80',
height = 0.012,
st.size = 4,
location = 'bottomright',
anchor = c(x = 138.8, y = -31.328)) +
labs(x = "Longitude", y = "Latitude") +
theme(
text = element_text(size = fs),
plot.margin = unit(c(1, 1, 1, 1), "mm")
)
ausPolygon <- readRDS(file.path("..", "external", "ausPolygon.RDS"))
ausPts <- SpatialPoints(cbind(x = loc[1], y = loc[2]))
proj4string(ausPts) <- proj4string(ausPolygon)
ausPlot <- ggplot() +
geom_polygon(
data = ausPolygon,
aes(long, lat, group = group),
fill = "white", colour = "black"
) +
geom_point(data = as.data.frame(ausPts), aes(x, y), size = 1.5) +
theme_void() +
theme(plot.background = element_rect(fill = "white", colour = "black"))
convert \
-density 600 \
-size 1920 \
../figures/Figure1.pdf \
../figures/Figure1.png
knitr::include_graphics(file.path("..", "figures", "Figure1.png"))
Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and k-means clustering.
zoomLoc <- c(138.753, -31.242)
xBreaks <- seq(138.74, 138.76, by = 0.01)
xLabs <- parse(text = paste(xBreaks, "*degree ~ W", sep = ""))
yBreaks <- seq(-31.235, -31.25, by = -0.005)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
central <- rbind(
x = c(138.749, 138.755),
y = c(-31.2365, -31.2495)
) %>%
set_colnames(c("min", "max"))
leftN <- tibble(
x = c(138.7595, 138.76, 138.76),
y = c(-31.234, -31.2335, -31.2325)
)
rightN <- tibble(
x = c(138.76, 138.76, 138.7605),
y = c(-31.2325, -31.2335, -31.234)
)
get_map(zoomLoc, zoom = 15, maptype = "terrain", source = "google", color = "bw") %>%
ggmap() +
geom_jitter(
data = filter(pca4Plot, grepl("2012", Population)),
aes(Longitude, Latitude, colour = Population),
size = 3, width = 0.0005, height = 0) +
geom_rect(
xmin = central["x", "min"],
xmax = central["x", "max"],
ymin = central["y", "min"],
ymax = central["y", "max"],
fill = "red", alpha = 0.01, colour = "black") +
geom_polygon(
aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4
) +
geom_polygon(
aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4
) +
geom_text(
x = 138.76, y = -31.232, label = "N", colour = "grey10", size = 5
) +
scale_colour_manual(values = popCols[2:3]) +
scale_x_continuous(breaks = xBreaks, labels = xLabs) +
scale_y_continuous(breaks = yBreaks, labels = yLabs) +
theme_bw() +
guides(colour = FALSE) +
labs(x = "Longitude", y = "Latitude") +
coord_cartesian(
xlim = c(138.74, 138.762),
ylim = c(-31.253, -31.231),
expand = 0
) +
ggsn::scalebar(
x.min = 138.74, x.max = 138.762,
y.min = -31.253, y.max = -31.231,
transform = TRUE,
dist = 0.25, dist_unit = "km",, model = 'GRS80',
height = 0.012, st.size = 4,
location = 'bottomright',
anchor = c(x = 138.761, y = -31.252)
)
Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.
The structure observed within the 2012 population in the PCA could possibly be explained by recent migration into this region. As the samples collected in the outer regions appeared very similar to the 1996 population in the above plots, this would possibly indicate migration is a very recent event as the genetic influence of this has not spread through the wider area. Although this may be due to other factors such as sampling bias, this structure was addressed by identifying SNPs which showed an association with the sub-populations identified by PCA analysis. In this way, any candidate SNPs obtained below will be less impacted by this structure, and will be more reflective of the intended variable under study, i.e. selection over time, as opposed to any internal structure of the 2012 population which may indicate migration into the breeding population.
oraRegions <- pca4Plot %>%
filter(grepl("2012", Population)) %>%
rowwise() %>%
mutate(
yCentral = cut(Latitude, breaks = central["y",], include.lowest = TRUE),
xCentral = cut(Longitude, breaks = central["x",], include.lowest = TRUE),
Central = (is.na(yCentral) + is.na(xCentral)) == 0
) %>%
dplyr::select(Sample, Central)
This model tests:
H0: No association between genotypes and collection region
HA: An association exists between genotypes and collection region
regionResults <- genotypes %>%
filter(Population == 2012) %>%
split(f = .$variant.id) %>%
mclapply(
function(x){
ft <- list(p.value = NA)
if (length(unique(x$Genotype)) > 1) {
ft <- x %>%
left_join(oraRegions) %>%
group_by(Genotype, Central) %>%
tally() %>%
spread(Genotype, n, fill = 0) %>%
column_to_rownames("Central") %>%
fisher.test()
}
x %>%
distinct(variant.id, snpID, chromosome, position) %>%
mutate(p = ft$p.value)
},
mc.cores = mc
) %>%
bind_rows() %>%
filter(!is.na(p)) %>%
arrange(p)
A total of 1682 SNPs were detected as showing a significant association between genotype and the collection region. Under H0, the number expected using α = 0.05 would be 943, and as this number was approximately double that expected, this was taken as evidence of this being a genuine point of concern for this dataset.
Notably, Type II errors were of principle concern in this instance, and as such every SNP with p < 0.05 in the above test was excluded from downstream analysis.
regionSNPs <- filter(regionResults, p < 0.05)
saveRDS(regionSNPs, "regionSNPs.RDS")
Under this additional filtering step, the original set of 18784 SNPs will be reduced to 17,102 for testing by genotype and allele frequency.
In order to verify that the removal of the above SNPs removed the undesired population structure from the 2012 population, the above PCA was repeated, excluding the SNPs marked for removal. The previous structure noted in the data was no longer evident, and as such, these SNPs were marked for removal during analysis by genotype and allele frequency.
pcaPost <- genotypes %>%
filter(
variant.id %in% snp4PCA$variant.id,
!variant.id %in% regionSNPs$variant.id
) %>%
dplyr::select(variant.id, Sample, Genotype) %>%
spread(Sample, Genotype) %>%
as.data.frame() %>%
column_to_rownames("variant.id") %>%
as.matrix() %>%
apply(2, function(x){
x[is.na(x)] <- mean(x, na.rm = TRUE)
x
}) %>%
t() %>%
.[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
prcomp( center = TRUE)
pcaPost4Plot <- pcaPost$x %>%
as.data.frame() %>%
rownames_to_column("Sample") %>%
as_tibble() %>%
dplyr::select(Sample, PC1, PC2, PC3) %>%
left_join(sampleID) %>%
mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
group_by(Cluster) %>%
mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
ungroup() %>%
mutate(
Population = case_when(
Population == 1996 ~ "1996",
Population == 2010 ~ "Outgroup (Turretfield)",
maxY == max(maxY) ~ "2012 (Outer)",
maxY != max(maxY) ~ "2012 (Central)"
)
) %>%
left_join(
genotypes %>%
filter(variant.id %in% snp4PCA$variant.id, !Sample %in% lowCall) %>%
group_by(Sample) %>%
tally() %>%
mutate(imputationRate = 1 - n / nrow(snp4PCA))
)
convert \
-density 600 \
-size 1920 \
../figures/Figure2.pdf \
../figures/Figure2.png
knitr::include_graphics(file.path("..", "figures", "Figure2.png"))
Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs
This model tests:
H0: No association between genotypes and populations
HA: An association exists between genotypes and populations
genotypeResults <- genotypes %>%
filter(
Population != 2010,
!variant.id %in% regionSNPs$variant.id
) %>%
group_by(variant.id, snpID, Population, Genotype) %>%
tally() %>%
ungroup() %>%
split(f = .$variant.id) %>%
mclapply(function(x){
ft <- list(p.value = NA)
if (length(unique(x$Genotype)) > 1) {
ft <- x %>%
spread(Genotype, n, fill = 0) %>%
column_to_rownames("Population") %>%
dplyr::select(-variant.id, -snpID) %>%
fisher.test()
}
x %>%
distinct(variant.id) %>%
mutate(p = ft$p.value)
},
mc.cores = mc
) %>%
bind_rows() %>%
filter(!is.na(p)) %>%
mutate(FDR = p.adjust(p, "fdr"), adjP = p.adjust(p, "bonferroni")) %>%
arrange(p) %>%
left_join(
genotypes %>%
distinct(variant.id, snpID, chromosome, position)
) %>%
dplyr::select(variant.id, snpID, chromosome, position, everything())
genoSNPs <- genotypeResults %>%
dplyr::filter(FDR < alpha) %>%
.[["snpID"]]
snpsGR %>%
subset(snpID %in% genoSNPs) %>%
join_nearest(ensGenes) %>%
as.data.frame() %>%
left_join(
ensGenes %>% as.data.frame(),
by = c("Name", "gene_id", "seqnames", "description"),
suffix = c("_snp", "_gene")
) %>%
as_tibble() %>%
dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene) %>%
mutate(dist2Gene = case_when(
start_snp >= start_gene & start_snp <= end_gene ~ 0L,
start_snp < start_gene ~ start_gene - start_snp,
start_snp > end_gene ~ start_snp - end_gene
),
Name = case_when(
dist2Gene > maxKb*1000 ~ "",
dist2Gene < maxKb*1000 ~ Name
),
gene_id = case_when(
dist2Gene > maxKb*1000 ~ "",
dist2Gene < maxKb*1000 ~ gene_id
),
inExon = snpID %in% ensExons$snpID,
dist2Gene = case_when(
Name == "" ~ "",
inExon ~ "CDS",
dist2Gene == 0 & !inExon ~ "Intronic",
dist2Gene > 0 ~ comma(dist2Gene)
),
seqnames = as.character(seqnames)
) %>%
dplyr::select(
snpID,
chromosome = seqnames,
position = start_snp,
gene_id, Name, dist2Gene
) %>%
bind_rows(
subset(snpsGR, snpID %in% setdiff(genoSNPs, .$snpID)) %>%
as.data.frame() %>%
mutate(seqnames = as.character(seqnames)) %>%
dplyr::select(
snpID,
chromosome = seqnames,
position = start
)
) %>%
left_join(genotypeResults) %>%
dplyr::select(-variant.id, FDR) %>%
arrange(p) %>%
mutate(
position = comma(position),
p = tidyP(p),
FDR = tidyP(FDR),
adjP = tidyP(adjP)
) %>%
pander(
justify = "llrlllrrr",
style = "rmarkdown",
big.mark = ",",
split.tables = Inf,
caption = paste(
"*Candidate SNPs to an FDR of", percent_format(1)(alpha), "when analysing by genotype.",
"The nearest gene to each SNP (within", paste0(maxKb, "kb"), ") is indicated.",
"Bonferroni-adjusted p-values are also given in the final column.*")
)
snpID | chromosome | position | gene_id | Name | dist2Gene | p | FDR | adjP |
---|---|---|---|---|---|---|---|---|
3391201_92 | GL018705 | 1,937,359 | ENSOCUG00000016428 | RCBTB2 | 37,689 | 6.10e-08 | 9.04e-04 | 0.001 |
686773_29 | 3 | 90,851,512 | ENSOCUG00000009616 | CRISPLD1 | CDS | 1.05e-07 | 9.04e-04 | 0.002 |
916731_91 | 4 | 89,602,973 | ENSOCUG00000006126 | RIC8B | Intronic | 1.87e-07 | 9.84e-04 | 0.003 |
836950_151 | 4 | 35,111,855 | ENSOCUG00000012144 | TMPRSS12 | 5,745 | 2.59e-07 | 9.84e-04 | 0.004 |
321186_117 | 2 | 27,995,273 | 2.87e-07 | 9.84e-04 | 0.005 | |||
2227006_109 | 13 | 129,650,867 | ENSOCUG00000022823 | ID3 | 16,426 | 4.42e-07 | 0.001 | 0.008 |
3040789_114 | 19 | 30,689,426 | ENSOCUG00000010440 | MSI2 | Intronic | 4.84e-07 | 0.001 | 0.008 |
4165193_106 | GL019119 | 147,630 | 4.90e-07 | 0.001 | 0.008 | |||
4223149_99 | GL019332 | 5,745 | 6.12e-07 | 0.001 | 0.011 | |||
1089925_112 | 7 | 35,584,375 | 9.80e-07 | 0.002 | 0.017 | |||
464226_128 | 2 | 125,603,458 | 1.01e-06 | 0.002 | 0.017 | |||
3374849_109 | GL018704 | 1,294,009 | 1.39e-06 | 0.002 | 0.024 | |||
3695415_108 | GL018751 | 737,318 | ENSOCUG00000003672 | TRAF3 | Intronic | 1.84e-06 | 0.002 | 0.032 |
2099634_108 | 13 | 45,086,624 | 2.03e-06 | 0.002 | 0.035 | |||
1902686_98 | 12 | 60,113,885 | ENSOCUG00000001871 | 12,820 | 2.09e-06 | 0.002 | 0.036 | |
644062_1 | 3 | 56,309,981 | ENSOCUG00000004827 | SFXN1 | Intronic | 3.51e-06 | 0.004 | 0.060 |
4149102_107 | GL019077 | 34,265 | ENSOCUG00000024268 | IFT140 | Intronic | 4.82e-06 | 0.005 | 0.083 |
87917_119 | 1 | 64,635,286 | ENSOCUG00000007758 | CEP78 | Intronic | 5.49e-06 | 0.005 | 0.094 |
2689075_49 | 16 | 56,527,308 | ENSOCUG00000000984 | ESRRG | Intronic | 5.62e-06 | 0.005 | 0.096 |
4010189_98 | GL018907 | 283,766 | ENSOCUG00000008870 | BRWD1 | Intronic | 5.93e-06 | 0.005 | 0.102 |
4044777_119 | GL018933 | 204,058 | ENSOCUG00000005859 | FNBP1 | Intronic | 7.75e-06 | 0.006 | 0.133 |
4146664_90 | GL019084 | 74,038 | ENSOCUG00000010071 | GNB1 | Intronic | 1.84e-05 | 0.014 | 0.316 |
3016258_99 | 19 | 19,178,148 | ENSOCUG00000015254 | SEZ6 | Intronic | 2.17e-05 | 0.016 | 0.372 |
4176850_23 | GL019154 | 883 | ENSOCUG00000002484 | MTHFSD | 828 | 2.18e-05 | 0.016 | 0.374 |
4098854_101 | GL018985 | 129,495 | ENSOCUG00000006396 | Intronic | 3.09e-05 | 0.021 | 0.530 | |
2465104_19 | 14 | 157,742,593 | 3.67e-05 | 0.023 | 0.629 | |||
3032107_92 | 19 | 26,020,372 | ENSOCUG00000024664 | LHX1 | 20,607 | 3.71e-05 | 0.023 | 0.637 |
2906428_79 | 18 | 25,311,176 | ENSOCUG00000011533 | RHOBTB1 | 24 | 3.76e-05 | 0.023 | 0.646 |
3829050_109 | GL018791 | 957,960 | ENSOCUG00000007849 | FHAD1 | Intronic | 4.01e-05 | 0.023 | 0.687 |
1009106_1 | 6 | 9,037,073 | ENSOCUG00000027083 | CLEC19A | 15,180 | 4.15e-05 | 0.023 | 0.712 |
482617_100 | 2 | 139,723,776 | ENSOCUG00000001127 | RHOQ | 8,346 | 4.34e-05 | 0.023 | 0.744 |
3378006_1 | GL018704 | 2,271,135 | ENSOCUG00000006255 | AGO3 | 9,610 | 4.45e-05 | 0.023 | 0.764 |
2580149_42 | 15 | 87,473,844 | ENSOCUG00000001910 | ADGRL3 | Intronic | 4.52e-05 | 0.023 | 0.775 |
1685466_9 | 10 | 42,335,759 | 6.43e-05 | 0.032 | 1.000 | |||
4136275_162 | GL019049 | 178,272 | ENSOCUG00000026644 | Intronic | 9.11e-05 | 0.045 | 1.000 | |
2028473_73 | 13 | 2,979,383 | 9.62e-05 | 0.045 | 1.000 | |||
1839042_17 | 12 | 15,448,280 | 9.72e-05 | 0.045 | 1.000 | |||
4208471_113 | GL019274 | 63,350 | ENSOCUG00000015189 | 37,542 | 1.01e-04 | 0.045 | 1.000 |
Under the full genotype model:
convert \
-density 600 \
-size 1920 \
../figures/FigureS3a.pdf \
../figures/FigureS3a.png
knitr::include_graphics(file.path("..", "figures", "FigureS3a.png"))
Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with a Bonferroni-adjusted p-value < 0,05 additionally shown in green
genotypeResults %>%
filter(FDR < alpha) %>%
left_join(
filter(
genotypes,
snpID %in% .$snpID,
Population %in% c(1996, 2012)
)
) %>%
mutate(
Genotype = c("AA", "AB", "BB")[Genotype + 1],
Genotype = as.factor(Genotype),
Population = as.factor(Population),
snpID = case_when(
adjP < alpha ~ paste0(snpID, "*"),
adjP >= alpha ~ as.character(snpID)
),
snpID = factor(snpID, levels = unique(snpID))
) %>%
group_by(snpID, Population, Genotype) %>%
tally() %>%
ungroup() %>%
group_by(snpID, Population) %>%
mutate(freq = n / sum(n)) %>%
ggplot(aes(Genotype, freq, fill = Population)) +
geom_bar(
stat = "identity",
position = position_dodge2(preserve = "single")
) +
facet_wrap(~snpID, drop = FALSE, ncol = 7) +
scale_fill_manual(values = c("black", "blue")) +
labs(y = "Frequency") +
theme(
legend.direction = "horizontal",
legend.position = c(0.75, 0.05)
)
Variants detected as significant to an FDR of 5%. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance, with some variants becoming exclusively heterozygous in the 2012 population.
write_tsv(genotypeResults, "genotypeResults.tsv")
This model tests:
H0: No association between allele frequencies and populations
HA: An association exists between allele frequencies and populations
alleleResults <- genotypes %>%
filter(
Population != 2010,
!variant.id %in% regionSNPs$variant.id
) %>%
group_by(snpID, Population) %>%
summarise(
P = sum(2 - Genotype),
Q = sum(Genotype)
) %>%
ungroup() %>%
split(f = .$snpID) %>%
mclapply(function(x){
m <- as.matrix(x[c("P", "Q")])
ft <- list(p.value = NA)
if (length(m) == 4) {
ft <- fisher.test(m)
}
x %>%
mutate(MAF = Q / (P + Q)) %>%
dplyr::select(snpID, Population, MAF) %>%
spread(Population, MAF) %>%
mutate(p = ft$p.value)
},mc.cores = mc) %>%
bind_rows() %>%
filter(!is.na(p)) %>%
dplyr::rename(
MAF_1996 = `1996`,
MAF_2012 = `2012`
) %>%
mutate(
FDR = p.adjust(p, "fdr"),
adjP = p.adjust(p, "bonferroni"),
# Make the MAF is relative to the 1996 population not the reference
MAF_2012 = case_when(
MAF_1996 > 0.5 ~ 1 - MAF_2012,
MAF_1996 <= 0.5 ~ MAF_2012
),
MAF_1996 = case_when(
MAF_1996 > 0.5 ~ 1 - MAF_1996,
MAF_1996 <= 0.5 ~ MAF_1996
)
) %>%
arrange(p) %>%
left_join(
genotypes %>% distinct(snpID, chromosome, position)
) %>%
dplyr::select(snpID, chromosome, position, everything())
alleleSnps <- alleleResults %>%
dplyr::filter(FDR < alpha) %>%
.[["snpID"]]
snpsGR %>%
subset(snpID %in% alleleSnps) %>%
join_nearest(ensGenes) %>%
as.data.frame() %>%
left_join(
ensGenes %>% as.data.frame(),
by = c("Name", "gene_id", "seqnames", "description"),
suffix = c("_snp", "_gene")
) %>%
as_tibble() %>%
dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene) %>%
mutate(dist2Gene = case_when(
start_snp >= start_gene & start_snp <= end_gene ~ 0L,
start_snp < start_gene ~ start_gene - start_snp,
start_snp > end_gene ~ start_snp - end_gene
),
Name = case_when(
dist2Gene > maxKb*1000 ~ "",
dist2Gene < maxKb*1000 ~ Name
),
gene_id = case_when(
dist2Gene > maxKb*1000 ~ "",
dist2Gene < maxKb*1000 ~ gene_id
),
inExon = snpID %in% ensExons$snpID,
dist2Gene = case_when(
Name == "" ~ "",
inExon ~ "CDS",
dist2Gene == 0 & !inExon ~ "Intronic",
dist2Gene > 0 ~ comma(dist2Gene)
),
seqnames = as.character(seqnames)
) %>%
dplyr::select(
snpID,
chromosome = seqnames,
position = start_snp,
gene_id, Name, dist2Gene
) %>%
left_join(alleleResults) %>%
dplyr::select(
snpID, chromosome, position, starts_with("MAF"),
gene_id, Name, dist2Gene, p, FDR, adjP
) %>%
arrange(p) %>%
mutate(
position = comma(position),
MAF_1996 = sprintf("%.3f", MAF_1996),
MAF_2012 = sprintf("%.3f", MAF_2012),
p = tidyP(p),
FDR = tidyP(FDR),
adjP = tidyP(adjP)
) %>%
pander(
justify = "llrrrlllrrr",
style = "rmarkdown",
big.mark = ",",
split.tables = Inf,
caption = paste(
"*SNPs considered as significant when analysing by genotype using an FDR cutoff of",
percent_format(1)(alpha),
"Minor allele frequencies are reported based on frequencies in the 1996 population.",
"Any genes within", paste0(maxKb, "kb"), "are also shown.",
"Bonferroni-adjusted p-values are given in the final column.*"
)
)
snpID | chromosome | position | MAF_1996 | MAF_2012 | gene_id | Name | dist2Gene | p | FDR | adjP |
---|---|---|---|---|---|---|---|---|---|---|
3391201_92 | GL018705 | 1,937,359 | 0.225 | 0.010 | ENSOCUG00000016428 | RCBTB2 | 37,689 | 3.44e-07 | 0.006 | 0.006 |
916731_91 | 4 | 89,602,973 | 0.196 | 0.000 | ENSOCUG00000006126 | RIC8B | Intronic | 6.88e-07 | 0.006 | 0.012 |
3040789_114 | 19 | 30,689,426 | 0.186 | 0.000 | ENSOCUG00000010440 | MSI2 | Intronic | 1.53e-06 | 0.008 | 0.026 |
321186_117 | 2 | 27,995,273 | 0.250 | 0.012 | 1.87e-06 | 0.008 | 0.032 | |||
1009106_1 | 6 | 9,037,073 | 0.443 | 0.128 | ENSOCUG00000027083 | CLEC19A | 15,180 | 2.52e-06 | 0.008 | 0.043 |
836950_151 | 4 | 35,111,855 | 0.291 | 0.041 | ENSOCUG00000012144 | TMPRSS12 | 5,745 | 3.09e-06 | 0.008 | 0.053 |
464226_128 | 2 | 125,603,458 | 0.191 | 0.000 | 3.11e-06 | 0.008 | 0.053 | |||
2227006_109 | 13 | 129,650,867 | 0.271 | 0.033 | ENSOCUG00000022823 | ID3 | 16,426 | 4.17e-06 | 0.008 | 0.072 |
1902686_98 | 12 | 60,113,885 | 0.163 | 0.000 | ENSOCUG00000001871 | 12,820 | 4.75e-06 | 0.008 | 0.082 | |
3374849_109 | GL018704 | 1,294,009 | 0.207 | 0.010 | 4.87e-06 | 0.008 | 0.084 | |||
4044777_119 | GL018933 | 204,058 | 0.018 | 0.202 | ENSOCUG00000005859 | FNBP1 | Intronic | 7.75e-06 | 0.011 | 0.133 |
1089925_112 | 7 | 35,584,375 | 0.284 | 0.043 | 9.72e-06 | 0.013 | 0.167 | |||
3695415_108 | GL018751 | 737,318 | 0.261 | 0.033 | ENSOCUG00000003672 | TRAF3 | Intronic | 1.28e-05 | 0.014 | 0.219 |
2099634_108 | 13 | 45,086,624 | 0.261 | 0.034 | 1.41e-05 | 0.014 | 0.243 | |||
87917_119 | 1 | 64,635,286 | 0.186 | 0.000 | ENSOCUG00000007758 | CEP78 | Intronic | 1.43e-05 | 0.014 | 0.246 |
4010189_98 | GL018907 | 283,766 | 0.193 | 0.010 | ENSOCUG00000008870 | BRWD1 | Intronic | 1.63e-05 | 0.015 | 0.280 |
2689075_49 | 16 | 56,527,308 | 0.058 | 0.304 | ENSOCUG00000000984 | ESRRG | Intronic | 1.65e-05 | 0.015 | 0.284 |
4149102_107 | GL019077 | 34,265 | 0.223 | 0.029 | ENSOCUG00000024268 | IFT140 | Intronic | 2.28e-05 | 0.020 | 0.392 |
1997286_93 | 12 | 141,844,290 | 0.089 | 0.356 | ENSOCUG00000004829 | ESR1 | Intronic | 2.44e-05 | 0.020 | 0.419 |
2906428_79 | 18 | 25,311,176 | 0.173 | 0.443 | ENSOCUG00000011533 | RHOBTB1 | 24 | 2.54e-05 | 0.020 | 0.437 |
2405359_66 | 14 | 97,813,263 | 0.021 | 0.202 | ENSOCUG00000005835 | STXBP5L | Intronic | 4.48e-05 | 0.033 | 0.770 |
3000704_65 | 19 | 13,055,154 | 0.032 | 0.234 | 5.42e-05 | 0.038 | 0.931 | |||
4146664_90 | GL019084 | 74,038 | 0.196 | 0.020 | ENSOCUG00000010071 | GNB1 | Intronic | 5.49e-05 | 0.038 | 0.944 |
4098854_101 | GL018985 | 129,495 | 0.170 | 0.010 | ENSOCUG00000006396 | Intronic | 6.69e-05 | 0.043 | 1.000 | |
4098838_14 | GL018985 | 123,328 | 0.364 | 0.127 | ENSOCUG00000006396 | Intronic | 6.80e-05 | 0.043 | 1.000 | |
1882774_17 | 12 | 40,509,779 | 0.321 | 0.594 | 7.29e-05 | 0.043 | 1.000 | |||
1045185_75 | 7 | 2,702,919 | 0.214 | 0.038 | ENSOCUG00000027861 | 3,276 | 7.81e-05 | 0.043 | 1.000 | |
3016258_99 | 19 | 19,178,148 | 0.220 | 0.033 | ENSOCUG00000015254 | SEZ6 | Intronic | 7.86e-05 | 0.043 | 1.000 |
2028473_73 | 13 | 2,979,383 | 0.018 | 0.173 | 8.05e-05 | 0.043 | 1.000 | |||
3829050_109 | GL018791 | 957,960 | 0.163 | 0.010 | ENSOCUG00000007849 | FHAD1 | Intronic | 8.09e-05 | 0.043 | 1.000 |
2580149_42 | 15 | 87,473,844 | 0.351 | 0.115 | ENSOCUG00000001910 | ADGRL3 | Intronic | 8.42e-05 | 0.043 | 1.000 |
2527760_104 | 15 | 43,156,117 | 0.153 | 0.406 | 8.59e-05 | 0.043 | 1.000 | |||
3999128_75 | GL018883 | 283,912 | 0.365 | 0.125 | ENSOCUG00000016318 | SERPINA6 | Intronic | 8.68e-05 | 0.043 | 1.000 |
4176850_23 | GL019154 | 883 | 0.234 | 0.041 | ENSOCUG00000002484 | MTHFSD | 828 | 8.92e-05 | 0.043 | 1.000 |
686773_29 | 3 | 90,851,512 | 0.438 | 0.167 | ENSOCUG00000009616 | CRISPLD1 | CDS | 9.73e-05 | 0.045 | 1.000 |
3032107_92 | 19 | 26,020,372 | 0.194 | 0.021 | ENSOCUG00000024664 | LHX1 | 20,607 | 1.02e-04 | 0.046 | 1.000 |
Under this model:
convert \
-density 600 \
-size 1920 \
../figures/FigureS3b.pdf \
../figures/FigureS3b.png
knitr::include_graphics(file.path("..", "figures", "FigureS3b.png"))
Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.
alleleResults %>%
mutate(Sig = FDR < alpha) %>%
ggplot(aes(MAF_1996, MAF_2012, colour = Sig, label = snpID)) +
geom_point() +
geom_text_repel(data = . %>% filter(adjP < alpha)) +
# geom_text_repel(data = . %>%filter(FDR < alpha & MAF_2012 > MAF_1996)) +
scale_colour_manual(values = c(rgb(0.5, 0.5, 0.5, 0.6), "red")) +
labs(x = "MAF (1996)", y = "MAF (2012)") +
theme(legend.position = "none")
Comparison of minor allele frequencies in both populations. SNPs with an FDR-adjusted p-value < 0.05 are coloured red, whilst those with a Bonferroni-adjusted p-value are additionally labelled.
alleleResults %>%
filter(FDR < alpha) %>%
gather(Population, B, contains("MAF")) %>%
mutate(
Population = str_remove(Population, "MAF_"),
A = 1 - B,
Population = as.factor(Population),
snpID = case_when(
adjP < alpha ~ paste0(snpID, "*"),
adjP >= alpha ~ as.character(snpID)
),
snpID = factor(snpID, levels = unique(snpID))
) %>%
gather(Allele, Freq, A, B) %>%
ggplot(aes(Allele, y = Freq, fill = Population)) +
geom_bar(
position = position_dodge2(preserve = "single"),
stat = "identity"
) +
facet_wrap(~snpID, drop = FALSE, ncol = 7) +
scale_fill_manual(values = c("black", "blue")) +
labs(y = "Frequency") +
theme(
legend.direction = "horizontal",
legend.position = c(0.75, 0.05)
)
Variants detected as significant to an FDR of 5% under the Allele Frequency model. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance.
write_tsv(alleleResults, "alleleResults.tsv")
This was performed separately and no unique SNPs were detected in addition to any detected above.
A VCF was required with only the 1996 and 2012 populations, and restricted to the candidate SNPs after pruning for linkage disequilibrium and detection of the allele in the 1996 population.
var2VCF <- genotypes %>%
distinct(variant.id, snpID) %>%
filter(snpID %in% filter(regionResults, p > 0.05)$snpID) %>%
.[["variant.id"]]
gdsFile <- seqOpen(gdsPath, readonly = TRUE)
seqSetFilter(
gdsFile,
variant.id = var2VCF,
sample.id = sampleID %>%
filter(Population %in% c(1996, 2012)) %>%
.[["Sample"]]
)
seqGDS2VCF(gdsFile, "../5_stacks/vcf/filtered.vcf.gz")
seqResetFilter(gdsFile)
seqClose(gdsFile)
After running Bayescan, results were imported. Unfortunately, these results had IDs which were taken from the line in the above VCF and need to be converted back to IDs compatible with previous analysis.
bayRes <- file.path("..", "external", "BayescanOut.txt.gz") %>%
gzfile() %>%
read_delim(delim = " ", col_names = FALSE, skip = 1, col_types = "innnnn-") %>%
set_colnames(c("ID", "prob","log10.PO.","qval","alpha","fst")) %>%
mutate(variant.id = var2VCF[ID])
bayRes %>%
dplyr::select(variant.id, qval, fst) %>%
dplyr::filter(qval < alpha) %>%
left_join(
genotypes %>%
distinct(snpID, .keep_all = TRUE) %>%
dplyr::select(variant.id, chromosome, position, snpID)
) %>%
makeGRangesFromDataFrame(
keep.extra.columns = TRUE,
ignore.strand = TRUE,
seqinfo = seqinfo(snp2Gene),
seqnames.field = "chromosome",
start.field = "position",
end.field = "position"
) %>%
join_nearest(ensGenes) %>%
as.data.frame() %>%
left_join(
ensGenes %>% as.data.frame(),
by = c("Name", "gene_id", "seqnames", "description"),
suffix = c("_snp", "_gene")
) %>%
as_tibble() %>%
dplyr::select(seqnames, start_snp, variant.id, snpID, gene_id, Name, start_gene, end_gene, qval, fst) %>%
mutate(dist2Gene = case_when(
start_snp >= start_gene & start_snp <= end_gene ~ 0L,
start_snp < start_gene ~ start_gene - start_snp,
start_snp > end_gene ~ start_snp - end_gene
),
Name = case_when(
dist2Gene > maxKb*1000 ~ "",
dist2Gene < maxKb*1000 ~ Name
),
gene_id = case_when(
dist2Gene > maxKb*1000 ~ "",
dist2Gene < maxKb*1000 ~ gene_id
),
inExon = snpID %in% ensExons$snpID,
dist2Gene = case_when(
Name == "" ~ "",
inExon ~ "CDS",
dist2Gene == 0 & !inExon ~ "Intronic",
dist2Gene > 0 ~ comma(dist2Gene)
),
seqnames = as.character(seqnames)
) %>%
dplyr::select(
snpID,
chromosome = seqnames, position = start_snp,
gene_id, Name, dist2Gene, fst, qval) %>%
# Replace the one dropped by `join_nearest()`
bind_rows(
bayRes %>%
dplyr::select(variant.id, qval, fst) %>%
dplyr::filter(qval < alpha) %>%
left_join(
genotypes %>%
distinct(snpID, .keep_all = TRUE) %>%
dplyr::select(variant.id, chromosome, position, snpID)
) %>%
mutate(
gene_id = "",
Name = "",
dist2Gene = ""
) %>%
dplyr::select(-variant.id)
) %>%
distinct(snpID, .keep_all = TRUE) %>%
mutate(position = comma(position)) %>%
arrange(qval) %>%
mutate(
fst = tidyP(fst),
qval = tidyP(qval)
) %>%
pander(
justify = "llrllrrr",
style = "rmarkdown",
big.mark = ",",
split.tables = Inf,
caption = paste(
"*Results from Bayescan with a q-value <", paste0(alpha, "."),
"The nearest gene within", paste0(maxKb, "kb"), "is also given.",
"One SNP detected by Bayescan was located on a scaffold with no genes (GL019119).*"
)
)
snpID | chromosome | position | gene_id | Name | dist2Gene | fst | qval |
---|---|---|---|---|---|---|---|
3391201_92 | GL018705 | 1,937,359 | ENSOCUG00000016428 | RCBTB2 | 37,689 | 0.043 | 0.003 |
321186_117 | 2 | 27,995,273 | 0.042 | 0.008 | |||
1009106_1 | 6 | 9,037,073 | ENSOCUG00000027083 | CLEC19A | 15,180 | 0.036 | 0.013 |
836950_151 | 4 | 35,111,855 | ENSOCUG00000012144 | TMPRSS12 | 5,745 | 0.035 | 0.018 |
2227006_109 | 13 | 129,650,867 | ENSOCUG00000022823 | ID3 | 16,426 | 0.035 | 0.023 |
3374849_109 | GL018704 | 1,294,009 | 0.037 | 0.026 | |||
4044777_119 | GL018933 | 204,058 | ENSOCUG00000005859 | FNBP1 | Intronic | 0.033 | 0.031 |
4165193_106 | GL019119 | 147,630 | 0.034 | 0.034 | |||
3695415_108 | GL018751 | 737,318 | ENSOCUG00000003672 | TRAF3 | Intronic | 0.033 | 0.038 |
4010189_98 | GL018907 | 283,766 | ENSOCUG00000008870 | BRWD1 | Intronic | 0.033 | 0.044 |
1089925_112 | 7 | 35,584,375 | 0.032 | 0.049 |
grid.newpage()
list(
Genotype = dplyr::filter(genotypeResults, FDR < alpha)$variant.id,
Allele = alleleResults %>%
left_join(snps) %>%
dplyr::filter(FDR < alpha) %>%
.[["variant.id"]],
Bayescan = dplyr::filter(bayRes, qval < 0.05)$variant.id
) %>%
VennDiagram::venn.diagram(
filename = NULL,
cex = 2,
cat.cex = 2
) %>%
grid.draw()
Summary of results from all three methods, using an FDR of 0.05 for each approach. All SNPs detected under Bayescan were identified using Fisher’s Exact Test for both allele and genotype analysis
sigSnps <- c(
filter(alleleResults, FDR < alpha)$snpID,
filter(genotypeResults, FDR < alpha)$snpID
) %>%
unique()
suppressWarnings(
snpsGR %>%
subset(snpID %in% sigSnps) %>%
resize(width = 1000*maxKb*2, fix = "center") %>%
trim() %>%
find_overlaps(ensGenes) %>%
as.data.frame() %>%
as_tibble() %>%
left_join(snps, by = c("variant.id", "snpID", "Col")) %>%
dplyr::select(snpID, gene_id, Name, chromosome, position) %>%
left_join(
as.data.frame(ensGenes), by = c("gene_id", "Name")
) %>%
mutate(
distance = case_when(
position < start ~ comma(start - position, accuracy = 1),
position > end ~ comma(position - end, accuracy = 1),
snpID %in% ensExons$snpID ~ "Exonic",
position >= start & position <= end ~ "Intronic"
),
model = ifelse(
snpID %in% filter(alleleResults, FDR < alpha)$snpID, "A", ""
),
model = ifelse(
snpID %in% dplyr::filter(snps, variant.id %in% dplyr::filter(bayRes, qval < 0.05)$variant.id)$snpID,
paste0(model, "B"), model
),
model = ifelse(
snpID %in% filter(genotypeResults, FDR < alpha)$snpID,
paste0(model, "G"), model
),
width = comma(width, accuracy = 1)
) %>%
unite(location, chromosome, position, sep = ":") %>%
dplyr::select(
`SNP Location` = location,
`SNP ID` = snpID,
`Gene ID` = gene_id,
`Gene Name` = Name,
`Gene Width` = width,
`Distance` = distance,
Models = model
)
) %>%
pander(
justify = "llllrll",
style = "rmarkdown",
split.tables = Inf,
# big.mark = ",",
caption = paste0(
"*All putatively significant SNPs with genes located within ", maxKb, "kb. ",
"The distance from SNP to gene is indicated, unless the SNP is intronic/exonic. ",
"The model(s) which identified the SNP are indicated in the final column ",
"as (A) the Allele Model; (B) Bayescan and (G) the Genotype Model.*"
)
)
SNP Location | SNP ID | Gene ID | Gene Name | Gene Width | Distance | Models |
---|---|---|---|---|---|---|
1:64635286 | 87917_119 | ENSOCUG00000007758 | CEP78 | 51,885 | Intronic | AG |
2:139723776 | 482617_100 | ENSOCUG00000001127 | RHOQ | 31,670 | 8,346 | G |
2:139723776 | 482617_100 | ENSOCUG00000021835 | ATP6V1E2 | 681 | 10,081 | G |
2:139723776 | 482617_100 | ENSOCUG00000028184 | TMEM247 | 4,917 | 37,653 | G |
3:56309981 | 644062_1 | ENSOCUG00000004827 | SFXN1 | 26,726 | Intronic | G |
3:90851512 | 686773_29 | ENSOCUG00000009616 | CRISPLD1 | 51,813 | Exonic | AG |
4:35111855 | 836950_151 | ENSOCUG00000012144 | TMPRSS12 | 40,859 | 5,745 | ABG |
4:89602973 | 916731_91 | ENSOCUG00000006126 | RIC8B | 123,296 | Intronic | AG |
6:9037073 | 1009106_1 | ENSOCUG00000027083 | CLEC19A | 20,430 | 15,180 | ABG |
7:2702919 | 1045185_75 | ENSOCUG00000007975 | ZNF777 | 32,875 | 12,875 | A |
7:2702919 | 1045185_75 | ENSOCUG00000027861 | 1,123 | 3,276 | A | |
12:60113885 | 1902686_98 | ENSOCUG00000016906 | 103,853 | 13,295 | AG | |
12:60113885 | 1902686_98 | ENSOCUG00000001871 | 1,594 | 12,820 | AG | |
12:60113885 | 1902686_98 | ENSOCUG00000021694 | 1,144 | 18,149 | AG | |
12:60113885 | 1902686_98 | ENSOCUG00000023798 | 2,496 | 35,410 | AG | |
12:141844290 | 1997286_93 | ENSOCUG00000004829 | ESR1 | 474,528 | Intronic | A |
13:129650867 | 2227006_109 | ENSOCUG00000022823 | ID3 | 528 | 16,426 | ABG |
14:97813263 | 2405359_66 | ENSOCUG00000005835 | STXBP5L | 418,595 | Intronic | A |
15:87473844 | 2580149_42 | ENSOCUG00000001910 | ADGRL3 | 571,686 | Intronic | AG |
16:56527308 | 2689075_49 | ENSOCUG00000000984 | ESRRG | 238,366 | Intronic | AG |
18:25311176 | 2906428_79 | ENSOCUG00000011533 | RHOBTB1 | 116,987 | 24 | AG |
19:19178148 | 3016258_99 | ENSOCUG00000015254 | SEZ6 | 46,097 | Intronic | AG |
19:26020372 | 3032107_92 | ENSOCUG00000014434 | AATF | 127,766 | 30,904 | AG |
19:26020372 | 3032107_92 | ENSOCUG00000024664 | LHX1 | 6,982 | 20,607 | AG |
19:30689426 | 3040789_114 | ENSOCUG00000010440 | MSI2 | 396,928 | Intronic | AG |
GL018704:2271135 | 3378006_1 | ENSOCUG00000011042 | COL8A2 | 3,187 | 34,195 | G |
GL018704:2271135 | 3378006_1 | ENSOCUG00000011038 | ADPRHL2 | 5,946 | 24,425 | G |
GL018704:2271135 | 3378006_1 | ENSOCUG00000029722 | TEKT2 | 3,671 | 20,166 | G |
GL018704:2271135 | 3378006_1 | ENSOCUG00000027798 | 1,002 | 16,486 | G | |
GL018704:2271135 | 3378006_1 | ENSOCUG00000006255 | AGO3 | 128,900 | 9,610 | G |
GL018705:1937359 | 3391201_92 | ENSOCUG00000016428 | RCBTB2 | 37,081 | 37,689 | ABG |
GL018751:737318 | 3695415_108 | ENSOCUG00000010323 | CDC42BPB | 107,021 | 5,698 | ABG |
GL018751:737318 | 3695415_108 | ENSOCUG00000003672 | TRAF3 | 48,133 | Intronic | ABG |
GL018791:957960 | 3829050_109 | ENSOCUG00000007849 | FHAD1 | 123,752 | Intronic | AG |
GL018883:283912 | 3999128_75 | ENSOCUG00000016273 | PPP4R4 | 96,327 | 21,034 | A |
GL018883:283912 | 3999128_75 | ENSOCUG00000016318 | SERPINA6 | 21,391 | Intronic | A |
GL018907:283766 | 4010189_98 | ENSOCUG00000008870 | BRWD1 | 121,750 | Intronic | ABG |
GL018933:204058 | 4044777_119 | ENSOCUG00000005859 | FNBP1 | 73,115 | Intronic | ABG |
GL018985:123328 | 4098838_14 | ENSOCUG00000006403 | CLDN5 | 657 | 14,238 | A |
GL018985:123328 | 4098838_14 | ENSOCUG00000006396 | 38,624 | Intronic | A | |
GL018985:123328 | 4098838_14 | ENSOCUG00000025709 | UFD1 | 18,611 | 28,184 | A |
GL018985:129495 | 4098854_101 | ENSOCUG00000006403 | CLDN5 | 657 | 20,405 | AG |
GL018985:129495 | 4098854_101 | ENSOCUG00000006396 | 38,624 | Intronic | AG | |
GL018985:129495 | 4098854_101 | ENSOCUG00000025709 | UFD1 | 18,611 | 22,017 | AG |
GL019049:178272 | 4136275_162 | ENSOCUG00000009045 | ASS1 | 36,901 | 36,520 | G |
GL019049:178272 | 4136275_162 | ENSOCUG00000017433 | 8,159 | 11,078 | G | |
GL019049:178272 | 4136275_162 | ENSOCUG00000026644 | 29,389 | Intronic | G | |
GL019084:74038 | 4146664_90 | ENSOCUG00000010071 | GNB1 | 40,037 | Intronic | AG |
GL019084:74038 | 4146664_90 | ENSOCUG00000021568 | NADK | 14,270 | 12,587 | AG |
GL019084:74038 | 4146664_90 | ENSOCUG00000007721 | 37,538 | 30,580 | AG | |
GL019077:34265 | 4149102_107 | ENSOCUG00000027299 | PTX4 | 3,616 | 19,359 | AG |
GL019077:34265 | 4149102_107 | ENSOCUG00000000521 | TELO2 | 11,447 | 6,323 | AG |
GL019077:34265 | 4149102_107 | ENSOCUG00000024268 | IFT140 | 81,557 | Intronic | AG |
GL019077:34265 | 4149102_107 | ENSOCUG00000001136 | TMEM204 | 19,558 | 13,406 | AG |
GL019154:883 | 4176850_23 | ENSOCUG00000002484 | MTHFSD | 13,513 | 828 | AG |
GL019154:883 | 4176850_23 | ENSOCUG00000001701 | FOXF1 | 2,195 | 19,026 | AG |
GL019274:63350 | 4208471_113 | ENSOCUG00000015189 | 321 | 37,542 | G |
fullPaths <- list(
toTable(GOBPANCESTOR) %>% set_names(c("go_id", "ancestor")),
toTable(GOCCANCESTOR) %>% set_names(c("go_id", "ancestor")),
toTable(GOMFANCESTOR) %>% set_names(c("go_id", "ancestor"))
) %>%
bind_rows() %>%
dplyr::filter(!ancestor %in% c("all", "GO:0008150", "GO:0003674", "GO:0005575")) %>%
as_tibble() %>%
bind_rows(
tibble(
go_id = unique(.$go_id),
ancestor = unique(.$go_id)
)
) %>%
arrange(go_id, ancestor)
goSummaries <- url("https://uofabioinformaticshub.github.io/summaries2GO/data/goSummaries.RDS") %>%
readRDS() %>%
dplyr::rename(go_id = id)
For the initial GO analysis, any gene within 40kb of a SNP considered as significant were tested for any enrichment of biological characteristics.
allSnpsGR <- snps %>%
dplyr::select(chromosome, position, snpID) %>%
makeGRangesFromDataFrame(
keep.extra.columns = TRUE,
seqnames.field = "chromosome",
start.field = "position",
end.field = "position",
seqinfo = seqinfo(ensGenes)) %>%
join_overlap_inner(
ensGenes,
maxgap = maxKb*1000
)
sigSnpGR <- allSnpsGR %>%
subset(snpID %in% sigSnps)
nGenes <- allSnpsGR$gene_id %>%
unique() %>%
length()
nSigGenes <- sigSnpGR$gene_id %>%
unique() %>%
length()
10821 genes were identified as being within 40kb of a SNP. For the 32 candidate SNPs within this distance of a gene, 54 genes were identified, and these were analysed for enrichment of any GO terms.
Mappings from genes to GO terms were downloaded manually from the biomart server at www.ensembl.org.
bm <- file.path("..", "external", "gene2GO_biomart.tsv.gz") %>%
gzfile() %>%
read_tsv() %>%
dplyr::filter(!is.na(`GO domain`)) %>%
dplyr::rename(gene_id = `Gene stable ID`,
ontology = `GO domain`,
go_id = `GO term accession`,
go_term = `GO term name`) %>%
dplyr::filter(gene_id %in% allSnpsGR$gene_id) %>%
left_join(fullPaths) %>%
dplyr::select(gene_id, go_id = ancestor) %>%
distinct(gene_id, go_id) %>%
dplyr::filter(!is.na(go_id))
Complete paths of GO terms back to ontology roots were obtained for each gene. The minimum number of steps back to the ontology roots were also obtained for each GO ID.
goRes <- bm %>%
mutate(inSig = gene_id %in% sigSnpGR$gene_id) %>%
group_by(go_id) %>%
summarise(N = dplyr::n(),
nSig = sum(inSig)) %>%
left_join(goSummaries) %>%
filter(nSig > 0, shortest_path > 2) %>%
distinct(go_id, .keep_all = TRUE) %>%
split(f = .$go_id) %>%
mclapply(function(x){
ft <- matrix(
c(nSigGenes - x$nSig, x$nSig, nGenes - nSigGenes - x$N, x$N),
nrow = 2, byrow = TRUE) %>%
fisher.test()
mutate(x, p = ft$p.value)
}, mc.cores = mc) %>%
bind_rows() %>%
mutate(Expected = nSigGenes * N / nGenes,
FDR = p.adjust(p, "fdr"),
adjP = p.adjust(p, "bonferroni"),
term = Term(go_id),
ontology = Ontology(go_id)) %>%
dplyr::select(go_id, term, ontology, N, nSig, Expected, p, adjP, FDR) %>%
arrange(p)
goRes %>%
dplyr::filter(FDR == min(FDR), nSig > 1) %>%
mutate(
Term = Term(go_id),
Genes = vapply(go_id, function(x){
subset(sigSnpGR, gene_id %in% I(
dplyr::filter(bm, go_id == x) %>%
.[["gene_id"]]))$Name %>%
sort() %>%
unique() %>%
paste(collapse = ", ")
}, character(1))
) %>%
dplyr::select(ID = go_id, Ontology = ontology, Term, Total = N, Sig = nSig, Expected, p, FDR, Genes) %>%
mutate(
ID = gsub(":", "\\\\:", ID),
Expected = tidyP(Expected),
p = tidyP(p),
FDR = round(FDR, 3)
) %>%
distinct(Ontology, Genes, .keep_all = TRUE) %>%
pander(
split.tables = Inf, style = "rmarkdown",
justify = "lllrrrrrl",
caption = paste(
"*Most highly ranked GO terms.",
"Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown.",
"Overall, this corresponds to an FDR of",
percent(max(.$FDR)), ".*"
)
)
ID | Ontology | Term | Total | Sig | Expected | p | FDR | Genes |
---|---|---|---|---|---|---|---|---|
GO:0005496 | MF | steroid binding | 34 | 3 | 0.170 | 8.10e-04 | 0.216 | ESR1, ESRRG, SERPINA6 |
GO:0060068 | BP | vagina development | 7 | 2 | 0.035 | 8.60e-04 | 0.216 | ESR1, LHX1 |
GO:0001655 | BP | urogenital system development | 146 | 5 | 0.729 | 9.03e-04 | 0.216 | ESR1, FOXF1, ID3, IFT140, LHX1 |
GO:0072189 | BP | ureter development | 10 | 2 | 0.050 | 0.002 | 0.216 | FOXF1, LHX1 |
GO:0002221 | BP | pattern recognition receptor signaling pathway | 51 | 3 | 0.255 | 0.002 | 0.216 | ESR1, TRAF3, UFD1 |
GO:0060041 | BP | retina development in camera-type eye | 56 | 3 | 0.279 | 0.003 | 0.216 | GNB1, IFT140, LHX1 |
GO:0043010 | BP | camera-type eye development | 122 | 4 | 0.609 | 0.004 | 0.216 | COL8A2, GNB1, IFT140, LHX1 |
GO:0021680 | BP | cerebellar Purkinje cell layer development | 16 | 2 | 0.080 | 0.004 | 0.216 | LHX1, SEZ6 |
GO:0072001 | BP | renal system development | 132 | 4 | 0.659 | 0.005 | 0.216 | FOXF1, ID3, IFT140, LHX1 |
GO:0003727 | MF | single-stranded RNA binding | 19 | 2 | 0.095 | 0.005 | 0.216 | AGO3, MSI2 |
GO:0031050 | BP | dsRNA processing | 19 | 2 | 0.095 | 0.005 | 0.216 | AGO3, ESR1 |
GO:0004879 | MF | nuclear receptor activity | 20 | 2 | 0.100 | 0.005 | 0.216 | ESR1, ESRRG |
GO:0050688 | BP | regulation of defense response to virus | 20 | 2 | 0.100 | 0.005 | 0.216 | TRAF3, UFD1 |
GO:0001750 | CC | photoreceptor outer segment | 23 | 2 | 0.115 | 0.007 | 0.216 | GNB1, IFT140 |
GO:0048701 | BP | embryonic cranial skeleton morphogenesis | 26 | 2 | 0.130 | 0.009 | 0.216 | IFT140, LHX1 |
GO:0030522 | BP | intracellular receptor signaling pathway | 85 | 3 | 0.424 | 0.010 | 0.216 | ESR1, ESRRG, UFD1 |
The results in the Turretfield population from the paper Resistance to RHD virus in wild Australia rabbits: Comparison of susceptible and resistant individuals using a genomewide approach were then compared to these results. In this previous analysis, results were generated by using Fisher’s Exact Test to test genotypes for association with RHDV susceptibility or resistance.
tfGR <- url("https://raw.githubusercontent.com/hdetering/orycun/master/results/filteredSNPScores.csv") %>%
read_csv() %>%
distinct(min_snp, .keep_all = TRUE) %>%
dplyr::select(chromosome = chr_name, pos, snpID = min_snp, p, FDR) %>%
mutate(chromosome = gsub("^chr", "", chromosome)) %>%
makeGRangesFromDataFrame(
keep.extra.columns = TRUE,
ignore.strand = TRUE,
seqinfo = seqinfo(ensGenes),
seqnames.field = "chromosome",
start.field = "pos",
end.field = "pos") %>%
sort()
Of the 9,229 unique SNPs contained in the previous dataset, only 1,431 were also identified in the current dataset. A meta-analysis was performed on these SNPs using the results from both separate analyses which used Fisher’s Exact Test on genotypes. In this approach, Fisher’s method was used to combine p-values, and resultant p-values were drawn from a \(\chi^2\) distribution with 4 degrees of freedom (i.e. \(2k\)).
combinedResults <- tfGR %>%
find_overlaps(allSnpsGR) %>%
mcols() %>%
as.data.frame() %>%
as_tibble() %>%
dplyr::select(snpID = snpID.y, tf.p = p) %>%
distinct(snpID, .keep_all = TRUE) %>%
left_join(
genotypeResults %>%
dplyr::select(snpID, chromosome, position, geno.p = p)) %>%
mutate(
X = -2*(log(tf.p) + log(geno.p)),
p = pchisq(X, 4, lower.tail = FALSE),
adjP = p.adjust(p, "bonferroni"),
FDR = p.adjust(p, "fdr")) %>%
arrange(p)
Using Bonferroni’s adjustment to control the FWER at 0.05 gave 2 candidate SNPs, whilst using the more relaxed criterion of an FDR < 0.05 gave 10 candidate SNPs.
combinedResults %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(
snpID,
Chromosome = chromosome,
Position = position,
p, adjP, FDR
) %>%
mutate(
p = tidyP(p),
adjP = tidyP(adjP),
FDR = tidyP(FDR)
) %>%
pander(
justify = "lrrrrr",
caption = "*Significant SNPs from the meta-analysis using the previous published results from a separate population, using an FDR < 0.05 as the sole criterion for significance.*"
)
snpID | Chromosome | Position | p | adjP | FDR |
---|---|---|---|---|---|
644062_1 | 3 | 56309981 | 1.67e-06 | 0.002 | 0.002 |
3378006_1 | GL018704 | 2271135 | 2.35e-05 | 0.031 | 0.015 |
3941015_20 | GL018847 | 216450 | 5.32e-05 | 0.069 | 0.023 |
213788_4 | 1 | 146274492 | 8.65e-05 | 0.113 | 0.025 |
3118382_68 | 20 | 14614986 | 9.50e-05 | 0.124 | 0.025 |
3108287_38 | 20 | 10514947 | 1.41e-04 | 0.183 | 0.031 |
3156991_0 | 21 | 6185579 | 1.79e-04 | 0.234 | 0.033 |
2425370_49 | 14 | 114833115 | 2.08e-04 | 0.272 | 0.034 |
3904662_77 | GL018828 | 225354 | 3.00e-04 | 0.392 | 0.043 |
3970912_82 | GL018864 | 1306 | 3.32e-04 | 0.433 | 0.043 |
allSnpsGR %>%
subset(snpID %in% dplyr::filter(combinedResults, FDR < 0.05)$snpID) %>%
subset(!is.na(Name)) %>%
as.data.frame() %>%
dplyr::select(
snpID,
Chromosome = seqnames,
BP = start,
ID = gene_id,
Name,
Description = description
) %>%
mutate(Description = str_remove(Description, " \\[Source.+")) %>%
pander(
style = "rmarkdown", split.table = Inf, justify = "lrrlll",
caption = paste0(
"*Genes within ", maxKb, "kb of SNPs identified in the meta-analysis.*"
)
)
snpID | Chromosome | BP | ID | Name | Description |
---|---|---|---|---|---|
213788_4 | 1 | 146274492 | ENSOCUG00000000568 | HBB2 | hemoglobin, beta |
213788_4 | 1 | 146274492 | ENSOCUG00000021180 | HBG2 | hemoglobin, gamma G |
213788_4 | 1 | 146274492 | ENSOCUG00000027533 | HBG1 | hemoglobin, gamma A |
213788_4 | 1 | 146274492 | ENSOCUG00000026912 | OR51I1 | olfactory receptor family 51 subfamily I member 1 |
213788_4 | 1 | 146274492 | ENSOCUG00000025404 | OLFR64_1 | MOR 5’beta3 |
644062_1 | 3 | 56309981 | ENSOCUG00000004827 | SFXN1 | sideroflexin 1 |
2425370_49 | 14 | 114833115 | ENSOCUG00000011289 | ALCAM | activated leukocyte cell adhesion molecule |
3108287_38 | 20 | 10514947 | ENSOCUG00000008859 | KCNH5 | potassium voltage-gated channel subfamily H member 5 |
3118382_68 | 20 | 14614986 | ENSOCUG00000008913 | RPS6KA5 | ribosomal protein S6 kinase A5 |
3156991_0 | 21 | 6185579 | ENSOCUG00000005961 | KDM2B | lysine demethylase 2B |
3156991_0 | 21 | 6185579 | ENSOCUG00000005957 | RNF34 | ring finger protein 34 |
3378006_1 | GL018704 | 2271135 | ENSOCUG00000011042 | COL8A2 | collagen type VIII alpha 2 chain |
3378006_1 | GL018704 | 2271135 | ENSOCUG00000011038 | ADPRHL2 | ADP-ribosylhydrolase like 2 |
3378006_1 | GL018704 | 2271135 | ENSOCUG00000029722 | TEKT2 | tektin 2 |
3378006_1 | GL018704 | 2271135 | ENSOCUG00000006255 | AGO3 | argonaute RISC catalytic component 3 |
3904662_77 | GL018828 | 225354 | ENSOCUG00000027620 | PAQR4 | progestin and adipoQ receptor family member 4 |
3904662_77 | GL018828 | 225354 | ENSOCUG00000017181 | PKMYT1 | protein kinase, membrane associated tyrosine/threonine 1 |
3904662_77 | GL018828 | 225354 | ENSOCUG00000027985 | TNFRSF12A | TNF receptor superfamily member 12A |
3904662_77 | GL018828 | 225354 | ENSOCUG00000017197 | CLDN6 | claudin 6 |
3941015_20 | GL018847 | 216450 | ENSOCUG00000012298 | ODF2 | outer dense fiber of sperm tails 2 |
3941015_20 | GL018847 | 216450 | ENSOCUG00000008855 | CERCAM | cerebral endothelial cell adhesion molecule |
3970912_82 | GL018864 | 1306 | ENSOCUG00000021601 | SCARB1 | scavenger receptor class B member 1 |
combinedResults %>%
mutate(Sig = FDR < 0.05) %>%
ggplot(aes(-log10(geno.p), -log10(tf.p), colour = Sig)) +
geom_point() +
geom_abline(slope =1, intercept = 0, linetype = 2, colour = "blue") +
geom_text_repel(aes(label = snpID),
data = . %>% filter(Sig)) +
scale_color_manual(values = c("black", "red")) +
labs(x = expression(paste(-log[10], "p (Genotype Analysis)")),
y = expression(paste(-log[10], "p (Previous Analysis)"))) +
theme(legend.position = "none")
Comparison of p-values for SNPs detected in both analyses, with those considered as significant after meta-analysis shown in red.
combinedResults %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(snpID) %>%
left_join(genotypes) %>%
filter(Population != 2010,
!variant.id %in% regionSNPs$variant.id) %>%
mutate(Genotype = as.factor(Genotype),
Population = as.factor(Population)) %>%
ggplot(aes(Genotype, fill = Population)) +
geom_bar(position = position_dodge2(preserve = "single")) +
facet_wrap(~snpID, drop = FALSE) +
scale_fill_manual(values = c("black", "blue")) +
labs(y = "Count") +
theme(
legend.direction = "horizontal",
legend.position = c(0.75, 0.05)
)
Allele distributions in the 1996 and 2012 populations for SNPs identified under the meta-analysis.
This list was also used to test for GO enrichment.
sigSnpGR <- allSnpsGR %>%
subset(snpID %in% filter(combinedResults, FDR < 0.05)$snpID)
nGenes <- allSnpsGR %>%
subset(snpID %in% combinedResults$snpID) %>%
unique() %>%
length()
nSigGenes <- sigSnpGR$gene_id %>%
unique() %>%
length()
combinedGoRes <- bm %>%
dplyr::filter(
gene_id %in% subset(allSnpsGR, snpID %in% combinedResults$snpID)$gene_id) %>%
mutate(inSig = gene_id %in% sigSnpGR$gene_id) %>%
group_by(go_id) %>%
summarise(N = dplyr::n(),
nSig = sum(inSig)) %>%
left_join(goSummaries) %>%
filter(nSig > 0, shortest_path > 2) %>%
distinct(go_id, .keep_all = TRUE) %>%
split(f = .$go_id) %>%
mclapply(function(x){
ft <- matrix(
c(nSigGenes - x$nSig, x$nSig, nGenes - nSigGenes - x$N, x$N),
nrow = 2, byrow = TRUE) %>%
fisher.test()
mutate(x, p = ft$p.value)
}, mc.cores = mc) %>%
bind_rows() %>%
mutate(Expected = nSigGenes * N / nGenes,
FDR = p.adjust(p, "fdr"),
adjP = p.adjust(p, "bonferroni"),
term = Term(go_id),
ontology = Ontology(go_id)) %>%
dplyr::select(go_id, term, ontology, shortest_path, N, nSig, Expected, p, adjP, FDR) %>%
arrange(p)
Inspection of the results revealed an enrichment for terms related to haemoglobin, however, as these genes are co-located this was revealed to be due to a single SNP with multiple haemoglobin genes within 40kb and can be disregarded.
combinedGoRes %>%
dplyr::filter(
nSig > Expected,
FDR < 0.05
) %>%
mutate(Term = Term(go_id),
Genes = vapply(go_id, function(x){
subset(sigSnpGR, gene_id %in% I(
dplyr::filter(bm, go_id == x) %>%
.[["gene_id"]]))$Name %>%
sort() %>%
unique() %>%
paste(collapse = ", ")
}, character(1)),
nSnps = vapply(go_id, function(x){
subset(sigSnpGR, gene_id %in% I(
dplyr::filter(bm, go_id == x) %>%
.[["gene_id"]]))$snpID %>%
unique() %>%
length()
}, integer(1))) %>%
dplyr::select(ID = go_id, Ontology = ontology, Term, Total = N, Sig = nSig, nSnps, Expected, p, FDR, Genes) %>%
mutate(ID = gsub(":", "\\\\:", ID),
Expected = round(Expected, 3),
p = round(p, 4),
FDR = round(FDR, 3)) %>%
distinct(Ontology, Genes, .keep_all = TRUE) %>%
pander(
split.tables = Inf, style = "rmarkdown",
justify = "lllrrrrrrl",
caption = paste(
"Most highly ranked GO terms for combined results.",
"Where multiple terms resulted from the identical combination of genes, only the most highly ranked terms is shown.",
"Overall, this corresponds to an FDR of",
percent(max(.$FDR)), ".*"
)
)
ID | Ontology | Term | Total | Sig | nSnps | Expected | p | FDR | Genes |
---|---|---|---|---|---|---|---|---|---|
GO:0015669 | BP | gas transport | 4 | 3 | 1 | 0.075 | 2e-04 | 0.034 | HBB2, HBG1, HBG2 |
GO:0019825 | MF | oxygen binding | 4 | 3 | 1 | 0.075 | 2e-04 | 0.034 | HBB2, HBG1, HBG2 |
GO:0015893 | BP | drug transport | 14 | 4 | 2 | 0.264 | 3e-04 | 0.037 | HBB2, HBG1, HBG2, SFXN1 |
snpsGR %>%
resize(width = 1000*maxKb*2, fix = "center") %>%
trim() %>%
find_overlaps(ensGenes) %>%
as.data.frame() %>%
as_tibble() %>%
left_join(snps, by = c("variant.id", "snpID", "Col")) %>%
dplyr::select(snpID, gene_id, Name, chromosome, position) %>%
left_join(
as.data.frame(ensGenes), by = c("gene_id", "Name")
) %>%
mutate(
distance = case_when(
position < start ~ comma(start - position, accuracy = 1),
position > end ~ comma(position - end, accuracy = 1),
snpID %in% ensExons$snpID ~ "Exonic",
position >= start & position <= end ~ "Intronic"
),
model = ifelse(
snpID %in% filter(alleleResults, FDR < alpha)$snpID, "A", ""
),
model = ifelse(
snpID %in% dplyr::filter(snps, variant.id %in% dplyr::filter(bayRes, qval < 0.05)$variant.id)$snpID,
paste0(model, "B"), model
),
model = ifelse(
snpID %in% filter(genotypeResults, FDR < alpha)$snpID,
paste0(model, "G"), model
),
width = comma(width, accuracy = 1)
) %>%
unite(location, chromosome, position, sep = ":") %>%
dplyr::select(
snpID, location, gene_id, Name, distance
) %>%
left_join(
genotypeResults %>% dplyr::select(snpID, genotype_p = p)
) %>%
left_join(
alleleResults %>% dplyr::select(snpID, allele_p = p)
) %>%
write_tsv(here::here("R", "allTestedSnps.tsv"))
All tested SNPs, along with genes within 40kb and p-values was then exported, and can be downloaded from here
R version 3.6.2 (2019-12-12)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats4, parallel, grid, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: GO.db(v.3.10.0), AnnotationDbi(v.1.48.0), Biobase(v.2.46.0), rtracklayer(v.1.46.0), plyranges(v.1.6.7), GenomicRanges(v.1.38.0), GenomeInfoDb(v.1.22.0), IRanges(v.2.20.2), S4Vectors(v.0.24.3), BiocGenerics(v.0.32.0), ggrepel(v.0.8.1), qqman(v.0.1.4), ggsn(v.0.5.0), rgdal(v.1.4-8), ggmap(v.3.0.0), sp(v.1.3-2), readxl(v.1.3.1), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.4), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.2), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.3.0), magrittr(v.1.5), scales(v.1.1.0), pander(v.0.6.3), SNPRelate(v.1.20.1), SeqArray(v.1.26.2) and gdsfmt(v.1.22.0)
loaded via a namespace (and not attached): colorspace(v.1.4-1), rjson(v.0.2.20), ellipsis(v.0.3.0), class(v.7.3-15), rprojroot(v.1.3-2), futile.logger(v.1.4.3), XVector(v.0.26.0), fs(v.1.3.1), rstudioapi(v.0.10), farver(v.2.0.3), bit64(v.0.9-7), fansi(v.0.4.1), lubridate(v.1.7.4), xml2(v.1.2.2), knitr(v.1.27), jsonlite(v.1.6), Rsamtools(v.2.2.1), broom(v.0.5.4), dbplyr(v.1.4.2), png(v.0.1-7), compiler(v.3.6.2), httr(v.1.4.1), backports(v.1.1.5), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), cli(v.2.0.1), formatR(v.1.7), htmltools(v.0.4.0), tools(v.3.6.2), gtable(v.0.3.0), glue(v.1.3.1), GenomeInfoDbData(v.1.2.2), Rcpp(v.1.0.3), cellranger(v.1.1.0), vctrs(v.0.2.2), Biostrings(v.2.54.0), nlme(v.3.1-143), xfun(v.0.12), rvest(v.0.3.5), lifecycle(v.0.1.0), XML(v.3.99-0.3), zlibbioc(v.1.32.0), MASS(v.7.3-51.5), hms(v.0.5.3), SummarizedExperiment(v.1.16.1), lambda.r(v.1.2.4), yaml(v.2.2.1), memoise(v.1.1.0), calibrate(v.1.7.5), stringi(v.1.4.5), RSQLite(v.2.2.0), highr(v.0.8), maptools(v.0.9-9), e1071(v.1.7-3), BiocParallel(v.1.20.1), RgoogleMaps(v.1.4.5.2), rlang(v.0.4.4), pkgconfig(v.2.0.3), bitops(v.1.0-6), matrixStats(v.0.55.0), evaluate(v.0.14), lattice(v.0.20-38), sf(v.0.8-1), labeling(v.0.3), GenomicAlignments(v.1.22.1), bit(v.1.1-15.1), tidyselect(v.1.0.0), here(v.0.1), plyr(v.1.8.5), R6(v.2.4.1), generics(v.0.0.2), DelayedArray(v.0.12.2), DBI(v.1.1.0), pillar(v.1.4.3), haven(v.2.2.0), foreign(v.0.8-75), withr(v.2.1.2), units(v.0.6-5), RCurl(v.1.98-1.1), modelr(v.0.1.5), crayon(v.1.3.4), futile.options(v.1.0.1), KernSmooth(v.2.23-16), rmarkdown(v.2.1), jpeg(v.0.1-8.1), blob(v.1.2.1), reprex(v.0.3.0), digest(v.0.6.23), classInt(v.0.4-2), VennDiagram(v.1.6.20) and munsell(v.0.5.0)