library(tidyverse)
library(rtracklayer)
library(GenomicRanges)
library(biomaRt)
library(GO.db)
library(parallel)
library(magrittr)
library(pander)
library(scales)
library(reshape2)
library(stringr)
nCores <- min(12, detectCores() - 1)
theme_set(theme_bw())
evalLongChunks <- FALSE
This analysis takes the SNPs found to be associated with the two populations and determines which genes are nearby. An analysis of the GO terms associated with these SNPS is also undertaken.
This file covers three main areas
GenomicRanges
were formed defining all genes and exons according to the gene models in Build 84 of Ensembl.
gffFile <- file.path("..", "data", "Oryctolagus_cuniculus.OryCun2.0.84.gff3.gz")
ensGenes <- import.gff(gffFile, feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE) %>%
sortSeqlevels()
ensExons <- import.gff(gffFile, feature.type = "exon", sequenceRegionsAsSeqinfo = TRUE) %>%
sortSeqlevels()
rootGO <- list(
BP = "GO:0008150",
CC = "GO:0005575",
MF = "GO:0003674"
)
mart <- useEnsembl("ensembl", dataset ="ocuniculus_gene_ensembl", version = "84")
ens2Go <- getBM(attributes = c("ensembl_gene_id","go_id", "namespace_1003"),
filters = "ensembl_gene_id",
values = ensGenes$gene_id,
mart = mart) %>%
filter(go_id != "") %>%
dplyr::rename(ontology = namespace_1003) %>%
mutate(ontology = c(biological_process = "BP",
cellular_component = "CC",
molecular_function = "MF")[ontology])
ens2Go %<>% split(f = .$ontology)
ens2Go$BP %<>% filter(go_id %in% get(rootGO$BP, GOBPOFFSPRING))
ens2Go$CC %<>% filter(go_id %in% get(rootGO$CC, GOCCOFFSPRING))
ens2Go$MF %<>% filter(go_id %in% get(rootGO$MF, GOMFOFFSPRING))
ens2Go %<>% bind_rows()
gzOut <- file.path("..", "data", "ens2GO.biomaRt.84.tsv.gz") %>% gzfile("w")
write_delim(ens2Go, gzOut, delim ="\t")
close(gzOut)
ens2Go <- file.path("..", "data","ens2GO.biomaRt.84.tsv.gz") %>%
gzfile() %>%
read_tsv()
biomaRt
back to the root nodes.allGOAncestor <- list(
BP = filter(ens2Go, ontology == "BP") %>%
extract2("go_id") %>%
unique() %>%
sapply(get, GOBPANCESTOR, simplify = FALSE),
CC = filter(ens2Go, ontology == "CC") %>%
extract2("go_id") %>%
unique() %>%
sapply(get, GOCCANCESTOR, simplify = FALSE),
MF = filter(ens2Go, ontology == "MF") %>%
extract2("go_id") %>%
unique() %>%
sapply(get, GOMFANCESTOR, simplify = FALSE)
)
ALLGO2ENS <- ens2Go %>%
split(f = .$ensembl_gene_id) %>%
mclapply(function(x){
g <- unique(x$ensembl_gene_id)
BP <- allGOAncestor$BP[filter(x, ontology == "BP")$go_id] %>%
unlist() %>%
c(filter(x, ontology == "BP")$go_id) %>%
unique()
CC <- allGOAncestor$CC[filter(x, ontology == "CC")$go_id] %>%
unlist() %>%
c(filter(x, ontology == "CC")$go_id) %>%
unique()
MF <- allGOAncestor$MF[filter(x, ontology == "MF")$go_id] %>%
unlist() %>%
c(filter(x, ontology == "MF")$go_id) %>%
unique()
data_frame(ensembl_gene_id = g, go_id = c(BP, CC, MF))
},mc.cores = nCores) %>%
bind_rows() %>%
filter(go_id != "all") %>%
mutate(ontology = Ontology(go_id))
gzOut <- file.path("..", "data", "ALLGO2ENS.tsv.gz") %>% gzfile("w")
write_delim(ALLGO2ENS, gzOut, delim = "\t")
close(gzOut)
ALLGO2ENS <- file.path("..", "data", "ALLGO2ENS.tsv.gz") %>%
gzfile() %>%
read_tsv()
singleGeneGO <- ALLGO2ENS %>%
group_by(go_id) %>%
tally %>%
filter(n == 1) %>%
extract2("go_id")
GO Terms were then defined in terms of their minimum distance to the Root Nodes, down to those requiring up to 3 steps.
firstLevelGO <- list(
BP = c(rootGO$BP, get(rootGO$BP, GOBPCHILDREN)),
CC = c(rootGO$CC, get(rootGO$CC, GOCCCHILDREN)),
MF = c(rootGO$MF, get(rootGO$MF, GOMFCHILDREN))
) %>%
lapply(unique)
secondLevelGO <- list(
BP = c(firstLevelGO$BP, lapply(firstLevelGO$BP, get, GOBPCHILDREN)),
CC = c(firstLevelGO$CC, lapply(firstLevelGO$CC, get, GOCCCHILDREN)),
MF = c(firstLevelGO$MF, lapply(firstLevelGO$MF, get, GOMFCHILDREN))
) %>%
lapply(unlist) %>%
lapply(unique) %>%
lapply(function(x){x[!is.na(x)]})
thirdLevelGO <- list(
BP = c(secondLevelGO$BP, lapply(secondLevelGO$BP, get, GOBPCHILDREN)),
CC = c(secondLevelGO$CC, lapply(secondLevelGO$CC, get, GOCCCHILDREN)),
MF = c(secondLevelGO$MF, lapply(secondLevelGO$MF, get, GOMFCHILDREN))
) %>%
mclapply(unlist, mc.cores = nCores) %>%
mclapply(unique, mc.cores = nCores) %>%
mclapply(function(x){x[!is.na(x)]}, mc.cores = nCores)
resultFiles <- file.path("..", "results", c("flkResults.tsv",
"alleleResults.tsv",
"genotypeResults.tsv")) %>%
set_names(c("flk", "alleles", "genotypes"))
results <- resultFiles %>%
lapply(read_tsv)
resultsGR <- full_join(
results$flk %>%
dplyr::select(snpID, Chr, BP, SNP, `Gum Creek (1996)`, `Oraparinna (2012)`,
FLK_p = F.LK.p.val, FLK_FDR = FDR),
results$alleles %>%
dplyr::select(snpID, Chr, BP, alleles_p = p, alleles_FDR = FDR)
) %>%
full_join(
results$genotypes %>%
dplyr::select(snpID, Chr, BP, genotypes_p = p, genotypes_FDR = FDR)
) %>%
makeGRangesFromDataFrame(
keep.extra.columns = TRUE,
ignore.strand = TRUE,
seqnames.field = "Chr",
start.field = "BP",
end.field = "BP",
seqinfo = seqinfo(ensGenes)) %>%
sort()
resultsGR$Sig <- with(resultsGR, FLK_FDR < 0.05 | genotypes_FDR < 0.1)
snpInGenes <- resultsGR %>%
subset(Sig) %>%
findOverlaps(ensGenes)
snpInExons <- resultsGR %>%
subset(Sig) %>%
findOverlaps(ensExons)
A total of 38 significant SNPs were found to be within the start and end points of genes. 5 of these were within the coding regions of these genes.
Gene ID | Name | Chr | BP | snpID |
---|---|---|---|---|
ENSOCUG00000003383 | MGAT4A | 2 | 93,561,889 | 214772_69 |
ENSOCUG00000017054 | TEX33 | 4 | 84,940,214 | 167107_60 |
ENSOCUG00000017054 | TEX33 | 4 | 84,940,235 | 167108_14 |
ENSOCUG00000005399 | EPO | 6 | 27,103,573 | 157156_58* |
ENSOCUG00000005399 | EPO | 6 | 27,103,576 | 157157_45* |
ENSOCUG00000001407 | MAGI2 | 7 | 38,250,131 | 151791_54 |
ENSOCUG00000011060 | MYO1B | 7 | 131,862,327 | 147965_18 |
ENSOCUG00000005605 | LDLRAD4 | 9 | 48,543,229 | 134595_50 |
ENSOCUG00000024842 | CTIF | 9 | 89,862,459 | 211772_50 |
ENSOCUG00000024842 | CTIF | 9 | 89,862,481 | 211772_28 |
ENSOCUG00000024842 | CTIF | 9 | 89,862,497 | 211772_12 |
ENSOCUG00000015851 | DYM | 9 | 90,037,571 | 137949_40 |
ENSOCUG00000015851 | DYM | 9 | 90,037,630 | 137951_16 |
ENSOCUG00000000804 | SCRN1 | 10 | 14,373,457 | 127156_20 |
ENSOCUG00000000804 | SCRN1 | 10 | 14,373,501 | 127157_45 |
ENSOCUG00000012252 | DPP6 | 13 | 15,938,245 | 107422_35 |
ENSOCUG00000007988 | MUC4 | 14 | 92,793,012 | 101831_18* |
ENSOCUG00000016518 | KIAA1217 | 16 | 735,716 | 87819_88 |
ENSOCUG00000007461 | NAV1 | 16 | 69,850,541 | 87407_11 |
ENSOCUG00000015494 | NIN | 17 | 69,838,525 | 80772_39 |
ENSOCUG00000011533 | RHOBTB1 | 18 | 25,311,139 | 72765_47* |
ENSOCUG00000007069 | FGFR2 | 18 | 68,517,919 | 208110_71 |
ENSOCUG00000007069 | FGFR2 | 18 | 68,517,942 | 208111_21 |
ENSOCUG00000008478 | STXBP4 | 19 | 32,825,478 | 68946_78 |
ENSOCUG00000001207 | ESRRB | 20 | 28,859,654 | 65474_28 |
ENSOCUG00000001722 | KSR2 | 21 | 14,643,101 | 62998_77 |
ENSOCUG00000021564 | GL018704 | 4,853,505 | 53831_42* | |
ENSOCUG00000009435 | XKR5 | GL018713 | 365,938 | 50206_34 |
ENSOCUG00000000596 | PLA2G16 | GL018717 | 314,346 | 48659_40 |
ENSOCUG00000017106 | KAZN | GL018739 | 75,851 | 41475_63 |
ENSOCUG00000017031 | USP46 | GL018754 | 1,365,061 | 37345_16 |
ENSOCUG00000029175 | GL018758 | 1,220,835 | 36566_45 | |
ENSOCUG00000029175 | GL018758 | 1,220,869 | 36567_12 | |
ENSOCUG00000010092 | CPNE9 | GL018802 | 439,482 | 204810_79 |
ENSOCUG00000005387 | XRCC1 | GL018881 | 24,194 | 21896_47 |
ENSOCUG00000005387 | XRCC1 | GL018881 | 24,230 | 21896_11 |
ENSOCUG00000027400 | PLAUR | GL018881 | 88,327 | 233206_29 |
ENSOCUG00000029496 | UNC93A | GL019406 | 5,672 | 5908_8 |
MYO1B
and USP46
are known interacting proteins in human biologyallGenesIn20kb <- subsetByOverlaps(
ensGenes,
resultsGR %>%
resize(width = 40001,fix = "center") %>%
trim()
)
sigGenesIn20kb <- subsetByOverlaps(
ensGenes,
resultsGR %>%
subset(Sig)%>%
resize(width = 40001,fix = "center") %>%
trim()
)
As a an initial conservative approach, the set of genes within 20kb of a SNP was investigated. This produced a list of 7407 genes in total, of which 65 overlapped SNPs of interest.
go2Test20kb <- ALLGO2ENS %>%
filter(ensembl_gene_id %in% sigGenesIn20kb$gene_id,
!go_id %in% unlist(thirdLevelGO)) %>%
extract2("go_id") %>%
unique %>%
setdiff(singleGeneGO)
A set of 767 GO terms mapping to more than one gene, and with at least 4 steps back to the ontology roots, were assigned to the genes within 20kb of the 65 candidate SNPs. These were then tested for enrichment using the set of genes within 20kb of the remaining 18328 SNPs
Due to the closely related nature of some SNPs in this dataset the possibility of two SNPs being within 20kb of the same genes was extremely high, and this approach would ensure that each gene only appears once despite the possibility of mapping to multiple SNPs within the dataset.
Fisher’s Exact Test was then used to test for enrichment by counting the genes mapped to each GO term within the set of significant, and non-significant SNPs, and comparing to the genes not mapped to each GO term in both sets of SNPs. All p values were then adjusted using Benjamini-Hochberg’s FDR.
nSigGenes <- length(sigGenesIn20kb)
nNotSigGenes <- length(allGenesIn20kb) - nSigGenes
goResults20kb <- go2Test20kb %>%
lapply(function(x){
# Form a matrix of the counts
mat <- filter(ALLGO2ENS, go_id == x) %>%
mutate(Sig = ensembl_gene_id %in% sigGenesIn20kb$gene_id) %>%
acast(Sig~., fun.aggregate = length, value.var = "Sig")
# Ensure it is a matrix, then set row/colnames
if (length(mat) == 1) mat <- c(0, mat)
mat <- cbind(mat, withoutGO = c(nNotSigGenes, nSigGenes) - mat)
colnames(mat)[1] <- "withGO"
rownames(mat) <- c("notNearSNP", "nearSNP")
# Fisher's exact test
ft <- fisher.test(mat)
data_frame(go_id = x,
expected = nSigGenes * mat["notNearSNP", "withGO"] / nNotSigGenes,
observed = mat["nearSNP", "withGO"],
p = ft$p.value)
}) %>%
bind_rows() %>%
mutate(FDR = p.adjust(p, "fdr")) %>%
arrange(p)
A total of 3 GO terms were considered as enriched using the criteria of an FDR-adjusted p-value < 0.05 and with observed numbers greater than that predicted by the ratio in the non-significant SNP genes.
GO ID | Term | Ont | Observed | Expected | p | FDR |
---|---|---|---|---|---|---|
GO:0010359 | regulation of anion channel activity | BP | 2 | 0.008853 | 0.0002262 | 0.007544 |
GO:0004792 | thiosulfate sulfurtransferase activity | MF | 2 | 0.008853 | 0.0002262 | 0.007544 |
GO:0071294 | cellular response to zinc ion | BP | 2 | 0.02656 | 0.0007455 | 0.01906 |
allGenesIn40kb <- subsetByOverlaps(
ensGenes,
resultsGR %>%
resize(width = 80001,fix = "center") %>%
trim()
)
sigGenesIn40kb <- subsetByOverlaps(
ensGenes,
resultsGR %>%
subset(Sig)%>%
resize(width = 80001,fix = "center") %>%
trim()
)
The same approach was then repeated for genes within 40kb of each SNP. This produced a list of 9688 genes in total, of which 96 overlapped SNPs of interest.
go2Test40kb <- ALLGO2ENS %>%
filter(ensembl_gene_id %in% sigGenesIn40kb$gene_id,
!go_id %in% unlist(thirdLevelGO)) %>%
extract2("go_id") %>%
unique %>%
setdiff(singleGeneGO)
A set of 982 GO terms mapping to more than one gene, and with at least 4 steps back to the ontology roots were assigned to the genes within 40kb of the 65 candidate SNPs. These were then tested for enrichment using the set of genes within 40kb of the remaining 18328 SNPs
nSigGenes <- length(sigGenesIn40kb)
nNotSigGenes <- length(allGenesIn40kb) - nSigGenes
goResults40kb <- go2Test40kb %>%
lapply(function(x){
# Form a matrix of the counts
mat <- filter(ALLGO2ENS, go_id == x) %>%
mutate(Sig = ensembl_gene_id %in% sigGenesIn40kb$gene_id) %>%
acast(Sig~., fun.aggregate = length, value.var = "Sig")
# Ensure it is a matrix, then set row/colnames
if (length(mat) == 1) mat <- c(0, mat)
mat <- cbind(mat, withoutGO = c(nNotSigGenes, nSigGenes) - mat)
colnames(mat)[1] <- "withGO"
rownames(mat) <- c("notNearSNP", "nearSNP")
# Fisher's exact test
ft <- fisher.test(mat)
data_frame(go_id = x,
expected = nSigGenes * mat["notNearSNP", "withGO"] / nNotSigGenes,
observed = mat["nearSNP", "withGO"],
p = ft$p.value)
}) %>%
bind_rows() %>%
mutate(FDR = p.adjust(p, "fdr")) %>%
arrange(p)
A total of 6 GO terms were considered as enriched using the criteria of an FDR-adjusted p-value < 0.05 and with observed numbers greater than that predicted by the ratio in the non-significant SNP genes.
GO ID | Term | Ont | Observed | Expected | p | FDR |
---|---|---|---|---|---|---|
GO:0071294 | cellular response to zinc ion | BP | 3 | 0.02002 | 9.296e-06 | 0.001186 |
GO:0010043 | response to zinc ion | BP | 3 | 0.06005 | 7.586e-05 | 0.003167 |
GO:0048471 | perinuclear region of cytoplasm | CC | 9 | 3.033 | 0.003688 | 0.05132 |
Similar sets of terms were detected at both 20kb and 40kb. The appearance of terms connected to the Zinc ion may be of note as the link between zinc and clearance of HCV has recently been established, via the IFN-γ pathway
The genes associated with each of these GO terms is given below:
GO ID | Term | Gene ID | Name |
---|---|---|---|
GO:0071294 | cellular response to zinc ion | ENSOCUG00000021126 | MT-2A |
ENSOCUG00000021209 | MT-2D | ||
ENSOCUG00000029235 | MT-1A | ||
GO:0010043 | response to zinc ion | ENSOCUG00000021126 | MT-2A |
ENSOCUG00000021209 | MT-2D | ||
ENSOCUG00000029235 | MT-1A | ||
GO:0048471 | perinuclear region of cytoplasm | ENSOCUG00000000596 | PLA2G16 |
ENSOCUG00000001407 | MAGI2 | ||
ENSOCUG00000010088 | MTMR14 | ||
ENSOCUG00000011060 | MYO1B | ||
ENSOCUG00000017056 | PKN2 | ||
ENSOCUG00000021126 | MT-2A | ||
ENSOCUG00000021209 | MT-2D | ||
ENSOCUG00000024842 | CTIF | ||
ENSOCUG00000029235 | MT-1A |
The set of genes within 40kb of each SNP of interest were then exported.
hitsIn40kb <- findOverlaps(
resultsGR %>%
subset(Sig)%>%
resize(width = 80001,fix = "center") %>%
trim(),
ensGenes
)
tsvOut <- file.path("..", "results", "GenesIn40kb.tsv")
list(
resultsGR %>%
subset(Sig) %>%
extract(queryHits(hitsIn40kb)) %>%
as.data.frame() %>%
dplyr::select(snpID, Chr = seqnames, BP = start, genotypes_p, FLK_p) ,
ensGenes %>%
extract(subjectHits(hitsIn40kb)) %>%
as.data.frame() %>%
dplyr::select(GeneStart = start, GeneEnd = end, strand, NearGene = gene_id, GeneName = Name)
) %>%
bind_cols %>%
as_data_frame() %>%
rowwise()%>%
mutate(LocusID = gsub("([0-9]*)_[0-9]*", "\\1", snpID),
dist2Gene = if_else(BP > GeneStart && BP < GeneEnd, 0L, min(abs(c(BP - GeneStart, BP - GeneEnd)))),
GeneWidth = abs(GeneStart - GeneEnd)) %>%
dplyr::select(LocusID, snpID, Chr, BP, NearGene, GeneName, GeneStrand = strand, GeneStart, GeneEnd, GeneWidth, dist2Gene, genotypes_p, FLK_p) %>%
mutate(minP = min(genotypes_p, FLK_p)) %>%
arrange(minP) %>%
dplyr::select(-minP) %>%
as.data.frame() %>%
write_tsv(tsvOut)
pander(sessionInfo())
R version 3.4.4 (2018-03-15)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: bindrcpp(v.0.2.2), reshape2(v.1.4.3), scales(v.0.5.0), pander(v.0.6.1), magrittr(v.1.5), GO.db(v.3.5.0), AnnotationDbi(v.1.40.0), Biobase(v.2.38.0), biomaRt(v.2.34.2), rtracklayer(v.1.38.3), GenomicRanges(v.1.30.3), GenomeInfoDb(v.1.14.0), IRanges(v.2.12.0), S4Vectors(v.0.16.0), BiocGenerics(v.0.24.0), forcats(v.0.3.0), stringr(v.1.3.1), dplyr(v.0.7.5), purrr(v.0.2.5), readr(v.1.1.1), tidyr(v.0.8.1), tibble(v.1.4.2), ggplot2(v.2.2.1) and tidyverse(v.1.2.1)
loaded via a namespace (and not attached): nlme(v.3.1-137), bitops(v.1.0-6), matrixStats(v.0.53.1), lubridate(v.1.7.4), bit64(v.0.9-7), progress(v.1.1.2), httr(v.1.3.1), rprojroot(v.1.3-2), tools(v.3.4.4), backports(v.1.1.2), R6(v.2.2.2), DBI(v.1.0.0), lazyeval(v.0.2.1), colorspace(v.1.3-2), tidyselect(v.0.2.4), prettyunits(v.1.0.2), mnormt(v.1.5-5), bit(v.1.1-14), compiler(v.3.4.4), cli(v.1.0.0), rvest(v.0.3.2), xml2(v.1.2.0), DelayedArray(v.0.4.1), psych(v.1.8.4), digest(v.0.6.15), Rsamtools(v.1.30.0), foreign(v.0.8-70), rmarkdown(v.1.9), XVector(v.0.18.0), pkgconfig(v.2.0.1), htmltools(v.0.3.6), rlang(v.0.2.1), readxl(v.1.1.0), rstudioapi(v.0.7), RSQLite(v.2.1.1), bindr(v.0.1.1), jsonlite(v.1.5), BiocParallel(v.1.12.0), RCurl(v.1.95-4.10), GenomeInfoDbData(v.1.0.0), Matrix(v.1.2-14), Rcpp(v.0.12.17), munsell(v.0.4.3), stringi(v.1.2.2), yaml(v.2.1.19), SummarizedExperiment(v.1.8.1), zlibbioc(v.1.24.0), plyr(v.1.8.4), grid(v.3.4.4), blob(v.1.1.1), crayon(v.1.3.4), lattice(v.0.20-35), Biostrings(v.2.46.0), haven(v.1.1.1), hms(v.0.4.2), knitr(v.1.20), pillar(v.1.2.3), XML(v.3.98-1.11), glue(v.1.2.0), evaluate(v.0.10.1), modelr(v.0.1.2), cellranger(v.1.1.0), gtable(v.0.2.0), assertthat(v.0.2.0), broom(v.0.4.4), GenomicAlignments(v.1.14.2) and memoise(v.1.1.0)