library(tidyverse)
library(rtracklayer)
library(GenomicRanges)
library(biomaRt)
library(GO.db)
library(parallel)
library(magrittr)
library(pander)
library(scales)
library(reshape2)
library(stringr)

nCores <- min(12, detectCores() - 1)
theme_set(theme_bw())
evalLongChunks <- FALSE

Introduction

This analysis takes the SNPs found to be associated with the two populations and determines which genes are nearby. An analysis of the GO terms associated with these SNPS is also undertaken.

Outline of analysis

This file covers three main areas

Preparation of the complete mappings from Ensembl gene IDs to GO terms
Finding SNPs within and near genes
Identification of enriched GO terms using Fisher’s Exact Test on the sets of genes within 2kb and 40kb of the significant SNPs

Data Setup

Genes

GenomicRanges were formed defining all genes and exons according to the gene models in Build 84 of Ensembl.

gffFile <- file.path("..", "data", "Oryctolagus_cuniculus.OryCun2.0.84.gff3.gz") 
ensGenes <- import.gff(gffFile, feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE) %>%
  sortSeqlevels() 
ensExons <-  import.gff(gffFile, feature.type = "exon", sequenceRegionsAsSeqinfo = TRUE) %>%
  sortSeqlevels()

GO Terms

Mapping Genes to GO Terms

The set of terms defining the root nodes was initially formed

rootGO <- list(
  BP = "GO:0008150",
  CC = "GO:0005575",
  MF = "GO:0003674"
  )

The complete set of GO IDs mapping to these genes was then obtained from biomaRt.

mart <- useEnsembl("ensembl", dataset ="ocuniculus_gene_ensembl", version = "84")
ens2Go <- getBM(attributes = c("ensembl_gene_id","go_id", "namespace_1003"), 
                filters = "ensembl_gene_id", 
                values = ensGenes$gene_id, 
                mart = mart) %>%
  filter(go_id != "") %>% 
  dplyr::rename(ontology = namespace_1003) %>%
  mutate(ontology = c(biological_process = "BP", 
                      cellular_component = "CC", 
                      molecular_function = "MF")[ontology])

GO Terms in the biomaRt results which failed to map to the database were removed.

ens2Go %<>% split(f = .$ontology)
ens2Go$BP %<>% filter(go_id %in% get(rootGO$BP, GOBPOFFSPRING))
ens2Go$CC %<>% filter(go_id %in% get(rootGO$CC, GOCCOFFSPRING))
ens2Go$MF %<>% filter(go_id %in% get(rootGO$MF, GOMFOFFSPRING))
ens2Go %<>% bind_rows()

The file was then saved to disk and became the reference database for subsequent analyses.

gzOut <- file.path("..", "data", "ens2GO.biomaRt.84.tsv.gz") %>% gzfile("w")
write_delim(ens2Go, gzOut, delim ="\t")
close(gzOut)

ens2Go <- file.path("..", "data","ens2GO.biomaRt.84.tsv.gz") %>%
  gzfile() %>%
  read_tsv()

Building the Complete Set of Mappings

Paths were formed mapping all GO Terms obtained from biomaRt back to the root nodes.

allGOAncestor <- list(
  BP = filter(ens2Go, ontology == "BP") %>%
    extract2("go_id") %>%
    unique() %>%
    sapply(get, GOBPANCESTOR, simplify = FALSE),
  CC = filter(ens2Go, ontology == "CC") %>%
    extract2("go_id") %>%
    unique() %>%
    sapply(get, GOCCANCESTOR, simplify = FALSE),
  MF = filter(ens2Go, ontology == "MF") %>%
    extract2("go_id") %>%
    unique() %>%
    sapply(get, GOMFANCESTOR, simplify = FALSE)
)

Each gene was then mapped to the complete set of GO terms and saved in an external table.

ALLGO2ENS <- ens2Go %>%
  split(f = .$ensembl_gene_id) %>%
  mclapply(function(x){
    g <- unique(x$ensembl_gene_id)
    BP <- allGOAncestor$BP[filter(x, ontology == "BP")$go_id] %>%
      unlist() %>%
      c(filter(x, ontology == "BP")$go_id) %>%      
      unique()
    CC <- allGOAncestor$CC[filter(x, ontology == "CC")$go_id] %>%
      unlist() %>%
      c(filter(x, ontology == "CC")$go_id) %>%      
      unique()
    MF <- allGOAncestor$MF[filter(x, ontology == "MF")$go_id] %>%
      unlist() %>%
      c(filter(x, ontology == "MF")$go_id) %>%
      unique() 
    data_frame(ensembl_gene_id = g, go_id = c(BP, CC, MF))
  },mc.cores = nCores) %>%
  bind_rows() %>%
  filter(go_id != "all") %>%
  mutate(ontology = Ontology(go_id))

gzOut <- file.path("..", "data", "ALLGO2ENS.tsv.gz") %>% gzfile("w")
write_delim(ALLGO2ENS, gzOut, delim = "\t")
close(gzOut)

ALLGO2ENS <- file.path("..", "data", "ALLGO2ENS.tsv.gz") %>%
  gzfile() %>%
  read_tsv()
singleGeneGO <- ALLGO2ENS %>% 
  group_by(go_id) %>% 
  tally %>% 
  filter(n == 1) %>%
  extract2("go_id")

This object contained all 1,264,162 possible gene to GO mappings for this build of Ensembl.
After building the database once, this object then became the reference object for all downstream analysis.
A subset of 4237 GO terms which mapped to only a single gene were also noted for exclusion from testing, as no significant results would be possible from these terms.

Defining GO Term Levels

GO Terms were then defined in terms of their minimum distance to the Root Nodes, down to those requiring up to 3 steps.

firstLevelGO <- list(
  BP = c(rootGO$BP, get(rootGO$BP, GOBPCHILDREN)),
  CC = c(rootGO$CC, get(rootGO$CC, GOCCCHILDREN)),
  MF = c(rootGO$MF, get(rootGO$MF, GOMFCHILDREN))
) %>%
  lapply(unique)
secondLevelGO <- list(
  BP = c(firstLevelGO$BP, lapply(firstLevelGO$BP, get, GOBPCHILDREN)),
  CC = c(firstLevelGO$CC, lapply(firstLevelGO$CC, get, GOCCCHILDREN)),
  MF = c(firstLevelGO$MF, lapply(firstLevelGO$MF, get, GOMFCHILDREN))
) %>%
  lapply(unlist) %>%
  lapply(unique) %>%
  lapply(function(x){x[!is.na(x)]})
thirdLevelGO <- list(
  BP = c(secondLevelGO$BP, lapply(secondLevelGO$BP, get, GOBPCHILDREN)),
  CC = c(secondLevelGO$CC, lapply(secondLevelGO$CC, get, GOCCCHILDREN)),
  MF = c(secondLevelGO$MF, lapply(secondLevelGO$MF, get, GOMFCHILDREN))
) %>%
  mclapply(unlist, mc.cores = nCores) %>%
  mclapply(unique, mc.cores = nCores) %>%
  mclapply(function(x){x[!is.na(x)]}, mc.cores = nCores)

SNPs

resultFiles <- file.path("..", "results", c("flkResults.tsv", 
                                            "alleleResults.tsv",
                                            "genotypeResults.tsv")) %>%
  set_names(c("flk", "alleles", "genotypes"))
results <- resultFiles %>%
  lapply(read_tsv)

resultsGR <- full_join(
  results$flk %>%
    dplyr::select(snpID, Chr, BP, SNP, `Gum Creek (1996)`, `Oraparinna (2012)`,
                  FLK_p = F.LK.p.val, FLK_FDR = FDR),
  results$alleles %>%
    dplyr::select(snpID, Chr, BP, alleles_p = p, alleles_FDR = FDR)
) %>%
  full_join(
  results$genotypes %>%
    dplyr::select(snpID, Chr, BP, genotypes_p = p, genotypes_FDR = FDR)
  ) %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE, 
    ignore.strand = TRUE,
    seqnames.field = "Chr", 
    start.field = "BP", 
    end.field = "BP", 
    seqinfo = seqinfo(ensGenes)) %>%
  sort()

resultsGR$Sig <- with(resultsGR, FLK_FDR < 0.05 | genotypes_FDR < 0.1)

Relationship of SNPs to Genes

SNPs within genes

snpInGenes <- resultsGR %>% 
  subset(Sig) %>%
  findOverlaps(ensGenes)
snpInExons <- resultsGR %>% 
  subset(Sig) %>%
  findOverlaps(ensExons)

A total of 38 significant SNPs were found to be within the start and end points of genes. 5 of these were within the coding regions of these genes.

Genes with significant SNPs located between their start and end positions. SNPs within exons are indicated with an additional asterisk.
Gene ID	Name	Chr	BP	snpID
ENSOCUG00000003383	MGAT4A	2	93,561,889	214772_69
ENSOCUG00000017054	TEX33	4	84,940,214	167107_60
ENSOCUG00000017054	TEX33	4	84,940,235	167108_14
ENSOCUG00000005399	EPO	6	27,103,573	157156_58*
ENSOCUG00000005399	EPO	6	27,103,576	157157_45*
ENSOCUG00000001407	MAGI2	7	38,250,131	151791_54
ENSOCUG00000011060	MYO1B	7	131,862,327	147965_18
ENSOCUG00000005605	LDLRAD4	9	48,543,229	134595_50
ENSOCUG00000024842	CTIF	9	89,862,459	211772_50
ENSOCUG00000024842	CTIF	9	89,862,481	211772_28
ENSOCUG00000024842	CTIF	9	89,862,497	211772_12
ENSOCUG00000015851	DYM	9	90,037,571	137949_40
ENSOCUG00000015851	DYM	9	90,037,630	137951_16
ENSOCUG00000000804	SCRN1	10	14,373,457	127156_20
ENSOCUG00000000804	SCRN1	10	14,373,501	127157_45
ENSOCUG00000012252	DPP6	13	15,938,245	107422_35
ENSOCUG00000007988	MUC4	14	92,793,012	101831_18*
ENSOCUG00000016518	KIAA1217	16	735,716	87819_88
ENSOCUG00000007461	NAV1	16	69,850,541	87407_11
ENSOCUG00000015494	NIN	17	69,838,525	80772_39
ENSOCUG00000011533	RHOBTB1	18	25,311,139	72765_47*
ENSOCUG00000007069	FGFR2	18	68,517,919	208110_71
ENSOCUG00000007069	FGFR2	18	68,517,942	208111_21
ENSOCUG00000008478	STXBP4	19	32,825,478	68946_78
ENSOCUG00000001207	ESRRB	20	28,859,654	65474_28
ENSOCUG00000001722	KSR2	21	14,643,101	62998_77
ENSOCUG00000021564		GL018704	4,853,505	53831_42*
ENSOCUG00000009435	XKR5	GL018713	365,938	50206_34
ENSOCUG00000000596	PLA2G16	GL018717	314,346	48659_40
ENSOCUG00000017106	KAZN	GL018739	75,851	41475_63
ENSOCUG00000017031	USP46	GL018754	1,365,061	37345_16
ENSOCUG00000029175		GL018758	1,220,835	36566_45
ENSOCUG00000029175		GL018758	1,220,869	36567_12
ENSOCUG00000010092	CPNE9	GL018802	439,482	204810_79
ENSOCUG00000005387	XRCC1	GL018881	24,194	21896_47
ENSOCUG00000005387	XRCC1	GL018881	24,230	21896_11
ENSOCUG00000027400	PLAUR	GL018881	88,327	233206_29
ENSOCUG00000029496	UNC93A	GL019406	5,672	5908_8

It was also noted that proteins produced from MYO1B and USP46are known interacting proteins in human biology

Genes within 20kb

allGenesIn20kb <- subsetByOverlaps(
  ensGenes, 
  resultsGR %>%
    resize(width = 40001,fix = "center") %>%
    trim()
    )
sigGenesIn20kb <- subsetByOverlaps(
  ensGenes, 
  resultsGR %>%
    subset(Sig)%>%
    resize(width = 40001,fix = "center") %>%
    trim()
    )

As a an initial conservative approach, the set of genes within 20kb of a SNP was investigated. This produced a list of 7407 genes in total, of which 65 overlapped SNPs of interest.

go2Test20kb <- ALLGO2ENS %>%
  filter(ensembl_gene_id %in% sigGenesIn20kb$gene_id,
         !go_id %in% unlist(thirdLevelGO)) %>%
  extract2("go_id") %>%
  unique %>%
  setdiff(singleGeneGO)

A set of 767 GO terms mapping to more than one gene, and with at least 4 steps back to the ontology roots, were assigned to the genes within 20kb of the 65 candidate SNPs. These were then tested for enrichment using the set of genes within 20kb of the remaining 18328 SNPs

Due to the closely related nature of some SNPs in this dataset the possibility of two SNPs being within 20kb of the same genes was extremely high, and this approach would ensure that each gene only appears once despite the possibility of mapping to multiple SNPs within the dataset.

Fisher’s Exact Test was then used to test for enrichment by counting the genes mapped to each GO term within the set of significant, and non-significant SNPs, and comparing to the genes not mapped to each GO term in both sets of SNPs. All p values were then adjusted using Benjamini-Hochberg’s FDR.

nSigGenes <- length(sigGenesIn20kb)
nNotSigGenes <- length(allGenesIn20kb) - nSigGenes
goResults20kb <- go2Test20kb %>%
  lapply(function(x){
    # Form a matrix of the counts
    mat <- filter(ALLGO2ENS, go_id == x) %>%
      mutate(Sig = ensembl_gene_id %in% sigGenesIn20kb$gene_id) %>%
      acast(Sig~., fun.aggregate = length, value.var = "Sig")
    # Ensure it is a matrix, then set row/colnames
    if (length(mat) == 1) mat <- c(0, mat)
    mat <- cbind(mat, withoutGO = c(nNotSigGenes, nSigGenes) - mat)
    colnames(mat)[1] <- "withGO"
    rownames(mat) <- c("notNearSNP", "nearSNP")
    # Fisher's exact test
    ft <- fisher.test(mat)
    data_frame(go_id = x,
               expected = nSigGenes * mat["notNearSNP", "withGO"] / nNotSigGenes,
               observed = mat["nearSNP", "withGO"],
               p = ft$p.value)
  }) %>%
  bind_rows() %>%
  mutate(FDR = p.adjust(p, "fdr")) %>%
  arrange(p)

A total of 3 GO terms were considered as enriched using the criteria of an FDR-adjusted p-value < 0.05 and with observed numbers greater than that predicted by the ratio in the non-significant SNP genes.

Gene Ontologies considered as enriched amongst the set of 65 genes within 20kb of the significant SNPs. The number of genes matching each term is given in the ‘Observed’ column.
GO ID	Term	Ont	Observed	Expected	p	FDR
GO:0010359	regulation of anion channel activity	BP	2	0.008853	0.0002262	0.007544
GO:0004792	thiosulfate sulfurtransferase activity	MF	2	0.008853	0.0002262	0.007544
GO:0071294	cellular response to zinc ion	BP	2	0.02656	0.0007455	0.01906

Genes within 40kb

allGenesIn40kb <- subsetByOverlaps(
  ensGenes, 
  resultsGR %>%
    resize(width = 80001,fix = "center") %>%
    trim()
    )
sigGenesIn40kb <- subsetByOverlaps(
  ensGenes, 
  resultsGR %>%
    subset(Sig)%>%
    resize(width = 80001,fix = "center") %>%
    trim()
    )

The same approach was then repeated for genes within 40kb of each SNP. This produced a list of 9688 genes in total, of which 96 overlapped SNPs of interest.

go2Test40kb <- ALLGO2ENS %>%
  filter(ensembl_gene_id %in% sigGenesIn40kb$gene_id,
         !go_id %in% unlist(thirdLevelGO)) %>%
  extract2("go_id") %>%
  unique %>%
  setdiff(singleGeneGO)

A set of 982 GO terms mapping to more than one gene, and with at least 4 steps back to the ontology roots were assigned to the genes within 40kb of the 65 candidate SNPs. These were then tested for enrichment using the set of genes within 40kb of the remaining 18328 SNPs

nSigGenes <- length(sigGenesIn40kb)
nNotSigGenes <- length(allGenesIn40kb) - nSigGenes
goResults40kb <- go2Test40kb %>%
  lapply(function(x){
    # Form a matrix of the counts
    mat <- filter(ALLGO2ENS, go_id == x) %>%
      mutate(Sig = ensembl_gene_id %in% sigGenesIn40kb$gene_id) %>%
      acast(Sig~., fun.aggregate = length, value.var = "Sig")
    # Ensure it is a matrix, then set row/colnames
    if (length(mat) == 1) mat <- c(0, mat)
    mat <- cbind(mat, withoutGO = c(nNotSigGenes, nSigGenes) - mat)
    colnames(mat)[1] <- "withGO"
    rownames(mat) <- c("notNearSNP", "nearSNP")
    # Fisher's exact test
    ft <- fisher.test(mat)
    data_frame(go_id = x,
               expected = nSigGenes * mat["notNearSNP", "withGO"] / nNotSigGenes,
               observed = mat["nearSNP", "withGO"],
               p = ft$p.value)
  }) %>%
  bind_rows() %>%
  mutate(FDR = p.adjust(p, "fdr")) %>%
  arrange(p)

A total of 6 GO terms were considered as enriched using the criteria of an FDR-adjusted p-value < 0.05 and with observed numbers greater than that predicted by the ratio in the non-significant SNP genes.

Gene Ontologies considered as enriched amongst the set of 96 genes within 40kb of the significant SNPs. The number of genes matching each term is given in the ‘Observed’ column.
GO ID	Term	Ont	Observed	Expected	p	FDR
GO:0071294	cellular response to zinc ion	BP	3	0.02002	9.296e-06	0.001186
GO:0010043	response to zinc ion	BP	3	0.06005	7.586e-05	0.003167
GO:0048471	perinuclear region of cytoplasm	CC	9	3.033	0.003688	0.05132

Similar sets of terms were detected at both 20kb and 40kb. The appearance of terms connected to the Zinc ion may be of note as the link between zinc and clearance of HCV has recently been established, via the IFN-γ pathway

The genes associated with each of these GO terms is given below:

Genes within 40kb of significant SNPs which are associated wth each enriched GO term
GO ID	Term	Gene ID	Name
GO:0071294	cellular response to zinc ion	ENSOCUG00000021126	MT-2A
		ENSOCUG00000021209	MT-2D
		ENSOCUG00000029235	MT-1A
GO:0010043	response to zinc ion	ENSOCUG00000021126	MT-2A
		ENSOCUG00000021209	MT-2D
		ENSOCUG00000029235	MT-1A
GO:0048471	perinuclear region of cytoplasm	ENSOCUG00000000596	PLA2G16
		ENSOCUG00000001407	MAGI2
		ENSOCUG00000010088	MTMR14
		ENSOCUG00000011060	MYO1B
		ENSOCUG00000017056	PKN2
		ENSOCUG00000021126	MT-2A
		ENSOCUG00000021209	MT-2D
		ENSOCUG00000024842	CTIF
		ENSOCUG00000029235	MT-1A

Export of Results

The set of genes within 40kb of each SNP of interest were then exported.

hitsIn40kb <- findOverlaps(
  resultsGR %>%
    subset(Sig)%>%
    resize(width = 80001,fix = "center") %>%
    trim(),
    ensGenes
    )

tsvOut <- file.path("..", "results", "GenesIn40kb.tsv")
list(
  resultsGR %>%
    subset(Sig) %>%
    extract(queryHits(hitsIn40kb)) %>%
    as.data.frame() %>%
    dplyr::select(snpID, Chr = seqnames, BP = start, genotypes_p,  FLK_p) ,
  ensGenes %>%
    extract(subjectHits(hitsIn40kb)) %>%
    as.data.frame() %>%
    dplyr::select(GeneStart = start, GeneEnd = end, strand, NearGene = gene_id, GeneName = Name) 
) %>%
  bind_cols %>%
  as_data_frame() %>%
    rowwise()%>%
  mutate(LocusID = gsub("([0-9]*)_[0-9]*", "\\1", snpID),
         dist2Gene = if_else(BP > GeneStart && BP < GeneEnd, 0L, min(abs(c(BP - GeneStart, BP - GeneEnd)))),
         GeneWidth = abs(GeneStart - GeneEnd)) %>%
  dplyr::select(LocusID, snpID, Chr, BP, NearGene, GeneName, GeneStrand = strand, GeneStart, GeneEnd, GeneWidth, dist2Gene, genotypes_p, FLK_p) %>%
  mutate(minP = min(genotypes_p, FLK_p)) %>%
  arrange(minP) %>%
  dplyr::select(-minP) %>%
  as.data.frame() %>%
  write_tsv(tsvOut)

pander(sessionInfo())

R version 3.4.4 (2018-03-15)

**Platform:** x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: bindrcpp(v.0.2.2), reshape2(v.1.4.3), scales(v.0.5.0), pander(v.0.6.1), magrittr(v.1.5), GO.db(v.3.5.0), AnnotationDbi(v.1.40.0), Biobase(v.2.38.0), biomaRt(v.2.34.2), rtracklayer(v.1.38.3), GenomicRanges(v.1.30.3), GenomeInfoDb(v.1.14.0), IRanges(v.2.12.0), S4Vectors(v.0.16.0), BiocGenerics(v.0.24.0), forcats(v.0.3.0), stringr(v.1.3.1), dplyr(v.0.7.5), purrr(v.0.2.5), readr(v.1.1.1), tidyr(v.0.8.1), tibble(v.1.4.2), ggplot2(v.2.2.1) and tidyverse(v.1.2.1)

loaded via a namespace (and not attached): nlme(v.3.1-137), bitops(v.1.0-6), matrixStats(v.0.53.1), lubridate(v.1.7.4), bit64(v.0.9-7), progress(v.1.1.2), httr(v.1.3.1), rprojroot(v.1.3-2), tools(v.3.4.4), backports(v.1.1.2), R6(v.2.2.2), DBI(v.1.0.0), lazyeval(v.0.2.1), colorspace(v.1.3-2), tidyselect(v.0.2.4), prettyunits(v.1.0.2), mnormt(v.1.5-5), bit(v.1.1-14), compiler(v.3.4.4), cli(v.1.0.0), rvest(v.0.3.2), xml2(v.1.2.0), DelayedArray(v.0.4.1), psych(v.1.8.4), digest(v.0.6.15), Rsamtools(v.1.30.0), foreign(v.0.8-70), rmarkdown(v.1.9), XVector(v.0.18.0), pkgconfig(v.2.0.1), htmltools(v.0.3.6), rlang(v.0.2.1), readxl(v.1.1.0), rstudioapi(v.0.7), RSQLite(v.2.1.1), bindr(v.0.1.1), jsonlite(v.1.5), BiocParallel(v.1.12.0), RCurl(v.1.95-4.10), GenomeInfoDbData(v.1.0.0), Matrix(v.1.2-14), Rcpp(v.0.12.17), munsell(v.0.4.3), stringi(v.1.2.2), yaml(v.2.1.19), SummarizedExperiment(v.1.8.1), zlibbioc(v.1.24.0), plyr(v.1.8.4), grid(v.3.4.4), blob(v.1.1.1), crayon(v.1.3.4), lattice(v.0.20-35), Biostrings(v.2.46.0), haven(v.1.1.1), hms(v.0.4.2), knitr(v.1.20), pillar(v.1.2.3), XML(v.3.98-1.11), glue(v.1.2.0), evaluate(v.0.10.1), modelr(v.0.1.2), cellranger(v.1.1.0), gtable(v.0.2.0), assertthat(v.0.2.0), broom(v.0.4.4), GenomicAlignments(v.1.14.2) and memoise(v.1.1.0)

Analysis Of Genes Associated With Significant SNPs

Steve Pederson

11 June, 2018