Linkage Analysis

library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
library(parallel)
library(plyranges)
library(rtracklayer)
library(ggrepel)

if (interactive()) here::here("R") %>% setwd()
theme_set(theme_bw())
panderOptions('big.mark', ",")
panderOptions('table.style', "rmarkdown")
panderOptions('table.split.table', Inf)
panderOptions("missing", "")
mc <- min(12, detectCores() - 1)

Linkage Analysis

gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
gdsFile <- seqOpen(gdsPath, readonly = TRUE)

sampleID <- tibble(
  Sample = seqGetData(gdsFile, "sample.annotation/Sample"),
  Population = seqGetData(gdsFile, "sample.annotation/Population"),
  Location = seqGetData(gdsFile, "sample.annotation/Location")
) %>%
  filter(Population != 2010)

Autosomal SNP IDs for the 1996 and 2012 samples were loaded. To then define the SNPs used for estimation of Linkage Decay, select the 10,000 autosomal SNPs identified in the highest number of individuals. Here this was set as SNPs identified in >95% of individuals (i.e. > 104). For simpler plotting and estimation of decay, only SNPs < 1Mb apart were retained.

A function was then defined to fit Hill & Weir’s linkage decay equation using a non-linear fit.

hwDecay <- function(d, r2, n = nrow(sampleID), k = 1e3){
  st <- c(C = 0.1)
  fit <- nls(
    r2 ~ ((10 + C*d) / ((2 + C*d)*(11 + C*d))) *
      (1 +((3 + C*d)*(12 + 12*C*d + (C*d)^2)) / (n*(2 + C*d)*(11 + C*d))),
    start = st,
    control = nls.control(maxiter = 100)
  )
  rho <- summary(fit)$parameters[1]
  x <- seq(0, max(d), length.out = k)
  y <- ((10 + rho*x)/((2 + rho*x)*(11 + rho*x))) *
    (1 + ((3 + rho*x)*(12 + 12*rho*x + (rho*x)^2)) / (n*(2 + rho*x)*(11 + rho*x)))
  list(line = tibble(x = x, y = y),
       fit = fit, 
       rho = rho)
}

fitDecay <- allLD %>%
  mutate(r2 = LD^2) %>%
  with(hwDecay(bp, r2))

set.seed(1764)
allLD %>%
  sample_n(n2plot) %>%
  ggplot(aes(bp / 1e3, LD^2)) +
  geom_point(alpha = 0.1) +
  geom_line(
    aes(x / 1e3, y), 
    data = fitDecay$line,
    colour = "green"
  ) +
  geom_vline(xintercept = 40, colour = "green", linetype = 2) +
  geom_vline(
    xintercept = dplyr::filter(fitDecay$line, y < 0.5*max(y))$x[1]/1e3,
    colour = "red", 
    linetype = 2
  ) + 
  geom_label(
    x = dplyr::filter(fitDecay$line, y < 0.5*max(y))$x[1]/1e3,
    y = 0.95,
    label = paste(
      round(dplyr::filter(fitDecay$line, y < 0.5*max(y))$x[1]/1e3, 0),
      "Kb"
    ),
    colour = "red"
  ) +
  labs(x = "Distance (Kb)", y = expression(r^2)) +
  scale_x_continuous(
    expand = expand_scale(0.02), 
    breaks = c(40, seq(0, 1e3, by = 200)), 
    minor_breaks = seq(0, 1e3, by = 100)
  ) +
  theme(
    axis.title.y = element_text(angle = 0, hjust = 0.5, vjust = 0.5),
    text = element_text(size = 14)
  )

Linkage decay for distances up to 1Mb. The Hill-Weir decay line is shown in green as is the distance 40kb. The red dashed line indicates the distances where linkage falls to half of the maximal linkage. 100,000 randomly chosen pairs from the complete set of 695,584 pairwise comparisons are shown.

Analysis of Significant SNPs

d <- 4e4

As the set of SNPs under analysis had been linkage pruned in order to only include one representative SNP, any SNPs within 40kb of the initially analysed SNPs were explored for evidence of linkage. As phasing was not possible outside of a given Stacks Locus, correlations between genotypes were used as a proxy for linkage. Correlations between were calculated for each time-point separately as they may not be expected to be the same at each time point, given the selective bottleneck applied by RHDV.

alleleResults <- read_tsv("alleleResults.tsv")
genotypeResults <- read_tsv("genotypeResults.tsv")

ensGenes <- file.path(
  "..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz"
) %>%
  import.gff3(feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE) %>%
  .[,c("gene_id", "Name", "description")]

snpsGR <- makeGRangesFromDataFrame(
  df = snps, 
  keep.extra.columns = TRUE, 
  ignore.strand = TRUE, 
  seqinfo = seqinfo(ensGenes), 
  seqnames.field = "chromosome", 
  start.field = "position", 
  end.field = "position"
)

ensExons <- file.path("..", "external", "Oryctolagus_cuniculus.OryCun2.0.96.gff3.gz") %>%
    import.gff3(feature.type = "exon", sequenceRegionsAsSeqinfo = TRUE) %>%
  join_overlap_inner_within(
    ensGenes, maxgap = 0, minoverlap = 0, suffix = c(".exon", "")
  ) %>%
  join_overlap_inner(snpsGR)
ensExons <- ensExons[,setdiff(colnames(mcols(ensExons)), "Name.exon")]

sigSNPs <- snpsGR %>%
  subset(
    snpID %in% (
      list(
        allele = alleleResults,
        genotype = genotypeResults
      ) %>%
        lapply(dplyr::filter, FDR < 0.05) %>%
        lapply(extract2, "snpID") %>%
        unlist() %>%
        unique()
    )
  ) 
sigRanges <- sigSNPs %>%
  resize(width = 2*d + 1, fix = "center") %>%
  trim()

snpsForCor <- snpsGR %>%
  filter_by_overlaps(sigRanges) %>%
  join_nearest(sigSNPs) %>%
  split(f = .$snpID.y) %>%
  .[sigSNPs$snpID]

countN <- function(x){sum(!is.na(x))}
samples <- list(
  ora = dplyr::filter(sampleID, Population == 2012)$Sample,
  gc = dplyr::filter(sampleID, Population == 1996)$Sample
)
snpN <- snpsForCor %>%
  lapply(function(x){
    seqSetFilter(
      gdsFile, 
      sample.id = unlist(samples), 
      variant.id = mcols(x)$variant.id.x
      )
    gt <- seqGetData(gdsFile, "genotype")
    seqResetFilter(gdsFile)
    apply(gt, MARGIN = 3, colSums) %>%
      set_colnames(mcols(x)$snpID.x) %>%
      set_rownames(unlist(samples)) %>%
      as.data.frame() %>%
      rownames_to_column("Sample") %>%
      left_join(sampleID) %>%
      as_tibble() %>%
      group_by(Population) %>%
      summarise_at(vars(contains("_")), countN) %>%
      gather("snpID", "N", -Population) %>%
      spread("Population", "N") %>%
      dplyr::rename(N_1996 = `1996`, N_2012 = `2012`)
  }) %>%
  bind_rows()

cors <- snpsForCor %>%
  lapply(function(x){
    df1 <- snpgdsLDMat(
      gdsFile,
      sample.id = samples$gc,
      snp.id = x$variant.id.x,
      slide = -1,
      method = "corr"
    )$LD %>%
      set_rownames(x$snpID.x) %>%
      set_colnames(x$snpID.x) %>%
      as.data.frame() %>%
      rownames_to_column("snp1") %>%
      gather(key = "snp2", value = "corr1996", -snp1) %>%
      dplyr::filter(
        snp1 %in% sigSNPs$snpID,
        snp1 != snp2
      )
    df2 <- snpgdsLDMat(
      gdsFile,
      sample.id = samples$ora,
      snp.id = x$variant.id.x,
      slide = -1,
      method = "corr"
    )$LD %>%
      set_rownames(x$snpID.x) %>%
      set_colnames(x$snpID.x) %>%
      as.data.frame() %>%
      rownames_to_column("snp1") %>%
      gather(key = "snp2", value = "corr2012", -snp1) %>%
      dplyr::filter(
        snp1 %in% sigSNPs$snpID,
        snp1 != snp2
      )
    left_join(df1, df2)
  }) %>%
  bind_rows() %>%
  as_tibble() %>%
  dplyr::filter(
    !is.nan(corr2012),
    !is.nan(corr1996)
  ) %>%
  mutate(
    LD2_1996 = corr1996^2,
    LD2_2012 = corr2012^2
  ) %>%
  left_join(snps, by = c("snp1" = "snpID")) %>%
  dplyr::select(
    starts_with("snp"), 
    starts_with("corr"),
    starts_with("LD"),
    variant.id, chromosome, 
    pos1 = position, 
    Locus1 = `Locus ID`
  ) %>%
  left_join(
    snps, 
    by = c("snp2" = "snpID", "chromosome" = "chromosome")
  ) %>%
  dplyr::select(
    chromosome,
    starts_with("snp"), 
    starts_with("corr"),
    starts_with("LD"),    
    Locus1, Locus2 = `Locus ID`,
    pos1, pos2 = position
  ) %>%
  dplyr::mutate(
    dist = pos2 - pos1,
    model = case_when(
      snp1 %in% dplyr::filter(alleleResults, FDR < 0.05)$snpID &
        snp1 %in% dplyr::filter(genotypeResults, FDR < 0.05)$snpID ~ "Both",
      snp1 %in% dplyr::filter(alleleResults, FDR < 0.05)$snpID ~ "Allele",
      snp1 %in% dplyr::filter(genotypeResults, FDR < 0.05)$snpID ~ "Genotype" 
    ),
    diff = LD2_2012 - LD2_1996
  )

In this analysis, linkage between each significant SNP and those within 40kb was investigated. Under no selective pressure, linkage would be expected to gradually decrease over the passage of time through recombination, as seen below, where correlations (as indicated by the blue regression line) are trending towards zero. SNPs in strong physical linkage before selective pressure from RHDV tended to remain in strong linkage if within the same locus (~100bp).

labFun <- function(x){1 / x}
cors %>% 
  mutate(`Same Locus` = Locus1 == Locus2) %>%
  dplyr::filter(
    snp1 %in% dplyr::filter(snpN, N_1996 >= 40, N_2012 >= 37)$snpID,
    snp2 %in% dplyr::filter(snpN, N_1996 >= 40, N_2012 >= 37)$snpID    
  ) %>%
  ggplot(aes(corr1996^2, corr2012^2)) +
  geom_point(
    aes(size = 1 / abs(dist), colour = `Same Locus`), 
    alpha = 0.5
  ) +
  geom_abline(slope = 1) + 
  geom_smooth(method = "lm", se = FALSE) +
  scale_size_continuous(
    labels = labFun, 
    breaks = c(1/d, 10^seq(-4, 0)), 
    trans = "sqrt"
  ) + 
  scale_colour_manual(values = c("black", "red")) +
  labs(
    x = expression(paste(r[1996]^2)),
    y = expression(paste(r[2012]^2)),
    size = "Distance (bp)"
  )

Correlations between pairs of SNPs located within 100kb of the significant SNPs detected in the previous stages. The regression line is shown in blue, with y = x shown in black for reference. SNPs within the same locus are shown in red. Only SNPs identified in >70% of both populations are shown.

No impact of distance between two SNPs was noted on changes in linkage, however a clear break in linkage was identified visually for one pair of SNPs. It is worth noting that transformations of correlations using Fisher’s Z transformation, are not viable here for testing changes in correlation. These are sample estimates of a true population parameter, and values at the boundary points (i.e. \(\rho = 1\)) are very common. These values are unable to be transformed to any value beside \(\infty\), and as such statistical tests cannot be performed.

cors %>% 
  ggplot(aes(abs(dist)/1000, diff)) +
  geom_point(aes(alpha = LD2_1996)) +
  geom_smooth(method = "lm") +
  geom_text(
    aes(label = label),
    data = . %>% 
      dplyr::filter(diff == min(diff)) %>%
      mutate(label = paste(snp1, snp2, sep = "\n"))
  ) +
  labs(
    x = "Distance (kb)",
    y = expression(paste(Delta, r^2))
  )

Change in linkage between SNP pairs shown by distance. One SNP pair which was found in 100% linkage for the 1996 population appeared to shown near zero linkage in 2012.

One pair of SNPs (2028473_73 & 2028554_103) was noticed as showing a clear break in linkage. Manual inspection revealed a further SNP (2028643_21; 13:3054236) showing a very similar decrease in linkage (r²) from 1 to 0.1361, at a distance of 74.8kb. Several other unlinked SNPs also appeared in this region, however this observation suggests than an ancestral haplotype from 1996 may have been selected against. The nearest gene to this region is COP1 (13:3090539-3302219), and the human protein ortholog (Q8NHY2) is predicted to be located in numerous cellular locations, including the Golgi Apparatus.

cors %>% 
  mutate(diff = LD2_2012 - LD2_1996) %>% 
  dplyr::filter(diff == min(diff)) %>%
  dplyr::select(
    Chromosome = chromosome, 
    Position = pos1,
    snp1, snp2, 
    Distance = dist,
    `r~1996~^2^` = LD2_1996, 
    `r~2012~^2^` = LD2_2012
  ) %>%
  pander(
    justify = "lrrrlll",
    caption = "The one SNP pair noted with a clear change in linkage"
  )

The one SNP pair noted with a clear change in linkage
Chromosome	Position	snp1	snp2	Distance	r₁₉₉₆²	r₂₀₁₂²
13	2,979,383	2028473_73	2028554_103	38,147	1	0.01481

Session Info

seqClose(gdsFile)

sessionInfo() %>% pander()

R version 3.6.1 (2019-07-05)

**Platform:** x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: ggrepel(v.0.8.1), rtracklayer(v.1.46.0), plyranges(v.1.6.3), GenomicRanges(v.1.38.0), GenomeInfoDb(v.1.22.0), IRanges(v.2.20.0), S4Vectors(v.0.24.2), BiocGenerics(v.0.32.0), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tibble(v.2.1.3), ggplot2(v.3.2.1), tidyverse(v.1.3.0), magrittr(v.1.5), scales(v.1.1.0), pander(v.0.6.3), SNPRelate(v.1.20.1), SeqArray(v.1.26.1) and gdsfmt(v.1.22.0)

loaded via a namespace (and not attached): Biobase(v.2.46.0), httr(v.1.4.1), jsonlite(v.1.6), modelr(v.0.1.5), assertthat(v.0.2.1), highr(v.0.8), GenomeInfoDbData(v.1.2.2), cellranger(v.1.1.0), Rsamtools(v.2.2.1), yaml(v.2.2.0), pillar(v.1.4.2), backports(v.1.1.5), lattice(v.0.20-38), glue(v.1.3.1), digest(v.0.6.23), XVector(v.0.26.0), rvest(v.0.3.5), colorspace(v.1.4-1), htmltools(v.0.4.0), Matrix(v.1.2-18), XML(v.3.98-1.20), pkgconfig(v.2.0.3), broom(v.0.5.2), haven(v.2.2.0), zlibbioc(v.1.32.0), BiocParallel(v.1.20.0), farver(v.2.0.3), generics(v.0.0.2), ellipsis(v.0.3.0), withr(v.2.1.2), SummarizedExperiment(v.1.16.0), lazyeval(v.0.2.2), cli(v.1.1.0), crayon(v.1.3.4), readxl(v.1.3.1), evaluate(v.0.14), fs(v.1.3.1), nlme(v.3.1-143), xml2(v.1.2.2), tools(v.3.6.1), hms(v.0.5.2), lifecycle(v.0.1.0), matrixStats(v.0.55.0), munsell(v.0.5.0), reprex(v.0.3.0), DelayedArray(v.0.12.0), Biostrings(v.2.54.0), compiler(v.3.6.1), rlang(v.0.4.2), grid(v.3.6.1), RCurl(v.1.95-4.13), rstudioapi(v.0.10), labeling(v.0.3), bitops(v.1.0-6), rmarkdown(v.1.17), gtable(v.0.3.0), DBI(v.1.1.0), R6(v.2.4.1), GenomicAlignments(v.1.22.1), lubridate(v.1.7.4), knitr(v.1.26), zeallot(v.0.1.0), stringi(v.1.4.5), Rcpp(v.1.0.3), vctrs(v.0.2.0), dbplyr(v.1.4.2), tidyselect(v.0.2.5) and xfun(v.0.11)

Linkage Analysis

Steve Pederson

27 January, 2020