Preliminary Setup

First we load the necessary libraries, along with a set of utility functions.

library(here)
library(stringr)
library(glue)
library(magrittr)
library(openxlsx)
library(SummarizedExperiment)
library(dplyr)
library(edgeR)
library(limma)
library(csaw)
library(sva)
library(ggplot2)
library(scales)
library(GGally)
library(ggalt)
library(ggthemes)
library(splines)
library(reshape2)
library(assertthat)
library(ggfortify)
library(broom)
library(ks)
library(RColorBrewer)
library(Rtsne)
library(variancePartition)

library(BSgenome.Hsapiens.UCSC.hg38)

library(doParallel)
ncores <- getOption("mc.cores", default=parallel::detectCores(logical = FALSE))
options(mc.cores=ncores)
registerDoParallel(cores=ncores)
library(BiocParallel)
register(DoparParam())

options(future.globals.maxSize=4 * 1024^3)
library(future)
plan(multicore)

source(here("scripts/utilities.R"))

# Required in order to use DGEList objects with future
length.DGEList <- function(x) {
    length(unclass(x))
}

We also set the random seed for reproducibility:

set.seed(1986)

Data Loading and Preprocessing

First we load the consensus peaks called from the reads pooled from all samples. This consensus peak set is not biased toward or against any sample or condition, and therefore the peak significance is expected to be independent of any differential binding in that peak.

peakfile <- here(
    "peak_calls", "epic_hg38.analysisSet",
    str_c(params$histone_mark, "_condition.ALL_donor.ALL"),
    "peaks_noBL_IDR.narrowPeak")
allpeaks <- {
    read.narrowPeak(peakfile) %>% as("GRanges") %>%
        assign_into(seqinfo(.), seqinfo(BSgenome.Hsapiens.UCSC.hg38)[seqlevels(.)]) %>%
        setNames(.$name)
}

Now we’ll load the ChIP-seq read count data set from RDS files containing SummarizedExperiment objects, and modify them to use the sample names as column names. We also ensure that the column order is identical between the two objects. Lastly, we filter out any windows with fewer than one count per sample. This is a very mild filtering criterion, but it often eliminates many windows, greatly easing the subsequent computational burden of computing the real filtering threshold.

sexpfile <-
    here("saved_data",
         glue_data(params, "csaw-counts_{window_size}-windows_{fragment_length}-reads_{histone_mark}.RDS"))
bigbin.sexpfile <-
    here("saved_data",
         glue_data(params, "csaw-counts_{bigbin_size}-bigbins_{histone_mark}.RDS"))
bigbin.sexp <- readRDS(bigbin.sexpfile)
full.sexp <- readRDS(sexpfile)
colnames(full.sexp) <- colData(full.sexp)$SampleName
colnames(bigbin.sexp) <- colData(bigbin.sexp)$SampleName
# Ensure identical column order
bigbin.sexp %<>% .[,colnames(full.sexp)]
assert_that(all(colnames(full.sexp) == colnames(bigbin.sexp)))
[1] TRUE
sexp <- full.sexp %>% .[rowSums(assay(.)) >= ncol(.),]
# Exepected number of counts per read, based on overlapping multiple windows.
# NOTE: Assumes windows exactly tile the genome (no overlaps, no gaps).
colData(sexp)$CountDupFactor <- (colData(sexp)$ext - 1) / median(width(rowRanges(sexp))) + 1

We extract the sample metadata from the SummarizedExperiment. We set all factors to use a sum-to-zero variant of the treatment-contrast coding, which will ease the subtraction of batch effects later.

sample.table <- colData(sexp) %>%
    as.data.frame %>% autoFactorize %>%
    mutate(days_after_activation=time_point %>% str_extract("\\d+$") %>% as.numeric(),
           time_point=factor(days_after_activation) %>% `levels<-`(glue("D{levels(.)}")),
           group=interaction(cell_type, time_point, sep="")) %>%
    autoFactorize %>%
    set_rownames(colnames(sexp))
for (i in names(sample.table)) {
    if (is.factor(sample.table[[i]]) && nlevels(sample.table[[i]]) > 1) {
        sample.table[[i]] %<>% C(code_control_named(levels(.)))
    }
}

Peak and Window Filtering

We begin by selecting only peaks with an IDR value of 0.05 or less.

idr.threshold <- 0.05
genome.size <- seqlengths(seqinfo(allpeaks)) %>% as.numeric %>% sum
peaks <- allpeaks[allpeaks$qValue >= -log10(idr.threshold)]
pct.covered <- width(peaks) %>% sum %>% divide_by(genome.size) %>% multiply_by(100)
mean.pct.reads <- sexp %>% subsetByOverlaps(peaks) %>% assay("counts") %>%
    colSums %>% divide_by(colData(sexp) %$% {totals * CountDupFactor}) %>% multiply_by(100) %>%
    mean
message(glue("Selected {length(peaks)} peaks at an IDR threshold of {format(idr.threshold, digits=3)}, with an average width of {round(mean(width(peaks)))} nucleotides and covering a total of {format(pct.covered, digits=3)}%% of the genome, containing on average {format(mean.pct.reads, digits=3)}%% of reads"))
Selected 18139 peaks at an IDR threshold of 0.05, with an average width of 18968 nucleotides and covering a total of 11.1%% of the genome, containing on average 22.6%% of reads

Now we need a strategy to filter out uninformative windows representing background regions of the genome where no specific binding is observed. First, we examine the overall distribution of average logCPM values, as well as the average logCPM distribution within called peaks:

a %<-% aveLogCPM(asDGEList(sexp), prior.count = 2)
peak.overlap <- overlapsAny(sexp, peaks)
a.peaks <- a[peak.overlap]
adata <- data.frame(logCPM=a, PeakOverlap=peak.overlap)
p <- list(
    Histogram=ggplot(adata) +
        aes(x=logCPM, fill=PeakOverlap) +
        geom_histogram(aes(y=100*(..count..)/sum(..count..)), binwidth=0.1, boundary=0) +
        xlab("Average logCPM") + ylab("Percent of windows in bin") +
        coord_cartesian(xlim=quantile(a, c(0, 0.999)), ylim=c(0,3)) +
        labs(title="Histogram of average window logCPM values",
             subtitle="Colored by peak overlap"),
    Violin=ggplot(adata) +
        aes(x=PeakOverlap, y=logCPM) +
        geom_violin(aes(fill=PeakOverlap), scale = "area") +
        geom_boxplot(width = 0.07, fill = "grey", alpha=0.75, outlier.alpha = 0) +
        scale_fill_hue(guide="none") +
        coord_cartesian(ylim=quantile(a, c(0, 0.999))) +
        labs(title="Violin plot of average window logCPM values",
             subtitle="Grouped by peak overlap"))
ggprint(p)

summary(lm(logCPM ~ PeakOverlap, data=adata))

Call:
lm(formula = logCPM ~ PeakOverlap, data = adata)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.4361 -0.2769 -0.0544  0.2238  5.2597 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.6439690  0.0001738   -9459   <2e-16 ***
PeakOverlapTRUE  0.7118700  0.0004895    1454   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3846 on 5601886 degrees of freedom
Multiple R-squared:  0.2741,    Adjusted R-squared:  0.2741 
F-statistic: 2.115e+06 on 1 and 5601886 DF,  p-value: < 2.2e-16

From the linear model and violin plot, we can see that peaks are clearly significantly enriched for high-abundance windows relative to background regions. However, the histogram shows that a purely count-based filter is not desirable, because any threshold still leaves substantial high-count windows that do no overlap peaks.This is because the peaks represent only a small fraction of the genome and the random variation in background coverage depth is at least as large as the difference between peaks and unbound regions. Hence, we will simply select all windows that overlap called peaks. For this purpose, we will select a larger set of peaks using a more relaxed IDR threshold in order to maximize the probability of including peaks that are present in only one or a few conditions and are therefore more weakly represented in the all-sample consensus. The trade-off of including more false positive peaks is acceptable here, since false positive peaks are not expected to show evidence of differential binding.

idr.threshold <- 0.2
peaks <- allpeaks[allpeaks$qValue >= -log10(idr.threshold)]
pct.covered <- width(peaks) %>% sum %>% divide_by(genome.size) %>% multiply_by(100)
mean.pct.reads <- sexp %>% subsetByOverlaps(peaks) %>% assay("counts") %>%
    colSums %>% divide_by(colData(sexp) %$% {totals * CountDupFactor}) %>% multiply_by(100) %>%
    mean
message(glue("Selected {length(peaks)} peaks at an IDR threshold of {format(idr.threshold, digits=3)}, with an average width of {round(mean(width(peaks)))} nucleotides and covering a total of {format(pct.covered, digits=3)}%% of the genome, containing on average {format(mean.pct.reads, digits=3)}%% of reads"))
Selected 24278 peaks at an IDR threshold of 0.2, with an average width of 15710 nucleotides and covering a total of 12.4%% of the genome, containing on average 24.6%% of reads
sexp %<>% subsetByOverlaps(peaks)

Lastly, we plot the resulting aveLogCPM distribution.

a <- aveLogCPM(asDGEList(sexp), prior.count = 2)
p <- ggplot(data.frame(logCPM=a)) +
    aes(x=logCPM) +
    geom_histogram(aes(y=100*(..count..)/sum(..count..)), binwidth=0.1, boundary=0) +
    xlab("Average logCPM") + ylab("Percent of windows in bin") +
    coord_cartesian(xlim=quantile(a, c(0, 0.999)))
ggprint(p)