post-map-atac.Rmd

---
title: "Untitled"
author: "Sam Buckberry"
date: "20/10/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(GenomicRanges)
library(GenomicAlignments)
library(GenomicFeatures)
library(edgeR)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(rtracklayer)
library(MACSr)

library(UpSetR)
library(stringr)
library(magrittr)
library(ggplot2)
library(ggrepel)
```

### Post-alignment processing
- Plot insert size histograms
- Create GRanges object of Tn5 insertion sites
- Remove chrM and blacklist genomic regions
- Create bigwig files
- Call ATAC-seq peaks
- Calculate reads in peaks

First, list the ATAC-seq BAM files
```{r, cache=TRUE}
bam_list <- list.files("aligned_data/",
                       pattern = ".bam$",
                       full.names = TRUE)
```

And now index the bam files
```{r, cache=TRUE}
lapply(bam_list, indexBam)
```

And create sample IDs based on shortened file names
```{r, cache=TRUE}
ids <- basename(bam_list) %>%
    str_remove_all("_filt_fastp_chr22_dedup.bam") %>%
    str_remove_all("-Effector_CD4pos")
```

Plot insert sizes for each library. The `plot_insert_sizes` function here will extract the mapping-determined insert sizes from the BAM files. 
```{r, cache=TRUE}
plot_insert_sizes <- function(bam_file, log_scale=FALSE, title="") {
    
    # Checks
    stopifnot(testPairedEndBam(file = bam_file))
    
    stopifnot(class(log_scale)=="logical")
    stopifnot(length(log_scale)==1)
    
    # Setup the filtering parameters for the BAM file
    param <- ScanBamParam(what=c('isize'),
                          flag = scanBamFlag(isSecondaryAlignment = FALSE,
                                             isDuplicate = FALSE,
                                             isUnmappedQuery=FALSE,
                                             isNotPassingQualityControls = FALSE))
    
    # Get the insert sizes from the bam file and tabulate
    isize <- scanBam(bam_file, param=param) %>%
        unlist(use.names = FALSE) %>% abs() %>% table()
    
    df <- data.frame(x = as.numeric(names(isize)), y = as.numeric(isize))
    
    # Limit width to 1000 bases for plotting
    df <- df[df$x < 1000, ]
    
    gg_insert <- NULL
    
    if (log_scale==TRUE){
        
    gg_insert <- ggplot(df, aes(x = x, y = y)) + 
        geom_line() + 
        scale_y_log10() +
        xlab("Fragment length (bp)") +
        ylab("Fragment counts (log10)") +
        ggtitle(title) +
        theme_bw()
    } else if (log_scale==FALSE){
    gg_insert <- ggplot(df, aes(x = x, y = y)) + 
        geom_line() + 
        xlab("Fragment length (bp)") +
        ylab("Fragment counts") +
        ggtitle(title) +
        theme_bw()
    }

    
    return(gg_insert)
}
```

Plot insert sizes for first BAM file
```{r, cache=TRUE}
plot_insert_sizes(bam_file = bam_list[1])
```

Or on the log scale, which emphasises mono, di-nucleosome spanning reads
```{r, cache=TRUE}
plot_insert_sizes(bam_file = bam_list[1], log_scale = TRUE)
```

And now make a plot of insert sizes with all samples together
```{r, cache=TRUE}
gg_insert_plots <- lapply(1:length(bam_list),
                          function(x){plot_insert_sizes(bam_file = bam_list[x],
                                                        log_scale = FALSE,
                                                        title = ids[x])})

cowplot::plot_grid(plotlist = gg_insert_plots)
```

Now we get the Tn5 insert positions from the BAM files and load these into a GRangesList object
```{r, cache=TRUE}

get_tn5_position <- function(bam_file, yield=100000){
    
    message(Sys.time())
    
    # Read the BAM file in chunks to preserve memory
    message(str_c("Reading ", bam_file))
    
    param <- ScanBamParam(flag = scanBamFlag(isPaired = TRUE, isDuplicate = FALSE))
    
    bf <- Rsamtools::BamFile(file = bam_file, yieldSize = yield)
        open(bf)
        gr <- NULL
        repeat {
                chunk <- GenomicAlignments::readGAlignments(bf, param = param)

                if (length(chunk) == 0L)
                        break
                chunk_gr <- GenomicRanges::GRanges(chunk)
                if (is.null(gr)) {
                        gr <- chunk_gr
                } else {
                        gr <- c(gr, chunk_gr)
                }
        }
        close(bf)
    
    message("Offset alignments to Tn5 insertion sites")
    pos <- gr[strand(gr) == "+"] %>% 
        GenomicRanges::shift(shift=4) %>%
        resize(width = 1, fix = "start")
    
    neg <- gr[strand(gr) == "-"] %>%
        GenomicRanges::shift(shift = -5) %>%
        resize(width = 1, fix = "start")
    
    shift_gr <- c(pos, neg)
    strand(shift_gr) <- "+"
    
    message("Done!")
    return(shift_gr)
}

tn5_grl <- lapply(X = bam_list, FUN = get_tn5_position)
```

Plot Tn5 insertions relative to transcription start sites (TSS). What we are looking for here is strong enrichment of signal at TSS, as these are areas of the genome which are typically characteried as assessible chromatin regions.
```{r, cache=TRUE}
## Get TSS positions
tx <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
tss <- resize(tx, width = 1, fix = "start")
seqlevelsStyle(tss) <- "Ensembl"
strand(tss) <- "+"

## Calculate distance of Tn5 insertions from TSS
calc_rel_distance <- function(gr1, gr2){
    
    gr2_ind <- GenomicRanges::nearest(x = gr1, subject = gr2,
                                      ignore.strand = TRUE)
    
    dists <- start(gr1) - start(gr2[gr2_ind])
    
    return(dists)

}


all_dists <- lapply(tn5_grl, calc_rel_distance, gr2=tss)

# PLot the distance to the TSS for each sample
plot_tss_dists <- function(dists, title="", flank=1000){
    
    dists <- dists[abs(dists) <= flank]
    
    df <- table(dists) %>% as.data.frame()
    df$dists <- as.character(df$dists) %>% as.numeric()
    
    gg_dists <- ggplot(df, aes(x = dists, y = Freq)) +
        geom_line() +
        ylab("Frequency") +
        xlab("Distance from TSS") +
        ggtitle(title) +
        theme_bw()
}

tss_dist_plots <- lapply(all_dists, plot_tss_dists)

cowplot::plot_grid(plotlist = tss_dist_plots)

```


Now we can use the Tn5 insertion sites to make insertion-centered bed files for peak calling, and bigwig files for browser and plotting. By making the BED files in this way, we can centre the data on the Tn5 insertion sites which are the accessible chromatin regions. Peak callers typically accept BAM files, but the read is not strictly indicative of the accessible region, only the 5' end is. This will enable more accurate peak definition. 

The function below will make the Tn5 insertion centered BED files and bigwig files from the BAM files for downstream analyses. Note that we also normalise the bigwig file by sequencing depth, so counts are represented as counts per million (CPM), allowing for comparisons between samples.

Within this function, we also filter for reads mapping to the mitochondrial genome and those mapping to blacklisted regions. For more information on blacklisted regions of the genome, see https://www.nature.com/articles/s41598-019-45839-z.  
Blacklist for hg38 was downloaded from here http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/hg38-human/

**Note:** The example data here are for chromosome 22, and hence there are no mitochondrial sequences in this test dataset. Additionally, chr22 does not have any blacklisted regions so no data will be filtered. However, this is an important step when analysing a full dataset. 
```{r, cache=TRUE}

process_atac_bam <- function(bam, mito_ids=c("chrM", "MT"), blacklist=TRUE,
                             blacklist_path="reference/hg38.blacklist.bed",
                             yield=100000){
    
    message(Sys.time())
    
    # Read the BAM file in chunks to preserve memory
    message(str_c("Reading ", bam))

    bf <- Rsamtools::BamFile(file = bam, yieldSize = yield)
        open(bf)
        gr <- NULL
        repeat {
                chunk <- GenomicAlignments::readGAlignments(bf)
                if (length(chunk) == 0L)
                        break
                chunk_gr <- GenomicRanges::GRanges(chunk)
                if (is.null(gr)) {
                        gr <- chunk_gr
                } else {
                        gr <- c(gr, chunk_gr)
                }
        }
        close(bf)
    
    message("Filtering mitochondrial sequences")
    gr <- gr[!seqnames(gr) %in% mito_ids]    
    
    message("Filtering blacklist regions")
    if (blacklist==TRUE){
        
        # Load the blacklist data
        bl_dat <- read.table(blacklist_path)
        blacklist_gr <- GRanges(seqnames = bl_dat[ ,1],
                                ranges = IRanges(start = bl_dat[ ,2],
                                                 end = bl_dat[ ,3]))
        
        # Ensure blacklist and granges have same seqlevels style
        seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(gr)[1]
        
        # Filter the blacklist reads
        pre_len <- length(gr)
        gr <- gr[!overlapsAny(gr, blacklist_gr)]
        post_len <- length(gr)
        filtered_reads <- pre_len - post_len
        message(str_c(filtered_reads, " blacklist reads filtered"))
    }

    message("Offset alignments to Tn5 insertion sites")
    pos <- gr[strand(gr) == "+"] %>% 
        GenomicRanges::shift(shift=4) %>%
        resize(width = 50, fix = "start")
    
    neg <- gr[strand(gr) == "-"] %>%
        GenomicRanges::shift(shift = -5) %>%
        resize(width = 50, fix = "start")
    
    shift_gr <- c(pos, neg)
    strand(shift_gr) <- "+"
    
    message("Calculating coverage")
    cov <- coverage(shift_gr)
    
    # Normalise by sequence depth (CPM)
    message("Normalising by depth")
    lib_size <- length(shift_gr)/1e6
    cov_cpm <- cov / lib_size
    
    # write tn5 centered atac bigwig
    out_bw <- str_replace(string = bam, pattern = ".bam",
                          replacement = "_atac.bw")
    
    message(str_c("Writing ", out_bw))
    
    export.bw(object = cov_cpm, con = out_bw)
    
    # write the tn5 centered bed file for peak calling
    dat <- as.data.frame(shift_gr)[ ,1:3]
    dat$name <- "."
    dat$score <- "."
    dat$strand <- "+"
    
    out_bed <- str_replace(string = bam, pattern = ".bam",
                           replacement = "_atac.bed.gz")
    
    message(str_c("Writing ", out_bed))
    gz1 <- gzfile(out_bed, "w")
    
    write.table(x = dat, file = gz1, quote = FALSE, sep = "\t",
                        row.names = FALSE, col.names = FALSE)
    
    close(gz1)
    
    message(str_c("Completed ", Sys.time()))
        
}

lapply(bam_list, process_atac_bam)
```

now using the BED files we created for each sample in the last step, we now call peaks for each sample using MACS3. All outputs from the peak caller will be saved in the folder `peaks/`
```{r, cache=TRUE}

# List the bed files for peak calling
atac_bed <- list.files(path = "aligned_data/",
                       pattern = ".bed.gz",
                       full.names = TRUE)

# Create an output directory for the peak files
dir.create("peaks")

# Create a function for calling peaks for all samples

call_peaks_from_bed <- function(x){
    
    # Set the output file basename
    id <- basename(atac_bed[x]) %>%
        tools::file_path_sans_ext(compression = TRUE)
    
    # Call the peaks
    pk <- callpeak(tfile = atac_bed[x],
         gsize = "hs",
         keepduplicates = "all",
         nomodel = TRUE,
         outdir = "peaks/",
         name = id, 
         format = "BED")
    
    return(pk)
}

# Run the peak calling function for all of the bed files
pk_logs <- lapply(X = 1:length(atac_bed), FUN = call_peaks_from_bed)

```

Let's have a look at the list of peak files
```{r, cache=TRUE}
peak_fls <- list.files("peaks/", pattern = ".narrowPeak", full.names = TRUE)
```


Now we have the peaks for each sample called, let's see how they a look.

First, we need to read the peak files into a GRangesList
```{r, cache=TRUE}
peak_to_gr <- function(peak_file){
    
    dat <- read.table(peak_file)
    
    gr <- GRanges(seqnames = dat$V1,
                  ranges = IRanges(start = dat$V2, end = dat$V3))
    
    gr$sample <- basename(peak_file) %>% str_remove("_peaks_narrowPeak")
    
    return(gr)
}
```


```{r, cache=TRUE}
peak_grl <- lapply(peak_fls, peak_to_gr) %>% GRangesList()
names(peak_grl) <- basename(peak_fls)
```

First, let's count how many peaks were called for each sample?
```{r, cache=TRUE}
lengths(peak_grl)
```

How wide are these peaks?
```{r, cache=TRUE}
get_peak_widths <- function(x){
    
    peak_widths <- width(peak_grl[[x]]) %>% as.numeric()
    peak_widths <- data.frame(width = peak_widths, id = names(peak_grl[x]))
    return(peak_widths)
}

peak_widths <- lapply(1:length(peak_grl), get_peak_widths) %>%
    do.call(rbind, .)

gg_width_hist <- ggplot(data = peak_widths, aes(x = width)) +
    geom_histogram() + 
    facet_wrap(.~id) + 
    xlab("Peak width") +
    theme_bw()

gg_width_hist
```

How many peaks overlap between samples? Let's make an UpSet plot to inspect the overlaps
```{r, cache=TRUE}
# Make a union peak set for comparing overlaps
union_peak_gr <- unlist(peak_grl) %>% GenomicRanges::reduce()

# Count peak overlaps with union peak set
peak_hits <- lapply(peak_grl, function(x){overlapsAny(union_peak_gr, x)})

# Create a matrix of TRUE/FALSE for peak overlaps
peak_hits <- do.call(cbind, peak_hits)
colnames(peak_hits) <- names(peak_grl) %>%
    str_remove("_filt_fastp_chr22_dedup_atac_peaks.narrowPeak")
rownames(peak_hits) <- as.character(union_peak_gr) 

# Convert the TRUE/FALSE values into 1/0 
peak_hits <- peak_hits + 0

# Convert to data.frame for plotting
peak_hits <- as.data.frame(peak_hits)

# Make un upset plot to visualise peak overlaps between samples
upset(data = peak_hits, ncol(peak_hits), nintersects = 16)

```

Save the union peak set as a bed file (to inspect later)
```{r, cache=TRUE}
gr_to_bed <- function(gr, out_path){
        dat <- as.data.frame(gr)[ ,1:3]
        dat$name <- "."
        dat$score <- "."
        dat$strand <- strand(gr)
        write.table(x = dat, file = out_path, quote = FALSE, sep = "\t",
                    row.names = FALSE, col.names = FALSE)
}

gr_to_bed(gr = union_peak_gr, out_path = "processed_data/atac_union_peaks.bed")
```


This presence/absence calling of peaks does not give us a lot of information, as some samples may have less noise, and more power for detecting peaks. For this reason, it is a good idea to get a normalised measure of reads in peaks, and then compare samples. 

Calculate number of reads (Tn5 insertions) in peaks for each sample using`.bed` files we used for the peak calling
```{r, cache=TRUE}

# Function to calculate read overlaps. Returns a matrix of counts
calc_read_peak_olaps <- function(bed_file_list, peak_set){
    
    
    ##### A function that imports a bed file to a GRanges object
    # Only takes the first 3 columns from a bed file
    bed_to_gr <- function(bed_path){
            dat <- read.table(bed_path, sep = "\t", header = FALSE)
            gr <- GRanges(seqnames = dat$V1,
                      ranges = IRanges(start = dat$V2,
                                       end = dat$V3))
        return(gr)
        
    }

    
    get_olaps <- function(bed_file){
        
        read_gr <- bed_to_gr(bed_file)
        olaps <- countOverlaps(query = peak_set, subject = read_gr)
        stopifnot(length(olaps) == length(peak_set))
    
        return(olaps)
    }
    
    peak_olaps <- lapply(X = bed_file_list, FUN = get_olaps)

    peak_olaps <- do.call(cbind, peak_olaps)
    
    colnames(peak_olaps) <- basename(bed_file_list)
    
    rownames(peak_olaps) <- as.character(peak_set)
    
    return(peak_olaps)

}

peak_counts <- calc_read_peak_olaps(bed_file_list = atac_bed,
                                    peak_set = union_peak_gr)

# Trim down the column names
colnames(peak_counts) <- colnames(peak_counts) %>%
    str_remove("_filt_fastp_chr22_dedup_atac.bed.gz")

head(peak_counts)
```

Save the peak counts table
```{r, cache=TRUE}
dir.create("processed_data")
write.table(x = peak_counts, "processed_data/atac_peak_counts.tsv",
            quote = FALSE, row.names = TRUE, col.names = TRUE, sep = "\t")
```


Calculate reads in peaks and plot
```{r, cache=TRUE}
peak_count_sums <- colSums(peak_counts)

qplot(y=peak_count_sums, x=names(peak_count_sums), geom="col") +
    coord_flip() + theme_bw()
```

Calculate Fraction of reads in peaks (FRiP) and plot 
```{r, cache=TRUE}
count_file_lines <- function(bed_path){
    con <- gzfile(bed_path) 
    counts <- length(readLines(con))
    close(con)
    return(counts)
}

lib_sizes <- lapply(atac_bed, count_file_lines) 
lib_sizes <- unlist(lib_sizes)

FRiP <- peak_count_sums / lib_sizes

qplot(y=FRiP, x=names(FRiP), geom="col") +
    coord_flip() + theme_bw()
```

Now lets look at how the samples compare with respect to normalised read counts in peaks. If we want to do differential accessability analyses, there plots will further help us evaluate our data. 

Calculate length-normalised, and library size normalised peak counts for plotting
```{r, cache=TRUE}
peak_rpkm <- rpkm(peak_counts, gene.length = width(union_peak_gr),
                  log = TRUE, prior.count = 1)

```

Boxplot peak rpkm
```{r, cache=TRUE}
boxplot(peak_rpkm)
```

Let's check how the samples look in a PCA
```{r, cache=TRUE}
plot_pca <- function(mat, dim1=1, dim2=2, scale=TRUE){
        
        # Remove incomplete cases
        mat <- mat[complete.cases(mat), ]
        
        # Transpose
        mat <- t(mat)
        
        # Calculate PC's
        pr <- prcomp(x = mat, scale.=scale)
        pc1 <- (summary(pr)$importance[2, dim1] * 100) %>% round(digits = 1)
        pc2 <- (summary(pr)$importance[2, dim2] * 100) %>% round(digits = 1)

        pc1_dat <- pr$x[ ,dim1]
        pc2_dat <- pr$x[ ,dim2]
        samples <- rownames(pr$x)
        
        pca_df <- data.frame(Sample=samples, PC1=pc1_dat, PC2=pc2_dat)
        
        gg_pca <-  ggplot(data = pca_df,
                          mapping = aes(x = PC2, y = PC1, label=Sample)) +
                geom_point(alpha=0.8, size=4) +
                theme_linedraw() +
                theme(panel.grid = element_line(colour = 'grey')) +
                geom_text_repel(data = subset(pca_df, samples %in% samples),
                                point.padding = unit(1, "lines"), size=3) +
                xlab(str_c("PC", dim2, " (", pc2, "%)")) +
                ylab(str_c("PC", dim1, " (", pc1, "%)"))
        gg_pca
}


atac_pca <- plot_pca(peak_rpkm)
atac_pca
```

Here we see some variance in the read count distributions between libraries, and not real clear seperation of treatment groups. 
In these cases, removing peaks with low counts and between-sample normalisaion often helps. 

Filter low-count peaks and plot PCA. Here we will try filtering for peaks with RPKM > 4 in at least 2 samples
```{r, cache=TRUE}
keep <- rowSums(peak_counts > 5) > 3
table(keep)

peak_filt_rpkm <- rpkm(peak_counts[keep, ], gene.length = width(union_peak_gr[keep]),
                  log = TRUE, prior.count = 1)

plot_pca(peak_filt_rpkm)
```

Here we see that PC2 seperates the treatment and control groups. 

And now with quantile normalisation
```{r, cache=TRUE}
peak_filt_rpkm_norm <- normalizeBetweenArrays(peak_filt_rpkm, method = "quantile")
plot_pca(peak_filt_rpkm_norm)
```

And now, PC1 seperates the treatment and control groups. Here we see the power of group agnostic peak filtering and normalisation to try and resolve as much biological signal as possible in our data. This will aid us when it comes to deciding on how to run our differential peak analysis.  

```{r}
sessionInfo()
```