Commit fa8d7f53 authored by FranziG's avatar FranziG
Browse files

upload day2 data

parent be09b507
This diff is collapsed.
---
title: "CourseData_2"
author: "Franziska Greulich"
date: "2022-08-26"
output: html_document
---
```{css, echo=FALSE}
p {
font-size: 20px;
}
```
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Peak Overlaps and annotations\
\
### Working with multiple files\
\
List of samples:\
\
```{r, message=FALSE}
library("stringr")
library("knitr")
targets <- data.frame(FileNames= intersect(list.files(path = "./data/MACS/",pattern = "blacklist",ignore.case = TRUE), list.files(path = "./data/MACS/",pattern = ".narrowPeak",ignore.case = TRUE)))
targets$experiment <- sub("_.*","",targets$FileNames)
targets$experiment <- ifelse(grepl("Gr",targets$experiment),"ChIPseq",ifelse(grepl("P",targets$experiment),"ChIPseq",ifelse(grepl("G",targets$experiment),"Cut&Run",targets$experiment)))
targets$antibody <- ifelse(grepl("H3K27AC",toupper(targets$FileNames)),"H3K27ac",ifelse(grepl("ATAC",toupper(targets$FileNames)),"none",ifelse(grepl("H3K27ME3",toupper(targets$FileNames)),"H3K27me3",ifelse(grepl("IGG",toupper(targets$FileNames)),"IgG",ifelse(grepl("H3K4ME3",toupper(targets$FileNames)),"H3K4me3",ifelse(grepl("INPUT",toupper(targets$FileNames)),"none","NA"))))))
targets$group <- ifelse(targets$experiment=="Cut&Run",substr(targets$FileNames, 2,2), ifelse(grepl("Gr5",targets$FileNames),"5",ifelse(grepl("P1",targets$FileNames),"1",ifelse(grepl("P2",targets$FileNames),"2",ifelse(grepl("PM",targets$FileNames),"PM",ifelse(grepl("KP",targets$FileNames),"KP",str_replace(targets$FileNames, pattern = ".*_(.)_.*", replacement = "\\1")))))))
#report table
kable(targets, caption = "The data:")
```
#### 1. Read all peaks into R and remove non-classical chromosomes. (see previous excercise)\
\
```{r}
#add the path to your file names
samples <- paste("./data/MACS/",targets$FileNames,sep="")
#import peaks
peaks <- lapply(samples, function(x) read.table(x, header = FALSE, stringsAsFactors = FALSE, sep="\t"))
#assign a name to each list element
names(peaks) <- targets$FileNames
#assign column names to each data frame within a list
for (i in 1:length(peaks)){
colnames(peaks[[i]]) <- c("chr","start","end","name","score","strand","fold-change","pvalue_log10","qvalue_log10","summit")
}
#here we add a column reporting the original peak number to our metadata frame targets
targets$peakNo_unfiltered <- "NA"
for (i in 1:length(peaks)){
targets[i,]$peakNo_unfiltered <- nrow(peaks[[i]])
}
#remove non-classical chromosomes
#1. define a vector of human chromosome names
chrHs <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY")
#filter
peaks <- lapply(peaks, FUN = function (x) { x[x$chr%in%chrHs,] } )
```
\
In contrast to our first exercise, we have a list of peaks now. Those can be imported into a list of `GRanges` using a slightly modified version of the `narrow2GRanges` function that includes the P-value and peak score rather than the fold-change and q-value. Convert the peak set in combination with the `lapply` function from base R.\
#### 2. Convert your peak list into a `GRangesList`.\
**Note**: Don't forget to load the `GenomicRanges` package and to define the `narrow2GRanges` function.\
\
```{r, message=FALSE}
library("GenomicRanges")
#define narrow2GRanges function
narrow2GRanges <-function(peaks)
{
myrange <- GRanges(seqnames=peaks[,1],range=IRanges(start=peaks[,2], end=peaks[,3], names=paste(peaks[,1],peaks[,2],peaks[,3],sep="_")), strand="*", score=peaks[,5], pvalue=peaks[,8])
return(myrange)
}
peaks_g <- lapply(peaks,function(x) {narrow2GRanges(x)})
```
\
#### 3. Remove the ENCODE blacklisted regions and report the percent of peak overlap with blacklisted regions as well as the number of peask after filtering.\
**Note**: Remember that you work with a LIST object.\
**Note**: As we used peaks that are cleaned from blacklisted regions in our example, the percent of filtered reads is 0. You can repeat the analysis with peaks not yet filtered to see the contribution of blacklisted regions.\
\
```{r}
#define bed2GRanges function
bed2GRanges <-function(peaks)
{
myrange <- GRanges(seqnames = peaks[,1],range = IRanges(start = peaks[,2], end = peaks[,3], names = peaks[,4]), strand = "*")
return(myrange)
}
#import blacklisted regions from bed file (downloaded from https://github.com/Boyle-Lab/Blacklist/tree/master/lists) and convert to GRange object
blacklist <- read.table("./data/hg38-blacklist.v2.bed",header=FALSE,stringsAsFactors = FALSE,fill=TRUE)
blacklist_g <- bed2GRanges(blacklist)
#calculate the percent overlap of all peaks with blacklisted regions
for (i in 1:length(peaks_g)) {
print(paste(names(peaks_g)[i],"overlaps ENCODE blacklisted regions to:", round(100*length(subsetByOverlaps(granges(peaks_g[[i]]),blacklist_g,minoverlap = 1,ignore.strand=TRUE))/length(granges(peaks_g[[i]])),2),"%.",sep=" "))
}
#remove blacklisted regions
peaks_g <- endoapply(peaks_g,function(x) {subsetByOverlaps(x,blacklist_g,invert = TRUE,minoverlap = 1,ignore.strand=TRUE)})
```
\
#### 4. Remove peaks that are wider than 2 kb as those are most likely aretefcats. Report the percentage of removed peaks. Plot the aggregated number of peaks per experiment and compare the peak width between all samples (aggregated by experiment).\
**Note**: I am using the `ggplot2` package for plotting the peak width as violin plot. The advantage of the Violin plot is that you can compare the mean as well as distribution of peak width between samples.\
\
```{r, message=FALSE}
#load library ggplot2
library("ggplot2")
library("plyranges")
peaks_g <- endoapply(peaks_g,function(x) plyranges::filter(x,width<2000))
#here we add a column reporting the peak number after filtering and the percentage of filtered peaks to our metadata targets
targets$peakNo_filtered <- "NA"
targets$percent_artefacts <- "NA"
for (i in 1:length(peaks_g)){
targets[i,]$peakNo_filtered <- length(peaks_g[[i]])
targets[i,]$percent_artefacts <- round(100-100*as.numeric(targets[i,]$peakNo_filtered)/as.numeric(targets[i,]$peakNo_unfiltered),2)
}
#extract peak width distribution per sample and store in a new data frame:
peakwidth <- data.frame()
for (i in 1:length(peaks_g)) {
width <- data.frame(width(peaks_g[[i]]))
width$sample <- names(peaks_g)[i]
peakwidth <- rbind(peakwidth,width)
}
colnames(peakwidth) <- c("width","sample")
peakwidth$sample <- as.factor(peakwidth$sample)
#add additional metadata to peakwidth for plotting
peakwidth$experiment <- sub("_.*","",peakwidth$sample)
peakwidth$experiment <- ifelse(grepl("Gr",peakwidth$experiment),"ChIPseq",ifelse(grepl("P",peakwidth$experiment),"ChIPseq",ifelse(grepl("G",peakwidth$experiment),"Cut&Run",peakwidth$experiment)))
peakwidth$antibody <- ifelse(grepl("H3K27AC",toupper(peakwidth$sample)),"H3K27ac",ifelse(grepl("ATAC",toupper(peakwidth$sample)),"none",ifelse(grepl("H3K27ME3",toupper(peakwidth$sample)),"H3K27me3",ifelse(grepl("IGG",toupper(peakwidth$sample)),"IgG",ifelse(grepl("H3K4ME3",toupper(peakwidth$sample)),"H3K4me3",ifelse(grepl("INPUT",toupper(peakwidth$sample)),"none","NA"))))))
peakwidth$group <- paste(peakwidth$experiment,peakwidth$antibody,sep="_")
#define plotting order
peakwidth$group <- factor(peakwidth$group,level=c("ATAC_none","ChIPseq_none","ChIPseq_IgG","Cut&Run_IgG","CT_IgG","ChIPseq_H3K27ac","Cut&Run_H3K27ac","CT_H3K27ac","ChIPseq_H3K27me3","Cut&Run_H3K27me3","CT_H3K27me3","CT_H3K4me3"))
#plot peak width distribution for all samples as histogram
ggplot(peakwidth, aes(x=group, y = width, fill=experiment)) +
geom_violin() +
stat_summary(fun = "median",geom = "crossbar",width = 0.5,colour = "red")+
scale_color_brewer(palette="Dark2")+
ggtitle("Peak width distribution between samples")+
# ylim(0,500)+
theme_bw()+
theme(axis.text.x = element_text(angle = 90))
#add mean peakwidth per experiment to metadata targets
targets$width_median <- "NA"
for (i in 1:length(peaks_g)){
targets[i,]$width_median <- median(width(peaks_g[[i]]))
}
#show metadata
kable(targets, caption = "The data:")
```
\
### 2. Look at the similarity of occupied regions between samples
We use the Jaccard index to compute the pair-wise overlaps between all samples. This index lies between 0 (no overlap) and 1 (100% overlap). Fur this purpose, we generate a function called `jaccard` that computes the jaccard index for two peak sets.\
\
```{r, message=FALSE, warnings=FALSE}
#define a function computing the Jaccard index
jaccard <- function(set1, set2){
intersect <- GenomicRanges::intersect(set1, set2)
union <- append(GenomicRanges::intersect(set1, set2), subsetByOverlaps(set1, set2,invert=TRUE,ignore.strand=TRUE, minoverlap=10))
union <- append(union,subsetByOverlaps(set2, set1,invert=TRUE,ignore.strand=TRUE, minoverlap=10))
jaccard <- length(intersect)/length(union)
return(jaccard)
}
jaccardM = readRDS("jaccard.rds")
#compute the Jaccard indices for all samples in peaks_g and store as matrix jaccarM
#jaccardM <- endoapply(peaks_g ,function(x){endoapply(peaks_g, function(y){jaccard(x,y)})})
#jaccardM <- matrix(unlist(do.call("cbind",jaccardM)),ncol=length(peaks_g))
#colnames(jaccardM) <- sub("*_wo_blacklist.narrowPeak","",names(peaks_g))
#rownames(jaccardM) <- sub("*_wo_blacklist.narrowPeak","",names(peaks_g))
#plot jaccard index as heatmap
#load packaes required for plotting
library("gplots")
library("RColorBrewer")
#define color scale for heatmap
col <- colorRampPalette(brewer.pal(10, "Greens"))(256)
#heatmap of jaccard indices
heatmap.2(jaccardM,
scale = "none",
col = col,
trace = "none",
density.info = "none",
xlab = "peak set",
ylab ="peak set",
cexRow=.3,
cexCol=.3,
margins=c(5,5),
main="Jaccard index")
```
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment