Last updated: 2018-07-26
workflowr checks: (Click a bullet for more information) ✔ R Markdown file: up-to-date
Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.
✔ Environment: empty
Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.
✔ Seed:
set.seed(20180613)
The command set.seed(20180613)
was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.
✔ Session information: recorded
Great job! Recording the operating system, R version, and package versions is critical for reproducibility.
✔ Repository version: 962aead
wflow_publish
or wflow_git_commit
). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
Ignored files:
Ignored: .DS_Store
Ignored: .Rhistory
Ignored: .Rproj.user/
Ignored: code_RCC/.DS_Store
Ignored: data/.DS_Store
Untracked files:
Untracked: workflow_setup.R
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes.
File | Version | Author | Date | Message |
---|---|---|---|---|
html | 94153b3 | kevinlkx | 2018-07-26 | Build site. |
Rmd | 4d492e6 | kevinlkx | 2018-07-26 | compare centipede predictions for CTCF in DiffAC regions |
html | ec5a6ef | kevinlkx | 2018-07-25 | Build site. |
Rmd | c628625 | kevinlkx | 2018-07-25 | compare centipede predictions for CTCF in DiffAC regions |
library(ggplot2)
library(grid)
library(gridExtra)
suppressPackageStartupMessages(library(GenomicRanges))
library(limma)
Attaching package: 'limma'
The following object is masked from 'package:BiocGenerics':
plotMA
library(edgeR)
library(VennDiagram)
Warning: package 'VennDiagram' was built under R version 3.4.4
Loading required package: futile.logger
message <- futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
## venn diagram
plot_venn_overlaps <- function(overlaps.m, title = "", col_fill = NULL, category.names = NULL){
grid.newpage()
overlaps_venn.l <- lapply(as.data.frame(overlaps.m), function(x) which(x == 1))
if(is.null(col_fill)){
col_fill <- 1:length(overlaps_venn.l)
}
if(is.null(category.names)){
category.names <- names(x)
}
venn.plot <- venn.diagram(
x = overlaps_venn.l,
category.names = category.names,
filename = NULL,
fill = col_fill,
alpha=rep(0.5, length(overlaps_venn.l)),
cex = 1.5,
cat.fontface=4,
main=title)
grid.draw(venn.plot)
}
tf_name <- "CTCF"
pwm_name <- "CTCF_MA0139.1_1e-4"
thresh_PostPr_bound <- 0.99
flank <- 100
cat("PWM name: ", pwm_name, "\n")
PWM name: CTCF_MA0139.1_1e-4
diffAC_regions.df <- read.csv("~/Dropbox/research/ATAC_DNase/ATAC-seq_Olivia_Gray/results/DiffAC_regions/ordered_results_withcoords.csv")
cat(nrow(diffAC_regions.df), "regions in total \n")
138927 regions in total
diffAC_regions.df <- diffAC_regions.df[, c("chr", "Start", "End","GeneID", "baseMean", "Strand", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
diffAC_sig_regions.df <- diffAC_regions.df[diffAC_regions.df$padj < 0.1, ]
cat(nrow(diffAC_sig_regions.df), "significant regions \n")
2390 significant regions
hist(diffAC_regions.df$log2FoldChange, xlab = "log2FoldChange", main = "Differentially open regions (FDR < 10%)")
Version | Author | Date |
---|---|---|
ec5a6ef | kevinlkx | 2018-07-25 |
diffAC_sigH_regions.df <- diffAC_sig_regions.df[diffAC_sig_regions.df$log2FoldChange > 0, ]
cat(nrow(diffAC_sigH_regions.df), "regions differentially open in hypoxia. \n")
201 regions differentially open in hypoxia.
diffAC_sigN_regions.df <- diffAC_sig_regions.df[diffAC_sig_regions.df$log2FoldChange < 0, ]
cat(nrow(diffAC_sigN_regions.df), "regions differentially open in normoxia. \n")
2189 regions differentially open in normoxia.
diffAC_sig_regions.gr <- makeGRangesFromDataFrame(diffAC_sig_regions.df, start.field = "Start", end.field = "End", keep.extra.columns = T)
diffAC_sigH_regions.gr <- makeGRangesFromDataFrame(diffAC_sigH_regions.df, start.field = "Start", end.field = "End", keep.extra.columns = T)
diffAC_sigN_regions.gr <- makeGRangesFromDataFrame(diffAC_sigN_regions.df, start.field = "Start", end.field = "End", keep.extra.columns = T)
dir_predictions <- paste0("~/Dropbox/research/ATAC_DNase/ATAC-seq_Olivia_Gray/results/centipede_predictions/", pwm_name)
## condition: N
bam_namelist_N <- c("N1_nomito_rdup.bam", "N2_nomito_rdup.bam", "N3_nomito_rdup.bam")
site_predictions_N.l <- vector("list", 3)
names(site_predictions_N.l) <- bam_namelist_N
for(i in 1:length(bam_namelist_N)){
bam_basename <- tools::file_path_sans_ext(basename(bam_namelist_N[[i]]))
site_predictions_N.l[[i]] <- read.table(paste0(dir_predictions, "/", pwm_name, "_", bam_basename, "_predictions.txt"), header = T, stringsAsFactors = F)
}
CentPostPr_N.df <- data.frame(N1 = site_predictions_N.l[[1]]$CentPostPr,
N2 = site_predictions_N.l[[2]]$CentPostPr,
N3 = site_predictions_N.l[[3]]$CentPostPr)
CentLogRatios_N.df <- data.frame(N1 = site_predictions_N.l[[1]]$CentLogRatios,
N2 = site_predictions_N.l[[2]]$CentLogRatios,
N3 = site_predictions_N.l[[3]]$CentLogRatios)
## condition: H
bam_namelist_H <- c("H1_nomito_rdup.bam", "H2_nomito_rdup.bam", "H3_nomito_rdup.bam")
site_predictions_H.l <- vector("list", 3)
names(site_predictions_H.l) <- bam_namelist_H
for(i in 1:length(bam_namelist_H)){
bam_basename <- tools::file_path_sans_ext(basename(bam_namelist_H[[i]]))
site_predictions_H.l[[i]] <- read.table(paste0(dir_predictions, "/", pwm_name, "_", bam_basename, "_predictions.txt"), header = T, stringsAsFactors = F)
}
CentPostPr_H.df <- data.frame(H1 = site_predictions_H.l[[1]]$CentPostPr,
H2 = site_predictions_H.l[[2]]$CentPostPr,
H3 = site_predictions_H.l[[3]]$CentPostPr)
CentLogRatios_H.df <- data.frame(H1 = site_predictions_H.l[[1]]$CentLogRatios,
H2 = site_predictions_H.l[[2]]$CentLogRatios,
H3 = site_predictions_H.l[[3]]$CentLogRatios)
if(any(site_predictions_N.l[[1]]$name != site_predictions_H.l[[1]]$name)){
stop("sites not match!")
}
sites.df <- site_predictions_N.l[[1]][,1:7]
## get motif coordinates
if(sites.df[1, "end"] - sites.df[1, "start"] > flank){
sites.df$start <- sites.df$start + flank
sites.df$end <- sites.df$end - flank
}
sites.gr <- makeGRangesFromDataFrame(sites.df, start.field = "start", end.field = "end", keep.extra.columns = F)
CentPostPr.df <- cbind(CentPostPr_N.df, CentPostPr_H.df)
CentLogRatios.df <- cbind(CentLogRatios_N.df, CentLogRatios_H.df)
sites_CentPostPr.df <- cbind(sites.df, CentPostPr_N.df, CentPostPr_H.df)
sites_CentLogRatios.df <- cbind(sites.df, CentLogRatios_N.df, CentLogRatios_H.df)
overlaps_diffAC.df <- as.data.frame(findOverlaps(query = sites.gr, subject = diffAC_sig_regions.gr, type = "within", ignore.strand = T))
idx_sites_diffAC <- unique(overlaps_diffAC.df$queryHits)
cat(length(idx_sites_diffAC), "candidate motif sites differentially open in hypoxia or normoxia. \n")
412 candidate motif sites differentially open in hypoxia or normoxia.
overlaps_sigH.df <- as.data.frame(findOverlaps(query = sites.gr, subject = diffAC_sigH_regions.gr, type = "within", ignore.strand = T))
idx_sites_sigH <- unique(overlaps_sigH.df$queryHits)
cat(length(idx_sites_sigH), "candidate motif sites differentially open in hypoxia. \n")
22 candidate motif sites differentially open in hypoxia.
overlaps_sigN.df <- as.data.frame(findOverlaps(query = sites.gr, subject = diffAC_sigN_regions.gr, type = "within", ignore.strand = T))
idx_sites_sigN <- unique(overlaps_sigN.df$queryHits)
cat(length(idx_sites_sigN), "candidate motif sites differentially open in normoxia. \n")
390 candidate motif sites differentially open in normoxia.
cat("Number of bound sites that are differentially open in hypoxia: \n")
Number of bound sites that are differentially open in hypoxia:
colSums(CentPostPr.df[idx_sites_sigH, ] > thresh_PostPr_bound)
N1 N2 N3 H1 H2 H3
14 16 13 18 15 17
cat("Number of bound sites that are differentially open in normoxia: \n")
Number of bound sites that are differentially open in normoxia:
colSums(CentPostPr.df[idx_sites_sigN, ] > thresh_PostPr_bound)
N1 N2 N3 H1 H2 H3
385 372 362 176 148 227
# binding probablity
par(pty="s")
plot(rowMeans(CentPostPr_N.df), rowMeans(CentPostPr_H.df),
xlab = "N average P(Bound)", ylab = "H average P(Bound)", main = tf_name,
pch = ".", col = rgb(0,0,1,0.7))
abline(a=0, b=1, col = "darkgray")
Version | Author | Date |
---|---|---|
ec5a6ef | kevinlkx | 2018-07-25 |
# logRatios
par(mfrow = c(1,2))
par(pty="s")
plot(rowMeans(CentLogRatios_N.df), rowMeans(CentLogRatios_H.df),
xlab = "N average logRatios", ylab = "H average logRatios", main = tf_name,
pch = ".", col = rgb(0,0,1,0.7))
abline(a=0,b=1,col = "darkgray")
plot(x = (rowMeans(CentLogRatios_H.df)+rowMeans(CentLogRatios_N.df))/2,
y = rowMeans(CentLogRatios_H.df) - rowMeans(CentLogRatios_N.df),
xlab = "average logRatios", ylab = "Difference in logRatios (H - N)", main = tf_name,
pch = ".", col = rgb(0,0,1,0.7))
abline(v=0, h=0, col = "darkgray")
Version | Author | Date |
---|---|---|
ec5a6ef | kevinlkx | 2018-07-25 |
cat(length(idx_sites_sigH), "candidate motif sites differentially open in hypoxia. \n")
22 candidate motif sites differentially open in hypoxia.
# binding probablity
par(pty="s")
plot(rowMeans(CentPostPr_N.df[idx_sites_sigH,]), rowMeans(CentPostPr_H.df[idx_sites_sigH,]),
xlab = "N average P(Bound)", ylab = "H average P(Bound)", main = paste(tf_name, "bound sites"),
pch = 20, col = rgb(0,0,1,0.7))
abline(a=0, b=1, col = "darkgray")
Version | Author | Date |
---|---|---|
ec5a6ef | kevinlkx | 2018-07-25 |
# logRatios
par(mfrow = c(1,2))
par(pty="s")
plot(rowMeans(CentLogRatios_N.df[idx_sites_sigH,]), rowMeans(CentLogRatios_H.df[idx_sites_sigH,]),
xlab = "N average logRatios", ylab = "H average logRatios", main = tf_name,
pch = 20, col = rgb(0,0,1,0.7))
abline(a=0,b=1,col = "darkgray")
plot(x = (rowMeans(CentLogRatios_H.df[idx_sites_sigH,])+rowMeans(CentLogRatios_N.df[idx_sites_sigH,]))/2,
y = rowMeans(CentLogRatios_H.df[idx_sites_sigH,]) - rowMeans(CentLogRatios_N.df[idx_sites_sigH,]),
xlab = "average logRatios", ylab = "Difference in logRatios (H - N)", main = tf_name,
pch = 20, col = rgb(0,0,1,0.7))
abline(v=0, h=0, col = "darkgray")
Version | Author | Date |
---|---|---|
ec5a6ef | kevinlkx | 2018-07-25 |
cat(length(idx_sites_sigN), "candidate motif sites differentially open in normoxia \n")
390 candidate motif sites differentially open in normoxia
# binding probablity
par(pty="s")
plot(rowMeans(CentPostPr_N.df[idx_sites_sigN,]), rowMeans(CentPostPr_H.df[idx_sites_sigN,]),
xlab = "N average P(Bound)", ylab = "H average P(Bound)", main = paste(tf_name, "bound sites"),
pch = 20, col = rgb(0,0,1,0.7))
abline(a=0, b=1, col = "darkgray")
Version | Author | Date |
---|---|---|
ec5a6ef | kevinlkx | 2018-07-25 |
# logRatios
par(mfrow = c(1,2))
par(pty="s")
plot(rowMeans(CentLogRatios_N.df[idx_sites_sigN,]), rowMeans(CentLogRatios_H.df[idx_sites_sigN,]),
xlab = "N average logRatios", ylab = "H average logRatios", main = tf_name,
pch = 20, col = rgb(0,0,1,0.7))
abline(a=0,b=1,col = "darkgray")
plot(x = (rowMeans(CentLogRatios_H.df[idx_sites_sigN,])+rowMeans(CentLogRatios_N.df[idx_sites_sigN,]))/2,
y = rowMeans(CentLogRatios_H.df[idx_sites_sigN,]) - rowMeans(CentLogRatios_N.df[idx_sites_sigN,]),
xlab = "average logRatios", ylab = "Difference in logRatios (H - N)", main = tf_name,
pch = 20, col = rgb(0,0,1,0.7))
abline(v=0, h=0, col = "darkgray")
Version | Author | Date |
---|---|---|
ec5a6ef | kevinlkx | 2018-07-25 |
targets <- data.frame(bam = c(bam_namelist_N, bam_namelist_H),
label = colnames(CentLogRatios.df),
condition = rep(c("N", "H"), each = 3))
print(targets)
bam label condition
1 N1_nomito_rdup.bam N1 N
2 N2_nomito_rdup.bam N2 N
3 N3_nomito_rdup.bam N3 N
4 H1_nomito_rdup.bam H1 H
5 H2_nomito_rdup.bam H2 H
6 H3_nomito_rdup.bam H3 H
condition <- factor(targets$condition, levels = c("N", "H"))
design <- model.matrix(~0+condition)
colnames(design) <- levels(condition)
print(design)
N H
1 1 0
2 1 0
3 1 0
4 0 1
5 0 1
6 0 1
attr(,"assign")
[1] 1 1
attr(,"contrasts")
attr(,"contrasts")$condition
[1] "contr.treatment"
CentLogRatios_diffAC.df <- CentLogRatios.df[idx_sites_diffAC, ]
fit <- lmFit(CentLogRatios_diffAC.df, design)
contrasts <- makeContrasts(H-N, levels=design)
fit2 <- contrasts.fit(fit, contrasts)
fit2 <- eBayes(fit2, trend=TRUE)
num_diffbind <- summary(decideTests(fit2))
percent_diffbind <- round(num_diffbind / sum(num_diffbind) * 100, 2)
cat(num_diffbind[1], "sites differentially open in normoxia (", percent_diffbind[1], "%) \n",
num_diffbind[3], "sites differentially open in hypoxia (", percent_diffbind[3], "%) \n",
num_diffbind[2], "sites not significantly different (", percent_diffbind[2], "%) \n")
388 sites differentially open in normoxia ( 94.17 %)
3 sites differentially open in hypoxia ( 0.73 %)
21 sites not significantly different ( 5.1 %)
# volcanoplot(fit2, main="H vs. N", xlab = "Difference in logRatios (H - N)")
plot(x = fit2$coef, y = -log10(fit2$p.value),
xlab = "Difference in logRatios (H - N)", ylab = "-log10(P-value)", main= paste(tf_name, "H vs. N"),
pch = 16, cex = 0.35)
sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats4 grid stats graphics grDevices utils
[8] datasets methods base
other attached packages:
[1] VennDiagram_1.6.20 futile.logger_1.4.3 edgeR_3.20.9
[4] limma_3.34.9 GenomicRanges_1.30.3 GenomeInfoDb_1.14.0
[7] IRanges_2.12.0 S4Vectors_0.16.0 BiocGenerics_0.24.0
[10] gridExtra_2.3 ggplot2_2.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.16 formatR_1.5 XVector_0.18.0
[4] compiler_3.4.3 pillar_1.2.2 git2r_0.21.0
[7] plyr_1.8.4 workflowr_1.1.1 futile.options_1.0.1
[10] zlibbioc_1.24.0 R.methodsS3_1.7.1 R.utils_2.6.0
[13] bitops_1.0-6 tools_3.4.3 digest_0.6.15
[16] lattice_0.20-35 evaluate_0.10.1 tibble_1.4.2
[19] gtable_0.2.0 rlang_0.2.0 yaml_2.1.18
[22] GenomeInfoDbData_1.0.0 stringr_1.3.0 knitr_1.20
[25] locfit_1.5-9.1 rprojroot_1.3-2 rmarkdown_1.9
[28] lambda.r_1.2.2 magrittr_1.5 whisker_0.3-2
[31] splines_3.4.3 backports_1.1.2 scales_0.5.0
[34] htmltools_0.3.6 colorspace_1.3-2 stringi_1.1.7
[37] RCurl_1.95-4.10 lazyeval_0.2.1 munsell_0.4.3
[40] R.oo_1.22.0
This reproducible R Markdown analysis was created with workflowr 1.1.1