# Time-stamp:
## This block is used to render pdf/html reports.
## There are a few different output formats available, a pdf and 2 html versions.
## I run the reports simply by hitting control-c control-n on the line I want to generate.
library(hpgltools)
save(list=ls(all=TRUE), file="RData")
rm(list=ls())
load("RData")
render("rnaseq_cbf5.rmd", output_format="pdf_document")
render("rnaseq_cbf5.rmd", output_format="html_document")
render("rnaseq_cbf5.rmd", 'knitrBootstrap::bootstrap_document')
knitr::opts_knit$set(progress=TRUE, verbose=TRUE, error=TRUE, stop_on_error=FALSE, fig.width=7, fig.height=7)
Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the data.
## These first 4 lines are not needed once hpgltools is installed.
source("http://bioconductor.org/biocLite.R")
biocLite("devtools")
library(devtools)
install_github("elsayed-lab/hpgltools")
library(hpgltools)
autoloads_all()
reference <- "reference/scerevisiae.gff.gz"
cbf5_expt <- create_expt(file="cbf5_samples.csv", suffix="_forward-trimmed-v0M1.count.gz", by_sample=TRUE, sep=",")
genelengths <- get_genelengths(reference)
goids <- read.delim("reference/go_trimmed.csv.gz", sep=",", header=FALSE)
##colnames(goids) <- c("ID","GO","Yeast_ID")
colnames(goids) <- c("unknown","GO","go_geneid")
annotation_info <- gff2df(reference)
annot <- annotation_info[, c("gene_id", "transcript_name")]
goids <- goids[, c("go_geneid","GO")]
goids <- merge(goids, annot, by.x="go_geneid", by.y="gene_id", all.x=TRUE)
goids_na <- !is.na(goids$transcript_name)
goids <- goids[goids_na, ]
goids <- goids[, c("transcript_name", "GO")]
colnames(goids) <- c("ID","GO")
crossref = unique(annotation_info[, c("transcript_name", "transcript_id")])
rownames(annotation_info) = make.names(annotation_info$transcript_name, unique=TRUE)
There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper 'graph_metrics()' but that takes away the chance to explain the graphs as I generate them.
full_design <- cbf5_expt$design
raw_metrics <- graph_metrics(cbf5_expt, qq=TRUE)
## [1] "There is just one batch in this data."
raw_metrics$libsize
raw_metrics$nonzero
raw_metrics$boxplot
## Warning: Removed 4008 rows containing non-finite values (stat_boxplot).
raw_metrics$corheat
raw_metrics$disheat
raw_metrics$smc
raw_metrics$pcaplot
raw_metrics$density
## Warning: Removed 4008 rows containing non-finite values (stat_density).
raw_metrics$qqlog
In most cases, raw data does not cluster very well, lets see if that is also true for the cbf5 experiment. Assuming it doesn't, lets normalize the data using the defaults (cpm, quantile, log2) and try again.
## So, normalize the data
l2qnorm <- normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE)
l2q_metrics <- graph_metrics(l2qnorm)
## [1] "There is just one batch in this data."
l2q_metrics$pcaplot
l2q_metrics$density
l2q_metrics$pcavar
## [1] 63.44 15.78 5.89 4.88 3.56 3.34 3.11
l2q_metrics$pcares
## propVar cumPropVar cond.R2
## 1 63.44 63.44 93.79
## 2 15.78 79.22 3.68
## 3 5.89 85.11 2.16
## 4 4.88 89.99 0.29
## 5 3.56 93.55 0.03
## 6 3.34 96.89 0.05
## 7 3.11 100.00 0.00
l2tmm <- normalize_expt(cbf5_expt, transform="log2", norm="tmm", convert="cpm", filter_low="pofa")
hpgl_density(l2tmm)
funkytown <- pca_information(l2qnorm, plot_pcas=TRUE)
## [1] "PC1: 63.44% variance"
## [1] "PC2: 15.78% variance"
funkytown$pca_plots$PC1_PC2$plot
## You may call the pairwise comparisons singly
##limma_analysis <- limma_pairwise(cbf5_expt, model_batch=FALSE)
##deseq_analysis <- deseq2_pairwise(cbf5_expt, model_batch=FALSE)
##edger_analysis <- edger_pairwise(cbf5_expt, model_batch=FALSE)
##basic_analysis <- basic_pairwise(cbf5_expt)
fun = all_pairwise(cbf5_expt, batches=NULL, model_batch=FALSE)
fun$comparison$comp
## wt_vs_mut
## le 0.9732847
## ld 0.9847753
## ed 0.9509408
## lb 0.9772878
## eb 0.9565188
## db 0.9716832
## limma, edgeR, DESeq2, and my stupid basic analysis all agree very much for this data.
keepers <- list("mutvwt" = c("mut","wt"))
combined <- combine_de_tables(fun, excel="excel/contrasts.xlsx", annot_df=annotation_info, keepers=keepers)
## table total limma_up limma_sigup deseq_up deseq_sigup edger_up
## 2 wt_vs_mut 6697 472 437 230 228 340
## edger_sigup basic_up basic_sigup limma_down limma_sigdown deseq_down
## 2 265 500 437 386 350 441
## deseq_sigdown edger_down edger_sigdown basic_down basic_sigdown meta_up
## 2 438 590 512 389 238 326
## meta_sigup meta_down meta_sigdown
## 2 309 472 444
## Warning: Removed 1087 rows containing non-finite values (stat_smooth).
## Warning: Removed 1087 rows containing missing values (geom_point).
## NULL
significant_summary <- extract_significant_genes(combined$data, excel="excel/significant_contrasts.xlsx")
I have 4 ontology tools we can try applying to this as well as the KEGG pathways.
## Test gprofiler
## gprofiler(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")
uppers_names <- uppers[["transcript_name"]]
profiler_go <- gProfileR::gprofiler(uppers_names, organism="scerevisiae")
profiler_kegg <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="KEGG")
profiler_reactome <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="REAC")
profiler_tf <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="TF")
profiler_mi <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="MI")
profiler_corum <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="CORUM")
kept_ontology <- subset_ontology_search(significant_summary, lengths=genelengths, goids=goids, gff=reference, gff_type="gene", universe_merge="transcript_name")
## Warning in pcls(G): initial point very close to some inequality constraints
## The minimum observed pvalue is: 0.000002
## The minimum observed pvalue is: 0.000002
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000010
## The minimum observed pvalue is: 0.000010
## The minimum observed pvalue is: 0.000901
## The minimum observed pvalue is: 0.000901
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000078
## The minimum observed pvalue is: 0.000078
##
## Building most specific GOs ..... ( 1818 GO terms found. )
##
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 2645 GO terms found. )
##
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 698 GO terms found. )
##
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 1818 GO terms found. )
##
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 2645 GO terms found. )
##
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 698 GO terms found. )
##
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## -- Classic Algorithm --
##
## the algorithm is scoring 695 nontrivial nodes
## parameters:
## test statistic: Fisher test
##
## -- Classic Algorithm --
##
## the algorithm is scoring 1928 nontrivial nodes
## parameters:
## test statistic: Fisher test
##
## -- Classic Algorithm --
##
## the algorithm is scoring 359 nontrivial nodes
## parameters:
## test statistic: Fisher test
##
## -- Classic Algorithm --
##
## the algorithm is scoring 2293 nontrivial nodes
## parameters:
## test statistic: KS tests
## score order: increasing
##
## -- Classic Algorithm --
##
## the algorithm is scoring 4691 nontrivial nodes
## parameters:
## test statistic: KS tests
## score order: increasing
##
## -- Classic Algorithm --
##
## the algorithm is scoring 915 nontrivial nodes
## parameters:
## test statistic: KS tests
## score order: increasing
##
## -- Elim Algorithm --
##
## the algorithm is scoring 2293 nontrivial nodes
## parameters:
## test statistic: Fisher test
## cutOff: 0.01
## score order: increasing
##
## Level 15: 2 nodes to be scored (0 eliminated genes)
##
## Level 14: 8 nodes to be scored (0 eliminated genes)
##
## Level 13: 7 nodes to be scored (17 eliminated genes)
##
## Level 12: 25 nodes to be scored (28 eliminated genes)
##
## Level 11: 45 nodes to be scored (40 eliminated genes)
##
## Level 10: 90 nodes to be scored (72 eliminated genes)
##
## Level 9: 145 nodes to be scored (229 eliminated genes)
##
## Level 8: 295 nodes to be scored (900 eliminated genes)
##
## Level 7: 432 nodes to be scored (1270 eliminated genes)
##
## Level 6: 647 nodes to be scored (1592 eliminated genes)
##
## Level 5: 348 nodes to be scored (2122 eliminated genes)
##
## Level 4: 180 nodes to be scored (2913 eliminated genes)
##
## Level 3: 55 nodes to be scored (3166 eliminated genes)
##
## Level 2: 13 nodes to be scored (3911 eliminated genes)
##
## Level 1: 1 nodes to be scored (4476 eliminated genes)
##
## -- Elim Algorithm --
##
## the algorithm is scoring 4691 nontrivial nodes
## parameters:
## test statistic: Fisher test
## cutOff: 0.01
## score order: increasing
##
## Level 20: 1 nodes to be scored (0 eliminated genes)
##
## Level 19: 1 nodes to be scored (0 eliminated genes)
##
## Level 18: 1 nodes to be scored (0 eliminated genes)
##
## Level 17: 4 nodes to be scored (0 eliminated genes)
##
## Level 16: 27 nodes to be scored (38 eliminated genes)
##
## Level 15: 83 nodes to be scored (38 eliminated genes)
##
## Level 14: 161 nodes to be scored (50 eliminated genes)
##
## Level 13: 240 nodes to be scored (245 eliminated genes)
##
## Level 12: 358 nodes to be scored (692 eliminated genes)
##
## Level 11: 473 nodes to be scored (1315 eliminated genes)
##
## Level 10: 597 nodes to be scored (1736 eliminated genes)
##
## Level 9: 653 nodes to be scored (2269 eliminated genes)
##
## Level 8: 589 nodes to be scored (2767 eliminated genes)
##
## Level 7: 528 nodes to be scored (3690 eliminated genes)
##
## Level 6: 459 nodes to be scored (4144 eliminated genes)
##
## Level 5: 302 nodes to be scored (4725 eliminated genes)
##
## Level 4: 145 nodes to be scored (4960 eliminated genes)
##
## Level 3: 50 nodes to be scored (5129 eliminated genes)
##
## Level 2: 18 nodes to be scored (5371 eliminated genes)
##
## Level 1: 1 nodes to be scored (5400 eliminated genes)
##
## -- Elim Algorithm --
##
## the algorithm is scoring 915 nontrivial nodes
## parameters:
## test statistic: Fisher test
## cutOff: 0.01
## score order: increasing
##
## Level 18: 1 nodes to be scored (0 eliminated genes)
##
## Level 17: 1 nodes to be scored (0 eliminated genes)
##
## Level 16: 6 nodes to be scored (0 eliminated genes)
##
## Level 15: 27 nodes to be scored (22 eliminated genes)
##
## Level 14: 59 nodes to be scored (53 eliminated genes)
##
## Level 13: 87 nodes to be scored (202 eliminated genes)
##
## Level 12: 95 nodes to be scored (416 eliminated genes)
##
## Level 11: 125 nodes to be scored (1128 eliminated genes)
##
## Level 10: 134 nodes to be scored (1773 eliminated genes)
##
## Level 9: 55 nodes to be scored (2406 eliminated genes)
##
## Level 8: 96 nodes to be scored (2566 eliminated genes)
##
## Level 7: 47 nodes to be scored (4531 eliminated genes)
##
## Level 6: 58 nodes to be scored (4871 eliminated genes)
##
## Level 5: 39 nodes to be scored (5311 eliminated genes)
##
## Level 4: 65 nodes to be scored (5687 eliminated genes)
##
## Level 3: 11 nodes to be scored (5714 eliminated genes)
##
## Level 2: 8 nodes to be scored (5780 eliminated genes)
##
## Level 1: 1 nodes to be scored (5784 eliminated genes)
##
## -- Weight Algorithm --
##
## The algorithm is scoring 695 nontrivial nodes
## parameters:
## test statistic: Fisher test : ratio
##
## Level 12: 3 nodes to be scored.
##
## Level 11: 9 nodes to be scored.
##
## Level 10: 22 nodes to be scored.
##
## Level 9: 36 nodes to be scored.
##
## Level 8: 62 nodes to be scored.
##
## Level 7: 103 nodes to be scored.
##
## Level 6: 166 nodes to be scored.
##
## Level 5: 138 nodes to be scored.
##
## Level 4: 100 nodes to be scored.
##
## Level 3: 46 nodes to be scored.
##
## Level 2: 9 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
##
## -- Weight Algorithm --
##
## The algorithm is scoring 1928 nontrivial nodes
## parameters:
## test statistic: Fisher test : ratio
##
## Level 17: 1 nodes to be scored.
##
## Level 16: 6 nodes to be scored.
##
## Level 15: 18 nodes to be scored.
##
## Level 14: 25 nodes to be scored.
##
## Level 13: 49 nodes to be scored.
##
## Level 12: 93 nodes to be scored.
##
## Level 11: 146 nodes to be scored.
##
## Level 10: 208 nodes to be scored.
##
## Level 9: 262 nodes to be scored.
##
## Level 8: 264 nodes to be scored.
##
## Level 7: 240 nodes to be scored.
##
## Level 6: 255 nodes to be scored.
##
## Level 5: 196 nodes to be scored.
##
## Level 4: 109 nodes to be scored.
##
## Level 3: 41 nodes to be scored.
##
## Level 2: 14 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
##
## -- Weight Algorithm --
##
## The algorithm is scoring 359 nontrivial nodes
## parameters:
## test statistic: Fisher test : ratio
##
## Level 16: 2 nodes to be scored.
##
## Level 15: 5 nodes to be scored.
##
## Level 14: 13 nodes to be scored.
##
## Level 13: 21 nodes to be scored.
##
## Level 12: 31 nodes to be scored.
##
## Level 11: 51 nodes to be scored.
##
## Level 10: 53 nodes to be scored.
##
## Level 9: 34 nodes to be scored.
##
## Level 8: 41 nodes to be scored.
##
## Level 7: 19 nodes to be scored.
##
## Level 6: 24 nodes to be scored.
##
## Level 5: 19 nodes to be scored.
##
## Level 4: 29 nodes to be scored.
##
## Level 3: 9 nodes to be scored.
##
## Level 2: 7 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
##
## Building most specific GOs ..... ( 1818 GO terms found. )
##
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 2645 GO terms found. )
##
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 698 GO terms found. )
##
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 1818 GO terms found. )
##
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 2645 GO terms found. )
##
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## Building most specific GOs ..... ( 698 GO terms found. )
##
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
##
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
##
## -- Classic Algorithm --
##
## the algorithm is scoring 492 nontrivial nodes
## parameters:
## test statistic: Fisher test
##
## -- Classic Algorithm --
##
## the algorithm is scoring 1420 nontrivial nodes
## parameters:
## test statistic: Fisher test
##
## -- Classic Algorithm --
##
## the algorithm is scoring 382 nontrivial nodes
## parameters:
## test statistic: Fisher test
##
## -- Classic Algorithm --
##
## the algorithm is scoring 2293 nontrivial nodes
## parameters:
## test statistic: KS tests
## score order: increasing
##
## -- Classic Algorithm --
##
## the algorithm is scoring 4691 nontrivial nodes
## parameters:
## test statistic: KS tests
## score order: increasing
##
## -- Classic Algorithm --
##
## the algorithm is scoring 915 nontrivial nodes
## parameters:
## test statistic: KS tests
## score order: increasing
##
## -- Elim Algorithm --
##
## the algorithm is scoring 2293 nontrivial nodes
## parameters:
## test statistic: Fisher test
## cutOff: 0.01
## score order: increasing
##
## Level 15: 2 nodes to be scored (0 eliminated genes)
##
## Level 14: 8 nodes to be scored (0 eliminated genes)
##
## Level 13: 7 nodes to be scored (17 eliminated genes)
##
## Level 12: 25 nodes to be scored (28 eliminated genes)
##
## Level 11: 45 nodes to be scored (28 eliminated genes)
##
## Level 10: 90 nodes to be scored (93 eliminated genes)
##
## Level 9: 145 nodes to be scored (244 eliminated genes)
##
## Level 8: 295 nodes to be scored (905 eliminated genes)
##
## Level 7: 432 nodes to be scored (1318 eliminated genes)
##
## Level 6: 647 nodes to be scored (1626 eliminated genes)
##
## Level 5: 348 nodes to be scored (2168 eliminated genes)
##
## Level 4: 180 nodes to be scored (3101 eliminated genes)
##
## Level 3: 55 nodes to be scored (3256 eliminated genes)
##
## Level 2: 13 nodes to be scored (4063 eliminated genes)
##
## Level 1: 1 nodes to be scored (4510 eliminated genes)
##
## -- Elim Algorithm --
##
## the algorithm is scoring 4691 nontrivial nodes
## parameters:
## test statistic: Fisher test
## cutOff: 0.01
## score order: increasing
##
## Level 20: 1 nodes to be scored (0 eliminated genes)
##
## Level 19: 1 nodes to be scored (0 eliminated genes)
##
## Level 18: 1 nodes to be scored (0 eliminated genes)
##
## Level 17: 4 nodes to be scored (0 eliminated genes)
##
## Level 16: 27 nodes to be scored (38 eliminated genes)
##
## Level 15: 83 nodes to be scored (38 eliminated genes)
##
## Level 14: 161 nodes to be scored (68 eliminated genes)
##
## Level 13: 240 nodes to be scored (273 eliminated genes)
##
## Level 12: 358 nodes to be scored (708 eliminated genes)
##
## Level 11: 473 nodes to be scored (1295 eliminated genes)
##
## Level 10: 597 nodes to be scored (1795 eliminated genes)
##
## Level 9: 653 nodes to be scored (2330 eliminated genes)
##
## Level 8: 589 nodes to be scored (2861 eliminated genes)
##
## Level 7: 528 nodes to be scored (3869 eliminated genes)
##
## Level 6: 459 nodes to be scored (4183 eliminated genes)
##
## Level 5: 302 nodes to be scored (4594 eliminated genes)
##
## Level 4: 145 nodes to be scored (4930 eliminated genes)
##
## Level 3: 50 nodes to be scored (5044 eliminated genes)
##
## Level 2: 18 nodes to be scored (5387 eliminated genes)
##
## Level 1: 1 nodes to be scored (5506 eliminated genes)
##
## -- Elim Algorithm --
##
## the algorithm is scoring 915 nontrivial nodes
## parameters:
## test statistic: Fisher test
## cutOff: 0.01
## score order: increasing
##
## Level 18: 1 nodes to be scored (0 eliminated genes)
##
## Level 17: 1 nodes to be scored (0 eliminated genes)
##
## Level 16: 6 nodes to be scored (0 eliminated genes)
##
## Level 15: 27 nodes to be scored (6 eliminated genes)
##
## Level 14: 59 nodes to be scored (25 eliminated genes)
##
## Level 13: 87 nodes to be scored (165 eliminated genes)
##
## Level 12: 95 nodes to be scored (506 eliminated genes)
##
## Level 11: 125 nodes to be scored (1052 eliminated genes)
##
## Level 10: 134 nodes to be scored (1713 eliminated genes)
##
## Level 9: 55 nodes to be scored (2422 eliminated genes)
##
## Level 8: 96 nodes to be scored (2503 eliminated genes)
##
## Level 7: 47 nodes to be scored (4344 eliminated genes)
##
## Level 6: 58 nodes to be scored (4941 eliminated genes)
##
## Level 5: 39 nodes to be scored (5376 eliminated genes)
##
## Level 4: 65 nodes to be scored (5741 eliminated genes)
##
## Level 3: 11 nodes to be scored (5765 eliminated genes)
##
## Level 2: 8 nodes to be scored (5765 eliminated genes)
##
## Level 1: 1 nodes to be scored (5792 eliminated genes)
##
## -- Weight Algorithm --
##
## The algorithm is scoring 492 nontrivial nodes
## parameters:
## test statistic: Fisher test : ratio
##
## Level 14: 1 nodes to be scored.
##
## Level 13: 2 nodes to be scored.
##
## Level 12: 2 nodes to be scored.
##
## Level 11: 6 nodes to be scored.
##
## Level 10: 14 nodes to be scored.
##
## Level 9: 18 nodes to be scored.
##
## Level 8: 47 nodes to be scored.
##
## Level 7: 77 nodes to be scored.
##
## Level 6: 112 nodes to be scored.
##
## Level 5: 93 nodes to be scored.
##
## Level 4: 75 nodes to be scored.
##
## Level 3: 32 nodes to be scored.
##
## Level 2: 12 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
##
## -- Weight Algorithm --
##
## The algorithm is scoring 1420 nontrivial nodes
## parameters:
## test statistic: Fisher test : ratio
##
## Level 15: 4 nodes to be scored.
##
## Level 14: 14 nodes to be scored.
##
## Level 13: 30 nodes to be scored.
##
## Level 12: 59 nodes to be scored.
##
## Level 11: 109 nodes to be scored.
##
## Level 10: 161 nodes to be scored.
##
## Level 9: 184 nodes to be scored.
##
## Level 8: 171 nodes to be scored.
##
## Level 7: 167 nodes to be scored.
##
## Level 6: 195 nodes to be scored.
##
## Level 5: 177 nodes to be scored.
##
## Level 4: 98 nodes to be scored.
##
## Level 3: 37 nodes to be scored.
##
## Level 2: 13 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
##
## -- Weight Algorithm --
##
## The algorithm is scoring 382 nontrivial nodes
## parameters:
## test statistic: Fisher test : ratio
##
## Level 16: 4 nodes to be scored.
##
## Level 15: 7 nodes to be scored.
##
## Level 14: 16 nodes to be scored.
##
## Level 13: 31 nodes to be scored.
##
## Level 12: 29 nodes to be scored.
##
## Level 11: 56 nodes to be scored.
##
## Level 10: 59 nodes to be scored.
##
## Level 9: 37 nodes to be scored.
##
## Level 8: 41 nodes to be scored.
##
## Level 7: 18 nodes to be scored.
##
## Level 6: 26 nodes to be scored.
##
## Level 5: 17 nodes to be scored.
##
## Level 4: 23 nodes to be scored.
##
## Level 3: 10 nodes to be scored.
##
## Level 2: 7 nodes to be scored.
##
## Level 1: 1 nodes to be scored.
something <- write_subset_ontologies(kept_ontology)
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
uorf_all <- read.csv("excel/Ingolia-all-uORFs.csv")
uorf_all$num <- 1
uorf_fun <- uorf_all[, c("Gene","num")]
library(plyr)
uorf_fun <- ddply(uorf_fun, 'Gene', summarize, total = sum(num))
combined_data <- combined[["data"]][["mutvwt"]]
uorf_vs_fc <- merge(combined_data, uorf_fun, by.x="row.names", by.y="Gene", all.x=TRUE)
uorf_vs_fc$total[is.na(uorf_vs_fc$total == 0)] <- 0
cor.test(x=uorf_vs_fc$limma_logfc, uorf_vs_fc$total)
##
## Pearson's product-moment correlation
##
## data: uorf_vs_fc$limma_logfc and uorf_vs_fc$total
## t = -1.3195, df = 6695, p-value = 0.1871
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.040059519 0.007829872
## sample estimates:
## cor
## -0.01612407
uorf_dist <- uorf_all[, c("Gene","uORF.end.to.CDS")]
colnames(uorf_dist) <- c("Gene","dist")
rownames(uorf_dist) <- make.names(uorf_dist$Gene, unique=TRUE)
uorfdist_vs_fc <- merge(combined_data, uorf_dist, by.x="row.names", by.y="row.names", all.x=TRUE)
uorfdist_vs_fc$dist[is.na(uorfdist_vs_fc$dist)] <- -1000
cor.test(uorfdist_vs_fc$dist, uorfdist_vs_fc$basic_logfc, na.rm=TRUE)
##
## Pearson's product-moment correlation
##
## data: uorfdist_vs_fc$dist and uorfdist_vs_fc$basic_logfc
## t = -2.4438, df = 6695, p-value = 0.01456
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.053765512 -0.005906341
## sample estimates:
## cor
## -0.02985304
tt <- uorfdist_vs_fc[, c("dist","basic_denmed")]
tt$basic_denmed <- log(tt$basic_denmed)
png(file="/tmp/test_uorf.png")
hpgl_linear_scatter(tt)$scatter + scale_x_continuous(limits=c(10,500))
## Error in eval(expr, envir, enclos): could not find function "scale_x_continuous"
dev.off()
## X11cairo
## 2
try_species = kegg_get_orgn("Saccharomyces cerevisiae")
pathview_data = all_pairwise$all_tables$wt_minus_mut
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
pathview_data = merge(pathview_data, crossref, by.x="row.names", by.y="transcript_name", all.x=TRUE)
## Error in merge(pathview_data, crossref, by.x = "row.names", by.y = "transcript_name", : object 'pathview_data' not found
rownames(pathview_data) = make.names(pathview_data$transcript_id, unique=TRUE)
## Error in make.names(pathview_data$transcript_id, unique = TRUE): object 'pathview_data' not found
## If I read the yeast xml files correctly, they all follow the traditional chromosomal location naming scheme...
pathview_result = hpgl_pathview(pathview_data, species=try_species, filenames="name")
## Error in hpgl_pathview(pathview_data, species = try_species, filenames = "name"): object 'pathview_data' not found
load("GO2ALLEG.rda")
## The first entry is GO:0003674
go_genes = GO2ALLEG[['GO:0003674']]
subtraction = all_pairwise$all_tables$wt_minus_mut
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
genes_in_go = subtraction[rownames(subtraction) %in% go_genes,]
## Error in eval(expr, envir, enclos): object 'subtraction' not found
genes_in_go = subset(genes_in_go, logFC > 1)
## Error in subset(genes_in_go, logFC > 1): object 'genes_in_go' not found
head(genes_in_go)
## Error in head(genes_in_go): object 'genes_in_go' not found
dim(genes_in_go)
## Error in eval(expr, envir, enclos): object 'genes_in_go' not found
cbf5_expt = create_expt(file="all_samples.csv")
## Error in `sampleNames<-`(`*tmp*`, value = c("AAC1", "AAC3", "AAD10", "AAD14", : number of new names (6696) should equal number of rows in AnnotatedDataFrame (7131)
merged_expt = create_expt(file="all_samples_merged.csv")
## Error in `sampleNames<-`(`*tmp*`, value = c("AAC1", "AAC3", "AAD10", "AAD14", : number of new names (6696) should equal number of rows in AnnotatedDataFrame (7131)
There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper 'graph_metrics()' but that takes away the chance to explain the graphs as I generate them.
full_design = cbf5_expt$definitions
## First make a bar plot of the library sizes in the experiment.
## Notice that the colors were auto-chosen by create_expt() and they should
## be maintained throughout this process
fis_libsize = hpgl_libsize(cbf5_expt)
fis_libsize
## Here we see that the wild type replicate 3 sample for 15 minutes has fewer non-zero genes than all its friends.
fis_nonzero = hpgl_nonzero(cbf5_expt, title="nonzero vs. cpm")
fis_nonzero
In most cases, raw data does not cluster very well, lets see if that is also true for the cbf5 experiment. Assuming it doesn't, lets normalize the data using the defaults (cpm, quantile, log2) and try again.
## Unsurprisingly, the raw data doesn't cluster well at all...
##fis_rawpca = hpgl_pca(expt=cbf5_expt, labels=cbf5_expt$condition)
##fis_rawpca = hpgl_pca(expt=cbf5_expt, labels="fancy")
fis_rawpca = hpgl_pca(cbf5_expt, title="Fun!")
## [1] "There is just one batch in this data."
fis_rawpca$plot
## So, normalize the data
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE)
normed = graph_metrics(norm_expt)
## [1] "There is just one batch in this data."
normed$pcaplot
norm_expt_svaseq = normalize_expt(cbf5_expt, transform="raw", norm="raw", convert="cpm", filter_low=TRUE, batch="svaseq")
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
hpgl_pca(norm_expt_svaseq)$plot
## [1] "There is just one batch in this data."
norm_expt_sva = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
hpgl_pca(norm_expt_sva)$plot
## [1] "There is just one batch in this data."
## And try the pca again
fis_normpca = hpgl_pca(norm_expt, labels=norm_expt$condition, title="normalized pca")
## [1] "There is just one batch in this data."
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
norm_merged = normalize_expt(merged_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
## Error in normalize_expt(merged_expt, transform = "log2", norm = "quant", : object 'merged_expt' not found
## And try the pca again
fis_normpca = hpgl_pca(norm_expt, labels=norm_expt$condition, title="normalized pca")
## [1] "There is just one batch in this data."
mer_normpca = hpgl_pca(norm_merged, labels=norm_merged$condition, title="normalized pca")
## Error in hpgl_pca(norm_merged, labels = norm_merged$condition, title = "normalized pca"): object 'norm_merged' not found
mer_normpca$plot
## Error in eval(expr, envir, enclos): object 'mer_normpca' not found
tiff(file="plots/norm_pca.tiff")
mer_normpca$plot
## Error in eval(expr, envir, enclos): object 'mer_normpca' not found
dev.off()
## X11cairo
## 2
fis_normpca$res
## propVar cumPropVar cond.R2
## 1 84.41 84.41 96.21
## 2 6.15 90.56 3.54
## 3 3.36 93.92 0.09
## 4 2.68 96.60 0.14
## 5 1.74 98.34 0.01
## 6 1.65 99.99 0.00
## 7 0.01 100.00 0.00
fis_normpca$variance
## [1] 84.41 6.15 3.36 2.68 1.74 1.65 0.01
tiff(file="plots/merged_pca.tiff")
mer_normpca$plot
## Error in eval(expr, envir, enclos): object 'mer_normpca' not found
dev.off()
## X11cairo
## 2
funkytown = pca_information(norm_merged, plot_pcas=TRUE)
## Error in pca_information(norm_merged, plot_pcas = TRUE): object 'norm_merged' not found
funkytown$pca_plots$PC1_PC2$plot
fun = graph_metrics(expt=norm_expt)
## [1] "There is just one batch in this data."
##funkytown = pca_information(expt=norm_expt, plot_pcas=TRUE)
hpgl_boxplot(norm_expt)
hpgl_density(norm_expt)
hpgl_disheat(norm_expt)
hpgl_corheat(norm_expt)
hpgl_smc(norm_expt)
hpgl_qq_all(norm_expt)
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## $logs
##
## $ratios
##
## $medians
## $medians[[1]]
## [1] 0.004694
##
## $medians[[2]]
## [1] 0.00007572
##
## $medians[[3]]
## [1] 0.003637
##
## $medians[[4]]
## [1] -0.0004534
##
## $medians[[5]]
## [1] -0.0001983
##
## $medians[[6]]
## [1] -0.000378
##
## $medians[[7]]
## [1] -0.000292
##
## $medians[[8]]
## [1] -0.0004834
hpgl_boxplot(norm_merged)
## Error in hpgl_boxplot(norm_merged): object 'norm_merged' not found
hpgl_density(norm_merged)
## Error in hpgl_density(norm_merged): object 'norm_merged' not found
hpgl_disheat(norm_merged)
## Error in hpgl_heatmap(data, colors = colors, design = design, method = method, : object 'norm_merged' not found
hpgl_corheat(norm_merged)
## Error in hpgl_heatmap(data, colors = colors, design = design, method = method, : object 'norm_merged' not found
hpgl_smc(norm_merged)
## Error in hpgl_smc(norm_merged): object 'norm_merged' not found
hpgl_qq_all(norm_merged)
## Error in hpgl_qq_all(norm_merged): object 'norm_merged' not found
##limma_expt = normalize_expt(expt=cbf5_expt, norm="quant")
cbf5_quant = normalize_expt(cbf5_expt, norm="quant", convert="cpm", filter_low=TRUE)
all_pairwise = limma_pairwise(norm_merged, batches=NULL, model_batch=FALSE)
## Error in limma_pairwise(norm_merged, batches = NULL, model_batch = FALSE): object 'norm_merged' not found
##all_pairwise = edger_pairwise(expt=cbf5_expt, batches=NULL, model_batch=FALSE)
## I have the following elements in this list:
### conditions_table : how many replicates are there of each condition
### batches_table : how many replicates are there of each batch
### conditions : a factor of all the conditions
### batches : a factor of all the batches
### model : the model of the data used for voom etc.
### fit : the result of lmfit()
### voom_result : the result of voom()
### voom_design : the design matrix fed to voom()
### identities : the strings fed to makeContrasts() which describe each sample alone
### all_pairwise : the strings describing all the subtractions fed to makeContrasts()
### contrast_string : the entire string fed to makeContrasts() including the design etc.
### pairwise_fits : the result of contrasts.fit()
### pairwise_comparisons : the result from eBayes()
### limma_result : a list of toptable()s corresponding to each pairwise comparison
names(all_pairwise$all_tables)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
summary(all_pairwise$all_tables$wt_minus_mut)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
awesome = coefficient_scatter(all_pairwise)
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
awesome$scatter
## Error in eval(expr, envir, enclos): object 'awesome' not found
fun = all_pairwise(cbf5_expt, batches=NULL, model_batch=FALSE)
fun$comparison$comp
## wt_vs_mut
## le 0.9732847
## ld 0.9847753
## ed 0.9509408
## lb 0.9772878
## eb 0.9565188
## db 0.9716832
## limma, edgeR, and DESeq2 all agree very much for this data.
names(all_pairwise$all_tables)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
summary(all_pairwise$all_tables$wt_minus_mut)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
##logfc = limma_scatter(all_pairwise, first_table=3, second_column="logFC", second_table=3, first_column="AveExpr")
##logfc$scatter
wt_mut_scatter = coefficient_scatter(all_pairwise, x="wt", y="mut")
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
wt_mut_scatter$scatter
## Error in eval(expr, envir, enclos): object 'wt_mut_scatter' not found
wt_mut_scatter$both_histogram
## Error in eval(expr, envir, enclos): object 'wt_mut_scatter' not found
wt_nmd = coefficient_scatter(all_pairwise, x="wt", y="nmd")
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
wt_nmd$scatter
## Error in eval(expr, envir, enclos): object 'wt_nmd' not found
wt_nmd$both_histogram
## Error in eval(expr, envir, enclos): object 'wt_nmd' not found
cbf_nmd = coefficient_scatter(all_pairwise, x="wt_minus_mut", y="wt_minus_nmd")
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
cbf_nmd$scatter
## Error in eval(expr, envir, enclos): object 'cbf_nmd' not found
cbf_nmd$both_histogram
## Error in eval(expr, envir, enclos): object 'cbf_nmd' not found
cbf_nmd$correlation
## Error in eval(expr, envir, enclos): object 'cbf_nmd' not found
hpgl_cor(cbf_nmd$df, method="robust")
## Error in na.action(data): object 'cbf_nmd' not found
## I think some of the differences in these sample types is being squashed by sva.
I have 4 ontology tools we can try applying to this as well as the KEGG pathways.
##crazytown = limma_ontology(all_pairwise, gene_lengths=lengths, goids=goids) ## This works now, but takes forever...
crazytown = limma_ontology(all_pairwise, gene_lengths=lengths, goids=goids, do_topgo=FALSE, do_trees=FALSE, do_cluster=FALSE)
## Error in eval(expr, envir, enclos): could not find function "limma_ontology"
##try_species = kegg_get_orgn("Saccharomyces cerevisiae")
##crazytown = hpgl_pathview(all_pairwise$limma_result$wt_minus_mut, species=try_species)
expression_table = all_pairwise$all_tables$wt_minus_mut
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
yeast_annotations = import.gff2("reference/scerevisiae.gff.gz")
## Error in eval(expr, envir, enclos): could not find function "import.gff2"
annotation_info = as.data.frame(yeast_annotations)
## Error in as.data.frame(yeast_annotations): object 'yeast_annotations' not found
rownames(annotation_info) = make.names(annotation_info$transcript_name, unique=TRUE)
annotation_plus_data = merge(annotation_info, expression_table, by.x="row.names", by.y="row.names")
## Error in merge(annotation_info, expression_table, by.x = "row.names", : object 'expression_table' not found
rownames(annotation_plus_data) = annotation_plus_data$Row.names
## Error in eval(expr, envir, enclos): object 'annotation_plus_data' not found
annotation_plus_data = annotation_plus_data[,-1]
## Error in eval(expr, envir, enclos): object 'annotation_plus_data' not found
gene_lengths = annotation_plus_data[,c("width","logFC")]
## Error in eval(expr, envir, enclos): object 'annotation_plus_data' not found
gene_lengths$ID = rownames(gene_lengths)
## Error in rownames(gene_lengths): object 'gene_lengths' not found
## I did the following to pull out the information I care about from the SGD gene association tale:
## zcat gene_association.sgd.gz | grep -v "^\!" | awk -F ' ' '{printf("%s,%s,%s\n", $3, $5, $11)}' | cut -d '|' -f1 > go_trimmed.csv
## The resulting csv file is in reference/go_trimmed.csv
go_annotation = read.csv("reference/go_trimmed.csv.gz")
colnames(go_annotation) = c("Name","GO","ID")
## mut - wt, so higher means more in cbf5 mutant.
de_genes_minus = subset(expression_table, logFC <= -1.5)
## Error in subset(expression_table, logFC <= -1.5): object 'expression_table' not found
de_genes_plus = subset(expression_table, logFC >= 1.5)
## Error in subset(expression_table, logFC >= 1.5): object 'expression_table' not found
fun_minus = simple_goseq(de_genes=de_genes_minus, lengths=gene_lengths, goids=go_annotation)
## Error in simple_goseq(de_genes = de_genes_minus, lengths = gene_lengths, : object 'gene_lengths' not found
tiff(file="plots/increased_mf_mutant.tiff", res=90)
fun_minus$mfp_plot
## Error in eval(expr, envir, enclos): object 'fun_minus' not found
dev.off()
## X11cairo
## 2
tiff(file="plots/increased_bp_mutant.tiff", res=90)
fun_minus$bpp_plot
## Error in eval(expr, envir, enclos): object 'fun_minus' not found
dev.off()
## X11cairo
## 2
fun_minus$bp_interesting
## Error in eval(expr, envir, enclos): object 'fun_minus' not found
fun_plus = simple_goseq(de_genes_plus, lengths=gene_lengths, goids=go_annotation)
## Error in simple_goseq(de_genes_plus, lengths = gene_lengths, goids = go_annotation): object 'gene_lengths' not found
## Oh wow, that is pretty cool, mess up cbf5 and RNA polI increases, that makes sense!
## The up-regulated sets have crap q-values.
tiff(file="plots/decreased_mf_mutant.tiff", res=90)
fun_plus$mfp_plot
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
dev.off()
## X11cairo
## 2
tiff(file="plots/decreased_bp_mutant.tiff", res=90)
fun_plus$bpp_plot
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
dev.off()
## X11cairo
## 2
fun_plus$mf_interesting
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
fun_plus$bp_interesting
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
high_prfdb = read.csv("processed_data/high.csv", header=FALSE)
colnames(high_prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted")
low_prfdb = read.csv("processed_data/low.csv", header=FALSE)
colnames(low_prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted")
rownames(high_prfdb) = make.names(high_prfdb$ID, unique=TRUE)
rownames(low_prfdb) = make.names(low_prfdb$ID, unique=TRUE)
high_prfdb$value = "high"
low_prfdb$value = "low"
prfdb = read.csv("processed_data/hits.csv", header=FALSE)
colnames(prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted","value")
rownames(prfdb) = make.names(prfdb$ID, unique=TRUE)
test = merge(prfdb, df, by.x="row.names", by.y="row.names")
test$delta = test$wt - test$mut
ggplot(test, aes(x=wt, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
ggplot(test, aes(x=mut, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
ggplot(test, aes(x=delta, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
summary(test)
thigh = test[as.character(test$value) == 'high',]
low = low_prfdb[,c("ID","mfe","value")]
high = high_prfdb[,c("ID","mfe","value")]
mfe_dist = rbind(low, high)
mfe_dist = mfe_dist[,c("mfe","value")]
cdf = ddply(mfe_dist, "value", summarise, rating.mean=mean(mfe, na.rm=TRUE))
multi = ggplot(mfe_dist, aes(x=mfe, fill=value)) +
geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") +
geom_density(alpha=0.5) +
geom_vline(data=cdf, aes(xintercept=rating.mean, colour=value), linetype="dashed") +
theme_bw()
test = merge(mfe_dist, df, by.x="row.names", by.y="row.names")
multi = ggplot(test, aes(x=wt, y=mut, color=value)) +
geom_point()
multi
high = subset(test, value=="high")
low = subset(test, value=="low")
summary(high$wt)
summary(low$wt)
summary(high$mut)
summary(low$mut)
low$sub = low$wt - low$mut
high$sub = high$wt - high$mut
summary(crazytown)
summary(crazytown$wt_minus_mut$up_goseq)
crazytown$wt_minus_mut$up_goseq$mfp_plot
head(crazytown$wt_minus_mut$up_goseq$mf_subset)
subtraction = all_pairwise$all_tables$wt_minus_mut
load("GO2ALLEG.rda")
## The first entry is GO:0003674
go_genes = GO2ALLEG[['GO:0003674']]
genes_in_go = subtraction[rownames(subtraction) %in% go_genes,]
genes_in_go = subset(genes_in_go, logFC > 2)
head(genes_in_go)
de_genes_plus = subset(subtraction, logFC >= 1.5)
fun = simple_goseq(de_genes_plus, lengths=gene_lengths, goids=goids)
new_mf_table = hpgltools:::gather_genes(fun, ont='MF')
new_bp_table = hpgltools:::gather_genes(fun, ont='BP')
new_bp_table