RNAseq of yeast with wt/mutant CBF5

# Time-stamp: 
## This block is used to render pdf/html reports.
## There are a few different output formats available, a pdf and 2 html versions.
## I run the reports simply by hitting control-c control-n on the line I want to generate.

library(hpgltools)

save(list=ls(all=TRUE), file="RData")
rm(list=ls())
load("RData")

render("rnaseq_cbf5.rmd", output_format="pdf_document")
render("rnaseq_cbf5.rmd", output_format="html_document")
render("rnaseq_cbf5.rmd", 'knitrBootstrap::bootstrap_document')

Todo

  1. Give a boxplot of the non-quantile normalized data distributions immediately
knitr::opts_knit$set(progress=TRUE, verbose=TRUE, error=TRUE, stop_on_error=FALSE, fig.width=7, fig.height=7)

Analyses using alignments with 0 mismatches!

Setting up

Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the data.

## These first 4 lines are not needed once hpgltools is installed.
source("http://bioconductor.org/biocLite.R")
biocLite("devtools")
library(devtools)
install_github("elsayed-lab/hpgltools")
library(hpgltools)
autoloads_all()

Reading data

reference <- "reference/scerevisiae.gff.gz"

cbf5_expt <- create_expt(file="cbf5_samples.csv", suffix="_forward-trimmed-v0M1.count.gz", by_sample=TRUE, sep=",")
## processed_data/count_tables/hpgl0564/hpgl0564_forward-trimmed-v0M1.count.gz contains 6697 rows.
## processed_data/count_tables/hpgl0565/hpgl0565_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0566/hpgl0566_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0567/hpgl0567_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0568/hpgl0568_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0569/hpgl0569_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0570/hpgl0570_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0571/hpgl0571_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
genelengths <- get_genelengths(reference)
goids <- read.delim("reference/go_trimmed.csv.gz", sep=",", header=FALSE)
##colnames(goids) <- c("ID","GO","Yeast_ID")
colnames(goids) <- c("unknown","GO","go_geneid")
annotation_info <- gff2df(reference)
annot <- annotation_info[, c("gene_id", "transcript_name")]
goids <- goids[, c("go_geneid","GO")]
goids <- merge(goids, annot, by.x="go_geneid", by.y="gene_id", all.x=TRUE)
goids_na <- !is.na(goids$transcript_name)
goids <- goids[goids_na, ]
goids <- goids[, c("transcript_name", "GO")]
colnames(goids) <- c("ID","GO")

crossref = unique(annotation_info[, c("transcript_name", "transcript_id")])
rownames(annotation_info) = make.names(annotation_info$transcript_name, unique=TRUE)

Normalizing and exploring data

There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper 'graph_metrics()' but that takes away the chance to explain the graphs as I generate them.

full_design <- cbf5_expt$design
raw_metrics <- graph_metrics(cbf5_expt, qq=TRUE)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## I am reasonably sure this should be log scaled and am setting it.
##   If this is incorrect, set scale='raw'
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Graphing a PCA plot.
## [1] "There is just one batch in this data."
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## QQ plotting!.
raw_metrics$libsize
raw_metrics$nonzero
raw_metrics$boxplot
## Warning: Removed 4008 rows containing non-finite values (stat_boxplot).
raw_metrics$corheat
raw_metrics$disheat
raw_metrics$smc
raw_metrics$pcaplot
raw_metrics$density
## Warning: Removed 4008 rows containing non-finite values (stat_density).
raw_metrics$qqlog

An initial pca plot

In most cases, raw data does not cluster very well, lets see if that is also true for the cbf5 experiment. Assuming it doesn't, lets normalize the data using the defaults (cpm, quantile, log2) and try again.

## So, normalize the data
l2qnorm <- normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(quant(cpm(low-filter(data))))
## It saves the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept in the slot called:
##  'new_expt$normalized$normalized_counts$libsize' which is copied into
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Did not recognize the filtering argument, defaulting to cbcb's.
##  Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'
l2q_metrics <- graph_metrics(l2qnorm)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Graphing a PCA plot.
## [1] "There is just one batch in this data."
## Plotting a density plot.
l2q_metrics$pcaplot
l2q_metrics$density
l2q_metrics$pcavar
## [1] 63.44 15.78  5.89  4.88  3.56  3.34  3.11
l2q_metrics$pcares
##   propVar cumPropVar cond.R2
## 1   63.44      63.44   93.79
## 2   15.78      79.22    3.68
## 3    5.89      85.11    2.16
## 4    4.88      89.99    0.29
## 5    3.56      93.55    0.03
## 6    3.34      96.89    0.05
## 7    3.11     100.00    0.00
l2tmm <- normalize_expt(cbf5_expt, transform="log2", norm="tmm", convert="cpm", filter_low="pofa")
## This function will replace the expt$expressionset slot with:
## log2(tmm(cpm(low-filter(data))))
## It saves the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept in the slot called:
##  'new_expt$normalized$normalized_counts$libsize' which is copied into
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Removing 431 low-count genes (6266 remaining).
hpgl_density(l2tmm)
funkytown <- pca_information(l2qnorm, plot_pcas=TRUE)
## The more shallow the curves in these plots, the more genes responsible for this principle component.
## [1] "PC1: 63.44% variance"
## [1] "PC2: 15.78% variance"
## The standard deviation was 0 for batch and PC1.
## The standard deviation was 0 for batch and PC2.
funkytown$pca_plots$PC1_PC2$plot

Some simple differential expression analyses

## You may call the pairwise comparisons singly
##limma_analysis <- limma_pairwise(cbf5_expt, model_batch=FALSE)
##deseq_analysis <- deseq2_pairwise(cbf5_expt, model_batch=FALSE)
##edger_analysis <- edger_pairwise(cbf5_expt, model_batch=FALSE)
##basic_analysis <- basic_pairwise(cbf5_expt)

fun = all_pairwise(cbf5_expt, batches=NULL, model_batch=FALSE)
## Starting limma pairwise comparison.
## libsize was not specified, this parameter has profound effects on limma's result.
## Using the libsize from expt$normalized$normalized_counts.
## Limma step 1/6: choosing model.
## Limma step 2/6: running voom
## The voom input was not cpm, converting now.
## The voom input was not log2, transforming now.
## limma step 3/6: running lmFit
## Limma step 4/6: making and fitting contrasts.
## As a reference, the identity is: mut = mut,
## As a reference, the identity is: wt = wt,
## Limma step 5/6: Running eBayes and topTable.
## Limma step 6/6: Writing limma outputs.
## limma step 6/6: 1/3: Printing table: mut.
## limma step 6/6: 2/3: Printing table: wt.
## limma step 6/6: 3/3: Printing table: wt_vs_mut.
## Starting DESeq2 pairwise comparisons.
## DESeq2 step 1/5: Including only condition in the deseq model.
## DESeq2 step 2/5: Estimate size factors.
## DESeq2 step 3/5: estimate Dispersions.
## DESeq2 step 4/5: nbinomWaldTest.
## DESeq2 step 5/5: 1/1: Printing table: wt_vs_mut
## Collected coefficients for: mut
## Collected coefficients for: wt
## Starting edgeR pairwise comparisons.
## EdgeR step 1/9: normalizing data.
## EdgeR step 2/9: Estimating the common dispersion.
## EdgeR step 3/9: Estimating dispersion across genes.
## EdgeR step 4/9: Estimating GLM Common dispersion.
## EdgeR step 5/9: Estimating GLM Trended dispersion.
## EdgeR step 6/9: Estimating GLM Tagged dispersion.
## EdgeR step 7/9: Running glmFit.
## EdgeR step 8/9: Making pairwise contrasts.
## As a reference, the identity is: mut = mut,
## As a reference, the identity is: wt = wt,
## EdgeR step 9/9: 1/1: Printing table: wt_vs_mut.
## Starting basic pairwise comparison.
## Basic step 1/3: Creating median and variance tables.
## Basic step 2/3: Performing comparisons.
## Basic step 2/3: 1/1: Performing log2 subtraction: wt_vs_mut
## Basic step 3/3: Creating faux DE Tables.
## Basic: Returning tables.
## 1/1: Comparing analyses: wt_vs_mut
fun$comparison$comp
##    wt_vs_mut
## le 0.9732847
## ld 0.9847753
## ed 0.9509408
## lb 0.9772878
## eb 0.9565188
## db 0.9716832
## limma, edgeR, DESeq2, and my stupid basic analysis all agree very much for this data.

keepers <- list("mutvwt" = c("mut","wt"))
combined <- combine_de_tables(fun, excel="excel/contrasts.xlsx", annot_df=annotation_info, keepers=keepers)
## Deleting the file excel/contrasts.xlsx before writing the tables.
## Working on 1/1: mutvwt
## The table is: wt_vs_mut
##       table total limma_up limma_sigup deseq_up deseq_sigup edger_up
## 2 wt_vs_mut  6697      472         437      230         228      340
##   edger_sigup basic_up basic_sigup limma_down limma_sigdown deseq_down
## 2         265      500         437        386           350        441
##   deseq_sigdown edger_down edger_sigdown basic_down basic_sigdown meta_up
## 2           438        590           512        389           238     326
##   meta_sigup meta_down meta_sigdown
## 2        309       472          444
## Converted strand to characters.
## Converted source to characters.
## Converted type to characters.
## Attempting to add a coefficient plot for mutvwt at column 52
## Warning: Removed 1087 rows containing non-finite values (stat_smooth).
## Warning: Removed 1087 rows containing missing values (geom_point).
## Writing summary information.
## Converted table to characters.
## The sheet pairwise_summary already exists.
## Attempting to add the comparison plot to pairwise_summary at row: 19 and column: 1
## NULL
## Performing final save of the workbook.
significant_summary <- extract_significant_genes(combined$data, excel="excel/significant_contrasts.xlsx")
## Writing excel data sheet 1/1
## Assuming the fold changes are on the log scale and so taking >< 0
## After (adj)p filter, the up genes table has 1904 genes.
## After (adj)p filter, the down genes table has 1065 genes.
## Assuming the fold changes are on the log scale and so taking -1 * fc
## After fold change filter, the up genes table has 437 genes.
## After fold change filter, the down genes table has 350 genes.
## Printing significant genes to the file: excel/significant_contrasts.xlsx
## Deleting the file excel/significant_contrasts.xlsx before writing the tables.
## 1/1: Writing excel data sheet up_mutvwt
## Converted strand to characters.
## Converted source to characters.
## Converted type to characters.
## 1/1: Writing excel data sheet down_mutvwt
## Converted strand to characters.
## Converted source to characters.
## Converted type to characters.
## Writing changed genes summary on last sheet.
## Converted change_counts_up to characters.
## Converted change_counts_down to characters.

Quick go attempt

I have 4 ontology tools we can try applying to this as well as the KEGG pathways.

## Test gprofiler
## gprofiler(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")
uppers_names <- uppers[["transcript_name"]]
profiler_go <- gProfileR::gprofiler(uppers_names, organism="scerevisiae")
profiler_kegg <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="KEGG")
profiler_reactome <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="REAC")
profiler_tf <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="TF")
profiler_mi <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="MI")
profiler_corum <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="CORUM")

kept_ontology <- subset_ontology_search(significant_summary, lengths=genelengths, goids=goids, gff=reference, gff_type="gene", universe_merge="transcript_name")
## Successfully found geneTable.rda.
## Using GO mapping data located in GO2EG.rda
## 1/1: Starting goseq
## Warning in pcls(G): initial point very close to some inequality constraints
## 1/1: Starting clusterprofiler
## The minimum observed pvalue is: 0.000002
## The minimum observed pvalue is: 0.000002
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000010
## The minimum observed pvalue is: 0.000010
## The minimum observed pvalue is: 0.000901
## The minimum observed pvalue is: 0.000901
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000000
## The minimum observed pvalue is: 0.000078
## The minimum observed pvalue is: 0.000078
## 1/1: Starting topgo
## 
## Building most specific GOs ..... ( 1818 GO terms found. )
## 
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 2645 GO terms found. )
## 
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 698 GO terms found. )
## 
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 1818 GO terms found. )
## 
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 2645 GO terms found. )
## 
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 698 GO terms found. )
## 
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 695 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 1928 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 359 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 2293 nontrivial nodes
##       parameters: 
##           test statistic:  KS tests 
##           score order:  increasing 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 4691 nontrivial nodes
##       parameters: 
##           test statistic:  KS tests 
##           score order:  increasing 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 915 nontrivial nodes
##       parameters: 
##           test statistic:  KS tests 
##           score order:  increasing 
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 2293 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
##           cutOff:  0.01 
##           score order:  increasing 
## 
##   Level 15:  2 nodes to be scored    (0 eliminated genes)
## 
##   Level 14:  8 nodes to be scored    (0 eliminated genes)
## 
##   Level 13:  7 nodes to be scored    (17 eliminated genes)
## 
##   Level 12:  25 nodes to be scored   (28 eliminated genes)
## 
##   Level 11:  45 nodes to be scored   (40 eliminated genes)
## 
##   Level 10:  90 nodes to be scored   (72 eliminated genes)
## 
##   Level 9:   145 nodes to be scored  (229 eliminated genes)
## 
##   Level 8:   295 nodes to be scored  (900 eliminated genes)
## 
##   Level 7:   432 nodes to be scored  (1270 eliminated genes)
## 
##   Level 6:   647 nodes to be scored  (1592 eliminated genes)
## 
##   Level 5:   348 nodes to be scored  (2122 eliminated genes)
## 
##   Level 4:   180 nodes to be scored  (2913 eliminated genes)
## 
##   Level 3:   55 nodes to be scored   (3166 eliminated genes)
## 
##   Level 2:   13 nodes to be scored   (3911 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (4476 eliminated genes)
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 4691 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
##           cutOff:  0.01 
##           score order:  increasing 
## 
##   Level 20:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 19:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 18:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 17:  4 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  27 nodes to be scored   (38 eliminated genes)
## 
##   Level 15:  83 nodes to be scored   (38 eliminated genes)
## 
##   Level 14:  161 nodes to be scored  (50 eliminated genes)
## 
##   Level 13:  240 nodes to be scored  (245 eliminated genes)
## 
##   Level 12:  358 nodes to be scored  (692 eliminated genes)
## 
##   Level 11:  473 nodes to be scored  (1315 eliminated genes)
## 
##   Level 10:  597 nodes to be scored  (1736 eliminated genes)
## 
##   Level 9:   653 nodes to be scored  (2269 eliminated genes)
## 
##   Level 8:   589 nodes to be scored  (2767 eliminated genes)
## 
##   Level 7:   528 nodes to be scored  (3690 eliminated genes)
## 
##   Level 6:   459 nodes to be scored  (4144 eliminated genes)
## 
##   Level 5:   302 nodes to be scored  (4725 eliminated genes)
## 
##   Level 4:   145 nodes to be scored  (4960 eliminated genes)
## 
##   Level 3:   50 nodes to be scored   (5129 eliminated genes)
## 
##   Level 2:   18 nodes to be scored   (5371 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (5400 eliminated genes)
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 915 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
##           cutOff:  0.01 
##           score order:  increasing 
## 
##   Level 18:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 17:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  6 nodes to be scored    (0 eliminated genes)
## 
##   Level 15:  27 nodes to be scored   (22 eliminated genes)
## 
##   Level 14:  59 nodes to be scored   (53 eliminated genes)
## 
##   Level 13:  87 nodes to be scored   (202 eliminated genes)
## 
##   Level 12:  95 nodes to be scored   (416 eliminated genes)
## 
##   Level 11:  125 nodes to be scored  (1128 eliminated genes)
## 
##   Level 10:  134 nodes to be scored  (1773 eliminated genes)
## 
##   Level 9:   55 nodes to be scored   (2406 eliminated genes)
## 
##   Level 8:   96 nodes to be scored   (2566 eliminated genes)
## 
##   Level 7:   47 nodes to be scored   (4531 eliminated genes)
## 
##   Level 6:   58 nodes to be scored   (4871 eliminated genes)
## 
##   Level 5:   39 nodes to be scored   (5311 eliminated genes)
## 
##   Level 4:   65 nodes to be scored   (5687 eliminated genes)
## 
##   Level 3:   11 nodes to be scored   (5714 eliminated genes)
## 
##   Level 2:   8 nodes to be scored    (5780 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (5784 eliminated genes)
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 695 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test : ratio 
## 
##   Level 12:  3 nodes to be scored.
## 
##   Level 11:  9 nodes to be scored.
## 
##   Level 10:  22 nodes to be scored.
## 
##   Level 9:   36 nodes to be scored.
## 
##   Level 8:   62 nodes to be scored.
## 
##   Level 7:   103 nodes to be scored.
## 
##   Level 6:   166 nodes to be scored.
## 
##   Level 5:   138 nodes to be scored.
## 
##   Level 4:   100 nodes to be scored.
## 
##   Level 3:   46 nodes to be scored.
## 
##   Level 2:   9 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 1928 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test : ratio 
## 
##   Level 17:  1 nodes to be scored.
## 
##   Level 16:  6 nodes to be scored.
## 
##   Level 15:  18 nodes to be scored.
## 
##   Level 14:  25 nodes to be scored.
## 
##   Level 13:  49 nodes to be scored.
## 
##   Level 12:  93 nodes to be scored.
## 
##   Level 11:  146 nodes to be scored.
## 
##   Level 10:  208 nodes to be scored.
## 
##   Level 9:   262 nodes to be scored.
## 
##   Level 8:   264 nodes to be scored.
## 
##   Level 7:   240 nodes to be scored.
## 
##   Level 6:   255 nodes to be scored.
## 
##   Level 5:   196 nodes to be scored.
## 
##   Level 4:   109 nodes to be scored.
## 
##   Level 3:   41 nodes to be scored.
## 
##   Level 2:   14 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 359 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test : ratio 
## 
##   Level 16:  2 nodes to be scored.
## 
##   Level 15:  5 nodes to be scored.
## 
##   Level 14:  13 nodes to be scored.
## 
##   Level 13:  21 nodes to be scored.
## 
##   Level 12:  31 nodes to be scored.
## 
##   Level 11:  51 nodes to be scored.
## 
##   Level 10:  53 nodes to be scored.
## 
##   Level 9:   34 nodes to be scored.
## 
##   Level 8:   41 nodes to be scored.
## 
##   Level 7:   19 nodes to be scored.
## 
##   Level 6:   24 nodes to be scored.
## 
##   Level 5:   19 nodes to be scored.
## 
##   Level 4:   29 nodes to be scored.
## 
##   Level 3:   9 nodes to be scored.
## 
##   Level 2:   7 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
## 
## Building most specific GOs ..... ( 1818 GO terms found. )
## 
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 2645 GO terms found. )
## 
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 698 GO terms found. )
## 
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 1818 GO terms found. )
## 
## Build GO DAG topology .......... ( 2293 GO terms and 2965 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 2645 GO terms found. )
## 
## Build GO DAG topology .......... ( 4691 GO terms and 10896 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
## Building most specific GOs ..... ( 698 GO terms found. )
## 
## Build GO DAG topology .......... ( 915 GO terms and 1934 relations. )
## 
## Annotating nodes ............... ( 6298 genes annotated to the GO terms. )
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 492 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 1420 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 382 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 2293 nontrivial nodes
##       parameters: 
##           test statistic:  KS tests 
##           score order:  increasing 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 4691 nontrivial nodes
##       parameters: 
##           test statistic:  KS tests 
##           score order:  increasing 
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 915 nontrivial nodes
##       parameters: 
##           test statistic:  KS tests 
##           score order:  increasing 
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 2293 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
##           cutOff:  0.01 
##           score order:  increasing 
## 
##   Level 15:  2 nodes to be scored    (0 eliminated genes)
## 
##   Level 14:  8 nodes to be scored    (0 eliminated genes)
## 
##   Level 13:  7 nodes to be scored    (17 eliminated genes)
## 
##   Level 12:  25 nodes to be scored   (28 eliminated genes)
## 
##   Level 11:  45 nodes to be scored   (28 eliminated genes)
## 
##   Level 10:  90 nodes to be scored   (93 eliminated genes)
## 
##   Level 9:   145 nodes to be scored  (244 eliminated genes)
## 
##   Level 8:   295 nodes to be scored  (905 eliminated genes)
## 
##   Level 7:   432 nodes to be scored  (1318 eliminated genes)
## 
##   Level 6:   647 nodes to be scored  (1626 eliminated genes)
## 
##   Level 5:   348 nodes to be scored  (2168 eliminated genes)
## 
##   Level 4:   180 nodes to be scored  (3101 eliminated genes)
## 
##   Level 3:   55 nodes to be scored   (3256 eliminated genes)
## 
##   Level 2:   13 nodes to be scored   (4063 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (4510 eliminated genes)
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 4691 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
##           cutOff:  0.01 
##           score order:  increasing 
## 
##   Level 20:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 19:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 18:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 17:  4 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  27 nodes to be scored   (38 eliminated genes)
## 
##   Level 15:  83 nodes to be scored   (38 eliminated genes)
## 
##   Level 14:  161 nodes to be scored  (68 eliminated genes)
## 
##   Level 13:  240 nodes to be scored  (273 eliminated genes)
## 
##   Level 12:  358 nodes to be scored  (708 eliminated genes)
## 
##   Level 11:  473 nodes to be scored  (1295 eliminated genes)
## 
##   Level 10:  597 nodes to be scored  (1795 eliminated genes)
## 
##   Level 9:   653 nodes to be scored  (2330 eliminated genes)
## 
##   Level 8:   589 nodes to be scored  (2861 eliminated genes)
## 
##   Level 7:   528 nodes to be scored  (3869 eliminated genes)
## 
##   Level 6:   459 nodes to be scored  (4183 eliminated genes)
## 
##   Level 5:   302 nodes to be scored  (4594 eliminated genes)
## 
##   Level 4:   145 nodes to be scored  (4930 eliminated genes)
## 
##   Level 3:   50 nodes to be scored   (5044 eliminated genes)
## 
##   Level 2:   18 nodes to be scored   (5387 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (5506 eliminated genes)
## 
##           -- Elim Algorithm -- 
## 
##       the algorithm is scoring 915 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test 
##           cutOff:  0.01 
##           score order:  increasing 
## 
##   Level 18:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 17:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  6 nodes to be scored    (0 eliminated genes)
## 
##   Level 15:  27 nodes to be scored   (6 eliminated genes)
## 
##   Level 14:  59 nodes to be scored   (25 eliminated genes)
## 
##   Level 13:  87 nodes to be scored   (165 eliminated genes)
## 
##   Level 12:  95 nodes to be scored   (506 eliminated genes)
## 
##   Level 11:  125 nodes to be scored  (1052 eliminated genes)
## 
##   Level 10:  134 nodes to be scored  (1713 eliminated genes)
## 
##   Level 9:   55 nodes to be scored   (2422 eliminated genes)
## 
##   Level 8:   96 nodes to be scored   (2503 eliminated genes)
## 
##   Level 7:   47 nodes to be scored   (4344 eliminated genes)
## 
##   Level 6:   58 nodes to be scored   (4941 eliminated genes)
## 
##   Level 5:   39 nodes to be scored   (5376 eliminated genes)
## 
##   Level 4:   65 nodes to be scored   (5741 eliminated genes)
## 
##   Level 3:   11 nodes to be scored   (5765 eliminated genes)
## 
##   Level 2:   8 nodes to be scored    (5765 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (5792 eliminated genes)
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 492 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test : ratio 
## 
##   Level 14:  1 nodes to be scored.
## 
##   Level 13:  2 nodes to be scored.
## 
##   Level 12:  2 nodes to be scored.
## 
##   Level 11:  6 nodes to be scored.
## 
##   Level 10:  14 nodes to be scored.
## 
##   Level 9:   18 nodes to be scored.
## 
##   Level 8:   47 nodes to be scored.
## 
##   Level 7:   77 nodes to be scored.
## 
##   Level 6:   112 nodes to be scored.
## 
##   Level 5:   93 nodes to be scored.
## 
##   Level 4:   75 nodes to be scored.
## 
##   Level 3:   32 nodes to be scored.
## 
##   Level 2:   12 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 1420 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test : ratio 
## 
##   Level 15:  4 nodes to be scored.
## 
##   Level 14:  14 nodes to be scored.
## 
##   Level 13:  30 nodes to be scored.
## 
##   Level 12:  59 nodes to be scored.
## 
##   Level 11:  109 nodes to be scored.
## 
##   Level 10:  161 nodes to be scored.
## 
##   Level 9:   184 nodes to be scored.
## 
##   Level 8:   171 nodes to be scored.
## 
##   Level 7:   167 nodes to be scored.
## 
##   Level 6:   195 nodes to be scored.
## 
##   Level 5:   177 nodes to be scored.
## 
##   Level 4:   98 nodes to be scored.
## 
##   Level 3:   37 nodes to be scored.
## 
##   Level 2:   13 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
## 
##           -- Weight Algorithm -- 
## 
##       The algorithm is scoring 382 nontrivial nodes
##       parameters: 
##           test statistic:  Fisher test : ratio 
## 
##   Level 16:  4 nodes to be scored.
## 
##   Level 15:  7 nodes to be scored.
## 
##   Level 14:  16 nodes to be scored.
## 
##   Level 13:  31 nodes to be scored.
## 
##   Level 12:  29 nodes to be scored.
## 
##   Level 11:  56 nodes to be scored.
## 
##   Level 10:  59 nodes to be scored.
## 
##   Level 9:   37 nodes to be scored.
## 
##   Level 8:   41 nodes to be scored.
## 
##   Level 7:   18 nodes to be scored.
## 
##   Level 6:   26 nodes to be scored.
## 
##   Level 5:   17 nodes to be scored.
## 
##   Level 4:   23 nodes to be scored.
## 
##   Level 3:   10 nodes to be scored.
## 
##   Level 2:   7 nodes to be scored.
## 
##   Level 1:   1 nodes to be scored.
## 1/1: Starting gostats
something <- write_subset_ontologies(kept_ontology)
## Renaming excel/subset_go_up-mutvwt-2016040116.xlsx to excel/subset_go_up-mutvwt-2016040116.xlsx.01.
## Renaming excel/subset_go_down-mutvwt-2016040116.xlsx to excel/subset_go_down-mutvwt-2016040116.xlsx.01.
## NULL
## Warning: Overwriting existing cell data.
## The previous line was a warning about overwriting existing data because of a link.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
## NULL
## Warning: Overwriting existing cell data.
uorf_all <- read.csv("excel/Ingolia-all-uORFs.csv")
uorf_all$num <- 1
uorf_fun <- uorf_all[, c("Gene","num")]
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:graph':
## 
##     join
## The following object is masked from 'package:IRanges':
## 
##     desc
## The following object is masked from 'package:S4Vectors':
## 
##     rename
uorf_fun <- ddply(uorf_fun, 'Gene', summarize, total = sum(num))
combined_data <- combined[["data"]][["mutvwt"]]
uorf_vs_fc <- merge(combined_data, uorf_fun, by.x="row.names", by.y="Gene", all.x=TRUE)
uorf_vs_fc$total[is.na(uorf_vs_fc$total == 0)] <- 0
cor.test(x=uorf_vs_fc$limma_logfc, uorf_vs_fc$total)
## 
##  Pearson's product-moment correlation
## 
## data:  uorf_vs_fc$limma_logfc and uorf_vs_fc$total
## t = -1.3195, df = 6695, p-value = 0.1871
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.040059519  0.007829872
## sample estimates:
##         cor 
## -0.01612407
uorf_dist <- uorf_all[, c("Gene","uORF.end.to.CDS")]
colnames(uorf_dist) <- c("Gene","dist")
rownames(uorf_dist) <- make.names(uorf_dist$Gene, unique=TRUE)
uorfdist_vs_fc <- merge(combined_data, uorf_dist, by.x="row.names", by.y="row.names", all.x=TRUE)
uorfdist_vs_fc$dist[is.na(uorfdist_vs_fc$dist)] <- -1000
cor.test(uorfdist_vs_fc$dist, uorfdist_vs_fc$basic_logfc, na.rm=TRUE)
## 
##  Pearson's product-moment correlation
## 
## data:  uorfdist_vs_fc$dist and uorfdist_vs_fc$basic_logfc
## t = -2.4438, df = 6695, p-value = 0.01456
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.053765512 -0.005906341
## sample estimates:
##         cor 
## -0.02985304
tt <- uorfdist_vs_fc[, c("dist","basic_denmed")]
tt$basic_denmed <- log(tt$basic_denmed)
png(file="/tmp/test_uorf.png")
hpgl_linear_scatter(tt)$scatter + scale_x_continuous(limits=c(10,500))
## Setting binwidth to 6.312 in order to have 500 bins.
## Error in eval(expr, envir, enclos): could not find function "scale_x_continuous"
dev.off()
## X11cairo 
##        2
try_species = kegg_get_orgn("Saccharomyces cerevisiae")
pathview_data = all_pairwise$all_tables$wt_minus_mut
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
pathview_data = merge(pathview_data, crossref, by.x="row.names", by.y="transcript_name", all.x=TRUE)
## Error in merge(pathview_data, crossref, by.x = "row.names", by.y = "transcript_name", : object 'pathview_data' not found
rownames(pathview_data) = make.names(pathview_data$transcript_id, unique=TRUE)
## Error in make.names(pathview_data$transcript_id, unique = TRUE): object 'pathview_data' not found
## If I read the yeast xml files correctly, they all follow the traditional chromosomal location naming scheme...
pathview_result = hpgl_pathview(pathview_data, species=try_species, filenames="name")
## Error in hpgl_pathview(pathview_data, species = try_species, filenames = "name"): object 'pathview_data' not found

Some playing around to get genes from go categories.

load("GO2ALLEG.rda")
## The first entry is GO:0003674
go_genes = GO2ALLEG[['GO:0003674']]

subtraction = all_pairwise$all_tables$wt_minus_mut
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
genes_in_go = subtraction[rownames(subtraction) %in% go_genes,]
## Error in eval(expr, envir, enclos): object 'subtraction' not found
genes_in_go = subset(genes_in_go, logFC > 1)
## Error in subset(genes_in_go, logFC > 1): object 'genes_in_go' not found
head(genes_in_go)
## Error in head(genes_in_go): object 'genes_in_go' not found
dim(genes_in_go)
## Error in eval(expr, envir, enclos): object 'genes_in_go' not found

Copy/pasting the 2 mismatch material here and deleting the file

RNAseq of yeast with wt/mutant CBF5 2 mismatches and including NMD rnaseq data!

Reading data

cbf5_expt = create_expt(file="all_samples.csv")
## processed_data/count_tables/hpgl0564/hpgl0564_scerevisiae.count.gz contains 7131 rows.
## processed_data/count_tables/hpgl0565/hpgl0565_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0566/hpgl0566_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0567/hpgl0567_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0568/hpgl0568_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0569/hpgl0569_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0570/hpgl0570_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0571/hpgl0571_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/wt/wt_forward-trimmed_scerevisiae.count.gz contains 6697 rows and merges to 7131 rows.
## processed_data/count_tables/upf1/upf1_forward-trimmed_scerevisiae.count.gz contains 6697 rows and merges to 7131 rows.
## processed_data/count_tables/upf2/upf2_forward-trimmed_scerevisiae.count.gz contains 6697 rows and merges to 7131 rows.
## processed_data/count_tables/upf3/upf3_forward-trimmed_scereviaiae.count.gz contains 6697 rows and merges to 7131 rows.
## Error in `sampleNames<-`(`*tmp*`, value = c("AAC1", "AAC3", "AAD10", "AAD14", : number of new names (6696) should equal number of rows in AnnotatedDataFrame (7131)
merged_expt = create_expt(file="all_samples_merged.csv")
## processed_data/count_tables/hpgl0564/hpgl0564_scerevisiae.count.gz contains 7131 rows.
## processed_data/count_tables/hpgl0565/hpgl0565_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0566/hpgl0566_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0567/hpgl0567_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0568/hpgl0568_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0569/hpgl0569_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0570/hpgl0570_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/hpgl0571/hpgl0571_scerevisiae.count.gz contains 7131 rows and merges to 7131 rows.
## processed_data/count_tables/wt/wt_forward-trimmed_scerevisiae.count.gz contains 6697 rows and merges to 7131 rows.
## processed_data/count_tables/upf1/upf1_forward-trimmed_scerevisiae.count.gz contains 6697 rows and merges to 7131 rows.
## processed_data/count_tables/upf2/upf2_forward-trimmed_scerevisiae.count.gz contains 6697 rows and merges to 7131 rows.
## processed_data/count_tables/upf3/upf3_forward-trimmed_scereviaiae.count.gz contains 6697 rows and merges to 7131 rows.
## Error in `sampleNames<-`(`*tmp*`, value = c("AAC1", "AAC3", "AAD10", "AAD14", : number of new names (6696) should equal number of rows in AnnotatedDataFrame (7131)

Normalizing and exploring data

There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper 'graph_metrics()' but that takes away the chance to explain the graphs as I generate them.

full_design = cbf5_expt$definitions
## First make a bar plot of the library sizes in the experiment.
## Notice that the colors were auto-chosen by create_expt() and they should
## be maintained throughout this process
fis_libsize = hpgl_libsize(cbf5_expt)
fis_libsize
## Here we see that the wild type replicate 3 sample for 15 minutes has fewer non-zero genes than all its friends.
fis_nonzero = hpgl_nonzero(cbf5_expt, title="nonzero vs. cpm")
fis_nonzero

An initial pca plot

In most cases, raw data does not cluster very well, lets see if that is also true for the cbf5 experiment. Assuming it doesn't, lets normalize the data using the defaults (cpm, quantile, log2) and try again.

## Unsurprisingly, the raw data doesn't cluster well at all...
##fis_rawpca = hpgl_pca(expt=cbf5_expt, labels=cbf5_expt$condition)
##fis_rawpca = hpgl_pca(expt=cbf5_expt, labels="fancy")
fis_rawpca = hpgl_pca(cbf5_expt, title="Fun!")
## [1] "There is just one batch in this data."
fis_rawpca$plot
## So, normalize the data
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(quant(cpm(low-filter(data))))
## It saves the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept in the slot called:
##  'new_expt$normalized$normalized_counts$libsize' which is copied into
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Did not recognize the filtering argument, defaulting to cbcb's.
##  Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'
normed = graph_metrics(norm_expt)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Graphing a PCA plot.
## [1] "There is just one batch in this data."
## Plotting a density plot.
normed$pcaplot
norm_expt_svaseq = normalize_expt(cbf5_expt, transform="raw", norm="raw", convert="cpm", filter_low=TRUE, batch="svaseq")
## This function will replace the expt$expressionset slot with:
## cpm(low-filter(batch-correct(data)))
## It saves the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept in the slot called:
##  'new_expt$normalized$normalized_counts$libsize' which is copied into
##  new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
##  some metrics are easier to see when the data is log2 transformed, but
##  EdgeR/DESeq don't like transformed data.
## Leaving the data unnormalized.  This is necessary for DESeq, but
##  EdgeR/limma might benefit from normalization.  Good choices include quantile,
##  size-factor, tmm, etc.
## Did not recognize the filtering argument, defaulting to cbcb's.
##  Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'
## batch_counts: Before batch correction, 3725 entries 0
## batch_counts: Using sva::svaseq for batch correction.
## Note to self:  If you feed svaseq a data frame you will get an error like:
## data %*% (Id - mod %*% blah blah requires numeric/complex arguments.
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
## The number of elements which are < 0 after batch correction is: 30
hpgl_pca(norm_expt_svaseq)$plot
## [1] "There is just one batch in this data."
norm_expt_sva = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
## This function will replace the expt$expressionset slot with:
## log2(quant(cpm(low-filter(batch-correct(data)))))
## It saves the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept in the slot called:
##  'new_expt$normalized$normalized_counts$libsize' which is copied into
##  new_expt$best_libsize
## Did not recognize the filtering argument, defaulting to cbcb's.
##  Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'
## batch_counts: Before batch correction, 3768 entries 0
## batch_counts: Using sva::fsva for batch correction.
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
## The number of elements which are < 0 after batch correction is: 1
## transform_counts: Found 1 values equal to 0, adding 0.5
##  to the matrix.
hpgl_pca(norm_expt_sva)$plot
## [1] "There is just one batch in this data."
## And try the pca again
fis_normpca = hpgl_pca(norm_expt, labels=norm_expt$condition, title="normalized pca")
## [1] "There is just one batch in this data."
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
## This function will replace the expt$expressionset slot with:
## log2(quant(cpm(low-filter(batch-correct(data)))))
## It saves the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept in the slot called:
##  'new_expt$normalized$normalized_counts$libsize' which is copied into
##  new_expt$best_libsize
## Did not recognize the filtering argument, defaulting to cbcb's.
##  Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'
## batch_counts: Before batch correction, 3768 entries 0
## batch_counts: Using sva::fsva for batch correction.
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
## The number of elements which are < 0 after batch correction is: 1
## transform_counts: Found 1 values equal to 0, adding 0.5
##  to the matrix.
norm_merged = normalize_expt(merged_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
## Error in normalize_expt(merged_expt, transform = "log2", norm = "quant", : object 'merged_expt' not found
## And try the pca again
fis_normpca = hpgl_pca(norm_expt, labels=norm_expt$condition, title="normalized pca")
## [1] "There is just one batch in this data."
mer_normpca = hpgl_pca(norm_merged, labels=norm_merged$condition, title="normalized pca")
## Error in hpgl_pca(norm_merged, labels = norm_merged$condition, title = "normalized pca"): object 'norm_merged' not found
mer_normpca$plot
## Error in eval(expr, envir, enclos): object 'mer_normpca' not found
tiff(file="plots/norm_pca.tiff")
mer_normpca$plot
## Error in eval(expr, envir, enclos): object 'mer_normpca' not found
dev.off()
## X11cairo 
##        2
fis_normpca$res
##   propVar cumPropVar cond.R2
## 1   84.41      84.41   96.21
## 2    6.15      90.56    3.54
## 3    3.36      93.92    0.09
## 4    2.68      96.60    0.14
## 5    1.74      98.34    0.01
## 6    1.65      99.99    0.00
## 7    0.01     100.00    0.00
fis_normpca$variance
## [1] 84.41  6.15  3.36  2.68  1.74  1.65  0.01
tiff(file="plots/merged_pca.tiff")
mer_normpca$plot
## Error in eval(expr, envir, enclos): object 'mer_normpca' not found
dev.off()
## X11cairo 
##        2
funkytown = pca_information(norm_merged, plot_pcas=TRUE)
## Error in pca_information(norm_merged, plot_pcas = TRUE): object 'norm_merged' not found
funkytown$pca_plots$PC1_PC2$plot
fun = graph_metrics(expt=norm_expt)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Graphing a PCA plot.
## [1] "There is just one batch in this data."
## Plotting a density plot.
##funkytown = pca_information(expt=norm_expt, plot_pcas=TRUE)

Look at the data distributions

hpgl_boxplot(norm_expt)
hpgl_density(norm_expt)
hpgl_disheat(norm_expt)
hpgl_corheat(norm_expt)
hpgl_smc(norm_expt)
hpgl_qq_all(norm_expt)
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## Warning in log(sorted_x): NaNs produced
## Warning in log(sorted_y): NaNs produced
## $logs
## 
## $ratios
## 
## $medians
## $medians[[1]]
## [1] 0.004694
## 
## $medians[[2]]
## [1] 0.00007572
## 
## $medians[[3]]
## [1] 0.003637
## 
## $medians[[4]]
## [1] -0.0004534
## 
## $medians[[5]]
## [1] -0.0001983
## 
## $medians[[6]]
## [1] -0.000378
## 
## $medians[[7]]
## [1] -0.000292
## 
## $medians[[8]]
## [1] -0.0004834
hpgl_boxplot(norm_merged)
## Error in hpgl_boxplot(norm_merged): object 'norm_merged' not found
hpgl_density(norm_merged)
## Error in hpgl_density(norm_merged): object 'norm_merged' not found
hpgl_disheat(norm_merged)
## Error in hpgl_heatmap(data, colors = colors, design = design, method = method, : object 'norm_merged' not found
hpgl_corheat(norm_merged)
## Error in hpgl_heatmap(data, colors = colors, design = design, method = method, : object 'norm_merged' not found
hpgl_smc(norm_merged)
## Error in hpgl_smc(norm_merged): object 'norm_merged' not found
hpgl_qq_all(norm_merged)
## Error in hpgl_qq_all(norm_merged): object 'norm_merged' not found

Some simple differential expression analyses

##limma_expt = normalize_expt(expt=cbf5_expt, norm="quant")
cbf5_quant = normalize_expt(cbf5_expt, norm="quant", convert="cpm", filter_low=TRUE)
## This function will replace the expt$expressionset slot with:
## quant(cpm(low-filter(data)))
## It saves the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept in the slot called:
##  'new_expt$normalized$normalized_counts$libsize' which is copied into
##  new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
##  some metrics are easier to see when the data is log2 transformed, but
##  EdgeR/DESeq don't like transformed data.
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Did not recognize the filtering argument, defaulting to cbcb's.
##  Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'
all_pairwise = limma_pairwise(norm_merged, batches=NULL, model_batch=FALSE)
## Starting limma pairwise comparison.
## Error in limma_pairwise(norm_merged, batches = NULL, model_batch = FALSE): object 'norm_merged' not found
##all_pairwise = edger_pairwise(expt=cbf5_expt, batches=NULL, model_batch=FALSE)
## I have the following elements in this list:
### conditions_table     :  how many replicates are there of each condition
### batches_table        :  how many replicates are there of each batch
### conditions           :  a factor of all the conditions
### batches              :  a factor of all the batches
### model                :  the model of the data used for voom etc.
### fit                  :  the result of lmfit()
### voom_result          :  the result of voom()
### voom_design          :  the design matrix fed to voom()
### identities           :  the strings fed to makeContrasts() which describe each sample alone
### all_pairwise         :  the strings describing all the subtractions fed to makeContrasts()
### contrast_string      :  the entire string fed to makeContrasts() including the design etc.
### pairwise_fits        :  the result of contrasts.fit()
### pairwise_comparisons :  the result from eBayes()
### limma_result         :  a list of toptable()s corresponding to each pairwise comparison

names(all_pairwise$all_tables)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
summary(all_pairwise$all_tables$wt_minus_mut)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
awesome = coefficient_scatter(all_pairwise)
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
awesome$scatter
## Error in eval(expr, envir, enclos): object 'awesome' not found
fun = all_pairwise(cbf5_expt, batches=NULL, model_batch=FALSE)
## Starting limma pairwise comparison.
## libsize was not specified, this parameter has profound effects on limma's result.
## Using the libsize from expt$normalized$normalized_counts.
## Limma step 1/6: choosing model.
## Limma step 2/6: running voom
## The voom input was not cpm, converting now.
## The voom input was not log2, transforming now.
## limma step 3/6: running lmFit
## Limma step 4/6: making and fitting contrasts.
## As a reference, the identity is: mut = mut,
## As a reference, the identity is: wt = wt,
## Limma step 5/6: Running eBayes and topTable.
## Limma step 6/6: Writing limma outputs.
## limma step 6/6: 1/3: Printing table: mut.
## limma step 6/6: 2/3: Printing table: wt.
## limma step 6/6: 3/3: Printing table: wt_vs_mut.
## Starting DESeq2 pairwise comparisons.
## DESeq2 step 1/5: Including only condition in the deseq model.
## DESeq2 step 2/5: Estimate size factors.
## DESeq2 step 3/5: estimate Dispersions.
## DESeq2 step 4/5: nbinomWaldTest.
## DESeq2 step 5/5: 1/1: Printing table: wt_vs_mut
## Collected coefficients for: mut
## Collected coefficients for: wt
## Starting edgeR pairwise comparisons.
## EdgeR step 1/9: normalizing data.
## EdgeR step 2/9: Estimating the common dispersion.
## EdgeR step 3/9: Estimating dispersion across genes.
## EdgeR step 4/9: Estimating GLM Common dispersion.
## EdgeR step 5/9: Estimating GLM Trended dispersion.
## EdgeR step 6/9: Estimating GLM Tagged dispersion.
## EdgeR step 7/9: Running glmFit.
## EdgeR step 8/9: Making pairwise contrasts.
## As a reference, the identity is: mut = mut,
## As a reference, the identity is: wt = wt,
## EdgeR step 9/9: 1/1: Printing table: wt_vs_mut.
## Starting basic pairwise comparison.
## Basic step 1/3: Creating median and variance tables.
## Basic step 2/3: Performing comparisons.
## Basic step 2/3: 1/1: Performing log2 subtraction: wt_vs_mut
## Basic step 3/3: Creating faux DE Tables.
## Basic: Returning tables.
## 1/1: Comparing analyses: wt_vs_mut
fun$comparison$comp
##    wt_vs_mut
## le 0.9732847
## ld 0.9847753
## ed 0.9509408
## lb 0.9772878
## eb 0.9565188
## db 0.9716832
## limma, edgeR, and DESeq2 all agree very much for this data.
names(all_pairwise$all_tables)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
summary(all_pairwise$all_tables$wt_minus_mut)
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
##logfc = limma_scatter(all_pairwise, first_table=3, second_column="logFC", second_table=3, first_column="AveExpr")
##logfc$scatter

wt_mut_scatter = coefficient_scatter(all_pairwise, x="wt", y="mut")
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
wt_mut_scatter$scatter
## Error in eval(expr, envir, enclos): object 'wt_mut_scatter' not found
wt_mut_scatter$both_histogram
## Error in eval(expr, envir, enclos): object 'wt_mut_scatter' not found
wt_nmd = coefficient_scatter(all_pairwise, x="wt", y="nmd")
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
wt_nmd$scatter
## Error in eval(expr, envir, enclos): object 'wt_nmd' not found
wt_nmd$both_histogram
## Error in eval(expr, envir, enclos): object 'wt_nmd' not found
cbf_nmd = coefficient_scatter(all_pairwise, x="wt_minus_mut", y="wt_minus_nmd")
## Error in eval(expr, envir, enclos): could not find function "coefficient_scatter"
cbf_nmd$scatter
## Error in eval(expr, envir, enclos): object 'cbf_nmd' not found
cbf_nmd$both_histogram
## Error in eval(expr, envir, enclos): object 'cbf_nmd' not found
cbf_nmd$correlation
## Error in eval(expr, envir, enclos): object 'cbf_nmd' not found
hpgl_cor(cbf_nmd$df, method="robust")
## Error in na.action(data): object 'cbf_nmd' not found
## I think some of the differences in these sample types is being squashed by sva.

Quick go attempt

I have 4 ontology tools we can try applying to this as well as the KEGG pathways.

##crazytown = limma_ontology(all_pairwise, gene_lengths=lengths, goids=goids) ## This works now, but takes forever...
crazytown = limma_ontology(all_pairwise, gene_lengths=lengths, goids=goids, do_topgo=FALSE, do_trees=FALSE, do_cluster=FALSE)
## Error in eval(expr, envir, enclos): could not find function "limma_ontology"
##try_species = kegg_get_orgn("Saccharomyces cerevisiae")
##crazytown = hpgl_pathview(all_pairwise$limma_result$wt_minus_mut, species=try_species)
expression_table = all_pairwise$all_tables$wt_minus_mut
## Error in all_pairwise$all_tables: object of type 'closure' is not subsettable
yeast_annotations = import.gff2("reference/scerevisiae.gff.gz")
## Error in eval(expr, envir, enclos): could not find function "import.gff2"
annotation_info = as.data.frame(yeast_annotations)
## Error in as.data.frame(yeast_annotations): object 'yeast_annotations' not found
rownames(annotation_info) = make.names(annotation_info$transcript_name, unique=TRUE)
annotation_plus_data = merge(annotation_info, expression_table, by.x="row.names", by.y="row.names")
## Error in merge(annotation_info, expression_table, by.x = "row.names", : object 'expression_table' not found
rownames(annotation_plus_data) = annotation_plus_data$Row.names
## Error in eval(expr, envir, enclos): object 'annotation_plus_data' not found
annotation_plus_data = annotation_plus_data[,-1]
## Error in eval(expr, envir, enclos): object 'annotation_plus_data' not found
gene_lengths = annotation_plus_data[,c("width","logFC")]
## Error in eval(expr, envir, enclos): object 'annotation_plus_data' not found
gene_lengths$ID = rownames(gene_lengths)
## Error in rownames(gene_lengths): object 'gene_lengths' not found
## I did the following to pull out the information I care about from the SGD gene association tale:
## zcat gene_association.sgd.gz | grep -v "^\!" | awk -F ' ' '{printf("%s,%s,%s\n", $3, $5, $11)}' | cut -d '|' -f1 > go_trimmed.csv
## The resulting csv file is in reference/go_trimmed.csv
go_annotation = read.csv("reference/go_trimmed.csv.gz")
colnames(go_annotation) = c("Name","GO","ID")

## mut - wt, so higher means more in cbf5 mutant.
de_genes_minus = subset(expression_table, logFC <= -1.5)
## Error in subset(expression_table, logFC <= -1.5): object 'expression_table' not found
de_genes_plus = subset(expression_table, logFC >= 1.5)
## Error in subset(expression_table, logFC >= 1.5): object 'expression_table' not found
fun_minus = simple_goseq(de_genes=de_genes_minus, lengths=gene_lengths, goids=go_annotation)
## simple_goseq() makes some pretty hard assumptions about the data it is fed:
## It requires 2 tables, one of GOids which must have columns (gene)ID and GO(category)
## The other table is of gene lengths with columns (gene)ID and (gene)width.
## Other columns are fine, but ignored.
## Error in simple_goseq(de_genes = de_genes_minus, lengths = gene_lengths, : object 'gene_lengths' not found
tiff(file="plots/increased_mf_mutant.tiff", res=90)
fun_minus$mfp_plot
## Error in eval(expr, envir, enclos): object 'fun_minus' not found
dev.off()
## X11cairo 
##        2
tiff(file="plots/increased_bp_mutant.tiff", res=90)
fun_minus$bpp_plot
## Error in eval(expr, envir, enclos): object 'fun_minus' not found
dev.off()
## X11cairo 
##        2
fun_minus$bp_interesting
## Error in eval(expr, envir, enclos): object 'fun_minus' not found
fun_plus = simple_goseq(de_genes_plus, lengths=gene_lengths, goids=go_annotation)
## simple_goseq() makes some pretty hard assumptions about the data it is fed:
## It requires 2 tables, one of GOids which must have columns (gene)ID and GO(category)
## The other table is of gene lengths with columns (gene)ID and (gene)width.
## Other columns are fine, but ignored.
## Error in simple_goseq(de_genes_plus, lengths = gene_lengths, goids = go_annotation): object 'gene_lengths' not found
## Oh wow, that is pretty cool, mess up cbf5 and RNA polI increases, that makes sense!
## The up-regulated sets have crap q-values.
tiff(file="plots/decreased_mf_mutant.tiff", res=90)
fun_plus$mfp_plot
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
dev.off()
## X11cairo 
##        2
tiff(file="plots/decreased_bp_mutant.tiff", res=90)
fun_plus$bpp_plot
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
dev.off()
## X11cairo 
##        2
fun_plus$mf_interesting
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
fun_plus$bp_interesting
## Error in eval(expr, envir, enclos): object 'fun_plus' not found
high_prfdb = read.csv("processed_data/high.csv", header=FALSE)
colnames(high_prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted")
low_prfdb = read.csv("processed_data/low.csv", header=FALSE)
colnames(low_prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted")
rownames(high_prfdb) = make.names(high_prfdb$ID, unique=TRUE)
rownames(low_prfdb) = make.names(low_prfdb$ID, unique=TRUE)
high_prfdb$value = "high"
low_prfdb$value = "low"

prfdb = read.csv("processed_data/hits.csv", header=FALSE)
colnames(prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted","value")
rownames(prfdb) = make.names(prfdb$ID, unique=TRUE)
test = merge(prfdb, df, by.x="row.names", by.y="row.names")
test$delta = test$wt - test$mut
ggplot(test, aes(x=wt, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
ggplot(test, aes(x=mut, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
ggplot(test, aes(x=delta, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
summary(test)
thigh = test[as.character(test$value) == 'high',]

low = low_prfdb[,c("ID","mfe","value")]
high = high_prfdb[,c("ID","mfe","value")]
mfe_dist = rbind(low, high)
mfe_dist = mfe_dist[,c("mfe","value")]
cdf = ddply(mfe_dist, "value", summarise, rating.mean=mean(mfe, na.rm=TRUE))
multi = ggplot(mfe_dist, aes(x=mfe, fill=value)) +
    geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") +
    geom_density(alpha=0.5) +
    geom_vline(data=cdf, aes(xintercept=rating.mean, colour=value), linetype="dashed") +
    theme_bw()


test = merge(mfe_dist, df, by.x="row.names", by.y="row.names")
multi = ggplot(test, aes(x=wt, y=mut, color=value)) +
    geom_point()
multi
high = subset(test, value=="high")
low = subset(test, value=="low")
summary(high$wt)
summary(low$wt)
summary(high$mut)
summary(low$mut)
low$sub = low$wt - low$mut
high$sub = high$wt - high$mut

summary(crazytown)
summary(crazytown$wt_minus_mut$up_goseq)
crazytown$wt_minus_mut$up_goseq$mfp_plot
head(crazytown$wt_minus_mut$up_goseq$mf_subset)
subtraction = all_pairwise$all_tables$wt_minus_mut

load("GO2ALLEG.rda")
## The first entry is GO:0003674
go_genes = GO2ALLEG[['GO:0003674']]

genes_in_go = subtraction[rownames(subtraction) %in% go_genes,]
genes_in_go = subset(genes_in_go, logFC > 2)
head(genes_in_go)

de_genes_plus = subset(subtraction, logFC >= 1.5)
fun = simple_goseq(de_genes_plus, lengths=gene_lengths, goids=goids)


new_mf_table = hpgltools:::gather_genes(fun, ont='MF')
new_bp_table = hpgltools:::gather_genes(fun, ont='BP')
new_bp_table