RNAseq of yeast with wt/mutant CBF5

# Time-stamp: 
## This block is used to render pdf/html reports.
## There are a few different output formats available, a pdf and 2 html versions.
## I run the reports simply by hitting control-c control-n on the line I want to generate.

library(hpgltools)
autoloads_all()
render("rnaseq_cbf5.rmd", output_format="pdf_document")
render("rnaseq_cbf5.rmd", output_format="html_document")
render("rnaseq_cbf5.rmd", 'knitrBootstrap::bootstrap_document')

Setting up

Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the data.

## These first 4 lines are not needed once hpgltools is installed.
source("http://bioconductor.org/biocLite.R")
biocLite("devtools")
library(devtools)
install_github("elsayed-lab/hpgltools")
library(hpgltools)
autoloads_all()

Reading data

cbf5_expt = create_expt(file="all_samples.csv", suffix="_forward-trimmed-v0M1.count.gz", by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

Normalizing and exploring data

There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper 'graph_metrics()' but that takes away the chance to explain the graphs as I generate them.

full_design = cbf5_expt$definitions
## First make a bar plot of the library sizes in the experiment.
## Notice that the colors were auto-chosen by create_expt() and they should
## be maintained throughout this process
fis_libsize = hpgl_libsize(expt=cbf5_expt)

## Adding log10

fis_libsize

## Here we see that the wild type replicate 3 sample for 15 minutes has fewer non-zero genes than all its friends.
fis_nonzero = hpgl_nonzero(expt=cbf5_expt, labels="boring", title="nonzero vs. cpm")
fis_nonzero

An initial pca plot

In most cases, raw data does not cluster very well, lets see if that is also true for the cbf5 experiment. Assuming it doesn't, lets normalize the data using the defaults (cpm, quantile, log2) and try again.

## Unsurprisingly, the raw data doesn't cluster well at all...
fis_rawpca = hpgl_pca(expt=cbf5_expt, labels=cbf5_expt$condition)
fis_rawpca$plot

## So, normalize the data
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE)

## [1] "This function will replace the expt$expressionset slot with the log2(quant(cpm))'d data."
## [1] "It saves the current data into a slot named: expt$backup_expressionset"
## [1] "It will also save copies of each step along the way in expt$normalized with the corresponding libsizes."
## [1] "Keep the libsizes in mind when invoking limma.  The appropriate libsize is the non-log(cpm(normalized))."
## [1] "Did not recognize the filtering argument, defaulting to cbcb's."
## [1] "Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'"

## And try the pca again
fis_normpca = hpgl_pca(expt=norm_expt, labels=norm_expt$condition, title="normalized pca")
fis_normpca$plot

fis_normpca$res

##   propVar cumPropVar cond.R2
## 1   63.44      63.44   93.79
## 2   15.78      79.22    3.68
## 3    5.89      85.11    2.16
## 4    4.88      89.99    0.29
## 5    3.56      93.55    0.03
## 6    3.34      96.89    0.05
## 7    3.11     100.00    0.00

fis_normpca$variance

## [1] 63.44 15.78  5.89  4.88  3.56  3.34  3.11

funkytown = pca_information(expt=norm_expt, plot_pcas=TRUE)

## The more shallow the curves in these plots, the more genes responsible for this principle component.

## The standard deviation was 0 for batch and PC1.
## The standard deviation was 0 for batch and PC2.

Look at the data distributions

hpgl_boxplot(expt=norm_expt)

hpgl_density_plot(expt=norm_expt)

## This plot looks neat if you do position='fill' or position='stack'

hpgl_disheat(expt=norm_expt)

hpgl_corheat(expt=norm_expt)

hpgl_smc(expt=norm_expt)

hpgl_qq_all(expt=norm_expt)

## $logs
## 
## $ratios
## 
## $medians
## $medians[[1]]
## [1] 0.1826
## 
## $medians[[2]]
## [1] 0.08932
## 
## $medians[[3]]
## [1] 0.000839
## 
## $medians[[4]]
## [1] 0.1826
## 
## $medians[[5]]
## [1] 0.1825
## 
## $medians[[6]]
## [1] 0.0007897
## 
## $medians[[7]]
## [1] 0.1826
## 
## $medians[[8]]
## [1] 0.00082

Some simple differential expression analyses

##limma_expt = normalize_expt(expt=cbf5_expt, norm="quant")
all_pairwise = limma_pairwise(expt=cbf5_expt, batches=NULL, model_batch=FALSE)

## libsize was not specified, this parameter has profound effects on limma's result.
## Using the libsize from expt$norm_libsize.

## [1] "Did not recognize the filtering argument, defaulting to cbcb's."
## [1] "Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'"

## As I understand it, the ideal input for voom is non-logged, non-converted, normalized data.
## This data is: FALSE, FALSE, and quant.
## The voom input was not cpm, converting now.
## The voom input was not log2, transforming now.

## [1] "As a reference, the identity is: mut = mut,"
## [1] "As a reference, the identity is: wt = wt,"

## Printing table: mut
## Printing table: wt
## Printing table: wt_minus_mut

##all_pairwise = edger_pairwise(expt=cbf5_expt, batches=NULL, model_batch=FALSE)
## I have the following elements in this list:
### conditions_table     :  how many replicates are there of each condition
### batches_table        :  how many replicates are there of each batch
### conditions           :  a factor of all the conditions
### batches              :  a factor of all the batches
### model                :  the model of the data used for voom etc.
### fit                  :  the result of lmfit()
### voom_result          :  the result of voom()
### voom_design          :  the design matrix fed to voom()
### identities           :  the strings fed to makeContrasts() which describe each sample alone
### all_pairwise         :  the strings describing all the subtractions fed to makeContrasts()
### contrast_string      :  the entire string fed to makeContrasts() including the design etc.
### pairwise_fits        :  the result of contrasts.fit()
### pairwise_comparisons :  the result from eBayes()
### limma_result         :  a list of toptable()s corresponding to each pairwise comparison

names(all_pairwise$limma_result)

## [1] "mut"          "wt"           "wt_minus_mut"

summary(all_pairwise$limma_result$wt_minus_mut)

##      logFC            AveExpr             t              P.Value      
##  Min.   :-4.0399   Min.   :-3.272   Min.   :-6.3671   Min.   :0.0000  
##  1st Qu.:-0.5606   1st Qu.: 4.147   1st Qu.:-0.9310   1st Qu.:0.2177  
##  Median :-0.1743   Median : 5.478   Median :-0.2917   Median :0.4842  
##  Mean   :-0.1429   Mean   : 5.226   Mean   :-0.2367   Mean   :0.4809  
##  3rd Qu.: 0.2401   3rd Qu.: 6.757   3rd Qu.: 0.3962   3rd Qu.:0.7373  
##  Max.   : 6.1797   Max.   :14.998   Max.   : 9.5995   Max.   :0.9997  
##    adj.P.Val            B              qvalue      
##  Min.   :0.0000   Min.   :-6.061   Min.   :0.0000  
##  1st Qu.:0.8675   1st Qu.:-5.963   1st Qu.:0.8208  
##  Median :0.9616   Median :-5.765   Median :0.9098  
##  Mean   :0.8868   Mean   :-5.387   Mean   :0.8390  
##  3rd Qu.:0.9820   3rd Qu.:-5.287   3rd Qu.:0.9291  
##  Max.   :0.9997   Max.   :37.467   Max.   :0.9459

logfc = limma_scatter(all_pairwise)

## No binwidth nor bins provided, setting it to 0.0384533840078004 in order to have 500 bins.

logfc$scatter

relative_hist = logfc$both_histogram$plot
relative_hist + scale_y_continuous(limits=c(0,0.25))

Quick go attempt

I have 4 ontology tools we can try applying to this as well as the KEGG pathways.

##crazytown = limma_ontology(all_pairwise)
##try_species = kegg_get_orgn("Saccharomyces cerevisiae")
##crazytown = hpgl_pathview(all_pairwise$limma_result$wt_minus_mut, species=try_species)