# Time-stamp:
## This block is used to render pdf/html reports.
## There are a few different output formats available, a pdf and 2 html versions.
## I run the reports simply by hitting control-c control-n on the line I want to generate.
library(hpgltools)
autoloads_all()
render("rnaseq_cbf5.rmd", output_format="pdf_document")
render("rnaseq_cbf5.rmd", output_format="html_document")
render("rnaseq_cbf5.rmd", 'knitrBootstrap::bootstrap_document')
Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the data.
## These first 4 lines are not needed once hpgltools is installed.
source("http://bioconductor.org/biocLite.R")
biocLite("devtools")
library(devtools)
install_github("elsayed-lab/hpgltools")
library(hpgltools)
autoloads_all()
cbf5_expt = create_expt(file="all_samples.csv", suffix="_forward-trimmed-v0M1.count.gz", by_sample=TRUE)
## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."
There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper 'graph_metrics()' but that takes away the chance to explain the graphs as I generate them.
full_design = cbf5_expt$definitions
## First make a bar plot of the library sizes in the experiment.
## Notice that the colors were auto-chosen by create_expt() and they should
## be maintained throughout this process
fis_libsize = hpgl_libsize(expt=cbf5_expt)
fis_libsize
## Here we see that the wild type replicate 3 sample for 15 minutes has fewer non-zero genes than all its friends.
fis_nonzero = hpgl_nonzero(expt=cbf5_expt, labels="boring", title="nonzero vs. cpm")
fis_nonzero
In most cases, raw data does not cluster very well, lets see if that is also true for the cbf5 experiment. Assuming it doesn't, lets normalize the data using the defaults (cpm, quantile, log2) and try again.
## Unsurprisingly, the raw data doesn't cluster well at all...
fis_rawpca = hpgl_pca(expt=cbf5_expt, labels=cbf5_expt$condition)
fis_rawpca$plot
## So, normalize the data
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE)
## [1] "This function will replace the expt$expressionset slot with the log2(quant(cpm))'d data."
## [1] "It saves the current data into a slot named: expt$backup_expressionset"
## [1] "It will also save copies of each step along the way in expt$normalized with the corresponding libsizes."
## [1] "Keep the libsizes in mind when invoking limma. The appropriate libsize is the non-log(cpm(normalized))."
## [1] "Did not recognize the filtering argument, defaulting to cbcb's."
## [1] "Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'"
## And try the pca again
fis_normpca = hpgl_pca(expt=norm_expt, labels=norm_expt$condition, title="normalized pca")
fis_normpca$plot
fis_normpca$res
## propVar cumPropVar cond.R2
## 1 63.44 63.44 93.79
## 2 15.78 79.22 3.68
## 3 5.89 85.11 2.16
## 4 4.88 89.99 0.29
## 5 3.56 93.55 0.03
## 6 3.34 96.89 0.05
## 7 3.11 100.00 0.00
fis_normpca$variance
## [1] 63.44 15.78 5.89 4.88 3.56 3.34 3.11
funkytown = pca_information(expt=norm_expt, plot_pcas=TRUE)
hpgl_boxplot(expt=norm_expt)
hpgl_density_plot(expt=norm_expt)
hpgl_disheat(expt=norm_expt)
hpgl_corheat(expt=norm_expt)
hpgl_smc(expt=norm_expt)
hpgl_qq_all(expt=norm_expt)
## $logs
##
## $ratios
##
## $medians
## $medians[[1]]
## [1] 0.1826
##
## $medians[[2]]
## [1] 0.08932
##
## $medians[[3]]
## [1] 0.000839
##
## $medians[[4]]
## [1] 0.1826
##
## $medians[[5]]
## [1] 0.1825
##
## $medians[[6]]
## [1] 0.0007897
##
## $medians[[7]]
## [1] 0.1826
##
## $medians[[8]]
## [1] 0.00082
##limma_expt = normalize_expt(expt=cbf5_expt, norm="quant")
all_pairwise = limma_pairwise(expt=cbf5_expt, batches=NULL, model_batch=FALSE)
## [1] "Did not recognize the filtering argument, defaulting to cbcb's."
## [1] "Recognized filters are: 'cv', 'kofa', 'pofa', 'cbcb'"
## [1] "As a reference, the identity is: mut = mut,"
## [1] "As a reference, the identity is: wt = wt,"
##all_pairwise = edger_pairwise(expt=cbf5_expt, batches=NULL, model_batch=FALSE)
## I have the following elements in this list:
### conditions_table : how many replicates are there of each condition
### batches_table : how many replicates are there of each batch
### conditions : a factor of all the conditions
### batches : a factor of all the batches
### model : the model of the data used for voom etc.
### fit : the result of lmfit()
### voom_result : the result of voom()
### voom_design : the design matrix fed to voom()
### identities : the strings fed to makeContrasts() which describe each sample alone
### all_pairwise : the strings describing all the subtractions fed to makeContrasts()
### contrast_string : the entire string fed to makeContrasts() including the design etc.
### pairwise_fits : the result of contrasts.fit()
### pairwise_comparisons : the result from eBayes()
### limma_result : a list of toptable()s corresponding to each pairwise comparison
names(all_pairwise$limma_result)
## [1] "mut" "wt" "wt_minus_mut"
summary(all_pairwise$limma_result$wt_minus_mut)
## logFC AveExpr t P.Value
## Min. :-4.0399 Min. :-3.272 Min. :-6.3671 Min. :0.0000
## 1st Qu.:-0.5606 1st Qu.: 4.147 1st Qu.:-0.9310 1st Qu.:0.2177
## Median :-0.1743 Median : 5.478 Median :-0.2917 Median :0.4842
## Mean :-0.1429 Mean : 5.226 Mean :-0.2367 Mean :0.4809
## 3rd Qu.: 0.2401 3rd Qu.: 6.757 3rd Qu.: 0.3962 3rd Qu.:0.7373
## Max. : 6.1797 Max. :14.998 Max. : 9.5995 Max. :0.9997
## adj.P.Val B qvalue
## Min. :0.0000 Min. :-6.061 Min. :0.0000
## 1st Qu.:0.8675 1st Qu.:-5.963 1st Qu.:0.8208
## Median :0.9616 Median :-5.765 Median :0.9098
## Mean :0.8868 Mean :-5.387 Mean :0.8390
## 3rd Qu.:0.9820 3rd Qu.:-5.287 3rd Qu.:0.9291
## Max. :0.9997 Max. :37.467 Max. :0.9459
logfc = limma_scatter(all_pairwise)
logfc$scatter
relative_hist = logfc$both_histogram$plot
relative_hist + scale_y_continuous(limits=c(0,0.25))
I have 4 ontology tools we can try applying to this as well as the KEGG pathways.
##crazytown = limma_ontology(all_pairwise)
##try_species = kegg_get_orgn("Saccharomyces cerevisiae")
##crazytown = hpgl_pathview(all_pairwise$limma_result$wt_minus_mut, species=try_species)