RNA sequencing of Saccharomyces cerevisiae with mutants in the CBF5 locus

Todo

  1. Give a boxplot of the non-quantile normalized data distributions immediately

Analyses using alignments with 0 mismatches!

Setting up

Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the data.

## These first 4 lines are not needed once hpgltools is installed.
source("http://bioconductor.org/biocLite.R")
biocLite("devtools")
library(devtools)
install_github("elsayed-lab/hpgltools")
library(hpgltools)
autoloads_all()

Reading data and annotations

There is a bit of a problem here. The count tables were generated with htseq-count and a copy of the yeast genome’s reference in gff format. The IDs provided by that method include entries like: ‘ADE5.7’ and ‘ACH1;’ which are not the canonical SGD IDs nor ENSEMBL. The annotation information I have been using is from biomart from ENSEMBL, which of course has a completely different set of IDs. Finally, I should be able to map pretty much any ID types using orgDb. Thus I create yeast_orgdb and a dataframe called yeast_idmaps.

  1. My yeast_annotations data frame has rownames with stuff like ‘NME1’, which corresponds to the rownames of yeast_idmaps.
  2. yeast_idmaps has ADE5.7 in the column ‘COMMON’ and also ‘ALIAS’.
  3. The rownames of my count table has ADE5.7
  4. I should therefore merge annotations<->idmaps by rownames and COMMON and make use rownames to merge against the count tables.
  5. Hmmm nope. The IDs in the gff are a mix of a couple columns, how frustrating!
  6. Therefore I extracted the information from the gff file directly and will map against multiple columns separately.
gff_file <- "reference/scerevisiae.gff.gz"
yeast_orgdb <- s_p(choose_orgdb("saccharomyces_cerevisiae"))$result
## I have a species name mapped to orgdb for saccharomyces_cerevisiae.
## Attached the orgDb for saccharomyces_cerevisiae with keys: ALIAS, COMMON, DESCRIPTION, ENSEMBL, ENSEMBLPROT, ENSEMBLTRANS, ENTREZID, ENZYME, EVIDENCE, EVIDENCEALL, GENENAME, GO, GOALL, INTERPRO, ONTOLOGY, ONTOLOGYALL, ORF, PATH, PFAM, PMID, REFSEQ, SGD, SMART, UNIPROT.
yeast_idmaps <- orgdb_idmap(yeast_orgdb, mapto=c("ALIAS","COMMON","ENSEMBL","ENTREZID","ORF","SGD","GENENAME"))
## Warning in if (!mapto %in% avail_keytypes) {: the condition has length > 1 and only the
## first element will be used
## 'select()' returned 1:many mapping between keys and columns
rownames(yeast_idmaps) <- make.names(yeast_idmaps$ORF, unique=TRUE)

yeast_annotations <- get_biomart_annotations("scerevisiae")
## The biomart annotations file already exists, loading from it.
yeast_annotations$ID <- rownames(yeast_annotations)

yeast_gff_annotations <- gff2df(gff_file, type="gene")
## Had a successful gff import with rtracklayer::import.gff(gff, format='gtf')
## Returning a df with 18 columns and 7050 rows.
dim(yeast_gff_annotations)
## [1] 7050   18
counts <- Biobase::exprs(cbf5_expt$expressionset)
dim(counts)
## [1] 6692    8
colnames(yeast_gff_annotations)
##  [1] "seqnames"        "start"           "end"             "width"          
##  [5] "strand"          "source"          "type"            "score"          
##  [9] "phase"           "exon_number"     "gene_id"         "ID"             
## [13] "p_id"            "protein_id"      "transcript_id"   "transcript_name"
## [17] "tss_id"          "seqedit"
test_vs_counts <- merge(yeast_gff_annotations, counts, by.x="ID", by.y="row.names")
dim(test_vs_counts)
## [1] 6457   26
## This is the best I have gotten, losing 235 genes.
colnames(yeast_idmaps)
## [1] "ORF"      "ALIAS"    "COMMON"   "ENSEMBL"  "ENTREZID" "SGD"      "GENENAME"
yeast_annot <- merge(yeast_gff_annotations, yeast_idmaps, by.x="gene_id", by.y="row.names", all.x=TRUE)
dim(yeast_annot)
## [1] 7050   25
final_annot <- merge(yeast_annot, yeast_annotations, by.x="gene_id", by.y="ID", all.x=TRUE)
rownames(final_annot) <- make.names(final_annot$ID, unique=TRUE)
dim(final_annot)
## [1] 7050   34
cbf5_expt <- create_expt(file="cbf5_samples.csv", file_suffix="_forward-trimmed-v0M1.count.gz", gene_info=final_annot, sep=",")
## processed_data/count_tables/hpgl0564/hpgl0564_forward-trimmed-v0M1.count.gz contains 6697 rows.
## processed_data/count_tables/hpgl0565/hpgl0565_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0566/hpgl0566_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0567/hpgl0567_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0568/hpgl0568_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0569/hpgl0569_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0570/hpgl0570_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
## processed_data/count_tables/hpgl0571/hpgl0571_forward-trimmed-v0M1.count.gz contains 6697 rows and merges to 6697 rows.
cbf5_norm <- s_p(normalize_expt(cbf5_expt, norm="quant", convert="cpm", transform="log2", filter_low=TRUE))$result
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(low-filter(data))))
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Performing low-count filter with option: TRUE
## Removing 562 low-count genes (6130 remaining).
## Performing batch correction at step 4.
## Applying: log2 transformation.

Normalizing and exploring data

There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper ‘graph_metrics()’ but that takes away the chance to explain the graphs as I generate them.

raw_metrics <- s_p(graph_metrics(cbf5_expt, qq=TRUE))$result
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## I am reasonably sure this should be log scaled and am setting it.
##   If this is incorrect, set scale='raw'
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## QQ plotting!
## Making plot of hpgl0564(1) vs. a sample distribution.
## Making plot of hpgl0565(2) vs. a sample distribution.
## Making plot of hpgl0566(3) vs. a sample distribution.
## Making plot of hpgl0567(4) vs. a sample distribution.
## Making plot of hpgl0568(5) vs. a sample distribution.
## Making plot of hpgl0569(6) vs. a sample distribution.
## Making plot of hpgl0570(7) vs. a sample distribution.
## Making plot of hpgl0571(8) vs. a sample distribution.
norm_metrics <- s_p(graph_metrics(cbf5_norm))$result
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Plotting a density plot.

The previous block created the various plots we might want to see. Now show them.

library(directlabels)
library(ggplot2)
raw_metrics$libsize

## 3 libraries have significantly lower read coverage.
raw_metrics$nonzero

## They therefore have fewer non-zero genes.
raw_metrics$boxplot

## And slightly lower distributions in a box plot
raw_metrics$corheat

## Wow!  Even non-normalized data splits nicely, that is unheard of!!
raw_metrics$disheat

## A little more heterogeneity is observed in the distance heatmap, so the data is not completely unbelievably perfect.
raw_metrics$smc

## Even before normalization, all the samples are better than the cutoff
raw_metrics$pcaplot

## The pca of raw data shows that it isn't perfect.
directlabels::direct.label(raw_metrics$density)

## The densities are nicely similar
raw_metrics$qqlog

## And the densities compared to mean are nicely similar.

norm_metrics$pcaplot

## A nice clean split between wt on left and mutant on right
norm_metrics$disheat

## ditto except <->
norm_metrics$corheat

Some simple differential expression analyses

We have no batch information in this data, so lets give a simple pairwise comparison a go.

## You may call the pairwise comparisons singly
##limma_analysis <- limma_pairwise(cbf5_expt, model_batch=FALSE)
##deseq_analysis <- deseq2_pairwise(cbf5_expt, model_batch=FALSE)
##edger_analysis <- edger_pairwise(cbf5_expt, model_batch=FALSE)
##basic_analysis <- basic_pairwise(cbf5_expt)
fun = s_p(all_pairwise(cbf5_norm, model_batch=FALSE))$result
## Starting limma pairwise comparison.
## libsize was not specified, this parameter has profound effects on limma's result.
## Using the libsize from expt$best_libsize.
## Limma step 1/6: choosing model.
## Limma step 2/6: running voom
## limma step 3/6: running lmFit
## Limma step 4/6: making and fitting contrasts.
## Limma step 5/6: Running eBayes and topTable.
## Limma step 6/6: Writing limma outputs.
## limma step 6/6: 1/3: Printing table: mut.
## limma step 6/6: 2/3: Printing table: wt.
## limma step 6/6: 3/3: Printing table: wt_vs_mut.
## Starting DESeq2 pairwise comparisons.
## DESeq2 demands raw data as input, reverting to the original expressionset.
## DESeq2 step 1/5: Including only condition in the deseq model.
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula
## are characters, converting to factors
## DESeq2 step 2/5: Estimate size factors.
## DESeq2 step 3/5: estimate Dispersions.
## DESeq2 step 4/5: nbinomWaldTest.
## DESeq2 step 5/5: 1/1: Printing table: wt_vs_mut
## Collected coefficients for: mut
## Collected coefficients for: wt
## Starting edgeR pairwise comparisons.
## EdgeR expects raw data as input, reverting to the original expressionset.
## EdgeR step 1/9: normalizing data.
## EdgeR step 2/9: Estimating the common dispersion.
## EdgeR step 3/9: Estimating dispersion across genes.
## EdgeR step 4/9: Estimating GLM Common dispersion.
## EdgeR step 5/9: Estimating GLM Trended dispersion.
## EdgeR step 6/9: Estimating GLM Tagged dispersion.
## EdgeR step 7/9: Running glmFit.
## EdgeR step 8/9: Making pairwise contrasts.
## EdgeR step 9/9: 1/1: Printing table: wt_vs_mut.
## Starting basic pairwise comparison.
## The data appears to have been at least transformed, leaving it alone.
## Basic step 1/3: Creating median and variance tables.
## Basic step 2/3: Performing comparisons.
## Basic step 2/3: 1/1: Performing log2 subtraction: wt_vs_mut
## Basic step 3/3: Creating faux DE Tables.
## Basic: Returning tables.
## 1/1: Comparing analyses: wt_vs_mut
fun$comparison$comp
##    wt_vs_mut
## le    0.9869
## ld    0.9858
## ed    0.9510
## lb    0.9892
## eb    0.9773
## db    0.9767
## limma, edgeR, DESeq2, and my stupid basic analysis all agree very much for this data.

keepers <- list("mutvwt" = c("mut","wt"))
combined <- combine_de_tables(fun, excel="excel/mut_vs_wt_contrasts.xlsx", keepers=keepers)
## Deleting the file excel/mut_vs_wt_contrasts.xlsx before writing the tables.
## Working on 1/1: mutvwt
## Found inverse table with wt_vs_mut
## Running create_combined_table with inverse=TRUE
## The table is: wt_vs_mut
##       table total limma_up limma_sigup deseq_up deseq_sigup edger_up edger_sigup basic_up
## 1 wt_vs_mut  6130      335         314      230         228      284         262      348
##   basic_sigup limma_down limma_sigdown deseq_down deseq_sigdown edger_down edger_sigdown
## 1         314        438           413        441           440        534           510
##   basic_down basic_sigdown meta_up meta_sigup meta_down meta_sigdown
## 1        435           377     274        267       474          459
## Converted seqnames to characters.
## Converted strandx to characters.
## Converted source to characters.
## Converted type to characters.
## Attempting to add a coefficient plot for mutvwt at column 69
## Warning: Removed 448 rows containing non-finite values (stat_smooth).
## Warning: Removed 448 rows containing missing values (geom_point).
## Writing summary information.
## The sheet pairwise_summary already exists, it will get overwritten
## Attempting to add the comparison plot to pairwise_summary at row: 19 and column: 1
## NULL
## Performing save of the workbook.
significant_summary <- extract_significant_genes(combined, excel="excel/mut_vs_wt_significant_contrasts.xlsx")
## Writing excel data sheet 1/1
## Assuming the fold changes are on the log scale and so taking >< 0
## After (adj)p filter, the up genes table has 1615 genes.
## After (adj)p filter, the down genes table has 1422 genes.
## Assuming the fold changes are on the log scale and so taking -1 * fc
## After fold change filter, the up genes table has 314 genes.
## After fold change filter, the down genes table has 413 genes.
## Printing significant genes to the file: excel/mut_vs_wt_significant_contrasts.xlsx
## Converted change_counts_up to characters.
## Converted change_counts_down to characters.
## 1/1: Writing excel data sheet up_limma_mutvwt
## Converted seqnames to characters.
## Converted strandx to characters.
## Converted source to characters.
## Converted type to characters.
## 1/1: Writing excel data sheet down_limma_mutvwt
## Converted seqnames to characters.
## Converted strandx to characters.
## Converted source to characters.
## Converted type to characters.
## Writing changed genes summary on last sheet.

Quick go attempt

I have 4 ontology tools we can try applying to this as well as the KEGG pathways.

## Test gprofiler
## gprofiler(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")
uppers_names <- uppers[["transcript_name"]]
profiler_go <- gProfileR::gprofiler(uppers_names, organism="scerevisiae")
profiler_kegg <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="KEGG")
profiler_reactome <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="REAC")
profiler_tf <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="TF")
profiler_mi <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="MI")
profiler_corum <- gProfileR::gprofiler(uppers_names, organism="scerevisiae", src_filter="CORUM")

kept_ontology <- subset_ontology_search(significant_summary, lengths=genelengths, goids=goids, gff=reference, gff_type="gene", universe_merge="transcript_name")
something <- write_subset_ontologies(kept_ontology)

uorf_all <- read.csv("excel/Ingolia-all-uORFs.csv")
uorf_all$num <- 1
uorf_fun <- uorf_all[, c("Gene","num")]
library(plyr)
uorf_fun <- ddply(uorf_fun, 'Gene', summarize, total = sum(num))
combined_data <- combined[["data"]][["mutvwt"]]
uorf_vs_fc <- merge(combined_data, uorf_fun, by.x="row.names", by.y="Gene", all.x=TRUE)
uorf_vs_fc$total[is.na(uorf_vs_fc$total == 0)] <- 0
cor.test(x=uorf_vs_fc$limma_logfc, uorf_vs_fc$total)

uorf_dist <- uorf_all[, c("Gene","uORF.end.to.CDS")]
colnames(uorf_dist) <- c("Gene","dist")
rownames(uorf_dist) <- make.names(uorf_dist$Gene, unique=TRUE)
uorfdist_vs_fc <- merge(combined_data, uorf_dist, by.x="row.names", by.y="row.names", all.x=TRUE)
uorfdist_vs_fc$dist[is.na(uorfdist_vs_fc$dist)] <- -1000
cor.test(uorfdist_vs_fc$dist, uorfdist_vs_fc$basic_logfc, na.rm=TRUE)
tt <- uorfdist_vs_fc[, c("dist","basic_denmed")]
tt$basic_denmed <- log(tt$basic_denmed)
png(file="/tmp/test_uorf.png")
hpgl_linear_scatter(tt)$scatter + scale_x_continuous(limits=c(10,500))
dev.off()

try_species = kegg_get_orgn("Saccharomyces cerevisiae")
pathview_data = all_pairwise$all_tables$wt_minus_mut
pathview_data = merge(pathview_data, crossref, by.x="row.names", by.y="transcript_name", all.x=TRUE)

rownames(pathview_data) = make.names(pathview_data$transcript_id, unique=TRUE)
## If I read the yeast xml files correctly, they all follow the traditional chromosomal location naming scheme...
pathview_result = hpgl_pathview(pathview_data, species=try_species, filenames="name")

Some playing around to get genes from go categories.

load("GO2ALLEG.rda")
## The first entry is GO:0003674
go_genes = GO2ALLEG[['GO:0003674']]

subtraction = all_pairwise$all_tables$wt_minus_mut
genes_in_go = subtraction[rownames(subtraction) %in% go_genes,]
genes_in_go = subset(genes_in_go, logFC > 1)
head(genes_in_go)
dim(genes_in_go)

Copy/pasting the 2 mismatch material here and deleting the file

RNAseq of yeast with wt/mutant CBF5 2 mismatches and including NMD rnaseq data!

Reading data

cbf5_expt = create_expt(file="all_samples.csv")
merged_expt = create_expt(file="all_samples_merged.csv")

Normalizing and exploring data

There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper ‘graph_metrics()’ but that takes away the chance to explain the graphs as I generate them.

full_design = cbf5_expt$definitions
## First make a bar plot of the library sizes in the experiment.
## Notice that the colors were auto-chosen by create_expt() and they should
## be maintained throughout this process
fis_libsize = hpgl_libsize(cbf5_expt)
fis_libsize

## Here we see that the wild type replicate 3 sample for 15 minutes has fewer non-zero genes than all its friends.
fis_nonzero = hpgl_nonzero(cbf5_expt, title="nonzero vs. cpm")
fis_nonzero

An initial pca plot

In most cases, raw data does not cluster very well, lets see if that is also true for the cbf5 experiment. Assuming it doesn’t, lets normalize the data using the defaults (cpm, quantile, log2) and try again.

## Unsurprisingly, the raw data doesn't cluster well at all...
##fis_rawpca = hpgl_pca(expt=cbf5_expt, labels=cbf5_expt$condition)
##fis_rawpca = hpgl_pca(expt=cbf5_expt, labels="fancy")
fis_rawpca = hpgl_pca(cbf5_expt, title="Fun!")
fis_rawpca$plot

## So, normalize the data
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE)
normed = graph_metrics(norm_expt)
normed$pcaplot

norm_expt_svaseq = normalize_expt(cbf5_expt, transform="raw", norm="raw", convert="cpm", filter_low=TRUE, batch="svaseq")
hpgl_pca(norm_expt_svaseq)$plot

norm_expt_sva = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
hpgl_pca(norm_expt_sva)$plot

## And try the pca again
fis_normpca = hpgl_pca(norm_expt, labels=norm_expt$condition, title="normalized pca")
norm_expt = normalize_expt(cbf5_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
norm_merged = normalize_expt(merged_expt, transform="log2", norm="quant", convert="cpm", filter_low=TRUE, batch="sva")
## And try the pca again
fis_normpca = hpgl_pca(norm_expt, labels=norm_expt$condition, title="normalized pca")
mer_normpca = hpgl_pca(norm_merged, labels=norm_merged$condition, title="normalized pca")
mer_normpca$plot
tiff(file="plots/norm_pca.tiff")
mer_normpca$plot
dev.off()
fis_normpca$res
fis_normpca$variance
tiff(file="plots/merged_pca.tiff")
mer_normpca$plot
dev.off()

funkytown = pca_information(norm_merged, plot_pcas=TRUE)
funkytown$pca_plots$PC1_PC2$plot
fun = graph_metrics(expt=norm_expt)

##funkytown = pca_information(expt=norm_expt, plot_pcas=TRUE)

Look at the data distributions

hpgl_boxplot(norm_expt)
hpgl_density(norm_expt)
hpgl_disheat(norm_expt)
hpgl_corheat(norm_expt)
hpgl_smc(norm_expt)
hpgl_qq_all(norm_expt)

hpgl_boxplot(norm_merged)
hpgl_density(norm_merged)
hpgl_disheat(norm_merged)
hpgl_corheat(norm_merged)
hpgl_smc(norm_merged)
hpgl_qq_all(norm_merged)

Some simple differential expression analyses

##limma_expt = normalize_expt(expt=cbf5_expt, norm="quant")
cbf5_quant = normalize_expt(cbf5_expt, norm="quant", convert="cpm", filter_low=TRUE)
all_pairwise = limma_pairwise(norm_merged, batches=NULL, model_batch=FALSE)
##all_pairwise = edger_pairwise(expt=cbf5_expt, batches=NULL, model_batch=FALSE)
## I have the following elements in this list:
### conditions_table     :  how many replicates are there of each condition
### batches_table        :  how many replicates are there of each batch
### conditions           :  a factor of all the conditions
### batches              :  a factor of all the batches
### model                :  the model of the data used for voom etc.
### fit                  :  the result of lmfit()
### voom_result          :  the result of voom()
### voom_design          :  the design matrix fed to voom()
### identities           :  the strings fed to makeContrasts() which describe each sample alone
### all_pairwise         :  the strings describing all the subtractions fed to makeContrasts()
### contrast_string      :  the entire string fed to makeContrasts() including the design etc.
### pairwise_fits        :  the result of contrasts.fit()
### pairwise_comparisons :  the result from eBayes()
### limma_result         :  a list of toptable()s corresponding to each pairwise comparison

names(all_pairwise$all_tables)
summary(all_pairwise$all_tables$wt_minus_mut)

awesome = coefficient_scatter(all_pairwise)
awesome$scatter

fun = all_pairwise(cbf5_expt, batches=NULL, model_batch=FALSE)
fun$comparison$comp
## limma, edgeR, and DESeq2 all agree very much for this data.
names(all_pairwise$all_tables)
summary(all_pairwise$all_tables$wt_minus_mut)

##logfc = limma_scatter(all_pairwise, first_table=3, second_column="logFC", second_table=3, first_column="AveExpr")
##logfc$scatter

wt_mut_scatter = coefficient_scatter(all_pairwise, x="wt", y="mut")
wt_mut_scatter$scatter
wt_mut_scatter$both_histogram

wt_nmd = coefficient_scatter(all_pairwise, x="wt", y="nmd")
wt_nmd$scatter
wt_nmd$both_histogram

cbf_nmd = coefficient_scatter(all_pairwise, x="wt_minus_mut", y="wt_minus_nmd")
cbf_nmd$scatter
cbf_nmd$both_histogram
cbf_nmd$correlation
hpgl_cor(cbf_nmd$df, method="robust")
## I think some of the differences in these sample types is being squashed by sva.

Quick go attempt

I have 4 ontology tools we can try applying to this as well as the KEGG pathways.

##crazytown = limma_ontology(all_pairwise, gene_lengths=lengths, goids=goids) ## This works now, but takes forever...
crazytown = limma_ontology(all_pairwise, gene_lengths=lengths, goids=goids, do_topgo=FALSE, do_trees=FALSE, do_cluster=FALSE)
##try_species = kegg_get_orgn("Saccharomyces cerevisiae")
##crazytown = hpgl_pathview(all_pairwise$limma_result$wt_minus_mut, species=try_species)
expression_table = all_pairwise$all_tables$wt_minus_mut
yeast_annotations = import.gff2("reference/scerevisiae.gff.gz")
annotation_info = as.data.frame(yeast_annotations)
rownames(annotation_info) = make.names(annotation_info$transcript_name, unique=TRUE)
annotation_plus_data = merge(annotation_info, expression_table, by.x="row.names", by.y="row.names")
rownames(annotation_plus_data) = annotation_plus_data$Row.names
annotation_plus_data = annotation_plus_data[,-1]
gene_lengths = annotation_plus_data[,c("width","logFC")]
gene_lengths$ID = rownames(gene_lengths)

## I did the following to pull out the information I care about from the SGD gene association tale:
## zcat gene_association.sgd.gz | grep -v "^\!" | awk -F ' ' '{printf("%s,%s,%s\n", $3, $5, $11)}' | cut -d '|' -f1 > go_trimmed.csv
## The resulting csv file is in reference/go_trimmed.csv
go_annotation = read.csv("reference/go_trimmed.csv.gz")
colnames(go_annotation) = c("Name","GO","ID")

## mut - wt, so higher means more in cbf5 mutant.
de_genes_minus = subset(expression_table, logFC <= -1.5)
de_genes_plus = subset(expression_table, logFC >= 1.5)
fun_minus = simple_goseq(de_genes=de_genes_minus, lengths=gene_lengths, goids=go_annotation)

tiff(file="plots/increased_mf_mutant.tiff", res=90)
fun_minus$mfp_plot
dev.off()
tiff(file="plots/increased_bp_mutant.tiff", res=90)
fun_minus$bpp_plot
dev.off()
fun_minus$bp_interesting

fun_plus = simple_goseq(de_genes_plus, lengths=gene_lengths, goids=go_annotation)
## Oh wow, that is pretty cool, mess up cbf5 and RNA polI increases, that makes sense!
## The up-regulated sets have crap q-values.
tiff(file="plots/decreased_mf_mutant.tiff", res=90)
fun_plus$mfp_plot
dev.off()
tiff(file="plots/decreased_bp_mutant.tiff", res=90)
fun_plus$bpp_plot
dev.off()
fun_plus$mf_interesting
fun_plus$bp_interesting
high_prfdb = read.csv("processed_data/high.csv", header=FALSE)
colnames(high_prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted")
low_prfdb = read.csv("processed_data/low.csv", header=FALSE)
colnames(low_prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted")
rownames(high_prfdb) = make.names(high_prfdb$ID, unique=TRUE)
rownames(low_prfdb) = make.names(low_prfdb$ID, unique=TRUE)
high_prfdb$value = "high"
low_prfdb$value = "low"

prfdb = read.csv("processed_data/hits.csv", header=FALSE)
colnames(prfdb) = c("ID","name","genome_id","mfe","second_mfe","knotted","value")
rownames(prfdb) = make.names(prfdb$ID, unique=TRUE)
test = merge(prfdb, df, by.x="row.names", by.y="row.names")
test$delta = test$wt - test$mut
ggplot(test, aes(x=wt, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
ggplot(test, aes(x=mut, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
ggplot(test, aes(x=delta, fill=value)) + geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") + geom_density(alpha=0.5)
summary(test)
thigh = test[as.character(test$value) == 'high',]

low = low_prfdb[,c("ID","mfe","value")]
high = high_prfdb[,c("ID","mfe","value")]
mfe_dist = rbind(low, high)
mfe_dist = mfe_dist[,c("mfe","value")]
cdf = ddply(mfe_dist, "value", summarise, rating.mean=mean(mfe, na.rm=TRUE))
multi = ggplot(mfe_dist, aes(x=mfe, fill=value)) +
    geom_histogram(aes(y=..density..), binwidth=0.1, alpha=0.5, position="identity") +
    geom_density(alpha=0.5) +
    geom_vline(data=cdf, aes(xintercept=rating.mean, colour=value), linetype="dashed") +
    theme_bw()


test = merge(mfe_dist, df, by.x="row.names", by.y="row.names")
multi = ggplot(test, aes(x=wt, y=mut, color=value)) +
    geom_point()
multi
high = subset(test, value=="high")
low = subset(test, value=="low")
summary(high$wt)
summary(low$wt)
summary(high$mut)
summary(low$mut)
low$sub = low$wt - low$mut
high$sub = high$wt - high$mut

summary(crazytown)
summary(crazytown$wt_minus_mut$up_goseq)
crazytown$wt_minus_mut$up_goseq$mfp_plot
head(crazytown$wt_minus_mut$up_goseq$mf_subset)
subtraction = all_pairwise$all_tables$wt_minus_mut

load("GO2ALLEG.rda")
## The first entry is GO:0003674
go_genes = GO2ALLEG[['GO:0003674']]

genes_in_go = subtraction[rownames(subtraction) %in% go_genes,]
genes_in_go = subset(genes_in_go, logFC > 2)
head(genes_in_go)

de_genes_plus = subset(subtraction, logFC >= 1.5)
fun = simple_goseq(de_genes_plus, lengths=gene_lengths, goids=goids)


new_mf_table = hpgltools:::gather_genes(fun, ont='MF')
new_bp_table = hpgltools:::gather_genes(fun, ont='BP')
new_bp_table