index.html 01_annotation.html 02_sample_estimations.html 03_differential_expression.html

1 Gene Ontology searches

Lets set up some simple data sets for goseq/clusterProfiler/topGO

I think a bunch of the material in this sheet is actually now obsolete.

1.1 Preparing to use goseq

I need a couple vectors before being able to use goseq I also need some rda files generated for clusterProfiler and topGO but I will do those later…

## goseq uses the gene lengths, clusterprofiler and topgo don't
## gene_lengths <- all_genes[,c("geneid","genesize")]
## colnames(gene_lengths) <- c("ID", "width")
colnames(all_lengths) <- c("ID", "width")
## go_ids needs to be a 2 column data frame with names 'ID', 'GO' which are
## gene names and GO names respectively.  I actually think I made my goseq() function
## smart enough that this is no longer needed, but nonetheless.

## go_ids <- read.table("reference/go/tcruzi_all_go.tab.gz", header=0, sep="\t")
## colnames(go_ids) <- c("ID","GO","ontology","term","source")
## Since there are multiple sources, I am doing a unique of the table...
## go_ids <- unique(go_ids[,c("ID","GO")])
## goid_file <- "reference/go/topgo_goids.tab.gz"

all_go <- as.data.frame(all_go)
all_go <- all_go[, c("GID", "GO")]
sig <- clbr_times_sigfilt$limma$ups$CLBr.Tryp_vs_CLBr.A96
clbr_up_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_up_a96tryp_goseq-v", ver, ".xlsx")))
clbr_up_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$downs$CLBr.Tryp_vs_CLBr.A96
clbr_down_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_down_a96tryp_goseq-v", ver, ".xlsx")))

clbr_down_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$ups$CLBr.A96_vs_CLBr.A60
clbr_up_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_up_a60a96_goseq-v", ver, ".xlsx")))

clbr_up_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$downs$CLBr.A96_vs_CLBr.A60
clbr_down_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_down_a60a96_goseq-v", ver, ".xlsx")))

clbr_down_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$ups$CLBr.A60_vs_CLBr.Tryp
clbr_up_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_up_trypa60_goseq-v", ver, ".xlsx")))

clbr_up_trypa60_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$downs$CLBr.A60_vs_CLBr.Tryp
clbr_down_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_down_trypa60_goseq-v", ver, ".xlsx")))

clbr_down_trypa60_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$ups$CL14.Tryp_vs_CL14.A96
cl14_up_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_up_a96tryp_goseq-v", ver, ".xlsx")))

cl14_up_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$downs$CL14.Tryp_vs_CL14.A96
cl14_down_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_down_a96tryp_goseq-v", ver, ".xlsx")))

cl14_down_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$ups$CL14.A96_vs_CL14.A60
cl14_up_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_up_a60a96_goseq-v", ver, ".xlsx")))

cl14_up_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$downs$CL14.A96_vs_CL14.A60
cl14_down_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_down_a60a96_goseq-v", ver, ".xlsx")))

cl14_down_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$ups$CL14.A60_vs_CL14.Tryp
cl14_up_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_up_trypa60_goseq-v", ver, ".xlsx")))

cl14_up_trypa60_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$downs$CL14.A60_vs_CL14.Tryp
cl14_down_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_down_trypa60_goseq-v", ver, ".xlsx")))

cl14_down_trypa60_goseq$pvalue_plots$bpp_plot_over

strains_up_a60_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$ups$CLBr.A60_vs_CL14.A60,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_upa60_goseq-v", ver, ".xlsx")))

strains_up_a96_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$ups$CLBr.A96_vs_CL14.A96,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_upa96_goseq-v", ver, ".xlsx")))

strains_up_tryp_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$ups$CLBr.Tryp_vs_CL14.Tryp,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_uptryp_goseq-v", ver, ".xlsx")))

strains_down_a60_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$downs$CLBr.A60_vs_CL14.A60,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_downa60_goseq-v", ver, ".xlsx")))

strains_down_a96_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$downs$CLBr.A96_vs_CL14.A96,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_downa96_goseq-v", ver, ".xlsx")))

strains_down_tryp_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$downs$CLBr.Tryp_vs_CL14.Tryp,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_downtryp_goseq-v", ver, ".xlsx")))

1.2 Compare Epimastigotes between CL14 and CLBr, high expression

One important note here, I needed to modify the data structure from readGff() in clusterProfiler before it was able to successfully generate the file ‘geneTable.rda’ Unfortunately, I was watching a movie at the time and didn’t properly write down what I did. However, I did write it out in the function myr::Gff2GeneTable() but in order to make it work one must load readGff() from clusterProfiler. I think the real solution is for me to grab a version of readGff() from clusterProfiler and make it exportable so that it will work consistently.

## Changes in epimastigotes from CLBrener to CL14
pval_thresh <- 0.0001
epi_cl14clbr_table <- all_tables$epi_cl14clbr
epi_cl14clbr_summary <- summary(epi_cl14clbr_table$logFC)
epi_cl14clbr_pvals <- epi_cl14clbr_table$adj.P.Val
names(epi_cl14clbr_pvals) <- rownames(epi_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_high <- subset(epi_cl14clbr_table, logFC >= epi_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
epi_cl14clbr_high_go <- simple_goseq(epi_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_high_go$pvalue_histogram
epi_cl14clbr_high_go$bpp_plot

summary(epi_cl14clbr_high_go)
fun <- epi_cl14clbr_high_go$godata_interesting
goseq_annotation <- goseq_table(fun)
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous
## script for how to do that (I think I do it automatically in simple_topgo()...
##epi_cl14clbr_high_trees = goseq_trees(degenes=epi_cl14clbr_high, epi_cl14clbr_high_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it
## was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
epi_cl14clbr_high_cl <- simple_clusterprofiler(epi_cl14clbr_high)
epi_cl14clbr_high_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(epi_cl14clbr_high, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
epi_cl14clbr_high_top <- simple_topgo(epi_cl14clbr_high, goids=goid_file, pvals=epi_cl14clbr_pvals)

topgo_pval <- topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees <- topgo_trees(epi_cl14clbr_high_top)
top_trees$bp_fisher_tree

1.3 Compare Epimastigotes between CL14 and CLBr, low expression

## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_low = subset(epi_cl14clbr_table, logFC <= epi_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
epi_cl14clbr_low_go = simple_goseq(de_genes=epi_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_low_go$pvalue_histogram
epi_cl14clbr_low_go$bpp_plot
##epi_cl14clbr_low_cl = simple_clusterprofiler(epi_cl14clbr_low, gff="reference/gff/clbrener_8.1_complete_genes.gff.gz")
##epi_cl14clbr_low_cl$bp_all_barplot
epi_cl14clbr_low_top = simple_topgo(epi_cl14clbr_low, goids=goid_file, pvals=epi_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(epi_cl14clbr_low_top)
top_trees$bp_fisher_tree

1.4 Compare trypo CL14/CLBr

## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_high = subset(tryp_cl14clbr_table, logFC >= tryp_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
tryp_cl14clbr_high_go = simple_goseq(tryp_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_high_go$pvalue_histogram
tryp_cl14clbr_high_go$bpp_plot
tryp_cl14clbr_high_cl = simple_clusterprofiler(tryp_cl14clbr_high)
tryp_cl14clbr_high_cl$bp_all_barplot
tryp_cl14clbr_high_top = simple_topgo(tryp_cl14clbr_high, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_high_top)
top_trees$bp_fisher_tree

1.5 Compare cl14/clbr tryp other side

## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_low = subset(tryp_cl14clbr_table, logFC <= tryp_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
tryp_cl14clbr_low_go = simple_goseq(tryp_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_low_go$pvalue_histogram
tryp_cl14clbr_low_go$bpp_plot
tryp_cl14clbr_low_cl = simple_clusterprofiler(tryp_cl14clbr_low)
tryp_cl14clbr_low_cl$bp_all_barplot
tryp_cl14clbr_low_top = simple_topgo(tryp_cl14clbr_low, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_low_top)
top_trees$bp_fisher_tree

1.6 Amastigote a96 cl14/clbr

## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = all_tables$a96_cl14clbr
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC >= a96_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = strain_comparisons$data$a96_clbr_over_cl14
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC <= a96_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree

## Ways to look
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$B.x, a96=tmp$B.y)
tt = hpgl_linear_scatter(tmp_b_t)
tt$scatter
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$t.x, a96=tmp$t.y)
tt = hpgl_scatter(tmp_b_t)
tt$scatter

2 KG

Oh yeah, I wrote a toy to wrap KEGG pathway stuff, lets try that. The KEGG abbreviation for t. cruzi is tcr

In theory, the _paths data structure returned by hpgl_pathview() should be a table of the numer of up/down genes observed in each KEGG path. However, there are some weird NAs getting returned. Effing hippies. – fixed.

kegg_list <- ""
##names(kegg_list) = rownames(epi_cl14clbr_table)
##epi_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_epi_cl14clbr", string_from="TcCLB.", string_to="")
##head(epi_cl14clbr_paths)


##a60_cl14clbr_table = all_tables$a60_cl14clbr
a60_cl14clbr_table <- strain_comparisons$data$a60_clbr_over_cl14
##kegg_list <- a60_cl14clbr_table$logFC
kegg_list <- a60_cl14clbr_table$limma_logfc
names(kegg_list) <- rownames(a60_cl14clbr_table)
a60_cl14clbr_paths <- hpgl_pathview(kegg_list, species="tcr",
                                    indir="pathview_in", outdir="pathview_a60_cl14clbr",
                                    from_list="TcCLB.", to_list="")
head(a60_cl14clbr_paths)

a96_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = a96_cl14clbr_table$logFC
names(kegg_list) = rownames(a96_cl14clbr_table)
a96_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_a96_cl14clbr", string_from="TcCLB.", string_to="")
head(a96_cl14clbr_paths)

tryp_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = tryp_cl14clbr_table$logFC
names(kegg_list) = rownames(tryp_cl14clbr_table)
tryp_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_tryp_cl14clbr", string_from="TcCLB.", string_to="")
head(tryp_cl14clbr_paths)
---
title: "RNAseq of T.cruzi CL14/CLBr: Gene Ontology."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: default
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: readable
  toc: true
  toc_float:
    collapsed: false
    smooth_scroll: false
---

<style>
  body .main-container {
    max-width: 1600px;
  }
</style>

```{r options, include=FALSE}
library("hpgltools")
tt <- sm(devtools::load_all("~/hpgltools"))
knitr::opts_knit$set(progress=TRUE,
                     verbose=TRUE,
                     width=90,
                     echo=NULL)
knitr::opts_chunk$set(error=TRUE,
                      fig.width=8,
                      fig.height=8,
                      dpi=96)
old_options <- options(error=NULL,
                       digits=4,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
set.seed(1)
ver <- "20170810"
previous_file <- "03_differential_expression.Rmd"

tmp <- try(sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz"))))

rmd_file <- "04_gene_ontology.Rmd"
```

[index.html](index.html)
[01_annotation.html](01_annotation.html)
[02_sample_estimations.html](02_sample_estimations.html)
[03_differential_expression.html](03_differential_expression.html)

```{r rendering, include=FALSE, eval=FALSE}
rmarkdown::render(rmd_file)

rmarkdown::render(rmd_file, output_format="pdf_document", output_options=c("skip_html"))
```
# Gene Ontology searches

Lets set up some simple data sets for goseq/clusterProfiler/topGO

I think a bunch of the material in this sheet is actually now obsolete.

## Preparing to use goseq

I need a couple vectors before being able to use goseq
I also need some rda files generated for clusterProfiler and topGO
but I will do those later...

```{r goseq_preparation}
## goseq uses the gene lengths, clusterprofiler and topgo don't
## gene_lengths <- all_genes[,c("geneid","genesize")]
## colnames(gene_lengths) <- c("ID", "width")
colnames(all_lengths) <- c("ID", "width")
## go_ids needs to be a 2 column data frame with names 'ID', 'GO' which are
## gene names and GO names respectively.  I actually think I made my goseq() function
## smart enough that this is no longer needed, but nonetheless.

## go_ids <- read.table("reference/go/tcruzi_all_go.tab.gz", header=0, sep="\t")
## colnames(go_ids) <- c("ID","GO","ontology","term","source")
## Since there are multiple sources, I am doing a unique of the table...
## go_ids <- unique(go_ids[,c("ID","GO")])
## goid_file <- "reference/go/topgo_goids.tab.gz"

all_go <- as.data.frame(all_go)
all_go <- all_go[, c("GID", "GO")]
```

```{r goseq_clbr_times1}
sig <- clbr_times_sigfilt$limma$ups$CLBr.Tryp_vs_CLBr.A96
clbr_up_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_up_a96tryp_goseq-v", ver, ".xlsx")))
clbr_up_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$downs$CLBr.Tryp_vs_CLBr.A96
clbr_down_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_down_a96tryp_goseq-v", ver, ".xlsx")))
clbr_down_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$ups$CLBr.A96_vs_CLBr.A60
clbr_up_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_up_a60a96_goseq-v", ver, ".xlsx")))
clbr_up_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$downs$CLBr.A96_vs_CLBr.A60
clbr_down_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_down_a60a96_goseq-v", ver, ".xlsx")))
clbr_down_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$ups$CLBr.A60_vs_CLBr.Tryp
clbr_up_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_up_trypa60_goseq-v", ver, ".xlsx")))
clbr_up_trypa60_goseq$pvalue_plots$bpp_plot_over

sig <- clbr_times_sigfilt$limma$downs$CLBr.A60_vs_CLBr.Tryp
clbr_down_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/clbr_down_trypa60_goseq-v", ver, ".xlsx")))
clbr_down_trypa60_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$ups$CL14.Tryp_vs_CL14.A96
cl14_up_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_up_a96tryp_goseq-v", ver, ".xlsx")))
cl14_up_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$downs$CL14.Tryp_vs_CL14.A96
cl14_down_a96tryp_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_down_a96tryp_goseq-v", ver, ".xlsx")))
cl14_down_a96tryp_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$ups$CL14.A96_vs_CL14.A60
cl14_up_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_up_a60a96_goseq-v", ver, ".xlsx")))
cl14_up_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$downs$CL14.A96_vs_CL14.A60
cl14_down_a60a96_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_down_a60a96_goseq-v", ver, ".xlsx")))
cl14_down_a60a96_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$ups$CL14.A60_vs_CL14.Tryp
cl14_up_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_up_trypa60_goseq-v", ver, ".xlsx")))
cl14_up_trypa60_goseq$pvalue_plots$bpp_plot_over

sig <- cl14_times_sigfilt$limma$downs$CL14.A60_vs_CL14.Tryp
cl14_down_trypa60_goseq <- sm(simple_goseq(
  sig_genes=sig,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/cl14_down_trypa60_goseq-v", ver, ".xlsx")))
cl14_down_trypa60_goseq$pvalue_plots$bpp_plot_over

strains_up_a60_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$ups$CLBr.A60_vs_CL14.A60,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_upa60_goseq-v", ver, ".xlsx")))

strains_up_a96_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$ups$CLBr.A96_vs_CL14.A96,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_upa96_goseq-v", ver, ".xlsx")))

strains_up_tryp_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$ups$CLBr.Tryp_vs_CL14.Tryp,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_uptryp_goseq-v", ver, ".xlsx")))
                        
strains_down_a60_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$downs$CLBr.A60_vs_CL14.A60,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_downa60_goseq-v", ver, ".xlsx")))

strains_down_a96_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$downs$CLBr.A96_vs_CL14.A96,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_downa96_goseq-v", ver, ".xlsx")))

strains_down_tryp_goseq <- sm(simple_goseq(
  sig_genes=strains_sigfilt$limma$downs$CLBr.Tryp_vs_CL14.Tryp,
  go_db=all_go,
  length_db=all_lengths,
  excel=paste0("excel/strains_downtryp_goseq-v", ver, ".xlsx")))
```

## Compare Epimastigotes between CL14 and CLBr, high expression

One important note here, I needed to modify the data structure from readGff() in clusterProfiler
before it was able to successfully generate the file 'geneTable.rda'
Unfortunately, I was watching a movie at the time and didn't properly write down what I did.
However, I did write it out in the function myr::Gff2GeneTable()
but in order to make it work one must load readGff() from clusterProfiler.
I think the real solution is for me to grab a version of readGff() from clusterProfiler
and make it exportable so that it will work consistently.

```{r ontology_epi_cl14clr_high, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
pval_thresh <- 0.0001
epi_cl14clbr_table <- all_tables$epi_cl14clbr
epi_cl14clbr_summary <- summary(epi_cl14clbr_table$logFC)
epi_cl14clbr_pvals <- epi_cl14clbr_table$adj.P.Val
names(epi_cl14clbr_pvals) <- rownames(epi_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_high <- subset(epi_cl14clbr_table, logFC >= epi_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
epi_cl14clbr_high_go <- simple_goseq(epi_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_high_go$pvalue_histogram
epi_cl14clbr_high_go$bpp_plot

summary(epi_cl14clbr_high_go)
fun <- epi_cl14clbr_high_go$godata_interesting
goseq_annotation <- goseq_table(fun)
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous
## script for how to do that (I think I do it automatically in simple_topgo()...
##epi_cl14clbr_high_trees = goseq_trees(degenes=epi_cl14clbr_high, epi_cl14clbr_high_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it
## was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
epi_cl14clbr_high_cl <- simple_clusterprofiler(epi_cl14clbr_high)
epi_cl14clbr_high_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(epi_cl14clbr_high, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
epi_cl14clbr_high_top <- simple_topgo(epi_cl14clbr_high, goids=goid_file, pvals=epi_cl14clbr_pvals)

topgo_pval <- topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees <- topgo_trees(epi_cl14clbr_high_top)
top_trees$bp_fisher_tree
```

## Compare Epimastigotes between CL14 and CLBr, low expression

```{r ontology_epi_cl14clbr_low, eval=FALSE}
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_low = subset(epi_cl14clbr_table, logFC <= epi_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
epi_cl14clbr_low_go = simple_goseq(de_genes=epi_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_low_go$pvalue_histogram
epi_cl14clbr_low_go$bpp_plot
##epi_cl14clbr_low_cl = simple_clusterprofiler(epi_cl14clbr_low, gff="reference/gff/clbrener_8.1_complete_genes.gff.gz")
##epi_cl14clbr_low_cl$bp_all_barplot
epi_cl14clbr_low_top = simple_topgo(epi_cl14clbr_low, goids=goid_file, pvals=epi_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(epi_cl14clbr_low_top)
top_trees$bp_fisher_tree
```

## Compare trypo CL14/CLBr

```{r tryp_cl14clbr_high_goseq, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_high = subset(tryp_cl14clbr_table, logFC >= tryp_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
tryp_cl14clbr_high_go = simple_goseq(tryp_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_high_go$pvalue_histogram
tryp_cl14clbr_high_go$bpp_plot
tryp_cl14clbr_high_cl = simple_clusterprofiler(tryp_cl14clbr_high)
tryp_cl14clbr_high_cl$bp_all_barplot
tryp_cl14clbr_high_top = simple_topgo(tryp_cl14clbr_high, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_high_top)
top_trees$bp_fisher_tree
```

## Compare cl14/clbr tryp other side

```{r tryp_cl14clbr_low_goseq, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_low = subset(tryp_cl14clbr_table, logFC <= tryp_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
tryp_cl14clbr_low_go = simple_goseq(tryp_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_low_go$pvalue_histogram
tryp_cl14clbr_low_go$bpp_plot
tryp_cl14clbr_low_cl = simple_clusterprofiler(tryp_cl14clbr_low)
tryp_cl14clbr_low_cl$bp_all_barplot
tryp_cl14clbr_low_top = simple_topgo(tryp_cl14clbr_low, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_low_top)
top_trees$bp_fisher_tree
```

## Amastigote a96 cl14/clbr

```{r ontology_a96_cl14clbr_high, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = all_tables$a96_cl14clbr
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC >= a96_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree
```

```{r ontology_a96_cl14clbr_low, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = strain_comparisons$data$a96_clbr_over_cl14
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC <= a96_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree

## Ways to look
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$B.x, a96=tmp$B.y)
tt = hpgl_linear_scatter(tmp_b_t)
tt$scatter
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$t.x, a96=tmp$t.y)
tt = hpgl_scatter(tmp_b_t)
tt$scatter
```

# KG

Oh yeah, I wrote a toy to wrap KEGG pathway stuff, lets try that.
The KEGG abbreviation for t. cruzi is tcr

In theory, the _paths data structure returned by hpgl_pathview() should be a table
of the numer of up/down genes observed in each KEGG path.  However, there are some
weird NAs getting returned.   Effing hippies. -- fixed.

```{r kegg_fun_v1}
kegg_list <- ""
```

```{r kegg_fun_v2, eval=FALSE}
##names(kegg_list) = rownames(epi_cl14clbr_table)
##epi_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_epi_cl14clbr", string_from="TcCLB.", string_to="")
##head(epi_cl14clbr_paths)


##a60_cl14clbr_table = all_tables$a60_cl14clbr
a60_cl14clbr_table <- strain_comparisons$data$a60_clbr_over_cl14
##kegg_list <- a60_cl14clbr_table$logFC
kegg_list <- a60_cl14clbr_table$limma_logfc
names(kegg_list) <- rownames(a60_cl14clbr_table)
a60_cl14clbr_paths <- hpgl_pathview(kegg_list, species="tcr",
                                    indir="pathview_in", outdir="pathview_a60_cl14clbr",
                                    from_list="TcCLB.", to_list="")
head(a60_cl14clbr_paths)

a96_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = a96_cl14clbr_table$logFC
names(kegg_list) = rownames(a96_cl14clbr_table)
a96_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_a96_cl14clbr", string_from="TcCLB.", string_to="")
head(a96_cl14clbr_paths)

tryp_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = tryp_cl14clbr_table$logFC
names(kegg_list) = rownames(tryp_cl14clbr_table)
tryp_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_tryp_cl14clbr", string_from="TcCLB.", string_to="")
head(tryp_cl14clbr_paths)
```
