index.html annotation.html sample_estimations.html differential_expression.html

In this document, we will perform a series of differential expression analyses using the data which survived sample estimation.

1 Gene Ontology searches

Lets set up some simple data sets for goseq/clusterProfiler/topGO

I think a bunch of the material in this sheet is actually now obsolete.

1.1 Extracting ontology annotations from the TriTrypDB

I need to extract the ontology/pFAM data from the Tcruzi annotation tables.

I have a perl script ‘dump_tritryp_GO.pl’ which can do this for me.

cd $LAB/ref_data/gene_ontology/
./dump_tritryp_GO.pl $LAB/ref_data/tritryp_text/Tcruzi.txt > tcruzi_go.tab
## The actual commands were slightly different, just because I didn't feel like copy/pasting the filenames.

1.2 Preparing to use goseq

I need a couple vectors before being able to use goseq I also need some rda files generated for clusterProfiler and topGO but I will do those later…

## goseq uses the gene lengths, clusterprofiler and topgo don't
## gene_lengths <- all_genes[,c("geneid","genesize")]
## colnames(gene_lengths) <- c("ID", "width")
colnames(all_lengths) <- c("ID", "width")
## go_ids needs to be a 2 column data frame with names 'ID', 'GO' which are
## gene names and GO names respectively.  I actually think I made my goseq() function
## smart enough that this is no longer needed, but nonetheless.

## go_ids <- read.table("reference/go/tcruzi_all_go.tab.gz", header=0, sep="\t")
## colnames(go_ids) <- c("ID","GO","ontology","term","source")
## Since there are multiple sources, I am doing a unique of the table...
## go_ids <- unique(go_ids[,c("ID","GO")])
## goid_file <- "reference/go/topgo_goids.tab.gz"

colnames(all_go) <- c("ID", "GO")
## First relax the input as perl the name of this file.
clbr_times_sigfiltv2 <- sm(extract_significant_genes(clbr_times_filtered, according_to="limma", p=0.1,
                                                excel=paste0("excel/clb_times_filtered_relaxedp-v", ver, ".xlsx")))
cl14_times_sigfiltv2 <- sm(extract_significant_genes(cl14_times_filtered, according_to="limma", p=0.1,
                                                excel=paste0("excel/cl14_times_filtered_relaxedp-v", ver, ".xlsx")))
strains_sigfiltv2 <- sm(extract_significant_genes(strains_filtered, according_to="limma", p=0.1,
                                               excel=paste0("excell/strain_comparisons_filtered_relaxedp-v", ver, ".xlsx")))

clbr_up_a96tryp_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$ups$clbr_a96_to_tryp,
                                         go_db=all_go, length_db=all_lengths))

clbr_up_a96tryp_goseq$pvalue_plots$bpp_plot_over

clbr_up_a96tryp_excel <- sm(write_goseq_data(clbr_up_a96tryp_goseq,
                                             excel=paste0("excel/clbr_up_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))

clbr_down_a96tryp_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$downs$clbr_a96_to_tryp,
                                           go_db=all_go, length_db=all_lengths))

clbr_down_a96tryp_goseq$pvalue_plots$bpp_plot_over

clbr_down_a96tryp_excel <- sm(write_goseq_data(clbr_down_a96tryp_goseq,
                                               excel=paste0("excel/clbr_down_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))

clbr_up_a60a96_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$ups$clbr_a60_to_a96,
                                        go_db=all_go, length_db=all_lengths))

clbr_up_a60a96_goseq$pvalue_plots$bpp_plot_over

clbr_up_a60a96_excel <- sm(write_goseq_data(clbr_up_a60a96_goseq,
                                            excel=paste0("excel/clbr_up_a60a96_goseq_relaxedp-v", ver, ".xlsx")))

clbr_down_a60a96_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$downs$clbr_a60_to_a96,
                                          go_db=all_go, length_db=all_lengths))

clbr_down_a60a96_goseq$pvalue_plots$bpp_plot_over

clbr_down_a60a96_excel <- sm(write_goseq_data(clbr_down_a60a96_goseq,
                                              excel=paste0("excel/clbr_down_a60a96_goseq_relaxedp-v", ver, ".xlsx")))

clbr_up_trypa60_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$ups$clbr_tryp_to_a60,
                                         go_db=all_go, length_db=all_lengths))

clbr_up_trypa60_goseq$pvalue_plots$bpp_plot_over

clbr_up_trypa60_excel <- sm(write_goseq_data(clbr_up_trypa60_goseq,
                                             excel=paste0("excel/clbr_up_trypa60_goseq_relaxedp-v", ver, ".xlsx")))

clbr_down_trypa60_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$downs$clbr_tryp_to_a60,
                                           go_db=all_go, length_db=all_lengths))

clbr_down_trypa60_goseq$pvalue_plots$bpp_plot_over

clbr_down_trypa60_excel <- sm(write_goseq_data(clbr_down_trypa60_goseq,
                                               excel=paste0("excel/clbr_down_trypa60_goseq_relaxedp-v", ver, ".xlsx")))

cl14_up_a96tryp_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$ups$cl14_a96_to_tryp,
                                         go_db=all_go, length_db=all_lengths))

cl14_up_a96tryp_goseq$pvalue_plots$bpp_plot_over

cl14_up_a96tryp_excel <- sm(write_goseq_data(cl14_up_a96tryp_goseq,
                                             excel=paste0("excel/cl14_up_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))

cl14_down_a96tryp_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$downs$cl14_a96_to_tryp,
                                           go_db=all_go, length_db=all_lengths))

cl14_down_a96tryp_goseq$pvalue_plots$bpp_plot_over

cl14_down_a96tryp_excel <- sm(write_goseq_data(cl14_down_a96tryp_goseq,
                                               excel=paste0("excel/cl14_down_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))

cl14_up_a60a96_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$ups$cl14_a60_to_a96,
                                        go_db=all_go, length_db=all_lengths))

cl14_up_a60a96_goseq$pvalue_plots$bpp_plot_over

cl14_up_a60a96_excel <- sm(write_goseq_data(cl14_up_a60a96_goseq,
                                            excel=paste0("excel/cl14_up_a60a96_goseq_relaxedp-v", ver, ".xlsx")))

cl14_down_a60a96_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$downs$cl14_a60_to_a96,
                                          go_db=all_go, length_db=all_lengths))

cl14_down_a60a96_goseq$pvalue_plots$bpp_plot_over

cl14_down_a60a96_excel <- sm(write_goseq_data(cl14_down_a60a96_goseq,
                                              excel=paste0("excel/cl14_down_a60a96_goseq_relaxedp-v", ver, ".xlsx")))

cl14_up_trypa60_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$ups$cl14_tryp_to_a60,
                                         go_db=all_go, length_db=all_lengths))

cl14_up_trypa60_goseq$pvalue_plots$bpp_plot_over

cl14_up_trypa60_excel <- sm(write_goseq_data(cl14_up_trypa60_goseq,
                                             excel=paste0("excel/cl14_up_trypa60_goseq_relaxedp-v", ver, ".xlsx")))

cl14_down_trypa60_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$downs$cl14_tryp_to_a60,
                                           go_db=all_go, length_db=all_lengths))

cl14_down_trypa60_goseq$pvalue_plots$bpp_plot_over

cl14_down_trypa60_excel <- sm(write_goseq_data(cl14_down_trypa60_goseq,
                                               excel=paste0("excel/cl14_down_trypa60_goseq_relaxedp-v", ver, ".xlsx")))

strains_up_a60_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$ups$a60_clbr_over_cl14,
                                        go_db=all_go, length_db=all_lengths))

strains_up_a96_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$ups$a96_clbr_over_cl14,
                                        go_db=all_go, length_db=all_lengths))

strains_up_tryp_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$ups$tryp_clbr_over_cl14,
                                         go_db=all_go, length_db=all_lengths))

strains_up_a60_excel <- sm(write_goseq_data(strains_up_a60_goseq,
                                            excel=paste0("excel/strains_upa60_goseq_relaxedp-v", ver, ".xlsx")))

strains_up_a96_excel <- sm(write_goseq_data(strains_up_a96_goseq,
                                            excel=paste0("excel/strains_upa96_goseq_relaxedp-v", ver, ".xlsx")))

strains_up_tryp_excel <- sm(write_goseq_data(strains_up_tryp_goseq,
                                             excel=paste0("excel/strains_uptryp_goseq_relaxedp-v", ver, ".xlsx")))

strains_down_a60_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$downs$a60_clbr_over_cl14,
                                          go_db=all_go, length_db=all_lengths))

strains_down_a96_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$downs$a96_clbr_over_cl14,
                                          go_db=all_go, length_db=all_lengths))

strains_down_tryp_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$downs$tryp_clbr_over_cl14,
                                           go_db=all_go, length_db=all_lengths))

strains_down_a60_excel <- sm(write_goseq_data(strains_down_a60_goseq,
                                            excel=paste0("excel/strains_downa60_goseq_relaxedp-v", ver, ".xlsx")))

strains_down_a96_excel <- sm(write_goseq_data(strains_down_a96_goseq,
                                            excel=paste0("excel/strains_downa96_goseq_relaxedp-v", ver, ".xlsx")))

strains_down_tryp_excel <- sm(write_goseq_data(strains_down_tryp_goseq,
                                             excel=paste0("excel/strains_downtryp_goseq_relaxedp-v", ver, ".xlsx")))

1.3 Compare Epimastigotes between CL14 and CLBr, high expression

One important note here, I needed to modify the data structure from readGff() in clusterProfiler before it was able to successfully generate the file ‘geneTable.rda’ Unfortunately, I was watching a movie at the time and didn’t properly write down what I did. However, I did write it out in the function myr::Gff2GeneTable() but in order to make it work one must load readGff() from clusterProfiler. I think the real solution is for me to grab a version of readGff() from clusterProfiler and make it exportable so that it will work consistently.

## Changes in epimastigotes from CLBrener to CL14
pval_thresh <- 0.0001
epi_cl14clbr_table <- all_tables$epi_cl14clbr
epi_cl14clbr_summary <- summary(epi_cl14clbr_table$logFC)
epi_cl14clbr_pvals <- epi_cl14clbr_table$adj.P.Val
names(epi_cl14clbr_pvals) <- rownames(epi_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_high <- subset(epi_cl14clbr_table, logFC >= epi_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
epi_cl14clbr_high_go <- simple_goseq(epi_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_high_go$pvalue_histogram
epi_cl14clbr_high_go$bpp_plot

summary(epi_cl14clbr_high_go)
fun <- epi_cl14clbr_high_go$godata_interesting
goseq_annotation <- goseq_table(fun)
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##epi_cl14clbr_high_trees = goseq_trees(degenes=epi_cl14clbr_high, epi_cl14clbr_high_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
epi_cl14clbr_high_cl <- simple_clusterprofiler(epi_cl14clbr_high)
epi_cl14clbr_high_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(epi_cl14clbr_high, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
epi_cl14clbr_high_top <- simple_topgo(epi_cl14clbr_high, goids=goid_file, pvals=epi_cl14clbr_pvals)

topgo_pval <- topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees <- topgo_trees(epi_cl14clbr_high_top)
top_trees$bp_fisher_tree

1.4 Compare Epimastigotes between CL14 and CLBr, low expression

## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_low = subset(epi_cl14clbr_table, logFC <= epi_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
epi_cl14clbr_low_go = simple_goseq(de_genes=epi_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_low_go$pvalue_histogram
epi_cl14clbr_low_go$bpp_plot
##epi_cl14clbr_low_cl = simple_clusterprofiler(epi_cl14clbr_low, gff="reference/gff/clbrener_8.1_complete_genes.gff.gz")
##epi_cl14clbr_low_cl$bp_all_barplot
epi_cl14clbr_low_top = simple_topgo(epi_cl14clbr_low, goids=goid_file, pvals=epi_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(epi_cl14clbr_low_top)
top_trees$bp_fisher_tree

1.5 Compare trypo CL14/CLBr

## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_high = subset(tryp_cl14clbr_table, logFC >= tryp_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
tryp_cl14clbr_high_go = simple_goseq(tryp_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_high_go$pvalue_histogram
tryp_cl14clbr_high_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##tryp_cl14clbr_high_trees = goseq_trees(degenes=tryp_cl14clbr_high, tryp_cl14clbr_high_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
tryp_cl14clbr_high_cl = simple_clusterprofiler(tryp_cl14clbr_high)
tryp_cl14clbr_high_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(tryp_cl14clbr_high, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
tryp_cl14clbr_high_top = simple_topgo(tryp_cl14clbr_high, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_high_top)
top_trees$bp_fisher_tree

1.6 Compare cl14/clbr tryp other side

## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_low = subset(tryp_cl14clbr_table, logFC <= tryp_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
tryp_cl14clbr_low_go = simple_goseq(tryp_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_low_go$pvalue_histogram
tryp_cl14clbr_low_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##tryp_cl14clbr_low_trees = goseq_trees(degenes=tryp_cl14clbr_low, tryp_cl14clbr_low_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
tryp_cl14clbr_low_cl = simple_clusterprofiler(tryp_cl14clbr_low)
tryp_cl14clbr_low_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(tryp_cl14clbr_low, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
tryp_cl14clbr_low_top = simple_topgo(tryp_cl14clbr_low, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_low_top)
top_trees$bp_fisher_tree

1.7 Amastigote a96 cl14/clbr

## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = all_tables$a96_cl14clbr
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC >= a96_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##a96_cl14clbr_low_trees = goseq_trees(degenes=a96_cl14clbr_low, a96_cl14clbr_low_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(a96_cl14clbr_low, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = strain_comparisons$data$a96_clbr_over_cl14
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC <= a96_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##a96_cl14clbr_low_trees = goseq_trees(degenes=a96_cl14clbr_low, a96_cl14clbr_low_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
##simple_topgo(a96_cl14clbr_low, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree

## Ways to look
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$B.x, a96=tmp$B.y)
tt = hpgl_linear_scatter(tmp_b_t)
tt$scatter
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$t.x, a96=tmp$t.y)
tt = hpgl_scatter(tmp_b_t)
tt$scatter

2 KG

Oh yeah, I wrote a toy to wrap KEGG pathway stuff, lets try that. The KEGG abbreviation for t. cruzi is tcr

In theory, the _paths data structure returned by hpgl_pathview() should be a table of the numer of up/down genes observed in each KEGG path. However, there are some weird NAs getting returned. Effing hippies. – fixed.

kegg_list <- ""
##names(kegg_list) = rownames(epi_cl14clbr_table)
##epi_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_epi_cl14clbr", string_from="TcCLB.", string_to="")
##head(epi_cl14clbr_paths)


##a60_cl14clbr_table = all_tables$a60_cl14clbr
a60_cl14clbr_table <- strain_comparisons$data$a60_clbr_over_cl14
##kegg_list <- a60_cl14clbr_table$logFC
kegg_list <- a60_cl14clbr_table$limma_logfc
names(kegg_list) <- rownames(a60_cl14clbr_table)
a60_cl14clbr_paths <- hpgl_pathview(kegg_list, species="tcr",
                                    indir="pathview_in", outdir="pathview_a60_cl14clbr",
                                    from_list="TcCLB.", to_list="")
head(a60_cl14clbr_paths)

a96_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = a96_cl14clbr_table$logFC
names(kegg_list) = rownames(a96_cl14clbr_table)
a96_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_a96_cl14clbr", string_from="TcCLB.", string_to="")
head(a96_cl14clbr_paths)

tryp_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = tryp_cl14clbr_table$logFC
names(kegg_list) = rownames(tryp_cl14clbr_table)
tryp_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_tryp_cl14clbr", string_from="TcCLB.", string_to="")
head(tryp_cl14clbr_paths)
---
title: "RNAseq of T.cruzi CL14/CLBr: Gene Ontology."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: default
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: readable
  toc: true
  toc_float:
    collapsed: false
    smooth_scroll: false
---

<style>
  <!-- Document prelude revision 2016-10 -->
body .main-container {
max-width: 1600px;
}
</style>

```{r options, include=FALSE}
## These are the options I tend to favor
library("hpgltools")
tt <- sm(devtools::load_all("~/hpgltools"))
knitr::opts_knit$set(
    progress = TRUE,
    verbose = TRUE,
    width = 90,
    echo = NULL)
knitr::opts_chunk$set(
    error = TRUE,
    fig.width = 8,
    fig.height = 8,
    dpi = 96)
old_options <- options(
    error = NULL,
    digits = 4,
    stringsAsFactors = FALSE,
    knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
set.seed(1)
previous_file <- "differential_expression.Rmd"
rmd_file <- "gene_ontology_relaxed_p_input.Rmd"
ver <- "20170216"
previous_save <- paste0(gsub(pattern="\\.Rmd", replace=".rda.xz", x=previous_file))
this_save <- paste0(gsub(pattern="\\.Rmd", replace=".rda.xz", x=rmd_file))
```

[index.html](index.html) [annotation.html](annotation.html)
[sample_estimations.html](sample_estimations.html) [differential_expression.html](differential_expression.html)

```{r rendering, include=FALSE, eval=FALSE}
## This block is used to render a document from within it.
rmarkdown::render(rmd_file)

## An extra renderer for pdf output
rmarkdown::render(rmd_file, output_format="pdf_document", output_options=c("skip_html"))
## Or to save/load large Rdata files.
hpgltools:::saveme()
hpgltools:::loadme()
rm(list=ls())
```

In this document, we will perform a series of differential expression analyses using the data which survived sample estimation.

```{r loadme, include=FALSE}
tmp <- sm(loadme(filename=previous_save))
```

# Gene Ontology searches

Lets set up some simple data sets for goseq/clusterProfiler/topGO

I think a bunch of the material in this sheet is actually now obsolete.

## Extracting ontology annotations from the TriTrypDB

I need to extract the ontology/pFAM data from the Tcruzi annotation
tables.

I have a perl script 'dump_tritryp_GO.pl' which can do this for me.

```{r engine='bash', eval=FALSE}
cd $LAB/ref_data/gene_ontology/
./dump_tritryp_GO.pl $LAB/ref_data/tritryp_text/Tcruzi.txt > tcruzi_go.tab
## The actual commands were slightly different, just because I didn't feel like copy/pasting the filenames.
```

## Preparing to use goseq

I need a couple vectors before being able to use goseq
I also need some rda files generated for clusterProfiler and topGO
but I will do those later...

```{r goseq_preparation}
## goseq uses the gene lengths, clusterprofiler and topgo don't
## gene_lengths <- all_genes[,c("geneid","genesize")]
## colnames(gene_lengths) <- c("ID", "width")
colnames(all_lengths) <- c("ID", "width")
## go_ids needs to be a 2 column data frame with names 'ID', 'GO' which are
## gene names and GO names respectively.  I actually think I made my goseq() function
## smart enough that this is no longer needed, but nonetheless.

## go_ids <- read.table("reference/go/tcruzi_all_go.tab.gz", header=0, sep="\t")
## colnames(go_ids) <- c("ID","GO","ontology","term","source")
## Since there are multiple sources, I am doing a unique of the table...
## go_ids <- unique(go_ids[,c("ID","GO")])
## goid_file <- "reference/go/topgo_goids.tab.gz"

colnames(all_go) <- c("ID", "GO")
```

```{r goseq_clbr_times1}
## First relax the input as perl the name of this file.
clbr_times_sigfiltv2 <- sm(extract_significant_genes(clbr_times_filtered, according_to="limma", p=0.1,
                                                excel=paste0("excel/clb_times_filtered_relaxedp-v", ver, ".xlsx")))
cl14_times_sigfiltv2 <- sm(extract_significant_genes(cl14_times_filtered, according_to="limma", p=0.1,
                                                excel=paste0("excel/cl14_times_filtered_relaxedp-v", ver, ".xlsx")))
strains_sigfiltv2 <- sm(extract_significant_genes(strains_filtered, according_to="limma", p=0.1,
                                               excel=paste0("excell/strain_comparisons_filtered_relaxedp-v", ver, ".xlsx")))

clbr_up_a96tryp_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$ups$clbr_a96_to_tryp,
                                         go_db=all_go, length_db=all_lengths))
clbr_up_a96tryp_goseq$pvalue_plots$bpp_plot_over
clbr_up_a96tryp_excel <- sm(write_goseq_data(clbr_up_a96tryp_goseq,
                                             excel=paste0("excel/clbr_up_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))
clbr_down_a96tryp_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$downs$clbr_a96_to_tryp,
                                           go_db=all_go, length_db=all_lengths))
clbr_down_a96tryp_goseq$pvalue_plots$bpp_plot_over
clbr_down_a96tryp_excel <- sm(write_goseq_data(clbr_down_a96tryp_goseq,
                                               excel=paste0("excel/clbr_down_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))
clbr_up_a60a96_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$ups$clbr_a60_to_a96,
                                        go_db=all_go, length_db=all_lengths))
clbr_up_a60a96_goseq$pvalue_plots$bpp_plot_over
clbr_up_a60a96_excel <- sm(write_goseq_data(clbr_up_a60a96_goseq,
                                            excel=paste0("excel/clbr_up_a60a96_goseq_relaxedp-v", ver, ".xlsx")))
clbr_down_a60a96_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$downs$clbr_a60_to_a96,
                                          go_db=all_go, length_db=all_lengths))
clbr_down_a60a96_goseq$pvalue_plots$bpp_plot_over
clbr_down_a60a96_excel <- sm(write_goseq_data(clbr_down_a60a96_goseq,
                                              excel=paste0("excel/clbr_down_a60a96_goseq_relaxedp-v", ver, ".xlsx")))
clbr_up_trypa60_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$ups$clbr_tryp_to_a60,
                                         go_db=all_go, length_db=all_lengths))
clbr_up_trypa60_goseq$pvalue_plots$bpp_plot_over
clbr_up_trypa60_excel <- sm(write_goseq_data(clbr_up_trypa60_goseq,
                                             excel=paste0("excel/clbr_up_trypa60_goseq_relaxedp-v", ver, ".xlsx")))
clbr_down_trypa60_goseq <- sm(simple_goseq(sig_genes=clbr_times_sigfiltv2$limma$downs$clbr_tryp_to_a60,
                                           go_db=all_go, length_db=all_lengths))
clbr_down_trypa60_goseq$pvalue_plots$bpp_plot_over
clbr_down_trypa60_excel <- sm(write_goseq_data(clbr_down_trypa60_goseq,
                                               excel=paste0("excel/clbr_down_trypa60_goseq_relaxedp-v", ver, ".xlsx")))
cl14_up_a96tryp_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$ups$cl14_a96_to_tryp,
                                         go_db=all_go, length_db=all_lengths))
cl14_up_a96tryp_goseq$pvalue_plots$bpp_plot_over
cl14_up_a96tryp_excel <- sm(write_goseq_data(cl14_up_a96tryp_goseq,
                                             excel=paste0("excel/cl14_up_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))
cl14_down_a96tryp_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$downs$cl14_a96_to_tryp,
                                           go_db=all_go, length_db=all_lengths))
cl14_down_a96tryp_goseq$pvalue_plots$bpp_plot_over
cl14_down_a96tryp_excel <- sm(write_goseq_data(cl14_down_a96tryp_goseq,
                                               excel=paste0("excel/cl14_down_a96tryp_goseq_relaxedp-v", ver, ".xlsx")))
cl14_up_a60a96_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$ups$cl14_a60_to_a96,
                                        go_db=all_go, length_db=all_lengths))
cl14_up_a60a96_goseq$pvalue_plots$bpp_plot_over
cl14_up_a60a96_excel <- sm(write_goseq_data(cl14_up_a60a96_goseq,
                                            excel=paste0("excel/cl14_up_a60a96_goseq_relaxedp-v", ver, ".xlsx")))
cl14_down_a60a96_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$downs$cl14_a60_to_a96,
                                          go_db=all_go, length_db=all_lengths))
cl14_down_a60a96_goseq$pvalue_plots$bpp_plot_over
cl14_down_a60a96_excel <- sm(write_goseq_data(cl14_down_a60a96_goseq,
                                              excel=paste0("excel/cl14_down_a60a96_goseq_relaxedp-v", ver, ".xlsx")))
cl14_up_trypa60_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$ups$cl14_tryp_to_a60,
                                         go_db=all_go, length_db=all_lengths))
cl14_up_trypa60_goseq$pvalue_plots$bpp_plot_over
cl14_up_trypa60_excel <- sm(write_goseq_data(cl14_up_trypa60_goseq,
                                             excel=paste0("excel/cl14_up_trypa60_goseq_relaxedp-v", ver, ".xlsx")))
cl14_down_trypa60_goseq <- sm(simple_goseq(sig_genes=cl14_times_sigfiltv2$limma$downs$cl14_tryp_to_a60,
                                           go_db=all_go, length_db=all_lengths))
cl14_down_trypa60_goseq$pvalue_plots$bpp_plot_over
cl14_down_trypa60_excel <- sm(write_goseq_data(cl14_down_trypa60_goseq,
                                               excel=paste0("excel/cl14_down_trypa60_goseq_relaxedp-v", ver, ".xlsx")))

strains_up_a60_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$ups$a60_clbr_over_cl14,
                                        go_db=all_go, length_db=all_lengths))
strains_up_a96_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$ups$a96_clbr_over_cl14,
                                        go_db=all_go, length_db=all_lengths))
strains_up_tryp_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$ups$tryp_clbr_over_cl14,
                                         go_db=all_go, length_db=all_lengths))
strains_up_a60_excel <- sm(write_goseq_data(strains_up_a60_goseq,
                                            excel=paste0("excel/strains_upa60_goseq_relaxedp-v", ver, ".xlsx")))
strains_up_a96_excel <- sm(write_goseq_data(strains_up_a96_goseq,
                                            excel=paste0("excel/strains_upa96_goseq_relaxedp-v", ver, ".xlsx")))
strains_up_tryp_excel <- sm(write_goseq_data(strains_up_tryp_goseq,
                                             excel=paste0("excel/strains_uptryp_goseq_relaxedp-v", ver, ".xlsx")))
strains_down_a60_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$downs$a60_clbr_over_cl14,
                                          go_db=all_go, length_db=all_lengths))
strains_down_a96_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$downs$a96_clbr_over_cl14,
                                          go_db=all_go, length_db=all_lengths))
strains_down_tryp_goseq <- sm(simple_goseq(sig_genes=strains_sigfiltv2$limma$downs$tryp_clbr_over_cl14,
                                           go_db=all_go, length_db=all_lengths))
strains_down_a60_excel <- sm(write_goseq_data(strains_down_a60_goseq,
                                            excel=paste0("excel/strains_downa60_goseq_relaxedp-v", ver, ".xlsx")))
strains_down_a96_excel <- sm(write_goseq_data(strains_down_a96_goseq,
                                            excel=paste0("excel/strains_downa96_goseq_relaxedp-v", ver, ".xlsx")))
strains_down_tryp_excel <- sm(write_goseq_data(strains_down_tryp_goseq,
                                             excel=paste0("excel/strains_downtryp_goseq_relaxedp-v", ver, ".xlsx")))
```

## Compare Epimastigotes between CL14 and CLBr, high expression

One important note here, I needed to modify the data structure from readGff() in clusterProfiler
before it was able to successfully generate the file 'geneTable.rda'
Unfortunately, I was watching a movie at the time and didn't properly write down what I did.
However, I did write it out in the function myr::Gff2GeneTable()
but in order to make it work one must load readGff() from clusterProfiler.
I think the real solution is for me to grab a version of readGff() from clusterProfiler
and make it exportable so that it will work consistently.

```{r ontology_epi_cl14clr_high, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
pval_thresh <- 0.0001
epi_cl14clbr_table <- all_tables$epi_cl14clbr
epi_cl14clbr_summary <- summary(epi_cl14clbr_table$logFC)
epi_cl14clbr_pvals <- epi_cl14clbr_table$adj.P.Val
names(epi_cl14clbr_pvals) <- rownames(epi_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_high <- subset(epi_cl14clbr_table, logFC >= epi_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
epi_cl14clbr_high_go <- simple_goseq(epi_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_high_go$pvalue_histogram
epi_cl14clbr_high_go$bpp_plot

summary(epi_cl14clbr_high_go)
fun <- epi_cl14clbr_high_go$godata_interesting
goseq_annotation <- goseq_table(fun)
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##epi_cl14clbr_high_trees = goseq_trees(degenes=epi_cl14clbr_high, epi_cl14clbr_high_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
epi_cl14clbr_high_cl <- simple_clusterprofiler(epi_cl14clbr_high)
epi_cl14clbr_high_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(epi_cl14clbr_high, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
epi_cl14clbr_high_top <- simple_topgo(epi_cl14clbr_high, goids=goid_file, pvals=epi_cl14clbr_pvals)

topgo_pval <- topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees <- topgo_trees(epi_cl14clbr_high_top)
top_trees$bp_fisher_tree
```

## Compare Epimastigotes between CL14 and CLBr, low expression

```{r ontology_epi_cl14clbr_low, eval=FALSE}
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_low = subset(epi_cl14clbr_table, logFC <= epi_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
epi_cl14clbr_low_go = simple_goseq(de_genes=epi_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_low_go$pvalue_histogram
epi_cl14clbr_low_go$bpp_plot
##epi_cl14clbr_low_cl = simple_clusterprofiler(epi_cl14clbr_low, gff="reference/gff/clbrener_8.1_complete_genes.gff.gz")
##epi_cl14clbr_low_cl$bp_all_barplot
epi_cl14clbr_low_top = simple_topgo(epi_cl14clbr_low, goids=goid_file, pvals=epi_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(epi_cl14clbr_low_top)
top_trees$bp_fisher_tree
```

## Compare trypo CL14/CLBr

```{r tryp_cl14clbr_high_goseq, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_high = subset(tryp_cl14clbr_table, logFC >= tryp_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
tryp_cl14clbr_high_go = simple_goseq(tryp_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_high_go$pvalue_histogram
tryp_cl14clbr_high_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##tryp_cl14clbr_high_trees = goseq_trees(degenes=tryp_cl14clbr_high, tryp_cl14clbr_high_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
tryp_cl14clbr_high_cl = simple_clusterprofiler(tryp_cl14clbr_high)
tryp_cl14clbr_high_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(tryp_cl14clbr_high, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
tryp_cl14clbr_high_top = simple_topgo(tryp_cl14clbr_high, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_high_top)
top_trees$bp_fisher_tree
```

## Compare cl14/clbr tryp other side

```{r tryp_cl14clbr_low_goseq, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_low = subset(tryp_cl14clbr_table, logFC <= tryp_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
tryp_cl14clbr_low_go = simple_goseq(tryp_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_low_go$pvalue_histogram
tryp_cl14clbr_low_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##tryp_cl14clbr_low_trees = goseq_trees(degenes=tryp_cl14clbr_low, tryp_cl14clbr_low_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
tryp_cl14clbr_low_cl = simple_clusterprofiler(tryp_cl14clbr_low)
tryp_cl14clbr_low_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(tryp_cl14clbr_low, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
tryp_cl14clbr_low_top = simple_topgo(tryp_cl14clbr_low, goids=goid_file, pvals=tryp_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(tryp_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_low_top)
top_trees$bp_fisher_tree
```

## Amastigote a96 cl14/clbr

```{r ontology_a96_cl14clbr_high, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = all_tables$a96_cl14clbr
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC >= a96_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##a96_cl14clbr_low_trees = goseq_trees(degenes=a96_cl14clbr_low, a96_cl14clbr_low_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(a96_cl14clbr_low, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree
```

```{r ontology_a96_cl14clbr_low, eval=FALSE}
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = strain_comparisons$data$a96_clbr_over_cl14
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC <= a96_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous script for how to do that (I think I do it automatically in simple_topgo()...
##a96_cl14clbr_low_trees = goseq_trees(degenes=a96_cl14clbr_low, a96_cl14clbr_low_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
##simple_topgo(a96_cl14clbr_low, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)

topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree

## Ways to look
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$B.x, a96=tmp$B.y)
tt = hpgl_linear_scatter(tmp_b_t)
tt$scatter
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$t.x, a96=tmp$t.y)
tt = hpgl_scatter(tmp_b_t)
tt$scatter
```

# KG

Oh yeah, I wrote a toy to wrap KEGG pathway stuff, lets try that.
The KEGG abbreviation for t. cruzi is tcr

In theory, the _paths data structure returned by hpgl_pathview() should be a table
of the numer of up/down genes observed in each KEGG path.  However, there are some
weird NAs getting returned.   Effing hippies. -- fixed.

```{r kegg_fun_v1}
kegg_list <- ""
```

```{r kegg_fun_v2, eval=FALSE}
##names(kegg_list) = rownames(epi_cl14clbr_table)
##epi_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_epi_cl14clbr", string_from="TcCLB.", string_to="")
##head(epi_cl14clbr_paths)


##a60_cl14clbr_table = all_tables$a60_cl14clbr
a60_cl14clbr_table <- strain_comparisons$data$a60_clbr_over_cl14
##kegg_list <- a60_cl14clbr_table$logFC
kegg_list <- a60_cl14clbr_table$limma_logfc
names(kegg_list) <- rownames(a60_cl14clbr_table)
a60_cl14clbr_paths <- hpgl_pathview(kegg_list, species="tcr",
                                    indir="pathview_in", outdir="pathview_a60_cl14clbr",
                                    from_list="TcCLB.", to_list="")
head(a60_cl14clbr_paths)

a96_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = a96_cl14clbr_table$logFC
names(kegg_list) = rownames(a96_cl14clbr_table)
a96_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_a96_cl14clbr", string_from="TcCLB.", string_to="")
head(a96_cl14clbr_paths)

tryp_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = tryp_cl14clbr_table$logFC
names(kegg_list) = rownames(tryp_cl14clbr_table)
tryp_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_tryp_cl14clbr", string_from="TcCLB.", string_to="")
head(tryp_cl14clbr_paths)
```
