index.html 01_annotation.html 02_sample_estimations.html 03_differential_expression.html
Lets set up some simple data sets for goseq/clusterProfiler/topGO
I think a bunch of the material in this sheet is actually now obsolete.
I need a couple vectors before being able to use goseq I also need some rda files generated for clusterProfiler and topGO but I will do those later…
## goseq uses the gene lengths, clusterprofiler and topgo don't
## gene_lengths <- all_genes[,c("geneid","genesize")]
## colnames(gene_lengths) <- c("ID", "width")
colnames(all_lengths) <- c("ID", "width")
## go_ids needs to be a 2 column data frame with names 'ID', 'GO' which are
## gene names and GO names respectively. I actually think I made my goseq() function
## smart enough that this is no longer needed, but nonetheless.
## go_ids <- read.table("reference/go/tcruzi_all_go.tab.gz", header=0, sep="\t")
## colnames(go_ids) <- c("ID","GO","ontology","term","source")
## Since there are multiple sources, I am doing a unique of the table...
## go_ids <- unique(go_ids[,c("ID","GO")])
## goid_file <- "reference/go/topgo_goids.tab.gz"
all_go <- as.data.frame(all_go)
all_go <- all_go[, c("GID", "GO")]
sig <- clbr_times_sigfilt$limma$ups$CLBr.Tryp_vs_CLBr.A96
clbr_up_a96tryp_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/clbr_up_a96tryp_goseq-v", ver, ".xlsx")))
clbr_up_a96tryp_goseq$pvalue_plots$bpp_plot_over
sig <- clbr_times_sigfilt$limma$downs$CLBr.Tryp_vs_CLBr.A96
clbr_down_a96tryp_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/clbr_down_a96tryp_goseq-v", ver, ".xlsx")))
clbr_down_a96tryp_goseq$pvalue_plots$bpp_plot_over
sig <- clbr_times_sigfilt$limma$ups$CLBr.A96_vs_CLBr.A60
clbr_up_a60a96_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/clbr_up_a60a96_goseq-v", ver, ".xlsx")))
clbr_up_a60a96_goseq$pvalue_plots$bpp_plot_over
sig <- clbr_times_sigfilt$limma$downs$CLBr.A96_vs_CLBr.A60
clbr_down_a60a96_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/clbr_down_a60a96_goseq-v", ver, ".xlsx")))
clbr_down_a60a96_goseq$pvalue_plots$bpp_plot_over
sig <- clbr_times_sigfilt$limma$ups$CLBr.A60_vs_CLBr.Tryp
clbr_up_trypa60_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/clbr_up_trypa60_goseq-v", ver, ".xlsx")))
clbr_up_trypa60_goseq$pvalue_plots$bpp_plot_over
sig <- clbr_times_sigfilt$limma$downs$CLBr.A60_vs_CLBr.Tryp
clbr_down_trypa60_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/clbr_down_trypa60_goseq-v", ver, ".xlsx")))
clbr_down_trypa60_goseq$pvalue_plots$bpp_plot_over
sig <- cl14_times_sigfilt$limma$ups$CL14.Tryp_vs_CL14.A96
cl14_up_a96tryp_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/cl14_up_a96tryp_goseq-v", ver, ".xlsx")))
cl14_up_a96tryp_goseq$pvalue_plots$bpp_plot_over
sig <- cl14_times_sigfilt$limma$downs$CL14.Tryp_vs_CL14.A96
cl14_down_a96tryp_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/cl14_down_a96tryp_goseq-v", ver, ".xlsx")))
cl14_down_a96tryp_goseq$pvalue_plots$bpp_plot_over
sig <- cl14_times_sigfilt$limma$ups$CL14.A96_vs_CL14.A60
cl14_up_a60a96_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/cl14_up_a60a96_goseq-v", ver, ".xlsx")))
cl14_up_a60a96_goseq$pvalue_plots$bpp_plot_over
sig <- cl14_times_sigfilt$limma$downs$CL14.A96_vs_CL14.A60
cl14_down_a60a96_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/cl14_down_a60a96_goseq-v", ver, ".xlsx")))
cl14_down_a60a96_goseq$pvalue_plots$bpp_plot_over
sig <- cl14_times_sigfilt$limma$ups$CL14.A60_vs_CL14.Tryp
cl14_up_trypa60_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/cl14_up_trypa60_goseq-v", ver, ".xlsx")))
cl14_up_trypa60_goseq$pvalue_plots$bpp_plot_over
sig <- cl14_times_sigfilt$limma$downs$CL14.A60_vs_CL14.Tryp
cl14_down_trypa60_goseq <- sm(simple_goseq(
sig_genes=sig,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/cl14_down_trypa60_goseq-v", ver, ".xlsx")))
cl14_down_trypa60_goseq$pvalue_plots$bpp_plot_over
strains_up_a60_goseq <- sm(simple_goseq(
sig_genes=strains_sigfilt$limma$ups$CLBr.A60_vs_CL14.A60,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/strains_upa60_goseq-v", ver, ".xlsx")))
strains_up_a96_goseq <- sm(simple_goseq(
sig_genes=strains_sigfilt$limma$ups$CLBr.A96_vs_CL14.A96,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/strains_upa96_goseq-v", ver, ".xlsx")))
strains_up_tryp_goseq <- sm(simple_goseq(
sig_genes=strains_sigfilt$limma$ups$CLBr.Tryp_vs_CL14.Tryp,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/strains_uptryp_goseq-v", ver, ".xlsx")))
strains_down_a60_goseq <- sm(simple_goseq(
sig_genes=strains_sigfilt$limma$downs$CLBr.A60_vs_CL14.A60,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/strains_downa60_goseq-v", ver, ".xlsx")))
strains_down_a96_goseq <- sm(simple_goseq(
sig_genes=strains_sigfilt$limma$downs$CLBr.A96_vs_CL14.A96,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/strains_downa96_goseq-v", ver, ".xlsx")))
strains_down_tryp_goseq <- sm(simple_goseq(
sig_genes=strains_sigfilt$limma$downs$CLBr.Tryp_vs_CL14.Tryp,
go_db=all_go,
length_db=all_lengths,
excel=paste0("excel/strains_downtryp_goseq-v", ver, ".xlsx")))
One important note here, I needed to modify the data structure from readGff() in clusterProfiler before it was able to successfully generate the file ‘geneTable.rda’ Unfortunately, I was watching a movie at the time and didn’t properly write down what I did. However, I did write it out in the function myr::Gff2GeneTable() but in order to make it work one must load readGff() from clusterProfiler. I think the real solution is for me to grab a version of readGff() from clusterProfiler and make it exportable so that it will work consistently.
## Changes in epimastigotes from CLBrener to CL14
pval_thresh <- 0.0001
epi_cl14clbr_table <- all_tables$epi_cl14clbr
epi_cl14clbr_summary <- summary(epi_cl14clbr_table$logFC)
epi_cl14clbr_pvals <- epi_cl14clbr_table$adj.P.Val
names(epi_cl14clbr_pvals) <- rownames(epi_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_high <- subset(epi_cl14clbr_table, logFC >= epi_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
epi_cl14clbr_high_go <- simple_goseq(epi_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_high_go$pvalue_histogram
epi_cl14clbr_high_go$bpp_plot
summary(epi_cl14clbr_high_go)
fun <- epi_cl14clbr_high_go$godata_interesting
goseq_annotation <- goseq_table(fun)
## Oh crap, I cannot make trees until I generate a id2go R mapping, I will need to check my previous
## script for how to do that (I think I do it automatically in simple_topgo()...
##epi_cl14clbr_high_trees = goseq_trees(degenes=epi_cl14clbr_high, epi_cl14clbr_high_go)
## I had to hack the geneTable() function in order to make it not take infinite memory because it
## was trying to do some stupid merge of every gff column against every other column...
## Once the geneTable.rda file exists and included real information, this seems to work fine, though.
epi_cl14clbr_high_cl <- simple_clusterprofiler(epi_cl14clbr_high)
epi_cl14clbr_high_cl$bp_all_barplot
## The first time this is run in a fresh directory, it needs to be run like this:
## simple_topgo(epi_cl14clbr_high, goids_df=go_ids)
## setting goids_df will tell it to generate a table of genes->GO<-genes in the format expected.
epi_cl14clbr_high_top <- simple_topgo(epi_cl14clbr_high, goids=goid_file, pvals=epi_cl14clbr_pvals)
topgo_pval <- topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees <- topgo_trees(epi_cl14clbr_high_top)
top_trees$bp_fisher_tree
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
epi_cl14clbr_low = subset(epi_cl14clbr_table, logFC <= epi_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
epi_cl14clbr_low_go = simple_goseq(de_genes=epi_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
epi_cl14clbr_low_go$pvalue_histogram
epi_cl14clbr_low_go$bpp_plot
##epi_cl14clbr_low_cl = simple_clusterprofiler(epi_cl14clbr_low, gff="reference/gff/clbrener_8.1_complete_genes.gff.gz")
##epi_cl14clbr_low_cl$bp_all_barplot
epi_cl14clbr_low_top = simple_topgo(epi_cl14clbr_low, goids=goid_file, pvals=epi_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(epi_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(epi_cl14clbr_low_top)
top_trees$bp_fisher_tree
## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with high expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_high = subset(tryp_cl14clbr_table, logFC >= tryp_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the high genes
tryp_cl14clbr_high_go = simple_goseq(tryp_cl14clbr_high, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_high_go$pvalue_histogram
tryp_cl14clbr_high_go$bpp_plot
tryp_cl14clbr_high_cl = simple_clusterprofiler(tryp_cl14clbr_high)
tryp_cl14clbr_high_cl$bp_all_barplot
tryp_cl14clbr_high_top = simple_topgo(tryp_cl14clbr_high, goids=goid_file, pvals=tryp_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(tryp_cl14clbr_high_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_high_top)
top_trees$bp_fisher_tree
## Changes in epimastigotes from CLBrener to CL14
tryp_cl14clbr_table = all_tables$tryp_cl14clbr
tryp_cl14clbr_summary = summary(tryp_cl14clbr_table$logFC)
tryp_cl14clbr_pvals = tryp_cl14clbr_table$adj.P.Val
names(tryp_cl14clbr_pvals) = rownames(tryp_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
tryp_cl14clbr_low = subset(tryp_cl14clbr_table, logFC <= tryp_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
tryp_cl14clbr_low_go = simple_goseq(tryp_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
tryp_cl14clbr_low_go$pvalue_histogram
tryp_cl14clbr_low_go$bpp_plot
tryp_cl14clbr_low_cl = simple_clusterprofiler(tryp_cl14clbr_low)
tryp_cl14clbr_low_cl$bp_all_barplot
tryp_cl14clbr_low_top = simple_topgo(tryp_cl14clbr_low, goids=goid_file, pvals=tryp_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(tryp_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(tryp_cl14clbr_low_top)
top_trees$bp_fisher_tree
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = all_tables$a96_cl14clbr
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC >= a96_cl14clbr_summary[5] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree
## Changes in epimastigotes from CLBrener to CL14
a96_cl14clbr_table = strain_comparisons$data$a96_clbr_over_cl14
a96_cl14clbr_summary = summary(a96_cl14clbr_table$logFC)
a96_cl14clbr_pvals = a96_cl14clbr_table$adj.P.Val
names(a96_cl14clbr_pvals) = rownames(a96_cl14clbr_table)
## Extract the genes with low expression in CL14 compared to Brener and very low pvalues
a96_cl14clbr_low = subset(a96_cl14clbr_table, logFC <= a96_cl14clbr_summary[2] & adj.P.Val <= pval_thresh)
## Do a goseq comparison of the low genes
a96_cl14clbr_low_go = simple_goseq(a96_cl14clbr_low, lengths=gene_lengths, goids=go_ids)
a96_cl14clbr_low_go$pvalue_histogram
a96_cl14clbr_low_go$bpp_plot
a96_cl14clbr_low_cl = simple_clusterprofiler(a96_cl14clbr_low)
a96_cl14clbr_low_cl$bp_all_barplot
a96_cl14clbr_low_top = simple_topgo(a96_cl14clbr_low, goids=goid_file, pvals=a96_cl14clbr_pvals)
topgo_pval = topgo_pval_plot(a96_cl14clbr_low_top)
topgo_pval$BP
top_trees = topgo_trees(a96_cl14clbr_low_top)
top_trees$bp_fisher_tree
## Ways to look
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$B.x, a96=tmp$B.y)
tt = hpgl_linear_scatter(tmp_b_t)
tt$scatter
a60_cl14clbr_table = all_tables$a60_cl14clbr
tmp = merge(a60_cl14clbr_table, a96_cl14clbr_table, by="row.names")
tmp_b_t = data.frame(a60=tmp$t.x, a96=tmp$t.y)
tt = hpgl_scatter(tmp_b_t)
tt$scatter
Oh yeah, I wrote a toy to wrap KEGG pathway stuff, lets try that. The KEGG abbreviation for t. cruzi is tcr
In theory, the _paths data structure returned by hpgl_pathview() should be a table of the numer of up/down genes observed in each KEGG path. However, there are some weird NAs getting returned. Effing hippies. – fixed.
kegg_list <- ""
##names(kegg_list) = rownames(epi_cl14clbr_table)
##epi_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_epi_cl14clbr", string_from="TcCLB.", string_to="")
##head(epi_cl14clbr_paths)
##a60_cl14clbr_table = all_tables$a60_cl14clbr
a60_cl14clbr_table <- strain_comparisons$data$a60_clbr_over_cl14
##kegg_list <- a60_cl14clbr_table$logFC
kegg_list <- a60_cl14clbr_table$limma_logfc
names(kegg_list) <- rownames(a60_cl14clbr_table)
a60_cl14clbr_paths <- hpgl_pathview(kegg_list, species="tcr",
indir="pathview_in", outdir="pathview_a60_cl14clbr",
from_list="TcCLB.", to_list="")
head(a60_cl14clbr_paths)
a96_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = a96_cl14clbr_table$logFC
names(kegg_list) = rownames(a96_cl14clbr_table)
a96_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_a96_cl14clbr", string_from="TcCLB.", string_to="")
head(a96_cl14clbr_paths)
tryp_cl14clbr_table = all_tables$a96_cl14clbr
kegg_list = tryp_cl14clbr_table$logFC
names(kegg_list) = rownames(tryp_cl14clbr_table)
tryp_cl14clbr_paths = hpgl_pathview(kegg_list, species="tcr", indir="pathview_in", outdir="pathview_tryp_cl14clbr", string_from="TcCLB.", string_to="")
head(tryp_cl14clbr_paths)