goseq

## Get a refresher on the likely data structures to use.
head(play_data)

##              deplete replete
## LmxM.01.0010   5.193   5.059
## LmxM.01.0020   6.514   6.446
## LmxM.01.0030   6.979   7.055
## LmxM.01.0040   5.168   5.008
## LmxM.01.0050   6.191   6.007
## LmxM.01.0060   5.975   6.139

head(combat_condbatch_table)

##               logFC AveExpr      t   P.Value adj.P.Val     B
## LmxM.02.0460 -2.553   7.068 -36.93 6.235e-12 5.026e-08 16.31
## LmxM.30.1200 -2.090   5.183 -28.48 7.973e-11 1.320e-07 13.89
## LmxM.33.0070 -1.920   6.916 -28.40 8.184e-11 1.320e-07 14.77
## LmxM.30.1190  1.767   5.429  22.25 8.837e-10 5.780e-07 12.47
## LmxM.09.0060  1.744   8.812  29.52 5.604e-11 1.320e-07 14.87
## LmxM.08.0290 -1.725   6.478 -24.22 3.882e-10 3.478e-07 13.52

go_ids = read.table("reference/LmxM_go.txt", header=1)
kegg_ids = read.csv("reference/KEGG_mexicana.csv", header=1)
kegg_ids = kegg_ids[,c(2,1)]
colnames(kegg_ids) = c("ID","GO")
gene_lengths = read.table("reference/LmxM_length.txt", header=1)
gene_information = merge(gene_lengths, combat_condbatch_table, by.x="ID", by.y="row.names")

## Pull out "significant" subsets
## The following two data.frames are for fold change > 1.6 and are not used for the final GO analisys
up_significant = subset(gene_information, logFC > 0.6931472)
down_significant = subset(gene_information, logFC < -0.6931472)
## the next three data.frames pull out the adj.P.Val < 0.05 and are used
p_significant = subset(gene_information, adj.P.Val < 0.05)
p_sig_up = subset(p_significant, logFC > 0.0)
p_sig_down = subset(p_significant, logFC < 0.0)

#Defining upregulated DE as 1 and then reformating frame for ID and DE
up_significant$DE = 1
up_significant = up_significant[,c(1,9)]
down_significant$DE = 1
down_significant = down_significant[,c(1,9)]
p_significant$DE = 1
p_significant = p_significant[,c(1,9)]
p_sig_up$DE = 1
p_sig_up = p_sig_up[,c(1,9)]
p_sig_down$DE = 1
p_sig_down = p_sig_down[,c(1,9)]

## Now merge them against the annotation/length information, sadly I
## don't remember the correct way to do this...
## Lets think, the goal is to have a list of all IDs, and a boolean
## yes or no for DE
## So a stupid way to handle this is to merge it back against
## everyone, leave DE as NA for where it isn't 1, then fill it back as
## 0
## The second merge is to include a subset of genes which had expression levels
## perceived to be essentially 0 (low counts), so I am merging back to the
## gene_lengths
up_significant = merge(gene_information, up_significant, by="ID", all.x=TRUE)
up_significant = merge(up_significant, gene_lengths, by="ID", all.y=TRUE)
up_significant[is.na(up_significant)] = 0
rownames(up_significant) = up_significant$ID
up_significant_id_vector = as.vector(up_significant$ID)
names(up_significant_id_vector) = up_significant$ID
up_significant_width_vector = as.vector(up_significant$width.y)
names(up_significant_width_vector) = up_significant$ID
up_significant_DE_vector = as.vector(up_significant$DE)
names(up_significant_DE_vector) = up_significant$ID

down_significant = merge(gene_information, down_significant, by="ID", all.x=TRUE)
down_significant = merge(down_significant, gene_lengths, by="ID", all.y=TRUE)
down_significant[is.na(down_significant)] = 0
rownames(down_significant) = down_significant$ID
down_significant_id_vector = as.vector(down_significant$ID)
names(down_significant_id_vector) = down_significant$ID
down_significant_width_vector = as.vector(down_significant$width.y)
names(down_significant_width_vector) = down_significant$ID
down_significant_DE_vector = as.vector(down_significant$DE)
names(down_significant_DE_vector) = down_significant$ID

p_significant = merge(gene_information, p_significant, by="ID", all.x=TRUE)
p_significant = merge(p_significant, gene_lengths, by="ID", all.y=TRUE)
p_significant[is.na(p_significant)] = 0
rownames(p_significant) = p_significant$ID
p_significant_id_vector = as.vector(p_significant$ID)
names(p_significant_id_vector) = p_significant$ID
p_significant_width_vector = as.vector(p_significant$width.y)
names(p_significant_width_vector) = p_significant$ID
p_significant_DE_vector = as.vector(p_significant$DE)
names(p_significant_DE_vector) = p_significant$ID

p_sig_up = merge(gene_information, p_sig_up, by="ID", all.x=TRUE)
p_sig_up = merge(p_sig_up, gene_lengths, by="ID", all.y=TRUE)
p_sig_up[is.na(p_sig_up)] = 0
rownames(p_sig_up) = p_sig_up$ID
p_sig_up_id_vector = as.vector(p_sig_up$ID)
names(p_sig_up_id_vector) = p_sig_up$ID
p_sig_up_width_vector = as.vector(p_sig_up$width.y)
names(p_sig_up_width_vector) = p_sig_up$ID
p_sig_up_DE_vector = as.vector(p_sig_up$DE)
names(p_sig_up_DE_vector) = p_sig_up$ID

p_sig_down = merge(gene_information, p_sig_down, by="ID", all.x=TRUE)
p_sig_down = merge(p_sig_down, gene_lengths, by="ID", all.y=TRUE)
p_sig_down[is.na(p_sig_down)] = 0
rownames(p_sig_down) = p_sig_down$ID
p_sig_down_id_vector = as.vector(p_sig_down$ID)
names(p_sig_down_id_vector) = p_sig_down$ID
p_sig_down_width_vector = as.vector(p_sig_down$width.y)
names(p_sig_down_width_vector) = p_sig_down$ID
p_sig_down_DE_vector = as.vector(p_sig_down$DE)
names(p_sig_down_DE_vector) = p_sig_down$ID

Perform nullp

Ok, I am a little confused about the data structures but I think I got them in place lets see what happens…

## For the moment I think these are the most interesting to Andrew
pwf_psig = nullp(p_significant_DE_vector, bias.data=p_significant_width_vector)

plot of chunk perform_nullp

go_psig = goseq(pwf_psig, gene2cat = go_ids, method = 'Wallenius')

## Using manually entered categories.
## Calculating the p-values...

go_penriched = go_psig$category[p.adjust(go_psig$over_represented_pvalue, method="BH") < 0.1]
kegg_psig = goseq(pwf_psig, gene2cat = kegg_ids, method = 'Wallenius')

## Using manually entered categories.
## Calculating the p-values...

kegg_penriched = kegg_psig$category[p.adjust(kegg_psig$over_represented_pvalue, method="BH") < 0.1]
kegg_psig$bh = p.adjust(kegg_psig$over_represented_pvalue, method="BH")
write.csv(kegg_psig, file="txt/kegg_pvalue_significant_all.csv")
write.csv(kegg_penriched, file="txt/kegg_pvalue_significant_enriched.csv")

pwf_pup = nullp(p_sig_up_DE_vector, bias.data=p_sig_up_width_vector)

plot of chunk perform_nullp

go_pup = goseq(pwf_pup, gene2cat = go_ids, method = 'Wallenius')

## Using manually entered categories.
## Calculating the p-values...

go_pup_enriched = go_pup$category[p.adjust(go_pup$over_represented_pvalue, method="BH") < 0.05]
kegg_pup = goseq(pwf_pup, gene2cat = kegg_ids, method = 'Wallenius')

## Using manually entered categories.
## Calculating the p-values...

kegg_pup_enriched = kegg_pup$category[p.adjust(kegg_pup$over_represented_pvalue, method="BH") < 0.1]
write.csv(kegg_pup, file="txt/kegg_pvalue_up_significant_all.csv")
write.csv(kegg_pup_enriched, file="txt/kegg_pvalue_up_significant_enriched.csv")

pwf_pdown = nullp(p_sig_down_DE_vector, bias.data=p_sig_down_width_vector)

plot of chunk perform_nullp

go_pdown = goseq(pwf_pdown, gene2cat = go_ids, method = 'Wallenius')

## Using manually entered categories.
## Calculating the p-values...

go_pdown_enriched = go_pdown$category[p.adjust(go_pdown$over_represented_pvalue, method="BH") < 0.05]
write.csv(go_pdown, file="txt/go_pvalue_down_significant_all.csv")
write.csv(go_pdown_enriched, file="txt/go_pvalue_down_significant_enriched.csv")

kegg_pdown = goseq(pwf_pdown, gene2cat = kegg_ids, method = 'Wallenius')

## Using manually entered categories.
## Calculating the p-values...

kegg_pdown_enriched = kegg_pdown$category[p.adjust(kegg_pdown$over_represented_pvalue, method="BH") < 0.1]
write.csv(kegg_pdown, file="txt/kegg_pvalue_down_significant_all.csv")
write.csv(kegg_pdown_enriched, file="txt/kegg_pvalue_down_significant_enriched.csv")
## Andrew wanted to see what categories would be lost or gained depending upon which pvalue threshold was used for each FDR calculation
## First for the GO categories
## All DE genes together
AFtest = subset(go_psig, p.adjust(go_psig$over_represented_pvalue, method = "BH") < 0.05)
write.table(AFtest, file="txt/AFpsig_all_005.txt", quote=FALSE, sep="\t", row.names=FALSE)
AFtest = subset(go_psig, p.adjust(go_psig$over_represented_pvalue, method = "BH") < 0.1)
write.table(AFtest, file="txt/AFpsig_all_01.txt", quote=FALSE, sep="\t", row.names=FALSE)
## Only the up regulated genes
AFtestup = subset(go_pup, p.adjust(go_pup$over_represented_pvalue, method="BH") < 0.05)
write.table(AFtestup, file="txt/AFpsig_up_005.txt", quote=FALSE, sep="\t", row.names=FALSE)
AFtestup = subset(go_pup, p.adjust(go_pup$over_represented_pvalue, method="BH") < 0.1)
write.table(AFtestup, file="txt/AFpsig_up_01.txt", quote=FALSE, sep="\t", row.names=FALSE)
## Only the down regulated genes
AFtestdown = subset(go_pdown, p.adjust(go_pdown$over_represented_pvalue, method="BH") < 0.05)
write.table(AFtestdown, file="txt/AFpsig_down_005.txt", quote=FALSE, sep="\t", row.names=FALSE)
AFtestdown = subset(go_pdown, p.adjust(go_pdown$over_represented_pvalue, method="BH") < 0.1)
write.table(AFtestdown, file="txt/AFpsig_down_01.txt", quote=FALSE, sep="\t", row.names=FALSE)

## Now for the KEGG terms
## All DE genes together
AFtest = subset(kegg_psig, p.adjust(kegg_psig$over_represented_pvalue, method = "BH") < 0.05)
write.table(AFtest, file="txt/AFkeggpsig_all_005.txt", quote=FALSE, sep="\t", row.names=FALSE)
AFtest = subset(kegg_psig, p.adjust(kegg_psig$over_represented_pvalue, method = "BH") < 0.1)
write.table(AFtest, file="txt/AFkeggpsig_all_01.txt", quote=FALSE, sep="\t", row.names=FALSE)
## Only the up regulated genes
AFtestup = subset(kegg_pup, p.adjust(kegg_pup$over_represented_pvalue, method="BH") < 0.05)
write.table(AFtestup, file="txt/AFkeggpsig_up_005.txt", quote=FALSE, sep="\t", row.names=FALSE)
AFtestup = subset(kegg_pup, p.adjust(kegg_pup$over_represented_pvalue, method="BH") < 0.1)
write.table(AFtestup, file="txt/AFkeggpsig_up_01.txt", quote=FALSE, sep="\t", row.names=FALSE)
## Only the down regulated genes
AFtestdown = subset(kegg_pdown, p.adjust(kegg_pdown$over_represented_pvalue, method="BH") < 0.05)
write.table(AFtestdown, file="txt/AFkeggpsig_down_005.txt", quote=FALSE, sep="\t", row.names=FALSE)
AFtestdown = subset(kegg_pdown, p.adjust(kegg_pdown$over_represented_pvalue, method="BH") < 0.1)
write.table(AFtestdown, file="txt/AFkeggpsig_down_01.txt", quote=FALSE, sep="\t", row.names=FALSE)
#cleaning up
rm (AFtest)
rm (AFtestup)
rm (AFtestdown)

Now I have on hand a set of enriched categories. I want to write them out prettily. Previously I did this with the following function ‘wtf’ which takes as input a re-merged set of the table against the enriched targets in the table, then grabbed the GOTERM data for those entries

But damn, that is confusing.

## First pull a data frame of the enriched genes -- oh yeah, it comes as a list, which is not so much mergeable...
go_pup_df = as.data.frame(go_pup_enriched)
go_pup_df = merge(go_pup_df, go_pup, by.x="go_pup_enriched", by.y="category")
rownames(go_pup_df) = go_pup_df[,1]
go_pup_df = go_pup_df[-1]

go_pdown_df = as.data.frame(go_pdown_enriched)
go_pdown_df = merge(go_pdown_df, go_pdown, by.x="go_pdown_enriched", by.y="category")
rownames(go_pdown_df) = go_pdown_df[,1]
go_pdown_df = go_pdown_df[-1]

go_penriched_df = as.data.frame(go_penriched)
go_penriched_df = merge(go_penriched_df, go_psig, by.x="go_penriched", by.y="category")
rownames(go_penriched_df) = go_penriched_df[,1]
go_penriched_df = go_penriched_df[-1]

getterm = function(go) {
    x = GOTERM[go]
    term = Term(x)
    return(toString(term))
}
getsyn = function(go) {
    x = GOTERM[go]
    syn = Synonym(x)
    return(toString(syn))
}
getsec = function(go) {
    x = GOTERM[go]
    syn = Secondary(x)
    return(toString(syn))
}
getdef = function(go) {
    x = GOTERM[go]
    syn = Definition(x)
    return(toString(syn))
}
getont = function(go) {
    x = GOTERM[go]
    syn = Ontology(x)
    return(toString(syn))
}
wtf = function(df, filename) {
    df$names = rownames(df)
    colnames(df) = c("over_pval","under_pval","numDEinCat","numInCat","GOid")
    df$term = mapply(getterm, df$GOid)
    df$syn = mapply(getsyn, df$GOid)
    df$sec = mapply(getsec, df$GOid)
    df$ont = mapply(getont, df$GOid)
    df$def = mapply(getdef, df$GOid)
    write.csv(df, file=filename)
    return(df)
}

p_sig_up_table = wtf(go_pup_df, "txt/go_psig_up.csv")
p_sig_down_table = wtf(go_pdown_df, "txt/go_psig_down.csv")
p_sig_table = wtf(go_penriched_df, "txt/go_psig_both.csv")

goseq

Perform nullp

References

Return