Introduction

This is a knitr version of Ramzi’s excellently done DEAnalysis.R for Andrew’s Leishmania amazonensis transcriptome.

Basically I am just wrapping the various sections in knitr R blocks and putting some explanatory text in between so that I make certain I know what is going on at each stage.

I should note that I was very skeptical of Ramzi’s pipeline/.conf file methodology, but I think that it was in fact very well done.

##rm(list = ls())
##source("http://bioconductor.org/biocLite.R")
library("preprocessCore")
library("sva")
library("gplots")
library("ggplot2")
library("RColorBrewer")
library("DESeq")
library("edgeR")
library("limma")
library("cbcbSEQ")
source("DEA_Core_Functions_BVL_V2.2_PublicationPlots.R")
setwd("~/scratch/rnaseq_amazonensis")

Configuration

First set up the configuration used by this set of tools

Design.file = "Andrew2_HPGL0199_HPGL0216.conf"
counts.dir = "counts_unstranded"
DEComparison.file = "DEComparison.conf"
name.prefix = "Andrew_project2_"
OutputDir = "Results_DEAnalysis_unstranded"
gff.file = "Lmexicana_TriTrypDB-4.2.gff"
SelFeature = "gene"
fields = c("ID","Name","description")
adjpval = .05

Start the analysis

I am skipping the DEAnalysis call in Ramzi’s script, but will instead perform the following analyses one at a time.

Design = read.table(Design.file,header=T)
DEComparisons=read.table(DEComparison.file)
nprefix=basename(Design.file)
conds = Design$Condition
Batch = Design$Batch
print("Read the first count file to fetch genes names") 
## [1] "Read the first count file to fetch genes names"
tmp = read.delim(as.character(paste(counts.dir, "/", Design[1,1],".count",sep="")),
    header=F, sep="\t", na.strings="NA", dec=".", strip.white=TRUE, comment.char = "#")

print("consolidate CountsTable from individual count files")
## [1] "consolidate CountsTable from individual count files"
CountsTable = matrix(nrow=dim(tmp)[1],ncol=dim(Design)[1])
rownames(CountsTable) = tmp[,1]
colnames(CountsTable) = Design[,1]
CountsTable[,1] = tmp[,2]
for(i in 2:dim(Design)[1]) {
    ##print(as.character(Design[i,1]))
    CountsTable[,i] = read.delim(as.character(paste(counts.dir, "/", Design[i,1],".count",sep="")),
                   header=F, sep="\t", na.strings="NA",
                   dec=".", strip.white=TRUE, comment.char = "#",)[,2]
}

Now write out the count tables

Also write out a plot of the non-zero genes observed

dir.create(paste("../",OutputDir,sep=""), showWarnings = FALSE)
setwd(paste("../",OutputDir,sep=""))
print("# write count table into a file ")
## [1] "# write count table into a file "
write.table(cbind(GeneID=rownames(CountsTable),CountsTable),paste(name.prefix,"_CountsTable.xls",sep=""),sep="\t",row.names = F)

print("# find number of expressed gene / total reads per condition")
## [1] "# find number of expressed gene / total reads per condition"
NZG_TR=data.frame(SampleID=colnames(CountsTable),NonZeroGenes=colSums(CountsTable>1),Count=colSums(CountsTable)*1e-6,Condition=Design$C,Batch=Design$B)
write.table(NZG_TR,file=paste(name.prefix,"_","NZG_CPM_data.xls",sep=""),sep="\t",row.names=F)
  
mycpmp=ggplot(NZG_TR, aes(Count,NonZeroGenes, color=Condition, shape=Batch))+
    scale_shape_identity() +      
    geom_point(stat="identity",size=5) + 
    theme(axis.text.x=element_text(angle=-90)) + 
    ## geom_text(aes(label=SampleID), size=3,vjust=-1) + 
    theme_bw()
  
##ggsave(mycpmp, file=paste(name.prefix,"_","NZG_CPM_plot.png",sep=""), dpi=600)
mycpmp

plot of chunk write_count_tables

Filter low count genes

print("Filter low count genes")
## [1] "Filter low count genes"
ntgenes = dim(CountsTable)[1]
CountsTable = CountsTable[rowSums(CountsTable) > dim(CountsTable)[2],]
nfgenes = dim(CountsTable)[1]
print(paste("Total Number Of Genes: ",ntgenes," / Number Of Genes After Filtering: ",nfgenes, " / Percentage Of Genes after filtering: ", round(100*nfgenes/ntgenes,digits=2),"%"))
## [1] "Total Number Of Genes:  8334  / Number Of Genes After Filtering:  8114  / Percentage Of Genes after filtering:  97.36 %"
### add number of gene  before and after filtering and percentage 

Plot the library size

print("Plot Library size (millions)")
## [1] "Plot Library size (millions)"
tmp = cbind(Design[,-3],Count=colSums(CountsTable)*1e-6)
mycpmp = ggplot(tmp, aes(HPGL_ID,Count, fill=Condition))+ 
    geom_bar(stat="identity", position="dodge") + 
    theme(axis.text.x=element_text(angle=-90)) + 
    theme_bw()
ggsave(mycpmp, file=paste(name.prefix,"_","Count_plot.png",sep=""), dpi=600)
## Saving 7 x 5 in image

Plot the heatmaps and PCA plot of the raw data

print("# generate HeatMap & PCA raw data")
## [1] "# generate HeatMap & PCA raw data"
plot.cluster2(CountsTable,Design,name.prefix) 
## [1] "# HeatMap Plot"
## [1] "# PCA plot"
## Saving 7 x 5 in image

plot of chunk raw_plot_pca_heatmap

print("generate table with correlation of (Condition,Batch) to Principal component")
## [1] "generate table with correlation of (Condition,Batch) to Principal component"

Normalize and replot the data

print("# generate HeatMap & PCA normalized log2cpm data")
## [1] "# generate HeatMap & PCA normalized log2cpm data"
qcounts = qNorm(CountsTable)
## convert to log(counts per million)
res = log2CPM(qcounts)
y = res$y
plot.cluster2(y,Design,paste(name.prefix,"_Norm_log2CPM_Heatmap.png",sep=""))
## [1] "# HeatMap Plot"
## [1] "# PCA plot"
## Saving 7 x 5 in image

plot of chunk normalize_replot

print("# generate PCA to Condition/Batch Correlation table raw data ") 
## [1] "# generate PCA to Condition/Batch Correlation table raw data "
cSVD=makeSVD(CountsTable)
cpcRes=pcRes(cSVD$v,cSVD$d,Design$Condition,Design$Batch)
write.table(cpcRes,paste(name.prefix,"_PCcorCondBatch_raw_Table.xls",sep=""),sep="\t",row.names = F)
print("# generate PCA to Condition/Batch Correlation table normalized log2CPM data ") 
## [1] "# generate PCA to Condition/Batch Correlation table normalized log2CPM data "
cSVD=makeSVD(y)
cpcRes=pcRes(cSVD$v,cSVD$d,Design$Condition,Design$Batch)
write.table(cpcRes,paste(name.prefix,"_PCcorCondBatch_log2CPM_Table.xls",sep=""),sep="\t",row.names = F)

Batch correction in case of major batch effect

print("Perform Batch Correction")  
CountsTable2 = batchSEQ(CountsTable,model.matrix(~0+Design$C),Design$B,Design$C)
CountsTable2 = CountsTable2$elist
colnames(CountsTable2$design) = sapply(strsplit(colnames(CountsTable2$design),split="\\$"), "[[", 2)   
colnames(CountsTable2$design) = substr(colnames(CountsTable2$design),2,nchar(colnames(CountsTable2$design)))
plot.cluster2(CountsTable2$E,Design,paste(name.prefix,"Batch_corrected"))
cSVD = makeSVD(CountsTable2)
cpcRes = pcRes(cSVD$v,cSVD$d,Design$Condition,Design$Batch)
write.table(cpcRes,paste(name.prefix,"_PCcorCondBatch_BatchCorrected_Table.xls",sep=""),sep="\t",row.names = F)

Voom invocation

CountsTable2 = voom(CountsTable, model.matrix(~0+Design$Condition), plot=TRUE)

plot of chunk voom_invocation

colnames(CountsTable2$design) = sapply(strsplit(colnames(CountsTable2$design), split="\\$"), "[[", 2)   
colnames(CountsTable2$design) = substr(colnames(CountsTable2$design), 2,
            nchar(colnames(CountsTable2$design)))

Correct batch in the model via limma

print("correct for batch in limma model")
## [1] "correct for batch in limma model"
Design$Batch = as.character(Design$Batch)
##mod = model.matrix(~0+Design$C+Design$B)  #limma
##mod = model.matrix(~0+conds+Batch)
batch_model = model.matrix(~Batch)
both_model = model.matrix(~0+conds+Batch)
CountsTable2 = voom(CountsTable, batch_model, plot=TRUE)

plot of chunk limma_batch

## My understanding is to fit the linear model now

fit = lmFit(CountsTable2)
batch_removed = residuals(fit, CountsTable2)
decomposed = makeSVD(batch_removed)
pcRes(decomposed$v, decomposed$d, conds, Batch)
##   propVar cumPropVar cond.R2 batch.R2
## 1   43.61      43.61   65.10     0.01
## 2   33.77      77.38   32.26     0.03
## 3   13.62      91.00    1.52     0.00
## 4    8.90      99.90    1.12     0.16
## 5    0.10     100.00    0.00    99.81
plot.cluster2(batch_removed ,Design, "decomposed.png")
## [1] "# HeatMap Plot"
## [1] "# PCA plot"
## Saving 7 x 5 in image

plot of chunk limma_batch

## Now fit the normalized data and used that to make the contrasts
norm_linear_model = voom(qcounts, both_model, plot=TRUE)

plot of chunk limma_batch

norm_design = norm_linear_model$design
fit = lmFit(norm_linear_model)

contrast_matrix = makeContrasts(WT_dFE_v_WT_rFE = condsWT_dFE-condsWT_rFE, levels=norm_linear_model$design)
fit = contrasts.fit(fit, contrast_matrix)
contrast_data = eBayes(fit)
replete_deplete_table = topTable(contrast_data, coef="WT_dFE_v_WT_rFE", number=nrow(norm_linear_model$E), sort.by="logFC")
write.csv(replete_deplete_table, file="replete_deplete_comparison.csv")

cpm_values = norm_linear_model$E
gene_names = rownames(replete_deplete_table)
fitted_means = rowMeans(cpm_values[gene_names,])
ma_plot_data = data.frame(avg_exp=fitted_means,
    logfc=replete_deplete_table$logFC,
    adj_pv=replete_deplete_table$adj.P.Val)
ggplot(ma_plot_data, aes(avg_exp,logfc, color=(adj_pv < 0.05))) +
    geom_hline(yintercept=c(-1,1), color="Red", size=2) +
    geom_point(stat="identity", size=3) +
    theme(axis.text.x=element_text(angle=-90)) +
    xlab("Average Count(Million Reads)") +
    ylab("log FC") + 
    theme_bw()

plot of chunk limma_batch

#colnames(mod) = substr(colnames(mod),9,nchar(colnames(mod)))
#CountsTable2 = voom(CountsTable, mod, plot=TRUE)
#print("create Contrast matrix from DEComparison file")

### This is where everything falls apart
#kont = paste("makeContrasts(",paste("ctr",1,"=", DEComparisons[1,1], "-", DEComparisons[1,2],sep=""))
#for (i in 2:nrow(DEComparisons)) {
#    kont=paste(kont,paste("ctr",i,"=", DEComparisons[i,1], "-", DEComparisons[i,2],sep=""),sep=",")
#}
#kont=paste(kont,", levels=CountsTable2$design)")
#kont
###  I think the actual call should be
##assign("matC",eval(parse(text=kont)))
## Though I would think it would be dFE-rFE and have replete as control...
#matC = makeContrasts(WT_rFE - WT_dFE, levels=c("WT_rFE","WT_dFE"))
#print("Contrast matrix created")
#print(matC)

Get the fit data

fit0 = lmFit(CountsTable2)
fit = contrasts.fit(fit0, matC)  
eb = eBayes(fit)

Perform contrasts

for (i in 1:dim(DEComparisons)[1]) {
    cond1=as.character(DEComparisons[i,1])
    cond2=as.character(DEComparisons[i,2])
    print(paste("performing DE analysis :",cond1,"Vs.",cond2,sep=" "))
    top1 = topTable(eb, coef=paste("ctr",i,sep=""), n=nrow(CountsTable2$E),sort.by="M")
    
    print("MAPlot generation")
    madata=data.frame(AvgExp=rowMeans(CountsTable2$E[top1$ID,]),LogFC=top1$logFC,AdjPVal=top1$adj.P.Val)
    mymap=ggplot(madata, aes(AvgExp,LogFC, color=(AdjPVal<adjpval))) +
        geom_hline(yintercept=c(-1,1),color="Red",size=2) +
        geom_point(stat="identity",size=3) +
        theme(axis.text.x=element_text(angle=-90)) +
        xlab("Average Count(Million Reads)") +ylab("log FC") + 
        theme_bw()
                 
    ggsave(mymap, file=paste(name.prefix,"_",cond1,"_", cond2,"_MA_plot.png",sep=""), dpi=600)
}

Plot p-value distribution and write tables

for (i in 1:dim(DEComparisons)[1]) {
    print("Plot P-value Distribution")
    png(paste(name.prefix,"_",cond1,"_", cond2,"_pval_plot.png",sep=""))
    hist(top1$P)
    dev.off()
    FC=2^top1$logFC
    FC_hr=FC
    FC_hr[which(FC_hr<1)]=-1/FC_hr[which(FC_hr<1)]  
    E1=2 * top1$AveExpr / (1+FC)
    E2=FC * E1
    res=cbind(id=top1$ID,E1,E2,AveExpr=top1$AveExpr,FC,FC_hr,top1[,c("logFC","P.Value","adj.P.Val")])
    names(res)[match("E1", names(res))] = cond2
    names(res)[match("E2", names(res))] = cond1
    print("Annotate full table")
    resAnot=annotate.data(res,anot.file,source.format,SelFeature,fields,cleandescription)
    write.table(resAnot,paste(name.prefix,"_",cond1,"_", cond2,"_res_anot.xls",sep=""),sep="\t",row.names = F)
    print("Annotate resSig table")
    resSig=res[ res$adj.P.Val < adjpval, ]
    if (dim(resSig)[1] > 0) {
        print(dim(resSig))
        resSigAanot=annotate.data(resSig,anot.file,source.format,SelFeature,fields,cleandescription)
        write.table(resSigAanot,paste(name.prefix,"_",cond1,"_", cond2,"_res_sig_anot.xls",sep=""),sep="\t",row.names = F)
      }
}