1 Repeat a previous set of analyses and see where we differ

I am going to intersperse the original comments and code, then compare with my own analyses.

1.1 Log file

lminfectome_dillonl_20150609_PCA_heatmap_samplediagnostics_limmaac_amast_CDSonly_mouse_forpublication.log

#Complete batches are named B, D, and E for historical reasons.

#Samples
#Name           condition       batch
#HPGL0165       metac (Ficoll)  B
#HPGL0167       amast4          B
#HPGL0169       amast24         B
#HPGL0171       amast48         B
#HPGL0173       amast72         B
#HPGL0230       metac (Ficoll)  D
#HPGL0232       amast4          D
#HPGL0234       amast24         D
#HPGL0236       amast48         D
#HPGL0238       amast72         D
#HPGL0329       metac (Ficoll)  E
#HPGL0389       amast4          E
#HPGL0391       amast24         E
#HPGL0393       amast48         E
#HPGL0395       amast72         E
#Counts in counts tables:
#/cbcb/nelsayed-scratch/dillonl/count_tables
#20140501_Lmajor60_167_169_171_173_191_232_234_236_238_389_391_393_395_CDSonly.count
#20140127_Lmajor60_proc_metac_096_097_098_164_165_192_193_228_229_230_324_325_326_finalset_CDSonly.count
#HPGL0329_Lmajor60_20150609_CDSonly.count
#Includes headers
tt <- sm(library(DESeq2))
tt <- sm(library(preprocessCore))
tt <- sm(library(cbcbSEQ))
tt <- sm(library(edgeR))
tt <- sm(library(RColorBrewer))
tt <- sm(library(gplots))

##Amastigote samples
countsTable <- read.table("preprocessing/dillonl/20140501_Lmajor60_167_169_171_173_191_232_234_236_238_389_391_393_395_CDSonly.count.xz", header=TRUE)
#Remove batch C (HPGL0191)
countsTable <- countsTable[,-6]
#Remove "exon_" from gene IDs
countsTable$id <- gsub("exon_", "", countsTable$id)
#Metacyclic samples
metac <- read.table("preprocessing/dillonl/20140127_Lmajor60_proc_metac_096_097_098_164_165_192_193_228_229_230_324_325_326_finalset_CDSonly.count.xz", header=TRUE)
#Restrict to HPGL0165 and HPGL0230
metac <- metac[,c(1,6,11)]
HPGL0329 <- read.table("preprocessing/dillonl/HPGL0329_Lmajor60_20150609_CDSonly.count.xz", header=FALSE)
colnames(HPGL0329) <- c("id", "HPGL0329")
#Remove "exon_" from gene IDs
HPGL0329$id <- gsub("exon_", "", HPGL0329$id)
metac <- merge(metac, HPGL0329, by="id", all=TRUE)
#Create single count table
countsTable <- merge(countsTable, metac, by="id", all=TRUE)
#Remove id as an actual column, use as row names instead
rownames(countsTable) <- countsTable$id
countsTable <- countsTable[,-1]
head(countsTable, n=3L)
##                HPGL0167 HPGL0169 HPGL0171 HPGL0173 HPGL0232 HPGL0234
## LmjF.01.0010-1      715      407      302       90      779      553
## LmjF.01.0020-1      678      428      327      105      954      658
## LmjF.01.0030-1      401      344      341      100      736      640
##                HPGL0236 HPGL0238 HPGL0389 HPGL0391 HPGL0393 HPGL0395
## LmjF.01.0010-1      314      231      291      189       90       68
## LmjF.01.0020-1      279      248      286      218      120       86
## LmjF.01.0030-1      231      203      233      177       98       80
##                HPGL0165 HPGL0230 HPGL0329
## LmjF.01.0010-1     1554     1905     1835
## LmjF.01.0020-1      480      951      980
## LmjF.01.0030-1      345     1219      953
#               HPGL0167 HPGL0169 HPGL0171 HPGL0173 HPGL0232 HPGL0234 HPGL0236 HPGL0238 HPGL0389 HPGL0391 HPGL0393 HPGL0395 HPGL0165 HPGL0230 HPGL0329
#LmjF.01.0010-1      715      407      302       90      779      553      314      231      291      189       90       68     1554     1905     1835
#LmjF.01.0020-1      678      428      327      105      954      658      279      248      286      218      120       86      480      951      980
#LmjF.01.0030-1      401      344      341      100      736      640      231      203      233      177       98       80      345     1219      953
#Reorder into batches
countsTable <- countsTable[,c(13,1:4,14,5:8,15,9:12)]
head(countsTable, n=3L)
##                HPGL0165 HPGL0167 HPGL0169 HPGL0171 HPGL0173 HPGL0230
## LmjF.01.0010-1     1554      715      407      302       90     1905
## LmjF.01.0020-1      480      678      428      327      105      951
## LmjF.01.0030-1      345      401      344      341      100     1219
##                HPGL0232 HPGL0234 HPGL0236 HPGL0238 HPGL0329 HPGL0389
## LmjF.01.0010-1      779      553      314      231     1835      291
## LmjF.01.0020-1      954      658      279      248      980      286
## LmjF.01.0030-1      736      640      231      203      953      233
##                HPGL0391 HPGL0393 HPGL0395
## LmjF.01.0010-1      189       90       68
## LmjF.01.0020-1      218      120       86
## LmjF.01.0030-1      177       98       80
#               HPGL0165 HPGL0167 HPGL0169 HPGL0171 HPGL0173 HPGL0230 HPGL0232 HPGL0234 HPGL0236 HPGL0238 HPGL0329 HPGL0389 HPGL0391 HPGL0393 HPGL0395
#LmjF.01.0010-1     1554      715      407      302       90     1905      779      553      314      231     1835      291      189       90       68
#Establish metadata for samples
sampleID <- colnames(countsTable)
condition <- rep(c("metac", "amast4", "amast24", "amast48", "amast72"), times=3)
batch <- rep(c("B", "D", "E"), each=5)
#Create condition and batch factor
condition <- factor(condition, levels=c("metac", "amast4", "amast24", "amast48", "amast72"))
batch <- factor(batch, levels=c("B", "D", "E"))
design <- data.frame(sampleID=sampleID, condition=condition, batch=batch)
design
##    sampleID condition batch
## 1  HPGL0165     metac     B
## 2  HPGL0167    amast4     B
## 3  HPGL0169   amast24     B
## 4  HPGL0171   amast48     B
## 5  HPGL0173   amast72     B
## 6  HPGL0230     metac     D
## 7  HPGL0232    amast4     D
## 8  HPGL0234   amast24     D
## 9  HPGL0236   amast48     D
## 10 HPGL0238   amast72     D
## 11 HPGL0329     metac     E
## 12 HPGL0389    amast4     E
## 13 HPGL0391   amast24     E
## 14 HPGL0393   amast48     E
## 15 HPGL0395   amast72     E
##   sampleID condition batch
#1  HPGL0165     metac     B
#2  HPGL0167    amast4     B
#3  HPGL0169   amast24     B
#4  HPGL0171   amast48     B
#5  HPGL0173   amast72     B
#6  HPGL0230     metac     D
#7  HPGL0232    amast4     D
#8  HPGL0234   amast24     D
#9  HPGL0236   amast48     D
#10 HPGL0238   amast72     D
#11 HPGL0329     metac     E
#12 HPGL0389    amast4     E
#13 HPGL0391   amast24     E
#14 HPGL0393   amast48     E
#15 HPGL0395   amast72     E

colnames(countsTable) <- paste(condition, batch, 1:length(condition), sep=".")
head(countsTable, n=3L)
##                metac.B.1 amast4.B.2 amast24.B.3 amast48.B.4 amast72.B.5
## LmjF.01.0010-1      1554        715         407         302          90
## LmjF.01.0020-1       480        678         428         327         105
## LmjF.01.0030-1       345        401         344         341         100
##                metac.D.6 amast4.D.7 amast24.D.8 amast48.D.9 amast72.D.10
## LmjF.01.0010-1      1905        779         553         314          231
## LmjF.01.0020-1       951        954         658         279          248
## LmjF.01.0030-1      1219        736         640         231          203
##                metac.E.11 amast4.E.12 amast24.E.13 amast48.E.14
## LmjF.01.0010-1       1835         291          189           90
## LmjF.01.0020-1        980         286          218          120
## LmjF.01.0030-1        953         233          177           98
##                amast72.E.15
## LmjF.01.0010-1           68
## LmjF.01.0020-1           86
## LmjF.01.0030-1           80
#               metac.B.1 amast4.B.2 amast24.B.3 amast48.B.4 amast72.B.5 metac.D.6 amast4.D.7 amast24.D.8 amast48.D.9
#LmjF.01.0010-1      1554        715         407         302          90      1905        779         553         314
#LmjF.01.0020-1       480        678         428         327         105       951        954         658         279
#LmjF.01.0030-1       345        401         344         341         100      1219        736         640         231
#               amast72.D.10 metac.E.11 amast4.E.12 amast24.E.13 amast48.E.14 amast72.E.15
#LmjF.01.0010-1          231       1835         291          189           90           68
#LmjF.01.0020-1          248        980         286          218          120           86
#LmjF.01.0030-1          203        953         233          177           98           80
#View barplot of raw counts by sample
barplot(colSums(countsTable), las=3, ylim=c(0,2.5e+07))

#Normalize raw counts using DESeq size factors
df <- data.frame(cond=(condition))
dds <- DESeqDataSetFromMatrix(countData=countsTable, colData = df, design = ~ cond)
dds <- estimateSizeFactors(dds)
ncts <- counts(dds, normalized=TRUE)
y <- log(ncts + 1)
#Determine median and quantiles of the size factor normalized, log2 counts
median(y)
## [1] 6.035
#[1] 6.035387
#compared to 5.776822 without metac samples included
quantile(y)
##     0%    25%    50%    75%   100% 
##  0.000  5.598  6.035  6.471 11.901
#       0%       25%       50%       75%      100%
# 0.000000  5.597645  6.035387  6.470512 11.900627
#compare to:
#       0%       25%       50%       75%      100%
# 0.000000  5.348148  5.776822  6.196398 11.638203
#without metac samples included
#Determine number of genes per sample with less than log2 of 2 counts (less than 4 counts per mill)
ydf <- as.data.frame(y)
colnames(ydf) <- sampleID
colnames(ydf)
##  [1] "HPGL0165" "HPGL0167" "HPGL0169" "HPGL0171" "HPGL0173" "HPGL0230"
##  [7] "HPGL0232" "HPGL0234" "HPGL0236" "HPGL0238" "HPGL0329" "HPGL0389"
## [13] "HPGL0391" "HPGL0393" "HPGL0395"
# [1] "HPGL0165" "HPGL0167" "HPGL0169" "HPGL0171" "HPGL0173" "HPGL0230" "HPGL0232" "HPGL0234" "HPGL0236" "HPGL0238"
#[11] "HPGL0329" "HPGL0389" "HPGL0391" "HPGL0393" "HPGL0395"

col.nam <- paste(condition, batch, 1:length(condition), sep=".")
#Boxplot of per sample log of size-factor-normalized counts (+ 1)
par(mar=c(10.5,4.5,2,1))
par(oma=c(0,0,0,0))
boxplot(y, names=col.nam, las=3)

#Heatmap of Pearson correlation between samples
#(delete Rowv, Colv, and dendrogram parameters if want samples to sort and show dendrogram)
#Correlation analysis (Pearson is default for cor unless othewise specified)
#Using raw counts
datCor <- cor(countsTable)
heatmap.2(datCor, Rowv=NA, Colv=NA,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          dendrogram="none",
          scale="none",
          trace="none",
          srtCol=45)

#Median pairwise correlation
#Using raw counts
corM <- matrixStats::rowMedians(cor(countsTable))
qs <- quantile(corM,p=c(1,3)/4)
iqr <- diff(qs)
outLimit <- qs[1] - 1.5 * iqr

ylim <- c(pmin(min(corM),outLimit),max(corM))
col <-  ifelse(condition=="amast4", "cornflowerblue",
        ifelse(condition=="amast24", "gold",
        ifelse(condition=="amast48", "green3",
        ifelse(condition=="amast72", "coral", "mediumorchid4"))))
plot(corM, xaxt="n", ylim=ylim, ylab="Median Pairwise Correlation", xlab="", main="", col=col, pch=16, cex=2.2)
axis(side=1,at=seq(along=corM),labels=paste(condition,batch,sep=":"),las=2)
abline(h=outLimit,lty=2)
abline(v=1:length(col.nam), lty=3, col="black")

#Plot without IQR cutoff

#Use filterCounts method to filter for low counts
#Define filterCounts function
filterCounts = function (counts, lib.size = NULL, thresh = 1, minSamples = 2) {
    cpms <- 2^log2CPM(counts, lib.size = lib.size)$y
    keep <- rowSums(cpms > thresh) >= minSamples
    counts <- counts[keep, ]
    counts
}
x <- table(condition)
dim(countsTable)
## [1] 8486   15
#[1] 8486   15

counts <- filterCounts(countsTable, thresh=1, minSamples=min(x))
dim(counts)
## [1] 8479   15
#[1] 8479   15
#Same number after filtering as when metac is not included
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Explore data for batch effects
##Transform data to log2 counts per million
x <- cbcbSEQ::log2CPM(countsSubQ)
#Compute principal components
s <- makeSVD(x$y)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
##    propVar cumPropVar cond.R2 batch.R2
## 1    41.84      41.84   97.04     0.20
## 2    16.51      58.35    1.25    91.84
## 3    11.34      69.69   70.11     6.92
## 4     7.43      77.12   58.10    23.26
## 5     5.84      82.96   47.21    29.73
## 6     4.20      87.16   15.35     2.93
## 7     2.81      89.97   13.41     1.59
## 8     2.21      92.18   12.63    17.54
## 9     1.90      94.08   17.81    10.47
## 10    1.61      95.69   26.25     0.42
## 11    1.31      97.00   18.15    12.73
## 12    1.16      98.16    3.36     0.26
## 13    0.97      99.13   17.20     0.29
## 14    0.85      99.98    2.10     1.82
#   propVar cumPropVar cond.R2 batch.R2
#1    41.84      41.84   97.04     0.20
#2    16.51      58.35    1.25    91.84
#3    11.34      69.69   70.11     6.92
#4     7.43      77.12   58.10    23.26
#5     5.84      82.96   47.21    29.73
#6     4.20      87.16   15.35     2.93
#7     2.81      89.97   13.41     1.59
#8     2.21      92.18   12.63    17.54
#9     1.90      94.08   17.81    10.47
#10    1.61      95.69   26.25     0.42
#11    1.31      97.00   18.15    12.73
#12    1.16      98.16    3.36     0.26
#13    0.97      99.13   17.20     0.29
#14    0.85      99.98    2.10     1.82
#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="B", 25, ifelse(batch=="D", 21, ifelse(batch=="E", 22, ifelse(batch=="A", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.34, y=0.45, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(counts))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

#I would like to create a PCA plot and heatmap to reflect the use of batch in the limma model
#Specify model
mod <- model.matrix(~batch)
#Use voom to determine weights for fitting the mean-variance trend for each gene
v <- voom(countsSubQ, mod)
#Fit the linear model for each gene
fit <- lmFit(v)
#Get the residual (i.e. everything except for the batch effect)
newData <- residuals(fit, v)
#Explore data for batch effects
#Compute principal components
s <- makeSVD(newData)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
##    propVar cumPropVar cond.R2 batch.R2
## 1    52.52      52.52   97.16        0
## 2    13.73      66.25   78.69        0
## 3     9.09      75.34   79.68        0
## 4     5.75      81.09   21.45        0
## 5     4.93      86.02   13.54        0
## 6     3.58      89.60   17.97        0
## 7     2.56      92.16   19.05        0
## 8     2.04      94.20   11.38        0
## 9     1.93      96.13   36.76        0
## 10    1.47      97.60    3.75        0
## 11    1.27      98.87   14.08        0
## 12    1.12      99.99    6.48        0
## 13    0.00      99.99    0.00      100
## 14    0.00      99.99    0.00      100
#   propVar cumPropVar cond.R2 batch.R2
#1    52.52      52.52   97.16        0
#2    13.73      66.25   78.69        0
#3     9.09      75.34   79.68        0
#4     5.75      81.09   21.45        0
#5     4.93      86.02   13.54        0
#6     3.58      89.60   17.97        0
#7     2.56      92.16   19.05        0
#8     2.04      94.20   11.38        0
#9     1.93      96.13   36.76        0
#10    1.47      97.60    3.75        0
#11    1.27      98.87   14.08        0
#12    1.12      99.99    6.48        0
#13    0.00      99.99    0.00      100
#14    0.00      99.99    0.00      100
#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="B", 25, ifelse(batch=="D", 21, ifelse(batch=="E", 22, ifelse(batch=="A", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.48, y=-0.24, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(newData))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

#############################
#############################
#DE analysis
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Specify model
mod = model.matrix(~0+condition+batch)
#View mean-variance trend
voom(countsSubQ, mod, plot=TRUE)

## An object of class "EList"
## $E
##                metac.B.1 amast4.B.2 amast24.B.3 amast48.B.4 amast72.B.5
## LmjF.01.0010-1     7.113      6.130       5.969       5.958       5.934
## LmjF.01.0020-1     5.599      6.048       6.043       6.087       6.166
## LmjF.01.0030-1     5.117      5.231       5.721       6.149       6.094
## LmjF.01.0040-1     5.008      5.140       5.632       5.465       5.849
## LmjF.01.0050-1     7.415      6.845       6.640       6.595       6.950
##                metac.D.6 amast4.D.7 amast24.D.8 amast48.D.9 amast72.D.10
## LmjF.01.0010-1     6.542      5.695       5.784       6.101        5.930
## LmjF.01.0020-1     5.588      6.000       6.037       5.919        6.035
## LmjF.01.0030-1     5.925      5.608       5.999       5.648        5.741
## LmjF.01.0040-1     4.559      4.875       5.109       5.483        5.450
## LmjF.01.0050-1     7.009      6.339       6.738       7.124        6.953
##                metac.E.11 amast4.E.12 amast24.E.13 amast48.E.14
## LmjF.01.0010-1      6.784       6.006        5.920        5.870
## LmjF.01.0020-1      5.930       5.982        6.130        6.316
## LmjF.01.0030-1      5.888       5.692        5.820        6.004
## LmjF.01.0040-1      5.103       5.130        5.144        5.545
## LmjF.01.0050-1      7.254       6.639        7.117        6.812
##                amast72.E.15
## LmjF.01.0010-1        5.820
## LmjF.01.0020-1        6.162
## LmjF.01.0030-1        6.060
## LmjF.01.0040-1        5.560
## LmjF.01.0050-1        6.833
## 8474 more rows ...
## 
## $weights
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11]
## [1,] 39.19 34.48 34.01 34.77 34.04 38.80 32.35 31.74 32.71 31.78 38.96
## [2,] 30.51 33.89 34.44 34.77 34.90 29.65 33.12 33.75 34.12 34.24 31.85
## [3,] 28.51 26.85 30.97 31.95 32.32 29.99 28.37 32.35 33.28 33.61 31.28
## [4,] 22.45 24.43 27.50 29.97 31.42 18.82 20.50 23.45 25.99 27.51 20.99
## [5,] 38.74 38.45 39.05 39.06 39.17 38.93 38.23 38.94 38.96 39.08 38.53
##      [,12] [,13] [,14] [,15]
## [1,] 33.09 32.54 33.43 32.59
## [2,] 34.93 35.44 35.74 35.85
## [3,] 29.70 33.50 34.35 34.63
## [4,] 22.90 25.97 28.48 29.97
## [5,] 38.61 39.11 39.13 39.21
## 8474 more rows ...
## 
## $design
##   conditionmetac conditionamast4 conditionamast24 conditionamast48
## 1              1               0                0                0
## 2              0               1                0                0
## 3              0               0                1                0
## 4              0               0                0                1
## 5              0               0                0                0
##   conditionamast72 batchD batchE
## 1                0      0      0
## 2                0      0      0
## 3                0      0      0
## 4                0      0      0
## 5                1      0      0
## 10 more rows ...
## 
## $targets
##             lib.size
## metac.B.1    7548103
## amast4.B.2   7548074
## amast24.B.3  7548098
## amast48.B.4  7548094
## amast72.B.5  7548024
## 10 more rows ...
#Use voom to transform quantile-normalized count data to log2-counts per million, estimate mean-variance relationship
#and use m-v relationship to computer appropriate observational-level weights
v <- voom(countsSubQ, mod)
#Fit a linear model for each gene using the specified design contained in v
fit <- lmFit(v)
##metac v amast4##
#eBayes finds an F-statistic from the set of t-statistics for that gene
metac.amast4.contr.mat <- makeContrasts(metac_v_amast4=conditionamast4-conditionmetac, levels=v$design)
metac.amast4.fit <- contrasts.fit(fit, metac.amast4.contr.mat)
metac.amast4.eb <- eBayes(metac.amast4.fit)
metac.amast4.topTab <- topTable(metac.amast4.eb, coef="metac_v_amast4", number=nrow(v$E))
write.csv(file="csv/lmajor_metac_vs_amast4_mmusculus.csv", x=metac.amast4.topTab)
#View the list of DE genes
head(metac.amast4.topTab, n=3L)
##                 logFC AveExpr      t   P.Value adj.P.Val     B
## LmjF.12.0940-1  2.810   10.85  14.54 4.900e-11 2.472e-07 15.40
## LmjF.12.1000-1  2.671   11.47  14.37 5.909e-11 2.472e-07 15.19
## LmjF.31.0350-1 -1.733   10.23 -13.98 9.146e-11 2.472e-07 14.76
#                   logFC  AveExpr         t      P.Value    adj.P.Val        B
#LmjF.12.0940-1  2.810295 10.85477  14.54169 4.899879e-11 2.472428e-07 15.39693
#LmjF.12.1000-1  2.671061 11.47061  14.37061 5.908732e-11 2.472428e-07 15.18740
#LmjF.31.0350-1 -1.733422 10.22653 -13.97803 9.146495e-11 2.472428e-07 14.75628
#Limit list to genes with an adjusted p value < 0.05
metac.amast4.sigGenes <- metac.amast4.topTab[metac.amast4.topTab$adj.P.Val <0.05, ]
length(metac.amast4.sigGenes$logFC)
## [1] 2962
#2962
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
metac.amast4.sigGenesFold1 <- subset(metac.amast4.sigGenes, abs(logFC) > 1)
length(metac.amast4.sigGenesFold1$logFC)
## [1] 299
#299
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
metac.amast4.sigGenesFold2 <- subset(metac.amast4.sigGenes, abs(logFC) > 2)
length(metac.amast4.sigGenesFold2$logFC)
## [1] 18
#18
metac.amast4.sigGenes <- metac.amast4.sigGenes[order(-metac.amast4.sigGenes$logFC), ]
#Make an MA plot
sel = metac.amast4.topTab$adj.P.Val < 0.05
top = metac.amast4.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="metac_amast4 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast4 v amast24##

##eBayes finds an F-statistic from the set of t-statistics for that gene
amast4.amast24.contr.mat <- makeContrasts(amast4_v_amast24=conditionamast24-conditionamast4, levels=v$design)
amast4.amast24.fit <- contrasts.fit(fit, amast4.amast24.contr.mat)
amast4.amast24.eb <- eBayes(amast4.amast24.fit)
amast4.amast24.topTab <- topTable(amast4.amast24.eb, coef="amast4_v_amast24", number=nrow(v$E))
#View the list of DE genes
head(amast4.amast24.topTab, n=3L)
##                logFC AveExpr     t   P.Value adj.P.Val     B
## LmjF.12.0920-1 2.103   8.008 14.28 6.537e-11 2.666e-07 14.23
## LmjF.12.1040-1 2.035   7.566 14.06 8.299e-11 2.666e-07 14.02
## LmjF.12.1020-1 2.148   7.649 13.85 1.053e-10 2.666e-07 13.83
#Limit list to genes with an adjusted p value < 0.05
amast4.amast24.sigGenes <- amast4.amast24.topTab[amast4.amast24.topTab$adj.P.Val <0.05, ]
length(amast4.amast24.sigGenes$logFC)
## [1] 301
#301
#281 before metac included
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast4.amast24.sigGenesFold1 <- subset(amast4.amast24.sigGenes, abs(logFC) > 1)
length(amast4.amast24.sigGenesFold1$logFC)
## [1] 34
#34
#27 before metac included
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast4.amast24.sigGenesFold2 <- subset(amast4.amast24.sigGenes, abs(logFC) > 2)
length(amast4.amast24.sigGenesFold2$logFC)
## [1] 6
#6
#1 before metac included
amast4.amast24.sigGenes <- amast4.amast24.sigGenes[order(-amast4.amast24.sigGenes$logFC), ]
#Make an MA plot
sel = amast4.amast24.topTab$adj.P.Val < 0.05
top = amast4.amast24.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast4_amast24 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast24 v amast48##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast24.amast48.contr.mat <- makeContrasts(amast24_v_amast48=conditionamast48-conditionamast24, levels=v$design)
amast24.amast48.fit <- contrasts.fit(fit, amast24.amast48.contr.mat)
amast24.amast48.eb <- eBayes(amast24.amast48.fit)
amast24.amast48.topTab <- topTable(amast24.amast48.eb, coef="amast24_v_amast48", number=nrow(v$E))
#View the list of DE genes
head(amast24.amast48.topTab, n=3L)
##                 logFC AveExpr     t   P.Value adj.P.Val     B
## LmjF.16.0500-1 0.9526   6.184 5.831 1.985e-05   0.05165 2.881
## LmjF.36.2350-1 1.6293   8.059 5.612 3.083e-05   0.05165 2.513
## LmjF.31.1700-1 0.6852   7.338 5.528 3.662e-05   0.05165 2.379
#                   logFC  AveExpr        t      P.Value  adj.P.Val        B
#LmjF.16.0500-1 0.9526423 6.184427 5.831439 1.985098e-05 0.05165267 2.881463
#LmjF.36.2350-1 1.6292766 8.058532 5.612429 3.082699e-05 0.05165267 2.513140
#LmjF.31.1700-1 0.6852255 7.338441 5.527579 3.661813e-05 0.05165267 2.379194
#Limit list to genes with an adjusted p value < 0.05
amast24.amast48.sigGenes <- amast24.amast48.topTab[amast24.amast48.topTab$adj.P.Val <0.05, ]
length(amast24.amast48.sigGenes$logFC)
## [1] 0
#0
#0 before metac included
#Make an MA plot
sel = amast24.amast48.topTab$adj.P.Val < 0.05
top = amast24.amast48.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast24_amast48 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast48 v amast72##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast48.amast72.contr.mat <- makeContrasts(amast48_v_amast72=conditionamast72-conditionamast48, levels=v$design)
amast48.amast72.fit <- contrasts.fit(fit, amast48.amast72.contr.mat)
amast48.amast72.eb <- eBayes(amast48.amast72.fit)
amast48.amast72.topTab <- topTable(amast48.amast72.eb, coef="amast48_v_amast72", number=nrow(v$E))
#View the list of DE genes
head(amast48.amast72.topTab, n=3L)
##                 logFC AveExpr     t   P.Value adj.P.Val      B
## LmjF.36.5250-1 0.8051   7.689 5.757 2.306e-05    0.1273 0.8850
## LmjF.15.0440-5 1.1439   7.165 5.625 3.004e-05    0.1273 0.8114
## LmjF.05.0240-1 0.5681   6.693 5.272 6.179e-05    0.1747 0.5072
#                   logFC  AveExpr        t      P.Value adj.P.Val         B
#LmjF.36.5250-1 0.8051206 7.688751 5.756536 2.305982e-05 0.1273414 0.8849812
#LmjF.15.0440-5 1.1439317 7.165058 5.625265 3.003690e-05 0.1273414 0.8114074
#LmjF.05.0240-1 0.5680832 6.693406 5.272282 6.179471e-05 0.1746524 0.5072436
#Limit list to genes with an adjusted p value < 0.05
amast48.amast72.sigGenes <- amast48.amast72.topTab[amast48.amast72.topTab$adj.P.Val <0.05, ]
length(amast48.amast72.sigGenes$logFC)
## [1] 0
#0
#0 before metac included
#Make an MA plot
sel = amast48.amast72.topTab$adj.P.Val < 0.05
top = amast48.amast72.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast48_amast72 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

---
title: "lminfectome 2016: RNAseq of L.major/amazonensis, repeating mouse log."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: tango
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: cosmo
  toc: true
  toc_float:
   collapsed: false
   smooth_scroll: false
---

<style>
body .main-container {
max-width: 1600px;
}
</style>

```{r options, include=FALSE}
## These are the options I tend to favor
library("hpgltools")
knitr::opts_knit$set(
    progress = TRUE,
    verbose = TRUE,
    width = 90,
    echo = TRUE)
knitr::opts_chunk$set(
    error = TRUE,
    fig.width = 8,
    fig.height = 8,
    dpi = 96)
options(
    digits = 4,
    stringsAsFactors = FALSE,
    knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
set.seed(1)
rmd_file <- "repeat_log_lmajor_mmusculus.Rmd"
```

# Repeat a previous set of analyses and see where we differ

I am going to intersperse the original comments and code, then compare with my own analyses.

## Log file

lminfectome_dillonl_20150609_PCA_heatmap_samplediagnostics_limmaac_amast_CDSonly_mouse_forpublication.log

<pre>
#Complete batches are named B, D, and E for historical reasons.

#Samples
#Name           condition       batch
#HPGL0165       metac (Ficoll)  B
#HPGL0167       amast4          B
#HPGL0169       amast24         B
#HPGL0171       amast48         B
#HPGL0173       amast72         B
#HPGL0230       metac (Ficoll)  D
#HPGL0232       amast4          D
#HPGL0234       amast24         D
#HPGL0236       amast48         D
#HPGL0238       amast72         D
#HPGL0329       metac (Ficoll)  E
#HPGL0389       amast4          E
#HPGL0391       amast24         E
#HPGL0393       amast48         E
#HPGL0395       amast72         E
#Counts in counts tables:
#/cbcb/nelsayed-scratch/dillonl/count_tables
#20140501_Lmajor60_167_169_171_173_191_232_234_236_238_389_391_393_395_CDSonly.count
#20140127_Lmajor60_proc_metac_096_097_098_164_165_192_193_228_229_230_324_325_326_finalset_CDSonly.count
#HPGL0329_Lmajor60_20150609_CDSonly.count
#Includes headers
</pre>

```{r repeat_log}
tt <- sm(library(DESeq2))
tt <- sm(library(preprocessCore))
tt <- sm(library(cbcbSEQ))
tt <- sm(library(edgeR))
tt <- sm(library(RColorBrewer))
tt <- sm(library(gplots))

##Amastigote samples
countsTable <- read.table("preprocessing/dillonl/20140501_Lmajor60_167_169_171_173_191_232_234_236_238_389_391_393_395_CDSonly.count.xz", header=TRUE)
#Remove batch C (HPGL0191)
countsTable <- countsTable[,-6]
#Remove "exon_" from gene IDs
countsTable$id <- gsub("exon_", "", countsTable$id)
#Metacyclic samples
metac <- read.table("preprocessing/dillonl/20140127_Lmajor60_proc_metac_096_097_098_164_165_192_193_228_229_230_324_325_326_finalset_CDSonly.count.xz", header=TRUE)
#Restrict to HPGL0165 and HPGL0230
metac <- metac[,c(1,6,11)]
HPGL0329 <- read.table("preprocessing/dillonl/HPGL0329_Lmajor60_20150609_CDSonly.count.xz", header=FALSE)
colnames(HPGL0329) <- c("id", "HPGL0329")
#Remove "exon_" from gene IDs
HPGL0329$id <- gsub("exon_", "", HPGL0329$id)
metac <- merge(metac, HPGL0329, by="id", all=TRUE)
#Create single count table
countsTable <- merge(countsTable, metac, by="id", all=TRUE)
#Remove id as an actual column, use as row names instead
rownames(countsTable) <- countsTable$id
countsTable <- countsTable[,-1]
head(countsTable, n=3L)
#               HPGL0167 HPGL0169 HPGL0171 HPGL0173 HPGL0232 HPGL0234 HPGL0236 HPGL0238 HPGL0389 HPGL0391 HPGL0393 HPGL0395 HPGL0165 HPGL0230 HPGL0329
#LmjF.01.0010-1      715      407      302       90      779      553      314      231      291      189       90       68     1554     1905     1835
#LmjF.01.0020-1      678      428      327      105      954      658      279      248      286      218      120       86      480      951      980
#LmjF.01.0030-1      401      344      341      100      736      640      231      203      233      177       98       80      345     1219      953
#Reorder into batches
countsTable <- countsTable[,c(13,1:4,14,5:8,15,9:12)]
head(countsTable, n=3L)
#               HPGL0165 HPGL0167 HPGL0169 HPGL0171 HPGL0173 HPGL0230 HPGL0232 HPGL0234 HPGL0236 HPGL0238 HPGL0329 HPGL0389 HPGL0391 HPGL0393 HPGL0395
#LmjF.01.0010-1     1554      715      407      302       90     1905      779      553      314      231     1835      291      189       90       68
#Establish metadata for samples
sampleID <- colnames(countsTable)
condition <- rep(c("metac", "amast4", "amast24", "amast48", "amast72"), times=3)
batch <- rep(c("B", "D", "E"), each=5)
#Create condition and batch factor
condition <- factor(condition, levels=c("metac", "amast4", "amast24", "amast48", "amast72"))
batch <- factor(batch, levels=c("B", "D", "E"))
design <- data.frame(sampleID=sampleID, condition=condition, batch=batch)
design

##   sampleID condition batch
#1  HPGL0165     metac     B
#2  HPGL0167    amast4     B
#3  HPGL0169   amast24     B
#4  HPGL0171   amast48     B
#5  HPGL0173   amast72     B
#6  HPGL0230     metac     D
#7  HPGL0232    amast4     D
#8  HPGL0234   amast24     D
#9  HPGL0236   amast48     D
#10 HPGL0238   amast72     D
#11 HPGL0329     metac     E
#12 HPGL0389    amast4     E
#13 HPGL0391   amast24     E
#14 HPGL0393   amast48     E
#15 HPGL0395   amast72     E

colnames(countsTable) <- paste(condition, batch, 1:length(condition), sep=".")
head(countsTable, n=3L)
#               metac.B.1 amast4.B.2 amast24.B.3 amast48.B.4 amast72.B.5 metac.D.6 amast4.D.7 amast24.D.8 amast48.D.9
#LmjF.01.0010-1      1554        715         407         302          90      1905        779         553         314
#LmjF.01.0020-1       480        678         428         327         105       951        954         658         279
#LmjF.01.0030-1       345        401         344         341         100      1219        736         640         231
#               amast72.D.10 metac.E.11 amast4.E.12 amast24.E.13 amast48.E.14 amast72.E.15
#LmjF.01.0010-1          231       1835         291          189           90           68
#LmjF.01.0020-1          248        980         286          218          120           86
#LmjF.01.0030-1          203        953         233          177           98           80
#View barplot of raw counts by sample
barplot(colSums(countsTable), las=3, ylim=c(0,2.5e+07))
#Normalize raw counts using DESeq size factors
df <- data.frame(cond=(condition))
dds <- DESeqDataSetFromMatrix(countData=countsTable, colData = df, design = ~ cond)
dds <- estimateSizeFactors(dds)
ncts <- counts(dds, normalized=TRUE)
y <- log(ncts + 1)
#Determine median and quantiles of the size factor normalized, log2 counts
median(y)
#[1] 6.035387
#compared to 5.776822 without metac samples included
quantile(y)
#       0%       25%       50%       75%      100%
# 0.000000  5.597645  6.035387  6.470512 11.900627
#compare to:
#       0%       25%       50%       75%      100%
# 0.000000  5.348148  5.776822  6.196398 11.638203
#without metac samples included
#Determine number of genes per sample with less than log2 of 2 counts (less than 4 counts per mill)
ydf <- as.data.frame(y)
colnames(ydf) <- sampleID
colnames(ydf)
# [1] "HPGL0165" "HPGL0167" "HPGL0169" "HPGL0171" "HPGL0173" "HPGL0230" "HPGL0232" "HPGL0234" "HPGL0236" "HPGL0238"
#[11] "HPGL0329" "HPGL0389" "HPGL0391" "HPGL0393" "HPGL0395"

col.nam <- paste(condition, batch, 1:length(condition), sep=".")
#Boxplot of per sample log of size-factor-normalized counts (+ 1)
par(mar=c(10.5,4.5,2,1))
par(oma=c(0,0,0,0))
boxplot(y, names=col.nam, las=3)
#Heatmap of Pearson correlation between samples
#(delete Rowv, Colv, and dendrogram parameters if want samples to sort and show dendrogram)
#Correlation analysis (Pearson is default for cor unless othewise specified)
#Using raw counts
datCor <- cor(countsTable)
heatmap.2(datCor, Rowv=NA, Colv=NA,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          dendrogram="none",
          scale="none",
          trace="none",
          srtCol=45)
#Median pairwise correlation
#Using raw counts
corM <- matrixStats::rowMedians(cor(countsTable))
qs <- quantile(corM,p=c(1,3)/4)
iqr <- diff(qs)
outLimit <- qs[1] - 1.5 * iqr

ylim <- c(pmin(min(corM),outLimit),max(corM))
col <-  ifelse(condition=="amast4", "cornflowerblue",
        ifelse(condition=="amast24", "gold",
        ifelse(condition=="amast48", "green3",
        ifelse(condition=="amast72", "coral", "mediumorchid4"))))
plot(corM, xaxt="n", ylim=ylim, ylab="Median Pairwise Correlation", xlab="", main="", col=col, pch=16, cex=2.2)
axis(side=1,at=seq(along=corM),labels=paste(condition,batch,sep=":"),las=2)
abline(h=outLimit,lty=2)
abline(v=1:length(col.nam), lty=3, col="black")
#Plot without IQR cutoff

#Use filterCounts method to filter for low counts
#Define filterCounts function
filterCounts = function (counts, lib.size = NULL, thresh = 1, minSamples = 2) {
    cpms <- 2^log2CPM(counts, lib.size = lib.size)$y
    keep <- rowSums(cpms > thresh) >= minSamples
    counts <- counts[keep, ]
    counts
}
x <- table(condition)
dim(countsTable)
#[1] 8486   15

counts <- filterCounts(countsTable, thresh=1, minSamples=min(x))
dim(counts)
#[1] 8479   15
#Same number after filtering as when metac is not included
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Explore data for batch effects
##Transform data to log2 counts per million
x <- cbcbSEQ::log2CPM(countsSubQ)
#Compute principal components
s <- makeSVD(x$y)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
#   propVar cumPropVar cond.R2 batch.R2
#1    41.84      41.84   97.04     0.20
#2    16.51      58.35    1.25    91.84
#3    11.34      69.69   70.11     6.92
#4     7.43      77.12   58.10    23.26
#5     5.84      82.96   47.21    29.73
#6     4.20      87.16   15.35     2.93
#7     2.81      89.97   13.41     1.59
#8     2.21      92.18   12.63    17.54
#9     1.90      94.08   17.81    10.47
#10    1.61      95.69   26.25     0.42
#11    1.31      97.00   18.15    12.73
#12    1.16      98.16    3.36     0.26
#13    0.97      99.13   17.20     0.29
#14    0.85      99.98    2.10     1.82
#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="B", 25, ifelse(batch=="D", 21, ifelse(batch=="E", 22, ifelse(batch=="A", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.34, y=0.45, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(counts))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)
#I would like to create a PCA plot and heatmap to reflect the use of batch in the limma model
#Specify model
mod <- model.matrix(~batch)
#Use voom to determine weights for fitting the mean-variance trend for each gene
v <- voom(countsSubQ, mod)
#Fit the linear model for each gene
fit <- lmFit(v)
#Get the residual (i.e. everything except for the batch effect)
newData <- residuals(fit, v)
#Explore data for batch effects
#Compute principal components
s <- makeSVD(newData)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
#   propVar cumPropVar cond.R2 batch.R2
#1    52.52      52.52   97.16        0
#2    13.73      66.25   78.69        0
#3     9.09      75.34   79.68        0
#4     5.75      81.09   21.45        0
#5     4.93      86.02   13.54        0
#6     3.58      89.60   17.97        0
#7     2.56      92.16   19.05        0
#8     2.04      94.20   11.38        0
#9     1.93      96.13   36.76        0
#10    1.47      97.60    3.75        0
#11    1.27      98.87   14.08        0
#12    1.12      99.99    6.48        0
#13    0.00      99.99    0.00      100
#14    0.00      99.99    0.00      100
#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="B", 25, ifelse(batch=="D", 21, ifelse(batch=="E", 22, ifelse(batch=="A", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.48, y=-0.24, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(newData))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

#############################
#############################
#DE analysis
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Specify model
mod = model.matrix(~0+condition+batch)
#View mean-variance trend
voom(countsSubQ, mod, plot=TRUE)
#Use voom to transform quantile-normalized count data to log2-counts per million, estimate mean-variance relationship
#and use m-v relationship to computer appropriate observational-level weights
v <- voom(countsSubQ, mod)
#Fit a linear model for each gene using the specified design contained in v
fit <- lmFit(v)
##metac v amast4##
#eBayes finds an F-statistic from the set of t-statistics for that gene
metac.amast4.contr.mat <- makeContrasts(metac_v_amast4=conditionamast4-conditionmetac, levels=v$design)
metac.amast4.fit <- contrasts.fit(fit, metac.amast4.contr.mat)
metac.amast4.eb <- eBayes(metac.amast4.fit)
metac.amast4.topTab <- topTable(metac.amast4.eb, coef="metac_v_amast4", number=nrow(v$E))
write.csv(file="csv/lmajor_metac_vs_amast4_mmusculus.csv", x=metac.amast4.topTab)
#View the list of DE genes
head(metac.amast4.topTab, n=3L)
#                   logFC  AveExpr         t      P.Value    adj.P.Val        B
#LmjF.12.0940-1  2.810295 10.85477  14.54169 4.899879e-11 2.472428e-07 15.39693
#LmjF.12.1000-1  2.671061 11.47061  14.37061 5.908732e-11 2.472428e-07 15.18740
#LmjF.31.0350-1 -1.733422 10.22653 -13.97803 9.146495e-11 2.472428e-07 14.75628
#Limit list to genes with an adjusted p value < 0.05
metac.amast4.sigGenes <- metac.amast4.topTab[metac.amast4.topTab$adj.P.Val <0.05, ]
length(metac.amast4.sigGenes$logFC)
#2962
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
metac.amast4.sigGenesFold1 <- subset(metac.amast4.sigGenes, abs(logFC) > 1)
length(metac.amast4.sigGenesFold1$logFC)
#299
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
metac.amast4.sigGenesFold2 <- subset(metac.amast4.sigGenes, abs(logFC) > 2)
length(metac.amast4.sigGenesFold2$logFC)
#18
metac.amast4.sigGenes <- metac.amast4.sigGenes[order(-metac.amast4.sigGenes$logFC), ]
#Make an MA plot
sel = metac.amast4.topTab$adj.P.Val < 0.05
top = metac.amast4.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="metac_amast4 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")
##amast4 v amast24##

##eBayes finds an F-statistic from the set of t-statistics for that gene
amast4.amast24.contr.mat <- makeContrasts(amast4_v_amast24=conditionamast24-conditionamast4, levels=v$design)
amast4.amast24.fit <- contrasts.fit(fit, amast4.amast24.contr.mat)
amast4.amast24.eb <- eBayes(amast4.amast24.fit)
amast4.amast24.topTab <- topTable(amast4.amast24.eb, coef="amast4_v_amast24", number=nrow(v$E))
#View the list of DE genes
head(amast4.amast24.topTab, n=3L)
#Limit list to genes with an adjusted p value < 0.05
amast4.amast24.sigGenes <- amast4.amast24.topTab[amast4.amast24.topTab$adj.P.Val <0.05, ]
length(amast4.amast24.sigGenes$logFC)
#301
#281 before metac included
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast4.amast24.sigGenesFold1 <- subset(amast4.amast24.sigGenes, abs(logFC) > 1)
length(amast4.amast24.sigGenesFold1$logFC)
#34
#27 before metac included
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast4.amast24.sigGenesFold2 <- subset(amast4.amast24.sigGenes, abs(logFC) > 2)
length(amast4.amast24.sigGenesFold2$logFC)
#6
#1 before metac included
amast4.amast24.sigGenes <- amast4.amast24.sigGenes[order(-amast4.amast24.sigGenes$logFC), ]
#Make an MA plot
sel = amast4.amast24.topTab$adj.P.Val < 0.05
top = amast4.amast24.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast4_amast24 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast24 v amast48##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast24.amast48.contr.mat <- makeContrasts(amast24_v_amast48=conditionamast48-conditionamast24, levels=v$design)
amast24.amast48.fit <- contrasts.fit(fit, amast24.amast48.contr.mat)
amast24.amast48.eb <- eBayes(amast24.amast48.fit)
amast24.amast48.topTab <- topTable(amast24.amast48.eb, coef="amast24_v_amast48", number=nrow(v$E))
#View the list of DE genes
head(amast24.amast48.topTab, n=3L)
#                   logFC  AveExpr        t      P.Value  adj.P.Val        B
#LmjF.16.0500-1 0.9526423 6.184427 5.831439 1.985098e-05 0.05165267 2.881463
#LmjF.36.2350-1 1.6292766 8.058532 5.612429 3.082699e-05 0.05165267 2.513140
#LmjF.31.1700-1 0.6852255 7.338441 5.527579 3.661813e-05 0.05165267 2.379194
#Limit list to genes with an adjusted p value < 0.05
amast24.amast48.sigGenes <- amast24.amast48.topTab[amast24.amast48.topTab$adj.P.Val <0.05, ]
length(amast24.amast48.sigGenes$logFC)
#0
#0 before metac included
#Make an MA plot
sel = amast24.amast48.topTab$adj.P.Val < 0.05
top = amast24.amast48.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast24_amast48 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast48 v amast72##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast48.amast72.contr.mat <- makeContrasts(amast48_v_amast72=conditionamast72-conditionamast48, levels=v$design)
amast48.amast72.fit <- contrasts.fit(fit, amast48.amast72.contr.mat)
amast48.amast72.eb <- eBayes(amast48.amast72.fit)
amast48.amast72.topTab <- topTable(amast48.amast72.eb, coef="amast48_v_amast72", number=nrow(v$E))
#View the list of DE genes
head(amast48.amast72.topTab, n=3L)
#                   logFC  AveExpr        t      P.Value adj.P.Val         B
#LmjF.36.5250-1 0.8051206 7.688751 5.756536 2.305982e-05 0.1273414 0.8849812
#LmjF.15.0440-5 1.1439317 7.165058 5.625265 3.003690e-05 0.1273414 0.8114074
#LmjF.05.0240-1 0.5680832 6.693406 5.272282 6.179471e-05 0.1746524 0.5072436
#Limit list to genes with an adjusted p value < 0.05
amast48.amast72.sigGenes <- amast48.amast72.topTab[amast48.amast72.topTab$adj.P.Val <0.05, ]
length(amast48.amast72.sigGenes$logFC)
#0
#0 before metac included
#Make an MA plot
sel = amast48.amast72.topTab$adj.P.Val < 0.05
top = amast48.amast72.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast48_amast72 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")
```
