1 Repeat a previous set of analyses and see where we differ

I am going to intersperse the original comments and code, then compare with my own analyses.

#lminfectome_dillonl_20150714_PCA_heatmap_samplediagnostics_limma_Lmajor60_metac_amast_no471_CDSonly_cecilia_forpublication.log
#Purpose: To repeat the analysis from 20150315, including the metacyclic samples in addition to the amastigote
#samples for the 5 batches for the human publication. HPGL0471 is excluded because it was found not to be an infected sample.cludes sample diagnostic, PCA, and heatmap analyses, as well as differential expression analysis to compare the o 4-hr amastigote transition and the amastigote samples across timepoints.
#This log is for parasite samples only using the count table restricted to CDS only.

#Count table generation is described in logs:
#lminfectome_dillonl_20140329_TopHat_HTSeq_Count_Table_hg19_Lmajor_HPGL0363-382_cecilia.log
#lminfectome_dillonl_20140712_TopHat_HTSeq_Count_Table_hg19_Lmajor_HPGL0396-405_cecilia.log
#lminfectome_dillonl_20141001_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0434-446_452-472_cecilia.logectome_dillonl_20141211_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0491-510_cecilia.log

#Name           condition       batch
#HPGL0364       metac (PNA)     A
#HPGL0366       amastLM4        A
#HPGL0368       amastLM24       A
#HPGL0370       amastLM48       A
#HPGL0372       amastLM72       A
#HPGL0374       metac (PNA)     B
#HPGL0376       amastLM4        B
#HPGL0380       amastLM48       B
#HPGL0382       amastLM72       B
#HPGL0397       metac (PNA)     C
#HPGL0399       amastLM4        C
#HPGL0401       amastLM24       C
#HPGL0403       amastLM48       C
#HPGL0405       amastLM72       C
#HPGL0456       metac (PNA)     E
#HPGL0459       amastLM4        E
#HPGL0463       amastLM24       E
#HPGL0467       amastLM48       E
#HPGL0494       metac (PNA)     F
#HPGL0497       amastLM4        F
#HPGL0501       amastLM24       F
#HPGL0505       amastLM48       F
#HPGL0509       amastLM72       F
#Count table is:
#20141211_Lmajor60_346_347_349_351_353_355_363_364_366_368_370_372_373_374_376_378_380_382_396_397_399_401_403_405_455-456_459_463_467_471_493-494_497_501_505_509_CDSonly.count
#Includes headers

#7-14-15

#L. major
#Create diagnostic plots
#Hector recommended that all diagnostics be done using the rawest data possible. For correlation analyses between samples,
#use countsTable data without filtering out lowly expressed genes and without normalization. Size factor normalization
#will be used to show the distribution of gene expression levels.

#This analysis uses count tables that have not been restricted to no471_CDS only.
#20141211_Lmajor60_346_347_349_351_353_355_363_364_366_368_370_372_373_374_376_378_380_382_396_397_399_401_403_405_455-456_459_463_467_471_493-494_497_501_505_509_CDSonly.count
library(cbcbSEQ)
## Loading required package: limma
## Loading required package: corpcor
## Loading required package: preprocessCore
## Loading required package: sva
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-23. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
## 
## Attaching package: 'cbcbSEQ'
## The following object is masked from 'package:hpgltools':
## 
##     pcRes
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(matrixStats)
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:genefilter':
## 
##     rowSds, rowVars
library(siggenes)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## Loading required package: multtest
## 
## Attaching package: 'multtest'
## The following object is masked from 'package:gplots':
## 
##     wapply
## Loading required package: splines
library(ReportingTools)
## Error in library(ReportingTools): there is no package called 'ReportingTools'
library(hwriter)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
## 
##     space
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:nlme':
## 
##     collapse
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: DelayedArray
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
library(RColorBrewer)

#Read in counts table
countsTable <- read.table("preprocessing/dillonl/20141211_Lmajor60_346_347_349_351_353_355_363_364_366_368_370_372_373_374_376_378_380_382_396_397_399_401_403_405_455-456_459_463_467_471_493-494_497_501_505_509_CDSonly.count.xz", header=TRUE)
#Remove id as an actual column, use as row names instead
rownames(countsTable) <- countsTable$id
countsTable <- countsTable[, -1, drop=FALSE]

#Restrict to samples of interest (exclude procyclic promastigote samples and HPGL0471)
countsTable <- countsTable[,-c(1:7, 13, 19, 25, 30:31)]

#Establish metadata for samples
sampleID <- colnames(countsTable)
condition <- rep(c("metac", "amast4", "amast24", "amast48", "amast72"), times=5)
batch <- rep(c("A", "B", "C", "E", "F"), each=5)
condition <- condition[-20]
batch <- batch[-20]

condition <- factor(condition, levels=c("metac", "amast4", "amast24", "amast48", "amast72"))
batch <- factor(batch, levels=c("A", "B", "C", "E", "F"))
design <- data.frame(sampleID=sampleID, condition=condition, batch=batch)
knitr::kable(design)
sampleID condition batch
HPGL0364 metac A
HPGL0366 amast4 A
HPGL0368 amast24 A
HPGL0370 amast48 A
HPGL0372 amast72 A
HPGL0374 metac B
HPGL0376 amast4 B
HPGL0378 amast24 B
HPGL0380 amast48 B
HPGL0382 amast72 B
HPGL0397 metac C
HPGL0399 amast4 C
HPGL0401 amast24 C
HPGL0403 amast48 C
HPGL0405 amast72 C
HPGL0456 metac E
HPGL0459 amast4 E
HPGL0463 amast24 E
HPGL0467 amast48 E
HPGL0494 metac F
HPGL0497 amast4 F
HPGL0501 amast24 F
HPGL0505 amast48 F
HPGL0509 amast72 F

2 Plots!

colnames(countsTable) <- paste(condition, batch, 1:length(condition), sep=".")
#View barplot of raw counts by sample
barplot(colSums(countsTable), las=3, ylim=c(0,3e+07))

df <- data.frame(cond=(condition))
dds <- DESeqDataSetFromMatrix(countData=countsTable, colData = df, design = ~ cond)
dds <- estimateSizeFactors(dds)
ncts <- counts(dds, normalized=TRUE)
y <- log(ncts + 1)

#Determine median and quantiles of the size factor normalized, log2 counts
median(y)
## [1] 5.886
#[1] 5.886309

quantile(y)
##     0%    25%    50%    75%   100% 
##  0.000  5.436  5.886  6.357 10.833
#       0%       25%       50%       75%      100%
# 0.000000  5.435910  5.886309  6.357313 10.832889

#Determine number of genes per sample with less than log2 of 2 counts (less than 4 counts per mill)
ydf <- as.data.frame(y)
colnames(ydf) <- sampleID

col.nam <- paste(condition, batch, 1:length(condition), sep=".")
#Boxplot of per sample log of size-factor-normalized counts (+ 1)
par(mar=c(10.5,4.5,2,1))
par(oma=c(0,0,0,0))
boxplot(y, names=col.nam, las=3)

#Heatmap of Pearson correlation between samples
#(delete Rowv, Colv, and dendrogram parameters if want samples to sort and show dendrogram)
#Correlation analysis (Pearson is default for cor unless othewise specified)
#Using raw counts
datCor <- cor(countsTable)
heatmap.2(datCor, Rowv=NA, Colv=NA,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          dendrogram="none",
          scale="none",
          trace="none",
          srtCol=45)

#Using raw counts
corM <- matrixStats::rowMedians(cor(countsTable))
qs <- quantile(corM,p=c(1,3)/4)
iqr <- diff(qs)
outLimit <- qs[1] - 1.5 * iqr
ylim <- c(pmin(min(corM),outLimit),max(corM))
col <-  ifelse(condition=="amast4", "cornflowerblue",
        ifelse(condition=="amast24", "gold",
        ifelse(condition=="amast48", "green3",
        ifelse(condition=="amast72", "coral", "mediumorchid4"))))

plot(corM, xaxt="n", ylim=ylim, ylab="Median Pairwise Correlation", xlab="", main="", col=col, pch=16, cex=2.2)
axis(side=1,at=seq(along=corM),labels=paste(condition,batch,sep=":"),las=2)
abline(h=outLimit,lty=2)
abline(v=1:length(col.nam), lty=3, col="black")

#Use filterCounts method to filter for low counts
#Define filterCounts function
filterCounts = function (counts, lib.size = NULL, thresh = 1, minSamples = 2) {
    cpms <- 2^log2CPM(counts, lib.size = lib.size)$y
    keep <- rowSums(cpms > thresh) >= minSamples
    counts <- counts[keep, ]
    counts
}
x <- table(condition)
dim(countsTable)
## [1] 8486   24
#[1] 8486   24
counts <- filterCounts(countsTable, thresh=1, minSamples=min(x))
dim(counts)
## [1] 8480   24
#[1] 8480   24

#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Explore data for batch effects
#Transform data to log2 counts per million
x <- log2CPM(countsSubQ)
#Compute principal components
s <- makeSVD(x$y)

#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
##    propVar cumPropVar cond.R2 batch.R2
## 1    39.68      39.68   82.46    13.46
## 2    12.96      52.64   14.53    61.01
## 3     8.02      60.66   35.08    18.08
## 4     7.13      67.79   54.67    13.29
## 5     5.35      73.14    7.61    63.48
## 6     4.23      77.37   14.21    37.35
## 7     3.48      80.85   16.10    35.96
## 8     2.94      83.79   23.22    22.08
## 9     2.15      85.94   13.26    46.74
## 10    2.11      88.05   16.58    18.60
## 11    1.76      89.81    8.03    24.71
## 12    1.58      91.39    5.81     3.39
## 13    1.36      92.75   34.83     4.28
## 14    1.28      94.03    2.16     7.12
## 15    1.08      95.11    7.23     6.02
## 16    0.87      95.98    6.08     3.74
## 17    0.77      96.75    0.84     8.10
## 18    0.72      97.47    0.49     0.82
## 19    0.60      98.07    4.59     4.24
## 20    0.57      98.64    2.70     2.46
## 21    0.48      99.12   36.70     1.44
## 22    0.46      99.58    9.52     0.94
## 23    0.41      99.99    3.29     2.70
#   propVar cumPropVar cond.R2 batch.R2
#1    39.68      39.68   82.46    13.46
#2    12.96      52.64   14.53    61.01
#3     8.02      60.66   35.08    18.08
#4     7.13      67.79   54.67    13.29
#5     5.35      73.14    7.61    63.48
#6     4.23      77.37   14.21    37.35
#7     3.48      80.85   16.10    35.96
#8     2.94      83.79   23.22    22.08
#9     2.15      85.94   13.26    46.74
#10    2.11      88.05   16.58    18.60
#11    1.76      89.81    8.03    24.71
#12    1.58      91.39    5.81     3.39
#13    1.36      92.75   34.83     4.28
#14    1.28      94.03    2.16     7.12
#15    1.08      95.11    7.23     6.02
#16    0.87      95.98    6.08     3.74
#17    0.77      96.75    0.84     8.10
#18    0.72      97.47    0.49     0.82
#19    0.60      98.07    4.59     4.24
#20    0.57      98.64    2.70     2.46
#21    0.48      99.12   36.70     1.44
#22    0.46      99.58    9.52     0.94
#23    0.41      99.99    3.29     2.70

#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="A", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.11, y=-0.40, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(counts))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

#I would like to create a PCA plot and heatmap to reflect the use of batch in the limma model
#Specify model
mod <- model.matrix(~batch)

#Use voom to determine weights for fitting the mean-variance trend for each gene
v <- voom(countsSubQ, mod)
#Fit the linear model for each gene
fit <- lmFit(v)

#Get the residual (i.e. everything except for the batch effect)
newData <- residuals(fit, v)
#Explore data for batch effects
#Compute principal components
s <- makeSVD(newData)

#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
##    propVar cumPropVar cond.R2 batch.R2
## 1    47.04      47.04   93.02        0
## 2    10.77      57.81   56.14        0
## 3     8.74      66.55   54.42        0
## 4     7.16      73.71    7.30        0
## 5     4.45      78.16   49.96        0
## 6     3.73      81.89    4.61        0
## 7     3.11      85.00    8.66        0
## 8     2.22      87.22    1.08        0
## 9     2.10      89.32   13.03        0
## 10    1.89      91.21   14.31        0
## 11    1.63      92.84   27.11        0
## 12    1.42      94.26    1.27        0
## 13    1.13      95.39    7.26        0
## 14    0.97      96.36    0.42        0
## 15    0.92      97.28    2.73        0
## 16    0.80      98.08    4.79        0
## 17    0.68      98.76   22.51        0
## 18    0.64      99.40   19.19        0
## 19    0.58      99.98    8.19        0
## 20    0.00      99.98    0.34      100
## 21    0.00      99.98    1.23      100
## 22    0.00      99.98    2.13      100
#   propVar cumPropVar cond.R2 batch.R2
#1    47.04      47.04   93.02        0
#2    10.77      57.81   56.14        0
#3     8.74      66.55   54.42        0
#4     7.16      73.71    7.30        0
#5     4.45      78.16   49.96        0
#6     3.73      81.89    4.61        0
#7     3.11      85.00    8.66        0
#8     2.22      87.22    1.08        0
#9     2.10      89.32   13.03        0
#10    1.89      91.21   14.31        0
#11    1.63      92.84   27.11        0
#12    1.42      94.26    1.27        0
#13    1.13      95.39    7.26        0
#14    0.97      96.36    0.42        0
#15    0.92      97.28    2.73        0
#16    0.80      98.08    4.79        0
#17    0.68      98.76   22.51        0
#18    0.64      99.40   19.19        0
#19    0.58      99.98    8.19        0
#20    0.00      99.98    0.34      100
#21    0.00      99.98    1.24      100
#22    0.00      99.98    2.12      100

#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="A", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.48, y=-0.24, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(newData))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)

vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}

heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
                     RowSideColors=batch.color, key="FALSE", srtCol=45)

countsSubQ <- qNorm(counts)
#Specify model
mod = model.matrix(~0+condition+batch)
#View mean-variance trend
v <- voom(countsSubQ, mod, plot=TRUE)

#Fit a linear model for each gene using the specified design contained in v
fit <- lmFit(v)
##metac v amast4##
#eBayes finds an F-statistic from the set of t-statistics for that gene
metac.amast4.contr.mat <- makeContrasts(metac_v_amast4=conditionamast4-conditionmetac, levels=v$design)
metac.amast4.fit <- contrasts.fit(fit, metac.amast4.contr.mat)
metac.amast4.eb <- eBayes(metac.amast4.fit)
metac.amast4.topTab <- topTable(metac.amast4.eb, coef="metac_v_amast4", number=nrow(v$E))
metac.amast4.topTab[["fold_change"]] <- 2 ^ metac.amast4.topTab[["logFC"]]
write.csv(file="csv/lmajor_metac_vs_amast4.csv", x=metac.amast4.topTab)

#View the list of DE genes
head(metac.amast4.topTab, n=3L)
##                logFC AveExpr     t   P.Value adj.P.Val     B fold_change
## LmjF.19.1530-1 1.480   8.453 14.50 8.359e-14 3.599e-10 21.44       2.790
## LmjF.26.0640-1 1.822   7.266 14.49 8.488e-14 3.599e-10 21.31       3.537
## LmjF.36.2030-1 1.387   8.786 11.68 1.026e-11 1.822e-08 16.86       2.616
#                  logFC  AveExpr        t      P.Value    adj.P.Val        B
#LmjF.19.1530-1 1.480236 8.453253 14.50158 8.359353e-14 3.598949e-10 21.43701
#LmjF.26.0640-1 1.822403 7.266265 14.49185 8.488086e-14 3.598949e-10 21.30901
#LmjF.36.2030-1 1.387168 8.785515 11.67783 1.026427e-11 1.822331e-08 16.85840


#Limit list to genes with an adjusted p value < 0.05
metac.amast4.sigGenes <- metac.amast4.topTab[metac.amast4.topTab$adj.P.Val <0.05, ]
length(metac.amast4.sigGenes$logFC)
## [1] 3224
#3224

#Filter out rows with less than 2-fold change (log2 fold change of > 1)
metac.amast4.sigGenesFold1 <- subset(metac.amast4.sigGenes, abs(logFC) > 1)
length(metac.amast4.sigGenesFold1$logFC)
## [1] 304
#304

#Filter out rows with less than 4-fold change (log2 fold change of > 2)
metac.amast4.sigGenesFold2 <- subset(metac.amast4.sigGenes, abs(logFC) > 2)
length(metac.amast4.sigGenesFold2$logFC)
## [1] 12
#12

metac.amast4.sigGenes <- metac.amast4.sigGenes[order(-metac.amast4.sigGenes$logFC), ]

#Make an MA plot
sel = metac.amast4.topTab$adj.P.Val < 0.05
top = metac.amast4.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E

plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="metac_amast4 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast4 v amast24##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast4.amast24.contr.mat <- makeContrasts(amast4_v_amast24=conditionamast24-conditionamast4, levels=v$design)
amast4.amast24.fit <- contrasts.fit(fit, amast4.amast24.contr.mat)
amast4.amast24.eb <- eBayes(amast4.amast24.fit)
amast4.amast24.topTab <- topTable(amast4.amast24.eb, coef="amast4_v_amast24", number=nrow(v$E))
#View the list of DE genes
head(amast4.amast24.topTab, n=3L)
##                 logFC AveExpr     t   P.Value adj.P.Val      B
## LmjF.28.1570-1 1.5786   6.136 8.744 3.870e-09 3.282e-05 10.769
## LmjF.05.0010-1 0.9452   8.273 8.130 1.540e-08 5.636e-05  9.638
## LmjF.36.2760-1 1.3463   5.846 8.018 1.994e-08 5.636e-05  9.229
#                   logFC  AveExpr        t      P.Value    adj.P.Val         B
#LmjF.28.1570-1 1.5785593 6.135850 8.744485 3.869906e-09 3.281680e-05 10.769201
#LmjF.05.0010-1 0.9451551 8.273246 8.130366 1.539908e-08 5.635831e-05  9.638152
#LmjF.36.2760-1 1.3462634 5.845910 8.017909 1.993808e-08 5.635831e-05  9.229250

#Limit list to genes with an adjusted p value < 0.05
amast4.amast24.sigGenes <- amast4.amast24.topTab[amast4.amast24.topTab$adj.P.Val <0.05, ]
length(amast4.amast24.sigGenes$logFC)
## [1] 356
#356

#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast4.amast24.sigGenesFold1 <- subset(amast4.amast24.sigGenes, abs(logFC) > 1)
length(amast4.amast24.sigGenesFold1$logFC)
## [1] 38
#38

#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast4.amast24.sigGenesFold2 <- subset(amast4.amast24.sigGenes, abs(logFC) > 2)
length(amast4.amast24.sigGenesFold2$logFC)
## [1] 1
#1

amast4.amast24.sigGenes <- amast4.amast24.sigGenes[order(-amast4.amast24.sigGenes$logFC), ]
#Make an MA plot
sel = amast4.amast24.topTab$adj.P.Val < 0.05
top = amast4.amast24.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E


plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast4_amast24 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

dev.off()
## null device 
##           1
##amast24 v amast48##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast24.amast48.contr.mat <- makeContrasts(amast24_v_amast48=conditionamast48-conditionamast24, levels=v$design)
amast24.amast48.fit <- contrasts.fit(fit, amast24.amast48.contr.mat)
amast24.amast48.eb <- eBayes(amast24.amast48.fit)
amast24.amast48.topTab <- topTable(amast24.amast48.eb, coef="amast24_v_amast48", number=nrow(v$E))

#View the list of DE genes
head(amast24.amast48.topTab, n=3L)
##                  logFC AveExpr      t   P.Value adj.P.Val     B
## LmjF.36.2760-1 -0.9615   5.846 -5.831 4.143e-06   0.02907 3.175
## LmjF.04.0310-1 -1.0496   6.963 -5.552 8.498e-06   0.02907 2.911
## LmjF.29.2710-1  0.6017   6.277  5.478 1.028e-05   0.02907 2.662
#                    logFC  AveExpr         t      P.Value  adj.P.Val        B
#LmjF.36.2760-1 -0.9614720 5.845910 -5.831286 4.142968e-06 0.02907032 3.175088
#LmjF.04.0310-1 -1.0496414 6.963380 -5.551887 8.497593e-06 0.02907032 2.910560
#LmjF.29.2710-1  0.6016944 6.277076  5.478035 1.028431e-05 0.02907032 2.661694

#Limit list to genes with an adjusted p value < 0.05
amast24.amast48.sigGenes <- amast24.amast48.topTab[amast24.amast48.topTab$adj.P.Val <0.05, ]
length(amast24.amast48.sigGenes$logFC)
## [1] 3
#3

#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast24.amast48.sigGenesFold1 <- subset(amast24.amast48.sigGenes, abs(logFC) > 1)
length(amast24.amast48.sigGenesFold1$logFC)
## [1] 1
#1

#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast24.amast48.sigGenesFold2 <- subset(amast24.amast48.sigGenes, abs(logFC) > 2)
length(amast24.amast48.sigGenesFold2$logFC)
## [1] 0
amast24.amast48.sigGenes <- amast24.amast48.sigGenes[order(-amast24.amast48.sigGenes$logFC), ]

#Make an MA plot
sel = amast24.amast48.topTab$adj.P.Val < 0.05
top = amast24.amast48.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast24_amast48 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast48 v amast72##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast48.amast72.contr.mat <- makeContrasts(amast48_v_amast72=conditionamast72-conditionamast48, levels=v$design)
amast48.amast72.fit <- contrasts.fit(fit, amast48.amast72.contr.mat)
amast48.amast72.eb <- eBayes(amast48.amast72.fit)
amast48.amast72.topTab <- topTable(amast48.amast72.eb, coef="amast48_v_amast72", number=nrow(v$E))

#View the list of DE genes
head(amast48.amast72.topTab, n=3L)
##                  logFC AveExpr      t   P.Value adj.P.Val       B
## LmjF.23.0025-1 -0.5558   6.033 -4.173 0.0003087    0.3816 -0.2582
## LmjF.13.1500-1  0.5284   7.658  3.932 0.0005761    0.3816 -0.5304
## LmjF.22.1640-1 -0.6805   6.958 -3.882 0.0006544    0.3816 -0.6542
#                    logFC  AveExpr         t      P.Value adj.P.Val          B
#LmjF.23.0025-1 -0.5557985 6.033048 -4.172762 0.0003086924 0.3815768 -0.2582314
#LmjF.13.1500-1  0.5283509 7.658283  3.931807 0.0005760897 0.3815768 -0.5303752
#LmjF.22.1640-1 -0.6805308 6.957837 -3.882314 0.0006544461 0.3815768 -0.6542054

#Limit list to genes with an adjusted p value < 0.05
amast48.amast72.sigGenes <- amast48.amast72.topTab[amast48.amast72.topTab$adj.P.Val <0.05, ]
length(amast48.amast72.sigGenes$logFC)
## [1] 0
#0

#Make an MA plot
sel = amast48.amast72.topTab$adj.P.Val < 0.05
top = amast48.amast72.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E

plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast48_amast72 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")
if (isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename=savefile))
}
---
title: "L.major/amazonensis 2016: Repeat previous lmajor human logs."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: tango
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: cosmo
  toc: true
  toc_float:
   collapsed: false
   smooth_scroll: false
---

<style>
body .main-container {
max-width: 1600px;
}
</style>

```{r options, include=FALSE}
if (!isTRUE(get0("skip_load"))) {
  library(hpgltools)
  tt <- devtools::load_all("~/hpgltools")
  knitr::opts_knit$set(progress=TRUE,
                       verbose=TRUE,
                       width=90,
                       echo=TRUE)
  knitr::opts_chunk$set(error=TRUE,
                        fig.width=8,
                        fig.height=8,
                        dpi=96)
  old_options <- options(digits=4,
                         stringsAsFactors=FALSE,
                         knitr.duplicate.label="allow")
  ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
  ver <- "20180424"
  previous_file <- paste0("index.Rmd")

  tmp <- try(sm(loadme(filename=gsub(pattern="\\.Rmd", replace="\\.rda\\.xz", x=previous_file))))
  rmd_file <- paste0("repeat_log_lmajor_hsapiens.Rmd")
  savefile <- gsub(pattern="\\.Rmd", replace="\\.rda\\.xz", x=rmd_file)
}
```

# Repeat a previous set of analyses and see where we differ

I am going to intersperse the original comments and code, then compare with my own analyses.

<pre>
#lminfectome_dillonl_20150714_PCA_heatmap_samplediagnostics_limma_Lmajor60_metac_amast_no471_CDSonly_cecilia_forpublication.log
#Purpose: To repeat the analysis from 20150315, including the metacyclic samples in addition to the amastigote
#samples for the 5 batches for the human publication. HPGL0471 is excluded because it was found not to be an infected sample.cludes sample diagnostic, PCA, and heatmap analyses, as well as differential expression analysis to compare the o 4-hr amastigote transition and the amastigote samples across timepoints.
#This log is for parasite samples only using the count table restricted to CDS only.

#Count table generation is described in logs:
#lminfectome_dillonl_20140329_TopHat_HTSeq_Count_Table_hg19_Lmajor_HPGL0363-382_cecilia.log
#lminfectome_dillonl_20140712_TopHat_HTSeq_Count_Table_hg19_Lmajor_HPGL0396-405_cecilia.log
#lminfectome_dillonl_20141001_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0434-446_452-472_cecilia.logectome_dillonl_20141211_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0491-510_cecilia.log

#Name           condition       batch
#HPGL0364       metac (PNA)     A
#HPGL0366       amastLM4        A
#HPGL0368       amastLM24       A
#HPGL0370       amastLM48       A
#HPGL0372       amastLM72       A
#HPGL0374       metac (PNA)     B
#HPGL0376       amastLM4        B
#HPGL0380       amastLM48       B
#HPGL0382       amastLM72       B
#HPGL0397       metac (PNA)     C
#HPGL0399       amastLM4        C
#HPGL0401       amastLM24       C
#HPGL0403       amastLM48       C
#HPGL0405       amastLM72       C
#HPGL0456       metac (PNA)     E
#HPGL0459       amastLM4        E
#HPGL0463       amastLM24       E
#HPGL0467       amastLM48       E
#HPGL0494       metac (PNA)     F
#HPGL0497       amastLM4        F
#HPGL0501       amastLM24       F
#HPGL0505       amastLM48       F
#HPGL0509       amastLM72       F
#Count table is:
#20141211_Lmajor60_346_347_349_351_353_355_363_364_366_368_370_372_373_374_376_378_380_382_396_397_399_401_403_405_455-456_459_463_467_471_493-494_497_501_505_509_CDSonly.count
#Includes headers

#7-14-15

#L. major
#Create diagnostic plots
#Hector recommended that all diagnostics be done using the rawest data possible. For correlation analyses between samples,
#use countsTable data without filtering out lowly expressed genes and without normalization. Size factor normalization
#will be used to show the distribution of gene expression levels.

#This analysis uses count tables that have not been restricted to no471_CDS only.
</pre>

```{r repeat_log}
#20141211_Lmajor60_346_347_349_351_353_355_363_364_366_368_370_372_373_374_376_378_380_382_396_397_399_401_403_405_455-456_459_463_467_471_493-494_497_501_505_509_CDSonly.count
library(cbcbSEQ)
library(gplots)
library(matrixStats)
library(siggenes)
library(ReportingTools)
library(hwriter)
library(DESeq2)
library(RColorBrewer)

#Read in counts table
countsTable <- read.table("preprocessing/dillonl/20141211_Lmajor60_346_347_349_351_353_355_363_364_366_368_370_372_373_374_376_378_380_382_396_397_399_401_403_405_455-456_459_463_467_471_493-494_497_501_505_509_CDSonly.count.xz", header=TRUE)
#Remove id as an actual column, use as row names instead
rownames(countsTable) <- countsTable$id
countsTable <- countsTable[, -1, drop=FALSE]

#Restrict to samples of interest (exclude procyclic promastigote samples and HPGL0471)
countsTable <- countsTable[,-c(1:7, 13, 19, 25, 30:31)]

#Establish metadata for samples
sampleID <- colnames(countsTable)
condition <- rep(c("metac", "amast4", "amast24", "amast48", "amast72"), times=5)
batch <- rep(c("A", "B", "C", "E", "F"), each=5)
condition <- condition[-20]
batch <- batch[-20]

condition <- factor(condition, levels=c("metac", "amast4", "amast24", "amast48", "amast72"))
batch <- factor(batch, levels=c("A", "B", "C", "E", "F"))
design <- data.frame(sampleID=sampleID, condition=condition, batch=batch)
knitr::kable(design)
```

# Plots!

```{r plots}
colnames(countsTable) <- paste(condition, batch, 1:length(condition), sep=".")
#View barplot of raw counts by sample
barplot(colSums(countsTable), las=3, ylim=c(0,3e+07))

df <- data.frame(cond=(condition))
dds <- DESeqDataSetFromMatrix(countData=countsTable, colData = df, design = ~ cond)
dds <- estimateSizeFactors(dds)
ncts <- counts(dds, normalized=TRUE)
y <- log(ncts + 1)

#Determine median and quantiles of the size factor normalized, log2 counts
median(y)
#[1] 5.886309

quantile(y)
#       0%       25%       50%       75%      100%
# 0.000000  5.435910  5.886309  6.357313 10.832889

#Determine number of genes per sample with less than log2 of 2 counts (less than 4 counts per mill)
ydf <- as.data.frame(y)
colnames(ydf) <- sampleID

col.nam <- paste(condition, batch, 1:length(condition), sep=".")
#Boxplot of per sample log of size-factor-normalized counts (+ 1)
par(mar=c(10.5,4.5,2,1))
par(oma=c(0,0,0,0))
boxplot(y, names=col.nam, las=3)

#Heatmap of Pearson correlation between samples
#(delete Rowv, Colv, and dendrogram parameters if want samples to sort and show dendrogram)
#Correlation analysis (Pearson is default for cor unless othewise specified)
#Using raw counts
datCor <- cor(countsTable)
heatmap.2(datCor, Rowv=NA, Colv=NA,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          dendrogram="none",
          scale="none",
          trace="none",
          srtCol=45)


#Using raw counts
corM <- matrixStats::rowMedians(cor(countsTable))
qs <- quantile(corM,p=c(1,3)/4)
iqr <- diff(qs)
outLimit <- qs[1] - 1.5 * iqr
ylim <- c(pmin(min(corM),outLimit),max(corM))
col <-  ifelse(condition=="amast4", "cornflowerblue",
        ifelse(condition=="amast24", "gold",
        ifelse(condition=="amast48", "green3",
        ifelse(condition=="amast72", "coral", "mediumorchid4"))))

plot(corM, xaxt="n", ylim=ylim, ylab="Median Pairwise Correlation", xlab="", main="", col=col, pch=16, cex=2.2)
axis(side=1,at=seq(along=corM),labels=paste(condition,batch,sep=":"),las=2)
abline(h=outLimit,lty=2)
abline(v=1:length(col.nam), lty=3, col="black")

#Use filterCounts method to filter for low counts
#Define filterCounts function
filterCounts = function (counts, lib.size = NULL, thresh = 1, minSamples = 2) {
    cpms <- 2^log2CPM(counts, lib.size = lib.size)$y
    keep <- rowSums(cpms > thresh) >= minSamples
    counts <- counts[keep, ]
    counts
}
x <- table(condition)
dim(countsTable)
#[1] 8486   24
counts <- filterCounts(countsTable, thresh=1, minSamples=min(x))
dim(counts)
#[1] 8480   24

#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Explore data for batch effects
#Transform data to log2 counts per million
x <- log2CPM(countsSubQ)
#Compute principal components
s <- makeSVD(x$y)

#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
#   propVar cumPropVar cond.R2 batch.R2
#1    39.68      39.68   82.46    13.46
#2    12.96      52.64   14.53    61.01
#3     8.02      60.66   35.08    18.08
#4     7.13      67.79   54.67    13.29
#5     5.35      73.14    7.61    63.48
#6     4.23      77.37   14.21    37.35
#7     3.48      80.85   16.10    35.96
#8     2.94      83.79   23.22    22.08
#9     2.15      85.94   13.26    46.74
#10    2.11      88.05   16.58    18.60
#11    1.76      89.81    8.03    24.71
#12    1.58      91.39    5.81     3.39
#13    1.36      92.75   34.83     4.28
#14    1.28      94.03    2.16     7.12
#15    1.08      95.11    7.23     6.02
#16    0.87      95.98    6.08     3.74
#17    0.77      96.75    0.84     8.10
#18    0.72      97.47    0.49     0.82
#19    0.60      98.07    4.59     4.24
#20    0.57      98.64    2.70     2.46
#21    0.48      99.12   36.70     1.44
#22    0.46      99.58    9.52     0.94
#23    0.41      99.99    3.29     2.70

#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="A", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.11, y=-0.40, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(counts))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

#I would like to create a PCA plot and heatmap to reflect the use of batch in the limma model
#Specify model
mod <- model.matrix(~batch)

#Use voom to determine weights for fitting the mean-variance trend for each gene
v <- voom(countsSubQ, mod)
#Fit the linear model for each gene
fit <- lmFit(v)

#Get the residual (i.e. everything except for the batch effect)
newData <- residuals(fit, v)
#Explore data for batch effects
#Compute principal components
s <- makeSVD(newData)

#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
#   propVar cumPropVar cond.R2 batch.R2
#1    47.04      47.04   93.02        0
#2    10.77      57.81   56.14        0
#3     8.74      66.55   54.42        0
#4     7.16      73.71    7.30        0
#5     4.45      78.16   49.96        0
#6     3.73      81.89    4.61        0
#7     3.11      85.00    8.66        0
#8     2.22      87.22    1.08        0
#9     2.10      89.32   13.03        0
#10    1.89      91.21   14.31        0
#11    1.63      92.84   27.11        0
#12    1.42      94.26    1.27        0
#13    1.13      95.39    7.26        0
#14    0.97      96.36    0.42        0
#15    0.92      97.28    2.73        0
#16    0.80      98.08    4.79        0
#17    0.68      98.76   22.51        0
#18    0.64      99.40   19.19        0
#19    0.58      99.98    8.19        0
#20    0.00      99.98    0.34      100
#21    0.00      99.98    1.24      100
#22    0.00      99.98    2.12      100

#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="A", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.48, y=-0.24, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(newData))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)

vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}

heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
                     RowSideColors=batch.color, key="FALSE", srtCol=45)

countsSubQ <- qNorm(counts)
#Specify model
mod = model.matrix(~0+condition+batch)
#View mean-variance trend
v <- voom(countsSubQ, mod, plot=TRUE)
#Fit a linear model for each gene using the specified design contained in v
fit <- lmFit(v)
##metac v amast4##
#eBayes finds an F-statistic from the set of t-statistics for that gene
metac.amast4.contr.mat <- makeContrasts(metac_v_amast4=conditionamast4-conditionmetac, levels=v$design)
metac.amast4.fit <- contrasts.fit(fit, metac.amast4.contr.mat)
metac.amast4.eb <- eBayes(metac.amast4.fit)
metac.amast4.topTab <- topTable(metac.amast4.eb, coef="metac_v_amast4", number=nrow(v$E))
metac.amast4.topTab[["fold_change"]] <- 2 ^ metac.amast4.topTab[["logFC"]]
write.csv(file="csv/lmajor_metac_vs_amast4.csv", x=metac.amast4.topTab)

#View the list of DE genes
head(metac.amast4.topTab, n=3L)
#                  logFC  AveExpr        t      P.Value    adj.P.Val        B
#LmjF.19.1530-1 1.480236 8.453253 14.50158 8.359353e-14 3.598949e-10 21.43701
#LmjF.26.0640-1 1.822403 7.266265 14.49185 8.488086e-14 3.598949e-10 21.30901
#LmjF.36.2030-1 1.387168 8.785515 11.67783 1.026427e-11 1.822331e-08 16.85840


#Limit list to genes with an adjusted p value < 0.05
metac.amast4.sigGenes <- metac.amast4.topTab[metac.amast4.topTab$adj.P.Val <0.05, ]
length(metac.amast4.sigGenes$logFC)
#3224

#Filter out rows with less than 2-fold change (log2 fold change of > 1)
metac.amast4.sigGenesFold1 <- subset(metac.amast4.sigGenes, abs(logFC) > 1)
length(metac.amast4.sigGenesFold1$logFC)
#304

#Filter out rows with less than 4-fold change (log2 fold change of > 2)
metac.amast4.sigGenesFold2 <- subset(metac.amast4.sigGenes, abs(logFC) > 2)
length(metac.amast4.sigGenesFold2$logFC)
#12

metac.amast4.sigGenes <- metac.amast4.sigGenes[order(-metac.amast4.sigGenes$logFC), ]

#Make an MA plot
sel = metac.amast4.topTab$adj.P.Val < 0.05
top = metac.amast4.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E

plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="metac_amast4 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast4 v amast24##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast4.amast24.contr.mat <- makeContrasts(amast4_v_amast24=conditionamast24-conditionamast4, levels=v$design)
amast4.amast24.fit <- contrasts.fit(fit, amast4.amast24.contr.mat)
amast4.amast24.eb <- eBayes(amast4.amast24.fit)
amast4.amast24.topTab <- topTable(amast4.amast24.eb, coef="amast4_v_amast24", number=nrow(v$E))
#View the list of DE genes
head(amast4.amast24.topTab, n=3L)
#                   logFC  AveExpr        t      P.Value    adj.P.Val         B
#LmjF.28.1570-1 1.5785593 6.135850 8.744485 3.869906e-09 3.281680e-05 10.769201
#LmjF.05.0010-1 0.9451551 8.273246 8.130366 1.539908e-08 5.635831e-05  9.638152
#LmjF.36.2760-1 1.3462634 5.845910 8.017909 1.993808e-08 5.635831e-05  9.229250

#Limit list to genes with an adjusted p value < 0.05
amast4.amast24.sigGenes <- amast4.amast24.topTab[amast4.amast24.topTab$adj.P.Val <0.05, ]
length(amast4.amast24.sigGenes$logFC)
#356

#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast4.amast24.sigGenesFold1 <- subset(amast4.amast24.sigGenes, abs(logFC) > 1)
length(amast4.amast24.sigGenesFold1$logFC)
#38

#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast4.amast24.sigGenesFold2 <- subset(amast4.amast24.sigGenes, abs(logFC) > 2)
length(amast4.amast24.sigGenesFold2$logFC)
#1

amast4.amast24.sigGenes <- amast4.amast24.sigGenes[order(-amast4.amast24.sigGenes$logFC), ]
#Make an MA plot
sel = amast4.amast24.topTab$adj.P.Val < 0.05
top = amast4.amast24.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E


plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast4_amast24 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")
dev.off()

##amast24 v amast48##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast24.amast48.contr.mat <- makeContrasts(amast24_v_amast48=conditionamast48-conditionamast24, levels=v$design)
amast24.amast48.fit <- contrasts.fit(fit, amast24.amast48.contr.mat)
amast24.amast48.eb <- eBayes(amast24.amast48.fit)
amast24.amast48.topTab <- topTable(amast24.amast48.eb, coef="amast24_v_amast48", number=nrow(v$E))

#View the list of DE genes
head(amast24.amast48.topTab, n=3L)
#                    logFC  AveExpr         t      P.Value  adj.P.Val        B
#LmjF.36.2760-1 -0.9614720 5.845910 -5.831286 4.142968e-06 0.02907032 3.175088
#LmjF.04.0310-1 -1.0496414 6.963380 -5.551887 8.497593e-06 0.02907032 2.910560
#LmjF.29.2710-1  0.6016944 6.277076  5.478035 1.028431e-05 0.02907032 2.661694

#Limit list to genes with an adjusted p value < 0.05
amast24.amast48.sigGenes <- amast24.amast48.topTab[amast24.amast48.topTab$adj.P.Val <0.05, ]
length(amast24.amast48.sigGenes$logFC)
#3

#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast24.amast48.sigGenesFold1 <- subset(amast24.amast48.sigGenes, abs(logFC) > 1)
length(amast24.amast48.sigGenesFold1$logFC)
#1

#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast24.amast48.sigGenesFold2 <- subset(amast24.amast48.sigGenes, abs(logFC) > 2)
length(amast24.amast48.sigGenesFold2$logFC)

amast24.amast48.sigGenes <- amast24.amast48.sigGenes[order(-amast24.amast48.sigGenes$logFC), ]

#Make an MA plot
sel = amast24.amast48.topTab$adj.P.Val < 0.05
top = amast24.amast48.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast24_amast48 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast48 v amast72##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast48.amast72.contr.mat <- makeContrasts(amast48_v_amast72=conditionamast72-conditionamast48, levels=v$design)
amast48.amast72.fit <- contrasts.fit(fit, amast48.amast72.contr.mat)
amast48.amast72.eb <- eBayes(amast48.amast72.fit)
amast48.amast72.topTab <- topTable(amast48.amast72.eb, coef="amast48_v_amast72", number=nrow(v$E))

#View the list of DE genes
head(amast48.amast72.topTab, n=3L)
#                    logFC  AveExpr         t      P.Value adj.P.Val          B
#LmjF.23.0025-1 -0.5557985 6.033048 -4.172762 0.0003086924 0.3815768 -0.2582314
#LmjF.13.1500-1  0.5283509 7.658283  3.931807 0.0005760897 0.3815768 -0.5303752
#LmjF.22.1640-1 -0.6805308 6.957837 -3.882314 0.0006544461 0.3815768 -0.6542054

#Limit list to genes with an adjusted p value < 0.05
amast48.amast72.sigGenes <- amast48.amast72.topTab[amast48.amast72.topTab$adj.P.Val <0.05, ]
length(amast48.amast72.sigGenes$logFC)
#0

#Make an MA plot
sel = amast48.amast72.topTab$adj.P.Val < 0.05
top = amast48.amast72.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E

plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast48_amast72 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

```

```{r saveme}
if (isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename=savefile))
}
```
