index.html

1 Repeat a previous set of analyses and see where we differ

I am going to intersperse the original comments and code, then compare with my own analyses.

#metacyclic to 4-hr amastigote transition and the amastigote samples across timepoints.
#This log is for parasite samples only using the count table restricted to CDS only.

#Count table generation is described in logs:
#lminfectome_dillonl_20141001_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0434-446_452-472_cecilia.log
#lminfectome_dillonl_20141211_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0491-510_cecilia.log

#Lmexicana81 (L. amazonensis) samples
#Name           condition       batch
#HPGL0435       metac           D
#HPGL0437       amastLA4        D
#HPGL0440       amastLA24       D
#HPGL0443       amastLA48       D
#HPGL0446       amastLA72       D
#HPGL0454       metac           E
#HPGL0458       amastLA4        E
#HPGL0462       amastLA24       E
#HPGL0466       amastLA48       E
#HPGL0470       amastLA72       E
#HPGL0492       metac           F
#HPGL0496       amastLA4        F
#HPGL0500       amastLA24       F
#HPGL0504       amastLA48       F
#HPGL0508       amastLA72       F
#Count table is:
#20141211_Lmexicana81_434-435_437_440_443_446_453-454_458_462_466_470_491-492_496_500_504_508_CDSonly.count
#Includes headers

#7-28-15
#L. amazonensis
#Create diagnostic plots
#Hector recommended that all diagnostics be done using the rawest data possible. For correlation analyses between samples,
#use countsTable data without filtering out lowly expressed genes and without normalization. Size factor normalization
#will be used to show the distribution of gene expression levels.

#This analysis uses count tables that have not been restricted to _CDS only.
library(cbcbSEQ)
## Loading required package: limma
## Loading required package: corpcor
## Loading required package: preprocessCore
## Loading required package: sva
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-23. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
## 
## Attaching package: 'cbcbSEQ'
## The following object is masked from 'package:hpgltools':
## 
##     pcRes
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(matrixStats)
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:genefilter':
## 
##     rowSds, rowVars
library(siggenes)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## Loading required package: multtest
## 
## Attaching package: 'multtest'
## The following object is masked from 'package:gplots':
## 
##     wapply
## Loading required package: splines
library(ReportingTools)
## Loading required package: knitr
## 
## 
library(hwriter)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
## 
##     space
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:nlme':
## 
##     collapse
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: DelayedArray
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
library(RColorBrewer)

#Read in counts table
countsTable <- read.table("preprocessing/dillonl/20141211_Lmexicana81_434-435_437_440_443_446_453-454_458_462_466_470_491-492_496_500_504_508_CDSonly.count.xz", header=TRUE)
#Remove id as an actual column, use as row names instead
rownames(countsTable) <- countsTable$id
countsTable <- countsTable[,-1]
#Restrict to samples of interest (exclude procyclic promastigote samples)
countsTable <- countsTable[,-c(1, 7, 13)]
head(countsTable, n=3L)
##                HPGL0435 HPGL0437 HPGL0440 HPGL0443 HPGL0446 HPGL0454
## LmxM.01.0010-1     1910      154       34       38       56     2255
## LmxM.01.0020-1     1103      370      119      162      277     1639
## LmxM.01.0030-1      849      211       60      100      195      789
##                HPGL0458 HPGL0462 HPGL0466 HPGL0470 HPGL0492 HPGL0496
## LmxM.01.0010-1      264      151      217      203     1512       51
## LmxM.01.0020-1     1153     1519     1315     1345     1528      153
## LmxM.01.0030-1      475      668      735      689      785       88
##                HPGL0500 HPGL0504 HPGL0508
## LmxM.01.0010-1       52       13        8
## LmxM.01.0020-1      225       62       66
## LmxM.01.0030-1      135       29       48
#               HPGL0435 HPGL0437 HPGL0440 HPGL0443 HPGL0446 HPGL0454 HPGL0458 HPGL0462 HPGL0466 HPGL0470 HPGL0492 HPGL0496 HPGL0500 HPGL0504 HPGL0508
#LmxM.01.0010-1     1910      154       34       38       56     2255      264      151      217      203     1512       51       52       13        8
#LmxM.01.0020-1     1103      370      119      162      277     1639     1153     1519     1315     1345     1528      153      225       62       66
#LmxM.01.0030-1      849      211       60      100      195      789      475      668      735      689      785       88      135       29       48
#Establish metadata for samples
sampleID <- colnames(countsTable)
condition <- rep(c("metac", "amast4", "amast24", "amast48", "amast72"), times=3)
batch <- rep(c("D", "E", "F"), each=5)
#Create condition and batch factor
condition <- factor(condition, levels=c("metac", "amast4", "amast24", "amast48", "amast72"))
batch <- factor(batch, levels=c("D", "E", "F"))
design <- data.frame(sampleID=sampleID, condition=condition, batch=batch)
design
##    sampleID condition batch
## 1  HPGL0435     metac     D
## 2  HPGL0437    amast4     D
## 3  HPGL0440   amast24     D
## 4  HPGL0443   amast48     D
## 5  HPGL0446   amast72     D
## 6  HPGL0454     metac     E
## 7  HPGL0458    amast4     E
## 8  HPGL0462   amast24     E
## 9  HPGL0466   amast48     E
## 10 HPGL0470   amast72     E
## 11 HPGL0492     metac     F
## 12 HPGL0496    amast4     F
## 13 HPGL0500   amast24     F
## 14 HPGL0504   amast48     F
## 15 HPGL0508   amast72     F
#   sampleID condition batch
#1  HPGL0435     metac     D
#2  HPGL0437    amast4     D
#3  HPGL0440   amast24     D
#4  HPGL0443   amast48     D
#5  HPGL0446   amast72     D
#6  HPGL0454     metac     E
#7  HPGL0458    amast4     E
#8  HPGL0462   amast24     E
#9  HPGL0466   amast48     E
#10 HPGL0470   amast72     E
#11 HPGL0492     metac     F
#12 HPGL0496    amast4     F
#13 HPGL0500   amast24     F
#14 HPGL0504   amast48     F
#15 HPGL0508   amast72     F
colnames(countsTable) <- paste(condition, batch, 1:length(condition), sep=".")

barplot(colSums(countsTable), las=3, ylim=c(0,3e+07))

#Normalize raw counts using DESeq size factors
df <- data.frame(cond=(condition))
dds <- DESeqDataSetFromMatrix(countData=countsTable, colData = df, design = ~ cond)
dds <- estimateSizeFactors(dds)
ncts <- counts(dds, normalized=TRUE)
y <- log(ncts + 1)

#Determine median and quantiles of the size factor normalized, log2 counts
median(y)
## [1] 5.789
#[1] 5.788727
quantile(y)
##     0%    25%    50%    75%   100% 
##  0.000  5.331  5.789  6.257 10.702
#       0%       25%       50%       75%      100%
# 0.000000  5.330752  5.788727  6.256596 10.701582
#Determine number of genes per sample with less than log2 of 2 counts (less than 4 counts per mill)
ydf <- as.data.frame(y)
colnames(ydf) <- sampleID

col.nam <- paste(condition, batch, 1:length(condition), sep=".")
#Boxplot of per sample log of size-factor-normalized counts (+ 1)
par(mar=c(10.5,4.5,2,1))
par(oma=c(0,0,0,0))
boxplot(y, names=col.nam, las=3)

#Heatmap of Pearson correlation between samples
#(delete Rowv, Colv, and dendrogram parameters if want samples to sort and show dendrogram)
#Correlation analysis (Pearson is default for cor unless othewise specified)
#Using raw counts
datCor <- cor(countsTable)
heatmap.2(datCor, Rowv=NA, Colv=NA,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          dendrogram="none",
          scale="none",
          trace="none",
          srtCol=45)

#Median pairwise correlation
#Using raw counts
corM <- matrixStats::rowMedians(cor(countsTable))
qs <- quantile(corM,p=c(1,3)/4)
iqr <- diff(qs)
outLimit <- qs[1] - 1.5 * iqr

ylim <- c(pmin(min(corM),outLimit),max(corM))
col <-  ifelse(condition=="amast4", "cornflowerblue",
        ifelse(condition=="amast24", "gold",
        ifelse(condition=="amast48", "green3",
        ifelse(condition=="amast72", "coral", "mediumorchid4"))))

plot(corM, xaxt="n", ylim=ylim, ylab="Median Pairwise Correlation", xlab="", main="", col=col, pch=16, cex=2.2)
axis(side=1,at=seq(along=corM),labels=paste(condition,batch,sep=":"),las=2)
abline(h=outLimit,lty=2)
abline(v=1:length(col.nam), lty=3, col="black")

#Plot without IQR cutoff
#Use filterCounts method to filter for low counts
#Define filterCounts function

filterCounts = function (counts, lib.size = NULL, thresh = 1, minSamples = 2) {
    cpms <- 2^log2CPM(counts, lib.size = lib.size)$y
    keep <- rowSums(cpms > thresh) >= minSamples
    counts <- counts[keep, ]
    counts
}
x <- table(condition)
dim(countsTable)
## [1] 8336   15
#[1] 8336   15
counts <- filterCounts(countsTable, thresh=1, minSamples=min(x))
dim(counts)
## [1] 8310   15
#[1] 8310   15
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Explore data for batch effects
#Transform data to log2 counts per million
x <- log2CPM(countsSubQ)
#Compute principal components
s <- makeSVD(x$y)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
##    propVar cumPropVar cond.R2 batch.R2
## 1    59.12      59.12   96.62     2.72
## 2     9.04      68.16   75.86    11.13
## 3     6.50      74.66   21.37    38.43
## 4     5.03      79.69   14.54    67.85
## 5     4.03      83.72    1.61    48.30
## 6     3.36      87.08   26.63    17.28
## 7     2.85      89.93   18.80     5.67
## 8     2.52      92.45   48.05     0.46
## 9     1.77      94.22   12.92     3.09
## 10    1.67      95.89   19.51     0.70
## 11    1.37      97.26    9.44     0.57
## 12    1.21      98.47   16.94     0.59
## 13    1.10      99.57   11.11     3.03
## 14    0.44     100.01   26.60     0.18
#   propVar cumPropVar cond.R2 batch.R2
#1    59.12      59.12   96.62     2.72
#2     9.04      68.16   75.86    11.13
#3     6.50      74.66   21.37    38.43
#4     5.03      79.69   14.54    67.85
#5     4.03      83.72    1.61    48.30
#6     3.36      87.08   26.63    17.28
#7     2.85      89.93   18.80     5.67
#8     2.52      92.45   48.05     0.46
#9     1.77      94.22   12.92     3.09
#10    1.67      95.89   19.51     0.70
#11    1.37      97.26    9.44     0.57
#12    1.21      98.47   16.94     0.59
#13    1.10      99.57   11.11     3.03
#14    0.44     100.01   26.60     0.18


#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="D", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.23, y=0.39, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)

#View as a Euclidean distance heatmap
dists <- dist(t(counts))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}

heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

dev.off()
## null device 
##           1
#I would like to create a PCA plot and heatmap to reflect the use of batch in the limma model
#Specify model
mod <- model.matrix(~batch)
#Use voom to determine weights for fitting the mean-variance trend for each gene
v <- voom(countsSubQ, mod)
#Fit the linear model for each gene
fit <- lmFit(v)
#Get the residual (i.e. everything except for the batch effect)
newData <- residuals(fit, v)
#Explore data for batch effects
#Compute principal components
s <- makeSVD(newData)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
##    propVar cumPropVar cond.R2 batch.R2
## 1    65.02      65.02   99.34        0
## 2     9.65      74.67   89.05        0
## 3     6.18      80.85   15.30        0
## 4     4.18      85.03   31.59        0
## 5     3.31      88.34   22.22        0
## 6     2.85      91.19   50.41        0
## 7     2.11      93.30   12.60        0
## 8     1.91      95.21   19.87        0
## 9     1.57      96.78    6.37        0
## 10    1.40      98.18   17.29        0
## 11    1.33      99.51   10.67        0
## 12    0.51     100.02   25.31        0
## 13    0.00     100.02    0.00      100
## 14    0.00     100.02    0.00      100
#   propVar cumPropVar cond.R2 batch.R2
#1    65.02      65.02   99.34        0
#2     9.65      74.67   89.05        0
#3     6.18      80.85   15.30        0
#4     4.18      85.03   31.59        0
#5     3.31      88.34   22.22        0
#6     2.85      91.19   50.41        0
#7     2.11      93.30   12.60        0
#8     1.91      95.21   19.87        0
#9     1.57      96.78    6.37        0
#10    1.40      98.18   17.29        0
#11    1.33      99.51   10.67        0
#12    0.51     100.02   25.31        0
#13    0.00     100.02    0.00      100
#14    0.00     100.02    0.00      100
#Plot PC1 vs. PC2, with black outlines and color fills

condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="D", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.48, y=-0.24, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)
#View as a Euclidean distance heatmap
dists <- dist(t(newData))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

#############################
#############################
#DE analysis
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Specify model
mod = model.matrix(~0+condition+batch)
#View mean-variance trend
voom(countsSubQ, mod, plot=TRUE)
## An object of class "EList"
## $E
##                metac.D.1 amast4.D.2 amast24.D.3 amast48.D.4 amast72.D.5
## LmxM.01.0010-1     6.872      5.124       4.686       4.055       4.147
## LmxM.01.0020-1     6.183      6.444       6.524       6.220       6.580
## LmxM.01.0030-1     5.842      5.599       5.498       5.496       6.041
## LmxM.01.0040-1     5.248      5.579       5.340       4.670       4.986
## LmxM.01.0050-1     6.555      6.640       6.931       6.445       6.731
##                metac.E.6 amast4.E.7 amast24.E.8 amast48.E.9 amast72.E.10
## LmxM.01.0010-1     6.862      4.549       3.743       3.998        4.062
## LmxM.01.0020-1     6.435      6.784       7.028       6.728        6.909
## LmxM.01.0030-1     5.414      5.428       5.754       5.858        5.875
## LmxM.01.0040-1     5.578      5.712       5.873       5.459        5.637
## LmxM.01.0050-1     6.487      6.679       6.877       6.861        7.005
##                metac.F.11 amast4.F.12 amast24.F.13 amast48.F.14
## LmxM.01.0010-1      6.479       4.832        4.295        4.253
## LmxM.01.0020-1      6.495       6.449        6.411        6.373
## LmxM.01.0030-1      5.568       5.630        5.681        5.313
## LmxM.01.0040-1      5.368       4.744        5.248        4.985
## LmxM.01.0050-1      6.524       6.647        6.826        6.565
##                amast72.F.15
## LmxM.01.0010-1        3.286
## LmxM.01.0020-1        6.173
## LmxM.01.0030-1        5.695
## LmxM.01.0040-1        5.261
## LmxM.01.0050-1        6.744
## 8305 more rows ...
## 
## $weights
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9]  [,10] [,11]
## [1,] 38.25 21.44 14.88 13.67 11.61 36.74 17.46 12.15 11.18  9.523 36.67
## [2,] 34.43 35.62 36.18 34.89 35.59 36.75 37.69 38.06 37.11 37.662 34.36
## [3,] 28.87 28.23 29.28 28.27 31.66 28.53 27.86 28.94 27.90 31.367 27.46
## [4,] 23.88 23.21 25.02 19.29 22.53 29.79 29.18 30.71 25.49 28.604 23.34
## [5,] 35.90 36.65 37.76 36.49 37.52 36.59 37.28 38.23 37.13 38.027 35.91
##      [,12] [,13] [,14]  [,15]
## [1,] 17.32 12.05 11.09  9.445
## [2,] 35.56 36.12 34.83 35.530
## [3,] 26.81 27.90 26.85 30.459
## [4,] 22.63 24.46 18.79 21.965
## [5,] 36.66 37.76 36.49 37.523
## 8305 more rows ...
## 
## $design
##   conditionmetac conditionamast4 conditionamast24 conditionamast48
## 1              1               0                0                0
## 2              0               1                0                0
## 3              0               0                1                0
## 4              0               0                0                1
## 5              0               0                0                0
##   conditionamast72 batchE batchF
## 1                0      0      0
## 2                0      0      0
## 3                0      0      0
## 4                0      0      0
## 5                1      0      0
## 10 more rows ...
## 
## $targets
##             lib.size
## metac.D.1    7666699
## amast4.D.2   7666679
## amast24.D.3  7666716
## amast48.D.4  7666698
## amast72.D.5  7666711
## 10 more rows ...
#Use voom to transform quantile-normalized count data to log2-counts per million, estimate mean-variance relationship
#and use m-v relationship to computer appropriate observational-level weights
v <- voom(countsSubQ, mod)
#Fit a linear model for each gene using the specified design contained in v
fit <- lmFit(v)
##metac v amast4##
#eBayes finds an F-statistic from the set of t-statistics for that gene
metac.amast4.contr.mat <- makeContrasts(metac_v_amast4=conditionamast4-conditionmetac, levels=v$design)
metac.amast4.fit <- contrasts.fit(fit, metac.amast4.contr.mat)
metac.amast4.eb <- eBayes(metac.amast4.fit)
metac.amast4.topTab <- topTable(metac.amast4.eb, coef="metac_v_amast4", number=nrow(v$E))
#View the list of DE genes
head(metac.amast4.topTab, n=3L)
##                        logFC AveExpr      t   P.Value adj.P.Val     B
## LmxM.31.1680-1        -3.044   5.086 -20.99 2.381e-15 1.978e-11 24.65
## LmxM.30.1450partial-1  2.429   9.485  18.74 2.181e-14 5.010e-11 23.02
## LmxM.29.2850-1        -2.509   6.005 -18.79 2.072e-14 5.010e-11 22.95
#                          logFC  AveExpr         t      P.Value    adj.P.Val        B
#LmxM.31.1680-1        -3.043546 5.085537 -20.98676 2.380772e-15 1.978422e-11 24.65466
#LmxM.30.1450partial-1  2.429366 9.484824  18.73963 2.181083e-14 5.009522e-11 23.02019
metac.amast4.topTab$fold_change <- 2 ^ metac.amast4.topTab$logFC
write.csv(file="csv/lamazonensis_metac_vs_amast4.csv", x=metac.amast4.topTab)

#Limit list to genes with an adjusted p value < 0.05
metac.amast4.sigGenes <- metac.amast4.topTab[metac.amast4.topTab$adj.P.Val <0.05, ]
length(metac.amast4.sigGenes$logFC)
## [1] 3896
#3896
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
metac.amast4.sigGenesFold1 <- subset(metac.amast4.sigGenes, abs(logFC) > 1)
length(metac.amast4.sigGenesFold1$logFC)
## [1] 649
#649
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
metac.amast4.sigGenesFold2 <- subset(metac.amast4.sigGenes, abs(logFC) > 2)
length(metac.amast4.sigGenesFold2$logFC)
## [1] 44
#44

metac.amast4.sigGenes <- metac.amast4.sigGenes[order(-metac.amast4.sigGenes$logFC), ]
#Make an MA plot
sel = metac.amast4.topTab$adj.P.Val < 0.05
top = metac.amast4.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="metac_amast4 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast4 v amast24##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast4.amast24.contr.mat <- makeContrasts(amast4_v_amast24=conditionamast24-conditionamast4, levels=v$design)
amast4.amast24.fit <- contrasts.fit(fit, amast4.amast24.contr.mat)
amast4.amast24.eb <- eBayes(amast4.amast24.fit)
amast4.amast24.topTab <- topTable(amast4.amast24.eb, coef="amast4_v_amast24", number=nrow(v$E))

#View the list of DE genes
head(amast4.amast24.topTab, n=3L)
##                 logFC AveExpr       t   P.Value adj.P.Val      B
## LmxM.09.0060-1  1.543   9.332  10.945 5.039e-10  3.34e-06 12.916
## LmxM.17.0890-1 -1.712   7.707 -10.658 8.040e-10  3.34e-06 12.499
## LmxM.23.1665-1 -1.647   5.517  -9.207 9.855e-09  2.73e-05  9.985
#                   logFC  AveExpr          t      P.Value    adj.P.Val         B
#LmxM.09.0060-1  1.543364 9.332460  10.944657 5.038502e-10 3.340466e-06 12.915671
#LmxM.17.0890-1 -1.712316 7.707216 -10.657815 8.039628e-10 3.340466e-06 12.499147
#LmxM.23.1665-1 -1.646699 5.517480  -9.207189 9.854612e-09 2.729727e-05  9.985394
#Limit list to genes with an adjusted p value < 0.05
amast4.amast24.sigGenes <- amast4.amast24.topTab[amast4.amast24.topTab$adj.P.Val <0.05, ]
length(amast4.amast24.sigGenes$logFC)
## [1] 577
#577
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast4.amast24.sigGenesFold1 <- subset(amast4.amast24.sigGenes, abs(logFC) > 1)
length(amast4.amast24.sigGenesFold1$logFC)
## [1] 104
#104
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast4.amast24.sigGenesFold2 <- subset(amast4.amast24.sigGenes, abs(logFC) > 2)
length(amast4.amast24.sigGenesFold2$logFC)
## [1] 1
#1
amast4.amast24.sigGenes <- amast4.amast24.sigGenes[order(-amast4.amast24.sigGenes$logFC), ]
#Make an MA plot
sel = amast4.amast24.topTab$adj.P.Val < 0.05
top = amast4.amast24.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast4_amast24 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")


##amast24 v amast48##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast24.amast48.contr.mat <- makeContrasts(amast24_v_amast48=conditionamast48-conditionamast24, levels=v$design)
amast24.amast48.fit <- contrasts.fit(fit, amast24.amast48.contr.mat)
amast24.amast48.eb <- eBayes(amast24.amast48.fit)
amast24.amast48.topTab <- topTable(amast24.amast48.eb, coef="amast24_v_amast48", number=nrow(v$E))
#View the list of DE genes
head(amast24.amast48.topTab, n=3L)
##                  logFC AveExpr      t   P.Value adj.P.Val     B
## LmxM.02.0460-1 -1.0270   6.482 -7.943 1.082e-07 0.0008988 4.933
## LmxM.30.1165-1  1.0818   5.463  7.038 6.838e-07 0.0028414 3.130
## LmxM.30.1190-1  0.8976   6.136  5.998 6.518e-06 0.0180550 2.413
#                    logFC  AveExpr         t      P.Value    adj.P.Val        B
#LmxM.02.0460-1 -1.0270442 6.481687 -7.943049 1.081563e-07 0.0008987787 4.933005
#LmxM.30.1165-1  1.0817746 5.463023  7.038413 6.838410e-07 0.0028413593 3.129606
#LmxM.30.1190-1  0.8976157 6.135557  5.998049 6.518059e-06 0.0180550222 2.413080
#Limit list to genes with an adjusted p value < 0.05
amast24.amast48.sigGenes <- amast24.amast48.topTab[amast24.amast48.topTab$adj.P.Val <0.05, ]
length(amast24.amast48.sigGenes$logFC)
## [1] 3
#3
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast24.amast48.sigGenesFold1 <- subset(amast24.amast48.sigGenes, abs(logFC) > 1)
length(amast24.amast48.sigGenesFold1$logFC)
## [1] 2
#2
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast24.amast48.sigGenesFold2 <- subset(amast24.amast48.sigGenes, abs(logFC) > 2)
length(amast24.amast48.sigGenesFold2$logFC)
## [1] 0
#0
amast24.amast48.sigGenes <- amast24.amast48.sigGenes[order(-amast24.amast48.sigGenes$logFC), ]
#Make an MA plot
sel = amast24.amast48.topTab$adj.P.Val < 0.05
top = amast24.amast48.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast24_amast48 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")


##amast48 v amast72##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast48.amast72.contr.mat <- makeContrasts(amast48_v_amast72=conditionamast72-conditionamast48, levels=v$design)
amast48.amast72.fit <- contrasts.fit(fit, amast48.amast72.contr.mat)
amast48.amast72.eb <- eBayes(amast48.amast72.fit)
amast48.amast72.topTab <- topTable(amast48.amast72.eb, coef="amast48_v_amast72", number=nrow(v$E))

#View the list of DE genes
head(amast48.amast72.topTab, n=3L)
##                     logFC AveExpr      t  P.Value adj.P.Val      B
## LmxM.02.0460-1    -0.6433   6.482 -4.585 0.000169    0.9987 -2.970
## LmxM.13.0390-1    -0.4710  12.405 -3.565 0.001878    0.9987 -3.190
## LmxM.08_29.0251-1 -0.7466   6.216 -3.701 0.001365    0.9987 -3.248
#                       logFC   AveExpr         t      P.Value adj.P.Val         B
#LmxM.02.0460-1    -0.6433210  6.481687 -4.584932 0.0001689891 0.9987343 -2.970448
#LmxM.13.0390-1    -0.4709751 12.404965 -3.565121 0.0018782419 0.9987343 -3.190347
#LmxM.08_29.0251-1 -0.7466192  6.215922 -3.700966 0.0013650425 0.9987343 -3.248311
#Limit list to genes with an adjusted p value < 0.05
amast48.amast72.sigGenes <- amast48.amast72.topTab[amast48.amast72.topTab$adj.P.Val <0.05, ]
length(amast48.amast72.sigGenes$logFC)
## [1] 0
#0
#Make an MA plot
sel = amast48.amast72.topTab$adj.P.Val < 0.05
top = amast48.amast72.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast48_amast72 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")
---
title: "L.major/amazonensis 2016:  Parasite Sample Estimation."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: tango
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: cosmo
  toc: true
  toc_float:
   collapsed: false
   smooth_scroll: false
---

<style>
body .main-container {
max-width: 1600px;
}
</style>

```{r options, include=FALSE}
## These are the options I tend to favor
library("hpgltools")
knitr::opts_knit$set(
    progress = TRUE,
    verbose = TRUE,
    width = 90,
    echo = TRUE)
knitr::opts_chunk$set(
    error = TRUE,
    fig.width = 8,
    fig.height = 8,
    dpi = 96)
options(
    digits = 4,
    stringsAsFactors = FALSE,
    knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
set.seed(1)
rmd_file <- "repeat_log_lamazonensis.Rmd"
```

[index.html](index.html)

```{r rendering, include=FALSE, eval=FALSE}
## This block is used to render a document from within it.
rmarkdown::render(rmd_file)

## An extra renderer for pdf output
rmarkdown::render(rmd_file, output_format="pdf_document", output_options=c("skip_html"))
## Or to save/load large Rdata files.
hpgltools:::saveme()
hpgltools:::loadme()
rm(list=ls())
```

# Repeat a previous set of analyses and see where we differ

I am going to intersperse the original comments and code, then compare with my own analyses.

<pre>
#metacyclic to 4-hr amastigote transition and the amastigote samples across timepoints.
#This log is for parasite samples only using the count table restricted to CDS only.

#Count table generation is described in logs:
#lminfectome_dillonl_20141001_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0434-446_452-472_cecilia.log
#lminfectome_dillonl_20141211_TopHat_HTSeq_Count_Table_hg19_Lamazonensis_Lmajor_beads_HPGL0491-510_cecilia.log

#Lmexicana81 (L. amazonensis) samples
#Name           condition       batch
#HPGL0435       metac           D
#HPGL0437       amastLA4        D
#HPGL0440       amastLA24       D
#HPGL0443       amastLA48       D
#HPGL0446       amastLA72       D
#HPGL0454       metac           E
#HPGL0458       amastLA4        E
#HPGL0462       amastLA24       E
#HPGL0466       amastLA48       E
#HPGL0470       amastLA72       E
#HPGL0492       metac           F
#HPGL0496       amastLA4        F
#HPGL0500       amastLA24       F
#HPGL0504       amastLA48       F
#HPGL0508       amastLA72       F
#Count table is:
#20141211_Lmexicana81_434-435_437_440_443_446_453-454_458_462_466_470_491-492_496_500_504_508_CDSonly.count
#Includes headers

#7-28-15
#L. amazonensis
#Create diagnostic plots
#Hector recommended that all diagnostics be done using the rawest data possible. For correlation analyses between samples,
#use countsTable data without filtering out lowly expressed genes and without normalization. Size factor normalization
#will be used to show the distribution of gene expression levels.

#This analysis uses count tables that have not been restricted to _CDS only.
</pre>

```{r repeatme}
library(cbcbSEQ)
library(gplots)
library(matrixStats)
library(siggenes)
library(ReportingTools)
library(hwriter)
library(DESeq2)
library(RColorBrewer)

#Read in counts table
countsTable <- read.table("preprocessing/dillonl/20141211_Lmexicana81_434-435_437_440_443_446_453-454_458_462_466_470_491-492_496_500_504_508_CDSonly.count.xz", header=TRUE)
#Remove id as an actual column, use as row names instead
rownames(countsTable) <- countsTable$id
countsTable <- countsTable[,-1]
#Restrict to samples of interest (exclude procyclic promastigote samples)
countsTable <- countsTable[,-c(1, 7, 13)]
head(countsTable, n=3L)
#               HPGL0435 HPGL0437 HPGL0440 HPGL0443 HPGL0446 HPGL0454 HPGL0458 HPGL0462 HPGL0466 HPGL0470 HPGL0492 HPGL0496 HPGL0500 HPGL0504 HPGL0508
#LmxM.01.0010-1     1910      154       34       38       56     2255      264      151      217      203     1512       51       52       13        8
#LmxM.01.0020-1     1103      370      119      162      277     1639     1153     1519     1315     1345     1528      153      225       62       66
#LmxM.01.0030-1      849      211       60      100      195      789      475      668      735      689      785       88      135       29       48
#Establish metadata for samples
sampleID <- colnames(countsTable)
condition <- rep(c("metac", "amast4", "amast24", "amast48", "amast72"), times=3)
batch <- rep(c("D", "E", "F"), each=5)
#Create condition and batch factor
condition <- factor(condition, levels=c("metac", "amast4", "amast24", "amast48", "amast72"))
batch <- factor(batch, levels=c("D", "E", "F"))
design <- data.frame(sampleID=sampleID, condition=condition, batch=batch)
design
#   sampleID condition batch
#1  HPGL0435     metac     D
#2  HPGL0437    amast4     D
#3  HPGL0440   amast24     D
#4  HPGL0443   amast48     D
#5  HPGL0446   amast72     D
#6  HPGL0454     metac     E
#7  HPGL0458    amast4     E
#8  HPGL0462   amast24     E
#9  HPGL0466   amast48     E
#10 HPGL0470   amast72     E
#11 HPGL0492     metac     F
#12 HPGL0496    amast4     F
#13 HPGL0500   amast24     F
#14 HPGL0504   amast48     F
#15 HPGL0508   amast72     F
colnames(countsTable) <- paste(condition, batch, 1:length(condition), sep=".")

barplot(colSums(countsTable), las=3, ylim=c(0,3e+07))

#Normalize raw counts using DESeq size factors
df <- data.frame(cond=(condition))
dds <- DESeqDataSetFromMatrix(countData=countsTable, colData = df, design = ~ cond)
dds <- estimateSizeFactors(dds)
ncts <- counts(dds, normalized=TRUE)
y <- log(ncts + 1)

#Determine median and quantiles of the size factor normalized, log2 counts
median(y)
#[1] 5.788727
quantile(y)
#       0%       25%       50%       75%      100%
# 0.000000  5.330752  5.788727  6.256596 10.701582
#Determine number of genes per sample with less than log2 of 2 counts (less than 4 counts per mill)
ydf <- as.data.frame(y)
colnames(ydf) <- sampleID

col.nam <- paste(condition, batch, 1:length(condition), sep=".")
#Boxplot of per sample log of size-factor-normalized counts (+ 1)
par(mar=c(10.5,4.5,2,1))
par(oma=c(0,0,0,0))
boxplot(y, names=col.nam, las=3)

#Heatmap of Pearson correlation between samples
#(delete Rowv, Colv, and dendrogram parameters if want samples to sort and show dendrogram)
#Correlation analysis (Pearson is default for cor unless othewise specified)
#Using raw counts
datCor <- cor(countsTable)
heatmap.2(datCor, Rowv=NA, Colv=NA,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          dendrogram="none",
          scale="none",
          trace="none",
          srtCol=45)
#Median pairwise correlation
#Using raw counts
corM <- matrixStats::rowMedians(cor(countsTable))
qs <- quantile(corM,p=c(1,3)/4)
iqr <- diff(qs)
outLimit <- qs[1] - 1.5 * iqr

ylim <- c(pmin(min(corM),outLimit),max(corM))
col <-  ifelse(condition=="amast4", "cornflowerblue",
        ifelse(condition=="amast24", "gold",
        ifelse(condition=="amast48", "green3",
        ifelse(condition=="amast72", "coral", "mediumorchid4"))))

plot(corM, xaxt="n", ylim=ylim, ylab="Median Pairwise Correlation", xlab="", main="", col=col, pch=16, cex=2.2)
axis(side=1,at=seq(along=corM),labels=paste(condition,batch,sep=":"),las=2)
abline(h=outLimit,lty=2)
abline(v=1:length(col.nam), lty=3, col="black")
#Plot without IQR cutoff
#Use filterCounts method to filter for low counts
#Define filterCounts function

filterCounts = function (counts, lib.size = NULL, thresh = 1, minSamples = 2) {
    cpms <- 2^log2CPM(counts, lib.size = lib.size)$y
    keep <- rowSums(cpms > thresh) >= minSamples
    counts <- counts[keep, ]
    counts
}
x <- table(condition)
dim(countsTable)
#[1] 8336   15
counts <- filterCounts(countsTable, thresh=1, minSamples=min(x))
dim(counts)
#[1] 8310   15
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Explore data for batch effects
#Transform data to log2 counts per million
x <- log2CPM(countsSubQ)
#Compute principal components
s <- makeSVD(x$y)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
#   propVar cumPropVar cond.R2 batch.R2
#1    59.12      59.12   96.62     2.72
#2     9.04      68.16   75.86    11.13
#3     6.50      74.66   21.37    38.43
#4     5.03      79.69   14.54    67.85
#5     4.03      83.72    1.61    48.30
#6     3.36      87.08   26.63    17.28
#7     2.85      89.93   18.80     5.67
#8     2.52      92.45   48.05     0.46
#9     1.77      94.22   12.92     3.09
#10    1.67      95.89   19.51     0.70
#11    1.37      97.26    9.44     0.57
#12    1.21      98.47   16.94     0.59
#13    1.10      99.57   11.11     3.03
#14    0.44     100.01   26.60     0.18


#Plot PC1 vs. PC2, with black outlines and color fills
#Save as eps
condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="D", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.23, y=0.39, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)
#View as a Euclidean distance heatmap
dists <- dist(t(counts))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}

heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)
dev.off()
#I would like to create a PCA plot and heatmap to reflect the use of batch in the limma model
#Specify model
mod <- model.matrix(~batch)
#Use voom to determine weights for fitting the mean-variance trend for each gene
v <- voom(countsSubQ, mod)
#Fit the linear model for each gene
fit <- lmFit(v)
#Get the residual (i.e. everything except for the batch effect)
newData <- residuals(fit, v)
#Explore data for batch effects
#Compute principal components
s <- makeSVD(newData)
#Compute variance of each PC and how they correlate with batch and condition
pcRes(s$v, s$d, condition, batch)
#   propVar cumPropVar cond.R2 batch.R2
#1    65.02      65.02   99.34        0
#2     9.65      74.67   89.05        0
#3     6.18      80.85   15.30        0
#4     4.18      85.03   31.59        0
#5     3.31      88.34   22.22        0
#6     2.85      91.19   50.41        0
#7     2.11      93.30   12.60        0
#8     1.91      95.21   19.87        0
#9     1.57      96.78    6.37        0
#10    1.40      98.18   17.29        0
#11    1.33      99.51   10.67        0
#12    0.51     100.02   25.31        0
#13    0.00     100.02    0.00      100
#14    0.00     100.02    0.00      100
#Plot PC1 vs. PC2, with black outlines and color fills

condnum <- as.numeric(condition)
plotPC(s$v,
       s$d,
       col="black",
       pch=ifelse(batch=="D", 25, ifelse(batch=="B", 21, ifelse(batch=="C", 22, ifelse(batch=="E", 24, 23)))),
       bg=ifelse(condnum==1, "mediumorchid4", ifelse(condnum==2, "cornflowerblue", ifelse(condnum==3, "gold", ifelse(condnum==4, "green3", "coral")))), cex=2.6
       )
legend(x=-0.48, y=-0.24, legend=c("metac", "amast4", "amast24", "amast48", "amast72"), pch=22, col=0,
pt.bg=c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral"), pt.cex=2.6, bty="n")
text(s$v[,1], s$v[,2], colnames(countsTable), cex=.7, pos=4)
#View as a Euclidean distance heatmap
dists <- dist(t(newData))
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat) <- with(design, paste(colnames(countsTable)))
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
vec.batch <- rainbow(nlevels(batch), start=0, end=.8)
batch.color <- rep(0, length(batch))
for (i in 1:length(batch)) {
    batch.color[i] <- vec.batch[batch[i]==levels(batch)]
}
vec.condition <- c("mediumorchid4", "cornflowerblue", "gold", "green3", "coral")
condition.color <- rep(0, length(condition))
for (i in 1:length(condition)) {
    condition.color[i] <- vec.condition[condition[i]==levels(condition)]
}
heatmap <- heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(11,11), ColSideColors=condition.color,
RowSideColors=batch.color, key="FALSE", srtCol=45)

#############################
#############################
#DE analysis
#Quantile normalize counts
countsSubQ <- qNorm(counts)
#Specify model
mod = model.matrix(~0+condition+batch)
#View mean-variance trend
voom(countsSubQ, mod, plot=TRUE)
#Use voom to transform quantile-normalized count data to log2-counts per million, estimate mean-variance relationship
#and use m-v relationship to computer appropriate observational-level weights
v <- voom(countsSubQ, mod)
#Fit a linear model for each gene using the specified design contained in v
fit <- lmFit(v)
##metac v amast4##
#eBayes finds an F-statistic from the set of t-statistics for that gene
metac.amast4.contr.mat <- makeContrasts(metac_v_amast4=conditionamast4-conditionmetac, levels=v$design)
metac.amast4.fit <- contrasts.fit(fit, metac.amast4.contr.mat)
metac.amast4.eb <- eBayes(metac.amast4.fit)
metac.amast4.topTab <- topTable(metac.amast4.eb, coef="metac_v_amast4", number=nrow(v$E))
#View the list of DE genes
head(metac.amast4.topTab, n=3L)
#                          logFC  AveExpr         t      P.Value    adj.P.Val        B
#LmxM.31.1680-1        -3.043546 5.085537 -20.98676 2.380772e-15 1.978422e-11 24.65466
#LmxM.30.1450partial-1  2.429366 9.484824  18.73963 2.181083e-14 5.009522e-11 23.02019
metac.amast4.topTab$fold_change <- 2 ^ metac.amast4.topTab$logFC
write.csv(file="csv/lamazonensis_metac_vs_amast4.csv", x=metac.amast4.topTab)

#Limit list to genes with an adjusted p value < 0.05
metac.amast4.sigGenes <- metac.amast4.topTab[metac.amast4.topTab$adj.P.Val <0.05, ]
length(metac.amast4.sigGenes$logFC)
#3896
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
metac.amast4.sigGenesFold1 <- subset(metac.amast4.sigGenes, abs(logFC) > 1)
length(metac.amast4.sigGenesFold1$logFC)
#649
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
metac.amast4.sigGenesFold2 <- subset(metac.amast4.sigGenes, abs(logFC) > 2)
length(metac.amast4.sigGenesFold2$logFC)
#44

metac.amast4.sigGenes <- metac.amast4.sigGenes[order(-metac.amast4.sigGenes$logFC), ]
#Make an MA plot
sel = metac.amast4.topTab$adj.P.Val < 0.05
top = metac.amast4.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="metac_amast4 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")

##amast4 v amast24##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast4.amast24.contr.mat <- makeContrasts(amast4_v_amast24=conditionamast24-conditionamast4, levels=v$design)
amast4.amast24.fit <- contrasts.fit(fit, amast4.amast24.contr.mat)
amast4.amast24.eb <- eBayes(amast4.amast24.fit)
amast4.amast24.topTab <- topTable(amast4.amast24.eb, coef="amast4_v_amast24", number=nrow(v$E))

#View the list of DE genes
head(amast4.amast24.topTab, n=3L)
#                   logFC  AveExpr          t      P.Value    adj.P.Val         B
#LmxM.09.0060-1  1.543364 9.332460  10.944657 5.038502e-10 3.340466e-06 12.915671
#LmxM.17.0890-1 -1.712316 7.707216 -10.657815 8.039628e-10 3.340466e-06 12.499147
#LmxM.23.1665-1 -1.646699 5.517480  -9.207189 9.854612e-09 2.729727e-05  9.985394
#Limit list to genes with an adjusted p value < 0.05
amast4.amast24.sigGenes <- amast4.amast24.topTab[amast4.amast24.topTab$adj.P.Val <0.05, ]
length(amast4.amast24.sigGenes$logFC)
#577
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast4.amast24.sigGenesFold1 <- subset(amast4.amast24.sigGenes, abs(logFC) > 1)
length(amast4.amast24.sigGenesFold1$logFC)
#104
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast4.amast24.sigGenesFold2 <- subset(amast4.amast24.sigGenes, abs(logFC) > 2)
length(amast4.amast24.sigGenesFold2$logFC)
#1
amast4.amast24.sigGenes <- amast4.amast24.sigGenes[order(-amast4.amast24.sigGenes$logFC), ]
#Make an MA plot
sel = amast4.amast24.topTab$adj.P.Val < 0.05
top = amast4.amast24.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast4_amast24 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")


##amast24 v amast48##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast24.amast48.contr.mat <- makeContrasts(amast24_v_amast48=conditionamast48-conditionamast24, levels=v$design)
amast24.amast48.fit <- contrasts.fit(fit, amast24.amast48.contr.mat)
amast24.amast48.eb <- eBayes(amast24.amast48.fit)
amast24.amast48.topTab <- topTable(amast24.amast48.eb, coef="amast24_v_amast48", number=nrow(v$E))
#View the list of DE genes
head(amast24.amast48.topTab, n=3L)
#                    logFC  AveExpr         t      P.Value    adj.P.Val        B
#LmxM.02.0460-1 -1.0270442 6.481687 -7.943049 1.081563e-07 0.0008987787 4.933005
#LmxM.30.1165-1  1.0817746 5.463023  7.038413 6.838410e-07 0.0028413593 3.129606
#LmxM.30.1190-1  0.8976157 6.135557  5.998049 6.518059e-06 0.0180550222 2.413080
#Limit list to genes with an adjusted p value < 0.05
amast24.amast48.sigGenes <- amast24.amast48.topTab[amast24.amast48.topTab$adj.P.Val <0.05, ]
length(amast24.amast48.sigGenes$logFC)
#3
#Filter out rows with less than 2-fold change (log2 fold change of > 1)
amast24.amast48.sigGenesFold1 <- subset(amast24.amast48.sigGenes, abs(logFC) > 1)
length(amast24.amast48.sigGenesFold1$logFC)
#2
#Filter out rows with less than 4-fold change (log2 fold change of > 2)
amast24.amast48.sigGenesFold2 <- subset(amast24.amast48.sigGenes, abs(logFC) > 2)
length(amast24.amast48.sigGenesFold2$logFC)
#0
amast24.amast48.sigGenes <- amast24.amast48.sigGenes[order(-amast24.amast48.sigGenes$logFC), ]
#Make an MA plot
sel = amast24.amast48.topTab$adj.P.Val < 0.05
top = amast24.amast48.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast24_amast48 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")


##amast48 v amast72##
#eBayes finds an F-statistic from the set of t-statistics for that gene
amast48.amast72.contr.mat <- makeContrasts(amast48_v_amast72=conditionamast72-conditionamast48, levels=v$design)
amast48.amast72.fit <- contrasts.fit(fit, amast48.amast72.contr.mat)
amast48.amast72.eb <- eBayes(amast48.amast72.fit)
amast48.amast72.topTab <- topTable(amast48.amast72.eb, coef="amast48_v_amast72", number=nrow(v$E))

#View the list of DE genes
head(amast48.amast72.topTab, n=3L)
#                       logFC   AveExpr         t      P.Value adj.P.Val         B
#LmxM.02.0460-1    -0.6433210  6.481687 -4.584932 0.0001689891 0.9987343 -2.970448
#LmxM.13.0390-1    -0.4709751 12.404965 -3.565121 0.0018782419 0.9987343 -3.190347
#LmxM.08_29.0251-1 -0.7466192  6.215922 -3.700966 0.0013650425 0.9987343 -3.248311
#Limit list to genes with an adjusted p value < 0.05
amast48.amast72.sigGenes <- amast48.amast72.topTab[amast48.amast72.topTab$adj.P.Val <0.05, ]
length(amast48.amast72.sigGenes$logFC)
#0
#Make an MA plot
sel = amast48.amast72.topTab$adj.P.Val < 0.05
top = amast48.amast72.topTab
sub = paste("No. of sig. genes: ", sum(sel),"/",length(sel))
cpm = v$E
plot(rowMeans(cpm[rownames(top),]), top$logFC, pch=16, cex=0.5,col="darkgrey",
        main="amast48_amast72 model batch adjusted",
        ylab="log FC", xlab="Average Expression",
        ylim=c(-3,3), sub=sub)
points(rowMeans(cpm[rownames(top),])[sel], top$logFC[sel], col="red", cex=0.5)
abline(h=c(-1,0,1), col="red")
```
