Final DE comparisons

At this point, we appear to have decided to use combat adjusted data. The data structure containing that data is linearfit_combat_cond

contrast_matrix = makeContrasts(rFE_v_dFE = wt_conditionswt_dfe - wt_conditionswt_rfe,
    levels=linearfit_combat_condbatch$design)
combat_condbatch_contrasts = contrasts.fit(linearfit_combat_condbatch, contrast_matrix)
combat_condbatch_comparison = eBayes(combat_condbatch_contrasts)
combat_condbatch_table = topTable(combat_condbatch_comparison,
    coef="rFE_v_dFE", number=nrow(linear_fit_cond_batch$E), sort.by="logFC")
write.csv(combat_condbatch_table, file="txt/combat-condbatch_adjustment_table.csv")

Now make a MA plot of the voom adjusted data – which actually was generated before

combat_ma_plot = my_ma_plot(meanvar_combat_condbatch$E, combat_condbatch_table)
combat_ma_plot

plot of chunk ma_plot

write_plots("ma_plot", combat_ma_plot)

## Warning: semi-transparency is not supported on this device: reported only
## once per page

## Cairo 
##     2

pvalue_histogram = ggplot(combat_condbatch_table, aes(x=P.Value)) +
    xlab("P value") +
    ylab("Number of genes observed") +
    geom_histogram(aes(y=..density..), binwidth=0.01, colour="black", fill="darkgray") +
    geom_density(alpha=0.2, fill="#EEEEEE") +
    geom_vline(aes(xintercept=mean(P.Value, na.rm=T)), color="red", linetype="dashed", size=1) +
    theme_bw()
pvalue_histogram

## Warning: position_stack requires constant width: output may be incorrect

plot of chunk ma_plot

write_plots("pvalue_histogram", pvalue_histogram)

## Warning: semi-transparency is not supported on this device: reported only
## once per page

## Cairo 
##     2

Play with voom output

I think the voom/limma output before performing subtraction is interesting and has some potentially neat stuff to plot. I will see what I can see.

gene_fit = linearfit_combat_condbatch ## I am lazy, I had this named as such earlier
play_data = data.frame(linearfit_combat_condbatch$coefficients[,c(1,2)])
colnames(play_data) = c("deplete","replete")
cor.test(play_data[,1], play_data[,2])

## 
##  Pearson's product-moment correlation
## 
## data:  play_data[, 1] and play_data[, 2]
## t = 721.5, df = 8060, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9920 0.9927
## sample estimates:
##    cor 
## 0.9923

require.auto("robust")
linear_model = lmrob(formula=deplete ~ replete, data=play_data, method="SMDM")
linear_model_summary = summary(linear_model)
linear_model_weights = weights(linear_model, type="robustness")
linear_model_summary

## 
## Call:
## lmrob(formula = deplete ~ replete, data = play_data, method = "SMDM")
##  \--> method = "SMDM"
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.54591 -0.07803  0.00215  0.07895  1.75475 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03464    0.00717    4.83  1.4e-06 ***
## replete      0.99508    0.00110  905.08  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 0.125 
## Multiple R-squared:  0.99,   Adjusted R-squared:  0.99 
## Convergence in 7 IRWLS iterations
## 
## Robustness weights: 
##  23 observations c(122,248,618,621,628,779,1163,2745,3220,3221,3537,3805,5080,5122,5413,5416,5417,5611,5612,6122,6382,6952,7586)
##   are outliers with |weight| <= 1.8e-06 ( < 1.2e-05); 
##  5730 weights are ~= 1. The remaining 2309 ones are summarized as
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0003  0.7390  0.9050  0.8170  0.9740  0.9990 
## Algorithmic parameters: 
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4          bb tuning.psi1 
##    -5.0e-01     1.5e+00          NA     5.0e-01     5.0e-01    -5.0e-01 
## tuning.psi2 tuning.psi3 tuning.psi4  refine.tol     rel.tol   solve.tol 
##     1.5e+00     9.5e-01          NA     1.0e-07     1.0e-07     1.0e-07 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd      numpoints 
##            200              0           1000              0             10 
## fast.s.large.n 
##           2000 
##           psi   subsampling           cov 
##         "lqq" "nonsingular"     ".vcov.w" 
## seed : int(0)

replete_intercept = coef(linear_model_summary)[1]
replete_slope = coef(linear_model_summary)[2]
replete_median = summary(play_data$replete)["Median"]
deplete_median = summary(play_data$deplete)["Median"]
replete_mad = mad(play_data$replete)
deplete_mad = mad(play_data$deplete)
replete_deplete = ggplot(play_data, aes(x=deplete, y=replete)) +
    xlab("Expression of amazonensis depleted of iron") +
    ylab("Expression of amazonensis unperturbed") +
    geom_vline(color="grey", xintercept=(deplete_median - deplete_mad), size=1.2) +
    geom_vline(color="grey", xintercept=(deplete_median + deplete_mad), size=1.2) +
    geom_vline(color="darkgrey", xintercept=deplete_median, size=1.5) +
    geom_hline(color="grey", yintercept=(replete_median - replete_mad), size=1.2) +
    geom_hline(color="grey", yintercept=(replete_median + replete_mad), size=1.2) +
    geom_hline(color="darkgrey", yintercept=replete_median, size=1.5) +
    geom_abline(colour="grey", slope=replete_slope, intercept=replete_intercept) +
    geom_point(alpha=0.5, size=2,
               colour=hsv(linear_model_weights * 9/20,
                   linear_model_weights/20 + 19/20,
                   (1.0 - linear_model_weights))) +
    theme(legend.position="none")
replete_deplete

plot of chunk play_voom

write_plots("replete_deplete_comparison", replete_deplete)

## Warning: semi-transparency is not supported on this device: reported only
## once per page

## Cairo 
##     2

It is also possible to plot histograms of the replete vs deplete data and see if there is a difference in overall expression level. There is not.

require.auto("plyr")
play_deplete = data.frame(expression=play_data[,c(1)], cond="deplete")
play_replete = data.frame(expression=play_data[,c(2)], cond="replete")
play_both = rbind(play_deplete, play_replete)
play_cdf = ddply(play_both, "cond", summarise, rating.mean=mean(expression, na.rm=TRUE))
cor.test(play_deplete$expression, play_replete$expression)

## 
##  Pearson's product-moment correlation
## 
## data:  play_deplete$expression and play_replete$expression
## t = 721.5, df = 8060, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9920 0.9927
## sample estimates:
##    cor 
## 0.9923

cor.test(play_deplete$expression, play_replete$expression, method="spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  play_deplete$expression and play_replete$expression
## S = 785224932, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##   rho 
## 0.991

t.test(play_deplete$expression, play_replete$expression)

## 
##  Welch Two Sample t-test
## 
## data:  play_deplete$expression and play_replete$expression
## t = -0.0129, df = 16122, p-value = 0.9897
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04064  0.04011
## sample estimates:
## mean of x mean of y 
##     6.384     6.385

ggplot(play_both, aes(x=expression, fill=cond)) +
    geom_histogram(aes(y=..density..), binwidth=0.05, alpha=0.4, position="identity") +
    xlab("Expression") +
    ylab("Number of genes observed") +
    geom_density(alpha=0.6) +
    geom_vline(data=play_cdf, aes(xintercept=rating.mean,  colour=cond), linetype="dashed", size=0.75)

plot of chunk expression_histogram

Check out standard deviations

gene_fit also has standard deviations by condition and batch I was thinking that if there is some systematic difference in the two data sets, then plotting a histogram of the standard deviations will show it. As the following plot shows, this is not the case – which I think is a good thing(TM).

stdev_play = as.data.frame(gene_fit$stdev.unscaled)[,c(1,2)]
colnames(stdev_play) = c("deplete","replete")
stdev_deplete = data.frame(stdev=stdev_play[,c(1)] * 100, cond="deplete")
stdev_replete = data.frame(stdev=stdev_play[,c(2)] * 100, cond="replete")
stdev_both = rbind(stdev_deplete, stdev_replete)
stdev_cdf = ddply(stdev_both, "cond", summarise, rating.mean=mean(stdev, na.rm=TRUE))
cor.test(stdev_deplete$stdev, stdev_replete$stdev)

## 
##  Pearson's product-moment correlation
## 
## data:  stdev_deplete$stdev and stdev_replete$stdev
## t = 1366, df = 8060, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9978 0.9979
## sample estimates:
##    cor 
## 0.9978

t.test(stdev_deplete$stdev, stdev_replete$stdev)

## 
##  Welch Two Sample t-test
## 
## data:  stdev_deplete$stdev and stdev_replete$stdev
## t = 0.0353, df = 16122, p-value = 0.9719
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04559  0.04726
## sample estimates:
## mean of x mean of y 
##     5.726     5.725

ggplot(stdev_both, aes(x=stdev, fill=cond)) +
    geom_histogram(aes(y=..density..), binwidth=0.05, alpha=0.4, position="identity") +
    xlab("Standard deviation") +
    ylab("Number of genes observed") +
    geom_density(alpha=0.6) +
    geom_vline(data=stdev_cdf, aes(xintercept=rating.mean,  colour=cond), linetype="dashed", size=0.75)

plot of chunk stdev_play

Write annotated table

fold_change = 2^combat_condbatch_table$logFC
fold_change_hr = fold_change
fold_change_hr[which(fold_change_hr < 1)] = -1/fold_change_hr[which(fold_change_hr < 1)]
E1=2 * combat_condbatch_table$AveExpr / (1 + fold_change)
E2=fold_change * E1
## cont1= A - B <=> B ir ref
all_genes_ids = rownames(combat_condbatch_table)

res=cbind(id=all_genes_ids,E1,E2,
    AveExpr=combat_condbatch_table$AveExpr,fold_change,fold_change_hr,
    combat_condbatch_table[,c("logFC","P.Value","adj.P.Val")])
names(res)[match("E1", names(res))] = "wt_rfe"
names(res)[match("E2", names(res))] = "wt_dfe"

resAnot = annotate_data(res,"reference/Lmexicana_TriTrypDB-4.2.gff.gz")
write.csv(resAnot, "txt/annotation_information.csv", row.names=FALSE)
subset_annotations = resAnot
subset_annotations = subset(subset_annotations, as.numeric(adj.P.Val) <= 0.05)
write.csv(subset_annotations, "txt/subset_annotation_information.csv", row.names=FALSE)

Save data

yay!