At this point, we appear to have decided to use combat adjusted data. The data structure containing that data is linearfit_combat_cond
contrast_matrix = makeContrasts(rFE_v_dFE = wt_conditionswt_dfe - wt_conditionswt_rfe,
levels=linearfit_combat_condbatch$design)
combat_condbatch_contrasts = contrasts.fit(linearfit_combat_condbatch, contrast_matrix)
combat_condbatch_comparison = eBayes(combat_condbatch_contrasts)
combat_condbatch_table = topTable(combat_condbatch_comparison,
coef="rFE_v_dFE", number=nrow(linear_fit_cond_batch$E), sort.by="logFC")
write.csv(combat_condbatch_table, file="txt/combat-condbatch_adjustment_table.csv")
Now make a MA plot of the voom adjusted data – which actually was generated before
combat_ma_plot = my_ma_plot(meanvar_combat_condbatch$E, combat_condbatch_table)
combat_ma_plot
write_plots("ma_plot", combat_ma_plot)
## Warning: semi-transparency is not supported on this device: reported only
## once per page
## Cairo
## 2
pvalue_histogram = ggplot(combat_condbatch_table, aes(x=P.Value)) +
xlab("P value") +
ylab("Number of genes observed") +
geom_histogram(aes(y=..density..), binwidth=0.01, colour="black", fill="darkgray") +
geom_density(alpha=0.2, fill="#EEEEEE") +
geom_vline(aes(xintercept=mean(P.Value, na.rm=T)), color="red", linetype="dashed", size=1) +
theme_bw()
pvalue_histogram
## Warning: position_stack requires constant width: output may be incorrect
write_plots("pvalue_histogram", pvalue_histogram)
## Warning: semi-transparency is not supported on this device: reported only
## once per page
## Cairo
## 2
I think the voom/limma output before performing subtraction is interesting and has some potentially neat stuff to plot. I will see what I can see.
gene_fit = linearfit_combat_condbatch ## I am lazy, I had this named as such earlier
play_data = data.frame(linearfit_combat_condbatch$coefficients[,c(1,2)])
colnames(play_data) = c("deplete","replete")
cor.test(play_data[,1], play_data[,2])
##
## Pearson's product-moment correlation
##
## data: play_data[, 1] and play_data[, 2]
## t = 721.5, df = 8060, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9920 0.9927
## sample estimates:
## cor
## 0.9923
require.auto("robust")
linear_model = lmrob(formula=deplete ~ replete, data=play_data, method="SMDM")
linear_model_summary = summary(linear_model)
linear_model_weights = weights(linear_model, type="robustness")
linear_model_summary
##
## Call:
## lmrob(formula = deplete ~ replete, data = play_data, method = "SMDM")
## \--> method = "SMDM"
## Residuals:
## Min 1Q Median 3Q Max
## -2.54591 -0.07803 0.00215 0.07895 1.75475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03464 0.00717 4.83 1.4e-06 ***
## replete 0.99508 0.00110 905.08 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.125
## Multiple R-squared: 0.99, Adjusted R-squared: 0.99
## Convergence in 7 IRWLS iterations
##
## Robustness weights:
## 23 observations c(122,248,618,621,628,779,1163,2745,3220,3221,3537,3805,5080,5122,5413,5416,5417,5611,5612,6122,6382,6952,7586)
## are outliers with |weight| <= 1.8e-06 ( < 1.2e-05);
## 5730 weights are ~= 1. The remaining 2309 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0003 0.7390 0.9050 0.8170 0.9740 0.9990
## Algorithmic parameters:
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4 bb tuning.psi1
## -5.0e-01 1.5e+00 NA 5.0e-01 5.0e-01 -5.0e-01
## tuning.psi2 tuning.psi3 tuning.psi4 refine.tol rel.tol solve.tol
## 1.5e+00 9.5e-01 NA 1.0e-07 1.0e-07 1.0e-07
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd numpoints
## 200 0 1000 0 10
## fast.s.large.n
## 2000
## psi subsampling cov
## "lqq" "nonsingular" ".vcov.w"
## seed : int(0)
replete_intercept = coef(linear_model_summary)[1]
replete_slope = coef(linear_model_summary)[2]
replete_median = summary(play_data$replete)["Median"]
deplete_median = summary(play_data$deplete)["Median"]
replete_mad = mad(play_data$replete)
deplete_mad = mad(play_data$deplete)
replete_deplete = ggplot(play_data, aes(x=deplete, y=replete)) +
xlab("Expression of amazonensis depleted of iron") +
ylab("Expression of amazonensis unperturbed") +
geom_vline(color="grey", xintercept=(deplete_median - deplete_mad), size=1.2) +
geom_vline(color="grey", xintercept=(deplete_median + deplete_mad), size=1.2) +
geom_vline(color="darkgrey", xintercept=deplete_median, size=1.5) +
geom_hline(color="grey", yintercept=(replete_median - replete_mad), size=1.2) +
geom_hline(color="grey", yintercept=(replete_median + replete_mad), size=1.2) +
geom_hline(color="darkgrey", yintercept=replete_median, size=1.5) +
geom_abline(colour="grey", slope=replete_slope, intercept=replete_intercept) +
geom_point(alpha=0.5, size=2,
colour=hsv(linear_model_weights * 9/20,
linear_model_weights/20 + 19/20,
(1.0 - linear_model_weights))) +
theme(legend.position="none")
replete_deplete
write_plots("replete_deplete_comparison", replete_deplete)
## Warning: semi-transparency is not supported on this device: reported only
## once per page
## Cairo
## 2
It is also possible to plot histograms of the replete vs deplete data and see if there is a difference in overall expression level. There is not.
require.auto("plyr")
play_deplete = data.frame(expression=play_data[,c(1)], cond="deplete")
play_replete = data.frame(expression=play_data[,c(2)], cond="replete")
play_both = rbind(play_deplete, play_replete)
play_cdf = ddply(play_both, "cond", summarise, rating.mean=mean(expression, na.rm=TRUE))
cor.test(play_deplete$expression, play_replete$expression)
##
## Pearson's product-moment correlation
##
## data: play_deplete$expression and play_replete$expression
## t = 721.5, df = 8060, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9920 0.9927
## sample estimates:
## cor
## 0.9923
cor.test(play_deplete$expression, play_replete$expression, method="spearman")
##
## Spearman's rank correlation rho
##
## data: play_deplete$expression and play_replete$expression
## S = 785224932, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.991
t.test(play_deplete$expression, play_replete$expression)
##
## Welch Two Sample t-test
##
## data: play_deplete$expression and play_replete$expression
## t = -0.0129, df = 16122, p-value = 0.9897
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04064 0.04011
## sample estimates:
## mean of x mean of y
## 6.384 6.385
ggplot(play_both, aes(x=expression, fill=cond)) +
geom_histogram(aes(y=..density..), binwidth=0.05, alpha=0.4, position="identity") +
xlab("Expression") +
ylab("Number of genes observed") +
geom_density(alpha=0.6) +
geom_vline(data=play_cdf, aes(xintercept=rating.mean, colour=cond), linetype="dashed", size=0.75)
gene_fit also has standard deviations by condition and batch I was thinking that if there is some systematic difference in the two data sets, then plotting a histogram of the standard deviations will show it. As the following plot shows, this is not the case – which I think is a good thing(TM).
stdev_play = as.data.frame(gene_fit$stdev.unscaled)[,c(1,2)]
colnames(stdev_play) = c("deplete","replete")
stdev_deplete = data.frame(stdev=stdev_play[,c(1)] * 100, cond="deplete")
stdev_replete = data.frame(stdev=stdev_play[,c(2)] * 100, cond="replete")
stdev_both = rbind(stdev_deplete, stdev_replete)
stdev_cdf = ddply(stdev_both, "cond", summarise, rating.mean=mean(stdev, na.rm=TRUE))
cor.test(stdev_deplete$stdev, stdev_replete$stdev)
##
## Pearson's product-moment correlation
##
## data: stdev_deplete$stdev and stdev_replete$stdev
## t = 1366, df = 8060, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9978 0.9979
## sample estimates:
## cor
## 0.9978
t.test(stdev_deplete$stdev, stdev_replete$stdev)
##
## Welch Two Sample t-test
##
## data: stdev_deplete$stdev and stdev_replete$stdev
## t = 0.0353, df = 16122, p-value = 0.9719
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04559 0.04726
## sample estimates:
## mean of x mean of y
## 5.726 5.725
ggplot(stdev_both, aes(x=stdev, fill=cond)) +
geom_histogram(aes(y=..density..), binwidth=0.05, alpha=0.4, position="identity") +
xlab("Standard deviation") +
ylab("Number of genes observed") +
geom_density(alpha=0.6) +
geom_vline(data=stdev_cdf, aes(xintercept=rating.mean, colour=cond), linetype="dashed", size=0.75)
fold_change = 2^combat_condbatch_table$logFC
fold_change_hr = fold_change
fold_change_hr[which(fold_change_hr < 1)] = -1/fold_change_hr[which(fold_change_hr < 1)]
E1=2 * combat_condbatch_table$AveExpr / (1 + fold_change)
E2=fold_change * E1
## cont1= A - B <=> B ir ref
all_genes_ids = rownames(combat_condbatch_table)
res=cbind(id=all_genes_ids,E1,E2,
AveExpr=combat_condbatch_table$AveExpr,fold_change,fold_change_hr,
combat_condbatch_table[,c("logFC","P.Value","adj.P.Val")])
names(res)[match("E1", names(res))] = "wt_rfe"
names(res)[match("E2", names(res))] = "wt_dfe"
resAnot = annotate_data(res,"reference/Lmexicana_TriTrypDB-4.2.gff.gz")
write.csv(resAnot, "txt/annotation_information.csv", row.names=FALSE)
subset_annotations = resAnot
subset_annotations = subset(subset_annotations, as.numeric(adj.P.Val) <= 0.05)
write.csv(subset_annotations, "txt/subset_annotation_information.csv", row.names=FALSE)
yay!