index.html annotation.html sample_estimations.html
In this document, we will perform a series of differential expression analyses using the data which survived sample estimation.
My original work in this context was to perform all possible contrasts using the entire data set. After discussion with Najib and Santuza, it was initially agreed that we would use a smaller subset in order to evaluate the methods for DE along with batch effects etc.
## cl_all_kexptv1 comes from sample_estimations when we removed an
## outlier sample (hpgl0490).
cl_all_filt <- sm(normalize_expt(cl_all_kexptv1, filter="cbcb"))
cl_all_de_sva <- sm(all_pairwise(cl_all_filt, model_batch="sva",
which_voom="limma",
edger_method="long"))
I am going to split these results into 3 categories: clbrener analyses over time, cl14 over time, comparisons of the two strains.
clb_times_keepers <- list(
"clbr_tryp_to_a60" = c("CLBr.A60","CLBr.Tryp"),
## "clbr_tryp_to_a96" = c("CLBr.A96","CLBr.Tryp"),
"clbr_a60_to_a96" = c("CLBr.A96","CLBr.A60"),
"clbr_a96_to_tryp" = c("CLBr.Tryp","CLBr.A96"))
cl14_times_keepers <- list(
"cl14_tryp_to_a60" = c("CL14.A60","CL14.Tryp"),
## "cl14_tryp_to_a96" = c("CL14.A96","CL14.Tryp"),
"cl14_a60_to_a96" = c("CL14.A96","CL14.A60"),
"cl14_a96_to_tryp" = c("CL14.Tryp","CL14.A96"))
strains_keepers <- list(
"a60_clbr_over_cl14" = c("CLBr.A60", "CL14.A60"),
"a96_clbr_over_cl14" = c("CLBr.A96", "CL14.A96"),
"tryp_clbr_over_cl14" = c("CLBr.Tryp", "CL14.Tryp"))
strain_comparisons <- sm(combine_de_tables(cl_all_de_sva,
excel=paste0("excel/strain_comparisons-v",
ver, ".xlsx"),
keepers=strains_keepers))
strain_only_limma <- sm(combine_de_tables(cl_all_de_sva,
excel=paste0("excel/strain_comparison_limma-v",
ver, ".xlsx"),
keepers=strains_keepers,
include_basic=FALSE,
include_deseq=FALSE,
include_edger=FALSE))
clb_compare_times <- sm(combine_de_tables(cl_all_de_sva,
excel=paste0("excel/clb_compare_times-v",
ver, ".xlsx"),
keepers=clb_times_keepers))
clb_only_limma <- sm(combine_de_tables(cl_all_de_sva,
excel=paste0("excel/clb_compare_times_limma-v",
ver, ".xlsx"),
keepers=clb_times_keepers,
include_basic=FALSE,
include_deseq=FALSE,
include_edger=FALSE))
cl14_compare_times <- sm(combine_de_tables(cl_all_de_sva,
excel=paste0("excel/cl14_compare_times-v",
ver, ".xlsx"),
keepers=cl14_times_keepers))
cl14_only_limma <- sm(combine_de_tables(cl_all_de_sva,
excel=paste0("excel/cl14_compare_times_limma-v",
ver, ".xlsx"),
keepers=cl14_times_keepers,
include_basic=FALSE,
include_deseq=FALSE,
include_edger=FALSE))
Hmm I think the goal laid out by Najib is to have some plots which show in a more concise fashion the changes over time in these specific gene families and be able to compare those differences from CL14 to CLBrener.
I messed up the semantic copy number filter with this.
## In this case, I want 'random' to be a sampling of all data
extract_families_clbr <- sm(semantic_copynumber_extract(clb_compare_times,
semantic_column="genedescription"))
extract_families_cl14 <- sm(semantic_copynumber_extract(cl14_compare_times,
semantic_column="genedescription"))
dgf_df <- data.frame(
"clbr_a60" = extract_families_clbr$data$clbr_a60_to_a96$DGF$limma_logfc,
"cl14_a60" = extract_families_cl14$data$cl14_a60_to_a96$DGF$limma_logfc,
"clbr_a96" = extract_families_clbr$data$clbr_a96_to_tryp$DGF$limma_logfc,
"cl14_a96" = extract_families_cl14$data$cl14_a96_to_tryp$DGF$limma_logfc,
"clbr_tryp" = extract_families_clbr$data$clbr_tryp_to_a60$DGF$limma_logfc,
"cl14_tryp" = extract_families_cl14$data$cl14_tryp_to_a60$DGF$limma_logfc
)
masp_df <- data.frame(
"clbr_a60" = extract_families_clbr$data$clbr_a60_to_a96$MASP$limma_logfc,
"cl14_a60" = extract_families_cl14$data$cl14_a60_to_a96$MASP$limma_logfc,
"clbr_a96" = extract_families_clbr$data$clbr_a96_to_tryp$MASP$limma_logfc,
"cl14_a96" = extract_families_cl14$data$cl14_a96_to_tryp$MASP$limma_logfc,
"clbr_tryp" = extract_families_clbr$data$clbr_tryp_to_a60$MASP$limma_logfc,
"cl14_tryp" = extract_families_cl14$data$cl14_tryp_to_a60$MASP$limma_logfc
)
mucin_df <- data.frame(
"clbr_a60" = extract_families_clbr$data$clbr_a60_to_a96$mucin$limma_logfc,
"cl14_a60" = extract_families_cl14$data$cl14_a60_to_a96$mucin$limma_logfc,
"clbr_a96" = extract_families_clbr$data$clbr_a96_to_tryp$mucin$limma_logfc,
"cl14_a96" = extract_families_cl14$data$cl14_a96_to_tryp$mucin$limma_logfc,
"clbr_tryp" = extract_families_clbr$data$clbr_tryp_to_a60$mucin$limma_logfc,
"cl14_tryp" = extract_families_cl14$data$cl14_tryp_to_a60$mucin$limma_logfc
)
sialidase_df <- data.frame(
"clbr_a60" = extract_families_clbr$data$clbr_a60_to_a96$sialidase$limma_logfc,
"cl14_a60" = extract_families_cl14$data$cl14_a60_to_a96$sialidase$limma_logfc,
"clbr_a96" = extract_families_clbr$data$clbr_a96_to_tryp$sialidase$limma_logfc,
"cl14_a96" = extract_families_cl14$data$cl14_a96_to_tryp$sialidase$limma_logfc,
"clbr_tryp" = extract_families_clbr$data$clbr_tryp_to_a60$sialidase$limma_logfc,
"cl14_tryp" = extract_families_cl14$data$cl14_tryp_to_a60$sialidase$limma_logfc
)
rhs_df <- data.frame(
"clbr_a60" = extract_families_clbr$data$clbr_a60_to_a96$RHS$limma_logfc,
"cl14_a60" = extract_families_cl14$data$cl14_a60_to_a96$RHS$limma_logfc,
"clbr_a96" = extract_families_clbr$data$clbr_a96_to_tryp$RHS$limma_logfc,
"cl14_a96" = extract_families_cl14$data$cl14_a96_to_tryp$RHS$limma_logfc,
"clbr_tryp" = extract_families_clbr$data$clbr_tryp_to_a60$RHS$limma_logfc,
"cl14_tryp" = extract_families_cl14$data$cl14_tryp_to_a60$RHS$limma_logfc
)
gp63_df <- data.frame(
"clbr_a60" = extract_families_clbr$data$clbr_a60_to_a96$GP63$limma_logfc,
"cl14_a60" = extract_families_cl14$data$cl14_a60_to_a96$GP63$limma_logfc,
"clbr_a96" = extract_families_clbr$data$clbr_a96_to_tryp$GP63$limma_logfc,
"cl14_a96" = extract_families_cl14$data$cl14_a96_to_tryp$GP63$limma_logfc,
"clbr_tryp" = extract_families_clbr$data$clbr_tryp_to_a60$GP63$limma_logfc,
"cl14_tryp" = extract_families_cl14$data$cl14_tryp_to_a60$GP63$limma_logfc
)
dgf_box <- plot_boxplot(dgf_df)
dgf_box
dgf_bean <- beanplot::beanplot(dgf_df, method="jitter")
masp_box <- plot_boxplot(masp_df)
masp_box
masp_bean <- beanplot::beanplot(masp_df, method="jitter")
mucin_box <- plot_boxplot(mucin_df)
mucin_box
mucin_bean <- beanplot::beanplot(mucin_df, method="jitter")
sialidase_box <- plot_boxplot(sialidase_df)
sialidase_box
sialidase_bean <- beanplot::beanplot(sialidase_df, method="jitter")
rhs_box <- plot_boxplot(rhs_df)
rhs_box
rhs_bean <- beanplot::beanplot(rhs_df, method="jitter")
gp_box <- plot_boxplot(gp63_df)
gp_box
gp_bean <- beanplot::beanplot(gp63_df, method="jitter")
## Compare DGF distributions across strains
cor.test(dgf_df[["clbr_tryp"]], dgf_df[["cl14_tryp"]])
##
## Pearson's product-moment correlation
##
## data: dgf_df[["clbr_tryp"]] and dgf_df[["cl14_tryp"]]
## t = 19, df = 500, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5895 0.6920
## sample estimates:
## cor
## 0.6436
t.test(dgf_df[["clbr_tryp"]], dgf_df[["cl14_tryp"]])
##
## Welch Two Sample t-test
##
## data: dgf_df[["clbr_tryp"]] and dgf_df[["cl14_tryp"]]
## t = -16, df = 990, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9319 -0.7318
## sample estimates:
## mean of x mean of y
## 0.5374 1.3692
ks.test(dgf_df[["clbr_tryp"]], dgf_df[["cl14_tryp"]])
## Warning in ks.test(dgf_df[["clbr_tryp"]], dgf_df[["cl14_tryp"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: dgf_df[["clbr_tryp"]] and dgf_df[["cl14_tryp"]]
## D = 0.44, p-value <2e-16
## alternative hypothesis: two-sided
cor.test(dgf_df[["clbr_a60"]], dgf_df[["cl14_a60"]])
##
## Pearson's product-moment correlation
##
## data: dgf_df[["clbr_a60"]] and dgf_df[["cl14_a60"]]
## t = -1.3, df = 500, p-value = 0.2
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.14585 0.02788
## sample estimates:
## cor
## -0.05943
t.test(dgf_df[["clbr_a60"]], dgf_df[["cl14_a60"]])
##
## Welch Two Sample t-test
##
## data: dgf_df[["clbr_a60"]] and dgf_df[["cl14_a60"]]
## t = -7.6, df = 740, p-value = 1e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6233 -0.3661
## sample estimates:
## mean of x mean of y
## -0.1449 0.3498
ks.test(dgf_df[["clbr_a60"]], dgf_df[["cl14_a60"]])
## Warning in ks.test(dgf_df[["clbr_a60"]], dgf_df[["cl14_a60"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: dgf_df[["clbr_a60"]] and dgf_df[["cl14_a60"]]
## D = 0.24, p-value = 3e-13
## alternative hypothesis: two-sided
cor.test(dgf_df[["clbr_a96"]], dgf_df[["cl14_a96"]])
##
## Pearson's product-moment correlation
##
## data: dgf_df[["clbr_a96"]] and dgf_df[["cl14_a96"]]
## t = 8.1, df = 500, p-value = 4e-15
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2599 0.4143
## sample estimates:
## cor
## 0.3394
t.test(dgf_df[["clbr_a96"]], dgf_df[["cl14_a96"]])
##
## Welch Two Sample t-test
##
## data: dgf_df[["clbr_a96"]] and dgf_df[["cl14_a96"]]
## t = 16, df = 610, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.166 1.487
## sample estimates:
## mean of x mean of y
## -0.3925 -1.7190
ks.test(dgf_df[["clbr_a96"]], dgf_df[["cl14_a96"]])
## Warning in ks.test(dgf_df[["clbr_a96"]], dgf_df[["cl14_a96"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: dgf_df[["clbr_a96"]] and dgf_df[["cl14_a96"]]
## D = 0.58, p-value <2e-16
## alternative hypothesis: two-sided
## Compare MASP distributions across strains
cor.test(masp_df[["clbr_tryp"]], masp_df[["cl14_tryp"]])
##
## Pearson's product-moment correlation
##
## data: masp_df[["clbr_tryp"]] and masp_df[["cl14_tryp"]]
## t = 45, df = 1300, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7565 0.7994
## sample estimates:
## cor
## 0.7789
t.test(masp_df[["clbr_tryp"]], masp_df[["cl14_tryp"]])
##
## Welch Two Sample t-test
##
## data: masp_df[["clbr_tryp"]] and masp_df[["cl14_tryp"]]
## t = -1.2, df = 2600, p-value = 0.2
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.18991 0.04508
## sample estimates:
## mean of x mean of y
## -2.523 -2.451
ks.test(masp_df[["clbr_tryp"]], masp_df[["cl14_tryp"]])
## Warning in ks.test(masp_df[["clbr_tryp"]], masp_df[["cl14_tryp"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: masp_df[["clbr_tryp"]] and masp_df[["cl14_tryp"]]
## D = 0.047, p-value = 0.1
## alternative hypothesis: two-sided
cor.test(masp_df[["clbr_a60"]], masp_df[["cl14_a60"]])
##
## Pearson's product-moment correlation
##
## data: masp_df[["clbr_a60"]] and masp_df[["cl14_a60"]]
## t = 5.4, df = 1300, p-value = 8e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.09487 0.20130
## sample estimates:
## cor
## 0.1485
t.test(masp_df[["clbr_a60"]], masp_df[["cl14_a60"]])
##
## Welch Two Sample t-test
##
## data: masp_df[["clbr_a60"]] and masp_df[["cl14_a60"]]
## t = 20, df = 2400, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.9367 1.1370
## sample estimates:
## mean of x mean of y
## 1.8801 0.8432
ks.test(masp_df[["clbr_a60"]], masp_df[["cl14_a60"]])
## Warning in ks.test(masp_df[["clbr_a60"]], masp_df[["cl14_a60"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: masp_df[["clbr_a60"]] and masp_df[["cl14_a60"]]
## D = 0.39, p-value <2e-16
## alternative hypothesis: two-sided
cor.test(masp_df[["clbr_a96"]], masp_df[["cl14_a96"]])
##
## Pearson's product-moment correlation
##
## data: masp_df[["clbr_a96"]] and masp_df[["cl14_a96"]]
## t = 13, df = 1300, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2859 0.3825
## sample estimates:
## cor
## 0.3351
t.test(masp_df[["clbr_a96"]], masp_df[["cl14_a96"]])
##
## Welch Two Sample t-test
##
## data: masp_df[["clbr_a96"]] and masp_df[["cl14_a96"]]
## t = -16, df = 2300, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.0807 -0.8481
## sample estimates:
## mean of x mean of y
## 0.6429 1.6073
ks.test(masp_df[["clbr_a96"]], masp_df[["cl14_a96"]])
## Warning in ks.test(masp_df[["clbr_a96"]], masp_df[["cl14_a96"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: masp_df[["clbr_a96"]] and masp_df[["cl14_a96"]]
## D = 0.44, p-value <2e-16
## alternative hypothesis: two-sided
## Compare MUCIN distributions across strains
cor.test(mucin_df[["clbr_tryp"]], mucin_df[["cl14_tryp"]])
##
## Pearson's product-moment correlation
##
## data: mucin_df[["clbr_tryp"]] and mucin_df[["cl14_tryp"]]
## t = 60, df = 2100, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7786 0.8100
## sample estimates:
## cor
## 0.7948
t.test(mucin_df[["clbr_tryp"]], mucin_df[["cl14_tryp"]])
##
## Welch Two Sample t-test
##
## data: mucin_df[["clbr_tryp"]] and mucin_df[["cl14_tryp"]]
## t = -3, df = 4200, p-value = 0.003
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.23422 -0.04918
## sample estimates:
## mean of x mean of y
## -2.515 -2.373
ks.test(mucin_df[["clbr_tryp"]], mucin_df[["cl14_tryp"]])
## Warning in ks.test(mucin_df[["clbr_tryp"]], mucin_df[["cl14_tryp"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: mucin_df[["clbr_tryp"]] and mucin_df[["cl14_tryp"]]
## D = 0.055, p-value = 0.003
## alternative hypothesis: two-sided
cor.test(mucin_df[["clbr_a60"]], mucin_df[["cl14_a60"]])
##
## Pearson's product-moment correlation
##
## data: mucin_df[["clbr_a60"]] and mucin_df[["cl14_a60"]]
## t = 7, df = 2100, p-value = 4e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1081 0.1915
## sample estimates:
## cor
## 0.1501
t.test(mucin_df[["clbr_a60"]], mucin_df[["cl14_a60"]])
##
## Welch Two Sample t-test
##
## data: mucin_df[["clbr_a60"]] and mucin_df[["cl14_a60"]]
## t = 24, df = 4000, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.9014 1.0585
## sample estimates:
## mean of x mean of y
## 1.7615 0.7815
ks.test(mucin_df[["clbr_a60"]], mucin_df[["cl14_a60"]])
## Warning in ks.test(mucin_df[["clbr_a60"]], mucin_df[["cl14_a60"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: mucin_df[["clbr_a60"]] and mucin_df[["cl14_a60"]]
## D = 0.35, p-value <2e-16
## alternative hypothesis: two-sided
cor.test(mucin_df[["clbr_a96"]], mucin_df[["cl14_a96"]])
##
## Pearson's product-moment correlation
##
## data: mucin_df[["clbr_a96"]] and mucin_df[["cl14_a96"]]
## t = 20, df = 2100, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3559 0.4281
## sample estimates:
## cor
## 0.3926
t.test(mucin_df[["clbr_a96"]], mucin_df[["cl14_a96"]])
##
## Welch Two Sample t-test
##
## data: mucin_df[["clbr_a96"]] and mucin_df[["cl14_a96"]]
## t = -18, df = 3800, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9305 -0.7459
## sample estimates:
## mean of x mean of y
## 0.7536 1.5918
ks.test(mucin_df[["clbr_a96"]], mucin_df[["cl14_a96"]])
## Warning in ks.test(mucin_df[["clbr_a96"]], mucin_df[["cl14_a96"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: mucin_df[["clbr_a96"]] and mucin_df[["cl14_a96"]]
## D = 0.36, p-value <2e-16
## alternative hypothesis: two-sided
## Compare SIALIDASE distributions across strains
cor.test(sialidase_df[["clbr_tryp"]], sialidase_df[["cl14_tryp"]])
##
## Pearson's product-moment correlation
##
## data: sialidase_df[["clbr_tryp"]] and sialidase_df[["cl14_tryp"]]
## t = 79, df = 1500, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8909 0.9103
## sample estimates:
## cor
## 0.901
t.test(sialidase_df[["clbr_tryp"]], sialidase_df[["cl14_tryp"]])
##
## Welch Two Sample t-test
##
## data: sialidase_df[["clbr_tryp"]] and sialidase_df[["cl14_tryp"]]
## t = -7.3, df = 2900, p-value = 5e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6938 -0.3990
## sample estimates:
## mean of x mean of y
## -2.125 -1.579
ks.test(sialidase_df[["clbr_tryp"]], sialidase_df[["cl14_tryp"]])
## Warning in ks.test(sialidase_df[["clbr_tryp"]], sialidase_df[["cl14_tryp"]]): p-value will
## be approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: sialidase_df[["clbr_tryp"]] and sialidase_df[["cl14_tryp"]]
## D = 0.11, p-value = 6e-08
## alternative hypothesis: two-sided
cor.test(sialidase_df[["clbr_a60"]], sialidase_df[["cl14_a60"]])
##
## Pearson's product-moment correlation
##
## data: sialidase_df[["clbr_a60"]] and sialidase_df[["cl14_a60"]]
## t = -0.022, df = 1500, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05185 0.05072
## sample estimates:
## cor
## -0.0005688
t.test(sialidase_df[["clbr_a60"]], sialidase_df[["cl14_a60"]])
##
## Welch Two Sample t-test
##
## data: sialidase_df[["clbr_a60"]] and sialidase_df[["cl14_a60"]]
## t = 10, df = 2800, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4233 0.6212
## sample estimates:
## mean of x mean of y
## 0.9379 0.4157
ks.test(sialidase_df[["clbr_a60"]], sialidase_df[["cl14_a60"]])
## Warning in ks.test(sialidase_df[["clbr_a60"]], sialidase_df[["cl14_a60"]]): p-value will
## be approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: sialidase_df[["clbr_a60"]] and sialidase_df[["cl14_a60"]]
## D = 0.27, p-value <2e-16
## alternative hypothesis: two-sided
cor.test(sialidase_df[["clbr_a96"]], sialidase_df[["cl14_a96"]])
##
## Pearson's product-moment correlation
##
## data: sialidase_df[["clbr_a96"]] and sialidase_df[["cl14_a96"]]
## t = 33, df = 1500, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6277 0.6859
## sample estimates:
## cor
## 0.6578
t.test(sialidase_df[["clbr_a96"]], sialidase_df[["cl14_a96"]])
##
## Welch Two Sample t-test
##
## data: sialidase_df[["clbr_a96"]] and sialidase_df[["cl14_a96"]]
## t = 0.31, df = 2300, p-value = 0.8
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1290 0.1773
## sample estimates:
## mean of x mean of y
## 1.187 1.163
ks.test(sialidase_df[["clbr_a96"]], sialidase_df[["cl14_a96"]])
## Warning in ks.test(sialidase_df[["clbr_a96"]], sialidase_df[["cl14_a96"]]): p-value will
## be approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: sialidase_df[["clbr_a96"]] and sialidase_df[["cl14_a96"]]
## D = 0.088, p-value = 3e-05
## alternative hypothesis: two-sided
## Compare RHS distributions across strains
cor.test(rhs_df[["clbr_tryp"]], rhs_df[["cl14_tryp"]])
##
## Pearson's product-moment correlation
##
## data: rhs_df[["clbr_tryp"]] and rhs_df[["cl14_tryp"]]
## t = 20, df = 710, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5416 0.6372
## sample estimates:
## cor
## 0.5915
t.test(rhs_df[["clbr_tryp"]], rhs_df[["cl14_tryp"]])
##
## Welch Two Sample t-test
##
## data: rhs_df[["clbr_tryp"]] and rhs_df[["cl14_tryp"]]
## t = -3, df = 1400, p-value = 0.003
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.19026 -0.03871
## sample estimates:
## mean of x mean of y
## -0.01416 0.10033
ks.test(rhs_df[["clbr_tryp"]], rhs_df[["cl14_tryp"]])
## Warning in ks.test(rhs_df[["clbr_tryp"]], rhs_df[["cl14_tryp"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: rhs_df[["clbr_tryp"]] and rhs_df[["cl14_tryp"]]
## D = 0.073, p-value = 0.05
## alternative hypothesis: two-sided
cor.test(rhs_df[["clbr_a60"]], rhs_df[["cl14_a60"]])
##
## Pearson's product-moment correlation
##
## data: rhs_df[["clbr_a60"]] and rhs_df[["cl14_a60"]]
## t = -2.4, df = 710, p-value = 0.02
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.16247 -0.01702
## sample estimates:
## cor
## -0.09022
t.test(rhs_df[["clbr_a60"]], rhs_df[["cl14_a60"]])
##
## Welch Two Sample t-test
##
## data: rhs_df[["clbr_a60"]] and rhs_df[["cl14_a60"]]
## t = -4.6, df = 950, p-value = 4e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3899 -0.1573
## sample estimates:
## mean of x mean of y
## 0.1558 0.4294
ks.test(rhs_df[["clbr_a60"]], rhs_df[["cl14_a60"]])
## Warning in ks.test(rhs_df[["clbr_a60"]], rhs_df[["cl14_a60"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: rhs_df[["clbr_a60"]] and rhs_df[["cl14_a60"]]
## D = 0.12, p-value = 1e-04
## alternative hypothesis: two-sided
cor.test(rhs_df[["clbr_a96"]], rhs_df[["cl14_a96"]])
##
## Pearson's product-moment correlation
##
## data: rhs_df[["clbr_a96"]] and rhs_df[["cl14_a96"]]
## t = 10, df = 710, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2976 0.4250
## sample estimates:
## cor
## 0.363
t.test(rhs_df[["clbr_a96"]], rhs_df[["cl14_a96"]])
##
## Welch Two Sample t-test
##
## data: rhs_df[["clbr_a96"]] and rhs_df[["cl14_a96"]]
## t = 5.6, df = 850, p-value = 4e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2512 0.5250
## sample estimates:
## mean of x mean of y
## -0.1416 -0.5297
ks.test(rhs_df[["clbr_a96"]], rhs_df[["cl14_a96"]])
## Warning in ks.test(rhs_df[["clbr_a96"]], rhs_df[["cl14_a96"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: rhs_df[["clbr_a96"]] and rhs_df[["cl14_a96"]]
## D = 0.18, p-value = 3e-10
## alternative hypothesis: two-sided
## Compare GP63 distributions across strains
cor.test(gp63_df[["clbr_tryp"]], gp63_df[["cl14_tryp"]])
##
## Pearson's product-moment correlation
##
## data: gp63_df[["clbr_tryp"]] and gp63_df[["cl14_tryp"]]
## t = 30, df = 330, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8195 0.8790
## sample estimates:
## cor
## 0.852
t.test(gp63_df[["clbr_tryp"]], gp63_df[["cl14_tryp"]])
##
## Welch Two Sample t-test
##
## data: gp63_df[["clbr_tryp"]] and gp63_df[["cl14_tryp"]]
## t = -2, df = 650, p-value = 0.05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.509576 -0.001369
## sample estimates:
## mean of x mean of y
## -0.9837 -0.7282
ks.test(gp63_df[["clbr_tryp"]], gp63_df[["cl14_tryp"]])
## Warning in ks.test(gp63_df[["clbr_tryp"]], gp63_df[["cl14_tryp"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: gp63_df[["clbr_tryp"]] and gp63_df[["cl14_tryp"]]
## D = 0.09, p-value = 0.1
## alternative hypothesis: two-sided
cor.test(gp63_df[["clbr_a60"]], gp63_df[["cl14_a60"]])
##
## Pearson's product-moment correlation
##
## data: gp63_df[["clbr_a60"]] and gp63_df[["cl14_a60"]]
## t = 0.68, df = 330, p-value = 0.5
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07065 0.14433
## sample estimates:
## cor
## 0.03728
t.test(gp63_df[["clbr_a60"]], gp63_df[["cl14_a60"]])
##
## Welch Two Sample t-test
##
## data: gp63_df[["clbr_a60"]] and gp63_df[["cl14_a60"]]
## t = 2.9, df = 560, p-value = 0.004
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0744 0.3888
## sample estimates:
## mean of x mean of y
## 0.4949 0.2633
ks.test(gp63_df[["clbr_a60"]], gp63_df[["cl14_a60"]])
## Warning in ks.test(gp63_df[["clbr_a60"]], gp63_df[["cl14_a60"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: gp63_df[["clbr_a60"]] and gp63_df[["cl14_a60"]]
## D = 0.16, p-value = 6e-04
## alternative hypothesis: two-sided
cor.test(gp63_df[["clbr_a96"]], gp63_df[["cl14_a96"]])
##
## Pearson's product-moment correlation
##
## data: gp63_df[["clbr_a96"]] and gp63_df[["cl14_a96"]]
## t = 17, df = 330, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6090 0.7275
## sample estimates:
## cor
## 0.6726
t.test(gp63_df[["clbr_a96"]], gp63_df[["cl14_a96"]])
##
## Welch Two Sample t-test
##
## data: gp63_df[["clbr_a96"]] and gp63_df[["cl14_a96"]]
## t = 0.22, df = 620, p-value = 0.8
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1915 0.2393
## sample estimates:
## mean of x mean of y
## 0.4888 0.4649
ks.test(gp63_df[["clbr_a96"]], gp63_df[["cl14_a96"]])
## Warning in ks.test(gp63_df[["clbr_a96"]], gp63_df[["cl14_a96"]]): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: gp63_df[["clbr_a96"]] and gp63_df[["cl14_a96"]]
## D = 0.11, p-value = 0.03
## alternative hypothesis: two-sided
strains_sig <- sm(extract_significant_genes(strain_comparisons, according_to="limma",
excel=paste0("excel/strain_comparisons_sig-v",
ver, ".xlsx")))
clbr_times_sig <- sm(extract_significant_genes(clb_compare_times, according_to="limma",
excel=paste0("excel/clb_times_sig-v",
ver, ".xlsx")))
cl14_times_sig <- sm(extract_significant_genes(cl14_compare_times, according_to="limma",
excel=paste0("excel/cl14_times_sig-v",
ver, ".xlsx")))
I want to make 100% certain that when I plot what I call the coefficient scatter plot, that the dots on the screen are in fact representative of the post-limma/deseq/edger values.
I keep a set of tables in my limma output called ‘identity_tables’ which in theory are the pre-contrast values for each gene by condition. If I am 100% correct, then I should be able to recreate the logFC from topTable() by takinng two of these and subtracting them.
test_genes_clbr_tryp <- cl_all_de_sva[["limma"]][["identity_tables"]][["CLBr.Tryp"]]
test_genes_clbr_a60 <- cl_all_de_sva[["limma"]][["identity_tables"]][["CLBr.A60"]]
test_genes_clbr_contrast <- cl_all_de_sva[["limma"]][["all_tables"]][["CLBr.Tryp_vs_CLBr.A60"]]
test_genes_clbr_tryp["TcCLB.511211.160", ]
## logFC AveExpr t P.Value adj.P.Val B qvalue
## TcCLB.511211.160 10.77 11.39 143.6 5.14e-32 4.168e-28 61.94 4.823e-30
test_genes_clbr_a60["TcCLB.511211.160", ]
## logFC AveExpr t P.Value adj.P.Val B qvalue
## TcCLB.511211.160 11.64 11.39 166.2 2.661e-33 3.072e-29 64.52 1.376e-30
test_genes_clbr_contrast["TcCLB.511211.160", ]
## logFC AveExpr t P.Value adj.P.Val B qvalue
## TcCLB.511211.160 -0.8763 11.39 -7.934 1.19e-07 1.277e-06 7.192 3.125e-07
10.77 - 11.64
## [1] -0.87
On the other hand, I also want to have a standard error / variance estimate to come along with this data. I have two likely places to acquire that:
With that in mind, I wrote a little function get_pairwise_gene_abundances() that collects these in four tables: expression_values, t_values, error_values, and stdev_values.
abundances <- get_pairwise_gene_abundances(cl_all_de_sva)
expression_written <- write_xls(data=abundances[["expression_values"]],
sheet="expression_values",
title="Values making up the contrast logFCs")
error_written <- write_xls(data=abundances[["error_values"]], start_col=18,
title="Error value obtained by dividing the expression / t-statistic",
sheet="expression_values",
wb=expression_written[["workbook"]])
## The sheet: expression_values is in expression_values.
openxlsx::saveWorkbook(wb=expression_written[["workbook"]],
file="excel/written_abundance_test.xlsx", overwrite=TRUE)
##canaries <- c("TcCLB.511511.6", "TcCLB.511511.3", "TcCLB.511727.190", "TcCLB.469785.40",
## "TcCLB.506563.10", "TcCLB.511127.10", "TcCLB.504005.6", "TcCLB.507611.300")
canaries <- c("TcCLB.511727.190", "TcCLB.469785.40",
"TcCLB.506563.10", "TcCLB.511127.10",
"TcCLB.504005.6", "TcCLB.507611.300")
## Reorder the data to match the existing figure
conditions <- c("CLBr.A60", "CLBr.A96", "CLBr.Tryp",
"CL14.A60", "CL14.A96", "CL14.Tryp")
expressions <- abundances[["expression_values"]][, conditions]
stdevs <- abundances[["stdev_values"]][, conditions]
## Error in `[.data.frame`(abundances[["stdev_values"]], , conditions): undefined columns selected
errs <- abundances[["error_values"]][, conditions]
adjp_values <- list()
library(ggplot2)
for (num in 1:length(canaries)) {
canary <- canaries[num]
point <- 2 ^ expressions[canary, ]
##stdev <- 2 ^ stdevs[canary, ]
err <- 2 ^ errs[canary, ]
my_colors <- c("blue","green","red","blue","green","red")
canary_df <- data.frame(
"id" = colnames(point),
"values" = as.numeric(point),
"colors" = my_colors,
"err" = as.numeric(err),
"plus_err" = as.numeric(err) + as.numeric(point),
"minus_err" = as.numeric(point) - as.numeric(err))
print(canary_df)
fun_barplot <- ggplot2::ggplot(data=canary_df, colour=my_colors,
aes_string(x="id", y="values")) +
ggplot2::geom_bar(stat="identity", colour="black", fill=my_colors) +
ggplot2::theme_bw() + ggplot2::ylab("Expression Value") +
ggplot2::ggtitle(canary) +
ggplot2::geom_errorbar(aes(ymax=plus_err, ymin=minus_err), width=0.3)
pdf_filename <- paste0("images/", num, "_", canary, "_expression.pdf")
pdf(file=pdf_filename)
print(fun_barplot)
dev.off()
## Now extract the adjusted p-values from the pairwise contrasts...
## test_genes_clbr_contrast <- cl_all_de_sva[["limma"]][["all_tables"]][["CLBr.Tryp_vs_CLBr.A60"]]
adjps <- c()
the_contrasts <- c("CLBr.Tryp_vs_CL14.Tryp", "CLBr.A60_vs_CL14.A60", "CLBr.A96_vs_CL14.A96")
for (contr in the_contrasts) {
adjp <- cl_all_de_sva[["limma"]][["all_tables"]][[contr]][canary, "adj.P.Val"]
adjps[[contr]] <- adjp
}
adjp_values[[canary]] <- adjps
}
## id values colors err plus_err minus_err
## 1 CLBr.A60 170.19 blue 1.034 171.22 169.16
## 2 CLBr.A96 113.77 green 1.022 114.79 112.75
## 3 CLBr.Tryp 22.58 red 1.035 23.62 21.55
## 4 CL14.A60 151.27 blue 1.038 152.31 150.23
## 5 CL14.A96 159.56 green 1.034 160.60 158.53
## 6 CL14.Tryp 32.04 red 1.040 33.08 31.00
## id values colors err plus_err minus_err
## 1 CLBr.A60 54.229 blue 1.086 55.316 53.143
## 2 CLBr.A96 23.785 green 1.047 24.832 22.739
## 3 CLBr.Tryp 7.449 red 1.076 8.525 6.373
## 4 CL14.A60 43.381 blue 1.094 44.475 42.287
## 5 CL14.A96 35.432 green 1.089 36.522 34.343
## 6 CL14.Tryp 10.520 red 1.102 11.622 9.417
## id values colors err plus_err minus_err
## 1 CLBr.A60 54.99 blue 1.087 56.07 53.90
## 2 CLBr.A96 81.91 green 1.042 82.95 80.87
## 3 CLBr.Tryp 24.81 red 1.069 25.88 23.74
## 4 CL14.A60 54.72 blue 1.087 55.81 53.63
## 5 CL14.A96 48.50 green 1.087 49.59 47.42
## 6 CL14.Tryp 19.47 red 1.095 20.56 18.37
## id values colors err plus_err minus_err
## 1 CLBr.A60 1.520 blue 1.454 2.974 0.06662
## 2 CLBr.A96 3.583 green 1.142 4.725 2.44051
## 3 CLBr.Tryp 3.345 red 1.205 4.550 2.14023
## 4 CL14.A60 1.462 blue 1.484 2.946 -0.02217
## 5 CL14.A96 1.618 green 1.498 3.116 0.12028
## 6 CL14.Tryp 1.206 red 1.477 2.683 -0.27112
## id values colors err plus_err minus_err
## 1 CLBr.A60 42.05 blue 1.167 43.22 40.88
## 2 CLBr.A96 101.13 green 1.071 102.20 100.05
## 3 CLBr.Tryp 25.90 red 1.120 27.02 24.78
## 4 CL14.A60 39.31 blue 1.159 40.47 38.16
## 5 CL14.A96 82.77 green 1.144 83.91 81.62
## 6 CL14.Tryp 44.42 red 1.168 45.58 43.25
## id values colors err plus_err minus_err
## 1 CLBr.A60 17.02 blue 1.224 18.24 15.79
## 2 CLBr.A96 21.10 green 1.086 22.18 20.01
## 3 CLBr.Tryp 20.31 red 1.127 21.44 19.18
## 4 CL14.A60 20.38 blue 1.195 21.57 19.18
## 5 CL14.A96 12.05 green 1.235 13.28 10.82
## 6 CL14.Tryp 68.40 red 1.210 69.61 67.19
## A query from Santuza, generate p-values between some of these bars.
## How do I do that when I extracted the values and variances from limma without the
## associated raw values?
## hmm...
adjp_values
## $TcCLB.511727.190
## CLBr.Tryp_vs_CL14.Tryp CLBr.A60_vs_CL14.A60 CLBr.A96_vs_CL14.A96
## 5.528e-04 5.050e-01 7.645e-05
##
## $TcCLB.469785.40
## CLBr.Tryp_vs_CL14.Tryp CLBr.A60_vs_CL14.A60 CLBr.A96_vs_CL14.A96
## 0.030550 0.439600 0.002574
##
## $TcCLB.506563.10
## CLBr.Tryp_vs_CL14.Tryp CLBr.A60_vs_CL14.A60 CLBr.A96_vs_CL14.A96
## 7.648e-02 9.923e-01 6.433e-05
##
## $TcCLB.511127.10
## CLBr.Tryp_vs_CL14.Tryp CLBr.A60_vs_CL14.A60 CLBr.A96_vs_CL14.A96
## 0.06265 0.98770 0.13790
##
## $TcCLB.504005.6
## CLBr.Tryp_vs_CL14.Tryp CLBr.A60_vs_CL14.A60 CLBr.A96_vs_CL14.A96
## 0.005835 0.916200 0.177100
##
## $TcCLB.507611.300
## CLBr.Tryp_vs_CL14.Tryp CLBr.A60_vs_CL14.A60 CLBr.A96_vs_CL14.A96
## 6.526e-06 7.914e-01 1.510e-02
clbr_bars <- sm(significant_barplots(clb_compare_times))
pdf(file="images/limma.pdf")
clbr_bars$limma
dev.off()
## png
## 2
clbr_inv <- sm(significant_barplots(clb_compare_times))
pdf(file="images/limma_invert.pdf")
clbr_inv$limma
dev.off()
## png
## 2
cl14_bars <- sm(significant_barplots(cl14_compare_times))
strain_bars <- sm(significant_barplots(strain_comparisons))
clbr_bars$limma
cl14_bars$limma
strain_bars$limma
Redo GO analyses excluding multigene family members. I believe we discussed that the best way to do this was to:
##clbr_times_filtered <- semantic_copynumber_filter(clbr_times_sig$limma, semantic_column="genedescription")
##cl14_times_filtered <- semantic_copynumber_filter(cl14_times_sig$limma, semantic_column="genedescription")
##strains_filtered <- semantic_copynumber_filter(strains_sig$limma, semantic_column="genedescription")
clbr_times_filtered <- sm(semantic_copynumber_filter(clb_compare_times,
semantic_column="genedescription"))
cl14_times_filtered <- sm(semantic_copynumber_filter(cl14_compare_times,
semantic_column="genedescription"))
strains_filtered <- sm(semantic_copynumber_filter(strain_comparisons,
semantic_column="genedescription"))
clbr_times_sigfilt <- sm(extract_significant_genes(clbr_times_filtered, according_to="limma",
excel=paste0("excel/clb_times_filtered_sig-v",
ver, ".xlsx")))
cl14_times_sigfilt <- sm(extract_significant_genes(cl14_times_filtered, according_to="limma",
excel=paste0("excel/cl14_times_filtered_sig-v",
ver, ".xlsx")))
strains_sigfilt <- sm(extract_significant_genes(strains_filtered, according_to="limma",
excel=paste0("excel/strain_comparisons_filtered_sig-v",
ver, ".xlsx")))
## Najib wants to know how many entries for each family were removed by the semantic filtering
## from the significance tables.
semantic_names <- c("mucin","sialidase","RHS","MASP","DGF","GP63")
find_numbers <- function(datum, direction="ups", table_name="clbr_tryp_to_a60") {
for (name in semantic_names) {
table <- datum[["limma"]][[direction]][[table_name]]
search <- grepl(pattern=name, x=table[["genedescription"]])
number <- sum(search)
message(paste0("The ", direction, " table ", table_name,
" was pruned for ",
name, " by ", number, " genes."))
}
}
find_numbers(clbr_times_sig)
## The ups table clbr_tryp_to_a60 was pruned for mucin by 14 genes.
## The ups table clbr_tryp_to_a60 was pruned for sialidase by 38 genes.
## The ups table clbr_tryp_to_a60 was pruned for RHS by 40 genes.
## The ups table clbr_tryp_to_a60 was pruned for MASP by 11 genes.
## The ups table clbr_tryp_to_a60 was pruned for DGF by 82 genes.
## The ups table clbr_tryp_to_a60 was pruned for GP63 by 17 genes.
find_numbers(clbr_times_sig, direction="downs")
## The downs table clbr_tryp_to_a60 was pruned for mucin by 1640 genes.
## The downs table clbr_tryp_to_a60 was pruned for sialidase by 870 genes.
## The downs table clbr_tryp_to_a60 was pruned for RHS by 38 genes.
## The downs table clbr_tryp_to_a60 was pruned for MASP by 1024 genes.
## The downs table clbr_tryp_to_a60 was pruned for DGF by 11 genes.
## The downs table clbr_tryp_to_a60 was pruned for GP63 by 108 genes.
find_numbers(clbr_times_sig, table_name="clbr_a60_to_a96")
## The ups table clbr_a60_to_a96 was pruned for mucin by 1286 genes.
## The ups table clbr_a60_to_a96 was pruned for sialidase by 511 genes.
## The ups table clbr_a60_to_a96 was pruned for RHS by 52 genes.
## The ups table clbr_a60_to_a96 was pruned for MASP by 859 genes.
## The ups table clbr_a60_to_a96 was pruned for DGF by 14 genes.
## The ups table clbr_a60_to_a96 was pruned for GP63 by 73 genes.
find_numbers(clbr_times_sig, direction="downs", table_name="clbr_a60_to_a96")
## The downs table clbr_a60_to_a96 was pruned for mucin by 43 genes.
## The downs table clbr_a60_to_a96 was pruned for sialidase by 34 genes.
## The downs table clbr_a60_to_a96 was pruned for RHS by 11 genes.
## The downs table clbr_a60_to_a96 was pruned for MASP by 27 genes.
## The downs table clbr_a60_to_a96 was pruned for DGF by 20 genes.
## The downs table clbr_a60_to_a96 was pruned for GP63 by 7 genes.
find_numbers(clbr_times_sig, table_name="clbr_a96_to_tryp")
## The ups table clbr_a96_to_tryp was pruned for mucin by 746 genes.
## The ups table clbr_a96_to_tryp was pruned for sialidase by 699 genes.
## The ups table clbr_a96_to_tryp was pruned for RHS by 17 genes.
## The ups table clbr_a96_to_tryp was pruned for MASP by 377 genes.
## The ups table clbr_a96_to_tryp was pruned for DGF by 12 genes.
## The ups table clbr_a96_to_tryp was pruned for GP63 by 68 genes.
find_numbers(clbr_times_sig, direction="downs", table_name="clbr_a96_to_tryp")
## The downs table clbr_a96_to_tryp was pruned for mucin by 112 genes.
## The downs table clbr_a96_to_tryp was pruned for sialidase by 51 genes.
## The downs table clbr_a96_to_tryp was pruned for RHS by 41 genes.
## The downs table clbr_a96_to_tryp was pruned for MASP by 63 genes.
## The downs table clbr_a96_to_tryp was pruned for DGF by 48 genes.
## The downs table clbr_a96_to_tryp was pruned for GP63 by 15 genes.
find_numbers(cl14_times_sig, table_name="cl14_tryp_to_a60")
## The ups table cl14_tryp_to_a60 was pruned for mucin by 19 genes.
## The ups table cl14_tryp_to_a60 was pruned for sialidase by 42 genes.
## The ups table cl14_tryp_to_a60 was pruned for RHS by 47 genes.
## The ups table cl14_tryp_to_a60 was pruned for MASP by 12 genes.
## The ups table cl14_tryp_to_a60 was pruned for DGF by 300 genes.
## The ups table cl14_tryp_to_a60 was pruned for GP63 by 15 genes.
find_numbers(cl14_times_sig, direction="downs", table_name="cl14_tryp_to_a60")
## The downs table cl14_tryp_to_a60 was pruned for mucin by 1612 genes.
## The downs table cl14_tryp_to_a60 was pruned for sialidase by 763 genes.
## The downs table cl14_tryp_to_a60 was pruned for RHS by 22 genes.
## The downs table cl14_tryp_to_a60 was pruned for MASP by 1042 genes.
## The downs table cl14_tryp_to_a60 was pruned for DGF by 3 genes.
## The downs table cl14_tryp_to_a60 was pruned for GP63 by 88 genes.
find_numbers(cl14_times_sig, table_name="cl14_a60_to_a96")
## The ups table cl14_a60_to_a96 was pruned for mucin by 185 genes.
## The ups table cl14_a60_to_a96 was pruned for sialidase by 130 genes.
## The ups table cl14_a60_to_a96 was pruned for RHS by 87 genes.
## The ups table cl14_a60_to_a96 was pruned for MASP by 131 genes.
## The ups table cl14_a60_to_a96 was pruned for DGF by 41 genes.
## The ups table cl14_a60_to_a96 was pruned for GP63 by 6 genes.
find_numbers(cl14_times_sig, direction="downs", table_name="cl14_a60_to_a96")
## The downs table cl14_a60_to_a96 was pruned for mucin by 9 genes.
## The downs table cl14_a60_to_a96 was pruned for sialidase by 40 genes.
## The downs table cl14_a60_to_a96 was pruned for RHS by 34 genes.
## The downs table cl14_a60_to_a96 was pruned for MASP by 5 genes.
## The downs table cl14_a60_to_a96 was pruned for DGF by 16 genes.
## The downs table cl14_a60_to_a96 was pruned for GP63 by 1 genes.
find_numbers(cl14_times_sig, table_name="cl14_a96_to_tryp")
## The ups table cl14_a96_to_tryp was pruned for mucin by 897 genes.
## The ups table cl14_a96_to_tryp was pruned for sialidase by 574 genes.
## The ups table cl14_a96_to_tryp was pruned for RHS by 43 genes.
## The ups table cl14_a96_to_tryp was pruned for MASP by 615 genes.
## The ups table cl14_a96_to_tryp was pruned for DGF by 5 genes.
## The ups table cl14_a96_to_tryp was pruned for GP63 by 49 genes.
find_numbers(cl14_times_sig, direction="downs", table_name="cl14_a96_to_tryp")
## The downs table cl14_a96_to_tryp was pruned for mucin by 61 genes.
## The downs table cl14_a96_to_tryp was pruned for sialidase by 112 genes.
## The downs table cl14_a96_to_tryp was pruned for RHS by 135 genes.
## The downs table cl14_a96_to_tryp was pruned for MASP by 41 genes.
## The downs table cl14_a96_to_tryp was pruned for DGF by 242 genes.
## The downs table cl14_a96_to_tryp was pruned for GP63 by 14 genes.
filenames <- c("mucin.txt", "masp.txt", "gp63.txt", "transsialidase.txt",
"retrotransposon.txt", "dispersed.txt")
family_colors <- c("blue", "red", "green", "purple", "brown", "yellow")
names(family_colors) <- c("mucin", "MASP", "GP63", "trans_sialidase", "rhs", "DGF")
for (num in 1:length(names(clb_times_keepers))) {
contrast_names <- names(clb_times_keepers)
contrast <- contrast_names[[num]]
for (file_num in 1:length(filenames)) {
color <- family_colors[[file_num]]
families <- names(family_colors)
family <- families[[file_num]]
file <- filenames[[file_num]]
filename <- paste0("reference/families/", file)
output <- paste0("images/", contrast, "_", family, "_ma-v", ver, ".png")
svg_output <- paste0("images/", contrast, "_", family, "_ma-v", ver, ".svg")
family <- read.csv(filename, header=FALSE, sep=":")
family <- family[[1]]
limma_plot <- extract_de_ma(clb_compare_times, type="limma",
table=contrast, family=family,
insig_color="lightgrey", sig_color="#444444",
family_color=color, label_numbers=FALSE)
the_plot <- limma_plot$plot + ylim(-10, 10) + theme(legend.position="none")
png(file=output)
plot(the_plot)
dev.off()
svg(file=svg_output)
plot(the_plot)
dev.off()
}
}
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded
## nul(s) found in input
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded
## nul(s) found in input
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded
## nul(s) found in input
for (num in 1:length(names(cl14_times_keepers))) {
contrast_names <- names(cl14_times_keepers)
contrast <- contrast_names[[num]]
for (file_num in 1:length(filenames)) {
color <- family_colors[[file_num]]
families <- names(family_colors)
family <- families[[file_num]]
file <- filenames[[file_num]]
filename <- paste0("reference/families/", file)
output <- paste0("images/", contrast, "_", family, "_ma-v", ver, ".png")
svg_output <- paste0("images/", contrast, "_", family, "_ma-v", ver, ".svg")
family <- read.csv(filename, header=FALSE, sep=":")
family <- family[[1]]
limma_plot <- extract_de_ma(cl14_compare_times, type="limma",
table=contrast, family=family,
insig_color="lightgrey", sig_color="#444444",
family_color=color, label_numbers=FALSE)
the_plot <- limma_plot$plot + ylim(-10, 10) + theme(legend.position="none")
png(file=output)
plot(the_plot)
dev.off()
svg(file=svg_output)
plot(the_plot)
dev.off()
}
}
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded
## nul(s) found in input
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded
## nul(s) found in input
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, : embedded
## nul(s) found in input
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
tt <- sm(saveme(filename=this_save))
I think I am the only person who care about these analyses, so I am telling knitr to stop running them.
cl_esmer_kexpt <- sm(normalize_expt(cl_esmer_kexpt, filter=TRUE))
cl_esmer_de <- sm(all_pairwise(cl_esmer_kexpt, model_batch="sva"))
cl_nonesmer_kexpt <- sm(normalize_expt(cl_nonesmer_kexpt, filter=TRUE))
cl_nonesmer_de <- sm(all_pairwise(cl_nonesmer_kexpt, model_batch="sva"))
cl_all_kexpt <- sm(normalize_expt(cl_all_kexpt, filter=TRUE))
cl_all_de <- sm(all_pairwise(cl_all_kexpt, model_batch="sva"))
clbr_esmer_kexpt <- sm(normalize_expt(clbr_esmer_kexpt, filter=TRUE))
clbr_esmer_de <- sm(all_pairwise(clbr_esmer_kexpt, model_batch="sva"))
clbr_nonesmer_kexpt <- sm(normalize_expt(clbr_nonesmer_kexpt, filter=TRUE))
clbr_nonesmer_de <- sm(all_pairwise(clbr_nonesmer_kexpt, model_batch="sva"))
cl14_esmer_kexpt <- sm(normalize_expt(cl14_esmer_kexpt, filter=TRUE))
cl14_esmer_de <- sm(all_pairwise(cl14_esmer_kexpt, model_batch="sva"))
cl14_nonesmer_kexpt <- sm(normalize_expt(cl14_nonesmer_kexpt, filter=TRUE))
cl14_nonesmer_de <- sm(all_pairwise(cl14_nonesmer_kexpt, model_batch="sva"))
cl14_all_kexpt <- sm(normalize_expt(cl14_all_kexpt, filter=TRUE))
cl14_all_de <- sm(all_pairwise(cl14_all_kexpt, model_batch="sva"))
hs_norm <- sm(normalize_expt(hs_expt, filter=TRUE))
tt <- sm(graph_metrics(hs_norm))
hs_de <- sm(all_pairwise(hs_norm, model_batch="svaseq"))
hs_keepers <- list(
"cl14_midvsuninf" = c("CL14.A60", "Uninfected"),
"clbr_midvsuninf" = c("CLBr.A60", "Uninfected"))
hs_tables_mid <- sm(combine_de_tables(hs_de, keepers=hs_keepers,
excel=paste0("excel/hs_mid_comparisons-v",
ver, ".xlsx")))
hs_keepers <- list(
"cl14_latevsuninf" = c("CL14.A96", "Uninfected"),
"clbr_latevsuninf" = c("CLBr.A96", "Uninfected"))
hs_tables_late <- sm(combine_de_tables(hs_de, keepers=hs_keepers,
excel=paste0("excel/hs_late_comparisons-v",
ver, ".xlsx")))
hs_keepers <- list(
"cl14_latevsmid" = c("CL14.A96", "CL14.A60"),
"clbr_latevsmid" = c("CLBr.A96", "CLBr.A60"))
hs_tables_midlate <- sm(combine_de_tables(hs_de, keepers=hs_keepers,
excel=paste0("excel/hs_midlate_comparisons-v",
ver, ".xlsx")))
hs_keepers <- list(
"mid_clbrvscl14" = c("CLBr.A60", "CL14.A60"),
"late_clbrvscl14" = c("CLBr.A96", "CL14.A96"))
hs_tables_clbrcl14 <- sm(combine_de_tables(hs_de, keepers=hs_keepers,
excel=paste0("excel/hs_cl14clbr_comparisons-v",
ver, ".xlsx")))
I want to take the set of up/down genes as provided above in changed_counts and pull out of them the sets of low/single copy genes. In order to do this, I will likely use the same fasta36 scripts I have used previously with Ginger etc. I might also/instead attempt using the needleman/wunsch implementation in bioconductor.
Either way, a significant portion of the work will require extracting the gene IDs from changed_counts for each up/down group, pulling the relevant primary sequence, and performing whatever search.
Santuza would like to poke at the general gene expression in various conditions. Therefore this is calculating and printing the rpkm and raw counts to an excel workbook.
## Santuza asked for a listing of the rpkm(normalized(counts)) in order to get an idea of the highly expressed genes
## Also include clbrener epimastigotes and do average(rpkm) by sample type
## The base expression set we will use is cl_all_filt
cl_all_rpkm <- sm(normalize_expt(cl_all_filt, filter=TRUE, convert="rpkm"))
rpkm_df <- merge(Biobase::fData(cl_all_filt$expressionset),
Biobase::exprs(cl_all_rpkm[["expressionset"]]),
by.x="row.names", by.y="row.names")
rownames(rpkm_df) <- rpkm_df[["Row.names"]]
rpkm_df = rpkm_df[, -1]
median_rpkm <- median_by_factor(data=cl_all_rpkm)
rpkm_df <- merge(rpkm_df, median_rpkm, by="row.names")
rownames(rpkm_df) <- rpkm_df[["Row.names"]]
rpkm_df <- rpkm_df[, -1]
raw_counts <- as.data.frame(Biobase::exprs(cl_all_filt$expressionset))
rpkm_df <- merge(raw_counts, rpkm_df, by="row.names")
rownames(rpkm_df) <- rpkm_df[["Row.names"]]
rpkm_df <- rpkm_df[, -1]
xls_ret <- write_xls(as.data.frame(rpkm_df), sheet="raw_to_rpkm_counts",
title="Table SXXX: Raw data to RPKM(data)")
openxlsx::saveWorkbook(wb=xls_ret[["workbook"]],
file=paste0("excel/rpkm_counts_base10-v", ver, ".xlsx"), overwrite=TRUE)
I think everything from here to the end of this document is no longer useful. But I refuse to delete it.
Lets try for a convenient method of including only the contrasts Santuza is interested in. These include (but may not be limited to):
I spoke with Santuza right before leaving for the evening, this is exactly the opposite of what she wanted.
CLB(tryp/a60) CL14(tryp/a60) CLB(a60/a96) CL14(a60/a96) CLB(a96/tryp) CL14(a96/tryp) a96(CL14/CLB) a60(CL14/CLB) tryp(CL14/CLB)
I am hoping to convince Najib and Santuza that the epi samples are salvageable.
The following few comparisons are going to use the ‘all’ data set. When I do a larger set of limma contrasts, I will just use the ‘kept’ set.
The following blocks are no longer being evaluated because they remove most of the samples from the data and therefore over-estimate significances. But they do provide reasonable examples of how one might perform simple pairwise comparisons in data.
epi_subset <- expt_subset(all_qcpml2, "condition=='clbr_epi'|condition=='cl14_epi'")
epi_graphs <- graph_metrics(epi_subset)
epi_graphs$norm_pcaplot
epi_graphs$norm_disheat
epi_graphs$norm_corheat
epi_comparison <- simple_comparison(epi_subset)
epi_table <- epi_comparison$table
epi_comparison$pvalue_histogram
epi_comparison$volcano_plot
epi_comparison$ma_plot
epi_comparison$coefficient_scatter
tryp_subset <- expt_subset(all_qcpml2, "condition=='clbr_tryp'|condition=='cl14_tryp'")
tryp_graphs <- graph_metrics(expt=tryp_subset)
tryp_graphs$norm_pcaplot
tryp_graphs$norm_disheat
tryp_graphs$norm_corheat
tryp_comparison <- simple_comparison(tryp_subset)
tryp_table = tryp_comparison$table
tryp_comparison$pvalue_histogram
tryp_comparison$volcano_plot
tryp_comparison$ma_plot
tryp_comparison$coefficient_scatter
a60_subset <- expt_subset(all_qcpml2, "condition=='clbr_a60'|condition=='cl14_a60'")
a60_graphs <- graph_metrics(expt=a60_subset)
a60_graphs$norm_pcaplot
a60_graphs$norm_corheat
a60_graphs$norm_disheat
a60_comparison <- simple_comparison(a60_subset)
a60_table <- a60_comparison$table
a60_comparison$pvalue_histogram
a60_comparison$volcano_plot
a60_comparison$ma_plot
a60_comparison$coefficient_scatter
A more appropriate method for performing the various comparisons of strains across time is to put all the extant data into a single contrast. The following block does that and attempts to take into account the relative contribution of each batch and sample.
The variables which were created to hold the relative contributions of these samples were also used to perform the various subtractions. Thus, when limma is actually run, it will provide data structures of the abundances of each sample type as well as the various comparisons.
## This data structure will be used for the following playing
kept_data <- exprs(kept_qcpml2$expressionset)
## acb stands for "all_conditions_batches" which takes too long to
## type when setting up the contrasts.
acb <- paste0(kept_qcpml2$conditions, kept_qcpml2$batches)
table(acb)
## The invocation of table() keptows me to count up the contribution of
## each condition/batch combination to the whole data set.
## Doing this (as I understand it) means I do nothave to worry about
## balanced samples so much, but must be more careful to understand
## the relative contribution of each sample type to the entire data
## set.
complete_model <- model.matrix(~0 + acb)
complete_fit <- lmFit(kept_data, complete_model)
complete_voom <- hpgl_voom(kept_data, complete_model)
complete_voom$plot
complete_model
epi_cl14 <- "acbcl14_epiF"
epi_clbr <- "acbclbr_epiE"
tryp_cl14 <- "(acbcl14_trypB + acbcl14_trypD + acbcl14_trypG) / 3"
tryp_clbr <- "acbclbr_trypG"
a60_cl14 <- "(acbcl14_a60A * 2/3) + (acbcl14_a60B * 1/3)"
a60_clbr <- "acbclbr_a60A"
a96_cl14 <- "acbcl14_a96C"
a96_clbr <- "acbclbr_a96C"
epi_cl14clbr <- paste0("(",epi_cl14,")", " - ", "(",epi_clbr,")")
tryp_cl14clbr <- paste0("(",tryp_cl14,")", " - ", "(",tryp_clbr,")")
a60_cl14clbr <- paste0("(",a60_cl14,")", " - ", "(",a60_clbr,")")
a96_cl14clbr <- paste0("(",a96_cl14,")", " - ", "(",a96_clbr,")")
epitryp_cl14 <- paste0("(",tryp_cl14,")", " - ", "(",epi_cl14,")")
epitryp_clbr <- paste0("(",tryp_clbr,")", " - ", "(",epi_clbr,")")
epia60_cl14 <- paste0("(",a60_cl14,")", " - ", "(",epi_cl14,")")
epia60_clbr <- paste0("(",a60_clbr,")", " - ", "(",epi_clbr,")")
a60a96_cl14 <- paste0("(",a96_cl14,")", " - ", "(",a60_cl14,")")
a60a96_clbr <- paste0("(",a96_clbr,")", " - ", "(",a60_clbr,")")
a60tryp_cl14 <- paste0("(",tryp_cl14,")", " - ", "(",a60_cl14,")")
a60tryp_clbr <- paste0("(",tryp_clbr,")", " - ", "(",a60_clbr,")")
a96tryp_cl14 <- paste0("(",tryp_cl14,")", " - ", "(",a96_cl14,")")
a96tryp_clbr <- paste0("(",tryp_clbr,")", " - ", "(",a96_clbr,")")
## The following contrast is messed up in some as of yet unknown way.
epitryp_cl14clbr <- paste0("(",epitryp_cl14,")", " - ", "(",epitryp_clbr,")")
## So I will add some more contrasts using data which doesn't get screwed up
epia60_cl14clbr <- paste0("(",epia60_cl14,")", " - ", "(",epia60_clbr,")")
a60tryp_cl14clbr <- paste0("(",a60tryp_cl14,")", " - ", "(",a60tryp_clbr,")")
a60a96_cl14clbr <- paste0("(",a60a96_cl14,")", " - ", "(",a60a96_clbr,")")
complete_contrasts_v2 <- makeContrasts(
epi_cl14=epi_cl14,
epi_clbr=epi_clbr,
tryp_cl14=tryp_cl14,
tryp_clbr=tryp_clbr,
a60_cl14=a60_cl14,
a60_clbr=a60_clbr,
a96_cl14=a96_cl14,
a96_clbr=a96_clbr,
epi_cl14clbr=epi_cl14clbr,
tryp_cl14clbr=tryp_cl14clbr,
a60_cl14clbr=a60_cl14clbr,
a96_cl14clbr=a96_cl14clbr,
epitryp_cl14=epitryp_cl14,
epitryp_clbr=epitryp_clbr,
epia60_cl14=epia60_cl14,
epia60_clbr=epia60_clbr,
a60a96_cl14=a60a96_cl14,
a60a96_clbr=a60a96_clbr,
a60tryp_cl14=a60tryp_cl14,
a60tryp_clbr=a60tryp_clbr,
a96tryp_cl14=a96tryp_cl14,
a96tryp_clbr=a96tryp_clbr,
epitryp_cl14clbr=epitryp_cl14clbr,
epia60_cl14clbr=epia60_cl14clbr,
a60tryp_cl14clbr=a60tryp_cl14clbr,
a60a96_cl14clbr=a60a96_cl14clbr,
levels=complete_voom$design)
## This colnames() is annoyingly necessary to avoid really obnoxious contrast names.
colnames(complete_contrasts_v2) <- c("epi_cl14","epi_clbr","tryp_cl14","tryp_clbr","a60_cl14","a60_clbr","a96_cl14","a96_clbr","epi_cl14clbr","tryp_cl14clbr","a60_cl14clbr","a96_cl14clbr","epitryp_cl14","epitryp_clbr","epia60_cl14","epia60_clbr","a60tryp_cl14","a60tryp_clbr","a96tryp_cl14","a96tryp_clbr","a60a96_cl14","a60a96_clbr","epitryp_cl14clbr","epia60_cl14clbr","a60tryp_cl14clbr","a60a96_cl14clbr")
kept_fits <- contrasts.fit(complete_fit, complete_contrasts_v2)
kept_comparisons <- eBayes(kept_fits)
con <- kept_qcpml2$conditions
condition_model <- model.matrix(~0 + con)
condition_voom <- hpgl_voom(kept_data, condition_model)
condition_lmfit <- lmFit(condition_voom, condition_model)
des <- condition_voom$design
condition_contrasts <- makeContrasts(
epi_cl14=concl14_epi,
epi_clbr=conclbr_epi,
a60_cl14=concl14_a60,
a60_clbr=conclbr_a60,
a96_cl14=concl14_a96,
a96_clbr=conclbr_a96,
tryp_cl14=concl14_tryp,
tryp_clbr=conclbr_tryp,
epi_cl14clbr=concl14_epi - conclbr_epi,
tryp_cl14clbr=concl14_tryp - conclbr_tryp,
a60_cl14clbr=concl14_a60 - conclbr_a60,
a96_cl14clbr=concl14_a96 - conclbr_a96,
a60epi_clbr=conclbr_a60 - conclbr_epi,
trypepi_clbr=conclbr_tryp - conclbr_epi,
a96a60_cl14=concl14_a96 - concl14_a60,
a96a60_clbr=conclbr_a96 - conclbr_a60,
trypa96_cl14=concl14_tryp - concl14_a96,
trypa96_clbr=conclbr_tryp - conclbr_a96,
a96a60_cl14clbr=(concl14_a96-concl14_a60)-(conclbr_a96-conclbr_a60),
trypa96_cl14clbr=(concl14_tryp-concl14_a96)-(conclbr_tryp-conclbr_a96),
levels=des)
colnames(condition_contrasts) <- c("epi_cl14","epi_clbr","a60_cl14","a60_clbr","a96_cl14","a96_clbr","tryp_cl14","tryp_clbr","epi_cl14clbr","tryp_cl14clbr","a60_cl14clbr","a96_cl14clbr","a60epi_clbr","trypepi_clbr","a60a96_cl14","a60a96_clbr","a96tryp_cl14","a96_tryp_clbr","a60a96_cl14clbr","a96tryp_cl14clbr")
condition_fits <- contrasts.fit(condition_lmfit, condition_contrasts)
condition_comparisons <- eBayes(condition_fits)
condition_table <- topTable(condition_comparisons, adjust="fdr", n=nrow(kept_data))
condition_tables <- write_limma(condition_comparisons, excel=TRUE, csv=FALSE, annotation=tooltip_data)
gene_lengths <- genes[,c("ID","width")]
go_ids <- read.csv(file="reference/go/tcruzi_all_go.tab.gz", sep="\t", header=FALSE)
colnames(go_ids) <- c("ID","GOID","ontology","name","evidence","code")
go_ids <- go_ids[,c("ID","GOID")]
colnames(go_ids) <- c("ID","GO")
condition_ontology <- limma_ontology(condition_tables, gene_lengths=gene_lengths, goids=go_ids, n=200, excel=TRUE, csv=FALSE, do_trees=FALSE, do_cluster=TRUE, do_topgo=TRUE)
a60_cl14clbr_ls <- hpgl_linear_scatter(condition_table[,c("a60_cl14", "a60_clbr")], loess=TRUE)
a60_cl14clbr_ls$scatter
a60_cl14clbr_ls$both_histogram$plot
a60_cl14clbr_ls$lm_model
a60_cl14clbr_ls$lm_summary
a60_cl14clbr_ls$correlation
a96_cl14clbr_ls <- hpgl_linear_scatter(condition_table[,c("a96_cl14", "a96_clbr")], loess=TRUE)
a96_cl14clbr_ls$scatter
a96_cl14clbr_ls$both_histogram$plot
a96_cl14clbr_ls$lm_model
a96_cl14clbr_ls$lm_summary
a96_cl14clbr_ls$correlation
tryp_cl14clbr_ls <- hpgl_linear_scatter(condition_table[,c("tryp_cl14", "tryp_clbr")], loess=TRUE)
tryp_cl14clbr_ls$scatter
tryp_cl14clbr_ls$both_histogram$plot
tryp_cl14clbr_ls$lm_model
tryp_cl14clbr_ls$lm_summary
tryp_cl14clbr_ls$correlation
topTable() will get called quite a lot in order to generate the various tables of interest. write_xls() will follow to write out the results into some excel spreadsheets.
## I wrote a function 'write_limma' which takes care of making a data structure of all the tables and printing the results in excel.
## Lets use that rather than this stupid list of topTable() calls.
## In addition, I am adding q-values and other metrics to improve the output from toptable, so it is a win I think.
kept_table <- topTable(kept_comparisons, adjust="fdr", n=nrow(kept_data))
## Use the coef argument to pull out adjusted p-values for subsections of the data
all_tables <- write_limma(kept_comparisons)
The function hpgl_linear_scatter() provides an easy way to perform some simple visualizations of the contrasts
## Make some scatter plots of the input data to see if these tables seem sane.
epi_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("epi_cl14", "epi_clbr")], gvis_filename="html/epi_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
epi_cl14clbr_ls$scatter
epi_cl14clbr_ls$both_histogram$plot
epi_cl14clbr_ls$lm_model
epi_cl14clbr_ls$lm_summary
epi_cl14clbr_ls$correlation
tryp_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("tryp_cl14", "tryp_clbr")], gvis_filename="html/tryp_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
tryp_cl14clbr_ls$scatter
tryp_cl14clbr_ls$both_histogram$plot
tryp_cl14clbr_ls$lm_model
tryp_cl14clbr_ls$lm_summary
tryp_cl14clbr_ls$correlation
a60_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("a60_cl14", "a60_clbr")], gvis_filename="html/a60_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a60_cl14clbr_ls$scatter
a60_cl14clbr_ls$both_histogram$plot
a60_cl14clbr_ls$lm_model
a60_cl14clbr_ls$lm_summary
a60_cl14clbr_ls$correlation
a96_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("a96_cl14", "a96_clbr")], gvis_filename="html/a96_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a96_cl14clbr_ls$scatter
a96_cl14clbr_ls$both_histogram$plot
a96_cl14clbr_ls$lm_model
a96_cl14clbr_ls$lm_summary
a96_cl14clbr_ls$correlation
epitryp_cl14_ls <- hpgl_linear_scatter(kept_table[,c("epi_cl14", "tryp_cl14")], gvis_filename="html/epitryp_cl14.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
epitryp_cl14_ls$scatter
epitryp_cl14_ls$both_histogram$plot
epitryp_cl14_ls$lm_model
epitryp_cl14_ls$lm_summary
epitryp_cl14_ls$correlation
epitryp_clbr_ls <- hpgl_linear_scatter(kept_table[,c("epi_clbr", "tryp_clbr")], gvis_filename="html/epitryp_clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
epitryp_clbr_ls$scatter
epitryp_clbr_ls$both_histogram$plot
epitryp_clbr_ls$lm_model
epitryp_clbr_ls$lm_summary
epitryp_clbr_ls$correlation
a96tryp_clbr_ls <- hpgl_linear_scatter(kept_table[,c("a96_clbr", "tryp_clbr")], gvis_filename="html/a96tryp_clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a96tryp_clbr_ls$scatter
a96tryp_clbr_ls$both_histogram$plot
a96tryp_clbr_ls$lm_model
a96tryp_clbr_ls$lm_summary
a96tryp_clbr_ls$correlation
a96tryp_cl14_ls <- hpgl_linear_scatter(kept_table[,c("a96_cl14", "tryp_cl14")], gvis_filename="html/a96tryp_cl14.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a96tryp_cl14_ls$scatter
a96tryp_cl14_ls$both_histogram$plot
a96tryp_cl14_ls$lm_model
a96tryp_cl14_ls$lm_summary
a96tryp_cl14_ls$correlation
a60a96_cl14_ls <- hpgl_linear_scatter(kept_table[,c("a60_cl14", "a96_cl14")], gvis_filename="html/a60a96_cl14.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a60a96_cl14_ls$scatter
a60a96_cl14_ls$both_histogram$plot
a60a96_cl14_ls$lm_model
a60a96_cl14_ls$lm_summary
a60a96_cl14_ls$correlation
a60a96_clbr_ls <- hpgl_linear_scatter(kept_table[,c("a60_clbr", "a96_clbr")], gvis_filename="html/a60a96_clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a60a96_clbr_ls$scatter
a60a96_clbr_ls$both_histogram$plot
a60a96_clbr_ls$lm_model
a60a96_clbr_ls$lm_summary
a60a96_clbr_ls$correlation
## Something is messed up here
## I am going to add some more contrasts to figure out what went wrong
epitryp_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("epitryp_cl14", "epitryp_clbr")], gvis_filename="html/epitryp_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
epitryp_cl14clbr_ls$scatter
epitryp_cl14clbr_ls$both_histogram$plot
epitryp_cl14clbr_ls$lm_model
epitryp_cl14clbr_ls$lm_summary
epitryp_cl14clbr_ls$correlation
## OK, so the following two are solidly telling me that the epimastigote data is the weirdo, I think we can assume the CL14.
epia60_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("epia60_cl14", "epia60_clbr")], gvis_filename="html/epia60_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
epia60_cl14clbr_ls$scatter
epia60_cl14clbr_ls$both_histogram$plot
epia60_cl14clbr_ls$lm_model
epia60_cl14clbr_ls$lm_summary
epia60_cl14clbr_ls$correlation
a60tryp_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("a60tryp_cl14", "a60tryp_clbr")], gvis_filename="html/a60tryp_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a60tryp_cl14clbr_ls$scatter
a60tryp_cl14clbr_ls$both_histogram$plot
a60tryp_cl14clbr_ls$lm_model
a60tryp_cl14clbr_ls$lm_summary
a60tryp_cl14clbr_ls$correlation
a60a96_cl14clbr_ls <- hpgl_linear_scatter(kept_table[,c("a60a96_cl14", "a60a96_clbr")], gvis_filename="html/a60a96_cl14clbr.html", tooltip_data=tooltip_data, loess=TRUE, gvis_trendline="linear")
a60a96_cl14clbr_ls$scatter
a60a96_cl14clbr_ls$both_histogram$plot
a60a96_cl14clbr_ls$lm_model
a60a96_cl14clbr_ls$lm_summary
a60a96_cl14clbr_ls$correlation
I removed a section regarding batch effect removal from here. This section is not needed because I included its functionality in myr::graph_metrics()
The following is the text of a message I sent Kwame, which lays out my concerns in this data:
I just wanted to ping you – I poked further at my concern. I am 99% certain I set up the model matrix correctly and assigned the relative contributions of each sample in makeContrasts() correctly. The results I get when performing the contrasts look for the most part very much like what I would expect. My only concern is when I perform operations like: (x-y)-(a-b); in most instances they too look very much like what I expect, but in one instance one term(the x from my example) is consistently (~4/3 * a) which leaves an apparently linear relationship when there really should not be one. This caused me to be concerned that I misattributed the samples in makeContrasts(). Upon further examination I came to the conclusion that they were in fact correct. My second guess was (given things I have heard you/Hector/Najib say about the relationship between normalization and read depth) that either the x or a term was much higher or lower in depth than the others. I checked and this is not true (there are libsize differences, but they don’t look significant to me). My last guess is that one of the samples in x does not cluster as nicely as all the rest of the samples in the data set and this is somehow leading to a consistent effect throughout the dataset, but for the life of me I cannot find a reason why.