Start out by extracting the relevant data and querying it to see the general quality.
This first block sets the names of the samples and colors. It also makes separate data sets for:
## Reread the sample sheet because I am fiddling with other possible surrogates (like strain)
## In fact, copy it to a separate sheet because these samples are a mess
hs_infection <- subset_expt(hs_expt, subset="experimentname=='infection'")
chosen_colors <- c("#009900","#990000", "#000099")
names(chosen_colors) <- c("uninf","chr","sh")
hs_uninf <- set_expt_colors(hs_infection, colors=chosen_colors)
hs_inf <- subset_expt(hs_uninf, subset="condition!='uninf'")
uninf_newnames <- paste0(pData(hs_uninf)$label, "_", pData(hs_uninf)$donor)
hs_uninf <- set_expt_samplenames(hs_uninf, newnames=uninf_newnames)
inf_newnames <- paste0(pData(hs_inf)$label, "_", pData(hs_inf)$donor)
hs_inf <- set_expt_samplenames(hs_inf, newnames=inf_newnames)
hs_cds_infection <- subset_expt(hs_cds_expt, subset="experimentname=='infection'")
hs_cds_uninf <- set_expt_colors(hs_cds_infection, colors=chosen_colors)
hs_cds_inf <- subset_expt(hs_cds_uninf, subset="condition!='uninf'")
hs_cds_uninf <- set_expt_samplenames(hs_cds_uninf, newnames=uninf_newnames)
hs_cds_inf <- set_expt_samplenames(hs_cds_inf, newnames=inf_newnames)
##infection_model_test <- model_test(infection_human$design)
The following creates metric plots of the raw data.
hs_uninf_met <- sm(graph_metrics(hs_uninf))
hs_inf_met <- sm(graph_metrics(hs_inf))
hs_cds_uninf_met <- sm(graph_metrics(hs_cds_uninf))
hs_cds_inf_met <- sm(graph_metrics(hs_cds_inf))
Now let us visualize some of these metrics for the data set of all features including the uninfected samples.
Start with the relative library sizes. Note that this includes all feature types.
hs_uninf_met$libsize
The picture is slightly different if we only look at coding sequences.
hs_cds_uninf_met$libsize
Look at the density of counts / feature for all samples. Use density plots and boxplots to view this information.
hs_uninf_met$density
hs_uninf_met$boxplot
## Wow there is a pretty big range in counts observed in this data!!
hs_cds_uninf_met$density
hs_cds_uninf_met$boxplot
Now let us look at how the samples relate to each other via pairwise correlation heatmaps. Once again, show this first for all features, then only cds features.
hs_uninf_met$corheat
hs_cds_uninf_met$corheat
Having looked at these metrics, now let us write out the results in 4 excel workbooks, representing the same 4 data sets.
hs_uninf_data <- sm(write_expt(hs_uninf, norm="quant", violin=FALSE, convert="cpm",
transform="log2", batch="pca", filter=TRUE,
excel=paste0("excel/hs_data-infection_with_uninfected-v", ver, ".xlsx")))
hs_inf_data <- sm(write_expt(hs_inf, norm="quant", violin=FALSE, convert="cpm",
transform="log2", batch="pca", filter=TRUE,
excel=paste0("excel/hs_data-infection_no_uninfected-v", ver, ".xlsx")))
hs_cds_uninf_data <- sm(write_expt(hs_cds_uninf, norm="quant", violin=FALSE, convert="cpm",
transform="log2", batch="pca", filter=TRUE,
excel=paste0("excel/hs_cds_data-infection_with_uninfected-v", ver, ".xlsx")))
hs_cds_inf_data <- sm(write_expt(hs_cds_inf, norm="quant", violin=FALSE, convert="cpm",
transform="log2", batch="pca", filter=TRUE,
excel=paste0("excel/hs_cds_data-infection_no_uninfected-v", ver, ".xlsx")))
Now perform the ‘default’ normalization we use in the lab and look again.
hs_uninf_cqf <- sm(normalize_expt(hs_uninf, convert="cpm", filter=TRUE, norm="quant"))
hs_uninf_met <- sm(graph_metrics(hs_uninf))
hs_inf_cqf <- sm(normalize_expt(hs_inf, convert="cpm", filter=TRUE, norm="quant"))
hs_inf_cqf_met <- sm(graph_metrics(hs_inf))
Construct figure 4, this should include the following panels:
pp(file="images/figure_4a.pdf")
hs_uninf_data$raw_libsize
dev.off()
## png
## 2
pp(file="images/figure_4b.pdf")
hs_uninf_data$raw_scaled_pca
dev.off()
## png
## 2
write.csv(hs_uninf_data$raw_scaled_pca_table, file="images/figure_4b.csv")
pp(file="images/figure_4c.pdf")
hs_inf_data$raw_scaled_pca
dev.off()
## png
## 2
write.csv(hs_inf_data$raw_scaled_pca_table, file="images/figure_4c.csv")
pp(file="images/figure_4d.pdf")
hs_uninf_data$raw_tsne
dev.off()
## png
## 2
pp(file="images/figure_4e.pdf")
hs_inf_data$raw_tsne
dev.off()
## png
## 2
pp(file="images/figure_4a_cds.pdf")
hs_cds_uninf_data$raw_libsize
dev.off()
## png
## 2
pp(file="images/figure_4b_cds.pdf")
hs_cds_uninf_data$raw_scaled_pca
dev.off()
## png
## 2
write.csv(hs_cds_uninf_data$raw_scaled_pca_table, file="images/figure_4b_cds.csv")
pp(file="images/figure_4c_cds.pdf")
hs_cds_inf_data$raw_scaled_pca
dev.off()
## png
## 2
write.csv(hs_cds_inf_data$raw_scaled_pca_table, file="images/figure_4c_cds.csv")
pp(file="images/figure_4d_cds.pdf")
hs_cds_uninf_data$raw_tsne
dev.off()
## png
## 2
pp(file="images/figure_4e_cds.pdf")
hs_cds_inf_data$raw_tsne
dev.off()
## png
## 2
Now let us try a few different ways of dealing with the batch effects/surrogate variables. In each case, I will use a PCA plot to see how the method changes the sample clustering.
In this first iteration, we will log2(cpm(quant(filter()))) the data and leave the experimental parameters as the default: condition == the 6 strains, 3 chronic 3 self-healing batch == the three patients p107
## Start with the non-uninfected, no batch correction
hs_inf_lqcf <- sm(normalize_expt(hs_inf, filter=TRUE, convert="cpm",
transform="log2", norm="quant"))
hs_pca_inf_lqcf <- plot_pca(hs_inf_lqcf)
hs_pca_inf_lqcf$plot
## 3 three patients are super obvious I think
## The patients 107/108 are on the left while 110 is on the right.
knitr::kable(hs_pca_inf_lqcf$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
chr_5430_d108 | chr_5430_d108 | chr | d108 | 2 | -0.1611 | -0.3489 | #990000 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chr | d108 | 2 | -0.1370 | -0.3164 | #990000 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chr | d108 | 2 | -0.0604 | -0.2326 | #990000 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | sh | d108 | 2 | -0.2063 | -0.3506 | #000099 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | sh | d108 | 2 | -0.0563 | -0.2715 | #000099 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | sh | d108 | 2 | -0.0981 | -0.2975 | #000099 | sh_2189_d108 |
chr_5430_d110 | chr_5430_d110 | chr | d110 | 3 | 0.2928 | -0.0198 | #990000 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chr | d110 | 3 | 0.3038 | 0.0114 | #990000 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chr | d110 | 3 | 0.3359 | 0.0550 | #990000 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | sh | d110 | 3 | 0.3020 | 0.0295 | #000099 | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | sh | d110 | 3 | 0.3685 | 0.1066 | #000099 | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | sh | d110 | 3 | 0.3447 | 0.0639 | #000099 | sh_2189_d110 |
chr_5430_d107 | chr_5430_d107 | chr | d107 | 1 | -0.2482 | 0.1907 | #990000 | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chr | d107 | 1 | -0.2296 | 0.2646 | #990000 | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chr | d107 | 1 | -0.1644 | 0.2975 | #990000 | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | sh | d107 | 1 | -0.2235 | 0.2505 | #000099 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | sh | d107 | 1 | -0.1456 | 0.2711 | #000099 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | sh | d107 | 1 | -0.2169 | 0.2965 | #000099 | sh_2189_d107 |
hs_pca_inf_lqcf$variance
## [1] 50.59 22.59 5.22 4.36 3.62 2.86 1.57 1.43 1.35 1.23 0.95 0.86 0.77 0.71
## [15] 0.66 0.65 0.58
write.csv(hs_pca_inf_lqcf$pcatable, file="csv/infection_nouninfected_no_batch.csv")
## This shows clean sehumantion by patient
## Therefore we will now add patient as a surrogate variable and minimize it
hscds_inf_lqcf <- sm(normalize_expt(hs_cds_inf, filter=TRUE, convert="cpm",
transform="log2", norm="quant"))
hscds_pca_inf_lqcf <- plot_pca(hscds_inf_lqcf)
hscds_pca_inf_lqcf$plot
For the second iteration, use the same normalization, but add a combat correction in an attempt to minimize patient’s effect in the variance.
hs_inf_lqcf_cbdonor <- sm(normalize_expt(hs_inf_lqcf, batch="combat_scale"))
## Here the split is semi chronic/self-healing, but not quite
hs_pca_inf_lqcf_cbdonor <- plot_pca(hs_inf_lqcf_cbdonor)
hs_pca_inf_lqcf_cbdonor$plot
## There are 2 sh and 1 chr on the right vs. 2 chr and 1 sh on the left
knitr::kable(hs_pca_inf_lqcf_cbdonor$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
chr_5430_d108 | chr_5430_d108 | chr | d108 | 2 | -0.1777 | 0.0481 | #990000 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chr | d108 | 2 | -0.2128 | 0.0873 | #990000 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chr | d108 | 2 | 0.3086 | -0.1935 | #990000 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | sh | d108 | 2 | -0.1571 | 0.0483 | #000099 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | sh | d108 | 2 | 0.1694 | -0.2950 | #000099 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | sh | d108 | 2 | -0.0142 | 0.3201 | #000099 | sh_2189_d108 |
chr_5430_d110 | chr_5430_d110 | chr | d110 | 3 | -0.3159 | 0.0953 | #990000 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chr | d110 | 3 | -0.3219 | -0.2135 | #990000 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chr | d110 | 3 | 0.1262 | -0.5327 | #990000 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | sh | d110 | 3 | -0.0520 | 0.2060 | #000099 | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | sh | d110 | 3 | 0.4338 | 0.1549 | #000099 | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | sh | d110 | 3 | 0.2165 | 0.2696 | #000099 | sh_2189_d110 |
chr_5430_d107 | chr_5430_d107 | chr | d107 | 1 | -0.3017 | -0.1341 | #990000 | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chr | d107 | 1 | -0.1394 | -0.0908 | #990000 | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chr | d107 | 1 | 0.1983 | -0.3629 | #990000 | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | sh | d107 | 1 | -0.2310 | 0.1243 | #000099 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | sh | d107 | 1 | 0.2756 | 0.2341 | #000099 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | sh | d107 | 1 | 0.1951 | 0.2347 | #000099 | sh_2189_d107 |
hs_pca_inf_lqcf_cbdonor$variance
## [1] 22.56 16.20 12.69 11.00 5.57 4.77 4.59 4.25 3.21 2.91 2.68 2.53 2.32 2.24
## [15] 2.13 0.22 0.14
write.csv(hs_pca_inf_lqcf_cbdonor$pcatable, file="csv/infection_nouninfected_batch_patient.csv")
hs_inf_pcainfo <- pca_information(hs_inf, plot_pcas=TRUE,
expt_factors=c("condition", "batch", "pathogenstrain",
"state", "donor", "rnangul"))
## More shallow curves in these plots suggest more genes in this principle component.
Look for significant correlations between the PCs and some factors in the experimental design.
hs_inf_pcainfo$anova_fstat_heatmap
hs_inf_pcainfo$pca_plots$PC2_PC6$plot
Here we will set the batch to the humansite strains and condition to a combination of the patient and state state; then perform the pca again.
new_condition <- paste0(hs_inf$design$state, '_', hs_inf$design$donor)
hs_inf_strbatch <- set_expt_factors(hs_inf, batch="pathogenstrain", condition=new_condition)
hs_inf_lqcf_cbstr <- sm(normalize_expt(hs_inf_strbatch, transform="log2", convert="cpm",
norm="quant", filter=TRUE, batch="combat_scale"))
hs_inf_lqcf_cbstr_pca <- plot_pca(hs_inf_lqcf_cbstr)
hs_inf_lqcf_cbstr_pca$plot
## Doing that kind of sucked the variance out of the data, but it did cause the samples to split by strain quite strongly
knitr::kable(hs_inf_lqcf_cbstr_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
chr_5430_d108 | chr_5430_d108 | chronic_d108 | s5430 | 6 | -0.2389 | -0.3389 | #1B9E77 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chronic_d108 | s5397 | 5 | -0.2331 | -0.3252 | #1B9E77 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chronic_d108 | s2504 | 4 | -0.2282 | -0.3041 | #1B9E77 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | self_heal_d108 | s2272 | 3 | 0.1600 | -0.3724 | #D95F02 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | self_heal_d108 | s1022 | 1 | 0.1783 | -0.3036 | #D95F02 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | self_heal_d108 | s2189 | 2 | 0.1795 | -0.3411 | #D95F02 | sh_2189_d108 |
chr_5430_d110 | chr_5430_d110 | chronic_d110 | s5430 | 6 | -0.0390 | 0.1866 | #7570B3 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chronic_d110 | s5397 | 5 | -0.0410 | 0.1663 | #7570B3 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chronic_d110 | s2504 | 4 | -0.0520 | 0.1531 | #7570B3 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | self_heal_d110 | s2272 | 3 | 0.3676 | 0.1360 | #E7298A | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | self_heal_d110 | s1022 | 1 | 0.3540 | 0.1337 | #E7298A | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | self_heal_d110 | s2189 | 2 | 0.3670 | 0.1323 | #E7298A | sh_2189_d110 |
chr_5430_d107 | chr_5430_d107 | chronic_d107 | s5430 | 6 | -0.3328 | 0.1979 | #66A61E | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chronic_d107 | s5397 | 5 | -0.3338 | 0.2137 | #66A61E | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chronic_d107 | s2504 | 4 | -0.3277 | 0.2162 | #66A61E | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | self_heal_d107 | s2272 | 3 | 0.0812 | 0.1698 | #E6AB02 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | self_heal_d107 | s1022 | 1 | 0.0747 | 0.1251 | #E6AB02 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | self_heal_d107 | s2189 | 2 | 0.0642 | 0.1545 | #E6AB02 | sh_2189_d107 |
hs_inf_lqcf_cbstr_pca$variance
## [1] 87.37 8.22 0.88 0.69 0.56 0.53 0.43 0.35 0.27 0.24 0.23 0.21 0.01 0.00
## [15] 0.00 0.00 0.00
write.csv(hs_inf_lqcf_cbstr_pca$pcatable, file="csv/infection_nouninfected_batch_strain.csv")
Now change only the condition to self/chronic and make super-explicit the split in the samples.
hs_inf_lqcf_cbstrv2 <- set_expt_condition(hs_inf_lqcf_cbstr, fact="state")
hs_inf_lqcf_cbstrv2 <- set_expt_colors(hs_inf_lqcf_cbstrv2, colors=c("#880000","#000088"))
hs_inf_lqcf_cbstrv2_pca <- plot_pca(hs_inf_lqcf_cbstrv2)
hs_inf_lqcf_cbstrv2_pca$plot
## Thus 3 runs of chronic on the right and self-state on the left
knitr::kable(hs_inf_lqcf_cbstrv2_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
chr_5430_d108 | chr_5430_d108 | chronic | s5430 | 6 | -0.2389 | -0.3389 | #880000 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chronic | s5397 | 5 | -0.2331 | -0.3252 | #880000 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chronic | s2504 | 4 | -0.2282 | -0.3041 | #880000 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | self_heal | s2272 | 3 | 0.1600 | -0.3724 | #000088 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | self_heal | s1022 | 1 | 0.1783 | -0.3036 | #000088 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | self_heal | s2189 | 2 | 0.1795 | -0.3411 | #000088 | sh_2189_d108 |
chr_5430_d110 | chr_5430_d110 | chronic | s5430 | 6 | -0.0390 | 0.1866 | #880000 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chronic | s5397 | 5 | -0.0410 | 0.1663 | #880000 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chronic | s2504 | 4 | -0.0520 | 0.1531 | #880000 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | self_heal | s2272 | 3 | 0.3676 | 0.1360 | #000088 | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | self_heal | s1022 | 1 | 0.3540 | 0.1337 | #000088 | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | self_heal | s2189 | 2 | 0.3670 | 0.1323 | #000088 | sh_2189_d110 |
chr_5430_d107 | chr_5430_d107 | chronic | s5430 | 6 | -0.3328 | 0.1979 | #880000 | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chronic | s5397 | 5 | -0.3338 | 0.2137 | #880000 | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chronic | s2504 | 4 | -0.3277 | 0.2162 | #880000 | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | self_heal | s2272 | 3 | 0.0812 | 0.1698 | #000088 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | self_heal | s1022 | 1 | 0.0747 | 0.1251 | #000088 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | self_heal | s2189 | 2 | 0.0642 | 0.1545 | #000088 | sh_2189_d107 |
For the next few blocks we will just repeat what we did but include the uninfected samples. Ideally doing so will have ~0 effect on the positions of the sample types.
In this first example, we see why the uninfected samples were initially removed from the analyses I think.
## Start with the non-uninfected, no batch correction
hs_uninf_lqcf <- sm(normalize_expt(hs_uninf, filter=TRUE, convert="cpm",
transform="log2", norm="quant"))
hs_uninf_lqcf_pca <- plot_pca(hs_uninf_lqcf)
hs_uninf_lqcf_pca$plot
## In this case, the uninfected samples cause the p107/p108 samples to smoosh together
knitr::kable(hs_uninf_lqcf_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
uninf_d108 | uninf_d108 | uninf | d108 | 2 | -0.1192 | -0.4892 | #009900 | uninf_d108 |
chr_5430_d108 | chr_5430_d108 | chr | d108 | 2 | 0.1684 | 0.0236 | #990000 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chr | d108 | 2 | 0.1455 | 0.0326 | #990000 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chr | d108 | 2 | 0.0704 | 0.0260 | #990000 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | sh | d108 | 2 | 0.1907 | -0.0507 | #000099 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | sh | d108 | 2 | 0.0678 | 0.0404 | #000099 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | sh | d108 | 2 | 0.1270 | 0.0982 | #000099 | sh_2189_d108 |
uninf_d110 | uninf_d110 | uninf | d110 | 3 | -0.2896 | -0.4798 | #009900 | uninf_d110 |
chr_5430_d110 | chr_5430_d110 | chr | d110 | 3 | -0.2280 | 0.2430 | #990000 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chr | d110 | 3 | -0.2350 | 0.2643 | #990000 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chr | d110 | 3 | -0.3191 | 0.0913 | #990000 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | sh | d110 | 3 | -0.2469 | 0.2080 | #000099 | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | sh | d110 | 3 | -0.3272 | 0.1504 | #000099 | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | sh | d110 | 3 | -0.2999 | 0.1656 | #000099 | sh_2189_d110 |
uninf_d107 | uninf_d107 | uninf | d107 | 1 | -0.0682 | -0.5184 | #009900 | uninf_d107 |
chr_5430_d107 | chr_5430_d107 | chr | d107 | 1 | 0.2617 | 0.0056 | #990000 | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chr | d107 | 1 | 0.2621 | 0.0748 | #990000 | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chr | d107 | 1 | 0.1763 | -0.0055 | #990000 | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | sh | d107 | 1 | 0.2601 | 0.0889 | #000099 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | sh | d107 | 1 | 0.1732 | 0.0381 | #000099 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | sh | d107 | 1 | 0.2298 | -0.0072 | #000099 | sh_2189_d107 |
hs_uninf_lqcf_pca$variance
## [1] 33.49 29.97 16.66 4.29 2.63 2.42 1.73 1.32 1.07 0.86 0.81 0.70 0.64 0.59
## [15] 0.58 0.50 0.48 0.45 0.43 0.39
write.csv(hs_uninf_lqcf_pca$pcatable, file="csv/infection_withuninfected_with_batch.csv")
For the second iteration, use the same normalization, but add a combat correction in an attempt to minimize patient’s effect in the variance.
hs_uninf_lqcf_cbdonor <- sm(normalize_expt(hs_uninf_lqcf, batch="combat_scale"))
## Here the split is semi chronic/self-state, but not quite
hs_uninf_lqcf_cbdonor_pca <- plot_pca(hs_uninf_lqcf_cbdonor)
hs_uninf_lqcf_cbdonor_pca$plot
## Now we have weak sehumantion between strains, I thought for a moment it might be few snps vs. many but that is not true.
## There are 2 sh and 1 chr on the right vs. 2 chr and 1 sh on the left
knitr::kable(hs_uninf_lqcf_cbdonor_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
uninf_d108 | uninf_d108 | uninf | d108 | 2 | -0.5109 | 0.0691 | #009900 | uninf_d108 |
chr_5430_d108 | chr_5430_d108 | chr | d108 | 2 | 0.0948 | -0.1729 | #990000 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chr | d108 | 2 | 0.0967 | -0.1735 | #990000 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chr | d108 | 2 | 0.0583 | 0.2280 | #990000 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | sh | d108 | 2 | 0.0315 | -0.3142 | #000099 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | sh | d108 | 2 | 0.0758 | 0.1310 | #000099 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | sh | d108 | 2 | 0.1524 | 0.0825 | #000099 | sh_2189_d108 |
uninf_d110 | uninf_d110 | uninf | d110 | 3 | -0.5201 | -0.5099 | #009900 | uninf_d110 |
chr_5430_d110 | chr_5430_d110 | chr | d110 | 3 | 0.1648 | 0.0014 | #990000 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chr | d110 | 3 | 0.1838 | -0.0382 | #990000 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chr | d110 | 3 | 0.0033 | 0.0840 | #990000 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | sh | d110 | 3 | 0.1039 | 0.0669 | #000099 | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | sh | d110 | 3 | 0.0191 | 0.3463 | #000099 | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | sh | d110 | 3 | 0.0438 | 0.2337 | #000099 | sh_2189_d110 |
uninf_d107 | uninf_d107 | uninf | d107 | 1 | -0.5332 | 0.3405 | #009900 | uninf_d107 |
chr_5430_d107 | chr_5430_d107 | chr | d107 | 1 | 0.0888 | -0.2790 | #990000 | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chr | d107 | 1 | 0.1458 | -0.1660 | #990000 | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chr | d107 | 1 | 0.0452 | 0.0279 | #990000 | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | sh | d107 | 1 | 0.1430 | -0.1872 | #000099 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | sh | d107 | 1 | 0.0693 | 0.2363 | #000099 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | sh | d107 | 1 | 0.0438 | -0.0066 | #000099 | sh_2189_d107 |
hs_uninf_lqcf_cbdonor_pca$variance
## [1] 58.94 11.07 5.71 4.54 3.47 2.50 1.78 1.58 1.46 1.31 1.19 1.11 1.05 0.93
## [15] 0.85 0.79 0.77 0.67 0.15 0.13
write.csv(hs_uninf_lqcf_cbdonor_pca$pcatable, file="csv/infection_withuninfected_batch_patient.csv")
Including the uninfected samples and changing the condition should not much matter
## Here we will set the batch to the humansite strains and condition to a
## combination of the patient and state state; then perform the pca.
new_condition <- paste0(hs_uninf$design$state, '_', hs_uninf$design$donor)
hs_uninfv2 <- set_expt_factors(hs_uninf, condition=new_condition, batch="pathogenstrain")
hs_uninfv2_lqcf_cbstr <- sm(normalize_expt(hs_uninfv2, transform="log2", convert="cpm",
norm="quant", filter=TRUE, batch="combat_scale"))
hs_uninfv2_lqcf_cbstr_pca <- plot_pca(hs_uninfv2_lqcf_cbstr)
hs_uninfv2_lqcf_cbstr_pca$plot
## This is a surprise to me, I would have expected the uninfected to still push
## the other samples off to a side or somesuch.
knitr::kable(hs_uninfv2_lqcf_cbstr_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
uninf_d108 | uninf_d108 | uninfected_d108 | none | 1 | -0.3841 | -0.2182 | #1B9E77 | uninf_d108 |
chr_5430_d108 | chr_5430_d108 | chronic_d108 | s5430 | 7 | 0.2235 | -0.3225 | #C16610 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chronic_d108 | s5397 | 6 | 0.2205 | -0.3165 | #C16610 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chronic_d108 | s2504 | 5 | 0.2179 | -0.3007 | #C16610 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | self_heal_d108 | s2272 | 4 | -0.0450 | -0.3487 | #8D6B86 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | self_heal_d108 | s1022 | 2 | -0.0614 | -0.2824 | #8D6B86 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | self_heal_d108 | s2189 | 3 | -0.0613 | -0.3120 | #8D6B86 | sh_2189_d108 |
uninf_d110 | uninf_d110 | uninfected_d110 | none | 1 | -0.4535 | -0.0394 | #BC4399 | uninf_d110 |
chr_5430_d110 | chr_5430_d110 | chronic_d110 | s5430 | 7 | 0.0834 | 0.1983 | #A66753 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chronic_d110 | s5397 | 6 | 0.0847 | 0.1813 | #A66753 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chronic_d110 | s2504 | 5 | 0.0894 | 0.1639 | #A66753 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | self_heal_d110 | s2272 | 4 | -0.1892 | 0.1684 | #96A713 | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | self_heal_d110 | s1022 | 2 | -0.1827 | 0.1791 | #96A713 | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | self_heal_d110 | s2189 | 3 | -0.1912 | 0.1702 | #96A713 | sh_2189_d110 |
uninf_d107 | uninf_d107 | uninfected_d107 | none | 1 | -0.3103 | 0.1750 | #D59D08 | uninf_d107 |
chr_5430_d107 | chr_5430_d107 | chronic_d107 | s5430 | 7 | 0.2977 | 0.1540 | #9D7426 | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chronic_d107 | s5397 | 6 | 0.2994 | 0.1704 | #9D7426 | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chronic_d107 | s2504 | 5 | 0.2953 | 0.1784 | #9D7426 | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | self_heal_d107 | s2272 | 4 | 0.0174 | 0.1420 | #666666 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | self_heal_d107 | s1022 | 2 | 0.0204 | 0.1237 | #666666 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | self_heal_d107 | s2189 | 3 | 0.0289 | 0.1357 | #666666 | sh_2189_d107 |
hs_uninfv2_lqcf_cbstr_pca$variance
## [1] 90.94 5.44 0.79 0.58 0.35 0.34 0.29 0.27 0.20 0.18 0.16 0.15 0.14 0.13
## [15] 0.02 0.01 0.01 0.00 0.00 0.00
write.csv(hs_uninfv2_lqcf_cbstr_pca$pcatable, file="csv/infection_withuninfected_batch_strain.csv")
hs_uninfv2_lqcf_cbstr <- set_expt_condition(hs_uninfv2_lqcf_cbstr, fact="state")
hs_uninfv2_lqcf_cbstr <- set_expt_colors(hs_uninfv2_lqcf_cbstr, colors=c("#880000","#000088","#008800"))
hs_uninfv2_lqcf_cbstr_pca <- plot_pca(hs_uninfv2_lqcf_cbstr)
hs_uninfv2_lqcf_cbstr_pca$plot
knitr::kable(hs_uninfv2_lqcf_cbstr_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
uninf_d108 | uninf_d108 | uninfected | none | 1 | -0.3841 | -0.2182 | #008800 | uninf_d108 |
chr_5430_d108 | chr_5430_d108 | chronic | s5430 | 7 | 0.2235 | -0.3225 | #880000 | chr_5430_d108 |
chr_5397_d108 | chr_5397_d108 | chronic | s5397 | 6 | 0.2205 | -0.3165 | #880000 | chr_5397_d108 |
chr_2504_d108 | chr_2504_d108 | chronic | s2504 | 5 | 0.2179 | -0.3007 | #880000 | chr_2504_d108 |
sh_2272_d108 | sh_2272_d108 | self_heal | s2272 | 4 | -0.0450 | -0.3487 | #000088 | sh_2272_d108 |
sh_1022_d108 | sh_1022_d108 | self_heal | s1022 | 2 | -0.0614 | -0.2824 | #000088 | sh_1022_d108 |
sh_2189_d108 | sh_2189_d108 | self_heal | s2189 | 3 | -0.0613 | -0.3120 | #000088 | sh_2189_d108 |
uninf_d110 | uninf_d110 | uninfected | none | 1 | -0.4535 | -0.0394 | #008800 | uninf_d110 |
chr_5430_d110 | chr_5430_d110 | chronic | s5430 | 7 | 0.0834 | 0.1983 | #880000 | chr_5430_d110 |
chr_5397_d110 | chr_5397_d110 | chronic | s5397 | 6 | 0.0847 | 0.1813 | #880000 | chr_5397_d110 |
chr_2504_d110 | chr_2504_d110 | chronic | s2504 | 5 | 0.0894 | 0.1639 | #880000 | chr_2504_d110 |
sh_2272_d110 | sh_2272_d110 | self_heal | s2272 | 4 | -0.1892 | 0.1684 | #000088 | sh_2272_d110 |
sh_1022_d110 | sh_1022_d110 | self_heal | s1022 | 2 | -0.1827 | 0.1791 | #000088 | sh_1022_d110 |
sh_2189_d110 | sh_2189_d110 | self_heal | s2189 | 3 | -0.1912 | 0.1702 | #000088 | sh_2189_d110 |
uninf_d107 | uninf_d107 | uninfected | none | 1 | -0.3103 | 0.1750 | #008800 | uninf_d107 |
chr_5430_d107 | chr_5430_d107 | chronic | s5430 | 7 | 0.2977 | 0.1540 | #880000 | chr_5430_d107 |
chr_5397_d107 | chr_5397_d107 | chronic | s5397 | 6 | 0.2994 | 0.1704 | #880000 | chr_5397_d107 |
chr_2504_d107 | chr_2504_d107 | chronic | s2504 | 5 | 0.2953 | 0.1784 | #880000 | chr_2504_d107 |
sh_2272_d107 | sh_2272_d107 | self_heal | s2272 | 4 | 0.0174 | 0.1420 | #000088 | sh_2272_d107 |
sh_1022_d107 | sh_1022_d107 | self_heal | s1022 | 2 | 0.0204 | 0.1237 | #000088 | sh_1022_d107 |
sh_2189_d107 | sh_2189_d107 | self_heal | s2189 | 3 | 0.0289 | 0.1357 | #000088 | sh_2189_d107 |
As per a conversation with Maria Adelaida on skype, lets remove all samples except those for one patient, then see if some aspect of the data jumps out (strain:strain variation, for example)
single_patient <- subset_expt(hs_inf, subset="donor=='d107'")
single_patient <- set_expt_batch(single_patient, fact="state")
single_norm <- sm(normalize_expt(single_patient, transform="log2", norm="quant",
convert="cpm", filter=TRUE))
single_norm_pca <- plot_pca(single_norm)
single_norm_pca$plot
single_patient <- subset_expt(hs_inf, subset="donor=='d108'")
single_patient <- set_expt_batch(single_patient, fact="state")
single_norm <- sm(normalize_expt(single_patient, transform="log2", norm="quant",
convert="cpm", filter=TRUE))
single_norm_pca <- plot_pca(single_norm)
single_norm_pca$plot
single_patient <- subset_expt(hs_inf, subset="donor=='d110'")
single_patient <- set_expt_batch(single_patient, fact="state")
single_norm <- sm(normalize_expt(single_patient, transform="log2", norm="quant",
convert="cpm", filter=TRUE))
single_norm_pca <- plot_pca(single_norm)
single_norm_pca$plot
## hmmm so the answer is, no -- I think.
knitr::kable(single_norm_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
chr_5430_d110 | HPGL0651 | chr | chronic | 1 | -0.4369 | 0.2326 | #990000 | chr_5430_d110 |
chr_5397_d110 | HPGL0652 | chr | chronic | 1 | -0.5379 | -0.1610 | #990000 | chr_5397_d110 |
chr_2504_d110 | HPGL0653 | chr | chronic | 1 | 0.2072 | -0.8469 | #990000 | chr_2504_d110 |
sh_2272_d110 | HPGL0654 | sh | self_heal | 2 | -0.1253 | 0.2510 | #000099 | sh_2272_d110 |
sh_1022_d110 | HPGL0655 | sh | self_heal | 2 | 0.6234 | 0.2280 | #000099 | sh_1022_d110 |
sh_2189_d110 | HPGL0656 | sh | self_heal | 2 | 0.2694 | 0.2963 | #000099 | sh_2189_d110 |
In our previous discussion, Hector suggested that sample ‘HPGL0635’ is sufficiently dis-similar to its cohort samples that it might actually be a member of strain ‘2504’ rather than ‘1022’. Let us look and see what happens if that is changed.
I am going to leave out the uninfected samples to avoid the confusion they generate.
hs_lqcf_noswitch <- sm(normalize_expt(hs_inf, transform="log2", convert="cpm",
norm="quant", filter=TRUE))
plot_pca(hs_lqcf_noswitch)$plot
## This is just to note that the original color for sample 635 was orange to match strain '1022'
##switch_one <- set_expt_condition(no_uninfected, ids=c("sHPGL0635"), fact="ch2504")
switch_one <- set_expt_condition(hs_inf, ids=c("sh_1022_d108"), fact="chr")
switcher <- list("sh_1022_d108" = "pink")
switch_one <- set_expt_colors(expt=switch_one, colors=switcher, change_by="sample")
lqcf_switch <- sm(normalize_expt(switch_one, transform="log2", convert="cpm", norm="quant",
filter=TRUE, batch="combat_scale"))
plot_pca(lqcf_switch)$plot
## Note that now it is pink, matching 'ch2504'
I may be biased, but I think this suggests that the samples were not switched.
One query was to see if there is a reversal of two samples.
combined_condition <- paste0(hs_inf$design$state, '_', hs_inf$design$donor)
##with_uninfected_combined <- set_expt_factors(with_uninfected, batch="pathogenstrain", condition=combined_condition)
hs_inf_combined <- set_expt_factors(hs_inf, batch="donor", condition="state")
head(exprs(normalize_expt(hs_inf, convert="cpm", filter=TRUE)))
## This function will replace the expt$expressionset slot with:
## cpm(hpgl(data))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 37484 low-count genes (13557 remaining).
## Step 2: not normalizing the data.
## Step 3: converting the data with cpm.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
## chr_5430_d108 chr_5397_d108 chr_2504_d108 sh_2272_d108 sh_1022_d108
## ENSG00000000419 19.929 19.091 16.926 19.812 17.835
## ENSG00000000457 25.969 30.204 22.447 29.718 23.492
## ENSG00000000460 13.890 11.024 7.846 10.340 9.205
## ENSG00000000938 531.151 522.436 385.306 403.883 401.960
## ENSG00000000971 4.429 4.929 4.359 4.605 4.698
## ENSG00000001036 38.852 37.195 38.792 35.627 40.369
## sh_2189_d108 chr_5430_d110 chr_5397_d110 chr_2504_d110 sh_2272_d110
## ENSG00000000419 17.854 16.113 17.212 15.027 13.011
## ENSG00000000457 28.000 18.311 20.261 20.135 20.096
## ENSG00000000460 8.195 7.263 7.115 6.534 4.638
## ENSG00000000938 435.812 618.598 552.883 329.412 460.340
## ENSG00000000971 3.415 2.564 1.762 3.029 2.834
## ENSG00000001036 44.001 46.021 39.303 35.935 37.358
## sh_1022_d110 sh_2189_d110 chr_5430_d107 chr_5397_d107 chr_2504_d107
## ENSG00000000419 12.678 15.247 16.314 14.825 13.676
## ENSG00000000457 17.554 18.090 27.494 26.042 24.837
## ENSG00000000460 4.632 4.458 9.925 8.472 5.738
## ENSG00000000938 477.464 452.119 461.239 332.276 280.359
## ENSG00000000971 1.544 2.972 4.221 7.844 5.816
## ENSG00000001036 35.596 35.921 31.715 23.611 24.758
## sh_2272_d107 sh_1022_d107 sh_2189_d107
## ENSG00000000419 16.461 14.623 15.871
## ENSG00000000457 28.395 24.614 27.970
## ENSG00000000460 6.349 7.448 6.903
## ENSG00000000938 348.559 391.470 298.665
## ENSG00000000971 6.584 3.724 3.808
## ENSG00000001036 27.043 30.518 20.995
combined_pca1 <- sm(normalize_expt(hs_inf_combined, filter=TRUE, batch="pca", convert="cpm"))
combined_pca1 <- set_expt_colors(combined_pca1, colors=c("#880000", "#000088"))
plot_pca(sm(normalize_expt(combined_pca1, filter=TRUE, transform="log2",
convert="cpm", norm="quant")))$plot
combined_pca2 <- set_expt_factors(combined_pca1, batch="pathogenstrain", condition="state")
combined_pca2 <- set_expt_colors(combined_pca2, colors=c("#880000", "#000088"))
combined_pca3 <- sm(normalize_expt(combined_pca2, filter=TRUE, batch="pca"))
l2cq_combined_pca3 <- sm(normalize_expt(combined_pca3, filter=TRUE, transform="log2",
convert="cpm", norm="quant"))
plot_pca(l2cq_combined_pca3)$plot
donor_strain_varpart <- sm(varpart(expt=hs_inf, predictor=NULL,
factors=c("condition","pathogenstrain","donor")))
donor_strain_varpart$percent_plot
pp(file="images/varpart_donor_strain.png")
donor_strain_varpart$partition_plot
dev.off()
## png
## 2
pp(file="images/varpart_donor_strain_pct.png")
replot_varpart_percent(donor_strain_varpart, n=40)
dev.off()
## png
## 2
sorted <- donor_strain_varpart$sorted_df
The experimental design does not fully supprt interaction models, but I want to see how it looks.
test_data <- sm(normalize_expt(hs_inf_combined, convert="cpm", norm="quant", filter=TRUE))
query_model_string <- "~ condition:pathogenstrain + donor"
query_design <- hs_inf_combined[["design"]]
query_conditions <- as.factor(query_design[["condition"]])
##query_batches <- as.factor(query_design[["anotherbatch"]])
query_batches <- as.factor(x=c("a","a","a","a","a","a","b","b","b","b","b","b","c","c","c","c","c","c"))
query_strains <- as.factor(query_design[["pathogenstrain"]])
query_donors <- as.factor(query_design[["donor"]])
data_mtrx <- as.data.frame(exprs(test_data))
query_model <- model.matrix(~ 0 + query_conditions + query_donors + query_strains, data=query_design)
combined_voom <- limma::voom(counts=data_mtrx, design=query_model, normalize.method="quantile")
## Coefficients not estimable: query_strainss5430
## Warning: Partial NA coefficients for 13557 probe(s)
combined_fit <- limma::lmFit(combined_voom, query_model, robust=TRUE)
## Coefficients not estimable: query_strainss5430
## Warning: Partial NA coefficients for 13557 probe(s)
combined_contrast <- limma::makeContrasts(
chsh=query_conditionschronic-query_conditionsself_heal,
levels=query_model)
combined_cfit <- limma::contrasts.fit(combined_fit, combined_contrast)
combined_bayes <- limma::eBayes(combined_cfit, robust=TRUE)
combined_table <- limma::topTable(combined_bayes, number=nrow(combined_bayes), resort.by="logFC")
hist(combined_table$adj.P.Val)
min(combined_table$adj.P.Val)
## [1] 1.331e-08
test_ma <- plot_ma_de(table=combined_table, expr_col="AveExpr", fc_col="logFC", p_col="adj.P.Val", logfc_cutoff=0.6)
test_ma$plot
head(combined_table)
## logFC AveExpr t P.Value adj.P.Val B
## ENSG00000125144 2.700 2.2519 13.756 2.366e-11 1.604e-07 14.1458
## ENSG00000122641 2.257 4.8639 11.096 1.584e-08 5.368e-05 9.8081
## ENSG00000166923 2.247 1.3688 5.079 2.397e-04 2.405e-02 0.7976
## ENSG00000137673 2.145 4.2251 6.272 5.686e-05 9.491e-03 2.2119
## ENSG00000205362 2.076 -0.3415 10.307 3.103e-09 1.402e-05 8.8525
## ENSG00000113657 1.934 5.6923 5.621 1.689e-04 2.009e-02 1.1213
“Changes during infection hpgl0630-0636 and hpgl0650-hpgl0663”
Start out by creating the expt and poking at it to see how well/badly behaved the data is.
## Reread the sample sheet because I am fiddling with other possible surrogates (like strain)
## In fact, copy it to a separate sheet because these samples are a mess
lp_inf <- subset_expt(parasite_expt, subset="experimentname=='infection'")
chosen_colors <- c("#990000", "#000099")
names(chosen_colors) <- c("chr","sh")
lp_inf <- set_expt_colors(lp_inf, colors=chosen_colors)
##lp_inf <- expt_exclude_genes(lp_inf, column="type")
The following creates all the metric plots of the raw data.
lp_inf_metrics <- sm(graph_metrics(lp_inf))
Now visualize some relevant metrics.
## Repeat for the parasite
lp_inf_metrics$libsize
## Wow, the range of coverage is shockingly large
lp_inf_metrics$density
## But this looks much better I think
lp_inf_metrics$boxplot
lp_inf_written <- sm(write_expt(lp_inf,
excel=paste0("excel/infection_parasite_data-v", ver, ".xlsx"),
violin=TRUE))
Now perform the ‘default’ normalization we use in the lab and look again.
lp_inf_norm <- sm(normalize_expt(lp_inf, convert="cpm", filter=TRUE, norm="quant"))
lp_inf_norm_metrics <- sm(graph_metrics(lp_inf_norm))
In this section, try out some normalizations/batch corrections and see the effect in PCA plots.
Start out by taking the parasite data and doing the default normalization and see what there is to see.
lp_l2qcpm <- sm(normalize_expt(lp_inf, transform="log2", convert="cpm",
norm="quant", filter=TRUE))
lp_l2qcpm_pca <- sm(plot_pca(lp_l2qcpm))
lp_l2qcpm_pca$plot
## Though the colors don't show it well, the samples are actually split beautifully by strain, but
## clearly not by chronic/healing
knitr::kable(lp_l2qcpm_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
hpgl0631 | HPGL0631 | chr | d108 | 2 | -0.4988 | -0.1901 | #990000 | HPGL0631 |
hpgl0632 | HPGL0632 | chr | d108 | 2 | 0.1617 | -0.1427 | #990000 | HPGL0632 |
hpgl0633 | HPGL0633 | chr | d108 | 2 | -0.0137 | 0.2963 | #990000 | HPGL0633 |
hpgl0634 | HPGL0634 | sh | d108 | 2 | 0.1339 | -0.2135 | #000099 | HPGL0634 |
hpgl0635 | HPGL0635 | sh | d108 | 2 | -0.0474 | 0.3478 | #000099 | HPGL0635 |
hpgl0636 | HPGL0636 | sh | d108 | 2 | 0.1801 | -0.1622 | #000099 | HPGL0636 |
hpgl0651 | HPGL0651 | chr | d110 | 3 | -0.4560 | -0.1643 | #990000 | HPGL0651 |
hpgl0652 | HPGL0652 | chr | d110 | 3 | 0.2050 | -0.0989 | #990000 | HPGL0652 |
hpgl0653 | HPGL0653 | chr | d110 | 3 | 0.0627 | 0.3543 | #990000 | HPGL0653 |
hpgl0654 | HPGL0654 | sh | d110 | 3 | 0.1987 | -0.1988 | #000099 | HPGL0654 |
hpgl0655 | HPGL0655 | sh | d110 | 3 | -0.0084 | 0.3517 | #000099 | HPGL0655 |
hpgl0656 | HPGL0656 | sh | d110 | 3 | 0.2129 | -0.1374 | #000099 | HPGL0656 |
hpgl0658 | HPGL0658 | chr | d107 | 1 | -0.5060 | -0.1996 | #990000 | HPGL0658 |
hpgl0659 | HPGL0659 | chr | d107 | 1 | 0.1391 | -0.1387 | #990000 | HPGL0659 |
hpgl0660 | HPGL0660 | chr | d107 | 1 | -0.0029 | 0.3053 | #990000 | HPGL0660 |
hpgl0661 | HPGL0661 | sh | d107 | 1 | 0.1403 | -0.1908 | #000099 | HPGL0661 |
hpgl0662 | HPGL0662 | sh | d107 | 1 | -0.0800 | 0.3288 | #000099 | HPGL0662 |
hpgl0663 | HPGL0663 | sh | d107 | 1 | 0.1788 | -0.1472 | #000099 | HPGL0663 |
lp_l2qcpm_tsne <- plot_tsne(lp_l2qcpm)
lp_l2qcpm_tsne$plot
##tt <- plot_tsne(lp_l2qcpm)
##ttt <- plot_tsne_genes(lp_l2qcpm)
Now repeat the same thing, but let sva minimize surrogate variables.
lp_l2qcpm_normbatch <- sm(normalize_expt(lp_inf, transform="log2", convert="cpm",
norm="quant", filter=TRUE, batch="sva"))
lp_l2qcpm_normbatch_pca <- plot_pca(lp_l2qcpm_normbatch)
Now plot the result and see if things make more sense.
lp_l2qcpm_normbatch_pca$plot
Adding SVA to the normalization does not help much.
## That does nothing significant to clarify things.
knitr::kable(lp_l2qcpm_normbatch_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
hpgl0631 | HPGL0631 | chr | d108 | 2 | -0.3179 | -0.0846 | #990000 | HPGL0631 |
hpgl0632 | HPGL0632 | chr | d108 | 2 | 0.5265 | 0.0599 | #990000 | HPGL0632 |
hpgl0633 | HPGL0633 | chr | d108 | 2 | -0.1376 | 0.0246 | #990000 | HPGL0633 |
hpgl0634 | HPGL0634 | sh | d108 | 2 | -0.1523 | 0.3362 | #000099 | HPGL0634 |
hpgl0635 | HPGL0635 | sh | d108 | 2 | -0.1448 | -0.0992 | #000099 | HPGL0635 |
hpgl0636 | HPGL0636 | sh | d108 | 2 | -0.1758 | -0.1116 | #000099 | HPGL0636 |
hpgl0651 | HPGL0651 | chr | d110 | 3 | 0.0806 | 0.0665 | #990000 | HPGL0651 |
hpgl0652 | HPGL0652 | chr | d110 | 3 | -0.2465 | 0.0111 | #990000 | HPGL0652 |
hpgl0653 | HPGL0653 | chr | d110 | 3 | -0.1124 | 0.0524 | #990000 | HPGL0653 |
hpgl0654 | HPGL0654 | sh | d110 | 3 | -0.2810 | 0.0921 | #000099 | HPGL0654 |
hpgl0655 | HPGL0655 | sh | d110 | 3 | 0.2511 | 0.0681 | #000099 | HPGL0655 |
hpgl0656 | HPGL0656 | sh | d110 | 3 | 0.0701 | -0.4988 | #000099 | HPGL0656 |
hpgl0658 | HPGL0658 | chr | d107 | 1 | 0.0630 | 0.0351 | #990000 | HPGL0658 |
hpgl0659 | HPGL0659 | chr | d107 | 1 | -0.1255 | 0.2471 | #990000 | HPGL0659 |
hpgl0660 | HPGL0660 | chr | d107 | 1 | -0.1392 | 0.0171 | #990000 | HPGL0660 |
hpgl0661 | HPGL0661 | sh | d107 | 1 | 0.4089 | 0.3793 | #000099 | HPGL0661 |
hpgl0662 | HPGL0662 | sh | d107 | 1 | 0.2387 | 0.0189 | #000099 | HPGL0662 |
hpgl0663 | HPGL0663 | sh | d107 | 1 | 0.1941 | -0.6142 | #000099 | HPGL0663 |
No, not really, so lets change things by putting the ‘snp status’ as the “batch” factor and minimize it with sva/combat.
lp_infv2 <- set_expt_condition(lp_inf, fact="state")
lp_infv2 <- set_expt_batch(lp_infv2, fact="snpclade")
lp_l2qcpm_snpbatch_straincond_sva <- sm(normalize_expt(lp_infv2, norm="quant",
transform="log2",
filter=TRUE,
batch="fsva"))
lp_l2qcpm_snpbatch_straincond_pca <- plot_pca(lp_l2qcpm_snpbatch_straincond_sva)
lp_l2qcpm_snpbatch_straincond_pca$plot
SNP status does not clarify things.
## Pulling strain 5430 away from the others makes a semi-split
knitr::kable(lp_l2qcpm_snpbatch_straincond_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
hpgl0631 | HPGL0631 | chronic | red | 4 | -0.1994 | -0.1208 | #1B9E77 | HPGL0631 |
hpgl0632 | HPGL0632 | chronic | yellow | 6 | -0.1393 | 0.1087 | #1B9E77 | HPGL0632 |
hpgl0633 | HPGL0633 | chronic | blue_chronic | 1 | 0.2959 | 0.0232 | #1B9E77 | HPGL0633 |
hpgl0634 | HPGL0634 | self_heal | white | 5 | -0.2106 | 0.3701 | #7570B3 | HPGL0634 |
hpgl0635 | HPGL0635 | self_heal | blue_self | 2 | 0.3487 | 0.2949 | #7570B3 | HPGL0635 |
hpgl0636 | HPGL0636 | self_heal | pink | 3 | -0.1592 | 0.1489 | #7570B3 | HPGL0636 |
hpgl0651 | HPGL0651 | chronic | red | 4 | -0.1737 | -0.2974 | #1B9E77 | HPGL0651 |
hpgl0652 | HPGL0652 | chronic | yellow | 6 | -0.0958 | -0.3006 | #1B9E77 | HPGL0652 |
hpgl0653 | HPGL0653 | chronic | blue_chronic | 1 | 0.3521 | -0.5280 | #1B9E77 | HPGL0653 |
hpgl0654 | HPGL0654 | self_heal | white | 5 | -0.1951 | -0.1497 | #7570B3 | HPGL0654 |
hpgl0655 | HPGL0655 | self_heal | blue_self | 2 | 0.3522 | -0.0434 | #7570B3 | HPGL0655 |
hpgl0656 | HPGL0656 | self_heal | pink | 3 | -0.1332 | -0.1445 | #7570B3 | HPGL0656 |
hpgl0658 | HPGL0658 | chronic | red | 4 | -0.2090 | -0.1460 | #1B9E77 | HPGL0658 |
hpgl0659 | HPGL0659 | chronic | yellow | 6 | -0.1357 | 0.0831 | #1B9E77 | HPGL0659 |
hpgl0660 | HPGL0660 | chronic | blue_chronic | 1 | 0.3039 | -0.0353 | #1B9E77 | HPGL0660 |
hpgl0661 | HPGL0661 | self_heal | white | 5 | -0.1874 | 0.2024 | #7570B3 | HPGL0661 |
hpgl0662 | HPGL0662 | self_heal | blue_self | 2 | 0.3294 | 0.3415 | #7570B3 | HPGL0662 |
hpgl0663 | HPGL0663 | self_heal | pink | 3 | -0.1440 | 0.1928 | #7570B3 | HPGL0663 |
Ok, so let us remove the healing state with combat and see if that allows us to see a split on some other factor.
lp_inf_strain <- set_expt_condition(lp_inf, fact="pathogenstrain")
lp_inf_strain <- set_expt_batch(lp_inf_strain, fact="state")
lp_l2qcpm_strain <- sm(normalize_expt(lp_inf_strain, transform="log2", convert="cpm",
norm="quant", filter=TRUE, batch="combat_scale"))
lp_l2qcpm_strain_pca <- plot_pca(lp_l2qcpm_strain)
lp_l2qcpm_strain_pca$plot
hmm ok, I think I quit for today.
plot_tsne(lp_l2qcpm_strain)$plot
hmm ok, I think I quit for today.
## wtf!?!? how did this happen?
knitr::kable(lp_l2qcpm_strain_pca$table)
sampleid | condition | batch | batch_int | PC1 | PC2 | colors | labels | |
---|---|---|---|---|---|---|---|---|
hpgl0631 | HPGL0631 | s5430 | chronic | 1 | -0.4266 | 0.2780 | #1B9E77 | HPGL0631 |
hpgl0632 | HPGL0632 | s5397 | chronic | 1 | -0.1580 | -0.3384 | #D95F02 | HPGL0632 |
hpgl0633 | HPGL0633 | s2504 | chronic | 1 | -0.0378 | -0.0036 | #7570B3 | HPGL0633 |
hpgl0634 | HPGL0634 | s2272 | self_heal | 2 | 0.1090 | -0.0865 | #E7298A | HPGL0634 |
hpgl0635 | HPGL0635 | s1022 | self_heal | 2 | 0.2916 | 0.3432 | #66A61E | HPGL0635 |
hpgl0636 | HPGL0636 | s2189 | self_heal | 2 | 0.1751 | -0.1096 | #E6AB02 | HPGL0636 |
hpgl0651 | HPGL0651 | s5430 | chronic | 1 | -0.4043 | 0.2412 | #1B9E77 | HPGL0651 |
hpgl0652 | HPGL0652 | s5397 | chronic | 1 | -0.1295 | -0.3764 | #D95F02 | HPGL0652 |
hpgl0653 | HPGL0653 | s2504 | chronic | 1 | 0.0055 | -0.0715 | #7570B3 | HPGL0653 |
hpgl0654 | HPGL0654 | s2272 | self_heal | 2 | 0.1340 | -0.1603 | #E7298A | HPGL0654 |
hpgl0655 | HPGL0655 | s1022 | self_heal | 2 | 0.3060 | 0.2918 | #66A61E | HPGL0655 |
hpgl0656 | HPGL0656 | s2189 | self_heal | 2 | 0.1928 | -0.1450 | #E6AB02 | HPGL0656 |
hpgl0658 | HPGL0658 | s5430 | chronic | 1 | -0.4317 | 0.2803 | #1B9E77 | HPGL0658 |
hpgl0659 | HPGL0659 | s5397 | chronic | 1 | -0.1645 | -0.3141 | #D95F02 | HPGL0659 |
hpgl0660 | HPGL0660 | s2504 | chronic | 1 | -0.0336 | -0.0085 | #7570B3 | HPGL0660 |
hpgl0661 | HPGL0661 | s2272 | self_heal | 2 | 0.1228 | -0.0884 | #E7298A | HPGL0661 |
hpgl0662 | HPGL0662 | s1022 | self_heal | 2 | 0.2723 | 0.3717 | #66A61E | HPGL0662 |
hpgl0663 | HPGL0663 | s2189 | self_heal | 2 | 0.1770 | -0.1041 | #E6AB02 | HPGL0663 |
pander::pander(sessionInfo())
R version 3.4.1 (2017-06-30)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ruv(v.0.9.6) and hpgltools(v.2017.01)
loaded via a namespace (and not attached): nlme(v.3.1-131), pbkrtest(v.0.4-7), bitops(v.1.0-6), matrixStats(v.0.52.2), devtools(v.1.13.3), bit64(v.0.9-7), doParallel(v.1.0.10), RColorBrewer(v.1.1-2), rprojroot(v.1.2), GenomeInfoDb(v.1.12.2), tools(v.3.4.1), backports(v.1.1.0), R6(v.2.2.2), KernSmooth(v.2.23-15), DBI(v.0.7), lazyeval(v.0.2.0), BiocGenerics(v.0.22.0), mgcv(v.1.8-19), colorspace(v.1.3-2), withr(v.2.0.0), bit(v.1.1-12), compiler(v.3.4.1), preprocessCore(v.1.38.1), graph(v.1.54.0), Biobase(v.2.36.2), xml2(v.1.1.1), DelayedArray(v.0.2.7), rtracklayer(v.1.36.4), labeling(v.0.3), caTools(v.1.17.1), scales(v.0.5.0), genefilter(v.1.58.1), quadprog(v.1.5-5), RBGL(v.1.52.0), commonmark(v.1.2), stringr(v.1.2.0), digest(v.0.6.12), Rsamtools(v.1.28.0), minqa(v.1.2.4), colorRamps(v.2.3), variancePartition(v.1.6.0), rmarkdown(v.1.6), XVector(v.0.16.0), base64enc(v.0.1-3), htmltools(v.0.3.6), lme4(v.1.1-13), highr(v.0.6), limma(v.3.32.5), rlang(v.0.1.2), RSQLite(v.2.0), BiocInstaller(v.1.26.0), BiocParallel(v.1.10.1), gtools(v.3.5.0), RCurl(v.1.95-4.8), magrittr(v.1.5), GenomeInfoDbData(v.0.99.0), Matrix(v.1.2-11), Rcpp(v.0.12.12), munsell(v.0.4.3), S4Vectors(v.0.14.3), stringi(v.1.1.5), yaml(v.2.1.14), edgeR(v.3.18.1), MASS(v.7.3-47), SummarizedExperiment(v.1.6.3), zlibbioc(v.1.22.0), gplots(v.3.0.1), Rtsne(v.0.13), plyr(v.1.8.4), grid(v.3.4.1), blob(v.1.1.0), gdata(v.2.18.0), parallel(v.3.4.1), ggrepel(v.0.6.5), crayon(v.1.3.2), lattice(v.0.20-35), Biostrings(v.2.44.2), splines(v.3.4.1), pander(v.0.6.1), GenomicFeatures(v.1.28.4), annotate(v.1.54.0), locfit(v.1.5-9.1), knitr(v.1.17), GenomicRanges(v.1.28.4), corpcor(v.1.6.9), reshape2(v.1.4.2), codetools(v.0.2-15), biomaRt(v.2.32.1), stats4(v.3.4.1), XML(v.3.98-1.9), evaluate(v.0.10.1), data.table(v.1.10.4), nloptr(v.1.0.4), foreach(v.1.4.3), testthat(v.1.0.2), gtable(v.0.2.0), ggplot2(v.2.2.1), openxlsx(v.4.0.17), xtable(v.1.8-2), roxygen2(v.6.0.1), survival(v.2.41-3), tibble(v.1.3.4), OrganismDbi(v.1.18.0), iterators(v.1.0.8), GenomicAlignments(v.1.12.2), AnnotationDbi(v.1.38.2), memoise(v.1.1.0), IRanges(v.2.10.2), statmod(v.1.4.30), sva(v.3.24.4) and directlabels(v.2017.03.31)
message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 5394e0113e659a34b39090b8d3fec7bdda3f0377
## R> packrat::restore()
## This is hpgltools commit: Sat Aug 26 13:55:21 2017 -0400: 5394e0113e659a34b39090b8d3fec7bdda3f0377
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 02_estimation_infection-v20170820.rda.xz
tmp <- sm(saveme(filename=this_save))