1 Sample Estimation

Let us see what we can learn about our mystery samples!

1.1 Generate expressionsets

esmer <- create_expt("sample_sheets/all_samples.xlsx", file_column="esmerfile")
## Reading the sample metadata.
## The sample definitions comprises: 7 rows(samples) and 7 columns(metadata fields).
## Reading count tables.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 10473 annotations and counts.
## Bringing together the count matrix and gene information.
nonesmer <- create_expt("sample_sheets/all_samples.xlsx", file_column="nonesmerfile")
## Reading the sample metadata.
## The sample definitions comprises: 7 rows(samples) and 7 columns(metadata fields).
## Reading count tables.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 10958 annotations and counts.
## Bringing together the count matrix and gene information.
unassigned <- create_expt("sample_sheets/all_samples.xlsx", file_column="unassignedfile")
## Reading the sample metadata.
## The sample definitions comprises: 7 rows(samples) and 7 columns(metadata fields).
## Reading count tables.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 2507 annotations and counts.
## Bringing together the count matrix and gene information.
all <- create_expt("sample_sheets/all_samples.xlsx", file_column="allhisatfile")
## Reading the sample metadata.
## The sample definitions comprises: 7 rows(samples) and 7 columns(metadata fields).
## Reading count tables.
## Reading count tables with read.table().
## /cbcb/nelsayed-scratch/atb/rnaseq/tcruzi_2019/preprocessing/hpgl1168/outputs/hisat2_tcruzi_all/forward.count.xz contains 25114 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/tcruzi_2019/preprocessing/hpgl1169/outputs/hisat2_tcruzi_all/forward.count.xz contains 25114 rows and merges to 25114 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/tcruzi_2019/preprocessing/hpgl1170/outputs/hisat2_tcruzi_all/forward.count.xz contains 25114 rows and merges to 25114 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/tcruzi_2019/preprocessing/hpgl1171/outputs/hisat2_tcruzi_all/forward.count.xz contains 25114 rows and merges to 25114 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/tcruzi_2019/preprocessing/hpgl1172/outputs/hisat2_tcruzi_all/forward.count.xz contains 25114 rows and merges to 25114 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/tcruzi_2019/preprocessing/hpgl1173/outputs/hisat2_tcruzi_all/forward.count.xz contains 25114 rows and merges to 25114 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/tcruzi_2019/preprocessing/hpgl1174/outputs/hisat2_tcruzi_all/forward.count.xz contains 25114 rows and merges to 25114 rows.
## Finished reading count tables.
## Matched 25109 annotations and counts.
## Bringing together the count matrix and gene information.
esmer_write <- write_expt(esmer, excel=glue::glue("excel/esmer-v{ver}.xlsx"))
## Writing the first sheet, containing a legend and some summary data.

## Writing the raw reads.
## Graphing the raw reads.
## Writing the normalized reads.
## Line 2336 of expt.r, removed normalization of pca plotting.
## Graphing the normalized reads.

## Writing the median reads by factor.
## The factor wt has 3 rows.
## The factor unknown has 4 rows.
## Note: zip::zip() is deprecated, please use zip::zipr() instead

1.2 Fooling around

esmer_norm <- normalize_expt(esmer, norm="quant", transform="log2", convert="cpm", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 493 low-count genes (9980 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 707 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
esmer_pca <- plot_pca(esmer_norm)
esmer_pca$plot

non_norm <- normalize_expt(nonesmer, norm="quant", transform="log2", convert="cpm", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 526 low-count genes (10432 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 563 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
non_pca <- plot_pca(non_norm)
non_pca$plot

unas_norm <- normalize_expt(unassigned, norm="quant", transform="log2", convert="cpm", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 0 low-count genes (2507 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 2669 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
unas_pca <- plot_pca(non_norm)
unas_pca$plot

quick <- all_pairwise(esmer, model_batch=FALSE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 493 low-count genes (9980 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 707 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
## Plotting a PCA before surrogates/batch inclusion.
## Assuming no batch in model for testing pca.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
quick_tables <- combine_de_tables(quick)
## Writing a legend of columns.
## Working on table 1/1: wt_vs_unknown
## 20181210 a pthread error in normalize.quantiles leads me to robust.
## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.
test_table <- quick_tables[["data"]][[1]]
test_idx <- order(test_table[["deseq_logfc"]])
test_table <- test_table[test_idx, ]

all_norm <- normalize_expt(all, norm="quant", transform="log2", convert="cpm", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 5600 low-count genes (19509 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 1163 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
all_pca <- plot_pca(all_norm)
all_pca$plot

all_filt <- normalize_expt(all, filter=TRUE)
## This function will replace the expt$expressionset slot with:
## cbcb(data)
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
##  some metrics are easier to see when the data is log2 transformed, but
##  EdgeR/DESeq do not accept transformed data.
## Leaving the data unconverted.  It is often advisable to cpm/rpkm
##  the data to normalize for sampling differences, keep in mind though that rpkm
##  has some annoying biases, and voom() by default does a cpm (though hpgl_voom()
##  will try to detect this).
## Leaving the data unnormalized.  This is necessary for DESeq, but
##  EdgeR/limma might benefit from normalization.  Good choices include quantile,
##  size-factor, tmm, etc.
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 5600 low-count genes (19509 remaining).
## Step 2: not normalizing the data.
## Step 3: not converting the data.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
all_de <- all_pairwise(all_filt, model_batch=FALSE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It backs up the current data into a slot named:
##  expt$backup_expressionset. It will also save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
##  when invoking limma.  The appropriate libsize is the non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 0 low-count genes (19509 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 1163 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
## Plotting a PCA before surrogates/batch inclusion.
## Assuming no batch in model for testing pca.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
all_tables <- combine_de_tables(all_de, excel="excel/all_tables.xlsx")
## Deleting the file excel/all_tables.xlsx before writing the tables.
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Working on table 1/1: wt_vs_unknown
## 20181210 a pthread error in normalize.quantiles leads me to robust.
## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.
## Adding venn plots for wt_vs_unknown.

## Limma expression coefficients for wt_vs_unknown; R^2: 0.954; equation: y = 0.969x + 0.155
## Edger expression coefficients for wt_vs_unknown; R^2: 0.955; equation: y = 0.951x + 0.309
## DESeq2 expression coefficients for wt_vs_unknown; R^2: 0.955; equation: y = 0.951x + 0.301
## Writing summary information.
## Attempting to add the comparison plot to pairwise_summary at row: 23 and column: 1
## Performing save of the workbook.

if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename=savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 801f62ba31ba525c6db9453255c6262fb1162d5c
## This is hpgltools commit: Thu Feb 21 14:03:11 2019 -0500: 801f62ba31ba525c6db9453255c6262fb1162d5c
## Saving to 01_annotation_v20190320.rda.xz
LS0tCnRpdGxlOiAiVC4gY3J1emkgMjAxOTAzMjA6IFNhbXBsZSBFc3RpbWF0aW9uIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiBodG1sX2RvY3VtZW50OgogIGNvZGVfZG93bmxvYWQ6IHRydWUKICBjb2RlX2ZvbGRpbmc6IHNob3cKICBmaWdfY2FwdGlvbjogdHJ1ZQogIGZpZ19oZWlnaHQ6IDcKICBmaWdfd2lkdGg6IDcKICBoaWdobGlnaHQ6IGRlZmF1bHQKICBrZWVwX21kOiBmYWxzZQogIG1vZGU6IHNlbGZjb250YWluZWQKICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICBzZWxmX2NvbnRhaW5lZDogdHJ1ZQogIHRoZW1lOiByZWFkYWJsZQogIHRvYzogdHJ1ZQogIHRvY19mbG9hdDoKICAgY29sbGFwc2VkOiBmYWxzZQogICBzbW9vdGhfc2Nyb2xsOiBmYWxzZQotLS0KCjxzdHlsZT4KICBib2R5IC5tYWluLWNvbnRhaW5lciB7CiAgICBtYXgtd2lkdGg6IDE2MDBweDsKICB9Cjwvc3R5bGU+CgpgYGB7ciBvcHRpb25zLCBpbmNsdWRlPUZBTFNFfQppZiAoIWlzVFJVRShnZXQwKCJza2lwX2xvYWQiKSkpIHsKICBsaWJyYXJ5KGhwZ2x0b29scykKICB0dCA8LSBzbShkZXZ0b29sczo6bG9hZF9hbGwoIn4vaHBnbHRvb2xzIikpCiAga25pdHI6Om9wdHNfa25pdCRzZXQocHJvZ3Jlc3M9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgICB2ZXJib3NlPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICAgd2lkdGg9OTAsCiAgICAgICAgICAgICAgICAgICAgICAgZWNobz1UUlVFKQogIGtuaXRyOjpvcHRzX2NodW5rJHNldChlcnJvcj1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgICBmaWcud2lkdGg9OCwKICAgICAgICAgICAgICAgICAgICAgICAgZmlnLmhlaWdodD04LAogICAgICAgICAgICAgICAgICAgICAgICBkcGk9OTYpCiAgb2xkX29wdGlvbnMgPC0gb3B0aW9ucyhkaWdpdHM9NCwKICAgICAgICAgICAgICAgICAgICAgICAgIHN0cmluZ3NBc0ZhY3RvcnM9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAgICBrbml0ci5kdXBsaWNhdGUubGFiZWw9ImFsbG93IikKICBnZ3Bsb3QyOjp0aGVtZV9zZXQoZ2dwbG90Mjo6dGhlbWVfYncoYmFzZV9zaXplPTEyKSkKICB2ZXIgPC0gIjIwMTkwMzIwIgogIHByZXZpb3VzX2ZpbGUgPC0gcGFzdGUwKCJpbmRleF92IiwgdmVyLCAiLlJtZCIpCgogIHRtcCA8LSB0cnkoc20obG9hZG1lKGZpbGVuYW1lPWdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iXFwucmRhXFwueHoiLCB4PXByZXZpb3VzX2ZpbGUpKSkpCiAgcm1kX2ZpbGUgPC0gcGFzdGUwKCIwMV9hbm5vdGF0aW9uX3YiLCB2ZXIsICIuUm1kIikKICBzYXZlZmlsZSA8LSBnc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IlxcLnJkYVxcLnh6IiwgeD1ybWRfZmlsZSkKfQpgYGAKCiMgU2FtcGxlIEVzdGltYXRpb24KCkxldCB1cyBzZWUgd2hhdCB3ZSBjYW4gbGVhcm4gYWJvdXQgb3VyIG15c3Rlcnkgc2FtcGxlcyEKCiMjIEdlbmVyYXRlIGV4cHJlc3Npb25zZXRzCgpgYGB7ciBlc3RpbWF0ZX0KZXNtZXIgPC0gY3JlYXRlX2V4cHQoInNhbXBsZV9zaGVldHMvYWxsX3NhbXBsZXMueGxzeCIsIGZpbGVfY29sdW1uPSJlc21lcmZpbGUiKQpub25lc21lciA8LSBjcmVhdGVfZXhwdCgic2FtcGxlX3NoZWV0cy9hbGxfc2FtcGxlcy54bHN4IiwgZmlsZV9jb2x1bW49Im5vbmVzbWVyZmlsZSIpCnVuYXNzaWduZWQgPC0gY3JlYXRlX2V4cHQoInNhbXBsZV9zaGVldHMvYWxsX3NhbXBsZXMueGxzeCIsIGZpbGVfY29sdW1uPSJ1bmFzc2lnbmVkZmlsZSIpCgphbGwgPC0gY3JlYXRlX2V4cHQoInNhbXBsZV9zaGVldHMvYWxsX3NhbXBsZXMueGxzeCIsIGZpbGVfY29sdW1uPSJhbGxoaXNhdGZpbGUiKQoKZXNtZXJfd3JpdGUgPC0gd3JpdGVfZXhwdChlc21lciwgZXhjZWw9Z2x1ZTo6Z2x1ZSgiZXhjZWwvZXNtZXItdnt2ZXJ9Lnhsc3giKSkKYGBgCgojIyBGb29saW5nIGFyb3VuZAoKYGBge3IgZm9vbGluZ30KZXNtZXJfbm9ybSA8LSBub3JtYWxpemVfZXhwdChlc21lciwgbm9ybT0icXVhbnQiLCB0cmFuc2Zvcm09ImxvZzIiLCBjb252ZXJ0PSJjcG0iLCBmaWx0ZXI9VFJVRSkKZXNtZXJfcGNhIDwtIHBsb3RfcGNhKGVzbWVyX25vcm0pCmVzbWVyX3BjYSRwbG90Cgpub25fbm9ybSA8LSBub3JtYWxpemVfZXhwdChub25lc21lciwgbm9ybT0icXVhbnQiLCB0cmFuc2Zvcm09ImxvZzIiLCBjb252ZXJ0PSJjcG0iLCBmaWx0ZXI9VFJVRSkKbm9uX3BjYSA8LSBwbG90X3BjYShub25fbm9ybSkKbm9uX3BjYSRwbG90Cgp1bmFzX25vcm0gPC0gbm9ybWFsaXplX2V4cHQodW5hc3NpZ25lZCwgbm9ybT0icXVhbnQiLCB0cmFuc2Zvcm09ImxvZzIiLCBjb252ZXJ0PSJjcG0iLCBmaWx0ZXI9VFJVRSkKdW5hc19wY2EgPC0gcGxvdF9wY2Eobm9uX25vcm0pCnVuYXNfcGNhJHBsb3QKCnF1aWNrIDwtIGFsbF9wYWlyd2lzZShlc21lciwgbW9kZWxfYmF0Y2g9RkFMU0UpCnF1aWNrX3RhYmxlcyA8LSBjb21iaW5lX2RlX3RhYmxlcyhxdWljaykKCnRlc3RfdGFibGUgPC0gcXVpY2tfdGFibGVzW1siZGF0YSJdXVtbMV1dCnRlc3RfaWR4IDwtIG9yZGVyKHRlc3RfdGFibGVbWyJkZXNlcV9sb2dmYyJdXSkKdGVzdF90YWJsZSA8LSB0ZXN0X3RhYmxlW3Rlc3RfaWR4LCBdCgphbGxfbm9ybSA8LSBub3JtYWxpemVfZXhwdChhbGwsIG5vcm09InF1YW50IiwgdHJhbnNmb3JtPSJsb2cyIiwgY29udmVydD0iY3BtIiwgZmlsdGVyPVRSVUUpCmFsbF9wY2EgPC0gcGxvdF9wY2EoYWxsX25vcm0pCmFsbF9wY2EkcGxvdAoKYWxsX2ZpbHQgPC0gbm9ybWFsaXplX2V4cHQoYWxsLCBmaWx0ZXI9VFJVRSkKYWxsX2RlIDwtIGFsbF9wYWlyd2lzZShhbGxfZmlsdCwgbW9kZWxfYmF0Y2g9RkFMU0UpCmFsbF90YWJsZXMgPC0gY29tYmluZV9kZV90YWJsZXMoYWxsX2RlLCBleGNlbD0iZXhjZWwvYWxsX3RhYmxlcy54bHN4IikKYGBgCgoKYGBge3Igc2F2ZW1lfQppZiAoIWlzVFJVRShnZXQwKCJza2lwX2xvYWQiKSkpIHsKICBwYW5kZXI6OnBhbmRlcihzZXNzaW9uSW5mbygpKQogIG1lc3NhZ2UocGFzdGUwKCJUaGlzIGlzIGhwZ2x0b29scyBjb21taXQ6ICIsIGdldF9naXRfY29tbWl0KCkpKQogIG1lc3NhZ2UocGFzdGUwKCJTYXZpbmcgdG8gIiwgc2F2ZWZpbGUpKQogIHRtcCA8LSBzbShzYXZlbWUoZmlsZW5hbWU9c2F2ZWZpbGUpKQp9CmBgYAo=