1 Sample Estimation

1.1 Generate expressionsets

Caveat: This is using hisat2 quantifications.

hs_expt <- create_expt("sample_sheets/tmrc3_samples_20191001.xlsx",
                       file_column="hg3891salmonfile",
                       gene_info=hs_annot, tx_gene_map=tx_gene_map)

## Reading the sample metadata.

## The sample definitions comprises: 17 rows(samples) and 62 columns(metadata fields).

## Reading count tables.

## Using the transcript to gene mapping.

## Reading salmon data with tximport.

## Finished reading count tables.

## Matched 2859 annotations and counts.

## Bringing together the count matrix and gene information.

## The mapped IDs are not the rownames of your gene information, changing them now.

## Some annotations were lost in merging, setting them to 'undefined'.

## The final expressionset has 2859 rows and 13 columns.

libsizes <- plot_libsize(hs_expt)

## The scale difference between the smallest and largest
## libraries is > 10. Assuming a log10 scale is better, set scale=FALSE if not.

libsizes$plot

## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(hs_expt)
box <- plot_boxplot(hs_expt)

## This data will benefit from being displayed on the log scale.

## If this is not desired, set scale='raw'

## Some entries are 0.  We are on log scale, adding 1 to the data.

## Changed 23422 zero count features.

hs_write <- write_expt(hs_expt, excel=glue("excel/hs_written-v{ver}.xlsx"))

## Writing the first sheet, containing a legend and some summary data.

## Writing the raw reads.

## Graphing the raw reads.

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (geom_text).

## Writing the normalized reads.

## Error in density.default(x, adjust = adj) : 'x' contains missing values
## Error in density.default(x, adjust = adj) : 'x' contains missing values

## Graphing the normalized reads.

## Writing the median reads by factor.

## The factor healthy_2hr has 2 rows.

## The factor healthy_7hr has 2 rows.

## The factor healthy_12hr has 2 rows.

## The factor neutrophil_early has 2 rows.

## The factor monocyte_early has 2 rows.

## The factor neutrophil_late has only 1 row.

## The factor monocyte_late has 2 rows.

## Warning in median_by_factor(exprs(norm_data), fact =
## norm_data[["conditions"]]): The level undefined of the factor has no
## columns.

## Note: zip::zip() is deprecated, please use zip::zipr() instead

hs_valid <- subset_expt(hs_expt, coverage=100000)

## Subsetting given a minimal number of counts/sample.

## There were 13, now there are 11 samples.

valid_write <- write_expt(hs_valid, excel=glue("excel/hs_valid-v{ver}.xlsx"))

## Writing the first sheet, containing a legend and some summary data.

## Writing the raw reads.

## Graphing the raw reads.

## Warning: Removed 3 rows containing missing values (geom_text).

## Warning: Removed 3 rows containing missing values (geom_text).

## Writing the normalized reads.

## Error in density.default(x, adjust = adj) : 'x' contains missing values
## Error in density.default(x, adjust = adj) : 'x' contains missing values

## Graphing the normalized reads.

## Writing the median reads by factor.

## The factor healthy_2hr has 2 rows.

## The factor healthy_7hr has 2 rows.

## The factor healthy_12hr has 2 rows.

## The factor neutrophil_early has only 1 row.

## The factor monocyte_early has only 1 row.

## The factor neutrophil_late has only 1 row.

## The factor monocyte_late has 2 rows.

hs_expt <- create_expt("sample_sheets/tmrc3_samples_20191001.xlsx",
                       file_column="hg3891hisatfile",
                       gene_info=hs_annot)

## Reading the sample metadata.

## The sample definitions comprises: 17 rows(samples) and 62 columns(metadata fields).

## Reading count tables.

## Reading count tables with read.table().

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30001/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30002/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30003/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30004/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30005/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30006/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30007/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30008/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30009/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30010/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30011/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30012/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/tmrc30013/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## Finished reading count tables.

## Matched 58243 annotations and counts.

## Bringing together the count matrix and gene information.

## Some annotations were lost in merging, setting them to 'undefined'.

## The final expressionset has 58302 rows and 13 columns.

libsizes <- plot_libsize(hs_expt)

## The scale difference between the smallest and largest
## libraries is > 10. Assuming a log10 scale is better, set scale=FALSE if not.

libsizes$plot

## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(hs_expt)
nonzero$plot

box <- plot_boxplot(hs_expt)

## This data will benefit from being displayed on the log scale.

## If this is not desired, set scale='raw'

## Some entries are 0.  We are on log scale, adding 1 to the data.

## Changed 494937 zero count features.

box

## This is causing segmentation faults in R
##hs_write <- write_expt(hs_expt, excel=glue("excel/hs_hisat2_written-v{ver}.xlsx"))

hs_valid <- subset_expt(hs_expt, coverage=3000000)

## Subsetting given a minimal number of counts/sample.

## There were 13, now there are 11 samples.

plot_libsize(hs_valid)$plot

valid_write <- write_expt(hs_valid, excel=glue("excel/hs_valid-v{ver}.xlsx"))

## Writing the first sheet, containing a legend and some summary data.

## Writing the raw reads.

## Graphing the raw reads.

## Warning: Removed 3 rows containing missing values (geom_text).

## Warning: Removed 3 rows containing missing values (geom_text).

## Writing the normalized reads.

## Graphing the normalized reads.

## Error in cov.wt(z) : 'x' must contain finite values only

## Error in plot.window(...) : need finite 'ylim' values

## Error in cov.wt(z) : 'x' must contain finite values only

## Error in plot.window(...) : need finite 'ylim' values

## Error in pcload[["plot"]]: subscript out of bounds

2 Questions from Maria Adelaida

The following comes from an email 20190830 from Maria Adelaida.

Samples WT1010 and WT1011 PBMCs from two healthy donors processed 2h, 7h and 12h after sample procurement. This is an analysis to explore the time-effect on gene expression and define steps for data analysis for patient samples considering time-dependent effects.
1. An initial PCA on the raw data would be very useful to see if there is clustering based on time or (as usual), mostly a donor-specific effect. Then I think a hierarchical clustering of genes based on time-dependent modifications to see what is mostly affected (if any) - like what you guys did for T.cruzi.
Samples from SU1017, SU1034 Samples from TMRC CL patients. m= monocyte, n= neutrophil. Samples labeled “1” are taken before treatment and those “2” mid way through treatment. This is exiting, because these will be our first neutrophil transcriptomes.

In an attempt to poke at these questions, I mapped the reads to hg38_91 using salmon and hisat2. It is very noteworthy that the salmon mappings are exhibiting some serious problems and should be looked into further. The hisat2 mappings are significantly more ‘normal’. Having said that, two samples remain basically unusable: tmrc30009 (1034n1) and (to a smaller degree) tmrc30007 (1017n1) have too few reads as shown above.

2.1 Preparation

To address these, I added to the end of the sample sheet columns named ‘condition’, ‘batch’, ‘donor’, and ‘time’. These are filled in with shorthand values according to the above.

2.2 Global view

Before addressing the questions explicitly by subsetting the data, I want to get a look at the samples as they are.

hs_valid <- set_expt_batches(hs_valid, fact="donor")
all_norm <- normalize_expt(hs_valid, norm="quant", transform="log2", convert="cpm", batch=FALSE,
                           filter=TRUE)

## This function will replace the expt$expressionset slot with:

## log2(cpm(quant(cbcb(data))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.

## Step 1: performing count filter with option: cbcb

## Removing 43798 low-count genes (14504 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 698 values equal to 0, adding 1 to the matrix.

## Step 5: not doing batch correction.

plot_pca(all_norm)$plot

plot_corheat(all_norm)$plot

plot_topn(hs_valid)$plot

2.3 Question 1

I interpreted question 1 as: pull out tmrc3000[1-6] and look at them.

I am not entirely certain what is meant by the heirarchical clustering request. I can see a couple of possibilities for what this means. The most similar thing I recall in the cruzi context was a post-DE visualization of some fairly specific genes.

hs_q1 <- subset_expt(hs_valid, subset="donor=='d1010'|donor=='d1011'")

## Using a subset expression.

## There were 11, now there are 6 samples.

q1_norm <- normalize_expt(hs_q1, norm="quant", transform="log2", convert="cpm", batch=FALSE,
                           filter=TRUE)

## This function will replace the expt$expressionset slot with:

## log2(cpm(quant(cbcb(data))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.

## Step 1: performing count filter with option: cbcb

## Removing 44614 low-count genes (13688 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 22 values equal to 0, adding 1 to the matrix.

## Step 5: not doing batch correction.

plot_pca(q1_norm)$plot

## Looks like PC1 == Time and PC2 is donor, and they have pretty much the same amount of variance

## Get a set of genes with high PC loads for PC1(time), and PC2(donor):
high_scores <- pca_highscores(q1_norm)

time_donors <- high_scores$highest[, c(1, 2)]
time_donors

##       Comp.1                  Comp.2                 
##  [1,] "28.38:ENSG00000129824" "12.22:ENSG00000073756"
##  [2,] "27.5:ENSG00000067048"  "10.41:ENSG00000125538"
##  [3,] "26.49:ENSG00000012817" "7.791:ENSG00000112299"
##  [4,] "23.26:ENSG00000179344" "7.714:ENSG00000093134"
##  [5,] "22.79:ENSG00000114374" "7.691:ENSG00000168329"
##  [6,] "22.46:ENSG00000206503" "7.414:ENSG00000274536"
##  [7,] "22.29:ENSG00000131002" "6.381:ENSG00000171049"
##  [8,] "21.33:ENSG00000196735" "6.286:ENSG00000162739"
##  [9,] "20.95:ENSG00000239830" "5.962:ENSG00000174946"
## [10,] "19.9:ENSG00000183878"  "5.919:ENSG00000168310"
## [11,] "19.74:ENSG00000234745" "5.903:ENSG00000170915"
## [12,] "18.95:ENSG00000099725" "5.748:ENSG00000245954"
## [13,] "18.35:ENSG00000213058" "5.722:ENSG00000254838"
## [14,] "17.88:ENSG00000067646" "5.719:ENSG00000133574"
## [15,] "17.8:ENSG00000233864"  "5.69:ENSG00000163563" 
## [16,] "14.98:ENSG00000198692" "5.553:ENSG00000171115"
## [17,] "14.57:ENSG00000259045" "5.54:ENSG00000277734" 
## [18,] "13.28:ENSG00000204642" "5.43:ENSG00000008517" 
## [19,] "11.89:ENSG00000215580" "5.388:ENSG00000213203"
## [20,] "11.17:ENSG00000112139" "5.355:ENSG00000261799"

## Column 1 should be winners vs. time and column 2 by donor.

time_genes <- gsub(x=time_donors[, 1], pattern="^.*:(.*)$", replacement="\\1")
time_genes

##  [1] "ENSG00000129824" "ENSG00000067048" "ENSG00000012817"
##  [4] "ENSG00000179344" "ENSG00000114374" "ENSG00000206503"
##  [7] "ENSG00000131002" "ENSG00000196735" "ENSG00000239830"
## [10] "ENSG00000183878" "ENSG00000234745" "ENSG00000099725"
## [13] "ENSG00000213058" "ENSG00000067646" "ENSG00000233864"
## [16] "ENSG00000198692" "ENSG00000259045" "ENSG00000204642"
## [19] "ENSG00000215580" "ENSG00000112139"

donor_genes <- gsub(x=time_donors[, 2], pattern="^.*:(.*)$", replacement="\\1")
donor_genes

##  [1] "ENSG00000073756" "ENSG00000125538" "ENSG00000112299"
##  [4] "ENSG00000093134" "ENSG00000168329" "ENSG00000274536"
##  [7] "ENSG00000171049" "ENSG00000162739" "ENSG00000174946"
## [10] "ENSG00000168310" "ENSG00000170915" "ENSG00000245954"
## [13] "ENSG00000254838" "ENSG00000133574" "ENSG00000163563"
## [16] "ENSG00000171115" "ENSG00000277734" "ENSG00000008517"
## [19] "ENSG00000213203" "ENSG00000261799"

## I am thinking to do 2 DE runs, one with condition as time and one with donor as condition,
## then look at these 20 genes for each, respectively.

q1_time <- set_expt_conditions(hs_q1, fact="time")
q1_donor <- set_expt_conditions(hs_q1, fact="donor")

time_de <- all_pairwise(q1_time, model_batch=FALSE)

## This function will replace the expt$expressionset slot with:

## log2(cpm(quant(cbcb(data))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.

## Step 1: performing count filter with option: cbcb

## Removing 44614 low-count genes (13688 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 22 values equal to 0, adding 1 to the matrix.

## Step 5: not doing batch correction.

## Plotting a PCA before surrogates/batch inclusion.

## Assuming no batch in model for testing pca.

## Finished running DE analyses, collecting outputs.

## Comparing analyses.

donor_de <- all_pairwise(q1_donor, model_batch=FALSE)

## This function will replace the expt$expressionset slot with:

## log2(cpm(quant(cbcb(data))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.

## Step 1: performing count filter with option: cbcb

## Removing 44614 low-count genes (13688 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 22 values equal to 0, adding 1 to the matrix.

## Step 5: not doing batch correction.

## Plotting a PCA before surrogates/batch inclusion.

## Assuming no batch in model for testing pca.

## Finished running DE analyses, collecting outputs.

## Comparing analyses.

time_tables <- combine_de_tables(time_de, excel=glue::glue("time_de_tables-v{ver}.xlsx"))

## Deleting the file time_de_tables-v20191001.xlsx before writing the tables.

## Writing a legend of columns.

## Printing a pca plot before/after surrogates/batch estimation.

## Working on table 1/3: t2hr_vs_t12hr

## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.

## Working on table 2/3: t7hr_vs_t12hr

## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.

## Working on table 3/3: t7hr_vs_t2hr

## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.

## Adding venn plots for t2hr_vs_t12hr.

## Limma expression coefficients for t2hr_vs_t12hr; R^2: 0.967; equation: y = 0.985x - 0.0364

## Edger expression coefficients for t2hr_vs_t12hr; R^2: 0.94; equation: y = 0.968x + 0.054

## DESeq2 expression coefficients for t2hr_vs_t12hr; R^2: 0.913; equation: y = 0.96x + 0.108

## Adding venn plots for t7hr_vs_t12hr.

## Limma expression coefficients for t7hr_vs_t12hr; R^2: 0.975; equation: y = 0.986x - 0.0042

## Edger expression coefficients for t7hr_vs_t12hr; R^2: 0.947; equation: y = 0.968x + 0.0741

## DESeq2 expression coefficients for t7hr_vs_t12hr; R^2: 0.928; equation: y = 0.965x + 0.0991

## Adding venn plots for t7hr_vs_t2hr.

## Limma expression coefficients for t7hr_vs_t2hr; R^2: 0.974; equation: y = 0.984x + 0.0049

## Edger expression coefficients for t7hr_vs_t2hr; R^2: 0.945; equation: y = 0.969x + 0.0932

## DESeq2 expression coefficients for t7hr_vs_t2hr; R^2: 0.925; equation: y = 0.96x + 0.138

## Writing summary information.

## Performing save of time_de_tables-v20191001.xlsx.

donor_tables <- combine_de_tables(donor_de, excel=glue::glue("time_de_tables-v{ver}.xlsx"))

## Deleting the file time_de_tables-v20191001.xlsx before writing the tables.

## Writing a legend of columns.

## Printing a pca plot before/after surrogates/batch estimation.

## Working on table 1/1: d1011_vs_d1010

## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.
## Used Bon Ferroni corrected t test(s) between columns.

## Adding venn plots for d1011_vs_d1010.

## Limma expression coefficients for d1011_vs_d1010; R^2: 0.975; equation: y = 0.989x - 0.0403

## Edger expression coefficients for d1011_vs_d1010; R^2: 0.951; equation: y = 0.979x + 0.0198

## DESeq2 expression coefficients for d1011_vs_d1010; R^2: 0.932; equation: y = 0.944x + 0.245

## Writing summary information.

## Performing save of time_de_tables-v20191001.xlsx.

q1_norm <- normalize_expt(hs_q1, transform="log2", convert="cpm", norm="quant")

## This function will replace the expt$expressionset slot with:

## log2(cpm(quant(data)))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Filter is false, this should likely be set to something, good
##  choices include cbcb, kofa, pofa (anything but FALSE).  If you want this to
##  stay FALSE, keep in mind that if other normalizations are performed, then the
##  resulting libsizes are likely to be strange (potentially negative!)

## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.

## Step 1: not doing count filtering.

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 207518 values equal to 0, adding 1 to the matrix.

## Step 5: not doing batch correction.

exprs_subset <- exprs(q1_norm)[time_genes, ]
plot_sample_heatmap(exprs_subset, row_label=rownames(exprs_subset))

exprs_subset <- exprs(q1_norm)[donor_genes, ]
plot_sample_heatmap(exprs_subset, row_label=rownames(exprs_subset))

hs_q2 <- subset_expt(hs_valid, subset="donor!='d1010'&donor!='d1011'")

## Using a subset expression.

## There were 11, now there are 5 samples.

q2_norm <- normalize_expt(hs_q2, transform="log2", convert="cpm", norm="quant", filter=TRUE)

## This function will replace the expt$expressionset slot with:

## log2(cpm(quant(cbcb(data))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.

## Step 1: performing count filter with option: cbcb

## Removing 46075 low-count genes (12227 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 242 values equal to 0, adding 1 to the matrix.

## Step 5: not doing batch correction.

plot_pca(q2_norm)$plot

if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename=savefile))
}

## If you wish to reproduce this exact build of hpgltools, invoke the following:

## > git clone http://github.com/abelew/hpgltools.git

## > git reset f3c1e03852c87dc60c7e72e726bb640572e695ff

## This is hpgltools commit: Thu Aug 22 15:32:44 2019 -0400: f3c1e03852c87dc60c7e72e726bb640572e695ff

## Saving to 01_annotation_v20191001.rda.xz

tmp <- loadme(filename=savefile)

LS0tCnRpdGxlOiAiTC4gcGFuYW1lbnNpcyAyMDE5MTAwMTogU2FtcGxlIEVzdGltYXRpb24iCmF1dGhvcjogImF0YiBhYmVsZXdAZ21haWwuY29tIgpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiCm91dHB1dDoKIGh0bWxfZG9jdW1lbnQ6CiAgY29kZV9kb3dubG9hZDogdHJ1ZQogIGNvZGVfZm9sZGluZzogc2hvdwogIGZpZ19jYXB0aW9uOiB0cnVlCiAgZmlnX2hlaWdodDogNwogIGZpZ193aWR0aDogNwogIGhpZ2hsaWdodDogZGVmYXVsdAogIGtlZXBfbWQ6IGZhbHNlCiAgbW9kZTogc2VsZmNvbnRhaW5lZAogIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogIHNlbGZfY29udGFpbmVkOiB0cnVlCiAgdGhlbWU6IHJlYWRhYmxlCiAgdG9jOiB0cnVlCiAgdG9jX2Zsb2F0OgogICBjb2xsYXBzZWQ6IGZhbHNlCiAgIHNtb290aF9zY3JvbGw6IGZhbHNlCi0tLQoKPHN0eWxlPgogIGJvZHkgLm1haW4tY29udGFpbmVyIHsKICAgIG1heC13aWR0aDogMTYwMHB4OwogIH0KPC9zdHlsZT4KCmBgYHtyIG9wdGlvbnMsIGluY2x1ZGU9RkFMU0V9CmlmICghaXNUUlVFKGdldDAoInNraXBfbG9hZCIpKSkgewogIGxpYnJhcnkoaHBnbHRvb2xzKQogIHR0IDwtIHNtKGRldnRvb2xzOjpsb2FkX2FsbCgiL2RhdGEvaHBnbHRvb2xzIikpCiAga25pdHI6Om9wdHNfa25pdCRzZXQocHJvZ3Jlc3M9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgICB2ZXJib3NlPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICAgd2lkdGg9OTAsCiAgICAgICAgICAgICAgICAgICAgICAgZWNobz1UUlVFKQogIGtuaXRyOjpvcHRzX2NodW5rJHNldChlcnJvcj1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgICBmaWcud2lkdGg9OCwKICAgICAgICAgICAgICAgICAgICAgICAgZmlnLmhlaWdodD04LAogICAgICAgICAgICAgICAgICAgICAgICBkcGk9OTYpCiAgb2xkX29wdGlvbnMgPC0gb3B0aW9ucyhkaWdpdHM9NCwKICAgICAgICAgICAgICAgICAgICAgICAgIHN0cmluZ3NBc0ZhY3RvcnM9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAgICBrbml0ci5kdXBsaWNhdGUubGFiZWw9ImFsbG93IikKICBnZ3Bsb3QyOjp0aGVtZV9zZXQoZ2dwbG90Mjo6dGhlbWVfYncoYmFzZV9zaXplPTEyKSkKICB2ZXIgPC0gIjIwMTkxMDAxIgogIHByZXZpb3VzX2ZpbGUgPC0gcGFzdGUwKCIwMV9hbm5vdGF0aW9uX3YiLCB2ZXIsICIuUm1kIikKCiAgdG1wIDwtIHRyeShzbShsb2FkbWUoZmlsZW5hbWU9Z3N1YihwYXR0ZXJuPSJcXC5SbWQiLCByZXBsYWNlPSJcXC5yZGFcXC54eiIsIHg9cHJldmlvdXNfZmlsZSkpKSkKICBybWRfZmlsZSA8LSBwYXN0ZTAoIjAxX2Fubm90YXRpb25fdiIsIHZlciwgIi5SbWQiKQogIHNhdmVmaWxlIDwtIGdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iXFwucmRhXFwueHoiLCB4PXJtZF9maWxlKQp9CmBgYAoKIyBTYW1wbGUgRXN0aW1hdGlvbgoKIyMgR2VuZXJhdGUgZXhwcmVzc2lvbnNldHMKCkNhdmVhdDogVGhpcyBpcyB1c2luZyBoaXNhdDIgcXVhbnRpZmljYXRpb25zLgoKYGBge3IgYWxsX25ld19zYWxtb24sIGZpZy5zaG93PSJoaWRlIn0KaHNfZXhwdCA8LSBjcmVhdGVfZXhwdCgic2FtcGxlX3NoZWV0cy90bXJjM19zYW1wbGVzXzIwMTkxMDAxLnhsc3giLAogICAgICAgICAgICAgICAgICAgICAgIGZpbGVfY29sdW1uPSJoZzM4OTFzYWxtb25maWxlIiwKICAgICAgICAgICAgICAgICAgICAgICBnZW5lX2luZm89aHNfYW5ub3QsIHR4X2dlbmVfbWFwPXR4X2dlbmVfbWFwKQoKbGlic2l6ZXMgPC0gcGxvdF9saWJzaXplKGhzX2V4cHQpCmxpYnNpemVzJHBsb3QKIyMgSSB0aGluayBzYW1wbGVzIDcsMTAgc2hvdWxkIGJlIHJlbW92ZWQgYXQgbWluaW11bSwgcHJvYmFibHkgYWxzbyA5LDExCm5vbnplcm8gPC0gcGxvdF9ub256ZXJvKGhzX2V4cHQpCmJveCA8LSBwbG90X2JveHBsb3QoaHNfZXhwdCkKCmhzX3dyaXRlIDwtIHdyaXRlX2V4cHQoaHNfZXhwdCwgZXhjZWw9Z2x1ZSgiZXhjZWwvaHNfd3JpdHRlbi12e3Zlcn0ueGxzeCIpKQoKaHNfdmFsaWQgPC0gc3Vic2V0X2V4cHQoaHNfZXhwdCwgY292ZXJhZ2U9MTAwMDAwKQp2YWxpZF93cml0ZSA8LSB3cml0ZV9leHB0KGhzX3ZhbGlkLCBleGNlbD1nbHVlKCJleGNlbC9oc192YWxpZC12e3Zlcn0ueGxzeCIpKQpgYGAKCmBgYHtyIGFsbF9uZXdfaGlzYXQyLCBmaWcuc2hvdz0iaGlkZSJ9CmhzX2V4cHQgPC0gY3JlYXRlX2V4cHQoInNhbXBsZV9zaGVldHMvdG1yYzNfc2FtcGxlc18yMDE5MTAwMS54bHN4IiwKICAgICAgICAgICAgICAgICAgICAgICBmaWxlX2NvbHVtbj0iaGczODkxaGlzYXRmaWxlIiwKICAgICAgICAgICAgICAgICAgICAgICBnZW5lX2luZm89aHNfYW5ub3QpCgpsaWJzaXplcyA8LSBwbG90X2xpYnNpemUoaHNfZXhwdCkKbGlic2l6ZXMkcGxvdAojIyBJIHRoaW5rIHNhbXBsZXMgNywxMCBzaG91bGQgYmUgcmVtb3ZlZCBhdCBtaW5pbXVtLCBwcm9iYWJseSBhbHNvIDksMTEKbm9uemVybyA8LSBwbG90X25vbnplcm8oaHNfZXhwdCkKbm9uemVybyRwbG90CmJveCA8LSBwbG90X2JveHBsb3QoaHNfZXhwdCkKYm94CgojIyBUaGlzIGlzIGNhdXNpbmcgc2VnbWVudGF0aW9uIGZhdWx0cyBpbiBSCiMjaHNfd3JpdGUgPC0gd3JpdGVfZXhwdChoc19leHB0LCBleGNlbD1nbHVlKCJleGNlbC9oc19oaXNhdDJfd3JpdHRlbi12e3Zlcn0ueGxzeCIpKQoKaHNfdmFsaWQgPC0gc3Vic2V0X2V4cHQoaHNfZXhwdCwgY292ZXJhZ2U9MzAwMDAwMCkKcGxvdF9saWJzaXplKGhzX3ZhbGlkKSRwbG90Cgp2YWxpZF93cml0ZSA8LSB3cml0ZV9leHB0KGhzX3ZhbGlkLCBleGNlbD1nbHVlKCJleGNlbC9oc192YWxpZC12e3Zlcn0ueGxzeCIpKQpgYGAKCiMgUXVlc3Rpb25zIGZyb20gTWFyaWEgQWRlbGFpZGEKClRoZSBmb2xsb3dpbmcgY29tZXMgZnJvbSBhbiBlbWFpbCAyMDE5MDgzMCBmcm9tIE1hcmlhIEFkZWxhaWRhLgoKMS4gU2FtcGxlcyBXVDEwMTAgYW5kIFdUMTAxMSBQQk1DcyBmcm9tIHR3byBoZWFsdGh5IGRvbm9ycyBwcm9jZXNzZWQgMmgsIDdoIGFuZAogICAxMmggYWZ0ZXIgc2FtcGxlIHByb2N1cmVtZW50LiBUaGlzIGlzIGFuIGFuYWx5c2lzIHRvIGV4cGxvcmUgdGhlIHRpbWUtZWZmZWN0CiAgIG9uIGdlbmUgZXhwcmVzc2lvbiBhbmQgZGVmaW5lIHN0ZXBzIGZvciBkYXRhIGFuYWx5c2lzIGZvciBwYXRpZW50IHNhbXBsZXMKICAgY29uc2lkZXJpbmcgdGltZS1kZXBlbmRlbnQgZWZmZWN0cy4KCiAgIGEuIEFuIGluaXRpYWwgUENBIG9uIHRoZSByYXcgZGF0YSB3b3VsZCBiZSB2ZXJ5IHVzZWZ1bCB0byBzZWUgaWYgdGhlcmUgaXMKICAgY2x1c3RlcmluZyBiYXNlZCBvbiB0aW1lIG9yIChhcyB1c3VhbCksIG1vc3RseSBhIGRvbm9yLXNwZWNpZmljIGVmZmVjdC4gVGhlbgogICBJIHRoaW5rIGEgaGllcmFyY2hpY2FsIGNsdXN0ZXJpbmcgb2YgZ2VuZXMgYmFzZWQgb24gdGltZS1kZXBlbmRlbnQKICAgbW9kaWZpY2F0aW9ucyB0byBzZWUgd2hhdCBpcyBtb3N0bHkgYWZmZWN0ZWQgKGlmIGFueSkgLSBsaWtlIHdoYXQgeW91IGd1eXMKICAgZGlkIGZvciBULmNydXppLgoKMi4gU2FtcGxlcyBmcm9tIFNVMTAxNywgU1UxMDM0IFNhbXBsZXMgZnJvbSBUTVJDIENMIHBhdGllbnRzLiBtPSBtb25vY3l0ZSwgbj0KICAgbmV1dHJvcGhpbC4gU2FtcGxlcyBsYWJlbGVkICIxIiBhcmUgdGFrZW4gYmVmb3JlIHRyZWF0bWVudCBhbmQgdGhvc2UgIjIiIG1pZCB3YXkKICAgdGhyb3VnaCB0cmVhdG1lbnQuIFRoaXMgaXMgZXhpdGluZywgYmVjYXVzZSB0aGVzZSB3aWxsIGJlIG91ciBmaXJzdAogICBuZXV0cm9waGlsIHRyYW5zY3JpcHRvbWVzLgoKSW4gYW4gYXR0ZW1wdCB0byBwb2tlIGF0IHRoZXNlIHF1ZXN0aW9ucywgSSBtYXBwZWQgdGhlIHJlYWRzIHRvIGhnMzhfOTEgdXNpbmcKc2FsbW9uIGFuZCBoaXNhdDIuICBJdCBpcyB2ZXJ5IG5vdGV3b3J0aHkgdGhhdCB0aGUgc2FsbW9uIG1hcHBpbmdzIGFyZQpleGhpYml0aW5nIHNvbWUgc2VyaW91cyBwcm9ibGVtcyBhbmQgc2hvdWxkIGJlIGxvb2tlZCBpbnRvIGZ1cnRoZXIuICBUaGUgaGlzYXQyCm1hcHBpbmdzIGFyZSBzaWduaWZpY2FudGx5IG1vcmUgJ25vcm1hbCcuICBIYXZpbmcgc2FpZCB0aGF0LCB0d28gc2FtcGxlcyByZW1haW4KYmFzaWNhbGx5IHVudXNhYmxlOiB0bXJjMzAwMDkgKDEwMzRuMSkgYW5kICh0byBhIHNtYWxsZXIgZGVncmVlKSB0bXJjMzAwMDcKKDEwMTduMSkgaGF2ZSB0b28gZmV3IHJlYWRzIGFzIHNob3duIGFib3ZlLgoKIyMgUHJlcGFyYXRpb24KClRvIGFkZHJlc3MgdGhlc2UsIEkgYWRkZWQgdG8gdGhlIGVuZCBvZiB0aGUgc2FtcGxlIHNoZWV0IGNvbHVtbnMgbmFtZWQKJ2NvbmRpdGlvbicsICdiYXRjaCcsICdkb25vcicsIGFuZCAndGltZScuICBUaGVzZSBhcmUgZmlsbGVkIGluIHdpdGggc2hvcnRoYW5kCnZhbHVlcyBhY2NvcmRpbmcgdG8gdGhlIGFib3ZlLgoKIyMgR2xvYmFsIHZpZXcKCkJlZm9yZSBhZGRyZXNzaW5nIHRoZSBxdWVzdGlvbnMgZXhwbGljaXRseSBieSBzdWJzZXR0aW5nIHRoZSBkYXRhLCBJIHdhbnQgdG8gZ2V0CmEgbG9vayBhdCB0aGUgc2FtcGxlcyBhcyB0aGV5IGFyZS4KCmBgYHtyIHByZV9xdWVzdGlvbnN9CmhzX3ZhbGlkIDwtIHNldF9leHB0X2JhdGNoZXMoaHNfdmFsaWQsIGZhY3Q9ImRvbm9yIikKYWxsX25vcm0gPC0gbm9ybWFsaXplX2V4cHQoaHNfdmFsaWQsIG5vcm09InF1YW50IiwgdHJhbnNmb3JtPSJsb2cyIiwgY29udmVydD0iY3BtIiwgYmF0Y2g9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIGZpbHRlcj1UUlVFKQpwbG90X3BjYShhbGxfbm9ybSkkcGxvdApwbG90X2NvcmhlYXQoYWxsX25vcm0pJHBsb3QKcGxvdF90b3BuKGhzX3ZhbGlkKSRwbG90CmBgYAoKIyMgUXVlc3Rpb24gMQoKSSBpbnRlcnByZXRlZCBxdWVzdGlvbiAxIGFzOiBwdWxsIG91dCB0bXJjMzAwMFsxLTZdIGFuZCBsb29rIGF0IHRoZW0uCgpJIGFtIG5vdCBlbnRpcmVseSBjZXJ0YWluIHdoYXQgaXMgbWVhbnQgYnkgdGhlIGhlaXJhcmNoaWNhbCBjbHVzdGVyaW5nIHJlcXVlc3QuCkkgY2FuIHNlZSBhIGNvdXBsZSBvZiBwb3NzaWJpbGl0aWVzIGZvciB3aGF0IHRoaXMgbWVhbnMuICBUaGUgbW9zdCBzaW1pbGFyIHRoaW5nCkkgcmVjYWxsIGluIHRoZSBjcnV6aSBjb250ZXh0IHdhcyBhIHBvc3QtREUgdmlzdWFsaXphdGlvbiBvZiBzb21lIGZhaXJseQpzcGVjaWZpYyBnZW5lcy4KCmBgYHtyIHF1ZXN0aW9uMX0KaHNfcTEgPC0gc3Vic2V0X2V4cHQoaHNfdmFsaWQsIHN1YnNldD0iZG9ub3I9PSdkMTAxMCd8ZG9ub3I9PSdkMTAxMSciKQoKcTFfbm9ybSA8LSBub3JtYWxpemVfZXhwdChoc19xMSwgbm9ybT0icXVhbnQiLCB0cmFuc2Zvcm09ImxvZzIiLCBjb252ZXJ0PSJjcG0iLCBiYXRjaD1GQUxTRSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgZmlsdGVyPVRSVUUpCnBsb3RfcGNhKHExX25vcm0pJHBsb3QKIyMgTG9va3MgbGlrZSBQQzEgPT0gVGltZSBhbmQgUEMyIGlzIGRvbm9yLCBhbmQgdGhleSBoYXZlIHByZXR0eSBtdWNoIHRoZSBzYW1lIGFtb3VudCBvZiB2YXJpYW5jZQoKIyMgR2V0IGEgc2V0IG9mIGdlbmVzIHdpdGggaGlnaCBQQyBsb2FkcyBmb3IgUEMxKHRpbWUpLCBhbmQgUEMyKGRvbm9yKToKaGlnaF9zY29yZXMgPC0gcGNhX2hpZ2hzY29yZXMocTFfbm9ybSkKdGltZV9kb25vcnMgPC0gaGlnaF9zY29yZXMkaGlnaGVzdFssIGMoMSwgMildCnRpbWVfZG9ub3JzCiMjIENvbHVtbiAxIHNob3VsZCBiZSB3aW5uZXJzIHZzLiB0aW1lIGFuZCBjb2x1bW4gMiBieSBkb25vci4KCnRpbWVfZ2VuZXMgPC0gZ3N1Yih4PXRpbWVfZG9ub3JzWywgMV0sIHBhdHRlcm49Il4uKjooLiopJCIsIHJlcGxhY2VtZW50PSJcXDEiKQp0aW1lX2dlbmVzCmRvbm9yX2dlbmVzIDwtIGdzdWIoeD10aW1lX2Rvbm9yc1ssIDJdLCBwYXR0ZXJuPSJeLio6KC4qKSQiLCByZXBsYWNlbWVudD0iXFwxIikKZG9ub3JfZ2VuZXMKCiMjIEkgYW0gdGhpbmtpbmcgdG8gZG8gMiBERSBydW5zLCBvbmUgd2l0aCBjb25kaXRpb24gYXMgdGltZSBhbmQgb25lIHdpdGggZG9ub3IgYXMgY29uZGl0aW9uLAojIyB0aGVuIGxvb2sgYXQgdGhlc2UgMjAgZ2VuZXMgZm9yIGVhY2gsIHJlc3BlY3RpdmVseS4KCnExX3RpbWUgPC0gc2V0X2V4cHRfY29uZGl0aW9ucyhoc19xMSwgZmFjdD0idGltZSIpCnExX2Rvbm9yIDwtIHNldF9leHB0X2NvbmRpdGlvbnMoaHNfcTEsIGZhY3Q9ImRvbm9yIikKCnRpbWVfZGUgPC0gYWxsX3BhaXJ3aXNlKHExX3RpbWUsIG1vZGVsX2JhdGNoPUZBTFNFKQpkb25vcl9kZSA8LSBhbGxfcGFpcndpc2UocTFfZG9ub3IsIG1vZGVsX2JhdGNoPUZBTFNFKQoKdGltZV90YWJsZXMgPC0gY29tYmluZV9kZV90YWJsZXModGltZV9kZSwgZXhjZWw9Z2x1ZTo6Z2x1ZSgidGltZV9kZV90YWJsZXMtdnt2ZXJ9Lnhsc3giKSkKZG9ub3JfdGFibGVzIDwtIGNvbWJpbmVfZGVfdGFibGVzKGRvbm9yX2RlLCBleGNlbD1nbHVlOjpnbHVlKCJ0aW1lX2RlX3RhYmxlcy12e3Zlcn0ueGxzeCIpKQoKcTFfbm9ybSA8LSBub3JtYWxpemVfZXhwdChoc19xMSwgdHJhbnNmb3JtPSJsb2cyIiwgY29udmVydD0iY3BtIiwgbm9ybT0icXVhbnQiKQoKZXhwcnNfc3Vic2V0IDwtIGV4cHJzKHExX25vcm0pW3RpbWVfZ2VuZXMsIF0KcGxvdF9zYW1wbGVfaGVhdG1hcChleHByc19zdWJzZXQsIHJvd19sYWJlbD1yb3duYW1lcyhleHByc19zdWJzZXQpKQoKZXhwcnNfc3Vic2V0IDwtIGV4cHJzKHExX25vcm0pW2Rvbm9yX2dlbmVzLCBdCnBsb3Rfc2FtcGxlX2hlYXRtYXAoZXhwcnNfc3Vic2V0LCByb3dfbGFiZWw9cm93bmFtZXMoZXhwcnNfc3Vic2V0KSkKYGBgCgpgYGB7ciBxdWVzdGlvbjJ9CmhzX3EyIDwtIHN1YnNldF9leHB0KGhzX3ZhbGlkLCBzdWJzZXQ9ImRvbm9yIT0nZDEwMTAnJmRvbm9yIT0nZDEwMTEnIikKcTJfbm9ybSA8LSBub3JtYWxpemVfZXhwdChoc19xMiwgdHJhbnNmb3JtPSJsb2cyIiwgY29udmVydD0iY3BtIiwgbm9ybT0icXVhbnQiLCBmaWx0ZXI9VFJVRSkKCnBsb3RfcGNhKHEyX25vcm0pJHBsb3QKYGBgCgpgYGB7ciBzYXZlbWV9CmlmICghaXNUUlVFKGdldDAoInNraXBfbG9hZCIpKSkgewogIHBhbmRlcjo6cGFuZGVyKHNlc3Npb25JbmZvKCkpCiAgbWVzc2FnZShwYXN0ZTAoIlRoaXMgaXMgaHBnbHRvb2xzIGNvbW1pdDogIiwgZ2V0X2dpdF9jb21taXQoKSkpCiAgbWVzc2FnZShwYXN0ZTAoIlNhdmluZyB0byAiLCBzYXZlZmlsZSkpCiAgdG1wIDwtIHNtKHNhdmVtZShmaWxlbmFtZT1zYXZlZmlsZSkpCn0KYGBgCgpgYGB7ciBsb2FkbWVfYWZ0ZXIsIGV2YWw9RkFMU0V9CnRtcCA8LSBsb2FkbWUoZmlsZW5hbWU9c2F2ZWZpbGUpCmBgYAo=

L. panamensis 20191001: Sample Estimation

atb abelew@gmail.com

2019-10-04