1 Estimates!

hs_expt <- set_expt_conditions(hs_expt, fact="infectstate")
hs_expt <- set_expt_batches(hs_expt, fact="studypmid")
hs_norm <- normalize_expt(hs_expt, transform="log2", convert="cpm",
                          norm="quant", filter="simple")

## This function will replace the expt$expressionset slot with:

## log2(cpm(quant(simple(data))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.

## Step 1: performing count filter with option: simple

## Removing 212 low-count genes (19417 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 6752 values equal to 0, adding 1 to the matrix.

## Step 5: not doing batch correction.

hs_pca <- plot_pca(hs_norm)

## Potentially check over the experimental design, there appear to be missing values.

## Warning in plot_pca(hs_norm): There are NA values in the component data.
## This can lead to weird plotting errors.

## plot labels was not set and there are more than 100 samples, disabling it.

## Not putting labels on the plot.

hs_pca$plot

## Warning: Removed 24 rows containing missing values (geom_point).

## Warning: Removed 24 rows containing missing values (geom_point).

hs_nb <- normalize_expt(hs_expt, transform="log2", convert="cpm",
                          norm="quant", filter="simple", batch="svaseq")

## This function will replace the expt$expressionset slot with:

## log2(svaseq(cpm(quant(simple(data)))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Warning in normalize_expt(hs_expt, transform = "log2", convert = "cpm", :
## Quantile normalization and sva do not always play well together.

## Step 1: performing count filter with option: simple

## Removing 212 low-count genes (19417 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 6752 values equal to 0, adding 1 to the matrix.

## Step 5: doing batch correction with svaseq.

## Note to self:  If you get an error like 'x contains missing values' The data has too many 0's and needs a stronger low-count filter applied.

## Passing off to all_adjusters.

## batch_counts: Before batch/surrogate estimation, 1022588 entries are x>1: 51.6%.

## batch_counts: Before batch/surrogate estimation, 6752 entries are x==0: 0.341%.

## batch_counts: Before batch/surrogate estimation, 951194 entries are 0<x<1: 48.0%.

## The be method chose 10 surrogate variable(s).

## Attempting svaseq estimation with 10 surrogates.

## There are 30781 (1.55%) elements which are < 0 after batch correction.

hs_nb_pca <- plot_pca(hs_nb)

## Potentially check over the experimental design, there appear to be missing values.

## Warning in plot_pca(hs_nb): There are NA values in the component data. This
## can lead to weird plotting errors.

## plot labels was not set and there are more than 100 samples, disabling it.

## Not putting labels on the plot.

hs_nb_pca$plot

## Warning: Removed 24 rows containing missing values (geom_point).

## Warning: Removed 24 rows containing missing values (geom_point).

hs_expt <- set_expt_conditions(hs_expt, fact="expttime")

hs_norm <- normalize_expt(hs_expt, transform="log2",
                          norm="quant", filter="simple", batch="svaseq")

## This function will replace the expt$expressionset slot with:

## log2(svaseq(quant(simple(data))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Leaving the data unconverted.  It is often advisable to cpm/rpkm
##  the data to normalize for sampling differences, keep in mind though that rpkm
##  has some annoying biases, and voom() by default does a cpm (though hpgl_voom()
##  will try to detect this).

## Warning in normalize_expt(hs_expt, transform = "log2", norm = "quant",
## filter = "simple", : Quantile normalization and sva do not always play well
## together.

## Step 1: performing count filter with option: simple

## Removing 212 low-count genes (19417 remaining).

## Step 2: normalizing the data with quant.

## Step 3: not converting the data.

## Step 4: transforming the data with log2.

## transform_counts: Found 6752 values equal to 0, adding 1 to the matrix.

## Step 5: doing batch correction with svaseq.

## Note to self:  If you get an error like 'x contains missing values' The data has too many 0's and needs a stronger low-count filter applied.

## Passing off to all_adjusters.

## batch_counts: Before batch/surrogate estimation, 1473620 entries are x>1: 74.4%.

## batch_counts: Before batch/surrogate estimation, 6752 entries are x==0: 0.341%.

## batch_counts: Before batch/surrogate estimation, 500162 entries are 0<x<1: 25.3%.

## The be method chose 11 surrogate variable(s).

## Attempting svaseq estimation with 11 surrogates.

## There are 4732 (0.239%) elements which are < 0 after batch correction.

hs_pca <- plot_pca(hs_norm)

## Potentially check over the experimental design, there appear to be missing values.

## Warning in plot_pca(hs_norm): There are NA values in the component data.
## This can lead to weird plotting errors.

## plot labels was not set and there are more than 100 samples, disabling it.

## Not putting labels on the plot.

hs_pca$plot

## Warning: Removed 24 rows containing missing values (geom_point).

## Warning: Removed 24 rows containing missing values (geom_point).

2 Add our data

Najib asked about adding the various data provided by our work. The expressionset which contains this information live in ‘../multiple_leishmania_2018’, more explicitly, the expressionset may be loaded via Hs_M0Lm4h.rda

load("../multiple_leishmania_2018/Hs_M0Lm4h.rda")

all_expt <- combine_expts(hs_expt, expt, merge_meta=TRUE)

all_expt <- set_expt_conditions(all_expt, fact="infectstate")
all_norm <- normalize_expt(all_expt, filter=TRUE, norm="quant", convert="cpm",
                           transform="log2", batch="svaseq")

## This function will replace the expt$expressionset slot with:

## log2(svaseq(cpm(quant(cbcb(data)))))

## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize

## Warning in normalize_expt(all_expt, filter = TRUE, norm = "quant", convert
## = "cpm", : Quantile normalization and sva do not always play well together.

## Step 1: performing count filter with option: cbcb

## Removing 0 low-count genes (19629 remaining).

## Step 2: normalizing the data with quant.

## Step 3: converting the data with cpm.

## Step 4: transforming the data with log2.

## transform_counts: Found 9876 values equal to 0, adding 1 to the matrix.

## Step 5: doing batch correction with svaseq.

## Note to self:  If you get an error like 'x contains missing values' The data has too many 0's and needs a stronger low-count filter applied.

## Passing off to all_adjusters.

## batch_counts: Before batch/surrogate estimation, 4233025 entries are x>1: 58.4%.

## batch_counts: Before batch/surrogate estimation, 9876 entries are x==0: 0.136%.

## batch_counts: Before batch/surrogate estimation, 3000200 entries are 0<x<1: 41.4%.

## The be method chose 25 surrogate variable(s).

## Attempting svaseq estimation with 25 surrogates.

## There are 211468 (2.92%) elements which are < 0 after batch correction.

all_pca <- plot_pca(all_norm)

## Potentially check over the experimental design, there appear to be missing values.

## Warning in plot_pca(all_norm): There are NA values in the component data.
## This can lead to weird plotting errors.

## plot labels was not set and there are more than 100 samples, disabling it.

## Not putting labels on the plot.

all_pca$plot

## Warning: Removed 24 rows containing missing values (geom_point).

pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))

LS0tCnRpdGxlOiAiRG93bmxvYWRlZCBkYXRhIHNldHMsIHNhbXBsZSBlc3RpbWF0aW9uLiIKYXV0aG9yOiAiYXRiIGFiZWxld0BnbWFpbC5jb20iCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGZpZ19jYXB0aW9uOiB0cnVlCiAgICBmaWdfaGVpZ2h0OiA3CiAgICBmaWdfd2lkdGg6IDcKICAgIGhpZ2hsaWdodDogdGFuZ28KICAgIGtlZXBfbWQ6IGZhbHNlCiAgICBtb2RlOiBzZWxmY29udGFpbmVkCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIHNlbGZfY29udGFpbmVkOiB0cnVlCiAgICB0aGVtZTogcmVhZGFibGUKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OgogICAgICBjb2xsYXBzZWQ6IGZhbHNlCiAgICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCiAgcm1kZm9ybWF0czo6cmVhZHRoZWRvd246CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGRmX3ByaW50OiBwYWdlZAogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAgd2lkdGg6IDMwMAogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIHRvY19mbG9hdDogdHJ1ZQogIEJpb2NTdHlsZTo6aHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIHRvY19mbG9hdDogdHJ1ZQotLS0KCjxzdHlsZSB0eXBlPSJ0ZXh0L2NzcyI+CmJvZHksIHRkIHsKICBmb250LXNpemU6IDE2cHg7Cn0KY29kZS5yewogIGZvbnQtc2l6ZTogMTZweDsKfQpwcmUgewogZm9udC1zaXplOiAxNnB4Cn0KPC9zdHlsZT4KCmBgYHtyIG9wdGlvbnMsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoImhwZ2x0b29scyIpCnR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgiL2RhdGEvaHBnbHRvb2xzIikKa25pdHI6Om9wdHNfa25pdCRzZXQod2lkdGg9MTIwLAogICAgICAgICAgICAgICAgICAgICBwcm9ncmVzcz1UUlVFLAogICAgICAgICAgICAgICAgICAgICB2ZXJib3NlPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KGVycm9yPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICBkcGk9OTYpCm9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSwKICAgICAgICAgICAgICAgICAgICAgICBrbml0ci5kdXBsaWNhdGUubGFiZWw9ImFsbG93IikKZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMCkpCnJ1bmRhdGUgPC0gZm9ybWF0KFN5cy5EYXRlKCksIGZvcm1hdD0iJVklbSVkIikKCnZlciA8LSAiMjAxOTA3MDEiCnByZXZpb3VzX2ZpbGUgPC0gcGFzdGUwKCIwMV9hbm5vdGF0aW9uXyIsIHZlciwgIi5SbWQiKQoKdG1wIDwtIHNtKGxvYWRtZShmaWxlbmFtZT1wYXN0ZTAoZ3N1YihwYXR0ZXJuPSJcXC5SbWQiLCByZXBsYWNlPSIiLCB4PXByZXZpb3VzX2ZpbGUpLCAiLXYiLCB2ZXIsICIucmRhLnh6IikpKQpybWRfZmlsZSA8LSAiMDFfYW5ub3RhdGlvbl8yMDE5MDcwMS5SbWQiCmBgYAoKIyBFc3RpbWF0ZXMhCgpgYGB7ciBlc3RpbWF0ZX0KaHNfZXhwdCA8LSBzZXRfZXhwdF9jb25kaXRpb25zKGhzX2V4cHQsIGZhY3Q9ImluZmVjdHN0YXRlIikKaHNfZXhwdCA8LSBzZXRfZXhwdF9iYXRjaGVzKGhzX2V4cHQsIGZhY3Q9InN0dWR5cG1pZCIpCmhzX25vcm0gPC0gbm9ybWFsaXplX2V4cHQoaHNfZXhwdCwgdHJhbnNmb3JtPSJsb2cyIiwgY29udmVydD0iY3BtIiwKICAgICAgICAgICAgICAgICAgICAgICAgICBub3JtPSJxdWFudCIsIGZpbHRlcj0ic2ltcGxlIikKaHNfcGNhIDwtIHBsb3RfcGNhKGhzX25vcm0pCmhzX3BjYSRwbG90Cgpoc19uYiA8LSBub3JtYWxpemVfZXhwdChoc19leHB0LCB0cmFuc2Zvcm09ImxvZzIiLCBjb252ZXJ0PSJjcG0iLAogICAgICAgICAgICAgICAgICAgICAgICAgIG5vcm09InF1YW50IiwgZmlsdGVyPSJzaW1wbGUiLCBiYXRjaD0ic3Zhc2VxIikKaHNfbmJfcGNhIDwtIHBsb3RfcGNhKGhzX25iKQpoc19uYl9wY2EkcGxvdAoKaHNfZXhwdCA8LSBzZXRfZXhwdF9jb25kaXRpb25zKGhzX2V4cHQsIGZhY3Q9ImV4cHR0aW1lIikKCmhzX25vcm0gPC0gbm9ybWFsaXplX2V4cHQoaHNfZXhwdCwgdHJhbnNmb3JtPSJsb2cyIiwKICAgICAgICAgICAgICAgICAgICAgICAgICBub3JtPSJxdWFudCIsIGZpbHRlcj0ic2ltcGxlIiwgYmF0Y2g9InN2YXNlcSIpCmhzX3BjYSA8LSBwbG90X3BjYShoc19ub3JtKQpoc19wY2EkcGxvdApgYGAKCiMgQWRkIG91ciBkYXRhCgpOYWppYiBhc2tlZCBhYm91dCBhZGRpbmcgdGhlIHZhcmlvdXMgZGF0YSBwcm92aWRlZCBieSBvdXIgd29yay4gIFRoZQpleHByZXNzaW9uc2V0IHdoaWNoIGNvbnRhaW5zIHRoaXMgaW5mb3JtYXRpb24gbGl2ZSBpbgonLi4vbXVsdGlwbGVfbGVpc2htYW5pYV8yMDE4JywgbW9yZSBleHBsaWNpdGx5LCB0aGUgZXhwcmVzc2lvbnNldCBtYXkgYmUgbG9hZGVkCnZpYSBIc19NMExtNGgucmRhCgpgYGB7ciBhZGRfb3VyX2RhdGF9CmxvYWQoIi4uL211bHRpcGxlX2xlaXNobWFuaWFfMjAxOC9Ic19NMExtNGgucmRhIikKCmFsbF9leHB0IDwtIGNvbWJpbmVfZXhwdHMoaHNfZXhwdCwgZXhwdCwgbWVyZ2VfbWV0YT1UUlVFKQoKYWxsX2V4cHQgPC0gc2V0X2V4cHRfY29uZGl0aW9ucyhhbGxfZXhwdCwgZmFjdD0iaW5mZWN0c3RhdGUiKQphbGxfbm9ybSA8LSBub3JtYWxpemVfZXhwdChhbGxfZXhwdCwgZmlsdGVyPVRSVUUsIG5vcm09InF1YW50IiwgY29udmVydD0iY3BtIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgdHJhbnNmb3JtPSJsb2cyIiwgYmF0Y2g9InN2YXNlcSIpCmFsbF9wY2EgPC0gcGxvdF9wY2EoYWxsX25vcm0pCmFsbF9wY2EkcGxvdApgYGAKCmBgYHtyIHNhdmVtZSwgZXZhbD1GQUxTRX0KcGFuZGVyOjpwYW5kZXIoc2Vzc2lvbkluZm8oKSkKbWVzc2FnZShwYXN0ZTAoIlRoaXMgaXMgaHBnbHRvb2xzIGNvbW1pdDogIiwgZ2V0X2dpdF9jb21taXQoKSkpCnRoaXNfc2F2ZSA8LSBwYXN0ZTAoZ3N1YihwYXR0ZXJuPSJcXC5SbWQiLCByZXBsYWNlPSIiLCB4PXJtZF9maWxlKSwgIi12IiwgdmVyLCAiLnJkYS54eiIpCm1lc3NhZ2UocGFzdGUwKCJTYXZpbmcgdG8gIiwgdGhpc19zYXZlKSkKdG1wIDwtIHNtKHNhdmVtZShmaWxlbmFtZT10aGlzX3NhdmUpKQpgYGAK

Downloaded data sets, sample estimation.

atb abelew@gmail.com

2019-07-03

1 Estimates!

2 Add our data