lm_annotations <- lm_annotv2
lm_expt <- create_expt(metadata="sample_sheets/all_samples_201804.xlsx",
gene_info=lm_annotations,
file_column="lmajorfile")
## Reading the sample metadata.
## The sample definitions comprises: 24, 22 rows, columns.
## Reading count tables.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 8062 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## amast inf12 inf14d uninf12hr uninf14d pro
##lm_expt <- set_expt_colors(expt=lm_expt, colors=c("darkgray", "gray", "darkred", "pink", "darkblue", "blue"))
lm_expt <- set_expt_colors(expt=lm_expt, colors=c("gray", "darkred", "darkblue", "pink", "blue", "darkgreen"))
There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper ‘graph_metrics()’ but that takes away the chance to explain the graphs as I generate them.
lm_expt <- exclude_genes_expt(expt=lm_expt, column="Type", method="keep", patterns=c("protein_coding"))
## Before removal, there were 8306 entries.
## Now there are 7970 entries.
## Percent kept: 97.461, 97.301, 97.577, 97.703, 97.325, 97.532, 97.684, 97.411, 97.567, 97.539, 97.377, 97.972, 96.812, 96.855, 96.507, 96.538, 97.608, 97.598, 97.650, 97.481, 97.433, 97.343, 97.705, 97.808
## Percent removed: 2.539, 2.699, 2.423, 2.297, 2.675, 2.468, 2.316, 2.589, 2.433, 2.461, 2.623, 2.028, 3.188, 3.145, 3.493, 3.462, 2.392, 2.402, 2.350, 2.519, 2.567, 2.657, 2.295, 2.192
lm_metrics <- graph_metrics(lm_expt)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## The scale difference between the smallest and largest
## libraries is > 10. Assuming a log10 scale is better, set scale=FALSE if not.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 55743 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, setting them to 0.5.
## Changed 55743 zero count features.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
lm_norm <- normalize_expt(lm_expt, filter=TRUE, norm="quant", convert="cpm", transform="log2")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(hpgl(data))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 53 low-count genes (7917 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 259 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
lm_normmetrics <- graph_metrics(lm_norm)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
Ok, before removing the uninfected samples, lets look at some metrics with them and see if any worrisome patterns jump out.
lm_metrics$legend
## pink and green are the uninfected samples
lm_metrics$libsize
## Still more reads than ideal for uninfected, but way better than before
## I feel like this will probably work?
## Lets see how they cluster before removing them and looking more seriously at the infected.
lm_normmetrics$pcaplot
## wow not much variance is in either axis, interesting.
lm_normmetrics$tsneplot
## tsne suggests 1088 is suspicious
lm_normmetrics$corheat
## It seems to me the clustering is actually beautiful, I bet the uninfected
## samples are just making it look bad.
Now lets get rid of the uninfected samples and try again and see if I am right.
lmpara_expt <- subset_expt(lm_expt, subset="parasitesp=='yes'")
## There were 24, now there are 16 samples.
para_metrics <- graph_metrics(lmpara_expt)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## The scale difference between the smallest and largest
## libraries is > 10. Assuming a log10 scale is better, set scale=FALSE if not.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 14884 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, setting them to 0.5.
## Changed 14884 zero count features.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
para_norm <- normalize_expt(lmpara_expt, transform="log2", convert="cpm",
norm="quant", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(hpgl(data))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 63 low-count genes (7907 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 245 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
para_norm_metrics <- graph_metrics(para_norm)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
paranormal_batch <- normalize_expt(lmpara_expt, transform="log2", convert="cpm",
norm="quant", filter=TRUE, batch="svaseq")
## This function will replace the expt$expressionset slot with:
## log2(svaseq(cpm(quant(hpgl(data)))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Warning in normalize_expt(lmpara_expt, transform = "log2", convert =
## "cpm", : Quantile normalization and sva do not always play well together.
## Step 1: performing count filter with option: hpgl
## Removing 63 low-count genes (7907 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 245 values equal to 0, adding 1 to the matrix.
## Step 5: doing batch correction with svaseq.
## In norm_batch, after testing logic of surrogate method/number, the
## number of surrogates is: and the method is: be.
## Note to self: If you get an error like 'x contains missing values'; I think this
## means that the data has too many 0's and needs to have a better low-count filter applied.
## batch_counts: Before batch correction, 11802 entries 0<x<1.
## batch_counts: Before batch correction, 245 entries are >= 0.
## After checking/setting the number of surrogates, it is: 7.
## batch_counts: Using sva::svaseq for batch correction.
## Note to self: If you feed svaseq a data frame you will get an error like:
## data %*% (Id - mod %*% blah blah requires numeric/complex arguments.
## The number of elements which are < 0 after batch correction is: 473
## The variable low_to_zero sets whether to change <0 values to 0 and is: FALSE
paranormal_metrics <- graph_metrics(paranormal_batch)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
para_metrics$libsize
para_metrics$topn
para_norm_metrics$corheat
## hmm interesting
para_norm_metrics$pcaplot
paranormal_metrics$pcaplot
paranormal_metrics$corheat
paranormal_metrics$disheat
Well, it looks like I was not right. I don’t think I was entirely wrong either, but I was certainly not right. Lets see if varpart can pick out anything.
paranormal_varpart <- varpart(lmpara_expt, predictor=NULL, factors=c("time", "batch"))
## Attempting mixed linear model with: ~ (1|time) + (1|batch)
## Fitting the expressionset to the model, this is slow.
## Projected run time: ~ 14 min
## Placing factor: time at the beginning of the model.
paranormal_varpart$partition_plot
## ok, so my arbitrary creation of 'batch' out of the replicate number is useless.
## But damn there is a lot of unexplained variance in this data!
lm_fun <- write_expt(lmpara_expt, excel=paste0("excel/lmparasite_samples_written-v", ver, ".xlsx"),
filter=TRUE, norm="quant", convert="raw", transform="log2")
## Writing the legend.
## Writing the raw reads.
## Graphing the raw reads.
## Writing the normalized reads.
## Graphing the normalized reads.
## Writing the median reads by factor.
## The factor pro has 4 rows.
## The factor ama has 4 rows.
## The factor pmn12hinf has 4 rows.
## The factor pmn14dinf has 4 rows.
pander::pander(sessionInfo())
R version 3.4.4 (2018-03-15)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ruv(v.0.9.7) and hpgltools(v.2018.03)
loaded via a namespace (and not attached): nlme(v.3.1-131.1), bitops(v.1.0-6), matrixStats(v.0.53.1), pbkrtest(v.0.4-7), devtools(v.1.13.5), bit64(v.0.9-7), doParallel(v.1.0.11), RColorBrewer(v.1.1-2), rprojroot(v.1.3-2), tools(v.3.4.4), backports(v.1.1.2), R6(v.2.2.2), KernSmooth(v.2.23-15), DBI(v.0.8), lazyeval(v.0.2.1), BiocGenerics(v.0.24.0), mgcv(v.1.8-23), colorspace(v.1.3-2), withr(v.2.1.2), gridExtra(v.2.3), bit(v.1.1-12), compiler(v.3.4.4), preprocessCore(v.1.40.0), Biobase(v.2.38.0), xml2(v.1.2.0), labeling(v.0.3), caTools(v.1.17.1), scales(v.0.5.0.9000), readr(v.1.1.1), genefilter(v.1.60.0), quadprog(v.1.5-5), commonmark(v.1.4), stringr(v.1.3.0), digest(v.0.6.15), minqa(v.1.2.4), rmarkdown(v.1.9), variancePartition(v.1.8.1), colorRamps(v.2.3), base64enc(v.0.1-3), pkgconfig(v.2.0.1), htmltools(v.0.3.6), lme4(v.1.1-15), limma(v.3.34.9), rlang(v.0.2.0.9001), RSQLite(v.2.0), BiocParallel(v.1.12.0), gtools(v.3.5.0), RCurl(v.1.95-4.10), magrittr(v.1.5), Matrix(v.1.2-12), Rcpp(v.0.12.16), munsell(v.0.4.3), S4Vectors(v.0.16.0), stringi(v.1.1.7), yaml(v.2.1.18), edgeR(v.3.20.9), MASS(v.7.3-49), gplots(v.3.0.1), Rtsne(v.0.13), plyr(v.1.8.4), grid(v.3.4.4), blob(v.1.1.0), parallel(v.3.4.4), gdata(v.2.18.0), ggrepel(v.0.7.0), lattice(v.0.20-35), splines(v.3.4.4), annotate(v.1.56.1), pander(v.0.6.1), hms(v.0.4.2), locfit(v.1.5-9.1), knitr(v.1.20), pillar(v.1.2.1), rjson(v.0.2.15), corpcor(v.1.6.9), reshape2(v.1.4.3), codetools(v.0.2-15), stats4(v.3.4.4), XML(v.3.98-1.10), evaluate(v.0.10.1), data.table(v.1.10.4-3), nloptr(v.1.0.4), foreach(v.1.4.4), gtable(v.0.2.0), ggplot2(v.2.2.1), openxlsx(v.4.0.17), xtable(v.1.8-2), roxygen2(v.6.0.1), survival(v.2.41-3), tibble(v.1.4.2), iterators(v.1.0.9), AnnotationDbi(v.1.40.0), memoise(v.1.1.0), IRanges(v.2.12.0), tximport(v.1.6.0), sva(v.3.26.0) and directlabels(v.2017.03.31)
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 02_sample_estimation_lmajor_201804-v20180410.rda.xz
tt <- sm(saveme(filename=this_save))