head(mm_annotv2$annotation)
## ensembl_transcript_id ensembl_gene_id version transcript_version
## ENSMUST00000000001 ENSMUST00000000001 ENSMUSG00000000001 4 4
## ENSMUST00000000003 ENSMUST00000000003 ENSMUSG00000000003 15 13
## ENSMUST00000000010 ENSMUST00000000010 ENSMUSG00000020875 9 8
## ENSMUST00000000028 ENSMUST00000000028 ENSMUSG00000000028 14 13
## ENSMUST00000000033 ENSMUST00000000033 ENSMUSG00000048583 16 11
## ENSMUST00000000049 ENSMUST00000000049 ENSMUSG00000000049 11 5
## hgnc_symbol
## ENSMUST00000000001
## ENSMUST00000000003
## ENSMUST00000000010
## ENSMUST00000000028
## ENSMUST00000000033
## ENSMUST00000000049
## description
## ENSMUST00000000001 guanine nucleotide binding protein (G protein), alpha inhibiting 3 [Source:MGI Symbol;Acc:MGI:95773]
## ENSMUST00000000003 probasin [Source:MGI Symbol;Acc:MGI:1860484]
## ENSMUST00000000010 homeobox B9 [Source:MGI Symbol;Acc:MGI:96190]
## ENSMUST00000000028 cell division cycle 45 [Source:MGI Symbol;Acc:MGI:1338073]
## ENSMUST00000000033 insulin-like growth factor 2 [Source:MGI Symbol;Acc:MGI:96434]
## ENSMUST00000000049 apolipoprotein H [Source:MGI Symbol;Acc:MGI:88058]
## gene_biotype cds_length chromosome_name strand start_position
## ENSMUST00000000001 protein_coding 1065 3 - 108107280
## ENSMUST00000000003 protein_coding 525 X - 77837901
## ENSMUST00000000010 protein_coding 753 11 + 96271457
## ENSMUST00000000028 protein_coding 1701 16 - 18780447
## ENSMUST00000000033 protein_coding 543 7 - 142650766
## ENSMUST00000000049 protein_coding 1038 11 + 108343354
## end_position
## ENSMUST00000000001 108146146
## ENSMUST00000000003 77853623
## ENSMUST00000000010 96276595
## ENSMUST00000000028 18811987
## ENSMUST00000000033 142666816
## ENSMUST00000000049 108414396
mm_expt <- create_expt(metadata="sample_sheets/all_samples_201804.xlsx",
gene_info=mm_annotv2_df,
file_column="mmusculusfile")
## Reading the sample metadata.
## The sample definitions comprises: 24, 22 rows, columns.
## Reading count tables.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 49753 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## pro ama 12hinf 12huninf 24dinf 24duninf
mm_expt <- set_expt_colors(expt=mm_expt, colors=c("gray", "darkgreen", "darkblue",
"dodgerblue", "darkred", "#EE9999"))
This file was coped from 02_sample_estimation_lmajor_201814.Rmd, so it might be a bit repetitive, but I suspect that there will be more interesting things going on in the host data. I will be sad to not make ‘paranormal’ jokes in my variable names. Also, these colors suck, lets fix that now.
There are lots of toys we have learned to use to play with with raw data and explore stuff like batch effects or non-canonical distributions or skewed counts. hpgltools provides some functionality to make this process easier. The graphs shown below and many more are generated with the wrapper ‘graph_metrics()’ but that takes away the chance to explain the graphs as I generate them.
mm_expt <- exclude_genes_expt(expt=mm_expt, column="Type", method="keep", patterns=c("protein_coding"))
## The Type column is null, doing nothing.
mm_metrics <- graph_metrics(mm_expt)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## The scale difference between the smallest and largest
## libraries is > 10. Assuming a log10 scale is better, set scale=FALSE if not.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 984972 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, setting them to 0.5.
## Changed 984972 zero count features.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
mm_norm <- normalize_expt(mm_expt, filter=TRUE, norm="quant", convert="cpm", transform="log2")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(hpgl(data))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 34796 low-count genes (21024 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 46655 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
mm_normmetrics <- graph_metrics(mm_norm)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
Ok, so this time I expect to see very few reads in the promastigote/amastigote samples.
Is this true?
mm_metrics$legend
## pink and green are the uninfected samples
mm_metrics$libsize
## Still _WAY_ more reads than ideal for promastigote/amastigote.
## I wonder if this is not an artifact of me using salmon.
mm_normmetrics$pcaplot
## It looks to me that the primary principle component is coverage?
## If it is the only factor, when I remove the pro/ama samples, then 1075
## should stick way out to the side with the other samples appearing in order:
## 1076, 1085, 1092, 1083, 1077, 1073, ...
## That does not seem likely.
Now lets get rid of the uninfected samples and try again and see if I am right.
mm_only_expt <- subset_expt(mm_expt, subset="hostp=='yes'")
## There were 24, now there are 16 samples.
mm_only_metrics <- graph_metrics(mm_only_expt)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 571038 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, setting them to 0.5.
## Changed 571038 zero count features.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
mm_only_norm <- normalize_expt(mm_only_expt, transform="log2", convert="cpm",
norm="quant", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(hpgl(data))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 35767 low-count genes (20053 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 39120 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
mm_nonly_metrics <- graph_metrics(mm_only_norm)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
mm_only_batch <- normalize_expt(mm_only_expt, transform="log2", convert="cpm",
norm="quant", filter=TRUE, batch="svaseq")
## This function will replace the expt$expressionset slot with:
## log2(svaseq(cpm(quant(hpgl(data)))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Warning in normalize_expt(mm_only_expt, transform = "log2", convert = "cpm", : Quantile
## normalization and sva do not always play well together.
## Step 1: performing count filter with option: hpgl
## Removing 35767 low-count genes (20053 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 39120 values equal to 0, adding 1 to the matrix.
## Step 5: doing batch correction with svaseq.
## In norm_batch, after testing logic of surrogate method/number, the
## number of surrogates is: and the method is: be.
## Note to self: If you get an error like 'x contains missing values'; I think this
## means that the data has too many 0's and needs to have a better low-count filter applied.
## batch_counts: Before batch correction, 49686 entries 0<x<1.
## batch_counts: Before batch correction, 39120 entries are >= 0.
## After checking/setting the number of surrogates, it is: 4.
## batch_counts: Using sva::svaseq for batch correction.
## Note to self: If you feed svaseq a data frame you will get an error like:
## data %*% (Id - mod %*% blah blah requires numeric/complex arguments.
## The number of elements which are < 0 after batch correction is: 12079
## The variable low_to_zero sets whether to change <0 values to 0 and is: FALSE
mm_bonly_metrics <- graph_metrics(mm_only_batch)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
I added a column to the sample design where bigger coverages (according to the libsize plot above get bigger numbers.
cov_test <- plot_pca(mm_only_norm, size_column="mmcoverage")
cov_test$plot
## Phew, that is good, no obvious clustering by coverage.
mm_only_metrics$libsize
mm_only_metrics$density
mm_nonly_metrics$corheat
mm_bonly_metrics$corheat
mm_bonly_metrics$pcaplot
mm_only_metrics$tsneplot
test <- pca_information(mm_only_expt,
expt_factors=c("mmcoverage", "time", "parasitesp", "condition"))
## More shallow curves in these plots suggest more genes in this principle component.
test$cor_heatmap
mm_varpart <- varpart(mm_only_expt, predictor=NULL,
factors=c("time", "parasitesp", "batch", "coveragefactor"))
## Attempting mixed linear model with: ~ (1|time) + (1|parasitesp) + (1|batch) + (1|coveragefactor)
## Fitting the expressionset to the model, this is slow.
## Projected run time: ~ 45 min
## Placing factor: time at the beginning of the model.
mm_varpart$partition_plot
I need to redo this from the perspective of genes and see if these patterns hold.
pander::pander(sessionInfo())
R version 3.4.4 (2018-03-15)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: hpgltools(v.2018.03)
loaded via a namespace (and not attached): Biobase(v.2.38.0), edgeR(v.3.20.9), bit64(v.0.9-7), splines(v.3.4.4), foreach(v.1.4.4), gtools(v.3.5.0), stats4(v.3.4.4), pander(v.0.6.1), blob(v.1.1.0), yaml(v.2.1.18), ggrepel(v.0.7.0), pillar(v.1.2.1), RSQLite(v.2.0), backports(v.1.1.2), lattice(v.0.20-35), limma(v.3.34.9), quadprog(v.1.5-5), digest(v.0.6.15), RColorBrewer(v.1.1-2), minqa(v.1.2.4), colorspace(v.1.3-2), preprocessCore(v.1.40.0), htmltools(v.0.3.6), Matrix(v.1.2-12), plyr(v.1.8.4), XML(v.3.98-1.10), pkgconfig(v.2.0.1), devtools(v.1.13.5), genefilter(v.1.60.0), xtable(v.1.8-2), corpcor(v.1.6.9), scales(v.0.5.0.9000), gdata(v.2.18.0), Rtsne(v.0.13), openxlsx(v.4.0.17), BiocParallel(v.1.12.0), lme4(v.1.1-15), annotate(v.1.56.1), tibble(v.1.4.2), mgcv(v.1.8-23), IRanges(v.2.12.0), ggplot2(v.2.2.1), withr(v.2.1.2), BiocGenerics(v.0.24.0), lazyeval(v.0.2.1), pbkrtest(v.0.4-7), survival(v.2.41-3), magrittr(v.1.5), memoise(v.1.1.0), evaluate(v.0.10.1), doParallel(v.1.0.11), nlme(v.3.1-131.1), MASS(v.7.3-49), gplots(v.3.0.1), xml2(v.1.2.0), tools(v.3.4.4), directlabels(v.2017.03.31), data.table(v.1.10.4-3), hms(v.0.4.2), matrixStats(v.0.53.1), stringr(v.1.3.0), S4Vectors(v.0.16.0), locfit(v.1.5-9.1), munsell(v.0.4.3), AnnotationDbi(v.1.40.0), colorRamps(v.2.3), compiler(v.3.4.4), caTools(v.1.17.1), rlang(v.0.2.0.9001), RCurl(v.1.95-4.10), grid(v.3.4.4), nloptr(v.1.0.4), iterators(v.1.0.9), tximport(v.1.6.0), rjson(v.0.2.15), labeling(v.0.3), bitops(v.1.0-6), base64enc(v.0.1-3), rmarkdown(v.1.9), variancePartition(v.1.8.1), gtable(v.0.2.0), codetools(v.0.2-15), DBI(v.0.8), roxygen2(v.6.0.1), reshape2(v.1.4.3), R6(v.2.2.2), knitr(v.1.20), bit(v.1.1-12), commonmark(v.1.4), rprojroot(v.1.3-2), KernSmooth(v.2.23-15), readr(v.1.1.1), stringi(v.1.1.7), sva(v.3.26.0), parallel(v.3.4.4) and Rcpp(v.0.12.16)
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 02_sample_estimation_mmusculus_201804-v20180410.rda.xz
tt <- sm(saveme(filename=this_save))