hs_annot <- load_biomart_annotations()$annotation
## The biomart annotations file already exists, loading from it.
rownames(hs_annot) <- make.names(
paste0(hs_annot[["ensembl_transcript_id"]], ".",
hs_annot[["transcript_version"]]),
unique=TRUE)
hs_tx_gene <- hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene[["id"]] <- rownames(hs_tx_gene)
hs_tx_gene <- hs_tx_gene[, c("id", "ensembl_gene_id")]
new_hs_annot <- hs_annot
rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)
lots <- create_expt("sample_sheets/many_samples.xlsx",
gene_info=new_hs_annot,
tx_gene_map=hs_tx_gene)
## Reading the sample metadata.
## The sample definitions comprises: 285 rows(samples) and 31 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 19629 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
What is the ‘signature’ of an infected macrophage? ** If one had to pick 10-20 marker genes which characterize an infected macrophage (up or down compared to uninfected), what would they be? ** If given a new sample that is unknown, what can we use to tell if it is infected or uninfected?
Can we make ‘better signatures’? ** Presumably signatures which define uninfected vs. infected.
initial <- plot_libsize(lots)
initial$plot
lots_met <- graph_metrics(lots)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 1284413 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, setting them to 0.5.
## Changed 1284413 zero count features.
## Plotting a CV plot.
## Naively calculating coefficient of variation/dispersion with respect to condition.
## Finished calculating dispersion estimates.
## Plotting the representation of the top-n genes.
## Plotting the expression of the top-n PC loaded genes.
## Printing a color to condition legend.
lots_norm <- normalize_expt(lots, norm="quant", filter=TRUE, convert="cpm",
transform="log2")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(hpgl(data))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 3762 low-count genes (15867 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 11575 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
norm_met <- graph_metrics(lots_norm)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## Plotting a CV plot.
## Naively calculating coefficient of variation/dispersion with respect to condition.
## Finished calculating dispersion estimates.
## Plotting the representation of the top-n genes.
## Plotting the expression of the top-n PC loaded genes.
## Printing a color to condition legend.
test_factors <- pca_information(lots_norm, num_components=4, plot_pcas=TRUE)
## More shallow curves in these plots suggest more genes in this principle component.
lots_pca <- plot_pca(lots_norm, plot_label=FALSE)
## Not putting labels on the plot.
lots_pca$plot
no_biopsy <- subset_expt(lots, subset="celltype!='skin'")
## There were 266, now there are 224 samples.
no_biopsy <- set_expt_conditions(no_biopsy, fact="infectstate")
macrophages <- subset_expt(lots, subset="celltype=='macrophage'")
## There were 266, now there are 203 samples.
uninf <- subset_expt(macrophages, subset="state=='uninfected'")
## There were 203, now there are 74 samples.
uninf_norm <- normalize_expt(uninf, convert="cpm", norm="quant", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## cpm(quant(hpgl(data)))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 6428 low-count genes (13201 remaining).
## Step 2: normalizing the data with quant.
## Using normalize.quantiles.robust due to a thread error in preprocessCore.
## Step 3: converting the data with cpm.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
test_uninf <- simple_xcell(uninf_norm, column="cds_length")
## The biomart annotations file already exists, loading from it.
## xCell strongly perfers rpkm values, re-normalizing now.
## This function will replace the expt$expressionset slot with:
## rpkm(data)
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Filter is false, this should likely be set to something, good
## choices include cbcb, kofa, pofa (anything but FALSE). If you want this to
## stay FALSE, keep in mind that if other normalizations are performed, then the
## resulting libsizes are likely to be strange (potentially negative!)
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: not doing count filtering.
## Step 2: not normalizing the data.
## Step 3: converting the data with rpkm.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
## Loading required namespace: xCell
test_uninf$heatmap
macrophages_norm <- normalize_expt(macrophages, convert="rpkm", column="cds_length", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## rpkm(hpgl(data))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 5595 low-count genes (14034 remaining).
## Step 2: not normalizing the data.
## Step 3: converting the data with rpkm.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
na_idx <- is.na(exprs(macrophages_norm))
macrophages_norm[na_idx] <- 0
mtrx <- exprs(macrophages_norm)
mtrx[na_idx] <- 0
Biobase::exprs(macrophages_norm$expressionset) <- mtrx
test_macrophages <- simple_xcell(macrophages, column="cds_length", filter=TRUE, label_size=0.3, width=15)
## The biomart annotations file already exists, loading from it.
## xCell strongly perfers rpkm values, re-normalizing now.
## This function will replace the expt$expressionset slot with:
## rpkm(hpgl(data))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 5595 low-count genes (14034 remaining).
## Step 2: not normalizing the data.
## Step 3: converting the data with rpkm.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
pp(file="images/macrophage_xcell_heatmap.pdf", image=test_macrophages$heatmap)
## Writing the image to: images/macrophage_xcell_heatmap.pdf and calling dev.off().
```{r poke_subsets macr_infuninf <- set_expt_conditions(macrophages, fact=“infectstate”) macr_norm <- normalize_expt(macr_infuninf, filter=TRUE, transform=“log2”, convert=“cpm”, batch=“svaseq”) macr_pca <- plot_pca(macr_norm, plot_label=FALSE) threed <- make_3d_pca(macr_pca) threed$plot
pp(file=“images/simple_macrophages.png”, image=macr_pca\(plot) macr_tsne <- plot_tsne(macr_norm, plot_label=FALSE, iterations=5000) macr_tsne\)plot
no_norm <- normalize_expt(no_biopsy, filter=TRUE, transform=“log2”, convert=“cpm”, batch=“svaseq”) no_pca <- plot_pca(no_norm, plot_label=FALSE, x_pc=1, y_pc=2, cis=FALSE) no_pca\(plot ggplt <- ggplotly(no_pca\)plot) widget <- htmlwidgets::saveWidget(as_widget(ggplt), “no_biopsy_svaseq_pca.html”)
high_cov <- subset_expt(no_biopsy, coverage=10000000) topsva <- normalize_expt(high_cov, filter=TRUE, transform=“log2”, batch=“svaseq”, num_surrogates=4) toppca <- plot_pca(topsva, plot_labels=FALSE, cis=FALSE) toppca$plot
no_batch_norm <- normalize_expt(no_biopsy, filter=TRUE, convert=“cpm”, transform=“log2”, batch=“svaseq”, num_surrogates=3) noba <- plot_pca(no_batch_norm, plot_labels=FALSE) noba$plot
no_test <- pca_information(no_norm, expt_factors=c(“condition”, “batch”, “expttime”), num_components=5, plot_pcas=TRUE) no_test$anova_neglogp_heatmap
lots_tsne <- plot_tsne(lots_norm, plot_label=FALSE) lots_tsne\(plot ##norm_met <- graph_metrics(lots_norm) ##norm_met\)pcaplot
gsva_big <- simple_gsva(lots, current_id=“ENSEMBL”) gsva_expt <- gsva_big\(expt gsva_cor <- plot_corheat(gsva_expt) pp(file="images/gsva_correlation.pdf", image=gsva_cor\)plot)
test_xcell <- simple_xcell(lots, column=“cds_length”) test_xcell$heatmap ```
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
## message(paste0("Saving to ", savefile))
## tmp <- sm(saveme(filename=savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset b0e11455e9f02944597c1bb5027f8ecbeb14201b
## This is hpgltools commit: Fri Jan 18 13:42:11 2019 -0500: b0e11455e9f02944597c1bb5027f8ecbeb14201b