This is a rerun of a much older analysis, partially to see if it still works and partially to better document my old work.
This is an rmarkdown document which makes heavy use of the hpgltools package. The following section demonstrates how to set that up in a clean R environment.
## Use R's install.packages to install devtools.
install.packages("devtools")
## Use devtools to install hpgltools.
devtools::install_github("elsayedlab/hpgltools")
## Load hpgltools into the R environment.
library(hpgltools)
## Use hpgltools' autoloads_all() function to install the many packages used by hpgltools.
The following are some requests I have received and whether or not I think I did them.
gff_file <- "reference/mgas_hsc5.gff.gz"
hsc5_info <- load_gff_annotations(gff_file)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Returning a df with 15 columns and 1812 rows.
## The following lines are likely replaced by the above
## but will stay here for the moment
annotations_hsc5 <- rtracklayer::import.gff3(gff_file)
annotation_hsc5_info <- BiocGenerics::as.data.frame(annotations_hsc5)
rownames(annotation_hsc5_info) <- make.names(annotation_hsc5_info$locus_tag, unique=TRUE)
genes_hsc5 <- annotation_hsc5_info[annotation_hsc5_info$type=="gene",]
rownames(genes_hsc5) <- make.names(genes_hsc5$locus_tag, unique=TRUE)
gene_annotations_hsc5 <- subset(genes_hsc5, select = c("start", "end", "width", "strand", "locus_tag"))
gene_lengths <- gene_annotations_hsc5[,c("width","start")]
gene_lengths$ID <- rownames(gene_lengths)
gene_lengths <- gene_lengths[,c("ID","width")]
short_annotations_hsc5 <- gene_annotations_hsc5[,c("width", "locus_tag")]
##tooltip_data_hsc5 <- make_tooltips(annotations=hsc5_info, desc_col=c("old_locus_tag","gene"), id_col="locus_tag")
xlsx_writer <- write_xls(gene_annotations_hsc5, sheet="genes")
openxlsx::saveWorkbook(wb=xlsx_writer$workbook, file="excel/annotations.xlsx", overwrite=TRUE)
##go_entries_hsc5 = strsplit(as.character(microbes_go_hsc5$GO), split=",", perl=TRUE)
##microbes_go_oneperrow_hsc5 = data.frame(name = rep(microbes_go_hsc5$sysName, sapply(go_entries_hsc5, length)), GO = unlist(go_entries_hsc5))
##microbes_go_hsc5 = microbes_go_oneperrow_hsc5
##rm(microbes_go_oneperrow_hsc5)
##rm(go_entries_hsc5)
## These are used for gene ontology stuff...
##microbes_lengths_hsc5 = microbes_hsc5[,c("sysName", "start","stop")]
##microbes_lengths_hsc5$length = abs(microbes_hsc5$start - microbes_hsc5$stop)
##microbes_lengths_hsc5 = microbes_lengths_hsc5[,c("sysName","length")]
hsc5_expt <- create_expt("all_samples.csv")
## Reading the sample metadata.
## The sample definitions comprises: 12, 14 rows, columns.
## Reading count tables.
## /cbcb/nelsayed-scratch/atb/tnseq/spyogenes_hsc5_2016/processed_data/counts/run1/thyt0-trimmed-v0M1.count.xz contains 1817 rows.
## processed_data/counts/run1/thyt1-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run1/cdmwithasn-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run1/cdmnoasn-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run2/thyt0run2-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run2/thyt1run2-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run2/asnrun2-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run2/noasnrun2-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/thyt0-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/thyt1-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/asn-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/noasn-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## Finished reading count tables.
## Matched 1812 annotations and counts.
## Bringing together the count matrix and gene information.
head(exprs(hsc5_expt$expressionset))
## HPGL0596 HPGL0597 HPGL0598 HPGL0599 HPGL0596b HPGL0597b HPGL0598b HPGL0599b
## L897_RS00005 0 0 0 1 21 2 4 6
## L897_RS00010 0 1 2 0 12 20 9 4
## L897_RS00015 10 0 12 15 20 11 23 16
## L897_RS00020 91 24 26 57 1010 1565 2094 1521
## L897_RS00025 0 0 0 0 1 1 0 1
## L897_RS00030 3523 1707 2947 1236 25280 57446 41772 39227
## HPGL0596c HPGL0597c HPGL0598c HPGL0599c
## L897_RS00005 26 5 11 12
## L897_RS00010 19 45 25 11
## L897_RS00015 38 55 40 33
## L897_RS00020 2369 2190 1865 1772
## L897_RS00025 1 2 3 0
## L897_RS00030 51103 74479 42698 40760
norm_hsc5 <- normalize_expt(hsc5_expt, transform="log2", norm="quant", convert="cpm", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(hpgl(data))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 332 low-count genes (1480 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 12 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
plot_pca(norm_hsc5)$plot
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
batch_hsc5 <- normalize_expt(hsc5_expt, transform="log2", norm="rle", convert="cpm", batch=TRUE, filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(sva(cpm(rle(hpgl(data)))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Step 1: performing count filter with option: hpgl
## Removing 332 low-count genes (1480 remaining).
## Step 2: normalizing the data with rle.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 593 values equal to 0, adding 1 to the matrix.
## Step 5: doing batch correction with sva.
## Note to self: If you get an error like 'x contains missing values'; I think this
## means that the data has too many 0's and needs to have a better low-count filter applied.
## batch_counts: Before batch correction, 954 entries 0<x<1.
## batch_counts: Before batch correction, 593 entries are >= 0.
## Passing the batch method to get_model_adjust().
## It understands a few additional batch methods.
## Not able to discern the state of the data.
## Going to use a simplistic metric to guess if it is log scale.
## The be method chose 3 surrogate variable(s).
## Estimate type 'sva' is shorthand for 'sva_unsupervised'.
## Other sva options include: sva_supervised and svaseq.
## Attempting sva unsupervised surrogate estimation with 3 surrogates.
## The number of elements which are < 0 after batch correction is: 782
## The variable low_to_zero sets whether to change <0 values to 0 and is: FALSE
plot_pca(batch_hsc5)$plot
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Run 1 can in no way be compared to runs 2/3
hsc5_expt = create_expt("kept_samples.csv")
## Reading the sample metadata.
## The sample definitions comprises: 8, 14 rows, columns.
## Reading count tables.
## /cbcb/nelsayed-scratch/atb/tnseq/spyogenes_hsc5_2016/processed_data/counts/run2/thyt0run2-trimmed-v0M1.count.xz contains 1817 rows.
## processed_data/counts/run2/thyt1run2-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run2/asnrun2-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run2/noasnrun2-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/thyt0-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/thyt1-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/asn-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## processed_data/counts/run3/noasn-trimmed-v0M1.count.xz contains 1817 rows and merges to 1817 rows.
## Finished reading count tables.
## Matched 1812 annotations and counts.
## Bringing together the count matrix and gene information.
head(exprs(hsc5_expt$expressionset))
## HPGL0596b HPGL0597b HPGL0598b HPGL0599b HPGL0596c HPGL0597c HPGL0598c
## L897_RS00005 21 2 4 6 26 5 11
## L897_RS00010 12 20 9 4 19 45 25
## L897_RS00015 20 11 23 16 38 55 40
## L897_RS00020 1010 1565 2094 1521 2369 2190 1865
## L897_RS00025 1 1 0 1 1 2 3
## L897_RS00030 25280 57446 41772 39227 51103 74479 42698
## HPGL0599c
## L897_RS00005 12
## L897_RS00010 11
## L897_RS00015 33
## L897_RS00020 1772
## L897_RS00025 0
## L897_RS00030 40760
norm_hsc5 = normalize_expt(hsc5_expt, transform="log2", norm="tmm", convert="cpm", filter=FALSE, batch=FALSE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(tmm(data)))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Filter is false, this should likely be set to something, good
## choices include cbcb, kofa, pofa (anything but FALSE). If you want this to
## stay FALSE, keep in mind that if other normalizations are performed, then the
## resulting libsizes are likely to be strange (potentially negative!)
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: not doing count filtering.
## Step 2: normalizing the data with tmm.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 1649 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
plot_pca(norm_hsc5)$plot
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## The top two PCs have pretty much the same amount of variance between them
## Thus I think adding batch to the model will be a good thing
norm_hsc5 = normalize_expt(hsc5_expt, transform="log2", norm="tmm", convert="cpm", filter=TRUE, batch=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(sva(cpm(tmm(hpgl(data)))))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Step 1: performing count filter with option: hpgl
## Removing 346 low-count genes (1466 remaining).
## Step 2: normalizing the data with tmm.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 30 values equal to 0, adding 1 to the matrix.
## Step 5: doing batch correction with sva.
## Note to self: If you get an error like 'x contains missing values'; I think this
## means that the data has too many 0's and needs to have a better low-count filter applied.
## batch_counts: Before batch correction, 130 entries 0<x<1.
## batch_counts: Before batch correction, 30 entries are >= 0.
## Passing the batch method to get_model_adjust().
## It understands a few additional batch methods.
## Not able to discern the state of the data.
## Going to use a simplistic metric to guess if it is log scale.
## The be method chose 1 surrogate variable(s).
## Estimate type 'sva' is shorthand for 'sva_unsupervised'.
## Other sva options include: sva_supervised and svaseq.
## Attempting sva unsupervised surrogate estimation with 1 surrogates.
## The number of elements which are < 0 after batch correction is: 30
## The variable low_to_zero sets whether to change <0 values to 0 and is: FALSE
hsc5_pca <- plot_pca(norm_hsc5)$plot
pp("images/pca_two_replicates.png", image=hsc5_pca)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Very nice
unmodified_metrics <- graph_metrics(hsc5_expt)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 1649 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, setting them to 0.5.
## Changed 1649 zero count features.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
unmodified_metrics$libsize
unmodified_metrics$density
unmodified_metrics$boxplot
normalized_metrics <- graph_metrics(norm_hsc5, qq=TRUE)
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some data are negative. We are on log scale, setting them to 0.
## Changed 30 negative features.
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 30 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some data are negative. We are on log scale, setting them to 0.5.
## Changed 30 negative features.
## Plotting the representation of the top-n genes.
## Printing a color to condition legend.
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## QQ plotting!
## Making plot of HPGL0596b(1) vs. a sample distribution.
## Making plot of HPGL0597b(2) vs. a sample distribution.
## Making plot of HPGL0598b(3) vs. a sample distribution.
## Making plot of HPGL0599b(4) vs. a sample distribution.
## Making plot of HPGL0596c(5) vs. a sample distribution.
## Making plot of HPGL0597c(6) vs. a sample distribution.
## Making plot of HPGL0598c(7) vs. a sample distribution.
## Making plot of HPGL0599c(8) vs. a sample distribution.
normalized_metrics$corheat
normalized_metrics$disheat
## This last one might be new to folks reading this document
normalized_metrics$qqlog
## Each of the 8 plots comprise a comparison of the rank-ordered genes vs. the mean of all samples
## The idea is that if one or more are significantly shifted in some way, then those samples are untenably different
## than the rest. The color from light-blue->dark-blue shows the density of genes at the given normalized(cpm)
The fitness analyses we have performed are a sort of differential expression bastardization
all_de <- all_pairwise(hsc5_expt, model_batch=TRUE)
## Using limma's removeBatchEffect to visualize before/after batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses 1/6: noasn_vs_asn
## Comparing analyses 2/6: thyt0_vs_asn
## Comparing analyses 3/6: thyt1_vs_asn
## Comparing analyses 4/6: thyt0_vs_noasn
## Comparing analyses 5/6: thyt1_vs_noasn
## Comparing analyses 6/6: thyt1_vs_thyt0
## holy crap DESeq2 really doesn't agree with either limma nor edgeR!
## I wonder if this is due to low-count fintering?
low_filtered <- normalize_expt(hsc5_expt, filter=TRUE)
## This function will replace the expt$expressionset slot with:
## hpgl(data)
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unconverted. It is often advisable to cpm/rpkm
## the data to normalize for sampling differences, keep in mind though that rpkm
## has some annoying biases, and voom() by default does a cpm (though hpgl_voom()
## will try to detect this).
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 346 low-count genes (1466 remaining).
## Step 2: not normalizing the data.
## Step 3: not converting the data.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
low_de <- all_pairwise(low_filtered, model_batch=TRUE)
## Using limma's removeBatchEffect to visualize before/after batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses 1/6: noasn_vs_asn
## Comparing analyses 2/6: thyt0_vs_asn
## Comparing analyses 3/6: thyt1_vs_asn
## Comparing analyses 4/6: thyt0_vs_noasn
## Comparing analyses 5/6: thyt1_vs_noasn
## Comparing analyses 6/6: thyt1_vs_thyt0
## That is part of the difference, but not all of it
## (note that the low end of the color-spectrum is now a rho of 0.6 rather than 0.5)
## I have a suspicion that the way I am including batch in DESeq's model is either incorrect or can be changed.
initial_result <- combine_de_tables(low_de, annot_df=short_annotations_hsc5, excel="excel/initial_fitness.xlsx")
## Deleting the file excel/initial_fitness.xlsx before writing the tables.
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Working on table 1/6: noasn_vs_asn
## Working on table 2/6: thyt0_vs_asn
## Working on table 3/6: thyt1_vs_asn
## Working on table 4/6: thyt0_vs_noasn
## Working on table 5/6: thyt1_vs_noasn
## Working on table 6/6: thyt1_vs_thyt0
## Adding venn plots for noasn_vs_asn.
## Limma expression coefficients for noasn_vs_asn; R^2: 0.974; equation: y = 0.987x + 0.103
## Edger expression coefficients for noasn_vs_asn; R^2: 0.974; equation: y = 0.935x + 0.655
## DESeq2 expression coefficients for noasn_vs_asn; R^2: 0.974; equation: y = 0.935x + 0.803
## Adding venn plots for thyt0_vs_asn.
## Limma expression coefficients for thyt0_vs_asn; R^2: 0.984; equation: y = 0.987x + 0.105
## Edger expression coefficients for thyt0_vs_asn; R^2: 0.983; equation: y = 0.973x + 0.258
## DESeq2 expression coefficients for thyt0_vs_asn; R^2: 0.983; equation: y = 0.973x + 0.339
## Adding venn plots for thyt1_vs_asn.
## Limma expression coefficients for thyt1_vs_asn; R^2: 0.983; equation: y = 0.993x + 0.0572
## Edger expression coefficients for thyt1_vs_asn; R^2: 0.982; equation: y = 0.949x + 0.688
## DESeq2 expression coefficients for thyt1_vs_asn; R^2: 0.983; equation: y = 0.949x + 0.627
## Adding venn plots for thyt0_vs_noasn.
## Limma expression coefficients for thyt0_vs_noasn; R^2: 0.971; equation: y = 0.968x + 0.273
## Edger expression coefficients for thyt0_vs_noasn; R^2: 0.97; equation: y = 1.01x - 0.128
## DESeq2 expression coefficients for thyt0_vs_noasn; R^2: 0.97; equation: y = 1.01x - 0.139
## Adding venn plots for thyt1_vs_noasn.
## Limma expression coefficients for thyt1_vs_noasn; R^2: 0.969; equation: y = 0.972x + 0.248
## Edger expression coefficients for thyt1_vs_noasn; R^2: 0.969; equation: y = 0.981x + 0.263
## DESeq2 expression coefficients for thyt1_vs_noasn; R^2: 0.969; equation: y = 0.979x + 0.269
## Adding venn plots for thyt1_vs_thyt0.
## Limma expression coefficients for thyt1_vs_thyt0; R^2: 0.988; equation: y = 1x - 0.0225
## Edger expression coefficients for thyt1_vs_thyt0; R^2: 0.987; equation: y = 0.965x + 0.475
## DESeq2 expression coefficients for thyt1_vs_thyt0; R^2: 0.987; equation: y = 0.965x + 0.429
## Writing summary information.
## Attempting to add the comparison plot to pairwise_summary at row: 24 and column: 1
##
|
| | 0%
|
|============= | 17%
|
|=========================== | 33%
|
|======================================== | 50%
|
|===================================================== | 67%
|
|=================================================================== | 83%
|
|================================================================================| 100%
## Performing save of the workbook.
initial_result$comp_plot$plot
The following from Miriam: The CDS Dval is calculated by dividing the number of hits in each gene by the number of TAs. I guess plotting it against length might be interesting?
dval_hsc5 <- convert_counts(hsc5_expt, convert="cp_seq_m",
annotations="reference/mgas_hsc5.gff.gz",
genome="reference/mgas_hsc5.fasta")
## Using pattern: TA instead of length for an rpkm-ish normalization.
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Returning a df with 15 columns and 1812 rows.
dval_df <- as.data.frame(dval_hsc5$count_table)
dval_df$median <- log2(matrixStats::rowMedians(as.matrix(dval_df)) + 1)
dval_df = merge(dval_df, gene_lengths, by="row.names")
dval_df$median_vs_width = dval_df$median / dval_df$width
rownames(dval_df) <- dval_df$ID
keepers <- list(
"asn_vs_noasn" = c("asn","noasn"),
"t1_vs_t0" = c("thyt1","thyt0")
)
initial_result <- combine_de_tables(low_de,
annot_df=dval_df,
excel="excel/initial_fitness.xlsx",
keepers=keepers)
## Deleting the file excel/initial_fitness.xlsx before writing the tables.
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Working on 1/2: asn_vs_noasn
## Found inverse table with noasn_vs_asn
## Working on 2/2: t1_vs_t0
## Found table with thyt1_vs_thyt0
## Adding venn plots for asn_vs_noasn.
## Limma expression coefficients for asn_vs_noasn; R^2: 0.974; equation: y = 0.987x + 0.103
## Edger expression coefficients for asn_vs_noasn; R^2: 0.974; equation: y = 0.935x + 0.655
## DESeq2 expression coefficients for asn_vs_noasn; R^2: 0.974; equation: y = 0.935x + 0.803
## Adding venn plots for thyt1_vs_thyt0.
## Limma expression coefficients for thyt1_vs_thyt0; R^2: 0.988; equation: y = 0.98x + 0.177
## Edger expression coefficients for thyt1_vs_thyt0; R^2: 0.987; equation: y = 1.02x - 0.22
## DESeq2 expression coefficients for thyt1_vs_thyt0; R^2: 0.987; equation: y = 1.02x - 0.195
## Writing summary information.
## Attempting to add the comparison plot to pairwise_summary at row: 20 and column: 1
##
|
| | 0%
|
|======================================== | 50%
|
|================================================================================| 100%
## Performing save of the workbook.
dval_width = dval_df[,c("width","median")]
scatter <- plot_linear_scatter(dval_width)
## Used Bon Ferroni corrected t test(s) between columns.
scatter$scatter
density = as.matrix(Biobase::exprs(hsc5_expt$expressionset))
density_df = as.data.frame(density)
density_df$median = log2(matrixStats::rowMedians(as.matrix(density)) + 1)
density_df = merge(density_df, gene_lengths, by="row.names")
density_df = density_df[,c("width","median")]
scatter <- plot_linear_scatter(density_df)
## Used Bon Ferroni corrected t test(s) between columns.
scatter$scatter
xlsx_writer <- hpgltools::write_xls(dval_df, sheet="dval")
openxlsx::saveWorkbook(wb=xlsx_writer$workbook, file="excel/dval.xlsx", overwrite=TRUE)
While Yoann is here lets do a quick histogram and see if essentiality metrics make sense As we go from m1 -> m16, the sensitivity of the essentiality tool decreases and a larger number of spurious 0’s occurs.
## Testing to see if some parameters are better than others
run2_thyt0_m1 <- read.csv("essentiality/mh_ess/run2/05v1M1l20gen_thyt0_genome.bam_essentiality_m1.csv", comment.char="#", header=FALSE, sep="\t")
colnames(run2_thyt0_m1) <- c("ORF","k","n","r","s","zbar","call")
plot_histogram(run2_thyt0_m1$zbar) +
ggplot2::scale_y_continuous(limits=c(0, 6)) +
ggplot2::scale_x_continuous(limits=c(0, 1))
## Warning: Removed 95 rows containing non-finite values (stat_bin).
## Warning: Removed 95 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
run2_thyt0_m2 <- read.csv("essentiality/mh_ess/run2/05v1M1l20gen_thyt0_genome.bam_essentiality_m2.csv", comment.char="#", header=FALSE, sep="\t")
colnames(run2_thyt0_m2) <- c("ORF","k","n","r","s","zbar","call")
plot_histogram(run2_thyt0_m2$zbar) +
ggplot2::scale_y_continuous(limits=c(0, 6)) +
ggplot2::scale_x_continuous(limits=c(0, 1))
## Warning: Removed 95 rows containing non-finite values (stat_bin).
## Warning: Removed 95 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
run2_thyt0_m4 <- read.csv("essentiality/mh_ess/run2/05v1M1l20gen_thyt0_genome.bam_essentiality_m4.csv", comment.char="#", header=FALSE, sep="\t")
colnames(run2_thyt0_m4) <- c("ORF","k","n","r","s","zbar","call")
plot_histogram(run2_thyt0_m4$zbar) +
ggplot2::scale_y_continuous(limits=c(0, 6)) +
ggplot2::scale_x_continuous(limits=c(0, 1))
## Warning: Removed 95 rows containing non-finite values (stat_bin).
## Warning: Removed 95 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
## This is not good
run2_thyt0_m16 <- read.csv("essentiality/mh_ess/run2/05v1M1l20gen_thyt0_genome.bam_essentiality_m16.csv", comment.char="#", header=FALSE, sep="\t")
colnames(run2_thyt0_m16) <- c("ORF","k","n","r","s","zbar","call")
plot_histogram(run2_thyt0_m16$zbar) +
ggplot2::scale_y_continuous(limits=c(0, 6)) +
ggplot2::scale_x_continuous(limits=c(0, 1))
## Warning: Removed 95 rows containing non-finite values (stat_bin).
## Warning: Removed 95 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_bar).
Now that there is a second usable replicate, the following material is no longer valid But I hate deleting stuff
head(exprs(norm_hsc5$expressionset))
norm_hsc5$design
norm_data = exprs(norm_hsc5$expressionset)
thyt0_vs_thyt1 = norm_data[,c("HPGL0596","HPGL0597")]
noasp_vs_asp = norm_data[,c("HPGL0599","HPGL0598")]
noasp_vs_t0 = norm_data[,c("HPGL0599","HPGL0596")]
asp_vs_t0 = norm_data[,c("HPGL0598","HPGL0596")]
thy = hpgl_linear_scatter(df=thyt0_vs_thyt1, loess=TRUE, identity=TRUE)
thy$scatter
thy$correlation
asp = hpgl_linear_scatter(df=noasp_vs_asp, loess=TRUE, identity=TRUE)
asp$scatter
asp$correlation
aspt0 = hpgl_linear_scatter(df=noasp_vs_t0, loess=TRUE, identity=TRUE)
aspt0$scatter
aspt0$correlation
noasp = hpgl_linear_scatter(df=noasp_vs_t0, loess=TRUE, identity=TRUE)
noaspt0$scatter
noaspt0$correlation
tas_thyt0 = tnseq_saturation("essentiality/mh_ess/tas_05v0M1l20gen_thyt0_genome.bam.txt")
tas_thyt0
tas_thyt1 = tnseq_saturation("essentiality/mh_ess/tas_05v0M1l20gen_thyt1_genome.bam.txt")
tas_thyt1
tas_asp = tnseq_saturation("essentiality/mh_ess/tas_05v0M1l20gen_asp_genome.bam.txt")
tas_asp
tas_noasp = tnseq_saturation("essentiality/mh_ess/tas_05v0M1l20gen_noasp_genome.bam.txt")
tas_noasp
tas_thyt0v1 = tnseq_saturation("essentiality/mh_ess/tas_05v1M1l20gen_thyt0_genome.bam.txt")
tas_thyt0v1
tas_thyt1v1 = tnseq_saturation("essentiality/mh_ess/tas_05v1M1l20gen_thyt1_genome.bam.txt")
tas_thyt1v1
tas_aspv1 = tnseq_saturation("essentiality/mh_ess/tas_05v1M1l20gen_asp_genome.bam.txt")
tas_aspv1
tas_noaspv1 = tnseq_saturation("essentiality/mh_ess/tas_05v1M1l20gen_noasp_genome.bam.txt")
tas_noaspv1
Lets make a right quick plot of the library densities
merged = merge(gene_annotations_hsc5, norm_data, by="row.names")
rownames(merged) = merged$Row.names
hsc5_cfg = hpgltools:::circos_prefix('hsc5')
hsc5_kary = hpgltools:::circos_karyotype(name='hsc5', fasta="reference/mgas_hsc5.fasta")
go_table = annotation_hsc5_info
go_table = go_table[,c("start","end","strand")]
go_table$go = ""
hsc5_plus_minus = hpgltools:::circos_plus_minus(go_table, cfgout=hsc5_cfg)
hsc5_thyt0 = hpgltools:::circos_hist(merged, cfgout=hsc5_cfg, colname="HPGL0596", outer=hsc5_plus_minus, fill_color="black", color="black")
hsc5_thyt1 = hpgltools:::circos_hist(merged, cfgout=hsc5_cfg, colname="HPGL0597", outer=hsc5_thyt0, fill_color="blue", color="black")
hsc5_noasp = hpgltools:::circos_hist(merged, cfgout=hsc5_cfg, colname="HPGL0599", outer=hsc5_thyt1, fill_color="green", color="black")
hsc5_asp = hpgltools:::circos_hist(merged, cfgout=hsc5_cfg, colname="HPGL0598", outer=hsc5_noasp, fill_color="red", color="black")
hpgltools:::circos_suffix(hsc5_cfg)
hpgltools:::circos_make('hsc5')
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 2a0661d6e37f8a3d8831eb3bbd6347c0d9c4b3b7
## R> packrat::restore()
## This is hpgltools commit: Thu Mar 29 16:59:07 2018 -0400: 2a0661d6e37f8a3d8831eb3bbd6347c0d9c4b3b7
## Saving to index-v20180329.rda.xz