One primary goal of revisiting this data is to look at some intergenic regions. I took a very simplistic route to consider this question.
I recently wrote a nifty function which in theory should fill in a sample sheet, let us see how well it works for this data.
##new_samplesheet <- gather_preprocessing_metadata("sample_sheets/all_samples.xlsx")
It turns out that function is not yet well suited to this data. Notably it tries to sanitize the date columns and does so poorly.
Also, damnit I am an idiot and redid the mapping off of my notes rather than remember that the sample sheet explicitly states that this is 5448 and not 5005. This will not make a huge difference, but I will remap as well.
Note that for the annotations, I am almost certain that the gene IDs have different lexical values between the genome downloaded from NCBI and the annotation data at microbesonline. If I recall it is something like the difference between ‘M5005_Spy_001’ and ‘M5005Spy_001’ or something similarly wacky.
as.data.frame(load_microbesonline_annotations("5005")) annot <-
## Found 1 entry.
## Streptococcus pyogenes MGAS5005Firmicutesyes2005-08-25yes101950293653
## The species being downloaded is: Streptococcus pyogenes MGAS5005
## Downloading: http://www.microbesonline.org/cgi-bin/genomeInfo.cgi?tId=293653;export=tab
rownames(annot) <- make.names(gsub(x=annot[["sysName"]], pattern="M5005_Spy_", replacement="M5005_Spy"), unique=TRUE)
"sample_sheets/all_samples.xlsx"
sample_sheet <- create_expt(sample_sheet, gene_info=annot, file_column="counttable") genome_expt <-
## Reading the sample metadata.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 20 rows(samples) and 28 columns(metadata fields).
## Matched 1926 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 1926 rows and 20 columns.
create_expt(sample_sheet, gene_info=annot, file_column="intercount") inter_expt <-
## Reading the sample metadata.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 20 rows(samples) and 28 columns(metadata fields).
## Matched 1841 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 1841 rows and 20 columns.
genome_expt %>%
genome_expt <- set_expt_conditions(fact="genotype") %>%
set_expt_batches(fact="time")
inter_expt %>%
inter_expt <- set_expt_conditions(fact="genotype") %>%
set_expt_batches(fact="time")
graph_metrics(genome_expt) cds_plots <-
## 2077 entries are 0. We are on a log scale, adding 1 to the data.
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing correlation.
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing distance.
## Changed 2077 zero count features.
## Naively calculating coefficient of variation/dispersion with respect to condition.
## Finished calculating dispersion estimates.
$libsize cds_plots
normalize_expt(genome_expt, filter=TRUE, norm="quant", convert="cpm", transform="log2") cds_norm <-
## Removing 127 low-count genes (1799 remaining).
graph_metrics(cds_norm) cds_norm_plots <-
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing correlation.
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing distance.
## Naively calculating coefficient of variation/dispersion with respect to condition.
## Finished calculating dispersion estimates.
$pc_plot cds_norm_plots
all_pairwise(genome_expt, filter=TRUE, model_batch=TRUE) cds_de <-
## Plotting a PCA before surrogate/batch inclusion.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(cds_de, excel="excel/cds_table.xlsx") cds_table <-
## Deleting the file excel/cds_table.xlsx before writing the tables.
graph_metrics(inter_expt) inter_plots <-
## 1342 entries are 0. We are on a log scale, adding 1 to the data.
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing correlation.
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing distance.
## Changed 1342 zero count features.
## Naively calculating coefficient of variation/dispersion with respect to condition.
## Finished calculating dispersion estimates.
$libsize inter_plots
normalize_expt(inter_expt, filter=TRUE, norm="quant", convert="cpm", transform="log2") inter_norm <-
## Removing 23 low-count genes (1818 remaining).
graph_metrics(inter_norm) inter_norm_plots <-
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing correlation.
## Warning in plot.window(...): "title" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "title" is not a graphical parameter
## Warning in title(...): "title" is not a graphical parameter
## Performing distance.
## Naively calculating coefficient of variation/dispersion with respect to condition.
## Finished calculating dispersion estimates.
$pc_plot inter_norm_plots
all_pairwise(inter_expt, filter=TRUE, model_batch=TRUE) inter_de <-
## Plotting a PCA before surrogate/batch inclusion.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(inter_de, excel="excel/inter_table.xlsx") inter_table <-
## Deleting the file excel/inter_table.xlsx before writing the tables.
compare_de_results(cds_table, inter_table) comp <-
## Testing method: limma.
## Adding method: limma to the set.
## Testing method: deseq.
## Adding method: deseq to the set.
## Testing method: edger.
## Adding method: edger to the set.
## Starting method limma, table wt_vs_mga.
## Starting method deseq, table wt_vs_mga.
## Starting method edger, table wt_vs_mga.
as.data.frame(comp$result)
## limma.wt_vs_mga.logfc limma.wt_vs_mga.p limma.wt_vs_mga.adjp
## 1 0.8272 0.5079 0.586
## deseq.wt_vs_mga.logfc deseq.wt_vs_mga.p deseq.wt_vs_mga.adjp
## 1 0.8973 0.559 0.6215
## edger.wt_vs_mga.logfc edger.wt_vs_mga.p edger.wt_vs_mga.adjp
## 1 0.8988 0.6019 0.6984
Similar, but definitely not the same, so there may be some interesting differences.
::pander(sessionInfo())
pandermessage(paste0("This is hpgltools commit: ", get_git_commit()))
paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
this_save <-message(paste0("Saving to ", this_save))
sm(saveme(filename=this_save)) tmp <-