This dataset contains multiple experiments.
load_gff_annotations("reference/paeruginosa_pa14.gff", id_col="gene_id") pa14_gff <-
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Returning a df with 16 columns and 11946 rows.
rownames(pa14_gff) <- pa14_gff[["gene_id"]]
## The Alias column has PA14_00010
as.data.frame(load_microbesonline_annotations("PA14")) pa14_microbes <-
## Found 1 entry.
## Pseudomonas aeruginosa UCBPP-PA14Proteobacteria2006-11-22yes105972208963
## The species being downloaded is: Pseudomonas aeruginosa UCBPP-PA14
## Downloading: http://www.microbesonline.org/cgi-bin/genomeInfo.cgi?tId=208963;export=tab
## The sysName column has PA14_0010
merge(pa14_gff, pa14_microbes, by.x="Alias", by.y="sysName")
pa14_annot <-rownames(pa14_annot) <- pa14_annot[["gene_id"]]
load_microbesonline_go(species = "PA14") pa14_go <-
## Found 1 entry.
## Pseudomonas aeruginosa UCBPP-PA14Proteobacteria2006-11-22yes105972208963
## The species being downloaded is: Pseudomonas aeruginosa UCBPP-PA14 and is being downloaded as 208963.tab.
Given the above annotations, now lets pull in the counts.
I am switching to the sheet all_samples_modified_gcd.xlsx for the moment because I added a space in the strain name for the gcd sampl
create_expt("sample_sheets/all_samples_modified_gcd.xlsx",
pa14_expt <-gene_info=pa14_annot, file_column="hisatcounttable")
## Reading the sample metadata.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 105 rows(samples) and 31 columns(metadata fields).
## Matched 5972 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 5979 rows and 105 columns.
While we are at it, lets drop the two sad samples.
plot_libsize(pa14_expt)
pa14_libsize <-$plot pa14_libsize
plot_nonzero(pa14_expt)
pa14_nonzero <-$plot pa14_nonzero
## Warning: ggrepel: 100 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
subset_expt(pa14_expt, nonzero=5500) pa14_expt <-
## The samples (and read coverage) removed when filtering 5500 non-zero genes are:
## SM040 SM048
## 43984 316632
## subset_expt(): There were 105, now there are 103 samples.
I know a priori that April is interested to see how the 4 library preparations look with respect to each other. Let us therefore create a data structure to look explicitly at that.
set_expt_conditions(pa14_expt, fact="libraryprepbatch") %>%
pa14_libprep <- set_expt_batches(fact="organisms")
normalize_expt(pa14_libprep, filter=TRUE,
pa14_lib_norm <-convert="cpm", norm="quant", transform="log2")
## Removing 6 low-count genes (5973 remaining).
## transform_counts: Found 35 values equal to 0, adding 1 to the matrix.
plot_pca(pa14_lib_norm)$plot
## plot labels was not set and there are more than 100 samples, disabling it.
plot_corheat(pa14_lib_norm)$plot
pca_information(
libprep_pca_info <-plot_pcas=TRUE,
pa14_lib_norm, expt_factors=c("libraryprepbatch", "organisms", "strains", "media", "bioreplicate"))
## plot labels was not set and there are more than 100 samples, disabling it.
$anova_f_heatmap libprep_pca_info
$pca_plots[[2]] libprep_pca_info
## Warning: ggrepel: 76 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
$pca_plots[[3]] libprep_pca_info
## Warning: ggrepel: 102 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
$pca_plots[[4]] libprep_pca_info
## Warning: ggrepel: 85 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
$pca_plots[[5]] libprep_pca_info
## Warning: ggrepel: 77 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
To my eyes they look reasonably mixed, suggesting that library prep batch is not a dominant factor in the data.
At this point, I am thinking that we should separate the data by the experiments, but I will first just show the relationships among all the data.
As far as I see, there are three factors which are of primary interest:
The last will be used as batch in the following plots.
set_expt_conditions(pa14_expt, fact="media") %>%
pa14_media <- set_expt_batches(fact="bioreplicate")
normalize_expt(pa14_media, transform="log2", convert="cpm",
pa14_media_norm <-filter=TRUE, norm="quant")
## Removing 6 low-count genes (5973 remaining).
## transform_counts: Found 35 values equal to 0, adding 1 to the matrix.
plot_pca(pa14_media_norm)$plot
## plot labels was not set and there are more than 100 samples, disabling it.
set_expt_conditions(pa14_expt, fact="strains") %>%
pa14_strains <- set_expt_batches(fact="bioreplicate")
normalize_expt(pa14_strains, transform="log2", convert="cpm",
pa14_strains_norm <-filter=TRUE, norm="quant")
## Removing 6 low-count genes (5973 remaining).
## transform_counts: Found 35 values equal to 0, adding 1 to the matrix.
plot_pca(pa14_strains_norm)$plot
## plot labels was not set and there are more than 100 samples, disabling it.
Disregarding the various experiments performed, I think we can state that media separates the data in a fashion which is more interesting than strain. Given the number of (what I assume are) closely related strains, I am thinking it might prove to be a good idea to perform my variant search tool on this data and see how well they held up with respect to the reference strain.
I have been told repeatedly that there are multiple experiments in this data, but apparently I have not paid proper attention because I cannot remember which is which, and to my eyes it is not obvious in the sample sheet.
With this in mind, I spoke with Solomon briefly and have an idea of the 3 logical groups in his data. Let us therefore separate and examine those first.
For the moment, I am going to call Solomon’s samples ‘metabolism and infection.’ I will also complicate the ‘condition’ of the data by combining the media and strain, but shortly thereafter will split that back. I think the reason why will become clear.
gsub(x=rownames(pData(pa14_expt)), pattern="^(..).*$", replacement="\\1")
initials_factor <-pData(pa14_expt)[["initials"]] <- as.factor(initials_factor)
paste0(pData(pa14_expt)[["strains"]], "_",
strain_media <-pData(pa14_expt)[["media"]])
pData(pa14_expt)[["strain_media"]] <- strain_media
## Lets set some colors
## WT: grayscale, eda: blue, edd: green, gcd: purple, pgl: red, zwf: yellow
list(
colors_by_strain <-"PA14 WT" = "#000000",
"PA14 eda" = "#0000dd",
"PA14 edd" = "#00dd00",
"PA14 gcd" = "#dd00dd",
"PA14 pgl" = "#dd0000",
"PA14 zwf" = "#dddd00")
subset_expt(pa14_expt, subset="initials=='SM'") %>%
infect_metabolism <- set_expt_conditions(fact="strains") %>%
set_expt_batches(fact="media")
## subset_expt(): There were 103, now there are 58 samples.
plot_legend(infect_metabolism)
## $colors
## condition batch colors
## sm001 PA14 WT LB #66A61E
## sm002 PA14 WT LB + 0.5 M urea #66A61E
## sm003 PA14 WT Mice Urine #66A61E
## sm004 PA14 WT PBST #66A61E
## sm005 PA14 eda Mice Urine #1B9E77
## sm006 PA14 eda PBST #1B9E77
## sm007 PA14 edd Mice Urine #D95F02
## sm008 PA14 edd PBST #D95F02
## sm009 PA14 gcd Mice Urine #7570B3
## sm010 PA14 gcd PBST #7570B3
## sm011 PA14 pgl Mice Urine #E7298A
## sm012 PA14 pgl PBST #E7298A
## sm013 PA14 zwf Mice Urine #E6AB02
## sm014 PA14 zwf PBST #E6AB02
## sm015 PA14 WT LB #66A61E
## sm016 PA14 WT LB + 0.5 M urea #66A61E
## sm017 PA14 WT Mice Urine #66A61E
## sm018 PA14 WT PBST #66A61E
## sm019 PA14 eda Mice Urine #1B9E77
## sm020 PA14 eda PBST #1B9E77
## sm021 PA14 edd Mice Urine #D95F02
## sm022 PA14 edd PBST #D95F02
## sm023 PA14 gcd Mice Urine #7570B3
## sm024 PA14 gcd PBST #7570B3
## sm025 PA14 pgl Mice Urine #E7298A
## sm026 PA14 pgl PBST #E7298A
## sm027 PA14 zwf Mice Urine #E6AB02
## sm028 PA14 zwf PBST #E6AB02
## sm029 PA14 WT LB #66A61E
## sm030 PA14 WT LB + 0.5 M urea #66A61E
## sm031 PA14 WT Mice Urine #66A61E
## sm032 PA14 WT PBST #66A61E
## sm033 PA14 eda Mice Urine #1B9E77
## sm034 PA14 eda PBST #1B9E77
## sm035 PA14 edd Mice Urine #D95F02
## sm036 PA14 edd PBST #D95F02
## sm037 PA14 gcd Mice Urine #7570B3
## sm038 PA14 gcd PBST #7570B3
## sm039 PA14 pgl Mice Urine #E7298A
## sm041 PA14 zwf Mice Urine #E6AB02
## sm042 PA14 zwf PBST #E6AB02
## sm043 PA14 WT Mice instiled #66A61E
## sm044 PA14 WT Mice instiled #66A61E
## sm045 PA14 WT Mice instiled #66A61E
## sm046 PA14 eda Mice instiled #1B9E77
## sm047 PA14 eda Mice instiled #1B9E77
## sm049 PA14 edd Mice instiled #D95F02
## sm050 PA14 edd Mice instiled #D95F02
## sm051 PA14 edd Mice instiled #D95F02
## sm052 PA14 gcd Mice instiled #7570B3
## sm053 PA14 gcd Mice instiled #7570B3
## sm054 PA14 gcd Mice instiled #7570B3
## sm055 PA14 pgl Mice instiled #E7298A
## sm056 PA14 pgl Mice instiled #E7298A
## sm057 PA14 pgl Mice instiled #E7298A
## sm058 PA14 zwf Mice instiled #E6AB02
## sm059 PA14 zwf Mice instiled #E6AB02
## sm060 PA14 zwf Mice instiled #E6AB02
##
## $plot
subset_expt(infect_metabolism,
metabolism_control <-subset="media=='LB'|media=='LB + 0.5 M urea'") %>%
set_expt_conditions(fact="media") %>%
set_expt_batches("bioreplicate")
## subset_expt(): There were 58, now there are 6 samples.
subset_expt(infect_metabolism,
metabolism_starvation <-subset="media=='PBST'|media=='Mice Urine'") %>%
set_expt_colors(colors=colors_by_strain) %>%
set_expt_conditions(fact="media") %>%
set_expt_batches(fact="strains")
## subset_expt(): There were 58, now there are 35 samples.
set_expt_conditions(metabolism_starvation, fact="strains") %>%
metabolism_starvation_strain <- set_expt_batches(fact="media")
subset_expt(infect_metabolism,
metabolism_exudate <-subset="media=='Mice instiled'")
## subset_expt(): There were 58, now there are 17 samples.
As a whole group, these samples are a bit confusing. The mouse instiled samples are prety obvious, but the other sources of variance remain a bit of a mystery to me.
normalize_expt(infect_metabolism, filter=TRUE, convert="cpm",
global_norm <-norm="quant", transform="log2") %>%
set_expt_conditions(fact="media")
## Removing 13 low-count genes (5966 remaining).
## transform_counts: Found 1 values equal to 0, adding 1 to the matrix.
plot_pca(global_norm)$plot
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
global_norm %>%
tmp <- set_expt_conditions(fact="strains")
plot_pca(tmp)$plot
## Warning: ggrepel: 48 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
This is a group of 6 samples, 3 in LB and three in LB+urea. This should therefore be the most straight forward comparison.
normalize_expt(metabolism_control, transform="log2",
mc_norm <-convert="cpm", norm="quant", filter=TRUE)
## Removing 73 low-count genes (5906 remaining).
plot_pca(mc_norm)$plot
sanitize_expt(metabolism_control)
mc_san <- all_pairwise(mc_san, model_batch=TRUE, filter=TRUE) mc_de <-
## Plotting a PCA before surrogate/batch inclusion.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(
mc_tables <-
mc_de,excel=glue::glue("excel/metabolism_control_tables-v{ver}.xlsx"),
sig_excel=glue::glue("excel/metabolism_control_sig-v{ver}.xlsx"))
hmm, is there any chance that SM001 and SM005 are flipped? If they were, then both clades would have 1 of each bioreplicate.
The second group is a little more complex, it seeks to simultaneously compare the strains (WT vs. mutants) and the environment (PBS vs. urine).
This design is complex enough that I think we need to choose colors more carefully.
normalize_expt(metabolism_starvation, filter=TRUE, norm="quant",
ms_norm <-convert="cpm", transform="log2")
## Removing 30 low-count genes (5949 remaining).
plot_pca(ms_norm)$plot
all_pairwise(metabolism_starvation, model_batch=TRUE) ms_de <-
## Plotting a PCA before surrogate/batch inclusion.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(
ms_tables <-
ms_de,excel=glue::glue("excel/metabolism_starvation_tables-v{ver}.xlsx"),
sig_excel=glue::glue("excel/metabolism_starvation_sig-v{ver}.xlsx"))
ms_tables[["significant"]][["deseq"]][["ups"]][[1]]
ms_up <- ms_tables[["significant"]][["deseq"]][["downs"]][[1]]
ms_down <-
## The go data from microbesonline is keyed by the gene name
## (e.g. dnaA), not gene ID or PA id or whatever.
pa14_annot[, c("name.x", "width")]
pa14_lengths <-colnames(pa14_lengths) <- c("ID", "width")
rownames(ms_up) <- make.names(ms_up[["namex"]], unique=TRUE)
simple_goseq(ms_up, go_db=pa14_go, length_db=pa14_lengths) ms_up_goseq <-
## Found 378 go_db genes and 717 length_db genes out of 717.
## Testing that go categories are defined.
## Removing undefined categories.
## Gathering synonyms.
## Gathering category definitions.
"pvalue_plots"]][["bpp_plot_over"]] ms_up_goseq[[
"pvalue_plots"]][["mfp_plot_over"]] ms_up_goseq[[
rownames(ms_down) <- make.names(ms_down[["namex"]], unique=TRUE)
simple_goseq(ms_down, go_db=pa14_go, length_db=pa14_lengths) ms_down_goseq <-
## Found 468 go_db genes and 672 length_db genes out of 673.
## Testing that go categories are defined.
## Removing undefined categories.
## Gathering synonyms.
## Gathering category definitions.
"pvalue_plots"]][["bpp_plot_over"]] ms_down_goseq[[
"pvalue_plots"]][["mfp_plot_over"]] ms_down_goseq[[
This time let us compare the strains and lower the variance from media.
normalize_expt(metabolism_starvation_strain, filter=TRUE, convert="cpm", norm="quant",
mss_norm <-batch="svaseq", transform="log2")
## Warning in normalize_expt(metabolism_starvation_strain, filter = TRUE, convert =
## "cpm", : Quantile normalization and sva do not always play well together.
## Removing 30 low-count genes (5949 remaining).
## batch_counts: Before batch/surrogate estimation, 0 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 1787 entries are 0<x<1: 1%.
## Setting 16 low elements to zero.
## transform_counts: Found 16 values equal to 0, adding 1 to the matrix.
plot_pca(mss_norm)$plot
subset_expt(metabolism_starvation_strain, subset="batch=='Mice Urine'") mss_urine <-
## subset_expt(): There were 35, now there are 18 samples.
list(
interesting <-"eda_vs_wt" = c("PA14eda", "PA14WT"),
"edd_vs_wt" = c("PA14edd", "PA14WT"),
"gcd_vs_wt" = c("PA14gcd", "PA14WT"),
"pgl_vs_wt" = c("PA14pgl", "PA14WT"),
"zfw_vs_wt" = c("PA14zwf", "PA14WT"))
all_pairwise(mss_urine, model_batch="svaseq", filter=TRUE) mss_urine_de <-
## batch_counts: Before batch/surrogate estimation, 29 entries are x==0: 0%.
## Plotting a PCA before surrogate/batch inclusion.
## Using svaseq to visualize before/after batch inclusion.
## Performing a test normalization with: raw
## Removing 0 low-count genes (5941 remaining).
## batch_counts: Before batch/surrogate estimation, 29 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 506 entries are 0<x<1: 0%.
## Setting 11 low elements to zero.
## transform_counts: Found 11 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(
mss_urine_table <-keepers=interesting,
mss_urine_de, excel=glue::glue("excel/metabolism_starvation_strain_tables-v{ver}.xlsx"),
sig_excel=glue::glue("excel/metabolism_starvation_strain_sig-v{ver}.xlsx"))
Comapre the strains during the instillation process
normalize_expt(metabolism_exudate, filter=TRUE, convert="cpm",
exudate_norm <-norm="quant", transform="log2")
## Removing 69 low-count genes (5910 remaining).
## transform_counts: Found 76 values equal to 0, adding 1 to the matrix.
plot_pca(exudate_norm)$plot
all_pairwise(metabolism_exudate, model_batch=TRUE, filter=TRUE) exudate_de <-
## Plotting a PCA before surrogate/batch inclusion.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(
exudate_tables <-keepers=interesting,
exudate_de, excel=glue::glue("excel/exudate_tables-v{ver}.xlsx"),
sig_excel=glue::glue("excel/exudate_sig-v{ver}.xlsx"))
create_expt("sample_sheets/all_samples_modified2.xlsx",
mm_expt <-gene_info=pa14_annot, file_column="mousetable")
## Reading the sample metadata.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 105 rows(samples) and 32 columns(metadata fields).
## Warning in create_expt("sample_sheets/all_samples_modified2.xlsx", gene_info
## = pa14_annot, : Some samples were removed when cross referencing the samples
## against the count data.
## Warning in create_expt("sample_sheets/all_samples_modified2.xlsx", gene_info =
## pa14_annot, : Even after changing the rownames in gene info, they do not match
## the count table.
## Even after changing the rownames in gene info, they do not match the count table.
## Here are the first few rownames from the count tables:
## gene:ENSMUSG00000000001, gene:ENSMUSG00000000003, gene:ENSMUSG00000000028, gene:ENSMUSG00000000037, gene:ENSMUSG00000000049, gene:ENSMUSG00000000056
## Here are the first few rownames from the gene information table:
## gene1650835, gene1650837, gene1650839, gene1650841, gene1650843, gene1650845
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Warning in create_expt("sample_sheets/all_samples_modified2.xlsx", gene_info =
## pa14_annot, : The following samples have no counts! SM029SM032SM038SM040
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 25753 rows and 35 columns.
plot_libsize(mm_expt)$plot
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 4 rows containing missing values (geom_bar).
::pander(sessionInfo())
pandermessage(paste0("This is hpgltools commit: ", get_git_commit()))
paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
this_save <-message(paste0("Saving to ", this_save))
sm(saveme(filename=this_save)) tmp <-