In my previous installment (preprocessing.Rmd), I spun up some jobs on the cluster to examine these samples. In this document I will spend some time looking at them.
Half of the samples are GAS 5448 and half are NZ131. I have only so far mapped all samples against 5448, I will do NZ131 today.
## Returning a df with 14 columns and 1821 rows.
rownames(gas5448_gff) <- make.names(gas5448_gff[["locus_tag"]], unique = TRUE)
nz131_microbes <- load_microbesonline_annotations(species = "Streptococcus pyogenes NZ131")
## Found 1 entry.
## Streptococcus pyogenes NZ131Firmicutesyes2009-04-21yes101791471876
## The species being downloaded is: Streptococcus pyogenes NZ131
## Downloading: http://www.microbesonline.org/cgi-bin/genomeInfo.cgi?tId=471876;export=tab
The various mapping etc tools finished last night, let us collect the numbers from them here. The defaults should work well for pretty much everything, so I think I need fill in only the starting sample sheet.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## Warning in dispatch_regex_search(meta, search, replace, input_file_spec, : NAs introduced by coercion
## Warning in dispatch_regex_search(meta, search, replace, input_file_spec, : NAs introduced by coercion
## Writing new metadata to: sample_sheets/all_samples_modified.xlsx
## Deleting the file sample_sheets/all_samples_modified.xlsx before writing the tables.
all_expt <- create_expt(sample_sheet[["new_meta"]], file_column = "hisat_count_table",
gene_info = gas5448_gff) %>%
subset_expt(subset="read1fastqfile!='Undetermined_S0_R1_001.fastq.gz'")
## Reading the sample metadata.
## The sample definitions comprises: 81 rows(samples) and 32 columns(metadata fields).
## Warning in create_expt(sample_sheet[["new_meta"]], file_column = "hisat_count_table", : Even after changing the rownames in gene info, they do not
## match the count table.
## Even after changing the rownames in gene info, they do not match the count table.
## Here are the first few rownames from the count tables:
## Spy49_0001, Spy49_0002, Spy49_0003, Spy49_0004, Spy49_0005, Spy49_0006
## Here are the first few rownames from the gene information table:
## SP5448_RS00005, SP5448_RS00010, SP5448_RS00015, SP5448_RS00020, SP5448_RS00025, SP5448_RS00030
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 1788 features and 81 samples.
## The samples excluded are: KMSL81.
## subset_expt(): There were 81, now there are 80 samples.
all_combined_factor <- paste0(pData(all_expt)[["strain"]], "_",
pData(all_expt)[["media"]], "_",
pData(all_expt)[["supplement"]], "_",
pData(all_expt)[["time"]])
pData(all_expt)[["combined"]] <- all_combined_factor
all_expt <- set_expt_conditions(all_expt, all_combined_factor) %>%
set_expt_batches("replicate")
## The numbers of samples by condition are:
##
## nz131_C_DMSO_t120m nz131_C_DMSO_t90m nz131_C_Heme_t120m nz131_C_Heme_t90m nz131_RPMI_DMSO_t120m nz131_RPMI_DMSO_t90m
## 4 4 4 4 4 4
## nz131_RPMI_Heme_t120m nz131_RPMI_Heme_t90m s5448_C_DMSO_t120m s5448_C_DMSO_t90m s5448_C_Heme_t120m s5448_C_Heme_t90m
## 4 4 4 4 4 4
## s5448_RPMI_DMSO_t120m s5448_RPMI_DMSO_t90m s5448_RPMI_Heme_t120m s5448_RPMI_Heme_t90m s5448AP_C_DMSO_t120m s5448AP_C_DMSO_t90m
## 4 4 4 4 4 4
## s5448AP_C_Heme_t120m s5448AP_C_Heme_t90m
## 4 4
## The number of samples by batch are:
##
## r1 r2 r3 r4
## 20 20 20 20
combined_strain_supplement_media <- paste0(pData(all_expt)[["strain"]], "_",
pData(all_expt)[["media"]], "_",
pData(all_expt)[["supplement"]])
pData(all_expt)[["str_sup_med"]] <- combined_strain_supplement_media
combined_strain_supplement <- paste0(pData(all_expt)[["strain"]], "_",
pData(all_expt)[["supplement"]])
pData(all_expt)[["str_sup"]] <- combined_strain_supplement
combined_media_supplement <- paste0(pData(all_expt)[["media"]], "_",
pData(all_expt)[["supplement"]])
pData(all_expt)[["med_sup"]] <- combined_media_supplement
combined_strain_media <- paste0(pData(all_expt)[["strain"]], "_",
pData(all_expt)[["media"]])
pData(all_expt)[["str_med"]] <- combined_strain_media
notime_factor <- paste0(pData(all_expt)[["strain"]], "_",
pData(all_expt)[["media"]], "_",
pData(all_expt)[["supplement"]])
pData(all_expt)[["notime"]] <- notime_factor
all_expt <- set_expt_conditions(all_expt, notime_factor) %>%
set_expt_batches("strain")
## The numbers of samples by condition are:
##
## nz131_C_DMSO nz131_C_Heme nz131_RPMI_DMSO nz131_RPMI_Heme s5448_C_DMSO s5448_C_Heme s5448_RPMI_DMSO s5448_RPMI_Heme s5448AP_C_DMSO
## 8 8 8 8 8 8 8 8 8
## s5448AP_C_Heme
## 8
## The number of samples by batch are:
##
## nz131 s5448 s5448AP
## 32 32 16
## A sankey plot describing the metadata of 80 samples,
## including 38 out of 0 nodes and traversing metadata factors:
## .
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## The colors used in the expressionset are: #97722D, #666666, #D8367D, #749829, #BBA90B, #C9930D, #1B9E77, #AE6D1C, #A16864, #9B58A5.
## Library sizes of 80 samples,
## ranging from 5,545,743 to 13,822,520.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## A non-zero genes plot of 80 samples.
## These samples have an average 9.375 CPM coverage and 1671 genes observed, ranging from 1621 to
## 1749.
## Warning: ggrepel: 43 unlabeled data points (too many overlaps). Consider increasing max.overlaps
all_norm <- normalize_expt(all_expt, transform = "log2", convert = "cpm",
norm = "quant", filter = TRUE)
## Removing 64 low-count genes (1724 remaining).
## transform_counts: Found 17 values equal to 0, adding 1 to the matrix.
## When the standard median metric was plotted, the values observed range
## from 0.642873338569537 to 1 with quartiles at 0.853537697220439 and 0.885246354323851.
## A heatmap of pairwise sample distances ranging from:
## 7.85911795064251 to 107.939087446247.
## A heatmap of pairwise sample correlations ranging from:
## 0.642873338569537 to 0.998106456004526.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by nz131_C_DMSO, nz131_C_Heme, nz131_RPMI_DMSO, nz131_RPMI_Heme, s5448_C_DMSO, s5448_C_Heme, s5448_RPMI_DMSO, s5448_RPMI_Heme, s5448AP_C_DMSO, s5448AP_C_Heme
## Shapes are defined by nz131, s5448, s5448AP.
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning: ggrepel: 62 unlabeled data points (too many overlaps). Consider increasing max.overlaps
I think some samples may warrant removal: KMSL60, KMSL67, KMSL18
exclude_ids <- c("KMSL60", "KMSL67", "KMSL18")
exclude_idx <- ! sampleNames(all_norm) %in% exclude_ids
exclude_samples <- sampleNames(all_norm)[exclude_idx]
exc_expt <- subset_expt(all_expt, ids = exclude_samples)
## The samples excluded are: KMSL18, KMSL60, KMSL67.
## subset_expt(): There were 80, now there are 77 samples.
Strange, I am going to assume the nz131/5448 split is the problem?
## Subsetting on samples.
## The samples excluded are: KMSL01, KMSL02, KMSL03, KMSL04, KMSL05, KMSL06, KMSL07, KMSL08, KMSL09, KMSL10, KMSL11, KMSL12, KMSL13, KMSL14, KMSL15, KMSL16, KMSL17, KMSL18, KMSL19, KMSL20, KMSL21, KMSL22, KMSL23, KMSL24, KMSL25, KMSL26, KMSL27, KMSL28, KMSL29, KMSL30, KMSL31, KMSL32, KMSL33, KMSL34, KMSL35, KMSL36, KMSL37, KMSL38, KMSL39, KMSL40, KMSL41, KMSL42, KMSL43, KMSL44, KMSL45, KMSL46, KMSL47, KMSL48.
## subset_expt(): There were 80, now there are 32 samples.
## Subsetting on samples.
## The samples excluded are: KMSL01, KMSL02, KMSL03, KMSL04, KMSL05, KMSL06, KMSL07, KMSL08, KMSL09, KMSL10, KMSL11, KMSL12, KMSL13, KMSL14, KMSL15, KMSL16, KMSL49, KMSL50, KMSL51, KMSL52, KMSL53, KMSL54, KMSL55, KMSL56, KMSL57, KMSL58, KMSL59, KMSL60, KMSL61, KMSL62, KMSL63, KMSL64, KMSL65, KMSL66, KMSL67, KMSL68, KMSL69, KMSL70, KMSL71, KMSL72, KMSL73, KMSL74, KMSL75, KMSL76, KMSL77, KMSL78, KMSL79, KMSL80.
## subset_expt(): There were 80, now there are 32 samples.
nz131_norm <- normalize_expt(nz131_expt, transform = "log2", convert = "cpm",
norm = "quant", filter = TRUE)
## Removing 79 low-count genes (1709 remaining).
## transform_counts: Found 8 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by nz131_C_DMSO, nz131_C_Heme, nz131_RPMI_DMSO, nz131_RPMI_Heme
## Shapes are defined by nz131.
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
s5448_norm <- normalize_expt(s5448_expt, transform = "log2", convert = "cpm",
norm = "quant", filter = TRUE)
## Removing 187 low-count genes (1601 remaining).
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by s5448_C_DMSO, s5448_C_Heme, s5448_RPMI_DMSO, s5448_RPMI_Heme
## Shapes are defined by s5448.
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
Note, variance partition requires a somewhat more strict filter than I use for many other tasks. Given the peculiarity of what I am seeing, I might choose to use this stricter dataset.
## Subsetting on features.
## remove_genes_expt(), before removal, there were 1775 genes, now there are 1637.
## The result of using variancePartition with the model:
## ~ strain + time + supplement + media + replicate
filt_expt <- vp[["modified_expt"]]
exc_vp <- simple_varpart(exc_expt, factors = c("strain", "time", "supplement", "media"))
## Subsetting on features.
## remove_genes_expt(), before removal, there were 1775 genes, now there are 1637.
Variance partition says that media is the winner, let us look at that first. I am guessing that the other factors will not provide interesting plots unless we go nuts and use sva.
## The numbers of samples by condition are:
##
## C RPMI
## 48 32
media_norm <- normalize_expt(media_expt, filter = TRUE, convert = "cpm",
norm = "quant", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## transform_counts: Found 17 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by C, RPMI
## Shapes are defined by nz131, s5448, s5448AP.
media_nb <- normalize_expt(media_expt, filter = TRUE, convert = "cpm",
batch = "svaseq", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## Setting 608 low elements to zero.
## transform_counts: Found 608 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by C, RPMI
## Shapes are defined by nz131, s5448, s5448AP.
## The numbers of samples by condition are:
##
## DMSO Heme
## 40 40
supplement_norm <- normalize_expt(supplement_expt, filter = TRUE, convert = "cpm",
norm = "quant", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## transform_counts: Found 17 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by DMSO, Heme
## Shapes are defined by nz131, s5448, s5448AP.
supplement_nb <- normalize_expt(supplement_expt, filter = TRUE, convert = "cpm",
batch = "svaseq", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## Setting 233 low elements to zero.
## transform_counts: Found 233 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by DMSO, Heme
## Shapes are defined by nz131, s5448, s5448AP.
Time is the big loser in this.
## The numbers of samples by condition are:
##
## t120m t90m
## 40 40
time_norm <- normalize_expt(time_expt, filter = TRUE, convert = "cpm",
norm = "quant", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## transform_counts: Found 17 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by t120m, t90m
## Shapes are defined by nz131, s5448, s5448AP.
time_nb <- normalize_expt(time_expt, filter = TRUE, convert = "cpm",
batch = "svaseq", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## Setting 209 low elements to zero.
## transform_counts: Found 209 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by t120m, t90m
## Shapes are defined by nz131, s5448, s5448AP.
med_sup_expt <- set_expt_conditions(filt_expt, fact = "med_sup") %>%
set_expt_batches(fact = "strain")
## The numbers of samples by condition are:
##
## C_DMSO C_Heme RPMI_DMSO RPMI_Heme
## 24 24 16 16
## The number of samples by batch are:
##
## nz131 s5448 s5448AP
## 32 32 16
med_sup_norm <- normalize_expt(med_sup_expt, filter = TRUE, convert = "cpm",
norm = "quant", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## transform_counts: Found 17 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by C_DMSO, C_Heme, RPMI_DMSO, RPMI_Heme
## Shapes are defined by nz131, s5448, s5448AP.
med_sup_nb <- normalize_expt(med_sup_expt, filter = TRUE, convert = "cpm",
batch = "svaseq", transform = "log2")
## Removing 64 low-count genes (1724 remaining).
## Setting 639 low elements to zero.
## transform_counts: Found 639 values equal to 0, adding 1 to the matrix.
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by C_DMSO, C_Heme, RPMI_DMSO, RPMI_Heme
## Shapes are defined by nz131, s5448, s5448AP.
Thinking… let us first just lump all samples together and see what happens?
all_supplement_keepers <- list(
"heme_vs_dmso" = c("Heme", "DMSO"))
## This adds replicate to the model because model_batch defaults to TRUE
all_supplement_de <- all_pairwise(supplement_expt, model_batch = TRUE)
##
## DMSO Heme
## 40 40
##
## nz131 s5448 s5448AP
## 32 32 16
all_supplement_table <- combine_de_tables(
all_supplement_de,
keepers = all_supplement_keepers,
excel = glue("excel/heme_vs_dmso_all_conditions_batch_table-v{ver}.xlsx"))
all_supplement_sig <- extract_significant_genes(
all_supplement_table,
excel = glue("excel/heme_vs_dmso_all_conditions_batch_sig-v{ver}.xlsx"))
all_supplement_sva_de <- all_pairwise(supplement_expt, model_batch = "svaseq")
##
## DMSO Heme
## 40 40
## Error in density.default(x, adjust = adj) : 'x' contains missing values
## Warning in all_adjusters(input, estimate_type = model_batch, surrogates = surrogates): It is highly likely that the underlying reason for this
## error is too many 0's in the dataset, please try doing a filtering of the data and retry.
## Error in density.default(x, adjust = adj): 'x' contains missing values
all_supplement_sva_table <- combine_de_tables(
all_supplement_sva_de,
keepers = all_supplement_keepers,
excel = glue("excel/heme_vs_dmso_all_conditions_sva_table-v{ver}.xlsx"))
## Error in eval(expr, envir, enclos): object 'all_supplement_sva_de' not found
all_supplement_sva_sig <- extract_significant_genes(
all_supplement_sva_table,
excel = glue("excel/heme_vs_dmso_all_conditions_sva_sig-v{ver}.xlsx"))
## Error in eval(expr, envir, enclos): object 'all_supplement_sva_table' not found
strain_supplement_keepers <- list(
"AP_C_HD" = c("s5448APCHeme", "s5448APCDMSO"),
"NZ_C_HD" = c("nz131CHeme", "nz131CDMSO"),
"G5448_C_HD" = c("s5448CHeme", "s5448CDMSO"),
"NZ_R_HD" = c("nz131RPMIHeme", "nz131RPMIDMSO"),
"G5448_R_HD" = c("s5448RPMIHeme", "s5448RPMIDMSO"))
exc_expt <- set_expt_batches(exc_expt, fact = "replicate")
## The number of samples by batch are:
##
## r1 r2 r3 r4
## 20 19 19 19
##
## nz131_C_DMSO nz131_C_Heme nz131_RPMI_DMSO nz131_RPMI_Heme s5448_C_DMSO s5448_C_Heme s5448_RPMI_DMSO s5448_RPMI_Heme s5448AP_C_DMSO
## 8 7 7 8 7 8 8 8 8
## s5448AP_C_Heme
## 8
##
## r1 r2 r3 r4
## 20 19 19 19
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: batch in model/limma.
## The primary analysis performed 10 comparisons.
exc_table <- combine_de_tables(
exc_de, keepers = strain_supplement_keepers,
excel = glue("excel/gas_heme_vs_dmso_batch-v{ver}.xlsx"))
exc_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 s5448APCHeme_vs_s5448APCDMSO 161 361 164 369 310 284
## 2 nz131CHeme_vs_nz131CDMSO 191 126 172 142 140 143
## 3 s5448CHeme_vs_s5448CDMSO 301 295 313 287 277 344
## 4 nz131RPMIHeme_vs_nz131RPMIDMSO 5 45 5 41 20 19
## 5 s5448RPMIHeme_vs_s5448RPMIDMSO 19 41 19 43 21 28
## Plot describing unique/shared genes in a differential expression table.
exc_sig <- extract_significant_genes(
exc_table,
excel = glue("excel/gas_heme_vs_dmso_batch_sig-v{ver}.xlsx"))
exc_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## AP_C_HD 310 284 164 369 161 361 199 236
## NZ_C_HD 140 143 172 142 191 126 136 143
## G5448_C_HD 277 344 313 287 301 295 268 336
## NZ_R_HD 20 19 5 41 5 45 14 17
## G5448_R_HD 21 28 19 43 19 41 20 20