In the moment of writing this text, my brain cannot recall which set of samples these are. I remember primarily that Najib wished to use these to learn more about how I handle the data and through it, to learn some of the methods himself. In terms of the experimental goals and details, that is precious little. I presume though, that the sample sheet will teach me some useful and important details.
These are a group of Cure/Fail Biopsy samples across three visits, some used an antimonial drug while others used miltefosine. All samples except one were done on the same day, but sequenced across a few iterations with a relatively wide range of sequencing depths. There are a couple of samples which have not had their read numbers/mapping rates collected into the sample sheet.
TMRC30126 and TMRC30129
I see that I ran mappings against panamensis, but those do not appear to be in the sample sheet. Do we wish to look at the parasites?
samplesheet <- "sample_sheets/wellcome_trust_samples-v202111.xlsx"
We take the annotation data from ensembl’s biomart instance. The genome which was used to map the data was hg38 revision 100. My default when using biomart is to load the data from 1 year before the current date.
hs_annot <- load_biomart_annotations(year = "2019")[["annotation"]]
## The biomart annotations file already exists, loading from it.
hs_annot[["transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE)
tx_gene_map <- hs_annot[, c("transcript", "ensembl_gene_id")]
summary(hs_annot)
## ensembl_transcript_id ensembl_gene_id version transcript_version
## Length:226754 Length:226754 Min. : 1.0 Min. : 1.00
## Class :character Class :character 1st Qu.: 6.0 1st Qu.: 1.00
## Mode :character Mode :character Median :12.0 Median : 1.00
## Mean :10.5 Mean : 3.07
## 3rd Qu.:15.0 3rd Qu.: 5.00
## Max. :26.0 Max. :17.00
##
## hgnc_symbol description gene_biotype cds_length
## Length:226754 Length:226754 Length:226754 Min. : 3
## Class :character Class :character Class :character 1st Qu.: 357
## Mode :character Mode :character Mode :character Median : 690
## Mean : 1134
## 3rd Qu.: 1440
## Max. :107976
## NA's :126871
## chromosome_name strand start_position end_position
## Length:226754 Length:226754 Min. :5.77e+02 Min. :6.47e+02
## Class :character Class :character 1st Qu.:3.11e+07 1st Qu.:3.12e+07
## Mode :character Mode :character Median :6.04e+07 Median :6.05e+07
## Mean :7.41e+07 Mean :7.42e+07
## 3rd Qu.:1.09e+08 3rd Qu.:1.09e+08
## Max. :2.49e+08 Max. :2.49e+08
##
## transcript
## Length:226754
## Class :character
## Mode :character
##
##
##
##
hs_go <- load_biomart_go()[["go"]]
## The biomart annotations file already exists, loading from it.
hs_length <- hs_annot[, c("ensembl_gene_id", "cds_length")]
colnames(hs_length) <- c("ID", "length")
wellcome_outtime <- create_expt(metadata = samplesheet, gene_info = hs_annot,
file_column = "hg38100hisatfile") %>%
set_expt_batches(fact = "drug")
## Reading the sample metadata.
## The sample definitions comprises: 36 rows(samples) and 47 columns(metadata fields).
## Matched 21433 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21481 features and 36 samples.
outcome_time <- paste0(pData(wellcome_outtime)[["clinicaloutcome"]], "_v",
pData(wellcome_outtime)[["visitnumber"]])
wellcome_outtime <- set_expt_conditions(wellcome_outtime, fact = outcome_time)
wellcome_time <- set_expt_conditions(wellcome_outtime, fact = "visitnumber")
pData(wellcome_time)[["condition"]] <- paste0("t", pData(wellcome_time)[["condition"]])
wellcome_outcome <- set_expt_conditions(wellcome_outtime, fact = "clinicaloutcome") %>%
set_expt_batches(fact = "visitnumber")
pData(wellcome_outcome)[["batch"]] <- paste0("t", pData(wellcome_outcome)[["batch"]])
wellcome_drug <- set_expt_conditions(wellcome_outtime, fact = "drug") %>%
set_expt_batches(fact = "visitnumber")
pData(wellcome_drug)[["batch"]] <- paste0("t", pData(wellcome_drug)[["batch"]])
plot_libsize(wellcome_time)$plot
plot_nonzero(wellcome_time)$plot
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_boxplot(wellcome_time)
## 174127 entries are 0. We are on a log scale, adding 1 to the data.
wellcome_time_norm <- normalize_expt(wellcome_time, norm = "quant", convert = "cpm",
filter = TRUE, transform = "log2")
## Removing 7327 low-count genes (14154 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_norm)$plot
wellcome_time_varpart <- simple_varpart(wellcome_time_norm)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
##
## Total:94 s
wellcome_time_varpart$partition_plot
wellcome_time_nb <- normalize_expt(wellcome_time, norm = "quant", convert = "cpm",
batch = "svaseq", filter = TRUE, transform = "log2")
## Warning in normalize_expt(wellcome_time, norm = "quant", convert = "cpm", :
## Quantile normalization and sva do not always play well together.
## Removing 7327 low-count genes (14154 remaining).
## Setting 591 low elements to zero.
## transform_counts: Found 591 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_nb)$plot
wellcome_time_nb_varpart <- simple_varpart(wellcome_time_nb)
##
## Total:94 s
wellcome_time_nb_varpart$partition_plot
wellcome_outcome_norm <- normalize_expt(wellcome_outcome, norm = "quant", convert = "cpm",
filter = TRUE, transform = "log2")
## Removing 7327 low-count genes (14154 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outcome_norm)$plot
wellcome_outcome_varpart <- simple_varpart(wellcome_outcome_norm)
##
## Total:95 s
wellcome_outcome_varpart$partition_plot
wellcome_outcome_nb <- normalize_expt(wellcome_outcome, convert = "cpm",
batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 7327 low-count genes (14154 remaining).
## Setting 1240 low elements to zero.
## transform_counts: Found 1240 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outcome_nb)$plot
wellcome_outcome_nb_varpart <- simple_varpart(wellcome_outcome_nb)
##
## Total:92 s
wellcome_outcome_nb_varpart$partition_plot
wellcome_drug_norm <- normalize_expt(wellcome_drug, norm = "quant", convert = "cpm",
filter = TRUE, transform = "log2")
## Removing 7327 low-count genes (14154 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_drug_norm)$plot
wellcome_drug_varpart <- simple_varpart(wellcome_drug_norm)
##
## Total:95 s
wellcome_drug_varpart$partition_plot
wellcome_drug_nb <- normalize_expt(wellcome_drug, convert = "cpm",
batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 7327 low-count genes (14154 remaining).
## Setting 1263 low elements to zero.
## transform_counts: Found 1263 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_drug_nb)$plot
wellcome_drug_nb_varpart <- simple_varpart(wellcome_drug_nb)
##
## Total:93 s
wellcome_drug_nb_varpart$partition_plot
wellcome_outtime_norm <- normalize_expt(wellcome_outtime, norm = "quant", convert = "cpm",
filter = TRUE, transform = "log2")
## Removing 7327 low-count genes (14154 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outtime_norm)$plot
wellcome_outtime_varpart <- simple_varpart(wellcome_outtime_norm)
##
## Total:97 s
wellcome_outtime_varpart$partition_plot
wellcome_outtime_nb <- normalize_expt(wellcome_outtime, norm = "quant", convert = "cpm",
batch = "svaseq", filter = TRUE, transform = "log2")
## Warning in normalize_expt(wellcome_outtime, norm = "quant", convert = "cpm", :
## Quantile normalization and sva do not always play well together.
## Removing 7327 low-count genes (14154 remaining).
## Setting 705 low elements to zero.
## transform_counts: Found 705 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outtime_nb)$plot
wellcome_outtime_nb_varpart <- simple_varpart(wellcome_outtime_nb)
##
## Total:96 s
wellcome_outtime_nb_varpart$partition_plot
TODO: Also do this without sva.
outtime_de <- all_pairwise(wellcome_outtime, model_batch = "svaseq", filter = TRUE)
## This DE analysis will perform all pairwise comparisons among:
##
## Cure_v1 Cure_v2 Cure_v3 Failure_v1 Failure_v2 Failure_v3
## 5 5 5 7 7 7
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (14154 remaining).
## Setting 1167 low elements to zero.
## transform_counts: Found 1167 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
outtime_keepers <- list(
"curev1v2" = c("Curev2", "Curev1"),
"curev1v3" = c("Curev3", "Curev1"),
"curev2v3" = c("Curev3", "Curev2"),
"failv1v2" = c("Failurev2", "Failurev1"),
"failv1v3" = c("Failurev3", "Failurev1"),
"failv2v3" = c("Failurev3", "Failurev2"))
outtime_table <- combine_de_tables(
outtime_de,
keepers = outtime_keepers,
excel = glue::glue("excel/wellcome_outtime_table_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_outtime_table_sva-v202107.xlsx before writing the tables.
outtime_sig <- extract_significant_genes(
outtime_table,
excel = glue::glue("excel/wellcome_outtime_sig_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_outtime_sig_sva-v202107.xlsx before writing the tables.
## Using p column: limma_adjp.
## Using p column: edger_adjp.
## Using p column: deseq_adjp.
## Using p column: basic_adjp.
outtime_batch_de <- all_pairwise(wellcome_outtime, model_batch = TRUE, filter = TRUE)
## This DE analysis will perform all pairwise comparisons among:
##
## Cure_v1 Cure_v2 Cure_v3 Failure_v1 Failure_v2 Failure_v3
## 5 5 5 7 7 7
## This analysis will include a batch factor in the model comprised of:
##
## Antimoniate Miltefosine
## 18 18
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
outtime_batch_table <- combine_de_tables(
outtime_batch_de,
keepers = outtime_keepers,
excel = glue::glue("excel/wellcome_outtime_table_batch-v{ver}.xlsx"))
outtime_sig <- extract_significant_genes(
outtime_batch_table,
excel = glue::glue("excel/wellcome_outtime_sig_batch-v{ver}.xlsx"))
## Using p column: limma_adjp.
## Using p column: edger_adjp.
## Using p column: deseq_adjp.
## Using p column: basic_adjp.
time_de <- all_pairwise(wellcome_time, model_batch = "svaseq", filter = TRUE)
## This DE analysis will perform all pairwise comparisons among:
##
## t1 t2 t3
## 12 12 12
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (14154 remaining).
## Setting 1125 low elements to zero.
## transform_counts: Found 1125 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
time_keepers <- list(
"v1v2" = c("t2", "t1"),
"v1v3" = c("t3", "t1"),
"v2v3" = c("t3", "t2"))
time_sva_table <- combine_de_tables(
time_de,
keepers = time_keepers,
excel = glue::glue("excel/wellcome_time_table_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_time_table_sva-v202107.xlsx before writing the tables.
time_sva_sig <- extract_significant_genes(
time_sva_table,
excel = glue::glue("excel/wellcome_time_sig_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_time_sig_sva-v202107.xlsx before writing the tables.
## Using p column: limma_adjp.
## Using p column: edger_adjp.
## Using p column: deseq_adjp.
## Using p column: ebseq_adjp.
## Using p column: basic_adjp.
## wellcome_outcome_norm <- normalize_expt(wellcome_outcome, norm = "quant", convert = "cpm",
outcome_sva_de <- all_pairwise(wellcome_outcome, model_batch = "svaseq", filter = TRUE)
## This DE analysis will perform all pairwise comparisons among:
##
## Cure Failure
## 15 21
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (14154 remaining).
## Setting 1240 low elements to zero.
## transform_counts: Found 1240 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
outcome_sva_table <- combine_de_tables(
outcome_sva_de,
excel = glue::glue("excel/wellcome_outcome_table_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_outcome_table_sva-v202107.xlsx before writing the tables.
outcome_sva_sig <- extract_significant_genes(
outcome_sva_table,
excel = glue::glue("excel/wellcome_outcome_sig_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_outcome_sig_sva-v202107.xlsx before writing the tables.
## Using p column: limma_adjp.
## Using p column: edger_adjp.
## Using p column: deseq_adjp.
## Using p column: ebseq_adjp.
## Using p column: basic_adjp.
outcome_batch_de <- all_pairwise(wellcome_outcome, model_batch = "batchseq", filter = TRUE)
## This DE analysis will perform all pairwise comparisons among:
##
## Cure Failure
## 15 21
## This analysis will include surrogate estimates from: batchseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 2
## Removing 0 low-count genes (14154 remaining).
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 6
## Setting 1245 low elements to zero.
## transform_counts: Found 1245 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
outcome_batch_table <- combine_de_tables(
outcome_batch_de,
excel = glue::glue("excel/wellcome_outcome_table_batch-v{ver}.xlsx"))
## Deleting the file excel/wellcome_outcome_table_batch-v202107.xlsx before writing the tables.
outcome_batch_sig <- extract_significant_genes(
outcome_batch_table,
excel = glue::glue("excel/wellcome_outcome_sig_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_outcome_sig_sva-v202107.xlsx before writing the tables.
## Using p column: limma_adjp.
## Using p column: edger_adjp.
## Using p column: deseq_adjp.
## Using p column: ebseq_adjp.
## Using p column: basic_adjp.