1 Poking at WT samples over two time scales

This dataset is very interesting, but lacking power to answer some of the really interesting questions it poses. There are 45 samples, 18 cure and 27 fail. There are 5 main visits, but the third has a short-term kinetics aspect which makes it absolutely fascinating. Unfortunately, the questions of cure/fail cannot be properly addressed in this short time-frame. Once I create the expressionsets, you will see what I mean…

samplesheet <- "sample_sheets/all_samples.xlsx"

2 Annotation

We take the annotation data from ensembl’s biomart instance. The genome which was used to map the data was hg38 revision 100. My default when using biomart is to load the data from 1 year before the current date.

hs_annot <- load_biomart_annotations(year = "2019")[["annotation"]]
## The biomart annotations file already exists, loading from it.
hs_annot[["transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
rownames(hs_annot) <- paste0("gene:", make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE))
tx_gene_map <- hs_annot[, c("transcript", "ensembl_gene_id")]

summary(hs_annot)
##  ensembl_transcript_id ensembl_gene_id       version     transcript_version
##  Length:227334         Length:227334      Min.   : 1.0   Min.   : 1.00     
##  Class :character      Class :character   1st Qu.: 6.0   1st Qu.: 1.00     
##  Mode  :character      Mode  :character   Median :12.0   Median : 1.00     
##                                           Mean   :10.6   Mean   : 3.08     
##                                           3rd Qu.:15.0   3rd Qu.: 5.00     
##                                           Max.   :27.0   Max.   :17.00     
##                                                                            
##  hgnc_symbol        description        gene_biotype         cds_length    
##  Length:227334      Length:227334      Length:227334      Min.   :     3  
##  Class :character   Class :character   Class :character   1st Qu.:   357  
##  Mode  :character   Mode  :character   Mode  :character   Median :   693  
##                                                           Mean   :  1139  
##                                                           3rd Qu.:  1446  
##                                                           Max.   :107976  
##                                                           NA's   :127031  
##  chromosome_name       strand          start_position      end_position     
##  Length:227334      Length:227334      Min.   :5.77e+02   Min.   :6.47e+02  
##  Class :character   Class :character   1st Qu.:3.11e+07   1st Qu.:3.12e+07  
##  Mode  :character   Mode  :character   Median :6.04e+07   Median :6.06e+07  
##                                        Mean   :7.41e+07   Mean   :7.42e+07  
##                                        3rd Qu.:1.09e+08   3rd Qu.:1.09e+08  
##                                        Max.   :2.49e+08   Max.   :2.49e+08  
##                                                                             
##   transcript       
##  Length:227334     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
hs_go <- load_biomart_go()[["go"]]
## The biomart annotations file already exists, loading from it.
hs_length <- hs_annot[, c("ensembl_gene_id", "cds_length")]
colnames(hs_length) <- c("ID", "length")
wellcome_pk <- create_expt(metadata = samplesheet, gene_info = hs_annot,
                           file_column = "hg38100hisatfile") %>%
  set_expt_conditions(fact="clinicaloutcome") %>%
  set_expt_batches(fact = "visit")
## Reading the sample metadata.
## The sample definitions comprises: 45 rows(samples) and 42 columns(metadata fields).
## Matched 21440 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21481 features and 45 samples.
wellcome_time <- set_expt_conditions(wellcome_pk, fact = "visit") %>%
  set_expt_batches(fact = "clinicaloutcome")

2.1 Show the number of various sample types

## Start with cure/fail
table(pData(wellcome_pk)[["clinicaloutcome"]])
## 
##    Cure failure 
##      26      19
## Then the global visit (e.g. there will be lots of visit 3, when the kinetics aspect happened)
table(pData(wellcome_pk)[["visit"]])
## 
## v1 v2 v3 v4 v5 
##  8  4 25  4  4
## Subdivide the draws on visit 3
table(pData(wellcome_pk)[["visithour"]])
## 
##   v1_h0   v1_h1   v2_h1   v3_h0   v3_h1 v3_h1.5   v3_h2  v3_h24   v3_h3   v3_h5 
##       4       4       4       3       4       3       3       3       3       3 
##   v3_h8   v4_h1   v5_h1 
##       3       4       4
## And consider this in the context of cure/fail
table(pData(wellcome_pk)[["visithouroutcome"]])
## 
##      Cure_v1_h0      Cure_v1_h1      Cure_v2_h1      Cure_v3_h0      Cure_v3_h1 
##               2               2               2               2               2 
##    Cure_v3_h1.5      Cure_v3_h2     Cure_v3_h24      Cure_v3_h3      Cure_v3_h5 
##               2               2               2               2               2 
##      Cure_v3_h8      Cure_v4_h1      Cure_v5_h1   failure_v1_h0   failure_v1_h1 
##               2               2               2               2               2 
##   failure_v2_h1   failure_v3_h0   failure_v3_h1 failure_v3_h1.5   failure_v3_h2 
##               2               1               2               1               1 
##  failure_v3_h24   failure_v3_h3   failure_v3_h5   failure_v3_h8   failure_v4_h1 
##               1               1               1               1               2 
##   failure_v5_h1 
##               2

3 Wellcome plots

3.1 A few global metrics

plot_libsize(wellcome_pk)$plot

plot_nonzero(wellcome_pk)$plot
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

plot_boxplot(wellcome_pk)
## 263736 entries are 0.  We are on a log scale, adding 1 to the data.

3.2 By CF

wellcome_pk_norm <- normalize_expt(wellcome_pk, norm = "quant", convert = "cpm",
                                   filter = TRUE, transform = "log2")
## Removing 8956 low-count genes (12525 remaining).
## transform_counts: Found 14 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_pk_norm, plot_labels=FALSE)$plot

wellcome_pk_varpart <- simple_varpart(wellcome_pk_norm)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## 
## Total:90 s
wellcome_pk_varpart$partition_plot

wellcome_pk_nb <- normalize_expt(wellcome_pk, convert = "cpm",
                                 filter = TRUE, transform = "log2", batch = "svaseq")
## Removing 8956 low-count genes (12525 remaining).
## Setting 249 low elements to zero.
## transform_counts: Found 249 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_pk_nb, plot_labels=FALSE)$plot

wellcome_pk_nb_varpart <- simple_varpart(wellcome_pk_nb)
## 
## Total:91 s
wellcome_pk_nb_varpart$partition_plot

3.3 Time

wellcome_time_norm <- normalize_expt(wellcome_time, norm = "quant", convert = "cpm",
                                     filter = TRUE, transform = "log2")
## Removing 8956 low-count genes (12525 remaining).
## transform_counts: Found 14 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_norm, plot_labels=FALSE)$plot

wellcome_time_nb <- normalize_expt(wellcome_time, convert = "cpm",
                                   batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 8956 low-count genes (12525 remaining).
## Setting 128 low elements to zero.
## transform_counts: Found 128 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_nb, plot_labels=FALSE)$plot

4 DE analyses

4.1 Outcome and time

TODO: Also do this without sva.

pk_sva_de <- all_pairwise(wellcome_pk, model_batch = "svaseq", filter = TRUE)
## This DE analysis will perform all pairwise comparisons among:
## 
##    Cure failure 
##      26      19
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (12525 remaining).
## Setting 249 low elements to zero.
## transform_counts: Found 249 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
pk_keepers <- list(
    "cf" = c("Cure", "failure"))
pk_sva_table <- combine_de_tables(
    pk_sva_de, keepers = pk_keepers,
    excel = glue::glue("excel/wellcome_pk_sva_table-v{ver}.xlsx"))
## Deleting the file excel/wellcome_pk_sva_table-v202209.xlsx before writing the tables.
pk_sva_sig <- extract_significant_genes(
    pk_sva_table,
    excel = glue::glue("excel/wellcome_pk_sva_sig-v{ver}.xlsx"))
## Deleting the file excel/wellcome_pk_sva_sig-v202209.xlsx before writing the tables.
LS0tCnRpdGxlOiAiV2VsbGNvbWUgS2luZXRpY3M6IDIwMjIwOSIKYXV0aG9yOiAiTmFqaWIgRWwtU2F5ZWQiCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCIKb3V0cHV0OgogaHRtbF9kb2N1bWVudDoKICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgY29kZV9mb2xkaW5nOiBzaG93CiAgZmlnX2NhcHRpb246IHRydWUKICBmaWdfaGVpZ2h0OiA3CiAgZmlnX3dpZHRoOiA3CiAgaGlnaGxpZ2h0OiBkZWZhdWx0CiAga2VlcF9tZDogZmFsc2UKICBtb2RlOiBzZWxmY29udGFpbmVkCiAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgc2VsZl9jb250YWluZWQ6IHRydWUKICB0aGVtZTogcmVhZGFibGUKICB0b2M6IHRydWUKICB0b2NfZmxvYXQ6CiAgIGNvbGxhcHNlZDogZmFsc2UKICAgc21vb3RoX3Njcm9sbDogZmFsc2UKLS0tCgo8c3R5bGU+CiAgYm9keSAubWFpbi1jb250YWluZXIgewogICAgbWF4LXdpZHRoOiAxNjAwcHg7CiAgfQo8L3N0eWxlPgoKYGBge3Igb3B0aW9ucywgaW5jbHVkZT1GQUxTRX0KbGlicmFyeShocGdsdG9vbHMpCnR0IDwtIHNtKGRldnRvb2xzOjpsb2FkX2FsbCgifi9ocGdsdG9vbHMiKSkKa25pdHI6Om9wdHNfa25pdCRzZXQocHJvZ3Jlc3M9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZT1UUlVFLAogICAgICAgICAgICAgICAgICAgICB3aWR0aD05MCwKICAgICAgICAgICAgICAgICAgICAgZWNobz1UUlVFKQprbml0cjo6b3B0c19jaHVuayRzZXQoZXJyb3I9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgIGZpZy53aWR0aD04LAogICAgICAgICAgICAgICAgICAgICAgZmlnLmhlaWdodD04LAogICAgICAgICAgICAgICAgICAgICAgZHBpPTk2KQpvbGRfb3B0aW9ucyA8LSBvcHRpb25zKGRpZ2l0cz00LAogICAgICAgICAgICAgICAgICAgICAgIHN0cmluZ3NBc0ZhY3RvcnM9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAga25pdHIuZHVwbGljYXRlLmxhYmVsPSJhbGxvdyIpCmdncGxvdDI6OnRoZW1lX3NldChnZ3Bsb3QyOjp0aGVtZV9idyhiYXNlX3NpemU9MTIpKQp2ZXIgPC0gIjIwMjIwOSIKcnVuZGF0ZSA8LSBmb3JtYXQoU3lzLkRhdGUoKSwgZm9ybWF0PSIlWSVtJWQiKQpgYGAKCiMgUG9raW5nIGF0IFdUIHNhbXBsZXMgb3ZlciB0d28gdGltZSBzY2FsZXMKClRoaXMgZGF0YXNldCBpcyB2ZXJ5IGludGVyZXN0aW5nLCBidXQgbGFja2luZyBwb3dlciB0byBhbnN3ZXIgc29tZSBvZgp0aGUgcmVhbGx5IGludGVyZXN0aW5nIHF1ZXN0aW9ucyBpdCBwb3Nlcy4gIFRoZXJlIGFyZSA0NSBzYW1wbGVzLCAxOApjdXJlIGFuZCAyNyBmYWlsLiAgVGhlcmUgYXJlIDUgbWFpbiB2aXNpdHMsIGJ1dCB0aGUgdGhpcmQgaGFzIGEKc2hvcnQtdGVybSBraW5ldGljcyBhc3BlY3Qgd2hpY2ggbWFrZXMgaXQgYWJzb2x1dGVseSBmYXNjaW5hdGluZy4KVW5mb3J0dW5hdGVseSwgdGhlIHF1ZXN0aW9ucyBvZiBjdXJlL2ZhaWwgY2Fubm90IGJlIHByb3Blcmx5IGFkZHJlc3NlZAppbiB0aGlzIHNob3J0IHRpbWUtZnJhbWUuICBPbmNlIEkgY3JlYXRlIHRoZSBleHByZXNzaW9uc2V0cywgeW91IHdpbGwKc2VlIHdoYXQgSSBtZWFuLi4uCgpgYGB7ciBzYW1wbGVzaGVldH0Kc2FtcGxlc2hlZXQgPC0gInNhbXBsZV9zaGVldHMvYWxsX3NhbXBsZXMueGxzeCIKYGBgCgojIEFubm90YXRpb24KCldlIHRha2UgdGhlIGFubm90YXRpb24gZGF0YSBmcm9tIGVuc2VtYmwncyBiaW9tYXJ0IGluc3RhbmNlLiAgVGhlIGdlbm9tZSB3aGljaAp3YXMgdXNlZCB0byBtYXAgdGhlIGRhdGEgd2FzIGhnMzggcmV2aXNpb24gMTAwLiAgTXkgZGVmYXVsdCB3aGVuIHVzaW5nIGJpb21hcnQgaXMKdG8gbG9hZCB0aGUgZGF0YSBmcm9tIDEgeWVhciBiZWZvcmUgdGhlIGN1cnJlbnQgZGF0ZS4KCmBgYHtyIGhzX2Fubm90fQpoc19hbm5vdCA8LSBsb2FkX2Jpb21hcnRfYW5ub3RhdGlvbnMoeWVhciA9ICIyMDE5IilbWyJhbm5vdGF0aW9uIl1dCmhzX2Fubm90W1sidHJhbnNjcmlwdCJdXSA8LSBwYXN0ZTAocm93bmFtZXMoaHNfYW5ub3QpLCAiLiIsIGhzX2Fubm90W1sidmVyc2lvbiJdXSkKcm93bmFtZXMoaHNfYW5ub3QpIDwtIHBhc3RlMCgiZ2VuZToiLCBtYWtlLm5hbWVzKGhzX2Fubm90W1siZW5zZW1ibF9nZW5lX2lkIl1dLCB1bmlxdWUgPSBUUlVFKSkKdHhfZ2VuZV9tYXAgPC0gaHNfYW5ub3RbLCBjKCJ0cmFuc2NyaXB0IiwgImVuc2VtYmxfZ2VuZV9pZCIpXQoKc3VtbWFyeShoc19hbm5vdCkKYGBgCgpgYGB7ciBoc19nb30KaHNfZ28gPC0gbG9hZF9iaW9tYXJ0X2dvKClbWyJnbyJdXQpoc19sZW5ndGggPC0gaHNfYW5ub3RbLCBjKCJlbnNlbWJsX2dlbmVfaWQiLCAiY2RzX2xlbmd0aCIpXQpjb2xuYW1lcyhoc19sZW5ndGgpIDwtIGMoIklEIiwgImxlbmd0aCIpCmBgYAoKYGBge3IgZXhwdH0Kd2VsbGNvbWVfcGsgPC0gY3JlYXRlX2V4cHQobWV0YWRhdGEgPSBzYW1wbGVzaGVldCwgZ2VuZV9pbmZvID0gaHNfYW5ub3QsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIGZpbGVfY29sdW1uID0gImhnMzgxMDBoaXNhdGZpbGUiKSAlPiUKICBzZXRfZXhwdF9jb25kaXRpb25zKGZhY3Q9ImNsaW5pY2Fsb3V0Y29tZSIpICU+JQogIHNldF9leHB0X2JhdGNoZXMoZmFjdCA9ICJ2aXNpdCIpCgp3ZWxsY29tZV90aW1lIDwtIHNldF9leHB0X2NvbmRpdGlvbnMod2VsbGNvbWVfcGssIGZhY3QgPSAidmlzaXQiKSAlPiUKICBzZXRfZXhwdF9iYXRjaGVzKGZhY3QgPSAiY2xpbmljYWxvdXRjb21lIikKYGBgCgojIyBTaG93IHRoZSBudW1iZXIgb2YgdmFyaW91cyBzYW1wbGUgdHlwZXMKCmBgYHtyIHRhYmxlX3NhbXBsZXN9CiMjIFN0YXJ0IHdpdGggY3VyZS9mYWlsCnRhYmxlKHBEYXRhKHdlbGxjb21lX3BrKVtbImNsaW5pY2Fsb3V0Y29tZSJdXSkKIyMgVGhlbiB0aGUgZ2xvYmFsIHZpc2l0IChlLmcuIHRoZXJlIHdpbGwgYmUgbG90cyBvZiB2aXNpdCAzLCB3aGVuIHRoZSBraW5ldGljcyBhc3BlY3QgaGFwcGVuZWQpCnRhYmxlKHBEYXRhKHdlbGxjb21lX3BrKVtbInZpc2l0Il1dKQojIyBTdWJkaXZpZGUgdGhlIGRyYXdzIG9uIHZpc2l0IDMKdGFibGUocERhdGEod2VsbGNvbWVfcGspW1sidmlzaXRob3VyIl1dKQojIyBBbmQgY29uc2lkZXIgdGhpcyBpbiB0aGUgY29udGV4dCBvZiBjdXJlL2ZhaWwKdGFibGUocERhdGEod2VsbGNvbWVfcGspW1sidmlzaXRob3Vyb3V0Y29tZSJdXSkKYGBgCgojIFdlbGxjb21lIHBsb3RzCgojIyBBIGZldyBnbG9iYWwgbWV0cmljcwoKYGBge3IgZ2xvYmFsfQpwbG90X2xpYnNpemUod2VsbGNvbWVfcGspJHBsb3QKcGxvdF9ub256ZXJvKHdlbGxjb21lX3BrKSRwbG90CnBsb3RfYm94cGxvdCh3ZWxsY29tZV9waykKYGBgCgojIyBCeSBDRgoKYGBge3Igd2VsbGNvbWVfY2Z9CndlbGxjb21lX3BrX25vcm0gPC0gbm9ybWFsaXplX2V4cHQod2VsbGNvbWVfcGssIG5vcm0gPSAicXVhbnQiLCBjb252ZXJ0ID0gImNwbSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZmlsdGVyID0gVFJVRSwgdHJhbnNmb3JtID0gImxvZzIiKQpwbG90X3BjYSh3ZWxsY29tZV9wa19ub3JtLCBwbG90X2xhYmVscz1GQUxTRSkkcGxvdAoKd2VsbGNvbWVfcGtfdmFycGFydCA8LSBzaW1wbGVfdmFycGFydCh3ZWxsY29tZV9wa19ub3JtKQp3ZWxsY29tZV9wa192YXJwYXJ0JHBhcnRpdGlvbl9wbG90Cgp3ZWxsY29tZV9wa19uYiA8LSBub3JtYWxpemVfZXhwdCh3ZWxsY29tZV9waywgY29udmVydCA9ICJjcG0iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmaWx0ZXIgPSBUUlVFLCB0cmFuc2Zvcm0gPSAibG9nMiIsIGJhdGNoID0gInN2YXNlcSIpCnBsb3RfcGNhKHdlbGxjb21lX3BrX25iLCBwbG90X2xhYmVscz1GQUxTRSkkcGxvdAoKd2VsbGNvbWVfcGtfbmJfdmFycGFydCA8LSBzaW1wbGVfdmFycGFydCh3ZWxsY29tZV9wa19uYikKd2VsbGNvbWVfcGtfbmJfdmFycGFydCRwYXJ0aXRpb25fcGxvdApgYGAKCiMjIFRpbWUKCmBgYHtyIHdlbGxjb21lX3RpbWV9CndlbGxjb21lX3RpbWVfbm9ybSA8LSBub3JtYWxpemVfZXhwdCh3ZWxsY29tZV90aW1lLCBub3JtID0gInF1YW50IiwgY29udmVydCA9ICJjcG0iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZmlsdGVyID0gVFJVRSwgdHJhbnNmb3JtID0gImxvZzIiKQpwbG90X3BjYSh3ZWxsY29tZV90aW1lX25vcm0sIHBsb3RfbGFiZWxzPUZBTFNFKSRwbG90Cgp3ZWxsY29tZV90aW1lX25iIDwtIG5vcm1hbGl6ZV9leHB0KHdlbGxjb21lX3RpbWUsIGNvbnZlcnQgPSAiY3BtIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBiYXRjaCA9ICJzdmFzZXEiLCBmaWx0ZXIgPSBUUlVFLCB0cmFuc2Zvcm0gPSAibG9nMiIpCnBsb3RfcGNhKHdlbGxjb21lX3RpbWVfbmIsIHBsb3RfbGFiZWxzPUZBTFNFKSRwbG90CmBgYAoKIyBERSBhbmFseXNlcwoKIyMgT3V0Y29tZSBhbmQgdGltZQoKVE9ETzogQWxzbyBkbyB0aGlzIHdpdGhvdXQgc3ZhLgoKYGBge3Igb3V0dGltZV9kZXYxfQpwa19zdmFfZGUgPC0gYWxsX3BhaXJ3aXNlKHdlbGxjb21lX3BrLCBtb2RlbF9iYXRjaCA9ICJzdmFzZXEiLCBmaWx0ZXIgPSBUUlVFKQpwa19rZWVwZXJzIDwtIGxpc3QoCiAgICAiY2YiID0gYygiQ3VyZSIsICJmYWlsdXJlIikpCnBrX3N2YV90YWJsZSA8LSBjb21iaW5lX2RlX3RhYmxlcygKICAgIHBrX3N2YV9kZSwga2VlcGVycyA9IHBrX2tlZXBlcnMsCiAgICBleGNlbCA9IGdsdWU6OmdsdWUoImV4Y2VsL3dlbGxjb21lX3BrX3N2YV90YWJsZS12e3Zlcn0ueGxzeCIpKQpwa19zdmFfc2lnIDwtIGV4dHJhY3Rfc2lnbmlmaWNhbnRfZ2VuZXMoCiAgICBwa19zdmFfdGFibGUsCiAgICBleGNlbCA9IGdsdWU6OmdsdWUoImV4Y2VsL3dlbGxjb21lX3BrX3N2YV9zaWctdnt2ZXJ9Lnhsc3giKSkKYGBgCg==