This dataset is very interesting, but lacking power to answer some of the really interesting questions it poses. There are 45 samples, 18 cure and 27 fail. There are 5 main visits, but the third has a short-term kinetics aspect which makes it absolutely fascinating. Unfortunately, the questions of cure/fail cannot be properly addressed in this short time-frame. Once I create the expressionsets, you will see what I mean…
samplesheet <- "sample_sheets/all_samples.xlsx"
We take the annotation data from ensembl’s biomart instance. The genome which was used to map the data was hg38 revision 100. My default when using biomart is to load the data from 1 year before the current date.
hs_annot <- load_biomart_annotations(year = "2019")[["annotation"]]
## The biomart annotations file already exists, loading from it.
hs_annot[["transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
rownames(hs_annot) <- paste0("gene:", make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE))
tx_gene_map <- hs_annot[, c("transcript", "ensembl_gene_id")]
summary(hs_annot)
## ensembl_transcript_id ensembl_gene_id version transcript_version
## Length:227334 Length:227334 Min. : 1.0 Min. : 1.00
## Class :character Class :character 1st Qu.: 6.0 1st Qu.: 1.00
## Mode :character Mode :character Median :12.0 Median : 1.00
## Mean :10.6 Mean : 3.08
## 3rd Qu.:15.0 3rd Qu.: 5.00
## Max. :27.0 Max. :17.00
##
## hgnc_symbol description gene_biotype cds_length
## Length:227334 Length:227334 Length:227334 Min. : 3
## Class :character Class :character Class :character 1st Qu.: 357
## Mode :character Mode :character Mode :character Median : 693
## Mean : 1139
## 3rd Qu.: 1446
## Max. :107976
## NA's :127031
## chromosome_name strand start_position end_position
## Length:227334 Length:227334 Min. :5.77e+02 Min. :6.47e+02
## Class :character Class :character 1st Qu.:3.11e+07 1st Qu.:3.12e+07
## Mode :character Mode :character Median :6.04e+07 Median :6.06e+07
## Mean :7.41e+07 Mean :7.42e+07
## 3rd Qu.:1.09e+08 3rd Qu.:1.09e+08
## Max. :2.49e+08 Max. :2.49e+08
##
## transcript
## Length:227334
## Class :character
## Mode :character
##
##
##
##
hs_go <- load_biomart_go()[["go"]]
## The biomart annotations file already exists, loading from it.
hs_length <- hs_annot[, c("ensembl_gene_id", "cds_length")]
colnames(hs_length) <- c("ID", "length")
wellcome_pk <- create_expt(metadata = samplesheet, gene_info = hs_annot,
file_column = "hg38100hisatfile") %>%
set_expt_conditions(fact="clinicaloutcome") %>%
set_expt_batches(fact = "visit")
## Reading the sample metadata.
## The sample definitions comprises: 45 rows(samples) and 42 columns(metadata fields).
## Matched 21440 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21481 features and 45 samples.
wellcome_time <- set_expt_conditions(wellcome_pk, fact = "visit") %>%
set_expt_batches(fact = "clinicaloutcome")
## Start with cure/fail
table(pData(wellcome_pk)[["clinicaloutcome"]])
##
## Cure failure
## 26 19
## Then the global visit (e.g. there will be lots of visit 3, when the kinetics aspect happened)
table(pData(wellcome_pk)[["visit"]])
##
## v1 v2 v3 v4 v5
## 8 4 25 4 4
## Subdivide the draws on visit 3
table(pData(wellcome_pk)[["visithour"]])
##
## v1_h0 v1_h1 v2_h1 v3_h0 v3_h1 v3_h1.5 v3_h2 v3_h24 v3_h3 v3_h5
## 4 4 4 3 4 3 3 3 3 3
## v3_h8 v4_h1 v5_h1
## 3 4 4
## And consider this in the context of cure/fail
table(pData(wellcome_pk)[["visithouroutcome"]])
##
## Cure_v1_h0 Cure_v1_h1 Cure_v2_h1 Cure_v3_h0 Cure_v3_h1
## 2 2 2 2 2
## Cure_v3_h1.5 Cure_v3_h2 Cure_v3_h24 Cure_v3_h3 Cure_v3_h5
## 2 2 2 2 2
## Cure_v3_h8 Cure_v4_h1 Cure_v5_h1 failure_v1_h0 failure_v1_h1
## 2 2 2 2 2
## failure_v2_h1 failure_v3_h0 failure_v3_h1 failure_v3_h1.5 failure_v3_h2
## 2 1 2 1 1
## failure_v3_h24 failure_v3_h3 failure_v3_h5 failure_v3_h8 failure_v4_h1
## 1 1 1 1 2
## failure_v5_h1
## 2
plot_libsize(wellcome_pk)$plot
plot_nonzero(wellcome_pk)$plot
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_boxplot(wellcome_pk)
## 263736 entries are 0. We are on a log scale, adding 1 to the data.
wellcome_pk_norm <- normalize_expt(wellcome_pk, norm = "quant", convert = "cpm",
filter = TRUE, transform = "log2")
## Removing 8956 low-count genes (12525 remaining).
## transform_counts: Found 14 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_pk_norm, plot_labels=FALSE)$plot
wellcome_pk_varpart <- simple_varpart(wellcome_pk_norm)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
##
## Total:90 s
wellcome_pk_varpart$partition_plot
wellcome_pk_nb <- normalize_expt(wellcome_pk, convert = "cpm",
filter = TRUE, transform = "log2", batch = "svaseq")
## Removing 8956 low-count genes (12525 remaining).
## Setting 249 low elements to zero.
## transform_counts: Found 249 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_pk_nb, plot_labels=FALSE)$plot
wellcome_pk_nb_varpart <- simple_varpart(wellcome_pk_nb)
##
## Total:91 s
wellcome_pk_nb_varpart$partition_plot
wellcome_time_norm <- normalize_expt(wellcome_time, norm = "quant", convert = "cpm",
filter = TRUE, transform = "log2")
## Removing 8956 low-count genes (12525 remaining).
## transform_counts: Found 14 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_norm, plot_labels=FALSE)$plot
wellcome_time_nb <- normalize_expt(wellcome_time, convert = "cpm",
batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 8956 low-count genes (12525 remaining).
## Setting 128 low elements to zero.
## transform_counts: Found 128 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_nb, plot_labels=FALSE)$plot
TODO: Also do this without sva.
pk_sva_de <- all_pairwise(wellcome_pk, model_batch = "svaseq", filter = TRUE)
## This DE analysis will perform all pairwise comparisons among:
##
## Cure failure
## 26 19
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (12525 remaining).
## Setting 249 low elements to zero.
## transform_counts: Found 249 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
pk_keepers <- list(
"cf" = c("Cure", "failure"))
pk_sva_table <- combine_de_tables(
pk_sva_de, keepers = pk_keepers,
excel = glue::glue("excel/wellcome_pk_sva_table-v{ver}.xlsx"))
## Deleting the file excel/wellcome_pk_sva_table-v202209.xlsx before writing the tables.
pk_sva_sig <- extract_significant_genes(
pk_sva_table,
excel = glue::glue("excel/wellcome_pk_sva_sig-v{ver}.xlsx"))
## Deleting the file excel/wellcome_pk_sva_sig-v202209.xlsx before writing the tables.