1 Introduction

2 Gene annotations

hg_annot <- load_biomart_annotations()

## The biomart annotations file already exists, loading from it.

hg_df <- hg_annot[["gene_annotations"]]

3 Sample sheet annotation

We have an older revision of the sample sheet for this dataset. I added some samples from Dr. Mosser in order to compare against M1/M2 activation states. These extra samples are not likely to be the most appropriate because they are not U937 samples.

hg38_se <- create_se("sample_sheets/macrogen_samples.xlsx",
                     file_column = "hisat_hg38", gene_info = hg_df)

## Reading the sample metadata.

## Checking the state of the condition column.

## Warning in extract_metadata(metadata, id_column = id_column, condition_column = condition_column, : There
## were NA values in the condition column, setting them to 'undefined'.

## Checking the state of the batch column.

## Warning in extract_metadata(metadata, id_column = id_column, condition_column = condition_column, : There
## were NA values in the condition column, setting them to 'undefined'.

## Checking the condition factor.

## The sample definitions comprises: 54 rows(samples) and 21 columns(metadata fields).

## Warning in create_se("sample_sheets/macrogen_samples.xlsx", file_column = "hisat_hg38", : Some samples
## were removed when cross referencing the samples against the count data.

## Matched 21405 annotations and counts.

## Some annotations were lost in merging, setting them to 'undefined'.

## The final summarized experiment has 21481 rows and 21 columns.

hg38_sampletype <- set_conditions(hg38_se, fact = "sampletype")

## The numbers of samples by condition are:

## 
## Asymptomatic      Chronic      control      Healthy 
##            5            5            3            2

4 Plots

hg38_norm <- normalize(hg38_sampletype, convert = "cpm", norm = "quant",
                       transform = "log2", filter = TRUE)

## Removing 9903 low-count genes (11578 remaining).

## transform_counts: Found 26 values equal to 0, adding 1 to the matrix.

plot_libsize(hg38_sampletype)

## Library sizes of 15 samples, 
## ranging from 1,872,219 to 7,805,747.

plot_nonzero(hg38_sampletype, y_intercept = 0.65)

## The following samples have less than 13962.65 genes.

##  [1] "m1"      "m2"      "a_20179" "c_10036" "a_20187" "c_10046" "a_20132" "c_10063" "a_20133" "c_10093"
## [11] "su1160"  "a_20134" "c_10073"

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the hpgltools package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## A non-zero genes plot of 15 samples.
## These samples have an average 3.382 CPM coverage and 13550 genes observed, ranging from 12975 to
## 14728.

plot_corheat(hg38_norm)

## A heatmap of pairwise sample correlations ranging from: 
## 0.932974671519701 to 0.990409066934894.

plot_pca(hg38_norm)

## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by Asymptomatic, Chronic, control, Healthy
## Shapes are defined by undefined.

hg38_nb <- normalize(hg38_sampletype, convert = "cpm", batch = "svaseq",
                     filter = TRUE, transform = "log2")

## Removing 9903 low-count genes (11578 remaining).
## transform_counts: Found 169 values less than 0.
## transform_counts: Found 169 values equal to 0, adding 1 to the matrix.

plot_pca(hg38_nb)

## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by Asymptomatic, Chronic, control, Healthy
## Shapes are defined by undefined.

5 DE

keepers <- list(
  "chr_asy" = c("Chronic", "Asymptomatic"),
  "chr_hea" = c("Chronic", "Healthy"),
  "chr_con" = c("Chronic", "control"),
  "asy_hea" = c("Asymptomatic", "Healthy"),
  "asy_con" = c("Asymptomatic", "control"),
  "hea_con" = c("Healthy", "control"))

hg38_de <- all_pairwise(hg38_sampletype, keepers = keepers, model_fstring = "~ 0 + condition",
                        model_svs = "svaseq", filter = TRUE)

## Asymptomatic      Chronic      control      Healthy 
##            5            5            3            2

## Removing 9903 low-count genes (11578 remaining).

## Basic step 0/3: Normalizing data.

## Basic step 0/3: Converting data.

## I think this is failing? SummarizedExperiment

## Basic step 0/3: Transforming data.

## Setting 4687 entries to zero.

## This received a matrix of SVs.

## converting counts to integer mode

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## conditions
## Asymptomatic      Chronic      control      Healthy 
##            5            5            3            2

## conditions
## Asymptomatic      Chronic      control      Healthy 
##            5            5            3            2

## conditions
## Asymptomatic      Chronic      control      Healthy 
##            5            5            3            2

hg38_de

## A pairwise differential expression with results from: basic, deseq, ebseq, edger, limma, noiseq.

## This used a surrogate/batch estimate from: svaseq.

## The primary analysis performed 6 comparisons.

hg38_tables <- combine_de_tables(hg38_de, excel = "excel/hg38_tables.xlsx")

## Deleting the file excel/hg38_tables.xlsx before writing the tables.

## Looking for subscript invalid names, start of extract_keepers.

## Looking for subscript invalid names, end of extract_keepers.

hg38_tables

## A set of combined differential expression results.

##                     table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 Chronic_vs_Asymptomatic          31            41          43            73          46            28
## 2      Chronic_vs_Healthy          59            43          98            77          34            46
## 3      Chronic_vs_control         680           458         748           551         611           554
## 4 Asymptomatic_vs_Healthy         112            83         183           127          92           113
## 5 Asymptomatic_vs_control         759           492         837           575         646           626
## 6      Healthy_vs_control         427           311         488           421         414           392

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the UpSetR package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the UpSetR package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## Plot describing unique/shared genes in a differential expression table.

hg38_sig <- extract_significant_genes(hg38_tables, excel = "excel/hg38_sig.xlsx",
                                      according_to = "deseq")

## Deleting the file excel/hg38_sig.xlsx before writing the tables.

hg38_sig

## A set of genes deemed significant according to deseq.

## The parameters defining significant were:

## LFC cutoff: 1 adj P cutoff: 0.05

##                         deseq_up deseq_down
## Chronic_vs_Asymptomatic       31         41
## Chronic_vs_Healthy            59         43
## Chronic_vs_control           680        458
## Asymptomatic_vs_Healthy      112         83
## Asymptomatic_vs_control      759        492
## Healthy_vs_control           427        311

6 Repeat GSE(A) analyses

all_gp <- all_gprofiler(hg38_sig)
all_cp <- all_cprofiler(hg38_sig, hg38_tables)

## Warning in simple_clusterprofiler(sig_genes = structure(list(ensembl_transcript_id = c("ENST00000492807",
## : No genes were found between the significant genes and the universe.

## Error in testForValidKeytype(x, keytype) : 
##   'keytype' must be a a single string

## Error in `simple_cl[["kegg_universe"]]`:
## ! subscript out of bounds

pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
message(paste0("Saving to ", savefile))
tmp <- sm(saveme(filename = savefile))

tmp <- loadme(filename = savefile)

LS0tCnRpdGxlOiAiUmV2aXNpdGluZyBhIG1hY3JvZ2VuIGV4cGVyaW1lbnQuIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB6ZW5idXJuCiAgICBrZWVwX21kOiBmYWxzZQogICAgbW9kZTogc2VsZmNvbnRhaW5lZAogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBzZWxmX2NvbnRhaW5lZDogdHJ1ZQogICAgdGhlbWU6IHJlYWRhYmxlCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDoKICAgICAgY29sbGFwc2VkOiBmYWxzZQogICAgICBzbW9vdGhfc2Nyb2xsOiBmYWxzZQotLS0KCgpgYGB7ciBvcHRpb25zLCBpbmNsdWRlID0gRkFMU0V9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZm9yY2F0cykKbGlicmFyeShnbHVlKQpsaWJyYXJ5KGhwZ2x0b29scykKbGlicmFyeSh0aWR5cikKCmtuaXRyOjpvcHRzX2tuaXQkc2V0KHByb2dyZXNzID0gVFJVRSwgdmVyYm9zZSA9IFRSVUUsIHdpZHRoID0gOTAsIGVjaG8gPSBUUlVFKQprbml0cjo6b3B0c19jaHVuayRzZXQoCiAgZXJyb3IgPSBUUlVFLCBmaWcud2lkdGggPSA4LCBmaWcuaGVpZ2h0ID0gOCwgZmlnLnJldGluYSA9IDIsCiAgb3V0LndpZHRoID0gIjEwMCUiLCBkZXYgPSAicG5nIiwKICBkZXYuYXJncyA9IGxpc3QocG5nID0gbGlzdCh0eXBlID0gImNhaXJvLXBuZyIpKSkKb2xkX29wdGlvbnMgPC0gb3B0aW9ucyhkaWdpdHMgPSA0LCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UsIGtuaXRyLmR1cGxpY2F0ZS5sYWJlbCA9ICJhbGxvdyIpCmdncGxvdDI6OnRoZW1lX3NldChnZ3Bsb3QyOjp0aGVtZV9idyhiYXNlX3NpemUgPSAxMikpCnZlciA8LSBTeXMuZ2V0ZW52KCJWRVJTSU9OIikKcnVuZGF0ZSA8LSBmb3JtYXQoU3lzLkRhdGUoKSwgZm9ybWF0ID0gIiVZJW0lZCIpCnJtZF9maWxlIDwtICJwcm90X3ZzX3JuYS5SbWQiCnNhdmVmaWxlIDwtIGdzdWIocGF0dGVybiA9ICJcXC5SbWQiLCByZXBsYWNlID0gIlxcLnJkYVxcLnh6IiwgeCA9IHJtZF9maWxlKQpgYGAKCiMgSW50cm9kdWN0aW9uCgojIEdlbmUgYW5ub3RhdGlvbnMKCmBgYHtyfQpoZ19hbm5vdCA8LSBsb2FkX2Jpb21hcnRfYW5ub3RhdGlvbnMoKQpoZ19kZiA8LSBoZ19hbm5vdFtbImdlbmVfYW5ub3RhdGlvbnMiXV0KYGBgCgojIFNhbXBsZSBzaGVldCBhbm5vdGF0aW9uCgpXZSBoYXZlIGFuIG9sZGVyIHJldmlzaW9uIG9mIHRoZSBzYW1wbGUgc2hlZXQgZm9yIHRoaXMgZGF0YXNldC4gIEkKYWRkZWQgc29tZSBzYW1wbGVzIGZyb20gRHIuIE1vc3NlciBpbiBvcmRlciB0byBjb21wYXJlIGFnYWluc3QgTTEvTTIKYWN0aXZhdGlvbiBzdGF0ZXMuICBUaGVzZSBleHRyYSBzYW1wbGVzIGFyZSBub3QgbGlrZWx5IHRvIGJlIHRoZSBtb3N0CmFwcHJvcHJpYXRlIGJlY2F1c2UgdGhleSBhcmUgbm90IFU5Mzcgc2FtcGxlcy4KCmBgYHtyfQpoZzM4X3NlIDwtIGNyZWF0ZV9zZSgic2FtcGxlX3NoZWV0cy9tYWNyb2dlbl9zYW1wbGVzLnhsc3giLAogICAgICAgICAgICAgICAgICAgICBmaWxlX2NvbHVtbiA9ICJoaXNhdF9oZzM4IiwgZ2VuZV9pbmZvID0gaGdfZGYpCgpoZzM4X3NhbXBsZXR5cGUgPC0gc2V0X2NvbmRpdGlvbnMoaGczOF9zZSwgZmFjdCA9ICJzYW1wbGV0eXBlIikKYGBgCgojIFBsb3RzCgpgYGB7cn0KaGczOF9ub3JtIDwtIG5vcm1hbGl6ZShoZzM4X3NhbXBsZXR5cGUsIGNvbnZlcnQgPSAiY3BtIiwgbm9ybSA9ICJxdWFudCIsCiAgICAgICAgICAgICAgICAgICAgICAgdHJhbnNmb3JtID0gImxvZzIiLCBmaWx0ZXIgPSBUUlVFKQoKcGxvdF9saWJzaXplKGhnMzhfc2FtcGxldHlwZSkKcGxvdF9ub256ZXJvKGhnMzhfc2FtcGxldHlwZSwgeV9pbnRlcmNlcHQgPSAwLjY1KQpwbG90X2NvcmhlYXQoaGczOF9ub3JtKQpwbG90X3BjYShoZzM4X25vcm0pCgpoZzM4X25iIDwtIG5vcm1hbGl6ZShoZzM4X3NhbXBsZXR5cGUsIGNvbnZlcnQgPSAiY3BtIiwgYmF0Y2ggPSAic3Zhc2VxIiwKICAgICAgICAgICAgICAgICAgICAgZmlsdGVyID0gVFJVRSwgdHJhbnNmb3JtID0gImxvZzIiKQpwbG90X3BjYShoZzM4X25iKQpgYGAKCiMgREUKCmBgYHtyfQprZWVwZXJzIDwtIGxpc3QoCiAgImNocl9hc3kiID0gYygiQ2hyb25pYyIsICJBc3ltcHRvbWF0aWMiKSwKICAiY2hyX2hlYSIgPSBjKCJDaHJvbmljIiwgIkhlYWx0aHkiKSwKICAiY2hyX2NvbiIgPSBjKCJDaHJvbmljIiwgImNvbnRyb2wiKSwKICAiYXN5X2hlYSIgPSBjKCJBc3ltcHRvbWF0aWMiLCAiSGVhbHRoeSIpLAogICJhc3lfY29uIiA9IGMoIkFzeW1wdG9tYXRpYyIsICJjb250cm9sIiksCiAgImhlYV9jb24iID0gYygiSGVhbHRoeSIsICJjb250cm9sIikpCgpoZzM4X2RlIDwtIGFsbF9wYWlyd2lzZShoZzM4X3NhbXBsZXR5cGUsIGtlZXBlcnMgPSBrZWVwZXJzLCBtb2RlbF9mc3RyaW5nID0gIn4gMCArIGNvbmRpdGlvbiIsCiAgICAgICAgICAgICAgICAgICAgICAgIG1vZGVsX3N2cyA9ICJzdmFzZXEiLCBmaWx0ZXIgPSBUUlVFKQpoZzM4X2RlCgpoZzM4X3RhYmxlcyA8LSBjb21iaW5lX2RlX3RhYmxlcyhoZzM4X2RlLCBleGNlbCA9ICJleGNlbC9oZzM4X3RhYmxlcy54bHN4IikKaGczOF90YWJsZXMKCmhnMzhfc2lnIDwtIGV4dHJhY3Rfc2lnbmlmaWNhbnRfZ2VuZXMoaGczOF90YWJsZXMsIGV4Y2VsID0gImV4Y2VsL2hnMzhfc2lnLnhsc3giLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGFjY29yZGluZ190byA9ICJkZXNlcSIpCmhnMzhfc2lnCmBgYAoKIyBSZXBlYXQgR1NFKEEpIGFuYWx5c2VzCgpgYGB7cn0KYWxsX2dwIDwtIGFsbF9ncHJvZmlsZXIoaGczOF9zaWcpCmFsbF9jcCA8LSBhbGxfY3Byb2ZpbGVyKGhnMzhfc2lnLCBoZzM4X3RhYmxlcykKYGBgCgpgYGB7ciBzYXZlbWUsIGV2YWw9RkFMU0V9CnBhbmRlcjo6cGFuZGVyKHNlc3Npb25JbmZvKCkpCm1lc3NhZ2UocGFzdGUwKCJUaGlzIGlzIGhwZ2x0b29scyBjb21taXQ6ICIsIGdldF9naXRfY29tbWl0KCkpKQptZXNzYWdlKHBhc3RlMCgiU2F2aW5nIHRvICIsIHNhdmVmaWxlKSkKdG1wIDwtIHNtKHNhdmVtZShmaWxlbmFtZSA9IHNhdmVmaWxlKSkKYGBgCgpgYGB7ciBsb2FkbWVfYWZ0ZXIsIGV2YWw9RkFMU0V9CnRtcCA8LSBsb2FkbWUoZmlsZW5hbWUgPSBzYXZlZmlsZSkKYGBgCg==

Revisiting a macrogen experiment.

atb abelew@gmail.com

2026-04-14