1 Introduction

I want to use this document to examine our first round of persistence samples. I checked my email from Najib and did not find a sample sheet but did find an explanation of the three sample types we expect.

In preparation for this, I downloaded a new hg38 genome. Since the panamensis asembly has not significantly changed (excepting the putative long read genome which I have not yet seen), I am just using the same one.

2 Loading annotation

The hg38 genome I got is brand new (202405), so do not use the archive for a while.

## Ok, so useast.ensembl is failing today, let us use the jan2024 archive?
#hs_annot <- load_biomart_annotations(archive = FALSE, species = "hsapiens")
## Seems like the 202401 archive is a good choice, it is explicitly the hg38_111 release.
## and it is waaaaay faster (like 100x) than useast right now.
hs_annot <- load_biomart_annotations(archive = FALSE, species = "hsapiens", overwrite = TRUE,
                                     year = 2025, month = "08")
## Using mart: ENSEMBL_MART_ENSEMBL from host: useast.ensembl.org.
## Successfully connected to the hsapiens_gene_ensembl database.
## Finished downloading ensembl gene annotations.
## Finished downloading ensembl structure annotations.
## symbol columns is null, pattern matching 'symbol' and taking the first.
## Found 2 potential symbol columns, using the first:hgnc_symbol.
## Including symbols, there are 86371 vs the 533740 gene annotations.
## Not dropping haplotype chromosome annotations, set drop_haplotypes = TRUE if this is bad.
## Saving annotations to hsapiens_biomart_annotations.rda.
## Finished save().
panamensis_orgdb_idx <- grep(pattern = "^org.+panamen.+MHOM.+db$", x = rownames(installed.packages()))
panamensis_orgdb <- tail(rownames(installed.packages())[panamensis_orgdb_idx], n = 1)
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid")
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:hpgltools':
## 
##     annotation, annotation<-, conditions, conditions<-
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind, colnames, dirname,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with 'browseVignettes()'. To cite
##     Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE
## 'select()' returned 1:1 mapping between keys and columns

This is a little silly, but I am going to reload the annotations using the previous invocation to extract the annotation table without having to think. The previous block loads the orgdb for me, so I can just use that to get the fun annotations.

all_columns <- keytypes(get0(panamensis_orgdb))
annot_idx <- grep(pattern = "^ANNOT_", x = all_columns)
annot_columns <- all_columns[annot_idx]
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_APOLLO_LINK_OUT, ANNOT_APOLLO_TRANSCRIPT_DESCRIPTION, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_END_MAX, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENOMIC_SEQUENCE_LENGTH, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MAP_LOCATION, ANNOT_MCMC_LOCATION, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROB_MAP, ANNOT_PROB_MCMC, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_START_MIN, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_IDS, ANNOT_UNIPROT_LINKS
## 'select()' returned 1:1 mapping between keys and columns

3 Collect preprocessed metadata

first_spec <- make_rnaseq_spec()
input <- read_metadata("sample_sheets/persistence_hu_v2.1.3.xlsx")
## Error in read.xlsx.default(xlsxFile = file, sheet = sheet, detectDates = TRUE) : 
##   File does not exist.
## Error in read.xlsx.default(xlsxFile = file, sheet = sheet, detectDates = FALSE) : 
##   File does not exist.
## Error in read_metadata("sample_sheets/persistence_hu_v2.1.3.xlsx"): Unable to read the metadata file: sample_sheets/persistence_hu_v2.1.3.xlsx
colnames(input)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colnames': object 'input' not found
pre_meta <- gather_preprocessing_metadata(
  starting_metadata = input, id_column = "hpgl_identifier",
  specification = first_spec, new_metadata = "persistence_hu_modified.xlsx",
  basedir = "preprocessing", species = c("hg38_115", "lpanamensis_mhomcol_v68"))
## Error: object 'input' not found
head(pre_meta$new_meta)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'pre_meta' not found
summary(pre_meta$new_meta[["salmon_observed_genes"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'pre_meta' not found
hisat_idx <- grep(pattern = "^hisat", x = names(first_spec))
second_spec <- first_spec[hisat_idx]
post_meta <- gather_preprocessing_metadata(
  starting_metadata = pre_meta[["new_meta"]],
  specification = second_spec, basedir = "preprocessing/202405", species = "hg38_111",
  new_metadata = "sample_sheets/tmrc2_persistence_202405_lp_hg.xlsx")

both_meta <- gather_preprocessing_metadata(
  starting_metadata = "sample_sheets/tmrc_persistence_202405.xlsx",
  specification = first_spec,
  basedir = "preprocessing/202405", species= c("lpanamensis_v68", "hg38_111"),
  new_metadata = "sample_sheets/tmrc_persistence_202405_both.xlsx")

4 Collect gene annotations

I should have all my load_xyz_annotation functions return some of the same elements in their retlists.

lp_genes <- lp_annot[["genes"]]
hg_genes <- hs_annot[["gene_annotations"]]

5 Quick peek at the SL samples, hg38 release 111

hg_genes <- hs_annot[["annotation"]]
hg_map <- hs_annot[["gene_tx_map"]]
lp_genes <- lp_annot[["genes"]]

hu_se_salmon <- create_se(pre_meta[["new_meta"]], gene_info = hg_genes,
                          tx_gene_map = hg_map, file_column = "salmon_count_table_hg38_115")
## Error: object 'pre_meta' not found
hu_se_hisat_gene <- create_se(pre_meta[["new_meta"]], gene_info = hg_genes,
                              file_column = "hisat_count_table_hg38_115")
## Error: object 'pre_meta' not found
hu_se_salmon <- set_conditions(hu_se_salmon, fact = "sample_type") %>%
  set_batches("detectionparasiteby7sl")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se_salmon' not found
undef_7sl <- is.na(colData(hu_se)[["detectionparasiteby7sl"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
colData(hu_se)[undef_7sl, "detectionparasiteby7sl"] <- "unknown"
## Error: object 'hu_se' not found
table(colData(hu_se)[["library_type"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
table(colData(hu_se)[["detectionparasiteby7sl"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
table(colData(hu_se)[["clinical_presentation"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
sample_7sl <- paste0(colData(hu_se)[["sample_type"]], "_", colData(hu_se)[["detectionparasiteby7sl"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
colData(hu_se)[["sample_7sl"]] <- sample_7sl
## Error: object 'sample_7sl' not found
healthy_vs_scar <- gsub(x = colData(hu_se)[["condition"]], pattern = "^skin biopsy ", replacement = "")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
colData(hu_se)[["hs"]] <- healthy_vs_scar
## Error: object 'healthy_vs_scar' not found
hu_hs <- subset_se(hu_se, subset = "hs=='healthy'|hs=='scar'") %>%
  set_conditions(fact = "hs")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se' not found

6 HU Metadata

hu_mapped <- plot_metadata_factors(hu_se, column = "salmon_mapped")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
hu_mapped
## Error: object 'hu_mapped' not found
hu_observed <- plot_metadata_factors(hu_se, column = "salmon_observed_genes")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
hu_observed
## Error: object 'hu_observed' not found
hu_pct <- plot_metadata_factors(hu_se, column = "salmon_percent")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
hu_pct
## Error: object 'hu_pct' not found
hu_sankey <- plot_meta_sankey(hu_se, factors = c("detectionparasiteby7sl", "sample_type", "library_type"))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'design' in selecting a method for function 'plot_meta_sankey': object 'hu_se' not found
hu_sankey
## Error: object 'hu_sankey' not found

7 nonzero/libsize/etc

plot_legend(hu_se)
## Error: object 'hu_se' not found
plot_libsize(hu_se)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_libsize': object 'hu_se' not found
plot_nonzero(hu_se, y_intercept = 0.4)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_nonzero': object 'hu_se' not found

8 Normalize

hu_norm <- normalize(hu_se, transform = "log2", convert = "cpm", filter = TRUE, norm = "quant")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_se' not found
plot_corheat(hu_norm)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'hu_norm' not found
hu_norm_pca <- plot_pca(hu_norm)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_norm' not found
pp(file = "images/hu_pca_sampletype.png")
hu_norm_pca$plot
## Error: object 'hu_norm_pca' not found
dev.off()
## png 
##   2
hu_norm_pca
## Error: object 'hu_norm_pca' not found
hu_detected <- subset_se(hu_se, subset = "detectionparasiteby7sl!='unknown'") %>%
  set_conditions(fact = "detectionparasiteby7sl") %>%
  set_batches("sample_type")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se' not found
hu_detect_nb <- normalize(hu_detected, transform = "log2", convert = "cpm",
                          filter = TRUE, batch = "svaseq")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_detected' not found
hu_detect_pca <- plot_pca(hu_detect_nb)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_detect_nb' not found
pp(file = "images/hu_pca_detect_sva.png")
hu_detect_pca$plot
## Error: object 'hu_detect_pca' not found
dev.off()
## png 
##   2
hu_detect_pca
## Error: object 'hu_detect_pca' not found

9 Look at sample type and 7sl

hu_s7sl <- set_conditions(hu_se, fact = "sample_7sl")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se' not found
hu_nasal <- subset_se(hu_s7sl, subset = "sample_type=='Nasal Swab'")
## Error: object 'hu_s7sl' not found
hu_nasal_nb <- normalize(hu_nasal, transform = "log2", convert = "cpm",
                          batch = "svaseq", filter = TRUE)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_nasal' not found
pp(file = "images/nasal_sample_np.png")
plot_pca(hu_nasal_nb)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_nasal_nb' not found
dev.off()
## png 
##   2
hu_wbc <- subset_se(hu_s7sl, subset = "sample_type=='WBCs'")
## Error: object 'hu_s7sl' not found
hu_wbc_nb <- normalize(hu_wbc, transform = "log2", convert = "cpm",
                          batch = "svaseq", filter = TRUE)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_wbc' not found
pp(file = "images/wbc_sample_np.png")
plot_pca(hu_wbc_nb)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_wbc_nb' not found
dev.off()
## png 
##   2
short_factor <- gsub(x = as.character(colData(hu_nasal)[["condition"]]), pattern = ".*_(.*)$", replacement = "\\1")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_nasal' not found
hu_nasal <- set_conditions(hu_nasal, fact = as.factor(short_factor))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_nasal' not found
hu_nasal_np <- subset_se(hu_nasal, subset = "condition!='unknown'")
## Error: object 'hu_nasal' not found
hu_nasal_de <- all_pairwise(hu_nasal_np, filter = TRUE, force = TRUE,
                            model_fstring = "~ 0 + condition", model_svs = "svaseq")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'hu_nasal_np' not found
hu_nasal_de
## Error: object 'hu_nasal_de' not found
hu_nasal_table <- combine_de_tables(hu_nasal_de, excel = "excel/persist_table.xlsx")
## Error: object 'hu_nasal_de' not found
hu_nasal_table
## Error: object 'hu_nasal_table' not found
hu_nasal_sig <- extract_significant_genes(hu_nasal_table, excel = "excel/persist_sig.xlsx")
## Error: object 'hu_nasal_table' not found
hu_nasal_sig
## Error: object 'hu_nasal_sig' not found

10 Healthy vs Scar samples

One query from our last meeting which I forgot about until I reread my TODO notes: compare the samples marked as healthy compared to those marked as scar. These are two distantly separate skin biopsies of the same person.

hu_hs_de <- all_pairwise(hu_hs, filter = TRUE, force = TRUE,
                         model_svs = "svaseq", model_fstring = "~ 0 + condition")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'hu_hs' not found
hu_hs_table <- combine_de_tables(hu_hs_de, excel = "excel/healthy_vs_scar_table.xlsx")
## Error: object 'hu_hs_de' not found
hu_hs_table
## Error: object 'hu_hs_table' not found
hu_hs_sig <- extract_significant_genes(hu_hs_table, excel = "excel/healthy_vs_scar_sig.xlsx")
## Error: object 'hu_hs_table' not found
hu_hs_sig
## Error: object 'hu_hs_sig' not found

11 Take a peek at the kraken results

hu_kraken <- create_se(pre_meta[["new_meta"]], file_column = "kraken_matrix", handle_na = "zero")
## Error: object 'pre_meta' not found
hu_kraken <- set_conditions(hu_kraken, fact = "sample_type") %>%
  set_batches("detectionparasiteby7sl")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_kraken' not found
kraken_norm <- normalize(hu_kraken, filter = TRUE, norm = "cpm", transform = "log2")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_kraken' not found
plot_corheat(kraken_norm)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'kraken_norm' not found
plot_disheat(kraken_norm)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'kraken_norm' not found
plot_pca(kraken_norm)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'kraken_norm' not found
nasal_kraken <- subset_se(hu_kraken, subset = "condition=='Nasal Swab'")
## Error: object 'hu_kraken' not found
nasal_norm <- normalize(nasal_kraken, filter = TRUE, norm = "cpm", transform = "log2")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'nasal_kraken' not found
plot_corheat(nasal_norm)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'nasal_norm' not found
kraken_bacteria <- gsub(x = colData(hu_kraken)[["kraken_matrix"]],
                        pattern = "viral", replacement = "bacteria")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_kraken' not found
colData(hu_kraken)[["kraken_bacteria"]] <- kraken_bacteria
## Error: object 'kraken_bacteria' not found
hu_kraken_bac <- create_se(colData(hu_kraken), file_column = "kraken_bacteria", handle_na = "zero")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_kraken' not found
hu_kraken <- set_conditions(hu_kraken, fact = "sample_type") %>%
  set_batches("detectionparasiteby7sl")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_kraken' not found
---
title: "Examining human samples"
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
bibliography: atb.bib
output:
  html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: zenburn
    keep_md: false
    mode: selfcontained
    number_sections: true
    self_contained: true
    theme: readable
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: false
  rmdformats::readthedown:
    code_download: true
    code_folding: show
    df_print: paged
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: zenburn
    width: 300
    keep_md: false
    mode: selfcontained
    toc_float: true
  BiocStyle::html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: zenburn
    keep_md: false
    mode: selfcontained
    toc_float: true
---

<style type="text/css">
body, td {
  font-size: 16px;
}
code.r{
  font-size: 16px;
}
pre {
  font-size: 16px
}
body .main-container {
  max-width: 1600px;
}
</style>

```{r options, include=FALSE}
library(hpgltools)

library(ggplot2)
library(reticulate)
tt <- try(devtools::load_all("~/hpgltools"))
knitr::opts_knit$set(
  progress = TRUE, verbose = TRUE, width = 90, echo = TRUE)
knitr::opts_chunk$set(
  error = TRUE, fig.width = 8, fig.height = 8, fig.retina = 2,
  out.width = "100%", dev = "png",
  dev.args = list(png = list(type = "cairo-png")))
old_options <- options(digits = 4, stringsAsFactors = FALSE, knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size = 12))
ver <- "202405"
previous_file <- ""
ver <- format(Sys.Date(), "%Y%m%d")

##tmp <- sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz")))
rmd_file <- "human_samples_2025.Rmd"
```

# Introduction

I want to use this document to examine our first round of persistence
samples.  I checked my email from Najib and did not find a sample
sheet but did find an explanation of the three sample types we expect.

In preparation for this, I downloaded a new hg38 genome.  Since the
panamensis asembly has not significantly changed (excepting the
putative long read genome which I have not yet seen), I am just using
the same one.

# Loading annotation

The hg38 genome I got is brand new (202405), so do not use the archive
for a while.

```{r}
## Ok, so useast.ensembl is failing today, let us use the jan2024 archive?
#hs_annot <- load_biomart_annotations(archive = FALSE, species = "hsapiens")
## Seems like the 202401 archive is a good choice, it is explicitly the hg38_111 release.
## and it is waaaaay faster (like 100x) than useast right now.
hs_annot <- load_biomart_annotations(archive = FALSE, species = "hsapiens", overwrite = TRUE,
                                     year = 2025, month = "08")

panamensis_orgdb_idx <- grep(pattern = "^org.+panamen.+MHOM.+db$", x = rownames(installed.packages()))
panamensis_orgdb <- tail(rownames(installed.packages())[panamensis_orgdb_idx], n = 1)
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid")
```

This is a little silly, but I am going to reload the annotations using
the previous invocation to extract the annotation table without having
to think.  The previous block loads the orgdb for me, so I can just
use that to get the fun annotations.

```{r}
all_columns <- keytypes(get0(panamensis_orgdb))
annot_idx <- grep(pattern = "^ANNOT_", x = all_columns)
annot_columns <- all_columns[annot_idx]
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid", fields = annot_columns)
```

# Collect preprocessed metadata

```{r}
first_spec <- make_rnaseq_spec()
input <- read_metadata("sample_sheets/persistence_hu_v2.1.3.xlsx")
colnames(input)

pre_meta <- gather_preprocessing_metadata(
  starting_metadata = input, id_column = "hpgl_identifier",
  specification = first_spec, new_metadata = "persistence_hu_modified.xlsx",
  basedir = "preprocessing", species = c("hg38_115", "lpanamensis_mhomcol_v68"))
head(pre_meta$new_meta)
summary(pre_meta$new_meta[["salmon_observed_genes"]])
```


```{r, eval=FALSE}
hisat_idx <- grep(pattern = "^hisat", x = names(first_spec))
second_spec <- first_spec[hisat_idx]
post_meta <- gather_preprocessing_metadata(
  starting_metadata = pre_meta[["new_meta"]],
  specification = second_spec, basedir = "preprocessing/202405", species = "hg38_111",
  new_metadata = "sample_sheets/tmrc2_persistence_202405_lp_hg.xlsx")

both_meta <- gather_preprocessing_metadata(
  starting_metadata = "sample_sheets/tmrc_persistence_202405.xlsx",
  specification = first_spec,
  basedir = "preprocessing/202405", species= c("lpanamensis_v68", "hg38_111"),
  new_metadata = "sample_sheets/tmrc_persistence_202405_both.xlsx")
```

# Collect gene annotations

I should have all my load_xyz_annotation functions return some of the
same elements in their retlists.

```{r}
lp_genes <- lp_annot[["genes"]]
hg_genes <- hs_annot[["gene_annotations"]]
```

# Quick peek at the SL samples, hg38 release 111

```{r}
hg_genes <- hs_annot[["annotation"]]
hg_map <- hs_annot[["gene_tx_map"]]
lp_genes <- lp_annot[["genes"]]

hu_se_salmon <- create_se(pre_meta[["new_meta"]], gene_info = hg_genes,
                          tx_gene_map = hg_map, file_column = "salmon_count_table_hg38_115")
hu_se_hisat_gene <- create_se(pre_meta[["new_meta"]], gene_info = hg_genes,
                              file_column = "hisat_count_table_hg38_115")


hu_se_salmon <- set_conditions(hu_se_salmon, fact = "sample_type") %>%
  set_batches("detectionparasiteby7sl")

undef_7sl <- is.na(colData(hu_se)[["detectionparasiteby7sl"]])
colData(hu_se)[undef_7sl, "detectionparasiteby7sl"] <- "unknown"

table(colData(hu_se)[["library_type"]])
table(colData(hu_se)[["detectionparasiteby7sl"]])
table(colData(hu_se)[["clinical_presentation"]])
sample_7sl <- paste0(colData(hu_se)[["sample_type"]], "_", colData(hu_se)[["detectionparasiteby7sl"]])
colData(hu_se)[["sample_7sl"]] <- sample_7sl

healthy_vs_scar <- gsub(x = colData(hu_se)[["condition"]], pattern = "^skin biopsy ", replacement = "")
colData(hu_se)[["hs"]] <- healthy_vs_scar
hu_hs <- subset_se(hu_se, subset = "hs=='healthy'|hs=='scar'") %>%
  set_conditions(fact = "hs")
```

# HU Metadata

```{r}
hu_mapped <- plot_metadata_factors(hu_se, column = "salmon_mapped")
hu_mapped
hu_observed <- plot_metadata_factors(hu_se, column = "salmon_observed_genes")
hu_observed
hu_pct <- plot_metadata_factors(hu_se, column = "salmon_percent")
hu_pct
hu_sankey <- plot_meta_sankey(hu_se, factors = c("detectionparasiteby7sl", "sample_type", "library_type"))
hu_sankey
```

# nonzero/libsize/etc

```{r}
plot_legend(hu_se)

plot_libsize(hu_se)

plot_nonzero(hu_se, y_intercept = 0.4)
```

# Normalize

```{r}
hu_norm <- normalize(hu_se, transform = "log2", convert = "cpm", filter = TRUE, norm = "quant")
plot_corheat(hu_norm)

hu_norm_pca <- plot_pca(hu_norm)

pp(file = "images/hu_pca_sampletype.png")
hu_norm_pca$plot
dev.off()
hu_norm_pca

hu_detected <- subset_se(hu_se, subset = "detectionparasiteby7sl!='unknown'") %>%
  set_conditions(fact = "detectionparasiteby7sl") %>%
  set_batches("sample_type")
hu_detect_nb <- normalize(hu_detected, transform = "log2", convert = "cpm",
                          filter = TRUE, batch = "svaseq")
hu_detect_pca <- plot_pca(hu_detect_nb)
pp(file = "images/hu_pca_detect_sva.png")
hu_detect_pca$plot
dev.off()
hu_detect_pca
```

# Look at sample type and 7sl

```{r}
hu_s7sl <- set_conditions(hu_se, fact = "sample_7sl")

hu_nasal <- subset_se(hu_s7sl, subset = "sample_type=='Nasal Swab'")
hu_nasal_nb <- normalize(hu_nasal, transform = "log2", convert = "cpm",
                          batch = "svaseq", filter = TRUE)
pp(file = "images/nasal_sample_np.png")
plot_pca(hu_nasal_nb)
dev.off()

hu_wbc <- subset_se(hu_s7sl, subset = "sample_type=='WBCs'")
hu_wbc_nb <- normalize(hu_wbc, transform = "log2", convert = "cpm",
                          batch = "svaseq", filter = TRUE)
pp(file = "images/wbc_sample_np.png")
plot_pca(hu_wbc_nb)
dev.off()
```

```{r}
short_factor <- gsub(x = as.character(colData(hu_nasal)[["condition"]]), pattern = ".*_(.*)$", replacement = "\\1")
hu_nasal <- set_conditions(hu_nasal, fact = as.factor(short_factor))
hu_nasal_np <- subset_se(hu_nasal, subset = "condition!='unknown'")

hu_nasal_de <- all_pairwise(hu_nasal_np, filter = TRUE, force = TRUE,
                            model_fstring = "~ 0 + condition", model_svs = "svaseq")
hu_nasal_de

hu_nasal_table <- combine_de_tables(hu_nasal_de, excel = "excel/persist_table.xlsx")
hu_nasal_table

hu_nasal_sig <- extract_significant_genes(hu_nasal_table, excel = "excel/persist_sig.xlsx")
hu_nasal_sig
```

# Healthy vs Scar samples

One query from our last meeting which I forgot about until I reread my
TODO notes: compare the samples marked as healthy compared to those
marked as scar.  These are two distantly separate skin biopsies of the
same person.

```{r}
hu_hs_de <- all_pairwise(hu_hs, filter = TRUE, force = TRUE,
                         model_svs = "svaseq", model_fstring = "~ 0 + condition")
hu_hs_table <- combine_de_tables(hu_hs_de, excel = "excel/healthy_vs_scar_table.xlsx")
hu_hs_table
hu_hs_sig <- extract_significant_genes(hu_hs_table, excel = "excel/healthy_vs_scar_sig.xlsx")
hu_hs_sig
```

# Take a peek at the kraken results

```{r}
hu_kraken <- create_se(pre_meta[["new_meta"]], file_column = "kraken_matrix", handle_na = "zero")
hu_kraken <- set_conditions(hu_kraken, fact = "sample_type") %>%
  set_batches("detectionparasiteby7sl")

kraken_norm <- normalize(hu_kraken, filter = TRUE, norm = "cpm", transform = "log2")
plot_corheat(kraken_norm)
plot_disheat(kraken_norm)
plot_pca(kraken_norm)

nasal_kraken <- subset_se(hu_kraken, subset = "condition=='Nasal Swab'")
nasal_norm <- normalize(nasal_kraken, filter = TRUE, norm = "cpm", transform = "log2")
plot_corheat(nasal_norm)

kraken_bacteria <- gsub(x = colData(hu_kraken)[["kraken_matrix"]],
                        pattern = "viral", replacement = "bacteria")
colData(hu_kraken)[["kraken_bacteria"]] <- kraken_bacteria
hu_kraken_bac <- create_se(colData(hu_kraken), file_column = "kraken_bacteria", handle_na = "zero")
hu_kraken <- set_conditions(hu_kraken, fact = "sample_type") %>%
  set_batches("detectionparasiteby7sl")
```
