I want to use this document to examine our first round of persistence samples. I checked my email from Najib and did not find a sample sheet but did find an explanation of the three sample types we expect.
In preparation for this, I downloaded a new hg38 genome. Since the panamensis asembly has not significantly changed (excepting the putative long read genome which I have not yet seen), I am just using the same one.
The hg38 genome I got is brand new (202405), so do not use the archive for a while.
## Ok, so useast.ensembl is failing today, let us use the jan2024 archive?
#hs_annot <- load_biomart_annotations(archive = FALSE, species = "hsapiens")
## Seems like the 202401 archive is a good choice, it is explicitly the hg38_111 release.
## and it is waaaaay faster (like 100x) than useast right now.
hs_annot <- load_biomart_annotations(archive = FALSE, species = "hsapiens", overwrite = TRUE,
year = 2025, month = "08")## Using mart: ENSEMBL_MART_ENSEMBL from host: useast.ensembl.org.
## Successfully connected to the hsapiens_gene_ensembl database.
## Finished downloading ensembl gene annotations.
## Finished downloading ensembl structure annotations.
## symbol columns is null, pattern matching 'symbol' and taking the first.
## Found 2 potential symbol columns, using the first:hgnc_symbol.
## Including symbols, there are 86371 vs the 533740 gene annotations.
## Not dropping haplotype chromosome annotations, set drop_haplotypes = TRUE if this is bad.
## Saving annotations to hsapiens_biomart_annotations.rda.
## Finished save().
panamensis_orgdb_idx <- grep(pattern = "^org.+panamen.+MHOM.+db$", x = rownames(installed.packages()))
panamensis_orgdb <- tail(rownames(installed.packages())[panamensis_orgdb_idx], n = 1)
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid")## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:hpgltools':
##
## annotation, annotation<-, conditions, conditions<-
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind, colnames, dirname,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with 'browseVignettes()'. To cite
## Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE
## 'select()' returned 1:1 mapping between keys and columns
This is a little silly, but I am going to reload the annotations using the previous invocation to extract the annotation table without having to think. The previous block loads the orgdb for me, so I can just use that to get the fun annotations.
all_columns <- keytypes(get0(panamensis_orgdb))
annot_idx <- grep(pattern = "^ANNOT_", x = all_columns)
annot_columns <- all_columns[annot_idx]
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid", fields = annot_columns)## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_APOLLO_LINK_OUT, ANNOT_APOLLO_TRANSCRIPT_DESCRIPTION, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_END_MAX, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENOMIC_SEQUENCE_LENGTH, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MAP_LOCATION, ANNOT_MCMC_LOCATION, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROB_MAP, ANNOT_PROB_MCMC, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_START_MIN, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_IDS, ANNOT_UNIPROT_LINKS
## 'select()' returned 1:1 mapping between keys and columns
## Error in read.xlsx.default(xlsxFile = file, sheet = sheet, detectDates = TRUE) :
## File does not exist.
## Error in read.xlsx.default(xlsxFile = file, sheet = sheet, detectDates = FALSE) :
## File does not exist.
## Error in read_metadata("sample_sheets/persistence_hu_v2.1.3.xlsx"): Unable to read the metadata file: sample_sheets/persistence_hu_v2.1.3.xlsx
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colnames': object 'input' not found
pre_meta <- gather_preprocessing_metadata(
starting_metadata = input, id_column = "hpgl_identifier",
specification = first_spec, new_metadata = "persistence_hu_modified.xlsx",
basedir = "preprocessing", species = c("hg38_115", "lpanamensis_mhomcol_v68"))## Error: object 'input' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'pre_meta' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'pre_meta' not found
hisat_idx <- grep(pattern = "^hisat", x = names(first_spec))
second_spec <- first_spec[hisat_idx]
post_meta <- gather_preprocessing_metadata(
starting_metadata = pre_meta[["new_meta"]],
specification = second_spec, basedir = "preprocessing/202405", species = "hg38_111",
new_metadata = "sample_sheets/tmrc2_persistence_202405_lp_hg.xlsx")
both_meta <- gather_preprocessing_metadata(
starting_metadata = "sample_sheets/tmrc_persistence_202405.xlsx",
specification = first_spec,
basedir = "preprocessing/202405", species= c("lpanamensis_v68", "hg38_111"),
new_metadata = "sample_sheets/tmrc_persistence_202405_both.xlsx")I should have all my load_xyz_annotation functions return some of the same elements in their retlists.
hg_genes <- hs_annot[["annotation"]]
hg_map <- hs_annot[["gene_tx_map"]]
lp_genes <- lp_annot[["genes"]]
hu_se_salmon <- create_se(pre_meta[["new_meta"]], gene_info = hg_genes,
tx_gene_map = hg_map, file_column = "salmon_count_table_hg38_115")## Error: object 'pre_meta' not found
hu_se_hisat_gene <- create_se(pre_meta[["new_meta"]], gene_info = hg_genes,
file_column = "hisat_count_table_hg38_115")## Error: object 'pre_meta' not found
hu_se_salmon <- set_conditions(hu_se_salmon, fact = "sample_type") %>%
set_batches("detectionparasiteby7sl")## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se_salmon' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error: object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
sample_7sl <- paste0(colData(hu_se)[["sample_type"]], "_", colData(hu_se)[["detectionparasiteby7sl"]])## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error: object 'sample_7sl' not found
healthy_vs_scar <- gsub(x = colData(hu_se)[["condition"]], pattern = "^skin biopsy ", replacement = "")## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error: object 'healthy_vs_scar' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error: object 'hu_mapped' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error: object 'hu_observed' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_se' not found
## Error: object 'hu_pct' not found
hu_sankey <- plot_meta_sankey(hu_se, factors = c("detectionparasiteby7sl", "sample_type", "library_type"))## Error in h(simpleError(msg, call)): error in evaluating the argument 'design' in selecting a method for function 'plot_meta_sankey': object 'hu_se' not found
## Error: object 'hu_sankey' not found
## Error: object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_libsize': object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_nonzero': object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'hu_norm' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_norm' not found
## Error: object 'hu_norm_pca' not found
## png
## 2
## Error: object 'hu_norm_pca' not found
hu_detected <- subset_se(hu_se, subset = "detectionparasiteby7sl!='unknown'") %>%
set_conditions(fact = "detectionparasiteby7sl") %>%
set_batches("sample_type")## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se' not found
hu_detect_nb <- normalize(hu_detected, transform = "log2", convert = "cpm",
filter = TRUE, batch = "svaseq")## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_detected' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_detect_nb' not found
## Error: object 'hu_detect_pca' not found
## png
## 2
## Error: object 'hu_detect_pca' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_se' not found
## Error: object 'hu_s7sl' not found
hu_nasal_nb <- normalize(hu_nasal, transform = "log2", convert = "cpm",
batch = "svaseq", filter = TRUE)## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_nasal' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_nasal_nb' not found
## png
## 2
## Error: object 'hu_s7sl' not found
hu_wbc_nb <- normalize(hu_wbc, transform = "log2", convert = "cpm",
batch = "svaseq", filter = TRUE)## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_wbc' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hu_wbc_nb' not found
## png
## 2
short_factor <- gsub(x = as.character(colData(hu_nasal)[["condition"]]), pattern = ".*_(.*)$", replacement = "\\1")## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_nasal' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_nasal' not found
## Error: object 'hu_nasal' not found
hu_nasal_de <- all_pairwise(hu_nasal_np, filter = TRUE, force = TRUE,
model_fstring = "~ 0 + condition", model_svs = "svaseq")## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'hu_nasal_np' not found
## Error: object 'hu_nasal_de' not found
## Error: object 'hu_nasal_de' not found
## Error: object 'hu_nasal_table' not found
## Error: object 'hu_nasal_table' not found
## Error: object 'hu_nasal_sig' not found
One query from our last meeting which I forgot about until I reread my TODO notes: compare the samples marked as healthy compared to those marked as scar. These are two distantly separate skin biopsies of the same person.
hu_hs_de <- all_pairwise(hu_hs, filter = TRUE, force = TRUE,
model_svs = "svaseq", model_fstring = "~ 0 + condition")## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'hu_hs' not found
## Error: object 'hu_hs_de' not found
## Error: object 'hu_hs_table' not found
## Error: object 'hu_hs_table' not found
## Error: object 'hu_hs_sig' not found
## Error: object 'pre_meta' not found
hu_kraken <- set_conditions(hu_kraken, fact = "sample_type") %>%
set_batches("detectionparasiteby7sl")## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_kraken' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'hu_kraken' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'kraken_norm' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'kraken_norm' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'kraken_norm' not found
## Error: object 'hu_kraken' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'normalize': object 'nasal_kraken' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input_data' in selecting a method for function 'plot_heatmap': object 'nasal_norm' not found
kraken_bacteria <- gsub(x = colData(hu_kraken)[["kraken_matrix"]],
pattern = "viral", replacement = "bacteria")## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_kraken' not found
## Error: object 'kraken_bacteria' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hu_kraken' not found
hu_kraken <- set_conditions(hu_kraken, fact = "sample_type") %>%
set_batches("detectionparasiteby7sl")## Error in h(simpleError(msg, call)): error in evaluating the argument 'exp' in selecting a method for function 'set_batches': error in evaluating the argument 'exp' in selecting a method for function 'set_conditions': object 'hu_kraken' not found