I want to use this document to examine our first round of persistence samples. I checked my email from Najib and did not find a sample sheet but did find an explanation of the three sample types we expect.
In preparation for this, I downloaded a new hg38 genome. Since the panamensis asembly has not significantly changed (excepting the putative long read genome which I have not yet seen), I am just using the same one.
The hg38 genome I got is brand new (202405), so do not use the archive for a while.
## Ok, so useast.ensembl is failing today, let us use the jan2024 archive?
#hs_annot <- load_biomart_annotations(archive = FALSE, species = "hsapiens")
## Seems like the 202401 archive is a good choice, it is explicitly the hg38_111 release.
## and it is waaaaay faster (like 100x) than useast right now.
hs_annot <- load_biomart_annotations(archive = TRUE, species = "hsapiens",
year = 2024, month = "01")
## The biomart annotations file already exists, loading from it.
panamensis_orgdb_idx <- grep(pattern = "^org.+panamen.+MHOM.+db$", x = rownames(installed.packages()))
panamensis_orgdb <- tail(rownames(installed.packages())[panamensis_orgdb_idx], n = 1)
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid")
## Loading required package: AnnotationDbi
##
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE
## 'select()' returned 1:1 mapping between keys and columns
This is a little silly, but I am going to reload the annotations using the previous invocation to extract the annotation table without having to think. The previous block loads the orgdb for me, so I can just use that to get the fun annotations.
all_columns <- keytypes(get0(panamensis_orgdb))
annot_idx <- grep(pattern = "^ANNOT_", x = all_columns)
annot_columns <- all_columns[annot_idx]
lp_annot <- load_orgdb_annotations(panamensis_orgdb, keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_APOLLO_LINK_OUT, ANNOT_APOLLO_TRANSCRIPT_DESCRIPTION, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_END_MAX, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENOMIC_SEQUENCE_LENGTH, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MAP_LOCATION, ANNOT_MCMC_LOCATION, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROB_MAP, ANNOT_PROB_MCMC, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_START_MIN, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_IDS, ANNOT_UNIPROT_LINKS
## 'select()' returned 1:1 mapping between keys and columns
first_spec <- make_rnaseq_spec()
pre_meta <- gather_preprocessing_metadata(
starting_metadata = "sample_sheets/tmrc_persistence_202405.xlsx",
specification = first_spec,
basedir = "preprocessing/202405", species="lpanamensis_v68",
new_metadata = "sample_sheets/tmrc_persistence_202405_lpanamensis.xlsx")
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## Warning in dispatch_regex_search(meta, search, replace, input_file_spec, : NAs introduced by coercion
## Writing new metadata to: sample_sheets/tmrc_persistence_202405_lpanamensis.xlsx
## Deleting the file sample_sheets/tmrc_persistence_202405_lpanamensis.xlsx before writing the tables.
hisat_idx <- grep(pattern = "^hisat", x = names(first_spec))
second_spec <- first_spec[hisat_idx]
post_meta <- gather_preprocessing_metadata(
starting_metadata = pre_meta[["new_meta"]],
specification = second_spec, basedir = "preprocessing/202405", species = "hg38_111",
new_metadata = "sample_sheets/tmrc2_persistence_202405_lp_hg.xlsx")
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_rrna_percent_log already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : NAs introduced by coercion
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_genome_single_concordant already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_genome_multi_concordant already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_genome_single_all already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_genome_multi_all already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_unmapped already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_genome_percent_log already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_observed_genes already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_observed_mean_exprs already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_observed_median_exprs already exists, replacing it.
## Warning in gather_preprocessing_metadata(starting_metadata = pre_meta[["new_meta"]], : Column: hisat_count_table already exists, replacing it.
## Writing new metadata to: sample_sheets/tmrc2_persistence_202405_lp_hg.xlsx
## Deleting the file sample_sheets/tmrc2_persistence_202405_lp_hg.xlsx before writing the tables.
both_meta <- gather_preprocessing_metadata(
starting_metadata = "sample_sheets/tmrc_persistence_202405.xlsx",
specification = first_spec,
basedir = "preprocessing/202405", species= c("lpanamensis_v68", "hg38_111"),
new_metadata = "sample_sheets/tmrc_persistence_202405_both.xlsx")
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## Warning in dispatch_regex_search(meta, search, replace, input_file_spec, : NAs introduced by coercion
## Warning in dispatch_regex_search(meta, search, replace, input_file_spec, : NAs introduced by coercion
## Writing new metadata to: sample_sheets/tmrc_persistence_202405_both.xlsx
## Deleting the file sample_sheets/tmrc_persistence_202405_both.xlsx before writing the tables.
I should have all my load_xyz_annotation functions return some of the same elements in their retlists.
hs_all_samples <- create_expt("sample_sheets/tmrc_persistence_202405_both.xlsx",
gene_info = hg_genes,
file_column = "hisatcounttablehg38111")
## Reading the sample metadata.
## The sample definitions comprises: 21 rows(samples) and 41 columns(metadata fields).
## Matched 21557 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21557 features and 21 samples.
## Subsetting on samples.
## The samples excluded are: PRCS0001, PRCS0002, PRCS0003, PRCS0004, PRCS0005, PRHU0001, PRHU0002, PRHU0003.
## subset_expt(): There were 21, now there are 13 samples.
## Subsetting on samples.
## The samples excluded are: PRCS0001, PRCS0002, PRCS0003, PRCS0004, PRCS0005, PRSL0001, PRSL0002, PRSL0003, PRSL0004, PRSL0005, PRSL0006, PRSL0007, PRSL0008, PRSL0009, PRSL0010, PRSL0011, PRSL0012, PRSL0015.
## subset_expt(): There were 21, now there are 3 samples.
lp_all_hisat_mapped <- plot_metadata_factors(lp_all_samples,
column = "hisatgenomesingleconcordantlpanamensisv68")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_all_samples' not found
## Error in eval(expr, envir, enclos): object 'lp_all_hisat_mapped' not found
hs_all_hisat_mapped <- plot_metadata_factors(hs_all_samples,
column = "hisatgenomesingleconcordanthg38111")
## Error in geom_jitter(height = 0, width = 0.1): could not find function "geom_jitter"
## Error in eval(expr, envir, enclos): object 'hs_all_hisat_mapped' not found
lp_sl_hisat_genes <- plot_metadata_factors(lp_sl_samples,
column = "hisatobservedgeneslpanamensisv68")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_sl_samples' not found
## Error in eval(expr, envir, enclos): object 'lp_sl_hisat_genes' not found
## Error in geom_jitter(height = 0, width = 0.1): could not find function "geom_jitter"
## Error in eval(expr, envir, enclos): object 'hs_sl_hisat_genes' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_libsize': object 'lp_sl_samples' not found
## Library sizes of 13 samples,
## ranging from 2,442,896 to 7,848,190.
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_nonzero': object 'lp_sl_samples' not found
## The following samples have less than 14012.05 genes.
## [1] "PRSL0001" "PRSL0002" "PRSL0003" "PRSL0004" "PRSL0005" "PRSL0006" "PRSL0007" "PRSL0008"
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## A non-zero genes plot of 13 samples.
## These samples have an average 4.26 CPM coverage and 11943 genes observed, ranging from 7260 to
## 16565.
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_libsize': object 'lp_sl_samples' not found
## Error in eval(expr, envir, enclos): object 'post_filter' not found
lp_sl_norm <- normalize_expt(lp_sl_samples, transform = "log2", convert = "cpm",
norm = "tmm", filter = TRUE)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'expt' in selecting a method for function 'normalize_expt': object 'lp_sl_samples' not found
## Error in eval(expr, envir, enclos): object 'lp_sl_norm' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'expt_data' in selecting a method for function 'plot_heatmap': object 'lp_sl_norm' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'expt_data' in selecting a method for function 'plot_heatmap': object 'lp_sl_norm' not found
hs_sl_hisat_mapped <- plot_metadata_factors(hs_sl_samples,
column = "hisatgenomesingleconcordanthg38111")
## Error in geom_jitter(height = 0, width = 0.1): could not find function "geom_jitter"
## Error in eval(expr, envir, enclos): object 'hs_sl_hisat_mapped' not found
## Error in geom_jitter(height = 0, width = 0.1): could not find function "geom_jitter"
## Error in eval(expr, envir, enclos): object 'hs_sl_hisat_genes' not found
## Library sizes of 13 samples,
## ranging from 2,442,896 to 7,848,190.
## The following samples have less than 14012.05 genes.
## [1] "PRSL0001" "PRSL0002" "PRSL0003" "PRSL0004" "PRSL0005" "PRSL0006" "PRSL0007" "PRSL0008"
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## A non-zero genes plot of 13 samples.
## These samples have an average 4.26 CPM coverage and 11943 genes observed, ranging from 7260 to
## 16565.