The tritrypdb just released a new version. Let us make new annotation data from it.
## These functions take _forever_ the first time around.
devtools::load_all("~/scratch/git/EuPathDB")
installedp <- get_eupath_pkgnames(esmer_entry)$orgdb_installed
if (!isTRUE(installedp)) {
## Setting the do_orthologs="get" argument ensures that I use the download
## more thorough but slower ortholog tables. Otherwise it will attempt to
## use the still-changing 'OrthologsLite' table that the eupathdb folks
## are suggesting.
esmer_annot <- EuPathDB::make_eupath_orgdb(esmer_entry, reinstall=TRUE,
overwrite=TRUE)
}
installedp <- get_eupath_pkgnames(nonesmer_entry)$orgdb_installed
if (!isTRUE(installedp)) {
nonesmer_annot <- EuPathDB::make_eupath_orgdb(nonesmer_entry, reinstall=TRUE, overwrite=TRUE)
}
installedp <- get_eupath_pkgnames(unas_entry)$orgdb_installed
if (!isTRUE(installedp)) {
unas_annot <- EuPathDB::make_eupath_orgdb(unas_entry, reinstall=TRUE, overwrite=TRUE)
}
In order to load these new packages, I rather need to remember their names… Happily I have a function for that.
library(EuPathDB)
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomeInfoDbData
## Loading required package: AnnotationHub
##
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:hpgltools':
##
## cache
## The following object is masked from 'package:Biobase':
##
## cache
##
## This is EuPathDB version 1.5.0
## Read 'EuPathDB()' to get started.
##
## Attaching package: 'EuPathDB'
## The following objects are masked from 'package:hpgltools':
##
## get_kegg_orgn, load_kegg_annotations, load_orgdb_annotations,
## load_orgdb_go, orgdb_from_ah
esmer_entry <- EuPathDB::get_eupath_entry(species="Esmeraldo-like", webservice="tritrypdb")
## Warning: 'BiocInstaller' and 'biocLite()' are deprecated, use the 'BiocManager'
## CRAN package instead.
## Found the following hits: Trypanosoma cruzi CL Brener Esmeraldo-like, Trypanosoma cruzi CL Brener Non-Esmeraldo-like, choosing the first.
## Using: Trypanosoma cruzi CL Brener Esmeraldo-like.
nonesmer_entry <- EuPathDB::get_eupath_entry(species="Brener Non", webservice="tritrypdb")
## Found: Trypanosoma cruzi CL Brener Non-Esmeraldo-like
unas_entry <- EuPathDB::get_eupath_entry(species="CL Brener$", webservice="tritrypdb")
## Found: Trypanosoma cruzi strain CL Brener
esmer_names <- get_eupath_pkgnames(esmer_entry)
esmer_names$orgdb
## org.Tcruzi.CL.Brener.Esmeraldo.like.v43.eg.db
nonesmer_names <- get_eupath_pkgnames(nonesmer_entry)
nonesmer_names$orgdb
## org.Tcruzi.CL.Brener.Non.Esmeraldo.like.v43.eg.db
unas_names <- get_eupath_pkgnames(unas_entry)
unas_names$orgdb
## org.Tcruzi.CL.Brener.v43.eg.db
For those packages I have generated/installed, use this to generate an annotation table. Oh, but I prefixed the column names with ‘annot_’ in order to make sure that nothing is duplicated with the GO tables, ortholog tables, etc.
## Just to save on typing
library(esmer_names$orgdb, character=TRUE)
## Loading required package: AnnotationDbi
##
library(nonesmer_names$orgdb, character=TRUE)
##
library(unas_names$orgdb, character=TRUE)
##
esmer_db <- get0(esmer_names$orgdb)
esmer_db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | DBSCHEMA: NOSCHEMA_DB
## | ORGANISM: Trypanosoma cruzi
## | SPECIES: Trypanosoma cruzi
## | CENTRALID: GID
## | Taxonomy ID: 5693
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
##
## Please see: help('select') for usage information
nonesmer_db <- get0(nonesmer_names$orgdb)
nonesmer_db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | DBSCHEMA: NOSCHEMA_DB
## | ORGANISM: Trypanosoma cruzi
## | SPECIES: Trypanosoma cruzi
## | CENTRALID: GID
## | Taxonomy ID: 5693
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
##
## Please see: help('select') for usage information
unas_db <- get0(unas_names$orgdb)
unas_db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | DBSCHEMA: NOSCHEMA_DB
## | ORGANISM: Trypanosoma cruzi strain CL Brener
## | SPECIES: Trypanosoma cruzi strain CL Brener
## | CENTRALID: GID
## | Taxonomy ID: 353153
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
##
## Please see: help('select') for usage information
Lets see what columns are available in the annotation packages.
all_fields <- columns(esmer_db)
all_fields
## [1] "ANNOT_BFD3_CDS"
## [2] "ANNOT_BFD3_MODEL"
## [3] "ANNOT_BFD6_CDS"
## [4] "ANNOT_BFD6_MODEL"
## [5] "ANNOT_CDS"
## [6] "ANNOT_CDS_LENGTH"
## [7] "ANNOT_CHROMOSOME"
## [8] "ANNOT_DIF_CDS"
## [9] "ANNOT_DIF_MODEL"
## [10] "ANNOT_EC_NUMBERS"
## [11] "ANNOT_EC_NUMBERS_DERIVED"
## [12] "ANNOT_EXON_COUNT"
## [13] "ANNOT_FC_BFD3_CDS"
## [14] "ANNOT_FC_BFD3_MODEL"
## [15] "ANNOT_FC_BFD6_CDS"
## [16] "ANNOT_FC_BFD6_MODEL"
## [17] "ANNOT_FC_DIF_CDS"
## [18] "ANNOT_FC_DIF_MODEL"
## [19] "ANNOT_FC_PF_CDS"
## [20] "ANNOT_FC_PF_MODEL"
## [21] "ANNOT_FIVE_PRIME_UTR_LENGTH"
## [22] "ANNOT_GENE_ENTREZ_ID"
## [23] "ANNOT_GENE_EXON_COUNT"
## [24] "ANNOT_GENE_HTS_NONCODING_SNPS"
## [25] "ANNOT_GENE_HTS_NONSYN_SYN_RATIO"
## [26] "ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS"
## [27] "ANNOT_GENE_HTS_STOP_CODON_SNPS"
## [28] "ANNOT_GENE_HTS_SYNONYMOUS_SNPS"
## [29] "ANNOT_GENE_LOCATION_TEXT"
## [30] "ANNOT_GENE_NAME"
## [31] "ANNOT_GENE_ORTHOLOG_NUMBER"
## [32] "ANNOT_GENE_ORTHOMCL_NAME"
## [33] "ANNOT_GENE_PARALOG_NUMBER"
## [34] "ANNOT_GENE_PREVIOUS_IDS"
## [35] "ANNOT_GENE_PRODUCT"
## [36] "ANNOT_GENE_SOURCE_ID"
## [37] "ANNOT_GENE_TOTAL_HTS_SNPS"
## [38] "ANNOT_GENE_TRANSCRIPT_COUNT"
## [39] "ANNOT_GENE_TYPE"
## [40] "ANNOT_GO_COMPONENT"
## [41] "ANNOT_GO_FUNCTION"
## [42] "ANNOT_GO_ID_COMPONENT"
## [43] "ANNOT_GO_ID_FUNCTION"
## [44] "ANNOT_GO_ID_PROCESS"
## [45] "ANNOT_GO_PROCESS"
## [46] "ANNOT_HAS_MISSING_TRANSCRIPTS"
## [47] "ANNOT_INTERPRO_DESCRIPTION"
## [48] "ANNOT_INTERPRO_ID"
## [49] "ANNOT_IS_PSEUDO"
## [50] "ANNOT_ISOELECTRIC_POINT"
## [51] "ANNOT_LOCATION_TEXT"
## [52] "ANNOT_MATCHED_RESULT"
## [53] "ANNOT_MOLECULAR_WEIGHT"
## [54] "ANNOT_NO_TET_CDS"
## [55] "ANNOT_NO_TET_MODEL"
## [56] "ANNOT_ORGANISM"
## [57] "ANNOT_PF_CDS"
## [58] "ANNOT_PF_MODEL"
## [59] "ANNOT_PFAM_DESCRIPTION"
## [60] "ANNOT_PFAM_ID"
## [61] "ANNOT_PIRSF_DESCRIPTION"
## [62] "ANNOT_PIRSF_ID"
## [63] "ANNOT_PREDICTED_GO_COMPONENT"
## [64] "ANNOT_PREDICTED_GO_FUNCTION"
## [65] "ANNOT_PREDICTED_GO_ID_COMPONENT"
## [66] "ANNOT_PREDICTED_GO_ID_FUNCTION"
## [67] "ANNOT_PREDICTED_GO_ID_PROCESS"
## [68] "ANNOT_PREDICTED_GO_PROCESS"
## [69] "ANNOT_PROJECT_ID"
## [70] "ANNOT_PROSITEPROFILES_DESCRIPTION"
## [71] "ANNOT_PROSITEPROFILES_ID"
## [72] "ANNOT_PROTEIN_LENGTH"
## [73] "ANNOT_PROTEIN_SEQUENCE"
## [74] "ANNOT_SEQUENCE_ID"
## [75] "ANNOT_SIGNALP_PEPTIDE"
## [76] "ANNOT_SIGNALP_SCORES"
## [77] "ANNOT_SMART_DESCRIPTION"
## [78] "ANNOT_SMART_ID"
## [79] "ANNOT_SOURCE_ID"
## [80] "ANNOT_STRAND"
## [81] "ANNOT_SUPERFAMILY_DESCRIPTION"
## [82] "ANNOT_SUPERFAMILY_ID"
## [83] "ANNOT_THREE_PRIME_UTR_LENGTH"
## [84] "ANNOT_TIGRFAM_DESCRIPTION"
## [85] "ANNOT_TIGRFAM_ID"
## [86] "ANNOT_TM_COUNT"
## [87] "ANNOT_TRANS_FOUND_PER_GENE_INTERNAL"
## [88] "ANNOT_TRANSCRIPT_INDEX_PER_GENE"
## [89] "ANNOT_TRANSCRIPT_LENGTH"
## [90] "ANNOT_TRANSCRIPT_LINK"
## [91] "ANNOT_TRANSCRIPT_PRODUCT"
## [92] "ANNOT_TRANSCRIPT_SEQUENCE"
## [93] "ANNOT_TRANSCRIPTS_FOUND_PER_GENE"
## [94] "ANNOT_UNIPROT_ID"
## [95] "ANNOT_URI"
## [96] "ANNOT_WDK_WEIGHT"
## [97] "CHR_ID"
## [98] "GENE_TYPE"
## [99] "GID"
## [100] "GO_EVIDENCE_CODE"
## [101] "GO_ID"
## [102] "GO_IS_NOT"
## [103] "GO_ONTOLOGY"
## [104] "GO_REFERENCE"
## [105] "GO_SORT_KEY"
## [106] "GO_SOURCE"
## [107] "GO_SUPPORT_FOR_EVIDENCE_CODE_ASSIGNMENT"
## [108] "GO_TERM_NAME"
## [109] "GO_TRANSCRIPT_ID_S"
## [110] "INTERPRO_DESCRIPTION"
## [111] "INTERPRO_E_VALUE"
## [112] "INTERPRO_END_MIN"
## [113] "INTERPRO_ID"
## [114] "INTERPRO_NAME"
## [115] "INTERPRO_PRIMARY_ID"
## [116] "INTERPRO_SECONDARY_ID"
## [117] "INTERPRO_START_MIN"
## [118] "INTERPRO_TRANSCRIPT_ID_S"
## [119] "KEGGREST_KEGG_GENEID"
## [120] "KEGGREST_NCBI_GENEID"
## [121] "KEGGREST_NCBI_PROTEINID"
## [122] "KEGGREST_PATHWAYS"
## [123] "KEGGREST_UNIPROTID"
## [124] "LINKOUT_DATABASE"
## [125] "LINKOUT_EXT_ID"
## [126] "LINKOUT_LINK_URL"
## [127] "LINKOUT_SOURCE_ID"
## [128] "ORTHOLOGS_GID"
## [129] "ORTHOLOGS_ORGANISM"
## [130] "ORTHOLOGS_PRODUCT"
## [131] "ORTHOLOGS_SYNTENIC"
## [132] "PATHWAY_EC_NUMBER_MATCHED_IN_PATHWAY"
## [133] "PATHWAY_EXACT_EC_NUMBER_MATCH"
## [134] "PATHWAY_EXPASY_URL"
## [135] "PATHWAY_ID"
## [136] "PATHWAY_REACTIONS_MATCHING_EC_NUMBER"
## [137] "PATHWAY_SOURCE"
## [138] "PATHWAY_SOURCE_ID"
## [139] "PUBMED_AUTHORS"
## [140] "PUBMED_DOI"
## [141] "PUBMED_ID"
## [142] "PUBMED_TITLE"
wanted_fields <- c("annot_gene_location_text",
"annot_cds_length",
"annot_gene_name",
"annot_gene_product",
"annot_gene_type",
"annot_strand",
"annot_gene_entrez_id",
"annot_gene_orthomcl_name")
esmer_annot <- load_orgdb_annotations(
esmer_db,
keytype="gid",
fields=wanted_fields)
## Unable to find CDSNAME, setting it to ANNOT_GENE_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_GENE_NAME, GENE_TYPE, ANNOT_GENE_LOCATION_TEXT, ANNOT_CDS_LENGTH, ANNOT_GENE_NAME, ANNOT_GENE_PRODUCT, ANNOT_GENE_TYPE, ANNOT_STRAND, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ORTHOMCL_NAME
## 'select()' returned 1:1 mapping between keys and columns
nonesmer_annot <- load_orgdb_annotations(
nonesmer_db,
keytype="gid",
fields=wanted_fields)
## Unable to find CDSNAME, setting it to ANNOT_GENE_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_GENE_NAME, GENE_TYPE, ANNOT_GENE_LOCATION_TEXT, ANNOT_CDS_LENGTH, ANNOT_GENE_NAME, ANNOT_GENE_PRODUCT, ANNOT_GENE_TYPE, ANNOT_STRAND, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ORTHOMCL_NAME
## 'select()' returned 1:1 mapping between keys and columns
unas_annot <- load_orgdb_annotations(
unas_db,
keytype="gid",
fields=wanted_fields)
## Unable to find CDSNAME, setting it to ANNOT_GENE_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_GENE_NAME, GENE_TYPE, ANNOT_GENE_LOCATION_TEXT, ANNOT_CDS_LENGTH, ANNOT_GENE_NAME, ANNOT_GENE_PRODUCT, ANNOT_GENE_TYPE, ANNOT_STRAND, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ORTHOMCL_NAME
## 'select()' returned 1:1 mapping between keys and columns
Now combine the esmer, nonesmer, and unassigned annotations.
clbr_annot <- rbind(rbind(esmer_annot[["genes"]], nonesmer_annot[["genes"]]),
unas_annot[["genes"]])
## Wait, did they finally combine esmer_annot/nonesmer_annot and the whole clbr?
testers <- head(rownames(unas_annot[["genes"]]))
testers %in% rownames(nonesmer_annot$genes)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
testers %in% rownames(esmer_annot$genes)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
## No, I guess not.
clbr_annot <- extract_gene_locations(clbr_annot)
hisat_clbr_annot <- clbr_annot
rownames(hisat_clbr_annot) <- paste0("exon_", rownames(hisat_clbr_annot), ".1")
salmon_clbr_annot <- clbr_annot
rownames(salmon_clbr_annot) <- paste0(rownames(salmon_clbr_annot), ".mRNA")
For the moment I think I will just ask for esmer->nonesmer
orthos <- EuPathDB::extract_eupath_orthologs(
db=esmer_db,
query_species="Trypanosoma cruzi CL Brener Non-Esmeraldo-like",
id_column="ORTHOLOGS_GID")
## Some columns were missing: ORTHOLOGS_COUNT
## Removing them, which may end badly.
## 'select()' returned 1:many mapping between keys and columns
## There are 48 possible species in this group.
## Found species: Trypanosoma cruzi CL Brener Non-Esmeraldo-like
orthos <- orthos[, c("GID", "ORTHOLOGS_GID")]
colnames(orthos) <- c("Esmeraldo", "NonEsmeraldo")
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
message(paste0("Saving to ", savefile))
tmp <- sm(saveme(filename=savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 0abc58e173be7300595d30d407b7efd4e4a512d6
## This is hpgltools commit: Thu May 9 14:56:34 2019 -0400: 0abc58e173be7300595d30d407b7efd4e4a512d6
## Saving to 01_annotation_v20190513.rda.xz
## Error in save(list = ls(all.names = TRUE, envir = globalenv()), envir = globalenv(), : error writing to connection