1 Annotation version: 20180828

1.1 Genome annotation with OrgDb/TxDb/OrganismDbi

The tritrypdb just released a new version. Let us make new annotation data from it.

## These functions take _forever_ the first time around.
lm_annot <- make_eupath_organismdbi(species="Leishmania major", reinstall=TRUE)

In order to load these new packages, I rather need to remember their names… Happily I have a function for that.

lm_name <- get_eupath_pkgnames("Leishmania major", version="37")
## Starting metadata download.
## Finished metadata download.
## Found the following hits: Leishmania major strain Friedlin, Leishmania major strain LV39c5, Leishmania major strain SD 75.1, choosing the first.
lm_name$organismdbi
## [1] "eupathdb.Leishmania.major.Friedlin.v37"
lmx_name <- get_eupath_pkgnames("Leishmania mexicana", version="37")
## Starting metadata download.
## Finished metadata download.
## Found the following hits: Leishmania mexicana MHOM/GT/2001/U1103, choosing the first.
lmx_name$organismdbi
## [1] "eupathdb.Leishmania.mexicana.MHOMGT2001U1103.v37"

For those packages I have generated/installed, use this to generate an annotation table. Oh, but I prefixed the column names with ‘annot_’ in order to make sure that nothing is duplicated with the GO tables, ortholog tables, etc. As a result, these are wrong until the new annotations are loaded.

## Just to save on typing
library(lm_name$orgdb, character=TRUE)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
lm_db <- get0(lm_name$orgdb)
lm_db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | DBSCHEMA: NOSCHEMA_DB
## | ORGANISM: Leishmania major strain Friedlin
## | SPECIES: Leishmania major strain Friedlin
## | CENTRALID: GID
## | Taxonomy ID: 347515
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## 
## Please see: help('select') for usage information
library(lmx_name$orgdb, character=TRUE)
## 
lmx_db <- get0(lmx_name$orgdb)
lmx_db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | DBSCHEMA: NOSCHEMA_DB
## | ORGANISM: Leishmania mexicana MHOM/GT/2001/U1103
## | SPECIES: Leishmania mexicana MHOM/GT/2001/U1103
## | CENTRALID: GID
## | Taxonomy ID: 929439
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## 
## Please see: help('select') for usage information
wanted_fields <- c("annot_gene_location_text",
                   "annot_cds_length",
                   "annot_gene_name",
                   "annot_gene_product",
                   "annot_gene_type",
                   "annot_strand",
                   "annot_gene_entrez_id",
                   "annot_gene_orthomcl_name")
lm_annot <- load_orgdb_annotations(lm_db,
                                   keytype="gid",
                                   fields=wanted_fields)
## Unable to find CDSNAME, setting it to ANNOT_GENE_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_GENE_NAME, GENE_TYPE, ANNOT_GENE_LOCATION_TEXT, ANNOT_CDS_LENGTH, ANNOT_GENE_NAME, ANNOT_GENE_PRODUCT, ANNOT_GENE_TYPE, ANNOT_STRAND, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ORTHOMCL_NAME
## 'select()' returned 1:many mapping between keys and columns
lm_annot <- extract_gene_locations(lm_annot$genes)
rownames(lm_annot) <- paste0(rownames(lm_annot), ".1")

lmx_annot <- load_orgdb_annotations(lmx_db,
                                   keytype="gid",
                                   fields=wanted_fields)
## Unable to find CDSNAME, setting it to ANNOT_GENE_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_GENE_NAME, GENE_TYPE, ANNOT_GENE_LOCATION_TEXT, ANNOT_CDS_LENGTH, ANNOT_GENE_NAME, ANNOT_GENE_PRODUCT, ANNOT_GENE_TYPE, ANNOT_STRAND, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ORTHOMCL_NAME
## 'select()' returned 1:1 mapping between keys and columns
lmx_annot <- extract_gene_locations(lmx_annot$genes)
rownames(lmx_annot) <- paste0(rownames(lmx_annot), ".1")

1.2 Extract data from the gff files

lm_gff_file <- "reference/TriTrypDB-37_LmajorFriedlin.gff"
lm_gff_annotations <- sm(load_gff_annotations(lm_gff_file, type="exon"))
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename=savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 0b63ce6abcbb822832fe4631b4916f94931d8648
## R> packrat::restore()
## This is hpgltools commit: Wed Sep 5 12:04:45 2018 -0400: 0b63ce6abcbb822832fe4631b4916f94931d8648
## Saving to 01_annotation_v20180828.rda.xz
LS0tCnRpdGxlOiAiTGVpc2htYW5pYSBzdHJhaW5zIDIwMTgwNTogQ29sbGVjdGluZyBhbm5vdGF0aW9uIGluZm9ybWF0aW9uLiIKYXV0aG9yOiAiYXRiIGFiZWxld0BnbWFpbC5jb20iCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCIKb3V0cHV0OgogaHRtbF9kb2N1bWVudDoKICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgY29kZV9mb2xkaW5nOiBzaG93CiAgZmlnX2NhcHRpb246IHRydWUKICBmaWdfaGVpZ2h0OiA3CiAgZmlnX3dpZHRoOiA3CiAgaGlnaGxpZ2h0OiBkZWZhdWx0CiAga2VlcF9tZDogZmFsc2UKICBtb2RlOiBzZWxmY29udGFpbmVkCiAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgc2VsZl9jb250YWluZWQ6IHRydWUKICB0aGVtZTogcmVhZGFibGUKICB0b2M6IHRydWUKICB0b2NfZmxvYXQ6CiAgIGNvbGxhcHNlZDogZmFsc2UKICAgc21vb3RoX3Njcm9sbDogZmFsc2UKLS0tCgo8c3R5bGU+CiAgYm9keSAubWFpbi1jb250YWluZXIgewogICAgbWF4LXdpZHRoOiAxNjAwcHg7CiAgfQo8L3N0eWxlPgoKYGBge3Igb3B0aW9ucywgaW5jbHVkZT1GQUxTRX0KaWYgKCFpc1RSVUUoZ2V0MCgic2tpcF9sb2FkIikpKSB7CiAgbGlicmFyeShocGdsdG9vbHMpCiAgdHQgPC0gc20oZGV2dG9vbHM6OmxvYWRfYWxsKCJ+L2hwZ2x0b29scyIpKQogIGtuaXRyOjpvcHRzX2tuaXQkc2V0KHByb2dyZXNzPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZT1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgIHdpZHRoPTkwLAogICAgICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKICBrbml0cjo6b3B0c19jaHVuayRzZXQoZXJyb3I9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTgsCiAgICAgICAgICAgICAgICAgICAgICAgIGZpZy5oZWlnaHQ9OCwKICAgICAgICAgICAgICAgICAgICAgICAgZHBpPTk2KQogIG9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICAgICAga25pdHIuZHVwbGljYXRlLmxhYmVsPSJhbGxvdyIpCiAgZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMikpCiAgdmVyIDwtICIyMDE4MDgyOCIKICBwcmV2aW91c19maWxlIDwtIHBhc3RlMCgiaW5kZXhfdiIsIHZlciwgIi5SbWQiKQoKICB0bXAgPC0gdHJ5KHNtKGxvYWRtZShmaWxlbmFtZT1nc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IlxcLnJkYVxcLnh6IiwgeD1wcmV2aW91c19maWxlKSkpKQogIHJtZF9maWxlIDwtIHBhc3RlMCgiMDFfYW5ub3RhdGlvbl92IiwgdmVyLCAiLlJtZCIpCiAgc2F2ZWZpbGUgPC0gZ3N1YihwYXR0ZXJuPSJcXC5SbWQiLCByZXBsYWNlPSJcXC5yZGFcXC54eiIsIHg9cm1kX2ZpbGUpCn0KYGBgCgojIEFubm90YXRpb24gdmVyc2lvbjogYHIgdmVyYAoKIyMgR2Vub21lIGFubm90YXRpb24gd2l0aCBPcmdEYi9UeERiL09yZ2FuaXNtRGJpCgpUaGUgdHJpdHJ5cGRiIGp1c3QgcmVsZWFzZWQgYSBuZXcgdmVyc2lvbi4gIExldCB1cyBtYWtlIG5ldyBhbm5vdGF0aW9uIGRhdGEgZnJvbSBpdC4KCmBgYHtyIGNyZWF0ZV9vcmdhbmlzbXMsIGV2YWw9RkFMU0V9CiMjIFRoZXNlIGZ1bmN0aW9ucyB0YWtlIF9mb3JldmVyXyB0aGUgZmlyc3QgdGltZSBhcm91bmQuCmxtX2Fubm90IDwtIG1ha2VfZXVwYXRoX29yZ2FuaXNtZGJpKHNwZWNpZXM9IkxlaXNobWFuaWEgbWFqb3IiLCByZWluc3RhbGw9VFJVRSkKYGBgCgpJbiBvcmRlciB0byBsb2FkIHRoZXNlIG5ldyBwYWNrYWdlcywgSSByYXRoZXIgbmVlZCB0byByZW1lbWJlciB0aGVpciBuYW1lcy4uLgpIYXBwaWx5IEkgaGF2ZSBhIGZ1bmN0aW9uIGZvciB0aGF0LgoKYGBge3IgbG9hZF9hbm5vdGF0aW9uc30KbG1fbmFtZSA8LSBnZXRfZXVwYXRoX3BrZ25hbWVzKCJMZWlzaG1hbmlhIG1ham9yIiwgdmVyc2lvbj0iMzciKQpsbV9uYW1lJG9yZ2FuaXNtZGJpCgpsbXhfbmFtZSA8LSBnZXRfZXVwYXRoX3BrZ25hbWVzKCJMZWlzaG1hbmlhIG1leGljYW5hIiwgdmVyc2lvbj0iMzciKQpsbXhfbmFtZSRvcmdhbmlzbWRiaQpgYGAKCkZvciB0aG9zZSBwYWNrYWdlcyBJIGhhdmUgZ2VuZXJhdGVkL2luc3RhbGxlZCwgdXNlIHRoaXMgdG8gZ2VuZXJhdGUgYW4KYW5ub3RhdGlvbiB0YWJsZS4gT2gsIGJ1dCBJIHByZWZpeGVkIHRoZSBjb2x1bW4gbmFtZXMgd2l0aCAnYW5ub3RfJyBpbiBvcmRlciB0bwptYWtlIHN1cmUgdGhhdCBub3RoaW5nIGlzIGR1cGxpY2F0ZWQgd2l0aCB0aGUgR08gdGFibGVzLCBvcnRob2xvZyB0YWJsZXMsIGV0Yy4KQXMgYSByZXN1bHQsIHRoZXNlIGFyZSB3cm9uZyB1bnRpbCB0aGUgbmV3IGFubm90YXRpb25zIGFyZSBsb2FkZWQuCgpgYGB7ciBsb2FkX29yZ2RifQojIyBKdXN0IHRvIHNhdmUgb24gdHlwaW5nCmxpYnJhcnkobG1fbmFtZSRvcmdkYiwgY2hhcmFjdGVyPVRSVUUpCmxtX2RiIDwtIGdldDAobG1fbmFtZSRvcmdkYikKbG1fZGIKCmxpYnJhcnkobG14X25hbWUkb3JnZGIsIGNoYXJhY3Rlcj1UUlVFKQpsbXhfZGIgPC0gZ2V0MChsbXhfbmFtZSRvcmdkYikKbG14X2RiCgp3YW50ZWRfZmllbGRzIDwtIGMoImFubm90X2dlbmVfbG9jYXRpb25fdGV4dCIsCiAgICAgICAgICAgICAgICAgICAiYW5ub3RfY2RzX2xlbmd0aCIsCiAgICAgICAgICAgICAgICAgICAiYW5ub3RfZ2VuZV9uYW1lIiwKICAgICAgICAgICAgICAgICAgICJhbm5vdF9nZW5lX3Byb2R1Y3QiLAogICAgICAgICAgICAgICAgICAgImFubm90X2dlbmVfdHlwZSIsCiAgICAgICAgICAgICAgICAgICAiYW5ub3Rfc3RyYW5kIiwKICAgICAgICAgICAgICAgICAgICJhbm5vdF9nZW5lX2VudHJlel9pZCIsCiAgICAgICAgICAgICAgICAgICAiYW5ub3RfZ2VuZV9vcnRob21jbF9uYW1lIikKbG1fYW5ub3QgPC0gbG9hZF9vcmdkYl9hbm5vdGF0aW9ucyhsbV9kYiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBrZXl0eXBlPSJnaWQiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGZpZWxkcz13YW50ZWRfZmllbGRzKQpsbV9hbm5vdCA8LSBleHRyYWN0X2dlbmVfbG9jYXRpb25zKGxtX2Fubm90JGdlbmVzKQpyb3duYW1lcyhsbV9hbm5vdCkgPC0gcGFzdGUwKHJvd25hbWVzKGxtX2Fubm90KSwgIi4xIikKCmxteF9hbm5vdCA8LSBsb2FkX29yZ2RiX2Fubm90YXRpb25zKGxteF9kYiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBrZXl0eXBlPSJnaWQiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGZpZWxkcz13YW50ZWRfZmllbGRzKQpsbXhfYW5ub3QgPC0gZXh0cmFjdF9nZW5lX2xvY2F0aW9ucyhsbXhfYW5ub3QkZ2VuZXMpCnJvd25hbWVzKGxteF9hbm5vdCkgPC0gcGFzdGUwKHJvd25hbWVzKGxteF9hbm5vdCksICIuMSIpCmBgYAoKIyMgRXh0cmFjdCBkYXRhIGZyb20gdGhlIGdmZiBmaWxlcwoKYGBge3IgZ2V0X2Zyb21fZ2ZmfQpsbV9nZmZfZmlsZSA8LSAicmVmZXJlbmNlL1RyaVRyeXBEQi0zN19MbWFqb3JGcmllZGxpbi5nZmYiCmxtX2dmZl9hbm5vdGF0aW9ucyA8LSBzbShsb2FkX2dmZl9hbm5vdGF0aW9ucyhsbV9nZmZfZmlsZSwgdHlwZT0iZXhvbiIpKQpgYGAKCmBgYHtyIHNhdmVtZX0KaWYgKCFpc1RSVUUoZ2V0MCgic2tpcF9sb2FkIikpKSB7CiAgcGFuZGVyOjpwYW5kZXIoc2Vzc2lvbkluZm8oKSkKICBtZXNzYWdlKHBhc3RlMCgiVGhpcyBpcyBocGdsdG9vbHMgY29tbWl0OiAiLCBnZXRfZ2l0X2NvbW1pdCgpKSkKICBtZXNzYWdlKHBhc3RlMCgiU2F2aW5nIHRvICIsIHNhdmVmaWxlKSkKICB0bXAgPC0gc20oc2F2ZW1lKGZpbGVuYW1lPXNhdmVmaWxlKSkKfQpgYGAK