1 Annotation version: 20171019

The following section loads the microbesonline and genbank annotations for Pseudomonas aeruginosa.

## Looks like it is taxon ID 208963
paeruginosa_annotations <- load_microbesonline_annotations(id=208963)
## The species being downloaded is: Pseudomonas aeruginosa UCBPP-PA14
knitr::kable(head(paeruginosa_annotations))
locusId accession GI scaffoldId start stop strand sysName name desc COG COGFun COGDesc TIGRFam TIGRRoles GO EC ECDesc
2194572 YP_788156.1 116053721 4582 483 2027 + PA14_00010 dnaA chromosomal replication initiator protein DnaA (NCBI) COG593 L ATPase involved in DNA replication initiation TIGR00362 chromosomal replication initiator protein DnaA [dnaA] DNA metabolism:DNA replication, recombination, and repair GO:0006270,GO:0006275,GO:0003688,GO:0017111,GO:0005524
2194573 YP_788157.1 116053722 4582 2056 3159 + PA14_00020 dnaN DNA polymerase III, beta chain (NCBI) COG592 L DNA polymerase sliding clamp subunit (PCNA homolog) TIGR00663 DNA polymerase III, beta subunit [dnaN] DNA metabolism:DNA replication, recombination, and repair GO:0006260,GO:0003677,GO:0003893,GO:0008408,GO:0016449,GO:0019984,GO:0003889,GO:0003894,GO:0015999,GO:0016450,GO:0003890,GO:0003895,GO:0016000,GO:0016451,GO:0003891,GO:0016448,GO:0016452 2.7.7.7 DNA-directed DNA polymerase.
2194574 YP_788158.1 116053723 4582 3169 4278 + PA14_00030 recF DNA replication and repair protein RecF (NCBI) COG1195 L Recombinational DNA repair ATPase (RecF pathway) TIGR00611 DNA replication and repair protein RecF [recF] DNA metabolism:DNA replication, recombination, and repair GO:0006281,GO:0005694,GO:0005524,GO:0017111,GO:0003697
2194575 YP_788159.1 116053724 4582 4275 6695 + PA14_00050 gyrB DNA gyrase subunit B (NCBI) COG187 L Type IIA topoisomerase (DNA gyrase/topo II, topoisomerase IV), B subunit TIGR01059 DNA gyrase, B subunit [gyrB] DNA metabolism:DNA replication, recombination, and repair GO:0006304,GO:0006265,GO:0005694,GO:0003918,GO:0005524 5.99.1.3 DNA topoisomerase (ATP-hydrolyzing).
2194576 YP_788160.1 116053725 4582 7791 7018 - PA14_00060 PA14_00060 putative acyltransferase (NCBI) COG204 I 1-acyl-sn-glycerol-3-phosphate acyltransferase GO:0008152,GO:0003841 2.3.1.51 1-acylglycerol-3-phosphate O-acyltransferase.
2194577 YP_788161.1 116053726 4582 8339 7803 - PA14_00070 PA14_00070 putative histidinol-phosphatase (NCBI) COG241 E Histidinol phosphatase and related phosphatases TIGR01656 histidinol-phosphate phosphatase domain,TIGR01662 HAD hydrolase, family IIIA Unknown function:Enzymes of unknown specificity GO:0000105,GO:0004401 3.1.3.-
## While waiting for that to download, maybe grab a genbank genome for it, too.
## Looks like the genome is accession NC_002516.2, downloading it to reference/
## crap in a hat, that is strain pa01, dirty trick; NC_008463.1 is pa14 I think.
write.csv(x=paeruginosa_annotations, file="paeruginosa_microbes_annotations.csv")
## oh yeah I wrote something which should download and parse genbank files...
pa <- new.env()
test <- try(load("pa14.rda", envir=pa))
pa14 <- NULL
if (class(test) == "try-error") {
    pa14 <- load_genbank_annotations(accession="NC_008463.1")
    ## save a copy of this data structure to avoid having to redownload it
    save(list=c("pa14"), file="pa14.rda")
} else {
    pa14 <- pa$pa14
    rm(pa)
}

pa14_annot <- paeruginosa_annotations[, c("sysName", "name", "desc", "COGDesc")]
pa_names <- grepl(pattern="^PA14_", x=pa14_annot$name)
pa14_annot[pa_names, "name"] <- ""
write.csv(x=pa14_annot, file="pa14_annotations.csv")

1.1 gff annotations

Unfortunately, it appears that IDs from the Pseudomonas gff do not match what I get from microbesonline. So let us get them from the gff file and see if I can xref them.

gff_annot <- load_gff_annotations(gff="reference/paeruginosa_pa14.gff")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Returning a df with 16 columns and 11946 rows.
rownames(gff_annot) <- make.names(gff_annot$Parent, unique=TRUE)
gff_annot$Alias <- as.character(gff_annot$Alias)
all_annot <- merge(gff_annot, pa14_annot, by.x="Alias", by.y="sysName", all.y=TRUE)
rownames(all_annot) <- make.names(all_annot$ID, unique=TRUE)
pander::pander(sessionInfo())

R version 3.5.1 (2018-07-02)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C

attached base packages: stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: hpgltools(v.2018.03)

loaded via a namespace (and not attached): Biobase(v.2.40.0), httr(v.1.3.1), bit64(v.0.9-7), foreach(v.1.4.4), assertthat(v.0.2.0), highr(v.0.7), stats4(v.3.5.1), pander(v.0.6.2), selectr(v.0.4-1), blob(v.1.1.1), GenomeInfoDbData(v.1.1.0), Rsamtools(v.1.32.2), yaml(v.2.1.19), progress(v.1.2.0), lattice(v.0.20-35), pillar(v.1.3.0), RSQLite(v.2.1.1), backports(v.1.1.2), glue(v.1.3.0), digest(v.0.6.15), GenomicRanges(v.1.32.4), XVector(v.0.20.0), rvest(v.0.3.2), colorspace(v.1.3-2), htmltools(v.0.3.6), Matrix(v.1.2-14), plyr(v.1.8.4), XML(v.3.98-1.12), pkgconfig(v.2.0.1), devtools(v.1.13.6), biomaRt(v.2.36.1), zlibbioc(v.1.26.0), purrr(v.0.2.5), scales(v.0.5.0), BiocParallel(v.1.14.2), tibble(v.1.4.2), IRanges(v.2.14.10), ggplot2(v.3.0.0), withr(v.2.1.2), SummarizedExperiment(v.1.10.1), GenomicFeatures(v.1.32.0), BiocGenerics(v.0.26.0), lazyeval(v.0.2.1), magrittr(v.1.5), crayon(v.1.3.4), memoise(v.1.1.0), evaluate(v.0.11), xml2(v.1.2.0), tools(v.3.5.1), data.table(v.1.11.4), prettyunits(v.1.0.2), hms(v.0.4.2), matrixStats(v.0.53.1), stringr(v.1.3.1), S4Vectors(v.0.18.3), munsell(v.0.5.0), DelayedArray(v.0.6.1), AnnotationDbi(v.1.42.1), bindrcpp(v.0.2.2), Biostrings(v.2.48.0), compiler(v.3.5.1), GenomeInfoDb(v.1.16.0), rlang(v.0.2.1), grid(v.3.5.1), RCurl(v.1.95-4.11), iterators(v.1.0.10), bitops(v.1.0-6), base64enc(v.0.1-3), rmarkdown(v.1.10), gtable(v.0.2.0), codetools(v.0.2-15), DBI(v.1.0.0), roxygen2(v.6.0.1), curl(v.3.2), R6(v.2.2.2), GenomicAlignments(v.1.16.0), knitr(v.1.20), dplyr(v.0.7.6), rtracklayer(v.1.40.3), bit(v.1.1-14), bindr(v.0.1.1), commonmark(v.1.5), rprojroot(v.1.3-2), stringi(v.1.2.3), parallel(v.3.5.1), Rcpp(v.0.12.17) and tidyselect(v.0.2.4)

message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset c730ef178f8e57bbf3819e21cf5e6cfe879e6328
## R> packrat::restore()
## This is hpgltools commit: Fri Jul 13 17:21:39 2018 -0400: c730ef178f8e57bbf3819e21cf5e6cfe879e6328
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_annotation-v20171019.rda.xz
tmp <- sm(saveme(filename=this_save))
LS0tCnRpdGxlOiAiMjAxNzEwMTk6IFAuYWVydWdpbm9zYSAyMDE3OiBhbm5vdGF0aW9uIGRhdGEiCmF1dGhvcjogImF0YiBhYmVsZXdAZ21haWwuY29tIgpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiCm91dHB1dDoKIGh0bWxfZG9jdW1lbnQ6CiAgY29kZV9kb3dubG9hZDogdHJ1ZQogIGNvZGVfZm9sZGluZzogc2hvdwogIGZpZ19jYXB0aW9uOiB0cnVlCiAgZmlnX2hlaWdodDogNwogIGZpZ193aWR0aDogNwogIGhpZ2hsaWdodDogZGVmYXVsdAogIGtlZXBfbWQ6IGZhbHNlCiAgbW9kZTogc2VsZmNvbnRhaW5lZAogIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogIHNlbGZfY29udGFpbmVkOiB0cnVlCiAgdGhlbWU6IHJlYWRhYmxlCiAgdG9jOiB0cnVlCiAgdG9jX2Zsb2F0OgogICAgY29sbGFwc2VkOiBmYWxzZQogICAgc21vb3RoX3Njcm9sbDogZmFsc2UKLS0tCgo8c3R5bGU+CiAgYm9keSAubWFpbi1jb250YWluZXIgewogICAgbWF4LXdpZHRoOiAxNjAwcHg7CiAgfQo8L3N0eWxlPgoKYGBge3Igb3B0aW9ucywgaW5jbHVkZT1GQUxTRX0KbGlicmFyeShocGdsdG9vbHMpCnR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgifi9ocGdsdG9vbHMiKQprbml0cjo6b3B0c19rbml0JHNldCgKICAgICAgICAgICAgICAgICAgIHByb2dyZXNzPVRSVUUsCiAgICAgICAgICAgICAgICAgICB2ZXJib3NlPVRSVUUsCiAgICAgICAgICAgICAgICAgICB3aWR0aD05MCwKICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KAogICAgICAgICAgICAgICAgICAgIGVycm9yPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTgsCiAgICAgICAgICAgICAgICAgICAgZmlnLmhlaWdodD04LAogICAgICAgICAgICAgICAgICAgIGRwaT05NikKb2xkX29wdGlvbnMgPC0gb3B0aW9ucygKICBkaWdpdHM9NCwKICBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFLAogIGtuaXRyLmR1cGxpY2F0ZS5sYWJlbD0iYWxsb3ciKQpnZ3Bsb3QyOjp0aGVtZV9zZXQoZ2dwbG90Mjo6dGhlbWVfYncoYmFzZV9zaXplPTEwKSkKc2V0LnNlZWQoMSkKdmVyIDwtICIyMDE3MTAxOSIKcHJldmlvdXNfZmlsZSA8LSAiaW5kZXguUm1kIgoKdG1wIDwtIHRyeShzbShsb2FkbWUoZmlsZW5hbWU9cGFzdGUwKGdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iIiwgeD1wcmV2aW91c19maWxlKSwgIi12IiwgdmVyLCAiLnJkYS54eiIpKSkpCnJtZF9maWxlIDwtICIwMV9hbm5vdGF0aW9uLlJtZCIKYGBgCgojIEFubm90YXRpb24gdmVyc2lvbjogYHIgdmVyYAoKVGhlIGZvbGxvd2luZyBzZWN0aW9uIGxvYWRzIHRoZSBtaWNyb2Jlc29ubGluZSBhbmQgZ2VuYmFuayBhbm5vdGF0aW9ucyBmb3IgUHNldWRvbW9uYXMgYWVydWdpbm9zYS4KCmBgYHtyIGFubm90YXRpb259CiMjIExvb2tzIGxpa2UgaXQgaXMgdGF4b24gSUQgMjA4OTYzCnBhZXJ1Z2lub3NhX2Fubm90YXRpb25zIDwtIGxvYWRfbWljcm9iZXNvbmxpbmVfYW5ub3RhdGlvbnMoaWQ9MjA4OTYzKQprbml0cjo6a2FibGUoaGVhZChwYWVydWdpbm9zYV9hbm5vdGF0aW9ucykpCiMjIFdoaWxlIHdhaXRpbmcgZm9yIHRoYXQgdG8gZG93bmxvYWQsIG1heWJlIGdyYWIgYSBnZW5iYW5rIGdlbm9tZSBmb3IgaXQsIHRvby4KIyMgTG9va3MgbGlrZSB0aGUgZ2Vub21lIGlzIGFjY2Vzc2lvbiBOQ18wMDI1MTYuMiwgZG93bmxvYWRpbmcgaXQgdG8gcmVmZXJlbmNlLwojIyBjcmFwIGluIGEgaGF0LCB0aGF0IGlzIHN0cmFpbiBwYTAxLCBkaXJ0eSB0cmljazsgTkNfMDA4NDYzLjEgaXMgcGExNCBJIHRoaW5rLgp3cml0ZS5jc3YoeD1wYWVydWdpbm9zYV9hbm5vdGF0aW9ucywgZmlsZT0icGFlcnVnaW5vc2FfbWljcm9iZXNfYW5ub3RhdGlvbnMuY3N2IikKIyMgb2ggeWVhaCBJIHdyb3RlIHNvbWV0aGluZyB3aGljaCBzaG91bGQgZG93bmxvYWQgYW5kIHBhcnNlIGdlbmJhbmsgZmlsZXMuLi4KcGEgPC0gbmV3LmVudigpCnRlc3QgPC0gdHJ5KGxvYWQoInBhMTQucmRhIiwgZW52aXI9cGEpKQpwYTE0IDwtIE5VTEwKaWYgKGNsYXNzKHRlc3QpID09ICJ0cnktZXJyb3IiKSB7CiAgICBwYTE0IDwtIGxvYWRfZ2VuYmFua19hbm5vdGF0aW9ucyhhY2Nlc3Npb249Ik5DXzAwODQ2My4xIikKICAgICMjIHNhdmUgYSBjb3B5IG9mIHRoaXMgZGF0YSBzdHJ1Y3R1cmUgdG8gYXZvaWQgaGF2aW5nIHRvIHJlZG93bmxvYWQgaXQKICAgIHNhdmUobGlzdD1jKCJwYTE0IiksIGZpbGU9InBhMTQucmRhIikKfSBlbHNlIHsKICAgIHBhMTQgPC0gcGEkcGExNAogICAgcm0ocGEpCn0KCnBhMTRfYW5ub3QgPC0gcGFlcnVnaW5vc2FfYW5ub3RhdGlvbnNbLCBjKCJzeXNOYW1lIiwgIm5hbWUiLCAiZGVzYyIsICJDT0dEZXNjIildCnBhX25hbWVzIDwtIGdyZXBsKHBhdHRlcm49Il5QQTE0XyIsIHg9cGExNF9hbm5vdCRuYW1lKQpwYTE0X2Fubm90W3BhX25hbWVzLCAibmFtZSJdIDwtICIiCndyaXRlLmNzdih4PXBhMTRfYW5ub3QsIGZpbGU9InBhMTRfYW5ub3RhdGlvbnMuY3N2IikKYGBgCgojIyBnZmYgYW5ub3RhdGlvbnMKClVuZm9ydHVuYXRlbHksIGl0IGFwcGVhcnMgdGhhdCBJRHMgZnJvbSB0aGUgUHNldWRvbW9uYXMgZ2ZmIGRvIG5vdCBtYXRjaCB3aGF0IEkKZ2V0IGZyb20gbWljcm9iZXNvbmxpbmUuICBTbyBsZXQgdXMgZ2V0IHRoZW0gZnJvbSB0aGUgZ2ZmIGZpbGUgYW5kIHNlZSBpZiBJIGNhbgp4cmVmIHRoZW0uCgpgYGB7ciBnZmZfYW5ub3RhdGlvbnN9CmdmZl9hbm5vdCA8LSBsb2FkX2dmZl9hbm5vdGF0aW9ucyhnZmY9InJlZmVyZW5jZS9wYWVydWdpbm9zYV9wYTE0LmdmZiIpCnJvd25hbWVzKGdmZl9hbm5vdCkgPC0gbWFrZS5uYW1lcyhnZmZfYW5ub3QkUGFyZW50LCB1bmlxdWU9VFJVRSkKZ2ZmX2Fubm90JEFsaWFzIDwtIGFzLmNoYXJhY3RlcihnZmZfYW5ub3QkQWxpYXMpCmBgYAoKYGBge3IgbWVyZ2VkX2Fubm90YXRpb25zfQphbGxfYW5ub3QgPC0gbWVyZ2UoZ2ZmX2Fubm90LCBwYTE0X2Fubm90LCBieS54PSJBbGlhcyIsIGJ5Lnk9InN5c05hbWUiLCBhbGwueT1UUlVFKQpyb3duYW1lcyhhbGxfYW5ub3QpIDwtIG1ha2UubmFtZXMoYWxsX2Fubm90JElELCB1bmlxdWU9VFJVRSkKYGBgCgpgYGB7ciBzYXZlbWV9CnBhbmRlcjo6cGFuZGVyKHNlc3Npb25JbmZvKCkpCm1lc3NhZ2UocGFzdGUwKCJUaGlzIGlzIGhwZ2x0b29scyBjb21taXQ6ICIsIGdldF9naXRfY29tbWl0KCkpKQp0aGlzX3NhdmUgPC0gcGFzdGUwKGdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iIiwgeD1ybWRfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKQptZXNzYWdlKHBhc3RlMCgiU2F2aW5nIHRvICIsIHRoaXNfc2F2ZSkpCnRtcCA8LSBzbShzYXZlbWUoZmlsZW5hbWU9dGhpc19zYXZlKSkKYGBgCg==