The following section loads the microbesonline and genbank annotations for Pseudomonas aeruginosa.
## Looks like it is taxon ID 208963
id <- 208963
paeruginosa_annotations <- load_microbesonline_annotations(208963)
## The species being downloaded is: Pseudomonas aeruginosa UCBPP-PA14
knitr::kable(head(paeruginosa_annotations))
locusId | accession | GI | scaffoldId | start | stop | strand | sysName | name | desc | COG | COGFun | COGDesc | TIGRFam | TIGRRoles | GO | EC | ECDesc |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2194572 | YP_788156.1 | 116053721 | 4582 | 483 | 2027 | + | PA14_00010 | dnaA | chromosomal replication initiator protein DnaA (NCBI) | COG593 | L | ATPase involved in DNA replication initiation | TIGR00362 chromosomal replication initiator protein DnaA [dnaA] | DNA metabolism:DNA replication, recombination, and repair | GO:0006270,GO:0006275,GO:0003688,GO:0017111,GO:0005524 | ||
2194573 | YP_788157.1 | 116053722 | 4582 | 2056 | 3159 | + | PA14_00020 | dnaN | DNA polymerase III, beta chain (NCBI) | COG592 | L | DNA polymerase sliding clamp subunit (PCNA homolog) | TIGR00663 DNA polymerase III, beta subunit [dnaN] | DNA metabolism:DNA replication, recombination, and repair | GO:0006260,GO:0003677,GO:0003893,GO:0008408,GO:0016449,GO:0019984,GO:0003889,GO:0003894,GO:0015999,GO:0016450,GO:0003890,GO:0003895,GO:0016000,GO:0016451,GO:0003891,GO:0016448,GO:0016452 | 2.7.7.7 | DNA-directed DNA polymerase. |
2194574 | YP_788158.1 | 116053723 | 4582 | 3169 | 4278 | + | PA14_00030 | recF | DNA replication and repair protein RecF (NCBI) | COG1195 | L | Recombinational DNA repair ATPase (RecF pathway) | TIGR00611 DNA replication and repair protein RecF [recF] | DNA metabolism:DNA replication, recombination, and repair | GO:0006281,GO:0005694,GO:0005524,GO:0017111,GO:0003697 | ||
2194575 | YP_788159.1 | 116053724 | 4582 | 4275 | 6695 | + | PA14_00050 | gyrB | DNA gyrase subunit B (NCBI) | COG187 | L | Type IIA topoisomerase (DNA gyrase/topo II, topoisomerase IV), B subunit | TIGR01059 DNA gyrase, B subunit [gyrB] | DNA metabolism:DNA replication, recombination, and repair | GO:0006304,GO:0006265,GO:0005694,GO:0003918,GO:0005524 | 5.99.1.3 | DNA topoisomerase (ATP-hydrolyzing). |
2194576 | YP_788160.1 | 116053725 | 4582 | 7791 | 7018 | - | PA14_00060 | PA14_00060 | putative acyltransferase (NCBI) | COG204 | I | 1-acyl-sn-glycerol-3-phosphate acyltransferase | GO:0008152,GO:0003841 | 2.3.1.51 | 1-acylglycerol-3-phosphate O-acyltransferase. | ||
2194577 | YP_788161.1 | 116053726 | 4582 | 8339 | 7803 | - | PA14_00070 | PA14_00070 | putative histidinol-phosphatase (NCBI) | COG241 | E | Histidinol phosphatase and related phosphatases | TIGR01656 histidinol-phosphate phosphatase domain,TIGR01662 HAD hydrolase, family IIIA | Unknown function:Enzymes of unknown specificity | GO:0000105,GO:0004401 | 3.1.3.- |
## While waiting for that to download, maybe grab a genbank genome for it, too.
## Looks like the genome is accession NC_002516.2, downloading it to reference/
## crap in a hat, that is strain pa01, dirty trick; NC_008463.1 is pa14 I think.
write.csv(x=paeruginosa_annotations, file="paeruginosa_microbes_annotations.csv")
## oh yeah I wrote something which should download and parse genbank files...
pa <- new.env()
test <- try(load("pa14.rda", envir=pa))
pa14 <- NULL
if (class(test) == "try-error") {
pa14 <- load_genbank_annotations(accession="NC_008463.1")
## save a copy of this data structure to avoid having to redownload it
save(list=c("pa14"), file="pa14.rda")
} else {
pa14 <- pa$pa14
rm(pa)
}
pa14_annot <- paeruginosa_annotations[, c("sysName", "name", "desc", "COGDesc")]
pa_names <- grepl(pattern="^PA14_", x=pa14_annot$name)
pa14_annot[pa_names, "name"] <- ""
write.csv(x=pa14_annot, file="pa14_annotations.csv")
Unfortunately, it appears that IDs from the Pseudomonas gff do not match what I get from microbesonline. So let us get them from the gff file and see if I can xref them.
gff_annot <- load_gff_annotations(gff="reference/paeruginosa_pa14.gff")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Returning a df with 16 columns and 11946 rows.
rownames(gff_annot) <- make.names(gff_annot$Parent, unique=TRUE)
gff_annot$Alias <- as.character(gff_annot$Alias)
all_annot <- merge(gff_annot, pa14_annot, by.x="Alias", by.y="sysName", all.y=TRUE)
rownames(all_annot) <- make.names(all_annot$ID, unique=TRUE)
pander::pander(sessionInfo())
R version 3.5.1 (2018-07-02)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: hpgltools(v.2018.03)
loaded via a namespace (and not attached): Biobase(v.2.40.0), httr(v.1.3.1), bit64(v.0.9-7), foreach(v.1.4.4), assertthat(v.0.2.0), highr(v.0.7), stats4(v.3.5.1), pander(v.0.6.2), selectr(v.0.4-1), blob(v.1.1.1), GenomeInfoDbData(v.1.1.0), Rsamtools(v.1.32.2), yaml(v.2.1.19), progress(v.1.2.0), lattice(v.0.20-35), pillar(v.1.3.0), RSQLite(v.2.1.1), backports(v.1.1.2), glue(v.1.3.0), digest(v.0.6.15), GenomicRanges(v.1.32.4), XVector(v.0.20.0), rvest(v.0.3.2), colorspace(v.1.3-2), htmltools(v.0.3.6), Matrix(v.1.2-14), plyr(v.1.8.4), XML(v.3.98-1.12), pkgconfig(v.2.0.1), devtools(v.1.13.6), biomaRt(v.2.36.1), zlibbioc(v.1.26.0), purrr(v.0.2.5), scales(v.0.5.0), BiocParallel(v.1.14.2), tibble(v.1.4.2), IRanges(v.2.14.10), ggplot2(v.3.0.0), withr(v.2.1.2), SummarizedExperiment(v.1.10.1), GenomicFeatures(v.1.32.0), BiocGenerics(v.0.26.0), lazyeval(v.0.2.1), magrittr(v.1.5), crayon(v.1.3.4), memoise(v.1.1.0), evaluate(v.0.11), xml2(v.1.2.0), tools(v.3.5.1), data.table(v.1.11.4), prettyunits(v.1.0.2), hms(v.0.4.2), matrixStats(v.0.53.1), stringr(v.1.3.1), S4Vectors(v.0.18.3), munsell(v.0.5.0), DelayedArray(v.0.6.1), AnnotationDbi(v.1.42.1), bindrcpp(v.0.2.2), Biostrings(v.2.48.0), compiler(v.3.5.1), GenomeInfoDb(v.1.16.0), rlang(v.0.2.1), grid(v.3.5.1), RCurl(v.1.95-4.11), iterators(v.1.0.10), bitops(v.1.0-6), base64enc(v.0.1-3), rmarkdown(v.1.10), gtable(v.0.2.0), codetools(v.0.2-15), DBI(v.1.0.0), roxygen2(v.6.0.1), curl(v.3.2), R6(v.2.2.2), GenomicAlignments(v.1.16.0), knitr(v.1.20), dplyr(v.0.7.6), rtracklayer(v.1.40.3), bit(v.1.1-14), bindr(v.0.1.1), commonmark(v.1.5), rprojroot(v.1.3-2), stringi(v.1.2.3), parallel(v.3.5.1), Rcpp(v.0.12.17) and tidyselect(v.0.2.4)
message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset c730ef178f8e57bbf3819e21cf5e6cfe879e6328
## R> packrat::restore()
## This is hpgltools commit: Fri Jul 13 17:21:39 2018 -0400: c730ef178f8e57bbf3819e21cf5e6cfe879e6328
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_annotation_20180718-v20180718.rda.xz
tmp <- sm(saveme(filename=this_save))