1 Gathering Mus musculus annotation data

biomart is pretty reliable for getting mouse annotation data.

mouse_pkg <- "Mus.musculus"
if ("Mus.musculus" %in% .packages(all.available=TRUE)) {
    tt <- sm(library("Mus.musculus"))
} else {
    require.auto("Mus.musculus")
    tt <- sm(library("Mus.musculus"))
}
mm_tx_org <- sm(load_orgdb_annotations(Mus.musculus, keytype="ensembltrans",
                                 fields=c("cdschrom", "definition", "genename")))
## Error in .tableJoinSelector(tKey): No query for this combination of tables. Please add gtc to the interpolator
mm_tx_genes <- mm_tx_org$genes
## Error in eval(expr, envir, enclos): object 'mm_tx_org' not found
## The inclusion of the 'definition' columns means that we also end up with a bunch
## of redundant entries, like 200,000 of them; therefore I am going to whittle down
## the list.
tt <- sm(library(Mus.musculus))
##mm_go <- load_orgdb_go(orgdb=Mus.musculus)
mm_go <- load_biomart_go(species="mmusculus")
## The biomart annotations file already exists, loading from it.
mm_annot <- load_biomart_annotations(species="mmusculus")
## The biomart annotations file already exists, loading from it.
mm_lengths <- mm_annot[, c("geneID", "length")]
## Error in mm_annot[, c("geneID", "length")]: incorrect number of dimensions
rownames(mm_lengths) <- make.names(mm_lengths[["geneID"]], unique=TRUE)
## Error in make.names(mm_lengths[["geneID"]], unique = TRUE): object 'mm_lengths' not found
colnames(mm_lengths) <- c("ID", "length")
## Error in colnames(mm_lengths) <- c("ID", "length"): object 'mm_lengths' not found
mm_tx_unique <- !grepl(pattern="\\.", x=rownames(mm_tx_genes))
## Error in rownames(mm_tx_genes): object 'mm_tx_genes' not found
mm_tx_genes <- mm_tx_genes[mm_tx_unique, ]
## Error in eval(expr, envir, enclos): object 'mm_tx_genes' not found
##mmtx_annotations <- get_biomart_annotations(species="mmusculus")
mart <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL", host="dec2015.archive.ensembl.org")
dataset <- paste0("mmusculus_gene_ensembl")
ensembl <- try(biomaRt::useDataset(dataset, mart=mart))
lots_of_rows <- biomaRt::listAttributes(ensembl)  ## List of possible attributes
## wanted_attributes <- c("ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "chromosome_name", "start_position","end_position","description", "entrezgene","hgnc_symbol","hgnc_id","uniprot_sptrembl","uniprot_swissprot","uniprot_genename")  ## attributes Lucia and I chose, but too many
wanted_attributes_global <- c("ensembl_gene_id", "ensembl_transcript_id", "chromosome_name",
                              "start_position", "end_position", "strand", "description")
wanted_attributes_names <- c("ensembl_gene_id", "ensembl_transcript_id", "entrezgene", "hgnc_symbol", "hgnc_id")
wanted_attributes_uniprot <- c("ensembl_gene_id", "ensembl_transcript_id", "uniprot_sptrembl", "uniprot_swissprot")

wanted_global_annotations <- biomaRt::getBM(attributes=wanted_attributes_global, mart=ensembl)
dim(wanted_global_annotations)
## [1] 114083      7
wanted_names_annotations <- biomaRt::getBM(attributes=wanted_attributes_names, mart=ensembl)
wanted_uniprot_annotations <- biomaRt::getBM(attributes=wanted_attributes_uniprot, mart=ensembl)
dim(wanted_uniprot_annotations)
## [1] 57040     4
wanted_annotations <- merge(wanted_global_annotations, wanted_uniprot_annotations,
                            by.x="ensembl_gene_id", by.y="ensembl_gene_id", all.y=TRUE)

length(unique(wanted_annotations$ensembl_gene_id))
## [1] 22683
rownames(wanted_annotations) <- make.names(wanted_annotations[["ensembl_gene_id"]], unique=TRUE)
pander::pander(sessionInfo())

R version 3.4.4 (2018-03-15)

**Platform:** x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C

attached base packages: parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: Mus.musculus(v.1.3.1), TxDb.Mmusculus.UCSC.mm10.knownGene(v.3.4.0), org.Mm.eg.db(v.3.5.0), GO.db(v.3.5.0), OrganismDbi(v.1.20.0), GenomicFeatures(v.1.30.3), GenomicRanges(v.1.30.3), GenomeInfoDb(v.1.14.0), AnnotationDbi(v.1.40.0), IRanges(v.2.12.0), S4Vectors(v.0.16.0), Biobase(v.2.38.0), BiocGenerics(v.0.24.0) and hpgltools(v.2018.03)

loaded via a namespace (and not attached): Rcpp(v.0.12.16), lattice(v.0.20-35), prettyunits(v.1.0.2), Rsamtools(v.1.30.0), Biostrings(v.2.46.0), assertthat(v.0.2.0), rprojroot(v.1.3-2), digest(v.0.6.15), foreach(v.1.4.4), R6(v.2.2.2), plyr(v.1.8.4), backports(v.1.1.2), RSQLite(v.2.0), evaluate(v.0.10.1), httr(v.1.3.1), ggplot2(v.2.2.1), BiocInstaller(v.1.28.0), pillar(v.1.2.1), zlibbioc(v.1.24.0), rlang(v.0.2.0.9001), progress(v.1.1.2), curl(v.3.1), lazyeval(v.0.2.1), data.table(v.1.10.4-3), blob(v.1.1.0), Matrix(v.1.2-12), rmarkdown(v.1.9), devtools(v.1.13.5), RMySQL(v.0.10.14), BiocParallel(v.1.12.0), pander(v.0.6.1), stringr(v.1.3.0), RCurl(v.1.95-4.10), bit(v.1.1-12), biomaRt(v.2.34.2), munsell(v.0.4.3), DelayedArray(v.0.4.1), compiler(v.3.4.4), rtracklayer(v.1.38.3), pkgconfig(v.2.0.1), base64enc(v.0.1-3), htmltools(v.0.3.6), SummarizedExperiment(v.1.8.1), tibble(v.1.4.2), GenomeInfoDbData(v.1.0.0), roxygen2(v.6.0.1), codetools(v.0.2-15), matrixStats(v.0.53.1), XML(v.3.98-1.10), withr(v.2.1.2), GenomicAlignments(v.1.14.1), bitops(v.1.0-6), commonmark(v.1.4), grid(v.3.4.4), RBGL(v.1.54.0), gtable(v.0.2.0), DBI(v.0.8), magrittr(v.1.5), scales(v.0.5.0.9000), graph(v.1.56.0), stringi(v.1.1.7), XVector(v.0.18.0), xml2(v.1.2.0), iterators(v.1.0.9), tools(v.3.4.4), bit64(v.0.9-7), yaml(v.2.1.18), colorspace(v.1.3-2), memoise(v.1.1.0) and knitr(v.1.20)

message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 1b009834267dea125ee94934203413fbd606e783
## R> packrat::restore()
## This is hpgltools commit: Mon Apr 23 14:59:56 2018 -0400: 1b009834267dea125ee94934203413fbd606e783
message(paste0("Saving to ", savefile))
## Saving to annotation_mmusculus_v20170628.rda.xz
tmp <- sm(saveme(filename=savefile))
LS0tCnRpdGxlOiAiSS5zY2FwdWxhcmlzIDIwMTcwNjI4OiBJbmZlY3RlZCBtaWNlIGFubm90YXRpb24gaW5mb3JtYXRpb24uIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiBodG1sX2RvY3VtZW50OgogIGNvZGVfZG93bmxvYWQ6IHRydWUKICBjb2RlX2ZvbGRpbmc6IHNob3cKICBmaWdfY2FwdGlvbjogdHJ1ZQogIGZpZ19oZWlnaHQ6IDcKICBmaWdfd2lkdGg6IDcKICBoaWdobGlnaHQ6IGRlZmF1bHQKICBrZWVwX21kOiBmYWxzZQogIG1vZGU6IHNlbGZjb250YWluZWQKICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICBzZWxmX2NvbnRhaW5lZDogdHJ1ZQogIHRoZW1lOiByZWFkYWJsZQogIHRvYzogdHJ1ZQogIHRvY19mbG9hdDoKICAgIGNvbGxhcHNlZDogZmFsc2UKICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCi0tLQoKPHN0eWxlPgogIGJvZHkgLm1haW4tY29udGFpbmVyIHsKICAgIG1heC13aWR0aDogMTYwMHB4OwogIH0KPC9zdHlsZT4KCmBgYHtyIG9wdGlvbnMsIGluY2x1ZGU9RkFMU0V9CmlmICghaXNUUlVFKGdldDAoInNraXBfbG9hZCIpKSkgewogIGxpYnJhcnkoaHBnbHRvb2xzKQogIHR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgifi9ocGdsdG9vbHMiKQogIGtuaXRyOjpvcHRzX2tuaXQkc2V0KHByb2dyZXNzPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZT1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgIHdpZHRoPTkwLAogICAgICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKICBrbml0cjo6b3B0c19jaHVuayRzZXQoZXJyb3I9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTgsCiAgICAgICAgICAgICAgICAgICAgICAgIGZpZy5oZWlnaHQ9OCwKICAgICAgICAgICAgICAgICAgICAgICAgZHBpPTk2KQogIG9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICAgICAga25pdHIuZHVwbGljYXRlLmxhYmVsPSJhbGxvdyIpCiAgZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMCkpCiAgdmVyIDwtICIyMDE3MDYyOCIKICBwcmV2aW91c19maWxlIDwtIHBhc3RlMCgiMDFfYW5ub3RhdGlvbl9tbXVzY3VsdXNfdiIsIHZlciwgIi5SbWQiKQoKICB0bXAgPC0gdHJ5KHNtKGxvYWRtZShmaWxlbmFtZT1nc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IlxcLnJkYVxcLnh6IiwgeD1wcmV2aW91c19maWxlKSkpKQogIHJtZF9maWxlIDwtIHBhc3RlMCgiYW5ub3RhdGlvbl9tbXVzY3VsdXNfdiIsIHZlciwgIi5SbWQiKQogIHNhdmVmaWxlIDwtIGdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iXFwucmRhXFwueHoiLCB4PXJtZF9maWxlKQp9CmBgYAoKR2F0aGVyaW5nIE11cyBtdXNjdWx1cyBhbm5vdGF0aW9uIGRhdGEKPT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT0KCmJpb21hcnQgaXMgcHJldHR5IHJlbGlhYmxlIGZvciBnZXR0aW5nIG1vdXNlIGFubm90YXRpb24gZGF0YS4KCmBgYHtyIG1vdXNlX2Fubm90YXRpb25zfQptb3VzZV9wa2cgPC0gIk11cy5tdXNjdWx1cyIKaWYgKCJNdXMubXVzY3VsdXMiICVpbiUgLnBhY2thZ2VzKGFsbC5hdmFpbGFibGU9VFJVRSkpIHsKICAgIHR0IDwtIHNtKGxpYnJhcnkoIk11cy5tdXNjdWx1cyIpKQp9IGVsc2UgewogICAgcmVxdWlyZS5hdXRvKCJNdXMubXVzY3VsdXMiKQogICAgdHQgPC0gc20obGlicmFyeSgiTXVzLm11c2N1bHVzIikpCn0KbW1fdHhfb3JnIDwtIHNtKGxvYWRfb3JnZGJfYW5ub3RhdGlvbnMoTXVzLm11c2N1bHVzLCBrZXl0eXBlPSJlbnNlbWJsdHJhbnMiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmaWVsZHM9YygiY2RzY2hyb20iLCAiZGVmaW5pdGlvbiIsICJnZW5lbmFtZSIpKSkKbW1fdHhfZ2VuZXMgPC0gbW1fdHhfb3JnJGdlbmVzCiMjIFRoZSBpbmNsdXNpb24gb2YgdGhlICdkZWZpbml0aW9uJyBjb2x1bW5zIG1lYW5zIHRoYXQgd2UgYWxzbyBlbmQgdXAgd2l0aCBhIGJ1bmNoCiMjIG9mIHJlZHVuZGFudCBlbnRyaWVzLCBsaWtlIDIwMCwwMDAgb2YgdGhlbTsgdGhlcmVmb3JlIEkgYW0gZ29pbmcgdG8gd2hpdHRsZSBkb3duCiMjIHRoZSBsaXN0Lgp0dCA8LSBzbShsaWJyYXJ5KE11cy5tdXNjdWx1cykpCiMjbW1fZ28gPC0gbG9hZF9vcmdkYl9nbyhvcmdkYj1NdXMubXVzY3VsdXMpCm1tX2dvIDwtIGxvYWRfYmlvbWFydF9nbyhzcGVjaWVzPSJtbXVzY3VsdXMiKQoKbW1fYW5ub3QgPC0gbG9hZF9iaW9tYXJ0X2Fubm90YXRpb25zKHNwZWNpZXM9Im1tdXNjdWx1cyIpCm1tX2xlbmd0aHMgPC0gbW1fYW5ub3RbLCBjKCJnZW5lSUQiLCAibGVuZ3RoIildCnJvd25hbWVzKG1tX2xlbmd0aHMpIDwtIG1ha2UubmFtZXMobW1fbGVuZ3Roc1tbImdlbmVJRCJdXSwgdW5pcXVlPVRSVUUpCmNvbG5hbWVzKG1tX2xlbmd0aHMpIDwtIGMoIklEIiwgImxlbmd0aCIpCgptbV90eF91bmlxdWUgPC0gIWdyZXBsKHBhdHRlcm49IlxcLiIsIHg9cm93bmFtZXMobW1fdHhfZ2VuZXMpKQptbV90eF9nZW5lcyA8LSBtbV90eF9nZW5lc1ttbV90eF91bmlxdWUsIF0KIyNtbXR4X2Fubm90YXRpb25zIDwtIGdldF9iaW9tYXJ0X2Fubm90YXRpb25zKHNwZWNpZXM9Im1tdXNjdWx1cyIpCm1hcnQgPC0gYmlvbWFSdDo6dXNlTWFydChiaW9tYXJ0PSJFTlNFTUJMX01BUlRfRU5TRU1CTCIsIGhvc3Q9ImRlYzIwMTUuYXJjaGl2ZS5lbnNlbWJsLm9yZyIpCmRhdGFzZXQgPC0gcGFzdGUwKCJtbXVzY3VsdXNfZ2VuZV9lbnNlbWJsIikKZW5zZW1ibCA8LSB0cnkoYmlvbWFSdDo6dXNlRGF0YXNldChkYXRhc2V0LCBtYXJ0PW1hcnQpKQpsb3RzX29mX3Jvd3MgPC0gYmlvbWFSdDo6bGlzdEF0dHJpYnV0ZXMoZW5zZW1ibCkgICMjIExpc3Qgb2YgcG9zc2libGUgYXR0cmlidXRlcwojIyB3YW50ZWRfYXR0cmlidXRlcyA8LSBjKCJlbnNlbWJsX2dlbmVfaWQiLCAiZW5zZW1ibF90cmFuc2NyaXB0X2lkIiwgImVuc2VtYmxfcGVwdGlkZV9pZCIsICJjaHJvbW9zb21lX25hbWUiLCAic3RhcnRfcG9zaXRpb24iLCJlbmRfcG9zaXRpb24iLCJkZXNjcmlwdGlvbiIsICJlbnRyZXpnZW5lIiwiaGduY19zeW1ib2wiLCJoZ25jX2lkIiwidW5pcHJvdF9zcHRyZW1ibCIsInVuaXByb3Rfc3dpc3Nwcm90IiwidW5pcHJvdF9nZW5lbmFtZSIpICAjIyBhdHRyaWJ1dGVzIEx1Y2lhIGFuZCBJIGNob3NlLCBidXQgdG9vIG1hbnkKd2FudGVkX2F0dHJpYnV0ZXNfZ2xvYmFsIDwtIGMoImVuc2VtYmxfZ2VuZV9pZCIsICJlbnNlbWJsX3RyYW5zY3JpcHRfaWQiLCAiY2hyb21vc29tZV9uYW1lIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgInN0YXJ0X3Bvc2l0aW9uIiwgImVuZF9wb3NpdGlvbiIsICJzdHJhbmQiLCAiZGVzY3JpcHRpb24iKQp3YW50ZWRfYXR0cmlidXRlc19uYW1lcyA8LSBjKCJlbnNlbWJsX2dlbmVfaWQiLCAiZW5zZW1ibF90cmFuc2NyaXB0X2lkIiwgImVudHJlemdlbmUiLCAiaGduY19zeW1ib2wiLCAiaGduY19pZCIpCndhbnRlZF9hdHRyaWJ1dGVzX3VuaXByb3QgPC0gYygiZW5zZW1ibF9nZW5lX2lkIiwgImVuc2VtYmxfdHJhbnNjcmlwdF9pZCIsICJ1bmlwcm90X3NwdHJlbWJsIiwgInVuaXByb3Rfc3dpc3Nwcm90IikKCndhbnRlZF9nbG9iYWxfYW5ub3RhdGlvbnMgPC0gYmlvbWFSdDo6Z2V0Qk0oYXR0cmlidXRlcz13YW50ZWRfYXR0cmlidXRlc19nbG9iYWwsIG1hcnQ9ZW5zZW1ibCkKZGltKHdhbnRlZF9nbG9iYWxfYW5ub3RhdGlvbnMpCndhbnRlZF9uYW1lc19hbm5vdGF0aW9ucyA8LSBiaW9tYVJ0OjpnZXRCTShhdHRyaWJ1dGVzPXdhbnRlZF9hdHRyaWJ1dGVzX25hbWVzLCBtYXJ0PWVuc2VtYmwpCndhbnRlZF91bmlwcm90X2Fubm90YXRpb25zIDwtIGJpb21hUnQ6OmdldEJNKGF0dHJpYnV0ZXM9d2FudGVkX2F0dHJpYnV0ZXNfdW5pcHJvdCwgbWFydD1lbnNlbWJsKQpkaW0od2FudGVkX3VuaXByb3RfYW5ub3RhdGlvbnMpCndhbnRlZF9hbm5vdGF0aW9ucyA8LSBtZXJnZSh3YW50ZWRfZ2xvYmFsX2Fubm90YXRpb25zLCB3YW50ZWRfdW5pcHJvdF9hbm5vdGF0aW9ucywKICAgICAgICAgICAgICAgICAgICAgICAgICAgIGJ5Lng9ImVuc2VtYmxfZ2VuZV9pZCIsIGJ5Lnk9ImVuc2VtYmxfZ2VuZV9pZCIsIGFsbC55PVRSVUUpCgpsZW5ndGgodW5pcXVlKHdhbnRlZF9hbm5vdGF0aW9ucyRlbnNlbWJsX2dlbmVfaWQpKQpyb3duYW1lcyh3YW50ZWRfYW5ub3RhdGlvbnMpIDwtIG1ha2UubmFtZXMod2FudGVkX2Fubm90YXRpb25zW1siZW5zZW1ibF9nZW5lX2lkIl1dLCB1bmlxdWU9VFJVRSkKYGBgCgpgYGB7ciBzYXZlbWV9CnBhbmRlcjo6cGFuZGVyKHNlc3Npb25JbmZvKCkpCm1lc3NhZ2UocGFzdGUwKCJUaGlzIGlzIGhwZ2x0b29scyBjb21taXQ6ICIsIGdldF9naXRfY29tbWl0KCkpKQptZXNzYWdlKHBhc3RlMCgiU2F2aW5nIHRvICIsIHNhdmVmaWxlKSkKdG1wIDwtIHNtKHNhdmVtZShmaWxlbmFtZT1zYXZlZmlsZSkpCmBgYAo=