The Orgdb packages provide a quick and dirty method for gathering annotation data, lets see what we get!
mouse_pkg <- "Mus.musculus"
if ("Mus.musculus" %in% .packages(all.available=TRUE)) {
tt <- sm(library("Mus.musculus"))
} else {
require.auto("Mus.musculus")
tt <- sm(library("Mus.musculus"))
}
mm_tx_org <- sm(load_orgdb_annotations(Mus.musculus, keytype="ensembltrans",
fields=c("definition", "genename")))
Now that we have the mouse data loaded into a big data frame, the goal is to pull out the most useful parts of it. We will also combine that information with the extant orgdb data.
mm_tx_genes <- mm_tx_org$genes
## The inclusion of the 'definition' columns means that we also end up with a bunch
## of redundant entries, like 200,000 of them; therefore I am going to whittle down
## the list.
tt <- sm(library(Mus.musculus))
##mm_go <- load_orgdb_go(orgdb=Mus.musculus)
mm_go <- load_biomart_go(species="mmusculus")
## The biomart annotations file already exists, loading from it.
mm_annot <- load_biomart_annotations(species="mmusculus")
## The biomart annotations file already exists, loading from it.
mm_annot <- mm_annot[["annotation"]]
mm_lengths <- mm_annot[, c("geneID", "length")]
rownames(mm_lengths) <- make.names(mm_lengths[["geneID"]], unique=TRUE)
colnames(mm_lengths) <- c("ID", "length")
mm_tx_unique <- !grepl(pattern="\\.", x=rownames(mm_tx_genes))
mm_tx_genes <- mm_tx_genes[mm_tx_unique, ]
##mmtx_annotations <- get_biomart_annotations(species="mmusculus")
mart <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL", host="dec2015.archive.ensembl.org")
dataset <- paste0("mmusculus_gene_ensembl")
ensembl <- try(biomaRt::useDataset(dataset, mart=mart))
lots_of_rows <- biomaRt::listAttributes(ensembl) ## List of possible attributes
## wanted_attributes <- c("ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "chromosome_name", "start_position","end_position","description", "entrezgene","hgnc_symbol","hgnc_id","uniprot_sptrembl","uniprot_swissprot","uniprot_genename") ## attributes Lucia and I chose, but too many
wanted_attributes_global <- c("ensembl_gene_id", "ensembl_transcript_id", "chromosome_name",
"start_position", "end_position", "strand", "description")
wanted_attributes_names <- c("ensembl_gene_id", "ensembl_transcript_id", "entrezgene", "hgnc_symbol", "hgnc_id")
wanted_attributes_uniprot <- c("ensembl_gene_id", "ensembl_transcript_id", "uniprot_sptrembl", "uniprot_swissprot")
wanted_global_annotations <- biomaRt::getBM(attributes=wanted_attributes_global, mart=ensembl)
dim(wanted_global_annotations)
## [1] 114083 7
wanted_names_annotations <- biomaRt::getBM(attributes=wanted_attributes_names, mart=ensembl)
wanted_uniprot_annotations <- biomaRt::getBM(attributes=wanted_attributes_uniprot, mart=ensembl)
dim(wanted_uniprot_annotations)
## [1] 57040 4
wanted_global_annotations <- data.table::as.data.table(wanted_global_annotations)
wanted_names_annotations <- data.table::as.data.table(wanted_names_annotations)
wanted_uniprot_annotations <- data.table::as.data.table(wanted_uniprot_annotations)
Now let us bring together these data sources into a (hopefully) comprehesive table.
wanted_annotations <- merge(wanted_global_annotations,
wanted_names_annotations,
by.x="ensembl_transcript_id",
by.y="ensembl_transcript_id",
all.y=TRUE)
wanted_annotations <- merge(wanted_annotations,
wanted_uniprot_annotations,
by.x="ensembl_transcript_id",
by.y="ensembl_transcript_id",
all.x=TRUE)
wanted_annotations <- as.data.frame(wanted_annotations)
wanted_annotations <- wanted_annotations[, c(
"ensembl_transcript_id", "ensembl_gene_id", "chromosome_name", "start_position", "end_position",
"strand", "description", "entrezgene", "hgnc_symbol", "hgnc_id", "uniprot_sptrembl",
"uniprot_swissprot")]
length(unique(wanted_annotations$ensembl_gene_id))
## [1] 22684
rownames(wanted_annotations) <- make.names(wanted_annotations[["ensembl_gene_id"]], unique=TRUE)
pander::pander(sessionInfo())
R version 3.4.4 (2018-03-15)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: hpgltools(v.2018.03), Mus.musculus(v.1.3.1), TxDb.Mmusculus.UCSC.mm10.knownGene(v.3.4.0), org.Mm.eg.db(v.3.5.0), GO.db(v.3.5.0), OrganismDbi(v.1.20.0), GenomicFeatures(v.1.30.3), GenomicRanges(v.1.30.3), GenomeInfoDb(v.1.14.0), AnnotationDbi(v.1.40.0), IRanges(v.2.12.0), S4Vectors(v.0.16.0), Biobase(v.2.38.0) and BiocGenerics(v.0.24.0)
loaded via a namespace (and not attached): httr(v.1.3.1), RMySQL(v.0.10.14), bit64(v.0.9-7), foreach(v.1.4.4), assertthat(v.0.2.0), pander(v.0.6.1), RBGL(v.1.54.0), blob(v.1.1.0), GenomeInfoDbData(v.1.0.0), Rsamtools(v.1.30.0), yaml(v.2.1.18), progress(v.1.1.2), pillar(v.1.2.1), RSQLite(v.2.0), backports(v.1.1.2), lattice(v.0.20-35), digest(v.0.6.15), XVector(v.0.18.0), colorspace(v.1.3-2), htmltools(v.0.3.6), Matrix(v.1.2-12), plyr(v.1.8.4), XML(v.3.98-1.10), pkgconfig(v.2.0.1), devtools(v.1.13.5), biomaRt(v.2.34.2), zlibbioc(v.1.24.0), scales(v.0.5.0), BiocParallel(v.1.12.0), tibble(v.1.4.2), ggplot2(v.2.2.1), withr(v.2.1.2), SummarizedExperiment(v.1.8.1), lazyeval(v.0.2.1), magrittr(v.1.5), memoise(v.1.1.0), evaluate(v.0.10.1), xml2(v.1.2.0), graph(v.1.56.0), BiocInstaller(v.1.28.0), tools(v.3.4.4), data.table(v.1.10.4-3), prettyunits(v.1.0.2), matrixStats(v.0.53.1), stringr(v.1.3.0), munsell(v.0.4.3), DelayedArray(v.0.4.1), Biostrings(v.2.46.0), compiler(v.3.4.4), rlang(v.0.2.0), grid(v.3.4.4), RCurl(v.1.95-4.10), iterators(v.1.0.9), bitops(v.1.0-6), base64enc(v.0.1-3), rmarkdown(v.1.9), gtable(v.0.2.0), codetools(v.0.2-15), curl(v.3.1), DBI(v.0.8), roxygen2(v.6.0.1), R6(v.2.2.2), GenomicAlignments(v.1.14.1), knitr(v.1.20), rtracklayer(v.1.38.3), bit(v.1.1-12), commonmark(v.1.4), rprojroot(v.1.3-2), stringi(v.1.1.7) and Rcpp(v.0.12.16)
message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 5d8c266e48bb9f73cdac8300e5c7c9f5baf003dc
## R> packrat::restore()
## This is hpgltools commit: Wed Mar 21 15:55:32 2018 -0400: 5d8c266e48bb9f73cdac8300e5c7c9f5baf003dc
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_annotation_mmusculus-v20170703.rda.xz
tmp <- sm(saveme(filename=this_save))