The Orgdb packages provide a quick and dirty method for gathering annotation data, lets see what we get!
mouse_pkg <- "Mus.musculus"
if ("Mus.musculus" %in% .packages(all.available=TRUE)) {
tt <- sm(library("Mus.musculus"))
} else {
please_install("Mus.musculus")
tt <- sm(library("Mus.musculus"))
}
mm_tx_org <- sm(load_orgdb_annotations(Mus.musculus, keytype="ensembltrans",
fields=c("definition", "genename")))
Now that we have the mouse data loaded into a big data frame, the goal is to pull out the most useful parts of it. We will also combine that information with the extant orgdb data.
mm_tx_genes <- mm_tx_org$genes
## The inclusion of the 'definition' columns means that we also end up with a bunch
## of redundant entries, like 200,000 of them; therefore I am going to whittle down
## the list.
tt <- sm(library(Mus.musculus))
##mm_go <- load_orgdb_go(orgdb=Mus.musculus)
mm_go <- load_biomart_go(species="mmusculus")
## The biomart annotations file already exists, loading from it.
mm_annot <- load_biomart_annotations(species="mmusculus")
## The biomart annotations file already exists, loading from it.
mm_annot <- mm_annot[["annotation"]]
mm_lengths <- mm_annot[, c("geneID", "length")]
rownames(mm_lengths) <- make.names(mm_lengths[["geneID"]], unique=TRUE)
colnames(mm_lengths) <- c("ID", "length")
mm_tx_unique <- !grepl(pattern="\\.", x=rownames(mm_tx_genes))
mm_tx_genes <- mm_tx_genes[mm_tx_unique, ]
##mmtx_annotations <- get_biomart_annotations(species="mmusculus")
mart <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL", host="dec2015.archive.ensembl.org")
dataset <- paste0("mmusculus_gene_ensembl")
ensembl <- try(biomaRt::useDataset(dataset, mart=mart))
lots_of_rows <- biomaRt::listAttributes(ensembl) ## List of possible attributes
## wanted_attributes <- c("ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "chromosome_name", "start_position","end_position","description", "entrezgene","hgnc_symbol","hgnc_id","uniprot_sptrembl","uniprot_swissprot","uniprot_genename") ## attributes Lucia and I chose, but too many
wanted_attributes_global <- c("ensembl_gene_id", "ensembl_transcript_id", "chromosome_name",
"start_position", "end_position", "strand", "description")
wanted_attributes_names <- c("ensembl_gene_id", "ensembl_transcript_id", "entrezgene", "hgnc_symbol", "hgnc_id")
wanted_attributes_uniprot <- c("ensembl_gene_id", "ensembl_transcript_id", "uniprot_sptrembl", "uniprot_swissprot")
wanted_global_annotations <- biomaRt::getBM(attributes=wanted_attributes_global, mart=ensembl)
dim(wanted_global_annotations)
## [1] 114083 7
wanted_names_annotations <- biomaRt::getBM(attributes=wanted_attributes_names, mart=ensembl)
wanted_uniprot_annotations <- biomaRt::getBM(attributes=wanted_attributes_uniprot, mart=ensembl)
dim(wanted_uniprot_annotations)
## [1] 57040 4
wanted_global_annotations <- data.table::as.data.table(wanted_global_annotations)
wanted_names_annotations <- data.table::as.data.table(wanted_names_annotations)
wanted_uniprot_annotations <- data.table::as.data.table(wanted_uniprot_annotations)
Now let us bring together these data sources into a (hopefully) comprehesive table.
wanted_annotations <- merge(wanted_global_annotations,
wanted_names_annotations,
by.x="ensembl_transcript_id",
by.y="ensembl_transcript_id",
all.y=TRUE)
wanted_annotations <- merge(wanted_annotations,
wanted_uniprot_annotations,
by.x="ensembl_transcript_id",
by.y="ensembl_transcript_id",
all.x=TRUE)
wanted_annotations <- as.data.frame(wanted_annotations)
wanted_annotations <- wanted_annotations[, c(
"ensembl_transcript_id", "ensembl_gene_id", "chromosome_name", "start_position", "end_position",
"strand", "description", "entrezgene", "hgnc_symbol", "hgnc_id", "uniprot_sptrembl",
"uniprot_swissprot")]
length(unique(wanted_annotations$ensembl_gene_id))
## [1] 22684
rownames(wanted_annotations) <- make.names(wanted_annotations[["ensembl_gene_id"]], unique=TRUE)
pander::pander(sessionInfo())
R version 3.4.4 (2018-03-15)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: Mus.musculus(v.1.3.1), TxDb.Mmusculus.UCSC.mm10.knownGene(v.3.4.0), org.Mm.eg.db(v.3.5.0), GO.db(v.3.5.0), OrganismDbi(v.1.20.0), GenomicFeatures(v.1.30.3), GenomicRanges(v.1.30.3), GenomeInfoDb(v.1.14.0), AnnotationDbi(v.1.40.0), IRanges(v.2.12.0), S4Vectors(v.0.16.0), Biobase(v.2.38.0), BiocGenerics(v.0.24.0) and hpgltools(v.2018.03)
loaded via a namespace (and not attached): Rcpp(v.0.12.16), lattice(v.0.20-35), prettyunits(v.1.0.2), Rsamtools(v.1.30.0), Biostrings(v.2.46.0), assertthat(v.0.2.0), rprojroot(v.1.3-2), digest(v.0.6.15), foreach(v.1.4.4), R6(v.2.2.2), plyr(v.1.8.4), backports(v.1.1.2), RSQLite(v.2.0), evaluate(v.0.10.1), httr(v.1.3.1), ggplot2(v.2.2.1), BiocInstaller(v.1.28.0), pillar(v.1.2.1), zlibbioc(v.1.24.0), rlang(v.0.2.0.9001), progress(v.1.1.2), curl(v.3.1), lazyeval(v.0.2.1), data.table(v.1.10.4-3), blob(v.1.1.0), Matrix(v.1.2-12), rmarkdown(v.1.9), devtools(v.1.13.5), RMySQL(v.0.10.14), BiocParallel(v.1.12.0), pander(v.0.6.1), stringr(v.1.3.0), RCurl(v.1.95-4.10), bit(v.1.1-12), biomaRt(v.2.34.2), munsell(v.0.4.3), DelayedArray(v.0.4.1), compiler(v.3.4.4), rtracklayer(v.1.38.3), pkgconfig(v.2.0.1), base64enc(v.0.1-3), htmltools(v.0.3.6), SummarizedExperiment(v.1.8.1), tibble(v.1.4.2), GenomeInfoDbData(v.1.0.0), roxygen2(v.6.0.1), codetools(v.0.2-15), matrixStats(v.0.53.1), XML(v.3.98-1.10), withr(v.2.1.2), GenomicAlignments(v.1.14.1), bitops(v.1.0-6), commonmark(v.1.4), grid(v.3.4.4), RBGL(v.1.54.0), gtable(v.0.2.0), DBI(v.0.8), magrittr(v.1.5), scales(v.0.5.0.9000), graph(v.1.56.0), stringi(v.1.1.7), XVector(v.0.18.0), xml2(v.1.2.0), iterators(v.1.0.9), tools(v.3.4.4), bit64(v.0.9-7), yaml(v.2.1.18), colorspace(v.1.3-2), memoise(v.1.1.0) and knitr(v.1.20)
message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 7de4503f6bb5724c28cce24af5dbee22bb1c0cae
## R> packrat::restore()
## This is hpgltools commit: Thu Apr 12 22:08:53 2018 -0400: 7de4503f6bb5724c28cce24af5dbee22bb1c0cae
message(paste0("Saving to ", savefile))
## Saving to 01_annotation_mmusculus_v20170614.rda.xz
tmp <- sm(saveme(filename=savefile))