There are a few methods of importing annotation data into R. I will attempt some of them in preparation for loading them into the M.musculus RNASeq data.
AnnotationHub is a newer service and has promise to be an excellent top-level resource for gathering annotation data.
tmp <- sm(library(AnnotationHub))
ah = sm(AnnotationHub())
orgdbs <- sm(query(ah, "OrgDb"))
mm_orgdb <- sm(query(ah, c("OrgDB", "musculus")))
mm_orgdb <- mm_orgdb[[1]]
## loading from cache '/home/trey//.AnnotationHub/64720'
mm_orgdb
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: MOUSE_DB
## | ORGANISM: Mus musculus
## | SPECIES: Mouse
## | EGSOURCEDATE: 2017-Nov6
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 10090
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 2017-Nov01
## | GOEGSOURCEDATE: 2017-Nov6
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Mus musculus)
## | GPSOURCEURL:
## | GPSOURCEDATE: 2017-Aug8
## | ENSOURCEDATE: 2017-Aug23
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Tue Nov 7 21:07:58 2017
##
## Please see: help('select') for usage information
## Holy crap it worked!
mm_annotv1 <- load_orgdb_annotations(
mm_orgdb,
keytype="entrezid",
fields=c("ensembl", "entrezid", "ensembltrans", "refseq", "genename", "symbol"))
## Unable to find TYPE in the db, removing it.
## Unable to find CHR in the db, removing it.
## Unable to find TXSTRAND in the db, removing it.
## Unable to find TXSTART in the db, removing it.
## Unable to find TXEND in the db, removing it.
## Extracted all gene ids.
## 'select()' returned 1:many mapping between keys and columns
mm_annotv1 <- mm_annotv1[["genes"]]
head(mm_annotv1)
## entrezid genename ensembl ensembltrans
## X11287 11287 PZP, alpha-2-macroglobulin like ENSMUSG00000030359 <NA>
## X11287.1 11287 PZP, alpha-2-macroglobulin like ENSMUSG00000030359 <NA>
## X11298 11298 arylalkylamine N-acetyltransferase ENSMUSG00000020804 <NA>
## X11298.1 11298 arylalkylamine N-acetyltransferase ENSMUSG00000020804 <NA>
## X11298.2 11298 arylalkylamine N-acetyltransferase ENSMUSG00000020804 <NA>
## X11298.3 11298 arylalkylamine N-acetyltransferase ENSMUSG00000020804 <NA>
## refseq symbol
## X11287 NM_007376 Pzp
## X11287.1 NP_031402 Pzp
## X11298 NM_009591 Aanat
## X11298.1 NP_033721 Aanat
## X11298.2 NR_033223 Aanat
## X11298.3 XM_017314223 Aanat
A completely separate and competing annotation source is biomart.
mm_annotv2 <- load_biomart_annotations(species="mmusculus", overwrite=TRUE)
## Successfully connected to the mmusculus_gene_ensembl database.
## Finished downloading ensembl gene annotations.
## Finished downloading ensembl structure annotations.
## Dropping haplotype chromosome annotations, set drop_haplotypes=FALSE if this is bad.
## Saving annotations to mmusculus_biomart_annotations.rda.
## Finished save().
mm_annotv2_df <- mm_annotv2[["annotation"]]
rownames(mm_annotv2_df) <- paste0(rownames(mm_annotv2_df), ".", mm_annotv2_df[["transcript_version"]])
head(mm_annotv2_df)
## ensembl_transcript_id ensembl_gene_id version transcript_version
## ENSMUST00000000001.4 ENSMUST00000000001 ENSMUSG00000000001 4 4
## ENSMUST00000000003.13 ENSMUST00000000003 ENSMUSG00000000003 15 13
## ENSMUST00000000010.8 ENSMUST00000000010 ENSMUSG00000020875 9 8
## ENSMUST00000000028.13 ENSMUST00000000028 ENSMUSG00000000028 14 13
## ENSMUST00000000033.11 ENSMUST00000000033 ENSMUSG00000048583 16 11
## ENSMUST00000000049.5 ENSMUST00000000049 ENSMUSG00000000049 11 5
## hgnc_symbol
## ENSMUST00000000001.4
## ENSMUST00000000003.13
## ENSMUST00000000010.8
## ENSMUST00000000028.13
## ENSMUST00000000033.11
## ENSMUST00000000049.5
## description
## ENSMUST00000000001.4 guanine nucleotide binding protein (G protein), alpha inhibiting 3 [Source:MGI Symbol;Acc:MGI:95773]
## ENSMUST00000000003.13 probasin [Source:MGI Symbol;Acc:MGI:1860484]
## ENSMUST00000000010.8 homeobox B9 [Source:MGI Symbol;Acc:MGI:96190]
## ENSMUST00000000028.13 cell division cycle 45 [Source:MGI Symbol;Acc:MGI:1338073]
## ENSMUST00000000033.11 insulin-like growth factor 2 [Source:MGI Symbol;Acc:MGI:96434]
## ENSMUST00000000049.5 apolipoprotein H [Source:MGI Symbol;Acc:MGI:88058]
## gene_biotype cds_length chromosome_name strand start_position
## ENSMUST00000000001.4 protein_coding 1065 3 - 108107280
## ENSMUST00000000003.13 protein_coding 525 X - 77837901
## ENSMUST00000000010.8 protein_coding 753 11 + 96271457
## ENSMUST00000000028.13 protein_coding 1701 16 - 18780447
## ENSMUST00000000033.11 protein_coding 543 7 - 142650766
## ENSMUST00000000049.5 protein_coding 1038 11 + 108343354
## end_position
## ENSMUST00000000001.4 108146146
## ENSMUST00000000003.13 77853623
## ENSMUST00000000010.8 96276595
## ENSMUST00000000028.13 18811987
## ENSMUST00000000033.11 142666816
## ENSMUST00000000049.5 108414396
mm_ontology <- load_biomart_go("mmusculus")$go
## The biomart annotations file already exists, loading from it.
In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments.
## The old way of getting genome/annotation data
mm_gff <- "reference/mmusculus.gtf.gz"
mm_gff_annotations <- load_gff_annotations(mm_gff, id_col="transcript_id")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Trying attempt: rtracklayer::import.gff2(gff, sequenceRegionsAsSeqinfo=TRUE)
## Had a successful gff import with rtracklayer::import.gff2(gff, sequenceRegionsAsSeqinfo=TRUE)
## Returning a df with 21 columns and 1405203 rows.
rownames(mm_gff_annotations) <- make.names(mm_gff_annotations$transcript_id, unique=TRUE)
head(mm_gff_annotations)
## seqnames start end width strand source
## NA. chr1 3054233 3054733 501 + pseudogene
## ENSMUST00000160944 chr1 3054233 3054733 501 + unprocessed_pseudogene
## ENSMUST00000160944.1 chr1 3054233 3054733 501 + unprocessed_pseudogene
## NA..1 chr1 3102016 3102125 110 + snRNA
## ENSMUST00000082908 chr1 3102016 3102125 110 + snRNA
## ENSMUST00000082908.1 chr1 3102016 3102125 110 + snRNA
## type score phase gene_id gene_name gene_source
## NA. gene NA NA ENSMUSG00000090025 Gm16088 havana
## ENSMUST00000160944 transcript NA NA ENSMUSG00000090025 Gm16088 havana
## ENSMUST00000160944.1 exon NA NA ENSMUSG00000090025 Gm16088 havana
## NA..1 gene NA NA ENSMUSG00000064842 Gm26206 ensembl
## ENSMUST00000082908 transcript NA NA ENSMUSG00000064842 Gm26206 ensembl
## ENSMUST00000082908.1 exon NA NA ENSMUSG00000064842 Gm26206 ensembl
## gene_biotype transcript_id transcript_name transcript_source
## NA. pseudogene <NA> <NA> <NA>
## ENSMUST00000160944 pseudogene ENSMUST00000160944 Gm16088-001 havana
## ENSMUST00000160944.1 pseudogene ENSMUST00000160944 Gm16088-001 havana
## NA..1 snRNA <NA> <NA> <NA>
## ENSMUST00000082908 snRNA ENSMUST00000082908 Gm26206-201 ensembl
## ENSMUST00000082908.1 snRNA ENSMUST00000082908 Gm26206-201 ensembl
## tag exon_number exon_id ccds_id protein_id
## NA. <NA> <NA> <NA> <NA> <NA>
## ENSMUST00000160944 mRNA_start_NF <NA> <NA> <NA> <NA>
## ENSMUST00000160944.1 mRNA_start_NF 1 ENSMUSE00000848981 <NA> <NA>
## NA..1 <NA> <NA> <NA> <NA> <NA>
## ENSMUST00000082908 <NA> <NA> <NA> <NA> <NA>
## ENSMUST00000082908.1 <NA> 1 ENSMUSE00000522066 <NA> <NA>
In the following block we create an expressionset using the sample sheet and the annotations.
Annoyingly, the gff annotations are keyed in a peculiar fashion. Therefore I need to do a little work to merge them.
pander::pander(sessionInfo())
R version 3.4.4 (2018-03-15)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: hpgltools(v.2018.03), AnnotationDbi(v.1.40.0), IRanges(v.2.12.0), S4Vectors(v.0.16.0), Biobase(v.2.38.0), AnnotationHub(v.2.10.1) and BiocGenerics(v.0.24.0)
loaded via a namespace (and not attached): nlme(v.3.1-131.1), matrixStats(v.0.53.1), bitops(v.1.0-6), pbkrtest(v.0.4-7), devtools(v.1.13.5), bit64(v.0.9-7), doParallel(v.1.0.11), progress(v.1.1.2), httr(v.1.3.1), rprojroot(v.1.3-2), GenomeInfoDb(v.1.14.0), backports(v.1.1.2), tools(v.3.4.4), R6(v.2.2.2), KernSmooth(v.2.23-15), DBI(v.0.8), lazyeval(v.0.2.1), colorspace(v.1.3-2), withr(v.2.1.2), prettyunits(v.1.0.2), RMySQL(v.0.10.14), bit(v.1.1-12), curl(v.3.1), compiler(v.3.4.4), xml2(v.1.2.0), DelayedArray(v.0.4.1), rtracklayer(v.1.38.3), caTools(v.1.17.1), scales(v.0.5.0.9000), quadprog(v.1.5-5), commonmark(v.1.4), stringr(v.1.3.0), digest(v.0.6.15), Rsamtools(v.1.30.0), minqa(v.1.2.4), rmarkdown(v.1.9), variancePartition(v.1.8.1), colorRamps(v.2.3), XVector(v.0.18.0), base64enc(v.0.1-3), pkgconfig(v.2.0.1), htmltools(v.0.3.6), lme4(v.1.1-15), limma(v.3.34.9), rlang(v.0.2.0.9001), RSQLite(v.2.0), BiocInstaller(v.1.28.0), shiny(v.1.0.5), BiocParallel(v.1.12.0), gtools(v.3.5.0), RCurl(v.1.95-4.10), magrittr(v.1.5), GenomeInfoDbData(v.1.0.0), Matrix(v.1.2-12), Rcpp(v.0.12.16), munsell(v.0.4.3), stringi(v.1.1.7), yaml(v.2.1.18), MASS(v.7.3-49), SummarizedExperiment(v.1.8.1), zlibbioc(v.1.24.0), gplots(v.3.0.1), plyr(v.1.8.4), grid(v.3.4.4), blob(v.1.1.0), gdata(v.2.18.0), ggrepel(v.0.7.0), lattice(v.0.20-35), Biostrings(v.2.46.0), splines(v.3.4.4), pander(v.0.6.1), GenomicFeatures(v.1.30.3), knitr(v.1.20), pillar(v.1.2.1), GenomicRanges(v.1.30.3), reshape2(v.1.4.3), codetools(v.0.2-15), biomaRt(v.2.34.2), XML(v.3.98-1.10), evaluate(v.0.10.1), data.table(v.1.10.4-3), nloptr(v.1.0.4), httpuv(v.1.3.6.2), foreach(v.1.4.4), gtable(v.0.2.0), assertthat(v.0.2.0), ggplot2(v.2.2.1), mime(v.0.5), xtable(v.1.8-2), roxygen2(v.6.0.1), tibble(v.1.4.2), iterators(v.1.0.9), GenomicAlignments(v.1.14.1), memoise(v.1.1.0), directlabels(v.2017.03.31) and interactiveDisplayBase(v.1.16.0)
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_annotation_mmusculus_201804-v20180410.rda.xz
tt <- sm(saveme(filename=this_save))