This document illustrates some ways to import annotation data into an RNAseq experiment. It will be reused throughout these analyses and as a result the output may be hidden. If you wish to view it directly: annotation.html
There are a few methods of importing annotation data into R. The following are two attempts, the second is currently being used in these analyses.
AnnotationHub is a newer service and has promise to be an excellent top-level resource for gathering annotation data.
## This doesn't quite work yet
tmp <- sm(library(AnnotationHub))
ah = sm(AnnotationHub())
orgdbs <- sm(query(ah, "OrgDb"))
sc_orgdb <- sm(query(ah, c("OrgDB", "Saccharomyces"))) ## AH49589 | org.Sc.sgd.db.sqlite
sc_orgdb <- ah[["AH57980"]]
## loading from cache '/home/trey//.AnnotationHub/64726'
class(sc_orgdb)
## [1] "OrgDb"
## attr(,"package")
## [1] "AnnotationDbi"
OrgDbs provide a set of links between different annotation resources, in the case of yeast that includes GO/KEGG/SGD IDs. We also want links between gene annotations and these gene IDs. The TxDb system provides this information.
please_install("TxDb.Scerevisiae.UCSC.sacCer3.sgdGene")
## [1] 0
tmp <- sm(library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene))
sc_txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
There is a non-zero chance we will want to use the actual genome sequence along with these annotations. The BSGenome packages provide that functionality.
tmp <- please_install("BSgenome.Scerevisiae.UCSC.sacCer3")
A completely separate and competing annotation source is biomart.
sc_annotation <- sm(load_biomart_annotations("scerevisiae"))$annotation
sc_ontology <- sm(load_biomart_go("scerevisiae"))$go
At this point we have multiple possible ways to mix and match these annotation data. Ideally I would like to put them all into a single format, organismDb is a good choice therefore.
tmp <- sm(library(OrganismDbi))
## The following 2 invocations of makeOrganism do not work because not all the resources are available.
##sc_organismdb <- makeOrganismDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL", dataset="scerevisiae_gene_ensembl",
## host="dec2015.archive.ensembl.org")
##sc_organismdb <- makeOrganismDbFromTxDb(sc_txdb)
test <- try(library(Sac.cer3))
if (class(test) == "try-error") {
gd = list(join1 = c(GO.db="GOID", org.Sc.sgd.db="GO"),
join2 = c(org.Sc.sgd.db="ENTREZID", TxDb.Scerevisiae.UCSC.sacCer3.sgdGene="GENEID"))
tt <- try(makeOrganismPackage(pkgname="Sac.cer3", # simplify typing!
graphData=gd, organism="Saccharomyces cerevisiae",
version="0.0.1", maintainer="atb <abelew@gmail.com>",
author="unknown <unknown@genomicsclass.github.io/book/pages/bioc2_rpacks.html>",
destDir=".",
license="Artistic-2.0"))
install.packages("Sac.cer3", repos=NULL, type="source")
test <- try(library(Sac.cer3))
}
In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments.
## The old way of getting genome/annotation data
sc_gff <- "reference/scerevisiae.gff.gz"
sc_fasta <- "reference/scerevisiae.fasta.xz"
sc_gff_annotations <- sm(load_gff_annotations(sc_gff, type="gene"))
rownames(sc_gff_annotations) <- make.names(sc_gff_annotations$transcript_name, unique=TRUE)
The following block sets up a master experiment containing one set of anontations and all the reads. Note that the following is specific to the previous (version 1) set of data.
sc_annotation[["fwd_location"]] <- paste0(sc_annotation[["chromosome"]], "_", sc_annotation[["start"]])
sc_annotation[["rev_location"]] <- paste0(sc_annotation[["chromosome"]], "_", sc_annotation[["end"]])
sc_gff_annotations[["fwd_location"]] <- paste0(sc_gff_annotations[["seqnames"]], "_", sc_gff_annotations[["start"]])
sc_gff_annotations[["rev_location"]] <- paste0(sc_gff_annotations[["seqnames"]], "_", sc_gff_annotations[["end"]])
sc_gff_annotations[["gff_rowname"]] <- rownames(sc_gff_annotations)
sc_fwd_annotations <- merge(sc_annotation, sc_gff_annotations, by="fwd_location")
sc_rev_annotations <- merge(sc_annotation, sc_gff_annotations, by="rev_location")
colnames(sc_fwd_annotations) <- c("location","transcriptID","geneID", "Description",
"Type", "length", "chromosome", "strand", "start",
"end", "location.x", "seqnames",
"start.y", "end.y", "width", "strand.y", "source", "type",
"score", "phase", "exon_number", "gene_id", "ID", "p_id",
"protein_id", "transcript_id", "transcript_name", "tss_id",
"seqedit", "location.y", "gff_rowname")
colnames(sc_rev_annotations) <- colnames(sc_fwd_annotations)
sc_all_annotations <- rbind(sc_fwd_annotations, sc_rev_annotations)
rownames(sc_all_annotations) <- make.names(sc_all_annotations[["gff_rowname"]], unique=TRUE)
sc_all_annotations <- sc_all_annotations[, c("transcriptID","geneID","Description","Type","length","chromosome","strand","start","end","tss_id")]
colnames(sc_all_annotations) <- c("transcriptID","geneID","Description","Type","length","chromosome","strand","start","end","tss_id")
sc_all_annotations[["location"]] <- paste0(sc_all_annotations[["chromosome"]], "_", sc_all_annotations[["start"]], "_", sc_all_annotations[["end"]])
sc_expt <- sm(create_expt("all_samples.csv", gene_info=sc_gff_annotations))
dim(Biobase::exprs(sc_expt$expressionset))
## [1] 6691 12
knitr::kable(head(Biobase::fData(sc_expt$expressionset)))
seqnames | start | end | width | strand | source | type | score | phase | exon_number | gene_id | ID | p_id | protein_id | transcript_id | transcript_name | tss_id | seqedit | fwd_location | rev_location | gff_rowname | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AAC1 | XIII | 387318 | 388244 | 927 | - | protein_coding | gene | undefined | 0 | 1 | YMR056C | AAC1 | P3747 | YMR056C | YMR056C | AAC1 | TSS5132 | undefined | XIII_387318 | XIII_388244 | AAC1 |
AAC3 | II | 415983 | 416903 | 921 | + | protein_coding | gene | undefined | 0 | 1 | YBR085W | AAC3 | P443 | YBR085W | YBR085W | AAC3 | TSS1609 | undefined | II_415983 | II_416903 | AAC3 |
AAD10 | X | 727405 | 728268 | 864 | + | protein_coding | gene | undefined | 0 | 1 | YJR155W | AAD10 | P1106 | YJR155W | YJR155W | AAD10 | TSS5024 | undefined | X_727405 | X_728268 | AAD10 |
AAD14 | XIV | 16121 | 17248 | 1128 | - | protein_coding | gene | undefined | 0 | 1 | YNL331C | AAD14 | P3139 | YNL331C | YNL331C | AAD14 | TSS6941 | undefined | XIV_16121 | XIV_17248 | AAD14 |
AAD15 | XV | 1650 | 2078 | 429 | - | protein_coding | gene | undefined | 0 | 1 | YOL165C | AAD15 | P1525 | YOL165C | YOL165C | AAD15 | TSS108 | undefined | XV_1650 | XV_2078 | AAD15 |
AAD16 | VI | 14308 | 14763 | 456 | - | protein_coding | gene | undefined | 0 | 1 | YFL057C | AAD16 | P4716 | YFL057C | YFL057C | AAD16 | TSS2145 | undefined | VI_14308 | VI_14763 | AAD16 |
exo_expt <- sm(create_expt("all_samples_merged.csv", gene_info=sc_gff_annotations))
v3_expt <- create_expt("sample_sheets/all_samples.xlsx", gene_info=sc_gff_annotations, file_column="bt2file")
## Reading the sample metadata.
## The sample definitions comprises: 28, 18 rows, columns.
## Reading count tables.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0774/outputs/bowtie2_scerevisiae/hpgl0774_forward-trimmed.count.xz contains 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0775/outputs/bowtie2_scerevisiae/hpgl0775_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0776/outputs/bowtie2_scerevisiae/hpgl0776_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0777/outputs/bowtie2_scerevisiae/hpgl0777_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0778/outputs/bowtie2_scerevisiae/hpgl0778_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0779/outputs/bowtie2_scerevisiae/hpgl0779_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0780/outputs/bowtie2_scerevisiae/hpgl0780_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0781/outputs/bowtie2_scerevisiae/hpgl0781_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0782/outputs/bowtie2_scerevisiae/hpgl0782_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0783/outputs/bowtie2_scerevisiae/hpgl0783_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0784/outputs/bowtie2_scerevisiae/hpgl0784_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0785/outputs/bowtie2_scerevisiae/hpgl0785_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0786/outputs/bowtie2_scerevisiae/hpgl0786_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0787/outputs/bowtie2_scerevisiae/hpgl0787_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0788/outputs/bowtie2_scerevisiae/hpgl0788_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2015/preprocessing/v2/hpgl0789/outputs/bowtie2_scerevisiae/hpgl0789_forward-trimmed.count.xz contains 7131 rows and merges to 7131 rows.
## Finished reading count tables.
## Matched 6692 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
library(biomaRt)
int_mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="dec2015.archive.ensembl.org")
int_ensembl <- try(biomaRt::useDataset("scerevisiae_gene_ensembl", mart=int_mart))
int_search <- getBM(attributes=c("ensembl_gene_id", "ensembl_exon_id", "transcript_biotype",
"transcript_start", "transcript_end", "transcript_length"), mart=int_ensembl)
multi_exons <- grepl(pattern="\\.2", x=int_search[["ensembl_exon_id"]])
multi_exons <- int_search[multi_exons, ]
multi_annots <- int_search[["ensembl_gene_id"]] %in% multi_exons[["ensembl_gene_id"]]
multi_annots <- int_search[multi_annots, ]
## Damn it sets the same stop/start for all exons and same length, that is dumb.
head(multi_annots)
## ensembl_gene_id ensembl_exon_id transcript_biotype transcript_start transcript_end
## 7 YPR098C YPR098C.1 protein_coding 728947 729528
## 8 YPR098C YPR098C.2 protein_coding 728947 729528
## 50 YDR305C YDR305C.1 protein_coding 1072746 1073488
## 51 YDR305C YDR305C.2 protein_coding 1072746 1073488
## 75 YPR187W YPR187W.1 protein_coding 911257 911800
## 76 YPR187W YPR187W.2 protein_coding 911257 911800
## transcript_length
## 7 486
## 8 486
## 50 654
## 51 654
## 75 468
## 76 468
## The set of all intron-containing CDS yeast genes. However I have not found definitive
## start/ends for the introns.
At this point, we should have everything necessary to play!
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 5d8c266e48bb9f73cdac8300e5c7c9f5baf003dc
## R> packrat::restore()
## This is hpgltools commit: Wed Mar 21 15:55:32 2018 -0400: 5d8c266e48bb9f73cdac8300e5c7c9f5baf003dc
## Saving to annotation-v20151102.rda.xz