There are a few methods of importing annotation data into R. I will attempt some of them in preparation for loading them into the S.cerevisiae RNASeq data.
AnnotationHub is a newer service and has promise to be an excellent top-level resource for gathering annotation data.
tmp <- sm(library(AnnotationHub))
ah = sm(AnnotationHub())
orgdbs <- sm(query(ah, "OrgDb"))
sc_orgdb <- sm(query(ah, c("OrgDB", "Saccharomyces"))) ## AH49589 | org.Sc.sgd.db.sqlite
sc_orgdb
## AnnotationHub with 7 records
## # snapshotDate(): 2017-10-27
## # $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
## # $species: Saccharomyces cerevisiae, Saccharomyces eubayanus, Schizosaccharomyces cry...
## # $rdataclass: OrgDb
## # additional mcols(): taxonomyid, genome, description, coordinate_1_based,
## # maintainer, rdatadateadded, preparerclass, tags, rdatapath, sourceurl,
## # sourcetype
## # retrieve records with, e.g., 'object[["AH57980"]]'
##
## title
## AH57980 | org.Sc.sgd.db.sqlite
## AH59735 | org.Schizosaccharomyces_pombe.eg.sqlite
## AH59859 | org.Saccharomyces_eubayanus.eg.sqlite
## AH59874 | org.Schizosaccharomyces_cryophilus_OY26.eg.sqlite
## AH59893 | org.Schizosaccharomyces_octosporus_yFS286.eg.sqlite
## AH59899 | org.Zygosaccharomyces_rouxii.eg.sqlite
## AH59913 | org.Schizosaccharomyces_japonicus_yFS275.eg.sqlite
sc_orgdb <- ah[["AH57980"]]
## loading from cache '/home/trey//.AnnotationHub/64726'
sc_orgdb
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: YEAST_DB
## | ORGANISM: Saccharomyces cerevisiae
## | SPECIES: Yeast
## | YGSOURCENAME: Yeast Genome
## | YGSOURCEURL: http://downloads.yeastgenome.org/
## | YGSOURCEDATE: 14-Jan-2017
## | CENTRALID: ORF
## | TAXID: 559292
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 2017-Nov01
## | EGSOURCEDATE: 2017-Nov6
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | ENSOURCEDATE: 2017-Aug23
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Tue Nov 7 21:11:11 2017
##
## Please see: help('select') for usage information
sc_annotv1 <- load_orgdb_annotations(sc_orgdb,
fields=c("alias", "description", "entrezid", "genename", "sgd"))
## Unable to find TYPE in the db, removing it.
## Unable to find CHR in the db, removing it.
## Unable to find TXSTRAND in the db, removing it.
## Unable to find TXSTART in the db, removing it.
## Unable to find TXEND in the db, removing it.
## Extracted all gene ids.
## 'select()' returned 1:many mapping between keys and columns
summary(sc_annotv1)
## Length Class Mode
## genes 6 data.frame list
## transcripts 0 -none- NULL
sc_annotv1 <- sc_annotv1[["genes"]]
head(sc_annotv1)
## ensembl genename alias
## YAL068C YAL068C PAU8 seripauperin PAU8
## YAL068C.1 YAL068C PAU9 seripauperin PAU9
## YAL068C.2 YAL068C PAU11 seripauperin PAU11
## YGL261C YGL261C PAU8 seripauperin PAU8
## YGL261C.1 YGL261C PAU9 seripauperin PAU9
## YGL261C.2 YGL261C PAU11 seripauperin PAU11
## description
## YAL068C Protein of unknown function; member of the seripauperin multigene family encoded mainly in subtelomeric regions
## YAL068C.1 Protein of unknown function; member of the seripauperin multigene family encoded mainly in subtelomeric regions; SWAT-GFP and mCherry fusion proteins localize to the endoplasmic reticulum and vacuole respectively
## YAL068C.2 Putative protein of unknown function; member of the seripauperin multigene family encoded mainly in subtelomeric regions; mRNA expression appears to be regulated by SUT1 and UPC2
## YGL261C Protein of unknown function; member of the seripauperin multigene family encoded mainly in subtelomeric regions
## YGL261C.1 Protein of unknown function; member of the seripauperin multigene family encoded mainly in subtelomeric regions; SWAT-GFP and mCherry fusion proteins localize to the endoplasmic reticulum and vacuole respectively
## YGL261C.2 Putative protein of unknown function; member of the seripauperin multigene family encoded mainly in subtelomeric regions; mRNA expression appears to be regulated by SUT1 and UPC2
## entrezid sgd
## YAL068C 851229 S000002142
## YAL068C.1 852163 S000007592
## YAL068C.2 852630 S000003230
## YGL261C 851229 S000002142
## YGL261C.1 852163 S000007592
## YGL261C.2 852630 S000003230
please_install("TxDb.Scerevisiae.UCSC.sacCer3.sgdGene")
## [1] 0
tmp <- sm(library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene))
sc_txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
There is a non-zero chance we will want to use the actual genome sequence along with these annotations. The BSGenome packages provide that functionality.
tt <- sm(please_install("BSgenome.Scerevisiae.UCSC.sacCer3"))
A completely separate and competing annotation source is biomart.
sc_annotv2 <- sm(load_biomart_annotations("scerevisiae"))$annotation
head(sc_annotv2)
## transcriptID geneID
## X15S_rRNA 15S_rRNA 15S_rRNA
## X21S_rRNA 21S_rRNA 21S_rRNA
## HRA1 HRA1 HRA1
## ICR1 ICR1 ICR1
## LSR1 LSR1 LSR1
## NME1 NME1 NME1
## Description
## X15S_rRNA Ribosomal RNA of the small mitochondrial ribosomal subunit; MSU1 allele suppresses ochre stop mutations in mitochondrial protein-coding genes [Source:SGD;Acc:S000007287]
## X21S_rRNA Mitochondrial 21S rRNA; intron encodes the I-SceI DNA endonuclease [Source:SGD;Acc:S000007288]
## HRA1 Non-protein-coding RNA; substrate of RNase P, possibly involved in rRNA processing, specifically maturation of 20S precursor into the mature 18S rRNA [Source:SGD;Acc:S000119380]
## ICR1 Long intergenic regulatory ncRNA; has a key role in regulating transcription of the nearby protein-coding ORF FLO11; initiated far upstream from FLO11 and transcribed across much of the large promoter of FLO11, repressing FLO11 transcription in cis [Source:SGD;Acc:S000132612]
## LSR1 U2 spliceosomal RNA (U2 snRNA), component of the spliceosome; pairs with the branchpoint sequence; functionally equivalent to mammalian U2 snRNA; stress-induced pseudouridylations at positions 56 and 93 may contribute to regulation of splicing [Source:SGD;Acc:S000006478]
## NME1 RNA component of RNase MRP; RNase MRP cleaves pre-rRNA and has a role in cell cycle-regulated degradation of daughter cell-specific mRNAs; human ortholog is implicated in cartilage-hair hypoplasia (CHH) [Source:SGD;Acc:S000007436]
## Type length chromosome strand start end
## X15S_rRNA rRNA NA Mito 1 6546 8194
## X21S_rRNA rRNA NA Mito 1 58009 62447
## HRA1 ncRNA NA I 1 99305 99868
## ICR1 ncRNA NA IX -1 393884 397082
## LSR1 snRNA NA II -1 680688 681862
## NME1 snoRNA NA XIV 1 585587 585926
sc_ontology <- sm(load_biomart_go("scerevisiae"))$go
head(sc_ontology)
## ID GO
## 1 YHR055C GO:0046872
## 2 YHR055C GO:0005829
## 3 YHR055C GO:0016209
## 4 YHR055C GO:0004784
## 5 YHR055C GO:0019430
## 6 YHR055C GO:0005507
In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments.
## The old way of getting genome/annotation data
sc_gff <- "reference/scerevisiae.gff.gz"
sc_gff_annotations <- load_gff_annotations(sc_gff, type="gene")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Trying attempt: rtracklayer::import.gff2(gff, sequenceRegionsAsSeqinfo=TRUE)
## Had a successful gff import with rtracklayer::import.gff2(gff, sequenceRegionsAsSeqinfo=TRUE)
## Returning a df with 18 columns and 7050 rows.
rownames(sc_gff_annotations) <- make.names(sc_gff_annotations$transcript_name, unique=TRUE)
head(sc_gff_annotations)
## seqnames start end width strand source type score phase exon_number
## YAL069W I 335 646 312 + protein_coding gene NA 0 1
## YAL068W.A I 538 789 252 + protein_coding gene NA 0 1
## PAU8 I 1810 2169 360 - protein_coding gene NA 0 1
## YAL067W.A I 2480 2704 225 + protein_coding gene NA 0 1
## SEO1 I 7238 9016 1779 - protein_coding gene NA 0 1
## YAL066W I 10091 10396 306 + protein_coding gene NA 0 1
## gene_id ID p_id protein_id transcript_id transcript_name tss_id
## YAL069W YAL069W YAL069W P3633 YAL069W YAL069W YAL069W TSS1128
## YAL068W.A YAL068W-A YAL068W-A P5377 YAL068W-A YAL068W-A YAL068W-A TSS5439
## PAU8 YAL068C PAU8 P6023 YAL068C YAL068C PAU8 TSS249
## YAL067W.A YAL067W-A YAL067W-A P4547 YAL067W-A YAL067W-A YAL067W-A TSS1248
## SEO1 YAL067C SEO1 P5747 YAL067C YAL067C SEO1 TSS5464
## YAL066W YAL066W YAL066W P1766 YAL066W YAL066W YAL066W TSS2674
## seqedit
## YAL069W <NA>
## YAL068W.A <NA>
## PAU8 <NA>
## YAL067W.A <NA>
## SEO1 <NA>
## YAL066W <NA>
In the following block we create an expressionset using the sample sheet and the annotations.
Annoyingly, the gff annotations are keyed in a peculiar fashion. Therefore I need to do a little work to merge them.
## Start by making locations for the biomart data
sc_annotv2[["fwd_location"]] <- paste0(sc_annotv2[["chromosome"]], "_", sc_annotv2[["start"]])
sc_annotv2[["rev_location"]] <- paste0(sc_annotv2[["chromosome"]], "_", sc_annotv2[["end"]])
## Do the same for the gff annotations
sc_gff_annotations[["fwd_location"]] <- paste0(sc_gff_annotations[["seqnames"]], "_", sc_gff_annotations[["start"]])
sc_gff_annotations[["rev_location"]] <- paste0(sc_gff_annotations[["seqnames"]], "_", sc_gff_annotations[["end"]])
sc_gff_annotations[["gff_rowname"]] <- rownames(sc_gff_annotations)
## Now merge them.
sc_fwd_annotations <- merge(sc_annotv2, sc_gff_annotations, by="fwd_location")
sc_rev_annotations <- merge(sc_annotv2, sc_gff_annotations, by="rev_location")
colnames(sc_fwd_annotations) <- c("location","transcriptID","geneID", "Description",
"Type", "length", "chromosome", "strand.x", "start.x",
"end.x", "location.x", "seqnames",
"start.y", "end.y", "width", "strand.y", "source", "type",
"score", "phase", "exon_number", "gene_id", "ID", "p_id",
"protein_id", "transcript_id", "transcript_name", "tss_id",
"seqedit", "location.y", "gff_rowname")
colnames(sc_rev_annotations) <- colnames(sc_fwd_annotations)
sc_all_annotations <- rbind(sc_fwd_annotations, sc_rev_annotations)
rownames(sc_all_annotations) <- make.names(sc_all_annotations[["gff_rowname"]], unique=TRUE)
sc_all_annotations <- sc_all_annotations[, c("transcriptID", "geneID", "Description", "Type",
"length", "chromosome", "strand.x", "start.x", "end.x",
"tss_id")]
colnames(sc_all_annotations) <- c("transcriptID", "geneID", "Description", "Type", "length",
"chromosome", "strand", "start", "end", "tss_id")
sc_all_annotations[["location"]] <- paste0(sc_all_annotations[["chromosome"]], "_", sc_all_annotations[["start"]], "_", sc_all_annotations[["end"]])
sc1_expt <- create_expt(metadata="sample_sheets/all_samples.xlsx",
gene_info=sc_all_annotations,
file_column="bowtiefile")
## Reading the sample metadata.
## The sample definitions comprises: 28, 18 rows, columns.
## Reading count tables.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0564/hpgl0564_scerevisiae.count.xz contains 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0565/hpgl0565_scerevisiae.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0566/hpgl0566_scerevisiae.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0567/hpgl0567_scerevisiae.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0568/hpgl0568_scerevisiae.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0569/hpgl0569_scerevisiae.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0570/hpgl0570_scerevisiae.count.xz contains 7131 rows and merges to 7131 rows.
## /cbcb/nelsayed-scratch/atb/rnaseq/scerevisiae_cbf5_2016/preprocessing/v1/hpgl0571/hpgl0571_scerevisiae.count.xz contains 7131 rows and merges to 7131 rows.
## preprocessing/wt/bowtie_out/wt_forward-trimmed-v0M1.count.xz contains 6697 rows and merges to 7131 rows.
## preprocessing/upf1/bowtie_out/upf1_forward-trimmed-v0M1.count.xz contains 6697 rows and merges to 7131 rows.
## preprocessing/upf2/bowtie_out/upf2_forward-trimmed-v0M1.count.xz contains 6697 rows and merges to 7131 rows.
## preprocessing/upf3/bowtie_out/upf3_forward-trimmed-v0M1.count.xz contains 6697 rows and merges to 7131 rows.
## Finished reading count tables.
## Matched 6539 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
head(exprs(sc1_expt$expressionset))
## hpgl0564 hpgl0565 hpgl0566 hpgl0567 hpgl0568 hpgl0569 hpgl0570 hpgl0571 wt upf1
## AAC1 141 90 91 155 351 144 384 120 131 84
## AAC3 236 140 119 253 267 117 326 120 183 245
## AAD10 189 167 183 224 283 132 326 89 178 602
## AAD14 389 262 230 341 375 221 547 148 117 424
## AAD15 94 66 50 104 103 53 125 47 3 32
## AAD16 226 149 140 218 290 152 339 129 80 291
## upf2 upf3
## AAC1 146 124
## AAC3 250 228
## AAD10 573 480
## AAD14 480 388
## AAD15 70 46
## AAD16 271 259
head(fData(sc1_expt$expressionset))
## transcriptID geneID
## AAC1 YMR056C YMR056C
## AAC3 YBR085W YBR085W
## AAD10 YJR155W YJR155W
## AAD14 YNL331C YNL331C
## AAD15 YOL165C YOL165C
## AAD16 YFL057C YFL057C
## Description
## AAC1 Mitochondrial inner membrane ADP/ATP translocator; exchanges cytosolic ADP for mitochondrially synthesized ATP; phosphorylated; Aac1p is a minor isoform while Pet9p is the major ADP/ATP translocator; relocalizes from mitochondrion to cytoplasm upon DNA replication stress [Source:SGD;Acc:S000004660]
## AAC3 Mitochondrial inner membrane ADP/ATP translocator; exchanges cytosolic ADP for mitochondrially synthesized ATP; expressed under anaerobic conditions; similar to Aac1p; has roles in maintenance of viability and in respiration; AAC3 has a paralog, PET9, that arose from the whole genome duplication [Source:SGD;Acc:S000000289]
## AAD10 Putative aryl-alcohol dehydrogenase; similar to P. chrysosporium aryl-alcohol dehydrogenase; mutational analysis has not yet revealed a physiological role; members of the AAD gene family comprise three pairs (AAD3 + AAD15, AAD6/AAD16 + AAD4, AAD10 + AAD14) whose two genes are more related to one another than to other members of the family [Source:SGD;Acc:S000003916]
## AAD14 Putative aryl-alcohol dehydrogenase; similar to P. chrysosporium aryl-alcohol dehydrogenase; mutational analysis has not yet revealed a physiological role; members of the AAD gene family comprise three pairs (AAD3 + AAD15, AAD6/AAD16 + AAD4, AAD10 + AAD14) whose two genes are more related to one another than to other members of the family [Source:SGD;Acc:S000005275]
## AAD15 Putative aryl-alcohol dehydrogenase; similar to P. chrysosporium aryl-alcohol dehydrogenase; mutational analysis has not yet revealed a physiological role; AAD15 has a paralog, AAD3, that arose from a segmental duplication; members of the AAD gene family comprise three pairs (AAD3 + AAD15, AAD6/AAD16 + AAD4, AAD10 + AAD14) whose two genes are more related to one another than to other members of the family [Source:SGD;Acc:S000005525]
## AAD16 Putative aryl alcohol dehydrogenase; similar to Phanerochaete chrysosporium aryl alcohol dehydrogenase; ORFs AAD6/YFL056C and AAD16/YFL057C are displaced from one another by -1 frameshift; members of the AAD gene family comprise three pairs (AAD3 + AAD15, AAD6/AAD16 + AAD4, AAD10 + AAD14) whose two genes are more related to one another than to other members of the family [Source:SGD;Acc:S000001837]
## Type length chromosome strand start end tss_id location
## AAC1 protein_coding 930 XIII -1 387315 388244 TSS5132 XIII_387315_388244
## AAC3 protein_coding 924 II 1 415983 416906 TSS1609 II_415983_416906
## AAD10 protein_coding 867 X 1 727405 728271 TSS5024 X_727405_728271
## AAD14 protein_coding 1131 XIV -1 16118 17248 TSS6941 XIV_16118_17248
## AAD15 protein_coding 432 XV -1 1647 2078 TSS108 XV_1647_2078
## AAD16 protein_coding 459 VI -1 14305 14763 TSS2145 VI_14305_14763
head(pData(sc1_expt$expressionset))
## sampleid strain condition batch originalbatch tube cbf5igv upf1igv
## hpgl0564 hpgl0564 yJD1524 wtc_wtu y y A wt wt
## hpgl0565 hpgl0565 yJD1524 wtc_wtu y y B wt wt
## hpgl0566 hpgl0566 yJD1524 wtc_wtu y y E wt wt
## hpgl0567 hpgl0567 yJD1524 wtc_wtu y y F wt wt
## hpgl0568 hpgl0568 yJD1525 mtc_wtu y y B mut wt
## hpgl0569 hpgl0569 yJD1525 mtc_wtu y y C mut wt
## incubationtime
## hpgl0564 unknown
## hpgl0565 unknown
## hpgl0566 unknown
## hpgl0567 unknown
## hpgl0568 unknown
## hpgl0569 unknown
## genotype
## hpgl0564 wt ade2-1 can1-100 his3-11 leu2-3, 112 trp1-1 ura3-1 cbf5::TRP1 + CBF5 on pRS313
## hpgl0565 wt ade2-1 can1-100 his3-11 leu2-3, 112 trp1-1 ura3-1 cbf5::TRP1 + CBF5 on pRS313
## hpgl0566 wt ade2-1 can1-100 his3-11 leu2-3, 112 trp1-1 ura3-1 cbf5::TRP1 + CBF5 on pRS313
## hpgl0567 wt ade2-1 can1-100 his3-11 leu2-3, 112 trp1-1 ura3-1 cbf5::TRP1 + CBF5 on pRS313
## hpgl0568 d95a ade2-1 can1-100 his3-11 leu2-3, 112 trp1-1 ura3-1 cbf5::TRP1 + CBF5 D95A on pRS313
## hpgl0569 d95a ade2-1 can1-100 his3-11 leu2-3, 112 trp1-1 ura3-1 cbf5::TRP1 + CBF5 D95A on pRS313
## conc bttotalreads bttotalmapped btleftmapped btrightmapped
## hpgl0564 619.4 27385278 23432324 15438470 11946808
## hpgl0565 629.0 17813593 15224673 10061396 7752197
## hpgl0566 375.4 8973978 7682107 5078415 3895563
## hpgl0567 720.3 23744501 20191005 13457860 10286641
## hpgl0568 440.7 23311126 19999162 13141126 10170000
## hpgl0569 423.5 10683277 9147161 6034715 4648562
## bowtiefile bt2file intronfile file
## hpgl0564 preprocessing/v1/hpgl0564/hpgl0564_scerevisiae.count.xz <NA> <NA> null
## hpgl0565 preprocessing/v1/hpgl0565/hpgl0565_scerevisiae.count.xz <NA> <NA> null
## hpgl0566 preprocessing/v1/hpgl0566/hpgl0566_scerevisiae.count.xz <NA> <NA> null
## hpgl0567 preprocessing/v1/hpgl0567/hpgl0567_scerevisiae.count.xz <NA> <NA> null
## hpgl0568 preprocessing/v1/hpgl0568/hpgl0568_scerevisiae.count.xz <NA> <NA> null
## hpgl0569 preprocessing/v1/hpgl0569/hpgl0569_scerevisiae.count.xz <NA> <NA> null
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
tmp <- sm(saveme(filename=this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 5d8c266e48bb9f73cdac8300e5c7c9f5baf003dc
## R> packrat::restore()
## This is hpgltools commit: Wed Mar 21 15:55:32 2018 -0400: 5d8c266e48bb9f73cdac8300e5c7c9f5baf003dc