1 Annotation version: 20171205

1.1 Genome annotation input

1.1.1 Read a gff file

In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments. More in-depth information for the human transcriptome may be extracted from biomart.

## The old way of getting genome/annotation data
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"

mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"

mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
rownames(mtb_annotations) <- mtb_annotations[["ID"]]

1.1.2 Download from microbesonline

## First figure out the ID for the Mtb genome:
ids <- get_microbesonline_ids("37")
head(ids)
##   taxonomyId                            shortName
## 1      83332     Mycobacterium tuberculosis H37Rv
## 2     243273            Mycoplasma genitalium G37
## 3     316274 Herpetosiphon aurantiacus ATCC 23779
## 4     331111             Escherichia coli E24377A
## 5     338966      Pelobacter propionicus DSM 2379
## 6     350704         Pseudomonas aeruginosa C3719
## Mycobacterium tuberculosis H37Rv is the first entry and has id: 83332
mtb_microbes <- load_microbesonline_annotations(ids=83332)
## Querying microbesonline for: Mycobacterium tuberculosis H37Rv.
## I made a nifty function to do this stuff: load_uniprot_annotations().
library(UniProt.ws)
colnames(availableUniprotSpecies())
found <- availableUniprotSpecies(pattern="Mycobacterium tuberculosis")
info
mtb_uniprot <- UniProt.ws(13120)
mtb_keys <- keys(x=mtb_uniprot, keytype="UCSC")
mtb_keys
columns <- c("UNIGENE", "ENSEMBL")
result <- select(mtb_uniprot, mtb_keys, columns, "ENTREZ_GENE")

1.2 Getting ontology data

mtb_go <- load_microbesonline_go(id=83332)
## Collecting go data for: Mycobacterium tuberculosis H37Rv.

2 Cross reference Keith’s work

I want to be able to cross reference some work from Keith. His gene IDs are MTBxxxx

all_de <- read.table("limma_result.csv", header=TRUE, sep="\t")

summary(mtb_microbes[[1]])
##     locusId          accession               GI             scaffoldId  
##  Min.   :   31772   Length:4611        Min.   :1.56e+07   Min.   :7022  
##  1st Qu.:   33068   Class :character   1st Qu.:1.56e+07   1st Qu.:7022  
##  Median :   34351   Mode  :character   Median :1.56e+07   Median :7022  
##  Mean   : 1665113                      Mean   :2.11e+07   Mean   :7022  
##  3rd Qu.:   35642                      3rd Qu.:1.56e+07   3rd Qu.:7022  
##  Max.   :11685601                      Max.   :1.61e+08   Max.   :7022  
##                                        NA's   :622                      
##      start              stop            strand            sysName         
##  Min.   :      1   Min.   :   1524   Length:4611        Length:4611       
##  1st Qu.:1135016   1st Qu.:1134353   Class :character   Class :character  
##  Median :2367711   Median :2368442   Mode  :character   Mode  :character  
##  Mean   :2273832   Mean   :2273812                                        
##  3rd Qu.:3333271   3rd Qu.:3332429                                        
##  Max.   :4410929   Max.   :4410786                                        
##                                                                           
##      name               desc               COG               COGFun         
##  Length:4611        Length:4611        Length:4611        Length:4611       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    COGDesc            TIGRFam           TIGRRoles              GO           
##  Length:4611        Length:4611        Length:4611        Length:4611       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##       EC               ECDesc         
##  Length:4611        Length:4611       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
mtb_microbes <- mtb_microbes[[1]]
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 08c9274d41bd6db1189f9330baf0b7ec57d22dc9> git reset yesterday
## R> packrat::restore()
## This is hpgltools commit: Wed Apr 4 11:03:17 2018 -0400: 08c9274d41bd6db1189f9330baf0b7ec57d22dc9This is hpgltools commit: Wed Apr 4 11:03:17 2018 -0400: yesterday
## Saving to 01_annotation-v20171205.rda.xz
LS0tCnRpdGxlOiAiTS50dWJlcmN1bG9zaXMgMjAxNzogQ29sbGVjdGluZyBhbm5vdGF0aW9uIGRhdGEuIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiBodG1sX2RvY3VtZW50OgogIGNvZGVfZG93bmxvYWQ6IHRydWUKICBjb2RlX2ZvbGRpbmc6IHNob3cKICBmaWdfY2FwdGlvbjogdHJ1ZQogIGZpZ19oZWlnaHQ6IDcKICBmaWdfd2lkdGg6IDcKICBoaWdobGlnaHQ6IGRlZmF1bHQKICBrZWVwX21kOiBmYWxzZQogIG1vZGU6IHNlbGZjb250YWluZWQKICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICBzZWxmX2NvbnRhaW5lZDogdHJ1ZQogIHRoZW1lOiByZWFkYWJsZQogIHRvYzogdHJ1ZQogIHRvY19mbG9hdDoKICAgIGNvbGxhcHNlZDogZmFsc2UKICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCi0tLQoKPHN0eWxlPgogIGJvZHkgLm1haW4tY29udGFpbmVyIHsKICAgIG1heC13aWR0aDogMTYwMHB4OwogIH0KPC9zdHlsZT4KCmBgYHtyIG9wdGlvbnMsIGluY2x1ZGU9RkFMU0V9CmlmICghaXNUUlVFKGdldDAoInNraXBfbG9hZCIpKSkgewogIGxpYnJhcnkoaHBnbHRvb2xzKQogIHR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgifi9ocGdsdG9vbHMiKQogIGtuaXRyOjpvcHRzX2tuaXQkc2V0KHByb2dyZXNzPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZT1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgIHdpZHRoPTkwLAogICAgICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKICBrbml0cjo6b3B0c19jaHVuayRzZXQoZXJyb3I9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTgsCiAgICAgICAgICAgICAgICAgICAgICAgIGZpZy5oZWlnaHQ9OCwKICAgICAgICAgICAgICAgICAgICAgICAgZHBpPTk2KQogIG9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICAgICAga25pdHIuZHVwbGljYXRlLmxhYmVsPSJhbGxvdyIpCiAgZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMCkpCiAgdmVyIDwtICIyMDE3MTIwNSIKICBwcmV2aW91c19maWxlIDwtICJpbmRleC5SbWQiCgogIHRtcCA8LSB0cnkoc20obG9hZG1lKGZpbGVuYW1lPXBhc3RlMChnc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IiIsIHg9cHJldmlvdXNfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKSkpKQogIHJtZF9maWxlIDwtICIwMV9hbm5vdGF0aW9uLlJtZCIKfQpgYGAKCiMgQW5ub3RhdGlvbiB2ZXJzaW9uOiBgciB2ZXJgCgojIyBHZW5vbWUgYW5ub3RhdGlvbiBpbnB1dAoKIyMjIFJlYWQgYSBnZmYgZmlsZQoKSW4gY29udHJhc3QsIGl0IGlzIHBvc3NpYmxlIHRvIGxvYWQgbW9zdCBhbm5vdGF0aW9ucyBvZiBpbnRlcmVzdCBkaXJlY3RseSBmcm9tIHRoZSBnZmYgZmlsZXMgdXNlZCBpbgp0aGUgYWxpZ25tZW50cy4gIE1vcmUgaW4tZGVwdGggaW5mb3JtYXRpb24gZm9yIHRoZSBodW1hbiB0cmFuc2NyaXB0b21lIG1heSBiZSBleHRyYWN0ZWQgZnJvbSBiaW9tYXJ0LgoKYGBge3IgZ2Vub21lX2lucHV0LCBjYWNoZT1UUlVFfQojIyBUaGUgb2xkIHdheSBvZiBnZXR0aW5nIGdlbm9tZS9hbm5vdGF0aW9uIGRhdGEKbXRiX2dmZiA8LSAicmVmZXJlbmNlL215Y29iYWN0ZXJpdW1fdHViZXJjdWxvc2lzX2gzN3J2XzIuZ2ZmLmd6IgoKbXRiX2dlbm9tZSA8LSAicmVmZXJlbmNlL210dWJlcmN1bG9zaXNfaDM3cnZfZ2VuYmFuay5mYXN0YSIKbXRiX2NkcyA8LSAicmVmZXJlbmNlL210Yl9jZHMuZmFzdGEiCgptdGJfYW5ub3RhdGlvbnMgPC0gc20obG9hZF9nZmZfYW5ub3RhdGlvbnMobXRiX2dmZiwgdHlwZT0iZ2VuZSIpKQpyb3duYW1lcyhtdGJfYW5ub3RhdGlvbnMpIDwtIG10Yl9hbm5vdGF0aW9uc1tbIklEIl1dCmBgYAoKIyMjIERvd25sb2FkIGZyb20gbWljcm9iZXNvbmxpbmUKCmBgYHtyIG1pY3JvYmVzb25saW5lfQojIyBGaXJzdCBmaWd1cmUgb3V0IHRoZSBJRCBmb3IgdGhlIE10YiBnZW5vbWU6CmlkcyA8LSBnZXRfbWljcm9iZXNvbmxpbmVfaWRzKCIzNyIpCmhlYWQoaWRzKQojIyBNeWNvYmFjdGVyaXVtIHR1YmVyY3Vsb3NpcyBIMzdSdiBpcyB0aGUgZmlyc3QgZW50cnkgYW5kIGhhcyBpZDogODMzMzIKbXRiX21pY3JvYmVzIDwtIGxvYWRfbWljcm9iZXNvbmxpbmVfYW5ub3RhdGlvbnMoaWRzPTgzMzMyKQpgYGAKCmBgYHtyIGdlbmJhbmssIGV2YWw9RkFMU0V9CiMjIEkgbWFkZSBhIG5pZnR5IGZ1bmN0aW9uIHRvIGRvIHRoaXMgc3R1ZmY6IGxvYWRfdW5pcHJvdF9hbm5vdGF0aW9ucygpLgpsaWJyYXJ5KFVuaVByb3Qud3MpCmNvbG5hbWVzKGF2YWlsYWJsZVVuaXByb3RTcGVjaWVzKCkpCmZvdW5kIDwtIGF2YWlsYWJsZVVuaXByb3RTcGVjaWVzKHBhdHRlcm49Ik15Y29iYWN0ZXJpdW0gdHViZXJjdWxvc2lzIikKaW5mbwptdGJfdW5pcHJvdCA8LSBVbmlQcm90LndzKDEzMTIwKQptdGJfa2V5cyA8LSBrZXlzKHg9bXRiX3VuaXByb3QsIGtleXR5cGU9IlVDU0MiKQptdGJfa2V5cwpjb2x1bW5zIDwtIGMoIlVOSUdFTkUiLCAiRU5TRU1CTCIpCnJlc3VsdCA8LSBzZWxlY3QobXRiX3VuaXByb3QsIG10Yl9rZXlzLCBjb2x1bW5zLCAiRU5UUkVaX0dFTkUiKQpgYGAKCiMjIEdldHRpbmcgb250b2xvZ3kgZGF0YQoKYGBge3Igb250b2xvZ3l9Cm10Yl9nbyA8LSBsb2FkX21pY3JvYmVzb25saW5lX2dvKGlkPTgzMzMyKQpgYGAKCiMgQ3Jvc3MgcmVmZXJlbmNlIEtlaXRoJ3Mgd29yawoKSSB3YW50IHRvIGJlIGFibGUgdG8gY3Jvc3MgcmVmZXJlbmNlIHNvbWUgd29yayBmcm9tIEtlaXRoLgpIaXMgZ2VuZSBJRHMgYXJlIE1UQnh4eHgKCmBgYHtyIHhyZWZ9CmFsbF9kZSA8LSByZWFkLnRhYmxlKCJsaW1tYV9yZXN1bHQuY3N2IiwgaGVhZGVyPVRSVUUsIHNlcD0iXHQiKQoKc3VtbWFyeShtdGJfbWljcm9iZXNbWzFdXSkKbXRiX21pY3JvYmVzIDwtIG10Yl9taWNyb2Jlc1tbMV1dCmBgYAoKYGBge3Igc2F2ZW1lfQppZiAoIWlzVFJVRShnZXQwKCJza2lwX2xvYWQiKSkpIHsKICBwYW5kZXI6OnBhbmRlcihzZXNzaW9uSW5mbygpKQogIG1lc3NhZ2UocGFzdGUwKCJUaGlzIGlzIGhwZ2x0b29scyBjb21taXQ6ICIsIGdldF9naXRfY29tbWl0KCkpKQogIHRoaXNfc2F2ZSA8LSBwYXN0ZTAoZ3N1YihwYXR0ZXJuPSJcXC5SbWQiLCByZXBsYWNlPSIiLCB4PXJtZF9maWxlKSwgIi12IiwgdmVyLCAiLnJkYS54eiIpCiAgbWVzc2FnZShwYXN0ZTAoIlNhdmluZyB0byAiLCB0aGlzX3NhdmUpKQogIHRtcCA8LSBzbShzYXZlbWUoZmlsZW5hbWU9dGhpc19zYXZlKSkKfQpgYGAK