1 Annotation version: 20171205

1.1 Genome annotation input

1.1.1 Read a gff file

In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments. More in-depth information for the human transcriptome may be extracted from biomart.

## The old way of getting genome/annotation data
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"

mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"

mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
rownames(mtb_annotations) <- mtb_annotations[["ID"]]

2 A quick pattern match

This little block is intended to seek out peptide sequences with the highly degenerate pattern: Y(E|D) with a varying number of amino acids between the Y and (E or D).

mtb_cds <- "reference/mtb_cds.fasta"

get_hits <- function(patterns, peptide_file) {
  pep_seq <- Biostrings::readAAStringSet(peptide_file)
  pep_lst <- as.data.frame(pep_seq)[["x"]]
  position_df <- data.frame(row.names=names(pep_seq))
  number_df <- data.frame(row.names=names(pep_seq))
  pct_df <- data.frame(row.names=names(pep_seq))
  nt_df <- data.frame(row.names=names(pep_seq))
  ct_df <- data.frame(row.names=names(pep_seq))
  for (pat in patterns) {
    column <- gregexpr(pattern=pat, text=pep_lst)
    names(column) <- names(pep_seq)
    for (r in 1:length(column)) {
      name <- names(column)[r]
      row <- column[[r]]
      len <- nchar(pep_lst[r])
      pct <- as.numeric(row) / len
      ## An important caveat here:  There may be more than one value.
      op <- options(warn=2)
      if (pct[1] < 0) {
        pct_df[r, pat] <- -1
      } else {
        pct_df[r, pat] <- toString(pct)
      }
      options(op)
      nt_str <- ""
      ct_str <- ""
      for (i in pct) {
        if (i < 0) {
          next
        }
        if (i < 0.1) {
          nt_str <- paste0(nt_str, ", ", i)
        }
        if (i > 0.9) {
          ct_str <- paste0(ct_str, ", ", i)
        }
      }
      nt_str <- gsub(pattern="^\\, ", replacement="", x=nt_str)
      ct_str <- gsub(pattern="^\\, ", replacement="", x=ct_str)
      nt_df[r, pat] <- nt_str
      ct_df[r, pat] <- ct_str
      position_df[r, pat] <- toString(as.numeric(row))
      number_df[r, pat] <- length(as.numeric(row))
    }
  }
  retlst <- list(
    "numbers" = number_df,
    "pct" = pct_df,
    "nterminals" = nt_df,
    "cterminals" = ct_df,
    "positions" = position_df)
  return(retlst)
}
patterns <- c("Y(E|D)", "Y.(E|D)", "Y..(E|D)", "Y...(E|D)", "Y....(E|D)", "Y.....(E|D)")
stuff <- get_hits(patterns, mtb_cds)
write.csv(file="excel/positions_by_pattern.csv", stuff[["positions"]])
write.csv(file="excel/number_by_pattern.csv", stuff[["numbers"]])
write.csv(file="excel/pct_by_pattern.csv", stuff[["pct"]])
write.csv(file="excel/nterminals.csv", stuff[["nterminals"]])
write.csv(file="excel/cterminals.csv", stuff[["cterminals"]])

2.0.1 Download from microbesonline

## First figure out the ID for the Mtb genome:
ids <- get_microbesonline_ids("37")
## Mycobacterium tuberculosis H37Rv is 83332
mtb_microbes <- load_microbesonline_annotations(ids=83332)
mtb_go <- load_microbesonline_go(id=83332)
mtb_uniprotws <- load_uniprotws_annotations(species="Mycobacterium tuberculosis")
## More than 1 species was returned, they follow; the first was chosen arbitrarily.
##   taxon ID
## 1   419947
## 2   443149
## 3   443150
## 4   652616
## 5   336982
## 6   478434
## 7     1773
##                                                        Species name
## 1            Mycobacterium tuberculosis (strain ATCC 25177 / H37Ra)
## 2                      Mycobacterium tuberculosis (strain CCDC5079)
## 3                      Mycobacterium tuberculosis (strain CCDC5180)
## 4 Mycobacterium tuberculosis (strain ATCC 35801 / TMC 107 / Erdman)
## 5                           Mycobacterium tuberculosis (strain F11)
## 6                Mycobacterium tuberculosis (strain KZN 1435 / MDR)
## 7                                        Mycobacterium tuberculosis
## Getting mapping data for A5U205 ... and P_GI
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 7de4503f6bb5724c28cce24af5dbee22bb1c0cae
## R> packrat::restore()
## This is hpgltools commit: Thu Apr 12 22:08:53 2018 -0400: 7de4503f6bb5724c28cce24af5dbee22bb1c0cae
## Saving to 01_annotation-v20171205.rda.xz
LS0tCnRpdGxlOiAiTS50dWJlcmN1bG9zaXM6IEFubm90YXRpb24gZGF0YSBmb3IgcHJvdGVvbWljcyBhbmFseXNlcyAodW5pcHJvdCkuIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiBodG1sX2RvY3VtZW50OgogIGNvZGVfZG93bmxvYWQ6IHRydWUKICBjb2RlX2ZvbGRpbmc6IHNob3cKICBmaWdfY2FwdGlvbjogdHJ1ZQogIGZpZ19oZWlnaHQ6IDcKICBmaWdfd2lkdGg6IDcKICBoaWdobGlnaHQ6IGRlZmF1bHQKICBrZWVwX21kOiBmYWxzZQogIG1vZGU6IHNlbGZjb250YWluZWQKICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICBzZWxmX2NvbnRhaW5lZDogdHJ1ZQogIHRoZW1lOiByZWFkYWJsZQogIHRvYzogdHJ1ZQogIHRvY19mbG9hdDoKICAgIGNvbGxhcHNlZDogZmFsc2UKICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCi0tLQoKPHN0eWxlPgogIGJvZHkgLm1haW4tY29udGFpbmVyIHsKICAgIG1heC13aWR0aDogMTYwMHB4OwogIH0KPC9zdHlsZT4KCmBgYHtyIG9wdGlvbnMsIGluY2x1ZGU9RkFMU0V9CmlmICghaXNUUlVFKGdldDAoInNraXBfbG9hZCIpKSkgewogIGxpYnJhcnkoaHBnbHRvb2xzKQogIHR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgifi9ocGdsdG9vbHMiKQogIGtuaXRyOjpvcHRzX2tuaXQkc2V0KHByb2dyZXNzPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZT1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgIHdpZHRoPTkwLAogICAgICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKICBrbml0cjo6b3B0c19jaHVuayRzZXQoZXJyb3I9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTgsCiAgICAgICAgICAgICAgICAgICAgICAgIGZpZy5oZWlnaHQ9OCwKICAgICAgICAgICAgICAgICAgICAgICAgZHBpPTk2KQogIG9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFLAogICAgICAgICAgICAgICAgICAgICAgICAga25pdHIuZHVwbGljYXRlLmxhYmVsPSJhbGxvdyIpCiAgZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMCkpCiAgc2V0LnNlZWQoMSkKICB2ZXIgPC0gIjIwMTcxMjA1IgogIHByZXZpb3VzX2ZpbGUgPC0gImluZGV4LlJtZCIKCiAgdG1wIDwtIHRyeShzbShsb2FkbWUoZmlsZW5hbWU9cGFzdGUwKGdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iIiwgeD1wcmV2aW91c19maWxlKSwgIi12IiwgdmVyLCAiLnJkYS54eiIpKSkpCiAgcm1kX2ZpbGUgPC0gIjAxX2Fubm90YXRpb24uUm1kIgp9CmBgYAoKIyBBbm5vdGF0aW9uIHZlcnNpb246IGByIHZlcmAKCiMjIEdlbm9tZSBhbm5vdGF0aW9uIGlucHV0CgojIyMgUmVhZCBhIGdmZiBmaWxlCgpJbiBjb250cmFzdCwgaXQgaXMgcG9zc2libGUgdG8gbG9hZCBtb3N0IGFubm90YXRpb25zIG9mIGludGVyZXN0IGRpcmVjdGx5IGZyb20gdGhlIGdmZiBmaWxlcyB1c2VkIGluCnRoZSBhbGlnbm1lbnRzLiAgTW9yZSBpbi1kZXB0aCBpbmZvcm1hdGlvbiBmb3IgdGhlIGh1bWFuIHRyYW5zY3JpcHRvbWUgbWF5IGJlIGV4dHJhY3RlZCBmcm9tIGJpb21hcnQuCgpgYGB7ciBnZW5vbWVfaW5wdXQsIGNhY2hlPVRSVUV9CiMjIFRoZSBvbGQgd2F5IG9mIGdldHRpbmcgZ2Vub21lL2Fubm90YXRpb24gZGF0YQptdGJfZ2ZmIDwtICJyZWZlcmVuY2UvbXljb2JhY3Rlcml1bV90dWJlcmN1bG9zaXNfaDM3cnZfMi5nZmYuZ3oiCgptdGJfZ2Vub21lIDwtICJyZWZlcmVuY2UvbXR1YmVyY3Vsb3Npc19oMzdydl9nZW5iYW5rLmZhc3RhIgptdGJfY2RzIDwtICJyZWZlcmVuY2UvbXRiX2Nkcy5mYXN0YSIKCm10Yl9hbm5vdGF0aW9ucyA8LSBzbShsb2FkX2dmZl9hbm5vdGF0aW9ucyhtdGJfZ2ZmLCB0eXBlPSJnZW5lIikpCnJvd25hbWVzKG10Yl9hbm5vdGF0aW9ucykgPC0gbXRiX2Fubm90YXRpb25zW1siSUQiXV0KYGBgCgojIEEgcXVpY2sgcGF0dGVybiBtYXRjaAoKVGhpcyBsaXR0bGUgYmxvY2sgaXMgaW50ZW5kZWQgdG8gc2VlayBvdXQgcGVwdGlkZSBzZXF1ZW5jZXMgd2l0aCB0aGUgaGlnaGx5IGRlZ2VuZXJhdGUgcGF0dGVybjoKWShFfEQpIHdpdGggYSB2YXJ5aW5nIG51bWJlciBvZiBhbWlubyBhY2lkcyBiZXR3ZWVuIHRoZSBZIGFuZCAoRSBvciBEKS4KCmBgYHtyIHl4eGR9Cm10Yl9jZHMgPC0gInJlZmVyZW5jZS9tdGJfY2RzLmZhc3RhIgoKZ2V0X2hpdHMgPC0gZnVuY3Rpb24ocGF0dGVybnMsIHBlcHRpZGVfZmlsZSkgewogIHBlcF9zZXEgPC0gQmlvc3RyaW5nczo6cmVhZEFBU3RyaW5nU2V0KHBlcHRpZGVfZmlsZSkKICBwZXBfbHN0IDwtIGFzLmRhdGEuZnJhbWUocGVwX3NlcSlbWyJ4Il1dCiAgcG9zaXRpb25fZGYgPC0gZGF0YS5mcmFtZShyb3cubmFtZXM9bmFtZXMocGVwX3NlcSkpCiAgbnVtYmVyX2RmIDwtIGRhdGEuZnJhbWUocm93Lm5hbWVzPW5hbWVzKHBlcF9zZXEpKQogIHBjdF9kZiA8LSBkYXRhLmZyYW1lKHJvdy5uYW1lcz1uYW1lcyhwZXBfc2VxKSkKICBudF9kZiA8LSBkYXRhLmZyYW1lKHJvdy5uYW1lcz1uYW1lcyhwZXBfc2VxKSkKICBjdF9kZiA8LSBkYXRhLmZyYW1lKHJvdy5uYW1lcz1uYW1lcyhwZXBfc2VxKSkKICBmb3IgKHBhdCBpbiBwYXR0ZXJucykgewogICAgY29sdW1uIDwtIGdyZWdleHByKHBhdHRlcm49cGF0LCB0ZXh0PXBlcF9sc3QpCiAgICBuYW1lcyhjb2x1bW4pIDwtIG5hbWVzKHBlcF9zZXEpCiAgICBmb3IgKHIgaW4gMTpsZW5ndGgoY29sdW1uKSkgewogICAgICBuYW1lIDwtIG5hbWVzKGNvbHVtbilbcl0KICAgICAgcm93IDwtIGNvbHVtbltbcl1dCiAgICAgIGxlbiA8LSBuY2hhcihwZXBfbHN0W3JdKQogICAgICBwY3QgPC0gYXMubnVtZXJpYyhyb3cpIC8gbGVuCiAgICAgICMjIEFuIGltcG9ydGFudCBjYXZlYXQgaGVyZTogIFRoZXJlIG1heSBiZSBtb3JlIHRoYW4gb25lIHZhbHVlLgogICAgICBvcCA8LSBvcHRpb25zKHdhcm49MikKICAgICAgaWYgKHBjdFsxXSA8IDApIHsKICAgICAgICBwY3RfZGZbciwgcGF0XSA8LSAtMQogICAgICB9IGVsc2UgewogICAgICAgIHBjdF9kZltyLCBwYXRdIDwtIHRvU3RyaW5nKHBjdCkKICAgICAgfQogICAgICBvcHRpb25zKG9wKQogICAgICBudF9zdHIgPC0gIiIKICAgICAgY3Rfc3RyIDwtICIiCiAgICAgIGZvciAoaSBpbiBwY3QpIHsKICAgICAgICBpZiAoaSA8IDApIHsKICAgICAgICAgIG5leHQKICAgICAgICB9CiAgICAgICAgaWYgKGkgPCAwLjEpIHsKICAgICAgICAgIG50X3N0ciA8LSBwYXN0ZTAobnRfc3RyLCAiLCAiLCBpKQogICAgICAgIH0KICAgICAgICBpZiAoaSA+IDAuOSkgewogICAgICAgICAgY3Rfc3RyIDwtIHBhc3RlMChjdF9zdHIsICIsICIsIGkpCiAgICAgICAgfQogICAgICB9CiAgICAgIG50X3N0ciA8LSBnc3ViKHBhdHRlcm49Il5cXCwgIiwgcmVwbGFjZW1lbnQ9IiIsIHg9bnRfc3RyKQogICAgICBjdF9zdHIgPC0gZ3N1YihwYXR0ZXJuPSJeXFwsICIsIHJlcGxhY2VtZW50PSIiLCB4PWN0X3N0cikKICAgICAgbnRfZGZbciwgcGF0XSA8LSBudF9zdHIKICAgICAgY3RfZGZbciwgcGF0XSA8LSBjdF9zdHIKICAgICAgcG9zaXRpb25fZGZbciwgcGF0XSA8LSB0b1N0cmluZyhhcy5udW1lcmljKHJvdykpCiAgICAgIG51bWJlcl9kZltyLCBwYXRdIDwtIGxlbmd0aChhcy5udW1lcmljKHJvdykpCiAgICB9CiAgfQogIHJldGxzdCA8LSBsaXN0KAogICAgIm51bWJlcnMiID0gbnVtYmVyX2RmLAogICAgInBjdCIgPSBwY3RfZGYsCiAgICAibnRlcm1pbmFscyIgPSBudF9kZiwKICAgICJjdGVybWluYWxzIiA9IGN0X2RmLAogICAgInBvc2l0aW9ucyIgPSBwb3NpdGlvbl9kZikKICByZXR1cm4ocmV0bHN0KQp9CnBhdHRlcm5zIDwtIGMoIlkoRXxEKSIsICJZLihFfEQpIiwgIlkuLihFfEQpIiwgIlkuLi4oRXxEKSIsICJZLi4uLihFfEQpIiwgIlkuLi4uLihFfEQpIikKc3R1ZmYgPC0gZ2V0X2hpdHMocGF0dGVybnMsIG10Yl9jZHMpCndyaXRlLmNzdihmaWxlPSJleGNlbC9wb3NpdGlvbnNfYnlfcGF0dGVybi5jc3YiLCBzdHVmZltbInBvc2l0aW9ucyJdXSkKd3JpdGUuY3N2KGZpbGU9ImV4Y2VsL251bWJlcl9ieV9wYXR0ZXJuLmNzdiIsIHN0dWZmW1sibnVtYmVycyJdXSkKd3JpdGUuY3N2KGZpbGU9ImV4Y2VsL3BjdF9ieV9wYXR0ZXJuLmNzdiIsIHN0dWZmW1sicGN0Il1dKQp3cml0ZS5jc3YoZmlsZT0iZXhjZWwvbnRlcm1pbmFscy5jc3YiLCBzdHVmZltbIm50ZXJtaW5hbHMiXV0pCndyaXRlLmNzdihmaWxlPSJleGNlbC9jdGVybWluYWxzLmNzdiIsIHN0dWZmW1siY3Rlcm1pbmFscyJdXSkKYGBgCgojIyMgRG93bmxvYWQgZnJvbSBtaWNyb2Jlc29ubGluZQoKYGBge3IgbWljcm9iZXNvbmxpbmUsIGV2YWw9RkFMU0V9CiMjIEZpcnN0IGZpZ3VyZSBvdXQgdGhlIElEIGZvciB0aGUgTXRiIGdlbm9tZToKaWRzIDwtIGdldF9taWNyb2Jlc29ubGluZV9pZHMoIjM3IikKIyMgTXljb2JhY3Rlcml1bSB0dWJlcmN1bG9zaXMgSDM3UnYgaXMgODMzMzIKbXRiX21pY3JvYmVzIDwtIGxvYWRfbWljcm9iZXNvbmxpbmVfYW5ub3RhdGlvbnMoaWRzPTgzMzMyKQptdGJfZ28gPC0gbG9hZF9taWNyb2Jlc29ubGluZV9nbyhpZD04MzMzMikKYGBgCgpgYGB7ciB1bmlwcm90d3N9Cm10Yl91bmlwcm90d3MgPC0gbG9hZF91bmlwcm90d3NfYW5ub3RhdGlvbnMoc3BlY2llcz0iTXljb2JhY3Rlcml1bSB0dWJlcmN1bG9zaXMiKQpgYGAKCmBgYHtyIHNhdmVtZX0KaWYgKCFpc1RSVUUoZ2V0MCgic2tpcF9sb2FkIikpKSB7CiAgcGFuZGVyOjpwYW5kZXIoc2Vzc2lvbkluZm8oKSkKICBtZXNzYWdlKHBhc3RlMCgiVGhpcyBpcyBocGdsdG9vbHMgY29tbWl0OiAiLCBnZXRfZ2l0X2NvbW1pdCgpKSkKICB0aGlzX3NhdmUgPC0gcGFzdGUwKGdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iIiwgeD1ybWRfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKQogIG1lc3NhZ2UocGFzdGUwKCJTYXZpbmcgdG8gIiwgdGhpc19zYXZlKSkKICB0bXAgPC0gc20oc2F2ZW1lKGZpbGVuYW1lPXRoaXNfc2F2ZSkpCn0KYGBgCg==