M.tuberculosis 20180913 proteomics: Collecting annotation data.

A fresh running of all proteomics tasks
Annotation version: 20180913

A fresh running of all proteomics tasks

I think I finally worked out all(most?) of the kinks in the processing of DIA data. Thus I want to have a fresh run of all the tasks required to interpret the results.

Annotation version: 20180913

Genome annotation input

Read a gff file

In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments. More in-depth information for the human transcriptome may be extracted from biomart.

## The old way of getting genome/annotation data
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"
mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"

mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
colnames(mtb_annotations) <- gsub(pattern="\\.", replacement="", x=colnames(mtb_annotations))
mtb_annotations[["description"]] <- gsub(pattern="\\+", replacement=" ",
                                         x=mtb_annotations[["description"]])
mtb_annotations[["function"]] <- gsub(pattern="\\+", replacement=" ",
                                         x=mtb_annotations[["function"]])
rownames(mtb_annotations) <- mtb_annotations[["ID"]]

Download from microbesonline

Apparently I queried the microbesonline too often and now I get an error whenever I try to use them, this disappoints me.

## I made a nifty function to do this stuff: load_uniprotws_annotations().
## It is slow, though.
mtb_microbes <- load_microbesonline_annotations(id=83332)

## The species being downloaded is: Mycobacterium tuberculosis H37Rv

## mtb_uniprot_annot <- load_uniprotws_annotations()
mtb_uniprot_annot <- sm(load_uniprot_annotations(file="reference/uniprot_3AUP000001584.txt.gz"))

Getting ontology data

mtb_go <- load_microbesonline_go(id=83332)

## The species being downloaded is: Mycobacterium tuberculosis H37Rv and is being downloaded as 83332.tab.

Some pattern matching

This little block is intended to seek out peptide sequences with the highly degenerate pattern: Y(E|D) with a varying number of amino acids between the Y and (E or D).

get_hits <- function(patterns, peptide_file, pct_limit=0.25) {
  pep_seq <- Biostrings::readAAStringSet(peptide_file)
  pep_lst <- as.data.frame(pep_seq)[["x"]]
  pos_df <- data.frame(stringsAsFactors=FALSE)
  pep_names <- names(pep_seq)
  rowname <- ""
  for (r in 1:length(pep_names)) {
    rowname <- pep_names[r]
    new_row <- vector()
    found_hits <- FALSE
    for (p in 1:length(patterns)) {
      pat <- patterns[[p]]
      pname <- names(patterns)[p]
      column <- gregexpr(pattern=pat, text=pep_lst)
      names(column) <- names(pep_seq)
      name <- names(column)[r]
      row <- column[[r]]
      len <- nchar(pep_lst[r])
      pct <- as.numeric(row) / len
      cdist <- len - as.numeric(row)
      pos_name <- glue::glue("pos_{pname}")
      num_name <- glue::glue("number_{pname}")
      nt_name <- glue::glue("nt_{pname}")
      ct_name <- glue::glue("ct_{pname}")
      pct_name <- glue::glue("pct_{pname}")
      pos_name <- glue::glue("pos_{pname}")
      ## An important caveat here:  There may be more than one value.
      new_row["rowname"] <- rowname
      number_hits <- length(as.numeric(row))
      if (as.numeric(row)[1] == -1) {
        number_hits <- "0"
      }
      new_row[num_name] <- number_hits
      new_row[pos_name] <- toString(as.numeric(row))
      if (new_row[pos_name] == "-1") {
        new_row[pos_name] <- "0"
      } else {
        found_hits <- TRUE
      }
      new_row["length"] <- len
      op <- options(warn=2)
      if (pct[1] < 0) {
        new_row[pct_name] <- "0"
      } else {
        new_row[pct_name] <- toString(pct)
      }
      options(op)
      nt_str <- ""
      ct_str <- ""
      max_pct <- 1 - pct_limit
      for (i in pct) {
        if (i < 0) {
          next
        }
        if (i < pct_limit) {
          nt_str <- paste0(nt_str, ", ", i)
        }
        if (i > max_pct) {
          ct_str <- paste0(ct_str, ", ", i)
        }
      }
      nt_str <- gsub(pattern="^\\, ", replacement="", x=nt_str)
      ct_str <- gsub(pattern="^\\, ", replacement="", x=ct_str)
      new_row[nt_name] <- nt_str
      new_row[ct_name] <- ct_str
    }
    if (isTRUE(found_hits)) {
      pos_df <- rbind(pos_df, new_row, stringsAsFactors=FALSE)
      colnames(pos_df) <- names(new_row)
    }
  } ## End for every gene
  rownames(pos_df) <- pos_df[["rowname"]]
  pos_df <- pos_df[-1, ]

  return(pos_df)
}

##patterns <- c("Y(E|D)", "Y.(E|D)", "Y..(E|D)", "Y...(E|D)", "Y....(E|D)", "Y.....(E|D)")
mtb_cds <- "reference/mtb_cds.fasta"
patterns <- c("Y..(E|D)", "Y...(E|D)")
names(patterns) <- c("two", "three")
yed_patterns <- get_hits(patterns, mtb_cds)
written <- write_xls(yed_patterns, excel="positions_by_patterns.xlsx")

## Saving to: positions_by_patterns.xlsx

Add the YED patterns to the annotation data

mtb_annotations <- merge(mtb_annotations, yed_patterns, by="row.names", all.x=TRUE)
rownames(mtb_annotations) <- mtb_annotations[["Row.names"]]
mtb_annotations <- mtb_annotations[-1, ]

Compare subset of interesting proteins vs. all

Najib and Volker requested a comparison of the distribution pattern hits of all proteins vs. the distribution in a specific subset of proteins. I don’t actually know the subset desired, so I will assume the

pe_ids <- read.table("reference/annotated_pe_genes.txt")[["V1"]]

yed_pe <- yed_patterns[pe_ids, ]

all_numbers <- as.numeric(yed_patterns[["number_two"]])
pe_numbers <- as.numeric(yed_pe[["number_two"]])

allthree_numbers <- as.numeric(yed_patterns[["number_three"]])
pethree_numbers <- as.numeric(yed_pe[["number_three"]])

all_pct <- yed_patterns[["pct_two"]]
pe_pct <- yed_pe[["pct_two"]]
all_positions <- yed_patterns[["pos_two"]]
pe_positions <- yed_pe[["pos_two"]]

sampled_numbers_two <- sample(x=all_numbers, size=1000, replace=TRUE)
summary(sampled_numbers_two)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00    1.15    2.00   13.00

pe_numbers[is.na(pe_numbers)] <- 0
summary(pe_numbers)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.329   1.000   3.000

sampled_numbers_three <- sample(x=allthree_numbers, size=1000, replace=TRUE)
summary(sampled_numbers_three)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00    1.23    2.00   11.00

pethree_numbers[is.na(pethree_numbers)] <- 0
summary(pethree_numbers)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.708   1.000   4.000

if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
}

## If you wish to reproduce this exact build of hpgltools, invoke the following:

## > git clone http://github.com/abelew/hpgltools.git

## > git reset 7c4477bb4fa3639cc6cf7940216e4c4b8cbee7ce

## This is hpgltools commit: Fri Oct 26 17:27:11 2018 -0400: 7c4477bb4fa3639cc6cf7940216e4c4b8cbee7ce

## Saving to 01_annotation_20180913-v20180913.rda.xz