1 A fresh running of all proteomics tasks

I think I finally worked out all(most?) of the kinks in the processing of DIA data. Thus I want to have a fresh run of all the tasks required to interpret the results.

2 Annotation version: 20190718

2.2 Getting ontology data

## The species being downloaded is: Mycobacterium tuberculosis H37Rv and is being downloaded as 83332.tab.

2.3 Some pattern matching

This little block is intended to seek out peptide sequences with the highly degenerate pattern: Y(E|D) with a varying number of amino acids between the Y and (E or D).

get_hits <- function(patterns, peptide_file, pct_limit=0.25) {
  pep_seq <- Biostrings::readAAStringSet(peptide_file)
  pep_lst <- as.data.frame(pep_seq)[["x"]]
  pos_df <- data.frame(stringsAsFactors=FALSE)
  pep_names <- names(pep_seq)
  rowname <- ""
  for (r in 1:length(pep_names)) {
    rowname <- pep_names[r]
    new_row <- vector()
    found_hits <- FALSE
    for (p in 1:length(patterns)) {
      pat <- patterns[[p]]
      pname <- names(patterns)[p]
      column <- gregexpr(pattern=pat, text=pep_lst)
      names(column) <- names(pep_seq)
      name <- names(column)[r]
      row <- column[[r]]
      len <- nchar(pep_lst[r])
      pct <- as.numeric(row) / len
      cdist <- len - as.numeric(row)
      pos_name <- glue::glue("pos_{pname}")
      num_name <- glue::glue("number_{pname}")
      nt_name <- glue::glue("nt_{pname}")
      ct_name <- glue::glue("ct_{pname}")
      pct_name <- glue::glue("pct_{pname}")
      pos_name <- glue::glue("pos_{pname}")
      ## An important caveat here:  There may be more than one value.
      new_row["rowname"] <- rowname
      number_hits <- length(as.numeric(row))
      if (as.numeric(row)[1] == -1) {
        number_hits <- "0"
      }
      new_row[num_name] <- number_hits
      new_row[pos_name] <- toString(as.numeric(row))
      if (new_row[pos_name] == "-1") {
        new_row[pos_name] <- "0"
      } else {
        found_hits <- TRUE
      }
      new_row["length"] <- len
      op <- options(warn=2)
      if (pct[1] < 0) {
        new_row[pct_name] <- "0"
      } else {
        new_row[pct_name] <- toString(pct)
      }
      options(op)
      nt_str <- ""
      ct_str <- ""
      max_pct <- 1 - pct_limit
      for (i in pct) {
        if (i < 0) {
          next
        }
        if (i < pct_limit) {
          nt_str <- paste0(nt_str, ", ", i)
        }
        if (i > max_pct) {
          ct_str <- paste0(ct_str, ", ", i)
        }
      }
      nt_str <- gsub(pattern="^\\, ", replacement="", x=nt_str)
      ct_str <- gsub(pattern="^\\, ", replacement="", x=ct_str)
      new_row[nt_name] <- nt_str
      new_row[ct_name] <- ct_str
    }
    if (isTRUE(found_hits)) {
      pos_df <- rbind(pos_df, new_row, stringsAsFactors=FALSE)
      colnames(pos_df) <- names(new_row)
    }
  } ## End for every gene
  rownames(pos_df) <- pos_df[["rowname"]]
  pos_df <- pos_df[-1, ]

  return(pos_df)
}

##patterns <- c("Y(E|D)", "Y.(E|D)", "Y..(E|D)", "Y...(E|D)", "Y....(E|D)", "Y.....(E|D)")
mtb_cds <- "reference/mtb_cds.fasta"
patterns <- c("Y..(E|D)", "Y...(E|D)")
names(patterns) <- c("two", "three")
yed_patterns <- get_hits(patterns, mtb_cds)
written <- write_xls(yed_patterns, excel="positions_by_patterns.xlsx")
## Saving to: positions_by_patterns.xlsx
## Note: zip::zip() is deprecated, please use zip::zipr() instead

2.6 Compare subset of interesting proteins vs. all

Najib and Volker requested a comparison of the distribution pattern hits of all proteins vs. the distribution in a specific subset of proteins. I don’t actually know the subset desired, so I will assume the

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00    1.17    2.00   13.00
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.329   1.000   3.000
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     1.0     1.2     2.0    10.0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.708   1.000   4.000

4 Gathering parameters

In this first block, I will set a couple of variables and source a file containing the parameters for the rest of the script.

## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 8ca465bb9928ffe95082f64aed9cf64799bbf8e6
## This is hpgltools commit: Wed Jul 31 16:40:59 2019 -0400: 8ca465bb9928ffe95082f64aed9cf64799bbf8e6
## Saving to 01_preprocessing_20190718-v20190718.rda.xz
---
title: "M.tuberculosis 20190718 proteomics: Preprocessing."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
  rmdformats::readthedown:
    code_download: true
    code_folding: show
    df_print: paged
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    width: 300
    keep_md: false
    mode: selfcontained
    toc_float: true
  BiocStyle::html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    keep_md: false
    mode: selfcontained
    toc_float: true
  html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    keep_md: false
    mode: selfcontained
    number_sections: true
    self_contained: true
    theme: readable
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: false
---

<style type="text/css">
body, td {
  font-size: 16px;
}
code.r{
  font-size: 16px;
}
pre {
 font-size: 16px
}
</style>

```{r options, include=FALSE}
library("hpgltools")
tt <- devtools::load_all("/data/hpgltools")
knitr::opts_knit$set(width=120,
                     progress=TRUE,
                     verbose=TRUE,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      dpi=96)
old_options <- options(digits=4,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
ver <- "20190718"
previous_file <- "index.Rmd"

tmp <- try(sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz"))))
rmd_file <- paste0("01_preprocessing_", ver, ".Rmd")
```

```{bash parameters, eval=FALSE}
export VERSION="20190718"
source "parameters/${VERSION}_settings.sh"
```


A fresh running of all proteomics tasks
=======================================

I think I finally worked out all(most?) of the kinks in the processing of DIA data.
Thus I want to have a fresh run of all the tasks required to interpret the results.

# Annotation version: `r ver`

## Genome annotation input

### Read a gff file

In contrast, it is possible to load most annotations of interest directly from the gff files used in
the alignments.  More in-depth information for the human transcriptome may be extracted from biomart.

```{r genome_input}
## The old way of getting genome/annotation data
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"
mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"

mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
colnames(mtb_annotations) <- gsub(pattern="\\.", replacement="", x=colnames(mtb_annotations))
mtb_annotations[["description"]] <- gsub(pattern="\\+", replacement=" ",
                                         x=mtb_annotations[["description"]])
mtb_annotations[["function"]] <- gsub(pattern="\\+", replacement=" ",
                                      x=mtb_annotations[["function"]])
rownames(mtb_annotations) <- mtb_annotations[["ID"]]
```

### Download from microbesonline

Apparently I queried the microbesonline too often and now I get an error
whenever I try to use them, this disappoints me.

```{r genbank}
## I made a nifty function to do this stuff: load_uniprotws_annotations().
## It is slow, though.
mtb_microbes <- load_microbesonline_annotations(id=83332)
mtb_uniprot_annot <- sm(load_uniprot_annotations(file="reference/uniprot_3AUP000001584.txt.gz"))
```

## Getting ontology data

```{r ontology}
mtb_go <- load_microbesonline_go(id=83332)
```

## Some pattern matching

This little block is intended to seek out peptide sequences with the highly degenerate pattern:
Y(E|D) with a varying number of amino acids between the Y and (E or D).

```{r pattern}
get_hits <- function(patterns, peptide_file, pct_limit=0.25) {
  pep_seq <- Biostrings::readAAStringSet(peptide_file)
  pep_lst <- as.data.frame(pep_seq)[["x"]]
  pos_df <- data.frame(stringsAsFactors=FALSE)
  pep_names <- names(pep_seq)
  rowname <- ""
  for (r in 1:length(pep_names)) {
    rowname <- pep_names[r]
    new_row <- vector()
    found_hits <- FALSE
    for (p in 1:length(patterns)) {
      pat <- patterns[[p]]
      pname <- names(patterns)[p]
      column <- gregexpr(pattern=pat, text=pep_lst)
      names(column) <- names(pep_seq)
      name <- names(column)[r]
      row <- column[[r]]
      len <- nchar(pep_lst[r])
      pct <- as.numeric(row) / len
      cdist <- len - as.numeric(row)
      pos_name <- glue::glue("pos_{pname}")
      num_name <- glue::glue("number_{pname}")
      nt_name <- glue::glue("nt_{pname}")
      ct_name <- glue::glue("ct_{pname}")
      pct_name <- glue::glue("pct_{pname}")
      pos_name <- glue::glue("pos_{pname}")
      ## An important caveat here:  There may be more than one value.
      new_row["rowname"] <- rowname
      number_hits <- length(as.numeric(row))
      if (as.numeric(row)[1] == -1) {
        number_hits <- "0"
      }
      new_row[num_name] <- number_hits
      new_row[pos_name] <- toString(as.numeric(row))
      if (new_row[pos_name] == "-1") {
        new_row[pos_name] <- "0"
      } else {
        found_hits <- TRUE
      }
      new_row["length"] <- len
      op <- options(warn=2)
      if (pct[1] < 0) {
        new_row[pct_name] <- "0"
      } else {
        new_row[pct_name] <- toString(pct)
      }
      options(op)
      nt_str <- ""
      ct_str <- ""
      max_pct <- 1 - pct_limit
      for (i in pct) {
        if (i < 0) {
          next
        }
        if (i < pct_limit) {
          nt_str <- paste0(nt_str, ", ", i)
        }
        if (i > max_pct) {
          ct_str <- paste0(ct_str, ", ", i)
        }
      }
      nt_str <- gsub(pattern="^\\, ", replacement="", x=nt_str)
      ct_str <- gsub(pattern="^\\, ", replacement="", x=ct_str)
      new_row[nt_name] <- nt_str
      new_row[ct_name] <- ct_str
    }
    if (isTRUE(found_hits)) {
      pos_df <- rbind(pos_df, new_row, stringsAsFactors=FALSE)
      colnames(pos_df) <- names(new_row)
    }
  } ## End for every gene
  rownames(pos_df) <- pos_df[["rowname"]]
  pos_df <- pos_df[-1, ]

  return(pos_df)
}

##patterns <- c("Y(E|D)", "Y.(E|D)", "Y..(E|D)", "Y...(E|D)", "Y....(E|D)", "Y.....(E|D)")
mtb_cds <- "reference/mtb_cds.fasta"
patterns <- c("Y..(E|D)", "Y...(E|D)")
names(patterns) <- c("two", "three")
yed_patterns <- get_hits(patterns, mtb_cds)
written <- write_xls(yed_patterns, excel="positions_by_patterns.xlsx")
```

## Random query from Volker

What proteins do not have glycine?

Glycine : Gly : G

```{r glycine}
pep_seq <- Biostrings::readAAStringSet(mtb_cds)
pep_lst <- as.data.frame(pep_seq)[["x"]]
pos_df <- data.frame(stringsAsFactors=FALSE)
pep_names <- names(pep_seq)
pep_df <- as.data.frame(pep_seq)
glycine_hits <- grepl(pattern="G", x=pep_df[["x"]])
rownames(pep_df)[!glycine_hits]
```

## Add the YED patterns to the annotation data

```{r yed_added}
mtb_annotations <- merge(mtb_annotations, yed_patterns, by="row.names", all.x=TRUE)
rownames(mtb_annotations) <- mtb_annotations[["Row.names"]]
mtb_annotations <- mtb_annotations[-1, ]
```

## Compare subset of interesting proteins vs. all

Najib and Volker requested a comparison of the distribution pattern hits of all
proteins vs. the distribution in a specific subset of proteins.  I don't
actually know the subset desired, so I will assume the

```{r pe_testing}
pe_ids <- read.table("reference/annotated_pe_genes.txt")[["V1"]]

yed_pe <- yed_patterns[pe_ids, ]

all_numbers <- as.numeric(yed_patterns[["number_two"]])
pe_numbers <- as.numeric(yed_pe[["number_two"]])

allthree_numbers <- as.numeric(yed_patterns[["number_three"]])
pethree_numbers <- as.numeric(yed_pe[["number_three"]])

all_pct <- yed_patterns[["pct_two"]]
pe_pct <- yed_pe[["pct_two"]]
all_positions <- yed_patterns[["pos_two"]]
pe_positions <- yed_pe[["pos_two"]]

sampled_numbers_two <- sample(x=all_numbers, size=1000, replace=TRUE)
summary(sampled_numbers_two)
pe_numbers[is.na(pe_numbers)] <- 0
summary(pe_numbers)

sampled_numbers_three <- sample(x=allthree_numbers, size=1000, replace=TRUE)
summary(sampled_numbers_three)
pethree_numbers[is.na(pethree_numbers)] <- 0
summary(pethree_numbers)
```

# Read the sample sheet and examine the new raw data

```{r read_samples, eval=FALSE}
sample_sheet <- glue::glue("sample_sheets/Mtb_dia_samples_{ver}.xlsx")
savefile <- glue::glue("mzxml_dia_data_{ver}.rda")

if (file.exists(savefile)) {
  load(savefile)
} else {
  mzxml_data <- extract_msraw_data(sample_sheet, parallel=FALSE,
                                   allow_window_overlap=FALSE,
                                   file_column="filename",
                                   savefile=savefile)

  mzml_data <- extract_msraw_data(sample_sheet, parallel=FALSE,
                                  format="mzML",
                                  allow_window_overlap=FALSE,
                                  file_column="mzmlfile",
                                  savefile="testing.rda")
}

intensity_boxplot <- plot_mzxml_boxplot(mzxml_data)
pp(file=glue::glue("images/dia_mzxml_intensities-v{ver}.png"), image=intensity_boxplot)

retention_boxplot <- plot_mzxml_boxplot(mzxml_data, table="scans", column="peakscount")
pp(file=glue::glue("images/dia_mzxml_retention-v{ver}.png"), image=retention_boxplot)

mz_boxplot <- plot_mzxml_boxplot(mzxml_data, table="scans", column="basepeakmz")
pp(file=glue::glue("images/dia_mzxml_mzbase-v{ver}.png"), image=mz_boxplot)

scanintensity_boxplot <- plot_mzxml_boxplot(mzxml_data, table="scans", column="basepeakintensity")
pp(file=glue::glue("images/dia_mzxml_scanintensity-v{ver}.png"), image=scanintensity_boxplot)

##intensity_wrt_mz <- plot_intensity_mz(mzxml_data, x_scale="log", y_scale="log")
##pp(file=paste0("images/dia_intensity_wrt_mz_dia-v{ver}.png"), image=intensity_wrt_mz)
```

# Gathering parameters

In this first block, I will set a couple of variables and source a file
containing the parameters for the rest of the script.

```{r saveme}
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
}
```
