1 Rework DIA-Umpire to feed OpenSWATH.

In the following blocks I want to use DIA Umpire to create transition libraries for openswath, then I want to run openswath and score the runs.

1.5 Combine the statistics

## Rerunning because writing the file failed.
spectrast \
    -cNSpecLib -cICID-QTOF \
    -cf "Protein! ~ DECOY_" \
    -cP0.4237 \
    -c_IRTreference/irt.txt \
    -c_IRR results/05_dia_umpire_prophet/iProphet.pep.xml

spectrast \
    -cNSpecLib_cons \
    -cICID-QTOF \
    -cAC SpecLib.splib

spectrast2tsv.py \
    -l 350,2000 \
    -s b,y \
    -x 1,2 \
    -o 6 \
    -n 6 \
    -p 0.05 \
    -d -e \
    -k openswath \
    -w windows/2018_0817BrikenTrypsinDIA19.txt \
    -a SpecLib_cons_openswath.tsv \
    SpecLib_cons.sptxt

TargetedFileConverter \
    -in SpecLib_cons_openswath.tsv \
    -in_type tsv \
    -out SpecLib_cons_openswath.TraML \
    -out_type TraML

OpenSwathDecoyGenerator \
    -in SpecLib_cons_openswath.TraML \
    -out SpecLib_cons_openswath_decoy.TraML \
    -method shuffle
##    -exclude_similar \
##    -similarity_threshold 0.05 \
##    -identity_threshold 0.7

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.tsv \
    -out_type tsv

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.pqp \
    -out_type pqp

export VERSION=${VERSION:-20190327}
echo "Loading environment modules and parameters for version: ${VERSION}."
source "parameters/${VERSION}_settings.sh"

echo "Invoking the OpenSwathWorkflow using local comet-derived transitions."
type="diaumpire"
input_type="mzXML"
export TRANSITION_PREFIX="SpecLib_cons_openswath_decoy"
echo "Checking in, the transition library is: ${TRANSITION_PREFIX}.pqp"
base_mzxmldir="results/01${input_type}/dia/${VERSION}"
swath_inputs=$(/usr/bin/find "${base_mzxmldir}" -name *.${input_type} -print | sort)
echo "Checking in, the inputs are: ${swath_inputs}"
mkdir -p "${SWATH_OUTDIR}_${type}"
pypdir="${PYPROPHET_OUTDIR}_${type}"
mkdir -p "${pypdir}"
for input in ${swath_inputs}
do
    name=$(basename "${input}" ".${input_type}")
    echo "Starting openswath run, library type ${type} for ${name} using ${MZ_WINDOWS} windows at $(date)."
    swath_output_prefix="${SWATH_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    pyprophet_output_prefix="${PYPROPHET_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    echo "Deleting previous swath output file: ${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_tsv "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_osw "${swath_output_prefix}.osw"
    ##2>"${swath_output_prefix}_osw.log" 1>&2
done
swath_out=$(dirname ${swath_output_prefix})
pyprophet_out="$(dirname "${pyprophet_output_prefix}")/openswath_merged.osw"
echo "Merging osw files to ${pyprophet_out}"
pyprophet merge \
          --template "${TRANSITION_PREFIX}.pqp" \
          --out="${pyprophet_out}" \
          ${swath_out}/*.osw
pyprophet score --in="${pyprophet_out}"
pyprophet export --in="${pyprophet_out}" --out "test.tsv"
## pyprophet always exports to the current working directory.
final_name="$(dirname ${pyprophet_out})/$(basename ${pyprophet_out} ".osw").tsv"
echo $final_name
mv "test.tsv"
ls -ld "${pyprophet_out}"

tric_tb="${TRIC_OUTDIR}_tuberculist"
mkdir -p "${tric_tb}"
feature_alignment.py \
    --force \
    --in "./${pypdir}/"*.tsv \
    --out "${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv" \
    --out_matrix "${tric_tb}/${DDA_METHOD}_outmatrix.tsv" \
    --out_meta "${tric_tb}/${DDA_METHOD}_meta.tsv"
2>"${tric_tb}/feature_alignment.err" \
 1>"${tric_tb}/feature_alignment.out"
echo "Wrote final output to ${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv"

2 DEP usage

Thanks to Vivek, I now am aware of DEP, which does everything I wish MSstats did. The matrix given to me by tric’s feature_alignment.py I think gives me what DEP requires, along with my annotations and sample sheet.

Let us see if this is true.

2.2 Preprocess intensities in preparation for DEP

## Loading SWATH2stats
## Not checking that the files are identical between the annotation and data.
## Number of non-decoy peptides: 17081
## Number of decoy peptides: 1801
## Decoy rate: 0.1054
## The average FDR by run on assay level is 0.014
## The average FDR by run on peptide level is 0.016
## The average FDR by run on protein level is 0.001
## Target assay FDR: 0.02
## Required overall m-score cutoff: 0.0031623
## achieving assay FDR: 0.0194
## Target protein FDR: 0.02
## Required overall m-score cutoff: 0.01
## achieving protein FDR: 0.00115
## Original dimension: 221952, new dimension: 211415, difference: 10537.
## Peptides need to have been quantified in more conditions than: 48 in order to pass this percentage-based threshold.
## Fraction of peptides selected: 0.00058
## Original dimension: 224796, new dimension: 680, difference: 224116.
## Target protein FDR: 0.01
## Required overall m-score cutoff: 0.01
## achieving protein FDR: 0
## filter_mscore_fdr is filtering the data...
## finding m-score cutoff to achieve desired protein FDR in protein master list..
## finding m-score cutoff to achieve desired global peptide FDR..
## Target peptide FDR: 0.05
## Required overall m-score cutoff: 0.01
## Achieving peptide FDR: 0
## Proteins selected: 
## Total proteins selected: 2412
## Final target proteins: 2412
## Final decoy proteins: 0
## Peptides mapping to these protein entries selected:
## Total mapping peptides: 16868
## Final target peptides: 16868
## Final decoy peptides: 0
## Total peptides selected from:
## Total peptides: 16868
## Final target peptides: 16868
## Final decoy peptides: 0
## Individual run FDR quality of the peptides was not calculated
## as not every run contains a decoy.
## The decoys have been removed from the returned data.
## Number of proteins detected: 2363
## Protein identifiers: Rv0577, Rv0242c, Rv3012c, Rv2467, Rv3715c, Rv2220
## Number of proteins detected that are supported by a proteotypic peptide: 2337
## Number of proteotypic peptides detected: 16728
## Number of proteins detected: 2337
## First 6 protein identifiers: Rv0577, Rv0242c, Rv3012c, Rv2467, Rv3715c, Rv2220
## Before filtering:
##   Number of proteins: 2337
##   Number of peptides: 16728
## 
## Percentage of peptides removed: 25.94%
## 
## After filtering:
##   Number of proteins: 2331
##   Number of peptides: 12388
## Before filtering:
##   Number of proteins: 2331
##   Number of peptides: 12388
## 
## Percentage of peptides removed: 0%
## 
## After filtering:
##   Number of proteins: 2284
##   Number of peptides: 12388
## Protein overview matrix results/swath2stats/20190327/protein_all.csv written to working folder.
## [1] 2434   45
## Protein overview matrix results/swath2stats/20190327/protein_matrix_mscore.csv written to working folder.
## [1] 2412   45
## Peptide overview matrix results/swath2stats/20190327/peptide_matrix_mscore.csv written to working folder.
## [1] 16868    45
## Protein overview matrix results/swath2stats/20190327/protein_matrix_filtered.csv written to working folder.
## [1] 2284   45
## Peptide overview matrix results/swath2stats/20190327/peptide_matrix_filtered.csv written to working folder.
## [1] 144819     45

3 Pass the data to DEP and see what happens.

## 
## Attaching package: 'DEP'
## The following objects are masked from 'package:hpgltools':
## 
##     plot_heatmap, plot_pca

## vsn2: 1628 x 43 matrix (1 stratum).
## Please use 'meanSdPlot' to verify the fit.

## Loading required package: imputeLCMD
## Loading required package: tmvtnorm
## Loading required package: mvtnorm
## Loading required package: Matrix
## Loading required package: stats4
## Loading required package: gmm
## Loading required package: sandwich
## Loading required package: norm
## Loading required package: pcaMethods
## 
## Attaching package: 'pcaMethods'
## The following object is masked from 'package:stats':
## 
##     loadings
## Loading required package: impute
## [1] 1.557

## Tested contrasts: wt_filtrate_vs_wt_whole, wt_filtrate_vs_delta_filtrate, wt_filtrate_vs_comp_filtrate, wt_filtrate_vs_delta_whole, wt_filtrate_vs_comp_whole, wt_whole_vs_delta_filtrate, wt_whole_vs_comp_filtrate, wt_whole_vs_delta_whole, wt_whole_vs_comp_whole, delta_filtrate_vs_comp_filtrate, delta_filtrate_vs_delta_whole, delta_filtrate_vs_comp_whole, comp_filtrate_vs_delta_whole, comp_filtrate_vs_comp_whole, delta_whole_vs_comp_whole
## Tested contrasts: wt_filtrate_vs_wt_whole, delta_filtrate_vs_wt_filtrate, comp_filtrate_vs_wt_filtrate, wt_filtrate_vs_delta_filtrate, wt_filtrate_vs_comp_filtrate, wt_whole_vs_delta_whole, wt_whole_vs_comp_whole

## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 0abc58e173be7300595d30d407b7efd4e4a512d6
## This is hpgltools commit: Thu May 9 14:56:34 2019 -0400: 0abc58e173be7300595d30d407b7efd4e4a512d6
## Saving to dia_umpire_20190308-v20190327.rda.xz

R version 3.5.3 (2019-03-11)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: imputeLCMD(v.2.0), impute(v.1.56.0), pcaMethods(v.1.74.0), norm(v.1.0-9.5), tmvtnorm(v.1.4-10), gmm(v.1.6-2), sandwich(v.2.5-1), Matrix(v.1.2-17), mvtnorm(v.1.0-10), DEP(v.1.4.1), SWATH2stats(v.1.13.5), testthat(v.2.1.1), hpgltools(v.1.0), Biobase(v.2.42.0) and BiocGenerics(v.0.28.0)

loaded via a namespace (and not attached): shinydashboard(v.0.7.1), tidyselect(v.0.2.5), lme4(v.1.1-21), htmlwidgets(v.1.3), RSQLite(v.2.1.1), AnnotationDbi(v.1.44.0), grid(v.3.5.3), BiocParallel(v.1.16.6), devtools(v.2.0.2), munsell(v.0.5.0), codetools(v.0.2-16), preprocessCore(v.1.44.0), DT(v.0.6), withr(v.2.1.2), colorspace(v.1.4-1), GOSemSim(v.2.8.0), knitr(v.1.23), rstudioapi(v.0.10), DOSE(v.3.8.2), mzID(v.1.20.1), labeling(v.0.3), urltools(v.1.7.3), GenomeInfoDbData(v.1.2.0), polyclip(v.1.10-0), bit64(v.0.9-7), farver(v.1.1.0), rprojroot(v.1.3-2), xfun(v.0.7), R6(v.2.4.0), doParallel(v.1.0.14), GenomeInfoDb(v.1.18.2), bitops(v.1.0-6), fgsea(v.1.8.0), gridGraphics(v.0.4-1), DelayedArray(v.0.8.0), assertthat(v.0.2.1), promises(v.1.0.1), scales(v.1.0.0), ggraph(v.1.0.2), enrichplot(v.1.2.0), gtable(v.0.3.0), affy(v.1.60.0), sva(v.3.30.1), processx(v.3.3.1), rlang(v.0.3.4), genefilter(v.1.64.0), mzR(v.2.16.2), GlobalOptions(v.0.1.0), splines(v.3.5.3), rtracklayer(v.1.42.2), lazyeval(v.0.2.2), selectr(v.0.4-1), europepmc(v.0.3), BiocManager(v.1.30.4), yaml(v.2.2.0), reshape2(v.1.4.3), GenomicFeatures(v.1.34.8), backports(v.1.1.4), httpuv(v.1.5.1), qvalue(v.2.14.1), clusterProfiler(v.3.10.1), tools(v.3.5.3), usethis(v.1.5.0), ggplotify(v.0.0.3), ggplot2(v.3.1.1), affyio(v.1.52.0), gplots(v.3.0.1.1), RColorBrewer(v.1.1-2), sessioninfo(v.1.1.1), MSnbase(v.2.8.3), ggridges(v.0.5.1), Rcpp(v.1.0.1), plyr(v.1.8.4), base64enc(v.0.1-3), progress(v.1.2.2), zlibbioc(v.1.28.0), purrr(v.0.3.2), RCurl(v.1.95-4.12), ps(v.1.3.0), prettyunits(v.1.0.2), GetoptLong(v.0.1.7), viridis(v.0.5.1), cowplot(v.0.9.4), zoo(v.1.8-5), S4Vectors(v.0.20.1), cluster(v.2.0.9), SummarizedExperiment(v.1.12.0), ggrepel(v.0.8.1), colorRamps(v.2.3), fs(v.1.3.1), variancePartition(v.1.12.3), magrittr(v.1.5), data.table(v.1.12.2), DO.db(v.2.9), openxlsx(v.4.1.0), circlize(v.0.4.6), triebeard(v.0.3.0), packrat(v.0.5.0), ProtGenerics(v.1.14.0), matrixStats(v.0.54.0), pkgload(v.1.0.2), mime(v.0.6), hms(v.0.4.2), evaluate(v.0.13), xtable(v.1.8-4), pbkrtest(v.0.4-7), XML(v.3.98-1.19), IRanges(v.2.16.0), gridExtra(v.2.3), shape(v.1.4.4), compiler(v.3.5.3), biomaRt(v.2.38.0), tibble(v.2.1.1), KernSmooth(v.2.23-15), ncdf4(v.1.16.1), crayon(v.1.3.4), minqa(v.1.2.4), htmltools(v.0.3.6), later(v.0.8.0), mgcv(v.1.8-28), tidyr(v.0.8.3), DBI(v.1.0.0), tweenr(v.1.0.1), ComplexHeatmap(v.1.20.0), MASS(v.7.3-51.4), boot(v.1.3-22), readr(v.1.3.1), cli(v.1.1.0), vsn(v.3.50.0), gdata(v.2.18.0), igraph(v.1.2.4.1), GenomicRanges(v.1.34.0), pkgconfig(v.2.0.2), rvcheck(v.0.1.3), GenomicAlignments(v.1.18.1), MALDIquant(v.1.19.3), xml2(v.1.2.0), foreach(v.1.4.4), annotate(v.1.60.1), XVector(v.0.22.0), rvest(v.0.3.4), stringr(v.1.4.0), callr(v.3.2.0), digest(v.0.6.19), Biostrings(v.2.50.2), rmarkdown(v.1.12), fastmatch(v.1.1-0), curl(v.3.3), shiny(v.1.3.2), Rsamtools(v.1.34.1), gtools(v.3.8.1), rjson(v.0.2.20), nloptr(v.1.2.1), nlme(v.3.1-140), jsonlite(v.1.6), desc(v.1.2.0), viridisLite(v.0.3.0), limma(v.3.38.3), pillar(v.1.4.0), lattice(v.0.20-38), httr(v.1.4.0), pkgbuild(v.1.0.3), survival(v.2.44-1.1), GO.db(v.3.7.0), glue(v.1.3.1), remotes(v.2.0.4), fdrtool(v.1.2.15), zip(v.2.0.2), UpSetR(v.1.3.3), iterators(v.1.0.10), pander(v.0.6.3), bit(v.1.1-14), ggforce(v.0.2.2), stringi(v.1.4.3), blob(v.1.1.1), caTools(v.1.17.1.2), memoise(v.1.1.0) and dplyr(v.0.8.1)

---
title: "M. tuberculosis 20190327: DIA-Umpire based OpenSWATH workflow."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
  html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    keep_md: false
    mode: selfcontained
    number_sections: true
    self_contained: true
    theme: readable
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: false
  rmdformats::readthedown:
    code_download: true
    code_folding: show
    df_print: paged
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    width: 300
    keep_md: false
    mode: selfcontained
    toc_float: true
  BiocStyle::html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    keep_md: false
    mode: selfcontained
    toc_float: true
---

<style type="text/css">
body, td {
  font-size: 16px;
}
code.r{
  font-size: 16px;
}
pre {
 font-size: 16px
}
</style>

```{r options, include=FALSE}
library("hpgltools")
tt <- devtools::load_all("/data/hpgltools")
knitr::opts_knit$set(width=120,
                     progress=TRUE,
                     verbose=TRUE,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      dpi=96)
old_options <- options(digits=4,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
rundate <- format(Sys.Date(), format="%Y%m%d")
previous_file <- "02_estimation_infection_20180822.Rmd"
ver <- "20190327"

##tmp <- sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz")))
rmd_file <- "dia_umpire_20190308.Rmd"
```

# Rework DIA-Umpire to feed OpenSWATH.

In the following blocks I want to use DIA Umpire to create transition libraries for openswath,
then I want to run openswath and score the runs.

## Invoke DIA Umpire

```{bash umpire, eval=FALSE}
cd ~/scratch/proteomics/mycobacterium_tuberculosis_2018
module add openms

type="mzXML"
export VERSION="20190228"
basedir="${HOME}/scratch/proteomics/mycobacterium_tuberculosis_2018"
base_input="${basedir}/results/01${type}/dia/${VERSION}/"
umpire_inputs=$(/usr/bin/find "${base_input}" -name "*.${type}" | sort)
echo "Checking in: ${umpire_inputs}"
for input in ${umpire_inputs};
do
    in_name=$(basename $input ".${type}")
    out_name="${in_name}_Q1.mgf"
    if [[ ! -f "${base_input}/${out_name}" ]]; then
        echo "The output file: ${out_name} already exists."
    else
        java -jar DIA_Umpire_SE.jar ${input} diaumpire_se.params
    fi
done
```

## Convert DIA Umpire results

```{bash convert, eval=FALSE}
msconvert --mzXML results/02mgf/*.mgf
mv *.mzXML results/03_dia_umpire_mzxml
```

## Search the Umpire results

```{bash search_umpire, eval=FALSE}
comet \
    -Pparameters/comet_dia_umpire_params.txt \
    results/03_dia_umpire_mzxml/*.mzXML
```

## Merge them

```{bash merge_xinteract, eval=FALSE}
xinteract \
    -dDECOY_ \
    -OARPd \
    -Ninteract.comet.pep.xml \
    results/03_dia_umpire_mzxml/*.pep.xml

mv interact.comet.pep.xml results/04_dia_umpire_xinteract
```

## Combine the statistics

```{bash protein_prophet, eval=FALSE}
ProteinProphet \
    results/04_dia_umpire_xinteract/interact.comet.pep.xml \
    results/05_dia_umpire_prophet/combined.prot.xml

InterProphetParser \
    DECOY=DECOY \
    results/04_dia_umpire_xinteract/interact.comet.pep.xml \
    results/05_dia_umpire_prophet/iProphet.pep.xml

Mayu.pl \
    -A results/05_dia_umpire_prophet/iProphet.pep.xml \
    -C reference/mtb_irt.fasta \
    -E DECOY
```

```{r extract_pct_mayu, eval=FALSE}
mayu_output <- "../2019-05-12_12.37.03_main_1.07.csv"
number <- hpgltools::extract_mayu_pps_fdr(mayu_output)
message("The number is: ", number)
## 0.43291
```


```{bash umpire_contd, eval=FALSE}
## Rerunning because writing the file failed.
spectrast \
    -cNSpecLib -cICID-QTOF \
    -cf "Protein! ~ DECOY_" \
    -cP0.4237 \
    -c_IRTreference/irt.txt \
    -c_IRR results/05_dia_umpire_prophet/iProphet.pep.xml

spectrast \
    -cNSpecLib_cons \
    -cICID-QTOF \
    -cAC SpecLib.splib

spectrast2tsv.py \
    -l 350,2000 \
    -s b,y \
    -x 1,2 \
    -o 6 \
    -n 6 \
    -p 0.05 \
    -d -e \
    -k openswath \
    -w windows/2018_0817BrikenTrypsinDIA19.txt \
    -a SpecLib_cons_openswath.tsv \
    SpecLib_cons.sptxt

TargetedFileConverter \
    -in SpecLib_cons_openswath.tsv \
    -in_type tsv \
    -out SpecLib_cons_openswath.TraML \
    -out_type TraML

OpenSwathDecoyGenerator \
    -in SpecLib_cons_openswath.TraML \
    -out SpecLib_cons_openswath_decoy.TraML \
    -method shuffle
##    -exclude_similar \
##    -similarity_threshold 0.05 \
##    -identity_threshold 0.7

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.tsv \
    -out_type tsv

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.pqp \
    -out_type pqp

export VERSION=${VERSION:-20190327}
echo "Loading environment modules and parameters for version: ${VERSION}."
source "parameters/${VERSION}_settings.sh"

echo "Invoking the OpenSwathWorkflow using local comet-derived transitions."
type="diaumpire"
input_type="mzXML"
export TRANSITION_PREFIX="SpecLib_cons_openswath_decoy"
echo "Checking in, the transition library is: ${TRANSITION_PREFIX}.pqp"
base_mzxmldir="results/01${input_type}/dia/${VERSION}"
swath_inputs=$(/usr/bin/find "${base_mzxmldir}" -name *.${input_type} -print | sort)
echo "Checking in, the inputs are: ${swath_inputs}"
mkdir -p "${SWATH_OUTDIR}_${type}"
pypdir="${PYPROPHET_OUTDIR}_${type}"
mkdir -p "${pypdir}"
for input in ${swath_inputs}
do
    name=$(basename "${input}" ".${input_type}")
    echo "Starting openswath run, library type ${type} for ${name} using ${MZ_WINDOWS} windows at $(date)."
    swath_output_prefix="${SWATH_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    pyprophet_output_prefix="${PYPROPHET_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    echo "Deleting previous swath output file: ${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_tsv "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_osw "${swath_output_prefix}.osw"
    ##2>"${swath_output_prefix}_osw.log" 1>&2
done
swath_out=$(dirname ${swath_output_prefix})
pyprophet_out="$(dirname "${pyprophet_output_prefix}")/openswath_merged.osw"
echo "Merging osw files to ${pyprophet_out}"
pyprophet merge \
          --template "${TRANSITION_PREFIX}.pqp" \
          --out="${pyprophet_out}" \
          ${swath_out}/*.osw
pyprophet score --in="${pyprophet_out}"
pyprophet export --in="${pyprophet_out}" --out "test.tsv"
## pyprophet always exports to the current working directory.
final_name="$(dirname ${pyprophet_out})/$(basename ${pyprophet_out} ".osw").tsv"
echo $final_name
mv "test.tsv"
ls -ld "${pyprophet_out}"

tric_tb="${TRIC_OUTDIR}_tuberculist"
mkdir -p "${tric_tb}"
feature_alignment.py \
    --force \
    --in "./${pypdir}/"*.tsv \
    --out "${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv" \
    --out_matrix "${tric_tb}/${DDA_METHOD}_outmatrix.tsv" \
    --out_meta "${tric_tb}/${DDA_METHOD}_meta.tsv"
2>"${tric_tb}/feature_alignment.err" \
 1>"${tric_tb}/feature_alignment.out"
echo "Wrote final output to ${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv"
```

# DEP usage

Thanks to Vivek, I now am aware of DEP, which does everything I wish MSstats did.
The matrix given to me by tric's feature_alignment.py I think gives me what DEP
requires, along with my annotations and sample sheet.

Let us see if this is true.

## Protein annotations

```{r protein_annotations}
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"
mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"

mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
colnames(mtb_annotations) <- gsub(pattern="\\.", replacement="", x=colnames(mtb_annotations))
mtb_annotations[["description"]] <- gsub(pattern="\\+", replacement=" ",
                                         x=mtb_annotations[["description"]])
mtb_annotations[["function"]] <- gsub(pattern="\\+", replacement=" ",
                                      x=mtb_annotations[["function"]])
rownames(mtb_annotations) <- mtb_annotations[["ID"]]

mtb_microbes <- load_microbesonline_annotations(id=83332)
```

## Preprocess intensities in preparation for DEP

```{r swath2stats, fig.show="hide"}
tric_data <- read.csv(
  paste0("results/tric/", ver, "/whole_8mz_dia_umpire/comet_HCD.tsv"), sep="\t")
tric_data[["ProteinName"]] <- gsub(pattern="^(.*)_.*$", replacement="\\1",
                                   x=tric_data[["ProteinName"]])
sample_annot <- extract_metadata(paste0("sample_sheets/Mtb_dia_samples_", ver, ".xlsx"))
kept <- ! grepl(x=rownames(sample_annot), pattern="^s\\.\\.")
sample_annot <- sample_annot[kept, ]
devtools::load_all("~/scratch/git/SWATH2stats_myforked")
s2s_exp <- SWATH2stats::sample_annotation(data=tric_data, check_files=FALSE,
                             sample_annotation=sample_annot,
                             fullpeptidename_column="fullpeptidename")

decoy_lists <- assess_decoy_rate(s2s_exp)
## This seems a bit high to me, yesno?
fdr_overall <- assess_fdr_overall(s2s_exp, output="Rconsole", plot=TRUE)

byrun_fdr <- assess_fdr_byrun(s2s_exp, FFT=0.7, plot=TRUE, output="Rconsole")
chosen_mscore <- mscore4assayfdr(s2s_exp, FFT=0.7, fdr_target=0.02)
prot_score <- mscore4protfdr(s2s_exp, FFT=0.7, fdr_target=0.02)

filtered_ms <- filter_mscore(s2s_exp, chosen_mscore)
filtered_fq <- filter_mscore_freqobs(s2s_exp, 0.01, 0.8, rm.decoy=FALSE)
filtered_ms_fdr <- filter_mscore_fdr(filtered_ms, FFT=0.7,
                                     overall_protein_fdr_target=prot_score,
                                     upper_overall_peptide_fdr_limit=0.05)
filtered_ms_fdr_pr <- filter_proteotypic_peptides(filtered_ms_fdr)
filtered_ms_fdr_pr_all <- filter_all_peptides(filtered_ms_fdr_pr)
filtered_ms_fdr_pr_all_str <- filter_on_max_peptides(data=filtered_ms_fdr_pr_all, n_peptides=10)
filtered_all_filters <- filter_on_min_peptides(data=filtered_ms_fdr_pr_all_str, n_peptides=3)

matrix_prefix <- file.path("results", "swath2stats", ver)
if (!file.exists(matrix_prefix)) {
  dir.create(matrix_prefix)
}
protein_matrix_all <- write_matrix_proteins(
  s2s_exp, write.csv=TRUE,
  filename=file.path(matrix_prefix, "protein_all.csv"))
dim(protein_matrix_all)
protein_matrix_mscore <- write_matrix_proteins(
  filtered_ms, write.csv=TRUE,
  filename=file.path(matrix_prefix, "protein_matrix_mscore.csv"))
dim(protein_matrix_mscore)
peptide_matrix_mscore <- write_matrix_peptides(
  filtered_ms, write.csv=TRUE,
  filename=file.path(matrix_prefix, "peptide_matrix_mscore.csv"))
dim(peptide_matrix_mscore)
protein_matrix_filtered <- write_matrix_proteins(
  filtered_all_filters, write.csv=TRUE,
  filename=file.path(matrix_prefix, "protein_matrix_filtered.csv"))
dim(protein_matrix_filtered)
peptide_matrix_filtered <- write_matrix_peptides(
  filtered_all_filters, write.csv=TRUE,
  filename=file.path(matrix_prefix, "peptide_matrix_filtered.csv"))
dim(peptide_matrix_filtered)
```

# Pass the data to DEP and see what happens.

```{r start_DEP}
library(DEP)
intensities <- protein_matrix_filtered
cols <- gsub(x=colnames(intensities), pattern="^.*(2018.*$)", replacement="s\\1")
cols[[1]] <- "Protein"
colnames(intensities) <- cols

sample_annot[["label"]] <- rownames(sample_annot)
unique_intensities <- make_unique(intensities, "Protein", "Protein", delim=";")
intensity_columns <- grep(pattern="2018", x=cols)

sample_annot[["condition"]] <- as.character(sample_annot[["condition"]])
sample_annot[["label"]] <- as.character(sample_annot[["label"]])
sample_annot[["replicate"]] <- make.names(paste0(sample_annot[["condition"]], "_", sample_annot[["batch"]]), unique=TRUE)
design <- sample_annot[, c("label", "condition", "replicate")]

mtb_se <- DEP::make_se(unique_intensities, intensity_columns, design)

DEP::plot_frequency(mtb_se)
mtb_filt <- DEP::filter_missval(mtb_se, thr=1)
DEP::plot_numbers(mtb_filt)
DEP::plot_coverage(mtb_filt)

mtb_norm <- DEP::normalize_vsn(mtb_filt)
DEP::plot_normalization(mtb_filt, mtb_norm)

DEP::plot_missval(mtb_filt)
DEP::plot_detect(mtb_filt)

mtb_imp <- DEP::impute(mtb_norm, fun="MinProb", q=0.01)
DEP::plot_imputation(mtb_norm, mtb_imp)

mtb_diff <- DEP::test_diff(mtb_imp, type="all")
mtb_diff <- DEP::test_diff(mtb_imp, type="manual",
                           test=c("wt_filtrate_vs_wt_whole",
                                  "delta_filtrate_vs_wt_filtrate",
                                  "comp_filtrate_vs_wt_filtrate",
                                  "wt_filtrate_vs_delta_filtrate",
                                  "wt_filtrate_vs_comp_filtrate",
                                  "wt_whole_vs_delta_whole",
                                  "wt_whole_vs_comp_whole"))

mtb_dep <- DEP::add_rejections(mtb_diff, alpha=0.05, lfc=0.6)
## mtb_pca <- DEP::plot_pca(mtb_dep)
## The PCA plotter provided by DEP has some problems.

DEP::plot_cor(mtb_dep)
DEP::plot_heatmap(mtb_dep, type="centered", kmeans=TRUE)
DEP::plot_heatmap(mtb_dep, type="contrast", kmeans=TRUE)

DEP::plot_volcano(mtb_dep, contrast="wt_whole_vs_delta_whole")
DEP::plot_volcano(mtb_dep, contrast="wt_filtrate_vs_delta_filtrate")

DEP::plot_single(mtb_dep, proteins=c("Rv0287"))
DEP::plot_single(mtb_dep, proteins=c("Rv0287"), type="centered")
DEP::plot_cond(mtb_dep)

mtb_result <- DEP::get_results(mtb_dep)
```


```{r saveme}
if (!isTRUE(get0("skip_load"))) {
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
  pander::pander(sessionInfo())
}
```

```{r loadme, eval=FALSE}
tmp <- loadme(filename=this_save)
```
