1 Rework DIA-Umpire to feed OpenSWATH.

In the following blocks I want to use DIA Umpire to create transition libraries for openswath, then I want to run openswath and score the runs.

1.5 Combine the statistics

## Rerunning because writing the file failed.
spectrast \
    -cNSpecLib -cICID-QTOF \
    -cf "Protein! ~ DECOY_" \
    -cP0.4237 \
    -c_IRTreference/irt.txt \
    -c_IRR results/05_dia_umpire_prophet/iProphet.pep.xml

spectrast \
    -cNSpecLib_cons \
    -cICID-QTOF \
    -cAC SpecLib.splib

spectrast2tsv.py \
    -l 350,2000 \
    -s b,y \
    -x 1,2 \
    -o 6 \
    -n 6 \
    -p 0.05 \
    -d -e \
    -k openswath \
    -w windows/2018_0817BrikenTrypsinDIA19.txt \
    -a SpecLib_cons_openswath.tsv \
    SpecLib_cons.sptxt

TargetedFileConverter \
    -in SpecLib_cons_openswath.tsv \
    -in_type tsv \
    -out SpecLib_cons_openswath.TraML \
    -out_type TraML

OpenSwathDecoyGenerator \
    -in SpecLib_cons_openswath.TraML \
    -out SpecLib_cons_openswath_decoy.TraML \
    -method shuffle
##    -exclude_similar \
##    -similarity_threshold 0.05 \
##    -identity_threshold 0.7

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.tsv \
    -out_type tsv

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.pqp \
    -out_type pqp

export VERSION=${VERSION:-20190327}
echo "Loading environment modules and parameters for version: ${VERSION}."
source "parameters/${VERSION}_settings.sh"

echo "Invoking the OpenSwathWorkflow using local comet-derived transitions."
type="diaumpire"
input_type="mzXML"
export TRANSITION_PREFIX="SpecLib_cons_openswath_decoy"
echo "Checking in, the transition library is: ${TRANSITION_PREFIX}.pqp"
base_mzxmldir="results/01${input_type}/dia/${VERSION}"
swath_inputs=$(/usr/bin/find "${base_mzxmldir}" -name *.${input_type} -print | sort)
echo "Checking in, the inputs are: ${swath_inputs}"
mkdir -p "${SWATH_OUTDIR}_${type}"
pypdir="${PYPROPHET_OUTDIR}_${type}"
mkdir -p "${pypdir}"
for input in ${swath_inputs}
do
    name=$(basename "${input}" ".${input_type}")
    echo "Starting openswath run, library type ${type} for ${name} using ${MZ_WINDOWS} windows at $(date)."
    swath_output_prefix="${SWATH_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    pyprophet_output_prefix="${PYPROPHET_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    echo "Deleting previous swath output file: ${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_tsv "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_osw "${swath_output_prefix}.osw"
    ##2>"${swath_output_prefix}_osw.log" 1>&2
done
swath_out=$(dirname ${swath_output_prefix})
pyprophet_out="$(dirname "${pyprophet_output_prefix}")/openswath_merged.osw"
echo "Merging osw files to ${pyprophet_out}"
pyprophet merge \
          --template "${TRANSITION_PREFIX}.pqp" \
          --out="${pyprophet_out}" \
          ${swath_out}/*.osw
pyprophet score --in="${pyprophet_out}"
pyprophet export --in="${pyprophet_out}" --out "test.tsv"
## pyprophet always exports to the current working directory.
final_name="$(dirname ${pyprophet_out})/$(basename ${pyprophet_out} ".osw").tsv"
echo $final_name
mv "test.tsv"
ls -ld "${pyprophet_out}"

tric_tb="${TRIC_OUTDIR}_tuberculist"
mkdir -p "${tric_tb}"
feature_alignment.py \
    --force \
    --in "./${pypdir}/"*.tsv \
    --out "${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv" \
    --out_matrix "${tric_tb}/${DDA_METHOD}_outmatrix.tsv" \
    --out_meta "${tric_tb}/${DDA_METHOD}_meta.tsv"
2>"${tric_tb}/feature_alignment.err" \
 1>"${tric_tb}/feature_alignment.out"
echo "Wrote final output to ${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv"

2 DEP usage

Thanks to Vivek, I now am aware of DEP, which does everything I wish MSstats did. The matrix given to me by tric’s feature_alignment.py I think gives me what DEP requires, along with my annotations and sample sheet.

Let us see if this is true.

2.2 Preprocess intensities in preparation for DEP

##  [1] "sampleid"           "tubeid"             "tubelabel"         
##  [4] "figurereplicate"    "figurename"         "sampledescription" 
##  [7] "bioreplicate"       "lcrun"              "msrun"             
## [10] "technicalreplicate" "replicatestate"     "run"               
## [13] "exptid"             "genotype"           "collectiontype"    
## [16] "condition"          "batch"              "windowsize"        
## [19] "enzyme"             "harvestdate"        "prepdate"          
## [22] "rundate"            "runinfo"            "rawfile"           
## [25] "filename"           "mzmlfile"           "diascored"         
## [28] "tuberculistscored"  "includeexclude"
##  [1] "s2018_0315Briken01"           "s2018_0315Briken02"          
##  [3] "s2018_0315Briken03"           "s2018_0315Briken04"          
##  [5] "s2018_0315Briken05"           "s2018_0315Briken06"          
##  [7] "s2018_0315Briken21"           "s2018_0315Briken22"          
##  [9] "s2018_0315Briken23"           "s2018_0315Briken24"          
## [11] "s2018_0315Briken25"           "s2018_0315Briken26"          
## [13] "s2018_0502BrikenDIA01"        "s2018_0502BrikenDIA02"       
## [15] "s2018_0502BrikenDIA03"        "s2018_0502BrikenDIA04"       
## [17] "s2018_0502BrikenDIA05"        "s2018_0502BrikenDIA06"       
## [19] "s2018_0502BrikenDIA07"        "s2018_0502BrikenDIA08"       
## [21] "s2018_0502BrikenDIA09"        "s2018_0502BrikenDIA10"       
## [23] "s2018_0502BrikenDIA11"        "s2018_0502BrikenDIA12"       
## [25] "s2018_0726Briken01"           "s2018_0726Briken02"          
## [27] "s2018_0726Briken03"           "s2018_0726Briken04"          
## [29] "s2018_0726Briken05"           "s2018_0726Briken06"          
## [31] "s2018_0726Briken07"           "s2018_0726Briken08"          
## [33] "s2018_0726Briken09"           "s2018_0726Briken11"          
## [35] "s2018_0726Briken12"           "s2018_0726Briken13"          
## [37] "s2018_0726Briken14"           "s2018_0726Briken15"          
## [39] "s2018_0726Briken16"           "s2018_0726Briken17"          
## [41] "s2018_0726Briken18"           "s2018_0726Briken19"          
## [43] "s2018_0817BrikenTrypsinDIA01" "s2018_0817BrikenTrypsinDIA02"
## [45] "s2018_0817BrikenTrypsinDIA03" "s2018_0817BrikenTrypsinDIA04"
## [47] "s2018_0817BrikenTrypsinDIA05" "s2018_0817BrikenTrypsinDIA06"
## [49] "s2018_0817BrikenTrypsinDIA07" "s2018_0817BrikenTrypsinDIA08"
## [51] "s2018_0817BrikenTrypsinDIA09" "s2018_0817BrikenTrypsinDIA11"
## [53] "s2018_0817BrikenTrypsinDIA12" "s2018_0817BrikenTrypsinDIA13"
## [55] "s2018_0817BrikenTrypsinDIA14" "s2018_0817BrikenTrypsinDIA15"
## [57] "s2018_0817BrikenTrypsinDIA16" "s2018_0817BrikenTrypsinDIA17"
## [59] "s2018_0817BrikenTrypsinDIA18" "s2018_0817BrikenTrypsinDIA19"
##  [1] "results/01mzXML/dia/20190327/2018_0315Briken01.mzXML"          
##  [2] "results/01mzXML/dia/20190327/2018_0315Briken02.mzXML"          
##  [3] "results/01mzXML/dia/20190327/2018_0315Briken03.mzXML"          
##  [4] "results/01mzXML/dia/20190327/2018_0315Briken04.mzXML"          
##  [5] "results/01mzXML/dia/20190327/2018_0315Briken05.mzXML"          
##  [6] "results/01mzXML/dia/20190327/2018_0315Briken06.mzXML"          
##  [7] "results/01mzXML/dia/20190327/2018_0315Briken21.mzXML"          
##  [8] "results/01mzXML/dia/20190327/2018_0315Briken22.mzXML"          
##  [9] "results/01mzXML/dia/20190327/2018_0315Briken23.mzXML"          
## [10] "results/01mzXML/dia/20190327/2018_0315Briken24.mzXML"          
## [11] "results/01mzXML/dia/20190327/2018_0315Briken25.mzXML"          
## [12] "results/01mzXML/dia/20190327/2018_0315Briken26.mzXML"          
## [13] "results/01mzXML/dia/20190327/2018_0502BrikenDIA01.mzXML"       
## [14] "results/01mzXML/dia/20190327/2018_0502BrikenDIA02.mzXML"       
## [15] "results/01mzXML/dia/20190327/2018_0502BrikenDIA03.mzXML"       
## [16] "results/01mzXML/dia/20190327/2018_0502BrikenDIA04.mzXML"       
## [17] "results/01mzXML/dia/20190327/2018_0502BrikenDIA05.mzXML"       
## [18] "results/01mzXML/dia/20190327/2018_0502BrikenDIA06.mzXML"       
## [19] "results/01mzXML/dia/20190327/2018_0502BrikenDIA07.mzXML"       
## [20] "results/01mzXML/dia/20190327/2018_0502BrikenDIA08.mzXML"       
## [21] "results/01mzXML/dia/20190327/2018_0502BrikenDIA09.mzXML"       
## [22] "results/01mzXML/dia/20190327/2018_0502BrikenDIA10.mzXML"       
## [23] "results/01mzXML/dia/20190327/2018_0502BrikenDIA11.mzXML"       
## [24] "results/01mzXML/dia/20190327/2018_0502BrikenDIA12.mzXML"       
## [25] "results/01mzXML/dia/20190327/2018_0726Briken01.mzXML"          
## [26] "results/01mzXML/dia/20190327/2018_0726Briken02.mzXML"          
## [27] "results/01mzXML/dia/20190327/2018_0726Briken03.mzXML"          
## [28] "results/01mzXML/dia/20190327/2018_0726Briken04.mzXML"          
## [29] "results/01mzXML/dia/20190327/2018_0726Briken05.mzXML"          
## [30] "results/01mzXML/dia/20190327/2018_0726Briken06.mzXML"          
## [31] "results/01mzXML/dia/20190327/2018_0726Briken07.mzXML"          
## [32] "results/01mzXML/dia/20190327/2018_0726Briken08.mzXML"          
## [33] "results/01mzXML/dia/20190327/2018_0726Briken09.mzXML"          
## [34] "results/01mzXML/dia/20190327/2018_0726Briken11.mzXML"          
## [35] "results/01mzXML/dia/20190327/2018_0726Briken12.mzXML"          
## [36] "results/01mzXML/dia/20190327/2018_0726Briken13.mzXML"          
## [37] "results/01mzXML/dia/20190327/2018_0726Briken14.mzXML"          
## [38] "results/01mzXML/dia/20190327/2018_0726Briken15.mzXML"          
## [39] "results/01mzXML/dia/20190327/2018_0726Briken16.mzXML"          
## [40] "results/01mzXML/dia/20190327/2018_0726Briken17.mzXML"          
## [41] "results/01mzXML/dia/20190327/2018_0726Briken18.mzXML"          
## [42] "results/01mzXML/dia/20190327/2018_0726Briken19.mzXML"          
## [43] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA01.mzXML"
## [44] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA02.mzXML"
## [45] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA03.mzXML"
## [46] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA04.mzXML"
## [47] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA05.mzXML"
## [48] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA06.mzXML"
## [49] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA07.mzXML"
## [50] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA08.mzXML"
## [51] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA09.mzXML"
## [52] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA11.mzXML"
## [53] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA12.mzXML"
## [54] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA13.mzXML"
## [55] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA14.mzXML"
## [56] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA15.mzXML"
## [57] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA16.mzXML"
## [58] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA17.mzXML"
## [59] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA18.mzXML"
## [60] "results/01mzXML/dia/20190327/2018_0817BrikenTrypsinDIA19.mzXML"
## Loading SWATH2stats
## Found the same mzXML files in the annotations and data.
## Number of non-decoy peptides: 17081
## Number of decoy peptides: 1801
## Decoy rate: 0.1054
## The average FDR by run on assay level is 0.015
## The average FDR by run on peptide level is 0.016
## The average FDR by run on protein level is 0.001
## Target assay FDR: 0.02
## Required overall m-score cutoff: 0.0031623
## achieving assay FDR: 0.0194
## Target protein FDR: 0.02
## Required overall m-score cutoff: 0.01
## achieving protein FDR: 0.00115
## Original dimension: 221952, new dimension: 211415, difference: 10537.
## Peptides need to have been quantified in more conditions than: 48 in order to pass this percentage-based threshold.
## Fraction of peptides selected: 0.00058
## Original dimension: 224796, new dimension: 680, difference: 224116.
## Error in mscore4protfdr(data, FFT, fdr_target = overall_protein_fdr_target): object 'prot_score' not found
## Error in is.data.frame(x): object 'ump_filtered_ms_fdr' not found
## Error in gsub(pattern = "^[[:digit:]]\\/", replacement = "", x = data[[column]]): object 'ump_filtered_ms_fdr_pr' not found
## Error in is.data.frame(x): object 'ump_filtered_ms_fdr_pr_all' not found
## Error in is.data.frame(x): object 'ump_filtered_ms_fdr_pr_all_str' not found
## Error in file.exists(matrix_prefix): object 'matrix_prefix' not found
## Error in file.path(matrix_prefix, "ump_protein_all.csv"): object 'matrix_prefix' not found
## Error in eval(expr, envir, enclos): object 'protein_matrix_all' not found
## Error in file.path(matrix_prefix, "ump_protein_matrix_mscore.csv"): object 'matrix_prefix' not found
## Error in eval(expr, envir, enclos): object 'protein_matrix_mscore' not found
## Error in file.path(matrix_prefix, "ump_peptide_matrix_mscore.csv"): object 'matrix_prefix' not found
## Error in eval(expr, envir, enclos): object 'peptide_matrix_mscore' not found
## Error in aggregate(data[, "intensity"], by = list(data[["proteinname"]], : object 'ump_filtered_all_filters' not found
## Error in eval(expr, envir, enclos): object 'protein_matrix_filtered' not found
## Error in is.data.frame(x): object 'ump_filtered_all_filters' not found
## Error in eval(expr, envir, enclos): object 'peptide_matrix_filtered' not found

3 Look at raw data

## Reading results/01mzML/dia/20190327/2018_0315Briken01.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken01.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken02.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken02.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken03.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken03.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken04.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken04.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken05.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken05.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken06.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken06.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken21.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken21.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken22.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken22.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken23.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken23.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken24.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken24.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken25.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken25.mzML not found
## Reading results/01mzML/dia/20190327/2018_0315Briken26.mzML
## Error in mzR::openMSfile(file) : 
##   File results/01mzML/dia/20190327/2018_0315Briken26.mzML not found
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA01.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA02.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA03.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA04.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA05.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA06.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA07.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA08.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA09.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA10.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA11.mzML
## Reading results/01mzML/dia/20190327/2018_0502BrikenDIA12.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken01.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken02.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken03.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken04.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken05.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken06.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken07.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken08.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken09.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken11.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken12.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken13.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken14.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken15.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken16.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken17.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken18.mzML
## Reading results/01mzML/dia/20190327/2018_0726Briken19.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA01.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA02.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA03.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA04.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA05.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA06.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA07.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA08.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA09.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA11.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA12.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA13.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA14.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA15.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA16.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA17.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA18.mzML
## Reading results/01mzML/dia/20190327/2018_0817BrikenTrypsinDIA19.mzML
## Error in names(res) <- rownames(sample_definitions): 'names' attribute [60] must be the same length as the vector [48]

4 Get intensities

## Error in eval(expr, envir, enclos): object 'protein_matrix_filtered' not found
## Error in colnames(intensities): object 'intensities' not found
## Error in cols[[1]] <- "Protein": object 'cols' not found
## Error in eval(expr, envir, enclos): object 'cols' not found
## Error in eval(expr, envir, enclos): object 'intensities' not found
## Error in intensities[["Protein"]] <- NULL: object 'intensities' not found
## Error in colnames(intensities): object 'intensities' not found
## Error in colnames(intensities): object 'intensities' not found
## Error in colnames(intensities): object 'intensities' not found
## Error in `[.data.frame`(sample_annot, reordered, ): object 'reordered' not found
## Reading the sample metadata.
## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("undefined",
## "undefined", : invalid factor level, NA generated

## Warning in `[<-.factor`(`*tmp*`, iseq, value = c("undefined",
## "undefined", : invalid factor level, NA generated
## The sample definitions comprises: 60 rows(samples) and 29 columns(metadata fields).
## Error in create_expt(sample_annot, count_dataframe = intensities, gene_info = mtb_annotations): object 'intensities' not found

5 Pass the data to DEP and see what happens.

## Loading DEP
wtf <- function (proteins_unique, columns, expdesign) {
  assertthat::assert_that(is.data.frame(proteins_unique), is.integer(columns),
                          is.data.frame(expdesign))
  if (any(!c("name", "ID") %in% colnames(proteins_unique))) {
    stop("'name' and/or 'ID' columns are not present in '",
         deparse(substitute(proteins_unique)), "'.\nRun make_unique() to obtain the required columns",
         call. = FALSE)
  }
  if (any(!c("label", "condition", "replicate") %in% colnames(expdesign))) {
    stop("'label', 'condition' and/or 'replicate' columns",
         "are not present in the experimental design", call. = FALSE)
  }
  if (any(!apply(proteins_unique[, columns], 2, is.numeric))) {
    stop("specified 'columns' should be numeric", "\nRun make_se_parse() with the appropriate columns as argument",
         call. = FALSE)
  }
  if (tibble::is.tibble(proteins_unique))
    proteins_unique <- as.data.frame(proteins_unique)
  if (tibble::is.tibble(expdesign))
    expdesign <- as.data.frame(expdesign)
  rownames(proteins_unique) <- proteins_unique$name
  raw <- proteins_unique[, columns]
  raw[raw == 0] <- NA
  raw <- log2(raw)
  expdesign <- mutate(expdesign, condition = make.names(condition))
  ## I changed the following because it didn't make sense to me.
  if (is.null(expdesign[["ID"]])) {
    expdesign <- expdesign %>%
      tidyr::unite(condition, replicate, remove=FALSE)
  }

  rownames(expdesign) <- expdesign$ID
  matched <- match(make.names(delete_prefix(expdesign$label)),
                   make.names(delete_prefix(colnames(raw))))
  if (any(is.na(matched))) {
    stop("None of the labels in the experimental design match ",
         "with column names in 'proteins_unique'", "\nRun make_se() with the correct labels in the experimental design",
         "and/or correct columns specification")
  }
  colnames(raw)[matched] <- expdesign$ID
  raw <- raw[, !is.na(colnames(raw))][rownames(expdesign)]
  row_data <- proteins_unique[, -columns]
  rownames(row_data) <- row_data$name
  se <- SummarizedExperiment(assays = as.matrix(raw), colData = expdesign,
                             rowData = row_data)
  return(se)
}
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:DEP':
## 
##     first, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:DEP':
## 
##     collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:DEP':
## 
##     count, rowRanges
## The following objects are masked from 'package:hpgltools':
## 
##     anyMissing, rowMedians
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply
## Error in pData(protein_expt): object 'protein_expt' not found
## Error in design[["sampleid"]]: object of type 'closure' is not subsettable
## Error in design[["sampleid"]]: object of type 'closure' is not subsettable
## Error in exprs(protein_expt): object 'protein_expt' not found
## Error in exprs(protein_expt): object 'protein_expt' not found
## Error in rownames(mtb_unique): object 'mtb_unique' not found
## Error in rownames(mtb_unique): object 'mtb_unique' not found
## Error in is.data.frame(proteins_unique): object 'mtb_unique' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_se' not found
## Error in assay(mtb_se): object 'mtb_se' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_se' not found
## Error in assay(mtb_filt): object 'mtb_filt' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_se' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_se' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_se' not found
## Error in eval(expr, p): object 'mtb_se' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_se' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_se' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_norm' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_norm' not found
## Error in eval(expr, p): object 'mtb_norm' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_imp' not found
## Error in inherits(se, "SummarizedExperiment"): object 'mtb_imp' not found
## Error in inherits(diff, "SummarizedExperiment"): object 'mtb_diff' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in inherits(dep, "SummarizedExperiment"): object 'mtb_dep' not found
## Error in hpgltools::write_xls(data = mtb_result, excel = "excel/dep_result.xlsx"): object 'mtb_result' not found

6 Minor change to plot_missval

DEP has a neat function to plot missing values. Sadly, it does not return the actual matrix, only the plot. This is nice and all, but I need the matrix, ergo this minor change.

## Error in assay(se): object 'mtb_se' not found
## Error in summary(def_mtrx): object 'def_mtrx' not found
## Error in rowSums(def_mtrx): object 'def_mtrx' not found
## Error in head(defined_by_protein): object 'defined_by_protein' not found
## Error in colSums(def_mtrx): object 'def_mtrx' not found
## Error in eval(expr, envir, enclos): object 'defined_by_sample' not found

7 Request 20190523

Compare our ‘normal’ openswath output via hpgltools analysis vs. the umpire version. Secondary goal: With and without imputation.

## Loading SWATH2stats
## Found the same mzXML files in the annotations and data.
## Number of non-decoy peptides: 21557
## Number of decoy peptides: 939
## Decoy rate: 0.0436

## The average FDR by run on assay level is 0.009
## The average FDR by run on peptide level is 0.01
## The average FDR by run on protein level is 0.047

## Target assay FDR: 0.02
## Required overall m-score cutoff: 0.0070795
## achieving assay FDR: 0.0181
## Target protein FDR: 0.02
## Required overall m-score cutoff: 0.00089125
## achieving protein FDR: 0.0182
## Original dimension: 133447, new dimension: 128204, difference: 5243.
## Peptides need to have been quantified in more conditions than: 9.6 in order to pass this percentage-based threshold.
## Fraction of peptides selected: 0.11
## Original dimension: 135427, new dimension: 33028, difference: 102399.
## Target protein FDR: 0.000891250938133746
## Required overall m-score cutoff: 0.01
## achieving protein FDR: 0
## filter_mscore_fdr is filtering the data...
## finding m-score cutoff to achieve desired protein FDR in protein master list..
## finding m-score cutoff to achieve desired global peptide FDR..
## Target peptide FDR: 0.05
## Required overall m-score cutoff: 0.01
## Achieving peptide FDR: 0
## Proteins selected: 
## Total proteins selected: 2999
## Final target proteins: 2999
## Final decoy proteins: 0
## Peptides mapping to these protein entries selected:
## Total mapping peptides: 20921
## Final target peptides: 20921
## Final decoy peptides: 0
## Total peptides selected from:
## Total peptides: 20921
## Final target peptides: 20921
## Final decoy peptides: 0
## Individual run FDR quality of the peptides was not calculated
## as not every run contains a decoy.
## The decoys have been removed from the returned data.
## Number of proteins detected: 3016
## Protein identifiers: Rv2524c, Rv3716c, Rv1270c, Rv0724, Rv0161, Rv2535c
## Number of proteins detected that are supported by a proteotypic peptide: 2888
## Number of proteotypic peptides detected: 20772
## Number of proteins detected: 2890
## First 6 protein identifiers: Rv2524c, Rv3716c, Rv1270c, Rv0724, Rv0161, Rv2535c
## Before filtering:
##   Number of proteins: 2888
##   Number of peptides: 20772
## 
## Percentage of peptides removed: 21.87%
## 
## After filtering:
##   Number of proteins: 2861
##   Number of peptides: 16230
## Before filtering:
##   Number of proteins: 2861
##   Number of peptides: 16230
## 
## Percentage of peptides removed: 0.04%
## 
## After filtering:
##   Number of proteins: 2603
##   Number of peptides: 16223
## Protein overview matrix results/swath2stats/20180913/osw_protein_all.csv written to working folder.
## [1] 3873   13
## Protein overview matrix results/swath2stats/20180913/osw_protein_matrix_mscore.csv written to working folder.
## [1] 2999   13
## Peptide overview matrix results/swath2stats/20180913/osw_peptide_matrix_mscore.csv written to working folder.
## [1] 20921    13
## Protein overview matrix results/swath2stats/20180913/osw_protein_matrix_filtered.csv written to working folder.
## [1] 2603   13
## Peptide overview matrix results/swath2stats/20180913/osw_peptide_matrix_filtered.csv written to working folder.
## [1] 93860    13
## The library contains 5 transitions per precursor.
## The data table was transformed into a table containing one row per transition.
## One or several columns required by MSstats were not in the data. The columns were created and filled with NAs.
## Missing columns: productcharge, isotopelabeltype
## isotopelabeltype was filled with light.
##  [1] "X2018_0502BrikenDIA01.mzML"       
##  [2] "X2018_0502BrikenDIA02.mzML"       
##  [3] "X2018_0502BrikenDIA03.mzML"       
##  [4] "X2018_0502BrikenDIA04.mzML"       
##  [5] "X2018_0502BrikenDIA05.mzML"       
##  [6] "X2018_0502BrikenDIA06.mzML"       
##  [7] "X2018_0502BrikenDIA07.mzML"       
##  [8] "X2018_0502BrikenDIA08.mzML"       
##  [9] "X2018_0502BrikenDIA09.mzML"       
## [10] "X2018_0502BrikenDIA10.mzML"       
## [11] "X2018_0502BrikenDIA11.mzML"       
## [12] "X2018_0502BrikenDIA12.mzML"       
## [13] "X2018_0726Briken01.mzML"          
## [14] "X2018_0726Briken02.mzML"          
## [15] "X2018_0726Briken03.mzML"          
## [16] "X2018_0726Briken04.mzML"          
## [17] "X2018_0726Briken05.mzML"          
## [18] "X2018_0726Briken06.mzML"          
## [19] "X2018_0726Briken07.mzML"          
## [20] "X2018_0726Briken08.mzML"          
## [21] "X2018_0726Briken09.mzML"          
## [22] "X2018_0726Briken11.mzML"          
## [23] "X2018_0726Briken12.mzML"          
## [24] "X2018_0726Briken13.mzML"          
## [25] "X2018_0726Briken14.mzML"          
## [26] "X2018_0726Briken15.mzML"          
## [27] "X2018_0726Briken16.mzML"          
## [28] "X2018_0726Briken17.mzML"          
## [29] "X2018_0726Briken18.mzML"          
## [30] "X2018_0726Briken19.mzML"          
## [31] "X2018_0817BrikenTrypsinDIA01.mzML"
## [32] "X2018_0817BrikenTrypsinDIA02.mzML"
## [33] "X2018_0817BrikenTrypsinDIA03.mzML"
## [34] "X2018_0817BrikenTrypsinDIA04.mzML"
## [35] "X2018_0817BrikenTrypsinDIA05.mzML"
## [36] "X2018_0817BrikenTrypsinDIA06.mzML"
## [37] "X2018_0817BrikenTrypsinDIA07.mzML"
## [38] "X2018_0817BrikenTrypsinDIA08.mzML"
## [39] "X2018_0817BrikenTrypsinDIA09.mzML"
## [40] "X2018_0817BrikenTrypsinDIA11.mzML"
## [41] "X2018_0817BrikenTrypsinDIA12.mzML"
## [42] "X2018_0817BrikenTrypsinDIA13.mzML"
## [43] "X2018_0817BrikenTrypsinDIA14.mzML"
## [44] "X2018_0817BrikenTrypsinDIA15.mzML"
## [45] "X2018_0817BrikenTrypsinDIA16.mzML"
## [46] "X2018_0817BrikenTrypsinDIA17.mzML"
## [47] "X2018_0817BrikenTrypsinDIA18.mzML"
## [48] "X2018_0817BrikenTrypsinDIA19.mzML"
## Reading the sample metadata.
## The sample definitions comprises: 48 rows(samples) and 28 columns(metadata fields).
## Matched 2632 annotations and counts.
## Bringing together the count matrix and gene information.
## The final expressionset has 2632 rows and 48 columns.

8 Perform comparisons

For the first and simplest comparison, I will take the median by condition for these three data sets and see how they compare. Then I will subset the data into whole vs. filtered and do the logFC comparisons and compare again. Finally I will repeat these processes with my version of the imputation provided by DEP.

## The factor delta_filtrate has 3 rows.
## The factor delta_whole has 3 rows.
## The factor wt_filtrate has 3 rows.
## The factor wt_whole has 3 rows.
## The factor delta_filtrate has 9 rows.
## The factor comp_filtrate has 10 rows.
## The factor delta_whole has 8 rows.
## The factor comp_whole has 9 rows.
## The factor wt_filtrate has 6 rows.
## The factor wt_whole has 6 rows.
## Error in pData(data): object 'ump_expt' not found
## Error in merge(all, ump_medians, by = "row.names"): object 'ump_medians' not found
## Error in `[.data.frame`(all, , c("delta_filtrate", "delta_filtrate.x", : undefined columns selected
## Error in cor.test(test_df[[1]], test_df[[2]], method = "spearman"): object 'test_df' not found
## Error in cor.test(test_df[[1]], test_df[[3]], method = "spearman"): object 'test_df' not found
## Error in `[.data.frame`(all, , c("delta_filtrate", "delta_filtrate.y")): undefined columns selected
## Error in cor.test(test_df[[1]], test_df[[2]]): object 'test_df' not found
## Error in normalize_expt(ump_expt, filter = TRUE): object 'ump_expt' not found
## Error in normalize_expt(input, filter = TRUE, batch = FALSE, transform = "log2", : object 'ump_norm' not found
## Error in combine_de_tables(ump_de, keepers = keepers, excel = paste0("excel/diaumpire_tables-v", : object 'ump_de' not found

9 Add imputation

## Found 11011 zeros in the data.
## The data has not been filtered.
## Filtering the data, turn on force to stop this.
## This function will replace the expt$expressionset slot with:
## pofa(data)
## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
##  some metrics are easier to see when the data is log2 transformed, but
##  EdgeR/DESeq do not accept transformed data.
## Leaving the data unconverted.  It is often advisable to cpm/rpkm
##  the data to normalize for sampling differences, keep in mind though that rpkm
##  has some annoying biases, and voom() by default does a cpm (though hpgl_voom()
##  will try to detect this).
## Leaving the data unnormalized.  This is necessary for DESeq, but
##  EdgeR/limma might benefit from normalization.  Good choices include quantile,
##  size-factor, tmm, etc.
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: pofa
## Removing 471 low-count genes (2132 remaining).
## Step 2: not normalizing the data.
## Step 3: not converting the data.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
## Invoking impute from MSnbase with the MinProb method.
## Loading required package: imputeLCMD
## Loading required package: tmvtnorm
## Loading required package: mvtnorm
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## Loading required package: gmm
## Loading required package: sandwich
## Loading required package: norm
## Loading required package: pcaMethods
## 
## Attaching package: 'pcaMethods'
## The following object is masked from 'package:stats':
## 
##     loadings
## Loading required package: impute
## [1] 51260527
## The counts remained the same?
## Found 46806 zeros in the data.
## The data has not been filtered.
## Filtering the data, turn on force to stop this.
## This function will replace the expt$expressionset slot with:
## pofa(data)
## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
##  some metrics are easier to see when the data is log2 transformed, but
##  EdgeR/DESeq do not accept transformed data.
## Leaving the data unconverted.  It is often advisable to cpm/rpkm
##  the data to normalize for sampling differences, keep in mind though that rpkm
##  has some annoying biases, and voom() by default does a cpm (though hpgl_voom()
##  will try to detect this).
## Leaving the data unnormalized.  This is necessary for DESeq, but
##  EdgeR/limma might benefit from normalization.  Good choices include quantile,
##  size-factor, tmm, etc.
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: pofa
## Removing 868 low-count genes (1764 remaining).
## Step 2: not normalizing the data.
## Step 3: not converting the data.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
## Invoking impute from MSnbase with the bpca method.
## The counts remained the same?
## Error in exprs(expt): object 'ump_expt' not found
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 70c617a152947724e6b6f07de9c37021b60133ec
## This is hpgltools commit: Mon Jun 3 11:41:31 2019 -0400: 70c617a152947724e6b6f07de9c37021b60133ec
## Saving to dia_umpire_20190308-v20190327.rda.xz

R version 3.5.3 (2019-03-11)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: imputeLCMD(v.2.0), impute(v.1.56.0), pcaMethods(v.1.74.0), norm(v.1.0-9.5), tmvtnorm(v.1.4-10), gmm(v.1.6-2), sandwich(v.2.5-1), Matrix(v.1.2-17), mvtnorm(v.1.0-10), SWATH2stats(v.1.13.5), SummarizedExperiment(v.1.12.0), DelayedArray(v.0.8.0), BiocParallel(v.1.16.6), matrixStats(v.0.54.0), GenomicRanges(v.1.34.0), GenomeInfoDb(v.1.18.2), IRanges(v.2.16.0), S4Vectors(v.0.20.1), DEP(v.1.5.3), testthat(v.2.1.1), hpgltools(v.1.0), Biobase(v.2.42.0) and BiocGenerics(v.0.28.0)

loaded via a namespace (and not attached): shinydashboard(v.0.7.1), tidyselect(v.0.2.5), lme4(v.1.1-21), htmlwidgets(v.1.3), RSQLite(v.2.1.1), AnnotationDbi(v.1.44.0), grid(v.3.5.3), devtools(v.2.0.2), munsell(v.0.5.0), codetools(v.0.2-16), preprocessCore(v.1.44.0), DT(v.0.6), withr(v.2.1.2), colorspace(v.1.4-1), GOSemSim(v.2.8.0), knitr(v.1.23), rstudioapi(v.0.10), DOSE(v.3.8.2), mzID(v.1.20.1), labeling(v.0.3), urltools(v.1.7.3), GenomeInfoDbData(v.1.2.0), polyclip(v.1.10-0), bit64(v.0.9-7), farver(v.1.1.0), rprojroot(v.1.3-2), xfun(v.0.7), R6(v.2.4.0), doParallel(v.1.0.14), bitops(v.1.0-6), fgsea(v.1.8.0), gridGraphics(v.0.4-1), assertthat(v.0.2.1), promises(v.1.0.1), scales(v.1.0.0), ggraph(v.1.0.2), enrichplot(v.1.2.0), gtable(v.0.3.0), affy(v.1.60.0), sva(v.3.30.1), processx(v.3.3.1), rlang(v.0.3.4), genefilter(v.1.64.0), mzR(v.2.16.2), GlobalOptions(v.0.1.0), splines(v.3.5.3), rtracklayer(v.1.42.2), lazyeval(v.0.2.2), selectr(v.0.4-1), europepmc(v.0.3), BiocManager(v.1.30.4), yaml(v.2.2.0), reshape2(v.1.4.3), GenomicFeatures(v.1.34.8), backports(v.1.1.4), httpuv(v.1.5.1), qvalue(v.2.14.1), clusterProfiler(v.3.10.1), tools(v.3.5.3), usethis(v.1.5.0), ggplotify(v.0.0.3), ggplot2(v.3.1.1), affyio(v.1.52.0), gplots(v.3.0.1.1), RColorBrewer(v.1.1-2), sessioninfo(v.1.1.1), MSnbase(v.2.8.3), ggridges(v.0.5.1), Rcpp(v.1.0.1), plyr(v.1.8.4), base64enc(v.0.1-3), progress(v.1.2.2), zlibbioc(v.1.28.0), purrr(v.0.3.2), RCurl(v.1.95-4.12), ps(v.1.3.0), prettyunits(v.1.0.2), GetoptLong(v.0.1.7), viridis(v.0.5.1), cowplot(v.0.9.4), zoo(v.1.8-5), cluster(v.2.0.9), ggrepel(v.0.8.1), colorRamps(v.2.3), fs(v.1.3.1), variancePartition(v.1.12.3), magrittr(v.1.5), data.table(v.1.12.2), DO.db(v.2.9), openxlsx(v.4.1.0), circlize(v.0.4.6), triebeard(v.0.3.0), packrat(v.0.5.0), ProtGenerics(v.1.14.0), pkgload(v.1.0.2), mime(v.0.6), hms(v.0.4.2), evaluate(v.0.13), xtable(v.1.8-4), pbkrtest(v.0.4-7), XML(v.3.98-1.19), shape(v.1.4.4), gridExtra(v.2.3), compiler(v.3.5.3), biomaRt(v.2.38.0), tibble(v.2.1.1), KernSmooth(v.2.23-15), ncdf4(v.1.16.1), crayon(v.1.3.4), minqa(v.1.2.4), htmltools(v.0.3.6), later(v.0.8.0), mgcv(v.1.8-28), tidyr(v.0.8.3), DBI(v.1.0.0), tweenr(v.1.0.1), ComplexHeatmap(v.1.20.0), MASS(v.7.3-51.4), boot(v.1.3-22), readr(v.1.3.1), cli(v.1.1.0), vsn(v.3.50.0), gdata(v.2.18.0), igraph(v.1.2.4.1), pkgconfig(v.2.0.2), rvcheck(v.0.1.3), GenomicAlignments(v.1.18.1), MALDIquant(v.1.19.3), xml2(v.1.2.0), foreach(v.1.4.4), annotate(v.1.60.1), XVector(v.0.22.0), rvest(v.0.3.4), stringr(v.1.4.0), callr(v.3.2.0), digest(v.0.6.19), Biostrings(v.2.50.2), rmarkdown(v.1.12), fastmatch(v.1.1-0), curl(v.3.3), shiny(v.1.3.2), Rsamtools(v.1.34.1), gtools(v.3.8.1), rjson(v.0.2.20), nloptr(v.1.2.1), nlme(v.3.1-140), jsonlite(v.1.6), desc(v.1.2.0), viridisLite(v.0.3.0), limma(v.3.38.3), pillar(v.1.4.0), lattice(v.0.20-38), httr(v.1.4.0), pkgbuild(v.1.0.3), survival(v.2.44-1.1), GO.db(v.3.7.0), glue(v.1.3.1), remotes(v.2.0.4), zip(v.2.0.2), fdrtool(v.1.2.15), UpSetR(v.1.3.3), iterators(v.1.0.10), pander(v.0.6.3), bit(v.1.1-14), ggforce(v.0.2.2), stringi(v.1.4.3), blob(v.1.1.1), caTools(v.1.17.1.2), memoise(v.1.1.0) and dplyr(v.0.8.1)

---
title: "M. tuberculosis 20190327: DIA-Umpire based OpenSWATH workflow."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
  html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    keep_md: false
    mode: selfcontained
    number_sections: true
    self_contained: true
    theme: readable
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: false
  rmdformats::readthedown:
    code_download: true
    code_folding: show
    df_print: paged
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    width: 300
    keep_md: false
    mode: selfcontained
    toc_float: true
  BiocStyle::html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: tango
    keep_md: false
    mode: selfcontained
    toc_float: true
---

<style type="text/css">
body, td {
  font-size: 16px;
}
code.r{
  font-size: 16px;
}
pre {
 font-size: 16px
}
</style>

```{r options, include=FALSE}
library("hpgltools")
tt <- devtools::load_all("/data/hpgltools")
knitr::opts_knit$set(width=120,
                     progress=TRUE,
                     verbose=TRUE,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      dpi=96)
old_options <- options(digits=4,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
rundate <- format(Sys.Date(), format="%Y%m%d")
previous_file <- "02_estimation_infection_20180822.Rmd"
ver <- "20190327"

##tmp <- sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz")))
rmd_file <- "dia_umpire_20190308.Rmd"
```

# Rework DIA-Umpire to feed OpenSWATH.

In the following blocks I want to use DIA Umpire to create transition libraries for openswath,
then I want to run openswath and score the runs.

## Invoke DIA Umpire

```{bash umpire, eval=FALSE}
cd ~/scratch/proteomics/mycobacterium_tuberculosis_2018
module add openms

type="mzXML"
export VERSION="20190228"
basedir="${HOME}/scratch/proteomics/mycobacterium_tuberculosis_2018"
base_input="${basedir}/results/01${type}/dia/${VERSION}/"
umpire_inputs=$(/usr/bin/find "${base_input}" -name "*.${type}" | sort)
echo "Checking in: ${umpire_inputs}"
for input in ${umpire_inputs};
do
    in_name=$(basename $input ".${type}")
    out_name="${in_name}_Q1.mgf"
    if [[ ! -f "${base_input}/${out_name}" ]]; then
        echo "The output file: ${out_name} already exists."
    else
        java -jar DIA_Umpire_SE.jar ${input} diaumpire_se.params
    fi
done
```

## Convert DIA Umpire results

```{bash convert, eval=FALSE}
msconvert --mzXML results/02mgf/*.mgf
mv *.mzXML results/03_dia_umpire_mzxml
```

## Search the Umpire results

```{bash search_umpire, eval=FALSE}
comet \
    -Pparameters/comet_dia_umpire_params.txt \
    results/03_dia_umpire_mzxml/*.mzXML
```

## Merge them

```{bash merge_xinteract, eval=FALSE}
xinteract \
    -dDECOY_ \
    -OARPd \
    -Ninteract.comet.pep.xml \
    results/03_dia_umpire_mzxml/*.pep.xml

mv interact.comet.pep.xml results/04_dia_umpire_xinteract
```

## Combine the statistics

```{bash protein_prophet, eval=FALSE}
ProteinProphet \
    results/04_dia_umpire_xinteract/interact.comet.pep.xml \
    results/05_dia_umpire_prophet/combined.prot.xml

InterProphetParser \
    DECOY=DECOY \
    results/04_dia_umpire_xinteract/interact.comet.pep.xml \
    results/05_dia_umpire_prophet/iProphet.pep.xml

Mayu.pl \
    -A results/05_dia_umpire_prophet/iProphet.pep.xml \
    -C reference/mtb_irt.fasta \
    -E DECOY
```

```{r extract_pct_mayu, eval=FALSE}
mayu_output <- "../2019-05-12_12.37.03_main_1.07.csv"
number <- hpgltools::extract_mayu_pps_fdr(mayu_output)
message("The number is: ", number)
## 0.43291
```


```{bash umpire_contd, eval=FALSE}
## Rerunning because writing the file failed.
spectrast \
    -cNSpecLib -cICID-QTOF \
    -cf "Protein! ~ DECOY_" \
    -cP0.4237 \
    -c_IRTreference/irt.txt \
    -c_IRR results/05_dia_umpire_prophet/iProphet.pep.xml

spectrast \
    -cNSpecLib_cons \
    -cICID-QTOF \
    -cAC SpecLib.splib

spectrast2tsv.py \
    -l 350,2000 \
    -s b,y \
    -x 1,2 \
    -o 6 \
    -n 6 \
    -p 0.05 \
    -d -e \
    -k openswath \
    -w windows/2018_0817BrikenTrypsinDIA19.txt \
    -a SpecLib_cons_openswath.tsv \
    SpecLib_cons.sptxt

TargetedFileConverter \
    -in SpecLib_cons_openswath.tsv \
    -in_type tsv \
    -out SpecLib_cons_openswath.TraML \
    -out_type TraML

OpenSwathDecoyGenerator \
    -in SpecLib_cons_openswath.TraML \
    -out SpecLib_cons_openswath_decoy.TraML \
    -method shuffle
##    -exclude_similar \
##    -similarity_threshold 0.05 \
##    -identity_threshold 0.7

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.tsv \
    -out_type tsv

TargetedFileConverter \
    -in SpecLib_cons_openswath_decoy.TraML \
    -in_type TraML \
    -out SpecLib_cons_openswath_decoy.pqp \
    -out_type pqp

export VERSION=${VERSION:-20190327}
echo "Loading environment modules and parameters for version: ${VERSION}."
source "parameters/${VERSION}_settings.sh"

echo "Invoking the OpenSwathWorkflow using local comet-derived transitions."
type="diaumpire"
input_type="mzXML"
export TRANSITION_PREFIX="SpecLib_cons_openswath_decoy"
echo "Checking in, the transition library is: ${TRANSITION_PREFIX}.pqp"
base_mzxmldir="results/01${input_type}/dia/${VERSION}"
swath_inputs=$(/usr/bin/find "${base_mzxmldir}" -name *.${input_type} -print | sort)
echo "Checking in, the inputs are: ${swath_inputs}"
mkdir -p "${SWATH_OUTDIR}_${type}"
pypdir="${PYPROPHET_OUTDIR}_${type}"
mkdir -p "${pypdir}"
for input in ${swath_inputs}
do
    name=$(basename "${input}" ".${input_type}")
    echo "Starting openswath run, library type ${type} for ${name} using ${MZ_WINDOWS} windows at $(date)."
    swath_output_prefix="${SWATH_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    pyprophet_output_prefix="${PYPROPHET_OUTDIR}_${type}/${name}_${DDA_METHOD}"
    echo "Deleting previous swath output file: ${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.osw"
    rm -f "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_tsv "${swath_output_prefix}.tsv"
    OpenSwathWorkflow \
        -in "${input}" \
        -force \
        -sort_swath_maps \
        -min_upper_edge_dist 1 \
        -mz_correction_function "quadratic_regression_delta_ppm" \
        -Scoring:TransitionGroupPicker:background_subtraction "original" \
        -Scoring:stop_report_after_feature "5" \
        -swath_windows_file "windows/openswath_${name}.txt" \
        -tr "${TRANSITION_PREFIX}.pqp" \
        -out_osw "${swath_output_prefix}.osw"
    ##2>"${swath_output_prefix}_osw.log" 1>&2
done
swath_out=$(dirname ${swath_output_prefix})
pyprophet_out="$(dirname "${pyprophet_output_prefix}")/openswath_merged.osw"
echo "Merging osw files to ${pyprophet_out}"
pyprophet merge \
          --template "${TRANSITION_PREFIX}.pqp" \
          --out="${pyprophet_out}" \
          ${swath_out}/*.osw
pyprophet score --in="${pyprophet_out}"
pyprophet export --in="${pyprophet_out}" --out "test.tsv"
## pyprophet always exports to the current working directory.
final_name="$(dirname ${pyprophet_out})/$(basename ${pyprophet_out} ".osw").tsv"
echo $final_name
mv "test.tsv"
ls -ld "${pyprophet_out}"

tric_tb="${TRIC_OUTDIR}_tuberculist"
mkdir -p "${tric_tb}"
feature_alignment.py \
    --force \
    --in "./${pypdir}/"*.tsv \
    --out "${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv" \
    --out_matrix "${tric_tb}/${DDA_METHOD}_outmatrix.tsv" \
    --out_meta "${tric_tb}/${DDA_METHOD}_meta.tsv"
2>"${tric_tb}/feature_alignment.err" \
 1>"${tric_tb}/feature_alignment.out"
echo "Wrote final output to ${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv"
```

# DEP usage

Thanks to Vivek, I now am aware of DEP, which does everything I wish MSstats did.
The matrix given to me by tric's feature_alignment.py I think gives me what DEP
requires, along with my annotations and sample sheet.

Let us see if this is true.

## Protein annotations

```{r protein_annotations}
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"
mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"

mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
colnames(mtb_annotations) <- gsub(pattern="\\.", replacement="", x=colnames(mtb_annotations))
mtb_annotations[["description"]] <- gsub(pattern="\\+", replacement=" ",
                                         x=mtb_annotations[["description"]])
mtb_annotations[["function"]] <- gsub(pattern="\\+", replacement=" ",
                                      x=mtb_annotations[["function"]])
rownames(mtb_annotations) <- mtb_annotations[["ID"]]

mtb_microbes <- load_microbesonline_annotations(id=83332)
```

## Preprocess intensities in preparation for DEP

```{r swath2stats, fig.show="hide"}
ver <- "20190327"
ump_data <- read.csv(
  paste0("results/tric/", ver, "/whole_8mz_dia_umpire/comet_HCD.tsv"), sep="\t")
ump_data[["ProteinName"]] <- gsub(pattern="^(.*)_.*$", replacement="\\1",
                                  x=ump_data[["ProteinName"]])
sample_annot <- extract_metadata(paste0("sample_sheets/Mtb_dia_samples_20190521.xlsx"))
colnames(sample_annot)
rownames(sample_annot)
levels(as.factor(ump_data[["filename"]]))

devtools::load_all("~/scratch/git/SWATH2stats_myforked")
ump_s2s <- SWATH2stats::sample_annotation(data=ump_data,
                                          sample_annotation=sample_annot,
                                          fullpeptidename_column="fullpeptidename")

decoy_lists <- assess_decoy_rate(ump_s2s)
## This seems a bit high to me, yesno?
ump_fdr <- assess_fdr_overall(ump_s2s, output="Rconsole", plot=TRUE)

ump_byrun_fdr <- assess_fdr_byrun(ump_s2s, FFT=0.7, plot=TRUE, output="Rconsole")
ump_chosen_mscore2 <- mscore4assayfdr(ump_s2s, FFT=0.7, fdr_target=0.02)
ump_prot_score <- mscore4protfdr(ump_s2s, FFT=0.7, fdr_target=0.02)

ump_filtered_ms <- filter_mscore(ump_s2s, ump_chosen_mscore2)
ump_filtered_fq <- filter_mscore_freqobs(ump_s2s, 0.01, 0.8, rm.decoy=FALSE)
ump_filtered_ms_fdr <- filter_mscore_fdr(ump_filtered_ms, FFT=0.7,
                                         overall_protein_fdr_target=prot_score,
                                         upper_overall_peptide_fdr_limit=0.05)
ump_filtered_ms_fdr_pr <- filter_proteotypic_peptides(ump_filtered_ms_fdr)
ump_filtered_ms_fdr_pr_all <- filter_all_peptides(ump_filtered_ms_fdr_pr)
ump_filtered_ms_fdr_pr_all_str <- filter_on_max_peptides(data=ump_filtered_ms_fdr_pr_all, n_peptides=10)
ump_filtered_all_filters <- filter_on_min_peptides(data=ump_filtered_ms_fdr_pr_all_str, n_peptides=3)

ump_matrix_prefix <- file.path("results", "swath2stats", ver)
if (!file.exists(matrix_prefix)) {
  dir.create(matrix_prefix)
}
protein_matrix_all <- write_matrix_proteins(
  ump_s2s, write.csv=TRUE,
  filename=file.path(matrix_prefix, "ump_protein_all.csv"))
dim(protein_matrix_all)
protein_matrix_mscore <- write_matrix_proteins(
  ump_filtered_ms, write.csv=TRUE,
  filename=file.path(matrix_prefix, "ump_protein_matrix_mscore.csv"))
dim(protein_matrix_mscore)
peptide_matrix_mscore <- write_matrix_peptides(
  ump_filtered_ms, write.csv=TRUE,
  filename=file.path(matrix_prefix, "ump_peptide_matrix_mscore.csv"))
dim(peptide_matrix_mscore)
protein_matrix_filtered <- write_matrix_proteins(
  ump_filtered_all_filters, write.csv=TRUE,
  filename=file.path(matrix_prefix, "ump_protein_matrix_filtered.csv"))
dim(protein_matrix_filtered)
peptide_matrix_filtered <- write_matrix_peptides(
  ump_filtered_all_filters, write.csv=TRUE,
  filename=file.path(matrix_prefix, "ump_peptide_matrix_filtered.csv"))
dim(peptide_matrix_filtered)
```

# Look at raw data

```{r msraw}
mzml_data <- extract_msraw_data(sample_annot, parallel=FALSE,
                                format="mzML",
                                allow_window_overlap=FALSE,
                                file_column="mzmlfile",
                                savefile="testing.rda")
```

# Get intensities

```{r tb_expt, fig.show="hide"}
intensities <- protein_matrix_filtered
cols <- gsub(x=colnames(intensities), pattern="^.*(2018.*$)", replacement="s\\1")
cols[[1]] <- "Protein"
colnames(intensities) <- cols
rownames(intensities) <- intensities[["Protein"]]
intensities[["Protein"]] <- NULL

## Check the sample names of the intensity matrix and sample annotations.
colnames(intensities) %in% rownames(sample_annot)
rownames(sample_annot) %in% colnames(intensities)
##rownames(sample_annot)
##colnames(intensities)

reordered <- colnames(intensities)
metadata <- sample_annot[reordered, ]
ump_expt <- create_expt(sample_annot,
                        count_dataframe=intensities,
                        gene_info=mtb_annotations)
```


# Pass the data to DEP and see what happens.

```{r make_se_shenanigans}
devtools::load_all("~/scratch/git/DEP")
wtf <- function (proteins_unique, columns, expdesign) {
  assertthat::assert_that(is.data.frame(proteins_unique), is.integer(columns),
                          is.data.frame(expdesign))
  if (any(!c("name", "ID") %in% colnames(proteins_unique))) {
    stop("'name' and/or 'ID' columns are not present in '",
         deparse(substitute(proteins_unique)), "'.\nRun make_unique() to obtain the required columns",
         call. = FALSE)
  }
  if (any(!c("label", "condition", "replicate") %in% colnames(expdesign))) {
    stop("'label', 'condition' and/or 'replicate' columns",
         "are not present in the experimental design", call. = FALSE)
  }
  if (any(!apply(proteins_unique[, columns], 2, is.numeric))) {
    stop("specified 'columns' should be numeric", "\nRun make_se_parse() with the appropriate columns as argument",
         call. = FALSE)
  }
  if (tibble::is.tibble(proteins_unique))
    proteins_unique <- as.data.frame(proteins_unique)
  if (tibble::is.tibble(expdesign))
    expdesign <- as.data.frame(expdesign)
  rownames(proteins_unique) <- proteins_unique$name
  raw <- proteins_unique[, columns]
  raw[raw == 0] <- NA
  raw <- log2(raw)
  expdesign <- mutate(expdesign, condition = make.names(condition))
  ## I changed the following because it didn't make sense to me.
  if (is.null(expdesign[["ID"]])) {
    expdesign <- expdesign %>%
      tidyr::unite(condition, replicate, remove=FALSE)
  }

  rownames(expdesign) <- expdesign$ID
  matched <- match(make.names(delete_prefix(expdesign$label)),
                   make.names(delete_prefix(colnames(raw))))
  if (any(is.na(matched))) {
    stop("None of the labels in the experimental design match ",
         "with column names in 'proteins_unique'", "\nRun make_se() with the correct labels in the experimental design",
         "and/or correct columns specification")
  }
  colnames(raw)[matched] <- expdesign$ID
  raw <- raw[, !is.na(colnames(raw))][rownames(expdesign)]
  row_data <- proteins_unique[, -columns]
  rownames(row_data) <- row_data$name
  se <- SummarizedExperiment(assays = as.matrix(raw), colData = expdesign,
                             rowData = row_data)
  return(se)
}
```

```{r start_DEP}
library(DEP)
library(SummarizedExperiment)

design <- pData(protein_expt)
design[["label"]] <- design[["sampleid"]]
design[["replicate"]] <- design[["sampleid"]]

my_se <- SummarizedExperiment(
  assays=exprs(protein_expt),
  colData=design,
  rowData=fData(protein_expt))

mtb_unique <- as.data.frame(exprs(protein_expt))
mtb_unique[["name"]] <- rownames(mtb_unique)
mtb_unique[["ID"]] <- rownames(mtb_unique)
intensity_columns <- 1:60
## mtb_se <- wtf(mtb_unique, intensity_columns, design)
mtb_se <- make_se(mtb_unique, intensity_columns, design)

DEP::plot_frequency(mtb_se)
dim(assay(mtb_se))
mtb_filt <- DEP::filter_missval(mtb_se, thr=2)
dim(assay(mtb_filt))
DEP::plot_numbers(mtb_se)
DEP::plot_coverage(mtb_se)

mtb_norm <- DEP::normalize_vsn(mtb_se)
DEP::plot_normalization(mtb_se, mtb_norm)

DEP::plot_missval(mtb_se)
DEP::plot_detect(mtb_se)

mtb_imp <- DEP::impute(mtb_norm, fun="MinProb", q=0.01)
mtb_imp_man <- DEP::impute(mtb_norm, fun="man", shift=1.8, scale=0.3)
DEP::plot_imputation(mtb_norm, mtb_imp)

mtb_diff <- DEP::test_diff(mtb_imp, type="all")
mtb_diff <- DEP::test_diff(mtb_imp, type="manual",
                           test=c("wt_filtrate_vs_wt_whole",
                                  "delta_filtrate_vs_wt_filtrate",
                                  "comp_filtrate_vs_wt_filtrate",
                                  "wt_filtrate_vs_delta_filtrate",
                                  "wt_filtrate_vs_comp_filtrate",
                                  "wt_whole_vs_delta_whole",
                                  "wt_whole_vs_comp_whole"))

mtb_dep <- DEP::add_rejections(mtb_diff, alpha=0.05, lfc=0.6)
## mtb_pca <- DEP::plot_pca(mtb_dep)
## The PCA plotter provided by DEP has some problems.

DEP::plot_cor(mtb_dep)
DEP::plot_heatmap(mtb_dep, type="centered", kmeans=TRUE)
DEP::plot_heatmap(mtb_dep, type="contrast", kmeans=TRUE)

DEP::plot_volcano(mtb_dep, contrast="wt_whole_vs_delta_whole")
DEP::plot_volcano(mtb_dep, contrast="wt_filtrate_vs_delta_filtrate")
DEP::plot_cond(mtb_dep)
mtb_result <- DEP::get_results(mtb_dep)
plot_single(mtb_dep, proteins = c("Rv0287", "Rv0288"))
written_dep <- hpgltools::write_xls(data=mtb_result, excel="excel/dep_result.xlsx")
```

# Minor change to plot_missval

DEP has a neat function to plot missing values.  Sadly, it does not return the
actual matrix, only the plot.  This is nice and all, but I need the matrix, ergo
this minor change.

```{r my_plot_missing}
count_defined <- function(se) {
  df <- as.data.frame(assay(se))
  na_idx <- is.na(df)
  defined_mtrx <- ifelse(is.na(df), 0, 1)
  return(defined_mtrx)
}

def_mtrx <- count_defined(mtb_se)
summary(def_mtrx)
defined_by_protein <- rowSums(def_mtrx)
head(defined_by_protein)
defined_by_sample <- colSums(def_mtrx)
defined_by_sample
```

# Request 20190523

Compare our 'normal' openswath output via hpgltools analysis vs. the umpire
version.  Secondary goal: With and without imputation.

```{r compare umpire_vs_normal}
ver <- "20180913"
tric_data <- read.csv(
  paste0("results/tric/", ver, "/whole_8mz_tuberculist/comet_HCD.tsv"), sep="\t")
tric_data[["ProteinName"]] <- gsub(pattern="^(.*)_.*$", replacement="\\1",
                                   x=tric_data[["ProteinName"]])
sample_annot <- extract_metadata(paste0("sample_sheets/Mtb_dia_samples_", ver, ".xlsx"))
kept <- ! grepl(x=rownames(sample_annot), pattern="^s\\.\\.")
sample_annot <- sample_annot[kept, ]
devtools::load_all("~/scratch/git/SWATH2stats_myforked")
s2s_exp <- sample_annotation(data=tric_data,
                             sample_annotation=sample_annot,
                             fullpeptidename_column="fullpeptidename")

decoy_lists <- assess_decoy_rate(s2s_exp)
## This seems a bit high to me, yesno?
fdr_overall <- assess_fdr_overall(s2s_exp, output="Rconsole", plot=TRUE)
byrun_fdr <- assess_fdr_byrun(s2s_exp, FFT=0.7, plot=TRUE, output="Rconsole")
chosen_mscore <- mscore4assayfdr(s2s_exp, FFT=0.7, fdr_target=0.02)
prot_score <- mscore4protfdr(s2s_exp, FFT=0.7, fdr_target=0.02)
filtered_ms <- filter_mscore(s2s_exp, chosen_mscore)
filtered_fq <- filter_mscore_freqobs(s2s_exp, 0.01, 0.8, rm.decoy=FALSE)
filtered_ms_fdr <- filter_mscore_fdr(filtered_ms, FFT=0.7,
                                     overall_protein_fdr_target=prot_score,
                                     upper_overall_peptide_fdr_limit=0.05)
filtered_ms_fdr_pr <- filter_proteotypic_peptides(filtered_ms_fdr)
filtered_ms_fdr_pr_all <- filter_all_peptides(filtered_ms_fdr_pr)
filtered_ms_fdr_pr_all_str <- filter_on_max_peptides(data=filtered_ms_fdr_pr_all, n_peptides=10)
filtered_all_filters <- filter_on_min_peptides(data=filtered_ms_fdr_pr_all_str, n_peptides=3)

matrix_prefix <- file.path("results", "swath2stats", ver)
if (!file.exists(matrix_prefix)) {
  dir.create(matrix_prefix)
}
protein_matrix_all <- write_matrix_proteins(
  s2s_exp, write.csv=TRUE,
  filename=file.path(matrix_prefix, "osw_protein_all.csv"))
dim(protein_matrix_all)
protein_matrix_mscore <- write_matrix_proteins(
  filtered_ms, write.csv=TRUE,
  filename=file.path(matrix_prefix, "osw_protein_matrix_mscore.csv"))
dim(protein_matrix_mscore)
peptide_matrix_mscore <- write_matrix_peptides(
  filtered_ms, write.csv=TRUE,
  filename=file.path(matrix_prefix, "osw_peptide_matrix_mscore.csv"))
dim(peptide_matrix_mscore)
protein_matrix_filtered <- write_matrix_proteins(
  filtered_all_filters, write.csv=TRUE,
  filename=file.path(matrix_prefix, "osw_protein_matrix_filtered.csv"))
dim(protein_matrix_filtered)
peptide_matrix_filtered <- write_matrix_peptides(
  filtered_all_filters, write.csv=TRUE,
  filename=file.path(matrix_prefix, "osw_peptide_matrix_filtered.csv"))
dim(peptide_matrix_filtered)
cols <- colnames(filtered_all_filters)
disaggregated <- disaggregate(filtered_all_filters, all.columns=TRUE)
msstats_input <- convert_MSstats(disaggregated)

prot_mtrx <- protein_matrix_filtered
rownames(prot_mtrx) <- gsub(pattern="^1\\/", replacement="", x=prot_mtrx[["proteinname"]])
prot_mtrx <- prot_mtrx[, -1]
## Important question: Did SWATH2stats reorder my data?
colnames(prot_mtrx) <- gsub(pattern="^(.*)(2018.*)$", replacement="s\\2", x=colnames(prot_mtrx))
reordered <- colnames(prot_mtrx)
metadata <- sample_annot[reordered, ]
osw_expt <- sm(create_expt(metadata,
                           count_dataframe=prot_mtrx,
                           gene_info=mtb_annotations))
```

```{r compare_encylopedia}
ver <- "20190327"
enc_metadata <- hpgltools:::read_metadata("sample_sheets/Mtb_dia_samples_encyclopedia_20190327.xlsx")
rownames(enc_metadata) <- paste0("s", enc_metadata[["sampleid"]])
enc_matrix <- read.table("encyclopedia/most_samples_quant_report.elib.proteins.txt", header=TRUE)
enc_pep_matrix <- read.table("encyclopedia/most_samples_quant_report.elib.peptides.txt", header=TRUE)
rownames(enc_matrix) <- enc_matrix[["Protein"]]
enc_matrix <- enc_matrix[, -1]
enc_matrix <- enc_matrix[, -1]
enc_matrix <- enc_matrix[, -1]
colnames(enc_matrix)
colnames(enc_matrix) <- gsub(pattern="X", replacement="s", x=colnames(enc_matrix))
colnames(enc_matrix) <- gsub(pattern="\\.mzML", replacement="", x=colnames(enc_matrix))
colnames(enc_matrix) <- gsub(pattern="^X", replacement="s", x=colnames(enc_matrix))

na_idx <- is.na(enc_matrix)
enc_matrix[na_idx] <- 0

enc_expt <- create_expt(metadata=enc_metadata, count_dataframe=enc_matrix, gene_info=NULL)
```

# Perform comparisons

For the first and simplest comparison, I will take the median by condition for
these three data sets and see how they compare.  Then I will subset the data
into whole vs. filtered and do the logFC comparisons and compare again.  Finally
I will repeat these processes with my version of the imputation provided by DEP.

```{r first_comparisons}
osw_medians <- median_by_factor(osw_expt)
enc_medians <- median_by_factor(enc_expt)
ump_medians <- median_by_factor(ump_expt)

all <- merge(osw_medians, enc_medians, by="row.names")
rownames(all) <- all[["Row.names"]]
all[["Row.names"]] <- NULL
all <- merge(all, ump_medians, by="row.names")
rownames(all) <- all[["Row.names"]]
all[["Row.names"]] <- NULL

## OpenSWATH 'normal' vs. EncyclopeDIA
test_df <- all[, c("delta_filtrate", "delta_filtrate.x", "delta_filtrate.y")]
cor.test(test_df[[1]], test_df[[2]], method="spearman")
cor.test(test_df[[1]], test_df[[3]], method="spearman")

## OpenSWATH 'normal' vs. DIAUmpire
test_df <- all[, c("delta_filtrate", "delta_filtrate.y")]
cor.test(test_df[[1]], test_df[[2]])

keepers <- list(
  "wt_cfwhole" = c("wt_filtrate", "wt_whole"),
  "delta_cfwhole" = c("delta_filtrate", "delta_whole"),
  "whole_deltawt" = c("delta_whole", "wt_whole"),
  "cf_deltawt" = c("delta_filtrate", "wt_filtrate"))
ump_norm <- normalize_expt(ump_expt, filter=TRUE)
ump_de <- all_pairwise(ump_norm, force=TRUE)
whole_tables <- combine_de_tables(
  ump_de,
  keepers=keepers,
  excel=paste0("excel/diaumpire_tables-v", ver, ".xlsx"))
```

# Add imputation

```{r imputation_addition}
osw_imputed <- impute_expt(osw_expt, fun="MinProb", q=0.05)
enc_imputed <- impute_expt(enc_expt)
ump_imputed <- impute_expt(ump_expt, fun="knn", k=10, rowmax=0.9, force=TRUE)
```


```{r saveme}
if (!isTRUE(get0("skip_load"))) {
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
  pander::pander(sessionInfo())
}
```

```{r loadme, eval=FALSE}
tmp <- loadme(filename=this_save)
```
