In the following blocks I want to use DIA Umpire to create transition libraries for openswath, then I want to run openswath and score the runs.
cd ~/scratch/proteomics/mycobacterium_tuberculosis_2018
module add openms
type="mzXML"
export VERSION="20190228"
basedir="${HOME}/scratch/proteomics/mycobacterium_tuberculosis_2018"
base_input="${basedir}/results/01${type}/dia/${VERSION}/"
umpire_inputs=$(/usr/bin/find "${base_input}" -name "*.${type}" | sort)
echo "Checking in: ${umpire_inputs}"
for input in ${umpire_inputs};
do
in_name=$(basename $input ".${type}")
out_name="${in_name}_Q1.mgf"
if [[ ! -f "${base_input}/${out_name}" ]]; then
echo "The output file: ${out_name} already exists."
else
java -jar DIA_Umpire_SE.jar ${input} diaumpire_se.params
fi
done
ProteinProphet \
results/04_dia_umpire_xinteract/interact.comet.pep.xml \
results/05_dia_umpire_prophet/combined.prot.xml
InterProphetParser \
DECOY=DECOY \
results/04_dia_umpire_xinteract/interact.comet.pep.xml \
results/05_dia_umpire_prophet/iProphet.pep.xml
Mayu.pl \
-A results/05_dia_umpire_prophet/iProphet.pep.xml \
-C reference/mtb_irt.fasta \
-E DECOY
mayu_output <- "../2019-05-12_12.37.03_main_1.07.csv"
number <- hpgltools::extract_mayu_pps_fdr(mayu_output)
message("The number is: ", number)
## 0.43291
## Rerunning because writing the file failed.
spectrast \
-cNSpecLib -cICID-QTOF \
-cf "Protein! ~ DECOY_" \
-cP0.4237 \
-c_IRTreference/irt.txt \
-c_IRR results/05_dia_umpire_prophet/iProphet.pep.xml
spectrast \
-cNSpecLib_cons \
-cICID-QTOF \
-cAC SpecLib.splib
spectrast2tsv.py \
-l 350,2000 \
-s b,y \
-x 1,2 \
-o 6 \
-n 6 \
-p 0.05 \
-d -e \
-k openswath \
-w windows/2018_0817BrikenTrypsinDIA19.txt \
-a SpecLib_cons_openswath.tsv \
SpecLib_cons.sptxt
TargetedFileConverter \
-in SpecLib_cons_openswath.tsv \
-in_type tsv \
-out SpecLib_cons_openswath.TraML \
-out_type TraML
OpenSwathDecoyGenerator \
-in SpecLib_cons_openswath.TraML \
-out SpecLib_cons_openswath_decoy.TraML \
-method shuffle
## -exclude_similar \
## -similarity_threshold 0.05 \
## -identity_threshold 0.7
TargetedFileConverter \
-in SpecLib_cons_openswath_decoy.TraML \
-in_type TraML \
-out SpecLib_cons_openswath_decoy.tsv \
-out_type tsv
TargetedFileConverter \
-in SpecLib_cons_openswath_decoy.TraML \
-in_type TraML \
-out SpecLib_cons_openswath_decoy.pqp \
-out_type pqp
export VERSION=${VERSION:-20190327}
echo "Loading environment modules and parameters for version: ${VERSION}."
source "parameters/${VERSION}_settings.sh"
echo "Invoking the OpenSwathWorkflow using local comet-derived transitions."
type="diaumpire"
input_type="mzXML"
export TRANSITION_PREFIX="SpecLib_cons_openswath_decoy"
echo "Checking in, the transition library is: ${TRANSITION_PREFIX}.pqp"
base_mzxmldir="results/01${input_type}/dia/${VERSION}"
swath_inputs=$(/usr/bin/find "${base_mzxmldir}" -name *.${input_type} -print | sort)
echo "Checking in, the inputs are: ${swath_inputs}"
mkdir -p "${SWATH_OUTDIR}_${type}"
pypdir="${PYPROPHET_OUTDIR}_${type}"
mkdir -p "${pypdir}"
for input in ${swath_inputs}
do
name=$(basename "${input}" ".${input_type}")
echo "Starting openswath run, library type ${type} for ${name} using ${MZ_WINDOWS} windows at $(date)."
swath_output_prefix="${SWATH_OUTDIR}_${type}/${name}_${DDA_METHOD}"
pyprophet_output_prefix="${PYPROPHET_OUTDIR}_${type}/${name}_${DDA_METHOD}"
echo "Deleting previous swath output file: ${swath_output_prefix}.osw"
rm -f "${swath_output_prefix}.osw"
rm -f "${swath_output_prefix}.tsv"
OpenSwathWorkflow \
-in "${input}" \
-force \
-sort_swath_maps \
-min_upper_edge_dist 1 \
-mz_correction_function "quadratic_regression_delta_ppm" \
-Scoring:TransitionGroupPicker:background_subtraction "original" \
-Scoring:stop_report_after_feature "5" \
-swath_windows_file "windows/openswath_${name}.txt" \
-tr "${TRANSITION_PREFIX}.pqp" \
-out_tsv "${swath_output_prefix}.tsv"
OpenSwathWorkflow \
-in "${input}" \
-force \
-sort_swath_maps \
-min_upper_edge_dist 1 \
-mz_correction_function "quadratic_regression_delta_ppm" \
-Scoring:TransitionGroupPicker:background_subtraction "original" \
-Scoring:stop_report_after_feature "5" \
-swath_windows_file "windows/openswath_${name}.txt" \
-tr "${TRANSITION_PREFIX}.pqp" \
-out_osw "${swath_output_prefix}.osw"
##2>"${swath_output_prefix}_osw.log" 1>&2
done
swath_out=$(dirname ${swath_output_prefix})
pyprophet_out="$(dirname "${pyprophet_output_prefix}")/openswath_merged.osw"
echo "Merging osw files to ${pyprophet_out}"
pyprophet merge \
--template "${TRANSITION_PREFIX}.pqp" \
--out="${pyprophet_out}" \
${swath_out}/*.osw
pyprophet score --in="${pyprophet_out}"
pyprophet export --in="${pyprophet_out}" --out "test.tsv"
## pyprophet always exports to the current working directory.
final_name="$(dirname ${pyprophet_out})/$(basename ${pyprophet_out} ".osw").tsv"
echo $final_name
mv "test.tsv"
ls -ld "${pyprophet_out}"
tric_tb="${TRIC_OUTDIR}_tuberculist"
mkdir -p "${tric_tb}"
feature_alignment.py \
--force \
--in "./${pypdir}/"*.tsv \
--out "${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv" \
--out_matrix "${tric_tb}/${DDA_METHOD}_outmatrix.tsv" \
--out_meta "${tric_tb}/${DDA_METHOD}_meta.tsv"
2>"${tric_tb}/feature_alignment.err" \
1>"${tric_tb}/feature_alignment.out"
echo "Wrote final output to ${tric_tb}/${SEARCH_METHOD}_${DDA_METHOD}.tsv"
Thanks to Vivek, I now am aware of DEP, which does everything I wish MSstats did. The matrix given to me by tric’s feature_alignment.py I think gives me what DEP requires, along with my annotations and sample sheet.
Let us see if this is true.
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"
mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"
mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
colnames(mtb_annotations) <- gsub(pattern="\\.", replacement="", x=colnames(mtb_annotations))
mtb_annotations[["description"]] <- gsub(pattern="\\+", replacement=" ",
x=mtb_annotations[["description"]])
mtb_annotations[["function"]] <- gsub(pattern="\\+", replacement=" ",
x=mtb_annotations[["function"]])
rownames(mtb_annotations) <- mtb_annotations[["ID"]]
mtb_microbes <- load_microbesonline_annotations(id=83332)
## The species being downloaded is: Mycobacterium tuberculosis H37Rv
tric_data <- read.csv(
paste0("results/tric/", ver, "/whole_8mz_dia_umpire/comet_HCD.tsv"), sep="\t")
tric_data[["ProteinName"]] <- gsub(pattern="^(.*)_.*$", replacement="\\1",
x=tric_data[["ProteinName"]])
sample_annot <- extract_metadata(paste0("sample_sheets/Mtb_dia_samples_", ver, ".xlsx"))
kept <- ! grepl(x=rownames(sample_annot), pattern="^s\\.\\.")
sample_annot <- sample_annot[kept, ]
devtools::load_all("~/scratch/git/SWATH2stats_myforked")
## Loading SWATH2stats
s2s_exp <- SWATH2stats::sample_annotation(data=tric_data, check_files=FALSE,
sample_annotation=sample_annot,
fullpeptidename_column="fullpeptidename")
## Not checking that the files are identical between the annotation and data.
## Number of non-decoy peptides: 17081
## Number of decoy peptides: 1801
## Decoy rate: 0.1054
## This seems a bit high to me, yesno?
fdr_overall <- assess_fdr_overall(s2s_exp, output="Rconsole", plot=TRUE)
## The average FDR by run on assay level is 0.014
## The average FDR by run on peptide level is 0.016
## The average FDR by run on protein level is 0.001
## Target assay FDR: 0.02
## Required overall m-score cutoff: 0.0031623
## achieving assay FDR: 0.0194
## Target protein FDR: 0.02
## Required overall m-score cutoff: 0.01
## achieving protein FDR: 0.00115
## Original dimension: 221952, new dimension: 211415, difference: 10537.
## Peptides need to have been quantified in more conditions than: 48 in order to pass this percentage-based threshold.
## Fraction of peptides selected: 0.00058
## Original dimension: 224796, new dimension: 680, difference: 224116.
filtered_ms_fdr <- filter_mscore_fdr(filtered_ms, FFT=0.7,
overall_protein_fdr_target=prot_score,
upper_overall_peptide_fdr_limit=0.05)
## Target protein FDR: 0.01
## Required overall m-score cutoff: 0.01
## achieving protein FDR: 0
## filter_mscore_fdr is filtering the data...
## finding m-score cutoff to achieve desired protein FDR in protein master list..
## finding m-score cutoff to achieve desired global peptide FDR..
## Target peptide FDR: 0.05
## Required overall m-score cutoff: 0.01
## Achieving peptide FDR: 0
## Proteins selected:
## Total proteins selected: 2412
## Final target proteins: 2412
## Final decoy proteins: 0
## Peptides mapping to these protein entries selected:
## Total mapping peptides: 16868
## Final target peptides: 16868
## Final decoy peptides: 0
## Total peptides selected from:
## Total peptides: 16868
## Final target peptides: 16868
## Final decoy peptides: 0
## Individual run FDR quality of the peptides was not calculated
## as not every run contains a decoy.
## The decoys have been removed from the returned data.
## Number of proteins detected: 2363
## Protein identifiers: Rv0577, Rv0242c, Rv3012c, Rv2467, Rv3715c, Rv2220
## Number of proteins detected that are supported by a proteotypic peptide: 2337
## Number of proteotypic peptides detected: 16728
## Number of proteins detected: 2337
## First 6 protein identifiers: Rv0577, Rv0242c, Rv3012c, Rv2467, Rv3715c, Rv2220
## Before filtering:
## Number of proteins: 2337
## Number of peptides: 16728
##
## Percentage of peptides removed: 25.94%
##
## After filtering:
## Number of proteins: 2331
## Number of peptides: 12388
## Before filtering:
## Number of proteins: 2331
## Number of peptides: 12388
##
## Percentage of peptides removed: 0%
##
## After filtering:
## Number of proteins: 2284
## Number of peptides: 12388
matrix_prefix <- file.path("results", "swath2stats", ver)
if (!file.exists(matrix_prefix)) {
dir.create(matrix_prefix)
}
protein_matrix_all <- write_matrix_proteins(
s2s_exp, write.csv=TRUE,
filename=file.path(matrix_prefix, "protein_all.csv"))
## Protein overview matrix results/swath2stats/20190327/protein_all.csv written to working folder.
## [1] 2434 45
protein_matrix_mscore <- write_matrix_proteins(
filtered_ms, write.csv=TRUE,
filename=file.path(matrix_prefix, "protein_matrix_mscore.csv"))
## Protein overview matrix results/swath2stats/20190327/protein_matrix_mscore.csv written to working folder.
## [1] 2412 45
peptide_matrix_mscore <- write_matrix_peptides(
filtered_ms, write.csv=TRUE,
filename=file.path(matrix_prefix, "peptide_matrix_mscore.csv"))
## Peptide overview matrix results/swath2stats/20190327/peptide_matrix_mscore.csv written to working folder.
## [1] 16868 45
protein_matrix_filtered <- write_matrix_proteins(
filtered_all_filters, write.csv=TRUE,
filename=file.path(matrix_prefix, "protein_matrix_filtered.csv"))
## Protein overview matrix results/swath2stats/20190327/protein_matrix_filtered.csv written to working folder.
## [1] 2284 45
peptide_matrix_filtered <- write_matrix_peptides(
filtered_all_filters, write.csv=TRUE,
filename=file.path(matrix_prefix, "peptide_matrix_filtered.csv"))
## Peptide overview matrix results/swath2stats/20190327/peptide_matrix_filtered.csv written to working folder.
## [1] 144819 45
##
## Attaching package: 'DEP'
## The following objects are masked from 'package:hpgltools':
##
## plot_heatmap, plot_pca
intensities <- protein_matrix_filtered
cols <- gsub(x=colnames(intensities), pattern="^.*(2018.*$)", replacement="s\\1")
cols[[1]] <- "Protein"
colnames(intensities) <- cols
sample_annot[["label"]] <- rownames(sample_annot)
unique_intensities <- make_unique(intensities, "Protein", "Protein", delim=";")
intensity_columns <- grep(pattern="2018", x=cols)
sample_annot[["condition"]] <- as.character(sample_annot[["condition"]])
sample_annot[["label"]] <- as.character(sample_annot[["label"]])
sample_annot[["replicate"]] <- make.names(paste0(sample_annot[["condition"]], "_", sample_annot[["batch"]]), unique=TRUE)
design <- sample_annot[, c("label", "condition", "replicate")]
mtb_se <- DEP::make_se(unique_intensities, intensity_columns, design)
DEP::plot_frequency(mtb_se)
## vsn2: 1628 x 43 matrix (1 stratum).
## Please use 'meanSdPlot' to verify the fit.
## Loading required package: imputeLCMD
## Loading required package: tmvtnorm
## Loading required package: mvtnorm
## Loading required package: Matrix
## Loading required package: stats4
## Loading required package: gmm
## Loading required package: sandwich
## Loading required package: norm
## Loading required package: pcaMethods
##
## Attaching package: 'pcaMethods'
## The following object is masked from 'package:stats':
##
## loadings
## Loading required package: impute
## [1] 1.557
## Tested contrasts: wt_filtrate_vs_wt_whole, wt_filtrate_vs_delta_filtrate, wt_filtrate_vs_comp_filtrate, wt_filtrate_vs_delta_whole, wt_filtrate_vs_comp_whole, wt_whole_vs_delta_filtrate, wt_whole_vs_comp_filtrate, wt_whole_vs_delta_whole, wt_whole_vs_comp_whole, delta_filtrate_vs_comp_filtrate, delta_filtrate_vs_delta_whole, delta_filtrate_vs_comp_whole, comp_filtrate_vs_delta_whole, comp_filtrate_vs_comp_whole, delta_whole_vs_comp_whole
mtb_diff <- DEP::test_diff(mtb_imp, type="manual",
test=c("wt_filtrate_vs_wt_whole",
"delta_filtrate_vs_wt_filtrate",
"comp_filtrate_vs_wt_filtrate",
"wt_filtrate_vs_delta_filtrate",
"wt_filtrate_vs_comp_filtrate",
"wt_whole_vs_delta_whole",
"wt_whole_vs_comp_whole"))
## Tested contrasts: wt_filtrate_vs_wt_whole, delta_filtrate_vs_wt_filtrate, comp_filtrate_vs_wt_filtrate, wt_filtrate_vs_delta_filtrate, wt_filtrate_vs_comp_filtrate, wt_whole_vs_delta_whole, wt_whole_vs_comp_whole
mtb_dep <- DEP::add_rejections(mtb_diff, alpha=0.05, lfc=0.6)
## mtb_pca <- DEP::plot_pca(mtb_dep)
## The PCA plotter provided by DEP has some problems.
DEP::plot_cor(mtb_dep)
if (!isTRUE(get0("skip_load"))) {
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
pander::pander(sessionInfo())
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 0abc58e173be7300595d30d407b7efd4e4a512d6
## This is hpgltools commit: Thu May 9 14:56:34 2019 -0400: 0abc58e173be7300595d30d407b7efd4e4a512d6
## Saving to dia_umpire_20190308-v20190327.rda.xz
R version 3.5.3 (2019-03-11)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: imputeLCMD(v.2.0), impute(v.1.56.0), pcaMethods(v.1.74.0), norm(v.1.0-9.5), tmvtnorm(v.1.4-10), gmm(v.1.6-2), sandwich(v.2.5-1), Matrix(v.1.2-17), mvtnorm(v.1.0-10), DEP(v.1.4.1), SWATH2stats(v.1.13.5), testthat(v.2.1.1), hpgltools(v.1.0), Biobase(v.2.42.0) and BiocGenerics(v.0.28.0)
loaded via a namespace (and not attached): shinydashboard(v.0.7.1), tidyselect(v.0.2.5), lme4(v.1.1-21), htmlwidgets(v.1.3), RSQLite(v.2.1.1), AnnotationDbi(v.1.44.0), grid(v.3.5.3), BiocParallel(v.1.16.6), devtools(v.2.0.2), munsell(v.0.5.0), codetools(v.0.2-16), preprocessCore(v.1.44.0), DT(v.0.6), withr(v.2.1.2), colorspace(v.1.4-1), GOSemSim(v.2.8.0), knitr(v.1.23), rstudioapi(v.0.10), DOSE(v.3.8.2), mzID(v.1.20.1), labeling(v.0.3), urltools(v.1.7.3), GenomeInfoDbData(v.1.2.0), polyclip(v.1.10-0), bit64(v.0.9-7), farver(v.1.1.0), rprojroot(v.1.3-2), xfun(v.0.7), R6(v.2.4.0), doParallel(v.1.0.14), GenomeInfoDb(v.1.18.2), bitops(v.1.0-6), fgsea(v.1.8.0), gridGraphics(v.0.4-1), DelayedArray(v.0.8.0), assertthat(v.0.2.1), promises(v.1.0.1), scales(v.1.0.0), ggraph(v.1.0.2), enrichplot(v.1.2.0), gtable(v.0.3.0), affy(v.1.60.0), sva(v.3.30.1), processx(v.3.3.1), rlang(v.0.3.4), genefilter(v.1.64.0), mzR(v.2.16.2), GlobalOptions(v.0.1.0), splines(v.3.5.3), rtracklayer(v.1.42.2), lazyeval(v.0.2.2), selectr(v.0.4-1), europepmc(v.0.3), BiocManager(v.1.30.4), yaml(v.2.2.0), reshape2(v.1.4.3), GenomicFeatures(v.1.34.8), backports(v.1.1.4), httpuv(v.1.5.1), qvalue(v.2.14.1), clusterProfiler(v.3.10.1), tools(v.3.5.3), usethis(v.1.5.0), ggplotify(v.0.0.3), ggplot2(v.3.1.1), affyio(v.1.52.0), gplots(v.3.0.1.1), RColorBrewer(v.1.1-2), sessioninfo(v.1.1.1), MSnbase(v.2.8.3), ggridges(v.0.5.1), Rcpp(v.1.0.1), plyr(v.1.8.4), base64enc(v.0.1-3), progress(v.1.2.2), zlibbioc(v.1.28.0), purrr(v.0.3.2), RCurl(v.1.95-4.12), ps(v.1.3.0), prettyunits(v.1.0.2), GetoptLong(v.0.1.7), viridis(v.0.5.1), cowplot(v.0.9.4), zoo(v.1.8-5), S4Vectors(v.0.20.1), cluster(v.2.0.9), SummarizedExperiment(v.1.12.0), ggrepel(v.0.8.1), colorRamps(v.2.3), fs(v.1.3.1), variancePartition(v.1.12.3), magrittr(v.1.5), data.table(v.1.12.2), DO.db(v.2.9), openxlsx(v.4.1.0), circlize(v.0.4.6), triebeard(v.0.3.0), packrat(v.0.5.0), ProtGenerics(v.1.14.0), matrixStats(v.0.54.0), pkgload(v.1.0.2), mime(v.0.6), hms(v.0.4.2), evaluate(v.0.13), xtable(v.1.8-4), pbkrtest(v.0.4-7), XML(v.3.98-1.19), IRanges(v.2.16.0), gridExtra(v.2.3), shape(v.1.4.4), compiler(v.3.5.3), biomaRt(v.2.38.0), tibble(v.2.1.1), KernSmooth(v.2.23-15), ncdf4(v.1.16.1), crayon(v.1.3.4), minqa(v.1.2.4), htmltools(v.0.3.6), later(v.0.8.0), mgcv(v.1.8-28), tidyr(v.0.8.3), DBI(v.1.0.0), tweenr(v.1.0.1), ComplexHeatmap(v.1.20.0), MASS(v.7.3-51.4), boot(v.1.3-22), readr(v.1.3.1), cli(v.1.1.0), vsn(v.3.50.0), gdata(v.2.18.0), igraph(v.1.2.4.1), GenomicRanges(v.1.34.0), pkgconfig(v.2.0.2), rvcheck(v.0.1.3), GenomicAlignments(v.1.18.1), MALDIquant(v.1.19.3), xml2(v.1.2.0), foreach(v.1.4.4), annotate(v.1.60.1), XVector(v.0.22.0), rvest(v.0.3.4), stringr(v.1.4.0), callr(v.3.2.0), digest(v.0.6.19), Biostrings(v.2.50.2), rmarkdown(v.1.12), fastmatch(v.1.1-0), curl(v.3.3), shiny(v.1.3.2), Rsamtools(v.1.34.1), gtools(v.3.8.1), rjson(v.0.2.20), nloptr(v.1.2.1), nlme(v.3.1-140), jsonlite(v.1.6), desc(v.1.2.0), viridisLite(v.0.3.0), limma(v.3.38.3), pillar(v.1.4.0), lattice(v.0.20-38), httr(v.1.4.0), pkgbuild(v.1.0.3), survival(v.2.44-1.1), GO.db(v.3.7.0), glue(v.1.3.1), remotes(v.2.0.4), fdrtool(v.1.2.15), zip(v.2.0.2), UpSetR(v.1.3.3), iterators(v.1.0.10), pander(v.0.6.3), bit(v.1.1-14), ggforce(v.0.2.2), stringi(v.1.4.3), blob(v.1.1.1), caTools(v.1.17.1.2), memoise(v.1.1.0) and dplyr(v.0.8.1)