1 Attempt some quantification/queries

Lets see if we can get anything out of the data?

1.1 An expressionSet?

Can I extract the intensities and do something with them?

## Take the outputs from tric (feature_alignment.py)
cid_intensities <- read.csv(file="results/tric/CID_outmatrix.tsv", sep="\t")
hcd_intensities <- read.csv(file="results/tric/HCD_outmatrix.tsv", sep="\t")

## Pull the protein names
cid_intensities[["rownames"]] <- cid_intensities[["Protein"]]
hcd_intensities[["rownames"]] <- hcd_intensities[["Protein"]]

## Simplify them
cid_intensities[["rownames"]] <- gsub(pattern="^[[:digit:]]+\\/",
                                      replacement="",
                                      x=cid_intensities[["rownames"]])
hcd_intensities[["rownames"]] <- gsub(pattern="^[[:digit:]]+\\/",
                                      replacement="",
                                      x=hcd_intensities[["rownames"]])

## make suitable rownames
rownames(cid_intensities) <- make.names(cid_intensities[["rownames"]], unique=TRUE)
rownames(hcd_intensities) <- make.names(hcd_intensities[["rownames"]], unique=TRUE)

## Simplify the column names because they are way too long
shorter_colnames <- colnames(cid_intensities)
shorter_colnames <- gsub(pattern="^(.*)_vs.*$", replacement="\\1", x=shorter_colnames)
colnames(cid_intensities) <- paste0("cid_", shorter_colnames)
colnames(hcd_intensities) <- paste0("hcd_", shorter_colnames)

## Make a data table from them.
cid <- data.table::as.data.table(cid_intensities)
cid[["cid_rownames"]] <- make.names(cid[["cid_rownames"]], unique=TRUE)
hcd <- data.table::as.data.table(hcd_intensities)
hcd[["hcd_rownames"]] <- make.names(hcd[["hcd_rownames"]], unique=TRUE)
## Set the NAs to 0
nas <- is.na(cid)
cid[nas] <- 0
nas <- is.na(hcd)
hcd[nas] <- 0

## Set up a decoy column
cid[["decoy"]] <- 0
hcd[["decoy"]] <- 0
decoys <- grepl(pattern="^DECOY_", x=cid[["cid_Protein"]])
cid[decoys, "decoy"] <- 1
decoys <- grepl(pattern="^DECOY_", x=hcd[["hcd_Protein"]])
hcd[decoys, "decoy"] <- 1

## Make some medians for the columns of interest
intensity_cols <- grep(pattern="Intensity", x=colnames(cid))
intense <- cid[, intensity_cols, with=FALSE]
cid[["median_intense"]] <- matrixStats::rowMedians(as.matrix(intense))
intensity_cols <- grep(pattern="Intensity", x=colnames(hcd))
intense <- hcd[, intensity_cols, with=FALSE]
hcd[["median_intense"]] <- matrixStats::rowMedians(as.matrix(intense))
## Repeat for RT
rt_cols <- grep(pattern="_RT_", x=colnames(cid))
rt <- cid[, rt_cols, with=FALSE]
cid[["median_rt"]] <- matrixStats::rowMedians(as.matrix(rt))
rt_cols <- grep(pattern="_RT_", x=colnames(hcd))
rt <- hcd[, rt_cols, with=FALSE]
hcd[["median_rt"]] <- matrixStats::rowMedians(as.matrix(rt))
## And score
score_cols <- grep(pattern="_score_", x=colnames(cid))
score <- cid[, score_cols, with=FALSE]
cid[["median_score"]] <- matrixStats::rowMedians(as.matrix(score))
score_cols <- grep(pattern="_RT_", x=colnames(hcd))
score <- hcd[, score_cols, with=FALSE]
hcd[["median_score"]] <- matrixStats::rowMedians(as.matrix(score))

## Now plot them.
cid_scores <- cid[, c("median_score", "median_rt", "median_intense", "decoy")]
melted <- reshape2::melt(cid_scores)
## No id variables; using all as measure variables
library(ggplot2)

cid[["decoy"]] <- as.factor(cid[["decoy"]])
cid_score_dist <- ggplot(data=cid, aes_string(x="median_score", fill="decoy")) +
  geom_density(aes_string(x="median_score", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
cid_score_dist
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 2734 rows containing non-finite values (stat_density).

hcd[["decoy"]] <- as.factor(hcd[["decoy"]])
hcd_score_dist <- ggplot(data=hcd, aes_string(x="median_score", fill="decoy")) +
  geom_density(aes_string(x="median_score", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
hcd_score_dist
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1863 rows containing non-finite values (stat_density).

library(dplyr)
decoy_idx <- cid[, "decoy"] == 1
decoys <- as.data.frame(cid)[decoy_idx, ]
nodecoys <- as.data.frame(cid)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_intense"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_intense"], na.rm=TRUE)
cid_intense_dist <- ggplot(data=cid, aes_string(x="median_intense", fill="decoy")) +
  geom_density(aes_string(x="median_intense", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
cid_intense_dist
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 2734 rows containing non-finite values (stat_density).

decoy_idx <- hcd[, "decoy"] == 1
decoys <- as.data.frame(hcd)[decoy_idx, ]
nodecoys <- as.data.frame(hcd)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_intense"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_intense"], na.rm=TRUE)
hcd_intense_dist <- ggplot(data=hcd, aes_string(x="median_intense", fill="decoy")) +
  geom_density(aes_string(x="median_intense", y="..count..", fill="decoy")) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_x_continuous(trans=scales::log2_trans()) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
hcd_intense_dist
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 3147 rows containing non-finite values (stat_density).

decoy_idx <- cid[, "decoy"] == 1
decoys <- as.data.frame(cid)[decoy_idx, ]
nodecoys <- as.data.frame(cid)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_rt"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_rt"], na.rm=TRUE)
cid_rt_dist <- ggplot(data=cid, aes_string(x="median_rt", fill="decoy")) +
  geom_density(aes_string(x="median_rt", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
cid_rt_dist
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1448 rows containing non-finite values (stat_density).

decoy_idx <- hcd[, "decoy"] == 1
decoys <- as.data.frame(hcd)[decoy_idx, ]
nodecoys <- as.data.frame(hcd)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_rt"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_rt"], na.rm=TRUE)
hcd_rt_dist <- ggplot(data=hcd, aes_string(x="median_rt", fill="decoy")) +
  geom_density(aes_string(x="median_rt", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
hcd_rt_dist
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1863 rows containing non-finite values (stat_density).

index.html

2 Index version: 20180215

3 TODO

  • 2017-06-14:

index.html

---
title: "Preprocessing DIA data"
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: tango
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: cosmo
  toc: true
  toc_float:
    collapsed: false
    smooth_scroll: false
---

<style>
  body .main-container {
    max-width: 1600px;
}
</style>

```{r options, include=FALSE}
library("hpgltools")
tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(progress=TRUE,
                     verbose=TRUE,
                     width=90,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      fig.width=8,
                      fig.height=8,
                      dpi=96)
old_options <- options(
  digits=4,
  stringsAsFactors=FALSE,
  knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
##set.seed(1)
ver <- "20180215"
previous_file <- "index.Rmd"

tmp <- try(sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz"))))
```

# Attempt some quantification/queries

Lets see if we can get anything out of the data?

## An expressionSet?

Can I extract the intensities and do something with them?

```{r expt_swath}
## Take the outputs from tric (feature_alignment.py)
cid_intensities <- read.csv(file="results/tric/CID_outmatrix.tsv", sep="\t")
hcd_intensities <- read.csv(file="results/tric/HCD_outmatrix.tsv", sep="\t")

## Pull the protein names
cid_intensities[["rownames"]] <- cid_intensities[["Protein"]]
hcd_intensities[["rownames"]] <- hcd_intensities[["Protein"]]

## Simplify them
cid_intensities[["rownames"]] <- gsub(pattern="^[[:digit:]]+\\/",
                                      replacement="",
                                      x=cid_intensities[["rownames"]])
hcd_intensities[["rownames"]] <- gsub(pattern="^[[:digit:]]+\\/",
                                      replacement="",
                                      x=hcd_intensities[["rownames"]])

## make suitable rownames
rownames(cid_intensities) <- make.names(cid_intensities[["rownames"]], unique=TRUE)
rownames(hcd_intensities) <- make.names(hcd_intensities[["rownames"]], unique=TRUE)

## Simplify the column names because they are way too long
shorter_colnames <- colnames(cid_intensities)
shorter_colnames <- gsub(pattern="^(.*)_vs.*$", replacement="\\1", x=shorter_colnames)
colnames(cid_intensities) <- paste0("cid_", shorter_colnames)
colnames(hcd_intensities) <- paste0("hcd_", shorter_colnames)

## Make a data table from them.
cid <- data.table::as.data.table(cid_intensities)
cid[["cid_rownames"]] <- make.names(cid[["cid_rownames"]], unique=TRUE)
hcd <- data.table::as.data.table(hcd_intensities)
hcd[["hcd_rownames"]] <- make.names(hcd[["hcd_rownames"]], unique=TRUE)
## Set the NAs to 0
nas <- is.na(cid)
cid[nas] <- 0
nas <- is.na(hcd)
hcd[nas] <- 0

## Set up a decoy column
cid[["decoy"]] <- 0
hcd[["decoy"]] <- 0
decoys <- grepl(pattern="^DECOY_", x=cid[["cid_Protein"]])
cid[decoys, "decoy"] <- 1
decoys <- grepl(pattern="^DECOY_", x=hcd[["hcd_Protein"]])
hcd[decoys, "decoy"] <- 1

## Make some medians for the columns of interest
intensity_cols <- grep(pattern="Intensity", x=colnames(cid))
intense <- cid[, intensity_cols, with=FALSE]
cid[["median_intense"]] <- matrixStats::rowMedians(as.matrix(intense))
intensity_cols <- grep(pattern="Intensity", x=colnames(hcd))
intense <- hcd[, intensity_cols, with=FALSE]
hcd[["median_intense"]] <- matrixStats::rowMedians(as.matrix(intense))
## Repeat for RT
rt_cols <- grep(pattern="_RT_", x=colnames(cid))
rt <- cid[, rt_cols, with=FALSE]
cid[["median_rt"]] <- matrixStats::rowMedians(as.matrix(rt))
rt_cols <- grep(pattern="_RT_", x=colnames(hcd))
rt <- hcd[, rt_cols, with=FALSE]
hcd[["median_rt"]] <- matrixStats::rowMedians(as.matrix(rt))
## And score
score_cols <- grep(pattern="_score_", x=colnames(cid))
score <- cid[, score_cols, with=FALSE]
cid[["median_score"]] <- matrixStats::rowMedians(as.matrix(score))
score_cols <- grep(pattern="_RT_", x=colnames(hcd))
score <- hcd[, score_cols, with=FALSE]
hcd[["median_score"]] <- matrixStats::rowMedians(as.matrix(score))

## Now plot them.
cid_scores <- cid[, c("median_score", "median_rt", "median_intense", "decoy")]
melted <- reshape2::melt(cid_scores)
library(ggplot2)

cid[["decoy"]] <- as.factor(cid[["decoy"]])
cid_score_dist <- ggplot(data=cid, aes_string(x="median_score", fill="decoy")) +
  geom_density(aes_string(x="median_score", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
cid_score_dist

hcd[["decoy"]] <- as.factor(hcd[["decoy"]])
hcd_score_dist <- ggplot(data=hcd, aes_string(x="median_score", fill="decoy")) +
  geom_density(aes_string(x="median_score", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
hcd_score_dist

library(dplyr)
decoy_idx <- cid[, "decoy"] == 1
decoys <- as.data.frame(cid)[decoy_idx, ]
nodecoys <- as.data.frame(cid)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_intense"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_intense"], na.rm=TRUE)
cid_intense_dist <- ggplot(data=cid, aes_string(x="median_intense", fill="decoy")) +
  geom_density(aes_string(x="median_intense", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
cid_intense_dist

decoy_idx <- hcd[, "decoy"] == 1
decoys <- as.data.frame(hcd)[decoy_idx, ]
nodecoys <- as.data.frame(hcd)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_intense"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_intense"], na.rm=TRUE)
hcd_intense_dist <- ggplot(data=hcd, aes_string(x="median_intense", fill="decoy")) +
  geom_density(aes_string(x="median_intense", y="..count..", fill="decoy")) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_x_continuous(trans=scales::log2_trans()) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
hcd_intense_dist

decoy_idx <- cid[, "decoy"] == 1
decoys <- as.data.frame(cid)[decoy_idx, ]
nodecoys <- as.data.frame(cid)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_rt"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_rt"], na.rm=TRUE)
cid_rt_dist <- ggplot(data=cid, aes_string(x="median_rt", fill="decoy")) +
  geom_density(aes_string(x="median_rt", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
cid_rt_dist

decoy_idx <- hcd[, "decoy"] == 1
decoys <- as.data.frame(hcd)[decoy_idx, ]
nodecoys <- as.data.frame(hcd)[!decoy_idx, ]
decoy_xint <- mean(decoys[, "median_rt"], na.rm=TRUE)
nodecoy_xint <- mean(nodecoys[, "median_rt"], na.rm=TRUE)
hcd_rt_dist <- ggplot(data=hcd, aes_string(x="median_rt", fill="decoy")) +
  geom_density(aes_string(x="median_rt", y="..count..", fill="decoy")) +
  scale_x_continuous(trans=scales::log2_trans()) +
  geom_vline(xintercept=decoy_xint, color="darkred", linetype="dashed", size=1) +
  geom_vline(xintercept=nodecoy_xint, color="darkblue", linetype="dashed", size=1) +
  scale_fill_manual(values=c("0"="darkblue", "1"="darkred"))
hcd_rt_dist
```

[index.html](index.html)

# Index version: `r ver`

# TODO

* 2017-06-14:

[index.html](index.html)
