As per Yan’s suggestion, I am using the existing results, found in the file: “2017_0811BrikenMultiConsensus.xlsx”
Unfortunately, this table is in a particularly strange format: it is a triply nested table of tables of tables in one worksheet in Excel. At one level, this is a really novel and interesting use of Excel, but on every other level it is sub-optimal. The primary reason it is sub-optimal is that the column headings end up being repeated once for every new sub-table (874 of the inner tables plus however many middle tables), which makes it nearly unparseable.
I am displeased I need to do this level of data wrangling.
The following function is my first attempt at handling this parsing puzzle.
For the interested, the structure is:
parse_difficult_excel <- function(xlsx_file, test_row=92150) {
old_options <- options(java.parameters="-Xmx20G")
message(paste0("Reading ", xlsx_file))
result <- readxl::read_xlsx(path=xlsx_file, sheet=1, col_names=FALSE)
message("Starting parse.")
group_data <- list()
bar <- utils::txtProgressBar(style=3)
for (r in 1:nrow(result)) {
row <- as.data.frame(result[r, ])
row[, is.na(row)] <- ""
pct_done <- r / nrow(result)
setTxtProgressBar(bar, pct_done)
## The following 3 stanzas handle the creation of the levels of our data structure
## The first defines the protein group
if (row[, 1] == "Checked") {
group_colnames <- as.character(row)
group_keepers <- !grepl(pattern="^$", x=group_colnames)
group_keepers[1] <- FALSE
group_colnames <- group_colnames[group_keepers]
next
}
## When the 2nd column is 'Checked', then this defines a new protein in the group.
if (row[, 2] == "Checked") {
protein_colnames <- as.character(row)
protein_keepers <- !grepl(pattern="^$", x=protein_colnames)
protein_keepers[2] <- FALSE
protein_colnames <- protein_colnames[protein_keepers]
next
}
## When the 3rd column is 'Checked', then this starts a peptide definition
if (row[, 3] == "Checked") {
peptide_colnames <- as.character(row)
peptide_keepers <- !grepl(pattern="^$", x=peptide_colnames)
peptide_keepers[3] <- FALSE
peptide_colnames <- peptide_colnames[peptide_keepers]
next
}
## Once the column names for the data are defined, we consider how to
## Fill in the actual data, the protein group is probably the least interesting.
if (row[, 1] == FALSE | row[, 1] == TRUE) {
group_information <- row[group_keepers]
colnames(group_information) <- group_colnames
group_information[["ID"]] <- sub(pattern="^.* GN=(\\w+) .*$",
replacement="\\1",
x=group_information[["Group Description"]])
group_accession <- group_information[["Protein Group ID"]]
group_list <- list(
"summary" = group_information,
"data" = list())
group_data[[group_accession]] <- group_list
next
}
## When the 2nd column is FALSE, then this defined a protein in the group.
## The protein data structure is likely the most interesting.
if (row[, 2] == FALSE | row[, 2] == TRUE) {
protein_information <- row[protein_keepers]
colnames(protein_information) <- protein_colnames
protein_information[["ID"]] <- sub(pattern="^.* GN=(\\w+) .*$",
replacement="\\1",
x=protein_information[["Description"]])
protein_accession <- protein_information[["Accession"]]
protein_list <- list(
"summary" = protein_information,
"data" = data.frame())
group_data[[group_accession]][["data"]][[protein_accession]] <- protein_list
next
}
## When the 3rd group is FALSE, then this adds a peptide.
## The peptide data structure is the most detailed, but probably not the most interesting.
if (row[, 3] == FALSE | row[, 3] == TRUE) {
peptide_information <- row[peptide_keepers]
colnames(peptide_information) <- peptide_colnames
current <- group_data[[group_accession]][["data"]][[protein_accession]][["data"]]
new <- rbind(current, peptide_information)
group_data[[group_accession]][["data"]][[protein_accession]][["data"]] <- new
next
}
} ## End iterating over ever row of this unholy stupid data structure.
close(bar)
message("Finished parsing, reorganizing the protein data.")
protein_df <- data.frame()
peptide_df <- data.frame()
protein_names <- c()
message(paste0("Starting to iterate over ", length(group_data), " groups."))
bar <- utils::txtProgressBar(style=3)
for (g in 1:length(group_data)) {
pct_done <- g / length(group_data)
setTxtProgressBar(bar, pct_done)
group <- as.character(names(group_data)[g])
protein_group <- group_data[[group]][["data"]]
protein_accessions <- names(protein_group)
for (p in 1:length(protein_accessions)) {
protein <- protein_accessions[p]
protein_names <- c(protein_names, protein)
protein_summary <- group_data[[group]][["data"]][[protein]][["summary"]]
protein_df <- rbind(protein_df, protein_summary)
peptide_data <- group_data[[group]][["data"]][[protein]][["data"]]
## peptide_df <- rbind(peptide_df, peptide_data)
}
} ## End of the for loop
close(bar)
retlist <- list(
"group_data" = group_data,
"protein_data" = protein_df,
"peptide_data" = peptide_df)
return(retlist)
}
##xlsx_file <- "previous_results/2017_0811BrikenMultiConsensus.xlsx"
xlsx_file <- "previous_results/2017_1101BrikenMulti_PE_PPE_Complete.xlsx"
less_difficult <- parse_difficult_excel(xlsx_file)
## Reading previous_results/2017_1101BrikenMulti_PE_PPE_Complete.xlsx
## Starting parse.
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 3%
|
|=== | 4%
|
|==== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 7%
|
|====== | 8%
|
|======= | 8%
|
|======= | 9%
|
|======== | 9%
|
|======== | 10%
|
|======== | 11%
|
|========= | 11%
|
|========= | 12%
|
|========== | 12%
|
|========== | 13%
|
|=========== | 13%
|
|=========== | 14%
|
|============ | 14%
|
|============ | 15%
|
|============ | 16%
|
|============= | 16%
|
|============= | 17%
|
|============== | 17%
|
|============== | 18%
|
|=============== | 18%
|
|=============== | 19%
|
|================ | 19%
|
|================ | 20%
|
|================ | 21%
|
|================= | 21%
|
|================= | 22%
|
|================== | 22%
|
|================== | 23%
|
|=================== | 23%
|
|=================== | 24%
|
|==================== | 24%
|
|==================== | 25%
|
|==================== | 26%
|
|===================== | 26%
|
|===================== | 27%
|
|====================== | 27%
|
|====================== | 28%
|
|======================= | 28%
|
|======================= | 29%
|
|======================== | 29%
|
|======================== | 30%
|
|======================== | 31%
|
|========================= | 31%
|
|========================= | 32%
|
|========================== | 32%
|
|========================== | 33%
|
|=========================== | 33%
|
|=========================== | 34%
|
|============================ | 34%
|
|============================ | 35%
|
|============================ | 36%
|
|============================= | 36%
|
|============================= | 37%
|
|============================== | 37%
|
|============================== | 38%
|
|=============================== | 38%
|
|=============================== | 39%
|
|================================ | 39%
|
|================================ | 40%
|
|================================ | 41%
|
|================================= | 41%
|
|================================= | 42%
|
|================================== | 42%
|
|================================== | 43%
|
|=================================== | 43%
|
|=================================== | 44%
|
|==================================== | 44%
|
|==================================== | 45%
|
|==================================== | 46%
|
|===================================== | 46%
|
|===================================== | 47%
|
|====================================== | 47%
|
|====================================== | 48%
|
|======================================= | 48%
|
|======================================= | 49%
|
|======================================== | 49%
|
|======================================== | 50%
|
|======================================== | 51%
|
|========================================= | 51%
|
|========================================= | 52%
|
|========================================== | 52%
|
|========================================== | 53%
|
|=========================================== | 53%
|
|=========================================== | 54%
|
|============================================ | 54%
|
|============================================ | 55%
|
|============================================ | 56%
|
|============================================= | 56%
|
|============================================= | 57%
|
|============================================== | 57%
|
|============================================== | 58%
|
|=============================================== | 58%
|
|=============================================== | 59%
|
|================================================ | 59%
|
|================================================ | 60%
|
|================================================ | 61%
|
|================================================= | 61%
|
|================================================= | 62%
|
|================================================== | 62%
|
|================================================== | 63%
|
|=================================================== | 63%
|
|=================================================== | 64%
|
|==================================================== | 64%
|
|==================================================== | 65%
|
|==================================================== | 66%
|
|===================================================== | 66%
|
|===================================================== | 67%
|
|====================================================== | 67%
|
|====================================================== | 68%
|
|======================================================= | 68%
|
|======================================================= | 69%
|
|======================================================== | 69%
|
|======================================================== | 70%
|
|======================================================== | 71%
|
|========================================================= | 71%
|
|========================================================= | 72%
|
|========================================================== | 72%
|
|========================================================== | 73%
|
|=========================================================== | 73%
|
|=========================================================== | 74%
|
|============================================================ | 74%
|
|============================================================ | 75%
|
|============================================================ | 76%
|
|============================================================= | 76%
|
|============================================================= | 77%
|
|============================================================== | 77%
|
|============================================================== | 78%
|
|=============================================================== | 78%
|
|=============================================================== | 79%
|
|================================================================ | 79%
|
|================================================================ | 80%
|
|================================================================ | 81%
|
|================================================================= | 81%
|
|================================================================= | 82%
|
|================================================================== | 82%
|
|================================================================== | 83%
|
|=================================================================== | 83%
|
|=================================================================== | 84%
|
|==================================================================== | 84%
|
|==================================================================== | 85%
|
|==================================================================== | 86%
|
|===================================================================== | 86%
|
|===================================================================== | 87%
|
|====================================================================== | 87%
|
|====================================================================== | 88%
|
|======================================================================= | 88%
|
|======================================================================= | 89%
|
|======================================================================== | 89%
|
|======================================================================== | 90%
|
|======================================================================== | 91%
|
|========================================================================= | 91%
|
|========================================================================= | 92%
|
|========================================================================== | 92%
|
|========================================================================== | 93%
|
|=========================================================================== | 93%
|
|=========================================================================== | 94%
|
|============================================================================ | 94%
|
|============================================================================ | 95%
|
|============================================================================ | 96%
|
|============================================================================= | 96%
|
|============================================================================= | 97%
|
|============================================================================== | 97%
|
|============================================================================== | 98%
|
|=============================================================================== | 98%
|
|=============================================================================== | 99%
|
|================================================================================| 99%
|
|================================================================================| 100%
## Finished parsing, reorganizing the protein data.
## Starting to iterate over 2574 groups.
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 3%
|
|=== | 4%
|
|==== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 7%
|
|====== | 8%
|
|======= | 8%
|
|======= | 9%
|
|======== | 9%
|
|======== | 10%
|
|======== | 11%
|
|========= | 11%
|
|========= | 12%
|
|========== | 12%
|
|========== | 13%
|
|=========== | 13%
|
|=========== | 14%
|
|============ | 14%
|
|============ | 15%
|
|============ | 16%
|
|============= | 16%
|
|============= | 17%
|
|============== | 17%
|
|============== | 18%
|
|=============== | 18%
|
|=============== | 19%
|
|================ | 19%
|
|================ | 20%
|
|================ | 21%
|
|================= | 21%
|
|================= | 22%
|
|================== | 22%
|
|================== | 23%
|
|=================== | 23%
|
|=================== | 24%
|
|==================== | 24%
|
|==================== | 25%
|
|==================== | 26%
|
|===================== | 26%
|
|===================== | 27%
|
|====================== | 27%
|
|====================== | 28%
|
|======================= | 28%
|
|======================= | 29%
|
|======================== | 29%
|
|======================== | 30%
|
|======================== | 31%
|
|========================= | 31%
|
|========================= | 32%
|
|========================== | 32%
|
|========================== | 33%
|
|=========================== | 33%
|
|=========================== | 34%
|
|============================ | 34%
|
|============================ | 35%
|
|============================ | 36%
|
|============================= | 36%
|
|============================= | 37%
|
|============================== | 37%
|
|============================== | 38%
|
|=============================== | 38%
|
|=============================== | 39%
|
|================================ | 39%
|
|================================ | 40%
|
|================================ | 41%
|
|================================= | 41%
|
|================================= | 42%
|
|================================== | 42%
|
|================================== | 43%
|
|=================================== | 43%
|
|=================================== | 44%
|
|==================================== | 44%
|
|==================================== | 45%
|
|==================================== | 46%
|
|===================================== | 46%
|
|===================================== | 47%
|
|====================================== | 47%
|
|====================================== | 48%
|
|======================================= | 48%
|
|======================================= | 49%
|
|======================================== | 49%
|
|======================================== | 50%
|
|======================================== | 51%
|
|========================================= | 51%
|
|========================================= | 52%
|
|========================================== | 52%
|
|========================================== | 53%
|
|=========================================== | 53%
|
|=========================================== | 54%
|
|============================================ | 54%
|
|============================================ | 55%
|
|============================================ | 56%
|
|============================================= | 56%
|
|============================================= | 57%
|
|============================================== | 57%
|
|============================================== | 58%
|
|=============================================== | 58%
|
|=============================================== | 59%
|
|================================================ | 59%
|
|================================================ | 60%
|
|================================================ | 61%
|
|================================================= | 61%
|
|================================================= | 62%
|
|================================================== | 62%
|
|================================================== | 63%
|
|=================================================== | 63%
|
|=================================================== | 64%
|
|==================================================== | 64%
|
|==================================================== | 65%
|
|==================================================== | 66%
|
|===================================================== | 66%
|
|===================================================== | 67%
|
|====================================================== | 67%
|
|====================================================== | 68%
|
|======================================================= | 68%
|
|======================================================= | 69%
|
|======================================================== | 69%
|
|======================================================== | 70%
|
|======================================================== | 71%
|
|========================================================= | 71%
|
|========================================================= | 72%
|
|========================================================== | 72%
|
|========================================================== | 73%
|
|=========================================================== | 73%
|
|=========================================================== | 74%
|
|============================================================ | 74%
|
|============================================================ | 75%
|
|============================================================ | 76%
|
|============================================================= | 76%
|
|============================================================= | 77%
|
|============================================================== | 77%
|
|============================================================== | 78%
|
|=============================================================== | 78%
|
|=============================================================== | 79%
|
|================================================================ | 79%
|
|================================================================ | 80%
|
|================================================================ | 81%
|
|================================================================= | 81%
|
|================================================================= | 82%
|
|================================================================== | 82%
|
|================================================================== | 83%
|
|=================================================================== | 83%
|
|=================================================================== | 84%
|
|==================================================================== | 84%
|
|==================================================================== | 85%
|
|==================================================================== | 86%
|
|===================================================================== | 86%
|
|===================================================================== | 87%
|
|====================================================================== | 87%
|
|====================================================================== | 88%
|
|======================================================================= | 88%
|
|======================================================================= | 89%
|
|======================================================================== | 89%
|
|======================================================================== | 90%
|
|======================================================================== | 91%
|
|========================================================================= | 91%
|
|========================================================================= | 92%
|
|========================================================================== | 92%
|
|========================================================================== | 93%
|
|=========================================================================== | 93%
|
|=========================================================================== | 94%
|
|============================================================================ | 94%
|
|============================================================================ | 95%
|
|============================================================================ | 96%
|
|============================================================================= | 96%
|
|============================================================================= | 97%
|
|============================================================================== | 97%
|
|============================================================================== | 98%
|
|=============================================================================== | 98%
|
|=============================================================================== | 99%
|
|================================================================================| 99%
|
|================================================================================| 100%
The function above is not immediately helpful, as it only wrangles the crazy excel into a less-crazy data structure, but if we wish to graph the data or play with it, we need to extract the elements of interest. For the moment those are primarily the peptide fragments and whether they were found in the trypsin digestions, chymotrypsin digestions, or both.
This data frame now contains everything we are likely to want to visualize.
Unfortunately, this data structure also suffers from a couple weaknesses:
Therefore, I will take a moment to fix these problems before trying any plots.
## Set some reasonable row names
protein_df <- less_difficult[["protein_data"]]
rownames(protein_df) <- make.names(protein_names, unique=TRUE)
## Error in `row.names<-.data.frame`(`*tmp*`, value = value): invalid 'row.names' length
## Set the column names to something legible.
## Each line denotes my mental organizational unit
new_colnames <- c("mascot_fdr", "sequest_fdr", "master", "accession", "description",
"mascot_q", "sequest_q", "coverage", "contaminant", "marked_as",
"num_peptides", "num_psms", "num_unique_pep", "num_prot_groups", "num_aas",
"mw", "pi", "mascot_score", "sequest_score", "num_peptides_mascot",
"num_peptides_sequest", "bp", "cc", "mf", "pfam_ids",
"entrez_gene_id", "ensembl_gene_id", "gene_symbol", "chromosome", "kegg",
"wikipathways", "found_trypsin_lysc_cf", "found_trypsin_lysc_wcl", "found_chymotrypsin_cf", "found_chymotrypsin_wcl",
"modifications", "id")
colnames(protein_df) <- new_colnames
## Now make sure the columns which should be numeric, are numeric.
numeric_cols <- c("mascot_q", "sequest_q", "coverage", "num_peptides", "num_psms",
"num_unique_pep", "num_prot_groups", "num_aas", "mw", "pi",
"mascot_score", "sequest_score",
"num_peptides_mascot", "num_peptides_sequest")
for (col in numeric_cols) {
protein_df[[col]] <- as.numeric(protein_df[[col]])
}
factor_cols <- c("found_trypsin_lysc_cf", "found_trypsin_lysc_wcl",
"found_chymotrypsin_cf", "found_chymotrypsin_wcl")
for (col in factor_cols) {
protein_df[[col]] <- as.factor(protein_df[[col]])
}
## Finally, set a 'state' column describing the 4 columns of chymotrypsin/trypsin digestions.
protein_df[["state"]] <- "ambiguous"
## If something is found in one chymo trypsin, but in neither trypsin, set it to chymo_only.
chymo_hits <- (protein_df[["found_chymotrypsin_cf"]] == "High" | protein_df[["found_chymotrypsin_wcl"]] == "High") &
(protein_df[["found_trypsin_lysc_cf"]] != "High" & protein_df[["found_trypsin_lysc_wcl"]] != "High")
protein_df[chymo_hits, "state"] <- "chymo_only"
## Conversely, if found in trypsin but not chymo, set it to tryp_only.
tryp_hits <- (protein_df[["found_trypsin_lysc_cf"]] == "High" | protein_df[["found_trypsin_lysc_wcl"]] == "High") &
(protein_df[["found_chymotrypsin_cf"]] != "High" & protein_df[["found_chymotrypsin_wcl"]] != "High")
protein_df[tryp_hits, "state"] <- "tryp_only"
no_hits <- protein_df[["found_trypsin_lysc_cf"]] == "Not Found" &
protein_df[["found_trypsin_lysc_wcl"]] == "Not Found" &
protein_df[["found_chymotrypsin_cf"]] == "Not Found" &
protein_df[["found_chymotrypsin_wcl"]] == "Not Found"
protein_df[no_hits, "state"] <- "None"
protein_df[["state"]] <- as.factor(protein_df[["state"]])
chymo_df <- protein_df[chymo_hits, ]
tryp_df <- protein_df[tryp_hits, ]
Now that the data has been wrangled, we can use the various R plotters to look at the data relatively quickly and easily.
library(ggplot2)
coverage_wrt_pi <- ggplot(protein_df, aes_string(x="pi", y="coverage", colour="state")) +
geom_point(alpha=0.6, size=2) +
geom_density_2d()
coverage_wrt_mw <- ggplot(protein_df, aes_string(x="mw", y="coverage", colour="state")) +
geom_point(alpha=0.6, size=2) +
geom_density_2d()
protein_df$logmascot <- log2(protein_df$mascot_score)
scores_by_state <- ggplot(protein_df, aes_string(x="state", y="logmascot")) +
geom_boxplot(aes_string(fill="state"))
Yan suggested that peptides with low-pIs would be less covered, especially in the chymotrypsin samples.
pp(file="images/coverage_wrt_pi.png")
## NULL
coverage_wrt_pi
## Warning: Computation failed in `stat_density2d()`:
## bandwidths must be strictly positive
dev.off()
## X11cairo
## 2
## To me, it looks like there might be something to that idea.
Maybe there is a trend of coverage and molecular weight by sample-type.
pp(file="images/coverage_wrt_mw.png")
## NULL
coverage_wrt_mw
## Warning: Computation failed in `stat_density2d()`:
## bandwidths must be strictly positive
dev.off()
## X11cairo
## 2
## It looks to me that the trypsin is slightly better than chymotrypsin.
## It is not clear in the excel, but I presume this is in kDa.
A quick boxplot showing mascot scores by what was observed.
scores_by_state
## Warning: Removed 96 rows containing non-finite values (stat_boxplot).
all_de <- read.table("limma_result.csv", header=TRUE, sep="\t")
colnames(all_de) <- c("ID", "logFC", "AveExpr", "adj.P.Val", "abbrev_id", "description", "gene_length", "type")
all_de <- merge(all_de, mtb_annotations, by.x="ID", by.y="ID")
dim(all_de)
## [1] 3995 21
pe_ids <- read.table("reference/annotated_pe_genes.txt")
colnames(pe_ids) <- "locus_tag"
pe_annotations <- merge(pe_ids, mtb_annotations, by="locus_tag")
rownames(pe_annotations) <- pe_annotations[["ID"]]
dim(pe_annotations)
## [1] 161 14
uniprot_annot <- load_uniprot_annotations(file="reference/uniprot_3AUP000001584.txt.gz")
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 3%
|
|=== | 4%
|
|==== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|====== | 7%
|
|====== | 8%
|
|======= | 8%
|
|======= | 9%
|
|======== | 9%
|
|======== | 10%
|
|======== | 11%
|
|========= | 11%
|
|========= | 12%
|
|========== | 12%
|
|========== | 13%
|
|=========== | 13%
|
|=========== | 14%
|
|============ | 14%
|
|============ | 15%
|
|============ | 16%
|
|============= | 16%
|
|============= | 17%
|
|============== | 17%
|
|============== | 18%
|
|=============== | 18%
|
|=============== | 19%
|
|================ | 19%
|
|================ | 20%
|
|================ | 21%
|
|================= | 21%
|
|================= | 22%
|
|================== | 22%
|
|================== | 23%
|
|=================== | 23%
|
|=================== | 24%
|
|==================== | 24%
|
|==================== | 25%
|
|==================== | 26%
|
|===================== | 26%
|
|===================== | 27%
|
|====================== | 27%
|
|====================== | 28%
|
|======================= | 28%
|
|======================= | 29%
|
|======================== | 29%
|
|======================== | 30%
|
|======================== | 31%
|
|========================= | 31%
|
|========================= | 32%
|
|========================== | 32%
|
|========================== | 33%
|
|=========================== | 33%
|
|=========================== | 34%
|
|============================ | 34%
|
|============================ | 35%
|
|============================ | 36%
|
|============================= | 36%
|
|============================= | 37%
|
|============================== | 37%
|
|============================== | 38%
|
|=============================== | 38%
|
|=============================== | 39%
|
|================================ | 39%
|
|================================ | 40%
|
|================================ | 41%
|
|================================= | 41%
|
|================================= | 42%
|
|================================== | 42%
|
|================================== | 43%
|
|=================================== | 43%
|
|=================================== | 44%
|
|==================================== | 44%
|
|==================================== | 45%
|
|==================================== | 46%
|
|===================================== | 46%
|
|===================================== | 47%
|
|====================================== | 47%
|
|====================================== | 48%
|
|======================================= | 48%
|
|======================================= | 49%
|
|======================================== | 49%
|
|======================================== | 50%
|
|======================================== | 51%
|
|========================================= | 51%
|
|========================================= | 52%
|
|========================================== | 52%
|
|========================================== | 53%
|
|=========================================== | 53%
|
|=========================================== | 54%
|
|============================================ | 54%
|
|============================================ | 55%
|
|============================================ | 56%
|
|============================================= | 56%
|
|============================================= | 57%
|
|============================================== | 57%
|
|============================================== | 58%
|
|=============================================== | 58%
|
|=============================================== | 59%
|
|================================================ | 59%
|
|================================================ | 60%
|
|================================================ | 61%
|
|================================================= | 61%
|
|================================================= | 62%
|
|================================================== | 62%
|
|================================================== | 63%
|
|=================================================== | 63%
|
|=================================================== | 64%
|
|==================================================== | 64%
|
|==================================================== | 65%
|
|==================================================== | 66%
|
|===================================================== | 66%
|
|===================================================== | 67%
|
|====================================================== | 67%
|
|====================================================== | 68%
|
|======================================================= | 68%
|
|======================================================= | 69%
|
|======================================================== | 69%
|
|======================================================== | 70%
|
|======================================================== | 71%
|
|========================================================= | 71%
|
|========================================================= | 72%
|
|========================================================== | 72%
|
|========================================================== | 73%
|
|=========================================================== | 73%
|
|=========================================================== | 74%
|
|============================================================ | 74%
|
|============================================================ | 75%
|
|============================================================ | 76%
|
|============================================================= | 76%
|
|============================================================= | 77%
|
|============================================================== | 77%
|
|============================================================== | 78%
|
|=============================================================== | 78%
|
|=============================================================== | 79%
|
|================================================================ | 79%
|
|================================================================ | 80%
|
|================================================================ | 81%
|
|================================================================= | 81%
|
|================================================================= | 82%
|
|================================================================== | 82%
|
|================================================================== | 83%
|
|=================================================================== | 83%
|
|=================================================================== | 84%
|
|==================================================================== | 84%
|
|==================================================================== | 85%
|
|==================================================================== | 86%
|
|===================================================================== | 86%
|
|===================================================================== | 87%
|
|====================================================================== | 87%
|
|====================================================================== | 88%
|
|======================================================================= | 88%
|
|======================================================================= | 89%
|
|======================================================================== | 89%
|
|======================================================================== | 90%
|
|======================================================================== | 91%
|
|========================================================================= | 91%
|
|========================================================================= | 92%
|
|========================================================================== | 92%
|
|========================================================================== | 93%
|
|=========================================================================== | 93%
|
|=========================================================================== | 94%
|
|============================================================================ | 94%
|
|============================================================================ | 95%
|
|============================================================================ | 96%
|
|============================================================================= | 96%
|
|============================================================================= | 97%
|
|============================================================================== | 97%
|
|============================================================================== | 98%
|
|=============================================================================== | 98%
|
|=============================================================================== | 99%
|
|================================================================================| 99%
|
|================================================================================| 100%
## Finished parsing, creating data frame.
dim(uniprot_annot)
## [1] 3993 41
uniprot_rna <- merge(all_de, uniprot_annot, by.x="ID", by.y="loci")
dim(uniprot_rna)
## [1] 3891 61
uniprot_pe <- merge(pe_annotations, uniprot_annot, by.x="ID", by.y="loci")
dim(uniprot_pe)
## [1] 157 54
uniprot_pe_peptides <- merge(uniprot_annot, protein_df, by.x="primary_accessions", by.y="accession")
uniprot_pe_peptides <- merge(uniprot_pe_peptides, pe_ids, by.x="loci", by.y="locus_tag")
dim(uniprot_pe_peptides)
## [1] 40 79
uniprot_rna_protein <- merge(uniprot_rna, protein_df, by.x="primary_accessions", by.y="accession")
dim(protein_df)
## [1] 2671 39
dim(uniprot_rna_protein)
## [1] 2511 99
uniprot_rna_pe <- merge(uniprot_pe, uniprot_rna_protein, by="ID")
## Warning in merge.data.frame(uniprot_pe, uniprot_rna_protein, by = "ID"): column names
## 'description.x', 'description.y' are duplicated in the result
dim(uniprot_rna_pe)
## [1] 40 152
## This is exhausting, I will call these big and little from now on
big <- uniprot_rna_protein
rownames(big) <- make.names(big$ID, unique=TRUE)
ordered <- order(rownames(big))
big <- big[ordered, ]
little <- uniprot_rna_pe
rownames(little) <- make.names(little$ID, unique=TRUE)
ordered <- order(rownames(little))
little <- little[ordered, ]
shared <- rownames(big) %in% rownames(little)
## pe_de_annot <- merge(pe_de, mtb_annotations, by="locus_tag")
## The contrast is written as wt - delta, so add back the logFC I think
## Hopefully I did not reverse this in my head.
big$ave_minus_log <- big$AveExpr - big$logFC
big$ave_minus_log <- big$AveExpr - big$logFC
big$pseudo_rpkm <- (big$ave_minus_log / big$width) * 1000
maximum_rpkm <- max(big$pseudo_rpkm)
big$express_vs_max <- big$pseudo_rpkm / maximum_rpkm
## Since we constant ordered the dataframes above, this is valid.
little$pseudo_rpkm <- big$pseudo_rpkm[shared]
little$express_vs_max <- big$express_vs_max[shared]
write.csv(x=little, file="pe_relative_expression_annotation-20171205.csv")
library(hpgltools)
library(ggplot2)
test <- list(
"all_peptides" = big$express_vs_max,
"pe_peptides" = little$express_vs_max)
multi <- plot_multihistogram(test)
## Used Bon Ferroni corrected t test(s) between columns.
multi$plot
rpkm_vs_psms <- big[, c("pseudo_rpkm", "num_psms")]
little_rpkm_vs_psms <- little[, c("pseudo_rpkm", "num_psms")]
rpkm_vs_psms$num_psms <- log2(rpkm_vs_psms$num_psms + 1)
little_rpkm_vs_psms$num_psms <- log2(little_rpkm_vs_psms$num_psms + 1)
rpkm_psms_linear <- plot_linear_scatter(rpkm_vs_psms)
## Used Bon Ferroni corrected t test(s) between columns.
rpkm_psms_linear$scatter
rpkm_psms_linear$cor
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 5.8, df = 2500, p-value = 9e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07575 0.15296
## sample estimates:
## cor
## 0.1145
rpkm_psms_linear$lm_rsq
## [1] 0.007207
big_little <- ggplot(data=rpkm_vs_psms, aes_string(x="pseudo_rpkm", y="num_psms")) +
geom_point()
big_little
big_little <- big_little +
geom_point(data=little_rpkm_vs_psms, aes_string(x="pseudo_rpkm", y="num_psms"), colour="red")
pp(file="rpkm_vs_psms.png")
## NULL
big_little
dev.off()
## X11cairo
## 2
rpkm_vs_peptides <- big[, c("pseudo_rpkm", "num_peptides")]
rpkm_peptides_linear <- plot_linear_scatter(rpkm_vs_peptides)
## Used Bon Ferroni corrected t test(s) between columns.
rpkm_peptides_linear$scatter
rpkm_peptides_linear$cor
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = -4.8, df = 2500, p-value = 2e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.13318 -0.05564
## sample estimates:
## cor
## -0.09455
rpkm_psms_linear$lm_rsq
## [1] 0.007207
little_rpkm_vs_peptides <- little[, c("pseudo_rpkm", "num_peptides")]
big_little <- ggplot(data=rpkm_vs_peptides, aes_string(x="pseudo_rpkm", y="num_peptides")) +
geom_point()
big_little
big_little <- big_little +
geom_point(data=little_rpkm_vs_peptides, aes_string(x="pseudo_rpkm", y="num_peptides"), colour="red")
big_little
rpkm_vs_coverage <- big[, c("pseudo_rpkm", "coverage")]
rpkm_coverage_linear <- plot_linear_scatter(rpkm_vs_coverage)
## Used Bon Ferroni corrected t test(s) between columns.
rpkm_coverage_linear$scatter
rpkm_coverage_linear$cor
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 21, df = 2500, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3532 0.4198
## sample estimates:
## cor
## 0.387
rpkm_coverage_linear$lm_rsq
## [1] 0.1618
little_rpkm_vs_coverage <- little[, c("pseudo_rpkm", "coverage")]
big_little <- ggplot(data=rpkm_vs_coverage, aes_string(x="pseudo_rpkm", y="coverage")) +
geom_point()
big_little
big_little <- big_little +
geom_point(data=little_rpkm_vs_coverage, aes_string(x="pseudo_rpkm", y="coverage"), colour="red")
big_little
pander::pander(sessionInfo())
R version 3.4.3 (2017-11-30)
**Platform:** x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: hpgltools(v.2017.10) and ggplot2(v.2.2.1)
loaded via a namespace (and not attached): reshape2(v.1.4.2), pander(v.0.6.1), colorspace(v.1.3-2), htmltools(v.0.3.6), yaml(v.2.1.15), base64enc(v.0.1-3), rlang(v.0.1.4), withr(v.2.1.0), BiocGenerics(v.0.24.0), readxl(v.1.0.0), foreach(v.1.4.3), plyr(v.1.8.4), robustbase(v.0.92-8), stringr(v.1.2.0), munsell(v.0.4.3), commonmark(v.1.4), gtable(v.0.2.0), cellranger(v.1.1.0), devtools(v.1.13.4), codetools(v.0.2-15), memoise(v.1.1.0), evaluate(v.0.10.1), labeling(v.0.3), Biobase(v.2.38.0), knitr(v.1.17), parallel(v.3.4.3), DEoptimR(v.1.0-8), Rcpp(v.0.12.14), readr(v.1.1.1), scales(v.0.5.0), backports(v.1.1.1), hms(v.0.4.0), digest(v.0.6.12), stringi(v.1.1.6), grid(v.3.4.3), rprojroot(v.1.2), tools(v.3.4.3), magrittr(v.1.5), lazyeval(v.0.2.1), tibble(v.1.3.4), pkgconfig(v.2.0.1), MASS(v.7.3-47), data.table(v.1.10.4-3), xml2(v.1.1.1), rmarkdown(v.1.8), roxygen2(v.6.0.1), iterators(v.1.0.8), R6(v.2.2.2) and compiler(v.3.4.3)
message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset e98d5f5d939340766de61c5d354576f11c2769db
## R> packrat::restore()
## This is hpgltools commit: Tue Dec 5 22:18:58 2017 -0500: e98d5f5d939340766de61c5d354576f11c2769db
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 02_eval_result_201711-v20171205.rda.xz
tmp <- sm(saveme(filename=this_save))