M.tuberculosis 20180913 proteomics: Collecting annotation data.
A fresh running of all proteomics tasks
I think I finally worked out all(most?) of the kinks in the processing of DIA data. Thus I want to have a fresh run of all the tasks required to interpret the results.
Annotation version: 20180913
Genome annotation input
Read a gff file
In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments. More in-depth information for the human transcriptome may be extracted from biomart.
## The old way of getting genome/annotation data
mtb_gff <- "reference/mycobacterium_tuberculosis_h37rv_2.gff.gz"
mtb_genome <- "reference/mtuberculosis_h37rv_genbank.fasta"
mtb_cds <- "reference/mtb_cds.fasta"
mtb_annotations <- sm(load_gff_annotations(mtb_gff, type="gene"))
colnames(mtb_annotations) <- gsub(pattern="\\.", replacement="", x=colnames(mtb_annotations))
mtb_annotations[["description"]] <- gsub(pattern="\\+", replacement=" ",
x=mtb_annotations[["description"]])
mtb_annotations[["function"]] <- gsub(pattern="\\+", replacement=" ",
x=mtb_annotations[["function"]])
rownames(mtb_annotations) <- mtb_annotations[["ID"]]
Download from microbesonline
Apparently I queried the microbesonline too often and now I get an error whenever I try to use them, this disappoints me.
## I made a nifty function to do this stuff: load_uniprotws_annotations().
## It is slow, though.
mtb_microbes <- load_microbesonline_annotations(id=83332)
## The species being downloaded is: Mycobacterium tuberculosis H37Rv
Getting ontology data
## The species being downloaded is: Mycobacterium tuberculosis H37Rv and is being downloaded as 83332.tab.
Some pattern matching
This little block is intended to seek out peptide sequences with the highly degenerate pattern: Y(E|D) with a varying number of amino acids between the Y and (E or D).
get_hits <- function(patterns, peptide_file, pct_limit=0.25) {
pep_seq <- Biostrings::readAAStringSet(peptide_file)
pep_lst <- as.data.frame(pep_seq)[["x"]]
pos_df <- data.frame(stringsAsFactors=FALSE)
pep_names <- names(pep_seq)
rowname <- ""
for (r in 1:length(pep_names)) {
rowname <- pep_names[r]
new_row <- vector()
found_hits <- FALSE
for (p in 1:length(patterns)) {
pat <- patterns[[p]]
pname <- names(patterns)[p]
column <- gregexpr(pattern=pat, text=pep_lst)
names(column) <- names(pep_seq)
name <- names(column)[r]
row <- column[[r]]
len <- nchar(pep_lst[r])
pct <- as.numeric(row) / len
cdist <- len - as.numeric(row)
pos_name <- glue::glue("pos_{pname}")
num_name <- glue::glue("number_{pname}")
nt_name <- glue::glue("nt_{pname}")
ct_name <- glue::glue("ct_{pname}")
pct_name <- glue::glue("pct_{pname}")
pos_name <- glue::glue("pos_{pname}")
## An important caveat here: There may be more than one value.
new_row["rowname"] <- rowname
number_hits <- length(as.numeric(row))
if (as.numeric(row)[1] == -1) {
number_hits <- "0"
}
new_row[num_name] <- number_hits
new_row[pos_name] <- toString(as.numeric(row))
if (new_row[pos_name] == "-1") {
new_row[pos_name] <- "0"
} else {
found_hits <- TRUE
}
new_row["length"] <- len
op <- options(warn=2)
if (pct[1] < 0) {
new_row[pct_name] <- "0"
} else {
new_row[pct_name] <- toString(pct)
}
options(op)
nt_str <- ""
ct_str <- ""
max_pct <- 1 - pct_limit
for (i in pct) {
if (i < 0) {
next
}
if (i < pct_limit) {
nt_str <- paste0(nt_str, ", ", i)
}
if (i > max_pct) {
ct_str <- paste0(ct_str, ", ", i)
}
}
nt_str <- gsub(pattern="^\\, ", replacement="", x=nt_str)
ct_str <- gsub(pattern="^\\, ", replacement="", x=ct_str)
new_row[nt_name] <- nt_str
new_row[ct_name] <- ct_str
}
if (isTRUE(found_hits)) {
pos_df <- rbind(pos_df, new_row, stringsAsFactors=FALSE)
colnames(pos_df) <- names(new_row)
}
} ## End for every gene
rownames(pos_df) <- pos_df[["rowname"]]
pos_df <- pos_df[-1, ]
return(pos_df)
}
##patterns <- c("Y(E|D)", "Y.(E|D)", "Y..(E|D)", "Y...(E|D)", "Y....(E|D)", "Y.....(E|D)")
mtb_cds <- "reference/mtb_cds.fasta"
patterns <- c("Y..(E|D)", "Y...(E|D)")
names(patterns) <- c("two", "three")
yed_patterns <- get_hits(patterns, mtb_cds)
written <- write_xls(yed_patterns, excel="positions_by_patterns.xlsx")
## Saving to: positions_by_patterns.xlsx
Add the YED patterns to the annotation data
Compare subset of interesting proteins vs. all
Najib and Volker requested a comparison of the distribution pattern hits of all proteins vs. the distribution in a specific subset of proteins. I don’t actually know the subset desired, so I will assume the
pe_ids <- read.table("reference/annotated_pe_genes.txt")[["V1"]]
yed_pe <- yed_patterns[pe_ids, ]
all_numbers <- as.numeric(yed_patterns[["number_two"]])
pe_numbers <- as.numeric(yed_pe[["number_two"]])
allthree_numbers <- as.numeric(yed_patterns[["number_three"]])
pethree_numbers <- as.numeric(yed_pe[["number_three"]])
all_pct <- yed_patterns[["pct_two"]]
pe_pct <- yed_pe[["pct_two"]]
all_positions <- yed_patterns[["pos_two"]]
pe_positions <- yed_pe[["pos_two"]]
sampled_numbers_two <- sample(x=all_numbers, size=1000, replace=TRUE)
summary(sampled_numbers_two)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 1.15 2.00 13.00
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.329 1.000 3.000
sampled_numbers_three <- sample(x=allthree_numbers, size=1000, replace=TRUE)
summary(sampled_numbers_three)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 1.23 2.00 11.00
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.708 1.000 4.000
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 7c4477bb4fa3639cc6cf7940216e4c4b8cbee7ce
## This is hpgltools commit: Fri Oct 26 17:27:11 2018 -0400: 7c4477bb4fa3639cc6cf7940216e4c4b8cbee7ce
## Saving to 01_annotation_20180913-v20180913.rda.xz