## Time-stamp: 

## Please remember to do a make in the myr/ directory before render()ing this.
## This block serves to load requisite libraries and set some options.
options(java.parameters="-Xmx32g -Xms4g")  ## used for xlconnect -- damn 4g wasn't enough
library("myr")
autoloads()
opts_knit$set(progress=TRUE, verbose=TRUE, stop_on_error=FALSE, error=TRUE, fig.width=800/192, fig.height=800/192, dpi=19)
theme_set(theme_bw(base_size=10))
set.seed(1)

Looking for interesting RNAs in mouse exosomes.

Loading/Saving data and generating reports

The following block is intended to only be evaluated manually when playing with data and generating reports. It is not evaluated by knitr so that the data is generated de-novo when the report is made.

load("RData")
##rm(list=ls())
save(list=ls(all=TRUE), file="RData")
render("exosome.rmd", output_format="pdf_document")
##render("exosome.rmd", output_format="html_document")
render("exosome.rmd", 'knitrBootstrap::bootstrap_document')

Genome annotation input

Annotation data for the genes is important for everything which follows. The TritrypDB reference gff files are the source for this information.

I dumped the homo sapiens gtf file into human.tab and substituted all the '; ' into tabs so that it is possible to easily extract all the annotation data from the otherwise unwieldy gtf file.

The resulting columns of interest (to me, anyway) are: V9: gene_id ENSG000... <<-- the count tables use this V10: transcript_id ENST.... V12: gene_name RP11... V13: transcript_name RP11-... V14: protein_id ENSP...

The following block I told to cache, because loading the human annotations takes too long.

gff2tooltip = function(filename, type) {
    annotations = as.data.frame(import.gff3(filename))
    annotations = subset(annotations, type==type)
    annotations$ID = gsub("exon:", "", annotations$ID)
    rownames(annotations) = make.names(annotations$ID, unique=TRUE)
    if (is.null(annotations$Name)) {
        tooltip = annotations[,c("ID","Parent")]
        tooltip$tooltip = paste(tooltip$ID, tooltip$Parent, sep=";")        
    } else {
        tooltip = annotations[,c("ID","Name","Parent")]        
        tooltip$tooltip = paste(tooltip$ID, tooltip$Name, tooltip$Parent, sep=";")
    }
    tooltip$tooltip = gsub("exon:", "", tooltip$tooltip)
    tooltip$tooltip = gsub("character\\(0\\)", "", tooltip$tooltip)    
    tooltip$tooltip = gsub("NA", "", tooltip$tooltip)
    tooltip$tooltip = gsub("", "", tooltip$tooltip)        
    tooltip = tooltip[-1]
    tooltip = tooltip[-1]
    tooltip = tooltip[-1]
    ret = list(genes=annotations, tooltip=tooltip)
    return(ret)
}

miRNA =  gff2tooltip("reference/mmusculus_mi.gff.xz", "transcript")
snRNA = gff2tooltip("reference/mmusculus_sn.gff.xz", "transcript")
snoRNA = gff2tooltip("reference/mmusculus_sno.gff.xz", "transcript")
miscRNA = gff2tooltip("reference/mmusculus_misc.gff.xz", "transcript")
lincRNA = gff2tooltip("reference/mmusculus_linc.gff.xz", "transcript")
antiRNA = gff2tooltip("reference/mmusculus_antisense.gff.xz", "transcript")
exonRNA = gff2tooltip("reference/mmusculus_exon.gff.xz", "exon")
pseudo =  gff2tooltip("reference/mmusculus_pseudo.gff.xz", "transcript")
fivep = gff2tooltip("reference/mmusculus_fiveputr.gff.xz", "exon")
threep = gff2tooltip("reference/mmusculus_threeputr.gff.xz", "exon")
rintron = gff2tooltip("reference/mmusculus_rintron.gff.xz", "exon")

Set up experimental design

The following couple of lines reads the csv of samples and sets some initial colors for plots.

Initial sample estimation

The following block of code sets up an experiment of all the samples and plots the various metrics used to estimate batch effect and sample quality, etc.

mirna = create_expt("samples.csv", suffix="_forward-trimmed-def_mi.count.xz", genes=miRNA$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

snrna = create_expt("samples.csv", suffix="_forward-trimmed-def_sn.count.xz", genes=snRNA$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

snorna = create_expt("samples.csv", suffix="_forward-trimmed-def_sno.count.xz", genes=snoRNA$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

miscrna = create_expt("samples.csv", suffix="_forward-trimmed-def_misc.count.xz", genes=miscRNA$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

lincrna = create_expt("samples.csv", suffix="_forward-trimmed-def_linc.count.xz", genes=lincRNA$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

antirna = create_expt("samples.csv", suffix="_forward-trimmed-def_antisense.count.xz", genes=antiRNA$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

pseudorna = create_expt("samples.csv", suffix="_forward-trimmed-def_pseudo.count.xz", genes=pseudo$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

exonrna = create_expt("samples.csv", suffix="_forward-trimmed-def_exon.count.xz", genes=exonRNA$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

fiveprna = create_expt("samples.csv", suffix="_forward-trimmed-def_fiveputr.count.xz", genes=fivep$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

threeprna = create_expt("samples.csv", suffix="_forward-trimmed-def_threeputr.count.xz", genes=threep$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

rintronrna = create_expt("samples.csv", suffix="_forward-trimmed-def_rintron.count.xz", genes=rintron$genes, by_sample=TRUE)

## [1] "This function needs the conditions and batches to be an explicit column in the sample sheet."
## [1] "Please note that thus function assumes a specific set of columns in the sample sheet:"
## [1] "The most important ones are: Sample.ID, Stage, Type."
## [1] "Other columns it will attempt to create by itself, but if"
## [1] "batch and condition are provided, that is a nice help."

Quantifying relative species

Najib's nice idea was to take the % of each category against the whole and just show that.

mi_found = colSums(exprs(mirna$expressionset))
sn_found = colSums(exprs(snrna$expressionset))
sno_found = colSums(exprs(snorna$expressionset))
misc_found = colSums(exprs(miscrna$expressionset))
linc_found = colSums(exprs(lincrna$expressionset))
anti_found = colSums(exprs(antirna$expressionset))
pseudo_found = colSums(exprs(pseudorna$expressionset))
exon_found = colSums(exprs(exonrna$expressionset))
fivep_found = colSums(exprs(fiveprna$expressionset))
threep_found = colSums(exprs(threeprna$expressionset))
rintron_found = colSums(exprs(rintronrna$expressionset))

total = mi_found
total = rbind(total, sn_found, sno_found, misc_found, linc_found, anti_found, pseudo_found, exon_found, fivep_found, threep_found, rintron_found)
rownames(total) = c("mi", "sn", "sno","misc","linc","anti","pseudo","exon","fivep","threep", "rintron")
totals = colSums(total)

pct = as.data.frame(total)
pct$hpgl0522 = (pct$hpgl0522 / totals["hpgl0522"]) * 100.0
pct$hpgl0523 = (pct$hpgl0523 / totals["hpgl0523"]) * 100.0
pct$hpgl0524 = (pct$hpgl0524 / totals["hpgl0524"]) * 100.0
pct$hpgl0525 = (pct$hpgl0525 / totals["hpgl0525"]) * 100.0
pct$hpgl0526 = (pct$hpgl0526 / totals["hpgl0526"]) * 100.0
pct$hpgl0527 = (pct$hpgl0527 / totals["hpgl0527"]) * 100.0
pct

##             hpgl0522   hpgl0523  hpgl0524   hpgl0525    hpgl0526
## mi       0.170214364  4.6937173  6.719178  6.6107012  0.30379676
## sn       0.005198314 32.8096675 10.524452  9.9348611  0.21527044
## sno      0.008275577 10.9020987  9.029677  3.8649893  0.06427539
## misc     0.042203704  1.3100507  4.011671  4.5604661  0.49505948
## linc     2.070889271 25.9561705 33.173892 26.8711015 36.34498860
## anti     0.361765287  0.1873744  0.807763  0.7640509  1.00842876
## pseudo   5.408793731  0.8302983  1.991098  2.7292153  1.73665156
## exon    52.433830263 11.7468378 17.947262 26.9382199 45.37310996
## fivep    2.402290492  0.5010640  1.901453  7.9272908  2.21482573
## threep  27.107521956  8.8425816 10.429641  5.2025717  6.72612430
## rintron  9.989017040  2.2201392  3.463913  4.5965322  5.51746901
##            hpgl0527
## mi       0.39949682
## sn       0.03091192
## sno      0.04368293
## misc     0.69953385
## linc    25.11203235
## anti     1.54078622
## pseudo   1.70125268
## exon    45.21577404
## fivep   13.13301136
## threep   4.57379613
## rintron  7.54972172

mi_norm = my_norm(expt=mirna, transform="log2", norm="quant", convert="cpm")$counts
mi_test = mi_norm[,c("hpgl0524","hpgl0525")]
mi_scatter = my_linear_scatter(mi_test, loess=TRUE, identity=TRUE)

## Warning in lmrob.S(x, y, control = control, mf = mf): S-estimated scale ==
## 0: Probably exact fit; check your data

## [1] "No binwidth nor bins provided, setting it to 0.0306054087304394 in order to have 500 bins."

mi_scatter$scatter

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : pseudoinverse used at -0.076514

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : neighborhood radius 1.192

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : reciprocal condition number 1.4068e-28

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : There are other near singularities as well. 1.2443

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : pseudoinverse used at -0.076514

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : neighborhood radius 1.192

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : reciprocal condition number 1.4068e-28

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : There are other near singularities as well. 1.2443

mi_scatter$lm_model

## 
## Call:
## robustbase::lmrob(formula = second ~ first, data = df, method = "SMDM")
## 
## Exact fit detected
## 
## Coefficients:
## (Intercept)        first  
##           0            0

mi_scatter$correlation

## 
##  Pearson's product-moment correlation
## 
## data:  df[, 1] and df[, 2]
## t = 92.2091, df = 1971, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8923570 0.9089928
## sample estimates:
##       cor 
## 0.9010056

misc_norm = my_norm(expt=miscrna, transform="log2", norm="quant", convert="cpm")$counts
misc_test = misc_norm[,c("hpgl0524","hpgl0525")]
misc_scatter = my_linear_scatter(misc_test, loess=TRUE, identity=TRUE)

## Warning in lmrob.S(x, y, control = control, mf = mf): S-estimated scale ==
## 0: Probably exact fit; check your data

## [1] "No binwidth nor bins provided, setting it to 0.0297091037652765 in order to have 500 bins."

misc_scatter$scatter

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : pseudoinverse used at -0.074273

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : neighborhood radius 1.2967

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : reciprocal condition number 8.3078e-31

## Warning in simpleLoess(y, x, w, span, degree, parametric, drop.square,
## normalize, : There are other near singularities as well. 1.4942

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : pseudoinverse used at -0.074273

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : neighborhood radius 1.2967

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : reciprocal condition number 8.3078e-31

## Warning in predLoess(y, x, newx, s, weights, pars$robust, pars$span,
## pars$degree, : There are other near singularities as well. 1.4942

misc_scatter$lm_model

## 
## Call:
## robustbase::lmrob(formula = second ~ first, data = df, method = "SMDM")
## 
## Exact fit detected
## 
## Coefficients:
## (Intercept)        first  
##           0            0

misc_scatter$correlation

## 
##  Pearson's product-moment correlation
## 
## data:  df[, 1] and df[, 2]
## t = 50.728, df = 588, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8860276 0.9162153
## sample estimates:
##       cor 
## 0.9022207

sn_norm = my_norm(expt=snrna, transform="log2", norm="quant", convert="cpm")$counts
sn_test = sn_norm[,c("hpgl0524","hpgl0525")]
sn_scatter = my_linear_scatter(sn_test, loess=TRUE, identity=TRUE)

## Warning in lmrob.S(x, y, control = control, mf = mf): find_scale() did not
## converge in 'maxit.scale' (= 200) iterations

## Warning in lmrob.S(x, y, control = control, mf = mf): find_scale() did not
## converge in 'maxit.scale' (= 200) iterations

## Warning in lmrob.S(x, y, control = control, mf = mf): find_scale() did not
## converge in 'maxit.scale' (= 200) iterations

## Warning in lmrob.S(x, y, control = control, mf = mf): find_scale() did not
## converge in 'maxit.scale' (= 200) iterations

## Warning in lmrob.S(x, y, control = control, mf = mf): find_scale() did not
## converge in 'maxit.scale' (= 200) iterations

## Warning in lmrob.S(x, y, control = control, mf = mf): find_scale() did not
## converge in 'maxit.scale' (= 200) iterations

## Warning in lmrob.S(x, y, control = control, mf = mf): S-estimated scale ==
## 0: Probably exact fit; check your data

## [1] "No binwidth nor bins provided, setting it to 0.0316787848939751 in order to have 500 bins."

sn_scatter$scatter

sn_scatter$lm_model

## 
## Call:
## robustbase::lmrob(formula = second ~ first, data = df, method = "SMDM")
## 
## Exact fit detected
## 
## Coefficients:
## (Intercept)        first  
##           0            0

sn_scatter$correlation

## 
##  Pearson's product-moment correlation
## 
## data:  df[, 1] and df[, 2]
## t = 83.2766, df = 1381, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9040039 0.9215517
## sample estimates:
##       cor 
## 0.9132001

exon_norm = my_norm(expt = exonrna, transform="log2", norm="quant", convert="cpm")$counts
exon_test = exon_norm[,c("hpgl0524","hpgl0525")]
ex_scatter = my_linear_scatter(exon_test, loess=TRUE, identity=TRUE)

## [1] "No binwidth nor bins provided, setting it to 0.0359943148478198 in order to have 500 bins."

ex_scatter$scatter

ex_scatter$lm_model

## 
## Call:
## robustbase::lmrob(formula = second ~ first, data = df, method = "SMDM")
## 
## Coefficients:
## (Intercept)        first  
##      1.3149       0.5973

ex_scatter$correlation

## 
##  Pearson's product-moment correlation
## 
## data:  df[, 1] and df[, 2]
## t = 111.338, df = 25790, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5614579 0.5779435
## sample estimates:
##      cor 
## 0.569758

linc_norm = my_norm(expt = lincrna, transform="log2", norm="quant", convert="cpm")$counts
linc_test = linc_norm[,c("hpgl0524","hpgl0525")]
linc_scatter = my_linear_scatter(linc_test, loess=TRUE, identity=TRUE)

## [1] "No binwidth nor bins provided, setting it to 0.0353558306128205 in order to have 500 bins."

linc_scatter$scatter

linc_scatter$lm_model

## 
## Call:
## robustbase::lmrob(formula = second ~ first, data = df, method = "SMDM")
## 
## Coefficients:
## (Intercept)        first  
##      0.5220       0.8911

linc_scatter$correlation

## 
##  Pearson's product-moment correlation
## 
## data:  df[, 1] and df[, 2]
## t = 90.9097, df = 2385, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8716214 0.8896138
## sample estimates:
##       cor 
## 0.8809355

rintron_norm = my_norm(expt = rintronrna, transform="log2", norm="quant", convert="cpm")$counts
rintron_test = rintron_norm[,c("hpgl0524","hpgl0525")]
rintron_scatter = my_linear_scatter(rintron_test, loess=TRUE, identity=TRUE)

## Warning in lmrob.S(x, y, control = control, mf = mf): S-estimated scale ==
## 0: Probably exact fit; check your data

## Error in lmrob..D..fit(init, x, mf = mf): NA/NaN/Inf in foreign function call (arg 5)

rintron_scatter$scatter

rintron_scatter$lm_model

## 
## Call:
## robustbase::lmrob(formula = second ~ first, data = df, method = "SMDM")
## 
## Coefficients:
## (Intercept)        first  
##      0.8800       0.4712

rintron_scatter$correlation

## 
##  Pearson's product-moment correlation
## 
## data:  df[, 1] and df[, 2]
## t = 72.0075, df = 11481, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5450477 0.5702505
## sample estimates:
##       cor 
## 0.5577777

Simple subtraction

Since I don't have replicates, I will do a simple subtraction to try to pull interesting loci out...

mi_sub = as.data.frame(exprs(normalize_expt(expt=mirna)$expressionset))

## [1] "This function defaults to using the original expressionset for normalization."

mi_sub$exo_minus_cell = mi_sub$hpgl0525 - mi_sub$hpgl0524

up_mi = subset(mi_sub, exo_minus_cell >= 1.6)[,c("hpgl0524","hpgl0525","exo_minus_cell")]
rownames(up_mi) = gsub(":\\d+","", rownames(up_mi))
up_mi = up_mi[order(up_mi$exo_minus_cell, decreasing=TRUE),]
up_mi = merge(miRNA$genes, up_mi, by.x="ID", by.y="row.names")
up_mi = up_mi[,c("ID","Name","Parent","hpgl0524","hpgl0525","exo_minus_cell")]

down_mi = subset(mi_sub, exo_minus_cell <= -1.6)[,c("hpgl0524","hpgl0525","exo_minus_cell")]
rownames(down_mi) = gsub(":\\d+","", rownames(down_mi))
down_mi = down_mi[order(down_mi$exo_minus_cell, decreasing=TRUE),]
down_mi = merge(miRNA$genes, down_mi, by.x="ID", by.y="row.names")
down_mi = down_mi[,c("ID","Name","Parent","hpgl0524","hpgl0525","exo_minus_cell")]


## Repeat for linc
linc_sub = as.data.frame(exprs(normalize_expt(expt=lincrna)$expressionset))

## [1] "This function defaults to using the original expressionset for normalization."

linc_sub$exo_minus_cell = linc_sub$hpgl0525 - linc_sub$hpgl0524

up_linc = subset(linc_sub, exo_minus_cell >= 1.6)[,c("hpgl0524","hpgl0525","exo_minus_cell")]
rownames(up_linc) = gsub(":\\d+","", rownames(up_linc))
up_linc = up_linc[order(up_linc$exo_minus_cell, decreasing=TRUE),]
up_linc = merge(lincRNA$genes, up_linc, by.x="ID", by.y="row.names")
up_linc = up_linc[,c("ID","Name","hpgl0524","hpgl0525","exo_minus_cell")]

down_linc = subset(linc_sub, exo_minus_cell <= -1.6)[,c("hpgl0524","hpgl0525","exo_minus_cell")]
rownames(down_linc) = gsub(":\\d+","", rownames(down_linc))
down_linc = down_linc[order(down_linc$exo_minus_cell, decreasing=TRUE),]
down_linc = merge(lincRNA$genes, down_linc, by.x="ID", by.y="row.names")
down_linc = down_linc[,c("ID","Name","Parent","hpgl0524","hpgl0525","exo_minus_cell")]

write_xls(up_mi, sheet="up_mi")
write_xls(down_mi, sheet="down_mi")
write_xls(up_linc, sheet="up_linc")
write_xls(down_linc, sheet="down_linc")