Initial metrics

There are a set of initial metrics of the non-molested data we want to set up. These include stuff like box plots, counts of non-zero genes, some initial correlations.

Count the number of non-zero genes

One potentially interesting metric is the # of non-zero genes by condition

non_zero_genes = data.frame(id=colnames(wt_merged),
    nonzero=colSums(wt_merged > 1),
    count=colSums(wt_merged)*1e-6,
    condition=wt_data_design$condition,
    batch=wt_data_design$batch)
write.csv(non_zero_genes, file="txt/non_zero_genes.csv")

nonzero_plot = ggplot(non_zero_genes, aes(count, nonzero, color=condition, shape=batch)) +
    scale_shape_identity() +
    geom_point(stat="identity",size=5) +
    theme(axis.text.x=element_text(angle=-90)) +
    geom_text(aes(label=id), size=3,vjust=-1) +
    xlab("Relative library size in millions") +
    ylab("Number of non-zero genes") +
    theme_bw()

nonzero_plot

plot of chunk non_zero_genes

write_plots("nonzero_wrt_size", nonzero_plot)

## Cairo 
##     2

dev.off()

## null device 
##           1

Filter low count genes.

This is something my normalization function handles, so I will not modify the count tables here. However I will perform the metric so that we can see the numbers.

total_genes = dim(wt_merged)[1]
filtered = wt_merged[rowSums(wt_merged) > dim(wt_merged)[2],]
num_filtered = dim(filtered)[1]
print(paste("Total Number Of Genes: ",
            total_genes,
            " / Number Of Genes After Filtering: ",
            num_filtered,
            " / Percentage Of Genes after filtering: ",
            round(100 * num_filtered / total_genes, digits=2),
            "%"))

## [1] "Total Number Of Genes:  8334  / Number Of Genes After Filtering:  8114  / Percentage Of Genes after filtering:  97.36 %"

Plot the raw library size.

I have a function in my_functions.R which does this relatively easily.

my_libsize(wt_merged, wt_design, wt_colors)

plot of chunk plot_libsizes

write_plots("libsizes", nonzero_plot)

## Cairo 
##     2

dev.off()

## null device 
##           1

Initial, raw data metrics

my_functions.R also has some helpers for dealing with these various plots

I am including a raw boxplot to just show that rnaseq data is pretty heavily centered at zero.

## First the goofy looking utterly raw boxplot
require.auto("reshape")
raw_boxplot = my_boxplot(wt_merged, wt_colors)

## Using id as id variables
## Using gene, variable as id variables

raw_boxplot

plot of chunk initial_raw_plots

dev.off()

## null device 
##           1

## Now a non-normalized, cpm, log2 boxplot
wt_merged_log2cpm = my_norm(wt_merged, out_type="cpm", filter="log2")
raw_log2cpm_boxplot = my_boxplot(wt_merged_log2cpm$counts, wt_colors)

## Using id as id variables
## Using gene, variable as id variables

raw_log2cpm_boxplot

plot of chunk initial_raw_plots

write_plots("log2cpm_boxplot", raw_log2cpm_boxplot)

## Warning: semi-transparency is not supported on this device: reported only
## once per page

## Cairo 
##     2

dev.off()

## null device 
##           1

## Lets graph the raw data right quick
wt_merged_cor = my_corheat(wt_merged, wt_colors, design=wt_data_design)

plot of chunk initial_raw_plots

##wt_merged_cor
##write_plots("initial_pearson_correlation_heatmap", wt_merged_cor)
wt_merged_cor = my_corheat(wt_merged, wt_colors, design=wt_data_design, method="spearman")

plot of chunk initial_raw_plots

wt_merged_smc = my_smc(wt_merged, wt_colors)

plot of chunk initial_raw_plots

wt_merged_dis = my_disheat(wt_merged, wt_colors, design=wt_data_design)

plot of chunk initial_raw_plots

##wt_merged_dis
##write_plots("initial_euclidean_distance_heatmap", wt_merged_dis)
wt_merged_smd = my_smd(wt_merged, wt_colors)

plot of chunk initial_raw_plots

wt_merged_pca = my_pca(wt_merged, wt_colors, wt_data_design)

## [1] 1 1 2 2 3 3

## Loading required package: proto

wt_merged_pca

plot of chunk initial_raw_plots

write_plots("initial_merged_pca", wt_merged_pca)

## Cairo 
##     2

Normalized, log2 data metrics

Now lets do a normalization of the data and run those metrics again.

wt_merged_quant_log2cpm = my_norm(wt_merged, norm_type="quant", out_type="cpm", filter="log2")
## Pull out the counts for ease of use
wt_quant_log2cpm = wt_merged_quant_log2cpm$counts
## Now copy/paste the stuff from above with the new data frame.
wt_quant_log2cpm_cor = my_corheat(wt_quant_log2cpm, wt_colors, design=wt_data_design)

plot of chunk normalize_replot

##wt_quant_log2cpm_cor
##write_plots("quant_log2cpm_correlation_heatmap", wt_quant_log2cpm_cor)
wt_quant_log2cpm_cor = my_corheat(wt_quant_log2cpm, wt_colors, design=wt_data_design, method="spearman")

plot of chunk normalize_replot

wt_quant_log2cpm_smc = my_smc(wt_quant_log2cpm, wt_colors)

plot of chunk normalize_replot

wt_quant_log2cpm_dis = my_disheat(wt_quant_log2cpm, wt_colors, design=wt_data_design)

plot of chunk normalize_replot

##wt_quant_log2cpm_dis
##write_plots("quant_log2cpm_euclidean_heatmap", wt_quant_log2cpm_dis)
wt_quant_log2cpm_smd = my_smd(wt_quant_log2cpm, wt_colors)

plot of chunk normalize_replot

wt_quant_log2cpm_pca = my_pca(wt_quant_log2cpm, wt_colors, wt_data_design)

## [1] 1 1 2 2 3 3

wt_quant_log2cpm_pca

plot of chunk normalize_replot

write_plots("quant_log2cpm_pca", wt_quant_log2cpm_pca)

## Cairo 
##     2

## One thing my little pca function does not do is print out the table of components
wt_merged_decomposed = makeSVD(wt_merged)
components_table = pcRes(wt_merged_decomposed$v,
    wt_merged_decomposed$d,
    wt_data_design$condition, wt_data_design$batch)
components_table

##   propVar cumPropVar cond.R2 batch.R2
## 1   63.18      63.18   26.73    27.69
## 2   20.76      83.94   61.49    25.13
## 3    8.64      92.58    9.56    87.22
## 4    4.94      97.52    1.73    49.96
## 5    2.49     100.01    0.49    10.01
## 6    0.00     100.01    9.08    33.86

## Hey check it out, the batch effect is not very bad.

Save data

yay!