There are a set of initial metrics of the non-molested data we want to set up. These include stuff like box plots, counts of non-zero genes, some initial correlations.
One potentially interesting metric is the # of non-zero genes by condition
non_zero_genes = data.frame(id=colnames(wt_merged),
nonzero=colSums(wt_merged > 1),
count=colSums(wt_merged)*1e-6,
condition=wt_data_design$condition,
batch=wt_data_design$batch)
write.csv(non_zero_genes, file="txt/non_zero_genes.csv")
nonzero_plot = ggplot(non_zero_genes, aes(count, nonzero, color=condition, shape=batch)) +
scale_shape_identity() +
geom_point(stat="identity",size=5) +
theme(axis.text.x=element_text(angle=-90)) +
geom_text(aes(label=id), size=3,vjust=-1) +
xlab("Relative library size in millions") +
ylab("Number of non-zero genes") +
theme_bw()
nonzero_plot
write_plots("nonzero_wrt_size", nonzero_plot)
## Cairo
## 2
dev.off()
## null device
## 1
This is something my normalization function handles, so I will not modify the count tables here. However I will perform the metric so that we can see the numbers.
total_genes = dim(wt_merged)[1]
filtered = wt_merged[rowSums(wt_merged) > dim(wt_merged)[2],]
num_filtered = dim(filtered)[1]
print(paste("Total Number Of Genes: ",
total_genes,
" / Number Of Genes After Filtering: ",
num_filtered,
" / Percentage Of Genes after filtering: ",
round(100 * num_filtered / total_genes, digits=2),
"%"))
## [1] "Total Number Of Genes: 8334 / Number Of Genes After Filtering: 8114 / Percentage Of Genes after filtering: 97.36 %"
I have a function in my_functions.R which does this relatively easily.
my_libsize(wt_merged, wt_design, wt_colors)
write_plots("libsizes", nonzero_plot)
## Cairo
## 2
dev.off()
## null device
## 1
my_functions.R also has some helpers for dealing with these various plots
I am including a raw boxplot to just show that rnaseq data is pretty heavily centered at zero.
## First the goofy looking utterly raw boxplot
require.auto("reshape")
raw_boxplot = my_boxplot(wt_merged, wt_colors)
## Using id as id variables
## Using gene, variable as id variables
raw_boxplot
dev.off()
## null device
## 1
## Now a non-normalized, cpm, log2 boxplot
wt_merged_log2cpm = my_norm(wt_merged, out_type="cpm", filter="log2")
raw_log2cpm_boxplot = my_boxplot(wt_merged_log2cpm$counts, wt_colors)
## Using id as id variables
## Using gene, variable as id variables
raw_log2cpm_boxplot
write_plots("log2cpm_boxplot", raw_log2cpm_boxplot)
## Warning: semi-transparency is not supported on this device: reported only
## once per page
## Cairo
## 2
dev.off()
## null device
## 1
## Lets graph the raw data right quick
wt_merged_cor = my_corheat(wt_merged, wt_colors, design=wt_data_design)
##wt_merged_cor
##write_plots("initial_pearson_correlation_heatmap", wt_merged_cor)
wt_merged_cor = my_corheat(wt_merged, wt_colors, design=wt_data_design, method="spearman")
wt_merged_smc = my_smc(wt_merged, wt_colors)
wt_merged_dis = my_disheat(wt_merged, wt_colors, design=wt_data_design)
##wt_merged_dis
##write_plots("initial_euclidean_distance_heatmap", wt_merged_dis)
wt_merged_smd = my_smd(wt_merged, wt_colors)
wt_merged_pca = my_pca(wt_merged, wt_colors, wt_data_design)
## [1] 1 1 2 2 3 3
## Loading required package: proto
wt_merged_pca
write_plots("initial_merged_pca", wt_merged_pca)
## Cairo
## 2
Now lets do a normalization of the data and run those metrics again.
wt_merged_quant_log2cpm = my_norm(wt_merged, norm_type="quant", out_type="cpm", filter="log2")
## Pull out the counts for ease of use
wt_quant_log2cpm = wt_merged_quant_log2cpm$counts
## Now copy/paste the stuff from above with the new data frame.
wt_quant_log2cpm_cor = my_corheat(wt_quant_log2cpm, wt_colors, design=wt_data_design)
##wt_quant_log2cpm_cor
##write_plots("quant_log2cpm_correlation_heatmap", wt_quant_log2cpm_cor)
wt_quant_log2cpm_cor = my_corheat(wt_quant_log2cpm, wt_colors, design=wt_data_design, method="spearman")
wt_quant_log2cpm_smc = my_smc(wt_quant_log2cpm, wt_colors)
wt_quant_log2cpm_dis = my_disheat(wt_quant_log2cpm, wt_colors, design=wt_data_design)
##wt_quant_log2cpm_dis
##write_plots("quant_log2cpm_euclidean_heatmap", wt_quant_log2cpm_dis)
wt_quant_log2cpm_smd = my_smd(wt_quant_log2cpm, wt_colors)
wt_quant_log2cpm_pca = my_pca(wt_quant_log2cpm, wt_colors, wt_data_design)
## [1] 1 1 2 2 3 3
wt_quant_log2cpm_pca
write_plots("quant_log2cpm_pca", wt_quant_log2cpm_pca)
## Cairo
## 2
## One thing my little pca function does not do is print out the table of components
wt_merged_decomposed = makeSVD(wt_merged)
components_table = pcRes(wt_merged_decomposed$v,
wt_merged_decomposed$d,
wt_data_design$condition, wt_data_design$batch)
components_table
## propVar cumPropVar cond.R2 batch.R2
## 1 63.18 63.18 26.73 27.69
## 2 20.76 83.94 61.49 25.13
## 3 8.64 92.58 9.56 87.22
## 4 4.94 97.52 1.73 49.96
## 5 2.49 100.01 0.49 10.01
## 6 0.00 100.01 9.08 33.86
## Hey check it out, the batch effect is not very bad.
yay!