This will be a very minimal analysis until we get some replicates.
I am using mm38_95.
## My load_biomart_annotations() function defaults to human, so that will be quick.
mm_annot <- load_biomart_annotations(species="mmusculus")
## The biomart annotations file already exists, loading from it.
mm_annot <- mm_annot[["annotation"]]
mm_annot[["txid"]] <- paste0(mm_annot[["ensembl_transcript_id"]], ".", mm_annot[["version"]])
rownames(mm_annot) <- make.names(mm_annot[["ensembl_gene_id"]], unique=TRUE)
tx_gene_map <- mm_annot[, c("txid", "ensembl_gene_id")]
So, I now have 2 data frames of parasite annotations and 1 human.
I am going to write a quick sample sheet in the current working directory called ‘all_samples.xlsx’ and put the names of the count tables in it.
Here I combine the metadata, count data, and annotations.
It is worth noting that the gene IDs from htseq-count probably do not match the annotations retrieved because they are likely exon-based rather than gene based. This is not really a problem, but don’t forget it!
mm38_salmon <- sm(create_expt("sample_sheets/all_samples.xlsx", tx_gene_map=tx_gene_map,
gene_info=mm_annot, file_column="salmonfile"))
mmtx_annot <- mm_annot
rownames(mmtx_annot) <- mm_annot[["txid"]]
mm38_saltx <- sm(create_expt("sample_sheets/all_samples.xlsx",
gene_info=mmtx_annot, file_column="salmonfile"))
hisat_annot <- mm_annot
rownames(hisat_annot) <- paste0("gene.", rownames(hisat_annot))
mm38_hisat <- create_expt("sample_sheets/all_samples.xlsx",
gene_info=hisat_annot)
## Reading the sample metadata.
## The sample definitions comprises: 8 rows(samples) and 8 columns(metadata fields).
## Reading count tables.
## Reading count tables with read.table().
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_01/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_02/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows and merges to 25788 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_03/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows and merges to 25788 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_04/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows and merges to 25788 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_05/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows and merges to 25788 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_06/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows and merges to 25788 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_07/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows and merges to 25788 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mmusculus_iprgc_2019/preprocessing/iprgc_08/outputs/hisat2_mm38_95/r1_trimmed.count.xz contains 25788 rows and merges to 25788 rows.
## Finished reading count tables.
## Matched 25554 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 25783 rows and 8 columns.
In this block I will calculate all the diagnostic plots, but not show them. I will show them next with a little annotation.
I will leave the output for the first of each invocation and silence it for the second.
mm38_norm_sa <- normalize_expt(mm38_salmon, norm="quant", convert="cpm",
transform="log2", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 2794 low-count genes (3970 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 1105 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
mm38_norm_hi <- normalize_expt(mm38_hisat, norm="quant", convert="cpm",
transform="log2", filter=TRUE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 12233 low-count genes (13550 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 19 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
## Error in plot_nonzero(expt, title = nonzero_title, ...) :
## object 'mm38_norm' not found
## Error in plot_libsize(expt, title = libsize_title, ...) :
## object 'mm38_norm' not found
## Error in plot_boxplot(expt, title = boxplot_title, ...) :
## object 'mm38_norm' not found
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, :
## object 'mm38_norm' not found
## Error in plot_sm(expt, method = cormethod, title = smc_title, ...) :
## object 'mm38_norm' not found
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, :
## object 'mm38_norm' not found
## Error in plot_sm(expt, method = distmethod, title = smd_title, ...) :
## object 'mm38_norm' not found
## Error in plot_pca(expt, title = pca_title, ...) :
## object 'mm38_norm' not found
## Error in plot_pca(..., pc_method = "tsne") : object 'mm38_norm' not found
## Error in plot_density(expt, title = dens_title, ...) :
## object 'mm38_norm' not found
## Error in plot_variance_coefficients(expt, title = dens_title, ...) :
## object 'mm38_norm' not found
## Error in plot_topn(expt, title = topn_title, ...) :
## object 'mm38_norm' not found
## Error in plot_nonzero(expt, title = nonzero_title, ...) :
## object 'mm38_norm' not found
## Error in plot_libsize(expt, title = libsize_title, ...) :
## object 'mm38_norm' not found
## Error in plot_boxplot(expt, title = boxplot_title, ...) :
## object 'mm38_norm' not found
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, :
## object 'mm38_norm' not found
## Error in plot_sm(expt, method = cormethod, title = smc_title, ...) :
## object 'mm38_norm' not found
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, :
## object 'mm38_norm' not found
## Error in plot_sm(expt, method = distmethod, title = smd_title, ...) :
## object 'mm38_norm' not found
## Error in plot_pca(expt, title = pca_title, ...) :
## object 'mm38_norm' not found
## Error in plot_pca(..., pc_method = "tsne") : object 'mm38_norm' not found
## Error in plot_density(expt, title = dens_title, ...) :
## object 'mm38_norm' not found
## Error in plot_variance_coefficients(expt, title = dens_title, ...) :
## object 'mm38_norm' not found
## Error in plot_topn(expt, title = topn_title, ...) :
## object 'mm38_norm' not found
## Error in normalize_expt(expt, filter = TRUE): object 'mm38_norm' not found
## Error in eval(expr, envir, enclos): object 'mm38n_plots_hi' not found
## Error in eval(expr, envir, enclos): object 'mm38n_plots_hi' not found
The only interesting DE I see in this is to compare the retinas to the dlgns. I can treat them as replicates and compare.
These differential expression analyses are EXPLICITLY NOT what you care about. However, they are useful for two purposes:
When we receive full replicate sets, this cheater method of encapsulating the data will not longer be required.
mm_sa <- set_expt_conditions(mm38_salmon, fact="celltype")
mm_norm_sa <- sm(normalize_expt(mm_sa, convert="rpkm", transform="log2", column="cds_length"))
plot_pca(mm_norm_sa)$plot
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 2794 low-count genes (3970 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 1105 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
## Plotting a PCA before surrogates/batch inclusion.
## Not putting labels on the plot.
## Assuming no batch in model for testing pca.
## Not putting labels on the plot.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
Until we get full replicates, I will do simple subtractions.
In an attempt to keep some clarity in the terms used, I want to define them now. There are three contexts in which we will consider the data:
The individual sample type. When considering individual samples, I will use three terms in this and only this context: wild-type (wt), het, and mut.
The individual translatome. These are defines as something / baseline. I will exclusively call the wt samples ‘baseline’ when speaking in this context. I will exclusively state ‘normal’ when referring to het / wt samples, and I will state ‘ko’ when referring to mut / wt samples in the translatome context.
Translatome vs. translatome. Whenever comparing translatomes, I will use the names as in #2 and always put the numerator first when writing the name of a comparison.
The most complex example of the above nomenclature is:
“normko_retdlgn is defined as normret_vs_normdlgn - koret_vs_kodlgn”
This states we are examining at the translatome context: (norm(retina translatome) - norm(dlgn translatome)) - (ko(retina translatome) - ko(dlgn translatome))
Which in turn is synonymous to the following at the sample context: ((rethet - retwt) - (dlgnhet - dlgnwt)) - ((retko - retwt) - (dlgnko - dlgnwt))
Now let us associate the various variable names with the appropriate samples:
dlgnwt <- "iprgc_01"
retwt <- "iprgc_02"
scnwt <- "iprgc_03"
dlgnhet <- "iprgc_04"
rethet <- "iprgc_05"
scnhet <- NULL ## Does not yet exist.
dlgnmut <- "iprgc_06"
retmut <- "iprgc_07"
scnmut <- "iprgc_08"
Give these variable names, now lets associate columns of the expression data with them. These are at the sample context, so the appropriate names are: ‘wt’, ‘het’, and ‘mut’. In each case I will prefix the genotype with the tissue type: ‘ret’, ‘dlgn’, and ‘scn’. Thus ‘retwt’ refers to the sample used to calculate the translatome retina baseline; in contrast ‘dlgnmut’ is the sample which provides the dlgn knockout.
## Sample context
mm38_norm <- mm_norm_sa
dlgnwt <- exprs(mm38_norm)[, dlgnwt]
retwt <- exprs(mm38_norm)[, retwt]
scnwt <- exprs(mm38_norm)[, scnwt]
dlgnhet <- exprs(mm38_norm)[, dlgnhet]
rethet <- exprs(mm38_norm)[, rethet]
dlgnmut <- exprs(mm38_norm)[, dlgnmut]
retmut <- exprs(mm38_norm)[, retmut]
scnmut <- exprs(mm38_norm)[, scnmut]
Each of the above 8 variables provides 1 column of information. We have 3 baseline comparisons available to us. In each of these we compare one wt sample to another.
## Baseline comparisons
wt_dlgnret <- dlgnwt - retwt
wt_scnret <- scnwt - retwt
wt_dlgnscn <- dlgnwt - scnwt
Simultaneously, we have 5 available translatomes. This are provided by comparing each het or mut to the associated wt. These will therefore receive names: ‘norm’ and ‘ko’ instead of ‘het’ and ‘mut’.
## Translatome context
normret <- rethet - retwt
koret <- retmut - retwt
koscn <- scnmut - scnwt
normdlgn <- dlgnhet - dlgnwt
kodlgn <- dlgnmut - dlgnwt
Given these translatomes, there are a few contrasts of likely interest. These are performed by comparing the relevant translatomes.
Will will split these into 4 separate categories: het vs het, ko vs ko, ko vs het, and ratio vs ratio.
Finally, note that we are being explicitly redundant in these definitions. I am making variable names for both the a/b ratio and the b/a ratio. Thus we have some redundantly redundant (haha) flexibility when deciding on what we want to plot.
## ko vs ko
koret_vs_kodlgn <- koret - kodlgn
kodlgn_vs_koret <- kodlgn - koret
koret_vs_koscn <- koret - koscn
koscn_vs_koret <- koscn - koret
kodlgn_vs_koscn <- kodlgn - koscn
koscn_vs_kodlgn <- koscn - kodlgn
On the other hand, I am assuming we always want the normals as denominators and kos as numerators.
Finally, here is the ratio of ratios example I printed above:
I named it ‘normko_retdlgn’ in an attempt to make clear that it is actually: (normret/normdlgn)/(koret/kodlgn)
or stated differently: “norm divided by ko for ret divided by dlgn.”
My matrix of data will now contain 1 column for each of the above 27 samples/comparisons.
pair_mtrx <- cbind(
## Individual samples
dlgnwt, retwt, scnwt, dlgnhet, rethet, dlgnmut, retmut, scnmut,
## Baseline comparisons
wt_dlgnret, wt_scnret, wt_dlgnscn,
## Baseline subtractions
normdlgn, normret, kodlgn, koret, koscn,
## het_vs_het, of which there is only 1 because we do not have hetscn
normdlgn_vs_normret, normret_vs_normdlgn,
## ko_vs_ko, of which we have 3
koret_vs_kodlgn, kodlgn_vs_koret,
koret_vs_koscn, koscn_vs_koret,
kodlgn_vs_koscn, koscn_vs_kodlgn,
## ko_vs_het, 3 including one getting around missing hetscn
koret_vs_normret, kodlgn_vs_normdlgn,
## ratio of ratios
normko_retdlgn)
I am not sure if we will use these indexes, but I am writing these out as subsets of genes to look at. These indexes are stating that, given a cutoff (0), we want to look at only the genes which have higher x / baseline values than the cutoff.
## Queries about gene subsets.
## These are all in the context of translatomes.
cutoff <- 0
ret_kept_idx <- normret > cutoff & koret > cutoff
scn_kept_idx <- koscn > cutoff
dlgn_kept_idx <- normdlgn > cutoff & kodlgn > cutoff
ret_dlgn_kept_idx <- ret_kept_idx & dlgn_kept_idx
ret_scn_kept_idx <- ret_kept_idx & scn_kept_idx
dlgn_scn_kept_idx <- dlgn_kept_idx & scn_kept_idx
##normdlgn_vs_normret[!ret_dlgn_kept_idx] <- NA
##normret_vs_normdlgn[!ret_dlgn_kept_idx] <- NA
##koret_vs_kodlgn[!ret_dlgn_kept_idx] <- NA
##kodlgn_vs_koret[!ret_dlgn_kept_idx] <- NA
##koret_vs_koscn[!ret_scn_kept_idx] <- NA
##koscn_vs_koret[!ret_scn_kept_idx] <- NA
##kodlgn_vs_koscn[!dlgn_scn_kept_idx] <- NA
##koscn_vs_kodlgn[!dlgn_scn_kept_idx] <- NA
##koret_vs_normret[!ret_kept_idx] <- NA
##kodlgn_vs_normdlgn[!dlgn_kept_idx] <- NA
##normko_retdlgn <- normko_retdlgn[!ret_dlgn_kept_idx] <- NA
I will use my function combine_de_tables() to add this information to my existing annotation data along with the results from the statistically valid comparison of the three tissue types.
mm_tables <- sm(combine_de_tables(
mm_de_sa, extra_annot=pair_mtrx,
excel=glue::glue("excel/{rundate}mm_salmon_tables-v{ver}.xlsx")))
## Put retina baseline on y axis as black, retina het on x axis as black.
## Then recolor a subset of these as red, the reds are when normret > 0
library(ggplot2)
plotted <- as.data.frame(pair_mtrx[, c("rethet", "retwt")])
red_idx <- normret > 0
plotted[, "color"] <- ifelse(red_idx, "black", "red")
ret_hetwt <- ggplot2::ggplot(plotted, aes_string(x="rethet",
y="retwt",
color="color")) +
geom_point(alpha=0.5) +
scale_color_manual(values=c("black", "red"))
ret_hetwt
## Warning: Removed 37 rows containing missing values (geom_point).
## Axon translatome specific
## x-axis: normdlgn_vs_normret or normret_vs_normdlgn,
## ^^^^
## y-axis: dlgnwt-retwt (baseline dlgn - baseline retina)
plotted <- as.data.frame(pair_mtrx[, c("normdlgn_vs_normret", "wt_dlgnret")])
red_idx <- normret > 0
plotted[, "color"] <- ifelse(red_idx, "black", "red")
axon_trans_ret_target <- ggplot2::ggplot(plotted, aes_string(x="normdlgn_vs_normret",
y="wt_dlgnret",
color="color")) +
geom_point(alpha=0.5) +
scale_color_manual(values=c("black", "red"))
axon_trans_ret_target
## Warning: Removed 37 rows containing missing values (geom_point).
## DLGN translatome wrt. Retina translatome
plotted <- pair_mtrx[, c("normret", "normdlgn")]
normret_normdlgn <- plot_scatter(plotted)
normret_normdlgn
plotted <- pair_mtrx[, c("koret", "koscn")]
koret_koscn_plot <- plot_scatter(plotted)
koret_koscn_plot
plotted <- pair_mtrx[, c("normdlgn", "kodlgn")]
normdlgn_kodlgn_plot <- plot_scatter(plotted)
normdlgn_kodlgn_plot
plotted <- pair_mtrx[, c("normret", "koret")]
normret_koret_plot <- plot_scatter(plotted)
normret_koret_plot
plotted <- pair_mtrx[, c("normret_vs_normdlgn", "koret_vs_kodlgn")]
normal_ko_axon_translatome <- plot_scatter(plotted)
normal_ko_axon_translatome
As I understand it, there is some interest in an ontology search using the ratio of ratios.
## [1] 1877
## [1] 476
ror_gprofiler_up <- simple_gprofiler(sig_genes=ror_up, species="mmusculus",
excel=glue::glue("excel/{rundate}mm_ror_gpfoiler_up-v{ver}.xlsx"))
## Performing gProfiler GO search of 1877 genes against mmusculus.
## GO search found 271 hits.
## Performing gProfiler KEGG search of 1877 genes against mmusculus.
## KEGG search found 8 hits.
## Performing gProfiler REAC search of 1877 genes against mmusculus.
## REAC search found 11 hits.
## Performing gProfiler MI search of 1877 genes against mmusculus.
## MI search found 0 hits.
## Performing gProfiler TF search of 1877 genes against mmusculus.
## TF search found 376 hits.
## Performing gProfiler CORUM search of 1877 genes against mmusculus.
## CORUM search found 1 hits.
## Performing gProfiler HP search of 1877 genes against mmusculus.
## HP search found 5 hits.
## Writing data to: excel/20200114mm_ror_gpfoiler_up-v20200114.xlsx.
## Error in `levels<-`(`*tmp*`, value = as.character(levels)) :
## factor level [29] is duplicated
## Error in `levels<-`(`*tmp*`, value = as.character(levels)) :
## factor level [29] is duplicated
## Error in `levels<-`(`*tmp*`, value = as.character(levels)) :
## factor level [29] is duplicated
## Error in `levels<-`(`*tmp*`, value = as.character(levels)) :
## factor level [29] is duplicated
## Finished writing data.
## Error in `levels<-`(`*tmp*`, value = as.character(levels)): factor level [29] is duplicated
ror_gprofiler_down <- simple_gprofiler(sig_genes=ror_down, species="mmusculus",
excel=glue::glue("excel/{rundate}mm_ror_gpfoiler_down-v{ver}.xlsx"))
## Performing gProfiler GO search of 476 genes against mmusculus.
## GO search found 82 hits.
## Performing gProfiler KEGG search of 476 genes against mmusculus.
## KEGG search found 7 hits.
## Performing gProfiler REAC search of 476 genes against mmusculus.
## REAC search found 2 hits.
## Performing gProfiler MI search of 476 genes against mmusculus.
## MI search found 0 hits.
## Performing gProfiler TF search of 476 genes against mmusculus.
## TF search found 6 hits.
## Performing gProfiler CORUM search of 476 genes against mmusculus.
## CORUM search found 0 hits.
## Performing gProfiler HP search of 476 genes against mmusculus.
## HP search found 0 hits.
## Writing data to: excel/20200114mm_ror_gpfoiler_down-v20200114.xlsx.
## Finished writing data.