index.html preprocessing.html

1 Annotation version: 20170905

There are a few methods of importing annotation data into R. The ‘new’ method is to use the DBI interfaces provided by bioconductor. These were a bit more difficult for me to learn, but now that I have a general idea of how they work, I grudgingly admit that they are far superior.

1.1 OrganismDbi/OrgDb/TxDb

The mouse OrganismDbi interface is contained the library ‘Mus.musculus.’ It brings together the OrgDb and TxDb packages for the mouse: ‘org.Mm.eg.db’ and ‘TxDb.Mmusculus.UCSC.mm10.knownGene’ respectively. I am going to need to figure out how to integrate the miRNA annotations into them, but for the moment I will work just with them and hopefully do them in a way which is cleaner and clearer than before.

## orgDb packages are responsible for providing mapping to other databases,
## like GO/KEGG/Entrez/Ensembl/etc
tt <- sm(require.auto("org.Mm.eg.db"))
tmp <- sm(library("org.Mm.eg.db"))
## In contrast, TxDb packages are responsible for providing information about
## each transcript, thus providing a link between the actual genome sequence
## and the transcripts encoded within.
tt <- sm(require.auto("TxDb.Mmusculus.UCSC.mm10.knownGene"))
tmp <- sm(library("TxDb.Mmusculus.UCSC.mm10.knownGene"))
## the Mus.musculus package actually loads the previous 2, but I want to be explicit.
tt <- sm(require.auto("Mus.musculus"))
tmp <- sm(library("Mus.musculus"))

## The following section gathers annotation data from the 2015 archive of ensembl.
## In this case, all the annotations are explicitly the mirbase data
## along with a subset of other ensembl fields.
ensembl_genes <- biomaRt::useEnsembl("ensembl", host="dec2015.archive.ensembl.org")
genes_datasets <- biomaRt::listDatasets(ensembl_genes)
mouse_genes <- biomaRt::useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl",
                                   host="dec2015.archive.ensembl.org")
mirna_attribs <- c("ensembl_gene_id", "ensembl_transcript_id", "description",
                  "external_gene_name", "mirbase_accession", "mirbase_id",
                  "mgi_transcript_name", "chromosome_name")
mouse_mirna_attribs <- biomaRt::listAttributes(mart=mouse_genes)
mm_mi_genes <- biomaRt::getBM(attributes=mirna_attribs, filters="biotype",
                              values="miRNA", mart=mouse_genes)
## The following is used by miRNAtab.Rmd
mm_mi_genes$ID <- paste0("chr", mm_mi_genes$chromosome_name, "_", mm_mi_genes$ensembl_gene_id)
rownames(mm_mi_genes) <- make.names(mm_mi_genes$ID, unique=TRUE)

tt <- sm(require.auto("mirbase.db"))
library("mirbase.db")
chosen_df <- mm_mi_genes[, c("ensembl_gene_id","mirbase_id")]
chosen_df$fivep_accession <- ""
chosen_df$fivep_id <- ""
chosen_df$threep_accession <- ""
chosen_df$threep_id <- ""
mikey <- "mmu-mir-344h-1"
names <- rownames(chosen_df)
for (minum in 1:length(chosen_df$mirbase_id)) {
    mikey <- chosen_df$mirbase_id[[minum]]
    if (mikey == "") {
        next
    }
    mi_mappings <- try(get(mikey, mirbaseMATURE), silent=TRUE)
    if (class(mi_mappings) == "try-error") {
        next
    }
    mi_accessions <- mi_mappings@matureAccession
    mi_names <- mi_mappings@matureName
    name <- names[[minum]]
    chosen_df[name, "fivep_accession"] <- mi_accessions[1]
    chosen_df[name, "threep_accession"] <- mi_accessions[2]
    chosen_df[name, "fivep_id"] <- mi_names[1]
    chosen_df[name, "threep_id"] <- mi_names[2]
}
write.table(x=chosen_df, file="reference/mi_mappings.tab")

ensembl_new <- biomaRt::useEnsembl("regulation")
new_datasets <- biomaRt::listDatasets(ensembl_new)
mouse_dataset <- biomaRt::useEnsembl(biomart="regulation", dataset="mmusculus_annotated_feature")
mirna_columns <- biomaRt::listAttributes(mouse_dataset)
mirna_columns
##                        name              description              page
## 1                    efo_id       EFO term accession annotated_feature
## 2           chromosome_name Chromosome/scaffold name annotated_feature
## 3          chromosome_start               Start (bp) annotated_feature
## 4            chromosome_end                 End (bp) annotated_feature
## 5         feature_type_name             Feature type annotated_feature
## 6        feature_type_class       Feature type class annotated_feature
## 7  feature_type_description Feature type description annotated_feature
## 8            epigenome_name           Epigenome name annotated_feature
## 9     epigenome_description    Epigenome description annotated_feature
## 10             project_name             Project name annotated_feature
## 11               archive_id SRA experiment accession annotated_feature
## 12             so_accession        SO term accession annotated_feature
## 13                  so_name             SO term name annotated_feature
new_columns <- c("efo_id", "so_accession", "feature_type_name")
## 2016-09-28 This seems to be timing out right now.
new_columns_download <- biomaRt::getBM(attributes=new_columns, mart=mouse_dataset)

## In contrast, when working with the transcripts from mice, we can pull
## everything directly from the OrganismDbi.
mm_tx_org <- sm(load_orgdb_annotations(Mus.musculus, keytype="ensembltrans",
                                 fields=c("cdschrom", "definition", "genename")))
## I think the gene IDs returned by this are already perfect for the transcripts
## in the count tables.
mm_tx_genes <- mm_tx_org$genes
## The inclusion of the 'definition' columns means that we also end up with a bunch
## of redundant entries, like 200,000 of them; therefore I am going to whittle down
## the list.
mm_tx_unique <- !grepl(pattern="\\.", x=rownames(mm_tx_genes))
mm_tx_genes <- mm_tx_genes[mm_tx_unique, ]

The Mus.musculus data does not seem to include the full set of genes, lets compare to what we get when using biomart.

##mmtx_annotations <- get_biomart_annotations(species="mmusculus")
mart <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL", host="dec2015.archive.ensembl.org")
dataset <- paste0("mmusculus_gene_ensembl")
ensembl <- try(biomaRt::useDataset(dataset, mart=mart))
lots_of_rows <- biomaRt::listAttributes(ensembl)  ## List of possible attributes
## wanted_attributes <- c("ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "chromosome_name", "start_position","end_position","description", "entrezgene","hgnc_symbol","hgnc_id","uniprot_sptrembl","uniprot_swissprot","uniprot_genename")  ## attributes Lucia and I chose, but too many
wanted_attributes_global <- c("ensembl_transcript_id", "ensembl_gene_id", "chromosome_name",
                              "start_position", "end_position", "strand", "description")
wanted_attributes_names <- c("ensembl_transcript_id", "entrezgene", "hgnc_symbol", "hgnc_id")
wanted_attributes_uniprot <- c("ensembl_transcript_id", "uniprot_sptrembl", "uniprot_swissprot")

wanted_global_annotations <- biomaRt::getBM(attributes=wanted_attributes_global, mart=ensembl)
dim(wanted_global_annotations)
## [1] 114083      7
wanted_names_annotations <- biomaRt::getBM(attributes=wanted_attributes_names, mart=ensembl)
wanted_uniprot_annotations <- biomaRt::getBM(attributes=wanted_attributes_uniprot, mart=ensembl)
dim(wanted_uniprot_annotations)
## [1] 57040     3
wanted_annotations <- merge(wanted_global_annotations, wanted_uniprot_annotations,
                            by.x="ensembl_transcript_id", by.y="ensembl_transcript_id", all.y=TRUE)

length(unique(wanted_annotations$ensembl_transcript_id))
## [1] 56999
rownames(wanted_annotations) <- make.names(wanted_annotations[["ensembl_transcript_id"]], unique=TRUE)

1.2 Prepare annotation tables

The annotation data given by load_annotations() is nice, but needs a little cleaning to match up with the count tables from the experiment.

## I happen to know that the gene IDs of the miRNA data are of the format
## 'chrx_ENSMUSG' so I will change the rownames of this slightly.
mm_mi_genes[["ID"]] <- paste0("chr", mm_mi_genes[["chromosome_name"]],
                              "_", mm_mi_genes[["ensembl_gene_id"]])
rownames(mm_mi_genes) <- make.names(mm_mi_genes[["ID"]], unique=TRUE)
rownames(mm_tx_genes) <- make.names(mm_tx_genes$ensembltrans, unique=TRUE)

2 Creating an Expressionset

The expressionset is the primary sequencing data format used in R. It brings together the experiment metadata (condition/batch/etc), gene annotations, and observed counts. The excel file containing the metadata also provides filenames containing the appropriate count tables.

mm_mi <- sm(create_expt(metadata="sample_sheets/all_samplesv1.xlsx", gene_info=mm_mi_genes,
                        file_column="mifile"))
## The IDs in the expressionset for the miRNAs are of the format 'chr1_ENSMUSG....'
mm_tx <- sm(create_expt(metadata="sample_sheets/all_samplesv1.xlsx", gene_info=wanted_annotations,
                        file_column="txfile"))
## The IDs in the expressionset for transcripts are of the format 'ENSMUST....'

In the previous block, we created the primary expressionsets used in the following work. However, much of it will need to performed with subsets of this data, so let us create those here.

## Because of the shenanigans I did to change the seed length for alignments, the counts are not
## guaranteed to be in integers, so round them here.
mm_mir <- mm_mi
Biobase::exprs(mm_mir$expressionset) <- round(Biobase::exprs(mm_mir$expressionset))
mm_txr <- mm_tx
Biobase::exprs(mm_txr$expressionset) <- round(Biobase::exprs(mm_txr$expressionset))

## Subsets!
mmmi_small <- expt_subset(mm_mir, subset="librarytype=='small'")
mmmi_large <- expt_subset(mm_mir, subset="librarytype=='large'")
mmtx_small <- expt_subset(mm_txr, subset="librarytype=='small'")
mmtx_large <- expt_subset(mm_txr, subset="librarytype=='large'")

mature_annot <- read.table("reference/mi_mappings.tab")
mature_fivep <- mature_annot[, c("fivep_accession","fivep_id")]
fivep_completed <- as.logical(mature_fivep[[1]] != "")
mature_fivep <- mature_fivep[fivep_completed, ]
mature_threep <- mature_annot[, c("threep_accession","threep_id")]
threep_completed <- as.logical(mature_threep[[1]] != "" & !is.na(mature_threep[[1]]))
mature_threep <- mature_threep[threep_completed, ]
colnames(mature_fivep) <- c("accession","id")
colnames(mature_threep) <- c("accession","id")
all_mature <- as.data.frame(rbind(mature_fivep, mature_threep))
rownames(all_mature) <-  make.names(gsub(pattern="\\-", replacement="\\.", x=all_mature$id), unique=TRUE)

mmmi_mature <- sm(create_expt(metadata="sample_sheets/all_samplesv1.xlsx", gene_info=all_mature,
                              file_column="maturefile"))

At this point, we should have everything necessary to perform the various analyses. So save the current data for reuse elsewhere.

pander::pander(sessionInfo())

R version 3.4.1 (2017-06-30)

**Platform:** x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C

attached base packages: parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: mirbase.db(v.1.2.0), Mus.musculus(v.1.3.1), GO.db(v.3.4.1), OrganismDbi(v.1.18.0), TxDb.Mmusculus.UCSC.mm10.knownGene(v.3.4.0), GenomicFeatures(v.1.28.4), GenomicRanges(v.1.28.4), GenomeInfoDb(v.1.12.2), org.Mm.eg.db(v.3.4.1), AnnotationDbi(v.1.38.2), IRanges(v.2.10.3), S4Vectors(v.0.14.3), Biobase(v.2.36.2), BiocGenerics(v.0.22.0) and hpgltools(v.2017.01)

loaded via a namespace (and not attached): Rcpp(v.0.12.12), lattice(v.0.20-35), Rsamtools(v.1.28.0), Biostrings(v.2.44.2), rprojroot(v.1.2), digest(v.0.6.12), foreach(v.1.4.3), R6(v.2.2.2), plyr(v.1.8.4), backports(v.1.1.0), RSQLite(v.2.0), evaluate(v.0.10.1), ggplot2(v.2.2.1), BiocInstaller(v.1.26.1), zlibbioc(v.1.22.0), rlang(v.0.1.2), lazyeval(v.0.2.0), data.table(v.1.10.4), blob(v.1.1.0), Matrix(v.1.2-11), rmarkdown(v.1.6), devtools(v.1.13.3), BiocParallel(v.1.10.1), pander(v.0.6.1), stringr(v.1.2.0), RCurl(v.1.95-4.8), bit(v.1.1-12), biomaRt(v.2.32.1), munsell(v.0.4.3), DelayedArray(v.0.2.7), compiler(v.3.4.1), rtracklayer(v.1.36.4), pkgconfig(v.2.0.1), base64enc(v.0.1-3), htmltools(v.0.3.6), SummarizedExperiment(v.1.6.3), tibble(v.1.3.4), GenomeInfoDbData(v.0.99.0), roxygen2(v.6.0.1), codetools(v.0.2-15), matrixStats(v.0.52.2), XML(v.3.98-1.9), crayon(v.1.3.2), withr(v.2.0.0), GenomicAlignments(v.1.12.2), bitops(v.1.0-6), commonmark(v.1.4), grid(v.3.4.1), RBGL(v.1.52.0), gtable(v.0.2.0), DBI(v.0.7), magrittr(v.1.5), scales(v.0.5.0), graph(v.1.54.0), stringi(v.1.1.5), XVector(v.0.16.0), testthat(v.1.0.2), xml2(v.1.1.1), openxlsx(v.4.0.17), RColorBrewer(v.1.1-2), iterators(v.1.0.8), tools(v.3.4.1), bit64(v.0.9-7), yaml(v.2.1.14), colorspace(v.1.3-2), memoise(v.1.1.0) and knitr(v.1.17)

message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 739e4a10345a89efb17b37e7de07c5811491ccea
## R> packrat::restore()
## This is hpgltools commit: Thu Aug 31 11:24:06 2017 -0400: 739e4a10345a89efb17b37e7de07c5811491ccea
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_annotation-v20170905.rda.xz
tmp <- sm(saveme(filename=this_save))
---
title: "Annotation data used for M.musculus samples."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: tango
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: cosmo
  toc: true
  toc_float:
    collapsed: false
    smooth_scroll: false
---

<style>
  body .main-container {
    max-width: 1600px;
  }
</style>

```{r options, include=FALSE}
library(hpgltools)
tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(progress=TRUE,
                     verbose=TRUE,
                     width=90,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      fig.width=8,
                      fig.height=8,
                      dpi=96)
old_options <- options(digits=4,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
set.seed(1)
ver <- "20170905"
previous_file <- "index.Rmd"

tmp <- try(sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz"))))

rmd_file <- "01_annotation.Rmd"
```

```{r rendering, include=FALSE, eval=FALSE}
rmarkdown::render(rmd_file)

rmarkdown::render(rmd_file, output_format="pdf_document")
```

[index.html](index.html) [preprocessing.html](preprocessing.html)

# Annotation version: `r ver`

There are a few methods of importing annotation data into R.  The 'new' method is to use the DBI
interfaces provided by bioconductor.  These were a bit more difficult for me to learn, but now that
I have a general idea of how they work, I grudgingly admit that they are far superior.

## OrganismDbi/OrgDb/TxDb

The mouse OrganismDbi interface is contained the library 'Mus.musculus.'  It brings together the
OrgDb and TxDb packages for the mouse: 'org.Mm.eg.db' and 'TxDb.Mmusculus.UCSC.mm10.knownGene'
respectively.  I am going to need to figure out how to integrate the miRNA annotations into them,
but for the moment I will work just with them and hopefully do them in a way which is cleaner and
clearer than before.

```{r mmusculus_orgdb}
## orgDb packages are responsible for providing mapping to other databases,
## like GO/KEGG/Entrez/Ensembl/etc
tt <- sm(require.auto("org.Mm.eg.db"))
tmp <- sm(library("org.Mm.eg.db"))
## In contrast, TxDb packages are responsible for providing information about
## each transcript, thus providing a link between the actual genome sequence
## and the transcripts encoded within.
tt <- sm(require.auto("TxDb.Mmusculus.UCSC.mm10.knownGene"))
tmp <- sm(library("TxDb.Mmusculus.UCSC.mm10.knownGene"))
## the Mus.musculus package actually loads the previous 2, but I want to be explicit.
tt <- sm(require.auto("Mus.musculus"))
tmp <- sm(library("Mus.musculus"))

## The following section gathers annotation data from the 2015 archive of ensembl.
## In this case, all the annotations are explicitly the mirbase data
## along with a subset of other ensembl fields.
ensembl_genes <- biomaRt::useEnsembl("ensembl", host="dec2015.archive.ensembl.org")
genes_datasets <- biomaRt::listDatasets(ensembl_genes)
mouse_genes <- biomaRt::useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl",
                                   host="dec2015.archive.ensembl.org")
mirna_attribs <- c("ensembl_gene_id", "ensembl_transcript_id", "description",
                  "external_gene_name", "mirbase_accession", "mirbase_id",
                  "mgi_transcript_name", "chromosome_name")
mouse_mirna_attribs <- biomaRt::listAttributes(mart=mouse_genes)
mm_mi_genes <- biomaRt::getBM(attributes=mirna_attribs, filters="biotype",
                              values="miRNA", mart=mouse_genes)
## The following is used by miRNAtab.Rmd
mm_mi_genes$ID <- paste0("chr", mm_mi_genes$chromosome_name, "_", mm_mi_genes$ensembl_gene_id)
rownames(mm_mi_genes) <- make.names(mm_mi_genes$ID, unique=TRUE)

tt <- sm(require.auto("mirbase.db"))
library("mirbase.db")
chosen_df <- mm_mi_genes[, c("ensembl_gene_id","mirbase_id")]
chosen_df$fivep_accession <- ""
chosen_df$fivep_id <- ""
chosen_df$threep_accession <- ""
chosen_df$threep_id <- ""
mikey <- "mmu-mir-344h-1"
names <- rownames(chosen_df)
for (minum in 1:length(chosen_df$mirbase_id)) {
    mikey <- chosen_df$mirbase_id[[minum]]
    if (mikey == "") {
        next
    }
    mi_mappings <- try(get(mikey, mirbaseMATURE), silent=TRUE)
    if (class(mi_mappings) == "try-error") {
        next
    }
    mi_accessions <- mi_mappings@matureAccession
    mi_names <- mi_mappings@matureName
    name <- names[[minum]]
    chosen_df[name, "fivep_accession"] <- mi_accessions[1]
    chosen_df[name, "threep_accession"] <- mi_accessions[2]
    chosen_df[name, "fivep_id"] <- mi_names[1]
    chosen_df[name, "threep_id"] <- mi_names[2]
}
write.table(x=chosen_df, file="reference/mi_mappings.tab")

ensembl_new <- biomaRt::useEnsembl("regulation")
new_datasets <- biomaRt::listDatasets(ensembl_new)
mouse_dataset <- biomaRt::useEnsembl(biomart="regulation", dataset="mmusculus_annotated_feature")
mirna_columns <- biomaRt::listAttributes(mouse_dataset)
mirna_columns
new_columns <- c("efo_id", "so_accession", "feature_type_name")
## 2016-09-28 This seems to be timing out right now.
new_columns_download <- biomaRt::getBM(attributes=new_columns, mart=mouse_dataset)

## In contrast, when working with the transcripts from mice, we can pull
## everything directly from the OrganismDbi.
mm_tx_org <- sm(load_orgdb_annotations(Mus.musculus, keytype="ensembltrans",
                                 fields=c("cdschrom", "definition", "genename")))
## I think the gene IDs returned by this are already perfect for the transcripts
## in the count tables.
mm_tx_genes <- mm_tx_org$genes
## The inclusion of the 'definition' columns means that we also end up with a bunch
## of redundant entries, like 200,000 of them; therefore I am going to whittle down
## the list.
mm_tx_unique <- !grepl(pattern="\\.", x=rownames(mm_tx_genes))
mm_tx_genes <- mm_tx_genes[mm_tx_unique, ]
```

The Mus.musculus data does not seem to include the full set of genes, lets compare to what we get when using biomart.

```{r compare_tx_biomart}
##mmtx_annotations <- get_biomart_annotations(species="mmusculus")
mart <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL", host="dec2015.archive.ensembl.org")
dataset <- paste0("mmusculus_gene_ensembl")
ensembl <- try(biomaRt::useDataset(dataset, mart=mart))
lots_of_rows <- biomaRt::listAttributes(ensembl)  ## List of possible attributes
## wanted_attributes <- c("ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "chromosome_name", "start_position","end_position","description", "entrezgene","hgnc_symbol","hgnc_id","uniprot_sptrembl","uniprot_swissprot","uniprot_genename")  ## attributes Lucia and I chose, but too many
wanted_attributes_global <- c("ensembl_transcript_id", "ensembl_gene_id", "chromosome_name",
                              "start_position", "end_position", "strand", "description")
wanted_attributes_names <- c("ensembl_transcript_id", "entrezgene", "hgnc_symbol", "hgnc_id")
wanted_attributes_uniprot <- c("ensembl_transcript_id", "uniprot_sptrembl", "uniprot_swissprot")

wanted_global_annotations <- biomaRt::getBM(attributes=wanted_attributes_global, mart=ensembl)
dim(wanted_global_annotations)
wanted_names_annotations <- biomaRt::getBM(attributes=wanted_attributes_names, mart=ensembl)
wanted_uniprot_annotations <- biomaRt::getBM(attributes=wanted_attributes_uniprot, mart=ensembl)
dim(wanted_uniprot_annotations)
wanted_annotations <- merge(wanted_global_annotations, wanted_uniprot_annotations,
                            by.x="ensembl_transcript_id", by.y="ensembl_transcript_id", all.y=TRUE)

length(unique(wanted_annotations$ensembl_transcript_id))
rownames(wanted_annotations) <- make.names(wanted_annotations[["ensembl_transcript_id"]], unique=TRUE)
```

## Prepare annotation tables

The annotation data given by load_annotations() is nice, but needs a little cleaning to match up
with the count tables from the experiment.

```{r cleanup_annotations}
## I happen to know that the gene IDs of the miRNA data are of the format
## 'chrx_ENSMUSG' so I will change the rownames of this slightly.
mm_mi_genes[["ID"]] <- paste0("chr", mm_mi_genes[["chromosome_name"]],
                              "_", mm_mi_genes[["ensembl_gene_id"]])
rownames(mm_mi_genes) <- make.names(mm_mi_genes[["ID"]], unique=TRUE)
rownames(mm_tx_genes) <- make.names(mm_tx_genes$ensembltrans, unique=TRUE)
```

# Creating an Expressionset

The expressionset is the primary sequencing data format used in R.  It brings together the
experiment metadata (condition/batch/etc), gene annotations, and observed counts.  The excel file
containing the metadata also provides filenames containing the appropriate count tables.

```{r experimental_design}
mm_mi <- sm(create_expt(metadata="sample_sheets/all_samplesv1.xlsx", gene_info=mm_mi_genes,
                        file_column="mifile"))
## The IDs in the expressionset for the miRNAs are of the format 'chr1_ENSMUSG....'
mm_tx <- sm(create_expt(metadata="sample_sheets/all_samplesv1.xlsx", gene_info=wanted_annotations,
                        file_column="txfile"))
## The IDs in the expressionset for transcripts are of the format 'ENSMUST....'
```

In the previous block, we created the primary expressionsets used in the
following work.  However, much of it will need to performed with subsets of this
data, so let us create those here.

```{r create_subsets}
## Because of the shenanigans I did to change the seed length for alignments, the counts are not
## guaranteed to be in integers, so round them here.
mm_mir <- mm_mi
Biobase::exprs(mm_mir$expressionset) <- round(Biobase::exprs(mm_mir$expressionset))
mm_txr <- mm_tx
Biobase::exprs(mm_txr$expressionset) <- round(Biobase::exprs(mm_txr$expressionset))

## Subsets!
mmmi_small <- expt_subset(mm_mir, subset="librarytype=='small'")
mmmi_large <- expt_subset(mm_mir, subset="librarytype=='large'")
mmtx_small <- expt_subset(mm_txr, subset="librarytype=='small'")
mmtx_large <- expt_subset(mm_txr, subset="librarytype=='large'")

mature_annot <- read.table("reference/mi_mappings.tab")
mature_fivep <- mature_annot[, c("fivep_accession","fivep_id")]
fivep_completed <- as.logical(mature_fivep[[1]] != "")
mature_fivep <- mature_fivep[fivep_completed, ]
mature_threep <- mature_annot[, c("threep_accession","threep_id")]
threep_completed <- as.logical(mature_threep[[1]] != "" & !is.na(mature_threep[[1]]))
mature_threep <- mature_threep[threep_completed, ]
colnames(mature_fivep) <- c("accession","id")
colnames(mature_threep) <- c("accession","id")
all_mature <- as.data.frame(rbind(mature_fivep, mature_threep))
rownames(all_mature) <-  make.names(gsub(pattern="\\-", replacement="\\.", x=all_mature$id), unique=TRUE)

mmmi_mature <- sm(create_expt(metadata="sample_sheets/all_samplesv1.xlsx", gene_info=all_mature,
                              file_column="maturefile"))
```

At this point, we should have everything necessary to perform the various analyses.  So save the current data for reuse elsewhere.

```{r saveme}
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
```
