1 Annotation version: 20171102

Any annotation data will need to come from our trinotate result. An interesting caveat, kallisto uses a transcript database. tximport is the library used to read the counts/transcript into R, it is able to take a set of mappings between gene/transcript in order to provide gene counts rather than transcript.

An important aspect of using trinity for this creation of our annotation data: there are many putative transcripts for each putative gene, many of which are probably not real, but instead are artifacts of the assembler. Therefore we need to do something to try to distinguish between the reals and fakers. The following code will attempt to address this problem.

1.1 Load all trinotate data

putative_annotations <- sm(load_trinotate_annotations("reference/trinotate.csv"))
annot_by_gene <- putative_annotations
## Let us make one set of annotations by transcript ID and one by gene ID
rownames(annot_by_gene) <- make.names(putative_annotations[["gene_id"]], unique=TRUE)
annot_by_tx <- putative_annotations
rownames(annot_by_tx) <- make.names(putative_annotations[["transcript_id"]], unique=TRUE)

1.2 Create the expressionsets

In order to make functional expressionsets of this data, we will need to use the mappings from gene <-> transcripts, that is the final argument to create_expt().

## I split the annotations into one set keyed by gene IDs and one by transcript IDs.
## This way it is possible to make an expressionset of transcripts or genes by
## removing/keeping the tx_gene_map argument.

## Why would you want to do this? you might ask...  Well, sadly the annotation database
## Does not always have filled-in data for the first transcript for a given gene ID,
## therefore the annotation information for those genes will be lost when we merge
## the count tables / annotation tables by gene ID.
## This is of course also true for transcript IDs, but for a vastly smaller number of them.
transcript_gene_map <- putative_annotations[, c("transcript_id", "gene_id")]
k10_raw <- create_expt(metadata="sample_sheets/all_samples.xlsx",
                       gene_info=annot_by_gene,
                       tx_gene_map=transcript_gene_map)
## Reading the sample metadata.
## The sample definitions comprises: 10, 7 rows, columns.
## Reading count tables.
## Using the transcript<->gene mapping.
## Finished reading count tables.
## Matched 58486 annotations and counts.
## Bringing together the count matrix and gene information.
putative_annotations <- fData(k10_raw)  ## A cheap way to get back the unique gene IDs.
## On further examination, we decided to remove both samples 1003 and 1008
## which Sandra pointed out are in fact from the same flow cytometry isolation.
k8_raw <- create_expt(metadata="sample_sheets/kept_samples.xlsx",
                      gene_info=annot_by_gene,
                      tx_gene_map=transcript_gene_map)
## Reading the sample metadata.
## The sample definitions comprises: 8, 7 rows, columns.
## Reading count tables.
## Using the transcript<->gene mapping.
## Finished reading count tables.
## Matched 58486 annotations and counts.
## Bringing together the count matrix and gene information.
## If we do not use the tx_gene_map information, we get 234,330 transcripts in the data set.
## Instead, we get 58,486 genes.

1.2.1 How many putative features in the raw data?

dim(exprs(k10_raw))
## [1] 58486    10
dim(exprs(k8_raw))
## [1] 58486     8

1.3 Threshold cutoff

Start by dropping all features with less than 4 counts across all samples. This should get rid of a significant number of the false positive transcripts. While I am at it, I will drop the features annotated as rRNA.

## Perform a count/gene cutoff here
threshold <- 4
method <- "cbcb"  ## other methods include 'genefilter' and 'simple'
k10_count <- sm(normalize_expt(k10_raw, filter=method, thresh=threshold))
k8_count <- sm(normalize_expt(k8_raw, filter=method, thresh=threshold))

## First get rid of the rRNA
sb_rrna <- putative_annotations[["rrna_subunit"]] != ""
rrna_droppers <- rownames(putative_annotations)[sb_rrna]
k10_norrna <- exclude_genes_expt(expt=k10_raw, ids=rrna_droppers)
## Before removal, there were 58486 entries.
## Now there are 58481 entries.
## Percent kept: 99.994, 99.994, 99.994, 99.996, 99.996, 99.993, 99.991, 99.994, 99.996, 99.995
## Percent removed: 0.006, 0.006, 0.006, 0.004, 0.004, 0.007, 0.009, 0.006, 0.004, 0.005
k8_norrna <- exclude_genes_expt(expt=k8_raw, ids=rrna_droppers)
## Before removal, there were 58486 entries.
## Now there are 58481 entries.
## Percent kept: 99.994, 99.994, 99.994, 99.996, 99.993, 99.991, 99.994, 99.995
## Percent removed: 0.006, 0.006, 0.006, 0.004, 0.007, 0.009, 0.006, 0.005

1.3.1 How many high-count features are in the data?

dim(exprs(k10_count))
## [1] 24132    10
dim(exprs(k8_count))
## [1] 23211     8

1.4 Confidence cutoff

Now lets drop those features for which we have relatively lower confidence in the BLAST results from trinotate.

## Keep only the blastx hits with a low e-value and high identity.
filtered_keepers <- putative_annotations[["blastx_evalue"]] <= 1e-10 &
  putative_annotations[["blastx_identity"]] >= 60
keepers <- rownames(putative_annotations)[filtered_keepers]
## This excludes all but 33,451 transcripts.

## Now keep only those transcript IDs which fall into the above categories.
k10_conf <- exclude_genes_expt(expt=k10_norrna, ids=keepers, method="keep")
## Before removal, there were 58481 entries.
## Now there are 4888 entries.
## Percent kept: 23.611, 22.922, 24.308, 23.931, 24.798, 24.603, 25.471, 22.574, 23.289, 25.368
## Percent removed: 76.389, 77.078, 75.692, 76.069, 75.202, 75.397, 74.529, 77.426, 76.711, 74.632
k8_conf <- exclude_genes_expt(expt=k8_norrna, ids=keepers, method="keep")
## Before removal, there were 58481 entries.
## Now there are 4888 entries.
## Percent kept: 23.611, 22.922, 24.308, 24.798, 24.603, 25.471, 22.574, 25.368
## Percent removed: 76.389, 77.078, 75.692, 75.202, 75.397, 74.529, 77.426, 74.632

1.5 How many high-confidence features are in the data?

dim(exprs(k10_conf))
## [1] 4888   10
dim(exprs(k8_conf))
## [1] 4888    8

1.6 Both cutoffs

Finally, lets be super-strict and remove both.

## Perform both the confidence and count filtration
k10_conf_count <- sm(normalize_expt(k10_conf, filter=method, thresh=threshold))
k8_conf_count <- sm(normalize_expt(k8_conf, filter=method, thresh=threshold))

rrna_expt <- exclude_genes_expt(expt=k10_raw, ids=sb_rrna, method="keep")
## Before removal, there were 58486 entries.
## Now there are 5 entries.
## Percent kept: 0.006, 0.006, 0.006, 0.004, 0.004, 0.007, 0.009, 0.006, 0.004, 0.005
## Percent removed: 99.994, 99.994, 99.994, 99.996, 99.996, 99.993, 99.991, 99.994, 99.996, 99.995
## This has only 21 putative rRNA transcripts.

1.6.1 How many high-count and high-confidence features?

dim(exprs(k10_conf_count))
## [1] 4392   10
dim(exprs(k8_conf_count))
## [1] 4328    8

2 Plot changes in number of putative genes

There are a few ways to visualize the changes wrought in the data by these operations. Here are two bar plots which attempt to show what happened, the first stacks the relatively-sized library sizes atop one another, the second stacks them on the x-axis.

genes_by_subset <- data.frame(
  samples = c(rep("ten", 4), rep("eight", 4)),
  colors = c(rep("#1B9E77", 4), rep("#7570B3", 4)),
  alpha = c("#1B9E77FF", "#1B9E77CC", "#1B0E77AA",
            "#1B0E7777", "#7570B3FF", "#7570B3CC", "#7570B3AA", "#7570B377"),
  type = rep(c("raw", "count", "conf", "conf_count"), 2),
  genes = c(nrow(exprs(k10_raw)), nrow(exprs(k10_count)),
            nrow(exprs(k10_conf)), nrow(exprs(k10_conf_count)),
            nrow(exprs(k8_raw)), nrow(exprs(k8_count)),
            nrow(exprs(k8_conf)), nrow(exprs(k8_conf_count))))

library(ggplot2)
columns <- ggplot(genes_by_subset, aes(x=samples, y=genes)) +
  geom_col(position="identity", aes(fill=colors, alpha=alpha), color="black") +
  scale_fill_manual(values=c(levels(as.factor(genes_by_subset$colors)))) +
  theme(axis.text=ggplot2::element_text(size=10, colour="black"),
        axis.text.x=ggplot2::element_text(angle=90, vjust=0.5),
        legend.position="none")
c1 <- genes_by_subset[["samples"]] == "ten"
c2 <- genes_by_subset[["samples"]] == "eight"
tmp <- cbind(genes_by_subset[c1, "genes"], genes_by_subset[c2, "genes"])
colnames(tmp) <- c("ten", "eight")
rownames(tmp) <- c("raw", "count", "conf", "conf_count")
tmp
##              ten eight
## raw        58486 58486
## count      24132 23211
## conf        4888  4888
## conf_count  4392  4328
pp(file="images/genes_by_columns.png", image=columns)
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 2a0661d6e37f8a3d8831eb3bbd6347c0d9c4b3b7
## R> packrat::restore()
## This is hpgltools commit: Thu Mar 29 16:59:07 2018 -0400: 2a0661d6e37f8a3d8831eb3bbd6347c0d9c4b3b7
## Saving to 01_annotation-v20171102.rda.xz
---
title: "S.betaceum 201708: annotation data"
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: default
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: readable
  toc: true
  toc_float:
    collapsed: false
    smooth_scroll: false
---

<style>
  body .main-container {
    max-width: 1600px;
  }
</style>

```{r options, include=FALSE}
if (!isTRUE(get0("skip_load"))) {
  library(hpgltools)
  tt <- devtools::load_all("~/hpgltools")
  knitr::opts_knit$set(progress=TRUE,
                       verbose=TRUE,
                       width=90,
                       echo=TRUE)
  knitr::opts_chunk$set(error=TRUE,
                        fig.width=8,
                        fig.height=8,
                        dpi=96)
  old_options <- options(digits=4,
                         stringsAsFactors=FALSE,
                         knitr.duplicate.label="allow")
  ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
  ver <- "20171102"
  previous_file <- "index.Rmd"

  tmp <- try(sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz"))))
  rmd_file <- "01_annotation.Rmd"
}
```

# Annotation version: `r ver`

Any annotation data will need to come from our trinotate result.
An interesting caveat, kallisto uses a transcript database.  tximport is the library
used to read the counts/transcript into R, it is able to take a set of mappings between
gene/transcript in order to provide gene counts rather than transcript.

An important aspect of using trinity for this creation of our annotation data:
there are many putative transcripts for each putative gene, many of which are
probably not real, but instead are artifacts of the assembler.  Therefore we
need to do something to try to distinguish between the reals and fakers.  The
following code will attempt to address this problem.

## Load all trinotate data

```{r annotation}
putative_annotations <- sm(load_trinotate_annotations("reference/trinotate.csv"))
annot_by_gene <- putative_annotations
## Let us make one set of annotations by transcript ID and one by gene ID
rownames(annot_by_gene) <- make.names(putative_annotations[["gene_id"]], unique=TRUE)
annot_by_tx <- putative_annotations
rownames(annot_by_tx) <- make.names(putative_annotations[["transcript_id"]], unique=TRUE)
```

## Create the expressionsets

In order to make functional expressionsets of this data, we will need to use
the mappings from gene <-> transcripts, that is the final argument to
create_expt().

```{r create_expt}
## I split the annotations into one set keyed by gene IDs and one by transcript IDs.
## This way it is possible to make an expressionset of transcripts or genes by
## removing/keeping the tx_gene_map argument.

## Why would you want to do this? you might ask...  Well, sadly the annotation database
## Does not always have filled-in data for the first transcript for a given gene ID,
## therefore the annotation information for those genes will be lost when we merge
## the count tables / annotation tables by gene ID.
## This is of course also true for transcript IDs, but for a vastly smaller number of them.
transcript_gene_map <- putative_annotations[, c("transcript_id", "gene_id")]
k10_raw <- create_expt(metadata="sample_sheets/all_samples.xlsx",
                       gene_info=annot_by_gene,
                       tx_gene_map=transcript_gene_map)
putative_annotations <- fData(k10_raw)  ## A cheap way to get back the unique gene IDs.
## On further examination, we decided to remove both samples 1003 and 1008
## which Sandra pointed out are in fact from the same flow cytometry isolation.
k8_raw <- create_expt(metadata="sample_sheets/kept_samples.xlsx",
                      gene_info=annot_by_gene,
                      tx_gene_map=transcript_gene_map)
## If we do not use the tx_gene_map information, we get 234,330 transcripts in the data set.
## Instead, we get 58,486 genes.
```

### How many putative features in the raw data?

```{r raw_features}
dim(exprs(k10_raw))
dim(exprs(k8_raw))
```

## Threshold cutoff

Start by dropping all features with less than 4 counts across all samples.  This
should get rid of a significant number of the false positive transcripts.  While
I am at it, I will drop the features annotated as rRNA.

```{r threshold_cutoff}
## Perform a count/gene cutoff here
threshold <- 4
method <- "cbcb"  ## other methods include 'genefilter' and 'simple'
k10_count <- sm(normalize_expt(k10_raw, filter=method, thresh=threshold))
k8_count <- sm(normalize_expt(k8_raw, filter=method, thresh=threshold))

## First get rid of the rRNA
sb_rrna <- putative_annotations[["rrna_subunit"]] != ""
rrna_droppers <- rownames(putative_annotations)[sb_rrna]
k10_norrna <- exclude_genes_expt(expt=k10_raw, ids=rrna_droppers)
k8_norrna <- exclude_genes_expt(expt=k8_raw, ids=rrna_droppers)
```

### How many high-count features are in the data?

```{r count_features}
dim(exprs(k10_count))
dim(exprs(k8_count))
```

## Confidence cutoff

Now lets drop those features for which we have relatively lower confidence in
the BLAST results from trinotate.

```{r confidence_cutoff}
## Keep only the blastx hits with a low e-value and high identity.
filtered_keepers <- putative_annotations[["blastx_evalue"]] <= 1e-10 &
  putative_annotations[["blastx_identity"]] >= 60
keepers <- rownames(putative_annotations)[filtered_keepers]
## This excludes all but 33,451 transcripts.

## Now keep only those transcript IDs which fall into the above categories.
k10_conf <- exclude_genes_expt(expt=k10_norrna, ids=keepers, method="keep")
k8_conf <- exclude_genes_expt(expt=k8_norrna, ids=keepers, method="keep")
```

## How many high-confidence features are in the data?

```{r conf_features}
dim(exprs(k10_conf))
dim(exprs(k8_conf))
```

## Both cutoffs

Finally, lets be super-strict and remove both.

```{r both_cutoff}
## Perform both the confidence and count filtration
k10_conf_count <- sm(normalize_expt(k10_conf, filter=method, thresh=threshold))
k8_conf_count <- sm(normalize_expt(k8_conf, filter=method, thresh=threshold))

rrna_expt <- exclude_genes_expt(expt=k10_raw, ids=sb_rrna, method="keep")
## This has only 21 putative rRNA transcripts.
```

### How many high-count and high-confidence features?

```{r conf_count_features}
dim(exprs(k10_conf_count))
dim(exprs(k8_conf_count))
```
# Plot changes in number of putative genes

There are a few ways to visualize the changes wrought in the data by these
operations.  Here are two bar plots which attempt to show what happened, the
first stacks the relatively-sized library sizes atop one another, the second
stacks them on the x-axis.

```{r putative_genes}
genes_by_subset <- data.frame(
  samples = c(rep("ten", 4), rep("eight", 4)),
  colors = c(rep("#1B9E77", 4), rep("#7570B3", 4)),
  alpha = c("#1B9E77FF", "#1B9E77CC", "#1B0E77AA",
            "#1B0E7777", "#7570B3FF", "#7570B3CC", "#7570B3AA", "#7570B377"),
  type = rep(c("raw", "count", "conf", "conf_count"), 2),
  genes = c(nrow(exprs(k10_raw)), nrow(exprs(k10_count)),
            nrow(exprs(k10_conf)), nrow(exprs(k10_conf_count)),
            nrow(exprs(k8_raw)), nrow(exprs(k8_count)),
            nrow(exprs(k8_conf)), nrow(exprs(k8_conf_count))))

library(ggplot2)
columns <- ggplot(genes_by_subset, aes(x=samples, y=genes)) +
  geom_col(position="identity", aes(fill=colors, alpha=alpha), color="black") +
  scale_fill_manual(values=c(levels(as.factor(genes_by_subset$colors)))) +
  theme(axis.text=ggplot2::element_text(size=10, colour="black"),
        axis.text.x=ggplot2::element_text(angle=90, vjust=0.5),
        legend.position="none")
c1 <- genes_by_subset[["samples"]] == "ten"
c2 <- genes_by_subset[["samples"]] == "eight"
tmp <- cbind(genes_by_subset[c1, "genes"], genes_by_subset[c2, "genes"])
colnames(tmp) <- c("ten", "eight")
rownames(tmp) <- c("raw", "count", "conf", "conf_count")
tmp

pp(file="images/genes_by_columns.png", image=columns)
```

```{r saveme}
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
  message(paste0("Saving to ", this_save))
  tmp <- sm(saveme(filename=this_save))
}
```
