1 Spyogenes TNSeq infections: 20170706

2 TODO list

  • 20170522: Get started.

3 Finished TODO

4 Installation and setup

These are rmarkdown documents which make heavy use of the hpgltools package. The following section demonstrates how to set that up in a clean R environment.

## Use R's install.packages to install devtools.
install.packages("devtools")
## Use devtools to install hpgltools.
devtools::install_github("abelew/hpgltools")
## Load hpgltools into the R environment.
library(hpgltools)
pander::pander(sessionInfo())

R version 3.5.1 (2018-07-02)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C

attached base packages: stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: hpgltools(v.2018.03)

loaded via a namespace (and not attached): Rcpp(v.0.12.17), compiler(v.3.5.1), pillar(v.1.3.0), plyr(v.1.8.4), bindr(v.0.1.1), base64enc(v.0.1-3), iterators(v.1.0.10), tools(v.3.5.1), digest(v.0.6.15), memoise(v.1.1.0), evaluate(v.0.10.1), tibble(v.1.4.2), gtable(v.0.2.0), pkgconfig(v.2.0.1), rlang(v.0.2.1), foreach(v.1.4.4), commonmark(v.1.5), yaml(v.2.1.19), parallel(v.3.5.1), bindrcpp(v.0.2.2), xml2(v.1.2.0), roxygen2(v.6.0.1), withr(v.2.1.2), stringr(v.1.3.1), dplyr(v.0.7.6), knitr(v.1.20), devtools(v.1.13.6), rprojroot(v.1.3-2), grid(v.3.5.1), tidyselect(v.0.2.4), glue(v.1.2.0), data.table(v.1.11.4), Biobase(v.2.40.0), R6(v.2.2.2), rmarkdown(v.1.10), pander(v.0.6.2), ggplot2(v.3.0.0), purrr(v.0.2.5), magrittr(v.1.5), backports(v.1.1.2), scales(v.0.5.0), codetools(v.0.2-15), htmltools(v.0.3.6), BiocGenerics(v.0.26.0), assertthat(v.0.2.0), colorspace(v.1.3-2), stringi(v.1.2.3), lazyeval(v.0.2.1), munsell(v.0.5.0) and crayon(v.1.3.4)

this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to index-v20170706.rda.xz
tt <- sm(saveme(filename=this_save))

4.1 Can we produce a master library which is the sum of three others?

The three others are: hpgl0906, hpgl0907, hpgl0908

cd preprocessing
mkdir hpgl0906 hpgl0907 hpgl0908
cd hpgl0906 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0906/ ./ && cd ..
cd hpgl0907 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0907/ ./ && cd ..
cd hpgl0908 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0908/ ./ && cd ..
bamfiles="../hpgl0906/outputs/bowtie_mgas_5005/hpgl0906-trimmed_ca_ta-v0M1.bam \
          ../hpgl0907/outputs/bowtie_mgas_5005/hpgl0907-trimmed_ca_ta-v0M1.bam \
          ../hpgl0908/outputs/bowtie_mgas_5005/hpgl0908-trimmed_ca_ta-v0M1.bam"
samtools merge combined.bam ${bamfiles}

Done.

4.2 For each library(separate and master) what are:

  1. Number of total reads
  2. Strictly aligned
  3. Randomly aligned
  4. The sum of 2,3
  5. Failed aligned reads
  6. Plasmid hits (this will take some time as I neglected to run these alignments)
  7. Unique insertion sites
  8. Saturation index
  9. Average distance

4.2.1 hpgl0906

## The following should answer 1-5 above.
cd preprocessing/
bamtools stats < hpgl0906.bam
## bash: line 2: hpgl0906.bam: No such file or directory

4.2.2 hpgl0907

cd preprocessing/
bamtools stats < hpgl0907.bam
## bash: line 1: hpgl0907.bam: No such file or directory

4.2.3 hpgl0908

cd preprocessing/
bamtools stats < hpgl0908.bam
## bash: line 1: hpgl0908.bam: No such file or directory

4.2.4 combined

cd preprocessing/
bamtools stats < combined.bam
## bash: line 1: combined.bam: No such file or directory

4.2.5 Saturation index

The answer for this is in the R function tnseq_saturation().

file <- "preprocessing/hpgl0906/outputs/essentiality/hpgl0906-v0M1.wig"
hpgl0906_saturation <- tnseq_saturation(data=file)
## Warning in max(data_list, na.rm = TRUE): no non-missing arguments to max;
## returning -Inf
## Error in FUN(X[[i]], ...): only defined on a data frame with all numeric variables
file <- "preprocessing/hpgl0907/outputs/essentiality/hpgl0907-v0M1.wig"
hpgl0907_saturation <- tnseq_saturation(data=file)
## Warning in max(data_list, na.rm = TRUE): no non-missing arguments to max;
## returning -Inf
## Error in FUN(X[[i]], ...): only defined on a data frame with all numeric variables
file <- "preprocessing/hpgl0908/outputs/essentiality/hpgl0908-v0M1.wig"
hpgl0908_saturation <- tnseq_saturation(data=file)
## Warning in max(data_list, na.rm = TRUE): no non-missing arguments to max;
## returning -Inf
## Error in FUN(X[[i]], ...): only defined on a data frame with all numeric variables
file <- "preprocessing/hpgl0909/outputs/essentiality/hpgl0909-v0M1.wig"
hpgl0909_saturation <- tnseq_saturation(data=file)
## Warning in max(data_list, na.rm = TRUE): no non-missing arguments to max;
## returning -Inf
## Error in FUN(X[[i]], ...): only defined on a data frame with all numeric variables
file <- "preprocessing/hpgl0910/outputs/essentiality/hpgl0910-v0M1.wig"
hpgl0910_saturation <- tnseq_saturation(data=file)
## Warning in max(data_list, na.rm = TRUE): no non-missing arguments to max;
## returning -Inf
## Error in FUN(X[[i]], ...): only defined on a data frame with all numeric variables
file <- "preprocessing/hpgl0911/outputs/essentiality/hpgl0911-v0M1.wig"
hpgl0911_saturation <- tnseq_saturation(data=file)
## Warning in max(data_list, na.rm = TRUE): no non-missing arguments to max;
## returning -Inf
## Error in FUN(X[[i]], ...): only defined on a data frame with all numeric variables
## Ok, now have stats for the individual libraries.
all_table <- merge(hpgl0906_saturation$hits_by_position,
                   hpgl0907_saturation$hits_by_position, by="Start")
## Error in merge(hpgl0906_saturation$hits_by_position, hpgl0907_saturation$hits_by_position, : object 'hpgl0906_saturation' not found
all_table <- merge(all_table,
                   hpgl0908_saturation$hits_by_position, by="Start")
## Error in merge(all_table, hpgl0908_saturation$hits_by_position, by = "Start"): object 'all_table' not found
all_table$sum <- 0
## Error in all_table$sum <- 0: object 'all_table' not found
for (r in 1:nrow(all_table)) {
  all_table[r, "sum"] <- all_table[r, "Reads.x"] + all_table[r, "Reads.y"] + all_table[r, "Reads"]
}
## Error in nrow(all_table): object 'all_table' not found
all_table <- all_table[, c("Start", "sum")]
## Error in eval(expr, envir, enclos): object 'all_table' not found
combined_saturation <- tnseq_saturation(data=all_table, column="sum")
## Error in tnseq_saturation(data = all_table, column = "sum"): object 'all_table' not found
all_table2 <- merge(hpgl0909_saturation$hits_by_position,
                    hpgl0910_saturation$hits_by_position, by="Start")
## Error in merge(hpgl0909_saturation$hits_by_position, hpgl0910_saturation$hits_by_position, : object 'hpgl0909_saturation' not found
all_table2 <- merge(all_table2,
                    hpgl0911_saturation$hits_by_position, by="Start")
## Error in merge(all_table2, hpgl0911_saturation$hits_by_position, by = "Start"): object 'all_table2' not found
all_table2$sum <- 0
## Error in all_table2$sum <- 0: object 'all_table2' not found
for (r in 1:nrow(all_table2)) {
  all_table2[r, "sum"] <- all_table2[r, "Reads.x"] + all_table2[r, "Reads.y"] + all_table2[r, "Reads"]
}
## Error in nrow(all_table2): object 'all_table2' not found
all_table2 <- all_table2[, c("Start", "sum")]
## Error in eval(expr, envir, enclos): object 'all_table2' not found
combined_saturation2 <- tnseq_saturation(data=all_table2, column="sum")
## Error in tnseq_saturation(data = all_table2, column = "sum"): object 'all_table2' not found

4.2.5.1 Unique insertion sites

I presume but am not certain that this is the number of > singleton hits.

hpgl0906_saturation$eq_0
## Error in eval(expr, envir, enclos): object 'hpgl0906_saturation' not found
hpgl0906_saturation$gt_1
## Error in eval(expr, envir, enclos): object 'hpgl0906_saturation' not found
hpgl0907_saturation$eq_0
## Error in eval(expr, envir, enclos): object 'hpgl0907_saturation' not found
hpgl0907_saturation$gt_1
## Error in eval(expr, envir, enclos): object 'hpgl0907_saturation' not found
hpgl0908_saturation$eq_0
## Error in eval(expr, envir, enclos): object 'hpgl0908_saturation' not found
hpgl0908_saturation$gt_1
## Error in eval(expr, envir, enclos): object 'hpgl0908_saturation' not found
combined_saturation$eq_0
## Error in eval(expr, envir, enclos): object 'combined_saturation' not found
combined_saturation$gt_1
## Error in eval(expr, envir, enclos): object 'combined_saturation' not found
hpgl0909_saturation$eq_0
## Error in eval(expr, envir, enclos): object 'hpgl0909_saturation' not found
hpgl0909_saturation$gt_1
## Error in eval(expr, envir, enclos): object 'hpgl0909_saturation' not found
hpgl0910_saturation$eq_0
## Error in eval(expr, envir, enclos): object 'hpgl0910_saturation' not found
hpgl0910_saturation$gt_1
## Error in eval(expr, envir, enclos): object 'hpgl0910_saturation' not found
hpgl0911_saturation$eq_0
## Error in eval(expr, envir, enclos): object 'hpgl0911_saturation' not found
hpgl0911_saturation$gt_1
## Error in eval(expr, envir, enclos): object 'hpgl0911_saturation' not found
combined_saturation2$eq_0
## Error in eval(expr, envir, enclos): object 'combined_saturation2' not found
combined_saturation2$gt_1
## Error in eval(expr, envir, enclos): object 'combined_saturation2' not found
hpgl0906_saturation$ratios[1]
## Error in eval(expr, envir, enclos): object 'hpgl0906_saturation' not found
hpgl0906_saturation$ratios[4]
## Error in eval(expr, envir, enclos): object 'hpgl0906_saturation' not found
hpgl0906_saturation$ratios[6]
## Error in eval(expr, envir, enclos): object 'hpgl0906_saturation' not found
hpgl0906_saturation$plot
## Error in eval(expr, envir, enclos): object 'hpgl0906_saturation' not found
hpgl0907_saturation$ratios[1]
## Error in eval(expr, envir, enclos): object 'hpgl0907_saturation' not found
hpgl0907_saturation$ratios[4]
## Error in eval(expr, envir, enclos): object 'hpgl0907_saturation' not found
hpgl0907_saturation$ratios[6]
## Error in eval(expr, envir, enclos): object 'hpgl0907_saturation' not found
hpgl0907_saturation$plot
## Error in eval(expr, envir, enclos): object 'hpgl0907_saturation' not found
hpgl0908_saturation$ratios[1]
## Error in eval(expr, envir, enclos): object 'hpgl0908_saturation' not found
hpgl0908_saturation$ratios[4]
## Error in eval(expr, envir, enclos): object 'hpgl0908_saturation' not found
hpgl0908_saturation$ratios[6]
## Error in eval(expr, envir, enclos): object 'hpgl0908_saturation' not found
hpgl0908_saturation$plot
## Error in eval(expr, envir, enclos): object 'hpgl0908_saturation' not found
combined_saturation$ratios[1]
## Error in eval(expr, envir, enclos): object 'combined_saturation' not found
combined_saturation$ratios[4]
## Error in eval(expr, envir, enclos): object 'combined_saturation' not found
combined_saturation$ratios[6]
## Error in eval(expr, envir, enclos): object 'combined_saturation' not found
combined_saturation$plot
## Error in eval(expr, envir, enclos): object 'combined_saturation' not found

4.3 How many TAs are in the MGAS5005 genome?

The answer to this question should be easily searchable in either the annotation data for the genome and/or the precursor files for essentiality (which collects hits on every TA).

The following counts the number of lines in the tas.txt file. The answer should be that -1, as the first line is a header.

cd preprocessing/hpgl0906/outputs/essentiality/
wc hpgl0906-trimmed_ca_ta-v0M1_tas.txt
## wc: hpgl0906-trimmed_ca_ta-v0M1_tas.txt: No such file or directory
---
title: "S.pyogenes 2017: TNSeq index."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: default
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: readable
  toc: true
  toc_float:
    collapsed: false
    smooth_scroll: false
---

<style>
  body .main-container {
    max-width: 1600px;
}
</style>

```{r options, include=FALSE}
library("hpgltools")
tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(progress=TRUE,
                     verbose=TRUE,
                     width=90,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      fig.width=8,
                      fig.height=8,
                      dpi=96)
old_options <- options(digits=4,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
set.seed(1)
ver <- "20170706"
previous_file <- "index.Rmd"

tmp <- try(sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz"))))

rmd_file <- "index.Rmd"
```

# Spyogenes TNSeq infections: `r ver`

# TODO list

* 20170522: Get started.

# Finished TODO

# Installation and setup

These are rmarkdown documents which make heavy use of the hpgltools package.  The following section
demonstrates how to set that up in a clean R environment.

```{r setup, eval=FALSE}
## Use R's install.packages to install devtools.
install.packages("devtools")
## Use devtools to install hpgltools.
devtools::install_github("abelew/hpgltools")
## Load hpgltools into the R environment.
library(hpgltools)
```

```{r saveme}
pander::pander(sessionInfo())
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tt <- sm(saveme(filename=this_save))
```

## Can we produce a master library which is the sum of three others?

The three others are: hpgl0906, hpgl0907, hpgl0908

```{r copy_combine, engine='bash', eval=FALSE}
cd preprocessing
mkdir hpgl0906 hpgl0907 hpgl0908
cd hpgl0906 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0906/ ./ && cd ..
cd hpgl0907 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0907/ ./ && cd ..
cd hpgl0908 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0908/ ./ && cd ..
bamfiles="../hpgl0906/outputs/bowtie_mgas_5005/hpgl0906-trimmed_ca_ta-v0M1.bam \
          ../hpgl0907/outputs/bowtie_mgas_5005/hpgl0907-trimmed_ca_ta-v0M1.bam \
          ../hpgl0908/outputs/bowtie_mgas_5005/hpgl0908-trimmed_ca_ta-v0M1.bam"
samtools merge combined.bam ${bamfiles}
```

Done.

## For each library(separate and master) what are:

1.  Number of total reads
2.  Strictly aligned
3.  Randomly aligned
4.  The sum of 2,3
5.  Failed aligned reads
6.  Plasmid hits (this will take some time as I neglected to run these alignments)
7.  Unique insertion sites
8.  Saturation index
9.  Average distance

### hpgl0906

```{r number_reads_906, engine='bash'}
## The following should answer 1-5 above.
cd preprocessing/
bamtools stats < hpgl0906.bam
```

### hpgl0907

```{r number_reads_907, engine='bash'}
cd preprocessing/
bamtools stats < hpgl0907.bam
```

### hpgl0908

```{r number_reads_908, engine='bash'}
cd preprocessing/
bamtools stats < hpgl0908.bam
```

### combined

```{r number_reads_combined, engine='bash'}
cd preprocessing/
bamtools stats < combined.bam
```

### Saturation index

The answer for this is in the R function tnseq_saturation().

```{r saturation}
file <- "preprocessing/hpgl0906/outputs/essentiality/hpgl0906-v0M1.wig"
hpgl0906_saturation <- tnseq_saturation(data=file)
file <- "preprocessing/hpgl0907/outputs/essentiality/hpgl0907-v0M1.wig"
hpgl0907_saturation <- tnseq_saturation(data=file)
file <- "preprocessing/hpgl0908/outputs/essentiality/hpgl0908-v0M1.wig"
hpgl0908_saturation <- tnseq_saturation(data=file)

file <- "preprocessing/hpgl0909/outputs/essentiality/hpgl0909-v0M1.wig"
hpgl0909_saturation <- tnseq_saturation(data=file)
file <- "preprocessing/hpgl0910/outputs/essentiality/hpgl0910-v0M1.wig"
hpgl0910_saturation <- tnseq_saturation(data=file)
file <- "preprocessing/hpgl0911/outputs/essentiality/hpgl0911-v0M1.wig"
hpgl0911_saturation <- tnseq_saturation(data=file)

## Ok, now have stats for the individual libraries.
all_table <- merge(hpgl0906_saturation$hits_by_position,
                   hpgl0907_saturation$hits_by_position, by="Start")
all_table <- merge(all_table,
                   hpgl0908_saturation$hits_by_position, by="Start")
all_table$sum <- 0
for (r in 1:nrow(all_table)) {
  all_table[r, "sum"] <- all_table[r, "Reads.x"] + all_table[r, "Reads.y"] + all_table[r, "Reads"]
}
all_table <- all_table[, c("Start", "sum")]
combined_saturation <- tnseq_saturation(data=all_table, column="sum")

all_table2 <- merge(hpgl0909_saturation$hits_by_position,
                    hpgl0910_saturation$hits_by_position, by="Start")
all_table2 <- merge(all_table2,
                    hpgl0911_saturation$hits_by_position, by="Start")
all_table2$sum <- 0
for (r in 1:nrow(all_table2)) {
  all_table2[r, "sum"] <- all_table2[r, "Reads.x"] + all_table2[r, "Reads.y"] + all_table2[r, "Reads"]
}
all_table2 <- all_table2[, c("Start", "sum")]
combined_saturation2 <- tnseq_saturation(data=all_table2, column="sum")
```

#### Unique insertion sites

I presume but am not certain that this is the number of > singleton hits.

```{r unique_sites}
hpgl0906_saturation$eq_0
hpgl0906_saturation$gt_1

hpgl0907_saturation$eq_0
hpgl0907_saturation$gt_1

hpgl0908_saturation$eq_0
hpgl0908_saturation$gt_1

combined_saturation$eq_0
combined_saturation$gt_1

hpgl0909_saturation$eq_0
hpgl0909_saturation$gt_1

hpgl0910_saturation$eq_0
hpgl0910_saturation$gt_1

hpgl0911_saturation$eq_0
hpgl0911_saturation$gt_1

combined_saturation2$eq_0
combined_saturation2$gt_1
```

```{r saturation_indexes}
hpgl0906_saturation$ratios[1]
hpgl0906_saturation$ratios[4]
hpgl0906_saturation$ratios[6]
hpgl0906_saturation$plot

hpgl0907_saturation$ratios[1]
hpgl0907_saturation$ratios[4]
hpgl0907_saturation$ratios[6]
hpgl0907_saturation$plot

hpgl0908_saturation$ratios[1]
hpgl0908_saturation$ratios[4]
hpgl0908_saturation$ratios[6]
hpgl0908_saturation$plot

combined_saturation$ratios[1]
combined_saturation$ratios[4]
combined_saturation$ratios[6]
combined_saturation$plot
```

## How many TAs are in the MGAS5005 genome?

The answer to this question should be easily searchable in either the annotation data for the
genome and/or the precursor files for essentiality (which collects hits on every TA).

The following counts the number of lines in the tas.txt file.
The answer should be that -1, as the first line is a header.

```{r number_tas, engine='bash'}
cd preprocessing/hpgl0906/outputs/essentiality/
wc hpgl0906-trimmed_ca_ta-v0M1_tas.txt
```
