index.html preprocessing.html

1 S.pyogenes TNSeq version: 20171010

1.1 Can we produce a master library which is the sum of three others?

The three others are: hpgl0837, hpgl0838, hpgl0839

cd preprocessing
mkdir hpgl0837 hpgl0838 hpgl0839
cd hpgl0837 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0837/ ./ && cd ..
cd hpgl0838 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0838/ ./ && cd ..
cd hpgl0839 && rsync -av ~/scratch/tnseq/spyogenes_5448v2/preprocessing/tnseq/hpgl0839/ ./ && cd ..
bamfiles="../hpgl0837/outputs/bowtie_mgas_5005/hpgl0837-trimmed_ca_ta-v0M1.bam \
          ../hpgl0838/outputs/bowtie_mgas_5005/hpgl0838-trimmed_ca_ta-v0M1.bam \
          ../hpgl0839/outputs/bowtie_mgas_5005/hpgl0839-trimmed_ca_ta-v0M1.bam"
samtools merge combined.bam ${bamfiles}

Done.

1.2 For each library(separate and master) what are:

  1. Number of total reads
  2. Strictly aligned
  3. Randomly aligned
  4. The sum of 2,3
  5. Failed aligned reads
  6. Plasmid hits (this will take some time as I neglected to run these alignments)
  7. Unique insertion sites
  8. Saturation index
  9. Average distance

1.2.1 hpgl0837

## The following should answer 1-5 above.
cd preprocessing/
bamtools stats < hpgl0837.bam
## 
## **********************************************
## Stats for BAM file(s): 
## **********************************************
## 
## Total reads:       4166286
## Mapped reads:      2172707   (52.1497%)
## Forward strand:    3061735   (73.4884%)
## Reverse strand:    1104551   (26.5116%)
## Failed QC:         0 (0%)
## Duplicates:        0 (0%)
## Paired-end reads:  0 (0%)

1.2.2 hpgl0838

cd preprocessing/
bamtools stats < hpgl0838.bam
## 
## **********************************************
## Stats for BAM file(s): 
## **********************************************
## 
## Total reads:       3248362
## Mapped reads:      1536284   (47.2941%)
## Forward strand:    2573976   (79.2392%)
## Reverse strand:    674386    (20.7608%)
## Failed QC:         0 (0%)
## Duplicates:        0 (0%)
## Paired-end reads:  0 (0%)

1.2.3 hpgl0839

cd preprocessing/
bamtools stats < hpgl0839.bam
## 
## **********************************************
## Stats for BAM file(s): 
## **********************************************
## 
## Total reads:       3925737
## Mapped reads:      2527090   (64.3724%)
## Forward strand:    2730038   (69.5421%)
## Reverse strand:    1195699   (30.458%)
## Failed QC:         0 (0%)
## Duplicates:        0 (0%)
## Paired-end reads:  0 (0%)

1.2.4 combined

cd preprocessing/
bamtools stats < combined.bam
## 
## **********************************************
## Stats for BAM file(s): 
## **********************************************
## 
## Total reads:       11340385
## Mapped reads:      6236081   (54.99%)
## Forward strand:    8365749   (73.7695%)
## Reverse strand:    2974636   (26.2305%)
## Failed QC:         0 (0%)
## Duplicates:        0 (0%)
## Paired-end reads:  0 (0%)

1.2.5 Saturation index

The answer for this is in the R function tnseq_saturation().

file <- "preprocessing/hpgl0837/outputs/essentiality/hpgl0837-trimmed_ca_ta-v0M1.wig"
hpgl0837_saturation <- tnseq_saturation(data=file)
file <- "preprocessing/hpgl0838/outputs/essentiality/hpgl0838-trimmed_ca_ta-v0M1.wig"
hpgl0838_saturation <- tnseq_saturation(data=file)
file <- "preprocessing/hpgl0839/outputs/essentiality/hpgl0839-trimmed_ca_ta-v0M1.wig"
hpgl0839_saturation <- tnseq_saturation(data=file)

## Ok, now have stats for the individual libraries.
all_table <- merge(hpgl0837_saturation$hits_by_position,
                   hpgl0838_saturation$hits_by_position, by="Start")
all_table <- merge(all_table,
                   hpgl0839_saturation$hits_by_position, by="Start")
all_table$sum <- 0
for (r in 1:nrow(all_table)) {
  all_table[r, "sum"] <- all_table[r, "Reads.x"] + all_table[r, "Reads.y"] + all_table[r, "Reads"]
}
all_table <- all_table[, c("Start", "sum")]
combined_saturation <- tnseq_saturation(data=all_table, column="sum")

1.2.5.1 Unique insertion sites

I presume but am not certain that this is the number of > singleton hits.

hpgl0837_saturation$eq_0
##      0 
## 124098
hpgl0837_saturation$gt_1
## [1] 8613
hpgl0838_saturation$eq_0
##      0 
## 126356
hpgl0838_saturation$gt_1
## [1] 6355
hpgl0839_saturation$eq_0
##      0 
## 118634
hpgl0839_saturation$gt_1
## [1] 14077
combined_saturation$eq_0
##      0 
## 107917
combined_saturation$gt_1
## [1] 24794
hpgl0837_saturation$ratios[1]
##            1 
## "0.06940483"
hpgl0837_saturation$ratios[4]
##            8 
## "0.03237764"
hpgl0837_saturation$ratios[6]
##           32 
## "0.02089478"
hpgl0837_saturation$plot
## Warning: Removed 2020 rows containing non-finite values (stat_bin).
## Warning: Removed 2020 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_bar).

hpgl0838_saturation$ratios[1]
##            1 
## "0.05029441"
hpgl0838_saturation$ratios[4]
##            8 
## "0.02428852"
hpgl0838_saturation$ratios[6]
##           32 
## "0.01438792"
hpgl0838_saturation$plot
## Warning: Removed 1322 rows containing non-finite values (stat_bin).
## Warning: Removed 1322 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).

hpgl0839_saturation$ratios[1]
##            1 
## "0.11865907"
hpgl0839_saturation$ratios[4]
##            8 
## "0.07349495"
hpgl0839_saturation$ratios[6]
##           32 
## "0.05283477"
hpgl0839_saturation$plot
## Warning: Removed 4902 rows containing non-finite values (stat_bin).
## Warning: Removed 4902 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_bar).

combined_saturation$ratios[1]
##            1 
## "0.22975064"
combined_saturation$ratios[4]
##            8 
## "0.13490924"
combined_saturation$ratios[6]
##           32 
## "0.09394257"
combined_saturation$plot
## Warning: Removed 7961 rows containing non-finite values (stat_bin).
## Warning: Removed 7961 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).

1.3 How many TAs are in the MGAS5005 genome?

The answer to this question should be easily searchable in either the annotation data for the genome and/or the precursor files for essentiality (which collects hits on every TA).

The following counts the number of lines in the tas.txt file. The answer should be that -1, as the first line is a header.

cd preprocessing/hpgl0837/outputs/essentiality/
wc hpgl0837-trimmed_ca_ta-v0M1_tas.txt
##  132712  530848 1783514 hpgl0837-trimmed_ca_ta-v0M1_tas.txt
LS0tCnRpdGxlOiAiUy5weW9nZW5lcyBUTlNlcSBxdWVzdGlvbnMiCmF1dGhvcjogImF0YiBhYmVsZXdAZ21haWwuY29tIgpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiCm91dHB1dDoKIGh0bWxfZG9jdW1lbnQ6CiAgY29kZV9kb3dubG9hZDogdHJ1ZQogIGNvZGVfZm9sZGluZzogc2hvdwogIGZpZ19jYXB0aW9uOiB0cnVlCiAgZmlnX2hlaWdodDogNwogIGZpZ193aWR0aDogNwogIGhpZ2hsaWdodDogZGVmYXVsdAogIGtlZXBfbWQ6IGZhbHNlCiAgbW9kZTogc2VsZmNvbnRhaW5lZAogIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogIHNlbGZfY29udGFpbmVkOiB0cnVlCiAgdGhlbWU6IHJlYWRhYmxlCiAgdG9jOiB0cnVlCiAgdG9jX2Zsb2F0OgogICAgY29sbGFwc2VkOiBmYWxzZQogICAgc21vb3RoX3Njcm9sbDogZmFsc2UKLS0tCgo8c3R5bGU+CiAgYm9keSAubWFpbi1jb250YWluZXIgewogICAgbWF4LXdpZHRoOiAxNjAwcHg7CiAgfQo8L3N0eWxlPgoKYGBge3Igb3B0aW9ucywgaW5jbHVkZT1GQUxTRX0KbGlicmFyeShocGdsdG9vbHMpCnR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgifi9ocGdsdG9vbHMiKQprbml0cjo6b3B0c19rbml0JHNldChwcm9ncmVzcz1UUlVFLAogICAgICAgICAgICAgICAgICAgICB2ZXJib3NlPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgIHdpZHRoPTkwLAogICAgICAgICAgICAgICAgICAgICBlY2hvPVRSVUUpCmtuaXRyOjpvcHRzX2NodW5rJHNldChlcnJvcj1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgZmlnLndpZHRoPTgsCiAgICAgICAgICAgICAgICAgICAgICBmaWcuaGVpZ2h0PTgsCiAgICAgICAgICAgICAgICAgICAgICBkcGk9OTYpCm9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSwKICAgICAgICAgICAgICAgICAgICAgICBrbml0ci5kdXBsaWNhdGUubGFiZWw9ImFsbG93IikKZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMCkpCnNldC5zZWVkKDEpCnZlciA8LSAiMjAxNzEwMTAiCnByZXZpb3VzX2ZpbGUgPC0gImluZGV4LlJtZCIKCnRtcCA8LSB0cnkoc20obG9hZG1lKGZpbGVuYW1lPXBhc3RlMChnc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IiIsIHg9cHJldmlvdXNfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKSkpKQoKcm1kX2ZpbGUgPC0gImluZGV4LlJtZCIKYGBgCgpgYGB7ciByZW5kZXJpbmcsIGluY2x1ZGU9RkFMU0UsIGV2YWw9RkFMU0V9CnJtYXJrZG93bjo6cmVuZGVyKHJtZF9maWxlKQoKcm1hcmtkb3duOjpyZW5kZXIocm1kX2ZpbGUsIG91dHB1dF9mb3JtYXQ9InBkZl9kb2N1bWVudCIpCmBgYAoKW2luZGV4Lmh0bWxdKGluZGV4Lmh0bWwpIFtwcmVwcm9jZXNzaW5nLmh0bWxdKHByZXByb2Nlc3NpbmcuaHRtbCkKCiMgUy5weW9nZW5lcyBUTlNlcSB2ZXJzaW9uOiBgciB2ZXJgCgojIyBDYW4gd2UgcHJvZHVjZSBhIG1hc3RlciBsaWJyYXJ5IHdoaWNoIGlzIHRoZSBzdW0gb2YgdGhyZWUgb3RoZXJzPwoKVGhlIHRocmVlIG90aGVycyBhcmU6IGhwZ2wwODM3LCBocGdsMDgzOCwgaHBnbDA4MzkKCmBgYHtyIGNvcHlfY29tYmluZSwgZW5naW5lPSdiYXNoJywgZXZhbD1GQUxTRX0KY2QgcHJlcHJvY2Vzc2luZwpta2RpciBocGdsMDgzNyBocGdsMDgzOCBocGdsMDgzOQpjZCBocGdsMDgzNyAmJiByc3luYyAtYXYgfi9zY3JhdGNoL3Ruc2VxL3NweW9nZW5lc181NDQ4djIvcHJlcHJvY2Vzc2luZy90bnNlcS9ocGdsMDgzNy8gLi8gJiYgY2QgLi4KY2QgaHBnbDA4MzggJiYgcnN5bmMgLWF2IH4vc2NyYXRjaC90bnNlcS9zcHlvZ2VuZXNfNTQ0OHYyL3ByZXByb2Nlc3NpbmcvdG5zZXEvaHBnbDA4MzgvIC4vICYmIGNkIC4uCmNkIGhwZ2wwODM5ICYmIHJzeW5jIC1hdiB+L3NjcmF0Y2gvdG5zZXEvc3B5b2dlbmVzXzU0NDh2Mi9wcmVwcm9jZXNzaW5nL3Ruc2VxL2hwZ2wwODM5LyAuLyAmJiBjZCAuLgpiYW1maWxlcz0iLi4vaHBnbDA4Mzcvb3V0cHV0cy9ib3d0aWVfbWdhc181MDA1L2hwZ2wwODM3LXRyaW1tZWRfY2FfdGEtdjBNMS5iYW0gXAogICAgICAgICAgLi4vaHBnbDA4Mzgvb3V0cHV0cy9ib3d0aWVfbWdhc181MDA1L2hwZ2wwODM4LXRyaW1tZWRfY2FfdGEtdjBNMS5iYW0gXAogICAgICAgICAgLi4vaHBnbDA4Mzkvb3V0cHV0cy9ib3d0aWVfbWdhc181MDA1L2hwZ2wwODM5LXRyaW1tZWRfY2FfdGEtdjBNMS5iYW0iCnNhbXRvb2xzIG1lcmdlIGNvbWJpbmVkLmJhbSAke2JhbWZpbGVzfQpgYGAKCkRvbmUuCgojIyBGb3IgZWFjaCBsaWJyYXJ5KHNlcGFyYXRlIGFuZCBtYXN0ZXIpIHdoYXQgYXJlOgoKMS4gIE51bWJlciBvZiB0b3RhbCByZWFkcwoyLiAgU3RyaWN0bHkgYWxpZ25lZAozLiAgUmFuZG9tbHkgYWxpZ25lZAo0LiAgVGhlIHN1bSBvZiAyLDMKNS4gIEZhaWxlZCBhbGlnbmVkIHJlYWRzCjYuICBQbGFzbWlkIGhpdHMgKHRoaXMgd2lsbCB0YWtlIHNvbWUgdGltZSBhcyBJIG5lZ2xlY3RlZCB0byBydW4gdGhlc2UgYWxpZ25tZW50cykKNy4gIFVuaXF1ZSBpbnNlcnRpb24gc2l0ZXMKOC4gIFNhdHVyYXRpb24gaW5kZXgKOS4gIEF2ZXJhZ2UgZGlzdGFuY2UKCiMjIyBocGdsMDgzNwoKYGBge3IgbnVtYmVyX3JlYWRzXzgzNywgZW5naW5lPSdiYXNoJ30KIyMgVGhlIGZvbGxvd2luZyBzaG91bGQgYW5zd2VyIDEtNSBhYm92ZS4KY2QgcHJlcHJvY2Vzc2luZy8KYmFtdG9vbHMgc3RhdHMgPCBocGdsMDgzNy5iYW0KYGBgCgojIyMgaHBnbDA4MzgKCmBgYHtyIG51bWJlcl9yZWFkc184MzgsIGVuZ2luZT0nYmFzaCd9CmNkIHByZXByb2Nlc3NpbmcvCmJhbXRvb2xzIHN0YXRzIDwgaHBnbDA4MzguYmFtCmBgYAoKIyMjIGhwZ2wwODM5CgpgYGB7ciBudW1iZXJfcmVhZHNfODM5LCBlbmdpbmU9J2Jhc2gnfQpjZCBwcmVwcm9jZXNzaW5nLwpiYW10b29scyBzdGF0cyA8IGhwZ2wwODM5LmJhbQpgYGAKCiMjIyBjb21iaW5lZAoKYGBge3IgbnVtYmVyX3JlYWRzX2NvbWJpbmVkLCBlbmdpbmU9J2Jhc2gnfQpjZCBwcmVwcm9jZXNzaW5nLwpiYW10b29scyBzdGF0cyA8IGNvbWJpbmVkLmJhbQpgYGAKCiMjIyBTYXR1cmF0aW9uIGluZGV4CgpUaGUgYW5zd2VyIGZvciB0aGlzIGlzIGluIHRoZSBSIGZ1bmN0aW9uIHRuc2VxX3NhdHVyYXRpb24oKS4KCmBgYHtyIHNhdHVyYXRpb259CmZpbGUgPC0gInByZXByb2Nlc3NpbmcvaHBnbDA4Mzcvb3V0cHV0cy9lc3NlbnRpYWxpdHkvaHBnbDA4MzctdHJpbW1lZF9jYV90YS12ME0xLndpZyIKaHBnbDA4Mzdfc2F0dXJhdGlvbiA8LSB0bnNlcV9zYXR1cmF0aW9uKGRhdGE9ZmlsZSkKZmlsZSA8LSAicHJlcHJvY2Vzc2luZy9ocGdsMDgzOC9vdXRwdXRzL2Vzc2VudGlhbGl0eS9ocGdsMDgzOC10cmltbWVkX2NhX3RhLXYwTTEud2lnIgpocGdsMDgzOF9zYXR1cmF0aW9uIDwtIHRuc2VxX3NhdHVyYXRpb24oZGF0YT1maWxlKQpmaWxlIDwtICJwcmVwcm9jZXNzaW5nL2hwZ2wwODM5L291dHB1dHMvZXNzZW50aWFsaXR5L2hwZ2wwODM5LXRyaW1tZWRfY2FfdGEtdjBNMS53aWciCmhwZ2wwODM5X3NhdHVyYXRpb24gPC0gdG5zZXFfc2F0dXJhdGlvbihkYXRhPWZpbGUpCgojIyBPaywgbm93IGhhdmUgc3RhdHMgZm9yIHRoZSBpbmRpdmlkdWFsIGxpYnJhcmllcy4KYWxsX3RhYmxlIDwtIG1lcmdlKGhwZ2wwODM3X3NhdHVyYXRpb24kaGl0c19ieV9wb3NpdGlvbiwKICAgICAgICAgICAgICAgICAgIGhwZ2wwODM4X3NhdHVyYXRpb24kaGl0c19ieV9wb3NpdGlvbiwgYnk9IlN0YXJ0IikKYWxsX3RhYmxlIDwtIG1lcmdlKGFsbF90YWJsZSwKICAgICAgICAgICAgICAgICAgIGhwZ2wwODM5X3NhdHVyYXRpb24kaGl0c19ieV9wb3NpdGlvbiwgYnk9IlN0YXJ0IikKYWxsX3RhYmxlJHN1bSA8LSAwCmZvciAociBpbiAxOm5yb3coYWxsX3RhYmxlKSkgewogIGFsbF90YWJsZVtyLCAic3VtIl0gPC0gYWxsX3RhYmxlW3IsICJSZWFkcy54Il0gKyBhbGxfdGFibGVbciwgIlJlYWRzLnkiXSArIGFsbF90YWJsZVtyLCAiUmVhZHMiXQp9CmFsbF90YWJsZSA8LSBhbGxfdGFibGVbLCBjKCJTdGFydCIsICJzdW0iKV0KY29tYmluZWRfc2F0dXJhdGlvbiA8LSB0bnNlcV9zYXR1cmF0aW9uKGRhdGE9YWxsX3RhYmxlLCBjb2x1bW49InN1bSIpCmBgYAoKIyMjIyBVbmlxdWUgaW5zZXJ0aW9uIHNpdGVzCgpJIHByZXN1bWUgYnV0IGFtIG5vdCBjZXJ0YWluIHRoYXQgdGhpcyBpcyB0aGUgbnVtYmVyIG9mID4gc2luZ2xldG9uIGhpdHMuCgpgYGB7ciB1bmlxdWVfc2l0ZXN9CmhwZ2wwODM3X3NhdHVyYXRpb24kZXFfMApocGdsMDgzN19zYXR1cmF0aW9uJGd0XzEKCmhwZ2wwODM4X3NhdHVyYXRpb24kZXFfMApocGdsMDgzOF9zYXR1cmF0aW9uJGd0XzEKCmhwZ2wwODM5X3NhdHVyYXRpb24kZXFfMApocGdsMDgzOV9zYXR1cmF0aW9uJGd0XzEKCmNvbWJpbmVkX3NhdHVyYXRpb24kZXFfMApjb21iaW5lZF9zYXR1cmF0aW9uJGd0XzEKYGBgCgpgYGB7ciBzYXR1cmF0aW9uX2luZGV4ZXN9CmhwZ2wwODM3X3NhdHVyYXRpb24kcmF0aW9zWzFdCmhwZ2wwODM3X3NhdHVyYXRpb24kcmF0aW9zWzRdCmhwZ2wwODM3X3NhdHVyYXRpb24kcmF0aW9zWzZdCmhwZ2wwODM3X3NhdHVyYXRpb24kcGxvdAoKaHBnbDA4Mzhfc2F0dXJhdGlvbiRyYXRpb3NbMV0KaHBnbDA4Mzhfc2F0dXJhdGlvbiRyYXRpb3NbNF0KaHBnbDA4Mzhfc2F0dXJhdGlvbiRyYXRpb3NbNl0KaHBnbDA4Mzhfc2F0dXJhdGlvbiRwbG90CgpocGdsMDgzOV9zYXR1cmF0aW9uJHJhdGlvc1sxXQpocGdsMDgzOV9zYXR1cmF0aW9uJHJhdGlvc1s0XQpocGdsMDgzOV9zYXR1cmF0aW9uJHJhdGlvc1s2XQpocGdsMDgzOV9zYXR1cmF0aW9uJHBsb3QKCmNvbWJpbmVkX3NhdHVyYXRpb24kcmF0aW9zWzFdCmNvbWJpbmVkX3NhdHVyYXRpb24kcmF0aW9zWzRdCmNvbWJpbmVkX3NhdHVyYXRpb24kcmF0aW9zWzZdCmNvbWJpbmVkX3NhdHVyYXRpb24kcGxvdApgYGAKCiMjIEhvdyBtYW55IFRBcyBhcmUgaW4gdGhlIE1HQVM1MDA1IGdlbm9tZT8KClRoZSBhbnN3ZXIgdG8gdGhpcyBxdWVzdGlvbiBzaG91bGQgYmUgZWFzaWx5IHNlYXJjaGFibGUgaW4gZWl0aGVyIHRoZSBhbm5vdGF0aW9uIGRhdGEgZm9yIHRoZQpnZW5vbWUgYW5kL29yIHRoZSBwcmVjdXJzb3IgZmlsZXMgZm9yIGVzc2VudGlhbGl0eSAod2hpY2ggY29sbGVjdHMgaGl0cyBvbiBldmVyeSBUQSkuCgpUaGUgZm9sbG93aW5nIGNvdW50cyB0aGUgbnVtYmVyIG9mIGxpbmVzIGluIHRoZSB0YXMudHh0IGZpbGUuClRoZSBhbnN3ZXIgc2hvdWxkIGJlIHRoYXQgLTEsIGFzIHRoZSBmaXJzdCBsaW5lIGlzIGEgaGVhZGVyLgoKYGBge3IgbnVtYmVyX3RhcywgZW5naW5lPSdiYXNoJ30KY2QgcHJlcHJvY2Vzc2luZy9ocGdsMDgzNy9vdXRwdXRzL2Vzc2VudGlhbGl0eS8Kd2MgaHBnbDA4MzctdHJpbW1lZF9jYV90YS12ME0xX3Rhcy50eHQKYGBgCgo=