1 Introduction

2 Annotation

hs_annot <- sm(load_biomart_annotations(year = "2020"))
hs_annot <- hs_annot[["annotation"]]
hs_annot[["transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE)
tx_gene_map <- hs_annot[, c("transcript", "ensembl_gene_id")]

summary(hs_annot)

##  ensembl_transcript_id ensembl_gene_id       version     transcript_version
##  Length:227921         Length:227921      Min.   : 1.0   Min.   : 1.00     
##  Class :character      Class :character   1st Qu.: 6.0   1st Qu.: 1.00     
##  Mode  :character      Mode  :character   Median :12.0   Median : 1.00     
##                                           Mean   :10.7   Mean   : 3.08     
##                                           3rd Qu.:16.0   3rd Qu.: 5.00     
##                                           Max.   :29.0   Max.   :17.00     
##                                                                            
##  hgnc_symbol        description        gene_biotype         cds_length    
##  Length:227921      Length:227921      Length:227921      Min.   :     3  
##  Class :character   Class :character   Class :character   1st Qu.:   357  
##  Mode  :character   Mode  :character   Mode  :character   Median :   694  
##                                                           Mean   :  1139  
##                                                           3rd Qu.:  1446  
##                                                           Max.   :107976  
##                                                           NA's   :127343  
##  chromosome_name       strand          start_position      end_position     
##  Length:227921      Length:227921      Min.   :5.77e+02   Min.   :6.47e+02  
##  Class :character   Class :character   1st Qu.:3.11e+07   1st Qu.:3.12e+07  
##  Mode  :character   Mode  :character   Median :6.04e+07   Median :6.06e+07  
##                                        Mean   :7.41e+07   Mean   :7.42e+07  
##                                        3rd Qu.:1.09e+08   3rd Qu.:1.09e+08  
##                                        Max.   :2.49e+08   Max.   :2.49e+08  
##                                                                             
##   transcript       
##  Length:227921     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

hs_go <- sm(load_biomart_go()[["go"]])
hs_length <- hs_annot[, c("ensembl_gene_id", "cds_length")]
colnames(hs_length) <- c("ID", "length")

3 Sample Estimation

3.1 Generate expressionsets

The sample sheet is copied from our shared online sheet and updated with each release of sequencing data.

samplesheet <- "sample_sheets/macrogen_samples.xlsx"

3.1.1 Hisat2 expressionsets

## Create the expressionset and immediately pass it to a filter
## removing the non protein coding genes.
hs_expt <- create_expt(samplesheet,
                       file_column = "file",
                       gene_info = hs_annot) %>%
  exclude_genes_expt(column = "gene_biotype", method = "keep", pattern = "protein_coding")

## Reading the sample metadata.

## The sample definitions comprises: 27 rows(samples) and 4 columns(metadata fields).

## Reading count tables.

## Reading count files with read.table().

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20132/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20133/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20134/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20179/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20187/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10036/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10046/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10063/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10073/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10093/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/m1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/m2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/sfb/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/su1158/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/su1160/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.

## Finished reading count data.

## Matched 21452 annotations and counts.

## Bringing together the count matrix and gene information.

## Some annotations were lost in merging, setting them to 'undefined'.

## Saving the expressionset to 'expt.rda'.

## The final expressionset has 21481 rows and 27 columns.

## Before removal, there were 21481 genes, now there are 19941.

## There are 5 samples which kept less than 90 percent counts.

## wt1029_b2 wt1029_b3 wt1031_b2 wt2008_b2 wt2010_b3 
##     83.67     88.98     89.81     86.08     86.82

3.1.1.1 Initial metrics

libsize <- plot_libsize(hs_expt)
libsize$plot

nonzero <- plot_nonzero(hs_expt)
nonzero$plot

## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

all_norm <- sm(normalize_expt(hs_expt, transform = "log2", norm = "quant",
                              convert = "cpm", filter = TRUE))

all_pca <- plot_pca(all_norm, plot_labels = FALSE, plot_title = "PCA - Cell type")
all_pca$plot

tmp <- loadme(filename = savefile)

LS0tCnRpdGxlOiAiU29tZSBtYWNyb2dlbiBzYW1wbGVzIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiBodG1sX2RvY3VtZW50OgogIGNvZGVfZG93bmxvYWQ6IHRydWUKICBjb2RlX2ZvbGRpbmc6IHNob3cKICBmaWdfY2FwdGlvbjogdHJ1ZQogIGZpZ19oZWlnaHQ6IDcKICBmaWdfd2lkdGg6IDcKICBoaWdobGlnaHQ6IGRlZmF1bHQKICBrZWVwX21kOiBmYWxzZQogIG1vZGU6IHNlbGZjb250YWluZWQKICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICBzZWxmX2NvbnRhaW5lZDogdHJ1ZQogIHRoZW1lOiByZWFkYWJsZQogIHRvYzogdHJ1ZQogIHRvY19mbG9hdDoKICAgY29sbGFwc2VkOiBmYWxzZQogICBzbW9vdGhfc2Nyb2xsOiBmYWxzZQotLS0KCjxzdHlsZT4KICBib2R5IC5tYWluLWNvbnRhaW5lciB7CiAgICBtYXgtd2lkdGg6IDE2MDBweDsKICB9Cjwvc3R5bGU+CgpgYGB7ciBvcHRpb25zLCBpbmNsdWRlID0gRkFMU0V9CmxpYnJhcnkoaHBnbHRvb2xzKQp0dCA8LSBzbShkZXZ0b29sczo6bG9hZF9hbGwoIn4vaHBnbHRvb2xzIikpCmtuaXRyOjpvcHRzX2tuaXQkc2V0KHByb2dyZXNzID0gVFJVRSwKICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZSA9IFRSVUUsCiAgICAgICAgICAgICAgICAgICAgIHdpZHRoID0gMTIwLAogICAgICAgICAgICAgICAgICAgICBlY2hvID0gVFJVRSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KGVycm9yID0gVFJVRSwKICAgICAgICAgICAgICAgICAgICAgIGZpZy53aWR0aCA9IDEyLAogICAgICAgICAgICAgICAgICAgICAgZmlnLmhlaWdodCA9IDEyLAogICAgICAgICAgICAgICAgICAgICAgZHBpID0gOTYpCm9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzID0gNCwKICAgICAgICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAga25pdHIuZHVwbGljYXRlLmxhYmVsID0gImFsbG93IikKZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZSA9IDEyKSkKdmVyIDwtICIyMDIxMDUiCnJ1bmRhdGUgPC0gZm9ybWF0KFN5cy5EYXRlKCksIGZvcm1hdCA9ICIlWSVtJWQiKQoKcm1kX2ZpbGUgPC0gZ2x1ZTo6Z2x1ZSgibWFjcm9nZW5fc2FtcGxlcy5SbWQiKQpzYXZlZmlsZSA8LSBnc3ViKHBhdHRlcm4gPSAiXFwuUm1kIiwgcmVwbGFjZSA9ICJcXC5yZGFcXC54eiIsIHggPSBybWRfZmlsZSkKYGBgCgojIEludHJvZHVjdGlvbgoKIyBBbm5vdGF0aW9uCgpgYGB7ciBoc19hbm5vdH0KaHNfYW5ub3QgPC0gc20obG9hZF9iaW9tYXJ0X2Fubm90YXRpb25zKHllYXIgPSAiMjAyMCIpKQpoc19hbm5vdCA8LSBoc19hbm5vdFtbImFubm90YXRpb24iXV0KaHNfYW5ub3RbWyJ0cmFuc2NyaXB0Il1dIDwtIHBhc3RlMChyb3duYW1lcyhoc19hbm5vdCksICIuIiwgaHNfYW5ub3RbWyJ2ZXJzaW9uIl1dKQpyb3duYW1lcyhoc19hbm5vdCkgPC0gbWFrZS5uYW1lcyhoc19hbm5vdFtbImVuc2VtYmxfZ2VuZV9pZCJdXSwgdW5pcXVlID0gVFJVRSkKdHhfZ2VuZV9tYXAgPC0gaHNfYW5ub3RbLCBjKCJ0cmFuc2NyaXB0IiwgImVuc2VtYmxfZ2VuZV9pZCIpXQoKc3VtbWFyeShoc19hbm5vdCkKYGBgCgpgYGB7ciBoc19nb30KaHNfZ28gPC0gc20obG9hZF9iaW9tYXJ0X2dvKClbWyJnbyJdXSkKaHNfbGVuZ3RoIDwtIGhzX2Fubm90WywgYygiZW5zZW1ibF9nZW5lX2lkIiwgImNkc19sZW5ndGgiKV0KY29sbmFtZXMoaHNfbGVuZ3RoKSA8LSBjKCJJRCIsICJsZW5ndGgiKQpgYGAKCiMgU2FtcGxlIEVzdGltYXRpb24KCiMjIEdlbmVyYXRlIGV4cHJlc3Npb25zZXRzCgpUaGUgc2FtcGxlIHNoZWV0IGlzIGNvcGllZCBmcm9tIG91ciBzaGFyZWQgb25saW5lIHNoZWV0IGFuZCB1cGRhdGVkIHdpdGggZWFjaCByZWxlYXNlCm9mIHNlcXVlbmNpbmcgZGF0YS4KCmBgYHtyIHNhbXBsZXNoZWV0fQpzYW1wbGVzaGVldCA8LSAic2FtcGxlX3NoZWV0cy9tYWNyb2dlbl9zYW1wbGVzLnhsc3giCmBgYAoKIyMjIEhpc2F0MiBleHByZXNzaW9uc2V0cwoKYGBge3IgYWxsX25ld19oaXNhdDJ9CiMjIENyZWF0ZSB0aGUgZXhwcmVzc2lvbnNldCBhbmQgaW1tZWRpYXRlbHkgcGFzcyBpdCB0byBhIGZpbHRlcgojIyByZW1vdmluZyB0aGUgbm9uIHByb3RlaW4gY29kaW5nIGdlbmVzLgpoc19leHB0IDwtIGNyZWF0ZV9leHB0KHNhbXBsZXNoZWV0LAogICAgICAgICAgICAgICAgICAgICAgIGZpbGVfY29sdW1uID0gImZpbGUiLAogICAgICAgICAgICAgICAgICAgICAgIGdlbmVfaW5mbyA9IGhzX2Fubm90KSAlPiUKICBleGNsdWRlX2dlbmVzX2V4cHQoY29sdW1uID0gImdlbmVfYmlvdHlwZSIsIG1ldGhvZCA9ICJrZWVwIiwgcGF0dGVybiA9ICJwcm90ZWluX2NvZGluZyIpCmBgYAoKIyMjIyBJbml0aWFsIG1ldHJpY3MKCmBgYHtyIGluaXRpYWxfbWV0cmljc30KbGlic2l6ZSA8LSBwbG90X2xpYnNpemUoaHNfZXhwdCkKbGlic2l6ZSRwbG90Cgpub256ZXJvIDwtIHBsb3Rfbm9uemVybyhoc19leHB0KQpub256ZXJvJHBsb3QKYGBgCgpgYGB7ciBwcmVfcXVlc3Rpb25zfQphbGxfbm9ybSA8LSBzbShub3JtYWxpemVfZXhwdChoc19leHB0LCB0cmFuc2Zvcm0gPSAibG9nMiIsIG5vcm0gPSAicXVhbnQiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICBjb252ZXJ0ID0gImNwbSIsIGZpbHRlciA9IFRSVUUpKQoKYWxsX3BjYSA8LSBwbG90X3BjYShhbGxfbm9ybSwgcGxvdF9sYWJlbHMgPSBGQUxTRSwgcGxvdF90aXRsZSA9ICJQQ0EgLSBDZWxsIHR5cGUiKQphbGxfcGNhJHBsb3QKYGBgCgpgYGB7ciBsb2FkbWVfYWZ0ZXIsIGV2YWwgPSBGQUxTRX0KdG1wIDwtIGxvYWRtZShmaWxlbmFtZSA9IHNhdmVmaWxlKQpgYGAK

Some macrogen samples

atb abelew@gmail.com

2021-05-05