1 Extract some count tables

I will use this sheet for exporting data to collaborators. Currently, that is only the folks interested in doing some analyses with the first PLOS NTD paper.

It is worth noting that I recently found some errors in my accounting for these and the plosntd2 samples (I had the pathogen strains reversed).

hs_annot <- load_biomart_annotations()$annotation
## The biomart annotations file already exists, loading from it.
rownames(hs_annot) <- make.names(
  paste0(hs_annot[["ensembl_transcript_id"]], ".",
         hs_annot[["transcript_version"]]),
  unique=TRUE)
hs_tx_gene <- hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene[["id"]] <- rownames(hs_tx_gene)
hs_tx_gene <- hs_tx_gene[, c("id", "ensembl_gene_id")]
new_hs_annot <- hs_annot
rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)

Note that I need to overwrite the savefile of the annotations and not use the archive server, since 38v99 came out in 202001.

sample_sheet <- glue::glue("sample_sheets/UMD_leishmania_host_metasheet_{ver}.xlsx")
## As of 20200222, I have only performed hisat2 mapping for the plos ntd2 data.
prefix <- "excel/plos_ntd_host_hisat2-v"
savefile <- glue::glue("mbio_expressionset-v{ver}.rda")
all_expt <- create_expt(metadata=sample_sheet,
                        file_column="hg3891salmon",
                        gene_info=new_hs_annot,
                        tx_gene_map=hs_tx_gene,
                        savefile=savefile)
## Reading the sample metadata.
## Dropped 2 rows from the sample metadata because they were blank.
## The sample definitions comprises: 441 rows(samples) and 68 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count data.
## Warning in create_expt(metadata = sample_sheet, file_column = "hg3891salmon", :
## Some samples were removed when cross referencing the samples against the count
## data.
## Matched 16933 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 16933 rows and 267 columns.
mbio_expt <- subset_expt(all_expt, subset="study=='mbio'")
## Using a subset expression.
## There were 267, now there are 82 samples.
mbio_expt <- set_expt_conditions(mbio_expt, fact="infectionstatus")
mbio_expt <- set_expt_batches(mbio_expt, fact="donor")

mbio_norm <- normalize_expt(mbio_expt, filter=TRUE, convert="cpm", norm="quant", transform="log2")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 5857 low-count genes (11076 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## The method is: raw.
## Step 4: not doing batch correction.
## Step 4: transforming the data with log2.
## transform_counts: Found 59018 values equal to 0, adding 1 to the matrix.
tt <- plot_pca(mbio_norm, plot_labels="normal")
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
LS0tCnRpdGxlOiAiVXNlIHRoaXMgc2hlZXQgdG8gZXh0cmFjdCBzdWJzZXRzIG9mIHRoZSBkYXRhLiIKYXV0aG9yOiAiYXRiIGFiZWxld0BnbWFpbC5jb20iCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGZpZ19jYXB0aW9uOiB0cnVlCiAgICBmaWdfaGVpZ2h0OiA3CiAgICBmaWdfd2lkdGg6IDcKICAgIGhpZ2hsaWdodDogdGFuZ28KICAgIGtlZXBfbWQ6IGZhbHNlCiAgICBtb2RlOiBzZWxmY29udGFpbmVkCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIHNlbGZfY29udGFpbmVkOiB0cnVlCiAgICB0aGVtZTogcmVhZGFibGUKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OgogICAgICBjb2xsYXBzZWQ6IGZhbHNlCiAgICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCiAgcm1kZm9ybWF0czo6cmVhZHRoZWRvd246CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGRmX3ByaW50OiBwYWdlZAogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAgd2lkdGg6IDMwMAogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIHRvY19mbG9hdDogdHJ1ZQogIEJpb2NTdHlsZTo6aHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIHRvY19mbG9hdDogdHJ1ZQotLS0KCjxzdHlsZSB0eXBlPSJ0ZXh0L2NzcyI+CmJvZHksIHRkIHsKICBmb250LXNpemU6IDE2cHg7Cn0KY29kZS5yewogIGZvbnQtc2l6ZTogMTZweDsKfQpwcmUgewogZm9udC1zaXplOiAxNnB4Cn0KPC9zdHlsZT4KCmBgYHtyIG9wdGlvbnMsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoImhwZ2x0b29scyIpCnR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgiL2RhdGEvaHBnbHRvb2xzIikKa25pdHI6Om9wdHNfa25pdCRzZXQod2lkdGg9MTIwLAogICAgICAgICAgICAgICAgICAgICBwcm9ncmVzcz1UUlVFLAogICAgICAgICAgICAgICAgICAgICB2ZXJib3NlPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KGVycm9yPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICBkcGk9OTYpCm9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSwKICAgICAgICAgICAgICAgICAgICAgICBrbml0ci5kdXBsaWNhdGUubGFiZWw9ImFsbG93IikKZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMCkpCnJ1bmRhdGUgPC0gZm9ybWF0KFN5cy5EYXRlKCksIGZvcm1hdD0iJVklbSVkIikKcHJldmlvdXNfZmlsZSA8LSAiIgp2ZXIgPC0gIjIwMjAwMzA0IgoKIyN0bXAgPC0gc20obG9hZG1lKGZpbGVuYW1lPXBhc3RlMChnc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IiIsIHg9cHJldmlvdXNfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKSkpCiMjcm1kX2ZpbGUgPC0gIjAzX2V4cHJlc3Npb25faW5mZWN0aW9uXzIwMTgwODIyLlJtZCIKYGBgCgojIEV4dHJhY3Qgc29tZSBjb3VudCB0YWJsZXMKCkkgd2lsbCB1c2UgdGhpcyBzaGVldCBmb3IgZXhwb3J0aW5nIGRhdGEgdG8gY29sbGFib3JhdG9ycy4gIEN1cnJlbnRseSwgdGhhdCBpcwpvbmx5IHRoZSBmb2xrcyBpbnRlcmVzdGVkIGluIGRvaW5nIHNvbWUgYW5hbHlzZXMgd2l0aCB0aGUgZmlyc3QgUExPUyBOVEQgcGFwZXIuCgpJdCBpcyB3b3J0aCBub3RpbmcgdGhhdCBJIHJlY2VudGx5IGZvdW5kIHNvbWUgZXJyb3JzIGluIG15IGFjY291bnRpbmcgZm9yIHRoZXNlCmFuZCB0aGUgcGxvc250ZDIgc2FtcGxlcyAoSSBoYWQgdGhlIHBhdGhvZ2VuIHN0cmFpbnMgcmV2ZXJzZWQpLgoKYGBge3IgYW5ub3RhdGlvbnMsIGZpZy5zaG93PSJoaWRlIn0KaHNfYW5ub3QgPC0gbG9hZF9iaW9tYXJ0X2Fubm90YXRpb25zKCkkYW5ub3RhdGlvbgpyb3duYW1lcyhoc19hbm5vdCkgPC0gbWFrZS5uYW1lcygKICBwYXN0ZTAoaHNfYW5ub3RbWyJlbnNlbWJsX3RyYW5zY3JpcHRfaWQiXV0sICIuIiwKICAgICAgICAgaHNfYW5ub3RbWyJ0cmFuc2NyaXB0X3ZlcnNpb24iXV0pLAogIHVuaXF1ZT1UUlVFKQpoc190eF9nZW5lIDwtIGhzX2Fubm90WywgYygiZW5zZW1ibF9nZW5lX2lkIiwgImVuc2VtYmxfdHJhbnNjcmlwdF9pZCIpXQpoc190eF9nZW5lW1siaWQiXV0gPC0gcm93bmFtZXMoaHNfdHhfZ2VuZSkKaHNfdHhfZ2VuZSA8LSBoc190eF9nZW5lWywgYygiaWQiLCAiZW5zZW1ibF9nZW5lX2lkIildCm5ld19oc19hbm5vdCA8LSBoc19hbm5vdApyb3duYW1lcyhuZXdfaHNfYW5ub3QpIDwtIG1ha2UubmFtZXMoaHNfYW5ub3RbWyJlbnNlbWJsX2dlbmVfaWQiXV0sIHVuaXF1ZT1UUlVFKQpgYGAKCk5vdGUgdGhhdCBJIG5lZWQgdG8gb3ZlcndyaXRlIHRoZSBzYXZlZmlsZSBvZiB0aGUgYW5ub3RhdGlvbnMgYW5kIG5vdCB1c2UgdGhlCmFyY2hpdmUgc2VydmVyLCBzaW5jZSAzOHY5OSBjYW1lIG91dCBpbiAyMDIwMDEuCgpgYGB7ciBleHRyYWN0X3Bsb3NudGQxLCBmaWcuc2hvdz0iaGlkZSJ9CnNhbXBsZV9zaGVldCA8LSBnbHVlOjpnbHVlKCJzYW1wbGVfc2hlZXRzL1VNRF9sZWlzaG1hbmlhX2hvc3RfbWV0YXNoZWV0X3t2ZXJ9Lnhsc3giKQojIyBBcyBvZiAyMDIwMDIyMiwgSSBoYXZlIG9ubHkgcGVyZm9ybWVkIGhpc2F0MiBtYXBwaW5nIGZvciB0aGUgcGxvcyBudGQyIGRhdGEuCnByZWZpeCA8LSAiZXhjZWwvcGxvc19udGRfaG9zdF9oaXNhdDItdiIKc2F2ZWZpbGUgPC0gZ2x1ZTo6Z2x1ZSgibWJpb19leHByZXNzaW9uc2V0LXZ7dmVyfS5yZGEiKQphbGxfZXhwdCA8LSBjcmVhdGVfZXhwdChtZXRhZGF0YT1zYW1wbGVfc2hlZXQsCiAgICAgICAgICAgICAgICAgICAgICAgIGZpbGVfY29sdW1uPSJoZzM4OTFzYWxtb24iLAogICAgICAgICAgICAgICAgICAgICAgICBnZW5lX2luZm89bmV3X2hzX2Fubm90LAogICAgICAgICAgICAgICAgICAgICAgICB0eF9nZW5lX21hcD1oc190eF9nZW5lLAogICAgICAgICAgICAgICAgICAgICAgICBzYXZlZmlsZT1zYXZlZmlsZSkKbWJpb19leHB0IDwtIHN1YnNldF9leHB0KGFsbF9leHB0LCBzdWJzZXQ9InN0dWR5PT0nbWJpbyciKQptYmlvX2V4cHQgPC0gc2V0X2V4cHRfY29uZGl0aW9ucyhtYmlvX2V4cHQsIGZhY3Q9ImluZmVjdGlvbnN0YXR1cyIpCm1iaW9fZXhwdCA8LSBzZXRfZXhwdF9iYXRjaGVzKG1iaW9fZXhwdCwgZmFjdD0iZG9ub3IiKQoKbWJpb19ub3JtIDwtIG5vcm1hbGl6ZV9leHB0KG1iaW9fZXhwdCwgZmlsdGVyPVRSVUUsIGNvbnZlcnQ9ImNwbSIsIG5vcm09InF1YW50IiwgdHJhbnNmb3JtPSJsb2cyIikKdHQgPC0gcGxvdF9wY2EobWJpb19ub3JtLCBwbG90X2xhYmVscz0ibm9ybWFsIikKYGBgCgpgYGB7ciBzYXZlbWUsIGV2YWw9RkFMU0V9CnBhbmRlcjo6cGFuZGVyKHNlc3Npb25JbmZvKCkpCm1lc3NhZ2UocGFzdGUwKCJUaGlzIGlzIGhwZ2x0b29scyBjb21taXQ6ICIsIGdldF9naXRfY29tbWl0KCkpKQp0aGlzX3NhdmUgPC0gcGFzdGUwKGdzdWIocGF0dGVybj0iXFwuUm1kIiwgcmVwbGFjZT0iIiwgeD1ybWRfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKQptZXNzYWdlKHBhc3RlMCgiU2F2aW5nIHRvICIsIHRoaXNfc2F2ZSkpCnRtcCA8LSBzbShzYXZlbWUoZmlsZW5hbWU9dGhpc19zYXZlKSkKYGBgCg==