1 Extract some count tables

I will use this sheet for exporting data to collaborators. Currently, that is only the folks interested in doing some analyses with the first PLOS NTD paper.

It is worth noting that I recently found some errors in my accounting for these and the plosntd2 samples (I had the pathogen strains reversed).

hs_annot <- load_biomart_annotations(host="useast.ensembl.org", overwrite=TRUE)$annotation

## Successfully connected to the hsapiens_gene_ensembl database.

## Cache found

## Finished downloading ensembl gene annotations.

## Cache found

## Finished downloading ensembl structure annotations.

## Dropping haplotype chromosome annotations, set drop_haplotypes=FALSE if this is bad.

## Saving annotations to hsapiens_biomart_annotations.rda.

## Finished save().

rownames(hs_annot) <- make.names(
  paste0(hs_annot[["ensembl_transcript_id"]], ".",
         hs_annot[["transcript_version"]]),
  unique=TRUE)
hs_tx_gene <- hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene[["id"]] <- rownames(hs_tx_gene)
hs_tx_gene <- hs_tx_gene[, c("id", "ensembl_gene_id")]
new_hs_annot <- hs_annot
rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)

Note that I need to overwrite the savefile of the annotations and not use the archive server, since 38v99 came out in 202001.

sample_sheet <- glue::glue("sample_sheets/UMD_leishmania_host_metasheet_{ver}.xlsx")
## As of 20200222, I have only performed hisat2 mapping for the plos ntd2 data.
prefix <- "excel/plos_ntd_host_hisat2-v"
plosntd1_expt <- create_expt(metadata=sample_sheet,
                             file_column="hg3891hisat2",
                             gene_info=new_hs_annot,
                             savefile=glue::glue("{prefix}{ver}.rda"))

## Reading the sample metadata.

## Dropped 2 rows from the sample metadata because they were blank.

## The sample definitions comprises: 441 rows(samples) and 68 columns(metadata fields).

## Reading count tables.

## Reading count tables with read.table().

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0725/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0726/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0727/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0728/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0729/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0730/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1108/outputs/hisat2_hg38_91/E1.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1109/outputs/hisat2_hg38_91/E2.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1110/outputs/hisat2_hg38_91/E3.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1111/outputs/hisat2_hg38_91/E4.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1112/outputs/hisat2_hg38_91/E5.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1113/outputs/hisat2_hg38_91/E6.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1114/outputs/hisat2_hg38_91/E7.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1115/outputs/hisat2_hg38_91/E8.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1116/outputs/hisat2_hg38_91/L1.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1117/outputs/hisat2_hg38_91/L2.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1118/outputs/hisat2_hg38_91/L3.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1119/outputs/hisat2_hg38_91/L4.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1120/outputs/hisat2_hg38_91/L5.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1121/outputs/hisat2_hg38_91/L6.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1122/outputs/hisat2_hg38_91/L7.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1123/outputs/hisat2_hg38_91/L8.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1124/outputs/hisat2_hg38_91/L9.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1125/outputs/hisat2_hg38_91/L10.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1126/outputs/hisat2_hg38_91/L11.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1127/outputs/hisat2_hg38_91/L12.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1128/outputs/hisat2_hg38_91/L13.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1129/outputs/hisat2_hg38_91/L14.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1130/outputs/hisat2_hg38_91/L15.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1131/outputs/hisat2_hg38_91/L16.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1132/outputs/hisat2_hg38_91/L17.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1133/outputs/hisat2_hg38_91/N1.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1134/outputs/hisat2_hg38_91/N2.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1135/outputs/hisat2_hg38_91/N3.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1136/outputs/hisat2_hg38_91/N4.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1137/outputs/hisat2_hg38_91/N5.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1138/outputs/hisat2_hg38_91/N6.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1139/outputs/hisat2_hg38_91/N7.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1140/outputs/hisat2_hg38_91/N8.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1141/outputs/hisat2_hg38_91/N9.count.xz contains 58307 rows and merges to 58307 rows.

## preprocessing/hpgl1142/outputs/hisat2_hg38_91/N10.count.xz contains 58307 rows and merges to 58307 rows.

## Finished reading count tables.

## Matched 57755 annotations and counts.

## Bringing together the count matrix and gene information.

## Some annotations were lost in merging, setting them to 'undefined'.

## The final expressionset has 58302 rows and 41 columns.

plosntd1_expt <- set_expt_conditions(plosntd1_expt, fact="infectionstatus")

written_csv <- readr::write_csv(x=as.data.frame(exprs(plosntd1_expt)),
                                path=glue::glue("{prefix}{ver}.csv"))
written_xls <- write_expt(plosntd1_expt,
                          excel=glue::glue("{prefix}{ver}.xlsx"))

## Writing the first sheet, containing a legend and some summary data.

## Writing the raw reads.

## Graphing the raw reads.

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Attempting mixed linear model with: ~  (1|condition) + (1|batch)

## Fitting the expressionset to the model, this is slow.

## Dividing work into 100 chunks...

## 
## Total:184 s

## Placing factor: condition at the beginning of the model.

## Writing the normalized reads.

## Graphing the normalized reads.

## Attempting mixed linear model with: ~  (1|condition) + (1|batch)

## Fitting the expressionset to the model, this is slow.

## Dividing work into 100 chunks...

## 
## Total:154 s

## Placing factor: condition at the beginning of the model.

## Writing the median reads by factor.

salmon_prefix <- "excel/plos_ntd_host_salmon-v"
salmon_plosntd1_expt <- create_expt(sample_sheet,
                                    file_column="hg3899salmon",
                                    gene_info=new_hs_annot,
                                    tx_gene_map=hs_tx_gene,
                                    savefile=glue::glue("{salmon_prefix}{ver}.rda"))

## Reading the sample metadata.

## Dropped 2 rows from the sample metadata because they were blank.

## The sample definitions comprises: 441 rows(samples) and 68 columns(metadata fields).

## Reading count tables.

## Using the transcript to gene mapping.

## Reading salmon data with tximport.

## Finished reading count tables.

## Matched 19999 annotations and counts.

## Bringing together the count matrix and gene information.

## The mapped IDs are not the rownames of your gene information, changing them now.

## Some annotations were lost in merging, setting them to 'undefined'.

## The final expressionset has 19999 rows and 41 columns.

salmon_plosntd1_expt <- set_expt_conditions(salmon_plosntd1_expt, fact="infectionstatus")
save_result <- try(save(salmon_plosntd1_expt, file=paste0(salmon_prefix, ver, ".rda")))

salmon_written_csv <- readr::write_csv(x=as.data.frame(exprs(salmon_plosntd1_expt)),
                                       path=glue::glue("{salmon_prefix}{ver}.csv"))
salmon_written_xls <- write_expt(salmon_plosntd1_expt,
                                 excel=glue::glue("{salmon_prefix}{ver}.xlsx"))

## Writing the first sheet, containing a legend and some summary data.

## Writing the raw reads.

## Graphing the raw reads.

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

## Attempting mixed linear model with: ~  (1|condition) + (1|batch)

## Fitting the expressionset to the model, this is slow.

## Dividing work into 100 chunks...

## 
## Total:96 s

## Placing factor: condition at the beginning of the model.

## Writing the normalized reads.

## Graphing the normalized reads.

## Attempting mixed linear model with: ~  (1|condition) + (1|batch)

## Fitting the expressionset to the model, this is slow.

## Dividing work into 100 chunks...

## 
## Total:118 s

## Placing factor: condition at the beginning of the model.

## Writing the median reads by factor.

fun <- load("excel/plos_ntd_host_hisat2-v20200304.rda")
ls()

##  [1] "expt"                 "fun"                  "hs_annot"            
##  [4] "hs_tx_gene"           "new_hs_annot"         "old_options"         
##  [7] "plosntd1_expt"        "prefix"               "previous_file"       
## [10] "rundate"              "salmon_plosntd1_expt" "salmon_prefix"       
## [13] "salmon_written_csv"   "salmon_written_xls"   "sample_sheet"        
## [16] "save_result"          "tt"                   "ver"                 
## [19] "written_csv"          "written_xls"

summary(expt)

##                  Length Class         Mode     
## title             1     -none-        character
## notes             1     -none-        character
## initial_metadata 69     data.frame    list     
## expressionset     1     ExpressionSet S4       
## design           69     data.frame    list     
## conditions       41     factor        numeric  
## batches          41     factor        numeric  
## samplenames      41     -none-        character
## colors           41     -none-        character
## state             5     -none-        list     
## libsize          41     -none-        numeric

salmon_plosntd1_exprs <- expt[["expressionset"]]
head(exprs(salmon_plosntd1_exprs), n=1)

##                 hpgl0725 hpgl0726 hpgl0727 hpgl0728 hpgl0729 hpgl0730 hpgl1108
## ENSG00000000003      150      338      387      681      787      458     1017
##                 hpgl1109 hpgl1110 hpgl1111 hpgl1112 hpgl1113 hpgl1114 hpgl1115
## ENSG00000000003      742      742     1585      871     1697      835      383
##                 hpgl1116 hpgl1117 hpgl1118 hpgl1119 hpgl1120 hpgl1121 hpgl1122
## ENSG00000000003      652      934      445      428      424      592      192
##                 hpgl1123 hpgl1124 hpgl1125 hpgl1126 hpgl1127 hpgl1128 hpgl1129
## ENSG00000000003      117      439     1009      664      971      889      396
##                 hpgl1130 hpgl1131 hpgl1132 hpgl1133 hpgl1134 hpgl1135 hpgl1136
## ENSG00000000003      653      408      445     4579     3436      954     1021
##                 hpgl1137 hpgl1138 hpgl1139 hpgl1140 hpgl1141 hpgl1142
## ENSG00000000003     1126     1126     1120      687      900     1508

head(pData(salmon_plosntd1_exprs), n=1)

##          sampleid    study    lab         host hoststrain hostcelltype
## hpgl0725 hpgl0725 plosntd2 mosser homo_sapiens       <NA>         skin
##          hostcellsource infectstate differentiationmethod stimulation
## hpgl0725           <NA>         yes                  <NA>        <NA>
##          pathogenspecies pathogenstrain pathogenstage  expttime moi
## hpgl0725    lamazonensis           <NA>    amastigote undefined  NA
##          numberparasitecells numberhostcells infectstateold donor celltype
## hpgl0725                  NA              NA       infected  <NA>     skin
##                state studybatch skipped        host1 pathogenspecies1
## hpgl0725 la_infected       <NA>      no homo_sapiens     lamazonensis
##                                                    hg3891salmon
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_hg38_91/quant.sf
##                                                    hg3899salmon
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_hg38_99/quant.sf
##                                                            hg3891hisat2
## hpgl0725 preprocessing/hpgl0725/outputs/hisat2_hg38_91/forward.count.xz
##          mmusculusfile lmajorfile
## hpgl0725          <NA>       <NA>
##                                                         lmexicanafile
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_lmexicana_v36/quant.sf
##          lpanamensisfile lbraziliensisfile
## hpgl0725            <NA>              <NA>
##                                                         lamazonensisfile
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_lamazonensis_v44/quant.sf
##          samplename infectionstatus samplealias parasitedetectionstatus
## hpgl0725       <NA>    diffuse&#10;        <NA>                    <NA>
##          lesionsizemm illnessdurationdays patientage patientsex experimentalias
## hpgl0725           NA                 22y         50          F            <NA>
##          tubelabel tubealias expperson exptdate cellsperwell infectionperiod
## hpgl0725      <NA>      <NA>      <NA>       NA           NA            <NA>
##          media parasiteenrichment parasitesperinfectedcell parasitesper100cells
## hpgl0725  <NA>               <NA>                     <NA>                   NA
##          percentinfectedcells parasitecellrange rnangul libraryconstruction
## hpgl0725                 <NA>              <NA>      NA                <NA>
##          sraaccession condition    batch notes totalreads trimmedreads
## hpgl0725   SRR7275003      skin plosntd2  <NA>         NA     60200944
##          mappedhost mappedparasite        host2 percentmappedhost
## hpgl0725   28558522        7742236 homo_sapiens            0.4744
##          percentmappedparasite file
## hpgl0725                0.1286 null

head(fData(salmon_plosntd1_exprs), n=1)

##                 ensembl_transcript_id ensembl_gene_id version
## ENSG00000000003       ENST00000373020 ENSG00000000003      15
##                 transcript_version hgnc_symbol
## ENSG00000000003                  9      TSPAN6
##                                                       description
## ENSG00000000003 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
##                   gene_biotype cds_length chromosome_name strand start_position
## ENSG00000000003 protein_coding        738               X      -      100627108
##                 end_position
## ENSG00000000003    100639991

pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))

LS0tCnRpdGxlOiAiVXNlIHRoaXMgc2hlZXQgdG8gZXh0cmFjdCBzdWJzZXRzIG9mIHRoZSBkYXRhLiIKYXV0aG9yOiAiYXRiIGFiZWxld0BnbWFpbC5jb20iCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGZpZ19jYXB0aW9uOiB0cnVlCiAgICBmaWdfaGVpZ2h0OiA3CiAgICBmaWdfd2lkdGg6IDcKICAgIGhpZ2hsaWdodDogdGFuZ28KICAgIGtlZXBfbWQ6IGZhbHNlCiAgICBtb2RlOiBzZWxmY29udGFpbmVkCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIHNlbGZfY29udGFpbmVkOiB0cnVlCiAgICB0aGVtZTogcmVhZGFibGUKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OgogICAgICBjb2xsYXBzZWQ6IGZhbHNlCiAgICAgIHNtb290aF9zY3JvbGw6IGZhbHNlCiAgcm1kZm9ybWF0czo6cmVhZHRoZWRvd246CiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cKICAgIGRmX3ByaW50OiBwYWdlZAogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAgd2lkdGg6IDMwMAogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIHRvY19mbG9hdDogdHJ1ZQogIEJpb2NTdHlsZTo6aHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIHRvY19mbG9hdDogdHJ1ZQotLS0KCjxzdHlsZSB0eXBlPSJ0ZXh0L2NzcyI+CmJvZHksIHRkIHsKICBmb250LXNpemU6IDE2cHg7Cn0KY29kZS5yewogIGZvbnQtc2l6ZTogMTZweDsKfQpwcmUgewogZm9udC1zaXplOiAxNnB4Cn0KPC9zdHlsZT4KCmBgYHtyIG9wdGlvbnMsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoImhwZ2x0b29scyIpCnR0IDwtIGRldnRvb2xzOjpsb2FkX2FsbCgiL2RhdGEvaHBnbHRvb2xzIikKa25pdHI6Om9wdHNfa25pdCRzZXQod2lkdGg9MTIwLAogICAgICAgICAgICAgICAgICAgICBwcm9ncmVzcz1UUlVFLAogICAgICAgICAgICAgICAgICAgICB2ZXJib3NlPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgIGVjaG89VFJVRSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KGVycm9yPVRSVUUsCiAgICAgICAgICAgICAgICAgICAgICBkcGk9OTYpCm9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzPTQsCiAgICAgICAgICAgICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSwKICAgICAgICAgICAgICAgICAgICAgICBrbml0ci5kdXBsaWNhdGUubGFiZWw9ImFsbG93IikKZ2dwbG90Mjo6dGhlbWVfc2V0KGdncGxvdDI6OnRoZW1lX2J3KGJhc2Vfc2l6ZT0xMCkpCnJ1bmRhdGUgPC0gZm9ybWF0KFN5cy5EYXRlKCksIGZvcm1hdD0iJVklbSVkIikKcHJldmlvdXNfZmlsZSA8LSAiIgp2ZXIgPC0gIjIwMjAwMzA0IgoKIyN0bXAgPC0gc20obG9hZG1lKGZpbGVuYW1lPXBhc3RlMChnc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IiIsIHg9cHJldmlvdXNfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKSkpCiMjcm1kX2ZpbGUgPC0gIjAzX2V4cHJlc3Npb25faW5mZWN0aW9uXzIwMTgwODIyLlJtZCIKYGBgCgojIEV4dHJhY3Qgc29tZSBjb3VudCB0YWJsZXMKCkkgd2lsbCB1c2UgdGhpcyBzaGVldCBmb3IgZXhwb3J0aW5nIGRhdGEgdG8gY29sbGFib3JhdG9ycy4gIEN1cnJlbnRseSwgdGhhdCBpcwpvbmx5IHRoZSBmb2xrcyBpbnRlcmVzdGVkIGluIGRvaW5nIHNvbWUgYW5hbHlzZXMgd2l0aCB0aGUgZmlyc3QgUExPUyBOVEQgcGFwZXIuCgpJdCBpcyB3b3J0aCBub3RpbmcgdGhhdCBJIHJlY2VudGx5IGZvdW5kIHNvbWUgZXJyb3JzIGluIG15IGFjY291bnRpbmcgZm9yIHRoZXNlCmFuZCB0aGUgcGxvc250ZDIgc2FtcGxlcyAoSSBoYWQgdGhlIHBhdGhvZ2VuIHN0cmFpbnMgcmV2ZXJzZWQpLgoKYGBge3IgYW5ub3RhdGlvbnMsIGZpZy5zaG93PSJoaWRlIn0KaHNfYW5ub3QgPC0gbG9hZF9iaW9tYXJ0X2Fubm90YXRpb25zKGhvc3Q9InVzZWFzdC5lbnNlbWJsLm9yZyIsIG92ZXJ3cml0ZT1UUlVFKSRhbm5vdGF0aW9uCnJvd25hbWVzKGhzX2Fubm90KSA8LSBtYWtlLm5hbWVzKAogIHBhc3RlMChoc19hbm5vdFtbImVuc2VtYmxfdHJhbnNjcmlwdF9pZCJdXSwgIi4iLAogICAgICAgICBoc19hbm5vdFtbInRyYW5zY3JpcHRfdmVyc2lvbiJdXSksCiAgdW5pcXVlPVRSVUUpCmhzX3R4X2dlbmUgPC0gaHNfYW5ub3RbLCBjKCJlbnNlbWJsX2dlbmVfaWQiLCAiZW5zZW1ibF90cmFuc2NyaXB0X2lkIildCmhzX3R4X2dlbmVbWyJpZCJdXSA8LSByb3duYW1lcyhoc190eF9nZW5lKQpoc190eF9nZW5lIDwtIGhzX3R4X2dlbmVbLCBjKCJpZCIsICJlbnNlbWJsX2dlbmVfaWQiKV0KbmV3X2hzX2Fubm90IDwtIGhzX2Fubm90CnJvd25hbWVzKG5ld19oc19hbm5vdCkgPC0gbWFrZS5uYW1lcyhoc19hbm5vdFtbImVuc2VtYmxfZ2VuZV9pZCJdXSwgdW5pcXVlPVRSVUUpCmBgYAoKTm90ZSB0aGF0IEkgbmVlZCB0byBvdmVyd3JpdGUgdGhlIHNhdmVmaWxlIG9mIHRoZSBhbm5vdGF0aW9ucyBhbmQgbm90IHVzZSB0aGUKYXJjaGl2ZSBzZXJ2ZXIsIHNpbmNlIDM4djk5IGNhbWUgb3V0IGluIDIwMjAwMS4KCmBgYHtyIGV4dHJhY3RfcGxvc250ZDEsIGZpZy5zaG93PSJoaWRlIn0Kc2FtcGxlX3NoZWV0IDwtIGdsdWU6OmdsdWUoInNhbXBsZV9zaGVldHMvVU1EX2xlaXNobWFuaWFfaG9zdF9tZXRhc2hlZXRfe3Zlcn0ueGxzeCIpCiMjIEFzIG9mIDIwMjAwMjIyLCBJIGhhdmUgb25seSBwZXJmb3JtZWQgaGlzYXQyIG1hcHBpbmcgZm9yIHRoZSBwbG9zIG50ZDIgZGF0YS4KcHJlZml4IDwtICJleGNlbC9wbG9zX250ZF9ob3N0X2hpc2F0Mi12IgpwbG9zbnRkMV9leHB0IDwtIGNyZWF0ZV9leHB0KG1ldGFkYXRhPXNhbXBsZV9zaGVldCwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmaWxlX2NvbHVtbj0iaGczODkxaGlzYXQyIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBnZW5lX2luZm89bmV3X2hzX2Fubm90LAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIHNhdmVmaWxlPWdsdWU6OmdsdWUoIntwcmVmaXh9e3Zlcn0ucmRhIikpCnBsb3NudGQxX2V4cHQgPC0gc2V0X2V4cHRfY29uZGl0aW9ucyhwbG9zbnRkMV9leHB0LCBmYWN0PSJpbmZlY3Rpb25zdGF0dXMiKQoKd3JpdHRlbl9jc3YgPC0gcmVhZHI6OndyaXRlX2Nzdih4PWFzLmRhdGEuZnJhbWUoZXhwcnMocGxvc250ZDFfZXhwdCkpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBhdGg9Z2x1ZTo6Z2x1ZSgie3ByZWZpeH17dmVyfS5jc3YiKSkKd3JpdHRlbl94bHMgPC0gd3JpdGVfZXhwdChwbG9zbnRkMV9leHB0LAogICAgICAgICAgICAgICAgICAgICAgICAgIGV4Y2VsPWdsdWU6OmdsdWUoIntwcmVmaXh9e3Zlcn0ueGxzeCIpKQoKc2FsbW9uX3ByZWZpeCA8LSAiZXhjZWwvcGxvc19udGRfaG9zdF9zYWxtb24tdiIKc2FsbW9uX3Bsb3NudGQxX2V4cHQgPC0gY3JlYXRlX2V4cHQoc2FtcGxlX3NoZWV0LAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmaWxlX2NvbHVtbj0iaGczODk5c2FsbW9uIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZ2VuZV9pbmZvPW5ld19oc19hbm5vdCwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHhfZ2VuZV9tYXA9aHNfdHhfZ2VuZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc2F2ZWZpbGU9Z2x1ZTo6Z2x1ZSgie3NhbG1vbl9wcmVmaXh9e3Zlcn0ucmRhIikpCnNhbG1vbl9wbG9zbnRkMV9leHB0IDwtIHNldF9leHB0X2NvbmRpdGlvbnMoc2FsbW9uX3Bsb3NudGQxX2V4cHQsIGZhY3Q9ImluZmVjdGlvbnN0YXR1cyIpCnNhdmVfcmVzdWx0IDwtIHRyeShzYXZlKHNhbG1vbl9wbG9zbnRkMV9leHB0LCBmaWxlPXBhc3RlMChzYWxtb25fcHJlZml4LCB2ZXIsICIucmRhIikpKQoKc2FsbW9uX3dyaXR0ZW5fY3N2IDwtIHJlYWRyOjp3cml0ZV9jc3YoeD1hcy5kYXRhLmZyYW1lKGV4cHJzKHNhbG1vbl9wbG9zbnRkMV9leHB0KSksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBhdGg9Z2x1ZTo6Z2x1ZSgie3NhbG1vbl9wcmVmaXh9e3Zlcn0uY3N2IikpCnNhbG1vbl93cml0dGVuX3hscyA8LSB3cml0ZV9leHB0KHNhbG1vbl9wbG9zbnRkMV9leHB0LAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBleGNlbD1nbHVlOjpnbHVlKCJ7c2FsbW9uX3ByZWZpeH17dmVyfS54bHN4IikpCmBgYAoKYGBge3IgZXhhbXBsZV9leHByZXNzaW9uc2V0X2V4dHJhY3Rpb259CmZ1biA8LSBsb2FkKCJleGNlbC9wbG9zX250ZF9ob3N0X2hpc2F0Mi12MjAyMDAzMDQucmRhIikKbHMoKQpzdW1tYXJ5KGV4cHQpCnNhbG1vbl9wbG9zbnRkMV9leHBycyA8LSBleHB0W1siZXhwcmVzc2lvbnNldCJdXQpoZWFkKGV4cHJzKHNhbG1vbl9wbG9zbnRkMV9leHBycyksIG49MSkKaGVhZChwRGF0YShzYWxtb25fcGxvc250ZDFfZXhwcnMpLCBuPTEpCmhlYWQoZkRhdGEoc2FsbW9uX3Bsb3NudGQxX2V4cHJzKSwgbj0xKQpgYGAKCmBgYHtyIHNhdmVtZSwgZXZhbD1GQUxTRX0KcGFuZGVyOjpwYW5kZXIoc2Vzc2lvbkluZm8oKSkKbWVzc2FnZShwYXN0ZTAoIlRoaXMgaXMgaHBnbHRvb2xzIGNvbW1pdDogIiwgZ2V0X2dpdF9jb21taXQoKSkpCnRoaXNfc2F2ZSA8LSBwYXN0ZTAoZ3N1YihwYXR0ZXJuPSJcXC5SbWQiLCByZXBsYWNlPSIiLCB4PXJtZF9maWxlKSwgIi12IiwgdmVyLCAiLnJkYS54eiIpCm1lc3NhZ2UocGFzdGUwKCJTYXZpbmcgdG8gIiwgdGhpc19zYXZlKSkKdG1wIDwtIHNtKHNhdmVtZShmaWxlbmFtZT10aGlzX3NhdmUpKQpgYGAK

Use this sheet to extract subsets of the data.

atb abelew@gmail.com

2020-03-05

1 Extract some count tables