sample_sheet <- glue::glue("sample_sheets/tmrc2_samples_202203.xlsx")

1 Introduction

This is mostly just a run of this worksheet to reacquaint myself with it.

This document is intended to provide a general overview of the TMRC2 samples which have thus far been sequenced. In some cases, this includes only those samples starting in 2019; in other instances I am including our previous (2015-2016) samples.

In all cases the processing performed was:

  1. Default trimming was performed.
  2. Hisat2 was used to map the remaining reads against the Leishmania panamensis genome revision 36.
  3. The alignments from hisat2 were used to count reads/gene against the revision 36 annotations with htseq.
  4. These alignments were also passed to the pileup functionality of samtools and the vcf/bcf utilities in order to make a matrix of all observed differences between each sample with respect to the reference.

The analyses in this document use the matrices of counts/gene from #3 and variants/position from #4 in order to provide some images and metrics describing the samples we have sequenced so far.

2 Annotations

Everything which follows depends on the Existing TriTrypDB annotations revision 46, circa 2019. The following block loads a database of these annotations and turns it into a matrix where the rows are genes and columns are all the annotation types provided by TriTrypDB.

The same database was used to create a matrix of orthologous genes between L.panamensis and all of the other species in the TriTrypDB.

tt <- sm(library(EuPathDB))
orgdb <- "org.Lpanamensis.MHOMCOL81L13.v46.eg.db"
tt <- sm(library(orgdb, character.only=TRUE))
pan_db <- org.Lpanamensis.MHOMCOL81L13.v46.eg.db
all_fields <- columns(pan_db)

all_lp_annot <- sm(load_orgdb_annotations(
    pan_db,
    keytype = "gid",
    fields = c("annot_gene_entrez_id", "annot_gene_name",
               "annot_strand", "annot_chromosome", "annot_cds_length",
               "annot_gene_product")))$genes

lp_go <- sm(load_orgdb_go(pan_db))
lp_lengths <- all_lp_annot[, c("gid", "annot_cds_length")]
colnames(lp_lengths)  <- c("ID", "length")
all_lp_annot[["annot_gene_product"]] <- tolower(all_lp_annot[["annot_gene_product"]])
orthos <- sm(EuPathDB::extract_eupath_orthologs(db = pan_db))

hisat_annot <- all_lp_annot

3 Load a genome

meta <- EuPathDB::download_eupath_metadata(webservice="tritrypdb")
## Unable to find species names for 2 species.
## Leishmania sp. Ghana MHOM/GH/2012/GH5, Leishmania sp. Namibia MPRO/NA/1975/252/LV425
## Appending to an existing file: EuPathDB/metadata/biocv3.14_tritrypdbv56_metadata.csv
## Appending to an existing file: EuPathDB/metadata/GRanges_biocv3.14_tritrypdbv56_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrgDb_biocv3.14_tritrypdbv56_metadata.csv
## Appending to an existing file: EuPathDB/metadata/TxDb_biocv3.14_tritrypdbv56_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrganismDbi_biocv3.14_tritrypdbv56_metadata.csv
## Appending to an existing file: EuPathDB/metadata/BSgenome_biocv3.14_tritrypdbv56_metadata.csv
## Appending to an existing file: EuPathDB/metadata/biocv3.14_tritrypdbv56_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/GRanges_biocv3.14_tritrypdbv56_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrgDb_biocv3.14_tritrypdbv56_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/TxDb_biocv3.14_tritrypdbv56_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrganismDbi_biocv3.14_tritrypdbv56_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/BSgenome_biocv3.14_tritrypdbv56_invalid_metadata.csv
lp_entry <- EuPathDB::get_eupath_entry(species="Leishmania panamensis", metadata=meta)
## Found the following hits: Leishmania panamensis MHOM/COL/81/L13, Leishmania panamensis strain MHOM/PA/94/PSC-1, choosing the first.
## Using: Leishmania panamensis MHOM/COL/81/L13.
colnames(lp_entry)
##  [1] "AnnotationVersion"  "AnnotationSource"   "BiocVersion"       
##  [4] "DataProvider"       "Genome"             "GenomeSource"      
##  [7] "GenomeVersion"      "NumArrayGene"       "NumChipChipGene"   
## [10] "NumChromosome"      "NumCodingGene"      "NumCommunity"      
## [13] "NumContig"          "NumEC"              "NumEST"            
## [16] "NumGene"            "NumGO"              "NumOrtholog"       
## [19] "NumOtherGene"       "NumPopSet"          "NumProteomics"     
## [22] "NumPseudogene"      "NumRNASeq"          "NumRTPCR"          
## [25] "NumSNP"             "NumTFBS"            "Organellar"        
## [28] "ReferenceStrain"    "MegaBP"             "PrimaryKey"        
## [31] "ProjectID"          "RecordClassName"    "SourceID"          
## [34] "SourceVersion"      "TaxonomyID"         "TaxonomyName"      
## [37] "URLGenome"          "URLGFF"             "URLProtein"        
## [40] "Coordinate_1_based" "Maintainer"         "SourceUrl"         
## [43] "Tags"               "BsgenomePkg"        "GrangesPkg"        
## [46] "OrganismdbiPkg"     "OrgdbPkg"           "TxdbPkg"           
## [49] "Taxon"              "Genus"              "Species"           
## [52] "Strain"             "BsgenomeFile"       "GrangesFile"       
## [55] "OrganismdbiFile"    "OrgdbFile"          "TxdbFile"          
## [58] "GenusSpecies"       "TaxonUnmodified"    "TaxonCanonical"    
## [61] "TaxonXref"
testing_panamensis <- "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53"
## testing_panamensis <- EuPathDB::make_eupath_bsgenome(entry=lp_entry, eu_version="v46")
library(as.character(testing_panamensis), character.only=TRUE)
## Loading required package: BSgenome
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: rtracklayer
genome <- get0(as.character(testing_panamensis))

4 TODO:

Resequence samples: TMRC20002, TMRC20006, TMRC20004 (maybe TMRC20008 and TMRC20029)

5 Generate Expressionsets and Sample Estimation

The process of sample estimation takes two primary inputs:

  1. The sample sheet, which contains all the metadata we currently have on hand, including filenames for the outputs of #3 and #4 above.
  2. The gene annotations.

An expressionset is a data structure used in R to examine RNASeq data. It is comprised of annotations, metadata, and expression data. In the case of our processing pipeline, the location of the expression data is provided by the filenames in the metadata.

The first lines of the following block create the Expressionset. All of the following lines perform various normalizations and generate plots from it.

5.1 Notes

The following samples are much lower coverage:

  • TMRC20002
  • TMRC20006
  • TMRC20007
  • TMRC20008

20210610: I made some manual changes to the sample sheet which I downloaded, filling in some zymodeme with ‘unknown’

5.2 TODO:

  1. Do the multi-gene family removal right here instead of way down at the bottom
  2. Add zymodeme snps to the annotation later.
  3. Start phylogenetic analysis of variant table.
sanitize_columns <- c("passagenumber", "clinicalresponse", "clinicalcategorical",
                      "zymodemecategorical", "zymodemecategorical")
lp_expt <- create_expt(sample_sheet,
                       gene_info = hisat_annot,
                       annotation = orgdb,
                       id_column = "hpglidentifier",
                       file_column = "lpanamensisv36hisatfile") %>%
  set_expt_conditions(fact = "zymodemecategorical") %>%
  subset_expt(nonzero = 8550) %>%
  subset_expt(coverage = 5000000) %>%
  semantic_expt_filter(semantic = c("amastin", "gp63", "leishmanolysin"),
                       semantic_column = "annot_gene_product") %>%
  sanitize_expt_metadata(columns = sanitize_columns) %>%
  set_expt_factors(columns = sanitize_columns, class = "factor")
## Reading the sample metadata.
## Dropped 11 rows from the sample metadata because they were blank.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 110 rows(samples) and 64 columns(metadata fields).
## Warning in create_expt(sample_sheet, gene_info = hisat_annot, annotation =
## orgdb, : Some samples were removed when cross referencing the samples against
## the count data.
## Matched 8778 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 8778 rows and 103 columns.
## The samples (and read coverage) removed when filtering 8550 non-zero genes are:
## TMRC20002 TMRC20006 
##  11681227   6670348
## subset_expt(): There were 103, now there are 101 samples.
## The samples removed (and read coverage) when filtering samples with less than 5e+06 reads are:
## TMRC20004 TMRC20029 
##    564812   1658096
## subset_expt(): There were 101, now there are 99 samples.
## semantic_expt_filter(): Removed 68 genes.
libsizes <- plot_libsize(lp_expt)
pp(file = "images/lp_expt_libsizes.png", image = libsizes$plot, width = 14, height = 9)
## Warning in pp(file = "images/lp_expt_libsizes.png", image = libsizes$plot, :
## There is no device to shut down.
## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(lp_expt)
pp(file = "images/lp_nonzero.png", image = nonzero$plot, width = 9, height = 9)
## Warning: ggrepel: 80 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning in pp(file = "images/lp_nonzero.png", image = nonzero$plot, width = 9, :
## There is no device to shut down.
## Warning: ggrepel: 80 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
lp_box <- plot_boxplot(lp_expt)
## 7895 entries are 0.  We are on a log scale, adding 1 to the data.
pp(file = "images/lp_expt_boxplot.png", image = lp_box, width = 12, height = 9)
## Warning in pp(file = "images/lp_expt_boxplot.png", image = lp_box, width = 12, :
## There is no device to shut down.
filter_plot <- plot_libsize_prepost(lp_expt)
filter_plot$lowgene_plot
## Warning: Using alpha for a discrete variable is not advised.
filter_plot$count_plot

5.3 Distribution Visualization

Najib’s favorite plots are of course the PCA/TNSE. These are nice to look at in order to get a sense of the relationships between samples. They also provide a good opportunity to see what happens when one applies different normalizations, surrogate analyses, filters, etc. In addition, one may set different experimental factors as the primary ‘condition’ (usually the color of plots) and surrogate ‘batches’.

5.4 By Susceptilibity

Column ‘Q’ in the sample sheet, make a categorical version of it with these parameters:

  • 0 <= x <= 35 is resistant
  • 36 <= x <= 48 is ambiguous
  • 49 <= x is sensitive
starting <- as.numeric(pData(lp_expt)[["susceptibilityinfectionreduction32ugmlsbvhistoricaldata"]])
## Warning: NAs introduced by coercion
sus_categorical <- starting
na_idx <- is.na(starting)
sus_categorical[na_idx] <- "unknown"

resist_idx <- starting <= 0.35
sus_categorical[resist_idx] <- "resistant"
indeterminant_idx <- starting >= 0.36 & starting <= 0.48
sus_categorical[indeterminant_idx] <- "ambiguous"
susceptible_idx <- starting >= 0.49
sus_categorical[susceptible_idx] <- "sensitive"

pData(lp_expt)[["sus_category"]] <- sus_categorical
table(sus_categorical)
## sus_categorical
## ambiguous resistant sensitive   unknown 
##         5        12        32        50
clinical_colors <- list(
    "z1.0" = "#333333",
    "z2.0" = "#555555",
    "z3.0" = "#777777",
    "z2.1" = "#874400",
    "z2.2" = "#0000cc",
    "z2.3" = "#cc0000",
    "z2.4" = "#df7000",
    "unknown" = "#cbcbcb",
    "null" = "#000000")
clinical_samples <- lp_expt %>%
  set_expt_batches(fact = sus_categorical) %>%
  set_expt_colors(clinical_colors)
table(pData(clinical_samples)[["condition"]])
## 
##    null unknown    z1.0    z2.0    z2.1    z2.2    z2.3    z2.4    z3.0 
##       2       4       1       1       7      42      39       2       1
clinical_norm <- normalize_expt(clinical_samples, norm = "quant", transform = "log2",
                                   convert = "cpm", filter = TRUE)
## Removing 140 low-count genes (8570 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
zymo_pca <- plot_pca(clinical_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/zymo_pca_sus_shape.png", image = zymo_pca$plot)
## Warning in pp(file = "images/zymo_pca_sus_shape.png", image = zymo_pca$plot):
## There is no device to shut down.
only_two_types <- subset_expt(clinical_samples, subset = "condition=='z2.3'|condition=='z2.2'")
## subset_expt(): There were 99, now there are 81 samples.
only_two_norm <- sm(normalize_expt(only_two_types, norm = "quant", transform = "log2",
                                   convert = "cpm", batch = FALSE, filter = TRUE))
onlytwo_pca <- plot_pca(only_two_norm, plot_title = "PCA of z2.2 and z2.3 parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/zymo_z2.2_z2.3_pca_sus_shape.pdf", image = onlytwo_pca$plot)

zymo_3dpca <- plot_3d_pca(zymo_pca)

zymo_3dpca$plot
clinical_n <- sm(normalize_expt(clinical_samples, transform = "log2",
                                convert = "cpm", batch = FALSE, filter = TRUE))
zymo_tsne <- plot_tsne(clinical_n, plot_title = "TSNE of parasite expression values")
zymo_tsne$plot
## Warning: ggrepel: 60 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

clinical_nb <- normalize_expt(clinical_samples, convert = "cpm", transform = "log2",
                         filter = TRUE, batch = "svaseq")
## Removing 140 low-count genes (8570 remaining).
## batch_counts: Before batch/surrogate estimation, 1477 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 4969 entries are 0<x<1: 1%.
## Setting 621 low elements to zero.
## transform_counts: Found 621 values equal to 0, adding 1 to the matrix.
clinical_nb_pca <- plot_pca(clinical_nb, plot_title = "PCA of parasite expression values",
                            plot_labels = FALSE)
pp(file = "images/clinical_nb_pca_sus_shape.png", image = clinical_nb_pca$plot)
## Warning in pp(file = "images/clinical_nb_pca_sus_shape.png", image =
## clinical_nb_pca$plot): There is no device to shut down.
clinical_nb_tsne <- plot_tsne(clinical_nb, plot_title = "TSNE of parasite expression values")
clinical_nb_tsne$plot
## Warning: ggrepel: 41 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
corheat <- plot_corheat(clinical_norm, plot_title = "Correlation heatmap of parasite
                 expression values
")
corheat$plot

plot_sm(clinical_norm)$plot
## Performing correlation.

5.5 By Cure/Fail status

cf_colors <- list(
    "cure" = "#006f00",
    "fail" = "#9dffa0",
    "unknown" = "#cbcbcb",
    "notapplicable" = "#000000")
cf_expt <- set_expt_conditions(lp_expt, fact = "clinicalcategorical") %>%
  set_expt_batches(fact = sus_categorical) %>%
  set_expt_colors(cf_colors)
## Warning in set_expt_colors(., cf_colors): Colors for the following categories
## are not being used: notapplicable.
table(pData(cf_expt)[["condition"]])
## 
##    cure    fail unknown 
##      37      37      25
cf_norm <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
                          norm = "quant", filter = TRUE)
## Removing 140 low-count genes (8570 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
start_cf <- plot_pca(cf_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/cf_sus_shape.png", image = start_cf$plot)
## Warning in pp(file = "images/cf_sus_shape.png", image = start_cf$plot): There is
## no device to shut down.
cf_nb <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
                        norm = "quant", filter = TRUE, batch = "svaseq")
## Warning in normalize_expt(cf_expt, convert = "cpm", transform = "log2", :
## Quantile normalization and sva do not always play well together.
## Removing 140 low-count genes (8570 remaining).
## batch_counts: Before batch/surrogate estimation, 2 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 6129 entries are 0<x<1: 1%.
## Setting 218 low elements to zero.
## transform_counts: Found 218 values equal to 0, adding 1 to the matrix.
cf_nb_pca <- plot_pca(cf_nb, plot_title = "PCA of parasite expression values",
                      plot_labels = FALSE)
pp(file = "images/cf_sus_share_nb.png", image = cf_nb_pca$plot)
## Warning in pp(file = "images/cf_sus_share_nb.png", image = cf_nb_pca$plot):
## There is no device to shut down.
cf_norm <- normalize_expt(cf_expt, transform = "log2", convert = "cpm",
                          filter = TRUE, norm = "quant")
## Removing 140 low-count genes (8570 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
test <- pca_information(cf_norm,
                        expt_factors = c("clinicalcategorical", "zymodemecategorical",
                                         "pathogenstrain", "passagenumber"),
                        num_components = 6, plot_pcas = TRUE)
test$anova_p
##                           PC1     PC2     PC3       PC4     PC5     PC6
## clinicalcategorical 3.565e-01 0.33131 0.13900 1.743e-03 0.50959 0.50856
## zymodemecategorical 4.993e-06 0.05503 0.02020 8.292e-03 0.01682 0.01981
## pathogenstrain      5.438e-01 0.69615 0.55415 5.312e-05 0.02296 0.19502
## passagenumber       9.413e-01 0.39295 0.02494 7.185e-03 0.56085 0.03434
test$cor_heatmap

sus_colors <- list(
    "resistant" = "#8563a7",
    "sensitive" = "#8d0000",
    "ambiguous" = "#cbcbcb",
    "unknown" = "#000000")
sus_expt <- set_expt_conditions(lp_expt, fact = "sus_category") %>%
  set_expt_batches(fact = "zymodemecategorical") %>%
  set_expt_colors(colors = sus_colors) %>%
  subset_expt(subset = "batch!='z24'") %>%
  subset_expt(subset = "batch!='z21'")
## subset_expt(): There were 99, now there are 97 samples.
## subset_expt(): There were 97, now there are 90 samples.
sus_norm <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
                           norm = "quant", filter = TRUE)
## Removing 141 low-count genes (8569 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
sus_pca <- plot_pca(sus_norm, plot_title = "PCA of parasite expression values",
                    plot_labels = FALSE)
pp(file = "images/sus_norm_pca.png", image = sus_pca[["plot"]])
## Warning in pp(file = "images/sus_norm_pca.png", image = sus_pca[["plot"]]):
## There is no device to shut down.
sus_nb <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
                         batch = "svaseq", filter = TRUE)
## Removing 141 low-count genes (8569 remaining).
## batch_counts: Before batch/surrogate estimation, 1325 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 4446 entries are 0<x<1: 1%.
## Setting 288 low elements to zero.
## transform_counts: Found 288 values equal to 0, adding 1 to the matrix.
sus_nb_pca <- plot_pca(sus_nb, plot_title = "PCA of parasite expression values",
                       plot_labels = FALSE)
pp(file = "images/sus_nb_pca.png", image = sus_nb_pca[["plot"]])
## Warning in pp(file = "images/sus_nb_pca.png", image = sus_nb_pca[["plot"]]):
## There is no device to shut down.

6 Zymodeme analyses

The following sections perform a series of analyses which seek to elucidate differences between the zymodemes 2.2 and 2.3 either through differential expression or variant profiles.

6.1 Differential expression

6.1.1 With respect to zymodeme attribution

TODO: Do this with and without sva and compare the results.

zy_expt <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
## subset_expt(): There were 99, now there are 81 samples.
zy_norm <- normalize_expt(zy_expt, filter = TRUE, convert = "cpm", norm = "quant")
## Removing 158 low-count genes (8552 remaining).
zy_de_nobatch <- all_pairwise(zy_expt, filter = TRUE, model_batch = FALSE)
## Plotting a PCA before surrogate/batch inclusion.
## Assuming no batch in model for testing pca.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
zy_table_nobatch <- combine_de_tables(
    zy_de_nobatch, excel = glue::glue("excel/zy_tables_nobatch-v{ver}.xlsx"),
    gmt = glue::glue("gmt/zymodeme_nobatch-v{ver}.gmt"))
## Deleting the file excel/zy_tables_nobatch-v202203.xlsx before writing the tables.
zy_sig_nobatch <- extract_significant_genes(
    zy_table_nobatch,
    excel = glue::glue("excel/zy_sig_nobatch-v{ver}.xlsx"))
## Deleting the file excel/zy_sig_nobatch-v202203.xlsx before writing the tables.
zy_de_sva <- all_pairwise(zy_expt, filter = TRUE, model_batch = "svaseq")
## batch_counts: Before batch/surrogate estimation, 920 entries are x==0: 0%.
## Plotting a PCA before surrogate/batch inclusion.
## Using svaseq to visualize before/after batch inclusion.
## Performing a test normalization with: raw
## Removing 0 low-count genes (8552 remaining).
## batch_counts: Before batch/surrogate estimation, 920 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 2988 entries are 0<x<1: 0%.
## Setting 407 low elements to zero.
## transform_counts: Found 407 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
zy_table_sva <- combine_de_tables(
    zy_de_sva, excel = glue::glue("excel/zy_tables_sva-v{ver}.xlsx"),
    gmt = glue::glue("gmt/zymodeme_sva-v{ver}.gmt"))
zy_sig_sva <- extract_significant_genes(
    zy_table_sva,
    excel = glue::glue("excel/zy_sig_sva-v{ver}.xlsx"))

6.1.2 Images of zymodeme DE

pp(file = "images/zymo_ma.png", image = zy_table[["plots"]][["z23_vs_z22"]][["deseq_ma_plots"]][["plot"]])
## Error in pp(file = "images/zymo_ma.png", image = zy_table[["plots"]][["z23_vs_z22"]][["deseq_ma_plots"]][["plot"]]): object 'zy_table' not found

6.2 With respect to cure/failure

In contrast, we can search for genes which are differentially expressed with respect to cure/failure status.

cf_de <- all_pairwise(cf_expt, filter = TRUE, model_batch = "svaseq")
## batch_counts: Before batch/surrogate estimation, 1477 entries are x==0: 0%.
## Plotting a PCA before surrogate/batch inclusion.
## Using svaseq to visualize before/after batch inclusion.
## Performing a test normalization with: raw
## Removing 0 low-count genes (8570 remaining).
## batch_counts: Before batch/surrogate estimation, 1477 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 4969 entries are 0<x<1: 1%.
## Setting 326 low elements to zero.
## transform_counts: Found 326 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
cf_table <- combine_de_tables(cf_de, excel = glue::glue("excel/cf_tables-v{ver}.xlsx"))
## Deleting the file excel/cf_tables-v202203.xlsx before writing the tables.
cf_sig <- extract_significant_genes(cf_table, excel = glue::glue("excel/cf_sig-v{ver}.xlsx"))
## Deleting the file excel/cf_sig-v202203.xlsx before writing the tables.
pp(file = "images/cf_ma.png", image = cf_table[["plots"]][["fail_vs_cure"]][["deseq_ma_plots"]][["plot"]])
## Warning in pp(file = "images/cf_ma.png", image = cf_table[["plots"]]
## [["fail_vs_cure"]][["deseq_ma_plots"]][["plot"]]): There is no device to shut
## down.

6.3 With respect to susceptibility

Finally, we can use our category of susceptibility and look for genes which change from sensitive to resistant. Keep in mind, though, that for the moment we have a lot of ambiguous and unknown strains.

sus_de <- all_pairwise(sus_expt, filter = TRUE, model_batch = "svaseq")
## batch_counts: Before batch/surrogate estimation, 1325 entries are x==0: 0%.
## Plotting a PCA before surrogate/batch inclusion.
## Using svaseq to visualize before/after batch inclusion.
## Performing a test normalization with: raw
## Removing 0 low-count genes (8569 remaining).
## batch_counts: Before batch/surrogate estimation, 1325 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 4446 entries are 0<x<1: 1%.
## Setting 288 low elements to zero.
## transform_counts: Found 288 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
sus_table <- combine_de_tables(sus_de, excel = glue::glue("excel/sus_tables-v{ver}.xlsx"))
## Deleting the file excel/sus_tables-v202203.xlsx before writing the tables.
sus_sig <- extract_significant_genes(sus_table, excel = glue::glue("excel/sus_sig-v{ver}.xlsx"))
## Deleting the file excel/sus_sig-v202203.xlsx before writing the tables.
knitr::kable(head(sus_sig$deseq$ups$sensitive_vs_resistant, n = 20))
gid annotgeneproduct annotgenetype chromosome start end strand annotgeneentrezid annotgenename annotstrand annotchromosome annotcdslength length deseq_logfc deseq_adjp edger_logfc edger_adjp limma_logfc limma_adjp basic_nummed basic_denmed basic_numvar basic_denvar basic_logfc basic_t basic_p basic_adjp deseq_basemean deseq_lfcse deseq_stat deseq_p ebseq_fc ebseq_logfc ebseq_c1mean ebseq_c2mean ebseq_mean ebseq_var ebseq_postfc ebseq_ppee ebseq_ppde ebseq_adjp edger_logcpm edger_lr edger_p limma_ave limma_t limma_b limma_p limma_adjp_ihw deseq_adjp_ihw edger_adjp_ihw ebseq_adjp_ihw basic_adjp_ihw lfc_meta lfc_var lfc_varbymed p_meta p_var
LPAL13_000017600 LPAL13_000017600 hypothetical protein, conserved protein coding LPAL13_SCAF000146 359 586 + forward Not Assigned 228.0 227 6.516 0 6.498 0e+00 6.244 0.0281 4.2460 -1.2210 5.0139 2.5901 5.468 8.898 0.0000 0.0000 533.60 0.6889 9.458 0 76.865 6.264 12.7287 979.15 709.45 4.389e+05 61.361 0e+00 1.0000 0e+00 4.4350 49.41 0 2.2490 2.824 -2.3140 0.0059 2.917e-02 5.373e-18 1.482e-09 9.312e-01 1.322e-07 7.222 6.597e+00 9.134e-01 1.957e-03 1.149e-05
LPAL13_000053200 LPAL13_000053200 hypothetical protein protein coding LPAL13_SCAF000804 5037 5249 - reverse Not Assigned 213.0 212 8.774 0 10.020 0e+00 5.877 0.0118 0.9857 -4.2710 8.6300 0.1283 5.257 9.778 0.0000 0.0000 69.19 1.1080 7.921 0 13050.813 13.672 0.0000 130.50 94.08 8.140e+03 36.589 1e+00 0.0000 1e+00 1.5420 44.83 0 -0.9567 3.233 -1.9370 0.0017 1.194e-02 1.551e-12 1.086e-08 0.000e+00 7.937e-09 8.675 2.464e+00 2.840e-01 5.770e-04 9.988e-07
LPAL13_300029400 LPAL13_300029400 hypothetical protein, conserved protein coding LpaL13_30 853953 854150 - reverse 30 198.0 197 6.298 0 6.200 0e+00 4.836 0.0016 1.5930 -2.6400 1.8368 1.9475 4.233 8.993 0.0000 0.0000 77.04 0.7707 8.172 0 62.352 5.962 2.0413 127.89 92.77 9.658e+03 23.160 0e+00 0.0000 0e+00 1.6540 43.35 0 -0.1533 4.044 0.1172 0.0001 1.529e-03 2.730e-13 1.745e-08 0.000e+00 1.349e-06 5.767 2.744e-01 4.758e-02 3.767e-05 4.256e-09
LPAL13_200050100 LPAL13_200050100 hypothetical protein protein coding LpaL13_20.1 1627529 1627717 + forward 20.1 189.0 188 5.211 0 5.162 0e+00 4.533 0.0020 2.2530 -1.9310 1.6946 2.4470 4.184 8.227 0.0000 0.0000 98.56 0.6293 8.282 0 24.142 4.593 8.0130 193.68 141.87 2.250e+04 17.160 0e+00 1.0000 0e+00 2.0340 44.14 0 0.5193 3.963 0.2097 0.0002 1.857e-03 1.486e-13 1.457e-08 9.700e-01 9.153e-06 5.055 6.586e-01 1.303e-01 5.027e-05 7.580e-09
LPAL13_350011800 LPAL13_350011800 hypothetical protein, conserved protein coding LpaL13_35 171009 171242 + forward 35 234.0 233 5.108 0 5.083 0e+00 4.441 0.0040 2.6630 -1.2370 2.7805 1.1593 3.901 9.037 0.0000 0.0000 153.50 0.6217 8.216 0 31.869 4.994 9.1905 293.20 213.94 5.991e+04 23.676 0e+00 1.0000 0e+00 2.6400 42.66 0 0.9495 3.688 -0.2933 0.0004 3.779e-03 2.141e-13 2.149e-08 9.312e-01 4.837e-08 4.965 6.069e-01 1.222e-01 1.307e-04 5.125e-08
LPAL13_000014000 LPAL13_000014000 hypothetical protein protein coding LPAL13_SCAF000119 655 942 + forward Not Assigned 288.0 287 4.006 0 3.993 0e+00 3.886 0.0047 2.3310 -1.1660 1.7387 1.9422 3.497 7.491 0.0000 0.0000 108.20 0.5494 7.292 0 16.712 4.063 11.6908 195.53 144.23 1.334e+04 13.033 0e+00 1.0000 0e+00 2.1660 37.41 0 0.8882 3.624 -0.4496 0.0005 4.803e-03 1.371e-10 1.349e-07 9.496e-01 1.519e-05 4.198 7.557e-01 1.800e-01 1.618e-04 7.857e-08
LPAL13_000035500 LPAL13_000035500 hypothetical protein, conserved protein coding LPAL13_SCAF000492 7045 7410 + forward Not Assigned 366.0 365 4.408 0 4.397 0e+00 3.919 0.0229 4.1870 0.7591 4.1887 0.5867 3.428 7.992 0.0000 0.0000 433.50 0.5555 7.936 0 18.070 4.176 49.2017 889.23 654.80 3.850e+05 17.441 0e+00 1.0000 0e+00 4.1390 41.36 0 2.7560 2.926 -2.0630 0.0044 2.182e-02 1.495e-12 2.978e-08 9.440e-01 8.570e-08 4.495 7.589e-01 1.688e-01 1.458e-03 6.377e-06
LPAL13_000040700 LPAL13_000040700 hypothetical protein, conserved protein coding LPAL13_SCAF000598 54 1067 + forward Not Assigned 1014.0 1013 6.523 0 6.927 0e+00 3.386 0.0324 -0.9791 -4.0710 6.0269 0.5495 3.092 6.309 0.0000 0.0000 18.56 0.9725 6.708 0 222.274 7.796 0.1376 32.79 23.68 8.649e+02 9.466 0e+00 0.0000 0e+00 -0.1583 32.71 0 -2.1450 2.757 -2.7910 0.0071 3.235e-02 5.465e-09 8.909e-07 0.000e+00 7.637e-06 5.605 7.924e-01 1.414e-01 2.370e-03 1.685e-05
LPAL13_220019500 LPAL13_220019500 hypothetical protein protein coding LpaL13_22 578260 578538 + forward 22 279.0 278 3.782 0 3.776 0e+00 3.358 0.0185 3.3850 0.3006 2.5163 0.8056 3.084 8.009 0.0000 0.0000 268.50 0.4942 7.652 0 13.781 3.785 31.5983 435.57 322.84 7.699e+04 12.699 0e+00 1.0000 0e+00 3.4530 41.34 0 2.3990 3.025 -1.8040 0.0033 1.757e-02 1.065e-11 4.229e-08 9.312e-01 1.917e-07 3.854 3.461e-01 8.980e-02 1.088e-03 3.549e-06
LPAL13_040019400 LPAL13_040019400 hypothetical protein protein coding LpaL13_04 440768 441127 - reverse 4 360.0 359 5.015 0 4.927 0e+00 3.338 0.0046 -0.5200 -3.4430 1.8788 1.1668 2.923 7.358 0.0000 0.0000 22.46 0.8429 5.949 0 42.411 5.406 0.8143 34.95 25.42 1.781e+03 8.529 0e+00 0.0000 0e+00 0.0017 27.93 0 -1.8490 3.634 -1.1620 0.0005 4.329e-03 2.681e-07 6.608e-06 0.000e+00 4.484e-06 4.447 1.892e-03 4.253e-04 1.568e-04 7.368e-08
LPAL13_080009800 LPAL13_080009800 tuzin, putative protein coding LpaL13_08 184254 185207 + forward 8 954.0 953 3.219 0 3.213 0e+00 3.088 0.0002 7.1420 4.2810 1.6072 0.4529 2.861 9.559 0.0000 0.0000 2887.00 0.4442 7.247 0 8.759 3.131 623.5924 5462.11 4111.83 1.567e+07 8.976 0e+00 1.0000 0e+00 6.8640 39.58 0 5.9790 4.856 3.7210 0.0000 2.037e-04 1.918e-10 6.740e-08 1.000e+00 4.597e-09 3.246 1.516e-01 4.669e-02 1.733e-06 9.013e-12
LPAL13_000011700 LPAL13_000011700 hypothetical protein protein coding LPAL13_SCAF000076 101 364 - reverse Not Assigned 264.0 263 21.850 0 7.497 0e+00 3.325 0.0067 -1.4690 -4.2710 6.0895 0.1283 2.803 6.158 0.0000 0.0000 12.37 1.2010 18.190 0 2517.168 11.298 0.0000 25.16 18.14 5.252e+02 7.665 1e+00 0.0000 1e+00 -0.6478 25.02 0 -2.8750 3.473 -1.5740 0.0008 6.729e-03 2.560e-70 2.064e-05 0.000e+00 2.056e-05 8.823 3.037e+01 3.442e+00 2.672e-04 2.137e-07
LPAL13_170012200 LPAL13_170012200 hypothetical protein protein coding LpaL13_17 289959 290222 - reverse 17 264.0 263 3.705 0 3.696 0e+00 3.220 0.0044 3.1250 0.3747 3.6684 0.2339 2.750 7.408 0.0000 0.0000 206.60 0.5578 6.643 0 13.493 3.754 31.6002 426.50 316.29 9.550e+04 12.517 0e+00 1.0000 0e+00 3.0830 31.29 0 1.6800 3.647 -0.1488 0.0004 4.154e-03 7.819e-09 1.658e-06 9.700e-01 4.995e-07 3.609 1.217e-01 3.373e-02 1.498e-04 6.729e-08
LPAL13_170014500 LPAL13_170014500 hypothetical protein, conserved protein coding LpaL13_17 361708 362040 + forward 17 333.0 332 5.295 0 5.135 3e-04 3.409 0.0069 -0.6621 -3.3780 6.1072 1.5098 2.716 4.780 0.0000 0.0004 19.82 1.0600 4.997 0 44.414 5.473 0.9965 44.69 32.50 1.669e+03 10.301 0e+00 0.0000 0e+00 -0.2210 18.46 0 -2.4070 3.464 -1.4520 0.0008 7.135e-03 1.885e-05 3.986e-04 0.000e+00 3.894e-04 4.693 1.893e-02 4.035e-03 2.810e-04 2.221e-07
LPAL13_130006100 LPAL13_130006100 hypothetical protein protein coding LpaL13_13 26114 26335 - reverse 13 222.0 221 3.105 0 3.099 0e+00 2.632 0.0029 3.4690 0.9328 0.9517 0.2971 2.536 10.770 0.0000 0.0000 227.50 0.3836 8.092 0 8.019 3.003 48.3159 387.52 292.86 5.261e+04 7.663 0e+00 1.0000 0e+00 3.2240 50.71 0 2.5790 3.815 0.3649 0.0003 2.765e-03 4.731e-13 1.026e-09 9.496e-01 6.771e-10 2.965 2.408e-03 8.123e-04 8.433e-05 2.134e-08
LPAL13_080010600 LPAL13_080010600 hypothetical protein, conserved protein coding LpaL13_08 195555 195749 - reverse 8 195.0 194 6.085 0 7.347 0e+00 2.986 0.0075 -1.7770 -4.2710 4.6928 0.1283 2.495 6.197 0.0000 0.0000 10.45 1.0880 5.591 0 2040.483 10.995 0.0000 20.39 14.70 5.086e+02 6.431 1e+00 0.0000 1e+00 -0.9224 28.25 0 -3.0540 3.428 -1.7060 0.0009 7.444e-03 1.437e-06 5.720e-06 0.000e+00 1.606e-05 5.084 1.981e+00 3.897e-01 3.093e-04 2.869e-07
LPAL13_190012600 LPAL13_190012600 hypothetical protein, conserved protein coding LpaL13_19 254293 254577 - reverse 19 285.0 284 3.389 0 3.377 0e+00 2.821 0.0028 1.9870 -0.4466 1.6023 0.3237 2.433 8.677 0.0000 0.0000 90.90 0.4410 7.685 0 9.447 3.240 16.3290 154.35 115.83 8.229e+03 7.959 0e+00 1.0000 0e+00 1.9190 43.74 0 1.1280 3.825 0.2225 0.0002 2.664e-03 9.371e-12 1.610e-08 9.496e-01 1.904e-08 3.242 2.045e-02 6.306e-03 8.157e-05 1.996e-08
LPAL13_240030500 LPAL13_240030500 hypothetical protein protein coding LpaL13_24 828719 828913 + forward 24 195.0 194 2.286 0 2.278 0e+00 2.596 0.0146 3.2990 0.8968 1.1635 4.0704 2.403 3.914 0.0017 0.0098 214.80 0.4373 5.226 0 4.499 2.170 78.1925 351.83 275.46 3.702e+04 4.393 4e-04 0.9996 4e-04 3.1260 23.45 0 2.3860 3.136 -1.5350 0.0023 1.464e-02 6.890e-06 3.931e-05 9.700e-01 9.810e-03 2.391 1.794e-01 7.505e-02 7.772e-04 1.809e-06
LPAL13_210015300 LPAL13_210015300 core histone h2a/h2b/h3/h4, putative protein coding LpaL13_21 324301 324699 + forward 21 399.0 398 2.639 0 2.635 0e+00 2.517 0.0000 8.7390 6.3970 1.7325 0.5224 2.342 7.428 0.0000 0.0000 10640.00 0.4143 6.369 0 5.972 2.578 2793.1315 16680.09 12804.66 1.102e+08 6.108 6e-04 0.9994 6e-04 8.7450 32.74 0 7.9440 5.686 6.8870 0.0000 1.724e-05 3.474e-08 9.489e-07 1.000e+00 7.154e-07 2.603 4.465e-02 1.715e-02 5.990e-08 8.953e-15
LPAL13_000018000 LPAL13_000018000 hypothetical protein protein coding LPAL13_SCAF000166 2 412 + forward Not Assigned 411.0 410 2.745 0 2.740 0e+00 2.713 0.0004 6.7360 4.4450 1.8804 0.1284 2.291 8.575 0.0000 0.0000 2229.00 0.4374 6.274 0 6.059 2.599 662.2008 4012.15 3077.28 5.393e+06 6.157 0e+00 1.0000 0e+00 6.4910 31.71 0 5.6950 4.568 2.6250 0.0000 4.071e-04 8.079e-08 1.756e-06 1.000e+00 3.436e-08 2.762 1.185e-01 4.292e-02 5.353e-06 8.566e-11
knitr::kable(head(sus_sig$deseq$downs$sensitive_vs_resistant, n = 20))
gid annotgeneproduct annotgenetype chromosome start end strand annotgeneentrezid annotgenename annotstrand annotchromosome annotcdslength length deseq_logfc deseq_adjp edger_logfc edger_adjp limma_logfc limma_adjp basic_nummed basic_denmed basic_numvar basic_denvar basic_logfc basic_t basic_p basic_adjp deseq_basemean deseq_lfcse deseq_stat deseq_p ebseq_fc ebseq_logfc ebseq_c1mean ebseq_c2mean ebseq_mean ebseq_var ebseq_postfc ebseq_ppee ebseq_ppde ebseq_adjp edger_logcpm edger_lr edger_p limma_ave limma_t limma_b limma_p limma_adjp_ihw deseq_adjp_ihw edger_adjp_ihw ebseq_adjp_ihw basic_adjp_ihw lfc_meta lfc_var lfc_varbymed p_meta p_var
LPAL13_000033300 LPAL13_000033300 hypothetical protein, conserved protein coding LPAL13_SCAF000463 551 811 + forward Not Assigned 261.0 260 -5.015 0.0004 -4.945 0.0004 -6.975 0.0000 -3.5090 3.4490 11.6176 0.0606 -6.957 -11.290 0.0000 0.0000 121.200 1.1950 -4.197 0.0000 0.1453 -2.7828 327.07 47.52 125.532 2.384e+04 0.1538 0.0000 0.0000 0.0000 2.2930 17.650 0.0000 -0.9768 -7.590 13.660 0.0000 6.127e-08 4.175e-04 4.428e-04 0.000e+00 9.349e-10 -5.645 0.000e+00 0.000e+00 1.786e-05 2.394e-10
LPAL13_000012000 LPAL13_000012000 hypothetical protein protein coding LPAL13_SCAF000080 710 1159 - reverse Not Assigned 450.0 449 -2.508 0.0011 -2.513 0.0003 -4.395 0.0000 0.0045 3.9300 8.0596 0.1671 -3.926 -7.501 0.0000 0.0000 195.800 0.6420 -3.906 0.0001 0.2451 -2.0287 472.54 115.80 215.358 5.531e+04 0.2570 0.0255 0.9745 0.0255 2.9840 18.760 0.0000 1.2590 -5.790 7.421 0.0000 1.104e-05 1.050e-03 2.717e-04 9.135e-01 9.115e-07 -3.084 7.809e-02 -2.532e-02 3.620e-05 2.533e-09
LPAL13_310035500 LPAL13_310035500 hypothetical protein protein coding LpaL13_31 1198439 1198957 - reverse 31 519.0 518 -2.435 0.0115 -2.366 0.0062 -3.629 0.0000 -4.2510 -0.4971 3.6075 0.4687 -3.754 -9.521 0.0000 0.0000 6.494 0.7872 -3.093 0.0020 0.2686 -1.8967 18.64 5.00 8.807 3.123e+02 0.3182 0.0000 0.0000 0.0000 -1.8510 11.200 0.0008 -3.1090 -6.935 8.355 0.0000 4.499e-07 1.105e-02 7.794e-03 0.000e+00 2.548e-09 -2.807 5.070e-04 -1.806e-04 9.331e-04 9.921e-07
LPAL13_000038400 LPAL13_000038400 expression-site associated gene (esag3), putative protein coding LPAL13_SCAF000573 101 1360 + forward Not Assigned 1260.0 1259 -2.908 0.0000 -2.916 0.0000 -4.030 0.0000 4.4940 8.2100 3.4258 0.0364 -3.716 -11.030 0.0000 0.0000 3442.000 0.5401 -5.384 0.0000 0.1798 -2.4751 9214.42 1657.20 3766.192 1.711e+07 0.1852 0.0000 0.0000 0.0000 7.1180 35.940 0.0000 5.7900 -6.120 8.762 0.0000 4.300e-06 3.633e-06 2.604e-07 0.000e+00 1.672e-09 -3.318 2.341e-02 -7.055e-03 3.348e-08 1.292e-15
LPAL13_000038500 LPAL13_000038500 hypothetical protein protein coding LPAL13_SCAF000575 39 251 + forward Not Assigned 213.0 212 -2.306 0.0013 -2.304 0.0011 -3.746 0.0000 -2.2950 1.3320 5.1691 0.6945 -3.627 -7.653 0.0000 0.0000 26.800 0.6001 -3.842 0.0001 0.2673 -1.9034 78.06 20.86 36.822 2.446e+03 0.2931 0.0000 0.0000 0.0000 0.1349 15.450 0.0001 -1.3950 -6.207 7.829 0.0000 3.139e-06 1.295e-03 9.790e-04 0.000e+00 1.917e-07 -2.700 3.996e-02 -1.480e-02 6.895e-05 3.905e-09
LPAL13_000012100 LPAL13_000012100 hypothetical protein protein coding LPAL13_SCAF000080 1637 1894 - reverse Not Assigned 258.0 257 -2.113 0.0064 -2.112 0.0033 -3.594 0.0000 -2.3000 1.1280 6.1645 0.6559 -3.429 -6.810 0.0000 0.0000 28.310 0.6366 -3.319 0.0009 0.2847 -1.8127 67.23 19.13 32.556 1.831e+03 0.3143 0.0044 0.9956 0.0044 0.2417 12.680 0.0004 -1.3420 -5.506 5.459 0.0000 2.986e-05 6.112e-03 3.293e-03 9.483e-01 1.694e-06 -2.480 7.348e-02 -2.963e-02 4.251e-04 2.066e-07
LPAL13_350063000 LPAL13_350063000 hypothetical protein protein coding LpaL13_35 1964328 1964543 - reverse 35 216.0 215 -2.703 0.0000 -2.691 0.0000 -3.602 0.0000 -2.2160 1.1400 2.3028 0.2296 -3.356 -10.980 0.0000 0.0000 19.930 0.4549 -5.942 0.0000 0.1795 -2.4781 56.76 10.18 23.179 6.353e+02 0.1955 0.0000 1.0000 0.0000 -0.2968 38.860 0.0000 -1.3780 -7.571 12.570 0.0000 6.127e-08 2.681e-07 8.861e-08 9.440e-01 1.393e-10 -3.027 3.922e-02 -1.296e-02 1.105e-09 2.261e-18
LPAL13_310031300 LPAL13_310031300 hypothetical protein, conserved protein coding LpaL13_31 1084772 1085059 - reverse 31 288.0 287 -2.328 0.0005 -2.320 0.0001 -3.744 0.0000 -1.1760 2.1420 4.6373 0.7854 -3.317 -7.153 0.0000 0.0000 57.850 0.5595 -4.160 0.0000 0.2604 -1.9413 145.07 37.77 67.713 6.939e+03 0.2783 0.0017 0.9983 0.0017 1.2250 20.460 0.0000 -0.1636 -6.394 9.190 0.0000 1.874e-06 4.525e-04 1.685e-04 9.312e-01 7.154e-07 -2.756 1.883e-02 -6.832e-03 1.265e-05 2.855e-10
LPAL13_340039600 LPAL13_340039600 hypothetical protein protein coding LpaL13_34 1247554 1247757 - reverse 34 204.0 203 -2.300 0.0001 -2.305 0.0000 -3.315 0.0000 1.2080 4.2620 3.8993 0.1029 -3.054 -8.332 0.0000 0.0000 214.600 0.4935 -4.662 0.0000 0.2145 -2.2212 583.09 125.04 252.873 5.809e+04 0.2195 0.0000 1.0000 0.0000 3.1100 25.900 0.0000 2.0000 -5.493 6.307 0.0000 3.027e-05 7.059e-05 1.813e-05 9.440e-01 1.283e-07 -2.606 3.793e-03 -1.455e-03 1.294e-06 2.551e-12
LPAL13_310031000 LPAL13_310031000 hypothetical protein, conserved protein coding LpaL13_31 1075172 1075459 - reverse 31 288.0 287 -2.407 0.0000 -2.395 0.0000 -3.382 0.0000 -1.9540 0.9518 3.3816 0.5378 -2.906 -7.407 0.0000 0.0000 20.320 0.4049 -5.946 0.0000 0.2853 -1.8095 57.59 16.42 27.913 1.029e+03 0.3067 0.3119 0.6881 0.3119 -0.1491 39.530 0.0000 -1.3960 -8.675 16.640 0.0000 1.795e-09 2.681e-07 6.793e-08 6.911e-01 3.625e-07 -2.765 5.951e-03 -2.152e-03 1.025e-09 2.263e-18
LPAL13_050005000 LPAL13_050005000 hypothetical protein protein coding LpaL13_05 3394 3612 - reverse 5 219.0 218 -2.071 0.0018 -2.077 0.0006 -3.002 0.0000 -0.1041 2.6690 3.3321 0.1596 -2.773 -7.978 0.0000 0.0000 85.560 0.5538 -3.740 0.0002 0.2818 -1.8273 188.14 53.01 90.722 7.118e+03 0.2904 0.0000 1.0000 0.0000 1.7850 16.870 0.0000 0.5013 -5.393 5.786 0.0000 3.693e-05 1.771e-03 7.626e-04 9.496e-01 1.665e-07 -2.317 3.797e-02 -1.639e-02 7.482e-05 9.298e-09
LPAL13_310039200 LPAL13_310039200 hypothetical protein protein coding LpaL13_31 1301745 1301972 - reverse 31 228.0 227 -2.382 0.0000 -2.386 0.0000 -2.809 0.0000 1.3100 3.7380 1.5983 0.2096 -2.429 -9.244 0.0000 0.0000 180.000 0.3972 -5.999 0.0000 0.2858 -1.8069 411.28 117.54 199.516 3.648e+04 0.2970 0.1922 0.8078 0.1922 2.8700 43.340 0.0000 2.0670 -6.797 11.660 0.0000 6.028e-07 2.034e-07 2.126e-08 7.801e-01 4.121e-09 -2.661 4.936e-02 -1.855e-02 1.095e-09 9.626e-19
LPAL13_140019300 LPAL13_140019300 bt1 family, putative protein coding LpaL13_14 530784 531350 + forward 14 567.0 566 -2.498 0.0000 -2.501 0.0000 -2.580 0.0000 4.6760 7.0960 0.6199 1.0310 -2.420 -7.436 0.0000 0.0000 1856.000 0.3807 -6.561 0.0000 0.1845 -2.4380 4939.45 911.50 2035.580 5.488e+06 0.1904 0.0000 1.0000 0.0000 6.2270 52.610 0.0000 5.4780 -6.589 10.760 0.0000 1.052e-06 1.759e-08 5.056e-10 9.794e-01 3.532e-05 -2.694 1.452e-01 -5.389e-02 1.082e-09 3.340e-18
LPAL13_340039700 LPAL13_340039700 snare domain containing protein, putative protein coding LpaL13_34 1248192 1248947 - reverse 34 756.0 755 -1.868 0.0000 -1.873 0.0000 -2.142 0.0000 4.6020 6.7060 0.7162 0.0946 -2.104 -11.950 0.0000 0.0000 1278.000 0.3204 -5.831 0.0000 0.2740 -1.8677 3160.69 866.02 1506.397 1.486e+06 0.2781 0.0000 1.0000 0.0000 5.6880 44.020 0.0000 5.2390 -6.801 11.680 0.0000 6.535e-07 6.329e-07 1.463e-08 1.000e+00 3.673e-11 -1.942 1.008e-03 -5.192e-04 2.253e-09 8.275e-18
LPAL13_290026200 LPAL13_290026200 hypothetical protein protein coding LpaL13_29 864413 864616 + forward 29 204.0 203 -1.112 0.2969 -1.110 0.2673 -2.091 0.0137 -2.6940 -0.5934 6.4773 4.7782 -2.101 -2.696 0.0128 0.0451 17.470 0.7972 -1.395 0.1630 0.5295 -0.9172 41.11 21.76 27.161 2.397e+03 0.5780 0.0299 0.9701 0.0299 -0.5489 2.202 0.1379 -2.6650 -3.166 -1.644 0.0021 1.367e-02 2.863e-01 2.673e-01 9.247e-01 4.217e-02 -1.383 9.399e-02 -6.798e-02 1.010e-01 7.490e-03
LPAL13_050016500 LPAL13_050016500 unspecified product snoRNA encoding LpaL13_05 undefined undefined + forward 5 0.0 undefined -1.093 0.1436 -1.093 0.1467 -2.411 0.0009 -1.4900 0.6061 3.6118 1.0047 -2.096 -4.685 0.0000 0.0006 25.990 0.5799 -1.885 0.0595 0.6737 -0.5699 46.02 31.00 35.194 3.534e+03 0.7128 0.6946 0.3054 0.6946 0.1063 3.561 0.0591 -1.1800 -4.276 1.477 0.0000 9.005e-04 1.385e-01 1.466e-01 3.382e-01 5.258e-04 -1.512 3.160e-01 -2.090e-01 3.956e-02 1.171e-03
LPAL13_350073400 LPAL13_350073400 hypothetical protein protein coding LpaL13_35 2342701 2342883 + forward 35 183.0 182 -1.539 0.0005 -1.542 0.0004 -2.017 0.0003 -0.3133 1.7570 1.1734 0.8268 -2.070 -6.336 0.0000 0.0000 44.150 0.3718 -4.139 0.0000 0.2995 -1.7394 115.90 34.70 57.364 6.356e+03 0.3251 0.0001 0.9999 0.0001 0.8158 17.800 0.0000 0.1407 -4.733 3.280 0.0000 2.586e-04 4.853e-04 4.150e-04 9.312e-01 4.689e-05 -1.699 0.000e+00 0.000e+00 2.265e-05 1.779e-10
LPAL13_140019100 LPAL13_140019100 bt1 family, putative protein coding LpaL13_14 525164 525514 + forward 14 351.0 350 -1.995 0.0000 -1.999 0.0000 -2.192 0.0000 3.9610 6.0150 0.4570 0.5080 -2.055 -8.600 0.0000 0.0000 875.400 0.3076 -6.487 0.0000 0.2486 -2.0078 2079.16 516.97 952.933 7.514e+05 0.2529 0.0000 1.0000 0.0000 5.1430 52.330 0.0000 4.6860 -7.546 15.040 0.0000 6.127e-08 1.876e-08 5.056e-10 9.496e-01 2.627e-06 -2.069 6.287e-03 -3.038e-03 4.276e-11 1.913e-21
LPAL13_170015400 LPAL13_170015400 hypothetical protein, conserved protein coding LpaL13_17 395975 396307 + forward 17 333.0 332 -1.880 0.0000 -1.884 0.0000 -2.184 0.0000 1.2120 3.2570 1.1891 0.1240 -2.045 -9.268 0.0000 0.0000 143.200 0.3208 -5.859 0.0000 0.3282 -1.6074 285.56 93.71 147.248 1.387e+04 0.3330 0.0000 1.0000 0.0000 2.5290 37.800 0.0000 2.0010 -6.277 9.432 0.0000 2.553e-06 3.854e-07 1.269e-07 9.700e-01 4.192e-09 -1.994 6.750e-04 -3.386e-04 6.096e-09 3.803e-17
LPAL13_180013900 LPAL13_180013900 hypothetical protein protein coding LpaL13_18 351792 352085 + forward 18 294.0 293 -1.249 0.0054 -1.251 0.0038 -2.094 0.0001 -0.2902 1.5670 1.1425 0.0804 -1.857 -8.898 0.0000 0.0000 39.130 0.3700 -3.376 0.0007 0.4352 -1.2002 75.25 32.74 44.605 7.867e+02 0.4379 0.0018 0.9982 0.0018 0.6741 12.380 0.0004 0.0816 -5.301 5.220 0.0000 5.386e-05 5.238e-03 3.744e-03 9.496e-01 1.515e-08 -1.508 6.346e-02 -4.208e-02 3.902e-04 1.361e-07
sus_ma <- sus_table[["plots"]][["sensitive_vs_resistant"]][["deseq_ma_plots"]][["plot"]]
pp(file = "images/sus_ma.png", image = sus_ma)
## Warning in pp(file = "images/sus_ma.png", image = sus_ma): There is no device to
## shut down.
## test <- ggplt(sus_ma)

6.4 Ontology searches

Now let us look for ontology categories which are increased in the 2.3 samples followed by the 2.2 samples.

## Gene categories more represented in the 2.3 group.
zy_go_up <- simple_goseq(sig_genes = zy_table[["significant"]][["deseq"]][["ups"]][[1]],
                         go_db = lp_go, length_db = lp_lengths)
## Error in simple_goseq(sig_genes = zy_table[["significant"]][["deseq"]][["ups"]][[1]], : object 'zy_table' not found
## Gene categories more represented in the 2.2 group.
zy_go_down <- simple_goseq(sig_genes = zy_table[["significant"]][["deseq"]][["downs"]][[1]],
                           go_db = lp_go, length_db = lp_lengths)
## Error in simple_goseq(sig_genes = zy_table[["significant"]][["deseq"]][["downs"]][[1]], : object 'zy_table' not found

6.4.1 A couple plots from the differential expression

6.4.1.1 Number of genes in agreement among DE methods, 2.3 more than 2.2

In the function ‘combined_de_tables()’ above, one of the tasks performed is to look at the agreement among DESeq2, limma, and edgeR. The following show a couple of these for the set of genes observed with a fold-change >= |2| and adjusted p-value <= 0.05.

zy_table[["venns"]][[1]][["p_lfc1"]][["up_noweight"]]
## Error in eval(expr, envir, enclos): object 'zy_table' not found

6.4.1.2 Number of genes in agreement among DE methods, 2.2 more than 2.3

zy_table[["venns"]][[1]][["p_lfc1"]][["down_noweight"]]
## Error in eval(expr, envir, enclos): object 'zy_table' not found

6.4.1.3 goseq ontology plots of groups of genes, 2.3 more than 2.2

zy_go_up$pvalue_plots$bpp_plot_over
## Error in eval(expr, envir, enclos): object 'zy_go_up' not found

6.4.1.4 goseq ontology plots of groups of genes, 2.2 more than 2.3

zy_go_down$pvalue_plots$bpp_plot_over
## Error in eval(expr, envir, enclos): object 'zy_go_down' not found

6.5 Look for agreement between sensitivity and zymodemes

Remind myself, the data structures are (zy|sus)_(de|table|sig).

zy_df <- zy_table[["data"]][["z23_vs_z22"]]
## Error in eval(expr, envir, enclos): object 'zy_table' not found
sus_df <- sus_table[["data"]][["sensitive_vs_resistant"]]

both_df <- merge(zy_df, sus_df, by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'zy_df' not found
plot_df <- both_df[, c("deseq_logfc.x", "deseq_logfc.y")]
## Error in eval(expr, envir, enclos): object 'both_df' not found
rownames(plot_df) <- both_df[["Row.names"]]
## Error in eval(expr, envir, enclos): object 'both_df' not found
colnames(plot_df) <- c("z23_vs_z22", "sensitive_vs_resistant")
## Error in colnames(plot_df) <- c("z23_vs_z22", "sensitive_vs_resistant"): object 'plot_df' not found
compare <- plot_linear_scatter(plot_df)
## Error in data.frame(df[, c(1, 2)]): object 'plot_df' not found
pp(file = "images/compare_sus_zy.png", image = compare$scatter)
## Error in compare$scatter: object of type 'closure' is not subsettable
compare$cor
## Error in compare$cor: object of type 'closure' is not subsettable

6.6 Zymodeme enzyme gene IDs

Najib read me an email listing off the gene names associated with the zymodeme classification. I took those names and cross referenced them against the Leishmania panamensis gene annotations and found the following:

They are:

  1. ALAT: LPAL13_120010900 – alanine aminotransferase
  2. ASAT: LPAL13_340013000 – aspartate aminotransferase
  3. G6PD: LPAL13_000054100 – glucase-6-phosphate 1-dehydrogenase
  4. NH: LPAL13_14006100, LPAL13_180018500 – inosine-guanine nucleoside hydrolase
  5. MPI: LPAL13_320022300 (maybe) – mannose phosphate isomerase (I chose phosphomannose isomerase)

Given these 6 gene IDs (NH has two gene IDs associated with it), I can do some looking for specific differences among the various samples.

6.6.1 Expression levels of zymodeme genes

The following creates a colorspace (red to green) heatmap showing the observed expression of these genes in every sample.

my_genes <- c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
              "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300",
              "other")
my_names <- c("ALAT", "ASAT", "G6PD", "NHv1", "NHv2", "MPI", "other")

zymo_expt <- exclude_genes_expt(zy_norm, ids = my_genes, method = "keep")
## Before removal, there were 8552 genes, now there are 6.
## There are 81 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20065 TMRC20005 TMRC20066 TMRC20039 TMRC20037 TMRC20038 TMRC20067 
##   0.13112   0.12486   0.13219   0.10585   0.13002   0.11005   0.11289   0.11638 
## TMRC20068 TMRC20041 TMRC20015 TMRC20009 TMRC20010 TMRC20016 TMRC20011 TMRC20012 
##   0.11548   0.11805   0.11474   0.11355   0.10983   0.10596   0.11022   0.12064 
## TMRC20013 TMRC20017 TMRC20014 TMRC20018 TMRC20019 TMRC20070 TMRC20020 TMRC20021 
##   0.12057   0.10637   0.10895   0.11461   0.12243   0.11254   0.11012   0.10621 
## TMRC20022 TMRC20024 TMRC20036 TMRC20069 TMRC20033 TMRC20026 TMRC20031 TMRC20076 
##   0.13068   0.11248   0.12022   0.11623   0.11264   0.13851   0.10019   0.12014 
## TMRC20073 TMRC20055 TMRC20079 TMRC20071 TMRC20078 TMRC20094 TMRC20042 TMRC20058 
##   0.12261   0.13482   0.12672   0.12330   0.13414   0.11739   0.13150   0.11804 
## TMRC20072 TMRC20059 TMRC20048 TMRC20088 TMRC20060 TMRC20077 TMRC20074 TMRC20063 
##   0.14328   0.11018   0.10308   0.12936   0.10847   0.12197   0.12073   0.11670 
## TMRC20053 TMRC20052 TMRC20064 TMRC20075 TMRC20051 TMRC20050 TMRC20049 TMRC20062 
##   0.11816   0.11041   0.11382   0.11108   0.12829   0.11533   0.13953   0.12853 
## TMRC20110 TMRC20080 TMRC20043 TMRC20083 TMRC20054 TMRC20085 TMRC20046 TMRC20089 
##   0.13865   0.11539   0.11361   0.12385   0.12772   0.12203   0.13688   0.11549 
## TMRC20090 TMRC20044 TMRC20105 TMRC20109 TMRC20098 TMRC20096 TMRC20097 TMRC20101 
##   0.11177   0.13387   0.12213   0.12677   0.11638   0.11664   0.11893   0.11896 
## TMRC20092 TMRC20099 TMRC20100 TMRC20087 TMRC20104 TMRC20086 TMRC20107 TMRC20081 
##   0.11565   0.12209   0.11064   0.12335   0.11726   0.10987   0.09648   0.10460 
## TMRC20106 
##   0.09811
zymo_heatmap <- plot_sample_heatmap(zymo_expt, row_label = my_names)
zymo_heatmap

6.7 Empirically observed Zymodeme genes from differential expression analysis

In contrast, the following plots take the set of genes which are shared among all differential expression methods (|lfc| >= 1.0 and adjp <= 0.05) and use them to make categories of genes which are increased in 2.3 or 2.2.

shared_zymo <- intersect_significant(zy_table)
## Error in is.data.frame(x): object 'zy_table' not found
up_shared <- shared_zymo[["ups"]][[1]][["data"]][["all"]]
## Error in eval(expr, envir, enclos): object 'shared_zymo' not found
rownames(up_shared)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'up_shared' not found
upshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(up_shared), method = "keep")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'up_shared' not found

We can plot a quick heatmap to get a sense of the differences observed between the genes which are different between the two zymodemes.

6.7.1 Heatmap of zymodeme gene expression increased in 2.3 vs. 2.2

high_23_heatmap <- plot_sample_heatmap(upshared_expt, row_label = rownames(up_shared))
## Error in plot_sample_heatmap(upshared_expt, row_label = rownames(up_shared)): object 'upshared_expt' not found
high_23_heatmap
## Error in eval(expr, envir, enclos): object 'high_23_heatmap' not found

6.7.2 Heatmap of zymodeme gene expression increased in 2.2 vs. 2.3

down_shared <- shared_zymo[["downs"]][[1]][["data"]][["all"]]
## Error in eval(expr, envir, enclos): object 'shared_zymo' not found
downshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(down_shared), method = "keep")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'down_shared' not found
high_22_heatmap <- plot_sample_heatmap(downshared_expt, row_label = rownames(down_shared))
## Error in plot_sample_heatmap(downshared_expt, row_label = rownames(down_shared)): object 'downshared_expt' not found
high_22_heatmap
## Error in eval(expr, envir, enclos): object 'high_22_heatmap' not found

7 SNP profiles

Now I will combine our previous samples and our new samples in the hopes of finding variant positions which help elucidate currently unknown aspects of either group via their clustering to known samples from the other group. In other words, we do not know the zymodeme annotations for the old samples nor the strain identities (or the shortcut ‘chronic vs. self-healing’) for the new samples. I hope to make educated guesses given the variant profiles. There are some differences in how the previous and current data sets were analyzed (though I have since redone the old samples so it should be trivial to remove those differences now).

I added our 2016 data to a specific TMRC2 sample sheet, dated 20191203. Thus I will load the data here. That previous data was mapped using tophat, so I will also need to make some changes to the gene names to accomodate the two mappings.

old_expt <- create_expt("sample_sheets/tmrc2_samples_20191203.xlsx",
                        file_column = "tophat2file")
## Reading the sample metadata.
## Dropped 13 rows from the sample metadata because they were blank.
## The sample definitions comprises: 50 rows(samples) and 38 columns(metadata fields).
## Warning in create_expt("sample_sheets/tmrc2_samples_20191203.xlsx", file_column
## = "tophat2file"): Some samples were removed when cross referencing the samples
## against the count data.
## Matched 8841 annotations and counts.
## Bringing together the count matrix and gene information.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 8841 rows and 33 columns.
tt <- lp_expt[["expressionset"]]
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.E1$", replacement = "", x = rownames(tt))
lp_expt$expressionset <- tt

tt <- old_expt$expressionset
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.1$", replacement = "", x = rownames(tt))
old_expt$expressionset <- tt
rm(tt)

7.1 Create the SNP expressionset

One other important caveat, we have a group of new samples which have not yet run through the variant search pipeline, so I need to remove them from consideration. Though it looks like they finished overnight…

## The next line drops the samples which are missing the SNP pipeline.
lp_snp <- subset_expt(lp_expt, subset="!is.na(pData(lp_expt)[['bcftable']])")
## subset_expt(): There were 99, now there are 67 samples.
new_snps <- sm(count_expt_snps(lp_snp, annot_column = "bcftable"))
old_snps <- sm(count_expt_snps(old_expt, annot_column = "bcftable", snp_column = 2))

nonzero_snps <- exprs(new_snps) != 0
colSums(nonzero_snps)
## tmrc20001 tmrc20065 tmrc20005 tmrc20007 tmrc20008 tmrc20027 tmrc20028 tmrc20032 
##     31443     73441      1965      2133      2786    291022    290524    116072 
## tmrc20040 tmrc20066 tmrc20039 tmrc20037 tmrc20038 tmrc20067 tmrc20068 tmrc20041 
##     34648     69275      4048     79861     80805     75518     74157     34972 
## tmrc20015 tmrc20009 tmrc20010 tmrc20016 tmrc20011 tmrc20012 tmrc20013 tmrc20017 
##     95041      7410     87192     95605      5297        10     89292      6689 
## tmrc20014 tmrc20018 tmrc20019 tmrc20070 tmrc20020 tmrc20021 tmrc20022 tmrc20025 
##      6440     82235      3021     78538      3209     88148      2608    279253 
## tmrc20024 tmrc20036 tmrc20069 tmrc20033 tmrc20026 tmrc20031 tmrc20073 tmrc20055 
##      4981     32060      3304      4443      2851      3150     78772      2819 
## tmrc20079 tmrc20071 tmrc20078 tmrc20042 tmrc20058 tmrc20072 tmrc20059 tmrc20048 
##     78013     74977      3287      2490     78849     31425     77883     76892 
## tmrc20057 tmrc20056 tmrc20060 tmrc20077 tmrc20063 tmrc20053 tmrc20052 tmrc20064 
##     32003      2977      2770      3091      1592      2836     77467     75335 
## tmrc20051 tmrc20050 tmrc20062 tmrc20080 tmrc20043 tmrc20054 tmrc20046 tmrc20047 
##     75845      3399     75667     81666     77526     77155     31433     75963 
## tmrc20044 tmrc20045 tmrc20061 
##      3132     30107     96058
both_snps <- combine_expts(new_snps, old_snps)
both_norm <- normalize_expt(both_snps, transform = "log2", convert = "cpm", filter = TRUE)
## Removing 0 low-count genes (670128 remaining).
## transform_counts: Found 61643658 values equal to 0, adding 1 to the matrix.
## strains <- both_norm[["design"]][["strain"]]
both_strain <- set_expt_conditions(both_norm, fact = "strain")

The data structure ‘both_norm’ now contains our 2016 data along with the newer data collected since 2019.

7.2 Plot of SNP profiles for zymodemes

The following plot shows the SNP profiles of all samples (old and new) where the colors at the top show either the 2.2 strains (orange), 2.3 strains (green), the previous samples (purple), or the various lab strains (pink etc).

old_new_variant_heatmap <- plot_disheat(both_norm)
pp(file = "images/raw_snp_disheat.png", image = old_new_variant_heatmap,
   height = 12, width = 12)
## Warning in pp(file = "images/raw_snp_disheat.png", image =
## old_new_variant_heatmap, : There is no device to shut down.

The function get_snp_sets() takes the provided metadata factor (in this case ‘condition’) and looks for variants which are exclusive to each element in it. In this case, this is looking for differences between 2.2 and 2.3, as well as the set shared among them.

snp_sets <- get_snp_sets(both_snps, factor = "condition")
## The factor z2.3 has 27 rows.
## The factor z2.2 has 28 rows.
## The factor unknown has 4 rows.
## The factor z1.0 has only 1 row.
## The factor z3.0 has only 1 row.
## The factor z2.0 has only 1 row.
## The factor z2.1 has 3 rows.
## The factor z2.4 has only 1 row.
## The factor null has only 1 row.
## The factor sh has 13 rows.
## The factor chr has 14 rows.
## The factor inf has 6 rows.
## Iterating over 727 elements.
both_expt <- combine_expts(lp_expt, old_expt)
## Error in combine(exp1, exp2): objects have different annotations: org.Lpanamensis.MHOMCOL81L13.v46.eg.db, org.Hs.eg.db
snp_genes <- sm(snps_vs_genes(both_expt, snp_sets, expt_name_col = "chromosome"))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'both_expt' not found
## I think we have some metrics here we can plot...
snp_subset <- sm(snp_subset_genes(
  both_expt, both_snps,
  genes = c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
            "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300")))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'both_expt' not found
zymo_heat <- plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset)))
## Error in plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset))): object 'snp_subset' not found
zymo_heat
## Error in eval(expr, envir, enclos): object 'zymo_heat' not found

Didn’t I create a set of densities by chromosome? Oh I think they come in from get_snp_sets()

7.3 SNPS associated with clinical response in the TMRC samples

clinical_sets <- get_snp_sets(new_snps, factor = "clinicalresponse")
## The factor cure has 26 rows.
## The factor failure has 21 rows.
## The factor laboratory line has only 1 row.
## The factor laboratory line miltefosine resistant has only 1 row.
## The factor nd has 14 rows.
## The factor reference strain has 4 rows.
## Iterating over 695 elements.
density_vec <- clinical_sets[["density"]]
chromosome_idx <- grep(pattern = "LpaL", x = names(density_vec))
density_df <- as.data.frame(density_vec[chromosome_idx])
density_df[["chr"]] <- rownames(density_df)
colnames(density_df) <- c("density_vec", "chr")
ggplot(density_df, aes_string(x = "chr", y = "density_vec")) +
  ggplot2::geom_col() +
  ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
                 axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5))

## clinical_written <- write_variants(new_snps)

7.3.1 Cross reference these variants by gene

clinical_genes <- sm(snps_vs_genes(lp_expt, clinical_sets, expt_name_col = "chromosome"))

snp_density <- merge(as.data.frame(clinical_genes[["summary_by_gene"]]),
                     as.data.frame(fData(lp_expt)),
                     by = "row.names")
snp_density <- snp_density[, c(1, 2, 4, 15)]
colnames(snp_density) <- c("name", "snps", "product", "length")
snp_density[["product"]] <- tolower(snp_density[["product"]])
snp_density[["length"]] <- as.numeric(snp_density[["length"]])
snp_density[["density"]] <- snp_density[["snps"]] / snp_density[["length"]]
snp_idx <- order(snp_density[["density"]], decreasing = TRUE)
snp_density <- snp_density[snp_idx, ]

removers <- c("amastin", "gp63", "leishmanolysin")
for (r in removers) {
  drop_idx <- grepl(pattern = r, x = snp_density[["product"]])
  snp_density <- snp_density[!drop_idx, ]
}
## Filter these for [A|a]mastin gp63 Leishmanolysin
clinical_snps <- snps_intersections(lp_expt, clinical_sets, chr_column = "chromosome")

fail_ref_snps <- as.data.frame(clinical_snps[["inters"]][["failure, reference strain"]])
cure_snps <- as.data.frame(clinical_snps[["inters"]][["cure"]])

head(fail_ref_snps)
##                                           seqnames  start    end width strand
## chr_LpaL13-10_pos_327353_ref_T_alt_C     LpaL13-10 327353 327354     2      +
## chr_LpaL13-13_pos_167047_ref_G_alt_C     LpaL13-13 167047 167048     2      +
## chr_LpaL13-15_pos_42885_ref_A_alt_G      LpaL13-15  42885  42886     2      +
## chr_LpaL13-20.1_pos_111781_ref_T_alt_C LpaL13-20.1 111781 111782     2      +
## chr_LpaL13-20.1_pos_85158_ref_C_alt_G  LpaL13-20.1  85158  85159     2      +
## chr_LpaL13-20.2_pos_48545_ref_T_alt_C  LpaL13-20.2  48545  48546     2      +
head(cure_snps)
##                                           seqnames  start    end width strand
## chr_LpaL13-08_pos_184791_ref_T_alt_A     LpaL13-08 184791 184792     2      +
## chr_LpaL13-20.1_pos_369935_ref_C_alt_T LpaL13-20.1 369935 369936     2      +
## chr_LpaL13-20.1_pos_370282_ref_C_alt_T LpaL13-20.1 370282 370283     2      +
## chr_LpaL13-20.1_pos_371356_ref_T_alt_C LpaL13-20.1 371356 371357     2      +
## chr_LpaL13-20.1_pos_380785_ref_A_alt_G LpaL13-20.1 380785 380786     2      +
## chr_LpaL13-20.1_pos_382801_ref_A_alt_C LpaL13-20.1 382801 382802     2      +
annot <- fData(lp_expt)
clinical_interest <- as.data.frame(clinical_snps[["gene_summaries"]][["cure"]])
clinical_interest <- merge(clinical_interest,
                           as.data.frame(clinical_snps[["gene_summaries"]][["failure, reference strain"]]),
                           by = "row.names")
rownames(clinical_interest) <- clinical_interest[["Row.names"]]
clinical_interest[["Row.names"]] <- NULL
colnames(clinical_interest) <- c("cure_snps","fail_snps")
annot <- merge(annot, clinical_interest, by = "row.names")
rownames(annot) <- annot[["Row.names"]]
annot[["Row.names"]] <- NULL
fData(lp_expt$expressionset) <- annot

8 Zymodeme for new samples

The heatmap produced here should show the variants only for the zymodeme genes.

8.1 Hunt for snp clusters

I am thinking that if we find clusters of locations which are variant, that might provide some PCR testing possibilities.

## Drop the 2.1, 2.4, unknown, and null
pruned_snps <- subset_expt(new_snps, subset="condition=='z2.2'|condition=='z2.3'")
## subset_expt(): There were 67, now there are 55 samples.
new_sets <- get_snp_sets(pruned_snps, factor = "zymodemecategorical")
## The factor z22 has 28 rows.
## The factor z23 has 27 rows.
## Iterating over 695 elements.
summary(new_sets)
##               Length Class      Mode     
## medians         3    data.frame list     
## possibilities   2    -none-     character
## intersections   3    -none-     list     
## chr_data      695    -none-     list     
## set_names       4    -none-     list     
## invert_names    4    -none-     list     
## density       695    -none-     numeric
## 1000000: 2.2
## 0100000: 2.3

summary(new_sets[["intersections"]][["10"]])
##    Length     Class      Mode 
##       890 character character
summary(new_sets[["intersections"]][["01"]])
##    Length     Class      Mode 
##     76189 character character

Thus we see that there are 511 variants associated with 2.2 and 49,790 associated with 2.3.

8.1.1 A small function for searching for potential PCR primers

The following function uses the positional data to look for sequential mismatches associated with zymodeme in the hopes that there will be some regions which would provide good potential targets for a PCR-based assay.

sequential_variants <- function(snp_sets, conditions = NULL, minimum = 3, maximum_separation = 3) {
  if (is.null(conditions)) {
    conditions <- 1
  }
  intersection_sets <- snp_sets[["intersections"]]
  intersection_names <- snp_sets[["set_names"]]
  chosen_intersection <- 1
  if (is.numeric(conditions)) {
    chosen_intersection <- conditions
  } else {
    intersection_idx <- intersection_names == conditions
    chosen_intersection <- names(intersection_names)[intersection_idx]
  }

  possible_positions <- intersection_sets[[chosen_intersection]]
  position_table <- data.frame(row.names = possible_positions)
  pat <- "^chr_(.+)_pos_(.+)_ref_.*$"
  position_table[["chr"]] <- gsub(pattern = pat, replacement = "\\1", x = rownames(position_table))
  position_table[["pos"]] <- as.numeric(gsub(pattern = pat, replacement = "\\2", x = rownames(position_table)))
  position_idx <- order(position_table[, "chr"], position_table[, "pos"])
  position_table <- position_table[position_idx, ]
  position_table[["dist"]] <- 0

  last_chr <- ""
  for (r in 1:nrow(position_table)) {
    this_chr <- position_table[r, "chr"]
    if (r == 1) {
      position_table[r, "dist"] <- position_table[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      position_table[r, "dist"] <- position_table[r, "pos"] - position_table[r - 1, "pos"]
    } else {
      position_table[r, "dist"] <- position_table[r, "pos"]
    }
    last_chr <- this_chr
  }

  ## Working interactively here.

  doubles <- position_table[["dist"]] == 1
  doubles <- position_table[doubles, ]
  write.csv(doubles, "doubles.csv")

  one_away <- position_table[["dist"]] == 2
  one_away <- position_table[one_away, ]
  write.csv(one_away, "one_away.csv")

  two_away <- position_table[["dist"]] == 3
  two_away <- position_table[two_away, ]
  write.csv(two_away, "two_away.csv")

  combined <- rbind(doubles, one_away)
  combined <- rbind(combined, two_away)
  position_idx <- order(combined[, "chr"], combined[, "pos"])
  combined <- combined[position_idx, ]

  this_chr <- ""
  for (r in 1:nrow(combined)) {
    this_chr <- combined[r, "chr"]
    if (r == 1) {
      combined[r, "dist_pair"] <- combined[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      combined[r, "dist_pair"] <- combined[r, "pos"] - combined[r - 1, "pos"]
    } else {
      combined[r, "dist_pair"] <- combined[r, "pos"]
    }
    last_chr <- this_chr
  }

  dist_pair_maximum <- 1000
  dist_pair_minimum <- 200
  dist_pair_idx <- combined[["dist_pair"]] <= dist_pair_maximum &
    combined[["dist_pair"]] >= dist_pair_minimum
  remaining <- combined[dist_pair_idx, ]
  no_weak_idx <- grepl(pattern="ref_(G|C)", x=rownames(remaining))
  remaining <- remaining[no_weak_idx, ]

  print(head(table(position_table[["dist"]])))
  sequentials <- position_table[["dist"]] <= maximum_separation
  message("There are ", sum(sequentials), " candidate regions.")

  ## The following can tell me how many runs of each length occurred, that is not quite what I want.
  ## Now use run length encoding to find the set of sequential sequentials!
  rle_result <- rle(sequentials)
  rle_values <- rle_result[["values"]]
  ## The following line is equivalent to just leaving values alone:
  ## true_values <- rle_result[["values"]] == TRUE
  rle_lengths <- rle_result[["lengths"]]
  true_sequentials <- rle_lengths[rle_values]
  rle_idx <- cumsum(rle_lengths)[which(rle_values)]

  position_table[["last_sequential"]] <- 0
  count <- 0
  for (r in rle_idx) {
    count <- count + 1
    position_table[r, "last_sequential"] <- true_sequentials[count]
  }
  message("The maximum sequential set is: ", max(position_table[["last_sequential"]]), ".")

  wanted_idx <- position_table[["last_sequential"]] >= minimum
  wanted <- position_table[wanted_idx, c("chr", "pos")]
  return(wanted)
}

zymo22_sequentials <- sequential_variants(new_sets, conditions = "z22", minimum=1, maximum_separation=2)
dim(zymo22_sequentials)
## 7 candidate regions for zymodeme 2.2 -- thus I am betting that the reference strain is a 2.2
zymo23_sequentials <- sequential_variants(new_sets, conditions = "z23",
                                          minimum = 2, maximum_separation = 2)
dim(zymo23_sequentials)
## In contrast, there are lots (587) of interesting regions for 2.3!

8.1.2 Extract a promising region from the genome

The first 4 candidate regions from my set of remaining: * Chr Pos. Distance * LpaL13-15 238433 448 * LpaL13-18 142844 613 * LpaL13-29 830342 252 * LpaL13-33 1331507 843

Lets define a couple of terms: * Third: Each of the 4 above positions. * Second: Third - Distance * End: Third + PrimerLen * Start: Second - Primerlen

In each instance, these are the last positions, so we want to grab three things:

  • The entire region from End -> Start, this way we can have a quick sanity check.
  • Start -> Second.
  • (Third -> End) <- Reverse complemented
## * LpaL13-15 238433 448
first_candidate_chr <- genome[["LpaL13_15"]]
primer_length <- 22
amplicon_length <- 448
first_candidate_third <- 238433
first_candidate_second <- first_candidate_third - amplicon_length
first_candidate_start <- first_candidate_second - primer_length
first_candidate_end <- first_candidate_third + primer_length
first_candidate_region <- subseq(first_candidate_chr, first_candidate_start, first_candidate_end)
first_candidate_region
first_candidate_5p <- subseq(first_candidate_chr, first_candidate_start, first_candidate_second)
as.character(first_candidate_5p)
first_candidate_3p <- spgs::reverseComplement(subseq(first_candidate_chr, first_candidate_third, first_candidate_end))
first_candidate_3p


## * LpaL13-18 142844 613
second_candidate_chr <- genome[["LpaL13_18"]]
primer_length <- 22
amplicon_length <- 613
second_candidate_third <- 142844
second_candidate_second <- second_candidate_third - amplicon_length
second_candidate_start <- second_candidate_second - primer_length
second_candidate_end <- second_candidate_third + primer_length
second_candidate_region <- subseq(second_candidate_chr, second_candidate_start, second_candidate_end)
second_candidate_region
second_candidate_5p <- subseq(second_candidate_chr, second_candidate_start, second_candidate_second)
as.character(second_candidate_5p)
second_candidate_3p <- spgs::reverseComplement(subseq(second_candidate_chr, second_candidate_third, second_candidate_end))
second_candidate_3p


## * LpaL13-29 830342 252
third_candidate_chr <- genome[["LpaL13_29"]]
primer_length <- 22
amplicon_length <- 252
third_candidate_third <- 830342
third_candidate_second <- third_candidate_third - amplicon_length
third_candidate_start <- third_candidate_second - primer_length
third_candidate_end <- third_candidate_third + primer_length
third_candidate_region <- subseq(third_candidate_chr, third_candidate_start, third_candidate_end)
third_candidate_region
third_candidate_5p <- subseq(third_candidate_chr, third_candidate_start, third_candidate_second)
as.character(third_candidate_5p)
third_candidate_3p <- spgs::reverseComplement(subseq(third_candidate_chr, third_candidate_third, third_candidate_end))
third_candidate_3p
## You are a garbage polypyrimidine tract.
## Which is actually interesting if the mutations mess it up.


## * LpaL13-33 1331507 843
fourth_candidate_chr <- genome[["LpaL13_33"]]
primer_length <- 22
amplicon_length <- 843
fourth_candidate_third <- 1331507
fourth_candidate_second <- fourth_candidate_third - amplicon_length
fourth_candidate_start <- fourth_candidate_second - primer_length
fourth_candidate_end <- fourth_candidate_third + primer_length
fourth_candidate_region <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_end)
fourth_candidate_region
fourth_candidate_5p <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_second)
as.character(fourth_candidate_5p)
fourth_candidate_3p <- spgs::reverseComplement(subseq(fourth_candidate_chr, fourth_candidate_third, fourth_candidate_end))
fourth_candidate_3p

8.2 Go hunting for Sanger sequencing regions

I made a fun little function which should find regions which have lots of variants associated with a given experimental factor.

pheno <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
pheno <- subset_expt(pheno, subset = "!is.na(pData(pheno)[['bcftable']])")
pheno_snps <- sm(count_expt_snps(pheno, annot_column = "bcftable"))

fun_stuff <- snp_density_primers(pheno_snps,
                                 bsgenome = "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53",
                                 gff = "reference/TriTrypDB-53_LpanamensisMHOMCOL81L13.gff")
drop_scaffolds <- grepl(x = rownames(fun_stuff$favorites), pattern = "SCAF")
favorite_primer_regions <- fun_stuff[["favorites"]][!drop_scaffolds, ]
favorite_primer_regions[["bin"]] <- rownames(favorite_primer_regions)
library(dplyr)
favorite_primer_regions <- favorite_primer_regions %>%
  relocate(bin)

8.3 Combine this table with 2.2/2.3 genes

Here is my note from our meeting:

Cross reference primers to DE genes of 2.2/2.3 and/or resistance/suscpetible, add a column to the primer spreadsheet with the DE genes (in retrospect I am guessing this actually means to put the logFC as a column.

One nice thing, I did a semantic removal on the lp_expt, so the set of logFC/pvalues should not have any of the offending types; thus I should be able to automagically get rid of them in the merge.

logfc <- zy_table[["data"]][["z23_vs_z22"]]
## Error in eval(expr, envir, enclos): object 'zy_table' not found
logfc_columns <- logfc[, c("deseq_logfc", "deseq_adjp")]
## Error in eval(expr, envir, enclos): object 'logfc' not found
colnames(logfc_columns) <- c("z23_logfc", "z23_adjp")
## Error in colnames(logfc_columns) <- c("z23_logfc", "z23_adjp"): object 'logfc_columns' not found
new_table <- merge(favorite_primer_regions, logfc_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'favorite_primer_regions' not found
sus <- sus_table[["data"]][["sensitive_vs_resistant"]]
sus_columns <- sus[, c("deseq_logfc", "deseq_adjp")]
colnames(sus_columns) <- c("sus_logfc", "sus_adjp")
new_table <- merge(new_table, sus_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names") %>%
  relocate(bin)
## Error in relocate(., bin): could not find function "relocate"
written <- write_xlsx(data=new_table,
                      excel="excel/favorite_primers_xref_zy_sus.xlsx")
## Error in write_xlsx(data = new_table, excel = "excel/favorite_primers_xref_zy_sus.xlsx"): object 'new_table' not found

8.4 Make a heatmap describing the clustering of variants

We can cross reference the variants against the zymodeme status and plot a heatmap of the results and hopefully see how they separate.

## pruned_snps <- subset_expt(new_snps, subset="condition=='z2.2'|condition=='z2.3'")
snp_genes <- sm(snps_vs_genes(lp_expt, new_sets, expt_name_col = "chromosome"))

##new_zymo_norm <- normalize_expt(pruned_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE)
##new_zymo_norm <- set_expt_conditions(new_zymo_norm, fact = "zymodemecategorical")
clinical_colors_v2 <- list(
    "z22" = "#0000cc",
    "z23" = "#cc0000")
new_zymo_norm <- normalize_expt(pruned_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE) %>%
  set_expt_conditions(fact = "zymodemecategorical") %>%
  set_expt_colors(clinical_colors_v2)
## Removing 0 low-count genes (568627 remaining).
## transform_counts: Found 28953155 values equal to 0, adding 1 to the matrix.
zymo_heat <- plot_disheat(new_zymo_norm)
pp(file = "images/onlyz22_z23_snp_heatmap.pdf", image=zymo_heat[["plot"]])

zymo_heat[["plot"]]

8.4.1 Annotated heatmap of variants

Now let us try to make a heatmap which includes some of the annotation data.

des <- both_norm[["design"]]
undef_idx <- is.na(des[["strain"]])
des[undef_idx, "strain"] <- "unknown"

##hmcols <- colorRampPalette(c("yellow","black","darkblue"))(256)
correlations <- hpgl_cor(exprs(both_norm))

zymo_missing_idx <- is.na(des[["zymodemecategorical"]])
des[["zymodemecategorical"]] <- as.character(des[["zymodemecategorical"]])
des[["clinicalcategorical"]] <- as.character(des[["clinicalcategorical"]])
des[zymo_missing_idx, "zymodemecategorical"] <- "unknown"
mydendro <- list(
  "clustfun" = hclust,
  "lwd" = 2.0)
col_data <- as.data.frame(des[, c("zymodemecategorical", "clinicalcategorical")])

unknown_clinical <- is.na(col_data[["clinicalcategorical"]])
row_data <- as.data.frame(des[, c("strain")])
colnames(col_data) <- c("zymodeme", "outcome")
col_data[unknown_clinical, "outcome"] <- "undefined"

colnames(row_data) <- c("strain")
myannot <- list(
  "Col" = list("data" = col_data),
  "Row" = list("data" = row_data))
myclust <- list("cuth" = 1.0,
                "col" = BrewerClusterCol)
mylabs <- list(
  "Row" = list("nrow" = 4),
  "Col" = list("nrow" = 4))
hmcols <- colorRampPalette(c("darkblue", "beige"))(240)
map1 <- annHeatmap2(
  correlations,
  dendrogram = mydendro,
  annotation = myannot,
  cluster = myclust,
  labels = mylabs,
  ## The following controls if the picture is symmetric
  scale = "none",
  col = hmcols)
## Warning in breakColors(breaks, col): more colors than classes: ignoring 29 last
## colors
pp(file = "images/dendro_heatmap.png", image = map1, height = 20, width = 20)
## Warning in pp(file = "images/dendro_heatmap.png", image = map1, height = 20, :
## There is no device to shut down.
## annotated Heatmap
## 
## Rows: 'dendrogram' with 2 branches and 100 members total, at height 6.173 
##   11  annotation variable(s)
## Cols: 'dendrogram' with 2 branches and 100 members total, at height 6.173 
##   13  annotation variable(s)

Print the larger heatmap so that all the labels appear. Keep in mind that as we get more samples, this image needs to continue getting bigger.

big heatmap

xref_prop <- table(pheno_snps[["conditions"]])
## Error in eval(quote(list(...)), env): object 'pheno_snps' not found
pheno_snps$conditions
## Error in eval(expr, envir, enclos): object 'pheno_snps' not found
idx_tbl <- exprs(pheno_snps) > 5
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'pheno_snps' not found
new_tbl <- data.frame(row.names = rownames(exprs(pheno_snps)))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'pheno_snps' not found
for (n in names(xref_prop)) {
  new_tbl[[n]] <- 0
  idx_cols <- which(pheno_snps[["conditions"]] == n)
  prop_col <- rowSums(idx_tbl[, idx_cols]) / xref_prop[n]
  new_tbl[n] <- prop_col
}
## Error in eval(expr, envir, enclos): object 'xref_prop' not found
keepers <- grepl(x = rownames(new_tbl), pattern = "LpaL13")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grepl': error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'new_tbl' not found
new_tbl <- new_tbl[keepers, ]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[["strong22"]] <- 1.001 - new_tbl[["z2.2"]]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[["strong23"]] <- 1.001 - new_tbl[["z2.3"]]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
s22_na <- new_tbl[["strong22"]] > 1
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[s22_na, "strong22"] <- 1
## Error in new_tbl[s22_na, "strong22"] <- 1: object 'new_tbl' not found
s23_na <- new_tbl[["strong23"]] > 1
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[s23_na, "strong23"] <- 1
## Error in new_tbl[s23_na, "strong23"] <- 1: object 'new_tbl' not found
new_tbl[["SNP"]] <- rownames(new_tbl)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'new_tbl' not found
new_tbl[["Chromosome"]] <- gsub(x = new_tbl[["SNP"]], pattern = "chr_(.*)_pos_.*", replacement = "\\1")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': object 'new_tbl' not found
new_tbl[["Position"]] <- gsub(x = new_tbl[["SNP"]], pattern = ".*_pos_(\\d+)_.*", replacement = "\\1")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': object 'new_tbl' not found
new_tbl <- new_tbl[, c("SNP", "Chromosome", "Position", "strong22", "strong23")]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
library(CMplot)
## Much appreciate for using CMplot.
## Full description, Bug report, Suggestion and the latest codes:
## https://github.com/YinLiLin/CMplot
simplify <- new_tbl
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
simplify[["strong22"]] <- NULL
## Error in simplify[["strong22"]] <- NULL: object 'simplify' not found
CMplot(simplify, bin.size = 100000)
## Error in is.data.frame(x): object 'simplify' not found
CMplot(new_tbl, plot.type="m", multracks=TRUE, threshold = c(0.01, 0.05),
       threshold.lwd=c(1,1), threshold.col=c("black","grey"),
       amplify=TRUE, bin.size=10000,
       chr.den.col=c("darkgreen", "yellow", "red"),
       signal.col=c("red", "green", "blue"),
       signal.cex=1, file="jpg", memo="", dpi=300, file.output=TRUE, verbose=TRUE)
## Error in is.data.frame(x): object 'new_tbl' not found

SNP Density Circular Manhattan Rectangular Manhattan QQ

8.5 Try out MatrixEQTL

This tool looks a little opaque, but provides sample data with things that make sense to me and should be pretty easy to recapitulate in our data.

  1. covariates.txt: Columns are samples, rows are things from pData – the most likely ones of interest for our data would be zymodeme, sensitivity
  2. geneloc.txt: columns are ‘geneid’, ‘chr’, ‘left’, ‘right’. I guess I can assume left and right are start/stop; in which case this is trivially acquirable from fData.
  3. ge.txt: This appears to be a log(rpkm/cpm) table with rows as genes and columns as samples
  4. snpsloc.txt: columns are ‘snpid’, ‘chr’, ‘pos’
  5. snps.txt: columns are samples, rows are the ids from snsploc, values a 0,1,2. I assume 0 is identical and 1..12 are the various A->TGC T->AGC C->AGT G->ACT
## For this, let us use the 'new_snps' data structure.
## Caveat here: these need to be coerced to numbers.
my_covariates <- pData(new_snps)[, c("zymodemecategorical", "clinicalcategorical")]
for (col in colnames(my_covariates)) {
  my_covariates[[col]] <- as.numeric(as.factor(my_covariates[[col]]))
}
my_covariates <- t(my_covariates)

my_geneloc <- fData(lp_expt)[, c("gid", "chromosome", "start", "end")]
colnames(my_geneloc) <- c("geneid", "chr", "left", "right")

my_ge <- exprs(normalize_expt(lp_expt, transform = "log2", filter = TRUE, convert = "cpm"))
used_samples <- tolower(colnames(my_ge)) %in% colnames(exprs(new_snps))
my_ge <- my_ge[, used_samples]

my_snpsloc <- data.frame(rownames = rownames(exprs(new_snps)))
## Oh, caveat here: Because of the way I stored the data,
## I could have duplicate rows which presumably will make matrixEQTL sad
my_snpsloc[["chr"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\1",
                            x = rownames(my_snpsloc))
my_snpsloc[["pos"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\2",
                            x = rownames(my_snpsloc))
test <- duplicated(my_snpsloc)
## Each duplicated row would be another variant at that position;
## so in theory we would do a rle to number them I am guessing
## However, I do not have different variants so I think I can ignore this for the moment
## but will need to make my matrix either 0 or 1.
if (sum(test) > 0) {
  message("There are: ", sum(duplicated), " duplicated entries.")
  keep_idx <- ! test
  my_snpsloc <- my_snpsloc[keep_idx, ]
}

my_snps <- exprs(new_snps)
one_idx <- my_snps > 0
my_snps[one_idx] <- 1

## Ok, at this point I think I have all the pieces which this method wants...
## Oh, no I guess not; it actually wants the data as a set of filenames...
library(MatrixEQTL)
write.table(my_snps, "eqtl/snps.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snps, "eqtl/snps.tsv", )
write.table(my_snpsloc, "eqtl/snpsloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snpsloc, "eqtl/snpsloc.tsv")
write.table(as.data.frame(my_ge), "eqtl/ge.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_ge), "eqtl/ge.tsv")
write.table(as.data.frame(my_geneloc), "eqtl/geneloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_geneloc), "eqtl/geneloc.tsv")
write.table(as.data.frame(my_covariates), "eqtl/covariates.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_covariates), "eqtl/covariates.tsv")

useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

# Genotype file name
SNP_file_name = "eqtl/snps.tsv"
snps_location_file_name = "eqtl/snpsloc.tsv"
expression_file_name = "eqtl/ge.tsv"
gene_location_file_name = "eqtl/geneloc.tsv"
covariates_file_name = "eqtl/covariates.tsv"
# Output file name
output_file_name_cis = tempfile()
output_file_name_tra = tempfile()
# Only associations significant at this level will be saved
pvOutputThreshold_cis = 0.1
pvOutputThreshold_tra = 0.1
# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric()
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");
# Distance for local gene-SNP pairs
cisDist = 1e6
## Load genotype data
snps = SlicedData$new()
snps$fileDelimiter = "\t"      # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1          # one row of column labels
snps$fileSkipColumns = 1       # one column of row labels
snps$fileSliceSize = 2000      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)
## Load gene expression data
gene = SlicedData$new()
gene$fileDelimiter = "\t"      # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1          # one row of column labels
gene$fileSkipColumns = 1       # one column of row labels
gene$fileSliceSize = 2000      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)
## Load covariates
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"      # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1          # one row of column labels
cvrt$fileSkipColumns = 1       # one column of row labels
if(length(covariates_file_name) > 0) {
  cvrt$LoadFile(covariates_file_name)
}
## Run the analysis
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)

me = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name_tra,
    pvOutputThreshold = pvOutputThreshold_tra,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = TRUE,
    output_file_name.cis = output_file_name_cis,
    pvOutputThreshold.cis = pvOutputThreshold_cis,
    snpspos = snpspos,
    genepos = genepos,
    cisDist = cisDist,
    pvalue.hist = "qqplot",
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE);
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename = savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset c07ff314c871617d0f3c45257d3c46e61270ed3e
## This is hpgltools commit: Thu Mar 17 11:25:56 2022 -0400: c07ff314c871617d0f3c45257d3c46e61270ed3e
## Saving to tmrc2_02sample_estimation_v202203.rda.xz
tmp <- loadme(filename = savefile)
---
title: "TMRC2 Comprehensive Data Analysis: 202201"
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: default
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: readable
  toc: true
  toc_float:
   collapsed: false
   smooth_scroll: false
---

<style>
  body .main-container {
    max-width: 1600px;
  }
</style>

```{r options, include = FALSE}
library(hpgltools)
tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(progress = TRUE,
                     verbose = TRUE,
                     width = 90,
                     echo = TRUE)
knitr::opts_chunk$set(error = TRUE,
                      fig.width = 8,
                      fig.height = 8,
                      dpi = 96)
old_options <- options(digits = 4,
                       stringsAsFactors = FALSE,
                       knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size = 12))
ver <- "202203"
rundate <- format(Sys.Date(), format = "%Y%m%d")

## tmp <- try(sm(loadme(filename = gsub(pattern = "\\.Rmd", replace = "\\.rda\\.xz", x = previous_file))))
rmd_file <- glue::glue("tmrc2_02sample_estimation_v{ver}.Rmd")
savefile <- gsub(pattern = "\\.Rmd", replace = "\\.rda\\.xz", x = rmd_file)

library(Heatplus)
```

```{r current_samplesheet}
sample_sheet <- glue::glue("sample_sheets/tmrc2_samples_202203.xlsx")
```

# Introduction

This is mostly just a run of this worksheet to reacquaint myself with it.

This document is intended to provide a general overview of the TMRC2 samples
which have thus far been sequenced.  In some cases, this includes only those
samples starting in 2019; in other instances I am including our previous
(2015-2016) samples.

In all cases the processing performed was:

1.  Default trimming was performed.
2.  Hisat2 was used to map the remaining reads against the Leishmania
    panamensis genome revision 36.
3.  The alignments from hisat2 were used to count reads/gene against the
    revision 36 annotations with htseq.
4.  These alignments were also passed to the pileup functionality of samtools
    and the vcf/bcf utilities in order to make a matrix of all observed
    differences between each sample with respect to the reference.

The analyses in this document use the matrices of counts/gene from #3 and
variants/position from #4 in order to provide some images and metrics describing
the samples we have sequenced so far.

# Annotations

Everything which follows depends on the Existing TriTrypDB annotations revision
46, circa 2019.  The following block loads a database of these annotations and
turns it into a matrix where the rows are genes and columns are all the
annotation types provided by TriTrypDB.

The same database was used to create a matrix of orthologous genes between
L.panamensis and all of the other species in the TriTrypDB.

```{r annot}
tt <- sm(library(EuPathDB))
orgdb <- "org.Lpanamensis.MHOMCOL81L13.v46.eg.db"
tt <- sm(library(orgdb, character.only=TRUE))
pan_db <- org.Lpanamensis.MHOMCOL81L13.v46.eg.db
all_fields <- columns(pan_db)

all_lp_annot <- sm(load_orgdb_annotations(
    pan_db,
    keytype = "gid",
    fields = c("annot_gene_entrez_id", "annot_gene_name",
               "annot_strand", "annot_chromosome", "annot_cds_length",
               "annot_gene_product")))$genes

lp_go <- sm(load_orgdb_go(pan_db))
lp_lengths <- all_lp_annot[, c("gid", "annot_cds_length")]
colnames(lp_lengths)  <- c("ID", "length")
all_lp_annot[["annot_gene_product"]] <- tolower(all_lp_annot[["annot_gene_product"]])
orthos <- sm(EuPathDB::extract_eupath_orthologs(db = pan_db))

hisat_annot <- all_lp_annot
```

# Load a genome

```{r genome}
meta <- EuPathDB::download_eupath_metadata(webservice="tritrypdb")
lp_entry <- EuPathDB::get_eupath_entry(species="Leishmania panamensis", metadata=meta)
colnames(lp_entry)
testing_panamensis <- "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53"
## testing_panamensis <- EuPathDB::make_eupath_bsgenome(entry=lp_entry, eu_version="v46")
library(as.character(testing_panamensis), character.only=TRUE)
genome <- get0(as.character(testing_panamensis))
```

# TODO:

Resequence samples: TMRC20002, TMRC20006, TMRC20004 (maybe TMRC20008 and TMRC20029)

# Generate Expressionsets and Sample Estimation

The process of sample estimation takes two primary inputs:

1.  The sample sheet, which contains all the metadata we currently have on hand,
    including filenames for the outputs of #3 and #4 above.
2.  The gene annotations.

An expressionset is a data structure used in R to examine RNASeq data.  It
is comprised of annotations, metadata, and expression data.  In the case of our
processing pipeline, the location of the expression data is provided by the
filenames in the metadata.

The first lines of the following block create the Expressionset.  All of the
following lines perform various normalizations and generate plots from it.

## Notes

The following samples are much lower coverage:

* TMRC20002
* TMRC20006
* TMRC20007
* TMRC20008

20210610: I made some manual changes to the sample sheet which I
downloaded, filling in some zymodeme with 'unknown'

## TODO:

1.  Do the multi-gene family removal right here instead of way down at the bottom
2.  Add zymodeme snps to the annotation later.
3.  Start phylogenetic analysis of variant table.

```{r new_samples_hisat}
sanitize_columns <- c("passagenumber", "clinicalresponse", "clinicalcategorical",
                      "zymodemecategorical", "zymodemecategorical")
lp_expt <- create_expt(sample_sheet,
                       gene_info = hisat_annot,
                       annotation = orgdb,
                       id_column = "hpglidentifier",
                       file_column = "lpanamensisv36hisatfile") %>%
  set_expt_conditions(fact = "zymodemecategorical") %>%
  subset_expt(nonzero = 8550) %>%
  subset_expt(coverage = 5000000) %>%
  semantic_expt_filter(semantic = c("amastin", "gp63", "leishmanolysin"),
                       semantic_column = "annot_gene_product") %>%
  sanitize_expt_metadata(columns = sanitize_columns) %>%
  set_expt_factors(columns = sanitize_columns, class = "factor")

libsizes <- plot_libsize(lp_expt)
pp(file = "images/lp_expt_libsizes.png", image = libsizes$plot, width = 14, height = 9)
## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(lp_expt)
pp(file = "images/lp_nonzero.png", image = nonzero$plot, width = 9, height = 9)

lp_box <- plot_boxplot(lp_expt)
pp(file = "images/lp_expt_boxplot.png", image = lp_box, width = 12, height = 9)

filter_plot <- plot_libsize_prepost(lp_expt)
filter_plot$lowgene_plot
filter_plot$count_plot
```

## Distribution Visualization

Najib's favorite plots are of course the PCA/TNSE.  These are nice to look at in
order to get a sense of the relationships between samples.  They also provide a
good opportunity to see what happens when one applies different normalizations,
surrogate analyses, filters, etc.  In addition, one may set different
experimental factors as the primary 'condition' (usually the color of plots) and
surrogate 'batches'.

## By Susceptilibity

Column 'Q' in the sample sheet, make a categorical version of it with these parameters:

* 0 <= x <= 35 is resistant
* 36 <= x <= 48 is ambiguous
* 49 <= x is sensitive

```{r susceptibility}
starting <- as.numeric(pData(lp_expt)[["susceptibilityinfectionreduction32ugmlsbvhistoricaldata"]])
sus_categorical <- starting
na_idx <- is.na(starting)
sus_categorical[na_idx] <- "unknown"

resist_idx <- starting <= 0.35
sus_categorical[resist_idx] <- "resistant"
indeterminant_idx <- starting >= 0.36 & starting <= 0.48
sus_categorical[indeterminant_idx] <- "ambiguous"
susceptible_idx <- starting >= 0.49
sus_categorical[susceptible_idx] <- "sensitive"

pData(lp_expt)[["sus_category"]] <- sus_categorical
table(sus_categorical)
```

```{r pre_questions}
clinical_colors <- list(
    "z1.0" = "#333333",
    "z2.0" = "#555555",
    "z3.0" = "#777777",
    "z2.1" = "#874400",
    "z2.2" = "#0000cc",
    "z2.3" = "#cc0000",
    "z2.4" = "#df7000",
    "unknown" = "#cbcbcb",
    "null" = "#000000")
clinical_samples <- lp_expt %>%
  set_expt_batches(fact = sus_categorical) %>%
  set_expt_colors(clinical_colors)
table(pData(clinical_samples)[["condition"]])

clinical_norm <- normalize_expt(clinical_samples, norm = "quant", transform = "log2",
                                   convert = "cpm", filter = TRUE)
zymo_pca <- plot_pca(clinical_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/zymo_pca_sus_shape.png", image = zymo_pca$plot)

only_two_types <- subset_expt(clinical_samples, subset = "condition=='z2.3'|condition=='z2.2'")
only_two_norm <- sm(normalize_expt(only_two_types, norm = "quant", transform = "log2",
                                   convert = "cpm", batch = FALSE, filter = TRUE))
onlytwo_pca <- plot_pca(only_two_norm, plot_title = "PCA of z2.2 and z2.3 parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/zymo_z2.2_z2.3_pca_sus_shape.pdf", image = onlytwo_pca$plot)

zymo_3dpca <- plot_3d_pca(zymo_pca)
zymo_3dpca$plot

clinical_n <- sm(normalize_expt(clinical_samples, transform = "log2",
                                convert = "cpm", batch = FALSE, filter = TRUE))
zymo_tsne <- plot_tsne(clinical_n, plot_title = "TSNE of parasite expression values")
zymo_tsne$plot

clinical_nb <- normalize_expt(clinical_samples, convert = "cpm", transform = "log2",
                         filter = TRUE, batch = "svaseq")
clinical_nb_pca <- plot_pca(clinical_nb, plot_title = "PCA of parasite expression values",
                            plot_labels = FALSE)
pp(file = "images/clinical_nb_pca_sus_shape.png", image = clinical_nb_pca$plot)

clinical_nb_tsne <- plot_tsne(clinical_nb, plot_title = "TSNE of parasite expression values")
clinical_nb_tsne$plot

corheat <- plot_corheat(clinical_norm, plot_title = "Correlation heatmap of parasite
                 expression values
")
corheat$plot

plot_sm(clinical_norm)$plot
```

## By Cure/Fail status

```{r cf_status}
cf_colors <- list(
    "cure" = "#006f00",
    "fail" = "#9dffa0",
    "unknown" = "#cbcbcb",
    "notapplicable" = "#000000")
cf_expt <- set_expt_conditions(lp_expt, fact = "clinicalcategorical") %>%
  set_expt_batches(fact = sus_categorical) %>%
  set_expt_colors(cf_colors)
table(pData(cf_expt)[["condition"]])

cf_norm <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
                          norm = "quant", filter = TRUE)
start_cf <- plot_pca(cf_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/cf_sus_shape.png", image = start_cf$plot)

cf_nb <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
                        norm = "quant", filter = TRUE, batch = "svaseq")
cf_nb_pca <- plot_pca(cf_nb, plot_title = "PCA of parasite expression values",
                      plot_labels = FALSE)
pp(file = "images/cf_sus_share_nb.png", image = cf_nb_pca$plot)

cf_norm <- normalize_expt(cf_expt, transform = "log2", convert = "cpm",
                          filter = TRUE, norm = "quant")

test <- pca_information(cf_norm,
                        expt_factors = c("clinicalcategorical", "zymodemecategorical",
                                         "pathogenstrain", "passagenumber"),
                        num_components = 6, plot_pcas = TRUE)
test$anova_p
test$cor_heatmap
```

```{r susceptibility_pca}
sus_colors <- list(
    "resistant" = "#8563a7",
    "sensitive" = "#8d0000",
    "ambiguous" = "#cbcbcb",
    "unknown" = "#000000")
sus_expt <- set_expt_conditions(lp_expt, fact = "sus_category") %>%
  set_expt_batches(fact = "zymodemecategorical") %>%
  set_expt_colors(colors = sus_colors) %>%
  subset_expt(subset = "batch!='z24'") %>%
  subset_expt(subset = "batch!='z21'")

sus_norm <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
                           norm = "quant", filter = TRUE)
sus_pca <- plot_pca(sus_norm, plot_title = "PCA of parasite expression values",
                    plot_labels = FALSE)
pp(file = "images/sus_norm_pca.png", image = sus_pca[["plot"]])

sus_nb <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
                         batch = "svaseq", filter = TRUE)
sus_nb_pca <- plot_pca(sus_nb, plot_title = "PCA of parasite expression values",
                       plot_labels = FALSE)
pp(file = "images/sus_nb_pca.png", image = sus_nb_pca[["plot"]])
```

# Zymodeme analyses

The following sections perform a series of analyses which seek to elucidate
differences between the zymodemes 2.2 and 2.3 either through differential
expression or variant profiles.

## Differential expression

### With respect to zymodeme attribution

TODO: Do this with and without sva and compare the results.

```{r zymo_de, fig.show = "hide"}
zy_expt <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
zy_norm <- normalize_expt(zy_expt, filter = TRUE, convert = "cpm", norm = "quant")

zy_de_nobatch <- all_pairwise(zy_expt, filter = TRUE, model_batch = FALSE)
zy_table_nobatch <- combine_de_tables(
    zy_de_nobatch, excel = glue::glue("excel/zy_tables_nobatch-v{ver}.xlsx"),
    gmt = glue::glue("gmt/zymodeme_nobatch-v{ver}.gmt"))
zy_sig_nobatch <- extract_significant_genes(
    zy_table_nobatch,
    excel = glue::glue("excel/zy_sig_nobatch-v{ver}.xlsx"))

zy_de_sva <- all_pairwise(zy_expt, filter = TRUE, model_batch = "svaseq")
zy_table_sva <- combine_de_tables(
    zy_de_sva, excel = glue::glue("excel/zy_tables_sva-v{ver}.xlsx"),
    gmt = glue::glue("gmt/zymodeme_sva-v{ver}.gmt"))
zy_sig_sva <- extract_significant_genes(
    zy_table_sva,
    excel = glue::glue("excel/zy_sig_sva-v{ver}.xlsx"))
```

### Images of zymodeme DE

```{r zymod_de_pictures}
pp(file = "images/zymo_ma.png", image = zy_table[["plots"]][["z23_vs_z22"]][["deseq_ma_plots"]][["plot"]])
```

## With respect to cure/failure

In contrast, we can search for genes which are differentially
expressed with respect to cure/failure status.

```{r curefail_de, fig.show = "hide"}
cf_de <- all_pairwise(cf_expt, filter = TRUE, model_batch = "svaseq")
cf_table <- combine_de_tables(cf_de, excel = glue::glue("excel/cf_tables-v{ver}.xlsx"))
cf_sig <- extract_significant_genes(cf_table, excel = glue::glue("excel/cf_sig-v{ver}.xlsx"))

pp(file = "images/cf_ma.png", image = cf_table[["plots"]][["fail_vs_cure"]][["deseq_ma_plots"]][["plot"]])
```

## With respect to susceptibility

Finally, we can use our category of susceptibility and look for genes
which change from sensitive to resistant.  Keep in mind, though, that
for the moment we have a lot of ambiguous and unknown strains.

```{r curefail_de02, fig.show = "hide"}
sus_de <- all_pairwise(sus_expt, filter = TRUE, model_batch = "svaseq")
sus_table <- combine_de_tables(sus_de, excel = glue::glue("excel/sus_tables-v{ver}.xlsx"))
sus_sig <- extract_significant_genes(sus_table, excel = glue::glue("excel/sus_sig-v{ver}.xlsx"))
```

```{r zymod_de_pictures01}
knitr::kable(head(sus_sig$deseq$ups$sensitive_vs_resistant, n = 20))

knitr::kable(head(sus_sig$deseq$downs$sensitive_vs_resistant, n = 20))

sus_ma <- sus_table[["plots"]][["sensitive_vs_resistant"]][["deseq_ma_plots"]][["plot"]]
pp(file = "images/sus_ma.png", image = sus_ma)

## test <- ggplt(sus_ma)
```

## Ontology searches

Now let us look for ontology categories which are increased in the 2.3
samples followed by the 2.2 samples.

```{r go, sig.show = "hide"}
## Gene categories more represented in the 2.3 group.
zy_go_up <- simple_goseq(sig_genes = zy_table[["significant"]][["deseq"]][["ups"]][[1]],
                         go_db = lp_go, length_db = lp_lengths)

## Gene categories more represented in the 2.2 group.
zy_go_down <- simple_goseq(sig_genes = zy_table[["significant"]][["deseq"]][["downs"]][[1]],
                           go_db = lp_go, length_db = lp_lengths)
```

### A couple plots from the differential expression

#### Number of genes in agreement among DE methods, 2.3 more than 2.2

In the function 'combined_de_tables()' above, one of the tasks
performed is to look at the agreement among DESeq2, limma, and edgeR.
The following show a couple of these for the set of genes observed
with a fold-change >= |2| and adjusted p-value <= 0.05.

```{r de_plots}
zy_table[["venns"]][[1]][["p_lfc1"]][["up_noweight"]]
```

#### Number of genes in agreement among DE methods, 2.2 more than 2.3

```{r de_plots01}
zy_table[["venns"]][[1]][["p_lfc1"]][["down_noweight"]]
```

#### goseq ontology plots of groups of genes, 2.3 more than 2.2


```{r goseq_up}
zy_go_up$pvalue_plots$bpp_plot_over
```

#### goseq ontology plots of groups of genes, 2.2 more than 2.3

```{r goseq_down}
zy_go_down$pvalue_plots$bpp_plot_over
```

## Look for agreement between sensitivity and zymodemes

Remind myself, the data structures are (zy|sus)_(de|table|sig).

```{r sensitive_vs_zymo}
zy_df <- zy_table[["data"]][["z23_vs_z22"]]
sus_df <- sus_table[["data"]][["sensitive_vs_resistant"]]

both_df <- merge(zy_df, sus_df, by = "row.names")
plot_df <- both_df[, c("deseq_logfc.x", "deseq_logfc.y")]
rownames(plot_df) <- both_df[["Row.names"]]
colnames(plot_df) <- c("z23_vs_z22", "sensitive_vs_resistant")

compare <- plot_linear_scatter(plot_df)
pp(file = "images/compare_sus_zy.png", image = compare$scatter)
compare$cor
```

## Zymodeme enzyme gene IDs

Najib read me an email listing off the gene names associated with the zymodeme
classification.  I took those names and cross referenced them against the
Leishmania panamensis gene annotations and found the following:

They are:

1. ALAT: LPAL13_120010900 -- alanine aminotransferase
2. ASAT: LPAL13_340013000 -- aspartate aminotransferase
3. G6PD: LPAL13_000054100 -- glucase-6-phosphate 1-dehydrogenase
4. NH: LPAL13_14006100, LPAL13_180018500 -- inosine-guanine nucleoside hydrolase
5. MPI: LPAL13_320022300 (maybe) -- mannose phosphate isomerase (I chose phosphomannose isomerase)

Given these 6 gene IDs (NH has two gene IDs associated with it), I can do some
looking for specific differences among the various samples.

### Expression levels of zymodeme genes

The following creates a colorspace (red to green) heatmap showing the observed
expression of these genes in every sample.

```{r zymodemes}
my_genes <- c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
              "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300",
              "other")
my_names <- c("ALAT", "ASAT", "G6PD", "NHv1", "NHv2", "MPI", "other")

zymo_expt <- exclude_genes_expt(zy_norm, ids = my_genes, method = "keep")
zymo_heatmap <- plot_sample_heatmap(zymo_expt, row_label = my_names)
zymo_heatmap
```

## Empirically observed Zymodeme genes from differential expression analysis

In contrast, the following plots take the set of genes which are shared among
all differential expression methods (|lfc| >= 1.0 and adjp <= 0.05) and use them
to make categories of genes which are increased in 2.3 or 2.2.

```{r zymodeme_genes_empirical}
shared_zymo <- intersect_significant(zy_table)
up_shared <- shared_zymo[["ups"]][[1]][["data"]][["all"]]
rownames(up_shared)
upshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(up_shared), method = "keep")
```

We can plot a quick heatmap to get a sense of the differences observed
between the genes which are different between the two zymodemes.

### Heatmap of zymodeme gene expression increased in 2.3 vs. 2.2

```{r zymoempup}
high_23_heatmap <- plot_sample_heatmap(upshared_expt, row_label = rownames(up_shared))
high_23_heatmap
```

### Heatmap of zymodeme gene expression increased in 2.2 vs. 2.3

```{r zymoemdown}
down_shared <- shared_zymo[["downs"]][[1]][["data"]][["all"]]
downshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(down_shared), method = "keep")
high_22_heatmap <- plot_sample_heatmap(downshared_expt, row_label = rownames(down_shared))
high_22_heatmap
```

# SNP profiles

Now I will combine our previous samples and our new samples in the
hopes of finding variant positions which help elucidate currently
unknown aspects of either group via their clustering to known samples
from the other group. In other words, we do not know the zymodeme
annotations for the old samples nor the strain identities (or the
shortcut 'chronic vs. self-healing') for the new samples. I hope to
make educated guesses given the variant profiles. There are some
differences in how the previous and current data sets were analyzed
(though I have since redone the old samples so it should be trivial to
remove those differences now).

I added our 2016 data to a specific TMRC2 sample sheet,
dated 20191203.  Thus I will load the data here.  That previous data
was mapped using tophat, so I will also need to make some changes to
the gene names to accomodate the two mappings.

```{r oldnew_variants}
old_expt <- create_expt("sample_sheets/tmrc2_samples_20191203.xlsx",
                        file_column = "tophat2file")

tt <- lp_expt[["expressionset"]]
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.E1$", replacement = "", x = rownames(tt))
lp_expt$expressionset <- tt

tt <- old_expt$expressionset
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.1$", replacement = "", x = rownames(tt))
old_expt$expressionset <- tt
rm(tt)
```

## Create the SNP expressionset

One other important caveat, we have a group of new samples which have
not yet run through the variant search pipeline, so I need to remove
them from consideration.  Though it looks like they finished overnight...

```{r count_expt_old_new}
## The next line drops the samples which are missing the SNP pipeline.
lp_snp <- subset_expt(lp_expt, subset="!is.na(pData(lp_expt)[['bcftable']])")
new_snps <- sm(count_expt_snps(lp_snp, annot_column = "bcftable"))
old_snps <- sm(count_expt_snps(old_expt, annot_column = "bcftable", snp_column = 2))

nonzero_snps <- exprs(new_snps) != 0
colSums(nonzero_snps)

both_snps <- combine_expts(new_snps, old_snps)
both_norm <- normalize_expt(both_snps, transform = "log2", convert = "cpm", filter = TRUE)

## strains <- both_norm[["design"]][["strain"]]
both_strain <- set_expt_conditions(both_norm, fact = "strain")
```

The data structure 'both_norm' now contains our 2016 data along with
the newer data collected since 2019.

## Plot of SNP profiles for zymodemes

The following plot shows the SNP profiles of all samples (old and new) where the
colors at the top show either the 2.2 strains (orange), 2.3 strains (green), the
previous samples (purple), or the various lab strains (pink etc).

```{r plotting_variants}
old_new_variant_heatmap <- plot_disheat(both_norm)
pp(file = "images/raw_snp_disheat.png", image = old_new_variant_heatmap,
   height = 12, width = 12)
```

The function get_snp_sets() takes the provided metadata factor (in
this case 'condition') and looks for variants which are exclusive to
each element in it.  In this case, this is looking for differences
between 2.2 and 2.3, as well as the set shared among them.

```{r get_snp_sets1}
snp_sets <- get_snp_sets(both_snps, factor = "condition")
both_expt <- combine_expts(lp_expt, old_expt)

snp_genes <- sm(snps_vs_genes(both_expt, snp_sets, expt_name_col = "chromosome"))
## I think we have some metrics here we can plot...
snp_subset <- sm(snp_subset_genes(
  both_expt, both_snps,
  genes = c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
            "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300")))
zymo_heat <- plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset)))
zymo_heat
```

Didn't I create a set of densities by chromosome?
Oh I think they come in from get_snp_sets()

## SNPS associated with clinical response in the TMRC samples

```{r snp_clinical}
clinical_sets <- get_snp_sets(new_snps, factor = "clinicalresponse")

density_vec <- clinical_sets[["density"]]
chromosome_idx <- grep(pattern = "LpaL", x = names(density_vec))
density_df <- as.data.frame(density_vec[chromosome_idx])
density_df[["chr"]] <- rownames(density_df)
colnames(density_df) <- c("density_vec", "chr")
ggplot(density_df, aes_string(x = "chr", y = "density_vec")) +
  ggplot2::geom_col() +
  ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
                 axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5))

## clinical_written <- write_variants(new_snps)
```

### Cross reference these variants by gene

```{r snp_classifications}
clinical_genes <- sm(snps_vs_genes(lp_expt, clinical_sets, expt_name_col = "chromosome"))

snp_density <- merge(as.data.frame(clinical_genes[["summary_by_gene"]]),
                     as.data.frame(fData(lp_expt)),
                     by = "row.names")
snp_density <- snp_density[, c(1, 2, 4, 15)]
colnames(snp_density) <- c("name", "snps", "product", "length")
snp_density[["product"]] <- tolower(snp_density[["product"]])
snp_density[["length"]] <- as.numeric(snp_density[["length"]])
snp_density[["density"]] <- snp_density[["snps"]] / snp_density[["length"]]
snp_idx <- order(snp_density[["density"]], decreasing = TRUE)
snp_density <- snp_density[snp_idx, ]

removers <- c("amastin", "gp63", "leishmanolysin")
for (r in removers) {
  drop_idx <- grepl(pattern = r, x = snp_density[["product"]])
  snp_density <- snp_density[!drop_idx, ]
}
## Filter these for [A|a]mastin gp63 Leishmanolysin
```


```{r snp_intersections}
clinical_snps <- snps_intersections(lp_expt, clinical_sets, chr_column = "chromosome")

fail_ref_snps <- as.data.frame(clinical_snps[["inters"]][["failure, reference strain"]])
cure_snps <- as.data.frame(clinical_snps[["inters"]][["cure"]])

head(fail_ref_snps)
head(cure_snps)

annot <- fData(lp_expt)
clinical_interest <- as.data.frame(clinical_snps[["gene_summaries"]][["cure"]])
clinical_interest <- merge(clinical_interest,
                           as.data.frame(clinical_snps[["gene_summaries"]][["failure, reference strain"]]),
                           by = "row.names")
rownames(clinical_interest) <- clinical_interest[["Row.names"]]
clinical_interest[["Row.names"]] <- NULL
colnames(clinical_interest) <- c("cure_snps","fail_snps")
annot <- merge(annot, clinical_interest, by = "row.names")
rownames(annot) <- annot[["Row.names"]]
annot[["Row.names"]] <- NULL
fData(lp_expt$expressionset) <- annot
```

# Zymodeme for new samples

The heatmap produced here should show the variants only for the zymodeme genes.

## Hunt for snp clusters

I am thinking that if we find clusters of locations which are variant, that
might provide some PCR testing possibilities.

```{r new_zymo}
## Drop the 2.1, 2.4, unknown, and null
pruned_snps <- subset_expt(new_snps, subset="condition=='z2.2'|condition=='z2.3'")
new_sets <- get_snp_sets(pruned_snps, factor = "zymodemecategorical")
summary(new_sets)
## 1000000: 2.2
## 0100000: 2.3

summary(new_sets[["intersections"]][["10"]])
summary(new_sets[["intersections"]][["01"]])
```

Thus we see that there are 511 variants associated with 2.2 and 49,790 associated with 2.3.

### A small function for searching for potential PCR primers

The following function uses the positional data to look for sequential
mismatches associated with zymodeme in the hopes that there will be
some regions which would provide good potential targets for a
PCR-based assay.

```{r sequential_search, eval=FALSE}
sequential_variants <- function(snp_sets, conditions = NULL, minimum = 3, maximum_separation = 3) {
  if (is.null(conditions)) {
    conditions <- 1
  }
  intersection_sets <- snp_sets[["intersections"]]
  intersection_names <- snp_sets[["set_names"]]
  chosen_intersection <- 1
  if (is.numeric(conditions)) {
    chosen_intersection <- conditions
  } else {
    intersection_idx <- intersection_names == conditions
    chosen_intersection <- names(intersection_names)[intersection_idx]
  }

  possible_positions <- intersection_sets[[chosen_intersection]]
  position_table <- data.frame(row.names = possible_positions)
  pat <- "^chr_(.+)_pos_(.+)_ref_.*$"
  position_table[["chr"]] <- gsub(pattern = pat, replacement = "\\1", x = rownames(position_table))
  position_table[["pos"]] <- as.numeric(gsub(pattern = pat, replacement = "\\2", x = rownames(position_table)))
  position_idx <- order(position_table[, "chr"], position_table[, "pos"])
  position_table <- position_table[position_idx, ]
  position_table[["dist"]] <- 0

  last_chr <- ""
  for (r in 1:nrow(position_table)) {
    this_chr <- position_table[r, "chr"]
    if (r == 1) {
      position_table[r, "dist"] <- position_table[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      position_table[r, "dist"] <- position_table[r, "pos"] - position_table[r - 1, "pos"]
    } else {
      position_table[r, "dist"] <- position_table[r, "pos"]
    }
    last_chr <- this_chr
  }

  ## Working interactively here.

  doubles <- position_table[["dist"]] == 1
  doubles <- position_table[doubles, ]
  write.csv(doubles, "doubles.csv")

  one_away <- position_table[["dist"]] == 2
  one_away <- position_table[one_away, ]
  write.csv(one_away, "one_away.csv")

  two_away <- position_table[["dist"]] == 3
  two_away <- position_table[two_away, ]
  write.csv(two_away, "two_away.csv")

  combined <- rbind(doubles, one_away)
  combined <- rbind(combined, two_away)
  position_idx <- order(combined[, "chr"], combined[, "pos"])
  combined <- combined[position_idx, ]

  this_chr <- ""
  for (r in 1:nrow(combined)) {
    this_chr <- combined[r, "chr"]
    if (r == 1) {
      combined[r, "dist_pair"] <- combined[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      combined[r, "dist_pair"] <- combined[r, "pos"] - combined[r - 1, "pos"]
    } else {
      combined[r, "dist_pair"] <- combined[r, "pos"]
    }
    last_chr <- this_chr
  }

  dist_pair_maximum <- 1000
  dist_pair_minimum <- 200
  dist_pair_idx <- combined[["dist_pair"]] <= dist_pair_maximum &
    combined[["dist_pair"]] >= dist_pair_minimum
  remaining <- combined[dist_pair_idx, ]
  no_weak_idx <- grepl(pattern="ref_(G|C)", x=rownames(remaining))
  remaining <- remaining[no_weak_idx, ]

  print(head(table(position_table[["dist"]])))
  sequentials <- position_table[["dist"]] <= maximum_separation
  message("There are ", sum(sequentials), " candidate regions.")

  ## The following can tell me how many runs of each length occurred, that is not quite what I want.
  ## Now use run length encoding to find the set of sequential sequentials!
  rle_result <- rle(sequentials)
  rle_values <- rle_result[["values"]]
  ## The following line is equivalent to just leaving values alone:
  ## true_values <- rle_result[["values"]] == TRUE
  rle_lengths <- rle_result[["lengths"]]
  true_sequentials <- rle_lengths[rle_values]
  rle_idx <- cumsum(rle_lengths)[which(rle_values)]

  position_table[["last_sequential"]] <- 0
  count <- 0
  for (r in rle_idx) {
    count <- count + 1
    position_table[r, "last_sequential"] <- true_sequentials[count]
  }
  message("The maximum sequential set is: ", max(position_table[["last_sequential"]]), ".")

  wanted_idx <- position_table[["last_sequential"]] >= minimum
  wanted <- position_table[wanted_idx, c("chr", "pos")]
  return(wanted)
}

zymo22_sequentials <- sequential_variants(new_sets, conditions = "z22", minimum=1, maximum_separation=2)
dim(zymo22_sequentials)
## 7 candidate regions for zymodeme 2.2 -- thus I am betting that the reference strain is a 2.2
zymo23_sequentials <- sequential_variants(new_sets, conditions = "z23",
                                          minimum = 2, maximum_separation = 2)
dim(zymo23_sequentials)
## In contrast, there are lots (587) of interesting regions for 2.3!
```

### Extract a promising region from the genome

The first 4 candidate regions from my set of remaining:
* Chr       Pos.   Distance
* LpaL13-15 238433 448
* LpaL13-18 142844 613
* LpaL13-29 830342 252
* LpaL13-33 1331507 843

Lets define a couple of terms:
* Third: Each of the 4 above positions.
* Second: Third - Distance
* End: Third + PrimerLen
* Start: Second - Primerlen

In each instance, these are the last positions, so we want to grab three things:

* The entire region from End -> Start, this way we can have a quick sanity check.
* Start -> Second.
* (Third -> End) <- Reverse complemented

```{r extract_bsgenome, eval=FALSE}
## * LpaL13-15 238433 448
first_candidate_chr <- genome[["LpaL13_15"]]
primer_length <- 22
amplicon_length <- 448
first_candidate_third <- 238433
first_candidate_second <- first_candidate_third - amplicon_length
first_candidate_start <- first_candidate_second - primer_length
first_candidate_end <- first_candidate_third + primer_length
first_candidate_region <- subseq(first_candidate_chr, first_candidate_start, first_candidate_end)
first_candidate_region
first_candidate_5p <- subseq(first_candidate_chr, first_candidate_start, first_candidate_second)
as.character(first_candidate_5p)
first_candidate_3p <- spgs::reverseComplement(subseq(first_candidate_chr, first_candidate_third, first_candidate_end))
first_candidate_3p


## * LpaL13-18 142844 613
second_candidate_chr <- genome[["LpaL13_18"]]
primer_length <- 22
amplicon_length <- 613
second_candidate_third <- 142844
second_candidate_second <- second_candidate_third - amplicon_length
second_candidate_start <- second_candidate_second - primer_length
second_candidate_end <- second_candidate_third + primer_length
second_candidate_region <- subseq(second_candidate_chr, second_candidate_start, second_candidate_end)
second_candidate_region
second_candidate_5p <- subseq(second_candidate_chr, second_candidate_start, second_candidate_second)
as.character(second_candidate_5p)
second_candidate_3p <- spgs::reverseComplement(subseq(second_candidate_chr, second_candidate_third, second_candidate_end))
second_candidate_3p


## * LpaL13-29 830342 252
third_candidate_chr <- genome[["LpaL13_29"]]
primer_length <- 22
amplicon_length <- 252
third_candidate_third <- 830342
third_candidate_second <- third_candidate_third - amplicon_length
third_candidate_start <- third_candidate_second - primer_length
third_candidate_end <- third_candidate_third + primer_length
third_candidate_region <- subseq(third_candidate_chr, third_candidate_start, third_candidate_end)
third_candidate_region
third_candidate_5p <- subseq(third_candidate_chr, third_candidate_start, third_candidate_second)
as.character(third_candidate_5p)
third_candidate_3p <- spgs::reverseComplement(subseq(third_candidate_chr, third_candidate_third, third_candidate_end))
third_candidate_3p
## You are a garbage polypyrimidine tract.
## Which is actually interesting if the mutations mess it up.


## * LpaL13-33 1331507 843
fourth_candidate_chr <- genome[["LpaL13_33"]]
primer_length <- 22
amplicon_length <- 843
fourth_candidate_third <- 1331507
fourth_candidate_second <- fourth_candidate_third - amplicon_length
fourth_candidate_start <- fourth_candidate_second - primer_length
fourth_candidate_end <- fourth_candidate_third + primer_length
fourth_candidate_region <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_end)
fourth_candidate_region
fourth_candidate_5p <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_second)
as.character(fourth_candidate_5p)
fourth_candidate_3p <- spgs::reverseComplement(subseq(fourth_candidate_chr, fourth_candidate_third, fourth_candidate_end))
fourth_candidate_3p
```

## Go hunting for Sanger sequencing regions

I made a fun little function which should find regions which have lots of variants
associated with a given experimental factor.

```{r sanger_fun, eval=FALSE}
pheno <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
pheno <- subset_expt(pheno, subset = "!is.na(pData(pheno)[['bcftable']])")
pheno_snps <- sm(count_expt_snps(pheno, annot_column = "bcftable"))

fun_stuff <- snp_density_primers(pheno_snps,
                                 bsgenome = "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53",
                                 gff = "reference/TriTrypDB-53_LpanamensisMHOMCOL81L13.gff")
drop_scaffolds <- grepl(x = rownames(fun_stuff$favorites), pattern = "SCAF")
favorite_primer_regions <- fun_stuff[["favorites"]][!drop_scaffolds, ]
favorite_primer_regions[["bin"]] <- rownames(favorite_primer_regions)
library(dplyr)
favorite_primer_regions <- favorite_primer_regions %>%
  relocate(bin)
```

## Combine this table with 2.2/2.3 genes

Here is my note from our meeting:

Cross reference primers to DE genes of 2.2/2.3 and/or resistance/suscpetible,
add a column to the primer spreadsheet with the DE genes (in retrospect I am guessing
this actually means to put the logFC as a column.

One nice thing, I did a semantic removal on the lp_expt, so the set of logFC/pvalues
should not have any of the offending types; thus I should be able to automagically
get rid of them in the merge.

```{r xref_primers_deg}
logfc <- zy_table[["data"]][["z23_vs_z22"]]
logfc_columns <- logfc[, c("deseq_logfc", "deseq_adjp")]
colnames(logfc_columns) <- c("z23_logfc", "z23_adjp")
new_table <- merge(favorite_primer_regions, logfc_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names")
sus <- sus_table[["data"]][["sensitive_vs_resistant"]]
sus_columns <- sus[, c("deseq_logfc", "deseq_adjp")]
colnames(sus_columns) <- c("sus_logfc", "sus_adjp")
new_table <- merge(new_table, sus_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names") %>%
  relocate(bin)
written <- write_xlsx(data=new_table,
                      excel="excel/favorite_primers_xref_zy_sus.xlsx")
```


## Make a heatmap describing the clustering of variants

We can cross reference the variants against the zymodeme status and
plot a heatmap of the results and hopefully see how they separate.

```{r zymo_heatmaps}
## pruned_snps <- subset_expt(new_snps, subset="condition=='z2.2'|condition=='z2.3'")
snp_genes <- sm(snps_vs_genes(lp_expt, new_sets, expt_name_col = "chromosome"))

##new_zymo_norm <- normalize_expt(pruned_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE)
##new_zymo_norm <- set_expt_conditions(new_zymo_norm, fact = "zymodemecategorical")
clinical_colors_v2 <- list(
    "z22" = "#0000cc",
    "z23" = "#cc0000")
new_zymo_norm <- normalize_expt(pruned_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE) %>%
  set_expt_conditions(fact = "zymodemecategorical") %>%
  set_expt_colors(clinical_colors_v2)

zymo_heat <- plot_disheat(new_zymo_norm)
pp(file = "images/onlyz22_z23_snp_heatmap.pdf", image=zymo_heat[["plot"]])
zymo_heat[["plot"]]
```

### Annotated heatmap of variants

Now let us try to make a heatmap which includes some of the annotation data.

```{r zymo_heat_panel_genes}
des <- both_norm[["design"]]
undef_idx <- is.na(des[["strain"]])
des[undef_idx, "strain"] <- "unknown"

##hmcols <- colorRampPalette(c("yellow","black","darkblue"))(256)
correlations <- hpgl_cor(exprs(both_norm))

zymo_missing_idx <- is.na(des[["zymodemecategorical"]])
des[["zymodemecategorical"]] <- as.character(des[["zymodemecategorical"]])
des[["clinicalcategorical"]] <- as.character(des[["clinicalcategorical"]])
des[zymo_missing_idx, "zymodemecategorical"] <- "unknown"
mydendro <- list(
  "clustfun" = hclust,
  "lwd" = 2.0)
col_data <- as.data.frame(des[, c("zymodemecategorical", "clinicalcategorical")])

unknown_clinical <- is.na(col_data[["clinicalcategorical"]])
row_data <- as.data.frame(des[, c("strain")])
colnames(col_data) <- c("zymodeme", "outcome")
col_data[unknown_clinical, "outcome"] <- "undefined"

colnames(row_data) <- c("strain")
myannot <- list(
  "Col" = list("data" = col_data),
  "Row" = list("data" = row_data))
myclust <- list("cuth" = 1.0,
                "col" = BrewerClusterCol)
mylabs <- list(
  "Row" = list("nrow" = 4),
  "Col" = list("nrow" = 4))
hmcols <- colorRampPalette(c("darkblue", "beige"))(240)
map1 <- annHeatmap2(
  correlations,
  dendrogram = mydendro,
  annotation = myannot,
  cluster = myclust,
  labels = mylabs,
  ## The following controls if the picture is symmetric
  scale = "none",
  col = hmcols)
pp(file = "images/dendro_heatmap.png", image = map1, height = 20, width = 20)
```

Print the larger heatmap so that all the labels appear.  Keep in mind
that as we get more samples, this image needs to continue getting
bigger.

![big heatmap](images/dendro_heatmap.png)


```{r theresa_idea}
xref_prop <- table(pheno_snps[["conditions"]])
pheno_snps$conditions
idx_tbl <- exprs(pheno_snps) > 5
new_tbl <- data.frame(row.names = rownames(exprs(pheno_snps)))
for (n in names(xref_prop)) {
  new_tbl[[n]] <- 0
  idx_cols <- which(pheno_snps[["conditions"]] == n)
  prop_col <- rowSums(idx_tbl[, idx_cols]) / xref_prop[n]
  new_tbl[n] <- prop_col
}
keepers <- grepl(x = rownames(new_tbl), pattern = "LpaL13")
new_tbl <- new_tbl[keepers, ]
new_tbl[["strong22"]] <- 1.001 - new_tbl[["z2.2"]]
new_tbl[["strong23"]] <- 1.001 - new_tbl[["z2.3"]]
s22_na <- new_tbl[["strong22"]] > 1
new_tbl[s22_na, "strong22"] <- 1
s23_na <- new_tbl[["strong23"]] > 1
new_tbl[s23_na, "strong23"] <- 1

new_tbl[["SNP"]] <- rownames(new_tbl)
new_tbl[["Chromosome"]] <- gsub(x = new_tbl[["SNP"]], pattern = "chr_(.*)_pos_.*", replacement = "\\1")
new_tbl[["Position"]] <- gsub(x = new_tbl[["SNP"]], pattern = ".*_pos_(\\d+)_.*", replacement = "\\1")
new_tbl <- new_tbl[, c("SNP", "Chromosome", "Position", "strong22", "strong23")]


library(CMplot)
simplify <- new_tbl
simplify[["strong22"]] <- NULL


CMplot(simplify, bin.size = 100000)

CMplot(new_tbl, plot.type="m", multracks=TRUE, threshold = c(0.01, 0.05),
       threshold.lwd=c(1,1), threshold.col=c("black","grey"),
       amplify=TRUE, bin.size=10000,
       chr.den.col=c("darkgreen", "yellow", "red"),
       signal.col=c("red", "green", "blue"),
       signal.cex=1, file="jpg", memo="", dpi=300, file.output=TRUE, verbose=TRUE)
```

![SNP Density](SNP-Density.ratio.jpg)
![Circular Manhattan](Circular-Manhattan.ratio.jpg)
![Rectangular Manhattan](Rectangular-Manhattan.ratio.jpg)
![QQ](QQplot.ratio.jpg)

## Try out MatrixEQTL

This tool looks a little opaque, but provides sample data with things
that make sense to me and should be pretty easy to recapitulate in our
data.

1.  covariates.txt: Columns are samples, rows are things from pData -- the
    most likely ones of interest for our data would be zymodeme,
    sensitivity
2.  geneloc.txt: columns are 'geneid', 'chr', 'left', 'right'.  I
    guess I can assume left and right are start/stop; in which case
    this is trivially acquirable from fData.
3.  ge.txt: This appears to be a log(rpkm/cpm) table with rows as genes and
    columns as samples
4.  snpsloc.txt: columns are 'snpid', 'chr', 'pos'
5.  snps.txt: columns are samples, rows are the ids from snsploc,
    values a 0,1,2.  I assume 0 is identical and 1..12 are the various
    A->TGC T->AGC C->AGT G->ACT

```{r matrixeqtl, eval=FALSE}
## For this, let us use the 'new_snps' data structure.
## Caveat here: these need to be coerced to numbers.
my_covariates <- pData(new_snps)[, c("zymodemecategorical", "clinicalcategorical")]
for (col in colnames(my_covariates)) {
  my_covariates[[col]] <- as.numeric(as.factor(my_covariates[[col]]))
}
my_covariates <- t(my_covariates)

my_geneloc <- fData(lp_expt)[, c("gid", "chromosome", "start", "end")]
colnames(my_geneloc) <- c("geneid", "chr", "left", "right")

my_ge <- exprs(normalize_expt(lp_expt, transform = "log2", filter = TRUE, convert = "cpm"))
used_samples <- tolower(colnames(my_ge)) %in% colnames(exprs(new_snps))
my_ge <- my_ge[, used_samples]

my_snpsloc <- data.frame(rownames = rownames(exprs(new_snps)))
## Oh, caveat here: Because of the way I stored the data,
## I could have duplicate rows which presumably will make matrixEQTL sad
my_snpsloc[["chr"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\1",
                            x = rownames(my_snpsloc))
my_snpsloc[["pos"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\2",
                            x = rownames(my_snpsloc))
test <- duplicated(my_snpsloc)
## Each duplicated row would be another variant at that position;
## so in theory we would do a rle to number them I am guessing
## However, I do not have different variants so I think I can ignore this for the moment
## but will need to make my matrix either 0 or 1.
if (sum(test) > 0) {
  message("There are: ", sum(duplicated), " duplicated entries.")
  keep_idx <- ! test
  my_snpsloc <- my_snpsloc[keep_idx, ]
}

my_snps <- exprs(new_snps)
one_idx <- my_snps > 0
my_snps[one_idx] <- 1

## Ok, at this point I think I have all the pieces which this method wants...
## Oh, no I guess not; it actually wants the data as a set of filenames...
library(MatrixEQTL)
write.table(my_snps, "eqtl/snps.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snps, "eqtl/snps.tsv", )
write.table(my_snpsloc, "eqtl/snpsloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snpsloc, "eqtl/snpsloc.tsv")
write.table(as.data.frame(my_ge), "eqtl/ge.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_ge), "eqtl/ge.tsv")
write.table(as.data.frame(my_geneloc), "eqtl/geneloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_geneloc), "eqtl/geneloc.tsv")
write.table(as.data.frame(my_covariates), "eqtl/covariates.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_covariates), "eqtl/covariates.tsv")

useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

# Genotype file name
SNP_file_name = "eqtl/snps.tsv"
snps_location_file_name = "eqtl/snpsloc.tsv"
expression_file_name = "eqtl/ge.tsv"
gene_location_file_name = "eqtl/geneloc.tsv"
covariates_file_name = "eqtl/covariates.tsv"
# Output file name
output_file_name_cis = tempfile()
output_file_name_tra = tempfile()
# Only associations significant at this level will be saved
pvOutputThreshold_cis = 0.1
pvOutputThreshold_tra = 0.1
# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric()
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");
# Distance for local gene-SNP pairs
cisDist = 1e6
## Load genotype data
snps = SlicedData$new()
snps$fileDelimiter = "\t"      # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1          # one row of column labels
snps$fileSkipColumns = 1       # one column of row labels
snps$fileSliceSize = 2000      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)
## Load gene expression data
gene = SlicedData$new()
gene$fileDelimiter = "\t"      # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1          # one row of column labels
gene$fileSkipColumns = 1       # one column of row labels
gene$fileSliceSize = 2000      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)
## Load covariates
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"      # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1          # one row of column labels
cvrt$fileSkipColumns = 1       # one column of row labels
if(length(covariates_file_name) > 0) {
  cvrt$LoadFile(covariates_file_name)
}
## Run the analysis
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)

me = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name_tra,
    pvOutputThreshold = pvOutputThreshold_tra,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = TRUE,
    output_file_name.cis = output_file_name_cis,
    pvOutputThreshold.cis = pvOutputThreshold_cis,
    snpspos = snpspos,
    genepos = genepos,
    cisDist = cisDist,
    pvalue.hist = "qqplot",
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE);
```



```{r saveme}
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename = savefile))
}
```

```{r loadme_after, eval = FALSE}
tmp <- loadme(filename = savefile)
```
