libsizes <- plot_libsize(lp_expt)
dev <- pp("images/lp_expt_libsizes.png", width = 14, height = 9)
libsizes$plot
closed <- dev.off()
libsizes$plot

## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(lp_expt)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
dev <- pp(file = "images/lp_nonzero.png", width=9, height = 9)
nonzero$plot
## Warning: ggrepel: 81 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
closed <- dev.off()
nonzero$plot
## Warning: ggrepel: 83 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

lp_box <- plot_boxplot(lp_expt)
## 8122 entries are 0.  We are on a log scale, adding 1 to the data.
dev <- pp(file = "images/lp_expt_boxplot.png", width = 16, height = 9)
lp_box
closed <- dev.off()
lp_box

filter_plot <- plot_libsize_prepost(lp_expt)
filter_plot$lowgene_plot
## Warning: Using alpha for a discrete variable is not advised.

filter_plot$count_plot

table(pData(lp_expt)[["zymodemecategorical"]])
## 
##   b2904 unknown     z10     z15     z20     z21     z22     z23     z24     z30 
##       1       2       1       1       1       7      43      41       2       1 
##     z32 
##       1
table(pData(lp_expt)[["clinicalresponse"]])
## 
##                                  cure                               failure 
##                                    38                                    38 
##                       laboratory line laboratory line miltefosine resistant 
##                                     1                                     1 
##                                    nd                      reference strain 
##                                    19                                     4

0.1 Distribution Visualizations

Najib’s favorite plots are of course the PCA/TNSE. These are nice to look at in order to get a sense of the relationships between samples. They also provide a good opportunity to see what happens when one applies different normalizations, surrogate analyses, filters, etc. In addition, one may set different experimental factors as the primary ‘condition’ (usually the color of plots) and surrogate ‘batches’.

0.2 By Susceptilibity

Column ‘Q’ in the sample sheet, make a categorical version of it with these parameters:

  • 0 <= x <= 35 is resistant
  • 36 <= x <= 48 is ambiguous
  • 49 <= x is sensitive
strain_norm <- normalize_expt(lp_strain, norm = "quant", transform = "log2",
                              convert = "cpm", filter = TRUE)
## Removing 134 low-count genes (8576 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
zymo_pca <- plot_pca(strain_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
ggplt(zymo_pca$plot)
## [1] "ggplot.html"
dev <- pp(file = "images/promastigote_zymocol_sensshape.png")
zymo_pca$plot
closed <- dev.off()
zymo_pca$plot

zymo_tsne <- plot_tsne(strain_norm, plot_title = "TSNE of parasite expression values")
## plot labels was not set and there are more than 100 samples, disabling it.
zymo_tsne$plot

strain_nb <- normalize_expt(lp_strain, convert = "cpm", transform = "log2",
                            filter = TRUE, batch = "svaseq")
## Removing 134 low-count genes (8576 remaining).
## Setting 738 low elements to zero.
## transform_counts: Found 738 values equal to 0, adding 1 to the matrix.
strain_nb_pca <- plot_pca(strain_nb, plot_title = "PCA of parasite expression values",
                          plot_labels = FALSE)
dev <- pp(file = "images/clinical_nb_pca_sus_shape.png")
strain_nb_pca$plot
closed <- dev.off()
strain_nb_pca$plot

strain_nb_tsne <- plot_tsne(strain_nb, plot_title = "TSNE of parasite expression values")
## plot labels was not set and there are more than 100 samples, disabling it.
strain_nb_tsne$plot

corheat <- plot_corheat(strain_norm, plot_title = "Correlation heatmap of parasite
                 expression values
")
corheat$plot

plot_sm(strain_norm)$plot
## Performing correlation.

0.3 Limit to three strains: 2.1/2.2/2.3

only_three_types <- subset_expt(lp_strain, subset = "condition=='z2.1'|condition=='z2.3'|condition=='z2.2'")
## subset_expt(): There were 101, now there are 91 samples.
only_three_norm <- sm(normalize_expt(only_three_types, norm = "quant", transform = "log2",
                                     convert = "cpm", batch = FALSE, filter = TRUE)) %>%
  set_expt_batches(fact="phase")
onlythree_pca <- plot_pca(only_three_norm, plot_title = "PCA of z2.1, z2.2 and z2.3 parasite expression values",
                          plot_labels = FALSE)
pp(file="images/promastigote_threetypes_zymocol_noshape.png")
onlythree_pca$plot
dev.off()
## png 
##   2
onlythree_pca$plot

0.4 Limit to just two strains: 2.2/2.3

lp_two_strains_norm <- sm(normalize_expt(lp_two_strains, norm = "quant", transform = "log2",
                                         convert = "cpm", batch = FALSE, filter = TRUE))
onlytwo_pca <- plot_pca(lp_two_strains_norm, plot_title = "PCA of z2.2 and z2.3 parasite expression values",
                        plot_labels = FALSE)
dev <- pp(file = "images/zymo_z2.2_z2.3_pca_sus_shape.pdf")
onlytwo_pca$plot
closed <- dev.off()
onlytwo_pca$plot

0.5 By Cure/Fail status

cf_norm <- normalize_expt(lp_cf, convert = "cpm", transform = "log2",
                          norm = "quant", filter = TRUE)
## Removing 134 low-count genes (8576 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
start_cf <- plot_pca(cf_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
dev <- pp(file = "images/cf_sus_shape.png")
start_cf$plot
closed <- dev.off()
start_cf$plot

cf_nb <- normalize_expt(lp_cf_known, convert = "cpm", transform = "log2",
                        filter = TRUE, batch = "svaseq")
## Removing 162 low-count genes (8548 remaining).
## Setting 117 low elements to zero.
## transform_counts: Found 117 values equal to 0, adding 1 to the matrix.
cf_nb_pca <- plot_pca(cf_nb, plot_title = "PCA of parasite expression values",
                      plot_labels = FALSE)
dev <- pp(file = "images/cf_sus_share_nb.png")
cf_nb_pca$plot
closed <- dev.off()
cf_nb_pca$plot

cf_norm <- normalize_expt(lp_cf, transform = "log2", convert = "cpm",
                          filter = TRUE, norm = "quant")
## Removing 134 low-count genes (8576 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
test <- pca_information(cf_norm,
                        expt_factors = c("clinicalcategorical", "zymodemecategorical",
                                         "pathogenstrain", "passagenumber"),
                        num_components = 6, plot_pcas = TRUE)
## plot labels was not set and there are more than 100 samples, disabling it.
test$anova_p
##                           PC1      PC2    PC3       PC4       PC5       PC6
## clinicalcategorical 3.139e-01 0.457872 0.9691 7.839e-03 2.264e-01 3.371e-01
## zymodemecategorical 4.787e-07 0.001621 0.5959 5.970e-02 3.966e-05 5.040e-01
## pathogenstrain      4.747e-01 0.870333 0.6433 5.629e-05 1.889e-02 2.316e-01
## passagenumber       9.502e-01 0.174448 0.4657 3.136e-02 8.602e-01 5.429e-06
test$cor_heatmap

sus_norm <- normalize_expt(lp_susceptibility, transform = "log2", convert = "cpm",
                           norm = "quant", filter = TRUE)
## Removing 134 low-count genes (8576 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
sus_pca <- plot_pca(sus_norm, plot_title = "PCA of parasite expression values",
                    plot_labels = FALSE)
dev <- pp(file = "images/sus_norm_pca.png")
sus_pca[["plot"]]
closed <- dev.off()
sus_pca[["plot"]]

sus_nb <- normalize_expt(lp_susceptibility, transform = "log2", convert = "cpm",
                         batch = "svaseq", filter = TRUE)
## Removing 134 low-count genes (8576 remaining).
## Setting 405 low elements to zero.
## transform_counts: Found 405 values equal to 0, adding 1 to the matrix.
sus_nb_pca <- plot_pca(sus_nb, plot_title = "PCA of parasite expression values",
                       plot_labels = FALSE)
dev <- pp(file = "images/sus_nb_pca.png")
sus_nb_pca[["plot"]]
closed <- dev.off()
sus_nb_pca[["plot"]]

0.6 Zymodeme enzyme gene IDs

Najib read me an email listing off the gene names associated with the zymodeme classification. I took those names and cross referenced them against the Leishmania panamensis gene annotations and found the following:

They are:

  1. ALAT: LPAL13_120010900 – alanine aminotransferase
  2. ASAT: LPAL13_340013000 – aspartate aminotransferase
  3. G6PD: LPAL13_000054100 – glucase-6-phosphate 1-dehydrogenase
  4. NH: LPAL13_14006100, LPAL13_180018500 – inosine-guanine nucleoside hydrolase
  5. MPI: LPAL13_320022300 (maybe) – mannose phosphate isomerase (I chose phosphomannose isomerase)

Given these 6 gene IDs (NH has two gene IDs associated with it), I can do some looking for specific differences among the various samples.

0.6.1 Expression levels of zymodeme genes

The following creates a colorspace (red to green) heatmap showing the observed expression of these genes in every sample.

my_genes <- c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
              "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300",
              "other")
my_names <- c("ALAT", "ASAT", "G6PD", "NHv1", "NHv2", "MPI", "other")

zymo_expt <- exclude_genes_expt(strain_norm, ids = my_genes, method = "keep")
## remove_genes_expt(), before removal, there were 8576 genes, now there are 6.
## There are 101 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20065 TMRC20005 TMRC20007 TMRC20008 TMRC20027 TMRC20028 TMRC20032 
##   0.08652   0.08512   0.08414   0.08695   0.08365   0.08470   0.08796   0.08394 
## TMRC20040 TMRC20066 TMRC20039 TMRC20037 TMRC20038 TMRC20067 TMRC20068 TMRC20041 
##   0.08260   0.08191   0.08481   0.08204   0.08359   0.08402   0.08449   0.08315 
## TMRC20015 TMRC20009 TMRC20010 TMRC20016 TMRC20011 TMRC20012 TMRC20013 TMRC20017 
##   0.08490   0.08382   0.08432   0.08365   0.08356   0.08550   0.08577   0.08344 
## TMRC20014 TMRC20018 TMRC20019 TMRC20070 TMRC20020 TMRC20021 TMRC20022 TMRC20025 
##   0.08400   0.08355   0.08372   0.08410   0.08220   0.08198   0.08548   0.08592 
## TMRC20024 TMRC20036 TMRC20069 TMRC20033 TMRC20026 TMRC20031 TMRC20076 TMRC20073 
##   0.08229   0.08273   0.08271   0.08278   0.08754   0.08204   0.08331   0.08490 
## TMRC20055 TMRC20079 TMRC20071 TMRC20078 TMRC20094 TMRC20042 TMRC20058 TMRC20072 
##   0.08446   0.08525   0.08434   0.08392   0.08409   0.08430   0.08318   0.08411 
## TMRC20059 TMRC20048 TMRC20057 TMRC20088 TMRC20056 TMRC20060 TMRC20077 TMRC20074 
##   0.08360   0.08241   0.08607   0.08494   0.08475   0.08320   0.08402   0.08375 
## TMRC20063 TMRC20053 TMRC20052 TMRC20064 TMRC20075 TMRC20051 TMRC20050 TMRC20049 
##   0.08251   0.08292   0.08267   0.08314   0.08374   0.08448   0.08262   0.08544 
## TMRC20062 TMRC20110 TMRC20080 TMRC20043 TMRC20083 TMRC20054 TMRC20085 TMRC20046 
##   0.08427   0.08519   0.08222   0.08343   0.08444   0.08488   0.08429   0.08544 
## TMRC20093 TMRC20089 TMRC20047 TMRC20090 TMRC20044 TMRC20045 TMRC20061 TMRC20105 
##   0.08460   0.08355   0.08430   0.08171   0.08531   0.08388   0.08348   0.08449 
## TMRC20108 TMRC20109 TMRC20098 TMRC20096 TMRC20097 TMRC20101 TMRC20092 TMRC20082 
##   0.08313   0.08458   0.08489   0.08363   0.08338   0.08366   0.08318   0.08277 
## TMRC20102 TMRC20099 TMRC20100 TMRC20091 TMRC20084 TMRC20087 TMRC20103 TMRC20104 
##   0.08338   0.08468   0.08324   0.08503   0.08319   0.08445   0.08440   0.08415 
## TMRC20086 TMRC20107 TMRC20081 TMRC20106 TMRC20095 
##   0.08366   0.08155   0.08221   0.08079   0.07790
zymo_heatmap <- plot_sample_heatmap(zymo_expt, row_label = my_names)
zymo_heatmap

new_conditions <- paste0(pData(hs_macrophage)[["macrophagetreatment"]], "_",
                         pData(hs_macrophage)[["macrophagezymodeme"]])

tmrc2_macrophage_norm <- normalize_expt(hs_macrophage, filter=TRUE, norm="quant",
                                        convert="cpm", transform="log2")
## Removing 10021 low-count genes (11460 remaining).
## transform_counts: Found 6 values equal to 0, adding 1 to the matrix.
macrophage_hs_pca <- plot_pca(tmrc2_macrophage_norm, plot_labels=FALSE)
pp(file="images/macrophage_hs_infection.png")
macrophage_hs_pca$plot
dev.off()
## png 
##   2
macrophage_hs_pca$plot

hs_macrophage_drugzymo <- set_expt_conditions(hs_macrophage,
                                              fact = new_conditions)
hs_macrophage_drugzymo_norm <- normalize_expt(hs_macrophage_drugzymo,
                                              filter=TRUE, norm="quant", convert="cpm",
                                              transform="log2")
## Removing 10021 low-count genes (11460 remaining).
## transform_counts: Found 6 values equal to 0, adding 1 to the matrix.
plot_pca(hs_macrophage_drugzymo_norm)$plot

tmrc2_macro_nosb_drugzymo <- subset_expt(hs_macrophage_drugzymo,
                                         subset="drug!='Antimony'") %>%
  subset_expt(subset="macrophagetreatment!='uninf'")
## subset_expt(): There were 28, now there are 14 samples.
## subset_expt(): There were 14, now there are 12 samples.
tmrc2_macro_nosb_drugzymo_norm <- normalize_expt(tmrc2_macro_nosb_drugzymo,
                                                 filter=TRUE, convert="cpm",
                                                 norm="quant", transform="log2")
## Removing 10435 low-count genes (11046 remaining).
## transform_counts: Found 1 values equal to 0, adding 1 to the matrix.
pp(file="images/tmrc2_macro_nosb_drugzymo_pca.png",
   image=plot_pca(tmrc2_macro_nosb_drugzymo_norm, plot_labels=FALSE)$plot)
## Warning in pp(file = "images/tmrc2_macro_nosb_drugzymo_pca.png", image =
## plot_pca(tmrc2_macro_nosb_drugzymo_norm, : There is no device to shut down.
new_conditions <- paste0(pData(lp_macrophage)[["macrophagetreatment"]], "_",
                         pData(lp_macrophage)[["macrophagezymodeme"]])
lp_macrophage <- set_expt_conditions(lp_macrophage, fact = new_conditions)

macrophage_libsize <- plot_libsize(lp_macrophage)
pp(file="images/tmrc2_macrophage_lp_libsize.png")
macrophage_libsize$plot
dev.off()
## png 
##   2
macrophage_libsize$plot

lp_macrophage_norm <- normalize_expt(lp_macrophage,
                                     filter=TRUE, norm="quant", transform="log2",
                                     convert="cpm")
## Removing 188 low-count genes (8522 remaining).
## transform_counts: Found 4 values equal to 0, adding 1 to the matrix.
lp_macrophage_pca <- plot_pca(lp_macrophage_norm, plot_labels=FALSE)
pp(file="images/amastigote_zymocol_includesb.png")
lp_macrophage_pca$plot
dev.off()
## png 
##   2
lp_macrophage_pca$plot

lp_macrophage_nosb <- subset_expt(lp_macrophage,
                                  subset="condition!='inf_sb_z2.3'")
## subset_expt(): There were 11, now there are 10 samples.
lp_macrophage_nosb_norm <- normalize_expt(lp_macrophage_nosb,
                                          filter=TRUE, norm="quant", transform="log2",
                                          convert="cpm")
## Removing 190 low-count genes (8520 remaining).
## transform_counts: Found 4 values equal to 0, adding 1 to the matrix.
lp_macrophage_nosb_pca <- plot_pca(lp_macrophage_nosb_norm,
                                   plot_labels=FALSE)
pp(file="images/amastigote_zymocol_excludesb.png")
lp_macrophage_nosb_pca$plot
dev.off()
## png 
##   2
lp_macrophage_nosb_pca$plot

lp_macrophage_de <- all_pairwise(lp_macrophage,
                                 model_batch="svaseq", filter=TRUE)
## This DE analysis will perform all pairwise comparisons among:
## 
## inf_sb_z2.3    inf_z2.2    inf_z2.3 
##           1           5           5
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (8522 remaining).
## Setting 53 low elements to zero.
## transform_counts: Found 53 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.

tmrc2_parasite_keepers <- list(
    "z23nosb_vs_z22nosb" = c("infz23", "infz22"))
lp_macrophage_table <- combine_de_tables(
  lp_macrophage_de, keepers = tmrc2_parasite_keepers,
  excel=glue::glue("excel/macrophage_parasite_infection_de-v{ver}.xlsx"))
lp_macrophage_sig <- extract_significant_genes(
    lp_macrophage_table,
    excel=glue::glue("excel/macrophage_parasite_sig-v{ver}.xlsx"))
## Using p column: limma_adjp.
## Using p column: edger_adjp.
## Using p column: deseq_adjp.
## Using p column: ebseq_adjp.
## Using p column: basic_adjp.

A recent suggestion included a query about the relationship of our amastigote TMRC2 samples which were the result of infecting a set of macrophages vs. these promastigote samples.

So far, we have kept these two experiments separate, now let us merge them.

tmrc2_macrophage_norm <- normalize_expt(lp_macrophage, transform="log2", convert="cpm",
                                        norm="quant", filter=TRUE)
## Removing 188 low-count genes (8522 remaining).
## transform_counts: Found 4 values equal to 0, adding 1 to the matrix.
all_tmrc2 <- combine_expts(lp_expt, lp_macrophage)

all_nosb <- all_tmrc2
pData(all_nosb)[["stage"]] <- "promastigote"
na_idx <- is.na(pData(all_nosb)[["macrophagetreatment"]])
pData(all_nosb)[na_idx, "macrophagetreatment"] <- "undefined"
all_nosb <- subset_expt(all_nosb, subset="macrophagetreatment!='inf_sb'")
## subset_expt(): There were 112, now there are 111 samples.
ama_idx <- pData(all_nosb)[["macrophagetreatment"]] == "inf"
pData(all_nosb)[ama_idx, "stage" ] <- "amastigote"

pData(all_nosb)[["batch"]] <- pData(all_nosb)[["stage"]]
all_norm <- normalize_expt(all_nosb, convert="cpm", norm="quant", transform="log2", filter=TRUE)
## Removing 129 low-count genes (8581 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(all_norm)$plot
## plot labels was not set and there are more than 100 samples, disabling it.

I think the above picture is sort of the opposite of what we want to compare in a DE analysis for this set of data, e.g. we want to compare promastigotes from amastigotes?

all_nosb <- set_expt_batches(all_nosb, fact="condition") %>%
  set_expt_conditions(fact="stage")

two_zymo <- subset_expt(all_nosb, subset="zymodemecategorical=='z22'|zymodemecategorical=='z23'|zymodemecategorical=='unknown'")
## subset_expt(): There were 111, now there are 86 samples.
pro_ama <- all_pairwise(all_nosb, filter=TRUE, model_batch="svaseq")
## This DE analysis will perform all pairwise comparisons among:
## 
##   amastigote promastigote 
##           10          101
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (8581 remaining).
## Setting 539 low elements to zero.
## transform_counts: Found 539 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
pro_ama_table <- combine_de_tables(pro_ama, excel="excel/tmrc2_pro_vs_ama.xlsx")
## Deleting the file excel/tmrc2_pro_vs_ama.xlsx before writing the tables.

1 Human macrophage comparison

new_conditions <- paste0(pData(hs_macrophage)[["macrophagetreatment"]], "_",
                         pData(hs_macrophage)[["macrophagezymodeme"]])
hs_macrophage <- set_expt_conditions(hs_macrophage, fact = new_conditions)

hs_macrophage_de <- all_pairwise(hs_macrophage, model_batch="svaseq", filter=TRUE)
## This DE analysis will perform all pairwise comparisons among:
## 
##   inf_sb_z2.2   inf_sb_z2.3      inf_z2.2      inf_z2.3    uninf_none 
##             6             6             6             6             2 
## uninf_sb_none 
##             2
## This analysis will include surrogate estimates from: svaseq.
## This will pre-filter the input data using normalize_expt's: TRUE argument.
## Removing 0 low-count genes (11460 remaining).
## Setting 757 low elements to zero.
## transform_counts: Found 757 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.

tmrc2_human_keepers <- list(
    "z23nosb_vs_uninf" = c("infz23", "uninfnone"),
    "z22nosb_vs_uninf" = c("infz22", "uninfnone"),
    "z23nosb_vs_z22nosb" = c("infz23", "infz22"),
    "z23sb_vs_z22sb" = c("infsbz23", "infsbz22"),
    "z23sb_vs_z23nosb" = c("infsbz23", "infz23"),
    "z22sb_vs_z22nosb" = c("infsbz22", "infz22"),
    "z23sb_vs_sb" = c("infz23", "uninfsbnone"),
    "z22sb_vs_sb" = c("infz22", "uninfsbnone"),
    "sb_vs_uninf" = c("uninfsbnone", "uninfnone"))
hs_macrophage_table <- combine_de_tables(
    hs_macrophage_de,
    keepers = tmrc2_human_keepers,
    excel=glue::glue("excel/macrophage_human_table-v{ver}.xlsx"))
hs_macrophage_sig <- extract_significant_genes(
    hs_macrophage_table,
    excel=glue::glue("excel/macrophage_human_sig-v{ver}.xlsx"))
## Using p column: limma_adjp.
## Using p column: edger_adjp.
## Using p column: deseq_adjp.
## Using p column: basic_adjp.

2 SNP profiles

Over the last couple of weeks, I redid all the variant searches with a newer, (I think) more sensitive and more specific variant tool. In addition I changed my script which interprets the results so that it is able to extract any tags from it, instead of just the one or two that my previous script handled. In addition, at least in theory it is now able to provide the set of amino acid substitutions for every gene in species without or with introns (not really relevant for Leishmania panamensis).

However, as of this writing, I have not re-performed the same tasks with the 2016 data, primarily because it will require remapping all of the samples. As a result, for the moment I cannot combine the older and newer samples. Thus, any of the following blocks which use the 2016 data are currently disabled.

old_expt <- create_expt("sample_sheets/tmrc2_samples_20191203.xlsx",
                        file_column = "tophat2file")
## Reading the sample metadata.
## Dropped 13 rows from the sample metadata because the sample ID is blank.
## The sample definitions comprises: 50 rows(samples) and 38 columns(metadata fields).
## Warning in create_expt("sample_sheets/tmrc2_samples_20191203.xlsx", file_column
## = "tophat2file"): Some samples were removed when cross referencing the samples
## against the count data.
## Matched 8841 annotations and counts.
## Bringing together the count matrix and gene information.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 8841 features and 33 samples.
##tt <- lp_expt[["expressionset"]]
##rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
##rownames(tt) <- gsub(pattern = "\\.E1$", replacement = "", x = rownames(tt))
##lp_expt$expressionset <- tt

tt <- old_expt$expressionset
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.1$", replacement = "", x = rownames(tt))
old_expt$expressionset <- tt
rm(tt)

2.1 Create the SNP expressionset

One other important caveat, we have a group of new samples which have not yet run through the variant search pipeline, so I need to remove them from consideration. Though it looks like they finished overnight…

## The next line drops the samples which are missing the SNP pipeline.
lp_snp <- subset_expt(lp_expt, subset="!is.na(pData(lp_expt)[['freebayessummary']])")
## subset_expt(): There were 101, now there are 101 samples.
new_snps <- count_expt_snps(lp_snp, annot_column = "freebayessummary", snp_column="PAIRED")
## New names:
## • `DP` -> `DP...3`
## • `RO` -> `RO...8`
## • `AO` -> `AO...9`
## • `QR` -> `QR...12`
## • `QA` -> `QA...13`
## • `DP` -> `DP...42`
## • `RO` -> `RO...43`
## • `QR` -> `QR...44`
## • `AO` -> `AO...45`
## • `QA` -> `QA...46`
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## • `DP` -> `DP...3`
## • `RO` -> `RO...8`
## • `AO` -> `AO...9`
## • `QR` -> `QR...12`
## • `QA` -> `QA...13`
## • `DP` -> `DP...42`
## • `RO` -> `RO...43`
## • `QR` -> `QR...44`
## • `AO` -> `AO...45`
## • `QA` -> `QA...46`
## Warning: NAs introduced by coercion
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## • `DP` -> `DP...3`
## • `RO` -> `RO...8`
## • `AO` -> `AO...9`
## • `QR` -> `QR...12`
## • `QA` -> `QA...13`
## • `DP` -> `DP...42`
## • `RO` -> `RO...43`
## • `QR` -> `QR...44`
## • `AO` -> `AO...45`
## • `QA` -> `QA...46`
old_snps <- count_expt_snps(old_expt, annot_column = "bcftable", snp_column = 2)
## The rownames are missing the chromosome identifier,
## they probably came from an older version of this method.
nonzero_snps <- exprs(new_snps) != 0
colSums(nonzero_snps)
## tmrc20001 tmrc20065 tmrc20005 tmrc20007 tmrc20008 tmrc20027 tmrc20028 tmrc20032 
##         0     93649         0         0         0    351343    338580    146302 
## tmrc20040 tmrc20066 tmrc20039 tmrc20037 tmrc20038 tmrc20067 tmrc20068 tmrc20041 
##     58753     93615     25115     98958     97676     93954     96583     53184 
## tmrc20015 tmrc20009 tmrc20010 tmrc20016 tmrc20011 tmrc20012 tmrc20013 tmrc20017 
##     96398     15890     93816    146124     13914       456     94766     48288 
## tmrc20014 tmrc20018 tmrc20019 tmrc20070 tmrc20020 tmrc20021 tmrc20022 tmrc20025 
##     17245    140438     14829     97336     15484    101127     18143    364240 
## tmrc20024 tmrc20036 tmrc20069 tmrc20033 tmrc20026 tmrc20031 tmrc20076 tmrc20073 
##     18471     60087     18792     33663     15074     19139     18385     96169 
## tmrc20055 tmrc20079 tmrc20071 tmrc20078 tmrc20094 tmrc20042 tmrc20058 tmrc20072 
##     22246     96224     94353     18836     87878     19734     94524     50292 
## tmrc20059 tmrc20048 tmrc20057 tmrc20088 tmrc20056 tmrc20060 tmrc20077 tmrc20074 
##     94091     97164     48944     15594     22683     21506     18773     22132 
## tmrc20063 tmrc20053 tmrc20052 tmrc20064 tmrc20075 tmrc20051 tmrc20050 tmrc20049 
##     28254     20181    100709     93173     97982     94125     17200     16168 
## tmrc20062 tmrc20110 tmrc20080 tmrc20043 tmrc20083 tmrc20054 tmrc20085 tmrc20046 
##     93677     16997     96528     95623     21167     93603     89765     48608 
## tmrc20093 tmrc20089 tmrc20047 tmrc20090 tmrc20044 tmrc20045 tmrc20061 tmrc20105 
##     48254     90421     92637     91564     14861     50403    116906     86758 
## tmrc20108 tmrc20109 tmrc20098 tmrc20096 tmrc20097 tmrc20101 tmrc20092 tmrc20082 
##     97005     17932     92927     17534     46863     17753     16578    108121 
## tmrc20102 tmrc20099 tmrc20100 tmrc20091 tmrc20084 tmrc20087 tmrc20103 tmrc20104 
##     92380     91383     94381     15059     46548     14947     49368     94237 
## tmrc20086 tmrc20107 tmrc20081 tmrc20106 tmrc20095 
##     15813     95370     19533     18830     81200
## My old_snps is using an older annotation incorrectly, so fix it here:
Biobase::annotation(old_snps$expressionset) <- Biobase::annotation(new_snps$expressionset)
both_snps <- combine_expts(new_snps, old_snps)
both_norm <- normalize_expt(both_snps, transform = "log2", norm = "quant")
## transform_counts: Found 207502544 values equal to 0, adding 1 to the matrix.
## strains <- both_norm[["design"]][["strain"]]
both_strain <- set_expt_conditions(both_norm, fact = "strain")

The data structure ‘both_norm’ now contains our 2016 data along with the newer data collected since 2019.

2.2 Plot of SNP profiles for zymodemes

The following plot shows the SNP profiles of all samples (old and new) where the colors at the top show either the 2.2 strains (orange), 2.3 strains (green), the previous samples (purple), or the various lab strains (pink etc).

new_variant_heatmap <- plot_disheat(new_snps)
dev <- pp(file = "images/raw_snp_disheat.png", height=12, width=12)
new_variant_heatmap$plot
closed <- dev.off()
new_variant_heatmap$plot

The function get_snp_sets() takes the provided metadata factor (in this case ‘condition’) and looks for variants which are exclusive to each element in it. In this case, this is looking for differences between 2.2 and 2.3, as well as the set shared among them.

snp_sets <- get_snp_sets(both_snps, factor = "condition")
## The factor z2.3 has 41 rows.
## The factor z2.2 has 43 rows.
## The factor unknown has 2 rows.
## The factor z1.0 has only 1 row.
## The factor b2904 has only 1 row.
## The factor z3.0 has only 1 row.
## The factor z2.0 has only 1 row.
## The factor z1.5 has only 1 row.
## The factor z2.1 has 7 rows.
## The factor z2.4 has 2 rows.
## The factor z3.2 has only 1 row.
## The factor sh has 13 rows.
## The factor chr has 14 rows.
## The factor inf has 6 rows.
Biobase::annotation(old_expt$expressionset) = Biobase::annotation(lp_expt$expressionset)
both_expt <- combine_expts(lp_expt, old_expt)

snp_genes <- sm(snps_vs_genes(both_expt, snp_sets, expt_name_col = "chromosome"))
## I think we have some metrics here we can plot...
snp_subset <- snp_subset_genes(
  both_expt, both_snps,
  genes = c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
            "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300"))
## remove_genes_expt(), before removal, there were 1514127 genes, now there are 179.
## There are 134 samples which kept less than 90 percent counts.
## tmrc20001 tmrc20065 tmrc20005 tmrc20007 tmrc20008 tmrc20027 tmrc20028 tmrc20032 
##  0.000000  0.010678  0.000000  0.000000  0.000000  0.018785  0.020084  0.010253 
## tmrc20040 tmrc20066 tmrc20039 tmrc20037 tmrc20038 tmrc20067 tmrc20068 tmrc20041 
##  0.008510  0.012818  0.015927  0.012126  0.013309  0.012772  0.011389  0.005641 
## tmrc20015 tmrc20009 tmrc20010 tmrc20016 tmrc20011 tmrc20012 tmrc20013 tmrc20017 
##  0.010374  0.000000  0.012046  0.007528  0.007586  0.219298  0.010836  0.004142 
## tmrc20014 tmrc20018 tmrc20019 tmrc20070 tmrc20020 tmrc20021 tmrc20022 tmrc20025 
##  0.005799  0.007833  0.006744  0.011301  0.012917  0.009889  0.000000  0.018120 
## tmrc20024 tmrc20036 tmrc20069 tmrc20033 tmrc20026 tmrc20031 tmrc20076 tmrc20073 
##  0.005414  0.003329  0.021286  0.011882  0.006634  0.005225  0.005439  0.011438 
## tmrc20055 tmrc20079 tmrc20071 tmrc20078 tmrc20094 tmrc20042 tmrc20058 tmrc20072 
##  0.017981  0.012471  0.011658  0.010618  0.013655  0.010135  0.010579  0.003977 
## tmrc20059 tmrc20048 tmrc20057 tmrc20088 tmrc20056 tmrc20060 tmrc20077 tmrc20074 
##  0.010628  0.013379  0.006129  0.019238  0.004409  0.013950  0.005327  0.013555 
## tmrc20063 tmrc20053 tmrc20052 tmrc20064 tmrc20075 tmrc20051 tmrc20050 tmrc20049 
##  0.010618  0.019821  0.010923  0.011806  0.015309  0.011687  0.005814  0.018555 
## tmrc20062 tmrc20110 tmrc20080 tmrc20043 tmrc20083 tmrc20054 tmrc20085 tmrc20046 
##  0.010675  0.011767  0.012432  0.013595  0.009449  0.011752  0.012254  0.002057 
## tmrc20093 tmrc20089 tmrc20047 tmrc20090 tmrc20044 tmrc20045 tmrc20061 tmrc20105 
##  0.004145  0.012165  0.014033  0.013106  0.013458  0.001984  0.013686  0.013832 
## tmrc20108 tmrc20109 tmrc20098 tmrc20096 tmrc20097 tmrc20101 tmrc20092 tmrc20082 
##  0.011340  0.022306  0.011837  0.022813  0.004268  0.005633  0.018096  0.010469 
## tmrc20102 tmrc20099 tmrc20100 tmrc20091 tmrc20084 tmrc20087 tmrc20103 tmrc20104 
##  0.011907  0.010943  0.011655  0.013281  0.008593  0.006690  0.004051  0.011673 
## tmrc20086 tmrc20107 tmrc20081 tmrc20106 tmrc20095  hpgl0242  hpgl0243  hpgl0244 
##  0.006324  0.013631  0.010239  0.010621  0.020936  0.000000  0.029118  0.027772 
##  hpgl0245  hpgl0246  hpgl0247  hpgl0248  hpgl0316  hpgl0318  hpgl0320  hpgl0322 
##  0.009257  0.028169  0.069020  0.000000  0.013550  0.106838  0.058167  0.052041 
##  hpgl0631  hpgl0632  hpgl0633  hpgl0634  hpgl0635  hpgl0636  hpgl0638  hpgl0639 
##  0.083820  0.000000  0.032016  0.048212  0.030793  0.000000  0.000000  0.029641 
##  hpgl0641  hpgl0643  hpgl0651  hpgl0652  hpgl0653  hpgl0654  hpgl0655  hpgl0656 
##  0.024917  0.109469  0.086478  0.000000  0.036742  0.040519  0.035387  0.000000 
##  hpgl0658  hpgl0659  hpgl0660  hpgl0661  hpgl0662  hpgl0663 
##  0.084983  0.000000  0.038113  0.033367  0.028960  0.000000
zymo_heat <- plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset)))
zymo_heat

2.3 Compare variants to DE genes

Najib has asked a few times about the relationship between variants and DE genes. In subsequent conversations I figured out what he really wants to learn is variants in the UTR (most likely 5’) which might affect expression of genes. The following explicitly does not help this question, but is a paralog: is there a relationship between variants in the CDS and differential expression?

vars_df <- data.frame(ID = names(snp_genes$summary_by_gene), variants = as.numeric(snp_genes$summary_by_gene))
vars_df[["variants"]] <- log2(vars_df[["variants"]] + 1)
vars_by_de_gene <- merge(zy_df, vars_df, by.x="row.names", by.y="ID")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'zy_df' not found
cor.test(vars_by_de_gene$deseq_logfc, vars_by_de_gene$variants)
## Error in cor.test(vars_by_de_gene$deseq_logfc, vars_by_de_gene$variants): object 'vars_by_de_gene' not found
variants_wrt_logfc <- plot_linear_scatter(vars_by_de_gene[, c("deseq_logfc", "variants")])
## Error in data.frame(df[, c(1, 2)]): object 'vars_by_de_gene' not found
variants_wrt_logfc$scatter
## Error in eval(expr, envir, enclos): object 'variants_wrt_logfc' not found
## It looks like there might be some genes of interest, even though this is not actually
## the question of interest.

Didn’t I create a set of densities by chromosome? Oh I think they come in from get_snp_sets()

2.4 SNPS associated with clinical response in the TMRC samples

clinical_sets <- get_snp_sets(new_snps, factor = "clinicalresponse")
## The factor cure has 38 rows.
## The factor failure has 38 rows.
## The factor laboratory line has only 1 row.
## The factor laboratory line miltefosine resistant has only 1 row.
## The factor nd has 19 rows.
## The factor reference strain has 4 rows.
density_vec <- clinical_sets[["density"]]
chromosome_idx <- grep(pattern = "LpaL", x = names(density_vec))
density_df <- as.data.frame(density_vec[chromosome_idx])
density_df[["chr"]] <- rownames(density_df)
colnames(density_df) <- c("density_vec", "chr")
ggplot(density_df, aes_string(x = "chr", y = "density_vec")) +
  ggplot2::geom_col() +
  ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
                 axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5))

## clinical_written <- write_variants(new_snps)

2.4.1 Cross reference these variants by gene

clinical_genes <- snps_vs_genes(lp_expt, clinical_sets, expt_name_col = "chromosome")

snp_density <- merge(as.data.frame(clinical_genes[["summary_by_gene"]]),
                     as.data.frame(fData(lp_expt)),
                     by = "row.names")
snp_density <- snp_density[, c(1, 2, 4, 15)]
colnames(snp_density) <- c("name", "snps", "product", "length")
snp_density[["product"]] <- tolower(snp_density[["product"]])
snp_density[["length"]] <- as.numeric(snp_density[["length"]])
snp_density[["density"]] <- snp_density[["snps"]] / snp_density[["length"]]
snp_idx <- order(snp_density[["density"]], decreasing = TRUE)
snp_density <- snp_density[snp_idx, ]

removers <- c("amastin", "gp63", "leishmanolysin")
for (r in removers) {
  drop_idx <- grepl(pattern = r, x = snp_density[["product"]])
  snp_density <- snp_density[!drop_idx, ]
}
## Filter these for [A|a]mastin gp63 Leishmanolysin
clinical_snps <- snps_intersections(lp_expt, clinical_sets, chr_column = "chromosome")

fail_ref_snps <- as.data.frame(clinical_snps[["inters"]][["failure, reference strain"]])
fail_ref_snps <- rbind(fail_ref_snps,
                       as.data.frame(clinical_snps[["inters"]][["failure"]]))
cure_snps <- as.data.frame(clinical_snps[["inters"]][["cure"]])

head(fail_ref_snps)
##                                       seqnames  start    end width strand
## chr_LpaL13-01_pos_110212_ref_T_alt_C LpaL13-01 110212 110213     2      +
## chr_LpaL13-01_pos_156486_ref_T_alt_C LpaL13-01 156486 156487     2      +
## chr_LpaL13-02_pos_143639_ref_T_alt_C LpaL13-02 143639 143640     2      +
## chr_LpaL13-02_pos_196792_ref_A_alt_C LpaL13-02 196792 196793     2      +
## chr_LpaL13-02_pos_197657_ref_T_alt_C LpaL13-02 197657 197658     2      +
## chr_LpaL13-02_pos_198494_ref_T_alt_C LpaL13-02 198494 198495     2      +
head(cure_snps)
##                                       seqnames  start    end width strand
## chr_LpaL13-01_pos_137363_ref_C_alt_A LpaL13-01 137363 137364     2      +
## chr_LpaL13-01_pos_140306_ref_C_alt_A LpaL13-01 140306 140307     2      +
## chr_LpaL13-01_pos_169299_ref_A_alt_G LpaL13-01 169299 169300     2      +
## chr_LpaL13-02_pos_71147_ref_G_alt_A  LpaL13-02  71147  71148     2      +
## chr_LpaL13-02_pos_76744_ref_A_alt_G  LpaL13-02  76744  76745     2      +
## chr_LpaL13-02_pos_76932_ref_G_alt_A  LpaL13-02  76932  76933     2      +
write.csv(file="csv/cure_variants.txt", x=rownames(cure_snps))
## Warning in file(file, ifelse(append, "a", "w")): cannot open file 'csv/
## cure_variants.txt': No such file or directory
## Error in file(file, ifelse(append, "a", "w")): cannot open the connection
write.csv(file="csv/fail_variants.txt", x=rownames(fail_ref_snps))
## Warning in file(file, ifelse(append, "a", "w")): cannot open file 'csv/
## fail_variants.txt': No such file or directory
## Error in file(file, ifelse(append, "a", "w")): cannot open the connection
annot <- fData(lp_expt)
clinical_interest <- as.data.frame(clinical_snps[["gene_summaries"]][["cure"]])
clinical_interest <- merge(clinical_interest,
                           as.data.frame(clinical_snps[["gene_summaries"]][["failure, reference strain"]]),
                           by = "row.names")
rownames(clinical_interest) <- clinical_interest[["Row.names"]]
clinical_interest[["Row.names"]] <- NULL
colnames(clinical_interest) <- c("cure_snps","fail_snps")
annot <- merge(annot, clinical_interest, by = "row.names")
rownames(annot) <- annot[["Row.names"]]
annot[["Row.names"]] <- NULL
fData(lp_expt$expressionset) <- annot

3 Zymodeme for new samples

The heatmap produced here should show the variants only for the zymodeme genes.

3.1 Hunt for snp clusters

I am thinking that if we find clusters of locations which are variant, that might provide some PCR testing possibilities.

## Drop the 2.1, 2.4, unknown, and null
pruned_snps <- subset_expt(new_snps, subset="condition=='z2.2'|condition=='z2.3'")
## subset_expt(): There were 101, now there are 84 samples.
new_sets <- get_snp_sets(pruned_snps, factor = "zymodemecategorical")
## The factor z22 has 43 rows.
## The factor z23 has 41 rows.
summary(new_sets)
##               Length Class      Mode     
## medians         3    data.frame list     
## possibilities   2    -none-     character
## intersections   3    -none-     list     
## chr_data      726    -none-     list     
## set_names       4    -none-     list     
## invert_names    4    -none-     list     
## density       726    -none-     numeric
## 1000000: 2.2
## 0100000: 2.3

summary(new_sets[["intersections"]][["10"]])
##    Length     Class      Mode 
##      3562 character character
write.csv(file="csv/variants_22.csv", x=new_sets[["intersections"]][["10"]])
## Warning in file(file, ifelse(append, "a", "w")): cannot open file 'csv/
## variants_22.csv': No such file or directory
## Error in file(file, ifelse(append, "a", "w")): cannot open the connection
summary(new_sets[["intersections"]][["01"]])
##    Length     Class      Mode 
##     81485 character character
write.csv(file="csv/variants_23.csv", x=new_sets[["intersections"]][["01"]])
## Warning in file(file, ifelse(append, "a", "w")): cannot open file 'csv/
## variants_23.csv': No such file or directory
## Error in file(file, ifelse(append, "a", "w")): cannot open the connection

Thus we see that there are 3,553 variants associated with 2.2 and 81,589 associated with 2.3.

3.1.1 A small function for searching for potential PCR primers

The following function uses the positional data to look for sequential mismatches associated with zymodeme in the hopes that there will be some regions which would provide good potential targets for a PCR-based assay.

sequential_variants <- function(snp_sets, conditions = NULL, minimum = 3, maximum_separation = 3) {
  if (is.null(conditions)) {
    conditions <- 1
  }
  intersection_sets <- snp_sets[["intersections"]]
  intersection_names <- snp_sets[["set_names"]]
  chosen_intersection <- 1
  if (is.numeric(conditions)) {
    chosen_intersection <- conditions
  } else {
    intersection_idx <- intersection_names == conditions
    chosen_intersection <- names(intersection_names)[intersection_idx]
  }

  possible_positions <- intersection_sets[[chosen_intersection]]
  position_table <- data.frame(row.names = possible_positions)
  pat <- "^chr_(.+)_pos_(.+)_ref_.*$"
  position_table[["chr"]] <- gsub(pattern = pat, replacement = "\\1", x = rownames(position_table))
  position_table[["pos"]] <- as.numeric(gsub(pattern = pat, replacement = "\\2", x = rownames(position_table)))
  position_idx <- order(position_table[, "chr"], position_table[, "pos"])
  position_table <- position_table[position_idx, ]
  position_table[["dist"]] <- 0

  last_chr <- ""
  for (r in 1:nrow(position_table)) {
    this_chr <- position_table[r, "chr"]
    if (r == 1) {
      position_table[r, "dist"] <- position_table[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      position_table[r, "dist"] <- position_table[r, "pos"] - position_table[r - 1, "pos"]
    } else {
      position_table[r, "dist"] <- position_table[r, "pos"]
    }
    last_chr <- this_chr
  }

  ## Working interactively here.

  doubles <- position_table[["dist"]] == 1
  doubles <- position_table[doubles, ]
  write.csv(doubles, "doubles.csv")

  one_away <- position_table[["dist"]] == 2
  one_away <- position_table[one_away, ]
  write.csv(one_away, "one_away.csv")

  two_away <- position_table[["dist"]] == 3
  two_away <- position_table[two_away, ]
  write.csv(two_away, "two_away.csv")

  combined <- rbind(doubles, one_away)
  combined <- rbind(combined, two_away)
  position_idx <- order(combined[, "chr"], combined[, "pos"])
  combined <- combined[position_idx, ]

  this_chr <- ""
  for (r in 1:nrow(combined)) {
    this_chr <- combined[r, "chr"]
    if (r == 1) {
      combined[r, "dist_pair"] <- combined[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      combined[r, "dist_pair"] <- combined[r, "pos"] - combined[r - 1, "pos"]
    } else {
      combined[r, "dist_pair"] <- combined[r, "pos"]
    }
    last_chr <- this_chr
  }

  dist_pair_maximum <- 1000
  dist_pair_minimum <- 200
  dist_pair_idx <- combined[["dist_pair"]] <= dist_pair_maximum &
    combined[["dist_pair"]] >= dist_pair_minimum
  remaining <- combined[dist_pair_idx, ]
  no_weak_idx <- grepl(pattern="ref_(G|C)", x=rownames(remaining))
  remaining <- remaining[no_weak_idx, ]

  print(head(table(position_table[["dist"]])))
  sequentials <- position_table[["dist"]] <= maximum_separation
  message("There are ", sum(sequentials), " candidate regions.")

  ## The following can tell me how many runs of each length occurred, that is not quite what I want.
  ## Now use run length encoding to find the set of sequential sequentials!
  rle_result <- rle(sequentials)
  rle_values <- rle_result[["values"]]
  ## The following line is equivalent to just leaving values alone:
  ## true_values <- rle_result[["values"]] == TRUE
  rle_lengths <- rle_result[["lengths"]]
  true_sequentials <- rle_lengths[rle_values]
  rle_idx <- cumsum(rle_lengths)[which(rle_values)]

  position_table[["last_sequential"]] <- 0
  count <- 0
  for (r in rle_idx) {
    count <- count + 1
    position_table[r, "last_sequential"] <- true_sequentials[count]
  }
  message("The maximum sequential set is: ", max(position_table[["last_sequential"]]), ".")

  wanted_idx <- position_table[["last_sequential"]] >= minimum
  wanted <- position_table[wanted_idx, c("chr", "pos")]
  return(wanted)
}

zymo22_sequentials <- sequential_variants(new_sets, conditions = "z22", minimum=1, maximum_separation=2)
dim(zymo22_sequentials)
## 7 candidate regions for zymodeme 2.2 -- thus I am betting that the reference strain is a 2.2
zymo23_sequentials <- sequential_variants(new_sets, conditions = "z23",
                                          minimum = 2, maximum_separation = 2)
dim(zymo23_sequentials)
## In contrast, there are lots (587) of interesting regions for 2.3!

3.1.2 Extract a promising region from the genome

The first 4 candidate regions from my set of remaining: * Chr Pos. Distance * LpaL13-15 238433 448 * LpaL13-18 142844 613 * LpaL13-29 830342 252 * LpaL13-33 1331507 843

Lets define a couple of terms: * Third: Each of the 4 above positions. * Second: Third - Distance * End: Third + PrimerLen * Start: Second - Primerlen

In each instance, these are the last positions, so we want to grab three things:

  • The entire region from End -> Start, this way we can have a quick sanity check.
  • Start -> Second.
  • (Third -> End) <- Reverse complemented
## * LpaL13-15 238433 448
first_candidate_chr <- genome[["LpaL13_15"]]
primer_length <- 22
amplicon_length <- 448
first_candidate_third <- 238433
first_candidate_second <- first_candidate_third - amplicon_length
first_candidate_start <- first_candidate_second - primer_length
first_candidate_end <- first_candidate_third + primer_length
first_candidate_region <- subseq(first_candidate_chr, first_candidate_start, first_candidate_end)
first_candidate_region
first_candidate_5p <- subseq(first_candidate_chr, first_candidate_start, first_candidate_second)
as.character(first_candidate_5p)
first_candidate_3p <- spgs::reverseComplement(subseq(first_candidate_chr, first_candidate_third, first_candidate_end))
first_candidate_3p

## * LpaL13-18 142844 613
second_candidate_chr <- genome[["LpaL13_18"]]
primer_length <- 22
amplicon_length <- 613
second_candidate_third <- 142844
second_candidate_second <- second_candidate_third - amplicon_length
second_candidate_start <- second_candidate_second - primer_length
second_candidate_end <- second_candidate_third + primer_length
second_candidate_region <- subseq(second_candidate_chr, second_candidate_start, second_candidate_end)
second_candidate_region
second_candidate_5p <- subseq(second_candidate_chr, second_candidate_start, second_candidate_second)
as.character(second_candidate_5p)
second_candidate_3p <- spgs::reverseComplement(subseq(second_candidate_chr, second_candidate_third, second_candidate_end))
second_candidate_3p


## * LpaL13-29 830342 252
third_candidate_chr <- genome[["LpaL13_29"]]
primer_length <- 22
amplicon_length <- 252
third_candidate_third <- 830342
third_candidate_second <- third_candidate_third - amplicon_length
third_candidate_start <- third_candidate_second - primer_length
third_candidate_end <- third_candidate_third + primer_length
third_candidate_region <- subseq(third_candidate_chr, third_candidate_start, third_candidate_end)
third_candidate_region
third_candidate_5p <- subseq(third_candidate_chr, third_candidate_start, third_candidate_second)
as.character(third_candidate_5p)
third_candidate_3p <- spgs::reverseComplement(subseq(third_candidate_chr, third_candidate_third, third_candidate_end))
third_candidate_3p
## You are a garbage polypyrimidine tract.
## Which is actually interesting if the mutations mess it up.


## * LpaL13-33 1331507 843
fourth_candidate_chr <- genome[["LpaL13_33"]]
primer_length <- 22
amplicon_length <- 843
fourth_candidate_third <- 1331507
fourth_candidate_second <- fourth_candidate_third - amplicon_length
fourth_candidate_start <- fourth_candidate_second - primer_length
fourth_candidate_end <- fourth_candidate_third + primer_length
fourth_candidate_region <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_end)
fourth_candidate_region
fourth_candidate_5p <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_second)
as.character(fourth_candidate_5p)
fourth_candidate_3p <- spgs::reverseComplement(subseq(fourth_candidate_chr, fourth_candidate_third, fourth_candidate_end))
fourth_candidate_3p

3.2 Go hunting for Sanger sequencing regions

I made a fun little function which should find regions which have lots of variants associated with a given experimental factor.

pheno <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
## subset_expt(): There were 101, now there are 84 samples.
pheno <- subset_expt(pheno, subset = "!is.na(pData(pheno)[['bcftable']])")
## subset_expt(): There were 84, now there are 55 samples.
pheno_snps <- sm(count_expt_snps(pheno, annot_column = "bcftable"))

fun_stuff <- snp_density_primers(
    pheno_snps,
    bsgenome = "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53",
    gff = "reference/TriTrypDB-53_LpanamensisMHOMCOL81L13.gff")
## Loading required package: BSgenome
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: rtracklayer
## Warning in seq_len(chromosomes): first element used of 'length.out' argument
## Warning in snp_density_primers(pheno_snps, bsgenome =
## "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53", : NAs introduced by coercion
## Error in seq_len(chromosomes): argument must be coercible to non-negative integer
drop_scaffolds <- grepl(x = rownames(fun_stuff$favorites), pattern = "SCAF")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grepl': error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'fun_stuff' not found
favorite_primer_regions <- fun_stuff[["favorites"]][!drop_scaffolds, ]
## Error in eval(expr, envir, enclos): object 'fun_stuff' not found
favorite_primer_regions[["bin"]] <- rownames(favorite_primer_regions)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'favorite_primer_regions' not found
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Biostrings':
## 
##     collapse, intersect, setdiff, setequal, union
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:hpgltools':
## 
##     combine
## The following object is masked from 'package:testthat':
## 
##     matches
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
favorite_primer_regions <- favorite_primer_regions %>%
  relocate(bin)
## Error in relocate(., bin): object 'favorite_primer_regions' not found

3.3 Combine this table with 2.2/2.3 genes

Here is my note from our meeting:

Cross reference primers to DE genes of 2.2/2.3 and/or resistance/suscpetible, add a column to the primer spreadsheet with the DE genes (in retrospect I am guessing this actually means to put the logFC as a column.

One nice thing, I did a semantic removal on the lp_expt, so the set of logFC/pvalues should not have any of the offending types; thus I should be able to automagically get rid of them in the merge.

logfc <- zy_table_sva[["data"]][["z23_vs_z22"]]
## Error in eval(expr, envir, enclos): object 'zy_table_sva' not found
logfc_columns <- logfc[, c("deseq_logfc", "deseq_adjp")]
## Error in eval(expr, envir, enclos): object 'logfc' not found
colnames(logfc_columns) <- c("z23_logfc", "z23_adjp")
## Error in colnames(logfc_columns) <- c("z23_logfc", "z23_adjp"): object 'logfc_columns' not found
new_table <- merge(favorite_primer_regions, logfc_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'favorite_primer_regions' not found
sus <- sus_table_sva[["data"]][["sensitive_vs_resistant"]]
## Error in eval(expr, envir, enclos): object 'sus_table_sva' not found
sus_columns <- sus[, c("deseq_logfc", "deseq_adjp")]
## Error in eval(expr, envir, enclos): object 'sus' not found
colnames(sus_columns) <- c("sus_logfc", "sus_adjp")
## Error in colnames(sus_columns) <- c("sus_logfc", "sus_adjp"): object 'sus_columns' not found
new_table <- merge(new_table, sus_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names") %>%
  relocate(bin)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'new_table' not found
written <- write_xlsx(data=new_table,
                      excel="excel/favorite_primers_xref_zy_sus.xlsx")
## Error in write_xlsx(data = new_table, excel = "excel/favorite_primers_xref_zy_sus.xlsx"): object 'new_table' not found

3.4 Make a heatmap describing the clustering of variants

We can cross reference the variants against the zymodeme status and plot a heatmap of the results and hopefully see how they separate.

snp_genes <- sm(snps_vs_genes(lp_expt, new_sets, expt_name_col = "chromosome"))

clinical_colors_v2 <- list(
    "z22" = "#0000cc",
    "z23" = "#cc0000")
new_zymo_norm <- normalize_expt(pruned_snps, normq = "quant") %>%
  set_expt_conditions(fact = "zymodemecategorical") %>%
  set_expt_colors(clinical_colors_v2)

zymo_heat <- plot_disheat(new_zymo_norm)
dev <- pp(file = "images/onlyz22_z23_snp_heatmap.pdf", width=12, height=12)
zymo_heat[["plot"]]
closed <- dev.off()
zymo_heat[["plot"]]

3.4.1 Annotated heatmap of variants

Now let us try to make a heatmap which includes some of the annotation data.

des <- both_norm[["design"]]
undef_idx <- is.na(des[["strain"]])
des[undef_idx, "strain"] <- "unknown"

##hmcols <- colorRampPalette(c("yellow","black","darkblue"))(256)
correlations <- hpgl_cor(exprs(both_norm))
## Warning in stats::cor(df, method = method, ...): the standard deviation is zero
na_idx <- is.na(correlations)
correlations[na_idx] <- 0

zymo_missing_idx <- is.na(des[["zymodemecategorical"]])
des[["zymodemecategorical"]] <- as.character(des[["zymodemecategorical"]])
des[["clinicalcategorical"]] <- as.character(des[["clinicalcategorical"]])
des[zymo_missing_idx, "zymodemecategorical"] <- "unknown"
mydendro <- list(
  "clustfun" = hclust,
  "lwd" = 2.0)
col_data <- as.data.frame(des[, c("zymodemecategorical", "clinicalcategorical")])

unknown_clinical <- is.na(col_data[["clinicalcategorical"]])
row_data <- as.data.frame(des[, c("strain")])
colnames(col_data) <- c("zymodeme", "outcome")
col_data[unknown_clinical, "outcome"] <- "undefined"

colnames(row_data) <- c("strain")
myannot <- list(
  "Col" = list("data" = col_data),
  "Row" = list("data" = row_data))
myclust <- list("cuth" = 1.0,
                "col" = BrewerClusterCol)
mylabs <- list(
  "Row" = list("nrow" = 4),
  "Col" = list("nrow" = 4))
hmcols <- colorRampPalette(c("darkblue", "beige"))(240)
zymo_annot_heat <- annHeatmap2(
    correlations,
    dendrogram = mydendro,
    annotation = myannot,
    cluster = myclust,
    labels = mylabs,
    ## The following controls if the picture is symmetric
    scale = "none",
    col = hmcols)
## Warning in breakColors(breaks, col): more colors than classes: ignoring 26 last
## colors
dev <- pp(file = "images/dendro_heatmap.png", height = 20, width = 20)
plot(zymo_annot_heat)
closed <- dev.off()
plot(zymo_annot_heat)

Print the larger heatmap so that all the labels appear. Keep in mind that as we get more samples, this image needs to continue getting bigger.

big heatmap

xref_prop <- table(pheno_snps[["conditions"]])
pheno_snps$conditions
##  [1] "z2.3" "z2.3" "z2.2" "z2.3" "z2.2" "z2.3" "z2.3" "z2.3" "z2.3" "z2.2"
## [11] "z2.3" "z2.2" "z2.3" "z2.3" "z2.2" "z2.2" "z2.3" "z2.2" "z2.2" "z2.3"
## [21] "z2.2" "z2.3" "z2.2" "z2.3" "z2.2" "z2.2" "z2.2" "z2.2" "z2.2" "z2.2"
## [31] "z2.2" "z2.3" "z2.2" "z2.3" "z2.3" "z2.2" "z2.2" "z2.3" "z2.2" "z2.3"
## [41] "z2.3" "z2.2" "z2.2" "z2.2" "z2.2" "z2.3" "z2.3" "z2.3" "z2.2" "z2.3"
## [51] "z2.3" "z2.3" "z2.3" "z2.2" "z2.2"
idx_tbl <- exprs(pheno_snps) > 5
new_tbl <- data.frame(row.names = rownames(exprs(pheno_snps)))
for (n in names(xref_prop)) {
  new_tbl[[n]] <- 0
  idx_cols <- which(pheno_snps[["conditions"]] == n)
  prop_col <- rowSums(idx_tbl[, idx_cols]) / xref_prop[n]
  new_tbl[n] <- prop_col
}
keepers <- grepl(x = rownames(new_tbl), pattern = "LpaL13")
new_tbl <- new_tbl[keepers, ]
new_tbl[["strong22"]] <- 1.001 - new_tbl[["z2.2"]]
new_tbl[["strong23"]] <- 1.001 - new_tbl[["z2.3"]]
s22_na <- new_tbl[["strong22"]] > 1
new_tbl[s22_na, "strong22"] <- 1
s23_na <- new_tbl[["strong23"]] > 1
new_tbl[s23_na, "strong23"] <- 1

new_tbl[["SNP"]] <- rownames(new_tbl)
new_tbl[["Chromosome"]] <- gsub(x = new_tbl[["SNP"]], pattern = "chr_(.*)_pos_.*", replacement = "\\1")
new_tbl[["Position"]] <- gsub(x = new_tbl[["SNP"]], pattern = ".*_pos_(\\d+)_.*", replacement = "\\1")
new_tbl <- new_tbl[, c("SNP", "Chromosome", "Position", "strong22", "strong23")]

library(CMplot)
## Much appreciate for using CMplot.
## Full description, Bug report, Suggestion and the latest codes:
## https://github.com/YinLiLin/CMplot
simplify <- new_tbl
simplify[["strong22"]] <- NULL

CMplot(simplify, bin.size = 100000)
##  SNP-Density Plotting.
##  Circular-Manhattan Plotting strong23.
##  Rectangular-Manhattan Plotting strong23.
##  QQ Plotting strong23.
##  Plots are stored in: /mnt/cbcb/fs01_abelew/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_git
CMplot(new_tbl, plot.type="m", multracks=TRUE, threshold = c(0.01, 0.05),
       threshold.lwd=c(1,1), threshold.col=c("black","grey"),
       amplify=TRUE, bin.size=10000,
       chr.den.col=c("darkgreen", "yellow", "red"),
       signal.col=c("red", "green", "blue"),
       signal.cex=1, file="jpg", memo="", dpi=300, file.output=TRUE, verbose=TRUE)
##  Multracks-Manhattan Plotting strong22.
##  Multracks-Manhattan Plotting strong23.
##  Multraits-Rectangular Plotting...(finished 73%)
 Multraits-Rectangular Plotting...(finished 74%)
 Multraits-Rectangular Plotting...(finished 75%)
 Multraits-Rectangular Plotting...(finished 76%)
 Multraits-Rectangular Plotting...(finished 77%)
 Multraits-Rectangular Plotting...(finished 78%)
 Multraits-Rectangular Plotting...(finished 79%)
 Multraits-Rectangular Plotting...(finished 80%)
 Multraits-Rectangular Plotting...(finished 81%)
 Multraits-Rectangular Plotting...(finished 82%)
 Multraits-Rectangular Plotting...(finished 83%)
 Multraits-Rectangular Plotting...(finished 84%)
 Multraits-Rectangular Plotting...(finished 85%)
 Multraits-Rectangular Plotting...(finished 86%)
 Multraits-Rectangular Plotting...(finished 87%)
 Multraits-Rectangular Plotting...(finished 88%)
 Multraits-Rectangular Plotting...(finished 89%)
 Multraits-Rectangular Plotting...(finished 90%)
 Multraits-Rectangular Plotting...(finished 91%)
 Multraits-Rectangular Plotting...(finished 92%)
 Multraits-Rectangular Plotting...(finished 93%)
 Multraits-Rectangular Plotting...(finished 94%)
 Multraits-Rectangular Plotting...(finished 95%)
 Multraits-Rectangular Plotting...(finished 96%)
 Multraits-Rectangular Plotting...(finished 97%)
 Multraits-Rectangular Plotting...(finished 98%)
 Multraits-Rectangular Plotting...(finished 99%)
 Multraits-Rectangular Plotting...(finished 100%)
##  Plots are stored in: /mnt/cbcb/fs01_abelew/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_git

3.5 Try out MatrixEQTL

This tool looks a little opaque, but provides sample data with things that make sense to me and should be pretty easy to recapitulate in our data.

  1. covariates.txt: Columns are samples, rows are things from pData – the most likely ones of interest for our data would be zymodeme, sensitivity
  2. geneloc.txt: columns are ‘geneid’, ‘chr’, ‘left’, ‘right’. I guess I can assume left and right are start/stop; in which case this is trivially acquirable from fData.
  3. ge.txt: This appears to be a log(rpkm/cpm) table with rows as genes and columns as samples
  4. snpsloc.txt: columns are ‘snpid’, ‘chr’, ‘pos’
  5. snps.txt: columns are samples, rows are the ids from snsploc, values a 0,1,2. I assume 0 is identical and 1..12 are the various A->TGC T->AGC C->AGT G->ACT
## For this, let us use the 'new_snps' data structure.
## Caveat here: these need to be coerced to numbers.
my_covariates <- pData(new_snps)[, c("zymodemecategorical", "clinicalcategorical")]
for (col in colnames(my_covariates)) {
  my_covariates[[col]] <- as.numeric(as.factor(my_covariates[[col]]))
}
my_covariates <- t(my_covariates)

my_geneloc <- fData(lp_expt)[, c("gid", "chromosome", "start", "end")]
colnames(my_geneloc) <- c("geneid", "chr", "left", "right")

my_ge <- exprs(normalize_expt(lp_expt, transform = "log2", filter = TRUE, convert = "cpm"))
used_samples <- tolower(colnames(my_ge)) %in% colnames(exprs(new_snps))
my_ge <- my_ge[, used_samples]

my_snpsloc <- data.frame(rownames = rownames(exprs(new_snps)))
## Oh, caveat here: Because of the way I stored the data,
## I could have duplicate rows which presumably will make matrixEQTL sad
my_snpsloc[["chr"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\1",
                            x = rownames(my_snpsloc))
my_snpsloc[["pos"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\2",
                            x = rownames(my_snpsloc))
test <- duplicated(my_snpsloc)
## Each duplicated row would be another variant at that position;
## so in theory we would do a rle to number them I am guessing
## However, I do not have different variants so I think I can ignore this for the moment
## but will need to make my matrix either 0 or 1.
if (sum(test) > 0) {
  message("There are: ", sum(duplicated), " duplicated entries.")
  keep_idx <- ! test
  my_snpsloc <- my_snpsloc[keep_idx, ]
}

my_snps <- exprs(new_snps)
one_idx <- my_snps > 0
my_snps[one_idx] <- 1

## Ok, at this point I think I have all the pieces which this method wants...
## Oh, no I guess not; it actually wants the data as a set of filenames...
library(MatrixEQTL)
write.table(my_snps, "eqtl/snps.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snps, "eqtl/snps.tsv", )
write.table(my_snpsloc, "eqtl/snpsloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snpsloc, "eqtl/snpsloc.tsv")
write.table(as.data.frame(my_ge), "eqtl/ge.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_ge), "eqtl/ge.tsv")
write.table(as.data.frame(my_geneloc), "eqtl/geneloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_geneloc), "eqtl/geneloc.tsv")
write.table(as.data.frame(my_covariates), "eqtl/covariates.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_covariates), "eqtl/covariates.tsv")

useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

# Genotype file name
SNP_file_name = "eqtl/snps.tsv"
snps_location_file_name = "eqtl/snpsloc.tsv"
expression_file_name = "eqtl/ge.tsv"
gene_location_file_name = "eqtl/geneloc.tsv"
covariates_file_name = "eqtl/covariates.tsv"
# Output file name
output_file_name_cis = tempfile()
output_file_name_tra = tempfile()
# Only associations significant at this level will be saved
pvOutputThreshold_cis = 0.1
pvOutputThreshold_tra = 0.1
# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric()
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");
# Distance for local gene-SNP pairs
cisDist = 1e6
## Load genotype data
snps = SlicedData$new()
snps$fileDelimiter = "\t"      # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1          # one row of column labels
snps$fileSkipColumns = 1       # one column of row labels
snps$fileSliceSize = 2000      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)
## Load gene expression data
gene = SlicedData$new()
gene$fileDelimiter = "\t"      # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1          # one row of column labels
gene$fileSkipColumns = 1       # one column of row labels
gene$fileSliceSize = 2000      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)
## Load covariates
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"      # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1          # one row of column labels
cvrt$fileSkipColumns = 1       # one column of row labels
if(length(covariates_file_name) > 0) {
  cvrt$LoadFile(covariates_file_name)
}
## Run the analysis
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)

me = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name_tra,
    pvOutputThreshold = pvOutputThreshold_tra,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = TRUE,
    output_file_name.cis = output_file_name_cis,
    pvOutputThreshold.cis = pvOutputThreshold_cis,
    snpspos = snpspos,
    genepos = genepos,
    cisDist = cisDist,
    pvalue.hist = "qqplot",
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE);
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename = savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 605cc89b5f1cadea6923b53ac71e234ba0181fe7
## This is hpgltools commit: Wed Aug 10 22:39:40 2022 -0400: 605cc89b5f1cadea6923b53ac71e234ba0181fe7
## Saving to tmrc2_visualization_202207.rda.xz
tmp <- loadme(filename = savefile)
