sample_sheet <- glue::glue("sample_sheets/tmrc2_samples_20220106.xlsx")

1 Introduction

This is mostly just a run of this worksheet to reacquaint myself with it.

This document is intended to provide a general overview of the TMRC2 samples which have thus far been sequenced. In some cases, this includes only those samples starting in 2019; in other instances I am including our previous (2015-2016) samples.

In all cases the processing performed was:

  1. Default trimming was performed.
  2. Hisat2 was used to map the remaining reads against the Leishmania panamensis genome revision 36.
  3. The alignments from hisat2 were used to count reads/gene against the revision 36 annotations with htseq.
  4. These alignments were also passed to the pileup functionality of samtools and the vcf/bcf utilities in order to make a matrix of all observed differences between each sample with respect to the reference.

The analyses in this document use the matrices of counts/gene from #3 and variants/position from #4 in order to provide some images and metrics describing the samples we have sequenced so far.

2 Annotations

Everything which follows depends on the Existing TriTrypDB annotations revision 46, circa 2019. The following block loads a database of these annotations and turns it into a matrix where the rows are genes and columns are all the annotation types provided by TriTrypDB.

The same database was used to create a matrix of orthologous genes between L.panamensis and all of the other species in the TriTrypDB.

tt <- sm(library(EuPathDB))
tt <- sm(library(org.Lpanamensis.MHOMCOL81L13.v46.eg.db))
pan_db <- org.Lpanamensis.MHOMCOL81L13.v46.eg.db
all_fields <- columns(pan_db)

all_lp_annot <- sm(load_orgdb_annotations(
    pan_db,
    keytype = "gid",
    fields = c("annot_gene_entrez_id", "annot_gene_name",
               "annot_strand", "annot_chromosome", "annot_cds_length",
               "annot_gene_product")))$genes

lp_go <- sm(load_orgdb_go(pan_db))
lp_lengths <- all_lp_annot[, c("gid", "annot_cds_length")]
colnames(lp_lengths)  <- c("ID", "length")
all_lp_annot[["annot_gene_product"]] <- tolower(all_lp_annot[["annot_gene_product"]])
orthos <- sm(EuPathDB::extract_eupath_orthologs(db = pan_db))

hisat_annot <- all_lp_annot
## rownames(hisat_annot) <- paste0("exon_", rownames(hisat_annot), ".E1")

3 Load a genome

meta <- EuPathDB::download_eupath_metadata(webservice="tritrypdb")
## Unable to find species names for 2 species.
## Leishmania sp. Ghana MHOM/GH/2012/GH5, Leishmania sp. Namibia MPRO/NA/1975/252/LV425
## Appending to an existing file: EuPathDB/metadata/biocv3.14_tritrypdbv55_metadata.csv
## Appending to an existing file: EuPathDB/metadata/GRanges_biocv3.14_tritrypdbv55_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrgDb_biocv3.14_tritrypdbv55_metadata.csv
## Appending to an existing file: EuPathDB/metadata/TxDb_biocv3.14_tritrypdbv55_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrganismDbi_biocv3.14_tritrypdbv55_metadata.csv
## Appending to an existing file: EuPathDB/metadata/BSgenome_biocv3.14_tritrypdbv55_metadata.csv
## Appending to an existing file: EuPathDB/metadata/biocv3.14_tritrypdbv55_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/GRanges_biocv3.14_tritrypdbv55_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrgDb_biocv3.14_tritrypdbv55_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/TxDb_biocv3.14_tritrypdbv55_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/OrganismDbi_biocv3.14_tritrypdbv55_invalid_metadata.csv
## Appending to an existing file: EuPathDB/metadata/BSgenome_biocv3.14_tritrypdbv55_invalid_metadata.csv
lp_entry <- EuPathDB::get_eupath_entry(species="Leishmania panamensis", metadata=meta)
## Found the following hits: Leishmania panamensis MHOM/COL/81/L13, Leishmania panamensis strain MHOM/PA/94/PSC-1, choosing the first.
## Using: Leishmania panamensis MHOM/COL/81/L13.
colnames(lp_entry)
##  [1] "AnnotationVersion"  "AnnotationSource"   "BiocVersion"       
##  [4] "DataProvider"       "Genome"             "GenomeSource"      
##  [7] "GenomeVersion"      "NumArrayGene"       "NumChipChipGene"   
## [10] "NumChromosome"      "NumCodingGene"      "NumCommunity"      
## [13] "NumContig"          "NumEC"              "NumEST"            
## [16] "NumGene"            "NumGO"              "NumOrtholog"       
## [19] "NumOtherGene"       "NumPopSet"          "NumProteomics"     
## [22] "NumPseudogene"      "NumRNASeq"          "NumRTPCR"          
## [25] "NumSNP"             "NumTFBS"            "Organellar"        
## [28] "ReferenceStrain"    "MegaBP"             "PrimaryKey"        
## [31] "ProjectID"          "RecordClassName"    "SourceID"          
## [34] "SourceVersion"      "TaxonomyID"         "TaxonomyName"      
## [37] "URLGenome"          "URLGFF"             "URLProtein"        
## [40] "Coordinate_1_based" "Maintainer"         "SourceUrl"         
## [43] "Tags"               "BsgenomePkg"        "GrangesPkg"        
## [46] "OrganismdbiPkg"     "OrgdbPkg"           "TxdbPkg"           
## [49] "Taxon"              "Genus"              "Species"           
## [52] "Strain"             "BsgenomeFile"       "GrangesFile"       
## [55] "OrganismdbiFile"    "OrgdbFile"          "TxdbFile"          
## [58] "GenusSpecies"       "TaxonUnmodified"    "TaxonCanonical"    
## [61] "TaxonXref"
testing_panamensis <- "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53"
## testing_panamensis <- EuPathDB::make_eupath_bsgenome(entry=lp_entry, eu_version="v46")
library(as.character(testing_panamensis), character.only=TRUE)
## Loading required package: BSgenome
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: rtracklayer
genome <- get0(as.character(testing_panamensis))

4 TODO:

Resequence samples: TMRC20002, TMRC20006, TMRC20004 (maybe TMRC20008 and TMRC20029)

5 Generate Expressionsets and Sample Estimation

The process of sample estimation takes two primary inputs:

  1. The sample sheet, which contains all the metadata we currently have on hand, including filenames for the outputs of #3 and #4 above.
  2. The gene annotations.

An expressionset is a data structure used in R to examine RNASeq data. It is comprised of annotations, metadata, and expression data. In the case of our processing pipeline, the location of the expression data is provided by the filenames in the metadata.

The first lines of the following block create the Expressionset. All of the following lines perform various normalizations and generate plots from it.

5.1 Notes

The following samples are much lower coverage:

  • TMRC20002
  • TMRC20006
  • TMRC20007
  • TMRC20008

20210610: I made some manual changes to the sample sheet which I downloaded, filling in some zymodeme with ‘unknown’

5.2 TODO:

  1. Do the multi-gene family removal right here instead of way down at the bottom
  2. Add zymodeme snps to the annotation later.
  3. Start phylogenetic analysis of variant table.
sanitize_columns <- c("passagenumber", "clinicalresponse", "clinicalcategorical",
                      "zymodemecategorical", "phenotypiccharacteristics")
lp_expt <- sm(create_expt(sample_sheet,
                          gene_info = hisat_annot,
                          id_column = "hpglidentifier",
                          file_column = "lpanamensisv36hisatfile")) %>%
  set_expt_conditions(fact = "zymodemecategorical") %>%
  subset_expt(nonzero = 8550) %>%
  subset_expt(coverage = 5000000) %>%
  semantic_expt_filter(semantic = c("amastin", "gp63", "leishmanolysin"),
                       semantic_column = "annot_gene_product") %>%
  sanitize_expt_metadata(columns = sanitize_columns) %>%
  set_expt_factors(columns = sanitize_columns, class = "factor")
## The samples (and read coverage) removed when filtering 8550 non-zero genes are:
## TMRC20002 TMRC20006 
##  11681227   6670348
## subset_expt(): There were 75, now there are 73 samples.
## The samples removed (and read coverage) when filtering samples with less than 5e+06 reads are:
## TMRC20004 TMRC20029 
##    564812   1658096
## subset_expt(): There were 73, now there are 71 samples.
## semantic_expt_filter(): Removed 68 genes.
libsizes <- plot_libsize(lp_expt)
pp(file = "images/lp_expt_libsizes.png", image = libsizes$plot, width = 14, height = 9)

## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(lp_expt)
pp(file = "images/lp_nonzero.png", image = nonzero$plot, width = 9, height = 9)
## Warning: ggrepel: 50 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 53 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

lp_box <- plot_boxplot(lp_expt)
## 5364 entries are 0.  We are on a log scale, adding 1 to the data.
pp(file = "images/lp_expt_boxplot.png", image = lp_box, width = 12, height = 9)

filter_plot <- plot_libsize_prepost(lp_expt)
filter_plot$lowgene_plot
## Warning: Using alpha for a discrete variable is not advised.

filter_plot$count_plot

5.3 Distribution Visualization

Najib’s favorite plots are of course the PCA/TNSE. These are nice to look at in order to get a sense of the relationships between samples. They also provide a good opportunity to see what happens when one applies different normalizations, surrogate analyses, filters, etc. In addition, one may set different experimental factors as the primary ‘condition’ (usually the color of plots) and surrogate ‘batches’.

5.4 By Susceptilibity

Column ‘Q’ in the sample sheet, make a categorical version of it with these parameters:

  • 0 <= x <= 35 is resistant
  • 36 <= x <= 48 is ambiguous
  • 49 <= x is sensitive
starting <- as.numeric(pData(lp_expt)[["susceptibilityinfectionreduction32ugmlsbvhistoricaldata"]])
sus_categorical <- starting
na_idx <- is.na(starting)
sus_categorical[na_idx] <- "unknown"

resist_idx <- starting <= 0.35
sus_categorical[resist_idx] <- "resistant"
indeterminant_idx <- starting >= 0.36 & starting <= 0.48
sus_categorical[indeterminant_idx] <- "ambiguous"
susceptible_idx <- starting >= 0.49
sus_categorical[susceptible_idx] <- "sensitive"

pData(lp_expt$expressionset)[["sus_category"]] <- sus_categorical
clinical_colors <- list(
    "z2.1" = "#0000cc",
    "z2.3" = "#874400",
    "z2.2" = "#df7000",
    "z2.4" = "#cc0000",
    "unknown" = "#cbcbcb",
    "null" = "#000000")
clinical_samples <- lp_expt %>%
  set_expt_batches(fact = sus_categorical) %>%
  set_expt_colors(clinical_colors)

clinical_norm <- sm(normalize_expt(clinical_samples, norm = "quant", transform = "log2",
                                   convert = "cpm", batch = FALSE, filter = TRUE))
zymo_pca <- plot_pca(clinical_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/zymo_pca_sus_shape.png", image = zymo_pca$plot)

zymo_3dpca <- plot_3d_pca(zymo_pca)
zymo_3dpca$plot
clinical_n <- sm(normalize_expt(clinical_samples, transform = "log2",
                                convert = "cpm", batch = FALSE, filter = TRUE))
zymo_tsne <- plot_tsne(clinical_n, plot_title = "TSNE of parasite expression values")
zymo_tsne$plot
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

clinical_nb <- normalize_expt(clinical_samples, convert = "cpm", transform = "log2",
                         filter = TRUE, batch = "svaseq")
## Removing 142 low-count genes (8568 remaining).
## batch_counts: Before batch/surrogate estimation, 1008 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 3380 entries are 0<x<1: 1%.
## Setting 324 low elements to zero.
## transform_counts: Found 324 values equal to 0, adding 1 to the matrix.
clinical_nb_pca <- plot_pca(clinical_nb, plot_title = "PCA of parasite expression values",
                            plot_labels = FALSE)
pp(file = "images/clinical_nb_pca_sus_shape.png", image = clinical_nb_pca$plot)

clinical_nb_tsne <- plot_tsne(clinical_nb, plot_title = "TSNE of parasite expression values")
clinical_nb_tsne$plot
## Warning: ggrepel: 52 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

corheat <- plot_corheat(clinical_norm, plot_title = "Correlation heatmap of parasite
                 expression values
")
corheat$plot

plot_sm(clinical_norm)$plot
## Performing correlation.

5.5 By Cure/Fail status

cf_colors <- list(
    "cure" = "#006f00",
    "fail" = "#9dffa0",
    "unknown" = "#cbcbcb",
    "notapplicable" = "#000000")
cf_expt <- set_expt_conditions(lp_expt, fact = "clinicalcategorical") %>%
  set_expt_batches(fact = sus_categorical) %>%
  set_expt_colors(cf_colors)
## Warning in set_expt_colors(., cf_colors): Colors for the following categories
## are not being used: notapplicable.
cf_norm <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
                          norm = "quant", filter = TRUE)
## Removing 142 low-count genes (8568 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
start_cf <- plot_pca(cf_norm, plot_title = "PCA of parasite expression values",
                     plot_labels = FALSE)
pp(file = "images/cf_sus_shape.png", image = start_cf$plot)

cf_nb <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
                        norm = "quant", filter = TRUE, batch = "svaseq")
## Warning in normalize_expt(cf_expt, convert = "cpm", transform = "log2", :
## Quantile normalization and sva do not always play well together.
## Removing 142 low-count genes (8568 remaining).
## batch_counts: Before batch/surrogate estimation, 2 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 4130 entries are 0<x<1: 1%.
## Setting 154 low elements to zero.
## transform_counts: Found 154 values equal to 0, adding 1 to the matrix.
cf_nb_pca <- plot_pca(cf_nb, plot_title = "PCA of parasite expression values",
                      plot_labels = FALSE)
pp(file = "images/cf_sus_share_nb.png", image = cf_nb_pca$plot)

cf_norm <- normalize_expt(cf_expt, transform = "log2", convert = "cpm",
                          filter = TRUE, norm = "quant")
## Removing 142 low-count genes (8568 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
test <- pca_information(cf_norm,
                        expt_factors = c("clinicalcategorical", "zymodemecategorical",
                                         "pathogenstrain", "passagenumber"),
                        num_components = 6, plot_pcas = TRUE)
test$anova_p
##                           PC1      PC2     PC3       PC4     PC5     PC6
## clinicalcategorical 2.850e-01 0.331134 0.47800 1.728e-03 0.69347 0.42085
## zymodemecategorical 6.720e-06 0.005922 0.70766 3.336e-02 0.01269 0.09026
## pathogenstrain      7.092e-01 0.776303 0.84356 4.512e-06 0.01811 0.56192
## passagenumber       8.896e-01 0.237729 0.04096 2.795e-02 0.22294 0.41299
test$cor_heatmap

sus_colors <- list(
    "resistant" = "#8563a7",
    "sensitive" = "#8d0000",
    "ambiguous" = "#cbcbcb",
    "unknown" = "#000000")
sus_expt <- set_expt_conditions(lp_expt, fact = "sus_category") %>%
  set_expt_batches(fact = "zymodemecategorical") %>%
  set_expt_colors(colors = sus_colors)

sus_norm <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
                           norm = "quant", filter = TRUE)
## Removing 142 low-count genes (8568 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
sus_pca <- plot_pca(sus_norm, plot_title = "PCA of parasite expression values",
                    plot_labels = FALSE)
pp(file = "images/sus_norm_pca.png", image = sus_pca[["plot"]])

sus_nb <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
                         batch = "svaseq", filter = TRUE)
## Removing 142 low-count genes (8568 remaining).
## batch_counts: Before batch/surrogate estimation, 1008 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 3380 entries are 0<x<1: 1%.
## Setting 229 low elements to zero.
## transform_counts: Found 229 values equal to 0, adding 1 to the matrix.
sus_nb_pca <- plot_pca(sus_nb, plot_title = "PCA of parasite expression values",
                       plot_labels = FALSE)
pp(file = "images/sus_nb_pca.png", image = sus_nb_pca[["plot"]])

6 Zymodeme analyses

The following sections perform a series of analyses which seek to elucidate differences between the zymodemes 2.2 and 2.3 either through differential expression or variant profiles.

6.1 Differential expression

6.1.1 With respect to zymodeme attribution

TODO: Do this with and without sva and compare the results.

zy_expt <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
## subset_expt(): There were 71, now there are 55 samples.
zy_norm <- normalize_expt(zy_expt, filter = TRUE, convert = "cpm", norm = "quant")
## Removing 160 low-count genes (8550 remaining).
zy_de_nobatch <- sm(all_pairwise(zy_expt, filter = TRUE, model_batch = "svaseq"))
zy_de <- sm(all_pairwise(zy_expt, filter = TRUE, model_batch = "svaseq"))
zy_table <- sm(combine_de_tables(zy_de, excel = glue::glue("excel/zy_tables-v{ver}.xlsx")))
zy_sig <- sm(extract_significant_genes(zy_table, excel = glue::glue("excel/zy_sig-v{ver}.xlsx")))

6.1.2 Images of zymodeme DE

pp(file = "images/zymo_ma.png", image = zy_table[["plots"]][["z23_vs_z22"]][["deseq_ma_plots"]][["plot"]])

6.2 With respect to cure/failure

In contrast, we can search for genes which are differentially expressed with respect to cure/failure status.

cf_de <- sm(all_pairwise(cf_expt, filter = TRUE, model_batch = "svaseq"))
cf_table <- sm(combine_de_tables(cf_de, excel = glue::glue("excel/cf_tables-v{ver}.xlsx")))
cf_sig <- sm(extract_significant_genes(cf_table, excel = glue::glue("excel/cf_sig-v{ver}.xlsx")))

6.3 With respect to susceptibility

Finally, we can use our category of susceptibility and look for genes which change from sensitive to resistant. Keep in mind, though, that for the moment we have a lot of ambiguous and unknown strains.

sus_de <- sm(all_pairwise(sus_expt, filter = TRUE, model_batch = "svaseq"))
sus_table <- sm(combine_de_tables(sus_de, excel = glue::glue("excel/sus_tables-v{ver}.xlsx")))
sus_sig <- sm(extract_significant_genes(sus_table, excel = glue::glue("excel/sus_sig-v{ver}.xlsx")))
knitr::kable(head(sus_sig$deseq$ups$sensitive_vs_resistant, n = 20))
gid annotgeneproduct annotgenetype chromosome start end strand annotgeneentrezid annotgenename annotstrand annotchromosome annotcdslength length deseq_logfc deseq_adjp edger_logfc edger_adjp limma_logfc limma_adjp basic_nummed basic_denmed basic_numvar basic_denvar basic_logfc basic_t basic_p basic_adjp deseq_basemean deseq_lfcse deseq_stat deseq_p ebseq_fc ebseq_logfc ebseq_c1mean ebseq_c2mean ebseq_mean ebseq_var ebseq_postfc ebseq_ppee ebseq_ppde ebseq_adjp edger_logcpm edger_lr edger_p limma_ave limma_t limma_b limma_p limma_adjp_ihw deseq_adjp_ihw edger_adjp_ihw ebseq_adjp_ihw basic_adjp_ihw lfc_meta lfc_var lfc_varbymed p_meta p_var
LPAL13_000044900 LPAL13_000044900 actin-related protein 2, putative protein coding LPAL13_SCAF000645 507 1685 - reverse Not Assigned 1179.0 1178 29.300 0e+00 13.430 0e+00 9.0070 0.2369 3.9690 -4.1880 15.756 0.1172 8.157 10.200 0.0000 0.0000 841.70 1.1550 25.370 0 115812.91 16.821 0.0000 1158.119 782.513 4.955e+05 356.464 1.0000 0.0000 1.0000 4.9800 60.02 0 1.3620 1.619 -4.2760 0.1099 2.370e-01 4.861e-138 1.647e-11 0.000e+00 5.941e-08 14.280 1.013e+01 7.092e-01 3.663e-02 4.026e-03
LPAL13_000035800 LPAL13_000035800 hypothetical protein protein coding LPAL13_SCAF000500 737 1006 - reverse Not Assigned 270.0 269 14.910 0e+00 14.080 0e+00 10.4500 0.2580 5.3190 -3.9740 15.252 0.3680 9.293 11.610 0.0000 0.0000 2880.00 1.1430 13.040 0 31670.08 14.951 0.1254 4287.362 2896.907 1.341e+07 1311.656 0.0000 0.0000 0.0000 6.7500 86.47 0 2.2850 1.554 -4.3360 0.1248 2.577e-01 2.908e-35 1.154e-16 0.000e+00 7.479e-09 16.200 9.099e+00 5.616e-01 4.160e-02 5.192e-03
LPAL13_320026300 LPAL13_320026300 hypothetical protein, conserved protein coding LpaL13_32 754268 755485 - reverse 32 1218.0 1217 14.210 0e+00 13.330 0e+00 9.8160 0.2961 4.7860 -4.0330 18.222 0.6163 8.819 9.984 0.0000 0.0000 1521.00 1.1160 12.740 0 17750.55 14.116 0.1173 2258.771 1526.235 2.020e+06 672.288 0.0000 0.0000 0.0000 5.8310 77.84 0 2.0190 1.445 -4.4300 0.1531 2.965e-01 1.034e-33 5.258e-15 0.000e+00 4.003e-08 12.360 9.588e-02 7.757e-03 5.103e-02 7.813e-03
LPAL13_000053200 LPAL13_000053200 hypothetical protein protein coding LPAL13_SCAF000804 5037 5249 - reverse Not Assigned 213.0 212 8.774 0e+00 10.180 0e+00 5.8300 0.0288 1.0040 -4.1880 9.040 0.1172 5.193 8.521 0.0000 0.0000 75.03 1.0880 8.064 0 12533.34 13.614 0.0000 125.323 84.678 7.766e+03 39.301 1.0000 0.0000 1.0000 1.5420 52.91 0 -0.9226 2.861 -2.8140 0.0056 3.481e-02 4.408e-13 2.647e-10 0.000e+00 7.765e-07 8.168 4.425e-02 5.417e-03 1.866e-03 1.045e-05
LPAL13_000051300 LPAL13_000051300 hypothetical protein, conserved protein coding LPAL13_SCAF000772 11 2344 + forward Not Assigned 2334.0 2333 8.477 0e+00 9.468 0e+00 3.7190 0.2694 0.2373 -4.0070 10.238 0.5216 4.244 6.306 0.0000 0.0000 135.60 1.2650 6.701 0 1753.26 10.776 0.0861 168.514 113.889 6.534e+04 56.718 0.0000 0.0000 0.0000 2.4020 32.16 0 -1.1240 1.520 -4.3650 0.1330 2.690e-01 4.676e-09 1.618e-06 0.000e+00 2.766e-05 6.999 2.573e+00 3.676e-01 4.433e-02 5.896e-03
LPAL13_000040700 LPAL13_000040700 hypothetical protein, conserved protein coding LPAL13_SCAF000598 54 1067 + forward Not Assigned 1014.0 1013 7.671 0e+00 6.800 0e+00 2.8180 0.1002 -1.2510 -3.9900 6.479 0.5345 2.740 4.971 0.0000 0.0004 20.16 1.1500 6.669 0 186.50 7.543 0.1278 25.699 17.405 3.919e+02 8.201 0.0000 0.0000 0.0000 -0.1417 25.17 0 -2.3660 2.188 -3.7130 0.0321 1.004e-01 5.117e-09 2.694e-05 0.000e+00 3.968e-04 5.586 1.673e+00 2.996e-01 1.069e-02 3.430e-04
LPAL13_000017600 LPAL13_000017600 hypothetical protein, conserved protein coding LPAL13_SCAF000146 359 586 + forward Not Assigned 228.0 227 6.599 0e+00 6.582 0e+00 5.9140 0.0594 4.4470 -1.1540 4.361 2.6176 5.601 8.940 0.0000 0.0000 615.90 0.6798 9.707 0 80.79 6.336 11.8247 956.082 649.836 3.876e+05 64.957 0.0000 1.0000 0.0000 4.5290 58.28 0 2.3410 2.483 -3.0470 0.0155 5.934e-02 5.443e-19 3.255e-11 9.676e-01 2.057e-07 6.658 2.129e+00 3.198e-01 5.160e-03 7.988e-05
LPAL13_300029400 LPAL13_300029400 hypothetical protein, conserved protein coding LpaL13_30 853953 854150 - reverse 30 198.0 197 6.334 0e+00 6.254 0e+00 4.9430 0.0024 1.7270 -2.5570 1.492 1.9180 4.284 9.143 0.0000 0.0000 89.08 0.7409 8.549 0 64.90 6.020 1.8983 123.831 84.285 8.962e+03 24.716 0.0000 1.0000 0.0000 1.7340 54.93 0 0.0165 3.995 -0.5115 0.0002 2.361e-03 1.330e-14 1.268e-10 9.650e-01 1.550e-06 5.915 5.901e-01 9.977e-02 5.337e-05 8.544e-09
LPAL13_080010600 LPAL13_080010600 hypothetical protein, conserved protein coding LpaL13_08 195555 195749 - reverse 8 195.0 194 5.937 0e+00 7.435 0e+00 2.3330 0.0513 -1.8840 -4.1880 4.892 0.1172 2.304 5.084 0.0000 0.0004 10.97 1.1490 5.165 0 1825.43 10.834 0.0000 18.244 12.327 4.601e+02 6.571 1.0000 0.0000 1.0000 -0.9372 26.93 0 -3.1350 2.562 -3.3070 0.0126 6.156e-02 9.050e-06 1.393e-05 0.000e+00 4.302e-04 4.938 4.273e+00 8.653e-01 4.200e-03 5.292e-05
LPAL13_000011700 LPAL13_000011700 hypothetical protein protein coding LPAL13_SCAF000076 101 364 - reverse Not Assigned 264.0 263 5.837 0e+00 7.381 0e+00 2.6660 0.0452 -1.3950 -4.1880 6.725 0.1172 2.793 5.291 0.0000 0.0003 13.83 1.1890 4.910 0 2415.59 11.238 0.0000 24.146 16.315 4.529e+02 8.171 1.0000 0.0000 1.0000 -0.6026 24.80 0 -2.9340 2.632 -3.1710 0.0105 4.521e-02 2.898e-05 3.232e-05 0.000e+00 3.013e-04 5.108 1.611e+00 3.154e-01 3.494e-03 3.660e-05
LPAL13_040019400 LPAL13_040019400 hypothetical protein protein coding LpaL13_04 440768 441127 - reverse 4 360.0 359 5.467 0e+00 5.338 0e+00 3.4590 0.0204 -0.4395 -3.3620 1.762 1.1875 2.922 7.098 0.0000 0.0000 33.36 0.8746 6.251 0 43.94 5.457 0.7564 33.666 22.992 1.721e+03 9.142 0.0000 0.0000 0.0000 0.3743 31.59 0 -1.6880 3.026 -2.5930 0.0035 2.038e-02 5.815e-08 1.928e-06 0.000e+00 8.066e-06 4.777 1.077e-01 2.254e-02 1.161e-03 4.046e-06
LPAL13_170014500 LPAL13_170014500 hypothetical protein, conserved protein coding LpaL13_17 361708 362040 + forward 17 333.0 332 5.167 0e+00 4.916 3e-04 2.7620 0.0290 -0.6550 -3.2950 6.983 1.4800 2.640 4.161 0.0002 0.0020 21.04 0.9833 5.255 0 46.88 5.551 0.9255 43.850 29.929 1.585e+03 11.208 0.0000 0.0000 0.0000 -0.2718 18.70 0 -2.4250 2.857 -2.7770 0.0057 2.911e-02 6.836e-06 3.877e-04 0.000e+00 1.994e-03 4.197 2.010e-01 4.790e-02 1.892e-03 1.065e-05
LPAL13_350011800 LPAL13_350011800 hypothetical protein, conserved protein coding LpaL13_35 171009 171242 + forward 35 234.0 233 5.163 0e+00 5.148 0e+00 4.6090 0.0026 2.8750 -1.1550 2.429 1.1467 4.030 9.181 0.0000 0.0000 174.70 0.5695 9.066 0 34.83 5.122 8.5427 297.854 204.023 5.757e+04 26.123 0.0000 1.0000 0.0000 2.7050 58.33 0 1.1580 3.938 -0.0230 0.0002 2.642e-03 2.111e-16 3.255e-11 9.650e-01 6.496e-08 4.958 6.883e-01 1.388e-01 6.487e-05 1.262e-08
LPAL13_200050100 LPAL13_200050100 hypothetical protein protein coding LpaL13_20.1 1627529 1627717 + forward 20.1 189.0 188 5.114 0e+00 5.083 0e+00 4.6430 0.0017 2.4570 -1.8560 1.010 2.3968 4.313 8.802 0.0000 0.0000 116.30 0.5978 8.555 0 25.75 4.686 7.4499 192.056 132.184 2.099e+04 18.447 0.0000 1.0000 0.0000 2.1490 51.00 0 0.7904 4.129 0.0911 0.0001 2.041e-03 1.305e-14 5.942e-10 9.676e-01 9.799e-06 5.002 1.092e+00 2.183e-01 3.370e-05 3.407e-09
LPAL13_080010800 LPAL13_080010800 hypothetical protein protein coding LpaL13_08 199409 199792 - reverse 8 384.0 383 5.065 1e-04 6.542 0e+00 1.6980 0.2085 -2.3590 -4.1880 4.126 0.1172 1.829 4.375 0.0002 0.0018 10.25 1.0820 4.681 0 1036.53 10.018 0.0000 10.355 6.997 1.037e+02 3.870 1.0000 0.0000 1.0000 -0.8838 24.72 0 -3.1090 1.714 -4.2060 0.0910 2.089e-01 7.164e-05 3.308e-05 0.000e+00 1.773e-03 3.916 3.049e+00 7.785e-01 3.034e-02 2.762e-03
LPAL13_000011800 LPAL13_000011800 hypothetical protein, conserved protein coding LPAL13_SCAF000076 446 640 - reverse Not Assigned 195.0 194 4.744 2e-04 5.365 3e-04 0.9541 0.4747 -2.5010 -3.9920 3.638 0.5576 1.491 3.404 0.0017 0.0102 11.03 1.0720 4.424 0 61.09 5.933 0.1379 9.026 6.143 7.072e+01 3.201 0.9990 0.0010 0.9990 -0.8584 18.80 0 -3.0600 1.029 -4.6930 0.3073 4.744e-01 1.774e-04 3.603e-04 1.773e-02 1.023e-02 3.320 3.640e+00 1.096e+00 1.024e-01 3.148e-02
LPAL13_000014000 LPAL13_000014000 hypothetical protein protein coding LPAL13_SCAF000119 655 942 + forward Not Assigned 288.0 287 4.278 0e+00 4.268 0e+00 3.9120 0.0114 2.4330 -1.1070 1.521 1.9542 3.540 7.486 0.0000 0.0000 128.20 0.5257 8.137 0 17.50 4.129 10.8620 190.207 132.041 1.264e+04 13.784 0.0000 1.0000 0.0000 2.2850 49.29 0 1.0980 3.311 -1.4520 0.0015 1.136e-02 3.161e-13 1.293e-09 9.609e-01 1.616e-05 4.339 9.720e-01 2.240e-01 4.953e-04 7.361e-07
LPAL13_000035500 LPAL13_000035500 hypothetical protein, conserved protein coding LPAL13_SCAF000492 7045 7410 + forward Not Assigned 366.0 365 4.244 0e+00 4.247 0e+00 3.8130 0.0290 4.5240 0.8016 2.358 0.5645 3.722 9.900 0.0000 0.0000 509.00 0.5882 7.214 0 19.25 4.267 45.7435 880.950 610.072 3.355e+05 18.528 0.0000 1.0000 0.0000 4.2590 38.49 0 2.8070 2.857 -2.2370 0.0057 3.509e-02 2.215e-10 1.077e-07 9.650e-01 8.739e-09 4.278 9.389e-01 2.195e-01 1.886e-03 1.067e-05
LPAL13_000026500 LPAL13_000026500 hypothetical protein protein coding LPAL13_SCAF000301 144 494 - reverse Not Assigned 351.0 350 4.099 0e+00 4.058 1e-04 2.4050 0.1735 0.1342 -2.3950 5.419 1.9803 2.529 4.092 0.0003 0.0025 43.82 0.7856 5.218 0 20.01 4.323 2.7286 54.802 37.913 1.498e+03 9.648 0.0002 0.9998 0.0002 0.8896 22.05 0 -0.8872 1.840 -4.0380 0.0701 1.737e-01 7.827e-06 8.662e-05 9.676e-01 2.483e-03 3.298 2.748e-01 8.333e-02 2.337e-02 1.638e-03
LPAL13_220019500 LPAL13_220019500 hypothetical protein protein coding LpaL13_22 578260 578538 + forward 22 279.0 278 3.766 0e+00 3.767 0e+00 3.0520 0.0282 3.5420 0.3451 2.128 0.8014 3.196 8.201 0.0000 0.0000 284.60 0.4888 7.703 0 14.41 3.849 29.3640 423.356 295.575 6.945e+04 13.320 0.0000 1.0000 0.0000 3.4280 45.31 0 2.3620 2.872 -2.2080 0.0054 2.827e-02 6.678e-12 6.457e-09 9.650e-01 2.914e-07 3.738 3.673e-01 9.825e-02 1.809e-03 9.817e-06
knitr::kable(head(sus_sig$deseq$downs$sensitive_vs_resistant, n = 20))
gid annotgeneproduct annotgenetype chromosome start end strand annotgeneentrezid annotgenename annotstrand annotchromosome annotcdslength length deseq_logfc deseq_adjp edger_logfc edger_adjp limma_logfc limma_adjp basic_nummed basic_denmed basic_numvar basic_denvar basic_logfc basic_t basic_p basic_adjp deseq_basemean deseq_lfcse deseq_stat deseq_p ebseq_fc ebseq_logfc ebseq_c1mean ebseq_c2mean ebseq_mean ebseq_var ebseq_postfc ebseq_ppee ebseq_ppde ebseq_adjp edger_logcpm edger_lr edger_p limma_ave limma_t limma_b limma_p limma_adjp_ihw deseq_adjp_ihw edger_adjp_ihw ebseq_adjp_ihw basic_adjp_ihw lfc_meta lfc_var lfc_varbymed p_meta p_var
LPAL13_000033300 LPAL13_000033300 hypothetical protein, conserved protein coding LPAL13_SCAF000463 551 811 + forward Not Assigned 261.0 260 -5.470 0.0003 -5.372 0.0007 -5.948 0.0000 -3.7270 3.4570 10.5693 0.0607 -7.184 -10.980 0 0e+00 128.700 1.2590 -4.344 0.0000 0.1257 -2.992 303.97 38.190 124.390 2.181e+04 0.1338 0.0000 0.0000 0.0000 2.2660 16.67 0.0000 -1.0450 -5.584 4.7710 0e+00 3.750e-05 2.310e-04 8.774e-04 0.000e+00 2.320e-08 -5.597 0.000e+00 0.000e+00 1.964e-05 5.091e-10
LPAL13_000038400 LPAL13_000038400 expression-site associated gene (esag3), putative protein coding LPAL13_SCAF000573 101 1360 + forward Not Assigned 1260.0 1259 -2.890 0.0000 -2.883 0.0000 -3.228 0.0001 4.6320 8.2100 3.1208 0.0361 -3.578 -10.010 0 0e+00 3613.000 0.5417 -5.335 0.0000 0.1769 -2.499 8563.68 1514.852 3800.959 1.502e+07 0.1810 0.0000 0.0000 0.0000 7.0710 32.07 0.0000 5.8030 -5.353 5.2830 0e+00 7.083e-05 4.901e-06 1.316e-06 0.000e+00 6.552e-08 -2.980 2.311e-02 -7.757e-03 3.978e-07 3.538e-13
LPAL13_350063000 LPAL13_350063000 hypothetical protein protein coding LpaL13_35 1964328 1964543 - reverse 35 216.0 215 -2.787 0.0000 -2.760 0.0000 -3.430 0.0000 -2.3360 1.1760 2.1326 0.2175 -3.511 -10.920 0 0e+00 20.820 0.4834 -5.767 0.0000 0.1430 -2.806 52.76 7.535 22.203 6.022e+02 0.1650 0.0000 1.0000 0.0000 -0.3551 32.28 0.0000 -1.4490 -6.980 8.1370 0e+00 9.781e-07 6.585e-07 1.463e-06 9.609e-01 5.752e-09 -3.009 2.562e-03 -8.513e-04 7.635e-09 3.534e-17
LPAL13_140019300 LPAL13_140019300 bt1 family, putative protein coding LpaL13_14 530784 531350 + forward 14 567.0 566 -2.647 0.0000 -2.642 0.0000 -2.524 0.0000 4.6450 7.1000 0.4797 1.0258 -2.455 -7.588 0 0e+00 1850.000 0.3812 -6.943 0.0000 0.1690 -2.565 4590.02 775.713 2012.785 5.149e+06 0.1732 0.0000 1.0000 0.0000 6.1060 54.17 0.0000 5.3900 -6.983 11.6400 0e+00 9.781e-07 1.215e-09 1.268e-10 9.956e-01 3.706e-05 -2.637 1.599e-01 -6.065e-02 4.897e-10 7.135e-19
LPAL13_000012000 LPAL13_000012000 hypothetical protein protein coding LPAL13_SCAF000080 710 1159 - reverse Not Assigned 450.0 449 -2.597 0.0006 -2.588 0.0003 -3.018 0.0033 0.0920 3.9390 7.6668 0.1677 -3.847 -6.794 0 0e+00 204.300 0.6316 -4.113 0.0000 0.2273 -2.137 439.13 99.824 209.870 4.713e+04 0.2373 0.1104 0.8896 0.1104 2.9350 18.91 0.0000 1.3490 -3.853 0.3639 3e-04 3.273e-03 5.777e-04 3.565e-04 8.804e-01 1.427e-05 -2.699 3.322e-02 -1.231e-02 1.041e-04 1.825e-08
LPAL13_310039200 LPAL13_310039200 hypothetical protein protein coding LpaL13_31 1301745 1301972 - reverse 31 228.0 227 -2.424 0.0000 -2.419 0.0000 -2.442 0.0000 1.2180 3.7480 1.3520 0.2102 -2.530 -9.456 0 0e+00 189.000 0.4028 -6.018 0.0000 0.2518 -1.990 382.33 96.248 189.031 3.283e+04 0.2618 0.3602 0.6398 0.3602 2.8250 39.27 0.0000 1.9810 -5.653 6.3710 0e+00 3.272e-05 1.888e-07 8.181e-08 6.661e-01 1.829e-08 -2.505 9.974e-02 -3.981e-02 1.114e-07 3.655e-14
LPAL13_310035500 LPAL13_310035500 hypothetical protein protein coding LpaL13_31 1198439 1198957 - reverse 31 519.0 518 -2.381 0.0059 -2.284 0.0045 -3.219 0.0000 -4.1820 -0.4332 4.0735 0.4619 -3.749 -8.353 0 0e+00 6.986 0.7064 -3.370 0.0008 0.3087 -1.696 17.33 5.343 9.229 3.055e+02 0.3627 0.0000 0.0000 0.0000 -1.9240 12.08 0.0005 -3.1420 -7.026 5.9880 0e+00 9.518e-07 5.431e-03 6.168e-03 0.000e+00 2.057e-07 -2.637 1.925e-03 -7.302e-04 4.202e-04 1.469e-07
LPAL13_340039600 LPAL13_340039600 hypothetical protein protein coding LpaL13_34 1247554 1247757 - reverse 34 204.0 203 -2.226 0.0002 -2.222 0.0001 -2.731 0.0005 1.2470 4.2710 3.6166 0.1032 -3.024 -7.724 0 0e+00 225.900 0.5086 -4.377 0.0000 0.2131 -2.230 541.90 115.473 253.773 5.388e+04 0.2179 0.0000 1.0000 0.0000 3.0670 20.97 0.0000 1.9970 -4.581 2.6640 0e+00 6.390e-04 2.224e-04 1.488e-04 9.609e-01 2.350e-06 -2.435 8.498e-03 -3.489e-03 1.227e-05 5.938e-11
LPAL13_310031000 LPAL13_310031000 hypothetical protein, conserved protein coding LpaL13_31 1075172 1075459 - reverse 31 288.0 287 -2.225 0.0000 -2.203 0.0000 -2.878 0.0000 -2.0110 0.9902 3.4994 0.5164 -3.001 -7.015 0 0e+00 26.070 0.4422 -5.032 0.0000 0.2783 -1.845 53.53 14.891 27.423 9.837e+02 0.3026 0.1041 0.8959 0.1041 0.0442 25.63 0.0000 -1.1600 -6.196 6.2140 0e+00 7.298e-06 1.793e-05 2.417e-05 8.856e-01 2.953e-06 -2.435 0.000e+00 0.000e+00 3.120e-07 5.784e-14
LPAL13_000012100 LPAL13_000012100 hypothetical protein protein coding LPAL13_SCAF000080 1637 1894 - reverse Not Assigned 258.0 257 -2.194 0.0093 -2.179 0.0075 -3.393 0.0003 -2.2170 1.1630 6.2496 0.6344 -3.380 -6.141 0 0e+00 30.040 0.6857 -3.200 0.0014 0.2868 -1.802 62.47 17.913 32.365 1.713e+03 0.3159 0.0526 0.9474 0.0526 0.2151 10.88 0.0010 -1.4100 -4.876 2.4920 0e+00 3.045e-04 9.270e-03 8.836e-03 9.356e-01 2.741e-05 -2.573 1.242e-01 -4.826e-02 7.845e-04 4.935e-07
LPAL13_310031300 LPAL13_310031300 hypothetical protein, conserved protein coding LpaL13_31 1084772 1085059 - reverse 31 288.0 287 -2.134 0.0024 -2.122 0.0016 -3.169 0.0002 -1.0760 2.1610 3.9624 0.7670 -3.237 -6.863 0 0e+00 62.920 0.5806 -3.676 0.0002 0.2357 -2.085 134.87 31.779 65.216 6.123e+03 0.2525 0.0099 0.9901 0.0099 1.2360 14.58 0.0001 -0.2025 -4.969 3.2940 0e+00 2.022e-04 2.379e-03 2.121e-03 9.584e-01 3.680e-06 -2.423 2.657e-02 -1.097e-02 1.253e-04 1.350e-08
LPAL13_140019100 LPAL13_140019100 bt1 family, putative protein coding LpaL13_14 525164 525514 + forward 14 351.0 350 -2.003 0.0000 -1.998 0.0000 -2.040 0.0000 3.9170 6.0240 0.3510 0.5070 -2.107 -8.882 0 0e+00 869.900 0.2988 -6.703 0.0000 0.2312 -2.113 1932.12 446.698 928.457 7.009e+05 0.2344 0.0000 1.0000 0.0000 5.0180 52.46 0.0000 4.5850 -7.335 13.0600 0e+00 5.447e-07 4.676e-09 2.970e-10 9.609e-01 2.897e-06 -2.057 4.177e-02 -2.031e-02 1.192e-10 3.558e-20
LPAL13_050005000 LPAL13_050005000 hypothetical protein protein coding LpaL13_05 3394 3612 - reverse 5 219.0 218 -1.993 0.0056 -1.986 0.0035 -2.736 0.0002 0.1994 2.6790 2.0375 0.1572 -2.480 -8.063 0 0e+00 88.780 0.5888 -3.384 0.0007 0.2837 -1.817 174.87 49.604 90.231 6.026e+03 0.2922 0.0002 0.9998 0.0002 1.7250 12.71 0.0004 0.4855 -4.996 3.8090 0e+00 2.276e-04 5.662e-03 4.140e-03 9.609e-01 5.395e-07 -2.217 1.753e-02 -7.908e-03 3.606e-04 1.258e-07
LPAL13_000038500 LPAL13_000038500 hypothetical protein protein coding LPAL13_SCAF000575 39 251 + forward Not Assigned 213.0 212 -1.978 0.0067 -1.963 0.0097 -3.314 0.0000 -1.9410 1.3660 4.6543 0.6628 -3.307 -6.731 0 0e+00 31.090 0.5964 -3.317 0.0009 0.2989 -1.742 72.58 21.686 38.191 2.293e+03 0.3243 0.1752 0.8248 0.1752 0.2162 10.24 0.0014 -1.2950 -5.590 4.5350 0e+00 4.344e-05 6.321e-03 1.256e-02 8.291e-01 5.673e-06 -2.298 2.433e-01 -1.059e-01 7.605e-04 4.864e-07
LPAL13_340039700 LPAL13_340039700 snare domain containing protein, putative protein coding LpaL13_34 1248192 1248947 - reverse 34 756.0 755 -1.798 0.0000 -1.794 0.0000 -1.930 0.0000 4.6130 6.7130 0.6868 0.0937 -2.100 -11.180 0 0e+00 1390.000 0.3094 -5.810 0.0000 0.2704 -1.887 2937.42 794.360 1489.406 1.369e+06 0.2739 0.0000 1.0000 0.0000 5.6930 38.61 0.0000 5.2760 -6.469 9.5800 0e+00 3.701e-06 5.447e-07 8.727e-08 9.487e-01 2.029e-09 -1.790 3.196e-02 -1.786e-02 6.333e-09 3.442e-17
LPAL13_170015400 LPAL13_170015400 hypothetical protein, conserved protein coding LpaL13_17 395975 396307 + forward 17 333.0 332 -1.715 0.0000 -1.708 0.0000 -1.699 0.0001 1.2420 3.2650 0.9481 0.1241 -2.023 -9.210 0 0e+00 148.900 0.2814 -6.095 0.0000 0.3164 -1.660 265.43 83.965 142.820 1.263e+04 0.3213 0.0000 1.0000 0.0000 2.4750 37.57 0.0000 1.9960 -5.333 5.1870 0e+00 7.409e-05 1.166e-07 1.322e-07 9.609e-01 3.537e-08 -1.702 4.225e-02 -2.482e-02 3.903e-07 4.548e-13
LPAL13_210015500 LPAL13_210015500 core histone h2a/h2b/h3/h4, putative protein coding LpaL13_21 326108 326506 + forward 21 399.0 398 -1.415 0.0015 -1.404 0.0004 -1.438 0.0045 4.9010 6.6470 2.6083 0.2832 -1.746 -4.882 0 4e-04 2143.000 0.3701 -3.822 0.0001 0.4722 -1.082 2956.62 1396.232 1902.304 2.503e+06 0.4907 0.9914 0.0086 0.9914 6.3210 18.04 0.0000 5.6650 -3.716 -0.3655 4e-04 4.497e-03 1.522e-03 3.184e-04 4.523e-02 4.434e-04 -1.498 2.465e-02 -1.646e-02 1.874e-04 3.965e-08
LPAL13_140019200 LPAL13_140019200 inositol-3-phosphate synthase protein coding LpaL13_14 527711 529291 + INO1 forward 14 1581.0 1580 -1.414 0.0000 -1.409 0.0000 -1.490 0.0000 8.8260 10.4100 0.1715 0.4377 -1.580 -7.588 0 0e+00 19060.000 0.2377 -5.946 0.0000 0.3502 -1.514 36500.63 12780.792 20473.713 1.790e+08 0.3525 0.0000 1.0000 0.0000 9.4720 41.49 0.0000 9.2510 -6.301 8.9390 0e+00 5.648e-06 2.763e-07 2.036e-08 1.000e+00 4.852e-05 -1.522 1.763e-02 -1.158e-02 9.103e-09 1.784e-16
LPAL13_320038700 LPAL13_320038700 hypothetical protein, conserved protein coding LpaL13_32 1175024 1175257 + forward 32 234.0 233 -1.413 0.0000 -1.406 0.0000 -1.442 0.0000 2.5490 3.9280 0.4267 0.1122 -1.379 -8.484 0 0e+00 255.300 0.2271 -6.222 0.0000 0.4234 -1.240 421.88 178.602 257.503 2.064e+04 0.4270 0.0009 0.9991 0.0009 3.2540 40.65 0.0000 3.0110 -5.952 7.5730 0e+00 1.437e-05 6.640e-08 4.747e-08 9.609e-01 1.169e-07 -1.490 2.028e-02 -1.361e-02 3.376e-08 3.351e-15
LPAL13_040007800 LPAL13_040007800 hypothetical protein, conserved protein coding LpaL13_04 77524 78306 + forward 4 783.0 782 -1.378 0.0000 -1.373 0.0000 -1.246 0.0000 6.5520 7.7350 0.1432 0.3718 -1.183 -6.174 0 3e-04 4021.000 0.2295 -6.007 0.0000 0.3941 -1.343 6683.93 2634.211 3947.634 7.264e+06 0.3986 0.0001 0.9999 0.0001 7.2270 44.53 0.0000 7.0050 -5.829 7.0500 0e+00 2.049e-05 2.015e-07 7.246e-09 9.679e-01 3.080e-04 -1.364 6.839e-02 -5.014e-02 5.564e-08 8.971e-15
sus_ma <- sus_table[["plots"]][["sensitive_vs_resistant"]][["deseq_ma_plots"]][["plot"]]
pp(file = "images/sus_ma.png", image = sus_ma)

## test <- ggplt(sus_ma)

6.4 Ontology searches

Now let us look for ontology categories which are increased in the 2.3 samples followed by the 2.2 samples.

## Gene categories more represented in the 2.3 group.
zy_go_up <- sm(simple_goseq(sig_genes = zy_sig[["deseq"]][["ups"]][[1]],
                            go_db = lp_go, length_db = lp_lengths))

## Gene categories more represented in the 2.2 group.
zy_go_down <- sm(simple_goseq(sig_genes = zy_sig[["deseq"]][["downs"]][[1]],
                              go_db = lp_go, length_db = lp_lengths))

6.4.1 A couple plots from the differential expression

6.4.1.1 Number of genes in agreement among DE methods, 2.3 more than 2.2

In the function ‘combined_de_tables()’ above, one of the tasks performed is to look at the agreement among DESeq2, limma, and edgeR. The following show a couple of these for the set of genes observed with a fold-change >= |2| and adjusted p-value <= 0.05.

zy_table[["venns"]][[1]][["p_lfc1"]][["up_noweight"]]

6.4.1.2 Number of genes in agreement among DE methods, 2.2 more than 2.3

zy_table[["venns"]][[1]][["p_lfc1"]][["down_noweight"]]

6.4.1.3 goseq ontology plots of groups of genes, 2.3 more than 2.2

zy_go_up$pvalue_plots$bpp_plot_over

6.4.1.4 goseq ontology plots of groups of genes, 2.2 more than 2.3

zy_go_down$pvalue_plots$bpp_plot_over

6.5 Look for agreement between sensitivity and zymodemes

Remind myself, the data structures are (zy|sus)_(de|table|sig).

zy_df <- zy_table[["data"]][["z23_vs_z22"]]
sus_df <- sus_table[["data"]][["sensitive_vs_resistant"]]

both_df <- merge(zy_df, sus_df, by = "row.names")
plot_df <- both_df[, c("deseq_logfc.x", "deseq_logfc.y")]
rownames(plot_df) <- both_df[["Row.names"]]
colnames(plot_df) <- c("z23_vs_z22", "sensitive_vs_resistant")

compare <- plot_linear_scatter(plot_df)
## Warning in plot_multihistogram(df): NAs introduced by coercion
pp(file = "images/compare_sus_zy.png", image = compare$scatter)

compare$cor
## 
##  Pearson's product-moment correlation
## 
## data:  df[, 1] and df[, 2]
## t = -193, df = 8548, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9060 -0.8981
## sample estimates:
##     cor 
## -0.9021

6.6 Zymodeme enzyme gene IDs

Najib read me an email listing off the gene names associated with the zymodeme classification. I took those names and cross referenced them against the Leishmania panamensis gene annotations and found the following:

They are:

  1. ALAT: LPAL13_120010900 – alanine aminotransferase
  2. ASAT: LPAL13_340013000 – aspartate aminotransferase
  3. G6PD: LPAL13_000054100 – glucase-6-phosphate 1-dehydrogenase
  4. NH: LPAL13_14006100, LPAL13_180018500 – inosine-guanine nucleoside hydrolase
  5. MPI: LPAL13_320022300 (maybe) – mannose phosphate isomerase (I chose phosphomannose isomerase)

Given these 6 gene IDs (NH has two gene IDs associated with it), I can do some looking for specific differences among the various samples.

6.6.1 Expression levels of zymodeme genes

The following creates a colorspace (red to green) heatmap showing the observed expression of these genes in every sample.

my_genes <- c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
              "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300",
              "other")
my_names <- c("ALAT", "ASAT", "G6PD", "NHv1", "NHv2", "MPI", "other")

zymo_expt <- exclude_genes_expt(zy_norm, ids = my_genes, method = "keep")
## Before removal, there were 8550 genes, now there are 6.
## There are 55 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20065 TMRC20005 TMRC20066 TMRC20039 TMRC20037 TMRC20038 TMRC20067 
##    0.1312    0.1249    0.1320    0.1060    0.1300    0.1102    0.1130    0.1165 
## TMRC20068 TMRC20041 TMRC20015 TMRC20009 TMRC20010 TMRC20016 TMRC20011 TMRC20012 
##    0.1157    0.1181    0.1149    0.1137    0.1101    0.1062    0.1104    0.1208 
## TMRC20013 TMRC20017 TMRC20014 TMRC20018 TMRC20019 TMRC20070 TMRC20020 TMRC20021 
##    0.1207    0.1066    0.1092    0.1147    0.1224    0.1127    0.1102    0.1063 
## TMRC20022 TMRC20024 TMRC20036 TMRC20069 TMRC20033 TMRC20031 TMRC20055 TMRC20079 
##    0.1307    0.1126    0.1202    0.1163    0.1128    0.1005    0.1346    0.1268 
## TMRC20071 TMRC20078 TMRC20058 TMRC20072 TMRC20059 TMRC20048 TMRC20060 TMRC20077 
##    0.1234    0.1340    0.1181    0.1430    0.1104    0.1033    0.1087    0.1220 
## TMRC20074 TMRC20063 TMRC20053 TMRC20052 TMRC20064 TMRC20075 TMRC20051 TMRC20050 
##    0.1208    0.1167    0.1182    0.1105    0.1140    0.1113    0.1283    0.1154 
## TMRC20049 TMRC20062 TMRC20080 TMRC20043 TMRC20054 TMRC20046 TMRC20044 
##    0.1393    0.1285    0.1154    0.1137    0.1278    0.1367    0.1337
zymo_heatmap <- plot_sample_heatmap(zymo_expt, row_label = my_names)
zymo_heatmap

6.7 Empirically observed Zymodeme genes from differential expression analysis

In contrast, the following plots take the set of genes which are shared among all differential expression methods (|lfc| >= 1.0 and adjp <= 0.05) and use them to make categories of genes which are increased in 2.3 or 2.2.

shared_zymo <- intersect_significant(zy_table)
## Deleting the file excel/intersect_significant.xlsx before writing the tables.
up_shared <- shared_zymo[["ups"]][[1]][["data"]][["all"]]
rownames(up_shared)
##  [1] "LPAL13_000033300" "LPAL13_000012000" "LPAL13_000038500" "LPAL13_000012100"
##  [5] "LPAL13_310031300" "LPAL13_000038400" "LPAL13_050005000" "LPAL13_340039600"
##  [9] "LPAL13_310031000" "LPAL13_310039200" "LPAL13_350063000" "LPAL13_180013900"
## [13] "LPAL13_140019300" "LPAL13_210015500" "LPAL13_350013200" "LPAL13_340039700"
## [17] "LPAL13_350073400" "LPAL13_250006300" "LPAL13_170015400" "LPAL13_330024000"
## [21] "LPAL13_140019100" "LPAL13_320038700" "LPAL13_000052700" "LPAL13_140019200"
## [25] "LPAL13_210005000" "LPAL13_230011200" "LPAL13_330021800" "LPAL13_240009700"
## [29] "LPAL13_160014500" "LPAL13_350073200" "LPAL13_050009600" "LPAL13_250025700"
## [33] "LPAL13_230011500" "LPAL13_160014100" "LPAL13_230011400" "LPAL13_040007800"
## [37] "LPAL13_230011300" "LPAL13_020006700" "LPAL13_310032500" "LPAL13_310028500"
upshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(up_shared), method = "keep")
## Before removal, there were 8550 genes, now there are 40.
## There are 55 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20065 TMRC20005 TMRC20066 TMRC20039 TMRC20037 TMRC20038 TMRC20067 
##    0.3322    0.4156    0.1039    0.3671    0.1512    0.3873    0.5153    0.3105 
## TMRC20068 TMRC20041 TMRC20015 TMRC20009 TMRC20010 TMRC20016 TMRC20011 TMRC20012 
##    0.3625    0.1575    0.4002    0.1335    0.3897    0.2896    0.1460    0.1261 
## TMRC20013 TMRC20017 TMRC20014 TMRC20018 TMRC20019 TMRC20070 TMRC20020 TMRC20021 
##    0.3401    0.1453    0.1548    0.3148    0.1255    0.3827    0.1178    0.3447 
## TMRC20022 TMRC20024 TMRC20036 TMRC20069 TMRC20033 TMRC20031 TMRC20055 TMRC20079 
##    0.1226    0.1371    0.1805    0.1545    0.1339    0.1099    0.1670    0.5184 
## TMRC20071 TMRC20078 TMRC20058 TMRC20072 TMRC20059 TMRC20048 TMRC20060 TMRC20077 
##    0.4733    0.1748    0.5681    0.1654    0.2709    0.2979    0.1251    0.1243 
## TMRC20074 TMRC20063 TMRC20053 TMRC20052 TMRC20064 TMRC20075 TMRC20051 TMRC20050 
##    0.1484    0.1369    0.1613    0.4133    0.3801    0.3249    0.5661    0.1338 
## TMRC20049 TMRC20062 TMRC20080 TMRC20043 TMRC20054 TMRC20046 TMRC20044 
##    0.1526    0.5845    0.4227    0.3880    0.5060    0.1546    0.1557

We can plot a quick heatmap to get a sense of the differences observed between the genes which are different between the two zymodemes.

6.7.1 Heatmap of zymodeme gene expression increased in 2.3 vs. 2.2

high_23_heatmap <- plot_sample_heatmap(upshared_expt, row_label = rownames(up_shared))
high_23_heatmap

6.7.2 Heatmap of zymodeme gene expression increased in 2.2 vs. 2.3

down_shared <- shared_zymo[["downs"]][[1]][["data"]][["all"]]
downshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(down_shared), method = "keep")
## Before removal, there were 8550 genes, now there are 68.
## There are 55 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20065 TMRC20005 TMRC20066 TMRC20039 TMRC20037 TMRC20038 TMRC20067 
##    0.2203    0.1878    0.6713    0.2263    0.6575    0.2105    0.2015    0.2387 
## TMRC20068 TMRC20041 TMRC20015 TMRC20009 TMRC20010 TMRC20016 TMRC20011 TMRC20012 
##    0.2024    0.6894    0.1877    0.6366    0.1683    0.2077    0.5678    0.5511 
## TMRC20013 TMRC20017 TMRC20014 TMRC20018 TMRC20019 TMRC20070 TMRC20020 TMRC20021 
##    0.1656    0.6549    0.6493    0.1597    0.6529    0.1858    0.6854    0.1538 
## TMRC20022 TMRC20024 TMRC20036 TMRC20069 TMRC20033 TMRC20031 TMRC20055 TMRC20079 
##    0.6762    0.7158    0.6760    0.6975    0.7211    0.6116    0.7106    0.1868 
## TMRC20071 TMRC20078 TMRC20058 TMRC20072 TMRC20059 TMRC20048 TMRC20060 TMRC20077 
##    0.1697    0.5313    0.2196    0.5354    0.1380    0.1543    0.7607    0.5706 
## TMRC20074 TMRC20063 TMRC20053 TMRC20052 TMRC20064 TMRC20075 TMRC20051 TMRC20050 
##    0.6620    0.6333    0.5715    0.1784    0.1946    0.1804    0.1904    0.6151 
## TMRC20049 TMRC20062 TMRC20080 TMRC20043 TMRC20054 TMRC20046 TMRC20044 
##    0.6976    0.1887    0.1547    0.1729    0.2026    0.6344    0.6226
high_22_heatmap <- plot_sample_heatmap(downshared_expt, row_label = rownames(down_shared))
high_22_heatmap

7 SNP profiles

Now I will combine our previous samples and our new samples in the hopes of finding variant positions which help elucidate currently unknown aspects of either group via their clustering to known samples from the other group. In other words, we do not know the zymodeme annotations for the old samples nor the strain identities (or the shortcut ‘chronic vs. self-healing’) for the new samples. I hope to make educated guesses given the variant profiles. There are some differences in how the previous and current data sets were analyzed (though I have since redone the old samples so it should be trivial to remove those differences now).

I added our 2016 data to a specific TMRC2 sample sheet, dated 20191203. Thus I will load the data here. That previous data was mapped using tophat, so I will also need to make some changes to the gene names to accomodate the two mappings.

old_expt <- sm(create_expt("sample_sheets/tmrc2_samples_20191203.xlsx",
                           file_column = "tophat2file"))

tt <- lp_expt[["expressionset"]]
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.E1$", replacement = "", x = rownames(tt))
lp_expt$expressionset <- tt

tt <- old_expt$expressionset
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.1$", replacement = "", x = rownames(tt))
old_expt$expressionset <- tt
rm(tt)

7.1 Create the SNP expressionset

One other important caveat, we have a group of new samples which have not yet run through the variant search pipeline, so I need to remove them from consideration. Though it looks like they finished overnight…

## The next line drops the samples which are missing the SNP pipeline.
lp_snp <- subset_expt(lp_expt, subset="!is.na(pData(lp_expt)[['bcftable']])")
## subset_expt(): There were 71, now there are 67 samples.
new_snps <- sm(count_expt_snps(lp_snp, annot_column = "bcftable"))
## Error : 'preprocessing/TMRC20080/outputs/vcfutils_lpanamensis_v36/r1_trimmed_lpanamensis_v36_count.txt' does not exist in current working directory ('/mnt/cbcb/fs01_abelew/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019').
## Error : 'preprocessing/TMRC20080/outputs/vcfutils_lpanamensis_v36/r1_trimmed_lpanamensis_v36_count.txt' does not exist in current working directory ('/mnt/cbcb/fs01_abelew/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019').
## Error in Biobase::`sampleNames<-`(`*tmp*`, value = colnames(snp_exprs)): number of new names (66) should equal number of rows in AnnotatedDataFrame (67)
old_snps <- sm(count_expt_snps(old_expt, annot_column = "bcftable", snp_column = 2))

both_snps <- combine_expts(new_snps, old_snps)
## Error in combine_expts(new_snps, old_snps): object 'new_snps' not found
both_norm <- sm(normalize_expt(both_snps, transform = "log2", convert = "cpm", filter = TRUE))
## Error in normalize_expt(both_snps, transform = "log2", convert = "cpm", : object 'both_snps' not found
## strains <- both_norm[["design"]][["strain"]]
both_strain <- set_expt_conditions(both_norm, fact = "strain")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'both_norm' not found

The data structure ‘both_norm’ now contains our 2016 data along with the newer data collected since 2019.

7.2 Plot of SNP profiles for zymodemes

The following plot shows the SNP profiles of all samples (old and new) where the colors at the top show either the 2.2 strains (orange), 2.3 strains (green), the previous samples (purple), or the various lab strains (pink etc).

old_new_variant_heatmap <- plot_disheat(both_norm)
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, : object 'both_norm' not found
pp(file = "images/raw_snp_disheat.png", image = old_new_variant_heatmap,
   height = 12, width = 12)
## Error in pp(file = "images/raw_snp_disheat.png", image = old_new_variant_heatmap, : object 'old_new_variant_heatmap' not found

The function get_snp_sets() takes the provided metadata factor (in this case ‘condition’) and looks for variants which are exclusive to each element in it. In this case, this is looking for differences between 2.2 and 2.3, as well as the set shared among them.

snp_sets <- get_snp_sets(both_snps, factor = "condition")
## Error in get_snp_sets(both_snps, factor = "condition"): object 'both_snps' not found
both_expt <- combine_expts(lp_expt, old_expt)

snp_genes <- sm(snps_vs_genes(both_expt, snp_sets, expt_name_col = "chromosome"))
## Error in snps_vs_genes(both_expt, snp_sets, expt_name_col = "chromosome"): object 'snp_sets' not found
## I think we have some metrics here we can plot...
snp_subset <- sm(snp_subset_genes(
  both_expt, both_snps,
  genes = c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
            "LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300")))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'both_snps' not found
zymo_heat <- plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset)))
## Error in plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset))): object 'snp_subset' not found
zymo_heat
## Error in eval(expr, envir, enclos): object 'zymo_heat' not found

Didn’t I create a set of densities by chromosome? Oh I think they come in from get_snp_sets()

7.3 SNPS associated with clinical response in the TMRC samples

clinical_sets <- get_snp_sets(new_snps, factor = "clinicalresponse")
## Error in get_snp_sets(new_snps, factor = "clinicalresponse"): object 'new_snps' not found
density_vec <- clinical_sets[["density"]]
## Error in eval(expr, envir, enclos): object 'clinical_sets' not found
chromosome_idx <- grep(pattern = "LpaL", x = names(density_vec))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grep': object 'density_vec' not found
density_df <- as.data.frame(density_vec[chromosome_idx])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'density_vec' not found
density_df[["chr"]] <- rownames(density_df)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'density_df' not found
colnames(density_df) <- c("density_vec", "chr")
## Error in colnames(density_df) <- c("density_vec", "chr"): object 'density_df' not found
ggplot(density_df, aes_string(x = "chr", y = "density_vec")) +
  ggplot2::geom_col() +
  ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
                 axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5))
## Error in ggplot(density_df, aes_string(x = "chr", y = "density_vec")): object 'density_df' not found
## clinical_written <- write_variants(new_snps)

7.3.1 Cross reference these variants by gene

clinical_genes <- sm(snps_vs_genes(lp_expt, clinical_sets, expt_name_col = "chromosome"))
## Error in snps_vs_genes(lp_expt, clinical_sets, expt_name_col = "chromosome"): object 'clinical_sets' not found
snp_density <- merge(as.data.frame(clinical_genes[["summary_by_gene"]]),
                     as.data.frame(fData(lp_expt)),
                     by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_genes' not found
snp_density <- snp_density[, c(1, 2, 4, 15)]
## Error in eval(expr, envir, enclos): object 'snp_density' not found
colnames(snp_density) <- c("name", "snps", "product", "length")
## Error in colnames(snp_density) <- c("name", "snps", "product", "length"): object 'snp_density' not found
snp_density[["product"]] <- tolower(snp_density[["product"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'tolower': object 'snp_density' not found
snp_density[["length"]] <- as.numeric(snp_density[["length"]])
## Error in eval(expr, envir, enclos): object 'snp_density' not found
snp_density[["density"]] <- snp_density[["snps"]] / snp_density[["length"]]
## Error in eval(expr, envir, enclos): object 'snp_density' not found
snp_idx <- order(snp_density[["density"]], decreasing = TRUE)
## Error in eval(quote(list(...)), env): object 'snp_density' not found
snp_density <- snp_density[snp_idx, ]
## Error in eval(expr, envir, enclos): object 'snp_density' not found
removers <- c("amastin", "gp63", "leishmanolysin")
for (r in removers) {
  drop_idx <- grepl(pattern = r, x = snp_density[["product"]])
  snp_density <- snp_density[!drop_idx, ]
}
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grepl': object 'snp_density' not found
## Filter these for [A|a]mastin gp63 Leishmanolysin
clinical_snps <- snps_intersections(lp_expt, clinical_sets, chr_column = "chromosome")
## Error in snps_intersections(lp_expt, clinical_sets, chr_column = "chromosome"): object 'clinical_sets' not found
fail_ref_snps <- as.data.frame(clinical_snps[["inters"]][["failure, reference strain"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_snps' not found
cure_snps <- as.data.frame(clinical_snps[["inters"]][["cure"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_snps' not found
head(fail_ref_snps)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'fail_ref_snps' not found
head(cure_snps)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'cure_snps' not found
annot <- fData(lp_expt)
clinical_interest <- as.data.frame(clinical_snps[["gene_summaries"]][["cure"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_snps' not found
clinical_interest <- merge(clinical_interest,
                           as.data.frame(clinical_snps[["gene_summaries"]][["failure, reference strain"]]),
                           by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'clinical_interest' not found
rownames(clinical_interest) <- clinical_interest[["Row.names"]]
## Error in eval(expr, envir, enclos): object 'clinical_interest' not found
clinical_interest[["Row.names"]] <- NULL
## Error in clinical_interest[["Row.names"]] <- NULL: object 'clinical_interest' not found
colnames(clinical_interest) <- c("cure_snps","fail_snps")
## Error in colnames(clinical_interest) <- c("cure_snps", "fail_snps"): object 'clinical_interest' not found
annot <- merge(annot, clinical_interest, by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'y' in selecting a method for function 'merge': object 'clinical_interest' not found
rownames(annot) <- annot[["Row.names"]]
annot[["Row.names"]] <- NULL
fData(lp_expt$expressionset) <- annot

8 Zymodeme for new samples

The heatmap produced here should show the variants only for the zymodeme genes.

8.1 Hunt for snp clusters

I am thinking that if we find clusters of locations which are variant, that might provide some PCR testing possibilities.

new_sets <- get_snp_sets(new_snps, factor = "phenotypiccharacteristics")
## Error in get_snp_sets(new_snps, factor = "phenotypiccharacteristics"): object 'new_snps' not found
summary(new_sets)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'new_sets' not found
## 1000000: 2.2
## 0100000: 2.3

summary(new_sets[["intersections"]][["10000"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'new_sets' not found
summary(new_sets[["intersections"]][["01000"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'new_sets' not found

Thus we see that there are 511 variants associated with 2.2 and 49,790 associated with 2.3.

8.1.1 A small function for searching for potential PCR primers

The following function uses the positional data to look for sequential mismatches associated with zymodeme in the hopes that there will be some regions which would provide good potential targets for a PCR-based assay.

sequential_variants <- function(snp_sets, conditions = NULL, minimum = 3, maximum_separation = 3) {
  if (is.null(conditions)) {
    conditions <- 1
  }
  intersection_sets <- snp_sets[["intersections"]]
  intersection_names <- snp_sets[["set_names"]]
  chosen_intersection <- 1
  if (is.numeric(conditions)) {
    chosen_intersection <- conditions
  } else {
    intersection_idx <- intersection_names == conditions
    chosen_intersection <- names(intersection_names)[intersection_idx]
  }

  possible_positions <- intersection_sets[[chosen_intersection]]
  position_table <- data.frame(row.names = possible_positions)
  pat <- "^chr_(.+)_pos_(.+)_ref_.*$"
  position_table[["chr"]] <- gsub(pattern = pat, replacement = "\\1", x = rownames(position_table))
  position_table[["pos"]] <- as.numeric(gsub(pattern = pat, replacement = "\\2", x = rownames(position_table)))
  position_idx <- order(position_table[, "chr"], position_table[, "pos"])
  position_table <- position_table[position_idx, ]
  position_table[["dist"]] <- 0

  last_chr <- ""
  for (r in 1:nrow(position_table)) {
    this_chr <- position_table[r, "chr"]
    if (r == 1) {
      position_table[r, "dist"] <- position_table[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      position_table[r, "dist"] <- position_table[r, "pos"] - position_table[r - 1, "pos"]
    } else {
      position_table[r, "dist"] <- position_table[r, "pos"]
    }
    last_chr <- this_chr
  }

  ## Working interactively here.
  
  doubles <- position_table[["dist"]] == 1
  doubles <- position_table[doubles, ]
  write.csv(doubles, "doubles.csv")

  one_away <- position_table[["dist"]] == 2
  one_away <- position_table[one_away, ]
  write.csv(one_away, "one_away.csv")

  two_away <- position_table[["dist"]] == 3
  two_away <- position_table[two_away, ]
  write.csv(two_away, "two_away.csv")

  combined <- rbind(doubles, one_away)
  combined <- rbind(combined, two_away)
  position_idx <- order(combined[, "chr"], combined[, "pos"])
  combined <- combined[position_idx, ]
  
  this_chr <- ""
  for (r in 1:nrow(combined)) {
    this_chr <- combined[r, "chr"]
    if (r == 1) {
      combined[r, "dist_pair"] <- combined[r, "pos"]
      last_chr <- this_chr
      next
    }
    if (this_chr == last_chr) {
      combined[r, "dist_pair"] <- combined[r, "pos"] - combined[r - 1, "pos"]
    } else {
      combined[r, "dist_pair"] <- combined[r, "pos"]
    }
    last_chr <- this_chr
  }

  dist_pair_maximum <- 1000
  dist_pair_minimum <- 200
  dist_pair_idx <- combined[["dist_pair"]] <= dist_pair_maximum &
    combined[["dist_pair"]] >= dist_pair_minimum
  remaining <- combined[dist_pair_idx, ]
  no_weak_idx <- grepl(pattern="ref_(G|C)", x=rownames(remaining))
  remaining <- remaining[no_weak_idx, ]
  
  print(head(table(position_table[["dist"]])))
  sequentials <- position_table[["dist"]] <= maximum_separation
  message("There are ", sum(sequentials), " candidate regions.")

  ## The following can tell me how many runs of each length occurred, that is not quite what I want.
  ## Now use run length encoding to find the set of sequential sequentials!
  rle_result <- rle(sequentials)
  rle_values <- rle_result[["values"]]
  ## The following line is equivalent to just leaving values alone:
  ## true_values <- rle_result[["values"]] == TRUE
  rle_lengths <- rle_result[["lengths"]]
  true_sequentials <- rle_lengths[rle_values]
  rle_idx <- cumsum(rle_lengths)[which(rle_values)]

  position_table[["last_sequential"]] <- 0
  count <- 0
  for (r in rle_idx) {
    count <- count + 1
    position_table[r, "last_sequential"] <- true_sequentials[count]
  }
  message("The maximum sequential set is: ", max(position_table[["last_sequential"]]), ".")

  wanted_idx <- position_table[["last_sequential"]] >= minimum
  wanted <- position_table[wanted_idx, c("chr", "pos")]
  return(wanted)
}

zymo22_sequentials <- sequential_variants(new_sets, conditions = "22", minimum=1, maximum_separation=2)
dim(zymo22_sequentials)
## 7 candidate regions for zymodeme 2.2 -- thus I am betting that the reference strain is a 2.2
zymo23_sequentials <- sequential_variants(new_sets, conditions = "23",
                                          minimum = 2, maximum_separation = 2)
dim(zymo23_sequentials)
## In contrast, there are lots (587) of interesting regions for 2.3!

8.1.2 Extract a promising region from the genome

The first 4 candidate regions from my set of remaining: * Chr Pos. Distance * LpaL13-15 238433 448 * LpaL13-18 142844 613 * LpaL13-29 830342 252 * LpaL13-33 1331507 843

Lets define a couple of terms: * Third: Each of the 4 above positions. * Second: Third - Distance * End: Third + PrimerLen * Start: Second - Primerlen

In each instance, these are the last positions, so we want to grab three things:

  • The entire region from End -> Start, this way we can have a quick sanity check.
  • Start -> Second.
  • (Third -> End) <- Reverse complemented
## * LpaL13-15 238433 448
first_candidate_chr <- genome[["LpaL13_15"]]
primer_length <- 22
amplicon_length <- 448
first_candidate_third <- 238433
first_candidate_second <- first_candidate_third - amplicon_length
first_candidate_start <- first_candidate_second - primer_length
first_candidate_end <- first_candidate_third + primer_length
first_candidate_region <- subseq(first_candidate_chr, first_candidate_start, first_candidate_end)
first_candidate_region
first_candidate_5p <- subseq(first_candidate_chr, first_candidate_start, first_candidate_second)
as.character(first_candidate_5p)
first_candidate_3p <- spgs::reverseComplement(subseq(first_candidate_chr, first_candidate_third, first_candidate_end))
first_candidate_3p


## * LpaL13-18 142844 613
second_candidate_chr <- genome[["LpaL13_18"]]
primer_length <- 22
amplicon_length <- 613
second_candidate_third <- 142844
second_candidate_second <- second_candidate_third - amplicon_length
second_candidate_start <- second_candidate_second - primer_length
second_candidate_end <- second_candidate_third + primer_length
second_candidate_region <- subseq(second_candidate_chr, second_candidate_start, second_candidate_end)
second_candidate_region
second_candidate_5p <- subseq(second_candidate_chr, second_candidate_start, second_candidate_second)
as.character(second_candidate_5p)
second_candidate_3p <- spgs::reverseComplement(subseq(second_candidate_chr, second_candidate_third, second_candidate_end))
second_candidate_3p


## * LpaL13-29 830342 252
third_candidate_chr <- genome[["LpaL13_29"]]
primer_length <- 22
amplicon_length <- 252
third_candidate_third <- 830342
third_candidate_second <- third_candidate_third - amplicon_length
third_candidate_start <- third_candidate_second - primer_length
third_candidate_end <- third_candidate_third + primer_length
third_candidate_region <- subseq(third_candidate_chr, third_candidate_start, third_candidate_end)
third_candidate_region
third_candidate_5p <- subseq(third_candidate_chr, third_candidate_start, third_candidate_second)
as.character(third_candidate_5p)
third_candidate_3p <- spgs::reverseComplement(subseq(third_candidate_chr, third_candidate_third, third_candidate_end))
third_candidate_3p
## You are a garbage polypyrimidine tract.
## Which is actually interesting if the mutations mess it up.


## * LpaL13-33 1331507 843
fourth_candidate_chr <- genome[["LpaL13_33"]]
primer_length <- 22
amplicon_length <- 843
fourth_candidate_third <- 1331507
fourth_candidate_second <- fourth_candidate_third - amplicon_length
fourth_candidate_start <- fourth_candidate_second - primer_length
fourth_candidate_end <- fourth_candidate_third + primer_length
fourth_candidate_region <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_end)
fourth_candidate_region
fourth_candidate_5p <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_second)
as.character(fourth_candidate_5p)
fourth_candidate_3p <- spgs::reverseComplement(subseq(fourth_candidate_chr, fourth_candidate_third, fourth_candidate_end))
fourth_candidate_3p

8.2 Go hunting for Sanger sequencing regions

I made a fun little function which should find regions which have lots of variants associated with a given experimental factor.

pheno <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
pheno <- subset_expt(pheno, subset = "!is.na(pData(pheno)[['bcftable']])")
pheno_snps <- sm(count_expt_snps(pheno, annot_column = "bcftable"))

fun_stuff <- snp_density_primers(pheno_snps,
                                 bsgenome="BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53",
                                 gff="reference/TriTrypDB-53_LpanamensisMHOMCOL81L13.gff")
drop_scaffolds <- grepl(x = rownames(fun_stuff$favorites), pattern = "SCAF")
favorite_primer_regions <- fun_stuff[["favorites"]][!drop_scaffolds, ]
favorite_primer_regions[["bin"]] <- rownames(favorite_primer_regions)
library(dplyr)
favorite_primer_regions <- favorite_primer_regions %>%
  relocate(bin)

8.3 Combine this table with 2.2/2.3 genes

Here is my note from our meeting:

Cross reference primers to DE genes of 2.2/2.3 and/or resistance/suscpetible, add a column to the primer spreadsheet with the DE genes (in retrospect I am guessing this actually means to put the logFC as a column.

One nice thing, I did a semantic removal on the lp_expt, so the set of logFC/pvalues should not have any of the offending types; thus I should be able to automagically get rid of them in the merge.

logfc <- zy_table[["data"]][["z23_vs_z22"]]
logfc_columns <- logfc[, c("deseq_logfc", "deseq_adjp")]
colnames(logfc_columns) <- c("z23_logfc", "z23_adjp")
new_table <- merge(favorite_primer_regions, logfc_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names")
sus <- sus_table[["data"]][["sensitive_vs_resistant"]]
sus_columns <- sus[, c("deseq_logfc", "deseq_adjp")]
colnames(sus_columns) <- c("sus_logfc", "sus_adjp")
new_table <- merge(new_table, sus_columns,
                   by.x = "closest_gene_before_id", by.y = "row.names") %>%
  relocate(bin)
written <- write_xlsx(data=new_table,
                      excel="excel/favorite_primers_xref_zy_sus.
## Error: <text>:13:29: unexpected INCOMPLETE_STRING
## 12: written <- write_xlsx(data=new_table,
## 13:                       excel="excel/favorite_primers_xref_zy_sus.
##                                 ^

8.4 Make a heatmap describing the clustering of variants

We can cross reference the variants against the zymodeme status and plot a heatmap of the results and hopefully see how they separate.

snp_genes <- sm(snps_vs_genes(lp_expt, new_sets, expt_name_col = "chromosome"))
## Error in snps_vs_genes(lp_expt, new_sets, expt_name_col = "chromosome"): object 'new_sets' not found
new_zymo_norm <- normalize_expt(new_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE)
## Error in normalize_expt(new_snps, filter = TRUE, convert = "cpm", norm = "quant", : object 'new_snps' not found
new_zymo_norm <- set_expt_conditions(new_zymo_norm, fact = "phenotypiccharacteristics")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'new_zymo_norm' not found
zymo_heat <- plot_disheat(new_zymo_norm)
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, : object 'new_zymo_norm' not found
zymo_heat[["plot"]]
## Error in eval(expr, envir, enclos): object 'zymo_heat' not found

8.4.1 Annotated heatmap of variants

Now let us try to make a heatmap which includes some of the annotation data.

des <- both_norm[["design"]]
## Error in eval(expr, envir, enclos): object 'both_norm' not found
undef_idx <- is.na(des[["strain"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[undef_idx, "strain"] <- "unknown"
## Error in des[undef_idx, "strain"] <- "unknown": object 'des' not found
##hmcols <- colorRampPalette(c("yellow","black","darkblue"))(256)
correlations <- hpgl_cor(exprs(both_norm))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'both_norm' not found
zymo_missing_idx <- is.na(des[["phenotypiccharacteristics"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[["phenotypiccharacteristics"]] <- as.character(des[["phenotypiccharacteristics"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[["clinicalcategorical"]] <- as.character(des[["clinicalcategorical"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[zymo_missing_idx, "phenotypiccharacteristics"] <- "unknown"
## Error in des[zymo_missing_idx, "phenotypiccharacteristics"] <- "unknown": object 'des' not found
mydendro <- list(
  "clustfun" = hclust,
  "lwd" = 2.0)
col_data <- as.data.frame(des[, c("phenotypiccharacteristics", "clinicalcategorical")])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'des' not found
unknown_clinical <- is.na(col_data[["clinicalcategorical"]])
## Error in eval(expr, envir, enclos): object 'col_data' not found
row_data <- as.data.frame(des[, c("strain")])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'des' not found
colnames(col_data) <- c("zymodeme", "outcome")
## Error in colnames(col_data) <- c("zymodeme", "outcome"): object 'col_data' not found
col_data[unknown_clinical, "outcome"] <- "undefined"
## Error in col_data[unknown_clinical, "outcome"] <- "undefined": object 'col_data' not found
colnames(row_data) <- c("strain")
## Error in colnames(row_data) <- c("strain"): object 'row_data' not found
myannot <- list(
  "Col" = list("data" = col_data),
  "Row" = list("data" = row_data))
## Error in eval(expr, envir, enclos): object 'col_data' not found
myclust <- list("cuth" = 1.0,
                "col" = BrewerClusterCol)
mylabs <- list(
  "Row" = list("nrow" = 4),
  "Col" = list("nrow" = 4))
hmcols <- colorRampPalette(c("darkblue", "beige"))(240)
map1 <- annHeatmap2(
  correlations,
  dendrogram = mydendro,
  annotation = myannot,
  cluster = myclust,
  labels = mylabs,
  ## The following controls if the picture is symmetric
  scale = "none",
  col = hmcols)
## Error in annHeatmap2(correlations, dendrogram = mydendro, annotation = myannot, : object 'correlations' not found
pp(file = "images/dendro_heatmap.png", image = map1, height = 20, width = 20)
## Error in pp(file = "images/dendro_heatmap.png", image = map1, height = 20, : object 'map1' not found

Print the larger heatmap so that all the labels appear. Keep in mind that as we get more samples, this image needs to continue getting bigger.

big heatmap

xref_prop <- table(pheno_snps[["conditions"]])
## Error in eval(quote(list(...)), env): object 'pheno_snps' not found
pheno_snps$conditions
## Error in eval(expr, envir, enclos): object 'pheno_snps' not found
idx_tbl <- exprs(pheno_snps) > 5
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'pheno_snps' not found
new_tbl <- data.frame(row.names = rownames(exprs(pheno_snps)))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'pheno_snps' not found
for (n in names(xref_prop)) {
  new_tbl[[n]] <- 0
  idx_cols <- which(pheno_snps[["conditions"]] == n)
  prop_col <- rowSums(idx_tbl[, idx_cols]) / xref_prop[n]
  new_tbl[n] <- prop_col
}
## Error in eval(expr, envir, enclos): object 'xref_prop' not found
keepers <- grepl(x = rownames(new_tbl), pattern = "LpaL13")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grepl': error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'new_tbl' not found
new_tbl <- new_tbl[keepers, ]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[["strong22"]] <- 1.001 - new_tbl[["z2.2"]]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[["strong23"]] <- 1.001 - new_tbl[["z2.3"]]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
s22_na <- new_tbl[["strong22"]] > 1
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[s22_na, "strong22"] <- 1
## Error in new_tbl[s22_na, "strong22"] <- 1: object 'new_tbl' not found
s23_na <- new_tbl[["strong23"]] > 1
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[s23_na, "strong23"] <- 1
## Error in new_tbl[s23_na, "strong23"] <- 1: object 'new_tbl' not found
new_tbl[["SNP"]] <- rownames(new_tbl)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'new_tbl' not found
new_tbl[["Chromosome"]] <- gsub(x = new_tbl[["SNP"]], pattern = "chr_(.*)_pos_.*", replacement = "\\1")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': object 'new_tbl' not found
new_tbl[["Position"]] <- gsub(x = new_tbl[["SNP"]], pattern = ".*_pos_(\\d+)_.*", replacement = "\\1")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': object 'new_tbl' not found
new_tbl <- new_tbl[, c("SNP", "Chromosome", "Position", "strong22", "strong23")]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
library(CMplot)
## Much appreciate for using CMplot.
## Full description, Bug report, Suggestion and the latest codes:
## https://github.com/YinLiLin/CMplot
simplify <- new_tbl
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
simplify[["strong22"]] <- NULL
## Error in simplify[["strong22"]] <- NULL: object 'simplify' not found
CMplot(simplify, bin.size = 100000)
## Error in is.data.frame(x): object 'simplify' not found
CMplot(new_tbl, plot.type="m", multracks=TRUE, threshold = c(0.01, 0.05),
       threshold.lwd=c(1,1), threshold.col=c("black","grey"),
       amplify=TRUE, bin.size=10000,
       chr.den.col=c("darkgreen", "yellow", "red"),
       signal.col=c("red", "green", "blue"),
       signal.cex=1, file="jpg", memo="", dpi=300, file.output=TRUE, verbose=TRUE)
## Error in is.data.frame(x): object 'new_tbl' not found

SNP Density Circular Manhattan Rectangular Manhattan QQ

8.5 Try out MatrixEQTL

This tool looks a little opaque, but provides sample data with things that make sense to me and should be pretty easy to recapitulate in our data.

  1. covariates.txt: Columns are samples, rows are things from pData – the most likely ones of interest for our data would be zymodeme, sensitivity
  2. geneloc.txt: columns are ‘geneid’, ‘chr’, ‘left’, ‘right’. I guess I can assume left and right are start/stop; in which case this is trivially acquirable from fData.
  3. ge.txt: This appears to be a log(rpkm/cpm) table with rows as genes and columns as samples
  4. snpsloc.txt: columns are ‘snpid’, ‘chr’, ‘pos’
  5. snps.txt: columns are samples, rows are the ids from snsploc, values a 0,1,2. I assume 0 is identical and 1..12 are the various A->TGC T->AGC C->AGT G->ACT
## For this, let us use the 'new_snps' data structure.
## Caveat here: these need to be coerced to numbers.
my_covariates <- pData(new_snps)[, c("phenotypiccharacteristics", "clinicalcategorical")]
for (col in colnames(my_covariates)) {
  my_covariates[[col]] <- as.numeric(as.factor(my_covariates[[col]]))
}
my_covariates <- t(my_covariates)

my_geneloc <- fData(lp_expt)[, c("gid", "chromosome", "start", "end")]
colnames(my_geneloc) <- c("geneid", "chr", "left", "right")

my_ge <- exprs(normalize_expt(lp_expt, transform = "log2", filter = TRUE, convert = "cpm"))
used_samples <- tolower(colnames(my_ge)) %in% colnames(exprs(new_snps))
my_ge <- my_ge[, used_samples]

my_snpsloc <- data.frame(rownames = rownames(exprs(new_snps)))
## Oh, caveat here: Because of the way I stored the data,
## I could have duplicate rows which presumably will make matrixEQTL sad
my_snpsloc[["chr"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\1",
                            x = rownames(my_snpsloc))
my_snpsloc[["pos"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\2",
                            x = rownames(my_snpsloc))
test <- duplicated(my_snpsloc)
## Each duplicated row would be another variant at that position;
## so in theory we would do a rle to number them I am guessing
## However, I do not have different variants so I think I can ignore this for the moment
## but will need to make my matrix either 0 or 1.
if (sum(test) > 0) {
  message("There are: ", sum(duplicated), " duplicated entries.")
  keep_idx <- ! test
  my_snpsloc <- my_snpsloc[keep_idx, ]
}

my_snps <- exprs(new_snps)
one_idx <- my_snps > 0
my_snps[one_idx] <- 1

## Ok, at this point I think I have all the pieces which this method wants...
## Oh, no I guess not; it actually wants the data as a set of filenames...
library(MatrixEQTL)
write.table(my_snps, "eqtl/snps.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snps, "eqtl/snps.tsv", )
write.table(my_snpsloc, "eqtl/snpsloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snpsloc, "eqtl/snpsloc.tsv")
write.table(as.data.frame(my_ge), "eqtl/ge.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_ge), "eqtl/ge.tsv")
write.table(as.data.frame(my_geneloc), "eqtl/geneloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_geneloc), "eqtl/geneloc.tsv")
write.table(as.data.frame(my_covariates), "eqtl/covariates.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_covariates), "eqtl/covariates.tsv")

useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

# Genotype file name
SNP_file_name = "eqtl/snps.tsv"
snps_location_file_name = "eqtl/snpsloc.tsv"
expression_file_name = "eqtl/ge.tsv"
gene_location_file_name = "eqtl/geneloc.tsv"
covariates_file_name = "eqtl/covariates.tsv"
# Output file name
output_file_name_cis = tempfile()
output_file_name_tra = tempfile()
# Only associations significant at this level will be saved
pvOutputThreshold_cis = 0.1
pvOutputThreshold_tra = 0.1
# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric()
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");
# Distance for local gene-SNP pairs
cisDist = 1e6
## Load genotype data
snps = SlicedData$new()
snps$fileDelimiter = "\t"      # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1          # one row of column labels
snps$fileSkipColumns = 1       # one column of row labels
snps$fileSliceSize = 2000      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)
## Load gene expression data
gene = SlicedData$new()
gene$fileDelimiter = "\t"      # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1          # one row of column labels
gene$fileSkipColumns = 1       # one column of row labels
gene$fileSliceSize = 2000      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)
## Load covariates
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"      # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1          # one row of column labels
cvrt$fileSkipColumns = 1       # one column of row labels
if(length(covariates_file_name) > 0) {
  cvrt$LoadFile(covariates_file_name)
}
## Run the analysis
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)

me = Matrix_eQTL_main(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name_tra,
    pvOutputThreshold = pvOutputThreshold_tra,
    useModel = useModel,
    errorCovariance = errorCovariance,
    verbose = TRUE,
    output_file_name.cis = output_file_name_cis,
    pvOutputThreshold.cis = pvOutputThreshold_cis,
    snpspos = snpspos,
    genepos = genepos,
    cisDist = cisDist,
    pvalue.hist = "qqplot",
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE);
if (!isTRUE(get0("skip_load"))) {
  pander::pander(sessionInfo())
  message(paste0("This is hpgltools commit: ", get_git_commit()))
  message(paste0("Saving to ", savefile))
  tmp <- sm(saveme(filename = savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset b006f25aaf505e4650ed89d020b1cb7e4b46f6a0
## This is hpgltools commit: Mon Dec 13 14:13:47 2021 -0500: b006f25aaf505e4650ed89d020b1cb7e4b46f6a0
## Saving to tmrc2_02sample_estimation_v202201.rda.xz
tmp <- loadme(filename = savefile)
