sample_sheet <- glue::glue("sample_sheets/tmrc2_samples_202206.xlsx")
This is mostly just a run of this worksheet to reacquaint myself with it.
This document is intended to provide a general overview of the TMRC2 samples which have thus far been sequenced. In some cases, this includes only those samples starting in 2019; in other instances I am including our previous (2015-2016) samples.
In all cases the processing performed was:
The analyses in this document use the matrices of counts/gene from #3 and variants/position from #4 in order to provide some images and metrics describing the samples we have sequenced so far.
Everything which follows depends on the Existing TriTrypDB annotations revision 46, circa 2019. The following block loads a database of these annotations and turns it into a matrix where the rows are genes and columns are all the annotation types provided by TriTrypDB.
The same database was used to create a matrix of orthologous genes between L.panamensis and all of the other species in the TriTrypDB.
tt <- sm(library(EuPathDB))
orgdb <- "org.Lpanamensis.MHOMCOL81L13.v46.eg.db"
tt <- sm(library(orgdb, character.only=TRUE))
pan_db <- org.Lpanamensis.MHOMCOL81L13.v46.eg.db
all_fields <- columns(pan_db)
all_lp_annot <- sm(load_orgdb_annotations(
pan_db,
keytype = "gid",
fields = c("annot_gene_entrez_id", "annot_gene_name",
"annot_strand", "annot_chromosome", "annot_cds_length",
"annot_gene_product")))$genes
lp_go <- sm(load_orgdb_go(pan_db))
lp_lengths <- all_lp_annot[, c("gid", "annot_cds_length")]
colnames(lp_lengths) <- c("ID", "length")
all_lp_annot[["annot_gene_product"]] <- tolower(all_lp_annot[["annot_gene_product"]])
orthos <- sm(EuPathDB::extract_eupath_orthologs(db = pan_db))
hisat_annot <- all_lp_annot
meta <- sm(EuPathDB::download_eupath_metadata(webservice="tritrypdb"))
lp_entry <- EuPathDB::get_eupath_entry(species="Leishmania panamensis", metadata=meta)
## Found the following hits: Leishmania panamensis MHOM/COL/81/L13, Leishmania panamensis strain MHOM/PA/94/PSC-1, choosing the first.
## Using: Leishmania panamensis MHOM/COL/81/L13.
colnames(lp_entry)
## [1] "AnnotationVersion" "AnnotationSource" "BiocVersion"
## [4] "DataProvider" "Genome" "GenomeSource"
## [7] "GenomeVersion" "NumArrayGene" "NumChipChipGene"
## [10] "NumChromosome" "NumCodingGene" "NumCommunity"
## [13] "NumContig" "NumEC" "NumEST"
## [16] "NumGene" "NumGO" "NumOrtholog"
## [19] "NumOtherGene" "NumPopSet" "NumProteomics"
## [22] "NumPseudogene" "NumRNASeq" "NumRTPCR"
## [25] "NumSNP" "NumTFBS" "Organellar"
## [28] "ReferenceStrain" "MegaBP" "PrimaryKey"
## [31] "ProjectID" "RecordClassName" "SourceID"
## [34] "SourceVersion" "TaxonomyID" "TaxonomyName"
## [37] "URLGenome" "URLGFF" "URLProtein"
## [40] "Coordinate_1_based" "Maintainer" "SourceUrl"
## [43] "Tags" "BsgenomePkg" "GrangesPkg"
## [46] "OrganismdbiPkg" "OrgdbPkg" "TxdbPkg"
## [49] "Taxon" "Genus" "Species"
## [52] "Strain" "BsgenomeFile" "GrangesFile"
## [55] "OrganismdbiFile" "OrgdbFile" "TxdbFile"
## [58] "GenusSpecies" "TaxonUnmodified" "TaxonCanonical"
## [61] "TaxonXref"
testing_panamensis <- "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53"
## testing_panamensis <- EuPathDB::make_eupath_bsgenome(entry=lp_entry, eu_version="v46")
library(as.character(testing_panamensis), character.only=TRUE)
## Loading required package: BSgenome
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: rtracklayer
genome <- get0(as.character(testing_panamensis))
Resequence samples: TMRC20002, TMRC20006, TMRC20004 (maybe TMRC20008 and TMRC20029)
The process of sample estimation takes two primary inputs:
An expressionset is a data structure used in R to examine RNASeq data. It is comprised of annotations, metadata, and expression data. In the case of our processing pipeline, the location of the expression data is provided by the filenames in the metadata.
The first lines of the following block create the Expressionset. All of the following lines perform various normalizations and generate plots from it.
The following samples are much lower coverage:
20210610: I made some manual changes to the sample sheet which I downloaded, filling in some zymodeme with ‘unknown’
sanitize_columns <- c("passagenumber", "clinicalresponse", "clinicalcategorical",
"zymodemecategorical", "zymodemecategorical")
lp_expt <- create_expt(sample_sheet,
gene_info = hisat_annot,
annotation = orgdb,
id_column = "hpglidentifier",
file_column = "lpanamensisv36hisatfile") %>%
set_expt_conditions(fact = "zymodemecategorical") %>%
subset_expt(nonzero = 8550) %>%
subset_expt(coverage = 5000000) %>%
semantic_expt_filter(semantic = c("amastin", "gp63", "leishmanolysin"),
semantic_column = "annot_gene_product") %>%
sanitize_expt_metadata(columns = sanitize_columns) %>%
set_expt_factors(columns = sanitize_columns, class = "factor")
## Reading the sample metadata.
## Error in read.xlsx.default(xlsxFile = file, sheet = 1) :
## File does not exist.
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': error in evaluating the argument 'object' in selecting a method for function 'pData': error in evaluating the argument 'object' in selecting a method for function 'exprs': error in evaluating the argument 'object' in selecting a method for function 'sampleNames': error in evaluating the argument 'object' in selecting a method for function 'sampleNames': error in evaluating the argument 'object' in selecting a method for function 'pData': Unable to read the metadata file: sample_sheets/tmrc2_samples_202206.xlsx
libsizes <- plot_libsize(lp_expt)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_libsize': object 'lp_expt' not found
dev <- pp(file = "images/lp_expt_libsizes.png", width = 14, height = 9)
## Writing png?
libsizes$plot
## Error in eval(expr, envir, enclos): object 'libsizes' not found
closed <- dev.off()
libsizes$plot
## Error in eval(expr, envir, enclos): object 'libsizes' not found
## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(lp_expt)
## Error in plot_nonzero(lp_expt): object 'lp_expt' not found
dev <- pp(file = "images/lp_nonzero.png", width=9, height=9)
## Writing png?
nonzero$plot
## Error in eval(expr, envir, enclos): object 'nonzero' not found
closed <- dev.off()
lp_box <- plot_boxplot(lp_expt)
## Error in plot_boxplot(lp_expt): object 'lp_expt' not found
dev <- pp(file = "images/lp_expt_boxplot.png", width = 12, height = 9)
## Writing png?
lp_box
## Error in eval(expr, envir, enclos): object 'lp_box' not found
closed <- dev.off()
lp_box
## Error in eval(expr, envir, enclos): object 'lp_box' not found
filter_plot <- plot_libsize_prepost(lp_expt)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_libsize': object 'lp_expt' not found
filter_plot$lowgene_plot
## Error in eval(expr, envir, enclos): object 'filter_plot' not found
filter_plot$count_plot
## Error in eval(expr, envir, enclos): object 'filter_plot' not found
table(pData(lp_expt)[["zymodemecategorical"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_expt' not found
table(pData(lp_expt)[["clinicalresponse"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_expt' not found
Najib’s favorite plots are of course the PCA/TNSE. These are nice to look at in order to get a sense of the relationships between samples. They also provide a good opportunity to see what happens when one applies different normalizations, surrogate analyses, filters, etc. In addition, one may set different experimental factors as the primary ‘condition’ (usually the color of plots) and surrogate ‘batches’.
Column ‘Q’ in the sample sheet, make a categorical version of it with these parameters:
starting <- as.numeric(pData(lp_expt)[["susceptibilityinfectionreduction32ugmlsbvhistoricaldata"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_expt' not found
sus_categorical <- starting
## Error in eval(expr, envir, enclos): object 'starting' not found
na_idx <- is.na(starting)
## Error in eval(expr, envir, enclos): object 'starting' not found
sus_categorical[na_idx] <- "unknown"
## Error in sus_categorical[na_idx] <- "unknown": object 'sus_categorical' not found
resist_idx <- starting <= 0.35
## Error in eval(expr, envir, enclos): object 'starting' not found
sus_categorical[resist_idx] <- "resistant"
## Error in sus_categorical[resist_idx] <- "resistant": object 'sus_categorical' not found
indeterminant_idx <- starting >= 0.36 & starting <= 0.48
## Error in eval(expr, envir, enclos): object 'starting' not found
sus_categorical[indeterminant_idx] <- "ambiguous"
## Error in sus_categorical[indeterminant_idx] <- "ambiguous": object 'sus_categorical' not found
susceptible_idx <- starting >= 0.49
## Error in eval(expr, envir, enclos): object 'starting' not found
sus_categorical[susceptible_idx] <- "sensitive"
## Error in sus_categorical[susceptible_idx] <- "sensitive": object 'sus_categorical' not found
pData(lp_expt)[["sus_category"]] <- sus_categorical
## Error in eval(expr, envir, enclos): object 'sus_categorical' not found
table(sus_categorical)
## Error in eval(quote(list(...)), env): object 'sus_categorical' not found
clinical_colors <- list(
"z1.0" = "#333333",
"z2.0" = "#555555",
"z3.0" = "#777777",
"z2.1" = "#874400",
"z2.2" = "#0000cc",
"z2.3" = "#cc0000",
"z2.4" = "#df7000",
"unknown" = "#cbcbcb",
"null" = "#000000")
clinical_samples <- lp_expt %>%
set_expt_batches(fact = sus_categorical) %>%
set_expt_colors(clinical_colors)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_expt' not found
table(pData(clinical_samples)[["condition"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'clinical_samples' not found
clinical_norm <- normalize_expt(clinical_samples, norm = "quant", transform = "log2",
convert = "cpm", filter = TRUE)
## Error in normalize_expt(clinical_samples, norm = "quant", transform = "log2", : object 'clinical_samples' not found
zymo_pca <- plot_pca(clinical_norm, plot_title = "PCA of parasite expression values",
plot_labels = FALSE)
## Error in plot_pca(clinical_norm, plot_title = "PCA of parasite expression values", : object 'clinical_norm' not found
ggplt(zymo_pca$plot)
## Error in plotly::ggplotly(gg, ...): object 'zymo_pca' not found
dev <- pp(file = "images/zymo_pca_sus_shape.png")
## Writing png?
zymo_pca$plot
## Error in eval(expr, envir, enclos): object 'zymo_pca' not found
closed <- dev.off()
zymo_pca$plot
## Error in eval(expr, envir, enclos): object 'zymo_pca' not found
only_two_types <- subset_expt(clinical_samples, subset = "condition=='z2.3'|condition=='z2.2'")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'sampleNames': object 'clinical_samples' not found
only_two_norm <- sm(normalize_expt(only_two_types, norm = "quant", transform = "log2",
convert = "cpm", batch = FALSE, filter = TRUE))
## Error in normalize_expt(only_two_types, norm = "quant", transform = "log2", : object 'only_two_types' not found
onlytwo_pca <- plot_pca(only_two_norm, plot_title = "PCA of z2.2 and z2.3 parasite expression values",
plot_labels = FALSE)
## Error in plot_pca(only_two_norm, plot_title = "PCA of z2.2 and z2.3 parasite expression values", : object 'only_two_norm' not found
dev <- pp(file = "images/zymo_z2.2_z2.3_pca_sus_shape.pdf")
onlytwo_pca$plot
## Error in eval(expr, envir, enclos): object 'onlytwo_pca' not found
closed <- dev.off()
onlytwo_pca$plot
## Error in eval(expr, envir, enclos): object 'onlytwo_pca' not found
zymo_3dpca <- plot_3d_pca(zymo_pca)
## Error in plot_3d_pca(zymo_pca): object 'zymo_pca' not found
zymo_3dpca$plot
## Error in eval(expr, envir, enclos): object 'zymo_3dpca' not found
clinical_n <- sm(normalize_expt(clinical_samples, transform = "log2",
convert = "cpm", batch = FALSE, filter = TRUE))
## Error in normalize_expt(clinical_samples, transform = "log2", convert = "cpm", : object 'clinical_samples' not found
zymo_tsne <- plot_tsne(clinical_n, plot_title = "TSNE of parasite expression values")
## Error in plot_pca(..., pc_method = "tsne"): object 'clinical_n' not found
zymo_tsne$plot
## Error in eval(expr, envir, enclos): object 'zymo_tsne' not found
clinical_nb <- normalize_expt(clinical_samples, convert = "cpm", transform = "log2",
filter = TRUE, batch = "svaseq")
## Error in normalize_expt(clinical_samples, convert = "cpm", transform = "log2", : object 'clinical_samples' not found
clinical_nb_pca <- plot_pca(clinical_nb, plot_title = "PCA of parasite expression values",
plot_labels = FALSE)
## Error in plot_pca(clinical_nb, plot_title = "PCA of parasite expression values", : object 'clinical_nb' not found
dev <- pp(file = "images/clinical_nb_pca_sus_shape.png")
## Writing png?
clinical_nb_pca$plot
## Error in eval(expr, envir, enclos): object 'clinical_nb_pca' not found
closed <- dev.off()
clinical_nb_pca$plot
## Error in eval(expr, envir, enclos): object 'clinical_nb_pca' not found
clinical_nb_tsne <- plot_tsne(clinical_nb, plot_title = "TSNE of parasite expression values")
## Error in plot_pca(..., pc_method = "tsne"): object 'clinical_nb' not found
clinical_nb_tsne$plot
## Error in eval(expr, envir, enclos): object 'clinical_nb_tsne' not found
corheat <- plot_corheat(clinical_norm, plot_title = "Correlation heatmap of parasite
expression values
")
## Error in "expt" %in% class(expt): object 'clinical_norm' not found
corheat$plot
## Error in eval(expr, envir, enclos): object 'corheat' not found
plot_sm(clinical_norm)$plot
## Error in plot_sm(clinical_norm): object 'clinical_norm' not found
cf_colors <- list(
"cure" = "#006f00",
"fail" = "#9dffa0",
"unknown" = "#cbcbcb",
"notapplicable" = "#000000")
cf_expt <- set_expt_conditions(lp_expt, fact = "clinicalcategorical") %>%
set_expt_batches(fact = sus_categorical) %>%
set_expt_colors(cf_colors)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': error in evaluating the argument 'object' in selecting a method for function 'pData': error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_expt' not found
table(pData(cf_expt)[["condition"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'cf_expt' not found
cf_norm <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
norm = "quant", filter = TRUE)
## Error in normalize_expt(cf_expt, convert = "cpm", transform = "log2", : object 'cf_expt' not found
start_cf <- plot_pca(cf_norm, plot_title = "PCA of parasite expression values",
plot_labels = FALSE)
## Error in plot_pca(cf_norm, plot_title = "PCA of parasite expression values", : object 'cf_norm' not found
dev <- pp(file = "images/cf_sus_shape.png")
## Writing png?
start_cf$plot
## Error in eval(expr, envir, enclos): object 'start_cf' not found
closed <- dev.off()
start_cf$plot
## Error in eval(expr, envir, enclos): object 'start_cf' not found
cf_nb <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
filter = TRUE, batch = "svaseq")
## Error in normalize_expt(cf_expt, convert = "cpm", transform = "log2", : object 'cf_expt' not found
cf_nb_pca <- plot_pca(cf_nb, plot_title = "PCA of parasite expression values",
plot_labels = FALSE)
## Error in plot_pca(cf_nb, plot_title = "PCA of parasite expression values", : object 'cf_nb' not found
dev <- pp(file = "images/cf_sus_share_nb.png")
## Writing png?
cf_nb_pca$plot
## Error in eval(expr, envir, enclos): object 'cf_nb_pca' not found
closed <- dev.off()
cf_nb_pca$plot
## Error in eval(expr, envir, enclos): object 'cf_nb_pca' not found
cf_norm <- normalize_expt(cf_expt, transform = "log2", convert = "cpm",
filter = TRUE, norm = "quant")
## Error in normalize_expt(cf_expt, transform = "log2", convert = "cpm", : object 'cf_expt' not found
test <- pca_information(cf_norm,
expt_factors = c("clinicalcategorical", "zymodemecategorical",
"pathogenstrain", "passagenumber"),
num_components = 6, plot_pcas = TRUE)
## Error in pca_information(cf_norm, expt_factors = c("clinicalcategorical", : object 'cf_norm' not found
test$anova_p
## Error in eval(expr, envir, enclos): object 'test' not found
test$cor_heatmap
## Error in eval(expr, envir, enclos): object 'test' not found
sus_colors <- list(
"resistant" = "#8563a7",
"sensitive" = "#8d0000",
"ambiguous" = "#cbcbcb",
"unknown" = "#000000")
sus_expt <- set_expt_conditions(lp_expt, fact = "sus_category") %>%
set_expt_batches(fact = "zymodemecategorical") %>%
set_expt_colors(colors = sus_colors) %>%
subset_expt(subset = "batch!='z24'") %>%
subset_expt(subset = "batch!='z21'")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'sampleNames': error in evaluating the argument 'object' in selecting a method for function 'sampleNames': error in evaluating the argument 'object' in selecting a method for function 'pData': error in evaluating the argument 'object' in selecting a method for function 'pData': error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_expt' not found
sus_norm <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
norm = "quant", filter = TRUE)
## Error in normalize_expt(sus_expt, transform = "log2", convert = "cpm", : object 'sus_expt' not found
sus_pca <- plot_pca(sus_norm, plot_title = "PCA of parasite expression values",
plot_labels = FALSE)
## Error in plot_pca(sus_norm, plot_title = "PCA of parasite expression values", : object 'sus_norm' not found
dev <- pp(file = "images/sus_norm_pca.png")
## Writing png?
sus_pca[["plot"]]
## Error in eval(expr, envir, enclos): object 'sus_pca' not found
closed <- dev.off()
sus_pca[["plot"]]
## Error in eval(expr, envir, enclos): object 'sus_pca' not found
sus_nb <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
batch = "svaseq", filter = TRUE)
## Error in normalize_expt(sus_expt, transform = "log2", convert = "cpm", : object 'sus_expt' not found
sus_nb_pca <- plot_pca(sus_nb, plot_title = "PCA of parasite expression values",
plot_labels = FALSE)
## Error in plot_pca(sus_nb, plot_title = "PCA of parasite expression values", : object 'sus_nb' not found
dev <- pp(file = "images/sus_nb_pca.png")
## Writing png?
sus_nb_pca[["plot"]]
## Error in eval(expr, envir, enclos): object 'sus_nb_pca' not found
closed <- dev.off()
sus_nb_pca[["plot"]]
## Error in eval(expr, envir, enclos): object 'sus_nb_pca' not found
The following sections perform a series of analyses which seek to elucidate differences between the zymodemes 2.2 and 2.3 either through differential expression or variant profiles.
TODO: Do this with and without sva and compare the results.
zy_expt <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'sampleNames': object 'lp_expt' not found
zy_norm <- normalize_expt(zy_expt, filter = TRUE, convert = "cpm", norm = "quant")
## Error in normalize_expt(zy_expt, filter = TRUE, convert = "cpm", norm = "quant"): object 'zy_expt' not found
zy_de_nobatch <- all_pairwise(zy_expt, filter = TRUE, model_batch = FALSE)
## Error in normalize_expt(input, filter = filter): object 'zy_expt' not found
zy_table_nobatch <- combine_de_tables(
zy_de_nobatch, excel = glue::glue("excel/zy_tables_nobatch-v{ver}.xlsx"),
gmt = glue::glue("gmt/zymodeme_nobatch-v{ver}.gmt"))
## Error in combine_de_tables(zy_de_nobatch, excel = glue::glue("excel/zy_tables_nobatch-v{ver}.xlsx"), : object 'zy_de_nobatch' not found
zy_sig_nobatch <- extract_significant_genes(
zy_table_nobatch,
excel = glue::glue("excel/zy_sig_nobatch-v{ver}.xlsx"))
## Error in extract_significant_genes(zy_table_nobatch, excel = glue::glue("excel/zy_sig_nobatch-v{ver}.xlsx")): object 'zy_table_nobatch' not found
zy_de_sva <- all_pairwise(zy_expt, filter = TRUE, model_batch = "svaseq")
## Error in normalize_expt(input, filter = filter): object 'zy_expt' not found
zy_table_sva <- combine_de_tables(
zy_de_sva, excel = glue::glue("excel/zy_tables_sva-v{ver}.xlsx"),
gmt = glue::glue("gmt/zymodeme_sva-v{ver}.gmt"))
## Error in combine_de_tables(zy_de_sva, excel = glue::glue("excel/zy_tables_sva-v{ver}.xlsx"), : object 'zy_de_sva' not found
zy_sig_sva <- extract_significant_genes(
zy_table_sva,
excel = glue::glue("excel/zy_sig_sva-v{ver}.xlsx"))
## Error in extract_significant_genes(zy_table_sva, excel = glue::glue("excel/zy_sig_sva-v{ver}.xlsx")): object 'zy_table_sva' not found
dev <- pp(file = "images/zymo_ma.png")
## Writing png?
zy_table_sva[["plots"]][["z23_vs_z22"]][["deseq_ma_plots"]][["plot"]]
## Error in eval(expr, envir, enclos): object 'zy_table_sva' not found
closed <- dev.off()
zy_table_sva[["plots"]][["z23_vs_z22"]][["deseq_ma_plots"]][["plot"]]
## Error in eval(expr, envir, enclos): object 'zy_table_sva' not found
In contrast, we can search for genes which are differentially expressed with respect to cure/failure status.
cf_de <- all_pairwise(cf_expt, filter = TRUE, model_batch = "svaseq")
## Error in normalize_expt(input, filter = filter): object 'cf_expt' not found
cf_table <- combine_de_tables(cf_de, excel = glue::glue("excel/cf_tables-v{ver}.xlsx"))
## Error in combine_de_tables(cf_de, excel = glue::glue("excel/cf_tables-v{ver}.xlsx")): object 'cf_de' not found
cf_sig <- extract_significant_genes(cf_table, excel = glue::glue("excel/cf_sig-v{ver}.xlsx"))
## Error in extract_significant_genes(cf_table, excel = glue::glue("excel/cf_sig-v{ver}.xlsx")): object 'cf_table' not found
dev <- pp(file = "images/cf_ma.png")
## Writing png?
cf_table[["plots"]][["fail_vs_cure"]][["deseq_ma_plots"]][["plot"]]
## Error in eval(expr, envir, enclos): object 'cf_table' not found
closed <- dev.off()
cf_table[["plots"]][["fail_vs_cure"]][["deseq_ma_plots"]][["plot"]]
## Error in eval(expr, envir, enclos): object 'cf_table' not found
Finally, we can use our category of susceptibility and look for genes which change from sensitive to resistant. Keep in mind, though, that for the moment we have a lot of ambiguous and unknown strains.
sus_de_sva <- all_pairwise(sus_expt, filter = TRUE, model_batch = "svaseq")
## Error in normalize_expt(input, filter = filter): object 'sus_expt' not found
sus_table_sva <- combine_de_tables(sus_de_sva, excel = glue::glue("excel/sus_tables_sva-v{ver}.xlsx"))
## Error in combine_de_tables(sus_de_sva, excel = glue::glue("excel/sus_tables_sva-v{ver}.xlsx")): object 'sus_de_sva' not found
sus_sig_sva <- extract_significant_genes(sus_table_sva, excel = glue::glue("excel/sus_sig_sva-v{ver}.xlsx"))
## Error in extract_significant_genes(sus_table_sva, excel = glue::glue("excel/sus_sig_sva-v{ver}.xlsx")): object 'sus_table_sva' not found
sus_de_nobatch <- all_pairwise(sus_expt, filter = TRUE, model_batch = FALSE)
## Error in normalize_expt(input, filter = filter): object 'sus_expt' not found
sus_table_nobatch <- combine_de_tables(sus_de_nobatch, excel = glue::glue("excel/sus_tables_nobatch-v{ver}.xlsx"))
## Error in combine_de_tables(sus_de_nobatch, excel = glue::glue("excel/sus_tables_nobatch-v{ver}.xlsx")): object 'sus_de_nobatch' not found
sus_sig_nobatch <- extract_significant_genes(sus_table_nobatch, excel = glue::glue("excel/sus_sig_nobatch-v{ver}.xlsx"))
## Error in extract_significant_genes(sus_table_nobatch, excel = glue::glue("excel/sus_sig_nobatch-v{ver}.xlsx")): object 'sus_table_nobatch' not found
knitr::kable(head(sus_sig_sva$deseq$ups$sensitive_vs_resistant, n = 20))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'sus_sig_sva' not found
knitr::kable(head(sus_sig_sva$deseq$downs$sensitive_vs_resistant, n = 20))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'sus_sig_sva' not found
sus_ma <- sus_table_sva[["plots"]][["sensitive_vs_resistant"]][["deseq_ma_plots"]][["plot"]]
## Error in eval(expr, envir, enclos): object 'sus_table_sva' not found
dev <- pp(file = "images/sus_ma_sva.png")
## Writing png?
sus_ma
## Error in eval(expr, envir, enclos): object 'sus_ma' not found
closed <- dev.off()
sus_ma
## Error in eval(expr, envir, enclos): object 'sus_ma' not found
## test <- ggplt(sus_ma)
Now let us look for ontology categories which are increased in the 2.3 samples followed by the 2.2 samples.
## Gene categories more represented in the 2.3 group.
zy_go_up <- simple_goseq(sig_genes = zy_sig_sva[["deseq"]][["ups"]][[1]],
go_db = lp_go, length_db = lp_lengths)
## Error in simple_goseq(sig_genes = zy_sig_sva[["deseq"]][["ups"]][[1]], : object 'zy_sig_sva' not found
## Gene categories more represented in the 2.2 group.
zy_go_down <- simple_goseq(sig_genes = zy_sig_sva[["deseq"]][["downs"]][[1]],
go_db = lp_go, length_db = lp_lengths)
## Error in simple_goseq(sig_genes = zy_sig_sva[["deseq"]][["downs"]][[1]], : object 'zy_sig_sva' not found
In the function ‘combined_de_tables()’ above, one of the tasks performed is to look at the agreement among DESeq2, limma, and edgeR. The following show a couple of these for the set of genes observed with a fold-change >= |2| and adjusted p-value <= 0.05.
zy_table_sva[["venns"]][[1]][["p_lfc1"]][["up_noweight"]]
## Error in eval(expr, envir, enclos): object 'zy_table_sva' not found
zy_table_sva[["venns"]][[1]][["p_lfc1"]][["down_noweight"]]
## Error in eval(expr, envir, enclos): object 'zy_table_sva' not found
zy_go_up[["pvalue_plots"]][["bpp_plot_over"]]
## Error in eval(expr, envir, enclos): object 'zy_go_up' not found
zy_go_down[["pvalue_plots"]][["bpp_plot_over"]]
## Error in eval(expr, envir, enclos): object 'zy_go_down' not found
Remind myself, the data structures are (zy|sus)_(de|table|sig).
zy_df <- zy_table_sva[["data"]][["z23_vs_z22"]]
## Error in eval(expr, envir, enclos): object 'zy_table_sva' not found
sus_df <- sus_table_sva[["data"]][["sensitive_vs_resistant"]]
## Error in eval(expr, envir, enclos): object 'sus_table_sva' not found
both_df <- merge(zy_df, sus_df, by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'zy_df' not found
plot_df <- both_df[, c("deseq_logfc.x", "deseq_logfc.y")]
## Error in eval(expr, envir, enclos): object 'both_df' not found
rownames(plot_df) <- both_df[["Row.names"]]
## Error in eval(expr, envir, enclos): object 'both_df' not found
colnames(plot_df) <- c("z23_vs_z22", "sensitive_vs_resistant")
## Error in colnames(plot_df) <- c("z23_vs_z22", "sensitive_vs_resistant"): object 'plot_df' not found
compare <- plot_linear_scatter(plot_df)
## Error in data.frame(df[, c(1, 2)]): object 'plot_df' not found
dev <- pp(file = "images/compare_sus_zy.png")
## Writing png?
compare$scatter
## Error in compare$scatter: object of type 'closure' is not subsettable
closed <- dev.off()
compare$scatter
## Error in compare$scatter: object of type 'closure' is not subsettable
compare$cor
## Error in compare$cor: object of type 'closure' is not subsettable
Najib read me an email listing off the gene names associated with the zymodeme classification. I took those names and cross referenced them against the Leishmania panamensis gene annotations and found the following:
They are:
Given these 6 gene IDs (NH has two gene IDs associated with it), I can do some looking for specific differences among the various samples.
The following creates a colorspace (red to green) heatmap showing the observed expression of these genes in every sample.
my_genes <- c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
"LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300",
"other")
my_names <- c("ALAT", "ASAT", "G6PD", "NHv1", "NHv2", "MPI", "other")
zymo_expt <- exclude_genes_expt(zy_norm, ids = my_genes, method = "keep")
## Error in exclude_genes_expt(zy_norm, ids = my_genes, method = "keep"): object 'zy_norm' not found
zymo_heatmap <- plot_sample_heatmap(zymo_expt, row_label = my_names)
## Error in plot_sample_heatmap(zymo_expt, row_label = my_names): object 'zymo_expt' not found
zymo_heatmap
## Error in eval(expr, envir, enclos): object 'zymo_heatmap' not found
In contrast, the following plots take the set of genes which are shared among all differential expression methods (|lfc| >= 1.0 and adjp <= 0.05) and use them to make categories of genes which are increased in 2.3 or 2.2.
shared_zymo <- intersect_significant(zy_table_sva)
## Deleting the file excel/intersect_significant.xlsx before writing the tables.
## Error in is.data.frame(x): object 'zy_table_sva' not found
up_shared <- shared_zymo[["ups"]][[1]][["data"]][["all"]]
## Error in eval(expr, envir, enclos): object 'shared_zymo' not found
rownames(up_shared)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'up_shared' not found
upshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(up_shared), method = "keep")
## Error in exclude_genes_expt(zy_norm, ids = rownames(up_shared), method = "keep"): object 'zy_norm' not found
We can plot a quick heatmap to get a sense of the differences observed between the genes which are different between the two zymodemes.
high_23_heatmap <- plot_sample_heatmap(upshared_expt, row_label = rownames(up_shared))
## Error in plot_sample_heatmap(upshared_expt, row_label = rownames(up_shared)): object 'upshared_expt' not found
high_23_heatmap
## Error in eval(expr, envir, enclos): object 'high_23_heatmap' not found
down_shared <- shared_zymo[["downs"]][[1]][["data"]][["all"]]
## Error in eval(expr, envir, enclos): object 'shared_zymo' not found
downshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(down_shared), method = "keep")
## Error in exclude_genes_expt(zy_norm, ids = rownames(down_shared), method = "keep"): object 'zy_norm' not found
high_22_heatmap <- plot_sample_heatmap(downshared_expt, row_label = rownames(down_shared))
## Error in plot_sample_heatmap(downshared_expt, row_label = rownames(down_shared)): object 'downshared_expt' not found
high_22_heatmap
## Error in eval(expr, envir, enclos): object 'high_22_heatmap' not found
Over the last couple of weeks, I redid all the variant searches with a newer, (I think) more sensitive and more specific variant tool. In addition I changed my script which interprets the results so that it is able to extract any tags from it, instead of just the one or two that my previous script handled. In addition, at least in theory it is now able to provide the set of amino acid substitutions for every gene in species without or with introns (not really relevant for Leishmania panamensis).
However, as of this writing, I have not re-performed the same tasks with the 2016 data, primarily because it will require remapping all of the samples. As a result, for the moment I cannot combine the older and newer samples. Thus, any of the following blocks which use the 2016 data are currently disabled.
old_expt <- create_expt("sample_sheets/tmrc2_samples_20191203.xlsx",
file_column = "tophat2file")
tt <- lp_expt[["expressionset"]]
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.E1$", replacement = "", x = rownames(tt))
lp_expt$expressionset <- tt
tt <- old_expt$expressionset
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.1$", replacement = "", x = rownames(tt))
old_expt$expressionset <- tt
rm(tt)
One other important caveat, we have a group of new samples which have not yet run through the variant search pipeline, so I need to remove them from consideration. Though it looks like they finished overnight…
## The next line drops the samples which are missing the SNP pipeline.
lp_snp <- subset_expt(lp_expt, subset="!is.na(pData(lp_expt)[['freebayessummary']])")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'sampleNames': object 'lp_expt' not found
new_snps <- count_expt_snps(lp_snp, annot_column = "freebayessummary", snp_column="PAIRED")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'lp_snp' not found
nonzero_snps <- exprs(new_snps) != 0
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'new_snps' not found
colSums(nonzero_snps)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colSums': object 'nonzero_snps' not found
## My old_snps is using an older annotation incorrectly, so fix it here:
Biobase::annotation(old_snps$expressionset) = Biobase::annotation(new_snps$expressionset)
both_snps <- combine_expts(new_snps, old_snps)
both_norm <- normalize_expt(both_snps, transform = "log2", convert = "cpm", filter = TRUE)
## strains <- both_norm[["design"]][["strain"]]
both_strain <- set_expt_conditions(both_norm, fact = "strain")
The data structure ‘both_norm’ now contains our 2016 data along with the newer data collected since 2019.
The following plot shows the SNP profiles of all samples (old and new) where the colors at the top show either the 2.2 strains (orange), 2.3 strains (green), the previous samples (purple), or the various lab strains (pink etc).
new_variant_heatmap <- plot_disheat(new_snps)
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, : object 'new_snps' not found
dev <- pp(file = "images/raw_snp_disheat.png", height=12, width=12)
## Writing png?
new_variant_heatmap$plot
## Error in eval(expr, envir, enclos): object 'new_variant_heatmap' not found
closed <- dev.off()
new_variant_heatmap$plot
## Error in eval(expr, envir, enclos): object 'new_variant_heatmap' not found
The function get_snp_sets() takes the provided metadata factor (in this case ‘condition’) and looks for variants which are exclusive to each element in it. In this case, this is looking for differences between 2.2 and 2.3, as well as the set shared among them.
snp_sets <- get_snp_sets(both_snps, factor = "condition")
Biobase::annotation(old_expt$expressionset) = Biobase::annotation(lp_expt$expressionset)
both_expt <- combine_expts(lp_expt, old_expt)
snp_genes <- sm(snps_vs_genes(both_expt, snp_sets, expt_name_col = "chromosome"))
## I think we have some metrics here we can plot...
snp_subset <- sm(snp_subset_genes(
both_expt, both_snps,
genes = c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
"LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300")))
zymo_heat <- plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset)))
zymo_heat
Didn’t I create a set of densities by chromosome? Oh I think they come in from get_snp_sets()
clinical_sets <- get_snp_sets(new_snps, factor = "clinicalresponse")
## Error in get_snp_sets(new_snps, factor = "clinicalresponse"): object 'new_snps' not found
density_vec <- clinical_sets[["density"]]
## Error in eval(expr, envir, enclos): object 'clinical_sets' not found
chromosome_idx <- grep(pattern = "LpaL", x = names(density_vec))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grep': object 'density_vec' not found
density_df <- as.data.frame(density_vec[chromosome_idx])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'density_vec' not found
density_df[["chr"]] <- rownames(density_df)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'density_df' not found
colnames(density_df) <- c("density_vec", "chr")
## Error in colnames(density_df) <- c("density_vec", "chr"): object 'density_df' not found
ggplot(density_df, aes_string(x = "chr", y = "density_vec")) +
ggplot2::geom_col() +
ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5))
## Error in ggplot(density_df, aes_string(x = "chr", y = "density_vec")): object 'density_df' not found
## clinical_written <- write_variants(new_snps)
clinical_genes <- sm(snps_vs_genes(lp_expt, clinical_sets, expt_name_col = "chromosome"))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'lp_expt' not found
snp_density <- merge(as.data.frame(clinical_genes[["summary_by_gene"]]),
as.data.frame(fData(lp_expt)),
by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_genes' not found
snp_density <- snp_density[, c(1, 2, 4, 15)]
## Error in eval(expr, envir, enclos): object 'snp_density' not found
colnames(snp_density) <- c("name", "snps", "product", "length")
## Error in colnames(snp_density) <- c("name", "snps", "product", "length"): object 'snp_density' not found
snp_density[["product"]] <- tolower(snp_density[["product"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'tolower': object 'snp_density' not found
snp_density[["length"]] <- as.numeric(snp_density[["length"]])
## Error in eval(expr, envir, enclos): object 'snp_density' not found
snp_density[["density"]] <- snp_density[["snps"]] / snp_density[["length"]]
## Error in eval(expr, envir, enclos): object 'snp_density' not found
snp_idx <- order(snp_density[["density"]], decreasing = TRUE)
## Error in eval(quote(list(...)), env): object 'snp_density' not found
snp_density <- snp_density[snp_idx, ]
## Error in eval(expr, envir, enclos): object 'snp_density' not found
removers <- c("amastin", "gp63", "leishmanolysin")
for (r in removers) {
drop_idx <- grepl(pattern = r, x = snp_density[["product"]])
snp_density <- snp_density[!drop_idx, ]
}
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grepl': object 'snp_density' not found
## Filter these for [A|a]mastin gp63 Leishmanolysin
clinical_snps <- snps_intersections(lp_expt, clinical_sets, chr_column = "chromosome")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'lp_expt' not found
fail_ref_snps <- as.data.frame(clinical_snps[["inters"]][["failure, reference strain"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_snps' not found
fail_ref_snps <- rbind(fail_ref_snps,
as.data.frame(clinical_snps[["inters"]][["failure"]]))
## Error in eval(quote(list(...)), env): object 'fail_ref_snps' not found
cure_snps <- as.data.frame(clinical_snps[["inters"]][["cure"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_snps' not found
head(fail_ref_snps)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'fail_ref_snps' not found
head(cure_snps)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'cure_snps' not found
write.csv(file="csv/cure_variants.txt", x=rownames(cure_snps))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'cure_snps' not found
write.csv(file="csv/fail_variants.txt", x=rownames(fail_ref_snps))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'fail_ref_snps' not found
annot <- fData(lp_expt)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'lp_expt' not found
clinical_interest <- as.data.frame(clinical_snps[["gene_summaries"]][["cure"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'clinical_snps' not found
clinical_interest <- merge(clinical_interest,
as.data.frame(clinical_snps[["gene_summaries"]][["failure, reference strain"]]),
by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'clinical_interest' not found
rownames(clinical_interest) <- clinical_interest[["Row.names"]]
## Error in eval(expr, envir, enclos): object 'clinical_interest' not found
clinical_interest[["Row.names"]] <- NULL
## Error in clinical_interest[["Row.names"]] <- NULL: object 'clinical_interest' not found
colnames(clinical_interest) <- c("cure_snps","fail_snps")
## Error in colnames(clinical_interest) <- c("cure_snps", "fail_snps"): object 'clinical_interest' not found
annot <- merge(annot, clinical_interest, by = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'annot' not found
rownames(annot) <- annot[["Row.names"]]
## Error in eval(expr, envir, enclos): object 'annot' not found
annot[["Row.names"]] <- NULL
## Error in annot[["Row.names"]] <- NULL: object 'annot' not found
fData(lp_expt$expressionset) <- annot
## Error in eval(expr, envir, enclos): object 'annot' not found
The heatmap produced here should show the variants only for the zymodeme genes.
I am thinking that if we find clusters of locations which are variant, that might provide some PCR testing possibilities.
## Drop the 2.1, 2.4, unknown, and null
pruned_snps <- subset_expt(new_snps, subset="condition=='z2.2'|condition=='z2.3'")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'sampleNames': object 'new_snps' not found
new_sets <- get_snp_sets(pruned_snps, factor = "zymodemecategorical")
## Error in get_snp_sets(pruned_snps, factor = "zymodemecategorical"): object 'pruned_snps' not found
summary(new_sets)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'new_sets' not found
## 1000000: 2.2
## 0100000: 2.3
summary(new_sets[["intersections"]][["10"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'new_sets' not found
write.csv(file="csv/variants_22.csv", x=new_sets[["intersections"]][["10"]])
## Error in is.data.frame(x): object 'new_sets' not found
summary(new_sets[["intersections"]][["01"]])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'new_sets' not found
write.csv(file="csv/variants_23.csv", x=new_sets[["intersections"]][["01"]])
## Error in is.data.frame(x): object 'new_sets' not found
Thus we see that there are 3,553 variants associated with 2.2 and 81,589 associated with 2.3.
The following function uses the positional data to look for sequential mismatches associated with zymodeme in the hopes that there will be some regions which would provide good potential targets for a PCR-based assay.
sequential_variants <- function(snp_sets, conditions = NULL, minimum = 3, maximum_separation = 3) {
if (is.null(conditions)) {
conditions <- 1
}
intersection_sets <- snp_sets[["intersections"]]
intersection_names <- snp_sets[["set_names"]]
chosen_intersection <- 1
if (is.numeric(conditions)) {
chosen_intersection <- conditions
} else {
intersection_idx <- intersection_names == conditions
chosen_intersection <- names(intersection_names)[intersection_idx]
}
possible_positions <- intersection_sets[[chosen_intersection]]
position_table <- data.frame(row.names = possible_positions)
pat <- "^chr_(.+)_pos_(.+)_ref_.*$"
position_table[["chr"]] <- gsub(pattern = pat, replacement = "\\1", x = rownames(position_table))
position_table[["pos"]] <- as.numeric(gsub(pattern = pat, replacement = "\\2", x = rownames(position_table)))
position_idx <- order(position_table[, "chr"], position_table[, "pos"])
position_table <- position_table[position_idx, ]
position_table[["dist"]] <- 0
last_chr <- ""
for (r in 1:nrow(position_table)) {
this_chr <- position_table[r, "chr"]
if (r == 1) {
position_table[r, "dist"] <- position_table[r, "pos"]
last_chr <- this_chr
next
}
if (this_chr == last_chr) {
position_table[r, "dist"] <- position_table[r, "pos"] - position_table[r - 1, "pos"]
} else {
position_table[r, "dist"] <- position_table[r, "pos"]
}
last_chr <- this_chr
}
## Working interactively here.
doubles <- position_table[["dist"]] == 1
doubles <- position_table[doubles, ]
write.csv(doubles, "doubles.csv")
one_away <- position_table[["dist"]] == 2
one_away <- position_table[one_away, ]
write.csv(one_away, "one_away.csv")
two_away <- position_table[["dist"]] == 3
two_away <- position_table[two_away, ]
write.csv(two_away, "two_away.csv")
combined <- rbind(doubles, one_away)
combined <- rbind(combined, two_away)
position_idx <- order(combined[, "chr"], combined[, "pos"])
combined <- combined[position_idx, ]
this_chr <- ""
for (r in 1:nrow(combined)) {
this_chr <- combined[r, "chr"]
if (r == 1) {
combined[r, "dist_pair"] <- combined[r, "pos"]
last_chr <- this_chr
next
}
if (this_chr == last_chr) {
combined[r, "dist_pair"] <- combined[r, "pos"] - combined[r - 1, "pos"]
} else {
combined[r, "dist_pair"] <- combined[r, "pos"]
}
last_chr <- this_chr
}
dist_pair_maximum <- 1000
dist_pair_minimum <- 200
dist_pair_idx <- combined[["dist_pair"]] <= dist_pair_maximum &
combined[["dist_pair"]] >= dist_pair_minimum
remaining <- combined[dist_pair_idx, ]
no_weak_idx <- grepl(pattern="ref_(G|C)", x=rownames(remaining))
remaining <- remaining[no_weak_idx, ]
print(head(table(position_table[["dist"]])))
sequentials <- position_table[["dist"]] <= maximum_separation
message("There are ", sum(sequentials), " candidate regions.")
## The following can tell me how many runs of each length occurred, that is not quite what I want.
## Now use run length encoding to find the set of sequential sequentials!
rle_result <- rle(sequentials)
rle_values <- rle_result[["values"]]
## The following line is equivalent to just leaving values alone:
## true_values <- rle_result[["values"]] == TRUE
rle_lengths <- rle_result[["lengths"]]
true_sequentials <- rle_lengths[rle_values]
rle_idx <- cumsum(rle_lengths)[which(rle_values)]
position_table[["last_sequential"]] <- 0
count <- 0
for (r in rle_idx) {
count <- count + 1
position_table[r, "last_sequential"] <- true_sequentials[count]
}
message("The maximum sequential set is: ", max(position_table[["last_sequential"]]), ".")
wanted_idx <- position_table[["last_sequential"]] >= minimum
wanted <- position_table[wanted_idx, c("chr", "pos")]
return(wanted)
}
zymo22_sequentials <- sequential_variants(new_sets, conditions = "z22", minimum=1, maximum_separation=2)
dim(zymo22_sequentials)
## 7 candidate regions for zymodeme 2.2 -- thus I am betting that the reference strain is a 2.2
zymo23_sequentials <- sequential_variants(new_sets, conditions = "z23",
minimum = 2, maximum_separation = 2)
dim(zymo23_sequentials)
## In contrast, there are lots (587) of interesting regions for 2.3!
The first 4 candidate regions from my set of remaining: * Chr Pos. Distance * LpaL13-15 238433 448 * LpaL13-18 142844 613 * LpaL13-29 830342 252 * LpaL13-33 1331507 843
Lets define a couple of terms: * Third: Each of the 4 above positions. * Second: Third - Distance * End: Third + PrimerLen * Start: Second - Primerlen
In each instance, these are the last positions, so we want to grab three things:
## * LpaL13-15 238433 448
first_candidate_chr <- genome[["LpaL13_15"]]
primer_length <- 22
amplicon_length <- 448
first_candidate_third <- 238433
first_candidate_second <- first_candidate_third - amplicon_length
first_candidate_start <- first_candidate_second - primer_length
first_candidate_end <- first_candidate_third + primer_length
first_candidate_region <- subseq(first_candidate_chr, first_candidate_start, first_candidate_end)
first_candidate_region
first_candidate_5p <- subseq(first_candidate_chr, first_candidate_start, first_candidate_second)
as.character(first_candidate_5p)
first_candidate_3p <- spgs::reverseComplement(subseq(first_candidate_chr, first_candidate_third, first_candidate_end))
first_candidate_3p
## * LpaL13-18 142844 613
second_candidate_chr <- genome[["LpaL13_18"]]
primer_length <- 22
amplicon_length <- 613
second_candidate_third <- 142844
second_candidate_second <- second_candidate_third - amplicon_length
second_candidate_start <- second_candidate_second - primer_length
second_candidate_end <- second_candidate_third + primer_length
second_candidate_region <- subseq(second_candidate_chr, second_candidate_start, second_candidate_end)
second_candidate_region
second_candidate_5p <- subseq(second_candidate_chr, second_candidate_start, second_candidate_second)
as.character(second_candidate_5p)
second_candidate_3p <- spgs::reverseComplement(subseq(second_candidate_chr, second_candidate_third, second_candidate_end))
second_candidate_3p
## * LpaL13-29 830342 252
third_candidate_chr <- genome[["LpaL13_29"]]
primer_length <- 22
amplicon_length <- 252
third_candidate_third <- 830342
third_candidate_second <- third_candidate_third - amplicon_length
third_candidate_start <- third_candidate_second - primer_length
third_candidate_end <- third_candidate_third + primer_length
third_candidate_region <- subseq(third_candidate_chr, third_candidate_start, third_candidate_end)
third_candidate_region
third_candidate_5p <- subseq(third_candidate_chr, third_candidate_start, third_candidate_second)
as.character(third_candidate_5p)
third_candidate_3p <- spgs::reverseComplement(subseq(third_candidate_chr, third_candidate_third, third_candidate_end))
third_candidate_3p
## You are a garbage polypyrimidine tract.
## Which is actually interesting if the mutations mess it up.
## * LpaL13-33 1331507 843
fourth_candidate_chr <- genome[["LpaL13_33"]]
primer_length <- 22
amplicon_length <- 843
fourth_candidate_third <- 1331507
fourth_candidate_second <- fourth_candidate_third - amplicon_length
fourth_candidate_start <- fourth_candidate_second - primer_length
fourth_candidate_end <- fourth_candidate_third + primer_length
fourth_candidate_region <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_end)
fourth_candidate_region
fourth_candidate_5p <- subseq(fourth_candidate_chr, fourth_candidate_start, fourth_candidate_second)
as.character(fourth_candidate_5p)
fourth_candidate_3p <- spgs::reverseComplement(subseq(fourth_candidate_chr, fourth_candidate_third, fourth_candidate_end))
fourth_candidate_3p
I made a fun little function which should find regions which have lots of variants associated with a given experimental factor.
pheno <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'sampleNames': object 'lp_expt' not found
pheno <- subset_expt(pheno, subset = "!is.na(pData(pheno)[['bcftable']])")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'sampleNames': object 'pheno' not found
pheno_snps <- sm(count_expt_snps(pheno, annot_column = "bcftable"))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'pheno' not found
fun_stuff <- snp_density_primers(
pheno_snps,
bsgenome = "BSGenome.Leishmania.panamensis.MHOMCOL81L13.v53",
gff = "reference/TriTrypDB-53_LpanamensisMHOMCOL81L13.gff")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'pheno_snps' not found
drop_scaffolds <- grepl(x = rownames(fun_stuff$favorites), pattern = "SCAF")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grepl': error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'fun_stuff' not found
favorite_primer_regions <- fun_stuff[["favorites"]][!drop_scaffolds, ]
## Error in eval(expr, envir, enclos): object 'fun_stuff' not found
favorite_primer_regions[["bin"]] <- rownames(favorite_primer_regions)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'favorite_primer_regions' not found
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Biostrings':
##
## collapse, intersect, setdiff, setequal, union
## The following object is masked from 'package:XVector':
##
## slice
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:hpgltools':
##
## combine
## The following object is masked from 'package:testthat':
##
## matches
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
favorite_primer_regions <- favorite_primer_regions %>%
relocate(bin)
## Error in relocate(., bin): object 'favorite_primer_regions' not found
Here is my note from our meeting:
Cross reference primers to DE genes of 2.2/2.3 and/or resistance/suscpetible, add a column to the primer spreadsheet with the DE genes (in retrospect I am guessing this actually means to put the logFC as a column.
One nice thing, I did a semantic removal on the lp_expt, so the set of logFC/pvalues should not have any of the offending types; thus I should be able to automagically get rid of them in the merge.
logfc <- zy_table_sva[["data"]][["z23_vs_z22"]]
## Error in eval(expr, envir, enclos): object 'zy_table_sva' not found
logfc_columns <- logfc[, c("deseq_logfc", "deseq_adjp")]
## Error in eval(expr, envir, enclos): object 'logfc' not found
colnames(logfc_columns) <- c("z23_logfc", "z23_adjp")
## Error in colnames(logfc_columns) <- c("z23_logfc", "z23_adjp"): object 'logfc_columns' not found
new_table <- merge(favorite_primer_regions, logfc_columns,
by.x = "closest_gene_before_id", by.y = "row.names")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'favorite_primer_regions' not found
sus <- sus_table_sva[["data"]][["sensitive_vs_resistant"]]
## Error in eval(expr, envir, enclos): object 'sus_table_sva' not found
sus_columns <- sus[, c("deseq_logfc", "deseq_adjp")]
## Error in eval(expr, envir, enclos): object 'sus' not found
colnames(sus_columns) <- c("sus_logfc", "sus_adjp")
## Error in colnames(sus_columns) <- c("sus_logfc", "sus_adjp"): object 'sus_columns' not found
new_table <- merge(new_table, sus_columns,
by.x = "closest_gene_before_id", by.y = "row.names") %>%
relocate(bin)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'merge': object 'new_table' not found
written <- write_xlsx(data=new_table,
excel="excel/favorite_primers_xref_zy_sus.xlsx")
## Error in write_xlsx(data = new_table, excel = "excel/favorite_primers_xref_zy_sus.xlsx"): object 'new_table' not found
We can cross reference the variants against the zymodeme status and plot a heatmap of the results and hopefully see how they separate.
## pruned_snps <- subset_expt(new_snps, subset="condition=='z2.2'|condition=='z2.3'")
snp_genes <- sm(snps_vs_genes(lp_expt, new_sets, expt_name_col = "chromosome"))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'lp_expt' not found
##new_zymo_norm <- normalize_expt(pruned_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE)
##new_zymo_norm <- set_expt_conditions(new_zymo_norm, fact = "zymodemecategorical")
clinical_colors_v2 <- list(
"z22" = "#0000cc",
"z23" = "#cc0000")
new_zymo_norm <- normalize_expt(pruned_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE) %>%
set_expt_conditions(fact = "zymodemecategorical") %>%
set_expt_colors(clinical_colors_v2)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': error in evaluating the argument 'object' in selecting a method for function 'pData': object 'pruned_snps' not found
zymo_heat <- plot_disheat(new_zymo_norm)
## Error in plot_heatmap(expt_data, expt_colors = expt_colors, expt_design = expt_design, : object 'new_zymo_norm' not found
dev <- pp(file = "images/onlyz22_z23_snp_heatmap.pdf", width=12, height=12)
zymo_heat[["plot"]]
## Error in eval(expr, envir, enclos): object 'zymo_heat' not found
closed <- dev.off()
zymo_heat[["plot"]]
## Error in eval(expr, envir, enclos): object 'zymo_heat' not found
Now let us try to make a heatmap which includes some of the annotation data.
des <- both_norm[["design"]]
## Error in eval(expr, envir, enclos): object 'both_norm' not found
undef_idx <- is.na(des[["strain"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[undef_idx, "strain"] <- "unknown"
## Error in des[undef_idx, "strain"] <- "unknown": object 'des' not found
##hmcols <- colorRampPalette(c("yellow","black","darkblue"))(256)
correlations <- hpgl_cor(exprs(both_norm))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'both_norm' not found
zymo_missing_idx <- is.na(des[["zymodemecategorical"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[["zymodemecategorical"]] <- as.character(des[["zymodemecategorical"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[["clinicalcategorical"]] <- as.character(des[["clinicalcategorical"]])
## Error in eval(expr, envir, enclos): object 'des' not found
des[zymo_missing_idx, "zymodemecategorical"] <- "unknown"
## Error in des[zymo_missing_idx, "zymodemecategorical"] <- "unknown": object 'des' not found
mydendro <- list(
"clustfun" = hclust,
"lwd" = 2.0)
col_data <- as.data.frame(des[, c("zymodemecategorical", "clinicalcategorical")])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'des' not found
unknown_clinical <- is.na(col_data[["clinicalcategorical"]])
## Error in eval(expr, envir, enclos): object 'col_data' not found
row_data <- as.data.frame(des[, c("strain")])
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'des' not found
colnames(col_data) <- c("zymodeme", "outcome")
## Error in colnames(col_data) <- c("zymodeme", "outcome"): object 'col_data' not found
col_data[unknown_clinical, "outcome"] <- "undefined"
## Error in col_data[unknown_clinical, "outcome"] <- "undefined": object 'col_data' not found
colnames(row_data) <- c("strain")
## Error in colnames(row_data) <- c("strain"): object 'row_data' not found
myannot <- list(
"Col" = list("data" = col_data),
"Row" = list("data" = row_data))
## Error in eval(expr, envir, enclos): object 'col_data' not found
myclust <- list("cuth" = 1.0,
"col" = BrewerClusterCol)
mylabs <- list(
"Row" = list("nrow" = 4),
"Col" = list("nrow" = 4))
hmcols <- colorRampPalette(c("darkblue", "beige"))(240)
zymo_annot_heat <- annHeatmap2(
correlations,
dendrogram = mydendro,
annotation = myannot,
cluster = myclust,
labels = mylabs,
## The following controls if the picture is symmetric
scale = "none",
col = hmcols)
## Error in annHeatmap2(correlations, dendrogram = mydendro, annotation = myannot, : object 'correlations' not found
dev <- pp(file = "images/dendro_heatmap.png", height = 20, width = 20)
## Writing png?
plot(zymo_annot_heat)
## Error in plot(zymo_annot_heat): object 'zymo_annot_heat' not found
closed <- dev.off()
plot(zymo_annot_heat)
## Error in plot(zymo_annot_heat): object 'zymo_annot_heat' not found
Print the larger heatmap so that all the labels appear. Keep in mind that as we get more samples, this image needs to continue getting bigger.
big heatmap
xref_prop <- table(pheno_snps[["conditions"]])
## Error in eval(quote(list(...)), env): object 'pheno_snps' not found
pheno_snps$conditions
## Error in eval(expr, envir, enclos): object 'pheno_snps' not found
idx_tbl <- exprs(pheno_snps) > 5
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'pheno_snps' not found
new_tbl <- data.frame(row.names = rownames(exprs(pheno_snps)))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'pheno_snps' not found
for (n in names(xref_prop)) {
new_tbl[[n]] <- 0
idx_cols <- which(pheno_snps[["conditions"]] == n)
prop_col <- rowSums(idx_tbl[, idx_cols]) / xref_prop[n]
new_tbl[n] <- prop_col
}
## Error in eval(expr, envir, enclos): object 'xref_prop' not found
keepers <- grepl(x = rownames(new_tbl), pattern = "LpaL13")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'grepl': error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'new_tbl' not found
new_tbl <- new_tbl[keepers, ]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[["strong22"]] <- 1.001 - new_tbl[["z2.2"]]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[["strong23"]] <- 1.001 - new_tbl[["z2.3"]]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
s22_na <- new_tbl[["strong22"]] > 1
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[s22_na, "strong22"] <- 1
## Error in new_tbl[s22_na, "strong22"] <- 1: object 'new_tbl' not found
s23_na <- new_tbl[["strong23"]] > 1
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
new_tbl[s23_na, "strong23"] <- 1
## Error in new_tbl[s23_na, "strong23"] <- 1: object 'new_tbl' not found
new_tbl[["SNP"]] <- rownames(new_tbl)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'rownames': object 'new_tbl' not found
new_tbl[["Chromosome"]] <- gsub(x = new_tbl[["SNP"]], pattern = "chr_(.*)_pos_.*", replacement = "\\1")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': object 'new_tbl' not found
new_tbl[["Position"]] <- gsub(x = new_tbl[["SNP"]], pattern = ".*_pos_(\\d+)_.*", replacement = "\\1")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'gsub': object 'new_tbl' not found
new_tbl <- new_tbl[, c("SNP", "Chromosome", "Position", "strong22", "strong23")]
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
library(CMplot)
## Much appreciate for using CMplot.
## Full description, Bug report, Suggestion and the latest codes:
## https://github.com/YinLiLin/CMplot
simplify <- new_tbl
## Error in eval(expr, envir, enclos): object 'new_tbl' not found
simplify[["strong22"]] <- NULL
## Error in simplify[["strong22"]] <- NULL: object 'simplify' not found
CMplot(simplify, bin.size = 100000)
## Error in is.data.frame(x): object 'simplify' not found
CMplot(new_tbl, plot.type="m", multracks=TRUE, threshold = c(0.01, 0.05),
threshold.lwd=c(1,1), threshold.col=c("black","grey"),
amplify=TRUE, bin.size=10000,
chr.den.col=c("darkgreen", "yellow", "red"),
signal.col=c("red", "green", "blue"),
signal.cex=1, file="jpg", memo="", dpi=300, file.output=TRUE, verbose=TRUE)
## Error in is.data.frame(x): object 'new_tbl' not found
This tool looks a little opaque, but provides sample data with things that make sense to me and should be pretty easy to recapitulate in our data.
## For this, let us use the 'new_snps' data structure.
## Caveat here: these need to be coerced to numbers.
my_covariates <- pData(new_snps)[, c("zymodemecategorical", "clinicalcategorical")]
for (col in colnames(my_covariates)) {
my_covariates[[col]] <- as.numeric(as.factor(my_covariates[[col]]))
}
my_covariates <- t(my_covariates)
my_geneloc <- fData(lp_expt)[, c("gid", "chromosome", "start", "end")]
colnames(my_geneloc) <- c("geneid", "chr", "left", "right")
my_ge <- exprs(normalize_expt(lp_expt, transform = "log2", filter = TRUE, convert = "cpm"))
used_samples <- tolower(colnames(my_ge)) %in% colnames(exprs(new_snps))
my_ge <- my_ge[, used_samples]
my_snpsloc <- data.frame(rownames = rownames(exprs(new_snps)))
## Oh, caveat here: Because of the way I stored the data,
## I could have duplicate rows which presumably will make matrixEQTL sad
my_snpsloc[["chr"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\1",
x = rownames(my_snpsloc))
my_snpsloc[["pos"]] <- gsub(pattern = "^chr_(.+)_pos(.+)_ref_.*$", replacement = "\\2",
x = rownames(my_snpsloc))
test <- duplicated(my_snpsloc)
## Each duplicated row would be another variant at that position;
## so in theory we would do a rle to number them I am guessing
## However, I do not have different variants so I think I can ignore this for the moment
## but will need to make my matrix either 0 or 1.
if (sum(test) > 0) {
message("There are: ", sum(duplicated), " duplicated entries.")
keep_idx <- ! test
my_snpsloc <- my_snpsloc[keep_idx, ]
}
my_snps <- exprs(new_snps)
one_idx <- my_snps > 0
my_snps[one_idx] <- 1
## Ok, at this point I think I have all the pieces which this method wants...
## Oh, no I guess not; it actually wants the data as a set of filenames...
library(MatrixEQTL)
write.table(my_snps, "eqtl/snps.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snps, "eqtl/snps.tsv", )
write.table(my_snpsloc, "eqtl/snpsloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(my_snpsloc, "eqtl/snpsloc.tsv")
write.table(as.data.frame(my_ge), "eqtl/ge.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_ge), "eqtl/ge.tsv")
write.table(as.data.frame(my_geneloc), "eqtl/geneloc.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_geneloc), "eqtl/geneloc.tsv")
write.table(as.data.frame(my_covariates), "eqtl/covariates.tsv", na = "NA", col.names = TRUE, row.names = TRUE, sep = "\t", quote = TRUE)
## readr::write_tsv(as.data.frame(my_covariates), "eqtl/covariates.tsv")
useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
# Genotype file name
SNP_file_name = "eqtl/snps.tsv"
snps_location_file_name = "eqtl/snpsloc.tsv"
expression_file_name = "eqtl/ge.tsv"
gene_location_file_name = "eqtl/geneloc.tsv"
covariates_file_name = "eqtl/covariates.tsv"
# Output file name
output_file_name_cis = tempfile()
output_file_name_tra = tempfile()
# Only associations significant at this level will be saved
pvOutputThreshold_cis = 0.1
pvOutputThreshold_tra = 0.1
# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric()
# errorCovariance = read.table("Sample_Data/errorCovariance.txt");
# Distance for local gene-SNP pairs
cisDist = 1e6
## Load genotype data
snps = SlicedData$new()
snps$fileDelimiter = "\t" # the TAB character
snps$fileOmitCharacters = "NA" # denote missing values;
snps$fileSkipRows = 1 # one row of column labels
snps$fileSkipColumns = 1 # one column of row labels
snps$fileSliceSize = 2000 # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name)
## Load gene expression data
gene = SlicedData$new()
gene$fileDelimiter = "\t" # the TAB character
gene$fileOmitCharacters = "NA" # denote missing values;
gene$fileSkipRows = 1 # one row of column labels
gene$fileSkipColumns = 1 # one column of row labels
gene$fileSliceSize = 2000 # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name)
## Load covariates
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t" # the TAB character
cvrt$fileOmitCharacters = "NA" # denote missing values;
cvrt$fileSkipRows = 1 # one row of column labels
cvrt$fileSkipColumns = 1 # one column of row labels
if(length(covariates_file_name) > 0) {
cvrt$LoadFile(covariates_file_name)
}
## Run the analysis
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE)
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE)
me = Matrix_eQTL_main(
snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name_tra,
pvOutputThreshold = pvOutputThreshold_tra,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = TRUE,
output_file_name.cis = output_file_name_cis,
pvOutputThreshold.cis = pvOutputThreshold_cis,
snpspos = snpspos,
genepos = genepos,
cisDist = cisDist,
pvalue.hist = "qqplot",
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE);
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
message(paste0("Saving to ", savefile))
tmp <- sm(saveme(filename = savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset ee4d224084d5338793b0f4f7be352c1473426849
## This is hpgltools commit: Mon Jun 6 13:12:47 2022 -0400: ee4d224084d5338793b0f4f7be352c1473426849
## Saving to tmrc2_02sample_estimation_v202206.rda.xz
tmp <- loadme(filename = savefile)