This document is intended to provide a general overview of the TMRC2 samples which have thus far been sequenced. In some cases, this includes only those samples starting in 2019; in other instances I am including our previous (2015-2016) samples.
In all cases the processing performed was:
The analyses in this document use the matrices of counts/gene from #3 and variants/position from #4 in order to provide some images and metrics describing the samples we have sequenced so far.
Everything which follows depends on the Existing TriTrypDB annotations revision 46, circa 2019. The following block loads a database of these annotations and turns it into a matrix where the rows are genes and columns are all the annotation types provided by TriTrypDB.
The same database was used to create a matrix of orthologous genes between L.panamensis and all of the other species in the TriTrypDB.
tt <- sm(library(EuPathDB))
tt <- sm(library(org.Lpanamensis.MHOMCOL81L13.v46.eg.db))
pan_db <- org.Lpanamensis.MHOMCOL81L13.v46.eg.db
all_fields <- columns(pan_db)
all_lp_annot <- sm(load_orgdb_annotations(
pan_db,
keytype = "gid",
fields = c("annot_gene_entrez_id", "annot_gene_name",
"annot_strand", "annot_chromosome", "annot_cds_length",
"annot_gene_product")))$genes
lp_go <- sm(load_orgdb_go(pan_db))
lp_lengths <- all_lp_annot[, c("gid", "annot_cds_length")]
colnames(lp_lengths) <- c("ID", "length")
all_lp_annot[["annot_gene_product"]] <- tolower(all_lp_annot[["annot_gene_product"]])
orthos <- sm(EuPathDB::extract_eupath_orthologs(db = pan_db))
hisat_annot <- all_lp_annot
## rownames(hisat_annot) <- paste0("exon_", rownames(hisat_annot), ".E1")
Resequence samples: TMRC20002, TMRC20006, TMRC20004 (maybe TMRC20008 and TMRC20029)
The process of sample estimation takes two primary inputs:
An expressionset is a data structure used in R to examine RNASeq data. It is comprised of annotations, metadata, and expression data. In the case of our processing pipeline, the location of the expression data is provided by the filenames in the metadata.
The first lines of the following block create the Expressionset. All of the following lines perform various normalizations and generate plots from it.
The following samples are much lower coverage:
sample_sheet <- glue::glue("sample_sheets/tmrc2_samples_20210528.xlsx")
lp_expt <- sm(create_expt(sample_sheet,
gene_info = hisat_annot,
id_column = "hpglidentifier",
file_column = "lpanamensisv36hisatfile")) %>%
set_expt_conditions(fact = "zymodemecategorical") %>%
subset_expt(nonzero = 8600) %>%
semantic_expt_filter(semantic = c("amastin", "gp63", "leishmanolysin"),
semantic_column = "annot_gene_product")
## The samples (and read coverage) removed when filtering 8600 non-zero genes are:
## TMRC20002 TMRC20004 TMRC20006 TMRC20029 TMRC20008
## 11681227 564812 6670348 1658096 6249790
## subset_expt(): There were 48, now there are 43 samples.
## semantic_expt_filter(): Removed 68 genes.
libsizes <- plot_libsize(lp_expt)
libsizes$plot
## I think samples 7,10 should be removed at minimum, probably also 9,11
nonzero <- plot_nonzero(lp_expt)
nonzero$plot
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_boxplot(lp_expt)
## 2826 entries are 0. We are on a log scale, adding 1 to the data.
filter_plot <- plot_libsize_prepost(lp_expt)
filter_plot$lowgene_plot
## Warning: Using alpha for a discrete variable is not advised.
filter_plot$count_plot
Najib’s favorite plots are of course the PCA/TNSE. These are nice to look at in order to get a sense of the relationships between samples. They also provide a good opportunity to see what happens when one applies different normalizations, surrogate analyses, filters, etc. In addition, one may set different experimental factors as the primary ‘condition’ (usually the color of plots) and surrogate ‘batches’.
Column ‘Q’ in the sample sheet, make a categorical version of it with these parameters:
starting <- as.numeric(pData(lp_expt)[["susceptibilityinfectionreduction32ugmlsbvhistoricaldata"]])
sus_categorical <- starting
na_idx <- is.na(starting)
sus_categorical[na_idx] <- "unknown"
resist_idx <- starting <= 0.35
sus_categorical[resist_idx] <- "resistant"
indeterminant_idx <- starting >= 0.36 & starting <= 0.48
sus_categorical[indeterminant_idx] <- "ambiguous"
susceptible_idx <- starting >= 0.49
sus_categorical[susceptible_idx] <- "sensitive"
pData(lp_expt$expressionset)[["sus_category"]] <- sus_categorical
clinical_samples <- lp_expt %>%
set_expt_batches(fact = sus_categorical)
clinical_norm <- sm(normalize_expt(clinical_samples, norm = "quant", transform = "log2",
convert = "cpm", batch = FALSE, filter = TRUE))
zymo_pca <- plot_pca(clinical_norm, plot_title = "PCA of parasite expression values")
pp(file = "images/zymo_pca_sus_shape.png", image = zymo_pca$plot)
zymo_3dpca <- plot_3d_pca(zymo_pca)
zymo_3dpca$plot
zymo_tsne <- plot_tsne(clinical_norm, plot_title = "TSNE of parasite expression values")
zymo_tsne$plot
clinical_nb <- normalize_expt(clinical_samples, convert = "cpm", transform = "log2",
filter = TRUE, batch = "svaseq")
## Removing 146 low-count genes (8564 remaining).
## batch_counts: Before batch/surrogate estimation, 507 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 1893 entries are 0<x<1: 1%.
## Setting 129 low elements to zero.
## transform_counts: Found 129 values equal to 0, adding 1 to the matrix.
clinical_nb_pca <- plot_pca(clinical_nb, plot_title = "PCA of parasite expression values")
pp(file = "images/clinical_nb_pca_sus_shape.png", image = clinical_nb_pca$plot)
clinical_nb_tsne <- plot_tsne(clinical_nb, plot_title = "TSNE of parasite expression values")
clinical_nb_tsne$plot
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
corheat <- plot_corheat(clinical_norm, plot_title = "Correlation heatmap of parasite
expression values
")
corheat$plot
plot_sm(clinical_norm)$plot
## Performing correlation.
cf_expt <- set_expt_conditions(lp_expt, fact = "clinicalcategorical") %>%
set_expt_batches(fact = sus_categorical)
cf_norm <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
norm = "quant", filter = TRUE)
## Removing 146 low-count genes (8564 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
start_cf <- plot_pca(cf_norm, plot_title = "PCA of parasite expression values")
pp(file = "images/cf_sus_shape.png", image = start_cf$plot)
cf_nb <- normalize_expt(cf_expt, convert = "cpm", transform = "log2",
norm = "quant", filter = TRUE, batch = "svaseq")
## Warning in normalize_expt(cf_expt, convert = "cpm", transform = "log2", :
## Quantile normalization and sva do not always play well together.
## Removing 146 low-count genes (8564 remaining).
## batch_counts: Before batch/surrogate estimation, 2 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 2294 entries are 0<x<1: 1%.
## Setting 94 low elements to zero.
## transform_counts: Found 94 values equal to 0, adding 1 to the matrix.
cf_nb_pca <- plot_pca(cf_nb, plot_title = "PCA of parasite expression values")
pp(file = "images/cf_sus_share_nb.png", image = cf_nb_pca$plot)
cf_norm <- normalize_expt(cf_expt, transform = "log2", convert = "cpm",
filter = TRUE, norm = "quant")
## Removing 146 low-count genes (8564 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
test <- pca_information(cf_norm,
expt_factors = c("clinicalcategorical", "zymodemecategorical",
"pathogenstrain", "passagenumber"),
num_components = 6, plot_pcas = TRUE)
test$anova_p
## PC1 PC2 PC3 PC4 PC5 PC6
## clinicalcategorical 0.000e+00 0.00000 0.000e+00 0.0000 0.0000 0.0000
## zymodemecategorical 2.065e-06 0.11542 3.763e-01 0.2677 0.3496 0.7188
## pathogenstrain 9.790e-01 0.08496 5.768e-06 0.4458 0.5482 0.9486
## passagenumber 0.000e+00 0.00000 0.000e+00 0.0000 0.0000 0.0000
test$cor_heatmap
sus_expt <- set_expt_conditions(lp_expt, fact = "sus_category") %>%
set_expt_batches(fact = "zymodemecategorical")
sus_norm <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
norm = "quant", filter = TRUE)
## Removing 146 low-count genes (8564 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
sus_pca <- plot_pca(sus_norm, plot_title = "PCA of parasite expression values")
sus_pca$plot
sus_nb <- normalize_expt(sus_expt, transform = "log2", convert = "cpm",
batch = "svaseq", filter = TRUE)
## Removing 146 low-count genes (8564 remaining).
## batch_counts: Before batch/surrogate estimation, 507 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 1893 entries are 0<x<1: 1%.
## Setting 109 low elements to zero.
## transform_counts: Found 109 values equal to 0, adding 1 to the matrix.
sus_nb_pca <- plot_pca(sus_nb, plot_title = "PCA of parasite expression values")
pp(file = "images/sus_nb_pca.png", image = sus_nb_pca$plot)
At this time, we do not have very many samples, so the set of metrics/plots is fairly limited. There is really only one factor in the metadata which we can use for performing differential expression analyses, the ‘zymodeme’.
The following sections perform a series of analyses which seek to elucidate differences between the zymodemes 2.2 and 2.3 either through differential expression or variant profiles.
TODO: Do this with and without sva and compare the results.
zy_expt <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
## subset_expt(): There were 43, now there are 23 samples.
zy_norm <- normalize_expt(zy_expt, filter = TRUE, convert = "cpm", norm = "quant")
## Removing 167 low-count genes (8543 remaining).
zy_de_nobatch <- sm(all_pairwise(zy_expt, filter = TRUE, model_batch = "svaseq"))
zy_de <- sm(all_pairwise(zy_expt, filter = TRUE, model_batch = "svaseq"))
zy_table <- sm(combine_de_tables(zy_de, excel = glue::glue("excel/zy_tables-v{ver}.xlsx")))
zy_sig <- sm(extract_significant_genes(zy_table, excel = glue::glue("excel/zy_sig-v{ver}.xlsx")))
zy_table[["plots"]][["z23_vs_z22"]][["deseq_ma_plots"]][["plot"]]
In contrast, we can search for genes which are differentially expressed with respect to cure/failure status.
cf_de <- sm(all_pairwise(cf_expt, filter = TRUE, model_batch = "svaseq"))
cf_table <- sm(combine_de_tables(cf_de, excel = glue::glue("excel/cf_tables-v{ver}.xlsx")))
cf_sig <- sm(extract_significant_genes(cf_table, excel = glue::glue("excel/cf_sig-v{ver}.xlsx")))
Finally, we can use our category of susceptibility and look for genes which change from sensitive to resistant. Keep in mind, though, that for the moment we have a lot of ambiguous and unknown strains.
sus_de <- sm(all_pairwise(sus_expt, filter = TRUE, model_batch = "svaseq"))
sus_table <- sm(combine_de_tables(sus_de, excel = glue::glue("excel/sus_tables-v{ver}.xlsx")))
sus_sig <- sm(extract_significant_genes(sus_table, excel = glue::glue("excel/sus_sig-v{ver}.xlsx")))
Now let us look for ontology categories which are increased in the 2.3 samples followed by the 2.2 samples.
## Gene categories more represented in the 2.3 group.
zy_go_up <- sm(simple_goseq(sig_genes = zy_sig[["deseq"]][["ups"]][[1]],
go_db = lp_go, length_db = lp_lengths))
## Gene categories more represented in the 2.2 group.
zy_go_down <- sm(simple_goseq(sig_genes = zy_sig[["deseq"]][["downs"]][[1]],
go_db = lp_go, length_db = lp_lengths))
In the function ‘combined_de_tables()’ above, one of the tasks performed is to look at the agreement among DESeq2, limma, and edgeR. The following show a couple of these for the set of genes observed with a fold-change >= |2| and adjusted p-value <= 0.05.
zy_table[["venns"]][[1]][["p_lfc1"]][["up_noweight"]]
zy_table[["venns"]][[1]][["p_lfc1"]][["down_noweight"]]
zy_table$plots[[1]][["deseq_ma_plots"]][["plot"]]
zy_go_up$pvalue_plots$bpp_plot_over
zy_go_down$pvalue_plots$bpp_plot_over
Najib read me an email listing off the gene names associated with the zymodeme classification. I took those names and cross referenced them against the Leishmania panamensis gene annotations and found the following:
They are:
Given these 6 gene IDs (NH has two gene IDs associated with it), I can do some looking for specific differences among the various samples.
The following creates a colorspace (red to green) heatmap showing the observed expression of these genes in every sample.
my_genes <- c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
"LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300",
"other")
my_names <- c("ALAT", "ASAT", "G6PD", "NHv1", "NHv2", "MPI", "other")
zymo_expt <- exclude_genes_expt(zy_norm, ids = my_genes, method = "keep")
## Before removal, there were 8543 genes, now there are 6.
## There are 23 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20005 TMRC20039 TMRC20037 TMRC20038 TMRC20041 TMRC20015 TMRC20009
## 0.1307 0.1315 0.1296 0.1098 0.1126 0.1176 0.1144 0.1133
## TMRC20010 TMRC20016 TMRC20011 TMRC20012 TMRC20013 TMRC20017 TMRC20014 TMRC20018
## 0.1096 0.1058 0.1099 0.1203 0.1202 0.1062 0.1087 0.1142
## TMRC20021 TMRC20022 TMRC20053 TMRC20052 TMRC20051 TMRC20050 TMRC20054
## 0.1059 0.1302 0.1179 0.1102 0.1277 0.1149 0.1273
zymo_heatmap <- plot_sample_heatmap(zymo_expt, row_label = my_names)
zymo_heatmap
In contrast, the following plots take the set of genes which are shared among all differential expression methods (|lfc| >= 1.0 and adjp <= 0.05) and use them to make categories of genes which are increased in 2.3 or 2.2.
shared_zymo <- intersect_significant(zy_table)
## Deleting the file excel/intersect_significant.xlsx before writing the tables.
up_shared <- shared_zymo[["ups"]][[1]][["data"]][["all"]]
rownames(up_shared)
## [1] "LPAL13_000033300" "LPAL13_000012000" "LPAL13_310031300" "LPAL13_000038400"
## [5] "LPAL13_000038500" "LPAL13_000012100" "LPAL13_340039600" "LPAL13_050005000"
## [9] "LPAL13_310031000" "LPAL13_310039200" "LPAL13_210015500" "LPAL13_350063000"
## [13] "LPAL13_270034100" "LPAL13_140019300" "LPAL13_340039700" "LPAL13_350013200"
## [17] "LPAL13_180013900" "LPAL13_170015400" "LPAL13_330021800" "LPAL13_240009700"
## [21] "LPAL13_140019100" "LPAL13_330021900" "LPAL13_140019200" "LPAL13_250025700"
## [25] "LPAL13_320038700" "LPAL13_350073200" "LPAL13_310028500" "LPAL13_210005000"
## [29] "LPAL13_230011200" "LPAL13_300031600" "LPAL13_230011400" "LPAL13_110015700"
## [33] "LPAL13_040007800" "LPAL13_290016200" "LPAL13_230011500" "LPAL13_310032500"
## [37] "LPAL13_000045100" "LPAL13_160014500" "LPAL13_000010600"
upshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(up_shared), method = "keep")
## Before removal, there were 8543 genes, now there are 39.
## There are 23 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20005 TMRC20039 TMRC20037 TMRC20038 TMRC20041 TMRC20015 TMRC20009
## 0.4148 0.1539 0.2236 0.6113 0.6912 0.1913 0.5184 0.1865
## TMRC20010 TMRC20016 TMRC20011 TMRC20012 TMRC20013 TMRC20017 TMRC20014 TMRC20018
## 0.4754 0.3863 0.1949 0.1527 0.4594 0.2529 0.2052 0.4372
## TMRC20021 TMRC20022 TMRC20053 TMRC20052 TMRC20051 TMRC20050 TMRC20054
## 0.4913 0.1731 0.2528 0.5766 0.7879 0.2603 0.6963
We can plot a quick heatmap to get a sense of the differences observed between the genes which are different between the two zymodemes.
high_23_heatmap <- plot_sample_heatmap(upshared_expt, row_label = rownames(up_shared))
high_23_heatmap
down_shared <- shared_zymo[["downs"]][[1]][["data"]][["all"]]
downshared_expt <- exclude_genes_expt(zy_norm, ids = rownames(down_shared), method = "keep")
## Before removal, there were 8543 genes, now there are 61.
## There are 23 samples which kept less than 90 percent counts.
## TMRC20001 TMRC20005 TMRC20039 TMRC20037 TMRC20038 TMRC20041 TMRC20015 TMRC20009
## 0.2175 0.6777 0.6555 0.1982 0.1892 0.6830 0.1799 0.6311
## TMRC20010 TMRC20016 TMRC20011 TMRC20012 TMRC20013 TMRC20017 TMRC20014 TMRC20018
## 0.1650 0.2055 0.5672 0.5549 0.1627 0.6520 0.6407 0.1590
## TMRC20021 TMRC20022 TMRC20053 TMRC20052 TMRC20051 TMRC20050 TMRC20054
## 0.1592 0.6787 0.5680 0.1789 0.1838 0.6124 0.1955
high_22_heatmap <- plot_sample_heatmap(downshared_expt, row_label = rownames(down_shared))
high_22_heatmap
Now I will combine our previous samples and our new samples in the hopes of finding variant positions which help elucidate currently unknown aspects of either group via their clustering to known samples from the other group. In other words, we do not know the zymodeme annotations for the old samples nor the strain identities (or the shortcut ‘chronic vs. self-healing’) for the new samples. I hope to make educated guesses given the variant profiles. There are some differences in how the previous and current data sets were analyzed (though I have since redone the old samples so it should be trivial to remove those differences now).
I added our 2016 data to a specific TMRC2 sample sheet, dated 20191203. Thus I will load the data here. That previous data was mapped using tophat, so I will also need to make some changes to the gene names to accomodate the two mappings.
old_expt <- sm(create_expt("sample_sheets/tmrc2_samples_20191203.xlsx",
file_column = "tophat2file"))
tt <- lp_expt$expressionset
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.E1$", replacement = "", x = rownames(tt))
lp_expt$expressionset <- tt
tt <- old_expt$expressionset
rownames(tt) <- gsub(pattern = "^exon_", replacement = "", x = rownames(tt))
rownames(tt) <- gsub(pattern = "\\.1$", replacement = "", x = rownames(tt))
old_expt$expressionset <- tt
One other important caveat, we have a group of new samples which have not yet run through the variant search pipeline, so I need to remove them from consideration. Though it looks like they finished overnight…
## The next line drops the samples which are missing the SNP pipeline.
lp_snp <- subset_expt(lp_expt, subset="!is.na(pData(lp_expt)[['bcftable']])")
## subset_expt(): There were 43, now there are 43 samples.
new_snps <- sm(count_expt_snps(lp_snp, annot_column = "bcftable"))
old_snps <- sm(count_expt_snps(old_expt, annot_column = "bcftable", snp_column = 2))
both_snps <- combine_expts(new_snps, old_snps)
both_norm <- sm(normalize_expt(both_snps, transform = "log2", convert = "cpm", filter = TRUE))
## strains <- both_norm[["design"]][["strain"]]
both_norm <- set_expt_conditions(both_norm, fact = "strain")
The data structure ‘both_norm’ now contains our 2016 data along with the newer data collected since 2019.
The following plot shows the SNP profiles of all samples (old and new) where the colors at the top show either the 2.2 strains (orange), 2.3 strains (green), the previous samples (purple), or the various lab strains (pink etc).
old_new_variant_heatmap <- plot_disheat(both_norm)
pp(file = "images/raw_snp_disheat.png", image = old_new_variant_heatmap,
height = 12, width = 12)
The function get_snp_sets() takes the provided metadata factor (in this case ‘condition’) and looks for variants which are exclusive to each element in it. In this case, this is looking for differences between 2.2 and 2.3, as well as the set shared among them.
snp_sets <- get_snp_sets(both_snps, factor = "condition")
## The factor z2.3 has 12 rows.
## The factor z2.2 has 11 rows.
## The factor unknown has 20 rows.
## The factor sh has 13 rows.
## The factor chr has 14 rows.
## The factor inf has 6 rows.
## Iterating over 727 elements.
both_expt <- combine_expts(lp_expt, old_expt)
snp_genes <- sm(snps_vs_genes(both_expt, snp_sets, expt_name_col = "chromosome"))
## I think we have some metrics here we can plot...
snp_subset <- sm(snp_subset_genes(
both_expt, both_snps,
genes = c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
"LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300")))
## zymo_heat <- plot_sample_heatmap(snp_subset, row_label = rownames(exprs(snp_subset)))
Didn’t I create a set of densities by chromosome? Oh I think they come in from get_snp_sets()
clinical_sets <- get_snp_sets(new_snps, factor = "clinicalresponse")
## The factor Cure has 17 rows.
## The factor Failure has 13 rows.
## The factor Laboratory line has only 1 row.
## The factor ND has 3 rows.
## The factor Reference strain has 4 rows.
## The factor unknown has 5 rows.
## Iterating over 693 elements.
density_vec <- clinical_sets[["density"]]
chromosome_idx <- grep(pattern = "LpaL", x = names(density_vec))
density_df <- as.data.frame(density_vec[chromosome_idx])
density_df[["chr"]] <- rownames(density_df)
colnames(density_df) <- c("density_vec", "chr")
ggplot(density_df, aes_string(x = "chr", y = "density_vec")) +
ggplot2::geom_col() +
ggplot2::theme(axis.text = ggplot2::element_text(size = 10, colour = "black"),
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5))
## clinical_written <- write_variants(new_snps)
clinical_genes <- sm(snps_vs_genes(lp_expt, clinical_sets, expt_name_col = "chromosome"))
snp_density <- merge(as.data.frame(clinical_genes[["summary_by_gene"]]),
as.data.frame(fData(lp_expt)),
by = "row.names")
snp_density <- snp_density[, c(1, 2, 4, 15)]
colnames(snp_density) <- c("name", "snps", "product", "length")
snp_density[["product"]] <- tolower(snp_density[["product"]])
snp_density[["length"]] <- as.numeric(snp_density[["length"]])
snp_density[["density"]] <- snp_density[["snps"]] / snp_density[["length"]]
snp_idx <- order(snp_density[["density"]], decreasing = TRUE)
snp_density <- snp_density[snp_idx, ]
removers <- c("amastin", "gp63", "leishmanolysin")
for (r in removers) {
drop_idx <- grepl(pattern = r, x = snp_density[["product"]])
snp_density <- snp_density[!drop_idx, ]
}
## Filter these for [A|a]mastin gp63 Leishmanolysin
clinical_snps <- snps_intersections(lp_expt, clinical_sets, chr_column = "chromosome")
head(as.data.frame(clinical_snps$inters[["Failure"]]))
## seqnames start end width strand
## chr_LpaL13-02_pos_91233_ref_G_alt_T LpaL13-02 91233 91234 2 +
## chr_LpaL13-20.1_pos_344505_ref_T_alt_C LpaL13-20.1 344505 344506 2 +
## chr_LpaL13-29_pos_484124_ref_G_alt_A LpaL13-29 484124 484125 2 +
head(as.data.frame(clinical_snps$inters[["Cure"]]))
## seqnames start end width strand
## chr_LpaL13-08_pos_184791_ref_T_alt_A LpaL13-08 184791 184792 2 +
## chr_LpaL13-10_pos_347757_ref_A_alt_C LpaL13-10 347757 347758 2 +
## chr_LpaL13-11_pos_433123_ref_C_alt_T LpaL13-11 433123 433124 2 +
## chr_LpaL13-15_pos_47170_ref_G_alt_C LpaL13-15 47170 47171 2 +
## chr_LpaL13-20.1_pos_106634_ref_G_alt_A LpaL13-20.1 106634 106635 2 +
## chr_LpaL13-20.1_pos_369935_ref_C_alt_T LpaL13-20.1 369935 369936 2 +
head(clinical_snps$gene_summaries$Failure)
## LPAL13_020007000 LPAL13_200014300 LPAL13_290018100 LPAL13_000005000
## 1 1 1 0
## LPAL13_000005400 LPAL13_000005500
## 0 0
head(clinical_snps$gene_summaries$Cure, n = 100)
## LPAL13_200017900 LPAL13_200014600 LPAL13_230015000 LPAL13_200015100
## 4 3 3 2
## LPAL13_200017600 LPAL13_200017800 LPAL13_200019500 LPAL13_200019600
## 2 2 2 2
## LPAL13_080009800 LPAL13_100014700 LPAL13_110015500 LPAL13_150006300
## 1 1 1 1
## LPAL13_200008300 LPAL13_200014900 LPAL13_200015000 LPAL13_200015200
## 1 1 1 1
## LPAL13_200015300 LPAL13_200016400 LPAL13_200016500 LPAL13_200016900
## 1 1 1 1
## LPAL13_200017200 LPAL13_310008900 LPAL13_310034900 LPAL13_330014300
## 1 1 1 1
## LPAL13_000005000 LPAL13_000005400 LPAL13_000005500 LPAL13_000005600
## 0 0 0 0
## LPAL13_000005700 LPAL13_000005800 LPAL13_000005900 LPAL13_000006000
## 0 0 0 0
## LPAL13_000006100 LPAL13_000006200 LPAL13_000006300 LPAL13_000006400
## 0 0 0 0
## LPAL13_000006500 LPAL13_000006600 LPAL13_000006700 LPAL13_000006800
## 0 0 0 0
## LPAL13_000006900 LPAL13_000007400 LPAL13_000007500 LPAL13_000007600
## 0 0 0 0
## LPAL13_000007700 LPAL13_000007800 LPAL13_000007900 LPAL13_000008000
## 0 0 0 0
## LPAL13_000008300 LPAL13_000008400 LPAL13_000008500 LPAL13_000008600
## 0 0 0 0
## LPAL13_000008700 LPAL13_000008800 LPAL13_000008900 LPAL13_000009000
## 0 0 0 0
## LPAL13_000009100 LPAL13_000009200 LPAL13_000009300 LPAL13_000009400
## 0 0 0 0
## LPAL13_000009500 LPAL13_000009600 LPAL13_000009700 LPAL13_000009800
## 0 0 0 0
## LPAL13_000009900 LPAL13_000010000 LPAL13_000010100 LPAL13_000010200
## 0 0 0 0
## LPAL13_000010300 LPAL13_000010400 LPAL13_000010500 LPAL13_000010600
## 0 0 0 0
## LPAL13_000010700 LPAL13_000010800 LPAL13_000010900 LPAL13_000011000
## 0 0 0 0
## LPAL13_000011100 LPAL13_000011200 LPAL13_000011300 LPAL13_000011400
## 0 0 0 0
## LPAL13_000011500 LPAL13_000011600 LPAL13_000011700 LPAL13_000011800
## 0 0 0 0
## LPAL13_000011900 LPAL13_000012000 LPAL13_000012100 LPAL13_000012200
## 0 0 0 0
## LPAL13_000012300 LPAL13_000012400 LPAL13_000012500 LPAL13_000012600
## 0 0 0 0
## LPAL13_000012700 LPAL13_000012800 LPAL13_000012900 LPAL13_000013000
## 0 0 0 0
## LPAL13_000013100 LPAL13_000013200 LPAL13_000013300 LPAL13_000013400
## 0 0 0 0
annot <- fData(lp_expt)
clinical_interest <- as.data.frame(clinical_snps[["gene_summaries"]][["Cure"]])
clinical_interest <- merge(clinical_interest, as.data.frame(clinical_snps[["gene_summaries"]][["Failure"]]), by = "row.names")
rownames(clinical_interest) <- clinical_interest[["Row.names"]]
clinical_interest[["Row.names"]] <- NULL
colnames(clinical_interest) <- c("cure_snps","fail_snps")
annot <- merge(annot, clinical_interest, by = "row.names")
rownames(annot) <- annot[["Row.names"]]
annot[["Row.names"]] <- NULL
fData(lp_expt$expressionset) <- annot
The heatmap produced here should show the variants only for the zymodeme genes.
I am thinking that if we find clusters of locations which are variant, that might provide some PCR testing possibilities.
new_sets <- get_snp_sets(new_snps, factor = "phenotypiccharacteristics")
## The factor 2.2 has 11 rows.
## The factor 2.3 has 12 rows.
## The factor Laboratory line has only 1 row.
## The factor Reference strain has 4 rows.
## The factor unknown has 15 rows.
## Iterating over 693 elements.
summary(new_sets)
## Length Class Mode
## medians 6 data.frame list
## possibilities 5 -none- character
## intersections 31 -none- list
## chr_data 693 -none- list
## set_names 32 -none- list
## invert_names 32 -none- list
## density 693 -none- numeric
## 1000000: 2.2
## 0100000: 2.3
summary(new_sets[["intersections"]][["100000"]])
## Length Class Mode
## 0 NULL NULL
dim(new_sets$intersections[["100000"]])
## NULL
sequential_variants <- function(snp_sets, conditions = NULL, minimum = 3, maximum_separation = 3) {
if (is.null(conditions)) {
conditions <- 1
}
intersection_sets <- snp_sets[["intersections"]]
intersection_names <- snp_sets[["set_names"]]
chosen_intersection <- 1
if (is.numeric(conditions)) {
chosen_intersection <- conditions
} else {
intersection_idx <- intersection_names == conditions
chosen_intersection <- names(intersection_names)[intersection_idx]
}
possible_positions <- intersection_sets[[chosen_intersection]]
position_table <- data.frame(row.names = possible_positions)
pat <- "^chr_(.+)_pos_(.+)_ref_.*$"
position_table[["chr"]] <- gsub(pattern = pat, replacement = "\\1", x = rownames(position_table))
position_table[["pos"]] <- as.numeric(gsub(pattern = pat, replacement = "\\2", x = rownames(position_table)))
position_idx <- order(position_table[, "chr"], position_table[, "pos"])
position_table <- position_table[position_idx, ]
position_table[["dist"]] <- 0
last_chr <- ""
for (r in 1:nrow(position_table)) {
this_chr <- position_table[r, "chr"]
if (r == 1) {
position_table[r, "dist"] <- position_table[r, "pos"]
last_chr <- this_chr
next
}
if (this_chr == last_chr) {
position_table[r, "dist"] <- position_table[r, "pos"] - position_table[r - 1, "pos"]
} else {
position_table[r, "dist"] <- position_table[r, "pos"]
}
last_chr <- this_chr
}
sequentials <- position_table[["dist"]] <= maximum_separation
## The following can tell me how many runs of each length occurred, that is not quite what I want.
## Now use run length encoding to find the set of sequential sequentials!
rle_result <- rle(sequentials)
rle_values <- rle_result[["values"]]
## The following line is equivalent to just leaving values alone:
## true_values <- rle_result[["values"]] == TRUE
rle_lengths <- rle_result[["lengths"]]
true_sequentials <- rle_lengths[rle_values]
rle_idx <- cumsum(rle_lengths)[which(rle_values)]
position_table[["last_sequential"]] <- 0
count <- 0
for (r in rle_idx) {
count <- count + 1
position_table[r, "last_sequential"] <- true_sequentials[count]
}
wanted_idx <- position_table[["last_sequential"]] >= minimum
wanted <- position_table[wanted_idx, c("chr", "pos")]
return(wanted)
}
zymo22_sequentials <- sequential_variants(new_sets, conditions = "2.2")
zymo22_sequentials
## chr pos
## chr_LpaL13-05_pos_260512_ref_G_alt_C LpaL13-05 260512
## chr_LpaL13-24_pos_163302_ref_A_alt_C LpaL13-24 163302
zymo23_sequentials <- sequential_variants(new_sets, conditions = "2.3")
zymo23_sequentials
## chr pos
## chr_LpaL13-05_pos_183858_ref_G_alt_A LpaL13-05 183858
## chr_LpaL13-08_pos_174502_ref_T_alt_G LpaL13-08 174502
## chr_LpaL13-09_pos_210577_ref_G_alt_C LpaL13-09 210577
## chr_LpaL13-09_pos_338720_ref_C_alt_G LpaL13-09 338720
## chr_LpaL13-09_pos_375148_ref_C_alt_T LpaL13-09 375148
## chr_LpaL13-11_pos_478993_ref_T_alt_G LpaL13-11 478993
## chr_LpaL13-11_pos_489159_ref_G_alt_A LpaL13-11 489159
## chr_LpaL13-14_pos_221315_ref_A_alt_G LpaL13-14 221315
## chr_LpaL13-28_pos_592641_ref_A_alt_C LpaL13-28 592641
## chr_LpaL13-31_pos_98759_ref_G_alt_T LpaL13-31 98759
## chr_LpaL13-32_pos_314579_ref_C_alt_A LpaL13-32 314579
## chr_LpaL13-35_pos_26430_ref_G_alt_A LpaL13-35 26430
snp_genes <- sm(snps_vs_genes(lp_expt, new_sets, expt_name_col = "chromosome"))
new_zymo_norm <- normalize_expt(new_snps, filter = TRUE, convert = "cpm", norm = "quant", transform = TRUE)
## Removing 0 low-count genes (544782 remaining).
## transform_counts: Found 7670178 values equal to 0, adding 1 to the matrix.
new_zymo_norm <- set_expt_conditions(new_zymo_norm, fact = "phenotypiccharacteristics")
zymo_heat <- plot_disheat(new_zymo_norm)
zymo_subset <- snp_subset_genes(lp_expt, new_snps,
genes = c("LPAL13_120010900", "LPAL13_340013000", "LPAL13_000054100",
"LPAL13_140006100", "LPAL13_180018500", "LPAL13_320022300"))
## Warning in .Seqinfo.mergexy(x, y): Each of the 2 combined objects has sequence levels not in the other:
## - in 'x': LPAL13-SCAF000002, LPAL13-SCAF000003, LPAL13-SCAF000004, LPAL13-SCAF000005, LPAL13-SCAF000009, LPAL13-SCAF000010, LPAL13-SCAF000013, LPAL13-SCAF000014, LPAL13-SCAF000015, LPAL13-SCAF000018, LPAL13-SCAF000019, LPAL13-SCAF000020, LPAL13-SCAF000022, LPAL13-SCAF000023, LPAL13-SCAF000026, LPAL13-SCAF000029, LPAL13-SCAF000030, LPAL13-SCAF000031, LPAL13-SCAF000032, LPAL13-SCAF000035, LPAL13-SCAF000036, LPAL13-SCAF000037, LPAL13-SCAF000038, LPAL13-SCAF000042, LPAL13-SCAF000043, LPAL13-SCAF000045, LPAL13-SCAF000047, LPAL13-SCAF000049, LPAL13-SCAF000050, LPAL13-SCAF000052, LPAL13-SCAF000054, LPAL13-SCAF000056, LPAL13-SCAF000057, LPAL13-SCAF000058, LPAL13-SCAF000060, LPAL13-SCAF000066, LPAL13-SCAF000067, LPAL13-SCAF000069, LPAL13-SCAF000070, LPAL13-SCAF000072, LPAL13-SCAF000073, LPAL13-SCAF000081, LPAL13-SCAF000082, LPAL13-SCAF000083, LPAL13-SCAF000085, LPAL13-SCAF000086, LPAL13-SCAF000088, LPAL13-SCAF000090, LPAL13-SCAF000091, LPAL13-SCAF000092, LPAL13-SCAF000095, LPAL13-SCAF000098, LPAL13-SCAF000101, LPAL13-SCAF000103, LPAL13-SCAF000106, LPAL13-SCAF000109, LPAL13-SCAF000111, LPAL13-SCAF000112, LPAL13-SCAF000113, LPAL13-SCAF000118, LPAL13-SCAF000125, LPAL13-SCAF000126, LPAL13-SCAF000128, LPAL13-SCAF000138, LPAL13-SCAF000139, LPAL13-SCAF000140, LPAL13-SCAF000141, LPAL13-SCAF000144, LPAL13-SCAF000145, LPAL13-SCAF000147, LPAL13-SCAF000148, LPAL13-SCAF000150, LPAL13-SCAF000151, LPAL13-SCAF000152, LPAL13-SCAF000154, LPAL13-SCAF000155, LPAL13-SCAF000156, LPAL13-SCAF000157, LPAL13-SCAF000158, LPAL13-SCAF000159, LPAL13-SCAF000160, LPAL13-SCAF000161, LPAL13-SCAF000163, LPAL13-SCAF000164, LPAL13-SCAF000167, LPAL13-SCAF000168, LPAL13-SCAF000169, LPAL13-SCAF000170, LPAL13-SCAF000175, LPAL13-SCAF000177, LPAL13-SCAF000178, LPAL13-SCAF000179, LPAL13-SCAF000180, LPAL13-SCAF000183, LPAL13-SCAF000184, LPAL13-SCAF000185, LPAL13-SCAF000189, LPAL13-SCAF000190, LPAL13-SCAF000192, LPAL13-SCAF000195, LPAL13-SCAF000196, LPAL13-SCAF000198, LPAL13-SCAF000199, LPAL13-SCAF000204, LPAL13-SCAF000207, LPAL13-SCAF000208, LPAL13-SCAF000210, LPAL13-SCAF000212, LPAL13-SCAF000213, LPAL13-SCAF000214, LPAL13-SCAF000215, LPAL13-SCAF000216, LPAL13-SCAF000218, LPAL13-SCAF000219, LPAL13-SCAF000221, LPAL13-SCAF000222, LPAL13-SCAF000223, LPAL13-SCAF000224, LPAL13-SCAF000225, LPAL13-SCAF000226, LPAL13-SCAF000228, LPAL13-SCAF000232, LPAL13-SCAF000234, LPAL13-SCAF000236, LPAL13-SCAF000238, LPAL13-SCAF000240, LPAL13-SCAF000241, LPAL13-SCAF000242, LPAL13-SCAF000243, LPAL13-SCAF000244, LPAL13-SCAF000246, LPAL13-SCAF000247, LPAL13-SCAF000249, LPAL13-SCAF000251, LPAL13-SCAF000252, LPAL13-SCAF000254, LPAL13-SCAF000255, LPAL13-SCAF000257, LPAL13-SCAF000258, LPAL13-SCAF000260, LPAL13-SCAF000262, LPAL13-SCAF000263, LPAL13-SCAF000264, LPAL13-SCAF000268, LPAL13-SCAF000269, LPAL13-SCAF000270, LPAL13-SCAF000272, LPAL13-SCAF000273, LPAL13-SCAF000274, LPAL13-SCAF000275, LPAL13-SCAF000276, LPAL13-SCAF000277, LPAL13-SCAF000278, LPAL13-SCAF000279, LPAL13-SCAF000280, LPAL13-SCAF000282, LPAL13-SCAF000283, LPAL13-SCAF000284, LPAL13-SCAF000289, LPAL13-SCAF000290, LPAL13-SCAF000293, LPAL13-SCAF000294, LPAL13-SCAF000297, LPAL13-SCAF000298, LPAL13-SCAF000299, LPAL13-SCAF000304, LPAL13-SCAF000305, LPAL13-SCAF000306, LPAL13-SCAF000307, LPAL13-SCAF000308, LPAL13-SCAF000310, LPAL13-SCAF000311, LPAL13-SCAF000312, LPAL13-SCAF000313, LPAL13-SCAF000315, LPAL13-SCAF000318, LPAL13-SCAF000323, LPAL13-SCAF000324, LPAL13-SCAF000325, LPAL13-SCAF000327, LPAL13-SCAF000329, LPAL13-SCAF000331, LPAL13-SCAF000332, LPAL13-SCAF000333, LPAL13-SCAF000334, LPAL13-SCAF000336, LPAL13-SCAF000341, LPAL13-SCAF000342, LPAL13-SCAF000343, LPAL13-SCAF000344, LPAL13-SCAF000345, LPAL13-SCAF000346, LPAL13-SCAF000348, LPAL13-SCAF000349, LPAL13-SCAF000350, LPAL13-SCAF000351, LPAL13-SCAF000352, LPAL13-SCAF000353, LPAL13-SCAF000354, LPAL13-SCAF000355, LPAL13-SCAF000356, LPAL13-SCAF000357, LPAL13-SCAF000359, LPAL13-SCAF000360, LPAL13-SCAF000361, LPAL13-SCAF000362, LPAL13-SCAF000365, LPAL13-SCAF000366, LPAL13-SCAF000369, LPAL13-SCAF000371, LPAL13-SCAF000372, LPAL13-SCAF000373, LPAL13-SCAF000375, LPAL13-SCAF000376, LPAL13-SCAF000377, LPAL13-SCAF000378, LPAL13-SCAF000379, LPAL13-SCAF000380, LPAL13-SCAF000381, LPAL13-SCAF000382, LPAL13-SCAF000383, LPAL13-SCAF000384, LPAL13-SCAF000385, LPAL13-SCAF000386, LPAL13-SCAF000387, LPAL13-SCAF000388, LPAL13-SCAF000389, LPAL13-SCAF000390, LPAL13-SCAF000392, LPAL13-SCAF000393, LPAL13-SCAF000394, LPAL13-SCAF000395, LPAL13-SCAF000396, LPAL13-SCAF000397, LPAL13-SCAF000398, LPAL13-SCAF000399, LPAL13-SCAF000402, LPAL13-SCAF000404, LPAL13-SCAF000406, LPAL13-SCAF000407, LPAL13-SCAF000408, LPAL13-SCAF000409, LPAL13-SCAF000410, LPAL13-SCAF000411, LPAL13-SCAF000412, LPAL13-SCAF000413, LPAL13-SCAF000414, LPAL13-SCAF000415, LPAL13-SCAF000416, LPAL13-SCAF000418, LPAL13-SCAF000422, LPAL13-SCAF000423, LPAL13-SCAF000425, LPAL13-SCAF000427, LPAL13-SCAF000428, LPAL13-SCAF000429, LPAL13-SCAF000431, LPAL13-SCAF000433, LPAL13-SCAF000435, LPAL13-SCAF000437, LPAL13-SCAF000438, LPAL13-SCAF000439, LPAL13-SCAF000441, LPAL13-SCAF000442, LPAL13-SCAF000443, LPAL13-SCAF000444, LPAL13-SCAF000445, LPAL13-SCAF000449, LPAL13-SCAF000450, LPAL13-SCAF000451, LPAL13-SCAF000452, LPAL13-SCAF000454, LPAL13-SCAF000455, LPAL13-SCAF000457, LPAL13-SCAF000458, LPAL13-SCAF000462, LPAL13-SCAF000464, LPAL13-SCAF000466, LPAL13-SCAF000467, LPAL13-SCAF000472, LPAL13-SCAF000473, LPAL13-SCAF000474, LPAL13-SCAF000475, LPAL13-SCAF000476, LPAL13-SCAF000478, LPAL13-SCAF000479, LPAL13-SCAF000480, LPAL13-SCAF000481, LPAL13-SCAF000482, LPAL13-SCAF000485, LPAL13-SCAF000487, LPAL13-SCAF000489, LPAL13-SCAF000493, LPAL13-SCAF000494, LPAL13-SCAF000495, LPAL13-SCAF000497, LPAL13-SCAF000498, LPAL13-SCAF000499, LPAL13-SCAF000501, LPAL13-SCAF000502, LPAL13-SCAF000504, LPAL13-SCAF000506, LPAL13-SCAF000509, LPAL13-SCAF000510, LPAL13-SCAF000513, LPAL13-SCAF000514, LPAL13-SCAF000516, LPAL13-SCAF000517, LPAL13-SCAF000518, LPAL13-SCAF000519, LPAL13-SCAF000520, LPAL13-SCAF000521, LPAL13-SCAF000523, LPAL13-SCAF000524, LPAL13-SCAF000525, LPAL13-SCAF000526, LPAL13-SCAF000530, LPAL13-SCAF000531, LPAL13-SCAF000534, LPAL13-SCAF000543, LPAL13-SCAF000545, LPAL13-SCAF000546, LPAL13-SCAF000550, LPAL13-SCAF000551, LPAL13-SCAF000557, LPAL13-SCAF000559, LPAL13-SCAF000561, LPAL13-SCAF000565, LPAL13-SCAF000571, LPAL13-SCAF000579, LPAL13-SCAF000581, LPAL13-SCAF000583, LPAL13-SCAF000584, LPAL13-SCAF000589, LPAL13-SCAF000592, LPAL13-SCAF000594, LPAL13-SCAF000595, LPAL13-SCAF000596, LPAL13-SCAF000597, LPAL13-SCAF000600, LPAL13-SCAF000602, LPAL13-SCAF000604, LPAL13-SCAF000606, LPAL13-SCAF000608, LPAL13-SCAF000609, LPAL13-SCAF000612, LPAL13-SCAF000613, LPAL13-SCAF000615, LPAL13-SCAF000620, LPAL13-SCAF000621, LPAL13-SCAF000623, LPAL13-SCAF000624, LPAL13-SCAF000629, LPAL13-SCAF000630, LPAL13-SCAF000631, LPAL13-SCAF000632, LPAL13-SCAF000633, LPAL13-SCAF000634, LPAL13-SCAF000635, LPAL13-SCAF000638, LPAL13-SCAF000640, LPAL13-SCAF000642, LPAL13-SCAF000647, LPAL13-SCAF000648, LPAL13-SCAF000657, LPAL13-SCAF000658, LPAL13-SCAF000660, LPAL13-SCAF000662, LPAL13-SCAF000663, LPAL13-SCAF000664, LPAL13-SCAF000665, LPAL13-SCAF000667, LPAL13-SCAF000669, LPAL13-SCAF000670, LPAL13-SCAF000671, LPAL13-SCAF000673, LPAL13-SCAF000674, LPAL13-SCAF000675, LPAL13-SCAF000676, LPAL13-SCAF000677, LPAL13-SCAF000678, LPAL13-SCAF000680, LPAL13-SCAF000683, LPAL13-SCAF000684, LPAL13-SCAF000685, LPAL13-SCAF000686, LPAL13-SCAF000687, LPAL13-SCAF000689, LPAL13-SCAF000690, LPAL13-SCAF000691, LPAL13-SCAF000692, LPAL13-SCAF000693, LPAL13-SCAF000694, LPAL13-SCAF000696, LPAL13-SCAF000699, LPAL13-SCAF000701, LPAL13-SCAF000702, LPAL13-SCAF000703, LPAL13-SCAF000705, LPAL13-SCAF000706, LPAL13-SCAF000708, LPAL13-SCAF000709, LPAL13-SCAF000710, LPAL13-SCAF000712, LPAL13-SCAF000715, LPAL13-SCAF000718, LPAL13-SCAF000721, LPAL13-SCAF000724, LPAL13-SCAF000725, LPAL13-SCAF000728, LPAL13-SCAF000729, LPAL13-SCAF000730, LPAL13-SCAF000731, LPAL13-SCAF000733, LPAL13-SCAF000736, LPAL13-SCAF000739, LPAL13-SCAF000740, LPAL13-SCAF000741, LPAL13-SCAF000742, LPAL13-SCAF000743, LPAL13-SCAF000745, LPAL13-SCAF000746, LPAL13-SCAF000747, LPAL13-SCAF000749, LPAL13-SCAF000750, LPAL13-SCAF000751, LPAL13-SCAF0007
## Before removal, there were 544782 genes, now there are 83.
## There are 43 samples which kept less than 90 percent counts.
## tmrc20001 tmrc20005 tmrc20007 tmrc20027 tmrc20028 tmrc20032 tmrc20040 tmrc20039
## 0.037035 0.041720 0.053085 0.059906 0.077365 0.037129 0.015234 0.041766
## tmrc20037 tmrc20038 tmrc20041 tmrc20015 tmrc20009 tmrc20010 tmrc20016 tmrc20011
## 0.028449 0.029649 0.008748 0.026217 0.000000 0.027716 0.026359 0.024992
## tmrc20012 tmrc20013 tmrc20017 tmrc20014 tmrc20018 tmrc20019 tmrc20020 tmrc20021
## 0.000000 0.029377 0.020294 0.018363 0.032806 0.079907 0.072428 0.032435
## tmrc20022 tmrc20025 tmrc20024 tmrc20036 tmrc20033 tmrc20026 tmrc20031 tmrc20042
## 0.000000 0.063343 0.040538 0.008628 0.000000 0.081882 0.045886 0.106630
## tmrc20048 tmrc20053 tmrc20052 tmrc20051 tmrc20050 tmrc20043 tmrc20054 tmrc20046
## 0.029476 0.000000 0.033177 0.035482 0.057916 0.032996 0.036392 0.005881
## tmrc20047 tmrc20044 tmrc20045
## 0.034909 0.065649 0.006047
zymo_subset <- set_expt_conditions(zymo_subset, fact = "phenotypiccharacteristics")
## zymo_heat <- plot_sample_heatmap(zymo_subset, row_label = rownames(exprs(snp_subset)))
des <- both_norm$design
undef_idx <- is.na(des[["strain"]])
des[undef_idx, "strain"] <- "unknown"
##hmcols <- colorRampPalette(c("yellow","black","darkblue"))(256)
correlations <- hpgl_cor(exprs(both_norm))
zymo_missing_idx <- is.na(des[["phenotypiccharacteristics"]])
des[zymo_missing_idx, "phenotypiccharacteristics"] <- "unknown"
mydendro <- list(
"clustfun" = hclust,
"lwd" = 2.0)
col_data <- as.data.frame(des[, c("phenotypiccharacteristics", "clinicalcategorical")])
unknown_clinical <- is.na(col_data[["clinicalcategorical"]])
row_data <- as.data.frame(des[, c("strain")])
colnames(col_data) <- c("zymodeme", "outcome")
col_data[unknown_clinical, "outcome"] <- "undefined"
colnames(row_data) <- c("strain")
myannot <- list(
"Col" = list("data" = col_data),
"Row" = list("data" = row_data))
myclust <- list("cuth" = 1.0,
"col" = BrewerClusterCol)
mylabs <- list(
"Row" = list("nrow" = 4),
"Col" = list("nrow" = 4))
hmcols <- colorRampPalette(c("darkblue", "beige"))(240)
map1 <- annHeatmap2(
correlations,
dendrogram = mydendro,
annotation = myannot,
cluster = myclust,
labels = mylabs,
## The following controls if the picture is symmetric
scale = "none",
col = hmcols)
## Warning in breakColors(breaks, col): more colors than classes: ignoring 28 last
## colors
pp(file = "images/dendro_heatmap.png", image = map1, height=12, width = 12)
## annotated Heatmap
##
## Rows: 'dendrogram' with 2 branches and 76 members total, at height 5.092
## 11 annotation variable(s)
## Cols: 'dendrogram' with 2 branches and 76 members total, at height 5.092
## 9 annotation variable(s)
## plot(map1)
The following uses the same information to make some guesses about the strains used in the new samples.
des <- both_norm$design
undef_idx <- is.na(des[["strain"]])
des[undef_idx, "strain"] <- "unknown"
##hmcols <- colorRampPalette(c("yellow","black","darkblue"))(256)
correlations <- hpgl_cor(exprs(both_norm))
mydendro <- list(
"clustfun" = hclust,
"lwd" = 2.0)
col_data <- as.data.frame(des[, c("condition")])
row_data <- as.data.frame(des[, c("strain")])
colnames(col_data) <- c("condition")
colnames(row_data) <- c("strain")
myannot <- list(
"Col" = list("data" = col_data),
"Row" = list("data" = row_data))
myclust <- list("cuth" = 1.0,
"col" = BrewerClusterCol)
mylabs <- list(
"Row" = list("nrow" = 4),
"Col" = list("nrow" = 4))
hmcols <- colorRampPalette(c("darkblue", "beige"))(170)
map1 <- annHeatmap2(
correlations,
dendrogram = mydendro,
annotation = myannot,
cluster = myclust,
labels = mylabs)
## col = hmcols)
plot(map1)
pheno <- subset_expt(lp_expt, subset = "condition=='z2.2'|condition=='z2.3'")
## subset_expt(): There were 43, now there are 23 samples.
pheno <- subset_expt(pheno, subset="!is.na(pData(pheno)[['bcftable']])")
## subset_expt(): There were 23, now there are 23 samples.
pheno_snps <- sm(count_expt_snps(pheno, annot_column = "bcftable"))
xref_prop <- table(pheno_snps$conditions)
pheno_snps$conditions
## [1] "z2.3" "z2.2" "z2.2" "z2.3" "z2.3" "z2.2" "z2.3" "z2.2" "z2.3" "z2.3"
## [11] "z2.2" "z2.2" "z2.3" "z2.2" "z2.2" "z2.3" "z2.3" "z2.2" "z2.2" "z2.3"
## [21] "z2.3" "z2.2" "z2.3"
idx_tbl <- exprs(pheno_snps) > 5
new_tbl <- data.frame(row.names = rownames(exprs(pheno_snps)))
for (n in names(xref_prop)) {
new_tbl[[n]] <- 0
idx_cols <- which(pheno_snps[["conditions"]] == n)
prop_col <- rowSums(idx_tbl[, idx_cols]) / xref_prop[n]
new_tbl[n] <- prop_col
}
new_tbl[["ratio"]] <- (new_tbl[["z2.2"]] - new_tbl[["z2.3"]])
keepers <- grepl(x = rownames(new_tbl), pattern = "LpaL13")
new_tbl <- new_tbl[keepers, ]
new_tbl[["SNP"]] <- rownames(new_tbl)
new_tbl[["Chromosome"]] <- gsub(x = new_tbl[["SNP"]], pattern = "chr_(.*)_pos_.*", replacement = "\\1")
new_tbl[["Position"]] <- gsub(x = new_tbl[["SNP"]], pattern = ".*_pos_(\\d+)_.*", replacement = "\\1")
new_tbl <- new_tbl[, c("SNP", "Chromosome", "Position", "ratio")]
library(CMplot)
## Much appreciate for using CMplot.
## Full description, Bug report, Suggestion and the latest codes:
## https://github.com/YinLiLin/CMplot
CMplot(new_tbl)
## SNP-Density Plotting.
## Circular-Manhattan Plotting ratio.
## Rectangular-Manhattan Plotting ratio.
## QQ Plotting ratio.
## Plots are stored in: /mnt/cbcb/fs01_abelew/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
message(paste0("Saving to ", savefile))
tmp <- sm(saveme(filename = savefile))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 68b1ce610bf0c750d9a3ed2f6bd2a529b1744c29
## This is hpgltools commit: Thu May 27 17:01:01 2021 -0400: 68b1ce610bf0c750d9a3ed2f6bd2a529b1744c29
## Saving to tmrc2_02sample_estimation_v202105.rda.xz
tmp <- loadme(filename = savefile)