mers <- "06"

06mers simple analysis

Visualize the mers

Now I have something like a count table which I will try out and see.

sample_sheet <- paste0("all_samples", mers, ".csv")
mers_expt <- sm(create_expt(metadata=sample_sheet, gene_info=NULL))

Create some metrics

norm_mers <- sm(normalize_expt(mers_expt, transform="log2", convert="cpm", norm="quant"))
graph_raw <- sm(graph_metrics(mers_expt))
graph_norm <- sm(graph_metrics(norm_mers))
filt_mers <- sm(normalize_expt(mers_expt, filter=TRUE))
batch_mers <- sm(normalize_expt(filt_mers, transform="log2", norm="quant",
                                batch="ssva"))
graph_batch <- sm(graph_metrics(batch_mers))

View some metrics

graph_raw$libsize

graph_raw$libsize_summary
##    condition   min   1st median   mean    3rd    max
## 1:     mt_ex 70589 79796  92068  89798 102070 104464
## 2:     wt_ex 28906 39952  50998  74363  97092 143185
## 3:     wt_st 32208 75638 119068 119068 162499 205929
## 4:     mt_st 21019 28980  36940  60773  80650 124361
## There are more reads of size == 4 than I assumed.
graph_raw$density

## There appears to me to be a pretty clear density shift.
graph_raw$corheat

## The correlations of the raw data as interesting
graph_raw$boxplot

graph_norm$pcaplot
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

## wow that is a far cleaner split than ever I would have guessed
graph_norm$disheat

graph_batch$pcaplot
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

## An even stronger split when batch is taken into account, so the 3 vs. 10 exclusions do matter.

Look for up/down mers?

Man this seems weird to me, but lets see

Perform DE-ish tests

keepers <- list(
  "mtex_vs_wtex" = c("mt_ex", "wt_ex"),
  "mtst_vs_wtst" = c("mt_st", "wt_st"),
  "wtst_vs_wtex" = c("wt_st", "wt_ex"),
  "mtst_vs_mtex" = c("mt_st", "mt_ex")
  )
comparisons <- sm(all_pairwise(mers_expt, model_batch=FALSE))

fake_annotations <- Biobase::exprs(mers_expt$expressionset)
excel_file <- paste0("excel/", mers, "mers.xlsx")
tables <- sm(combine_de_tables(
  comparisons, excel=excel_file, extra_annot=fake_annotations, keepers=keepers,
  rda=paste0("combined_", mers, ".rda")))
sig_excel <- paste0("excel/top_", mers, "mers.xlsx")

sig <- sm(extract_significant_genes(combined=tables, according_to="deseq", excel=sig_excel, n=16))

Exponential growth delta with respect to wild-type

scatter_ex <- extract_coefficient_scatter(comparisons, x="wt_ex", y="mt_ex", type="deseq")
## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing wt_ex and mt_ex.
uppers_ex <- scatter_ex$data[rownames(scatter_ex$data) %in%
                          rownames(sig$deseq[["ups"]][["mt_ex_vs_wt_ex"]]), ]
downers_ex <- scatter_ex$data[rownames(scatter_ex$data) %in%
                           rownames(sig$deseq[["downs"]][["mt_ex_vs_wt_ex"]]), ]

sc_ex <- scatter_ex$scatter
sc_ex <- sc_ex +
  ggplot2::scale_x_continuous(limits = c(0, 20)) +
  ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_ex

sc_ex <- sc_ex +
  ggrepel::geom_text_repel(data=uppers_ex,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=-0.2, nudge_y=2.5,
                           ggplot2::aes(x=wt_ex, y=mt_ex, label=rownames(uppers_ex))) +
  ggplot2::geom_point(data=uppers_ex, ggplot2::aes(x=wt_ex, y=mt_ex), colour="blue") +
  ggrepel::geom_text_repel(data=downers_ex,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=2.5, nudge_y=-0.2,
                           ggplot2::aes(x=wt_ex, y=mt_ex, label=rownames(downers_ex))) +
  ggplot2::geom_point(data=downers_ex, ggplot2::aes(x=wt_ex, y=mt_ex), colour="blue") +
  ggplot2::scale_x_continuous(limits = c(0, 20)) +
  ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_ex

scatter_ex$both_histogram$plot

up_names <- rownames(uppers_ex)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
  up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
  up_pwm <- MotIV::makePWM(up_prob)
  up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}

down_names <- rownames(downers_ex)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
  down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
  down_pwm <- MotIV::makePWM(down_prob)
  down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}

Stationary growth delta with respect to wild-type

scatter_st <- extract_coefficient_scatter(comparisons, x="wt_st", y="mt_st", type="deseq")
## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing wt_st and mt_st.
uppers_st <- scatter_st$data[rownames(scatter_st$data) %in%
                          rownames(sig$deseq[["ups"]][["mt_st_vs_wt_st"]]), ]
downers_st <- scatter_st$data[rownames(scatter_st$data) %in%
                           rownames(sig$deseq[["downs"]][["mt_st_vs_wt_st"]]), ]

sc_st <- scatter_st$scatter
sc_st <- sc_st +
  ggplot2::scale_x_continuous(limits = c(0, 20)) +
  ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_st
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).

sc_st <- sc_st +
  ggrepel::geom_text_repel(data=uppers_st,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=-0.2, nudge_y=2.5,
                           ggplot2::aes(x=wt_st, y=mt_st, label=rownames(uppers_st))) +
  ggplot2::geom_point(data=uppers_st, ggplot2::aes(x=wt_st, y=mt_st), colour="blue") +
  ggrepel::geom_text_repel(data=downers_st,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=2.5, nudge_y=-0.2,
                           ggplot2::aes(x=wt_st, y=mt_st, label=rownames(downers_st))) +
  ggplot2::geom_point(data=downers_st, ggplot2::aes(x=wt_st, y=mt_st), colour="blue")
sc_st
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).

## I told it to label the top 10 most 'significant' 4-mers on either side of the distribution.
scatter_st$both_histogram$plot

up_names <- rownames(uppers_st)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
  up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
  up_pwm <- MotIV::makePWM(up_prob)
  up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}

down_names <- rownames(downers_st)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
  down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
  down_pwm <- MotIV::makePWM(down_prob)
  down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}

Wild-type stationary with respect to exponential

scatter_wt <- extract_coefficient_scatter(comparisons, x="wt_ex", y="wt_st", type="deseq")
## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing wt_ex and wt_st.
uppers_wt <- scatter_wt$data[rownames(scatter_wt$data) %in%
                             rownames(sig$deseq[["ups"]][["wt_st_vs_wt_ex"]]), ]
downers_wt <- scatter_wt$data[rownames(scatter_wt$data) %in%
                              rownames(sig$deseq[["downs"]][["wt_st_vs_wt_ex"]]), ]
sc_wt <- scatter_wt$scatter
sc_wt <- sc_wt +
  ggplot2::scale_x_continuous(limits = c(0, 20)) +
  ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_wt
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

sc_wt <- sc_wt +
  ggrepel::geom_text_repel(data=uppers_wt,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=-0.2, nudge_y=2.5,
                           ggplot2::aes(x=wt_ex, y=wt_st, label=rownames(uppers_wt))) +
  ggplot2::geom_point(data=uppers_wt, ggplot2::aes(x=wt_ex, y=wt_st), colour="blue") +
  ggrepel::geom_text_repel(data=downers_wt,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=2.5, nudge_y=-0.2,
                           ggplot2::aes(x=wt_ex, y=wt_st, label=rownames(downers_wt))) +
  ggplot2::geom_point(data=downers_wt, ggplot2::aes(x=wt_ex, y=wt_st), colour="blue") +
  ggplot2::scale_x_continuous(limits = c(0, 20)) +
  ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_wt
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

scatter_wt$both_histogram$plot

up_names <- rownames(uppers_wt)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
  up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
  up_pwm <- MotIV::makePWM(up_prob)
  up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}

down_names <- rownames(downers_ex)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
  down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
  down_pwm <- MotIV::makePWM(down_prob)
  down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}

Delta stationary with respect to exponential

scatter_mt <- extract_coefficient_scatter(comparisons, x="mt_ex", y="mt_st", type="deseq")
## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing mt_ex and mt_st.
uppers_mt <- scatter_mt$data[rownames(scatter_mt$data) %in%
                             rownames(sig$deseq[["ups"]][["mt_st_vs_mt_ex"]]), ]
downers_mt <- scatter_mt$data[rownames(scatter_mt$data) %in%
                              rownames(sig$deseq[["downs"]][["mt_st_vs_mt_ex"]]), ]
sc_mt <- scatter_mt$scatter
sc_mt <- sc_mt +
  ggplot2::scale_x_continuous(limits = c(0, 20)) +
  ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_mt

sc_mt <- sc_mt +
  ggrepel::geom_text_repel(data=uppers_mt,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=-0.2, nudge_y=2.5,
                           ggplot2::aes(x=mt_ex, y=mt_st, label=rownames(uppers_mt))) +
  ggplot2::geom_point(data=uppers_mt, ggplot2::aes(x=mt_ex, y=mt_st), colour="blue") +
  ggrepel::geom_text_repel(data=downers_mt,
                           arrow=arrow(length=unit(0.01, "npc")),
                           nudge_x=2.5, nudge_y=-0.2,
                           ggplot2::aes(x=mt_ex, y=mt_st, label=rownames(downers_mt))) +
  ggplot2::geom_point(data=downers_mt, ggplot2::aes(x=mt_ex, y=mt_st), colour="blue") +
  ggplot2::scale_x_continuous(limits = c(0, 20)) +
  ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_mt

scatter_mt$both_histogram$plot

up_names <- rownames(uppers_mt)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
  up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
  up_pwm <- MotIV::makePWM(up_prob)
  up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}

down_names <- rownames(downers_ex)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
  down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
  down_pwm <- MotIV::makePWM(down_prob)
  down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}

Hmm I am curious, I found some 8-mer which seemed to happen a bunch, is it over-represented in the genome? That sequence was ‘CGGCTGTT’

pa14 <- new.env()
load(file="../savefiles/01_annotation-v20171019.rda.xz", envir=pa14)
pa14 <- pa14$pa14
count_nmer <- function(pattern, genome=pa14$seq, mismatch=0) {
    dict <- Biostrings::PDict(pattern, max.mismatch=mismatch)
    result <- Biostrings::vcountPDict(dict, genome)[1,1]
    return(result)
}
count_nmer("CGGCTGTT", pa14$seq) ## interesting
## This appears to me to be an overrepresented pattern in the genome, as it appears 217 times
count_nmer("AAAGAACT", pa14$seq)

library(Biobase)
all_nmers <- exprs(mers_expt$expressionset)
four_mer_genome <- lapply(rownames(all_nmers), count_nmer)
all_nmers <- as.data.frame(cbind(all_nmers, unlist(four_mer_genome)))
colnames(all_nmers) <- c("hpgl0701","hpgl0702","hpgl0703","hpgl0704",
                         "hpgl0705","hpgl0706","hpgl0707","hpgl0708","genome")
tt = hpgl_norm(all_nmers, transform="log2")
ttt = as.data.frame(tt$count_table)
plot_density(ttt, colors=c("red","red","red","red",
                           "blue","blue","blue","blue","green"))
## The green is the number of times a given 4-mer is found in the genome while red and blue
## are the wt and mutant versions of the same thing

plot_histogram(ttt$genome)
plot_histogram(ttt[[1]])
plot_histogram(ttt[[2]])
plot_histogram(ttt[[3]])
plot_histogram(ttt[[4]])

plot_histogram(ttt[[5]])
plot_histogram(ttt[[6]])
plot_histogram(ttt[[7]])
plot_histogram(ttt[[8]])