mers <- "04"Now I have something like a count table which I will try out and see.
sample_sheet <- paste0("all_samples", mers, ".csv")
mers_expt <- sm(create_expt(metadata=sample_sheet, gene_info=NULL))norm_mers <- sm(normalize_expt(mers_expt, transform="log2", convert="cpm", norm="quant"))
graph_raw <- sm(graph_metrics(mers_expt))graph_norm <- sm(graph_metrics(norm_mers))filt_mers <- sm(normalize_expt(mers_expt, filter=TRUE))
batch_mers <- sm(normalize_expt(filt_mers, transform="log2", norm="quant",
batch="ssva"))
graph_batch <- sm(graph_metrics(batch_mers))graph_raw$libsizegraph_raw$libsize_summary## condition min 1st median mean 3rd max
## 1: mt_ex 11824 15464 19157 18700 22393 24661
## 2: wt_ex 6770 7892 9013 14482 18338 27663
## 3: wt_st 6456 13219 19982 19982 26746 33509
## 4: mt_st 4760 6672 8584 14033 18669 28754
## There are more reads of size == 4 than I assumed.
graph_raw$density## There appears to me to be a pretty clear density shift.
graph_raw$corheat## The correlations of the raw data as interesting
graph_raw$boxplotgraph_norm$pcaplot## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## wow that is a far cleaner split than ever I would have guessed
graph_norm$disheatgraph_batch$pcaplot## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## An even stronger split when batch is taken into account, so the 3 vs. 10 exclusions do matter.Man this seems weird to me, but lets see
keepers <- list(
"mtex_vs_wtex" = c("mt_ex", "wt_ex"),
"mtst_vs_wtst" = c("mt_st", "wt_st"),
"wtst_vs_wtex" = c("wt_st", "wt_ex"),
"mtst_vs_mtex" = c("mt_st", "mt_ex")
)
comparisons <- sm(all_pairwise(mers_expt, model_batch=FALSE))fake_annotations <- Biobase::exprs(mers_expt$expressionset)
excel_file <- paste0("excel/", mers, "mers.xlsx")
tables <- sm(combine_de_tables(
comparisons, excel=excel_file, extra_annot=fake_annotations, keepers=keepers,
rda=paste0("combined_", mers, ".rda")))
sig_excel <- paste0("excel/top_", mers, "mers.xlsx")sig <- sm(extract_significant_genes(combined=tables, according_to="deseq", excel=sig_excel, n=16))scatter_ex <- extract_coefficient_scatter(comparisons, x="wt_ex", y="mt_ex", type="deseq")## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing wt_ex and mt_ex.
uppers_ex <- scatter_ex$data[rownames(scatter_ex$data) %in%
rownames(sig$deseq[["ups"]][["mt_ex_vs_wt_ex"]]), ]
downers_ex <- scatter_ex$data[rownames(scatter_ex$data) %in%
rownames(sig$deseq[["downs"]][["mt_ex_vs_wt_ex"]]), ]
sc_ex <- scatter_ex$scatter
sc_ex <- sc_ex +
ggplot2::scale_x_continuous(limits = c(0, 20)) +
ggplot2::scale_y_continuous(limits = c(0, 20))## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_exsc_ex <- sc_ex +
ggrepel::geom_text_repel(data=uppers_ex,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=-0.2, nudge_y=2.5,
ggplot2::aes(x=wt_ex, y=mt_ex, label=rownames(uppers_ex))) +
ggplot2::geom_point(data=uppers_ex, ggplot2::aes(x=wt_ex, y=mt_ex), colour="blue") +
ggrepel::geom_text_repel(data=downers_ex,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=2.5, nudge_y=-0.2,
ggplot2::aes(x=wt_ex, y=mt_ex, label=rownames(downers_ex))) +
ggplot2::geom_point(data=downers_ex, ggplot2::aes(x=wt_ex, y=mt_ex), colour="blue") +
ggplot2::scale_x_continuous(limits = c(0, 20)) +
ggplot2::scale_y_continuous(limits = c(0, 20))## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_exscatter_ex$both_histogram$plotup_names <- rownames(uppers_ex)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
up_pwm <- MotIV::makePWM(up_prob)
up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}down_names <- rownames(downers_ex)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
down_pwm <- MotIV::makePWM(down_prob)
down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}scatter_st <- extract_coefficient_scatter(comparisons, x="wt_st", y="mt_st", type="deseq")## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing wt_st and mt_st.
uppers_st <- scatter_st$data[rownames(scatter_st$data) %in%
rownames(sig$deseq[["ups"]][["mt_st_vs_wt_st"]]), ]
downers_st <- scatter_st$data[rownames(scatter_st$data) %in%
rownames(sig$deseq[["downs"]][["mt_st_vs_wt_st"]]), ]
sc_st <- scatter_st$scatter
sc_st <- sc_st +
ggplot2::scale_x_continuous(limits = c(0, 20)) +
ggplot2::scale_y_continuous(limits = c(0, 20))## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_stsc_st <- sc_st +
ggrepel::geom_text_repel(data=uppers_st,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=-0.2, nudge_y=2.5,
ggplot2::aes(x=wt_st, y=mt_st, label=rownames(uppers_st))) +
ggplot2::geom_point(data=uppers_st, ggplot2::aes(x=wt_st, y=mt_st), colour="blue") +
ggrepel::geom_text_repel(data=downers_st,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=2.5, nudge_y=-0.2,
ggplot2::aes(x=wt_st, y=mt_st, label=rownames(downers_st))) +
ggplot2::geom_point(data=downers_st, ggplot2::aes(x=wt_st, y=mt_st), colour="blue")
sc_st## I told it to label the top 10 most 'significant' 4-mers on either side of the distribution.
scatter_st$both_histogram$plotup_names <- rownames(uppers_st)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
up_pwm <- MotIV::makePWM(up_prob)
up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}down_names <- rownames(downers_st)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
down_pwm <- MotIV::makePWM(down_prob)
down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}scatter_wt <- extract_coefficient_scatter(comparisons, x="wt_ex", y="wt_st", type="deseq")## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing wt_ex and wt_st.
uppers_wt <- scatter_wt$data[rownames(scatter_wt$data) %in%
rownames(sig$deseq[["ups"]][["wt_st_vs_wt_ex"]]), ]
downers_wt <- scatter_wt$data[rownames(scatter_wt$data) %in%
rownames(sig$deseq[["downs"]][["wt_st_vs_wt_ex"]]), ]
sc_wt <- scatter_wt$scatter
sc_wt <- sc_wt +
ggplot2::scale_x_continuous(limits = c(0, 20)) +
ggplot2::scale_y_continuous(limits = c(0, 20))## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_wtsc_wt <- sc_wt +
ggrepel::geom_text_repel(data=uppers_wt,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=-0.2, nudge_y=2.5,
ggplot2::aes(x=wt_ex, y=wt_st, label=rownames(uppers_wt))) +
ggplot2::geom_point(data=uppers_wt, ggplot2::aes(x=wt_ex, y=wt_st), colour="blue") +
ggrepel::geom_text_repel(data=downers_wt,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=2.5, nudge_y=-0.2,
ggplot2::aes(x=wt_ex, y=wt_st, label=rownames(downers_wt))) +
ggplot2::geom_point(data=downers_wt, ggplot2::aes(x=wt_ex, y=wt_st), colour="blue") +
ggplot2::scale_x_continuous(limits = c(0, 20)) +
ggplot2::scale_y_continuous(limits = c(0, 20))## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_wtscatter_wt$both_histogram$plotup_names <- rownames(uppers_wt)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
up_pwm <- MotIV::makePWM(up_prob)
up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}
down_names <- rownames(downers_ex)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
down_pwm <- MotIV::makePWM(down_prob)
down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}scatter_mt <- extract_coefficient_scatter(comparisons, x="mt_ex", y="mt_st", type="deseq")## This can do comparisons among the following columns in the pairwise result:
## mt_ex, mt_st, wt_ex, wt_st
## Actually comparing mt_ex and mt_st.
uppers_mt <- scatter_mt$data[rownames(scatter_mt$data) %in%
rownames(sig$deseq[["ups"]][["mt_st_vs_mt_ex"]]), ]
downers_mt <- scatter_mt$data[rownames(scatter_mt$data) %in%
rownames(sig$deseq[["downs"]][["mt_st_vs_mt_ex"]]), ]
sc_mt <- scatter_mt$scatter
sc_mt <- sc_mt +
ggplot2::scale_x_continuous(limits = c(0, 20)) +
ggplot2::scale_y_continuous(limits = c(0, 20))## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_mtsc_mt <- sc_mt +
ggrepel::geom_text_repel(data=uppers_mt,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=-0.2, nudge_y=2.5,
ggplot2::aes(x=mt_ex, y=mt_st, label=rownames(uppers_mt))) +
ggplot2::geom_point(data=uppers_mt, ggplot2::aes(x=mt_ex, y=mt_st), colour="blue") +
ggrepel::geom_text_repel(data=downers_mt,
arrow=arrow(length=unit(0.01, "npc")),
nudge_x=2.5, nudge_y=-0.2,
ggplot2::aes(x=mt_ex, y=mt_st, label=rownames(downers_mt))) +
ggplot2::geom_point(data=downers_mt, ggplot2::aes(x=mt_ex, y=mt_st), colour="blue") +
ggplot2::scale_x_continuous(limits = c(0, 20)) +
ggplot2::scale_y_continuous(limits = c(0, 20))## Scale for 'x' is already present. Adding another scale for 'x', which will replace the
## existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the
## existing scale.
sc_mtscatter_mt$both_histogram$plotup_names <- rownames(uppers_mt)
up_seq <- Biostrings::DNAStringSet(up_names)
if (length(up_seq) > 0) {
up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
up_pwm <- MotIV::makePWM(up_prob)
up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
}down_names <- rownames(downers_ex)
down_seq <- Biostrings::DNAStringSet(down_names)
if (length(down_seq) > 0) {
down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
down_pwm <- MotIV::makePWM(down_prob)
down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
}Hmm I am curious, I found some 8-mer which seemed to happen a bunch, is it over-represented in the genome? That sequence was ‘CGGCTGTT’
pa14 <- new.env()
load(file="../savefiles/01_annotation-v20171019.rda.xz", envir=pa14)
pa14 <- pa14$pa14
count_nmer <- function(pattern, genome=pa14$seq, mismatch=0) {
dict <- Biostrings::PDict(pattern, max.mismatch=mismatch)
result <- Biostrings::vcountPDict(dict, genome)[1,1]
return(result)
}
count_nmer("CGGCTGTT", pa14$seq) ## interesting
## This appears to me to be an overrepresented pattern in the genome, as it appears 217 times
count_nmer("AAAGAACT", pa14$seq)
library(Biobase)
all_nmers <- exprs(mers_expt$expressionset)
four_mer_genome <- lapply(rownames(all_nmers), count_nmer)
all_nmers <- as.data.frame(cbind(all_nmers, unlist(four_mer_genome)))
colnames(all_nmers) <- c("hpgl0701","hpgl0702","hpgl0703","hpgl0704",
"hpgl0705","hpgl0706","hpgl0707","hpgl0708","genome")
tt = hpgl_norm(all_nmers, transform="log2")
ttt = as.data.frame(tt$count_table)
plot_density(ttt, colors=c("red","red","red","red",
"blue","blue","blue","blue","green"))
## The green is the number of times a given 4-mer is found in the genome while red and blue
## are the wt and mutant versions of the same thing
plot_histogram(ttt$genome)
plot_histogram(ttt[[1]])
plot_histogram(ttt[[2]])
plot_histogram(ttt[[3]])
plot_histogram(ttt[[4]])
plot_histogram(ttt[[5]])
plot_histogram(ttt[[6]])
plot_histogram(ttt[[7]])
plot_histogram(ttt[[8]])