mers <- "06"

06mers simple analysis

Visualize the mers

Now I have something like a count table which I will try out and see.

sample_sheet <- paste0("all_samples", mers, ".csv")
mers_expt <- sm(create_expt(metadata=sample_sheet, gene_info=NULL))

Create some metrics

norm_mers <- sm(normalize_expt(mers_expt, transform="log2", convert="cpm", norm="quant"))
graph_raw <- sm(graph_metrics(mers_expt))
graph_norm <- sm(graph_metrics(norm_mers))
filt_mers <- sm(normalize_expt(mers_expt, filter=TRUE))
batch_mers <- sm(normalize_expt(filt_mers, transform="log2", norm="quant",
                                batch="ssva"))
graph_batch <- sm(graph_metrics(batch_mers))

View some metrics

graph_raw$libsize

graph_raw$libsize_summary
## NULL
## There are more reads of size == 4 than I assumed.
graph_raw$density

## There appears to me to be a pretty clear density shift.
graph_raw$corheat

## The correlations of the raw data as interesting
graph_raw$boxplot

graph_norm$pcaplot

## wow that is a far cleaner split than ever I would have guessed
graph_norm$disheat

graph_batch$pcaplot

## An even stronger split when batch is taken into account, so the 3 vs. 10 exclusions do matter.

Look for up/down mers?

Man this seems weird to me, but lets see

weird <- sm(all_pairwise(mers_expt, model_batch="ssva"))
fake_annotations <- Biobase::exprs(mers_expt$expressionset)
excel <- paste0("excel/", mers, "mers.xlsx")
keepers <- list("mt_vs_wt" = c("mt","wt"))
weird_table <- sm(combine_de_tables(
  weird, excel=excel, extra_annot=fake_annotations, keepers=keepers,
  rda=paste0("combined_", mers, ".rda")))
sig_excel <- paste0("excel/top_", mers, "mers.xlsx")

sig <- sm(extract_significant_genes(weird_table, according_to="all", excel=sig_excel, n=16))

weird_scatter <- extract_coefficient_scatter(weird$limma, x="wt", y="mt", type="limma")
## This can do comparisons among the following columns in the pairwise result:
## mt, wt
## Actually comparing wt and mt.
uppers <- weird_scatter$data[rownames(weird_scatter$data) %in% rownames(sig$limma$ups[[1]]), ]
downers <- weird_scatter$data[rownames(weird_scatter$data) %in% rownames(sig$limma$downs[[1]]), ]

sc <- weird_scatter$scatter
sc <- sc +
    ggrepel::geom_text_repel(data=uppers,
                             arrow=arrow(length=unit(0.01, "npc")),
                             nudge_x=-0.2, nudge_y=2.5,
                             ggplot2::aes(x=wt, y=mt, label=rownames(uppers))) +
    ggplot2::geom_point(data=uppers, ggplot2::aes(x=wt, y=mt), colour="blue") +
    ggrepel::geom_text_repel(data=downers,
                             arrow=arrow(length=unit(0.01, "npc")),
                             nudge_x=2.5, nudge_y=-0.2,
                             ggplot2::aes(x=wt, y=mt, label=rownames(downers))) +
    ggplot2::geom_point(data=downers, ggplot2::aes(x=wt, y=mt), colour="blue") +
    ggplot2::scale_x_continuous(limits = c(0, 20)) +
    ggplot2::scale_y_continuous(limits = c(0, 20))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
sc
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).

## I told it to label the top 10 most 'significant' 4-mers on either side of the distribution.

weird_scatter$both_histogram$plot

## We can see that the distribution of 4-mers shifts quite a bit in the mutant

up_names <- rownames(uppers)
up_seq <- Biostrings::DNAStringSet(up_names)
down_names <- rownames(downers)
down_seq <- Biostrings::DNAStringSet(down_names)
up_prob <- Biostrings::consensusMatrix(up_seq, as.prob=TRUE)[1:4, ]
up_pwm <- MotIV::makePWM(up_prob)

logo_file <- paste0("images/up_logo_", mers, "mers.png")
png(file=logo_file)
up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))
dev.off()
## X11cairo 
##        2
up_logo <- seqLogo::seqLogo(as.data.frame(up_pwm@pwm))

down_prob <- Biostrings::consensusMatrix(down_seq, as.prob=TRUE)[1:4, ]
down_pwm <- MotIV::makePWM(down_prob)
logo_file <- paste0("images/down_logo_", mers, "mers.png")
png(file=logo_file)
down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))
dev.off()
## X11cairo 
##        2
down_logo <- seqLogo::seqLogo(as.data.frame(down_pwm@pwm))

Hmm I am curious, I found some 8-mer which seemed to happen a bunch, is it over-represented in the genome? That sequence was ‘CGGCTGTT’

pa14 <- new.env()
load(file="../savefiles/01_annotation-v20171019.rda.xz", envir=pa14)
pa14 <- pa14$pa14
count_nmer <- function(pattern, genome=pa14$seq, mismatch=0) {
    dict <- Biostrings::PDict(pattern, max.mismatch=mismatch)
    result <- Biostrings::vcountPDict(dict, genome)[1,1]
    return(result)
}
count_nmer("CGGCTGTT", pa14$seq) ## interesting
## This appears to me to be an overrepresented pattern in the genome, as it appears 217 times
count_nmer("AAAGAACT", pa14$seq)

library(Biobase)
all_nmers <- exprs(mers_expt$expressionset)
four_mer_genome <- lapply(rownames(all_nmers), count_nmer)
all_nmers <- as.data.frame(cbind(all_nmers, unlist(four_mer_genome)))
colnames(all_nmers) <- c("hpgl0701","hpgl0702","hpgl0703","hpgl0704",
                         "hpgl0705","hpgl0706","hpgl0707","hpgl0708","genome")
tt = hpgl_norm(all_nmers, transform="log2")
ttt = as.data.frame(tt$count_table)
plot_density(ttt, colors=c("red","red","red","red",
                           "blue","blue","blue","blue","green"))
## The green is the number of times a given 4-mer is found in the genome while red and blue
## are the wt and mutant versions of the same thing

plot_histogram(ttt$genome)
plot_histogram(ttt[[1]])
plot_histogram(ttt[[2]])
plot_histogram(ttt[[3]])
plot_histogram(ttt[[4]])

plot_histogram(ttt[[5]])
plot_histogram(ttt[[6]])
plot_histogram(ttt[[7]])
plot_histogram(ttt[[8]])