load_biomart_annotations()$annotation hs_annot <-
## The biomart annotations file already exists, loading from it.
rownames(hs_annot) <- make.names(
paste0(hs_annot[["ensembl_transcript_id"]], ".",
"transcript_version"]]),
hs_annot[[unique = TRUE)
hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene <-"id"]] <- rownames(hs_tx_gene)
hs_tx_gene[[ hs_tx_gene[, c("id", "ensembl_gene_id")]
hs_tx_gene <- hs_annot
new_hs_annot <-rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE)
Note that as of 20190124, two samples are still missing: hpgl0914 and hpgl0749. I am rerunning their mapping with salmon now with the assumption that I just missed them previously. I noted their status in the ‘skipped’ column of the online sample sheet.
Note 20190125: hpgl0749 mapped; but hpgl0914 has some weirdness still. It looks like the forward reads have a gzip CRC error, so I used zcat to extract the available data and will then retrim/remap.
Note: 20190210: All samples mapped.
Another note, I am using the ‘state’ column, which was missing the field ‘la_infected’ for the Leishmania amazonensis samples; this resulted in a set of ‘NA’ conditions. I therefore added la_infected to the relevant fields to the sample sheet online and my working sheet.
I also found an error in the time attribution for sample hpgl0461. The time was set to undefined, while in the study it was t24h. That has been changed in both the online and my copy of the sample sheet.
##sample_sheet <- "sample_sheets/all_leishmania_samples_20190124.xlsx"
"sample_sheets/all_leishmania_samples_20190225.xlsx"
sample_sheet <- create_expt(sample_sheet,
lots <-file_column = "hsapiensfile",
gene_info = new_hs_annot,
tx_gene_map = hs_tx_gene)
## Reading the sample metadata.
## Dropped 16 rows from the sample metadata because they were blank.
## The sample definitions comprises: 421 rows(samples) and 59 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count data.
## Warning in create_expt(sample_sheet, file_column = "hsapiensfile", gene_info
## = new_hs_annot, : Some samples were removed when cross referencing the samples
## against the count data.
## Matched 16933 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 16933 rows and 267 columns.
Now we have a 267 sample data set, but we only want the samples from the human portion of the mBio paper, which Najib helpfully defined in the ‘study’ column of the sample sheet as ‘mBio’.
Thus I will pull those samples from the sample sheet and set the conditions/batches to what I am assuming are reasonable values. An important caveat: we need to concatenate the existing columns: ‘expt_time’ and ‘state’ in order to get useful values for the condition.
In addition, I am removing the L. amazonensis samples for the moment.
subset_expt(lots, subset = "study=='mbio'") mbio_expt <-
## Using a subset expression.
## There were 267, now there are 82 samples.
subset_expt(mbio_expt, subset = "studybatch!='unused'") mbio_expt <-
## Using a subset expression.
## There were 82, now there are 66 samples.
##mbio_expt <- subset_expt(mbio_expt, subset = "pathogenspecies!='lamazonensis'")
##mbio_expt <- subset_expt(mbio_expt, subset = "donor!='thp1'")
set_expt_batches(mbio_expt, "studybatch")
mbio_expt <- pData(mbio_expt)
metadata <- paste0(metadata[["state"]], "_", metadata[["expttime"]])
new_condition <- set_expt_conditions(mbio_expt, "state") mbio_expt <-
Now make some plots and see if I get similar ones to those observed in the paper.
Here is the link with the PCA plots and such: https://mbio.asm.org/content/7/3/e00027-16/figures-only
Unless I am mistaken, the only things I have to compare against are some fancy PCA plots in the main paper and a few raw-ish ones in the supplemental.
plot_libsize(mbio_expt)
libsize <-$plot libsize
This first plot makes no attempt to handle the various batch effects in the data.
sm(normalize_expt(mbio_expt, transform = "log2", convert = "cpm", filter = TRUE, norm = "quant"))
mbio_norm <- plot_pca(mbio_norm, size_column = "expttime", plot_labels = FALSE, cis = NULL,
mbio_pca <-size_order = c("t4h", "t24h", "t48h", "t72h"))
## Not putting labels on the PC plot.
##mbio_pca <- plot_pca(mbio_norm)
$plot mbio_pca
## Warning: Use of `pca_data[["condition"]]` is discouraged. Use
## `.data[["condition"]]` instead.
In this iteration, we use limma’s function to remove batch effect, which I think is what was used in order to make the figure in the paper. This is borne out by the fact that the image generated is nearly identical to the one in the paper.
function(counts, lib.size = NULL, thresh = 1, minSamples = 2) {
filterCounts = 2^cbcbSEQ::log2CPM(counts, lib.size = lib.size)$y
cpms <- rowSums(cpms > thresh) >= minSamples
keep <- counts[keep, ]
counts <-
counts
} pData(mbio_expt)$condition
condition <- table(condition)
x <- exprs(mbio_expt)
countsTable <-dim(countsTable)
##[1] 20956 72
filterCounts(countsTable, thresh = 1, minSamples = min(x))
counts <-dim(counts)
##[1] 13801 72
##12666 prior to adding in THP1 batch
##Quantile normalize counts
library(preprocessCore)
cbcbSEQ::qNorm(counts)
countsSubQ <-
##Explore data for batch effects
##Transform data to log2 counts per million
cbcbSEQ::log2CPM(countsSubQ)
x <-
##Compute principal components
library(corpcor)
cbcbSEQ::makeSVD(x$y)
s <-
pData(mbio_expt)$batch
batch <-::pcRes(s$v, s$d, condition, batch)
cbcbSEQ::plotPC(s$v, s$d) cbcbSEQ
sm(normalize_expt(t1, transform = "log2", convert = "cpm",
mbio_batch1 <-filter = TRUE, norm = "quant", batch = "limma"))
## Error in normalize_expt(t1, transform = "log2", convert = "cpm", filter = TRUE, : object 't1' not found
plot_pca(mbio_batch1,
mbio_pca1 <-cis = NULL,
size_column = "expttime", size_order = c("t4h", "t24h", "t48h", "t72h"))
## Error in plot_pca(mbio_batch1, cis = NULL, size_column = "expttime", size_order = c("t4h", : object 'mbio_batch1' not found
##mbio_pca <- plot_pca(mbio_norm)
$plot mbio_pca1
## Error in eval(expr, envir, enclos): object 'mbio_pca1' not found
Finally, I employ my favorite method: svaseq(). This squishes the time-based differences in the data and highlights the differences between the various infection states.
sm(normalize_expt(mbio_expt, transform = "log2", convert = "cpm",
mbio_batch2 <-filter = TRUE, batch = "svaseq"))
plot_pca(mbio_batch2, size_column = "expttime", plot_labels = FALSE,
mbio_pca2 <-cis = NULL, size_order = c("t4h", "t24h", "t48h", "t72h"))
## Not putting labels on the PC plot.
##mbio_pca <- plot_pca(mbio_norm)
$plot mbio_pca2
## Warning: Use of `pca_data[["condition"]]` is discouraged. Use
## `.data[["condition"]]` instead.
I do not recall what methods were used to estimate the ‘bead effect’ in the data. Therefore I am copy/pasting the relevant logs from Laura and will then try to recapitulate the tasks performed separately.
I think the makeTab() function is what was used to regenerate the p-values for the bead-adjusted data.
Following the function definition is a representative invocation performed by Laura. (I copy/pasted from her log with minor formatting changes).
## Quantile normalize counts
qNorm(counts)
countsSubQ <-## Specify model
model.matrix(~0+condition+batch)
mod =## Use voom to transform quantile-normalized count data to log2-counts per million, estimate mean-variance relationship
## and use m-v relationship to computer appropriate observational-level weights
voom(countsSubQ, mod)
v <-## Fit a linear model for each gene using the specified design contained in v
lmFit(v)
fit <-
function(contrFit, coef1, coef2, ...) {
makeTab <-## Compute test statistic
pmin(abs(contrFit$t[, coef1]), abs(contrFit$t[, coef2]))
stat <-## Compute pvalue for stat
pmax(contrFit$p.value[, coef1], contrFit$p.value[, coef2])
pval <-## Adjust pvalue for multiple testing
p.adjust(pval, method = "BH")
adj.pval <-## Make the toptable
topTable(contrFit, coef = coef1, sort.by = "none", ...)
tab <- colnames(contrFit$coef)[coef1]
coef1_name <- colnames(contrFit$coef)[coef2]
coef2_name <- data.frame(tab$ID, tab$logFC, contrFit$coef[, coef2], tab$AveExpr,
new_tab <-$t, contrFit$t[, coef2], stat, pval, adj.pval)
tab new_tab[order(-stat), ]
new_tab <-
colnames(new_tab) <- c("ID", paste0("logFC_", coef1_name),
paste0("logFC_", coef2_name),
"AveExpr",
paste0("t_", coef1_name),
paste0("t_", coef2_name),
"stat",
"P.Value",
"adj.P.Value")
new_tab
}
## eBayes finds an F-statistic from the set of t-statistics for that gene
makeContrasts(uninf_inf=(conditioninfLM24-conditionuninf24),
beads24.infLM24.contr.mat <-beads_inf=(conditioninfLM24-conditionbeads24),
levels = v$design)
contrasts.fit(fit, beads24.infLM24.contr.mat)
beads24.infLM24.fit <- eBayes(beads24.infLM24.fit)
beads24.infLM24.eb <-
makeTab(beads24.infLM24.eb, 1, 2, number = nrow(v$E)) beads24.infLM24.topTab <-
As far as the above goes, it mostly makes sense. My question is, how do we get the modified logFC values? Presumably that is later down in the log.
Looking further down I found the following invocations, which partially but incompletely answer my question.
## Define makeTab2 function
## construct a DE result table for infection vs. uninfected and beads
## contrFit: the result of eBayes after conrasts.fit
## cellmeansFit: the cell means fit (lmFit(v) above)
## conjContrasts: the 'conjuctive' null test (infection vs. uninf AND infect vs. beads)
## disjContrast: the 'other' test (beads vs. uninf)
function(contrFit, cellmeansFit, conjContrasts, disjContrast) {
makeTab2 <-## Get average expression for all relevant terms
rowSums(contrFit$contrasts[, c(conjContrasts, disjContrast)] != 0)
contr_level_counts <-## Define the condition levels involved in the tests
names(contr_level_counts)[contr_level_counts > 0]
levels_to_use <-## Extract the average counts for each, make into table
cellmeansFit$coef[, levels_to_use]
ave_expression_mat <- data.frame(ID = rownames(ave_expression_mat))
exp_table <- cbind(exp_table, as.data.frame(ave_expression_mat))
exp_table <-names(exp_table)[-1] <- paste(
"AveExpr", gsub("condition","",levels_to_use),
sep = ":")
## Compute test statistic, adjusted pval, and logFC for conjuctive test
## Add to table
rowMins(abs(contrFit$t[, conjContrasts]))
stat <- rowMaxs(contrFit$p.value[, conjContrasts])
pval <- p.adjust(pval, method = "BH")
adj.pval <- as.data.frame(contrFit$coef[, conjContrasts])
fcs <-names(fcs) <- paste("logFC", names(fcs), sep = ":")
as.data.frame(apply(contrFit$p.value[, conjContrasts], 2,
conj_pvals <-method = "BH"))
p.adjust, names(conj_pvals) <- paste("adj.P.Val", names(conj_pvals), sep = ":")
data.frame(ID = rownames(contrFit))
conj_table <- cbind(conj_table, fcs, conj_pvals, stat = stat, adj.P.Value = adj.pval)
conj_table <-names(conj_table)[seq(2 + 2 * length(conjContrasts), ncol(conj_table))] <- paste(
c("stat","adj.P.Value"),
paste(conjContrasts,collapse = ":"),
sep = ":")
## Make the table for the 'other' test
data.frame(ID = rownames(contrFit),
disj_table <-logFC = contrFit$coef[, disjContrast],
adj.P.Value = p.adjust(contrFit$p.value[, disjContrast], method = "BH"))
names(disj_table)[-1] <- paste(c("logFC", "adj.P.Value"), disjContrast, sep = ":")
## Combine tables, making sure all tables are in the same order
stopifnot(all(exp_table$ID == conj_table$ID & exp_table$ID == disj_table$ID))
cbind(exp_table, conj_table[, -1], disj_table[, -1])
out_table <-
## order output table by the statistic in the disjunctive test
order(-stat)
o <-
out_table[o,]
}
makeContrasts(uninf_inf=((conditioninfLM24-conditionuninf24)-(conditioninfLM4-conditionuninf4)),
infLM4.infLM24.contr.mat <-beads_inf=((conditioninfLM24-conditionbeads24)-(conditioninfLM4-conditionbeads4)),
uninf_beads=((conditionbeads24-conditionuninf24)-(conditionbeads4-conditionuninf4)), levels = v$design)
contrasts.fit(fit, infLM4.infLM24.contr.mat)
infLM4.infLM24.fit <- eBayes(infLM4.infLM24.fit)
infLM4.infLM24.eb <-
makeTab2(infLM4.infLM24.eb, fit, c("uninf_inf", "beads_inf"),
infLM4.infLM24.topTab <-c("uninf_beads"))
I think that is everything performed. If I understand what I see, then it is doing the following:
I do not see how this set of operations gives us a better picture of the effect of beads during an infection. The primary thing I see in it is the modification of the p-values and the compound contrast of (infy-uninfy)-(infx-uninfx) It seems to me that this might instead be time for an interaction model?
With the above in mind, it is pretty trivial for me to perform limma/edger with the same contrasts. I will first invoke my interpretation of the paper contrasts using limma_pairwise() and for the 4 hour data lmajor data.
After rereading the previous implementation, I think I get it. It was in fact using two contrasts: infected/uninfected and infected/beads. It reported the infected/beads result and then took the least significant of the p-value and t statistics of the two contrasts, re-adjusted them, and reported these.
list(
keepers <-"4hpi_uninf" = c("lm_infected_t4h", "uninfected_t4h"),
"4hpi_beads" = c("lm_infected_t4h", "bead_t4h")
) set_expt_conditions(mbio_expt, new_condition)
mbio_filt <- sm(normalize_expt(mbio_filt, filter = TRUE))
mbio_filt <-## Something weird happened, my counts are somehow getting cast as non-integers!
## Thus I am invoking force = TRUE until I figure out what is going on.
sm(all_pairwise(mbio_filt, model_batch = TRUE,
mbio_pairwise <-do_ebseq = FALSE, force = TRUE))
glue::glue("excel/{rundate}_mbio_pairwise_tables-v{ver}.xlsx")
excel_file <- sm(combine_de_tables(mbio_pairwise, keepers = keepers, excel = excel_file)) mbio_tables <-
$comp$heat mbio_pairwise
"plots"]][["4hpi_uninf"]][["deseq_ma_plots"]][["plot"]] mbio_tables[[
I saved the worksheet ‘infLM4_before’ as inline-supplementary-material-5_infLM4_before.csv It is 4 hpi / uninfected, which is happily a contrast I performed.
readr::read_csv("excel/inline-supplementary-material-5_infLM4_before.csv") old_table <-
## Warning: Missing column names filled in: 'X2' [2], 'X3' [3], 'X4' [4], 'X5' [5]
##
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## `DE genes in L. major-infected human macrophages relative to uninfected controls, 4 hpi, not accounting for phagocytosis` = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character()
## )
colnames(old_table) <- old_table[1, ]
as.data.frame(old_table[-1, ])
old_table <-
rownames(old_table) <- old_table[["ID"]]
mbio_tables[["data"]][["4hpi_uninf"]]
new_table <- merge(old_table, new_table, by = "row.names")
common <-dim(common)
## [1] 4308 52
"Fold change"]] <- as.numeric(common[["Fold change"]])
common[[ common[["Fold change"]] < 0
neg_idx <-"Fold change"] <- log2(1 / (-1 * common[neg_idx, "Fold change"]))
common[neg_idx, !neg_idx, "Fold change"] <- log2(common[!neg_idx, "Fold change"])
common[
cor.test(common[["Fold change"]], common[["limma_logfc"]])
##
## Pearson's product-moment correlation
##
## data: common[["Fold change"]] and common[["limma_logfc"]]
## t = 111, df = 4306, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8534 0.8689
## sample estimates:
## cor
## 0.8613
plot_linear_scatter(common[, c("Fold change", "limma_logfc")])
test <-$scatter test
readr::read_csv("excel/inline-supplementary-material-5_infLM4_before.csv") old_table <-
## Warning: Missing column names filled in: 'X2' [2], 'X3' [3], 'X4' [4], 'X5' [5]
##
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## `DE genes in L. major-infected human macrophages relative to uninfected controls, 4 hpi, not accounting for phagocytosis` = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character()
## )
colnames(old_table) <- old_table[1, ]
as.data.frame(old_table[-1, ])
old_table <-rownames(old_table) <- old_table[["ID"]]
mbio_tables[["data"]][["4hpi_beads"]]
new_table <- merge(old_table, new_table, by = "row.names")
common <-dim(common)
## [1] 4308 52
common[["Fold change"]] < 0
neg_idx <-"Fold change"]] <- as.numeric(common[["Fold change"]])
common[["Fold change"] <- log2(1 / (-1 * common[neg_idx, "Fold change"]))
common[neg_idx, !neg_idx, "Fold change"] <- log2(common[!neg_idx, "Fold change"])
common[
cor.test(common[["Fold change"]], common[["limma_logfc"]])
##
## Pearson's product-moment correlation
##
## data: common[["Fold change"]] and common[["limma_logfc"]]
## t = 93, df = 4306, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8058 0.8258
## sample estimates:
## cor
## 0.816
plot_linear_scatter(common[, c("Fold change", "limma_logfc")])
test <-$scatter test
readr::read_csv("excel/inline-supplementary-material-5_infLM4_after.csv") old_table <-
## Warning: Missing column names filled in: 'X2' [2], 'X3' [3], 'X4' [4], 'X5' [5],
## 'X6' [6]
##
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## `DE genes in L. major-infected human macrophages relative to uninfected controls, 4 hpi, with accounting for phagocytosis` = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_character()
## )
colnames(old_table) <- old_table[1, ]
as.data.frame(old_table[-1, ])
old_table <-rownames(old_table) <- old_table[["ID"]]
mbio_tables[["data"]][["4hpi_beads"]]
new_table <- merge(old_table, new_table, by = "row.names")
common <-dim(common)
## [1] 2470 53
"Fold change beads v inf"]] <- log2(as.numeric(common[["Fold change uninf v inf"]])) common[[
## Warning: NaNs produced
cor.test(common[["Fold change beads v inf"]], common[["limma_logfc"]])
##
## Pearson's product-moment correlation
##
## data: common[["Fold change beads v inf"]] and common[["limma_logfc"]]
## t = 36, df = 955, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7353 0.7883
## sample estimates:
## cor
## 0.7631
plot_linear_scatter(common[, c("Fold change beads v inf", "limma_logfc")])
test <-$scatter test
mbio_tables[["data"]][[1]]
t1_infuninf <- mbio_tables[["data"]][[2]]
t2_beaduninf <- data.frame(
lfc_pvals <-row.names = rownames(t1_infuninf),
"l2fc" = t1_infuninf[["limma_logfc"]],
"infuninf" = t1_infuninf[["limma_adjp"]],
"beaduninf" = t2_beaduninf[["limma_adjp"]])
rownames(lfc_pvals) <- rownames(t1_infuninf)
"worst"]] <- 1.0
lfc_pvals[[library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:testthat':
##
## matches
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:hpgltools':
##
## combine
## The following object is masked from 'package:testthat':
##
## matches
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
lfc_pvals %>%
lfc_pvals <- rownames_to_column("rownames") %>%
rowwise() %>%
mutate(worst = pmax(infuninf, beaduninf))
merge(old_table, lfc_pvals, by.x = "ID", by.y = "rownames", all.x = TRUE)
merged <-"adj.P.Value"]] <- as.numeric(merged[["adj.P.Value"]])
merged[["infuninf"]] <- as.numeric(merged[["infuninf"]])
merged[["beaduninf"]] <- as.numeric(merged[["beaduninf"]])
merged[["worst"]] <- as.numeric(merged[["worst"]])
merged[[cor.test(merged[["adj.P.Value"]], merged[["worst"]])
##
## Pearson's product-moment correlation
##
## data: merged[["adj.P.Value"]] and merged[["worst"]]
## t = 12, df = 2468, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1989 0.2734
## sample estimates:
## cor
## 0.2365
cor.test(merged[["adj.P.Value"]], merged[["infuninf"]])
##
## Pearson's product-moment correlation
##
## data: merged[["adj.P.Value"]] and merged[["infuninf"]]
## t = 9, df = 2468, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1408 0.2171
## sample estimates:
## cor
## 0.1792
cor.test(merged[["adj.P.Value"]], merged[["beaduninf"]])
##
## Pearson's product-moment correlation
##
## data: merged[["adj.P.Value"]] and merged[["beaduninf"]]
## t = 12, df = 2468, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1945 0.2691
## sample estimates:
## cor
## 0.2321
## I think this is suggestive that the two pvalue metrics are similar?
In order to recreate figure 5, I think all I need to do is generate a set of logFCs for the mouse data used in Figure 5/Table S7 and compare them. If my regenerated logFCs are similar, then I can call it success, as my human logFCs have a correlation coefficient of ~ 0.96.
The only problem with doing this is that it looks to me that I have data from multiple mouse experiments all with the study name ‘lminfectome’. I think therefore, I must figure out what samples I actually want to use and therefore presumably split the ‘lminfectome’ experiment into a couple/few separate experiments.
load_biomart_annotations(species = "mmusculus")$annotation mm_annot <-
## The biomart annotations file already exists, loading from it.
rownames(mm_annot) <- make.names(
paste0(mm_annot[["ensembl_transcript_id"]], ".",
"transcript_version"]]),
mm_annot[[unique = TRUE)
mm_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
mm_tx_gene <-"id"]] <- rownames(mm_tx_gene)
mm_tx_gene[[ mm_tx_gene[, c("id", "ensembl_gene_id")]
mm_tx_gene <- mm_annot
new_mm_annot <-rownames(new_mm_annot) <- make.names(mm_annot[["ensembl_gene_id"]], unique = TRUE)
create_expt(sample_sheet,
lots_mm <-file_column = "mmusculusfile",
gene_info = new_mm_annot,
tx_gene_map = mm_tx_gene)
## Reading the sample metadata.
## Dropped 16 rows from the sample metadata because they were blank.
## The sample definitions comprises: 421 rows(samples) and 59 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count data.
## Warning in create_expt(sample_sheet, file_column = "mmusculusfile", gene_info
## = new_mm_annot, : Some samples were removed when cross referencing the samples
## against the count data.
## Matched 19660 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 19660 rows and 100 columns.
subset_expt(lots_mm, subset = "study=='bmc'") bmc_expt <-
## Using a subset expression.
## There were 100, now there are 24 samples.
set_expt_batches(bmc_expt, "studybatch")
bmc_expt <- pData(bmc_expt)
metadata <- paste0(metadata[["infectstate"]], "_", metadata[["expttime"]])
new_condition <- set_expt_conditions(bmc_expt, new_condition)
bmc_expt <-
sm(normalize_expt(bmc_expt, transform = "log2", filter = TRUE,
bmc_norm <-convert = "cpm", norm = "quant"))
plot_pca(bmc_norm)
test <-$plot test
sm(normalize_expt(bmc_expt, transform = "log2", filter = TRUE,
bmc_normbatch <-convert = "cpm", norm = "quant", batch = "limma"))
plot_pca(bmc_normbatch)
test <-$plot test
sm(normalize_expt(bmc_expt, transform = "log2", filter = TRUE,
bmc_normbatch <-convert = "cpm", norm = "quant", batch = "svaseq"))
plot_pca(bmc_normbatch)
test <-$plot test
## This plot is similar to the one observed in figure 2B of
## https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-015-2237-2
## YAY!
all_pairwise(bmc_expt, model_batch = TRUE, force = TRUE) bmc_pairwise <-
## Plotting a PCA before surrogate/batch inclusion.
## Not putting labels on the PC plot.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Not putting labels on the PC plot.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
sm(combine_de_tables(bmc_pairwise))
bmc_tables <-
readr::read_tsv("excel/inline-supplementary-material-7.csv", skip = 1) compare_table <-
##
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## `Gene Symbol` = col_character(),
## `Human ID` = col_character(),
## `logFC in human` = col_double(),
## `Mouse ID` = col_character(),
## `logFC in mouse` = col_double()
## )
bmc_tables$data[["yes_t4h_vs_no_t4h"]]
bmc_table <- merge(compare_table, bmc_table, by.x = "Mouse ID", by.y = "row.names", all.x = TRUE)
compare_merged <-cor.test(compare_merged[["logFC in mouse"]], compare_merged[["limma_logfc"]])
##
## Pearson's product-moment correlation
##
## data: compare_merged[["logFC in mouse"]] and compare_merged[["limma_logfc"]]
## t = 133, df = 1557, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9542 0.9623
## sample estimates:
## cor
## 0.9584
In the following blocks, I will attempt to directly address the ‘Big Questions’ posed by Najib in my TODO document. For the moment, it will not be very well organized, as I did some of this stuff before Najib’s questions appeared. Once I finish, I will reorganize it though.
Which genes are DE in human macrophages at 4 hours upon infection with L. major?
This question was addressed above.
This question is mostly addressed above, but needs to be expanded slightly. It is not a very interesting question to me, to be honest; since my degree of agreement with Laura’s previous analyses is very high, I am content to just say: “whatever she said is correct for this question, lets move to something new.”