## The biomart annotations file already exists, loading from it.
rownames(hs_annot) <- make.names(
paste0(hs_annot[["ensembl_transcript_id"]], ".",
hs_annot[["transcript_version"]]),
unique=TRUE)
hs_tx_gene <- hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene[["id"]] <- rownames(hs_tx_gene)
hs_tx_gene <- hs_tx_gene[, c("id", "ensembl_gene_id")]
new_hs_annot <- hs_annot
rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)
lots <- create_expt(metadata="sample_sheets/many_samples.xlsx",
gene_info=new_hs_annot,
tx_gene_map=hs_tx_gene)## Reading the sample metadata.
## The sample definitions comprises: 285 rows(samples) and 30 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 19629 annotations and counts.
## Bringing together the count matrix and gene information.
## Hey, your new gene map IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
The gene set collections available in GSVA are a bit out of date; thus the following should create a fresh copy using the data files downloaded from Broad.
There are two redundant ways to get the same GSC. In the first instance, we parse the out of the larger XML file provided by Broad.
The following instance is the one I will actually use, just because I feel like it will take less time to get them from the gmt files.
## In this second instance, we get them from the gmt file for c7 explicitly.
broad_c7 <- GSEABase::getGmt("reference/msigdb_v6.2/c7.all.v6.2.entrez.gmt",
collectionType=GSEABase::BroadCollection(category="c7"),
geneIdType=GSEABase::EntrezIdentifier())
library(xCell)
data(xCell.data)
xcell_data <- xCell.data[["signatures"]]
xcell_entrez <- convert_gsc_ids(xcell_data, from_type="SYMBOL")## Converting the rownames() of the expressionset to ENTREZID.
Take the above expressionset and perform GSVA using it and a simple, controlled subset as the trainer.
## There were 266, now there are 203 samples.
## This function will replace the expt$expressionset slot with:
## rpkm(hpgl(data))
## It backs up the current data into a slot named:
## expt$backup_expressionset. It will also save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep the libsizes in mind
## when invoking limma. The appropriate libsize is the non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: hpgl
## Removing 5595 low-count genes (14034 remaining).
## Step 2: not normalizing the data.
## Step 3: converting the data with rpkm.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
pair_delta <- exprs(macr_norm)[, "hpgl0376"] / exprs(macr_norm)[, "hpgl0375"]
pair_up_idx <- order(pair_delta, decreasing=TRUE)
pair_down_idx <- order(pair_delta, decreasing=FALSE)
pair_up <- pair_delta[pair_up_idx]
pair_down <- pair_delta[pair_down_idx]
single_inf <- exprs(macr_norm)[, "hpgl0376"]
single_uninf <- exprs(macr_norm)[, "hpgl0375"]
singleinf_high_idx <- order(single_inf, decreasing=TRUE)
singleinf_low_idx <- order(single_inf, decreasing=FALSE)
singleinf_high <- single_inf[singleinf_high_idx]
singleinf_low <- single_inf[singleinf_low_idx]
singleuninf_high_idx <- order(single_uninf, decreasing=TRUE)
singleuninf_low_idx <- order(single_uninf, decreasing=FALSE)
singleuninf_high <- single_uninf[singleuninf_high_idx]
singleuninf_low <- single_uninf[singleuninf_low_idx]
num <- 100
up <- names(head(pair_up, n=num))
dn <- names(head(pair_down, n=num))
pair_gsc <- make_gsc_from_ids(up, dn, pair_names=c("up", "dn"))## Converting the rownames() of the expressionset to ENTREZID.
hghinf <- names(head(singleinf_high, n=num))
lowinf <- names(head(singleinf_low, n=num))
inf_gsc <- make_gsc_from_ids(hghinf, lowinf,
category_name="infected",
pair_names=c("hgh", "low"))## Converting the rownames() of the expressionset to ENTREZID.
hghuninf <- names(head(singleuninf_high, n=num))
lowuninf <- names(head(singleuninf_low, n=num))
uninf_gsc <- make_gsc_from_ids(hghuninf, lowuninf,
category_name="uninfected",
pair_names=c("hgh", "low"))## Converting the rownames() of the expressionset to ENTREZID.
our_lst <- list(
pair_gsc[[1]], pair_gsc[[2]],
inf_gsc[[1]], inf_gsc[[2]],
uninf_gsc[[1]], uninf_gsc[[2]])
names(our_lst) <- c(names(pair_gsc)[1], names(pair_gsc)[2],
names(inf_gsc)[1], names(inf_gsc)[2],
names(uninf_gsc)[1], names(uninf_gsc)[2])
for (c in 1:length(our_lst)) {
name <- names(our_lst)[c]
gs <- our_lst[[c]]
collection[[name]] <- gs
}
tt <- sm(library(GSEABase))
collection_gsc <- GeneSetCollection(collection)our_test <- sm(simple_gsva(macr_norm, signatures=collection_gsc))
our_test_data <- exprs(our_test$expt)
rank_score <- order(our_test_data[, "hpgl0376"], decreasing=TRUE)
top_rank <- our_test_data[rank_score, ]
top_100 <- head(top_rank[, "hpgl0376"], n=100)
head(print(names(top_100)), n=40)## [1] "Melanocytes%FANTOM%1.txt"
## [2] "GSE9988_ANTI_TREM1_VS_VEHICLE_TREATED_MONOCYTES_UP"
## [3] "GSE9988_ANTI_TREM1_VS_CTRL_TREATED_MONOCYTES_UP"
## [4] "GSE9988_ANTI_TREM1_AND_LPS_VS_CTRL_TREATED_MONOCYTES_UP"
## [5] "Mesangial cells%ENCODE%2.txt"
## [6] "GSE9988_ANTI_TREM1_AND_LPS_VS_VEHICLE_TREATED_MONOCYTES_UP"
## [7] "Epithelial cells%HPCA%2.txt"
## [8] "GSE22140_HEALTHY_VS_ARTHRITIC_GERMFREE_MOUSE_CD4_TCELL_DN"
## [9] "Chondrocytes%HPCA%3.txt"
## [10] "GSE9988_LOW_LPS_VS_ANTI_TREM1_AND_LPS_MONOCYTE_DN"
## [11] "ELSAYED_MACROPHAGE_INFECTED_HGH"
## [12] "Epithelial cells%HPCA%1.txt"
## [13] "GSE9988_LPS_VS_LPS_AND_ANTI_TREM1_MONOCYTE_DN"
## [14] "NK cells%NOVERSHTERN%1.txt"
## [15] "Monocytes%IRIS%2.txt"
## [16] "GSE9988_LOW_LPS_VS_CTRL_TREATED_MONOCYTE_UP"
## [17] "Neurons%ENCODE%1.txt"
## [18] "Monocytes%BLUEPRINT%3.txt"
## [19] "GSE2706_UNSTIM_VS_2H_R848_DC_DN"
## [20] "GSE9988_LOW_LPS_VS_VEHICLE_TREATED_MONOCYTE_UP"
## [21] "GSE9988_LPS_VS_VEHICLE_TREATED_MONOCYTE_UP"
## [22] "ELSAYED_MACROPHAGE_UNINFECTED_HGH"
## [23] "cDC%HPCA%3.txt"
## [24] "GSE22140_GERMFREE_VS_SPF_MOUSE_CD4_TCELL_DN"
## [25] "GSE22886_DAY0_VS_DAY1_MONOCYTE_IN_CULTURE_DN"
## [26] "Monocytes%NOVERSHTERN%3.txt"
## [27] "Chondrocytes%HPCA%2.txt"
## [28] "GSE19401_NAIVE_VS_IMMUNIZED_MOUSE_PLN_FOLLICULAR_DC_UP"
## [29] "Keratinocytes%ENCODE%3.txt"
## [30] "GSE9988_LPS_VS_CTRL_TREATED_MONOCYTE_UP"
## [31] "GSE2706_UNSTIM_VS_2H_LPS_DC_DN"
## [32] "HSC%FANTOM%3.txt"
## [33] "HSC%NOVERSHTERN%2.txt"
## [34] "GSE22196_HEALTHY_VS_OBESE_MOUSE_SKIN_GAMMADELTA_TCELL_DN"
## [35] "Monocytes%IRIS%1.txt"
## [36] "Eosinophils%BLUEPRINT%3.txt"
## [37] "Sebocytes%FANTOM%2.txt"
## [38] "GSE29617_CTRL_VS_DAY7_TIV_FLU_VACCINE_PBMC_2008_UP"
## [39] "HSC%FANTOM%2.txt"
## [40] "Monocytes%HPCA%1.txt"
## [41] "Eosinophils%NOVERSHTERN%1.txt"
## [42] "Monocytes%IRIS%3.txt"
## [43] "Monocytes%BLUEPRINT%1.txt"
## [44] "GSE2706_UNSTIM_VS_2H_LPS_AND_R848_DC_DN"
## [45] "Eosinophils%BLUEPRINT%1.txt"
## [46] "GSE36891_POLYIC_TLR3_VS_PAM_TLR2_STIM_PERITONEAL_MACROPHAGE_UP"
## [47] "GSE9988_ANTI_TREM1_VS_LOW_LPS_MONOCYTE_UP"
## [48] "GSE1432_1H_VS_24H_IFNG_MICROGLIA_UP"
## [49] "GSE9988_ANTI_TREM1_VS_LPS_MONOCYTE_UP"
## [50] "GSE14769_UNSTIM_VS_80MIN_LPS_BMDM_DN"
## [51] "GSE18791_CTRL_VS_NEWCASTLE_VIRUS_DC_2H_DN"
## [52] "Monocytes%NOVERSHTERN%2.txt"
## [53] "pDC%FANTOM%2.txt"
## [54] "aDC%IRIS%1.txt"
## [55] "GMP%NOVERSHTERN%1.txt"
## [56] "GMP%NOVERSHTERN%2.txt"
## [57] "GMP%NOVERSHTERN%3.txt"
## [58] "GSE37605_TREG_VS_TCONV_NOD_FOXP3_FUSION_GFP_UP"
## [59] "GSE22886_DAY1_VS_DAY7_MONOCYTE_IN_CULTURE_UP"
## [60] "NK cells%NOVERSHTERN%2.txt"
## [61] "Chondrocytes%HPCA%1.txt"
## [62] "CMP%BLUEPRINT%1.txt"
## [63] "GSE9988_ANTI_TREM1_VS_ANTI_TREM1_AND_LPS_MONOCYTE_DN"
## [64] "Tregs%HPCA%1.txt"
## [65] "Erythrocytes%FANTOM%2.txt"
## [66] "GSE36891_UNSTIM_VS_POLYIC_TLR3_STIM_PERITONEAL_MACROPHAGE_UP"
## [67] "GSE14769_UNSTIM_VS_40MIN_LPS_BMDM_DN"
## [68] "GSE30971_CTRL_VS_LPS_STIM_MACROPHAGE_WBP7_HET_2H_UP"
## [69] "CMP%BLUEPRINT%2.txt"
## [70] "Tregs%HPCA%3.txt"
## [71] "GSE19401_UNSTIM_VS_RETINOIC_ACID_AND_PAM2CSK4_STIM_FOLLICULAR_DC_DN"
## [72] "GSE14769_UNSTIM_VS_60MIN_LPS_BMDM_DN"
## [73] "GSE12392_IFNAR_KO_VS_IFNB_KO_CD8_NEG_SPLEEN_DC_UP"
## [74] "iDC%HPCA%3.txt"
## [75] "CMP%BLUEPRINT%3.txt"
## [76] "GSE25123_CTRL_VS_ROSIGLITAZONE_STIM_PPARG_KO_MACROPHAGE_UP"
## [77] "Myocytes%FANTOM%2.txt"
## [78] "Sebocytes%FANTOM%1.txt"
## [79] "pDC%FANTOM%3.txt"
## [80] "GSE46606_UNSTIM_VS_CD40L_IL2_IL5_1DAY_STIMULATED_IRF4HIGH_SORTED_BCELL_DN"
## [81] "GSE3920_UNTREATED_VS_IFNG_TREATED_ENDOTHELIAL_CELL_UP"
## [82] "NK cells%FANTOM%1.txt"
## [83] "Eosinophils%BLUEPRINT%2.txt"
## [84] "CD4+ Tem%NOVERSHTERN%2.txt"
## [85] "CD4+ Tem%NOVERSHTERN%3.txt"
## [86] "GSE30971_CTRL_VS_LPS_STIM_MACROPHAGE_WBP7_KO_2H_UP"
## [87] "GSE30971_CTRL_VS_LPS_STIM_MACROPHAGE_WBP7_KO_4H_UP"
## [88] "Sebocytes%FANTOM%3.txt"
## [89] "GSE369_PRE_VS_POST_IL6_INJECTION_SOCS3_KO_LIVER_UP"
## [90] "GSE6269_FLU_VS_STAPH_AUREUS_INF_PBMC_DN"
## [91] "GSE40666_STAT1_KO_VS_STAT4_KO_CD8_TCELL_DN"
## [92] "GSE45365_WT_VS_IFNAR_KO_CD8A_DC_DN"
## [93] "Pericytes%FANTOM%2.txt"
## [94] "GSE27434_WT_VS_DNMT1_KO_TREG_DN"
## [95] "Astrocytes%ENCODE%3.txt"
## [96] "GSE34156_NOD2_LIGAND_VS_TLR1_TLR2_LIGAND_6H_TREATED_MONOCYTE_DN"
## [97] "GSE19923_WT_VS_HEB_AND_E2A_KO_DP_THYMOCYTE_DN"
## [98] "GSE21360_SECONDARY_VS_QUATERNARY_MEMORY_CD8_TCELL_UP"
## [99] "GSE29617_CTRL_VS_TIV_FLU_VACCINE_PBMC_2008_UP"
## [100] "Neurons%FANTOM%1.txt"
## [1] "Melanocytes%FANTOM%1.txt"
## [2] "GSE9988_ANTI_TREM1_VS_VEHICLE_TREATED_MONOCYTES_UP"
## [3] "GSE9988_ANTI_TREM1_VS_CTRL_TREATED_MONOCYTES_UP"
## [4] "GSE9988_ANTI_TREM1_AND_LPS_VS_CTRL_TREATED_MONOCYTES_UP"
## [5] "Mesangial cells%ENCODE%2.txt"
## [6] "GSE9988_ANTI_TREM1_AND_LPS_VS_VEHICLE_TREATED_MONOCYTES_UP"
## [7] "Epithelial cells%HPCA%2.txt"
## [8] "GSE22140_HEALTHY_VS_ARTHRITIC_GERMFREE_MOUSE_CD4_TCELL_DN"
## [9] "Chondrocytes%HPCA%3.txt"
## [10] "GSE9988_LOW_LPS_VS_ANTI_TREM1_AND_LPS_MONOCYTE_DN"
## [11] "ELSAYED_MACROPHAGE_INFECTED_HGH"
## [12] "Epithelial cells%HPCA%1.txt"
## [13] "GSE9988_LPS_VS_LPS_AND_ANTI_TREM1_MONOCYTE_DN"
## [14] "NK cells%NOVERSHTERN%1.txt"
## [15] "Monocytes%IRIS%2.txt"
## [16] "GSE9988_LOW_LPS_VS_CTRL_TREATED_MONOCYTE_UP"
## [17] "Neurons%ENCODE%1.txt"
## [18] "Monocytes%BLUEPRINT%3.txt"
## [19] "GSE2706_UNSTIM_VS_2H_R848_DC_DN"
## [20] "GSE9988_LOW_LPS_VS_VEHICLE_TREATED_MONOCYTE_UP"
## [21] "GSE9988_LPS_VS_VEHICLE_TREATED_MONOCYTE_UP"
## [22] "ELSAYED_MACROPHAGE_UNINFECTED_HGH"
## [23] "cDC%HPCA%3.txt"
## [24] "GSE22140_GERMFREE_VS_SPF_MOUSE_CD4_TCELL_DN"
## [25] "GSE22886_DAY0_VS_DAY1_MONOCYTE_IN_CULTURE_DN"
## [26] "Monocytes%NOVERSHTERN%3.txt"
## [27] "Chondrocytes%HPCA%2.txt"
## [28] "GSE19401_NAIVE_VS_IMMUNIZED_MOUSE_PLN_FOLLICULAR_DC_UP"
## [29] "Keratinocytes%ENCODE%3.txt"
## [30] "GSE9988_LPS_VS_CTRL_TREATED_MONOCYTE_UP"
## [31] "GSE2706_UNSTIM_VS_2H_LPS_DC_DN"
## [32] "HSC%FANTOM%3.txt"
## [33] "HSC%NOVERSHTERN%2.txt"
## [34] "GSE22196_HEALTHY_VS_OBESE_MOUSE_SKIN_GAMMADELTA_TCELL_DN"
## [35] "Monocytes%IRIS%1.txt"
## [36] "Eosinophils%BLUEPRINT%3.txt"
## [37] "Sebocytes%FANTOM%2.txt"
## [38] "GSE29617_CTRL_VS_DAY7_TIV_FLU_VACCINE_PBMC_2008_UP"
## [39] "HSC%FANTOM%2.txt"
## [40] "Monocytes%HPCA%1.txt"
The scores from gsva are a little difficult to interpret, but follow a normal distribution. Thus the following block serves to rescore them as their z-score within their distribution.
hpgl_0376_likelihoods <- gsva_likelihoods(our_test, sample="hpgl0376")
hpgl_0375_likelihoods <- gsva_likelihoods(our_test, sample="hpgl0375")
es_infected_hgh_likelihoods <- gsva_likelihoods(our_test, category="ELSAYED_MACROPHAGE_INFECTED_HGH")## Examining row: ELSAYED_MACROPHAGE_INFECTED_HGH against the data.
## Here is a histogram showing the odd distributions of scores after rescoring...
plot_histogram(hpgl_0376_likelihoods)A corollary question from scoring: what signatures are shared/unique to each set of genes, infected/uninfected? We can use a set of venn diagrams to explore that question…
sets <- list("inf" = names(inf_top200), "uninf" = names(uninf_top200))
venn <- Vennerable::Venn(Sets=sets)
plot(venn, doWeights=FALSE)intersections <- venn@IntersectionSets
inf_alone_signatures <- intersections[[2]]
annot <- fData(our_test$expt)
inf_annot <- annot[inf_alone_signatures, ]
gene_ids <- inf_annot[[2]]
ret <- list()
for (i in 1:length(gene_ids)) {
id_lst <- gene_ids[i]
ids <- strsplit(id_lst, ", ")[[1]]
for (j in 1:length(ids)) {
element <- ids[j]
if (is.null(ret[[element]])) {
ret[[element]] <- 1
} else {
ret[[element]] <- ret[[element]] + 1
}
}
}
ret <- ret[order(as.numeric(ret), decreasing=TRUE) ]uninf_alone_signatures <- intersections[[3]]
annot <- fData(our_test$expt)
uninf_annot <- annot[uninf_alone_signatures, ]
gene_ids <- uninf_annot[[2]]
ret2 <- list()
for (i in 1:length(gene_ids)) {
id_lst <- gene_ids[i]
ids <- strsplit(id_lst, ", ")[[1]]
for (j in 1:length(ids)) {
element <- ids[j]
if (is.null(ret[[element]])) {
ret2[[element]] <- 1
} else {
ret2[[element]] <- ret[[element]] + 1
}
}
}
ret2 <- ret2[order(as.numeric(ret2), decreasing=TRUE) ]## $`1847`
## [1] 43
##
## $`5743`
## [1] 42
##
## $`3552`
## [1] 41
##
## $`8870`
## [1] 39
##
## $`7128`
## [1] 39
##
## $`2920`
## [1] 39
Now lets go one step deeper, completing a logical circle…
## Query from Najib: a venn of the genes in these infected/uninfected sets
infected_sig_genes <- names(ret)
uninfected_sig_genes <- names(ret2)
sets <- list("inf" = infected_sig_genes, "uninf" = uninfected_sig_genes)
venn <- Vennerable::Venn(Sets=sets)
plot(venn, doWeights=FALSE)gene_intersections <- venn@IntersectionSets
only_infected_genes <- gene_intersections[[2]]
only_uninfected_genes <- gene_intersections[[3]]
only_infected_genes_frequency <- ret[only_infected_genes]
only_uninfected_genes_frequency <- ret2[only_uninfected_genes]
ten_or_more <- data.frame(row.names=tail(only_infected_genes, n=42))
ten_or_more$frequency <- tail(as.numeric(only_infected_genes_frequency), n=42)
library(org.Hs.eg.db)
ensembl_ids <- AnnotationDbi::select(x=org.Hs.eg.db,
keys=rownames(ten_or_more),
keytype="ENTREZID",
columns=c("ENSEMBL"))## 'select()' returned 1:many mapping between keys and columns
merged_ten <- merge(ten_or_more, ensembl_ids, by.x="row.names", by.y="ENTREZID")
merged_ten <- merge(merged_ten, hs_annot, by.x="ENSEMBL", by.y="ensembl_gene_id")
colnames(merged_ten)[2] <- "entrezid"
write.csv(file="/tmp/merged_ten.csv", merged_ten)The following block attempts to formalize the logic above and make it possible to query these intersections in more flexible and hopefully informative ways.
## Examining the mean score for experimental factor: infected from column: infectstate.
## Examining the mean score for experimental factor: uninfected from column: infectstate.
What about genes implicated in the infected-only, uninfected-only, and shared signature sets? Well, our intersect_signatures() function provides an opportunity to query even that…
## This next line should print the top 6 gene IDs and the number of signatures they were
## observed in among the signatures which are unique to the infected samples.
## This is getting a bit meta...
head(sigint$signature_genes[["inf"]])## 9816 3559 6723 23212 29889 1503
## 25 23 22 22 21 21
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
message(paste0("Saving to ", savefile))
tmp <- sm(saveme(filename=savefile))
}## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 7c4477bb4fa3639cc6cf7940216e4c4b8cbee7ce
## This is hpgltools commit: Fri Oct 26 17:27:11 2018 -0400: 7c4477bb4fa3639cc6cf7940216e4c4b8cbee7ce
## Saving to 05_gsva_testing_v20180828.rda.xz