There is one important caveat: I only just remembered to queue up the salmon quantifications on the cluster. They should finish shortly, but this should nevertheless give a good idea of how similar the results are and the degree to which salmon/hisat2 agree.
Even before starting, I am willing to bet a very large sum that my salmon numbers will be nearly or completely identical to the numbers from dragen. Before moving on to compare my hisat numbers to dragen’s salmon, I bet that when we compare the count tables on a per-sample basis, the correlations are > 0.9 for every sample.
hs_annot <- sm(load_biomart_annotations(year = "2020"))
hs_annot <- hs_annot[["annotation"]]
hs_annot[["transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE)
tx_gene_map <- hs_annot[, c("transcript", "ensembl_gene_id")]
summary(hs_annot)
## ensembl_transcript_id ensembl_gene_id version transcript_version
## Length:227921 Length:227921 Min. : 1.0 Min. : 1.00
## Class :character Class :character 1st Qu.: 6.0 1st Qu.: 1.00
## Mode :character Mode :character Median :12.0 Median : 1.00
## Mean :10.7 Mean : 3.08
## 3rd Qu.:16.0 3rd Qu.: 5.00
## Max. :29.0 Max. :17.00
##
## hgnc_symbol description gene_biotype cds_length
## Length:227921 Length:227921 Length:227921 Min. : 3
## Class :character Class :character Class :character 1st Qu.: 357
## Mode :character Mode :character Mode :character Median : 694
## Mean : 1139
## 3rd Qu.: 1446
## Max. :107976
## NA's :127343
## chromosome_name strand start_position end_position
## Length:227921 Length:227921 Min. :5.77e+02 Min. :6.47e+02
## Class :character Class :character 1st Qu.:3.11e+07 1st Qu.:3.12e+07
## Mode :character Mode :character Median :6.04e+07 Median :6.06e+07
## Mean :7.41e+07 Mean :7.42e+07
## 3rd Qu.:1.09e+08 3rd Qu.:1.09e+08
## Max. :2.49e+08 Max. :2.49e+08
##
## transcript
## Length:227921
## Class :character
## Mode :character
##
##
##
##
hisat <- create_expt("sample_sheets/dragen_samples_202105.xlsx", file = "hg38100hisatfile",
gene_info = hs_annot) %>%
set_expt_conditions(fact = "clinicaloutcome") %>%
set_expt_batches(fact = "typeofcells")
## Reading the sample metadata.
## The sample definitions comprises: 8 rows(samples) and 58 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/TMRC30109/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows.
## preprocessing/TMRC30110/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## preprocessing/TMRC30111/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## preprocessing/TMRC30112/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## preprocessing/TMRC30113/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## preprocessing/TMRC30114/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## preprocessing/TMRC30115/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## preprocessing/TMRC30116/outputs/hisat2_hg38_100/r1_trimmed.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## Finished reading count data.
## Matched 21452 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21481 rows and 8 columns.
dragen <- create_expt("sample_sheets/dragen_samples_202105.xlsx", file = "hg3891salmonfile",
gene_info = hs_annot, tx_gene_map = tx_gene_map) %>%
set_expt_conditions(fact = "clinicaloutcome") %>%
set_expt_batches(fact = "typeofcells")
## Reading the sample metadata.
## The sample definitions comprises: 8 rows(samples) and 58 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count data.
## Matched 35866 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 35866 rows and 8 columns.
hisat_norm <- normalize_expt(hisat, filter = TRUE, norm = "quant",
convert = "cpm", transform = "log2")
## Removing 8507 low-count genes (12974 remaining).
## transform_counts: Found 142 values equal to 0, adding 1 to the matrix.
plot_pca(hisat_norm)$plot
dragen_norm <- normalize_expt(dragen, filter = TRUE, norm = "quant",
convert = "cpm", transform = "log2")
## Removing 28054 low-count genes (7812 remaining).
## transform_counts: Found 30473 values equal to 0, adding 1 to the matrix.
plot_pca(dragen_norm)$plot
merged_table <- merge(exprs(hisat), exprs(dragen), by = "row.names")
cor.test(merged_table[["TMRC30109.x"]], merged_table[["TMRC30109.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30109.x"]] and merged_table[["TMRC30109.y"]]
## t = 415, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9882 0.9896
## sample estimates:
## cor
## 0.9889
cor.test(merged_table[["TMRC30110.x"]], merged_table[["TMRC30110.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30110.x"]] and merged_table[["TMRC30110.y"]]
## t = 345, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9831 0.9851
## sample estimates:
## cor
## 0.9841
cor.test(merged_table[["TMRC30111.x"]], merged_table[["TMRC30111.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30111.x"]] and merged_table[["TMRC30111.y"]]
## t = 452, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9900 0.9912
## sample estimates:
## cor
## 0.9906
cor.test(merged_table[["TMRC30112.x"]], merged_table[["TMRC30112.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30112.x"]] and merged_table[["TMRC30112.y"]]
## t = 247, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9678 0.9716
## sample estimates:
## cor
## 0.9698
cor.test(merged_table[["TMRC30113.x"]], merged_table[["TMRC30113.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30113.x"]] and merged_table[["TMRC30113.y"]]
## t = 141, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9095 0.9197
## sample estimates:
## cor
## 0.9147
cor.test(merged_table[["TMRC30114.x"]], merged_table[["TMRC30114.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30114.x"]] and merged_table[["TMRC30114.y"]]
## t = 133, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9004 0.9116
## sample estimates:
## cor
## 0.9061
cor.test(merged_table[["TMRC30115.x"]], merged_table[["TMRC30115.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30115.x"]] and merged_table[["TMRC30115.y"]]
## t = 294, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9769 0.9796
## sample estimates:
## cor
## 0.9782
cor.test(merged_table[["TMRC30116.x"]], merged_table[["TMRC30116.y"]])
##
## Pearson's product-moment correlation
##
## data: merged_table[["TMRC30116.x"]] and merged_table[["TMRC30116.y"]]
## t = 107, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8567 0.8726
## sample estimates:
## cor
## 0.8648
Well, I was not completely correct, 30116 is a little lower.
hisat_de <- all_pairwise(hisat, model_batch = FALSE)
## Plotting a PCA before surrogate/batch inclusion.
## Assuming no batch in model for testing pca.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
dragen_de <- all_pairwise(dragen, model_batch = FALSE)
## Plotting a PCA before surrogate/batch inclusion.
## Assuming no batch in model for testing pca.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
hisat_table <- combine_de_tables(hisat_de)
dragen_table <- combine_de_tables(dragen_de)
comparison <- compare_de_results(hisat_table, dragen_table)
## Testing method: limma.
## Adding method: limma to the set.
## Testing method: deseq.
## Adding method: deseq to the set.
## Testing method: edger.
## Adding method: edger to the set.
## Starting method limma, table Failure_vs_Cure.
## Warning in cor(x = fs[["x"]], y = fs[["y"]], method = cor_method): the standard
## deviation is zero
## Starting method deseq, table Failure_vs_Cure.
## Warning in cor(x = fs[["x"]], y = fs[["y"]], method = cor_method): the standard
## deviation is zero
## Starting method edger, table Failure_vs_Cure.
## Warning in cor(x = fs[["x"]], y = fs[["y"]], method = cor_method): the standard
## deviation is zero
tt <- merge(hisat_table[["data"]][[1]], dragen_table[["data"]][[1]], by = "row.names")
cor.test(tt[, "deseq_logfc.x"], tt[, "deseq_logfc.y"])
##
## Pearson's product-moment correlation
##
## data: tt[, "deseq_logfc.x"] and tt[, "deseq_logfc.y"]
## t = 45, df = 3879, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5669 0.6081
## sample estimates:
## cor
## 0.5879
hmm <- plot_linear_scatter(tt[, c("deseq_logfc.x", "deseq_logfc.y")])
hmm$scatter
Keep in mind that I explicitly did not filter the input before passing it to DESeq/limma/EdgeR, thus this is a worst-case scenario comparison between the two methods, especially given the large range of coverage.
The dots on the y-axis are where htseq has decided not to count a given gene, presumably because of the quality scores returned by hisat2, multi-mapping, or due to difficult to handle introns.
The dots on the x-axis are where salmon has discounted the results for a given gene, presumably because of a multi-gene family.
Lets prove myself right or wrong…
hisat_de2 <- all_pairwise(hisat, model_batch = FALSE, filter = TRUE)
## Plotting a PCA before surrogate/batch inclusion.
## Assuming no batch in model for testing pca.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
dragen_de2 <- all_pairwise(dragen, model_batch = FALSE, filter = TRUE)
## Plotting a PCA before surrogate/batch inclusion.
## Assuming no batch in model for testing pca.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
hisat_table2 <- combine_de_tables(hisat_de2)
dragen_table2 <- combine_de_tables(dragen_de2)
comparison2 <- compare_de_results(hisat_table2, dragen_table2)
## Testing method: limma.
## Adding method: limma to the set.
## Testing method: deseq.
## Adding method: deseq to the set.
## Testing method: edger.
## Adding method: edger to the set.
## Starting method limma, table Failure_vs_Cure.
## Warning in cor(x = fs[["x"]], y = fs[["y"]], method = cor_method): the standard
## deviation is zero
## Starting method deseq, table Failure_vs_Cure.
## Starting method edger, table Failure_vs_Cure.
comparison2[["logfc"]]
## Failure_vs_Cure
## 0.7556
tt <- merge(hisat_table2[["data"]][[1]], dragen_table2[["data"]][[1]], by = "row.names")
cor.test(tt[, "deseq_logfc.x"], tt[, "deseq_logfc.y"])
##
## Pearson's product-moment correlation
##
## data: tt[, "deseq_logfc.x"] and tt[, "deseq_logfc.y"]
## t = 43, df = 975, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7829 0.8269
## sample estimates:
## cor
## 0.806
hmm2 <- plot_linear_scatter(tt[, c("deseq_logfc.x", "deseq_logfc.y")])
hmm2$scatter
hmm2$cor
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 43, df = 975, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7829 0.8269
## sample estimates:
## cor
## 0.806
hmm2 <- plot_linear_scatter(tt[, c("limma_logfc.x", "limma_logfc.y")])
hmm2$scatter
hmm2$cor
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 27, df = 975, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6209 0.6921
## sample estimates:
## cor
## 0.658
tmp <- loadme(filename = savefile)