This worksheet attempts to describe some analyses performed as part of “Consolidation of a molecular and clinical signature of healing in cutaneous leishmaniasis is achieved during the first ten days of treatment.”
This dataset comprises 36 samples across 12 patient biopsies with varying drug treatment (Antimonial vs. Miltefosine), clinic visit (pre-treatment, mid treatment, end of treatment), Cure/Fail, and infecting parasite.
This dataset was initially analyzed by Najib as part of his goal to learn R and get a better understanding of how we process data. As a result I did not very closely examine the data. Given that we are likely to publish this data soon, that should be addressed.
<- "sample_sheets/wellcome_trust_samples-v202209.xlsx"
samplesheet
<- make_rnaseq_spec()
rna_spec <- gather_preprocessing_metadata(samplesheet, specification = rna_spec, species = "hg38_100") modified
## Using provided specification
## Starting trimomatic_input.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*trimomatic/*-trimomatic.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*trimomatic/*-trimomatic.stderr.
## Not including new entries for: trimomatic_input, it is empty.
## Starting trimomatic_output.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*trimomatic/*-trimomatic.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*trimomatic/*-trimomatic.stderr.
## Not including new entries for: trimomatic_output, it is empty.
## Starting trimomatic_ratio.
## Checking input_file_spec: .
## Missing data to calculate the ratio between: trimomatic_output and trimomatic_input.
## Not including new entries for: trimomatic_percent, it is empty.
## Starting fastqc_pct_gc.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*fastqc/*_fastqc/fastqc_data.txt.
## Example filename: preprocessing/TMRC30097/outputs/*fastqc/*_fastqc/fastqc_data.txt.
## Starting fastqc_most_overrepresented.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*fastqc/*_fastqc/fastqc_data.txt.
## Skipping for now
## Not including new entries for: fastqc_most_overrepresented, it is empty.
## Starting hisat_rrna_single_concordant.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*hisat2_{species}/hisat2_*rRNA*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hisat2_*rRNA*.stderr.
## Not including new entries for: hisat_rrna_single_concordant, it is empty.
## Starting hisat_rrna_multi_concordant.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*hisat2_{species}/hisat2_*rRNA*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hisat2_*rRNA*.stderr.
## Not including new entries for: hisat_rrna_multi_concordant, it is empty.
## Starting hisat_rrna_percent.
## Checking input_file_spec: .
## Missing data to calculate the ratio between: hisat_rrna_multi_concordant and trimomatic_output.
## Not including new entries for: hisat_rrna_percent, it is empty.
## Starting hisat_genome_single_concordant.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*hisat2_{species}/hisat2_*genome*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_single_concordant, it is empty.
## Starting hisat_genome_multi_concordant.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*hisat2_{species}/hisat2_*genome*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_multi_concordant, it is empty.
## Starting hisat_genome_single_all.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*hisat2_{species}/hisat2_*genome*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_single_all, it is empty.
## Starting hisat_genome_multi_all.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*hisat2_{species}/hisat2_*genome*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_multi_all, it is empty.
## Starting hisat_genome_percent.
## Checking input_file_spec: .
## Missing data to calculate the ratio between: hisat_genome_single_concordant and trimomatic_output.
## Not including new entries for: hisat_genome_percent, it is empty.
## Starting hisat_count_table.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*hisat2_{species}/{species}_{type}*.count.xz.
## Example filename: preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30097/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30075/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30085/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30086/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30087/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30101/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30089/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30090/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30081/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30092/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30104/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30125/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30106/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30114/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30095/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30108/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30126/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30127/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30120/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30128/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30129/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30130/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30124/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30131/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30109/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30084/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30098/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30099/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30100/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30110/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30111/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30102/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30091/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30112/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30163/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30073/outputs/*hisat2_hg38_100/hg38_100_genome*.count.xz.
## Not including new entries for: hisat_count_table, it is empty.
## Starting salmon_stranded.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*salmon_{species}/salmon_*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*salmon_*/salmon_*.stderr.
## Starting salmon_count_table.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*salmon_{species}/quant.sf.
## Example filename: preprocessing/TMRC30097/outputs/*salmon_hg38_100/quant.sf.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30126/outputs/*salmon_hg38_100/quant.sf.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/TMRC30129/outputs/*salmon_hg38_100/quant.sf.
## Starting salmon_mapped.
## Checking input_file_spec: {basedir}/{meta[['sampleid']]}/outputs/*salmon_{species}gg/salmon_*.stderr.
## Example filename: preprocessing/TMRC30097/outputs/*salmon_*gg/salmon_*.stderr.
## Not including new entries for: salmon_mapped, it is empty.
## Writing new metadata to: sample_sheets/wellcome_trust_samples-v202209_modified.xlsx
## Deleting the file sample_sheets/wellcome_trust_samples-v202209_modified.xlsx before writing the tables.
We take the annotation data from ensembl’s biomart instance. The genome which was used to map the data was hg38 revision 100. My default when using biomart is to load the data from 1 year before the current date.
<- load_biomart_annotations(year = "2019")[["annotation"]] hs_annot
## The biomart annotations file already exists, loading from it.
"transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
hs_annot[[rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE)
<- hs_annot[, c("transcript", "ensembl_gene_id")]
tx_gene_map
summary(hs_annot)
## ensembl_transcript_id ensembl_gene_id version transcript_version hgnc_symbol description gene_biotype cds_length
## Length:226754 Length:226754 Min. : 1.0 Min. : 1.00 Length:226754 Length:226754 Length:226754 Min. : 3
## Class :character Class :character 1st Qu.: 6.0 1st Qu.: 1.00 Class :character Class :character Class :character 1st Qu.: 357
## Mode :character Mode :character Median :12.0 Median : 1.00 Mode :character Mode :character Mode :character Median : 690
## Mean :10.5 Mean : 3.07 Mean : 1134
## 3rd Qu.:15.0 3rd Qu.: 5.00 3rd Qu.: 1440
## Max. :26.0 Max. :17.00 Max. :107976
## NA's :126871
## chromosome_name strand start_position end_position transcript
## Length:226754 Length:226754 Min. :5.77e+02 Min. :6.47e+02 Length:226754
## Class :character Class :character 1st Qu.:3.11e+07 1st Qu.:3.12e+07 Class :character
## Mode :character Mode :character Median :6.04e+07 Median :6.05e+07 Mode :character
## Mean :7.41e+07 Mean :7.42e+07
## 3rd Qu.:1.09e+08 3rd Qu.:1.09e+08
## Max. :2.49e+08 Max. :2.49e+08
##
<- load_biomart_go()[["go"]] hs_go
## The biomart annotations file already exists, loading from it.
<- hs_annot[, c("ensembl_gene_id", "cds_length")]
hs_length colnames(hs_length) <- c("ID", "length")
Start out by creating a few datastructures which define the primary questions in the data.
I am going to grab colors from the TMRC3 data with the assumption that there are good choices there.
<- list(
color_choices "cf_visit" = list(
"cure_v1" = "#D95F0E",
"failure_v1" = "#FEC44F",
"cure_v2" = "#DD1C77",
"failure_v2" = "#C994C7",
"cure_v3" = "#3182BD",
"failure_v3" = "#9ECAE1"),
"cf" = list(
"cure" = "#998EC3",
"failure" = "#F1A340"),
"visit" = list(
"v1" = "#33EE33",
"v2" = "#11AA11",
"v3" = "#134413"),
"visitbi" = list(
"v1" = "#BB0000",
"vother" = "#0000BB"),
"species" = list(
"lvpanamensis" = "#FFC300",
"lvbraziliensis" = "#525252"),
"donor" = list(
"d2008" = "#B78415",
"d1029" = "#93752C",
"d1036" = "#7E6EA2",
"d1037" = "#B3499C",
"d1031" = "#BD6332",
"d2002" = "#7D8F31",
"d2009" = "#8E7037",
"d2010" = "#666666",
"d1019" = "#1B9E77",
"d2004" = "#E0A604",
"d2001" = "#CF3F76",
"d2003" = "#A0A811"),
"drug" = list(
"antimoniate" = "#3182AA",
"miltefosine" = "#C994AA"))
The set of color choices demonstrates the complexity of the experimental design. We are looking at combinations of time, outcome, species, and drug. Inherent in these is an unspecified donor effect.
The initial datastructure will use a factor combining the clinical outcome and visit as the experimental condition of interest. We will follow this up by splitting off the various factors of interest.
<- create_expt(metadata = samplesheet, gene_info = hs_annot,
wellcome_outtime file_column = "hg38100hisatfile") %>%
set_expt_batches(fact = "drug") %>%
sanitize_metadata(spaces = TRUE)
## Reading the sample metadata.
## The sample definitions comprises: 36 rows(samples) and 48 columns(metadata fields).
## Matched 21433 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21481 features and 36 samples.
## The number of samples by batch are:
##
## Antimoniate Miltefosine
## 18 18
<- paste0(pData(wellcome_outtime)[["clinicaloutcome"]], "_v",
outcome_time pData(wellcome_outtime)[["visitnumber"]])
<- set_expt_conditions(wellcome_outtime, fact = outcome_time) %>%
wellcome_outtime set_expt_colors(color_choices[["cf_visit"]])
## The numbers of samples by condition are:
##
## cure_v1 cure_v2 cure_v3 failure_v1 failure_v2 failure_v3
## 5 5 5 7 7 7
pData(wellcome_outtime)[["visitnumber"]] <- paste0("v", pData(wellcome_outtime)[["visitnumber"]])
First, let us get a quick view of the samples in their native state. There are a few samples which are candidates for removal due to lower coverage.
<- plot_legend(wellcome_outtime)
legend $plot legend
plot_libsize(wellcome_outtime)
## Library sizes of 36 samples, \
## ranging from 2,410,677 to 220,494,183.
plot_nonzero(wellcome_outtime, plot_labels = "repel")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## A non-zero genes plot of 36 samples.
## These samples have an average 45.52 CPM coverage and 16644 genes observed, ranging from 14215 to
## 17990.
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider increasing max.overlaps
<- subset_expt(wellcome_outtime, nonzero = 15000) wellcome_filtered
## The samples (and read coverage) removed when filtering 15000 non-zero genes are:
## subset_expt(): There were 36, now there are 34 samples.
Looking at the nonzero plot, it seems that 15k genes is a reasonable cutoff, which would remove 2 samples. So, for the moment I will split the data up into two sets, pre/post removal.
Given that we don’t have enough samples for a full rank matrix of donor/drug/time/cf, I am going to create separate data structures where each individual factor is the condition, starting with donor.
<- set_expt_conditions(wellcome_outtime, fact = "donor",
wellcome_donor colors = color_choices)
## The numbers of samples by condition are:
##
## d1019 d1029 d1031 d1036 d1037 d2001 d2002 d2003 d2004 d2008 d2009 d2010
## 3 3 3 3 3 3 3 3 3 3 3 3
<- set_expt_conditions(wellcome_filtered, fact = "donor",
wellcome_filtdonor colors = color_choices)
## The numbers of samples by condition are:
##
## d1019 d1029 d1031 d1036 d1037 d2001 d2002 d2003 d2004 d2008 d2009 d2010
## 3 3 3 3 3 3 3 2 3 3 2 3
In this case, the condition will only be visit number.
<- set_expt_conditions(wellcome_outtime, fact = "visitnumber",
wellcome_time colors = color_choices[["visit"]])
## The numbers of samples by condition are:
##
## v1 v2 v3
## 12 12 12
<- set_expt_conditions(wellcome_filtered, fact = "visitnumber",
wellcome_filttime colors = color_choices[["visit"]])
## The numbers of samples by condition are:
##
## v1 v2 v3
## 11 12 11
Conversely, we can split by clinical outcome.
<- set_expt_conditions(wellcome_outtime, fact = "clinicaloutcome",
wellcome_outcome colors = color_choices[["cf"]]) %>%
set_expt_batches(fact = "visitnumber")
## The numbers of samples by condition are:
##
## cure failure
## 15 21
## The number of samples by batch are:
##
## v1 v2 v3
## 12 12 12
<- set_expt_conditions(wellcome_filtered, fact = "clinicaloutcome",
wellcome_outcome_filt colors = color_choices[["cf"]]) %>%
set_expt_batches(fact = "visitnumber")
## The numbers of samples by condition are:
##
## cure failure
## 15 19
## The number of samples by batch are:
##
## v1 v2 v3
## 11 12 11
… or drug used.
<- set_expt_conditions(wellcome_outtime, fact = "drug",
wellcome_drug colors = color_choices[["drug"]]) %>%
set_expt_batches(fact = "visitnumber")
## The numbers of samples by condition are:
##
## antimoniate miltefosine
## 18 18
## The number of samples by batch are:
##
## v1 v2 v3
## 12 12 12
<- set_expt_conditions(wellcome_filtered, fact = "drug",
wellcome_drugfilt colors = color_choices[["drug"]]) %>%
set_expt_batches(fact = "visitnumber")
## The numbers of samples by condition are:
##
## antimoniate miltefosine
## 18 16
## The number of samples by batch are:
##
## v1 v2 v3
## 11 12 11
… and the infecting parasite.
<- set_expt_conditions(wellcome_outtime, fact = "infectingspecies",
wellcome_parasite colors = color_choices[["species"]]) %>%
set_expt_batches(fact = "visitnumber")
## The numbers of samples by condition are:
##
## lvbraziliensis lvpanamensis
## 12 24
## The number of samples by batch are:
##
## v1 v2 v3
## 12 12 12
<- set_expt_conditions(wellcome_filtered, fact = "infectingspecies",
wellcome_parafilt colors = color_choices[["species"]]) %>%
set_expt_batches(fact = "visitnumber")
## The numbers of samples by condition are:
##
## lvbraziliensis lvpanamensis
## 11 23
## The number of samples by batch are:
##
## v1 v2 v3
## 11 12 11
<- plot_meta_sankey(
drug_visit_species_cf factors = c("drug", "visitnumber", "infectingspecies", "clinicaloutcome"),
wellcome_outtime, color_choices = color_choices)
## Warning: attributes are not identical across measure variables; they will be dropped
drug_visit_species_cf
## A sankey plot describing the metadata of 36 samples,
## including 41 out of 44 nodes and traversing metadata factors:
## drug, visitnumber, infectingspecies, clinicaloutcome.
Note that there do not appear to be any miltefosine, braziliensis, fails.
Given the large number of variables in this data, lets start by getting a feeling for the amount of variance in each. Given that we don’t have a full rank with respect to clinical outcome, I assume that trying variance partition with the 4 primary factors will fail, but let us see…
<- simple_varpart(
all_varpart_donor
wellcome_filtered,factors = c("donor", "visitnumber"))
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be unreliable
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be unreliable
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be unreliable
##
## Total:119 s
all_varpart_donor
## The result of using variancePartition with the model: x[['model_string']]
<- simple_varpart(
all_varpart_dvic
wellcome_filtered,factors = c("drug", "visitnumber", "infectingspecies", "clinicaloutcome"))
##
## Total:140 s
all_varpart_dvic
## The result of using variancePartition with the model: x[['model_string']]
To my eyes, it appears that donor is dominant factor, followed by: visit, species, drug, and outcome. I think this observation may have an effect on the current state of the manuscript - which if I recall properly states that donor is not a significant factor in the data.
Let us ask if there are categories associated with the genes associated with each of these categories and see if they ‘make sense’.
<- order(all_varpart_donor[["fitted_df"]][["donor"]])
top_300_donor_genes_idx <- tail(all_varpart_donor[["fitted_df"]][top_300_donor_genes_idx, ], n = 300)
top_300_donor_genes summary(top_300_donor_genes)
## donor visitnumber Residuals
## Min. :0.634 Min. :0.0000 Min. :0.000
## 1st Qu.:0.648 1st Qu.:0.0329 1st Qu.:0.213
## Median :0.673 Median :0.0661 Median :0.246
## Mean :0.697 Mean :0.0635 Mean :0.239
## 3rd Qu.:0.729 3rd Qu.:0.0910 3rd Qu.:0.271
## Max. :1.000 Max. :0.1494 Max. :0.345
<- simple_gprofiler(rownames(top_300_donor_genes)) donor_genes_gp
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
donor_genes_gp
## A set of ontologies produced by gprofiler using 300
## genes against the hsapiens annotations and significance cutoff 0.05.
## There are 83 GO hits, 0, KEGG hits, 18 reactome hits, 0 wikipathway hits, 8 transcription factor hits, 0 miRNA hits, 0 HPA hits, 0 HP hits, and 0 CORUM hits.
## Category BP is the most populated with 30 hits.
<- order(all_varpart_dvic[["fitted_df"]][["visitnumber"]])
top_300_visit_genes_idx <- tail(all_varpart_dvic[["fitted_df"]][top_300_visit_genes_idx, ], n = 300)
top_300_visit_genes summary(top_300_visit_genes)
## drug visitnumber infectingspecies clinicaloutcome Residuals
## Min. :0.00000 Min. :0.351 Min. :0.00000 Min. :0.00000 Min. :0.380
## 1st Qu.:0.00098 1st Qu.:0.369 1st Qu.:0.00206 1st Qu.:0.00074 1st Qu.:0.523
## Median :0.00353 Median :0.392 Median :0.01071 Median :0.00336 Median :0.568
## Mean :0.00941 Mean :0.405 Mean :0.01937 Mean :0.00789 Mean :0.559
## 3rd Qu.:0.01049 3rd Qu.:0.429 3rd Qu.:0.02541 3rd Qu.:0.00890 3rd Qu.:0.601
## Max. :0.08742 Max. :0.599 Max. :0.20026 Max. :0.08396 Max. :0.645
<- simple_gprofiler(rownames(top_300_visit_genes)) visit_genes_gp
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
visit_genes_gp
## A set of ontologies produced by gprofiler using 300
## genes against the hsapiens annotations and significance cutoff 0.05.
## There are 39 GO hits, 0, KEGG hits, 2 reactome hits, 0 wikipathway hits, 5 transcription factor hits, 0 miRNA hits, 2 HPA hits, 0 HP hits, and 0 CORUM hits.
## Category BP is the most populated with 22 hits.
<- order(all_varpart_dvic[["fitted_df"]][["drug"]])
top_300_drug_genes_idx <- tail(all_varpart_dvic[["fitted_df"]][top_300_drug_genes_idx, ], n = 300)
top_300_drug_genes summary(top_300_drug_genes)
## drug visitnumber infectingspecies clinicaloutcome Residuals
## Min. :0.0855 Min. :0.0005 Min. :0.0000 Min. :0.00000 Min. :0.475
## 1st Qu.:0.0952 1st Qu.:0.0384 1st Qu.:0.0035 1st Qu.:0.00241 1st Qu.:0.682
## Median :0.1068 Median :0.0819 Median :0.0121 Median :0.01358 Median :0.733
## Mean :0.1166 Mean :0.0934 Mean :0.0330 Mean :0.02867 Mean :0.728
## 3rd Qu.:0.1281 3rd Qu.:0.1316 3rd Qu.:0.0434 3rd Qu.:0.04383 3rd Qu.:0.789
## Max. :0.2484 Max. :0.4005 Max. :0.3613 Max. :0.17113 Max. :0.890
<- simple_gprofiler(rownames(top_300_drug_genes)) drug_genes_gp
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
drug_genes_gp
## A set of ontologies produced by gprofiler using 300
## genes against the hsapiens annotations and significance cutoff 0.05.
## There are 29 GO hits, 0, KEGG hits, 0 reactome hits, 0 wikipathway hits, 0 transcription factor hits, 0 miRNA hits, 2 HPA hits, 0 HP hits, and 0 CORUM hits.
## Category BP is the most populated with 14 hits.
<- order(all_varpart_dvic[["fitted_df"]][["infectingspecies"]])
top_300_species_genes_idx <- tail(all_varpart_dvic[["fitted_df"]][top_300_species_genes_idx, ], n = 300)
top_300_species_genes summary(top_300_species_genes)
## drug visitnumber infectingspecies clinicaloutcome Residuals
## Min. :0.00000 Min. :0.00027 Min. :0.201 Min. :0.00000 Min. :0.450
## 1st Qu.:0.00194 1st Qu.:0.03845 1st Qu.:0.214 1st Qu.:0.00116 1st Qu.:0.604
## Median :0.00678 Median :0.06786 Median :0.238 Median :0.00505 Median :0.650
## Mean :0.01451 Mean :0.07222 Mean :0.254 Mean :0.01734 Mean :0.642
## 3rd Qu.:0.01880 3rd Qu.:0.10072 3rd Qu.:0.273 3rd Qu.:0.02279 3rd Qu.:0.693
## Max. :0.20728 Max. :0.30268 Max. :0.440 Max. :0.17757 Max. :0.775
<- simple_gprofiler(rownames(top_300_species_genes)) species_genes_gp
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
species_genes_gp
## A set of ontologies produced by gprofiler using 300
## genes against the hsapiens annotations and significance cutoff 0.05.
## There are 32 GO hits, 0, KEGG hits, 0 reactome hits, 0 wikipathway hits, 4 transcription factor hits, 3 miRNA hits, 0 HPA hits, 0 HP hits, and 0 CORUM hits.
## Category CC is the most populated with 15 hits.
<- order(all_varpart_dvic[["fitted_df"]][["clinicaloutcome"]])
top_300_cf_genes_idx <- tail(all_varpart_dvic[["fitted_df"]][top_300_cf_genes_idx, ], n = 300)
top_300_cf_genes summary(top_300_cf_genes)
## drug visitnumber infectingspecies clinicaloutcome Residuals
## Min. :0.0000 Min. :0.00014 Min. :0.0000 Min. :0.101 Min. :0.405
## 1st Qu.:0.0070 1st Qu.:0.04878 1st Qu.:0.0057 1st Qu.:0.111 1st Qu.:0.641
## Median :0.0228 Median :0.09591 Median :0.0204 Median :0.126 Median :0.693
## Mean :0.0313 Mean :0.10356 Mean :0.0416 Mean :0.131 Mean :0.692
## 3rd Qu.:0.0441 3rd Qu.:0.14862 3rd Qu.:0.0547 3rd Qu.:0.144 3rd Qu.:0.752
## Max. :0.2115 Max. :0.31194 Max. :0.3507 Max. :0.255 Max. :0.864
<- simple_gprofiler(rownames(top_300_cf_genes)) cf_genes_gp
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
cf_genes_gp
## A set of ontologies produced by gprofiler using 300
## genes against the hsapiens annotations and significance cutoff 0.05.
## There are 112 GO hits, 1, KEGG hits, 5 reactome hits, 0 wikipathway hits, 0 transcription factor hits, 0 miRNA hits, 0 HPA hits, 0 HP hits, and 0 CORUM hits.
## Category BP is the most populated with 30 hits.
Here are the PCA results before/after removing the two samples that I called shenanigans on. They are quite similar.
<- normalize_expt(wellcome_outtime, filter = TRUE,
wellcome_outtime_norm convert = "cpm", transform = "log2", norm = "quant")
## Removing 7327 low-count genes (14154 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outtime_norm)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by cure_v1, cure_v2, cure_v3, failure_v1, failure_v2, failure_v3
## Shapes are defined by antimoniate, miltefosine.
<- normalize_expt(wellcome_filtered, filter = TRUE,
wellcome_outfilt_norm convert = "cpm", transform = "log2", norm = "quant")
## Removing 7361 low-count genes (14120 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outfilt_norm)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by cure_v1, cure_v2, cure_v3, failure_v1, failure_v2, failure_v3
## Shapes are defined by antimoniate, miltefosine.
From my perspective, it seems that the most interesting GSVA comparison is to look for scores changing between the cure and fail samples.
<- simple_gsva(wellcome_outcome_filt) wt_gsva
## Converting the rownames() of the expressionset to ENTREZID.
## 1630 ENSEMBL ID's didn't have a matching ENTEREZ ID. Dropping them now.
## Before conversion, the expressionset has 21481 entries.
## After conversion, the expressionset has 20006 entries.
wt_gsva
## GSVA result using method: ssgsea against the c2 dataset.
## Scores range from: -0.4572 to: 0.5428.
<- get_sig_gsva_categories(wt_gsva, excel = "excel/wellcome_outcome_filt_gsva.xlsx") wt_gsva_sig
## Starting limma pairwise comparison.
## libsize was not specified, this parameter has profound effects on limma's result.
## Using the libsize from expt$libsize.
## Limma step 1/6: choosing model.
## Assuming this data is similar to a micro array and not performign voom.
## Limma step 3/6: running lmFit with method: ls.
## Limma step 4/6: making and fitting contrasts with no intercept. (~ 0 + factors)
## Limma step 5/6: Running eBayes with robust = FALSE and trend = FALSE.
## Limma step 6/6: Writing limma outputs.
## Limma step 6/6: 1/1: Creating table: failure_vs_cure. Adjust = BH
## Limma step 6/6: 1/2: Creating table: cure. Adjust = BH
## Limma step 6/6: 2/2: Creating table: failure. Adjust = BH
## The factor cure has 15 rows.
## The factor failure has 19 rows.
## Testing each factor against the others.
## Scoring cure against everything else.
## Scoring failure against everything else.
## Deleting the file excel/wellcome_outcome_filt_gsva.xlsx before writing the tables.
wt_gsva_sig
## The set of GSVA categories deemed significantly higher than the
## distribution of all scores. It comprises 44 gene sets.
## Error in xy.coords(x, y, xlabel, ylabel, log): 'x' is a list, but does not have components 'x' and 'y'
<- normalize_expt(wellcome_filttime, norm = "quant", convert = "cpm",
wellcome_time_norm filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_norm)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by v1, v2, v3
## Shapes are defined by antimoniate, miltefosine.
<- normalize_expt(wellcome_filttime, convert = "cpm",
wellcome_time_nb batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## Setting 904 low elements to zero.
## transform_counts: Found 904 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_time_nb)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by v1, v2, v3
## Shapes are defined by antimoniate, miltefosine.
<- normalize_expt(wellcome_outcome_filt, norm = "quant", convert = "cpm",
wellcome_outcome_norm filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outcome_norm, plot_labels=FALSE)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by cure, failure
## Shapes are defined by v1, v2, v3.
<- normalize_expt(wellcome_outcome_filt, convert = "cpm",
wellcome_outcome_nb batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## Setting 819 low elements to zero.
## transform_counts: Found 819 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outcome_nb)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by cure, failure
## Shapes are defined by v1, v2, v3.
Wow, there really isn’t much C/F variance, is there.
<- normalize_expt(wellcome_drugfilt, norm = "quant", convert = "cpm",
wellcome_drug_norm filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_drug_norm)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by antimoniate, miltefosine
## Shapes are defined by v1, v2, v3.
<- normalize_expt(wellcome_drugfilt, convert = "cpm",
wellcome_drug_nb batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## Setting 868 low elements to zero.
## transform_counts: Found 868 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_drug_nb)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by antimoniate, miltefosine
## Shapes are defined by v1, v2, v3.
<- normalize_expt(wellcome_filtered, norm = "quant", convert = "cpm",
wellcome_outtime_norm filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outtime_norm)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by cure_v1, cure_v2, cure_v3, failure_v1, failure_v2, failure_v3
## Shapes are defined by antimoniate, miltefosine.
<- normalize_expt(wellcome_filtered, convert = "cpm",
wellcome_outtime_nb batch = "svaseq", filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## Setting 928 low elements to zero.
## transform_counts: Found 928 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_outtime_nb)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by cure_v1, cure_v2, cure_v3, failure_v1, failure_v2, failure_v3
## Shapes are defined by antimoniate, miltefosine.
<- normalize_expt(wellcome_parafilt, norm = "quant", convert = "cpm",
wellcome_parasite_norm filter = TRUE, transform = "log2")
## Removing 7361 low-count genes (14120 remaining).
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_parasite_norm)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by lvbraziliensis, lvpanamensis
## Shapes are defined by v1, v2, v3.
<- normalize_expt(wellcome_parafilt, norm = "quant", convert = "cpm",
wellcome_parasite_nb batch = "svaseq", filter = TRUE, transform = "log2")
## Warning in normalize_expt(wellcome_parafilt, norm = "quant", convert = "cpm", : Quantile normalization and sva do not always play well together.
## Removing 7361 low-count genes (14120 remaining).
## Setting 649 low elements to zero.
## transform_counts: Found 649 values equal to 0, adding 1 to the matrix.
plot_pca(wellcome_parasite_nb)
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by lvbraziliensis, lvpanamensis
## Shapes are defined by v1, v2, v3.
Given the above variance partition and PCA plots, we can surmise that some metadata factors are much more likely to give interesting results than others (Drug far more than C/F, for example). With that in mind, let us perform the various possible DE analyses with each factor and see what comes out. Each of these runs of all pairwise will be performed once with SVA estimates in the model, and once with the drug treatment as the batch factor.
Start with the combined factor of {outcome}_{visit}. Given the PCA, I expect to see a little bit of information with batch in the model, oddly less information with SVA.
<- all_pairwise(wellcome_filtered, model_batch = "svaseq", filter = TRUE) outtime_sva_de
##
## cure_v1 cure_v2 cure_v3 failure_v1 failure_v2 failure_v3
## 5 5 5 6 7 6
## Removing 0 low-count genes (14120 remaining).
## Setting 928 low elements to zero.
## transform_counts: Found 928 values equal to 0, adding 1 to the matrix.
outtime_sva_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: svaseq.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
<- list(
outtime_keepers "curev1v2" = c("curev2", "curev1"),
"curev1v3" = c("curev3", "curev1"),
"curev2v3" = c("curev3", "curev2"),
"failv1v2" = c("failurev2", "failurev1"),
"failv1v3" = c("failurev3", "failurev1"),
"failv2v3" = c("failurev3", "failurev2"),
"cfv1" = c("failurev1", "curev1"),
"cfv2" = c("failurev2", "curev2"),
"cfv3" = c("failurev3", "curev3"))
<- combine_de_tables(
outtime_sva_table
outtime_sva_de,keepers = outtime_keepers,
excel = glue::glue("excel/wellcome_outtime_table_sva-v{ver}.xlsx"))
## Adding venn plots for curev1v2.
## Adding venn plots for curev1v3.
## Adding venn plots for curev2v3.
## Adding venn plots for failv1v2.
## Adding venn plots for failv1v3.
## Adding venn plots for failv2v3.
## Adding venn plots for cfv1.
## Adding venn plots for cfv2.
## Adding venn plots for cfv3.
outtime_sva_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 curev2_vs_curev1 539 335 528 292 162 187
## 2 curev3_vs_curev1 803 684 803 618 489 549
## 3 curev3_vs_curev2 13 3 6 3 1 0
## 4 failurev2_vs_failurev1 256 59 195 51 0 0
## 5 failurev3_vs_failurev1 1232 617 1186 611 827 791
## 6 failurev3_vs_failurev2 136 86 99 65 16 36
## 7 failurev1_vs_curev1 34 210 24 149 1 12
## 8 failurev2_vs_curev2 12 17 4 6 0 0
## 9 failurev3_vs_curev3 4 0 0 0 0 0
<- extract_significant_genes(
outtime_sva_sig
outtime_sva_table,excel = glue::glue("excel/wellcome_outtime_sig_sva-v{ver}.xlsx"))
outtime_sva_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## curev1v2 162 187 528 292 539 335 0 0
## curev1v3 489 549 803 618 803 684 20 61
## curev2v3 1 0 6 3 13 3 0 0
## failv1v2 0 0 195 51 256 59 0 0
## failv1v3 827 791 1186 611 1232 617 0 0
## failv2v3 16 36 99 65 136 86 0 0
## cfv1 1 12 24 149 34 210 0 0
## cfv2 0 0 4 6 12 17 0 0
## cfv3 0 0 0 0 4 0 0 0
<- all_pairwise(wellcome_filtered, model_batch = TRUE, filter = TRUE) outtime_batch_de
##
## cure_v1 cure_v2 cure_v3 failure_v1 failure_v2 failure_v3
## 5 5 5 6 7 6
##
## antimoniate miltefosine
## 18 16
outtime_batch_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: batch in model/limma.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
<- combine_de_tables(
outtime_batch_table
outtime_batch_de,keepers = outtime_keepers,
excel = glue::glue("excel/wellcome_outtime_table_batch-v{ver}.xlsx"))
## Adding venn plots for curev1v2.
## Adding venn plots for curev1v3.
## Adding venn plots for curev2v3.
## Adding venn plots for failv1v2.
## Adding venn plots for failv1v3.
## Adding venn plots for failv2v3.
## Adding venn plots for cfv1.
## Adding venn plots for cfv2.
## Adding venn plots for cfv3.
outtime_batch_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 curev2_vs_curev1 437 260 392 217 159 147
## 2 curev3_vs_curev1 624 580 611 523 455 489
## 3 curev3_vs_curev2 5 4 1 1 1 0
## 4 failurev2_vs_failurev1 196 33 128 28 0 2
## 5 failurev3_vs_failurev1 1028 526 960 514 707 726
## 6 failurev3_vs_failurev2 46 40 24 23 0 7
## 7 failurev1_vs_curev1 69 266 49 216 9 36
## 8 failurev2_vs_curev2 8 7 0 5 0 0
## 9 failurev3_vs_curev3 0 0 0 0 0 0
<- extract_significant_genes(
outtime_batch_sig
outtime_batch_table,excel = glue::glue("excel/wellcome_outtime_sig_batch-v{ver}.xlsx"))
outtime_batch_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## curev1v2 159 147 392 217 437 260 0 0
## curev1v3 455 489 611 523 624 580 20 61
## curev2v3 1 0 1 1 5 4 0 0
## failv1v2 0 2 128 28 196 33 0 0
## failv1v3 707 726 960 514 1028 526 0 0
## failv2v3 0 7 24 23 46 40 0 0
## cfv1 9 36 49 216 69 266 0 0
## cfv2 0 0 0 5 8 7 0 0
## cfv3 0 0 0 0 0 0 0 0
Now let us compare the time points, with and without SVA.
<- all_pairwise(wellcome_filttime, model_batch = "svaseq", filter = TRUE) time_sva_de
##
## v1 v2 v3
## 11 12 11
## Removing 0 low-count genes (14120 remaining).
## Setting 904 low elements to zero.
## transform_counts: Found 904 values equal to 0, adding 1 to the matrix.
time_sva_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: svaseq.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
<- list(
time_keepers "v1v2" = c("v2", "v1"),
"v1v3" = c("v3", "v1"),
"v2v3" = c("v3", "v2"))
<- combine_de_tables(
time_sva_table
time_sva_de,keepers = time_keepers,
excel = glue::glue("excel/wellcome_time_table_sva-v{ver}.xlsx"))
## Adding venn plots for v1v2.
## Adding venn plots for v1v3.
## Adding venn plots for v2v3.
time_sva_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 v2_vs_v1 626 217 619 217 315 290
## 2 v3_vs_v1 1254 669 1274 650 904 789
## 3 v3_vs_v2 205 184 189 170 54 52
<- extract_significant_genes(
time_sva_sig
time_sva_table,excel = glue::glue("excel/wellcome_time_sig_sva-v{ver}.xlsx"))
time_sva_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## v1v2 315 290 619 217 626 217 142 190
## v1v3 904 789 1274 650 1254 669 656 650
## v2v3 54 52 189 170 205 184 0 0
<- all_pairwise(wellcome_filttime, model_batch = "batchseq", filter = TRUE) time_batch_de
##
## v1 v2 v3
## 11 12 11
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 2
## Removing 0 low-count genes (14120 remaining).
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 7
## Setting 831 low elements to zero.
## transform_counts: Found 831 values equal to 0, adding 1 to the matrix.
time_batch_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: batchseq.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
<- combine_de_tables(
time_batch_table
time_batch_de,keepers = time_keepers,
excel = glue::glue("excel/wellcome_time_table_batch-v{ver}.xlsx"))
## Adding venn plots for v1v2.
## Adding venn plots for v1v3.
## Adding venn plots for v2v3.
time_batch_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 v2_vs_v1 342 92 314 94 141 77
## 2 v3_vs_v1 416 242 340 206 37 39
## 3 v3_vs_v2 12 39 6 35 0 0
<- extract_significant_genes(
time_batch_sig
time_batch_table,excel = glue::glue("excel/wellcome_time_batch_sva-v{ver}.xlsx"))
time_batch_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## v1v2 141 77 314 94 342 92 142 190
## v1v3 37 39 340 206 416 242 656 650
## v2v3 0 0 6 35 12 39 0 0
The infecting strain I think should prove one of the more interesting comparisons.
<- all_pairwise(wellcome_parafilt, model_batch = "svaseq", filter = TRUE) parasite_sva_de
##
## lvbraziliensis lvpanamensis
## 11 23
## Removing 0 low-count genes (14120 remaining).
## Setting 922 low elements to zero.
## transform_counts: Found 922 values equal to 0, adding 1 to the matrix.
parasite_sva_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: svaseq.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
## lvpanamensis_vs_lvbraziliensis
## limma_vs_deseq 0.6960
## limma_vs_edger 0.8564
## limma_vs_basic 0.9152
## limma_vs_noiseq -0.8289
## deseq_vs_edger 0.8951
## deseq_vs_basic 0.6937
## deseq_vs_noiseq -0.6436
## edger_vs_basic 0.8441
## edger_vs_noiseq -0.8205
## basic_vs_noiseq -0.8786
<- combine_de_tables(
parasite_sva_table
parasite_sva_de,excel = glue::glue("excel/wellcome_parasite_table_sva-v{ver}.xlsx"))
## Adding venn plots for lvpanamensis_vs_lvbraziliensis.
parasite_sva_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 lvpanamensis_vs_lvbraziliensis 110 80 83 79 44 70
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
<- extract_significant_genes(
parasite_sva_sig
parasite_sva_table,excel = glue::glue("excel/wellcome_parasite_sig_sva-v{ver}.xlsx"))
parasite_sva_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## lvpanamensis_vs_lvbraziliensis 44 70 83 79 110 80 1 3
<- all_pairwise(wellcome_parafilt, model_batch = "batchseq", filter = TRUE) parasite_batch_de
##
## lvbraziliensis lvpanamensis
## 11 23
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 2
## Removing 0 low-count genes (14120 remaining).
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 6
## Setting 899 low elements to zero.
## transform_counts: Found 899 values equal to 0, adding 1 to the matrix.
parasite_batch_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: batchseq.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
## lvpanamensis_vs_lvbraziliensis
## limma_vs_deseq 0.7055
## limma_vs_edger 0.8580
## limma_vs_basic 0.9070
## limma_vs_noiseq -0.8221
## deseq_vs_edger 0.9041
## deseq_vs_basic 0.6906
## deseq_vs_noiseq -0.6408
## edger_vs_basic 0.8268
## edger_vs_noiseq -0.8024
## basic_vs_noiseq -0.8786
<- combine_de_tables(
parasite_batch_table
parasite_batch_de,excel = glue::glue("excel/wellcome_parasite_table_batch-v{ver}.xlsx"))
## Adding venn plots for lvpanamensis_vs_lvbraziliensis.
parasite_batch_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 lvpanamensis_vs_lvbraziliensis 136 93 113 103 47 103
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
<- extract_significant_genes(
parasite_batch_sig
parasite_batch_table,excel = glue::glue("excel/wellcome_parasite_sig_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_parasite_sig_sva-v20230627.xlsx before writing the tables.
parasite_batch_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## lvpanamensis_vs_lvbraziliensis 47 103 113 103 136 93 1 3
<- all_pairwise(wellcome_outcome_filt, model_batch = "svaseq", filter = TRUE) outcome_sva_de
##
## cure failure
## 15 19
## Removing 0 low-count genes (14120 remaining).
## Setting 819 low elements to zero.
## transform_counts: Found 819 values equal to 0, adding 1 to the matrix.
outcome_sva_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: svaseq.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
## failure_vs_cure
## limma_vs_deseq 0.8086
## limma_vs_edger 0.8238
## limma_vs_basic 0.7541
## limma_vs_noiseq -0.6946
## deseq_vs_edger 0.9959
## deseq_vs_basic 0.6992
## deseq_vs_noiseq -0.6866
## edger_vs_basic 0.7174
## edger_vs_noiseq -0.7037
## basic_vs_noiseq -0.9077
<- combine_de_tables(
outcome_sva_table
outcome_sva_de,excel = glue::glue("excel/wellcome_outcome_table_sva-v{ver}.xlsx"))
## Adding venn plots for failure_vs_cure.
outcome_sva_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 failure_vs_cure 11 28 10 18 0 0
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
<- extract_significant_genes(
outcome_sva_sig
outcome_sva_table,excel = glue::glue("excel/wellcome_outcome_sig_sva-v{ver}.xlsx"))
outcome_sva_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## failure_vs_cure 0 0 10 18 11 28 0 0
<- all_pairwise(wellcome_outcome_filt, model_batch = "batchseq", filter = TRUE) outcome_batch_de
##
## cure failure
## 15 19
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 2
## Removing 0 low-count genes (14120 remaining).
## sva warning: controls provided so supervised sva is being performed.
## Number of significant surrogate variables is: 7
## Setting 836 low elements to zero.
## transform_counts: Found 836 values equal to 0, adding 1 to the matrix.
outcome_batch_de
## A pairwise differential expression with results from: basic, deseq, edger, limma, noiseq.
## This used a surrogate/batch estimate from: batchseq.
## The primary analysis performed 10 comparisons.
## The logFC agreement among the methods follows:
## failure_vs_cure
## limma_vs_deseq 0.7606
## limma_vs_edger 0.7912
## limma_vs_basic 0.4550
## limma_vs_noiseq -0.3958
## deseq_vs_edger 0.9920
## deseq_vs_basic 0.3760
## deseq_vs_noiseq -0.3833
## edger_vs_basic 0.3961
## edger_vs_noiseq -0.4019
## basic_vs_noiseq -0.9077
<- combine_de_tables(
outcome_batch_table
outcome_batch_de,excel = glue::glue("excel/wellcome_outcome_table_batch-v{ver}.xlsx"))
## Adding venn plots for failure_vs_cure.
outcome_batch_table
## A set of combined differential expression results.
## table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 failure_vs_cure 7 9 1 3 0 0
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
<- extract_significant_genes(
outcome_batch_sig
outcome_batch_table,excel = glue::glue("excel/wellcome_outcome_sig_sva-v{ver}.xlsx"))
## Deleting the file excel/wellcome_outcome_sig_sva-v20230627.xlsx before writing the tables.
outcome_batch_sig
## A set of genes deemed significant according to limma, edger, deseq, basic.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
## limma_up limma_down edger_up edger_down deseq_up deseq_down basic_up basic_down
## failure_vs_cure 0 0 1 3 7 9 0 0
<- simple_gsva(wellcome_filtered, signature_category="c2") wellcome_gsva_c2
## Converting the rownames() of the expressionset to ENTREZID.
## 1630 ENSEMBL ID's didn't have a matching ENTEREZ ID. Dropping them now.
## Before conversion, the expressionset has 21481 entries.
## After conversion, the expressionset has 20006 entries.
<- get_sig_gsva_categories(
wellcome_gsva_c2_sig
wellcome_gsva_c2,excel="excel/wellcome_gsva_c2.xlsx")
## Starting limma pairwise comparison.
## libsize was not specified, this parameter has profound effects on limma's result.
## Using the libsize from expt$libsize.
## Limma step 1/6: choosing model.
## Assuming this data is similar to a micro array and not performign voom.
## Limma step 3/6: running lmFit with method: ls.
## Limma step 4/6: making and fitting contrasts with no intercept. (~ 0 + factors)
## Limma step 5/6: Running eBayes with robust = FALSE and trend = FALSE.
## Limma step 6/6: Writing limma outputs.
## Limma step 6/6: 1/15: Creating table: curev2_vs_curev1. Adjust = BH
## Limma step 6/6: 2/15: Creating table: curev3_vs_curev1. Adjust = BH
## Limma step 6/6: 3/15: Creating table: failurev1_vs_curev1. Adjust = BH
## Limma step 6/6: 4/15: Creating table: failurev2_vs_curev1. Adjust = BH
## Limma step 6/6: 5/15: Creating table: failurev3_vs_curev1. Adjust = BH
## Limma step 6/6: 6/15: Creating table: curev3_vs_curev2. Adjust = BH
## Limma step 6/6: 7/15: Creating table: failurev1_vs_curev2. Adjust = BH
## Limma step 6/6: 8/15: Creating table: failurev2_vs_curev2. Adjust = BH
## Limma step 6/6: 9/15: Creating table: failurev3_vs_curev2. Adjust = BH
## Limma step 6/6: 10/15: Creating table: failurev1_vs_curev3. Adjust = BH
## Limma step 6/6: 11/15: Creating table: failurev2_vs_curev3. Adjust = BH
## Limma step 6/6: 12/15: Creating table: failurev3_vs_curev3. Adjust = BH
## Limma step 6/6: 13/15: Creating table: failurev2_vs_failurev1. Adjust = BH
## Limma step 6/6: 14/15: Creating table: failurev3_vs_failurev1. Adjust = BH
## Limma step 6/6: 15/15: Creating table: failurev3_vs_failurev2. Adjust = BH
## Limma step 6/6: 1/6: Creating table: curev1. Adjust = BH
## Limma step 6/6: 2/6: Creating table: curev2. Adjust = BH
## Limma step 6/6: 3/6: Creating table: curev3. Adjust = BH
## Limma step 6/6: 4/6: Creating table: failurev1. Adjust = BH
## Limma step 6/6: 5/6: Creating table: failurev2. Adjust = BH
## Limma step 6/6: 6/6: Creating table: failurev3. Adjust = BH
## The factor cure_v1 has 5 rows.
## The factor cure_v2 has 5 rows.
## The factor cure_v3 has 5 rows.
## The factor failure_v1 has 6 rows.
## The factor failure_v2 has 7 rows.
## The factor failure_v3 has 6 rows.
## Testing each factor against the others.
## Scoring cure_v1 against everything else.
## Scoring cure_v2 against everything else.
## Scoring cure_v3 against everything else.
## Scoring failure_v1 against everything else.
## Scoring failure_v2 against everything else.
## Scoring failure_v3 against everything else.
## Deleting the file excel/wellcome_gsva_c2.xlsx before writing the tables.
<- simple_gsva(wellcome_filtered, signature_category="c7") wellcome_gsva_c7
## Converting the rownames() of the expressionset to ENTREZID.
## 1630 ENSEMBL ID's didn't have a matching ENTEREZ ID. Dropping them now.
## Before conversion, the expressionset has 21481 entries.
## After conversion, the expressionset has 20006 entries.
<- get_sig_gsva_categories(
wellcome_gsva_c7_sig
wellcome_gsva_c7,excel="excel/wellcome_gsva_c7.xlsx")
## Starting limma pairwise comparison.
## libsize was not specified, this parameter has profound effects on limma's result.
## Using the libsize from expt$libsize.
## Limma step 1/6: choosing model.
## Assuming this data is similar to a micro array and not performign voom.
## Limma step 3/6: running lmFit with method: ls.
## Limma step 4/6: making and fitting contrasts with no intercept. (~ 0 + factors)
## Limma step 5/6: Running eBayes with robust = FALSE and trend = FALSE.
## Limma step 6/6: Writing limma outputs.
## Limma step 6/6: 1/15: Creating table: curev2_vs_curev1. Adjust = BH
## Limma step 6/6: 2/15: Creating table: curev3_vs_curev1. Adjust = BH
## Limma step 6/6: 3/15: Creating table: failurev1_vs_curev1. Adjust = BH
## Limma step 6/6: 4/15: Creating table: failurev2_vs_curev1. Adjust = BH
## Limma step 6/6: 5/15: Creating table: failurev3_vs_curev1. Adjust = BH
## Limma step 6/6: 6/15: Creating table: curev3_vs_curev2. Adjust = BH
## Limma step 6/6: 7/15: Creating table: failurev1_vs_curev2. Adjust = BH
## Limma step 6/6: 8/15: Creating table: failurev2_vs_curev2. Adjust = BH
## Limma step 6/6: 9/15: Creating table: failurev3_vs_curev2. Adjust = BH
## Limma step 6/6: 10/15: Creating table: failurev1_vs_curev3. Adjust = BH
## Limma step 6/6: 11/15: Creating table: failurev2_vs_curev3. Adjust = BH
## Limma step 6/6: 12/15: Creating table: failurev3_vs_curev3. Adjust = BH
## Limma step 6/6: 13/15: Creating table: failurev2_vs_failurev1. Adjust = BH
## Limma step 6/6: 14/15: Creating table: failurev3_vs_failurev1. Adjust = BH
## Limma step 6/6: 15/15: Creating table: failurev3_vs_failurev2. Adjust = BH
## Limma step 6/6: 1/6: Creating table: curev1. Adjust = BH
## Limma step 6/6: 2/6: Creating table: curev2. Adjust = BH
## Limma step 6/6: 3/6: Creating table: curev3. Adjust = BH
## Limma step 6/6: 4/6: Creating table: failurev1. Adjust = BH
## Limma step 6/6: 5/6: Creating table: failurev2. Adjust = BH
## Limma step 6/6: 6/6: Creating table: failurev3. Adjust = BH
## The factor cure_v1 has 5 rows.
## The factor cure_v2 has 5 rows.
## The factor cure_v3 has 5 rows.
## The factor failure_v1 has 6 rows.
## The factor failure_v2 has 7 rows.
## The factor failure_v3 has 6 rows.
## Testing each factor against the others.
## Scoring cure_v1 against everything else.
## Scoring cure_v2 against everything else.
## Scoring cure_v3 against everything else.
## Scoring failure_v1 against everything else.
## Scoring failure_v2 against everything else.
## Scoring failure_v3 against everything else.
## Deleting the file excel/wellcome_gsva_c7.xlsx before writing the tables.
<- all_gprofiler(outtime_sva_sig)
outtime_gprofiler $curev1v2_up$pvalue_plots$bpp_plot_over outtime_gprofiler
## NULL
$failv1v2_up$pvalue_plots$kegg_plot_over outtime_gprofiler
## NULL
$cfv1_up$pvalue_plots$reactome_plot_over outtime_gprofiler
## NULL
$cfv1_down$pvalue_plots$reactome_plot_over outtime_gprofiler
## NULL