## This assumes salmon has run happily on all the samples in the cwd.
## This first command collects the salmon data and makes a table of tpm by sample.
multipleFieldSelection.py -i $(find -L . -name 'quant.sf') \
-k 1 \
-f 4 \
-o iso_tpm.txt
## Unfortunately, the previous step provides ensembl transcript IDs looking like:
## 'ENSTxxxx.1' rather than the canonical 'ENSTxxxx'
## The following step strips off the trailing .number
format_Ensembl_ids.R iso_tpm.txt
## Now make a library of splicing events (SE,SS,MX,RI,FL) from our annotations in the ioe format.
suppa.py generateEvents \
-i ~/scratch/libraries/genome/hg38_91.gtf \
-o hg38_19.events \
-e SE SS MX RI FL \
-f ioe
## Combine them into a single file (why the hell the previous step doesn't do this beats me)
awk '
FNR==1 && NR!=1 { while (/^<header>/) getline; }
1 {print}
' *.ioe > ensembl_hg38.events.ioe
## Calculate the psi score using the tpms for the events just collated.
suppa.py psiPerEvent \
--ioe-file ensembl_hg38.events.ioe \
--expression-file iso_tpm_formatted.txt \
-o events \
2>psi_per_event.out 1>&2
## Calculate the same metrics by isoform rather than event.
suppa.py psiPerIsoform \
-g ~/scratch/libraries/genome/hg38_91.gtf \
-e iso_tpm_formatted.txt \
-o uninf_inf \
2>psi_per_iso.out 1>&2
## Now make separate files of the infected and uninfected samples (this is also dumb imo)
split_file.R events.psi \
HPGL0082_R1_filtered_trimmed.fastq,HPGL0086_R1_filtered_trimmed.fastq,HPGL0090_R1_filtered_trimmed.fastq \
HPGL0083_R1_filtered_trimmed.fastq,HPGL0087_R1_filtered_trimmed.fastq,HPGL0091_R1_filtered_trimmed.fastq,HPGL0330_R1_filtered_trimmed.fastq,HPGL0331_R1_filtered_trimmed.fastq,HPGL0332_R1_filtered_trimmed.fastq \
uninf_events.psi \
inf_events.psi \
-e
split_file.R uninf_inf_isoforms.psi \
HPGL0082_R1_filtered_trimmed.fastq,HPGL0086_R1_filtered_trimmed.fastq,HPGL0090_R1_filtered_trimmed.fastq \
HPGL0083_R1_filtered_trimmed.fastq,HPGL0087_R1_filtered_trimmed.fastq,HPGL0091_R1_filtered_trimmed.fastq,HPGL0330_R1_filtered_trimmed.fastq,HPGL0331_R1_filtered_trimmed.fastq,HPGL0332_R1_filtered_trimmed.fastq \
uninf_isoforms.psi \
inf_isoforms.psi \
-e
## Using the split tpms, split psi by event, and the catalog of events;
## calculate differential splicing metrics.
## Make sure to use --save-tpm-events so that we get a bridge between ENSG->ENST->tpm->psi
suppa.py diffSplice \
-m empirical \
-gc -i ensembl_hg38.events.ioe \
-p uninf_events.psi inf_events.psi \
-e uninf.tpm inf.tpm \
-o uninf_inf_diffsplice \
--save_tpm_events \
2>diffsplice_event.out 1>&2
suppa.py diffSplice \
-m empirical \
-gc -i ensembl_hg38.events.ioe \
-p uninf_isoforms.psi inf_isoforms.psi \
-e uninf.tpm inf.tpm \
-o uninf_inf_isoform_diffsplice \
--save_tpm_events \
2>diffsplice_uninf_isoform.out 1>&2
suppa.py diffSplice \
-m empirical \
-gc -i ensembl_hg38.events.ioe \
-p uninf_isoforms.psi \
-e uninf.tpm \
-o uninf_isoform_diffsplice \
--save_tpm_events \
2>diffsplice_uninf_isoform.out 1>&2
suppa.py diffSplice \
-m empirical \
-gc -i ensembl_hg38.events.ioe \
-p inf_isoforms.psi \
-e inf.tpm \
-o inf_isoform_diffsplice \
--save_tpm_events \
2>diffsplice_inf_isoform.out 1>&2
## We don't have sufficient samples for clustering.
## suppa.py clusterEvents \
## --dpsi uninf_inf_diffsplice.dpsi \
## --psivec uninf_inf_diffsplice.psivec \
## --sig-threshold 0.05 \
## --eps 0.2 \
## --separation 0.11 \
## -dt 0.2 \
## --min-pts 10 \
## -c OPTICS \
## -o optics
Well, to be honest, I am moving backwards, I tried the various commands last night and will try to plot the results now, rather than, you know, the actual order that I did things…
basedir <- "preprocessing/outputs/suppa_hg38_91"
dpsi_file <- file.path(basedir, "uninf_inf_diffsplice.dpsi")
tpm_file <- file.path(basedir, "uninf_inf_diffsplice_avglogtpm.tab")
events_file <- file.path(basedir, "ensembl_hg38.events.ioe")
psi_file <- file.path(basedir, "uninf_inf_diffsplice.psivec")
initial_suppa_plots <- plot_suppa(dpsi_file, tpm_file,
events=events_file,
label_type="Skipping exon",
psi=psi_file)
initial_suppa_plots$ma
## Warning: Removed 43147 rows containing missing values (geom_point).
dpsi | pvalue | avglogtpm | log10pval | adjp | psig | adjpsig | gene_name | category | coordinates | plot_cat | event | alternative_transcripts | total_transcripts | denominator1 | denominator2 | denominator3 | numerator1 | numerator2 | numerator3 | numerator4 | numerator5 | numerator6 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000000003;A5:X:100635746-100636191:100635746-100636608:- | NaN | 1.0000 | -0.3956 | 0.0000 | 1.0000 | FALSE | FALSE | ENSG00000000003 | Alternate 5 prime | X:100635746-100636191:100635746-100636608:- | Insignificant | ENSG00000000003;A5:X:100635746-100636191:100635746-100636608:- | ENST00000496771 | ENST00000496771,ENST00000373020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
ENSG00000000003;A5:X:100635746-100636608:100635746-100636793:- | NaN | 1.0000 | -0.0988 | 0.0000 | 1.0000 | FALSE | FALSE | ENSG00000000003 | Alternate 5 prime | X:100635746-100636608:100635746-100636793:- | Insignificant | ENSG00000000003;A5:X:100635746-100636608:100635746-100636793:- | ENST00000373020 | ENST00000373020,ENST00000612152,ENST00000614008,ENST00000494424 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
ENSG00000000003;SE:X:100630866-100632485:100632568-100633405:- | -0.0875 | 0.2023 | -0.1432 | 0.6940 | 0.9302 | FALSE | FALSE | ENSG00000000003 | Skipping exon | X:100630866-100632485:100632568-100633405:- | Insignificant | ENSG00000000003;SE:X:100630866-100632485:100632568-100633405:- | ENST00000373020 | ENST00000373020,ENST00000612152 | 1.0000 | 0.5826 | 0.4456 | 0.8273 | 0.8367 | 0.4118 | 0.1723 | 1.0000 | 0.2833 |
ENSG00000000419;A3:20:50940955-50941105:50940933-50941105:- | NaN | 1.0000 | 0.8031 | 0.0000 | 1.0000 | FALSE | FALSE | ENSG00000000419 | Alternate 3 prime | 20:50940955-50941105:50940933-50941105:- | Insignificant | ENSG00000000419;A3:20:50940955-50941105:50940933-50941105:- | ENST00000494752 | ENST00000494752,ENST00000371584 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
ENSG00000000419;A3:20:50940955-50942031:50940933-50942031:- | NaN | 1.0000 | 1.3710 | 0.0000 | 1.0000 | FALSE | FALSE | ENSG00000000419 | Alternate 3 prime | 20:50940955-50942031:50940933-50942031:- | Insignificant | ENSG00000000419;A3:20:50940955-50942031:50940933-50942031:- | ENST00000466152 | ENST00000466152,ENST00000371588 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
ENSG00000000419;A5:20:50940933-50941105:50940933-50941129:- | -0.0806 | 0.2907 | 0.9346 | 0.5365 | 0.9302 | FALSE | FALSE | ENSG00000000419 | Alternate 5 prime | 20:50940933-50941105:50940933-50941129:- | Insignificant | ENSG00000000419;A5:20:50940933-50941105:50940933-50941129:- | ENST00000371584 | ENST00000413082,ENST00000371582,ENST00000371584 | 0.8497 | 0.7415 | 0.7522 | 0.7260 | 0.7368 | 0.6626 | 0.6987 | 0.6559 | 0.7234 |
## The biomart annotations file already exists, loading from it.
## Saving to: excel/suppa_table.xlsx
if (!isTRUE(get0("skip_load"))) {
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
pander::pander(sessionInfo())
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 1b009834267dea125ee94934203413fbd606e783
## R> packrat::restore()
## This is hpgltools commit: Mon Apr 23 14:59:56 2018 -0400: 1b009834267dea125ee94934203413fbd606e783
## Saving to 03_suppa_invocation-v20180302.rda.xz
R version 3.4.4 (2018-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.utf8, LC_NUMERIC=C, LC_TIME=en_US.utf8, LC_COLLATE=en_US.utf8, LC_MONETARY=en_US.utf8, LC_MESSAGES=en_US.utf8, LC_PAPER=en_US.utf8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.utf8 and LC_IDENTIFICATION=C
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: hpgltools(v.2018.03)
loaded via a namespace (and not attached): Rcpp(v.0.12.16), highr(v.0.6), compiler(v.3.4.4), pillar(v.1.2.1), plyr(v.1.8.4), base64enc(v.0.1-3), iterators(v.1.0.9), tools(v.3.4.4), digest(v.0.6.15), evaluate(v.0.10.1), memoise(v.1.1.0), tibble(v.1.4.2), gtable(v.0.2.0), rlang(v.0.2.0.9001), openxlsx(v.4.0.17), foreach(v.1.4.4), commonmark(v.1.4), ggrepel(v.0.7.0), yaml(v.2.1.18), parallel(v.3.4.4), withr(v.2.1.2), stringr(v.1.3.0), knitr(v.1.20), roxygen2(v.6.0.1), xml2(v.1.2.0), devtools(v.1.13.5), rprojroot(v.1.3-2), grid(v.3.4.4), data.table(v.1.10.4-3), Biobase(v.2.38.0), R6(v.2.2.2), rmarkdown(v.1.9), pander(v.0.6.1), ggplot2(v.2.2.1), magrittr(v.1.5), backports(v.1.1.2), scales(v.0.5.0.9000), codetools(v.0.2-15), htmltools(v.0.3.6), BiocGenerics(v.0.24.0), colorspace(v.1.3-2), labeling(v.0.3), stringi(v.1.1.7), lazyeval(v.0.2.1) and munsell(v.0.4.3)