1 Introduction

This document will seek to perform some preprocessing, interpretation, de-novo assembly, and differential expression of a series of un/infected Oncomelania samples.

2 Preprocessing

I am not completely certain of the best putative parasite transcriptomes to compare each samples against. So let us start with trimming.

2.1 Trimming

I will use trimomatic to trim these reads followed by fastqc to get a sense of the sequencer quality.

cd preprocessing
start=$(pwd)
for i in $(/bin/ls); do
    cd ${i}
    cyoa --method trim --input r1.fastq.gz:r2.fastq.gz
    cd ${start}
done

2.2 De-novo assembly

If I understand properly the experimental design, all samples are the same host. So, I will concatenate all samples (post-parasite filtering) into one large set and use the tool ‘Trinity’ on them.

mkdir denovo_assembly
for i in $(find preprocessing/ -name r1_trimmed.fastq.xz); do
    cat $i >> denovo_assembly/r1_trimmed.fastq.xz
done
for i in $(find preprocessing/ -name r2_trimmed.fastq.xz); do
    cat $i >> denovo_assembly/r2_trimmed.fastq.xz
done

cd denovo_assembly
cyoa --method trinity --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz

cyoa --method trinotate --input outputs/trinotate/Trinity.fasta

2.2.1 Trinity script

In a similar fashion, here are the actual scripts used to run trinity/trinotate, and the post-processing methods employed by them.

## This is a trinity submission script.
mkdir -p outputs/60trinity_r1_trimmed && \
  Trinity --seqType fq --min_contig_length 600 --normalize_reads \
    --max_memory 90G --CPU 6 \
    --output outputs/60trinity_r1_trimmed \
    --left <(less r1_trimmed.fastq.xz) --right <(less r2_trimmed.fastq.xz)

2.2.2 Trinotate script

function cleanup {
  echo "Removing /tmp/Trinity.fasta.sqlite"
  rm  -f /tmp/Trinity.fasta.sqlite
}
trap cleanup EXIT

mkdir -p outputs/20trinotate60trinity_r1_trimmed
start=$(pwd)
cd outputs/20trinotate60trinity_r1_trimmed
ln -sf "/home/trey/sshfs/scratch/atb/rnaseq/oncomelania_2023/denovo_assembly/outputs/60trinity_r1_trimmed/Trinity.fasta" .
rm -f "Trinity.fasta.gene_trans_map"
if [[ -f "Trinity.fasta.gene_trans_map" ]]; then
  echo "The gene to transcript map already exists."
else
  ids=$({ grep "^>" /home/trey/sshfs/scratch/atb/rnaseq/oncomelania_2023/denovo_assembly/outputs/60trinity_r1_trimmed/Trinity.fasta || test $? = 1; } | sed 's/>//g' | awk '{print $1}')
  for i in ${ids}; do
    echo "${i}  ${i}" >> Trinity.fasta.gene_trans_map
  done
fi

cp /sw/local/trinotate/3.2.2/bin/Trinotate.sqlite /tmp/Trinity.fasta.sqlite
/sw/local/trinotate/3.2.2/bin/auto/autoTrinotate.pl \
  --conf /sw/local/trinotate/3.2.2/bin/auto/conf.txt \
  --Trinotate_sqlite /tmp/Trinity.fasta.sqlite \
  --transcripts Trinity.fasta \
  --gene_to_trans_map Trinity.fasta.gene_trans_map \
  --CPU 16
mv Trinotate.tsv Trinity.tsv
rm -f *.ok *.out *.outfmt6 *.cmds *.log
rm -rf TMHMM_* Trinity.fasta.ffn.trans*
cd ${start}

2.2.3 Trinity post processing

Trinity also provides some RSEM-based quantitation and quality assurance post processing methods.

start=$(pwd)
cd outputs/60trinity_r1_trimmed
/sw/local/trinity/2.15.1/util/TrinityStats.pl Trinity.fasta

/sw/local/trinity/2.15.1/util/align_and_estimate_abundance.pl \
  --output_dir align_estimate.out \
  --transcripts outputs/trinity_r1_trimmed/Trinity.fasta \
  --seqType fq \
  --left r1_trimmed.fastq.xz --right r2_trimmed.fastq.xz  \
  --est_method RSEM \
  --aln_method bowtie \
  --trinity_mode --prep_reference \
  2>trinpost_align_estimate.stderr \
  1>trinpost_align_estimate.stdout

/sw/local/trinity/2.15.1/util/abundance_estimates_to_matrix.pl \
  --est_method RSEM \
  --gene_trans_map Trinity.fasta.gene_trans_map \
  align_estimate.out/RSEM.isoform.results \
  2>trinpost_estimate_to_matrix.stderr \
  1>trinpost_estimate_to_matrix.stdout

/sw/local/trinity/2.15.1/util/SAM_nameSorted_to_uniq_count_stats.pl \
  bowtie_out/bowtie_out.nameSorted.bam \
  2>trinpost_count_stats.stderr \
  1>trinpost_count_stats.stdout

cd ${start}

2.2.4 An example of the trimming process

The above command creates a script run on our computing cluster to trim the reads. Here is an arbitrarily chosen script it generated:

#!/usr/bin/bash
#SBATCH --export=ALL --requeue --mail-type=NONE --open-mode=append
#SBATCH --chdir=/z1/scratch/atb/rnaseq/oncomelania_2023/preprocessing/3R2H-1
#SBATCH --job-name=01trim_3R2H-1_R1_001 --nice=0
#SBATCH --output=outputs/log.txt.sbatch
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=4
#SBATCH --time=36:00:00
#SBATCH --mem=24G
set -o errexit
set -o errtrace
set -o pipefail
export LESS='--buffers 0 -B'
echo "## Started /z1/scratch/atb/rnaseq/oncomelania_2023/preprocessing/3R2H-1/scripts/01trim_3R2H-1_R1_001.sh at $(date) on $(hostname) with id ${SLURM_JOBID}." >> outputs/log.txt
mod=$( { type -t module || true; } )
if [[ -z "${mod}" ]]; then
  module() {
    eval $(/usr/bin/modulecmd bash $*);
  }
  export -f module
fi
module add trimomatic
## This call to trimomatic removes illumina and epicentre adapters from 3R2H-1_R1_001.fastq.gz:3R2H-1_R2_001.fastq.gz.
## It also performs a sliding window removal of anything with quality <25;
## cutadapt provides an alternative to this tool.
## The original sequence data is recompressed and saved in the sequences/ directory.
mkdir -p outputs/01trimomatic
## Note that trimomatic prints all output and errors to STDERR, so send both to output
trimmomatic PE \
  -threads 1 \
  -phred33 \
  3R2H-1_R1_001.fastq.gz 3R2H-1_R2_001.fastq.gz \
  3R2H-1_R1_001-trimmed_paired.fastq 3R2H-1_R1_001-trimmed_unpaired.fastq \
  3R2H-1_R2_001-trimmed_paired.fastq 3R2H-1_R2_001-trimmed_unpaired.fastq \
   ILLUMINACLIP:/sw/local/cyoa/202302/prefix/lib/perl5/auto/share/dist/Bio-Adventure/genome/adapters.fa:2:20:10:2:keepBothReads  \
  SLIDINGWINDOW:4:20 MINLEN:50 \
  1>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stdout \
  2>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stderr
excepted=$( { grep "Exception" "outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stdout" || test $? = 1; } )
## The following is in case the illumina clipping fails, which it does if this has already been run I think.
if [[ "${excepted}" != "" ]]; then
  trimmomatic PE \
    -threads 1 \
    -phred33 \
    3R2H-1_R1_001.fastq.gz 3R2H-1_R2_001.fastq.gz \
    3R2H-1_R1_001-trimmed_paired.fastq 3R2H-1_R1_001-trimmed_unpaired.fastq \
    3R2H-1_R2_001-trimmed_paired.fastq 3R2H-1_R2_001-trimmed_unpaired.fastq \
     SLIDINGWINDOW:4:25 MINLEN:50 \
    1>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stdout \
    2>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stderr
fi
sleep 10
mv 3R2H-1_R1_001-trimmed_paired.fastq 3R2H-1_R1_001-trimmed.fastq
mv 3R2H-1_R2_001-trimmed_paired.fastq 3R2H-1_R2_001-trimmed.fastq

## Recompress the unpaired reads, this should not take long.
xz -9e -f 3R2H-1_R1_001-trimmed_unpaired.fastq
xz -9e -f 3R2H-1_R2_001-trimmed_unpaired.fastq
## Recompress the paired reads.
xz -9e -f 3R2H-1_R1_001-trimmed.fastq
xz -9e -f 3R2H-1_R2_001-trimmed.fastq
ln -sf 3R2H-1_R1_001-trimmed.fastq.xz r1_trimmed.fastq.xz
ln -sf 3R2H-1_R2_001-trimmed.fastq.xz r2_trimmed.fastq.xz

## The following lines give status codes and some logging
echo "Job status: $? " >> outputs/log.txt
minutes_used=$(( SECONDS / 60 ))
echo "  $(hostname) Finished ${SLURM_JOBID} 01trim_3R2H-1_R1_001.sh at $(date), it took ${minutes_used} minutes." >> outputs/log.txt
if [[ -x "$(command -v sstat)" && ! -z "${SLURM_JOBID}" ]]; then
  echo "  walltime used by ${SLURM_JOBID} was: ${minutes_used:-null} minutes." >> outputs/log.txt
  maxmem=$(sstat -n -P --format=MaxVMSize -j "${SLURM_JOBID}")
  ## I am not sure why, but when I run a script in an interactive session, the maxmem variable
  ## gets set correctly everytime, but when it is run by another node, sometimes it does not.
  ## Lets try and figure that out...
  echo "TESTME: ${maxmem}"
  if [[ ! -z "${maxmem}" ]]; then
    echo "  maximum memory used by ${SLURM_JOBID} was: ${maxmem}." >> outputs/log.txt
  else
    echo "  The maximum memory did not get set for this job: ${SLURM_JOBID}." >> outputs/log.txt
    sstat -P -j "${SLURM_JOBID}" >> outputs/log.txt
  fi
  echo "" >> outputs/log.txt
fi
touch outputs/logs/01trim_3R2H-1_R1_001.finished

The beginning and end of the script are used to tell the computing cluster some information about the job and collect some statistics about the run so that I may improve the job in the future.

In between is the meat of the process, in which it invokes the trimomatic software and performs the actual trimming. Upon completion it uses a very aggressive compression tool (xz) to compress the final results and save some disk space.

2.3 Kraken

I am going to use the kmer catalog tool, kraken2 in order to see what species are in this.

cd preprocessing
start=$(pwd)
for i in $(/bin/ls); do
    cd ${i}
    cyoa --method kraken --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz
    cd ${start}
done

2.3.1 Kraken script

Here is an example script generated to run kraken on the reads used for denovo assembly. I am going to strip out the stuff used to define the job on the cluster. As you can see, it is making use of the viral kraken database, there is another job which uses the bacterial database and differs only in the suffix of the various output files.

kraken2 --db ${KRAKEN2_DB_PATH}/viral \
  --report outputs/11kraken_viral/kraken_report.txt --use-mpa-style \
  --use-names  --paired <(less r1_trimmed.fastq.xz) <(less r2_trimmed.fastq.xz)  \
  --classified-out outputs/11kraken_viral/classified#.fastq.gz \
  --unclassified-out outputs/11kraken_viral/unclassified#.fastq.gz \
  2>outputs/11kraken_viral/kraken.stderr \
  1>outputs/11kraken_viral/kraken.stdout
if [ "$?" -ne "0" ]; then
  echo "Kraken returned an error."
fi

## Cleaning up after running kraken.
rm -f outputs/11kraken_viral/kraken.stdout
rm -f outputs/11kraken_viral/*.fastq.gz

3 Hisat2

The hisat2 process is a bit more involved and for the moment I am going to skip it because we primarily are using salmon for now.

cd preprocessing
start=$(pwd)
for i in $(/bin/ls -d 3* 5* M*); do
    cd $i
    cyoa --method hisat --species oncomelania_unknown --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz --stranded no
    cyoa --method salmon --species oncomelania_unknown --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz
    cd $start
done

3.0.1 Salmon quantification script

Another important note, I had not yet renamed the transcriptome to the correct species when I ran this, so there remain places which incorrectly state oncomelania. That has since been corrected, but not reflected in some of these files.

mkdir -p outputs/45salmon_oncomelania_unknown
salmon quant -i /home/trey/libraries/genome/indexes/oncomelania_unknown_salmon_index \
  -l A --gcBias --validateMappings  \
   -1 <(less r1_trimmed.fastq.xz) -2 <(less r2_trimmed.fastq.xz)  \
  -o outputs/45salmon_oncomelania_unknown \
  2>outputs/45salmon_oncomelania_unknown/salmon_oncomelania_unknown.stderr \
  1>outputs/45salmon_oncomelania_unknown/salmon_oncomelania_unknown.stdout

Everything before this point is performed on the computing cluster and provides the input for what is coming. These were mostly done via my little package ‘cyoa’.

4 End of the pre processing

Everything which follows is performed on my workstation in R and uses my little package ‘hpgltools’.

5 Examine the samples and mapped reads

I called the denovo transcriptome that I created ‘oncomelania_unknown’ and mapped each sample against it using hisat2 and salmon. Given the sample sheet that Joy sent me, I think I can extract the relevant mapping statistics into it.

spec <- make_rnaseq_spec()
meta <- gather_preprocessing_metadata("sample_sheets/all_samples.xlsx", specification = spec)
## Using provided specification
## Example filename: preprocessing/3RNI-1/outputs/*trimomatic/*-trimomatic.stderr.
## Not including new entries for: trimomatic_input, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*trimomatic/*-trimomatic.stderr.
## Not including new entries for: trimomatic_output, it is empty.
## Missing data to calculate the ratio between: trimomatic_output and trimomatic_input.
## Not including new entries for: trimomatic_percent, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*fastqc/*_fastqc/fastqc_data.txt.
## Not including new entries for: fastqc_pct_gc, it is empty.
## Skipping for now
## Not including new entries for: fastqc_most_overrepresented, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/hisat2_*rRNA*.stderr.
## Not including new entries for: hisat_rrna_single_concordant, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/hisat2_*rRNA*.stderr.
## Not including new entries for: hisat_rrna_multi_concordant, it is empty.
## Missing data to calculate the ratio between: hisat_rrna_multi_concordant and trimomatic_output.
## Not including new entries for: hisat_rrna_percent, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_single_concordant, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_multi_concordant, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_single_all, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/hisat2_*genome*.stderr.
## Not including new entries for: hisat_genome_multi_all, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/hisat2_*genome*.stderr.
## Not including new entries for: hisat_unmapped, it is empty.
## Missing data to calculate the ratio between: hisat_genome_single_concordant and trimomatic_output.
## Not including new entries for: hisat_genome_percent, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/3RNI-1/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/MNI-3D/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/MNI-4E/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/3R2H-1/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/3R2H-2/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/3R2H-4/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/3R4H-1/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/3R4H-2/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/3R4H-4/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/5M-1/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/5M-2/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/M2H-4D/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/M2H-1E/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/M2H-3E/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/M4H-2D/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/M4H-1E/outputs/*hisat2_*/*_genome*.count.xz.
## Warning in dispatch_filename_search(meta, input_file_spec, verbose = verbose, : The input file is NA for:
## preprocessing/M4H-4E/outputs/*hisat2_*/*_genome*.count.xz.
## Not including new entries for: hisat_count_table, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*hisat2_*/*_genome*.count.xz.
## Not including new entries for: hisat_observed_genes, it is empty.
## Not including new entries for: hisat_observed_mean_exprs, it is empty.
## Not including new entries for: hisat_observed_median_exprs, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*salmon_*/salmon_*.stderr.
## Not including new entries for: salmon_stranded, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*salmon_*/quant.sf.
## Example filename: preprocessing/3RNI-1/outputs/*salmon_*/salmon_*.stderr.
## Not including new entries for: salmon_mapped, it is empty.
## Example filename: preprocessing/3RNI-1/outputs/*salmon_*/salmon_*.stderr.
## Not including new entries for: salmon_percent, it is empty.
## Writing new metadata to: sample_sheets/all_samples_modified.xlsx
## Deleting the file sample_sheets/all_samples_modified.xlsx before writing the tables.
new_meta_file <- meta[["new_file"]]

6 Collect annotations

I used trinotate to perform some blast etc searches against the denovo transcriptome I generated as well as passed it to interproscan. Strangely, it looks like interproscan didn’t finish, though I thought I checked it previously…

ok then, I will start with the trinotate results.

trinotate <- load_trinotate_annotations(
  "denovo_assembly/outputs/20trinotate60trinity_r1_trimmed/Trinotate.xls")
trinotate <- as.data.frame(trinotate)
rownames(trinotate) <- trinotate[["rownames"]]
trinotate[["rownames"]] <- NULL

7 Create an expressionset

Using the metadata I downloaded and modified with gather_metadata(), along with the annotations, lets make a dataset!

The following block assumes you have all of the processed/raw data, which is quite a large amount of space. If you wish to skip that step, one may instead just load the expt.rda file on box.

onco_expt <- create_expt(new_meta_file, gene_info = trinotate, file_column = "salmoncounttable")
## Reading the sample metadata.
## The sample definitions comprises: 17 rows(samples) and 37 columns(metadata fields).
## Matched 335124 annotations and counts.
## Bringing together the count matrix and gene information.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 335124 features and 17 samples.
onco_expt
## An expressionSet containing experiment with 335124
## genes and 17 samples. There are 38 metadata columns and
## 30 annotation columns; the primary condition is comprised of:
## uninfected, sjap_2h, sjap_4h, sjap_5m, sman_2h, sman_4h.
## Its current state is: raw(data).
plot_libsize(onco_expt)
## Library sizes of 17 samples, \
## ranging from 28,391,920 to 41,661,006.

plot_nonzero(onco_expt)
## The following samples have less than 217830.6 genes.
##  [1] "MNI-3D" "MNI-4E" "3R2H-1" "3R2H-2" "3R2H-4" "3R4H-1" "3R4H-4" "5M-1"   "5M-2"   "M2H-4D" "M2H-1E" "M2H-3E" "M4H-2D" "M4H-1E"
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Not putting labels on the plot.
## 
## A non-zero genes plot of 17 samples.
## These samples have an average 35.61 CPM coverage and 212103 genes observed, ranging from 197020 to
## 224047.

I think it might be worth while to collapse the apparent gene groups. Lets see what that does to the data…

trinotate[["collapsed_gid"]] <- gsub(x = rownames(trinotate), pattern = "^(.*_c\\d+_g\\d+)_i\\d+$",
                                     replacement = "\\1")
tx_gene_map <- trinotate[, c("gene_id", "collapsed_gid")]
collapsed_meta <- trinotate
rownames(collapsed_meta) <- make.names(collapsed_meta[["collapsed_gid"]], unique=TRUE)


onco_collapsed <- create_expt(new_meta_file, gene_info = collapsed_meta,
                             tx_gene_map = tx_gene_map, file_column = "salmoncounttable")
## Reading the sample metadata.
## The sample definitions comprises: 17 rows(samples) and 37 columns(metadata fields).
## In some cases, (notably salmon) the format of the IDs used by this can be tricky.
## It is likely to require the transcript ID followed by a '.' and the ensembl column:
## 'transcript_version', which is explicitly different than the gene version column.
## If this is not correctly performed, very few genes will be observed
## Matched 147473 annotations and counts.
## Bringing together the count matrix and gene information.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 147473 features and 17 samples.
onco_collapsed
## An expressionSet containing experiment with 147473
## genes and 17 samples. There are 38 metadata columns and
## 31 annotation columns; the primary condition is comprised of:
## uninfected, sjap_2h, sjap_4h, sjap_5m, sman_2h, sman_4h.
## Its current state is: raw(data).
plot_libsize(onco_collapsed)
## Library sizes of 17 samples, \
## ranging from 28,391,920 to 41,661,006.

plot_nonzero(onco_collapsed)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Not putting labels on the plot.
## 
## A non-zero genes plot of 17 samples.
## These samples have an average 35.61 CPM coverage and 112323 genes observed, ranging from 106556 to
## 119007.

plot_meta_sankey(onco_collapsed, factors = c("time", "parasite"))
## A sankey plot describing the metadata of 17 samples,
## including 10 out of 16 nodes and traversing metadata factors:
## time, parasite.

all_norm <- normalize_expt(onco_collapsed, transform = "log2", convert = "cpm", filter = TRUE)
## Removing 101648 low-count genes (45825 remaining).
## transform_counts: Found 96804 values equal to 0, adding 1 to the matrix.
all_pca <- plot_pca(all_norm)
all_pca
## The result of performing a fast_svd dimension reduction.
## The x-axis is PC1 and the y-axis is PC2
## Colors are defined by uninfected, sjap_2h, sjap_4h, sjap_5m, sman_2h, sman_4h
## Shapes are defined by A, B, C, X.

8 Examine the expressionset

For the moment, let us assume that the collapsed dataset is more appropriate. I think this will need further evaluation, I really expected there to be <100,000 genes after collapsing.

onco_collapsed <- load("expt.rda")
onco_early <- set_expt_batches(onco_early, fact = "parasite")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'onco_early' not found
onco_box <- plot_boxplot(onco_early)
## Error in plot_boxplot(onco_early): object 'onco_early' not found
onco_box$plot
## Error in eval(expr, envir, enclos): object 'onco_box' not found
onco_early_norm <- normalize_expt(onco_early, transform = "log2", convert = "cpm",
                           norm = "quant", filter = TRUE)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'expt' in selecting a method for function 'state': object 'onco_early' not found
onco_pca <- plot_pca(onco_early_norm)
## Error in plot_pca(onco_early_norm): object 'onco_early_norm' not found
onco_pca
## Error in eval(expr, envir, enclos): object 'onco_pca' not found
onco_early_nb <- normalize_expt(onco_early, transform = "log2", convert = "cpm",
                               filter = TRUE, batch = "svaseq")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'expt' in selecting a method for function 'state': object 'onco_early' not found
onco_pca <- plot_pca(onco_early_nb)
## Error in plot_pca(onco_early_nb): object 'onco_early_nb' not found
onco_pca
## Error in eval(expr, envir, enclos): object 'onco_pca' not found
only_inf <- subset_expt(onco_early, subset = "time!='undef'")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'expt' in selecting a method for function 'subset_expt': object 'onco_early' not found
onco_varpart <- simple_varpart(only_inf, factors = c("time", "parasite"))
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'only_inf' not found

While putting together the variance partition result, I noticed that there are at least a few genes with an extraordinary number of reads.

only_inf <- variance_expt(only_inf)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'expt' in selecting a method for function 'state': object 'only_inf' not found
annot <- fData(only_inf)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'fData': object 'only_inf' not found
highest_prop <- order(annot[["exprs_gene_rawprop"]], decreasing = TRUE)
## Error in eval(quote(list(...)), env): object 'annot' not found
head(annot[highest_prop, ], n = 10)
## Error in head(annot[highest_prop, ], n = 10): object 'annot' not found
highest_prop <- order(annot[["exprs_cv"]], decreasing = TRUE)
## Error in eval(quote(list(...)), env): object 'annot' not found
head(annot[highest_prop, ], n = 10)
## Error in head(annot[highest_prop, ], n = 10): object 'annot' not found

9 Perform initial DE

contrasts <- list(
  "mansoni_4v2" = c("sman4h", "sman2h"),
  "japonicum_4v2" = c("sjap4h", "sjap2h"),
  "4h_species" = c("sjap4h", "sman4h"),
  "2h_species" = c("sjap2h", "sman2h"))

start_de <- all_pairwise(only_inf, filter = TRUE, model_batch = "svaseq", force = TRUE)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'pData': object 'only_inf' not found
start_table <- combine_de_tables(start_de, keepers = contrasts,
                                 excel = "excel/start_tables.xlsx")
## Deleting the file excel/start_tables.xlsx before writing the tables.
## Error in get_expt_colors(apr[["input"]]): object 'start_de' not found
start_table
## Error in eval(expr, envir, enclos): object 'start_table' not found
start_sig <- extract_significant_genes(start_table, according_to = "deseq", lfc = 1, p = 0.05,
                                       excel = "excel/start_sig.xlsx")
## Deleting the file excel/start_sig.xlsx before writing the tables.
## Error in extract_significant_genes(start_table, according_to = "deseq", : object 'start_table' not found

10 Compare each group to uninfected

vs_uninf_contrasts <- list(
  "sj2h_uninf" = c("sjap2h", "uninfected"),
  "sj4h_uninf" = c("sjap4h", "uninfected"),
  "sm2h_uninf" = c("sman2h", "uninfected"),
  "sm4h_uninf" = c("sman4h", "uninfected"),
  "sj5m_uninf" = c("sjap5m", "uninfected"),
  "sj5m_sj2h" = c("sjap5m", "sjap2h"),
  "sj5m_sj4h" = c("sjap5m", "sjap4h"))

all_vs_uninf <- all_pairwise(onco_collapsed, filter = TRUE, model_batch = "svaseq", force = TRUE)
## 
## uninfected    sjap_2h    sjap_4h    sjap_5m    sman_2h    sman_4h 
##          3          3          3          2          3          3
## Removing 0 low-count genes (45825 remaining).
## Setting 60340 low elements to zero.
## transform_counts: Found 60340 values equal to 0, adding 1 to the matrix.
all_vs_uninf_tables <- combine_de_tables(all_vs_uninf, keepers = vs_uninf_contrasts,
                                         excel = "excel/all_vs_uninf_tables.xlsx")
## Adding venn plots for sj2h_uninf.
## Adding venn plots for sj4h_uninf.
## Adding venn plots for sm2h_uninf.
## Adding venn plots for sm4h_uninf.
## Adding venn plots for sj5m_uninf.
## Adding venn plots for sj5m_sj2h.
## Adding venn plots for sj5m_sj4h.
all_vs_uninf_tables
## A set of combined differential expression results.

##                           table deseq_sigup deseq_sigdown edger_sigup edger_sigdown limma_sigup limma_sigdown
## 1 uninfected_vs_sjap2h-inverted          20            12          76            39           0             0
## 2 uninfected_vs_sjap4h-inverted          63            35        3192           268           0             0
## 3 uninfected_vs_sman2h-inverted          46            12          65            57           0             0
## 4 uninfected_vs_sman4h-inverted          64            16         171            59           0             0
## 5 uninfected_vs_sjap5m-inverted       11072          7809        9631          3672       20586         29473
## 6              sjap5m_vs_sjap2h        9024          6825        8473          2625       20876         25683
## 7              sjap5m_vs_sjap4h        7971          6978        7493          2755       21025         25615

all_vs_uninf_sig <- extract_significant_genes(all_vs_uninf_tables, according_to = "deseq", lfc = 1, p = 0.05,
                                              excel = "excel/all_vs_uninf_sig.xlsx")
all_vs_uninf_sig
## A set of genes deemed significant according to deseq.
## The parameters defining significant were:
## LFC cutoff: 1 adj P cutoff: 0.05
##            deseq_up deseq_down
## sj2h_uninf       20         12
## sj4h_uninf       63         35
## sm2h_uninf       46         12
## sm4h_uninf       64         16
## sj5m_uninf    11072       7809
## sj5m_sj2h      9024       6825
## sj5m_sj4h      7971       6978

pander::pander(sessionInfo())

R version 4.2.0 (2022-04-22)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: ruv(v.0.9.7.1), BiocParallel(v.1.32.6), variancePartition(v.1.28.9), hpgltools(v.1.0), testthat(v.3.1.8), glue(v.1.6.2), reticulate(v.1.28), SummarizedExperiment(v.1.28.0), GenomicRanges(v.1.50.2), GenomeInfoDb(v.1.34.9), IRanges(v.2.32.0), S4Vectors(v.0.36.2), MatrixGenerics(v.1.10.0), matrixStats(v.0.63.0), Biobase(v.2.58.0) and BiocGenerics(v.0.44.0)

loaded via a namespace (and not attached): rappdirs(v.0.3.3), rtracklayer(v.1.58.0), R.methodsS3(v.1.8.2), tidyr(v.1.3.0), ggplot2(v.3.4.2), clusterGeneration(v.1.3.7), bit64(v.4.0.5), knitr(v.1.42), R.utils(v.2.12.2), DelayedArray(v.0.24.0), data.table(v.1.14.8), KEGGREST(v.1.38.0), RCurl(v.1.98-1.12), doParallel(v.1.0.17), generics(v.0.1.3), preprocessCore(v.1.60.2), GenomicFeatures(v.1.50.4), callr(v.3.7.3), RhpcBLASctl(v.0.23-42), cowplot(v.1.1.1), usethis(v.2.1.6), RSQLite(v.2.3.1), shadowtext(v.0.1.2), bit(v.4.0.5), tzdb(v.0.3.0), enrichplot(v.1.18.4), xml2(v.1.3.4), httpuv(v.1.6.10), viridis(v.0.6.3), tximport(v.1.26.1), xfun(v.0.39), hms(v.1.1.3), jquerylib(v.0.1.4), IHW(v.1.26.0), evaluate(v.0.21), promises(v.1.2.0.1), DEoptimR(v.1.0-13), fansi(v.1.0.4), restfulr(v.0.0.15), progress(v.1.2.2), caTools(v.1.18.2), dbplyr(v.2.3.2), geneplotter(v.1.76.0), igraph(v.1.4.2), DBI(v.1.1.3), htmlwidgets(v.1.6.2), purrr(v.1.0.1), ellipsis(v.0.3.2), dplyr(v.1.1.2), backports(v.1.4.1), annotate(v.1.76.0), aod(v.1.3.2), biomaRt(v.2.54.1), vctrs(v.0.6.3), remotes(v.2.4.2), cachem(v.1.0.8), withr(v.2.5.0), ggforce(v.0.4.1), HDO.db(v.0.99.1), robustbase(v.0.95-1), vroom(v.1.6.3), GenomicAlignments(v.1.34.1), treeio(v.1.22.0), fdrtool(v.1.2.17), prettyunits(v.1.1.1), DOSE(v.3.24.2), ape(v.5.7-1), lazyeval(v.0.2.2), crayon(v.1.5.2), genefilter(v.1.80.3), slam(v.0.1-50), labeling(v.0.4.2), edgeR(v.3.40.2), pkgconfig(v.2.0.3), tweenr(v.2.0.2), nlme(v.3.1-162), pkgload(v.1.3.2), devtools(v.2.4.5), rlang(v.1.1.1), lifecycle(v.1.0.3), miniUI(v.0.1.1.1), ggsankey(v.0.0.99999), downloader(v.0.4), filelock(v.1.0.2), BiocFileCache(v.2.6.1), rprojroot(v.2.0.3), polyclip(v.1.10-4), graph(v.1.76.0), Matrix(v.1.5-4), aplot(v.0.1.10), lpsymphony(v.1.26.3), boot(v.1.3-28.1), processx(v.3.8.1), png(v.0.1-8), viridisLite(v.0.4.2), rjson(v.0.2.21), bitops(v.1.0-7), R.oo(v.1.25.0), gson(v.0.1.0), KernSmooth(v.2.23-21), pander(v.0.6.5), Biostrings(v.2.66.0), blob(v.1.2.4), stringr(v.1.5.0), qvalue(v.2.30.0), readr(v.2.1.4), remaCor(v.0.0.11), gridGraphics(v.0.5-1), scales(v.1.2.1), memoise(v.2.0.1), GSEABase(v.1.60.0), magrittr(v.2.0.3), plyr(v.1.8.8), gplots(v.3.1.3), zlibbioc(v.1.44.0), compiler(v.4.2.0), scatterpie(v.0.1.9), BiocIO(v.1.8.0), RColorBrewer(v.1.1-3), lme4(v.1.1-33), DESeq2(v.1.38.3), Rsamtools(v.2.14.0), cli(v.3.6.1), XVector(v.0.38.0), urlchecker(v.1.0.1), patchwork(v.1.1.2), ps(v.1.7.5), MASS(v.7.3-60), mgcv(v.1.8-42), tidyselect(v.1.2.0), stringi(v.1.7.12), highr(v.0.10), yaml(v.2.3.7), GOSemSim(v.2.24.0), locfit(v.1.5-9.7), ggrepel(v.0.9.3), grid(v.4.2.0), sass(v.0.4.6), fastmatch(v.1.1-3), tools(v.4.2.0), parallel(v.4.2.0), rstudioapi(v.0.14), foreach(v.1.5.2), gridExtra(v.2.3), farver(v.2.1.1), ggraph(v.2.1.0), digest(v.0.6.31), shiny(v.1.7.4), networkD3(v.0.4), Rcpp(v.1.0.10), broom(v.1.0.4), later(v.1.3.1), httr(v.1.4.6), AnnotationDbi(v.1.60.2), Rdpack(v.2.4), colorspace(v.2.1-0), brio(v.1.1.3), XML(v.3.99-0.14), fs(v.1.6.2), splines(v.4.2.0), RBGL(v.1.74.0), yulab.utils(v.0.0.6), PROPER(v.1.30.0), tidytree(v.0.4.2), graphlayouts(v.1.0.0), ggplotify(v.0.1.0), plotly(v.4.10.1), sessioninfo(v.1.2.2), xtable(v.1.8-4), jsonlite(v.1.8.7), nloptr(v.2.0.3), ggtree(v.3.6.2), tidygraph(v.1.2.3), corpcor(v.1.6.10), UpSetR(v.1.4.0), ggfun(v.0.0.9), Vennerable(v.3.1.0.9000), R6(v.2.5.1), RUnit(v.0.4.32), profvis(v.0.3.8), pillar(v.1.9.0), htmltools(v.0.5.5), mime(v.0.12), fastmap(v.1.1.1), minqa(v.1.2.5), clusterProfiler(v.4.6.2), codetools(v.0.2-19), fgsea(v.1.24.0), pkgbuild(v.1.4.0), mvtnorm(v.1.1-3), utf8(v.1.2.3), lattice(v.0.21-8), bslib(v.0.4.2), tibble(v.3.2.1), sva(v.3.46.0), pbkrtest(v.0.5.2), curl(v.5.0.1), gtools(v.3.9.4), zip(v.2.3.0), GO.db(v.3.16.0), openxlsx(v.4.2.5.2), survival(v.3.5-5), limma(v.3.54.2), rmarkdown(v.2.21), desc(v.1.4.2), munsell(v.0.5.0), GenomeInfoDbData(v.1.2.9), iterators(v.1.0.14), reshape2(v.1.4.4), gtable(v.0.3.3) and rbibutils(v.2.2.13)

message("This is hpgltools commit: ", get_git_commit())
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 03b4182f2adc26f5d381a8049cc990d5dd4ef218
## This is hpgltools commit: Fri Aug 11 12:20:59 2023 -0400: 03b4182f2adc26f5d381a8049cc990d5dd4ef218
this_save <- paste0(gsub(pattern = "\\.Rmd", replace = "", x = rmd_file), "-v", ver, ".rda.xz")
#message("Saving to ", this_save)
#tmp <- sm(saveme(filename = this_save))
---
title: "Comparing Oncomelania transcriptomes pre/post infection."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
  html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: zenburn
    keep_md: false
    mode: selfcontained
    number_sections: true
    self_contained: true
    theme: readable
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: false
  rmdformats::readthedown:
    code_download: true
    code_folding: show
    df_print: paged
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: zenburn
    width: 300
    keep_md: false
    mode: selfcontained
    toc_float: true
  BiocStyle::html_document:
    code_download: true
    code_folding: show
    fig_caption: true
    fig_height: 7
    fig_width: 7
    highlight: zenburn
    keep_md: false
    mode: selfcontained
    toc_float: true
---

<style type="text/css">
body, td {
  font-size: 16px;
}
code.r{
  font-size: 16px;
}
pre {
 font-size: 16px
}
body .main-container {
  max-width: 1600px;
}
</style>

```{r options, include=FALSE}
library(hpgltools)
library(reticulate)
tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(
  progress = TRUE, verbose = TRUE, width = 90, echo = TRUE)
knitr::opts_chunk$set(
  error = TRUE, fig.width = 8, fig.height = 8, fig.retina = 2,
  fig.pos = "t", fig.align = "center", dpi = if (knitr::is_latex_output()) 72 else 300,
  out.width = "100%", dev = "png",
  dev.args = list(png = list(type = "cairo-png")))
old_options <- options(digits = 4,
                       stringsAsFactors = FALSE,
                       knitr.duplicate.label = "allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size = 12))
ver <- "202306"
previous_file <- ""
ver <- format(Sys.Date(), "%Y%m%d")

##tmp <- sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz")))
rmd_file <- "index.Rmd"
```

# Introduction

This document will seek to perform some preprocessing, interpretation,
de-novo assembly, and differential expression of a series of
un/infected Oncomelania samples.

# Preprocessing

I am not completely certain of the best putative parasite
transcriptomes to compare each samples against.  So let us start with
trimming.

## Trimming

I will use trimomatic to trim these reads followed by fastqc to get a
sense of the sequencer quality.

```{bash trimming, eval=FALSE}
cd preprocessing
start=$(pwd)
for i in $(/bin/ls); do
    cd ${i}
    cyoa --method trim --input r1.fastq.gz:r2.fastq.gz
    cd ${start}
done
```

## De-novo assembly

If I understand properly the experimental design, all samples are the
same host.  So, I will concatenate all samples (post-parasite
filtering) into one large set and use the tool 'Trinity' on them.

```{bash trinity, eval=FALSE}
mkdir denovo_assembly
for i in $(find preprocessing/ -name r1_trimmed.fastq.xz); do
    cat $i >> denovo_assembly/r1_trimmed.fastq.xz
done
for i in $(find preprocessing/ -name r2_trimmed.fastq.xz); do
    cat $i >> denovo_assembly/r2_trimmed.fastq.xz
done

cd denovo_assembly
cyoa --method trinity --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz

cyoa --method trinotate --input outputs/trinotate/Trinity.fasta
```

### Trinity script

In a similar fashion, here are the actual scripts used to run
trinity/trinotate, and the post-processing methods employed by them.

```{bash trinity_script, eval=FALSE}
## This is a trinity submission script.
mkdir -p outputs/60trinity_r1_trimmed && \
  Trinity --seqType fq --min_contig_length 600 --normalize_reads \
    --max_memory 90G --CPU 6 \
    --output outputs/60trinity_r1_trimmed \
    --left <(less r1_trimmed.fastq.xz) --right <(less r2_trimmed.fastq.xz)
```

### Trinotate script

```{bash trinotate_script, eval=FALSE}
function cleanup {
  echo "Removing /tmp/Trinity.fasta.sqlite"
  rm  -f /tmp/Trinity.fasta.sqlite
}
trap cleanup EXIT

mkdir -p outputs/20trinotate60trinity_r1_trimmed
start=$(pwd)
cd outputs/20trinotate60trinity_r1_trimmed
ln -sf "/home/trey/sshfs/scratch/atb/rnaseq/oncomelania_2023/denovo_assembly/outputs/60trinity_r1_trimmed/Trinity.fasta" .
rm -f "Trinity.fasta.gene_trans_map"
if [[ -f "Trinity.fasta.gene_trans_map" ]]; then
  echo "The gene to transcript map already exists."
else
  ids=$({ grep "^>" /home/trey/sshfs/scratch/atb/rnaseq/oncomelania_2023/denovo_assembly/outputs/60trinity_r1_trimmed/Trinity.fasta || test $? = 1; } | sed 's/>//g' | awk '{print $1}')
  for i in ${ids}; do
    echo "${i}  ${i}" >> Trinity.fasta.gene_trans_map
  done
fi

cp /sw/local/trinotate/3.2.2/bin/Trinotate.sqlite /tmp/Trinity.fasta.sqlite
/sw/local/trinotate/3.2.2/bin/auto/autoTrinotate.pl \
  --conf /sw/local/trinotate/3.2.2/bin/auto/conf.txt \
  --Trinotate_sqlite /tmp/Trinity.fasta.sqlite \
  --transcripts Trinity.fasta \
  --gene_to_trans_map Trinity.fasta.gene_trans_map \
  --CPU 16
mv Trinotate.tsv Trinity.tsv
rm -f *.ok *.out *.outfmt6 *.cmds *.log
rm -rf TMHMM_* Trinity.fasta.ffn.trans*
cd ${start}
```

### Trinity post processing

Trinity also provides some RSEM-based quantitation and quality
assurance post processing methods.

```{bash trinity_post_script, eval=FALSE}
start=$(pwd)
cd outputs/60trinity_r1_trimmed
/sw/local/trinity/2.15.1/util/TrinityStats.pl Trinity.fasta

/sw/local/trinity/2.15.1/util/align_and_estimate_abundance.pl \
  --output_dir align_estimate.out \
  --transcripts outputs/trinity_r1_trimmed/Trinity.fasta \
  --seqType fq \
  --left r1_trimmed.fastq.xz --right r2_trimmed.fastq.xz  \
  --est_method RSEM \
  --aln_method bowtie \
  --trinity_mode --prep_reference \
  2>trinpost_align_estimate.stderr \
  1>trinpost_align_estimate.stdout

/sw/local/trinity/2.15.1/util/abundance_estimates_to_matrix.pl \
  --est_method RSEM \
  --gene_trans_map Trinity.fasta.gene_trans_map \
  align_estimate.out/RSEM.isoform.results \
  2>trinpost_estimate_to_matrix.stderr \
  1>trinpost_estimate_to_matrix.stdout

/sw/local/trinity/2.15.1/util/SAM_nameSorted_to_uniq_count_stats.pl \
  bowtie_out/bowtie_out.nameSorted.bam \
  2>trinpost_count_stats.stderr \
  1>trinpost_count_stats.stdout

cd ${start}
```

### An example of the trimming process

The above command creates a script run on our computing cluster to
trim the reads. Here is an arbitrarily chosen script it generated:

```{bash trim_example, eval=FALSE}
#!/usr/bin/bash
#SBATCH --export=ALL --requeue --mail-type=NONE --open-mode=append
#SBATCH --chdir=/z1/scratch/atb/rnaseq/oncomelania_2023/preprocessing/3R2H-1
#SBATCH --job-name=01trim_3R2H-1_R1_001 --nice=0
#SBATCH --output=outputs/log.txt.sbatch
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=4
#SBATCH --time=36:00:00
#SBATCH --mem=24G
set -o errexit
set -o errtrace
set -o pipefail
export LESS='--buffers 0 -B'
echo "## Started /z1/scratch/atb/rnaseq/oncomelania_2023/preprocessing/3R2H-1/scripts/01trim_3R2H-1_R1_001.sh at $(date) on $(hostname) with id ${SLURM_JOBID}." >> outputs/log.txt
mod=$( { type -t module || true; } )
if [[ -z "${mod}" ]]; then
  module() {
    eval $(/usr/bin/modulecmd bash $*);
  }
  export -f module
fi
module add trimomatic
## This call to trimomatic removes illumina and epicentre adapters from 3R2H-1_R1_001.fastq.gz:3R2H-1_R2_001.fastq.gz.
## It also performs a sliding window removal of anything with quality <25;
## cutadapt provides an alternative to this tool.
## The original sequence data is recompressed and saved in the sequences/ directory.
mkdir -p outputs/01trimomatic
## Note that trimomatic prints all output and errors to STDERR, so send both to output
trimmomatic PE \
  -threads 1 \
  -phred33 \
  3R2H-1_R1_001.fastq.gz 3R2H-1_R2_001.fastq.gz \
  3R2H-1_R1_001-trimmed_paired.fastq 3R2H-1_R1_001-trimmed_unpaired.fastq \
  3R2H-1_R2_001-trimmed_paired.fastq 3R2H-1_R2_001-trimmed_unpaired.fastq \
   ILLUMINACLIP:/sw/local/cyoa/202302/prefix/lib/perl5/auto/share/dist/Bio-Adventure/genome/adapters.fa:2:20:10:2:keepBothReads  \
  SLIDINGWINDOW:4:20 MINLEN:50 \
  1>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stdout \
  2>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stderr
excepted=$( { grep "Exception" "outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stdout" || test $? = 1; } )
## The following is in case the illumina clipping fails, which it does if this has already been run I think.
if [[ "${excepted}" != "" ]]; then
  trimmomatic PE \
    -threads 1 \
    -phred33 \
    3R2H-1_R1_001.fastq.gz 3R2H-1_R2_001.fastq.gz \
    3R2H-1_R1_001-trimmed_paired.fastq 3R2H-1_R1_001-trimmed_unpaired.fastq \
    3R2H-1_R2_001-trimmed_paired.fastq 3R2H-1_R2_001-trimmed_unpaired.fastq \
     SLIDINGWINDOW:4:25 MINLEN:50 \
    1>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stdout \
    2>outputs/01trimomatic/3R2H-1_R1_001-trimomatic.stderr
fi
sleep 10
mv 3R2H-1_R1_001-trimmed_paired.fastq 3R2H-1_R1_001-trimmed.fastq
mv 3R2H-1_R2_001-trimmed_paired.fastq 3R2H-1_R2_001-trimmed.fastq

## Recompress the unpaired reads, this should not take long.
xz -9e -f 3R2H-1_R1_001-trimmed_unpaired.fastq
xz -9e -f 3R2H-1_R2_001-trimmed_unpaired.fastq
## Recompress the paired reads.
xz -9e -f 3R2H-1_R1_001-trimmed.fastq
xz -9e -f 3R2H-1_R2_001-trimmed.fastq
ln -sf 3R2H-1_R1_001-trimmed.fastq.xz r1_trimmed.fastq.xz
ln -sf 3R2H-1_R2_001-trimmed.fastq.xz r2_trimmed.fastq.xz

## The following lines give status codes and some logging
echo "Job status: $? " >> outputs/log.txt
minutes_used=$(( SECONDS / 60 ))
echo "  $(hostname) Finished ${SLURM_JOBID} 01trim_3R2H-1_R1_001.sh at $(date), it took ${minutes_used} minutes." >> outputs/log.txt
if [[ -x "$(command -v sstat)" && ! -z "${SLURM_JOBID}" ]]; then
  echo "  walltime used by ${SLURM_JOBID} was: ${minutes_used:-null} minutes." >> outputs/log.txt
  maxmem=$(sstat -n -P --format=MaxVMSize -j "${SLURM_JOBID}")
  ## I am not sure why, but when I run a script in an interactive session, the maxmem variable
  ## gets set correctly everytime, but when it is run by another node, sometimes it does not.
  ## Lets try and figure that out...
  echo "TESTME: ${maxmem}"
  if [[ ! -z "${maxmem}" ]]; then
    echo "  maximum memory used by ${SLURM_JOBID} was: ${maxmem}." >> outputs/log.txt
  else
    echo "  The maximum memory did not get set for this job: ${SLURM_JOBID}." >> outputs/log.txt
    sstat -P -j "${SLURM_JOBID}" >> outputs/log.txt
  fi
  echo "" >> outputs/log.txt
fi
touch outputs/logs/01trim_3R2H-1_R1_001.finished
```

The beginning and end of the script are used to tell the computing
cluster some information about the job and collect some statistics
about the run so that I may improve the job in the future.

In between is the meat of the process, in which it invokes the
trimomatic software and performs the actual trimming.  Upon completion
it uses a very aggressive compression tool (xz) to compress the final
results and save some disk space.

## Kraken

I am going to use the kmer catalog tool, kraken2 in order to see what
species are in this.

```{bash kraken, eval=FALSE}
cd preprocessing
start=$(pwd)
for i in $(/bin/ls); do
    cd ${i}
    cyoa --method kraken --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz
    cd ${start}
done
```

### Kraken script

Here is an example script generated to run kraken on the reads used
for denovo assembly.  I am going to strip out the stuff used to define
the job on the cluster.  As you can see, it is making use of the viral
kraken database, there is another job which uses the bacterial
database and differs only in the suffix of the various output files.

```{bash kraken_script, eval=FALSE}
kraken2 --db ${KRAKEN2_DB_PATH}/viral \
  --report outputs/11kraken_viral/kraken_report.txt --use-mpa-style \
  --use-names  --paired <(less r1_trimmed.fastq.xz) <(less r2_trimmed.fastq.xz)  \
  --classified-out outputs/11kraken_viral/classified#.fastq.gz \
  --unclassified-out outputs/11kraken_viral/unclassified#.fastq.gz \
  2>outputs/11kraken_viral/kraken.stderr \
  1>outputs/11kraken_viral/kraken.stdout
if [ "$?" -ne "0" ]; then
  echo "Kraken returned an error."
fi

## Cleaning up after running kraken.
rm -f outputs/11kraken_viral/kraken.stdout
rm -f outputs/11kraken_viral/*.fastq.gz
```

# Hisat2

The hisat2 process is a bit more involved and for the moment I am
going to skip it because we primarily are using salmon for now.

```{bash hisat, eval=FALSE}
cd preprocessing
start=$(pwd)
for i in $(/bin/ls -d 3* 5* M*); do
    cd $i
    cyoa --method hisat --species oncomelania_unknown --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz --stranded no
    cyoa --method salmon --species oncomelania_unknown --input r1_trimmed.fastq.xz:r2_trimmed.fastq.xz
    cd $start
done
```

### Salmon quantification script

Another important note, I had not yet renamed the transcriptome to the
correct species when I ran this, so there remain places which
incorrectly state oncomelania.  That has since been corrected, but not
reflected in some of these files.

```{bash salmon_script, eval=FALSE}
mkdir -p outputs/45salmon_oncomelania_unknown
salmon quant -i /home/trey/libraries/genome/indexes/oncomelania_unknown_salmon_index \
  -l A --gcBias --validateMappings  \
   -1 <(less r1_trimmed.fastq.xz) -2 <(less r2_trimmed.fastq.xz)  \
  -o outputs/45salmon_oncomelania_unknown \
  2>outputs/45salmon_oncomelania_unknown/salmon_oncomelania_unknown.stderr \
  1>outputs/45salmon_oncomelania_unknown/salmon_oncomelania_unknown.stdout
```

Everything before this point is performed on the computing cluster and
provides the input for what is coming.  These were mostly done via my
little package 'cyoa'.

# End of the pre processing

Everything which follows is performed on my workstation in R and uses
my little package 'hpgltools'.

# Examine the samples and mapped reads

I called the denovo transcriptome that I created 'oncomelania_unknown'
and mapped each sample against it using hisat2 and salmon.  Given the
sample sheet that Joy sent me, I think I can extract the relevant
mapping statistics into it.

```{r modify_samplesheet}
spec <- make_rnaseq_spec()
meta <- gather_preprocessing_metadata("sample_sheets/all_samples.xlsx", specification = spec)
new_meta_file <- meta[["new_file"]]
```

# Collect annotations

I used trinotate to perform some blast etc searches against the denovo
transcriptome I generated as well as passed it to interproscan.
Strangely, it looks like interproscan didn't finish, though I thought
I checked it previously...

ok then, I will start with the trinotate results.

```{r annotations}
trinotate <- load_trinotate_annotations(
  "denovo_assembly/outputs/20trinotate60trinity_r1_trimmed/Trinotate.xls")
trinotate <- as.data.frame(trinotate)
rownames(trinotate) <- trinotate[["rownames"]]
trinotate[["rownames"]] <- NULL
```

# Create an expressionset

Using the metadata I downloaded and modified with gather_metadata(),
along with the annotations, lets make a dataset!

The following block assumes you have all of the processed/raw data,
which is quite a large amount of space.  If you wish to skip that
step, one may instead just load the expt.rda file on box.

```{r exprs}
onco_expt <- create_expt(new_meta_file, gene_info = trinotate, file_column = "salmoncounttable")
onco_expt

plot_libsize(onco_expt)
plot_nonzero(onco_expt)
```

I think it might be worth while to collapse the apparent gene groups.
Lets see what that does to the data...

```{r collapse_genes}
trinotate[["collapsed_gid"]] <- gsub(x = rownames(trinotate), pattern = "^(.*_c\\d+_g\\d+)_i\\d+$",
                                     replacement = "\\1")
tx_gene_map <- trinotate[, c("gene_id", "collapsed_gid")]
collapsed_meta <- trinotate
rownames(collapsed_meta) <- make.names(collapsed_meta[["collapsed_gid"]], unique=TRUE)


onco_collapsed <- create_expt(new_meta_file, gene_info = collapsed_meta,
                             tx_gene_map = tx_gene_map, file_column = "salmoncounttable")
onco_collapsed
plot_libsize(onco_collapsed)
plot_nonzero(onco_collapsed)

plot_meta_sankey(onco_collapsed, factors = c("time", "parasite"))
all_norm <- normalize_expt(onco_collapsed, transform = "log2", convert = "cpm", filter = TRUE)
all_pca <- plot_pca(all_norm)
all_pca
```

# Examine the expressionset

For the moment, let us assume that the collapsed dataset is more
appropriate.  I think this will need further evaluation, I really
expected there to be <100,000 genes after collapsing.

```{r load_expt, eval=FALSE}
onco_collapsed <- load("expt.rda")
```

```{r exprs}
onco_early <- set_expt_batches(onco_early, fact = "parasite")

onco_box <- plot_boxplot(onco_early)
onco_box$plot

onco_early_norm <- normalize_expt(onco_early, transform = "log2", convert = "cpm",
                           norm = "quant", filter = TRUE)
onco_pca <- plot_pca(onco_early_norm)
onco_pca

onco_early_nb <- normalize_expt(onco_early, transform = "log2", convert = "cpm",
                               filter = TRUE, batch = "svaseq")
onco_pca <- plot_pca(onco_early_nb)
onco_pca

only_inf <- subset_expt(onco_early, subset = "time!='undef'")
onco_varpart <- simple_varpart(only_inf, factors = c("time", "parasite"))
```

While putting together the variance partition result, I noticed that
there are at least a few genes with an extraordinary number of reads.

```{r high_genes}
only_inf <- variance_expt(only_inf)
annot <- fData(only_inf)
highest_prop <- order(annot[["exprs_gene_rawprop"]], decreasing = TRUE)
head(annot[highest_prop, ], n = 10)

highest_prop <- order(annot[["exprs_cv"]], decreasing = TRUE)
head(annot[highest_prop, ], n = 10)
```

# Perform initial DE

```{r de_fun}
contrasts <- list(
  "mansoni_4v2" = c("sman4h", "sman2h"),
  "japonicum_4v2" = c("sjap4h", "sjap2h"),
  "4h_species" = c("sjap4h", "sman4h"),
  "2h_species" = c("sjap2h", "sman2h"))

start_de <- all_pairwise(only_inf, filter = TRUE, model_batch = "svaseq", force = TRUE)
start_table <- combine_de_tables(start_de, keepers = contrasts,
                                 excel = "excel/start_tables.xlsx")
start_table
start_sig <- extract_significant_genes(start_table, according_to = "deseq", lfc = 1, p = 0.05,
                                       excel = "excel/start_sig.xlsx")
```

# Compare each group to uninfected

```{r de_vs_uninf}
vs_uninf_contrasts <- list(
  "sj2h_uninf" = c("sjap2h", "uninfected"),
  "sj4h_uninf" = c("sjap4h", "uninfected"),
  "sm2h_uninf" = c("sman2h", "uninfected"),
  "sm4h_uninf" = c("sman4h", "uninfected"),
  "sj5m_uninf" = c("sjap5m", "uninfected"),
  "sj5m_sj2h" = c("sjap5m", "sjap2h"),
  "sj5m_sj4h" = c("sjap5m", "sjap4h"))

all_vs_uninf <- all_pairwise(onco_collapsed, filter = TRUE, model_batch = "svaseq", force = TRUE)
all_vs_uninf_tables <- combine_de_tables(all_vs_uninf, keepers = vs_uninf_contrasts,
                                         excel = "excel/all_vs_uninf_tables.xlsx")
all_vs_uninf_tables
all_vs_uninf_sig <- extract_significant_genes(all_vs_uninf_tables, according_to = "deseq", lfc = 1, p = 0.05,
                                              excel = "excel/all_vs_uninf_sig.xlsx")
all_vs_uninf_sig
```


```{r saveme}
pander::pander(sessionInfo())
message("This is hpgltools commit: ", get_git_commit())
this_save <- paste0(gsub(pattern = "\\.Rmd", replace = "", x = rmd_file), "-v", ver, ".rda.xz")
#message("Saving to ", this_save)
#tmp <- sm(saveme(filename = this_save))
```
