S.pyogenes 5448 pdxR RNASeq version: 202204
Preprocessing
I used my cyoa tool to process these samples by doing the following:
- Copying the data from the sequencer into the directory ‘preprocessing/’
- Used a slightly involved shell command to create a directory for each sample and copy the reads for it to the ‘unprocessed/’ subdirectory within it.
- invoked the following:
cd preprocessing
start=$(pwd)
for i in $(/bin/ls -d ./*)
do
cd $i
rm -rf outputs scripts
cyoa --task pipe --method prnas --species spyogenes_5448 \
--gff_type gene --gff_tag locus_tag \
--input $(/bin/ls unprocessed/* | tr '\n' ':' | sed 's/:$//g')
cd $start
done
The above for loop goes into each sample and does the following:
- Trims the data, heavily compresses the outputs.
- Runs fastqc
- Runs hisat2 using my spyogenes_5448 indices.
- Converts the sam alignment to sorted/indexed bam.
- Makes a couple of extra copies of it with some filters.
- Compresses the aligned/unaligned reads.
- Runs htseq-count on the alignments to count reads/gene.
Note the following steps were not actually run because I had a speeling error. But since they are not necessary for the explicitly RNASeq analyses I first want to do, I ignored it. I am curious though to see if there are other mutations in these strains, so I will likely run those portions manually.
- Runs freebayes on the alignments to look for variants.
- Sorts/compresses the freebayes output.
- Does some parsing of the freebayes output and provides some tables about where mutations were observed.
Collect annotation information
Same two primary annotation sources, the gff file used for mapping/counting, and microbesonline.org. Note that since I moved to just downloading the material from the web interface, I no longer have a handy method to get the taxon ID, so I go there and hunt down the taxId manually.
Now that I am thinking about it, my 5448 genome/annotations are kind of old, I will ask and check to see if there is anything newer.
Also, 5448 does not have an entry at microbesonline.org, a fact which I forgot. I need to go poking in my notes to reconnect 5005 and 5448.
gff_annot <- load_gff_annotations("reference/spyogenes_5448.gff", type = "gene")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Returning a df with 14 columns and 1814 rows.
rownames(gff_annot) <- gff_annot[["locus_tag"]]
head(gff_annot)
## seqnames start end width strand source type score
## SP5448_00005 CP008776 232 1587 1356 + EMBL/GenBank/SwissProt gene NA
## SP5448_00010 CP008776 1742 2878 1137 + EMBL/GenBank/SwissProt gene NA
## SP5448_00015 CP008776 2953 3150 198 + EMBL/GenBank/SwissProt gene NA
## SP5448_00020 CP008776 3480 4595 1116 + EMBL/GenBank/SwissProt gene NA
## SP5448_00025 CP008776 4665 5234 570 + EMBL/GenBank/SwissProt gene NA
## SP5448_00030 CP008776 5237 8740 3504 + EMBL/GenBank/SwissProt gene NA
## phase locus_tag gene gene_synonym note pseudo
## SP5448_00005 1 SP5448_00005 <NA> <NA> <NA>
## SP5448_00010 1 SP5448_00010 <NA> <NA> <NA>
## SP5448_00015 1 SP5448_00015 <NA> <NA> <NA>
## SP5448_00020 1 SP5448_00020 <NA> <NA> <NA>
## SP5448_00025 1 SP5448_00025 <NA> <NA> <NA>
## SP5448_00030 1 SP5448_00030 <NA> <NA> <NA>
mgas_data <- load_genbank_annotations(accession="CP008776")
## Loading required namespace: rentrez
## Done Parsing raw GenBank file text. [ 13.601 seconds ]
## 2022-04-26 15:23:50 Starting creation of gene GRanges
## 2022-04-26 15:23:53 Starting creation of CDS GRanges
## 2022-04-26 15:24:00 Starting creation of exon GRanges
## No exons read from genbank file. Assuming sections of CDS are full exons
## 2022-04-26 15:24:02 Starting creation of variant VRanges
## 2022-04-26 15:24:02 Starting creation of transcript GRanges
## No transcript features (mRNA) found, using spans of CDSs
## 2022-04-26 15:24:03 Starting creation of misc feature GRanges
## Warning in fill_stack_df(feats[!typs %in% c("gene", "exon", "CDS",
## "variation", : Got unexpected multi-value field(s) [ inference ]. The resulting
## column(s) will be of class CharacterList, rather than vector(s). Please contact
## the maintainer if multi-valuedness is expected/meaningful for the listed
## field(s).
## 2022-04-26 15:24:03 - Done creating GenBankRecord object [ 13.042 seconds ]
genome_size <- GenomicRanges::width(mgas_data$seq)
## Error in (function (classes, fdef, mtable) : unable to find an inherited method for function 'width' for signature '"NULL"'
mgas_cds <- as.data.frame(mgas_data$cds)
rownames(mgas_cds) <- mgas_cds[["locus_tag"]]
wanted <- ! colnames(mgas_cds) %in% c("translation", "type", "strand", "seqnames", "start", "end", "locus_tag", "note", "gene", "gene_synonym", "width")
mgas_cds <- mgas_cds[, wanted]
mgas_annot <- merge(mgas_cds, gff_annot, by="row.names")
rownames(mgas_annot) <- mgas_annot[["Row.names"]]
mgas_annot[["Row.names"]] <- NULL
ref_gff_annot <- load_gff_annotations("reference/spyogenes_5005.gff", type="CDS")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Returning a df with 19 columns and 1841 rows.
microbes_annot <- load_microbesonline_annotations(species="5005")
## Found 1 entry.
## Streptococcus pyogenes MGAS5005Firmicutesyes2005-08-25yes101950293653
## The species being downloaded is: Streptococcus pyogenes MGAS5005
## Downloading: http://www.microbesonline.org/cgi-bin/genomeInfo.cgi?tId=293653;export=tab
Rerunning my fasta36 mapper across strains
Once upon a time I wrote a little tool to compare closely species/strains. It is essentially a poor-man’s ortholog search. It has been a very long time since last I used it, and it required some modifications in order to work properly. The most notable problem is that the pathnames it uses are too long for fasta36. I shortened them and fixed a couple of old errors and it appears to work again.
single_hits <- readr::read_tsv("single_multi/outputs/fasta_spyogenes_5448_cds_spyogenes_5005_cds/spyogenes_5448_cds_singles.txt")
## Rows: 1340 Columns: 2
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): AKK69518.1, AAZ50620.1:1.000:0
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(single_hits) <- c("from", "to")
single_hits[["to"]] <- gsub(pattern="(^.*?):.*$", replacement="\\1",
x=single_hits[["to"]], perl=TRUE)
single_both <- merge(mgas_annot, single_hits, by.x="protein_id", by.y="from", all.x=TRUE)
single_both <- merge(single_both, ref_gff_annot, by.x="to", by.y="protein_id", all.x=TRUE)
missing_ids <- is.na(single_both[["old_locus_tag"]])
new_ids <- single_both[missing_ids, "locus_tag.x"]
single_both[missing_ids, "old_locus_tag"] <- new_ids
single_both <- merge(single_both, microbes_annot, by.x="old_locus_tag", by.y="sysName", all.x=TRUE)
rownames(single_both) <- make.names(single_both[["gene_id"]], unique=TRUE)
Create expressionSet
Note that I did the following to the sample sheet provided by Dr. McIver:
- Changed the dP_R20_4 sample to dP_R20_2 (The original sample name is still there are the original column).
- Added columns at the end containing the count table locations.
- Added columns ‘short_media’, ‘growth_phase’, ‘genotype’ which hopefully contain the relevant metadata extracted from the sample descriptions.
- Added a column ‘Experiment’ which is either ‘rofA’ or ‘pdxR’.
pdxr_expt <- create_expt(metadata = "sample_sheets/all_samples.xlsx",
gene_info = single_both, file_column = "spyogenes5448genecounts") %>%
subset_expt(subset="experiment=='pdxR'") %>%
set_expt_conditions(fact="genotype") %>%
set_expt_batches(fact="growthphase")
## Reading the sample metadata.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 72 rows(samples) and 27 columns(metadata fields).
## Matched 1723 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 1814 rows and 72 columns.
## subset_expt(): There were 72, now there are 36 samples.
I have 36 samples to play with, let us see what they look like.
Poke expressionSet
pdxr_libsize <- plot_libsize(pdxr_expt)
pdxr_libsize$plot

pdxr_filter_plot <- plot_libsize_prepost(pdxr_expt)
pdxr_filter_plot$count_plot

pdxr_filter_plot$lowgene_plot
## Warning: Using alpha for a discrete variable is not advised.

pdxr_nonzero <- plot_nonzero(pdxr_expt)
pdxr_nonzero$plot
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

written <- write_expt(pdxr_expt, excel = glue::glue("excel/pdxr_expt-v{ver}.xlsx"))
## Deleting the file excel/pdxr_expt-v202204.xlsx before writing the tables.
## Writing the first sheet, containing a legend and some summary data.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
##
## Total:13 s
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
##
## Total:15 s
Quick visualizations
Without considering time
Let us start with some views of the data without thinking about batch effects.
pdxr_norm <- normalize_expt(pdxr_expt, filter=TRUE, norm="quant", convert="cpm", transform="log2")
## Removing 31 low-count genes (1783 remaining).
## transform_counts: Found 12 values equal to 0, adding 1 to the matrix.
pdxr_pca <- plot_pca(pdxr_norm)
pdxr_pca$plot

pdxr_heatmap <- plot_disheat(pdxr_norm)
pdxr_heatmap$plot

pdxr_sm <- plot_sm(pdxr_norm)
## Performing correlation.

Considering time
Repeat the previous plots, but this time using limma’s batch removal method (which is just a residuals).
pdxr_nb <- normalize_expt(pdxr_expt, filter=TRUE, norm="quant", convert="cpm",
transform="log2", batch="limma")
## Removing 31 low-count genes (1783 remaining).
## If you receive a warning: 'NANs produced', one potential reason is that the data was quantile normalized.
## Setting 29 low elements to zero.
## transform_counts: Found 29 values equal to 0, adding 1 to the matrix.
pdxr_nb_pca <- plot_pca(pdxr_nb)
pdxr_nb_pca$plot

pdxr_nb_heatmap <- plot_corheat(pdxr_nb)
pdxr_nb_heatmap$plot

Well, it is pretty clear that time is the dominant factor.
Differential Expression analyses
I am going to do the DE in 3 separate pieces:
- Only compare strains
- Only compare times
- Compare the concatenation of strains and times.
Strain only comparisons
strain_de <- all_pairwise(pdxr_expt, model_batch=TRUE, filter=TRUE)
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.

strain_keepers <- list(
"delta_wt" = c("delta", "WT"),
"complement_wt" = c("complement", "WT"),
"delta_complement" = c("delta", "complement"))
strain_tables <- combine_de_tables(
strain_de, keepers = strain_keepers,
excel = glue::glue("excel/pdxr_strain_tables-v{ver}.xlsx"))
## Deleting the file excel/pdxr_strain_tables-v202204.xlsx before writing the tables.
strain_sig <- extract_significant_genes(
strain_tables,
excel = glue::glue("excel/pdxr_strain_sig-v{ver}.xlsx"))
## Deleting the file excel/pdxr_strain_sig-v202204.xlsx before writing the tables.
Time only comparisons
pdxr_time <- set_expt_conditions(pdxr_expt, fact="growthphase") %>%
set_expt_batches(fact="genotype")
time_de <- all_pairwise(pdxr_time, model_batch=TRUE)
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.

time_keepers <- list(
"e20m_start" = c("e20m", "start"),
"e60m_e20m" = c("e60m", "e20m"),
"e60m_start" = c("e60m", "start"))
time_tables <- combine_de_tables(
time_de, keepers = time_keepers,
excel = glue::glue("excel/pdxr_time_tables-v{ver}.xlsx"))
## Deleting the file excel/pdxr_time_tables-v202204.xlsx before writing the tables.
time_sig <- extract_significant_genes(
time_tables,
excel = glue::glue("excel/pdxr_time_sig-v{ver}.xlsx"))
## Deleting the file excel/pdxr_time_sig-v202204.xlsx before writing the tables.
Compare times vs strains
I always worry that comparing a data set across multiple conditions results in not what I think it will. Let us therefore plot the logFCs of likely contrasts against each other.
x_axis <- time_tables[["data"]][["start_e20m"]][, c("deseq_logfc", "deseq_adjp")]
y_axis <- strain_tables[["data"]][["delta_wt"]][, c("deseq_logfc", "deseq_adjp")]
both <- merge(x_axis, y_axis, by="row.names")
rownames(both) <- both[["Row.names"]]
both[["Row.names"]] <- NULL
cor.test(both[["deseq_logfc.x"]], both[["deseq_logfc.y"]], method="spearman")
## Error in cor.test.default(both[["deseq_logfc.x"]], both[["deseq_logfc.y"]], : 'x' must be a numeric vector
plotted <- plot_linear_scatter(both[, c("deseq_logfc.x", "deseq_logfc.y")])
## Error in `[.data.frame`(both, , c("deseq_logfc.x", "deseq_logfc.y")): undefined columns selected
## Error in eval(expr, envir, enclos): object 'plotted' not found
Interaction model
I want to make sure that my methods of performing interaction models work as I think it does, and this data set looks to me to be a perfect place to test that.
combined_factors <- paste0(pData(pdxr_expt)[["genotype"]], "_",
pData(pdxr_expt)[["growthphase"]])
combined_expt <- set_expt_conditions(pdxr_expt, fact=combined_factors) %>%
combined_de <- all_pairwise(combined_expt, model_batch="svaseq", filter=TRUE)
combined_keepers <- list(
"e20m_delta_vs_wt" = c("deltae20m", "WTe20m"),
"e20m_delta_vs_complement" = c("deltae20m", "complemente20m"),
"e60m_delta_vs_wt" = c("deltae60m", "WTe60m"),
"e60m_delta_vs_complement" = c("deltae60m", "complemente60m"),
"start_delta_vs_wt" = c("deltastart", "WTstart"),
"start_delta_vs_complement" = c("deltastart", "complementstart"),
"WT_e20m_vs_start" = c("WTe20m", "WTstart"),
"delta_e20m_vs_start" = c("deltae20m", "deltastart"),
"complement_e20m_vs_start" = c("complemente20m", "complementstart"),
"WT_e60m_vs_start" = c("WTe60m", "WTstart"),
"delta_e60m_vs_start" = c("deltae60m", "deltastart"),
"complement_e60m_vs_start" = c("complemente60m", "complementstart"),
"WT_e60m_vs_e20m" = c("WTe60m", "WTe20m"),
"delta_e60m_vs_e20m" = c("deltae60m", "deltae20m"),
"complement_e60m_vs_e20m" = c("complemente60m", "complemente20m"))
combined_tables <- combine_de_tables(
combined_de, keepers = combined_keepers,
excel = glue::glue("excel/pdxr_combined_tables-v{ver}.xlsx"))
combined_sig <- extract_significant_genes(
combined_tables,
excel = glue::glue("excel/pdxr_combined_sig-v{ver}.xlsx"))
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern = "\\.Rmd", replace = "", x = rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename = this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 09b2f72b4cb5e7f7f74b7a7970f5b2f7f3609e41
## This is hpgltools commit: Tue Apr 19 18:55:02 2022 -0400: 09b2f72b4cb5e7f7f74b7a7970f5b2f7f3609e41
## Saving to index_pdxR-v202204.rda.xz
