Note, I am using my previous worksheet from when last I worked with Rezia as a template for this (I copied it from there and am modifying it now).
I used my cyoa tool to process these samples by doing the following:
cd preprocessing
start=$(pwd)
for i in $(/bin/ls -d ./*)
do
cd $i
rm -rf outputs scripts
cyoa --task pipe --method prnas --species spyogenes_5448 \
--gff_type gene --gff_tag locus_tag \
--input $(/bin/ls unprocessed/* | tr '\n' ':' | sed 's/:$//g')
cd $start
done
The above for loop goes into each sample and does the following:
Note the following steps were not actually run because I had a speeling error. But since they are not necessary for the explicitly RNASeq analyses I first want to do, I ignored it. I am curious though to see if there are other mutations in these strains, so I will likely run those portions manually.
Same two primary annotation sources, the gff file used for mapping/counting, and microbesonline.org. Note that since I moved to just downloading the material from the web interface, I no longer have a handy method to get the taxon ID, so I go there and hunt down the taxId manually.
Now that I am thinking about it, my 5448 genome/annotations are kind of old, I will ask and check to see if there is anything newer.
Also, 5448 does not have an entry at microbesonline.org, a fact which I forgot. I need to go poking in my notes to reconnect 5005 and 5448.
gff_annot <- load_gff_annotations("reference/spyogenes_5448.gff", type = "gene")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Returning a df with 14 columns and 1814 rows.
rownames(gff_annot) <- gff_annot[["locus_tag"]]
head(gff_annot)
## seqnames start end width strand source type score
## SP5448_00005 CP008776 232 1587 1356 + EMBL/GenBank/SwissProt gene NA
## SP5448_00010 CP008776 1742 2878 1137 + EMBL/GenBank/SwissProt gene NA
## SP5448_00015 CP008776 2953 3150 198 + EMBL/GenBank/SwissProt gene NA
## SP5448_00020 CP008776 3480 4595 1116 + EMBL/GenBank/SwissProt gene NA
## SP5448_00025 CP008776 4665 5234 570 + EMBL/GenBank/SwissProt gene NA
## SP5448_00030 CP008776 5237 8740 3504 + EMBL/GenBank/SwissProt gene NA
## phase locus_tag gene gene_synonym note pseudo
## SP5448_00005 1 SP5448_00005 <NA> <NA> <NA>
## SP5448_00010 1 SP5448_00010 <NA> <NA> <NA>
## SP5448_00015 1 SP5448_00015 <NA> <NA> <NA>
## SP5448_00020 1 SP5448_00020 <NA> <NA> <NA>
## SP5448_00025 1 SP5448_00025 <NA> <NA> <NA>
## SP5448_00030 1 SP5448_00030 <NA> <NA> <NA>
mgas_data <- load_genbank_annotations(accession="CP008776")
## Loading required namespace: rentrez
## Done Parsing raw GenBank file text. [ 14.068 seconds ]
## 2022-04-26 15:06:47 Starting creation of gene GRanges
## 2022-04-26 15:06:50 Starting creation of CDS GRanges
## 2022-04-26 15:06:57 Starting creation of exon GRanges
## No exons read from genbank file. Assuming sections of CDS are full exons
## 2022-04-26 15:06:59 Starting creation of variant VRanges
## 2022-04-26 15:06:59 Starting creation of transcript GRanges
## No transcript features (mRNA) found, using spans of CDSs
## 2022-04-26 15:07:00 Starting creation of misc feature GRanges
## Warning in fill_stack_df(feats[!typs %in% c("gene", "exon", "CDS",
## "variation", : Got unexpected multi-value field(s) [ inference ]. The resulting
## column(s) will be of class CharacterList, rather than vector(s). Please contact
## the maintainer if multi-valuedness is expected/meaningful for the listed
## field(s).
## 2022-04-26 15:07:00 - Done creating GenBankRecord object [ 13.23 seconds ]
genome_size <- GenomicRanges::width(mgas_data$seq) ## This fails on travis?
## Error in (function (classes, fdef, mtable) : unable to find an inherited method for function 'width' for signature '"NULL"'
mgas_cds <- as.data.frame(mgas_data$cds)
## Get rid of amino acid sequence
rownames(mgas_cds) <- mgas_cds[["locus_tag"]]
wanted <- ! colnames(mgas_cds) %in% c("translation", "type", "strand", "seqnames", "start", "end", "locus_tag", "note", "gene", "gene_synonym", "width")
mgas_cds <- mgas_cds[, wanted]
## And EC_number because wtf is that?
mgas_annot <- merge(mgas_cds, gff_annot, by="row.names")
rownames(mgas_annot) <- mgas_annot[["Row.names"]]
mgas_annot[["Row.names"]] <- NULL
ref_gff_annot <- load_gff_annotations("reference/spyogenes_5005.gff", type="CDS")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Returning a df with 19 columns and 1841 rows.
microbes_annot <- load_microbesonline_annotations(species="5005")
## Found 1 entry.
## Streptococcus pyogenes MGAS5005Firmicutesyes2005-08-25yes101950293653
## The species being downloaded is: Streptococcus pyogenes MGAS5005
## Downloading: http://www.microbesonline.org/cgi-bin/genomeInfo.cgi?tId=293653;export=tab
single_hits <- readr::read_tsv("single_multi/outputs/fasta_spyogenes_5448_cds_spyogenes_5005_cds/spyogenes_5448_cds_singles.txt")
## Rows: 1340 Columns: 2
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): AKK69518.1, AAZ50620.1:1.000:0
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(single_hits) <- c("from", "to")
single_hits[["to"]] <- gsub(pattern="(^.*?):.*$", replacement="\\1",
x=single_hits[["to"]], perl=TRUE)
## The gff annotation has the protein_id column matching this cds entry.
single_both <- merge(mgas_annot, single_hits, by.x="protein_id", by.y="from", all.x=TRUE)
single_both <- merge(single_both, ref_gff_annot, by.x="to", by.y="protein_id", all.x=TRUE)
## Now pick up the missing old_locus_tags and put in the 5448 IDs
missing_ids <- is.na(single_both[["old_locus_tag"]])
new_ids <- single_both[missing_ids, "locus_tag.x"]
single_both[missing_ids, "old_locus_tag"] <- new_ids
single_both <- merge(single_both, microbes_annot, by.x="old_locus_tag", by.y="sysName", all.x=TRUE)
rownames(single_both) <- make.names(single_both[["gene_id"]], unique=TRUE)
Note that I did the following to the sample sheet provided by Dr. McIver:
rofa_expt <- create_expt(metadata = "sample_sheets/all_samples.xlsx",
gene_info = single_both, file_column = "spyogenes5448genecounts") %>%
subset_expt(subset="experiment=='rofA'") %>%
set_expt_conditions(fact="genotype") %>%
set_expt_batches(fact="growthphase")
## Reading the sample metadata.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 72 rows(samples) and 27 columns(metadata fields).
## Matched 1723 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 1814 rows and 72 columns.
## subset_expt(): There were 72, now there are 36 samples.
I have 36 samples to play with, let us see what they look like.
rofa_libsize <- plot_libsize(rofa_expt)
rofa_libsize$plot
rofa_filter_plot <- plot_libsize_prepost(rofa_expt)
rofa_filter_plot$count_plot
rofa_filter_plot$lowgene_plot
## Warning: Using alpha for a discrete variable is not advised.
rofa_nonzero <- plot_nonzero(rofa_expt)
rofa_nonzero$plot
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## awesome, there is a range of ~ 25 genes.
written <- write_expt(rofa_expt, excel = glue::glue("excel/rofA_expt-v{ver}.xlsx"))
## Deleting the file excel/rofA_expt-v202204.xlsx before writing the tables.
## Writing the first sheet, containing a legend and some summary data.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
##
## Total:14 s
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
##
## Total:13 s
Let us start with some views of the data without thinking about batch effects.
rofa_norm <- normalize_expt(rofa_expt, filter=TRUE, norm="quant", convert="cpm", transform="log2")
## Removing 40 low-count genes (1774 remaining).
rofa_pca <- plot_pca(rofa_norm)
rofa_pca$plot
rofa_heatmap <- plot_disheat(rofa_norm)
rofa_heatmap$plot
Repeat the previous plots, but this time using limma’s batch removal method (which is just a residuals).
rofa_time <- set_expt_conditions(rofa_expt, fact="growthphase") %>%
set_expt_batches(fact="genotype")
rofa_time_norm <- normalize_expt(rofa_time, filter=TRUE, norm="quant", convert="cpm",
transform="log2")
## Removing 40 low-count genes (1774 remaining).
rofa_time_pca <- plot_pca(rofa_time_norm)
rofa_time_pca$plot
rofa_time_nb <- normalize_expt(rofa_time, filter=TRUE, norm="quant", convert="cpm",
transform="log2", batch="limma")
## Removing 40 low-count genes (1774 remaining).
## If you receive a warning: 'NANs produced', one potential reason is that the data was quantile normalized.
## Setting 28 low elements to zero.
## transform_counts: Found 28 values equal to 0, adding 1 to the matrix.
rofa_time_nb_pca <- plot_pca(rofa_time_nb)
rofa_time_nb_pca$plot
Well, it is pretty clear that time is the dominant factor.
I am going to do the DE in 4 separate pieces:
strain_de <- all_pairwise(rofa_expt, model_batch=TRUE, filter=TRUE)
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
strain_keepers <- list(
"delta_wt" = c("delta", "WT"),
"revert_wt" = c("revert", "WT"),
"delta_revert" = c("delta", "revert"))
strain_tables <- combine_de_tables(
strain_de, keepers = strain_keepers,
excel = glue::glue("excel/rofa_strain_tables-v{ver}.xlsx"))
## Deleting the file excel/rofa_strain_tables-v202204.xlsx before writing the tables.
strain_sig <- extract_significant_genes(
strain_tables,
excel = glue::glue("excel/rofa_strain_sig-v{ver}.xlsx"))
## Deleting the file excel/rofa_strain_sig-v202204.xlsx before writing the tables.
time_de <- all_pairwise(rofa_time, model_batch=TRUE)
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
time_keepers <- list(
"transition_exponential" = c("transition", "exponential"),
"stationary_exponential" = c("stationary", "exponential"),
"stationary_transition" = c("stationary", "transition"))
time_tables <- combine_de_tables(
time_de, keepers = time_keepers,
excel = glue::glue("excel/rofa_time_tables-v{ver}.xlsx"))
## Deleting the file excel/rofa_time_tables-v202204.xlsx before writing the tables.
time_sig <- extract_significant_genes(
time_tables,
excel = glue::glue("excel/rofa_time_sig-v{ver}.xlsx"))
## Deleting the file excel/rofa_time_sig-v202204.xlsx before writing the tables.
I always worry that comparing a data set across multiple conditions results in not what I think it will. Let us therefore plot the logFCs of likely contrasts against each other.
x_axis <- time_tables[["data"]][["transition_exponential"]][, c("deseq_logfc", "deseq_adjp")]
y_axis <- strain_tables[["data"]][["delta_wt"]][, c("deseq_logfc", "deseq_adjp")]
both <- merge(x_axis, y_axis, by="row.names")
rownames(both) <- both[["Row.names"]]
both[["Row.names"]] <- NULL
cor.test(both[["deseq_logfc.x"]], both[["deseq_logfc.y"]], method="spearman")
## Warning in cor.test.default(both[["deseq_logfc.x"]], both[["deseq_logfc.y"]], :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: both[["deseq_logfc.x"]] and both[["deseq_logfc.y"]]
## S = 5.3e+08, p-value <2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4274
plotted <- plot_linear_scatter(both[, c("deseq_logfc.x", "deseq_logfc.y")])
## Warning in plot_multihistogram(df): NAs introduced by coercion
plotted$scatter
I want to make sure that my methods of performing interaction models work as I think it does, and this data set looks to me to be a perfect place to test that.
combined_factors <- paste0(pData(rofa_expt)[["genotype"]], "_",
pData(rofa_expt)[["growthphase"]])
combined_expt <- set_expt_conditions(rofa_expt, fact=combined_factors)
combined_de <- all_pairwise(combined_expt, model_batch="svaseq", filter=TRUE)
combined_keepers <- list(
"exponential_delta_vs_wt" = c("deltaexponential", "WTexponential"),
"exponential_delta_vs_revert" = c("deltaexponential", "revertexponential"),
"stationary_delta_vs_wt" = c("deltastationary", "WTstationary"),
"stationary_delta_vs_revert" = c("deltastationary", "revertstationary"),
"transition_delta_vs_wt" = c("deltatransition", "WTtransition"),
"transition_delta_vs_revert" = c("deltatransition", "reverttransition"),
"WT_exponential_vs_transition" = c("WTexponential", "WTtransition"),
"delta_exponential_vs_transition" = c("deltaexponential", "deltatransition"),
"revert_exponential_vs_transition" = c("revertexponential", "reverttransition"),
"WT_stationary_vs_transition" = c("WTstationary", "WTtransition"),
"delta_stationary_vs_transition" = c("deltastationary", "deltatransition"),
"revert_stationary_vs_transition" = c("revertstationary", "reverttransition"),
"WT_stationary_vs_exponential" = c("WTstationary", "WTexponential"),
"delta_stationary_vs_exponential" = c("deltastationary", "deltaexponential"),
"revert_stationary_vs_exponential" = c("revertstationary", "revertexponential"))
combined_tables <- combine_de_tables(
combined_de, keepers = combined_keepers,
excel = glue::glue("excel/rofa_combined_tables-v{ver}.xlsx"))
combined_sig <- extract_significant_genes(
combined_tables,
excel = glue::glue("excel/rofa_combined_sig-v{ver}.xlsx"))
Work on making a pretty picture, there are kind of a lot of potential contrasts, so lets just start with one.
circos_cfg <- circos_prefix(fData(rofa_expt), name="rofa", chr_column="seqnames.x", start_column="start.x",
stop_column = "end.x", strand_column="strand.x", cog_column = "COGFun")
## This assumes you have a colors.conf in circos/colors/ and fonts.conf in circos/fonts/
## It also assumes you have conf/ideogram.conf, conf/ticks.conf, and conf/housekeeping.conf
## It will write circos/conf/rofa.conf with a reasonable first approximation config file.
## Setting 95 undefined entries to the plus strand.
## Warning in circos_prefix(fData(rofa_expt), name = "rofa", chr_column =
## "seqnames.x", : NAs introduced by coercion
## Warning in circos_prefix(fData(rofa_expt), name = "rofa", chr_column =
## "seqnames.x", : NAs introduced by coercion
## Wrote karyotype to circos/conf/ideograms/rofa.conf
## This should match the ideogram= line in rofa.conf
## Wrote ticks to circos/conf/ticks_rofa.conf
kary <- circos_karyotype(circos_cfg, fasta = "reference/spyogenes_5448.fasta")
## Wrote karyotype to circos/conf/karyotypes/rofa.conf
## This should match the karyotype= line in rofa.conf
plus_minus <- circos_plus_minus(circos_cfg, width = 0.06, thickness = 40,
url_string = "http://www.microbesonline.org/cgi-bin/fetchLocus.cgi?locus=[id]")
## Writing data file: circos/data/rofa_plus_go.txt with the + strand GO data.
## Writing data file: circos/data/rofa_minus_go.txt with the - strand GO data.
## Wrote the +/- config files. Appending their inclusion to the master file.
## Returning the inner width: 0.88. Use it as the outer for the next ring.
exp_delta_vs_wt_hist <- circos_hist(circos_cfg, combined_tables[["data"]][["exponential_delta_vs_wt"]],
colname="deseq_logfc", basename="exp_deltawt",
outer=plus_minus)
## Error in merge(df, annot, by = "row.names"): object 'combined_tables' not found
cjb_suffix <- circos_suffix(circos_cfg)
made <- circos_make(circos_cfg, target="rofa")
if (!isTRUE(get0("skip_load"))) {
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern = "\\.Rmd", replace = "", x = rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename = this_save))
}
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 09b2f72b4cb5e7f7f74b7a7970f5b2f7f3609e41
## This is hpgltools commit: Tue Apr 19 18:55:02 2022 -0400: 09b2f72b4cb5e7f7f74b7a7970f5b2f7f3609e41
## Saving to index_rofA-v202204.rda.xz