I really do not know what I am doing with this data, so lets wing it!
Najib said to map this using L. tropica 590 and human. So let us gather those annotation sets.
## My load_biomart_annotations() function defaults to human, so that will be quick.
hs_annot <- load_biomart_annotations()
## The biomart annotations file already exists, loading from it.
hs_annot <- hs_annot[["annotation"]]
lt_annot <- load_gff_annotations("~/scratch/libraries/genome/ltropica590_v46.gff")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Returning a df with 15 columns and 35977 rows.
## To get the TriTrypDB annotations, I need to first figure out the full ID.
## Oh, I mispeeled it when I downloaded the genome, I wrote it as 'ltropica590',
## it should have been: ltropical590.
lt_entry <- EuPathDB::get_eupath_entry("L590", webservice="tritrypdb")
## Found: Leishmania tropica L590
## $bsgenome
## BSGenome.Leishmania.tropica.L590.v46
##
## $bsgenome_installed
## [1] FALSE
##
## $granges
## GRanges.Leishmania.tropica.L590.v46.rda
##
## $organismdbi
## tritrypdb.Leishmania.tropica.L590.v46
##
## $organismdbi_installed
## [1] FALSE
##
## $orgdb
## org.Ltropica.L590.v46.eg.db
##
## $orgdb_installed
## [1] TRUE
##
## $txdb
## TxDb.Leishmania.tropica.L590.TriTrypDB.v46
##
## $txdb_installed
## [1] TRUE
lt_entry <- "org.Ltropica.L590.v46.eg.db"
lt_orgdb <- EuPathDB::load_orgdb_annotations(lt_entry, fields="all")
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
##
## Selecting the following fields, this might be too many:
## ANNOT_BFD3_CDS, ANNOT_BFD3_MODEL, ANNOT_BFD6_CDS, ANNOT_BFD6_MODEL, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_DIF_CDS, ANNOT_DIF_MODEL, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_FC_BFD3_CDS, ANNOT_FC_BFD3_MODEL, ANNOT_FC_BFD6_CDS, ANNOT_FC_BFD6_MODEL, ANNOT_FC_DIF_CDS, ANNOT_FC_DIF_MODEL, ANNOT_FC_PF_CDS, ANNOT_FC_PF_MODEL, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_SOURCE_ID, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GO_COMPONENT, ANNOT_GO_FUNCTION, ANNOT_GO_ID_COMPONENT, ANNOT_GO_ID_FUNCTION, ANNOT_GO_ID_PROCESS, ANNOT_GO_PROCESS, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MATCHED_RESULT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NO_TET_CDS, ANNOT_NO_TET_MODEL, ANNOT_ORGANISM, ANNOT_PF_CDS, ANNOT_PF_MODEL, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PROJECT_ID, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SOURCE_ID, ANNOT_STRAND, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_URI, ANNOT_WDK_WEIGHT
## Extracted all gene ids.
## Attempting to select: ANNOT_BFD3_CDS, ANNOT_BFD3_MODEL, ANNOT_BFD6_CDS, ANNOT_BFD6_MODEL, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_DIF_CDS, ANNOT_DIF_MODEL, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_FC_BFD3_CDS, ANNOT_FC_BFD3_MODEL, ANNOT_FC_BFD6_CDS, ANNOT_FC_BFD6_MODEL, ANNOT_FC_DIF_CDS, ANNOT_FC_DIF_MODEL, ANNOT_FC_PF_CDS, ANNOT_FC_PF_MODEL, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_SOURCE_ID, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GO_COMPONENT, ANNOT_GO_FUNCTION, ANNOT_GO_ID_COMPONENT, ANNOT_GO_ID_FUNCTION, ANNOT_GO_ID_PROCESS, ANNOT_GO_PROCESS, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MATCHED_RESULT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NO_TET_CDS, ANNOT_NO_TET_MODEL, ANNOT_ORGANISM, ANNOT_PF_CDS, ANNOT_PF_MODEL, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PROJECT_ID, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SOURCE_ID, ANNOT_STRAND, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_URI, ANNOT_WDK_WEIGHT
## 'select()' returned 1:1 mapping between keys and columns
So, I now have 2 data frames of parasite annotations and 1 human.
I am going to write a quick sample sheet in the current working directory called ‘samples.xlsx’ and put the names of the count tables in it.
Here I combine the metadata, count data, and annotations.
It is worth noting that the gene IDs from htseq-count probably do not match the annotations retrieved because they are likely exon-based rather than gene based. This is not really a problem, but don’t forget it!
## Oops, the count table is by gene and I wrote the annotations by transcript. Doofus.
rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)
hg38_expt <- create_expt("sample_sheet.xlsx", gene_info=hs_annot, file_column="hsfile")
## Reading the sample metadata.
## The sample definitions comprises: 6 rows(samples) and 5 columns(metadata fields).
## Reading count tables.
## Reading count tables with read.table().
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/raw_data/salloum/INF1_1/processed/outputs/hisat2_hg38_91/INF1_S155_L003.count.xz contains 58307 rows.
## INF1_2/processed/outputs/hisat2_hg38_91/INF2_S0_L009.count.xz contains 58307 rows and merges to 58307 rows.
## THP1_1/processed/outputs/hisat2_hg38_91/THP1_1_S17_L001.count.xz contains 58307 rows and merges to 58307 rows.
## THP1_2/processed/outputs/hisat2_hg38_91/THP1_2_S0_L009.count.xz contains 58307 rows and merges to 58307 rows.
## Finished reading count tables.
## Matched 58243 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 58302 rows and 4 columns.
## And I entirely forgot to set the row names for the leishmania data!
## But it does contain the exon names, but with a '-' instead of a '.'...
rownames(lt_annot) <- paste0("exon_", lt_annot[["ID"]], ".E1")
lt_expt <- create_expt("sample_sheet.xlsx", gene_info=lt_annot, file="ltfile")
## Reading the sample metadata.
## The sample definitions comprises: 6 rows(samples) and 5 columns(metadata fields).
## Reading count tables.
## Reading count tables with read.table().
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/raw_data/salloum/INF1_1/processed/outputs/hisat2_ltropica590_v46/INF1_S155_L003.count.xz contains 9052 rows.
## INF1_2/processed/outputs/hisat2_ltropica590_v46/INF2_S0_L009.count.xz contains 9052 rows and merges to 9052 rows.
## LT2_1/processed/outputs/hisat2_ltropica590_v46/LT2_1_S156_L003.count.xz contains 9052 rows and merges to 9052 rows.
## LT2_2/processed/outputs/hisat2_ltropica590_v46/LT2_2_S0_L009.count.xz contains 9052 rows and merges to 9052 rows.
## Finished reading count tables.
## Matched 8938 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 9047 rows and 4 columns.
In this block I will calculate all the diagnostic plots, but not show them. I will show them next with a little annotation.
I will leave the output for the first of each invocation and silence it for the second.
## Graphing number of non-zero genes with respect to CPM by library.
## Graphing library sizes.
## Graphing a boxplot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, adding 1 to the data.
## Changed 158534 zero count features.
## Graphing a correlation heatmap.
## Graphing a standard median correlation.
## Performing correlation.
## Graphing a distance heatmap.
## Graphing a standard median distance.
## Performing distance.
## Graphing a PCA plot.
## Graphing a T-SNE plot.
## Warning in plot_pca(..., pc_method = "tsne"): TSNE: Attempting to auto-detect
## perplexity failed, setting it to 1.
## Plotting a density plot.
## This data will benefit from being displayed on the log scale.
## If this is not desired, set scale='raw'
## Some entries are 0. We are on log scale, setting them to 0.5.
## Changed 158534 zero count features.
## Plotting a CV plot.
## Naively calculating coefficient of variation/dispersion with respect to condition.
## Finished calculating dispersion estimates.
## Plotting the representation of the top-n genes.
## Plotting the expression of the top-n PC loaded genes.
## Printing a color to condition legend.
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 47141 low-count genes (11161 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
lt_norm <- sm(normalize_expt(lt_expt, norm="quant", convert="cpm", transform="log2", filter=TRUE))
hg38n_plots <- sm(graph_metrics(hg38_norm))
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in plot_pca(..., pc_method = "tsne"): TSNE: Attempting to auto-detect
## perplexity failed, setting it to 1.
## Writing the normalized reads.
## Graphing the normalized reads.
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Writing the median reads by factor.
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in plot_pca(..., pc_method = "tsne"): TSNE: Attempting to auto-detect
## perplexity failed, setting it to 1.
## Writing the normalized reads.
## Graphing the normalized reads.
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Writing the median reads by factor.
## The legend will be the same for both the human and parasite data, except I decided to drop the
## THP samples from the parasite, I am reasonably certain there are no actual parasite reads there.
hg38_plots$libsize
## How many parasite genes have >0 reads?
## These plots suggest that Inf1_2 might benefit from more coverage.
hg38n_plots$density
Like the above, I am going to tell the 2nd invocation of each command to stay quiet.
## all_pairwise() invokes DESeq2, EdgeR, limma, EBSeq on the data, along with my own basic method.
hg38_de <- all_pairwise(hg38_expt, model_batch=FALSE)
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 47141 low-count genes (11161 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 2 values equal to 0, adding 1 to the matrix.
## Step 5: not doing batch correction.
## Plotting a PCA before surrogates/batch inclusion.
## Not putting labels on the plot.
## Assuming no batch in model for testing pca.
## Not putting labels on the plot.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
lt_de <- sm(all_pairwise(lt_expt, model_batch=FALSE))
## Now we want to make one big table from the tables provided by DESeq2, EdgeR, etc.
hg38_contrast <- list(
"inf_over_thp" = c("inf", "thp"))
hg38_table <- combine_de_tables(hg38_de, keepers=hg38_contrast,
excel=glue::glue("excel/{rundate}_hg38_91_contrast_tables.xlsx"))
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Working on 1/1: inf_over_thp which is: inf/thp.
## Found inverse table with thp_vs_inf
## Adding venn plots for inf_over_thp.
## Limma expression coefficients for inf_over_thp; R^2: 0.911; equation: y = 0.958x - 0.0379
## Deseq expression coefficients for inf_over_thp; R^2: 0.831; equation: y = 0.939x + 0.218
## Edger expression coefficients for inf_over_thp; R^2: 0.899; equation: y = 0.944x + 0.2
## Writing summary information, compare_plot is: TRUE.
## Performing save of excel/20191212_hg38_91_contrast_tables.xlsx.
lt_contrast <- list(
"inf_over_lt" = c("inf", "lt"))
lt_table <- sm(combine_de_tables(lt_de, keepers=lt_contrast,
excel=glue::glue("excel/{rundate}_lt_contrast_tables.xlsx")))
Now lets visualize some of the DE results. I was watching the plots generate while it wrote the tables, it looks like limma does not agree well with DESeq2/EdgeR for this data.
## Writing a legend of columns.
## Writing excel data according to limma for inf_over_thp: 1/5.
## After (adj)p filter, the up genes table has 4 genes.
## After (adj)p filter, the down genes table has 145 genes.
## After fold change filter, the up genes table has 4 genes.
## After fold change filter, the down genes table has 145 genes.
## Printing significant genes to the file: excel/significant_genes.xlsx
## 1/1: Creating significant table up_limma_inf_over_thp
## Writing excel data according to edger for inf_over_thp: 1/5.
## After (adj)p filter, the up genes table has 601 genes.
## After (adj)p filter, the down genes table has 414 genes.
## After fold change filter, the up genes table has 601 genes.
## After fold change filter, the down genes table has 414 genes.
## Printing significant genes to the file: excel/significant_genes.xlsx
## 1/1: Creating significant table up_edger_inf_over_thp
## Writing excel data according to deseq for inf_over_thp: 1/5.
## After (adj)p filter, the up genes table has 1166 genes.
## After (adj)p filter, the down genes table has 764 genes.
## After fold change filter, the up genes table has 1166 genes.
## After fold change filter, the down genes table has 764 genes.
## Printing significant genes to the file: excel/significant_genes.xlsx
## 1/1: Creating significant table up_deseq_inf_over_thp
## Writing excel data according to ebseq for inf_over_thp: 1/5.
## After (adj)p filter, the up genes table has 3422 genes.
## After (adj)p filter, the down genes table has 1062 genes.
## After fold change filter, the up genes table has 3080 genes.
## After fold change filter, the down genes table has 902 genes.
## Printing significant genes to the file: excel/significant_genes.xlsx
## 1/1: Creating significant table up_ebseq_inf_over_thp
## Writing excel data according to basic for inf_over_thp: 1/5.
## After (adj)p filter, the up genes table has 0 genes.
## After (adj)p filter, the down genes table has 0 genes.
## After fold change filter, the up genes table has 0 genes.
## After fold change filter, the down genes table has 0 genes.
## Printing significant genes to the file: excel/significant_genes.xlsx
## The up table inf_over_thp is empty.
## The down table inf_over_thp is empty.
## Adding significance bar plots.
## Performing gProfiler GO search of 1166 against hsapiens.
## GO search found 147 hits.
## Performing gProfiler KEGG search of 1166 against hsapiens.
## KEGG search found 7 hits.
## Performing gProfiler REAC search of 1166 against hsapiens.
## REAC search found 13 hits.
## Performing gProfiler MI search of 1166 against hsapiens.
## MI search found 0 hits.
## Performing gProfiler TF search of 1166 against hsapiens.
## TF search found 833 hits.
## Performing gProfiler CORUM search of 1166 against hsapiens.
## CORUM search found 10 hits.
## Performing gProfiler HP search of 1166 against hsapiens.
## HP search found 4 hits.
## Performing gProfiler GO search of 764 against hsapiens.
## GO search found 461 hits.
## Performing gProfiler KEGG search of 764 against hsapiens.
## KEGG search found 15 hits.
## Performing gProfiler REAC search of 764 against hsapiens.
## REAC search found 127 hits.
## Performing gProfiler MI search of 764 against hsapiens.
## MI search found 0 hits.
## Performing gProfiler TF search of 764 against hsapiens.
## TF search found 343 hits.
## Performing gProfiler CORUM search of 764 against hsapiens.
## CORUM search found 50 hits.
## Performing gProfiler HP search of 764 against hsapiens.
## HP search found 230 hits.