There are a few methods of importing annotation data into R. The following are two attempts, the second is currently being used in these analyses.
AnnotationHub is a newer service and has promise to be an excellent top-level resource for gathering annotation data.
tt <- sm(library(AnnotationHub))
ah <- sm(AnnotationHub())
orgdbs <- query(ah, "OrgDB")
annot_lm <- query(ah, c("OrgDB", "Friedlin"))
lm_name <- names(annot_lm)
annot_lm <- sm(annot_lm[[lm_name[[2]]]])
txtdbs <- query(ah, "TxDb")
## AH48429 appears to be panamensis
##annot_lp <- annot_lp[["AH48429"]]
annot_lp <- query(ah, c("OrgDB", "panamensis"))
lp_name <- names(annot_lp)
annot_lp <- sm(annot_lp[[lp_name[[2]]]])
Since this document was originally written, I have made substantial changes to how I create, load, and manipulate the eupathdb annotation data. As a result, this needs to be significantly reworked.
AnnotationHub is the new and fancier version of what OrganismDb does. Keith already made these for the parasites though, lets try and use one of those.
library(EuPathDB)
panamensis_entry <- get_eupath_entry("panamensis", webservice="tritrypdb")
testing_panamensis <- make_eupath_orgdb(panamensis_entry)
braziliensis_entry <- get_eupath_entry("braziliensis", webservice="tritrypdb")
testing_braziliensis <- make_eupath_orgdb(braziliensis_entry)
donovani_entry <- get_eupath_entry("donovani", webservice="tritrypdb")
testing_donovani <- make_eupath_orgdb(donovani_entry)
mexicana_entry <- get_eupath_entry("mexicana", webservice="tritrypdb")
testing_mexicana <- make_eupath_orgdb(mexicana_entry)
major_entry <- get_eupath_entry("major", webservice="tritrypdb")
testing_major <- make_eupath_orgdb(major_entry)
crith_entry <- get_eupath_entry("Crit", webservice="tritrypdb")
testing_crith <- make_eupath_orgdb(crith_entry)
Assuming the above packages got created, we may load them and extract the annotation data.
## Loading required package: GenomeInfoDbData
##
## This is EuPathDB version 1.6.0
## Read 'EuPathDB()' to get started.
##
## Attaching package: 'EuPathDB'
## The following objects are masked from 'package:hpgltools':
##
## download_uniprot_proteome, get_kegg_orgn,
## load_kegg_annotations, load_orgdb_annotations, load_orgdb_go,
## load_uniprot_annotations, orgdb_from_ah
## Bioconductor version 3.10 (BiocManager 1.30.8), ?BiocManager::install for
## help
## Found the following hits: Leishmania major strain Friedlin, Leishmania major strain LV39c5, Leishmania major strain SD 75.1, choosing the first.
## Using: Leishmania major strain Friedlin.
## org.Lmajor.Friedlin.v45.eg.db
wanted_fields <- c("annot_cds_length", "annot_chromosome", "annot_gene_entrez_id",
"annot_gene_name", "annot_strand", "gid", "go_go_id",
"go_go_term_name", "go_ontology",
"interpro_description" ,"interpro_e_value", "type_gene_type")
lm_org <- sm(EuPathDB::load_orgdb_annotations(major_names$orgdb,
keytype="gid",
fields=wanted_fields))
panamensis_entry <- get_eupath_entry("panamensis", webservice="tritrypdb")
## Found the following hits: Leishmania panamensis MHOM/COL/81/L13, Leishmania panamensis strain MHOM/PA/94/PSC-1, choosing the first.
## Using: Leishmania panamensis MHOM/COL/81/L13.
## org.Lpanamensis.MHOMCOL81L13.v45.eg.db
lp_org <- sm(EuPathDB::load_orgdb_annotations(panamensis_names$orgdb,
keytype="gid",
fields=wanted_fields))
braziliensis_names <- get_eupath_pkgnames("braziliensis")
braziliensis_names$orgdb
old_name <- "org.Lbraziliensis.MHOMBR75M2903.v38.eg.db"
lb_org <- sm(load_orgdb_annotations(old_name, keytype="gid", fields=wanted_fields))
##donovani_names <- get_eupath_pkgnames("donovani")
##donovani_names$orgdb
##ld_org <- load_orgdb_annotations(donovani_names$orgdb, keytype="gid", fields=wanted_fields)
##
##mexicana_names <- get_eupath_pkgnames("mexicana")
##mexicana_names$orgdb
##lmex_org <- load_orgdb_annotations(mexicana_names$orgdb, keytype="gid", fields=wanted_fields)
##
##fasciculata_names <- get_eupath_pkgnames("rithidia", metadata=tritryp_metadata)
##fasciculata_names$orgdb
##cf_org <- load_orgdb_annotations(fasciculata_names$orgdb, keytype="gid", fields=wanted_fields)
In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments. More in-depth information for the human transcriptome may be extracted from biomart.
## The old way of getting genome/annotation data
lp_gff <- "reference/lpanamensis.gff"
lb_gff <- "reference/lbraziliensis.gff"
hs_gff <- "reference/hsapiens.gtf"
lp_fasta <- "reference/lpanamensis.fasta.xz"
lb_fasta <- "reference/lbraziliensis.fasta.xz"
hs_fasta <- "reference/hsapiens.fasta.xz"
lp_annotations <- sm(load_gff_annotations(lp_gff, type="gene"))
rownames(lp_annotations) <- paste0("exon_", lp_annotations$web_id, ".1")
lb_annotations <- sm(load_gff_annotations(lb_gff, type="gene"))
hs_gff_annot <- sm(load_gff_annotations(hs_gff, id_col="gene_id"))
hs_annotations <- sm(load_biomart_annotations())$annotation
hs_annotations$ID <- hs_annotations$geneID
rownames(hs_annotations) <- make.names(hs_annotations[["ensembl_gene_id"]], unique=TRUE)
lp_size_dist <- plot_histogram(lp_annotations[["width"]])
lp_size_dist
hs_size_dist <- plot_histogram(hs_annotations[["cds_length"]])
hs_size_dist +
ggplot2::scale_x_continuous(limits=c(0, 20000))
## Warning: Removed 103681 rows containing non-finite values (stat_bin).
## Warning: Removed 103681 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
Annotation for gene ontologies may be gathered from a similarly large number of sources. The following are a couple.
## Try using biomart
hs_go_biomart <- sm(load_biomart_go())
## or the org.Hs.eg.db sqlite database
tt <- sm(library("Homo.sapiens"))
hs <- Homo.sapiens
##hs_go_ensembl <- load_orgdb_go(hs, hs_annotations$geneID)
##dim(hs_go_biomart)
##dim(hs_go_ensembl)
##hs_goids <- hs_go_biomart
## While testing, I called this desc, that will need to change.
##lp_tooltips <- make_tooltips(lp_annotations)
##lb_tooltips <- make_tooltips(lb_annotations)
lp_lengths <- lp_annotations[, c("ID", "width")]
lb_lengths <- lb_annotations[, c("ID", "width")]
hs_lengths <- hs_annotations[, c("ensembl_gene_id", "cds_length")]
colnames(hs_lengths) <- c("ID", "width")
lp_goids <- read.csv(file="reference/lpan_go.txt.xz", sep="\t", header=FALSE)
lb_goids <- read.csv(file="reference/lbraz_go.txt.xz", sep="\t", header=FALSE)
colnames(lp_goids) <- c("ID","GO","ont","name","source","tag")
colnames(lb_goids) <- c("ID","GO","ont","name","source","tag")
The macrophage experiment has samples across 2 contexts, the host and parasite. The following block sets up one experiment for each. If you open the all_samples-species.xlsx files, you will note immediately that a few different attempts were made at ascertaining the most likely experimental factors that contributed to the readily apparent batch effects.
Keep in mind that if I change the experimental design with new annotations, I must therefore regenerate the following.
hs_final_annotations <- hs_annotations
hs_final_annotations <- hs_final_annotations[, c("ensembl_transcript_id", "ensembl_gene_id", "cds_length",
"hgnc_symbol", "description", "gene_biotype")]
colnames(hs_final_annotations)[3] <- "length"
note <- "New experimental design factors by snp added 2016-09-20"
hs_expt <- sm(create_expt("sample_sheets/all_samples-combined.xlsx",
gene_info=hs_final_annotations,
file_column="humanfile",
notes=note))
knitr::kable(head(hs_expt$design, n=1))
sampleid | pathogenstrain | experimentname | tubelabel | alias | condition | batch | anotherbatch | snpclade | snpcladev2 | snpcladev3 | pathogenstrain.1 | label | donor | time | pctmappedparasite | pctcategory | state | sourcelab | expperson | pathogen | host | hostcelltype | noofhostcells | infectionperiodhpitimeofharvest | moiexposure | parasitespercell | pctinf | rnangul | rnaqcpassed | libraryconst | libqcpassed | index | descriptonandremarks | observation | lowercaseid | humanfile | parasitefile | bcftable | salmonreads | hssalmonmapped | hssalmonmaprate | lpsalmonmapped | lpsalmonmaprate | tophatpairs | hstophataligned | hstophatpct | hstophatmulti | hstophatdiscordant | hstophatconcordantpct | lptophataligned | lptophatpct | lptophatmulti | lptophatdiscordant | lpconcordantpct | variantpositions | file | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
HPGL0241 | HPGL0241 | none | macrophage | TM130-Nil (Blue label) | Nil | uninf | a | a | undef | undef | undef | none | uninf_1 | d130 | undef | undef | 0 | uninfected | Ade | Adriana | none | Human | Human macs | Max 2 mill | 2h - 24h chase period | NA | unknown | unknown | 468 | Y | Wanderson | Y | 1 | Uninfected human macrophages | NA | hpgl0241 | preprocessing/hpgl0241/outputs/tophat_hsapiens/accepted_paired.count.xz | undef | undef | 46628648 | 26156539 | 0.561 | NA | NA | 46319335 | 40905961 | 0.8831 | 1374099 | 1430888 | 0.8522 | NA | NA | NA | NA | NA | NA | null |
parasite_expt <- sm(create_expt("sample_sheets/all_samples-combined.xlsx",
gene_info=lp_annotations, file_column="parasitefile"))
knitr::kable(head(parasite_expt$design, n=3),
caption="The first three rows of the parasite experimental design.")
sampleid | pathogenstrain | experimentname | tubelabel | alias | condition | batch | anotherbatch | snpclade | snpcladev2 | snpcladev3 | pathogenstrain.1 | label | donor | time | pctmappedparasite | pctcategory | state | sourcelab | expperson | pathogen | host | hostcelltype | noofhostcells | infectionperiodhpitimeofharvest | moiexposure | parasitespercell | pctinf | rnangul | rnaqcpassed | libraryconst | libqcpassed | index | descriptonandremarks | observation | lowercaseid | humanfile | parasitefile | bcftable | salmonreads | hssalmonmapped | hssalmonmaprate | lpsalmonmapped | lpsalmonmaprate | tophatpairs | hstophataligned | hstophatpct | hstophatmulti | hstophatdiscordant | hstophatconcordantpct | lptophataligned | lptophatpct | lptophatmulti | lptophatdiscordant | lpconcordantpct | variantpositions | file | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
HPGL0242 | HPGL0242 | s2271 | macrophage | TM130-2271 | Self-Healing | sh | a | a | white | whitepink | right | s2271 | sh_2271 | d130 | undef | 30 | 3 | self_heal | Ade | Adriana | Lp | Human | Human macs | Max 2 mill | 2h - 24h chase period | 0.0486111111111111 | unknown | unknown | 276 | Y | Wanderson | Y | 8 | Infected human macrophages. | NA | hpgl0242 | preprocessing/hpgl0242/outputs/tophat_hsapiens/accepted_paired.count.xz | preprocessing/hpgl0242/outputs/tophat_lpanamensis/accepted_paired.count.xz | preprocessing/outputs/hpgl0242_parsed_count.txt | 42742857 | 17945935 | 0.4199 | 8023463 | 0.1877 | 42612353 | 25394266 | 0.5959 | 869649 | 784620 | 0.5775 | 13117819 | 0.3078 | 350277 | 263923 | 0.3016 | 3930 | null |
HPGL0244 | HPGL0244 | s5433 | macrophage | TM130-5433 | Chronic | chr | a | a | blue_self | blue | left | s5433 | chr_5433 | d130 | undef | 15 | 1 | chronic | Ade | Adriana | Lp | Human | Human macs | Max 2 mill | 2h - 24h chase period | 0.0486111111111111 | unknown | unknown | 261 | Y | Wanderson | Y | 27 | Infected human macrophages | NA | hpgl0244 | preprocessing/hpgl0244/outputs/tophat_hsapiens/accepted_paired.count.xz | preprocessing/hpgl0244/outputs/tophat_lpanamensis/accepted_paired.count.xz | preprocessing/outputs/hpgl0244_parsed_count.txt | 47150925 | 25281958 | 0.5362 | 3761371 | 0.0798 | 46925604 | 36379602 | 0.7753 | 1070964 | 991929 | 0.7541 | 5755998 | 0.1227 | 154830 | 116414 | 0.1202 | 85981 | null |
HPGL0245 | HPGL0245 | s1320 | macrophage | TM130-1320 | Chronic | chr | a | a | multicolor | yellowbrownmulti | right | s1320 | chr_1320 | d130 | undef | 40 | 4 | chronic | Ade | Adriana | Lp | Human | Human macs | Max 2 mill | 2h - 24h chase period | 0.0486111111111111 | unknown | unknown | 199 | Y | Wanderson | Y | 11 | Infected human macrophages | NA | hpgl0245 | preprocessing/hpgl0245/outputs/tophat_hsapiens/accepted_paired.count.xz | preprocessing/hpgl0245/outputs/tophat_lpanamensis/accepted_paired.count.xz | preprocessing/outputs/hpgl0245_parsed_count.txt | 44501247 | 15904014 | 0.3574 | 10310518 | 0.2317 | 44384269 | 22909280 | 0.5162 | 673982 | 600480 | 0.5026 | 16414676 | 0.3698 | 441931 | 310092 | 0.3628 | 38072 | null |
Table S1 is going to be a summary of the metadata in all_samples-combined This may also include some of the numbers regarding mapping %, etc.
Wanted columns:
Use the Tcruzi colors.
At this point, we should have everything necessary to perform the various analyses of the 4 sub-experiments. So save the current data for reuse elsewhere.
The experimental design is available here.
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset c2881f6d97e1ec981fd1481cf46d6bc875fac423
## This is hpgltools commit: Tue Oct 22 10:22:30 2019 -0400: c2881f6d97e1ec981fd1481cf46d6bc875fac423
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_annotation_20190914-v20190914.rda.xz
## The savefile is: /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_2016/savefiles/01_annotation_20190914-v20190914.rda.xz
## Renaming /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_2016/savefiles/01_annotation_20190914-v20190914.rda.xz to /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_2016/savefiles/01_annotation_20190914-v20190914.rda.xz.01.
## The save string is: con <- pipe(paste0('pxz > /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_2016/savefiles/01_annotation_20190914-v20190914.rda.xz'), 'wb'); save(list=ls(all.names=TRUE, envir=globalenv()),
## envir=globalenv(), file=con, compress=FALSE); close(con)
R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: Homo.sapiens(v.1.3.1), TxDb.Hsapiens.UCSC.hg19.knownGene(v.3.2.2), org.Hs.eg.db(v.3.8.2), GO.db(v.3.8.2), OrganismDbi(v.1.27.1), GenomicFeatures(v.1.37.4), GenomicRanges(v.1.37.17), GenomeInfoDb(v.1.21.2), org.Lpanamensis.MHOMCOL81L13.v45.eg.db(v.2019.10), org.Lmajor.Friedlin.v45.eg.db(v.2019.10), AnnotationDbi(v.1.47.1), IRanges(v.2.19.17), S4Vectors(v.0.23.25), futile.logger(v.1.4.3), EuPathDB(v.1.6.0), GenomeInfoDbData(v.1.2.1), hpgltools(v.1.0), Biobase(v.2.45.1) and BiocGenerics(v.0.31.6)
loaded via a namespace (and not attached): colorspace(v.1.4-1), ellipsis(v.0.3.0), XVector(v.0.25.0), base64enc(v.0.1-3), bit64(v.0.9-7), interactiveDisplayBase(v.1.23.0), xml2(v.1.2.2), codetools(v.0.2-16), knitr(v.1.25), zeallot(v.0.1.0), jsonlite(v.1.6), Rsamtools(v.2.1.7), dbplyr(v.1.4.2), graph(v.1.63.0), shiny(v.1.4.0), BiocManager(v.1.30.8), compiler(v.3.6.1), httr(v.1.4.1), backports(v.1.1.5), assertthat(v.0.2.1), Matrix(v.1.2-17), fastmap(v.1.0.1), lazyeval(v.0.2.2), later(v.1.0.0), formatR(v.1.7), htmltools(v.0.4.0), prettyunits(v.1.0.2), tools(v.3.6.1), gtable(v.0.3.0), glue(v.1.3.1), dplyr(v.0.8.3), rappdirs(v.0.3.1), Rcpp(v.1.0.2), vctrs(v.0.2.0), Biostrings(v.2.53.2), rtracklayer(v.1.45.6), iterators(v.1.0.12), xfun(v.0.10), stringr(v.1.4.0), openxlsx(v.4.1.0.1), testthat(v.2.2.1), rvest(v.0.3.4), mime(v.0.7), lifecycle(v.0.1.0), XML(v.3.98-1.20), AnnotationHub(v.2.17.10), zlibbioc(v.1.31.0), scales(v.1.0.0), rBiopaxParser(v.2.25.0), hms(v.0.5.1), promises(v.1.1.0), SummarizedExperiment(v.1.15.9), RBGL(v.1.61.0), RColorBrewer(v.1.1-2), lambda.r(v.1.2.4), yaml(v.2.2.0), curl(v.4.2), memoise(v.1.1.0), pander(v.0.6.3), ggplot2(v.3.2.1), biomaRt(v.2.41.9), stringi(v.1.4.3), RSQLite(v.2.1.2), highr(v.0.8), foreach(v.1.4.7), zip(v.2.0.4), BiocParallel(v.1.19.4), rlang(v.0.4.1), pkgconfig(v.2.0.3), matrixStats(v.0.55.0), bitops(v.1.0-6), evaluate(v.0.14), lattice(v.0.20-38), purrr(v.0.3.3), GenomicAlignments(v.1.21.7), labeling(v.0.3), bit(v.1.1-14), tidyselect(v.0.2.5), AnnotationForge(v.1.27.4), magrittr(v.1.5), R6(v.2.4.0), RUnit(v.0.4.32), DelayedArray(v.0.11.8), DBI(v.1.0.0), pillar(v.1.4.2), biocViews(v.1.53.3), RCurl(v.1.95-4.12), tibble(v.2.1.3), crayon(v.1.3.4), futile.options(v.1.0.1), BiocFileCache(v.1.9.1), rmarkdown(v.1.16), progress(v.1.2.2), grid(v.3.6.1), data.table(v.1.12.6), blob(v.1.2.0), digest(v.0.6.22), xtable(v.1.8-4), tidyr(v.1.0.0), httpuv(v.1.5.2), openssl(v.1.4.1), munsell(v.0.5.0), AnnotationHubData(v.1.15.13) and askpass(v.1.1)