This short document should gather the annotation data required for playing with the data downloaded from SRA and friends.
hs_gff <- "reference/hg38_91.gff"
hs_gff_annot <- sm(load_gff_annotations(hs_gff, id_col="gene_id"))
hs_annot <- sm(load_biomart_annotations())$annotation
hs_annot$ID <- hs_annot$geneID
rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)
dim(hs_annot)
## [1] 197995 12
hs_size_dist <- plot_histogram(hs_annot[["cds_length"]])
hs_size_dist +
ggplot2::scale_x_continuous(limits=c(0, 20000))
## Warning: Removed 103681 rows containing non-finite values (stat_bin).
## Warning: Removed 103681 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
hs_go_biomart <- sm(load_biomart_go())
hs_lengths <- hs_annot[, c("ensembl_gene_id", "cds_length")]
colnames(hs_lengths) <- c("ID", "width")
rownames(hs_annot) <- make.names(
paste0(hs_annot[["ensembl_transcript_id"]], ".",
hs_annot[["transcript_version"]]),
unique=TRUE)
hs_tx_gene <- hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene[["id"]] <- rownames(hs_tx_gene)
hs_tx_gene <- hs_tx_gene[, c("id", "ensembl_gene_id")]
new_hs_annot <- hs_annot
rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)
hs_expt <- create_expt("sample_sheets/macrophage_transcriptome_data_sources_20190701.xlsx",
gene_info=new_hs_annot,
tx_gene_map=hs_tx_gene,
file_column="salmonfile")
## Reading the sample metadata.
## Did not find the condition column in the sample sheet.
## Filling it in as undefined.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 115 rows(samples) and 32 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 19629 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 19629 rows and 102 columns.
cds_entries <- fData(hs_expt)
cds_entries <- cds_entries[["gene_biotype"]] == "protein_coding"
hs_cds_expt <- hs_expt
hs_cds_expt$expressionset <- hs_cds_expt$expressionset[cds_entries, ]
new_cds_entries <- fData(hs_cds_expt)
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: hpgltools(v.1.0), Biobase(v.2.44.0) and BiocGenerics(v.0.30.0)
loaded via a namespace (and not attached): backports(v.1.1.4), fastmatch(v.1.1-0), plyr(v.1.8.4), igraph(v.1.2.4.1), lazyeval(v.0.2.2), splines(v.3.6.0), BiocParallel(v.1.18.0), usethis(v.1.5.0), GenomeInfoDb(v.1.20.0), ggplot2(v.3.2.0), urltools(v.1.7.3), sva(v.3.32.1), digest(v.0.6.19), foreach(v.1.4.4), htmltools(v.0.3.6), GOSemSim(v.2.10.0), viridis(v.0.5.1), GO.db(v.3.8.2), gdata(v.2.18.0), magrittr(v.1.5), memoise(v.1.1.0), doParallel(v.1.0.14), openxlsx(v.4.1.0.1), limma(v.3.40.2), remotes(v.2.1.0), readr(v.1.3.1), Biostrings(v.2.52.0), annotate(v.1.62.0), matrixStats(v.0.54.0), enrichplot(v.1.4.0), prettyunits(v.1.0.2), colorspace(v.1.4-1), blob(v.1.1.1), ggrepel(v.0.8.1), xfun(v.0.8), dplyr(v.0.8.1), tximport(v.1.12.3), callr(v.3.2.0), crayon(v.1.3.4), RCurl(v.1.95-4.12), jsonlite(v.1.6), genefilter(v.1.66.0), lme4(v.1.1-21), survival(v.2.44-1.1), iterators(v.1.0.10), glue(v.1.3.1), polyclip(v.1.10-0), gtable(v.0.3.0), zlibbioc(v.1.30.0), XVector(v.0.24.0), UpSetR(v.1.4.0), DelayedArray(v.0.10.0), pkgbuild(v.1.0.3), scales(v.1.0.0), DOSE(v.3.10.2), DBI(v.1.0.0), Rcpp(v.1.0.1), viridisLite(v.0.3.0), xtable(v.1.8-4), progress(v.1.2.2), gridGraphics(v.0.4-1), bit(v.1.1-14), europepmc(v.0.3), stats4(v.3.6.0), httr(v.1.4.0), fgsea(v.1.10.0), gplots(v.3.0.1.1), RColorBrewer(v.1.1-2), pkgconfig(v.2.0.2), XML(v.3.98-1.20), farver(v.1.1.0), labeling(v.0.3), ggplotify(v.0.0.3), tidyselect(v.0.2.5), rlang(v.0.4.0), reshape2(v.1.4.3), AnnotationDbi(v.1.46.0), munsell(v.0.5.0), tools(v.3.6.0), cli(v.1.1.0), RSQLite(v.2.1.1), ggridges(v.0.5.1), devtools(v.2.0.2), evaluate(v.0.14), stringr(v.1.4.0), yaml(v.2.2.0), processx(v.3.3.1), knitr(v.1.23), bit64(v.0.9-7), fs(v.1.3.1), pander(v.0.6.3), zip(v.2.0.2), caTools(v.1.17.1.2), purrr(v.0.3.2), ggraph(v.1.0.2), nlme(v.3.1-140), xml2(v.1.2.0), DO.db(v.2.9), biomaRt(v.2.40.1), compiler(v.3.6.0), pbkrtest(v.0.4-7), rstudioapi(v.0.10), variancePartition(v.1.14.0), testthat(v.2.1.1), tibble(v.2.1.3), tweenr(v.1.0.1), stringi(v.1.4.3), ps(v.1.3.0), GenomicFeatures(v.1.36.3), desc(v.1.2.0), lattice(v.0.20-38), Matrix(v.1.2-17), nloptr(v.1.2.1), pillar(v.1.4.2), triebeard(v.0.3.0), data.table(v.1.12.2), cowplot(v.0.9.4), bitops(v.1.0-6), rtracklayer(v.1.44.0), GenomicRanges(v.1.36.0), qvalue(v.2.16.0), colorRamps(v.2.3), R6(v.2.4.0), KernSmooth(v.2.23-15), gridExtra(v.2.3), IRanges(v.2.18.1), sessioninfo(v.1.1.1), codetools(v.0.2-16), boot(v.1.3-22), MASS(v.7.3-51.4), gtools(v.3.8.1), assertthat(v.0.2.1), pkgload(v.1.0.2), SummarizedExperiment(v.1.14.0), rprojroot(v.1.3-2), withr(v.2.1.2), GenomicAlignments(v.1.20.1), Rsamtools(v.2.0.0), S4Vectors(v.0.22.0), GenomeInfoDbData(v.1.2.1), mgcv(v.1.8-28), hms(v.0.4.2), clusterProfiler(v.3.12.0), grid(v.3.6.0), tidyr(v.0.8.3), minqa(v.1.2.4), rvcheck(v.0.1.3), rmarkdown(v.1.13), ggforce(v.0.2.2) and base64enc(v.0.1-3)
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 8b0982a32ca67b6e0038facd2536a24e06bd4da8
## This is hpgltools commit: Fri Jun 21 10:35:35 2019 -0400: 8b0982a32ca67b6e0038facd2536a24e06bd4da8
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_annotation_20190701-v20190701.rda.xz