The following section loads the microbesonline and genbank annotations for Mycobacterium tuberculosis.
## Looks like it is taxon ID 83332
mtb_annotations <- as.data.frame(load_microbesonline_annotations(species="Mycobacterium tuberculosis H37Rv"))
## Found 1 entry.
## Genome Phylum Paper Loaded Complete
## 2178 Mycobacterium tuberculosis H37Rv Actinobacteria yes 2007-05-08 yes
## #Chr. #Plasmids #Genes tax_id
## 2178 1 0 4047 83332
## The species being downloaded is: Mycobacterium tuberculosis H37Rv
## Downloading: http://www.microbesonline.org/cgi-bin/genomeInfo.cgi?tId=83332;export=tab
knitr::kable(head(mtb_annotations))
locusId | accession | GI | scaffoldId | start | stop | strand | sysName | name | desc | COG | COGFun | COGDesc | TIGRFam | TIGRRoles | GO | EC | ECDesc |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
31772 | NP_214515.1 | 15607143 | 7022 | 1 | 1524 | + | Rv0001 | dnaA | chromosomal replication initiation protein (NCBI) | COG593 | L | ATPase involved in DNA replication initiation | TIGR00362 chromosomal replication initiator protein DnaA [dnaA] | DNA metabolism:DNA replication, recombination, and repair | GO:0006270,GO:0006275,GO:0003688,GO:0017111,GO:0005524 | NA | NA |
31773 | NP_214516.1 | 15607144 | 7022 | 2052 | 3260 | + | Rv0002 | dnaN | DNA polymerase III subunit beta (NCBI) | COG592 | L | DNA polymerase sliding clamp subunit (PCNA homolog) | TIGR00663 DNA polymerase III, beta subunit [dnaN] | DNA metabolism:DNA replication, recombination, and repair | GO:0006260,GO:0003677,GO:0003893,GO:0008408,GO:0016449,GO:0019984,GO:0003889,GO:0003894,GO:0015999,GO:0016450,GO:0003890,GO:0003895,GO:0016000,GO:0016451,GO:0003891,GO:0016448,GO:0016452 | 2.7.7.7 | DNA-directed DNA polymerase. |
31774 | NP_214517.1 | 15607145 | 7022 | 3280 | 4437 | + | Rv0003 | recF | recombination protein F (NCBI) | COG1195 | L | Recombinational DNA repair ATPase (RecF pathway) | TIGR00611 DNA replication and repair protein RecF [recF] | DNA metabolism:DNA replication, recombination, and repair | GO:0006281,GO:0005694,GO:0003697,GO:0005524 | NA | NA |
31775 | NP_214518.1 | 15607146 | 7022 | 4434 | 4997 | + | Rv0004 | Rv0004 | hypothetical protein (NCBI) | COG5512 | R | Zn-ribbon-containing, possibly RNA-binding protein and truncated derivatives | NA | NA | NA | NA | NA |
31776 | NP_214519.1 | 15607147 | 7022 | 5123 | 7267 | + | Rv0005 | gyrB | DNA topoisomerase IV subunit B (NCBI) | COG187 | L | Type IIA topoisomerase (DNA gyrase/topo II, topoisomerase IV), B subunit | TIGR01059 DNA gyrase, B subunit [gyrB] | DNA metabolism:DNA replication, recombination, and repair | GO:0006304,GO:0006265,GO:0005694,GO:0003918,GO:0005524 | 5.99.1.3 | DNA topoisomerase (ATP-hydrolyzing). |
31777 | NP_214520.1 | 15607148 | 7022 | 7302 | 9818 | + | Rv0006 | gyrA | DNA gyrase subunit A (NCBI) | COG188 | L | Type IIA topoisomerase (DNA gyrase/topo II, topoisomerase IV), A subunit | TIGR01063 DNA gyrase, A subunit [gyrA] | DNA metabolism:DNA replication, recombination, and repair | GO:0006265,GO:0006268,GO:0005694,GO:0003918,GO:0005509,GO:0005524 | 5.99.1.3 | DNA topoisomerase (ATP-hydrolyzing). |
mtb_go <- load_microbesonline_go(species="Mycobacterium tuberculosis H37Rv", id_column="sysName")
## Found 1 entry.
## Genome Phylum Paper Loaded Complete
## 2178 Mycobacterium tuberculosis H37Rv Actinobacteria yes 2007-05-08 yes
## #Chr. #Plasmids #Genes tax_id
## 2178 1 0 4047 83332
## The species being downloaded is: Mycobacterium tuberculosis H37Rv and is being downloaded as 83332.tab.
colnames(mtb_go) <- c("ID", "GO")
mtb_gff <- load_gff_annotations(gff="~/scratch/libraries/genome/mtuberculosis_h37rv.gff")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Returning a df with 15 columns and 4008 rows.
rownames(mtb_gff) <- mtb_gff[["locus_tag"]]
mtb_annot <- merge(mtb_gff, mtb_annotations, by.x="row.names", by.y="sysName", all.x=TRUE)
rownames(mtb_annot) <- mtb_annot[["Row.names"]]
mtb_annot[["Row.names"]] <- NULL
This is the group of samples which were collected by the Briken lab and previously analyzed by members of the El-Sayed lab.
local_expt <- create_expt(metadata="sample_sheets/local_samples.xlsx",
file_column="mtbfile",
gene_info=mtb_annot)
## Reading the sample metadata.
## The sample definitions comprises: 44 rows(samples) and 4 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0082/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0083/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0084/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0085/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0086/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0087/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0088/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0089/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0090/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0091/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0092/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0093/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0094/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0095/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0130/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0131/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0132/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0133/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0134/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0135/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0136/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0137/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0138/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0139/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0140/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0141/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0330/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0331/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0332/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0333/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0334/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0335/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0336/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0511/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0512/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0513/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0514/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0515/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0516/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0517/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0518/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0519/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0520/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/hpgl0521/outputs/hisat2_mtuberculosis_h37rv/r1.count_mtuberculosis_h37rv_sno_gene_locus_tag.count.xz contains 4013 rows and merges to 4013 rows.
## Finished reading count data.
## Matched 4008 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 4008 rows and 44 columns.
Najib and Volker would like to focus for the moment on only hpgl IDs: 130-132, 330-332.
few_expt <- subset_expt(local_expt, subset="condition=='in_vitro'|condition=='thp1'")
## Using a subset expression.
## There were 44, now there are 6 samples.
few_filt <- normalize_expt(few_expt, filter=TRUE)
## This function will replace the expt$expressionset slot with:
## cbcb(data)
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unconverted. It is often advisable to cpm/rpkm
## the data to normalize for sampling differences, keep in mind though that rpkm
## has some annoying biases, and voom() by default does a cpm (though hpgl_voom()
## will try to detect this).
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 0 low-count genes (4008 remaining).
## Step 2: not normalizing the data.
## Step 3: not converting the data.
## Step 4: not transforming the data.
## Step 5: not doing batch correction.
few_write <- write_expt(few_expt, excel="excel/few_written.xlsx")
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## varpart sees only 1 batch, adjusting the model accordingly.
## Attempting mixed linear model with: ~ (1|condition)
## Fitting the expressionset to the model, this is slow.
## Dividing work into 100 chunks...
##
## Total:17 s
## Placing factor: condition at the beginning of the model.
## Writing the normalized reads.
## Error in density.default(x, adjust = adj) : 'x' contains missing values
## Error in density.default(x, adjust = adj) : 'x' contains missing values
## Graphing the normalized reads.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## varpart sees only 1 batch, adjusting the model accordingly.
## Attempting mixed linear model with: ~ (1|condition)
## Fitting the expressionset to the model, this is slow.
## Dividing work into 100 chunks...
##
## Total:22 s
## Placing factor: condition at the beginning of the model.
## Writing the median reads by factor.
## Error in `colnames<-`(`*tmp*`, value = "cv_"): attempt to set 'colnames' on an object with less than two dimensions
few_de <- all_pairwise(few_filt)
## Plotting a PCA before surrogates/batch inclusion.
## Not putting labels on the plot.
## Using limma's removeBatchEffect to visualize with(out) batch inclusion.
## Error in density.default(x, adjust = adj) : 'x' contains missing values
## Error in density.default(x, adjust = adj) : 'x' contains missing values
## Not putting labels on the plot.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
few_table <- combine_de_tables(few_de, excel="excel/few_samples_table.xlsx")
## Deleting the file excel/few_samples_table.xlsx before writing the tables.
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Working on table 1/1: thp1_vs_in_vitro
## Adding venn plots for thp1_vs_in_vitro.
## Limma expression coefficients for thp1_vs_in_vitro; R^2: 0.612; equation: y = 0.671x + 1.72
## Deseq expression coefficients for thp1_vs_in_vitro; R^2: 0.641; equation: y = 0.831x + 0.579
## Edger expression coefficients for thp1_vs_in_vitro; R^2: 0.581; equation: y = 1.15x - 1.67
## Writing summary information, compare_plot is: TRUE.
## Performing save of excel/few_samples_table.xlsx.
few_sig <- extract_significant_genes(few_table,
excel="excel/few_samples_sig.xlsx")
## Writing a legend of columns.
## Printing significant genes to the file: excel/few_samples_sig.xlsx
## 1/1: Creating significant table up_limma_thp1_vs_in_vitro
## Printing significant genes to the file: excel/few_samples_sig.xlsx
## 1/1: Creating significant table up_edger_thp1_vs_in_vitro
## Printing significant genes to the file: excel/few_samples_sig.xlsx
## 1/1: Creating significant table up_deseq_thp1_vs_in_vitro
## Printing significant genes to the file: excel/few_samples_sig.xlsx
## 1/1: Creating significant table up_ebseq_thp1_vs_in_vitro
## Printing significant genes to the file: excel/few_samples_sig.xlsx
## 1/1: Creating significant table up_basic_thp1_vs_in_vitro
## Adding significance bar plots.
mtb_lengths <- mtb_annot[, c("seqnames", "width")]
mtb_lengths[["seqnames"]] <- rownames(mtb_lengths)
colnames(mtb_lengths) <- c("ID", "length")
up_genes <- few_sig[["deseq"]][["ups"]][[1]]
up_go <- simple_goseq(sig_genes=up_genes, go_db=mtb_go, length_db=mtb_lengths,
excel="excel/up_goseq.xlsx")
## Using the row names of your table.
## Found 210 genes out of 390 from the sig_genes in the go_db.
## Found 390 genes out of 390 from the sig_genes in the length_db.
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
## simple_goseq(): Calculating q-values
## simple_goseq(): Filling godata with terms, this is slow.
## Testing that go categories are defined.
## Removing undefined categories.
## Gathering synonyms.
## Gathering category definitions.
## simple_goseq(): Making pvalue plots for the ontologies.
## Writing data to: excel/up_goseq.xlsx.
down_genes <- few_sig[["deseq"]][["downs"]][[1]]
down <- rownames(down_genes)
down_go <- simple_goseq(sig_genes=down, go_db=mtb_go, length_db=mtb_lengths)
## Found 319 genes out of 457 from the sig_genes in the go_db.
## Found 457 genes out of 457 from the sig_genes in the length_db.
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
## simple_goseq(): Calculating q-values
## simple_goseq(): Filling godata with terms, this is slow.
## Testing that go categories are defined.
## Removing undefined categories.
## Gathering synonyms.
## Gathering category definitions.
## simple_goseq(): Making pvalue plots for the ontologies.
up_go$pvalue_plots[[1]]
down_go$pvalue_plots[[1]]
In this context, exogenous just means samples which were not created here. E.g. samples I downloaded from SRA.
exo_expt <- create_expt(metadata="sample_sheets/exo_samples.xlsx",
file_column="mtbfile",
gene_info=mtb_gff)
## Reading the sample metadata.
## Did not find the batch column in the sample sheet.
## Filling it in as undefined.
## The sample definitions comprises: 19 rows(samples) and 12 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /mnt/sshfs/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/mycobacterium_tuberculosis_2020/preprocessing/SRR9214125/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows.
## preprocessing/SRR9214126/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214127/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214128/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214129/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214130/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214131/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214132/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214133/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214134/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214135/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214136/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214137/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214138/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214139/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214140/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214141/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214142/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## preprocessing/SRR9214143/outputs/hisat2_mtuberculosis_h37rv/r1.count.xz contains 4013 rows and merges to 4013 rows.
## Finished reading count data.
## Warning in create_expt(metadata = "sample_sheets/exo_samples.xlsx", file_column
## = "mtbfile", : Even after changing the rownames in gene info, they do not match
## the count table.
## Even after changing the rownames in gene info, they do not match the count table.
## Here are the first few rownames from the count tables:
## GeneID:11117810, GeneID:11117811, GeneID:11117812, GeneID:14515843, GeneID:14515844, GeneID:14515845
## Here are the first few rownames from the gene information table:
## Rv0001, Rv0002, Rv0003, Rv0004, Rv0005, Rv0006
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 4008 rows and 19 columns.
The following blocks will plot and print a few common metrics of the new data.
exo_plots <- sm(graph_metrics(exo_expt))
exo_norm <- sm(normalize_expt(exo_expt, transform="log2", norm="quant", filter=TRUE))
exon_plots <- sm(graph_metrics(exo_norm))
raw_plots$libsize
## Error in eval(expr, envir, enclos): object 'raw_plots' not found
raw_plots$density
## Error in eval(expr, envir, enclos): object 'raw_plots' not found
tmp <- plot_density(mtb_expt)
## Error in plot_density(mtb_expt): object 'mtb_expt' not found
tn <- normalize_expt(mtb_expt, transform="log2")
## Error in normalize_expt(mtb_expt, transform = "log2"): object 'mtb_expt' not found
tnp <- plot_density(tn)
## Error in plot_density(tn): object 'tn' not found
tmp_ggstats <- ggstatsplot::ggbetweenstats(
data=tnp$table, x=sample, y=counts,
notch=TRUE, mean.ci=TRUE, k=3,
pairwise.comparisons=FALSE)
## Registered S3 methods overwritten by 'broom.mixed':
## method from
## augment.lme broom
## augment.merMod broom
## glance.lme broom
## glance.merMod broom
## glance.stanreg broom
## tidy.brmsfit broom
## tidy.gamlss broom
## tidy.lme broom
## tidy.merMod broom
## tidy.rjags broom
## tidy.stanfit broom
## tidy.stanreg broom
## Error: object 'bartlett_message' is not exported by 'namespace:ipmisc'
tmp_ggstats
## Error in eval(expr, envir, enclos): object 'tmp_ggstats' not found
tmp_ggstats <- ggstatsplot::grouped_ggbetweenstats(
grouping.var=condition,
data=tnp$table, x=sample, y=counts,
notch=TRUE, mean.ci=TRUE, k=3,
pairwise.comparisons=FALSE)
## Error: object 'bartlett_message' is not exported by 'namespace:ipmisc'
tmp_ggstats
## Error in eval(expr, envir, enclos): object 'tmp_ggstats' not found
## Quick PCA
exon_pc_expt <- normalize_expt(exo_expt, transform="log2", filter=TRUE, convert="cpm",
norm="quant", batch="svaseq")
## This function will replace the expt$expressionset slot with:
## log2(svaseq(cpm(quant(cbcb(data)))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Warning in normalize_expt(exo_expt, transform = "log2", filter = TRUE, convert =
## "cpm", : Quantile normalization and sva do not always play well together.
## Step 1: performing count filter with option: cbcb
## Removing 20 low-count genes (3988 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## Step 4: transforming the data with log2.
## transform_counts: Found 11 values equal to 0, adding 1 to the matrix.
## Step 5: doing batch correction with svaseq.
## Note to self: If you get an error like 'x contains missing values' The data has too many 0's and needs a stronger low-count filter applied.
## Passing off to all_adjusters.
## batch_counts: Before batch/surrogate estimation, 75205 entries are x>1: 99%.
## batch_counts: Before batch/surrogate estimation, 11 entries are x==0: 0%.
## batch_counts: Before batch/surrogate estimation, 556 entries are 0<x<1: 1%.
## The be method chose 2 surrogate variable(s).
## Attempting svaseq estimation with 2 surrogates.
## There are 13 (0%) elements which are < 0 after batch correction.
## Setting low elements to zero.
pp(file="images/exo_pc.png", image=plot_pca(exon_pc_expt)$plot)
## Writing the image to: images/exo_pc.png and calling dev.off().
pander::pander(sessionInfo())
R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=en_US.UTF-8, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C
attached base packages: grid, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: Rgraphviz(v.2.30.0), graph(v.1.64.0), SparseM(v.1.78), topGO(v.2.38.1), ruv(v.0.9.7.1), BiocParallel(v.1.20.1), variancePartition(v.1.16.1), hpgltools(v.1.0), Biobase(v.2.46.0) and BiocGenerics(v.0.32.0)
loaded via a namespace (and not attached): rappdirs(v.0.3.1), rtracklayer(v.1.46.0), R.methodsS3(v.1.8.0), coda(v.0.19-3), tidyr(v.1.1.0), ggplot2(v.3.3.1), acepack(v.1.4.1), bit64(v.0.9-7), knitr(v.1.28), DelayedArray(v.0.12.3), R.utils(v.2.9.2), data.table(v.1.12.8), rpart(v.4.1-15), RCurl(v.1.98-1.2), doParallel(v.1.0.15), generics(v.0.0.2), GenomicFeatures(v.1.38.2), preprocessCore(v.1.48.0), callr(v.3.4.3), cowplot(v.1.0.0), usethis(v.1.6.1), RSQLite(v.2.2.0), europepmc(v.0.4), correlation(v.0.2.1), bit(v.1.1-15.2), enrichplot(v.1.6.1), httpuv(v.1.5.4), xml2(v.1.3.2), SummarizedExperiment(v.1.16.1), assertthat(v.0.2.1), viridis(v.0.5.1), xfun(v.0.14), hms(v.0.5.3), promises(v.1.1.1), evaluate(v.0.14), DEoptimR(v.1.0-8), fansi(v.0.4.1), progress(v.1.2.2), caTools(v.1.18.0), dbplyr(v.1.4.4), igraph(v.1.2.5), DBI(v.1.1.0), geneplotter(v.1.64.0), htmlwidgets(v.1.5.1), stats4(v.3.6.1), purrr(v.0.3.4), ellipsis(v.0.3.1), selectr(v.0.4-2), dplyr(v.1.0.0), backports(v.1.1.7), insight(v.0.8.5), ggcorrplot(v.0.1.3), annotate(v.1.64.0), biomaRt(v.2.42.1), vctrs(v.0.3.1), remotes(v.2.1.1), withr(v.2.2.0), ggforce(v.0.3.1), triebeard(v.0.3.0), robustbase(v.0.93-6), checkmate(v.2.0.0), GenomicAlignments(v.1.22.1), prettyunits(v.1.1.1), cluster(v.2.1.0), DOSE(v.3.12.0), crayon(v.1.3.4), genefilter(v.1.68.0), edgeR(v.3.28.1), pkgconfig(v.2.0.3), labeling(v.0.3), tweenr(v.1.0.1), GenomeInfoDb(v.1.22.1), nlme(v.3.1-148), pkgload(v.1.1.0), nnet(v.7.3-14), devtools(v.2.3.0), rlang(v.0.4.6), miniUI(v.0.1.1.1), lifecycle(v.0.2.0), skimr(v.2.1.1), groupedstats(v.1.0.1), BiocFileCache(v.1.10.2), directlabels(v.2020.1.31), rprojroot(v.1.3-2), polyclip(v.1.10-0), broomExtra(v.4.0.2), matrixStats(v.0.56.0), Matrix(v.1.2-18), urltools(v.1.7.3), boot(v.1.3-25), base64enc(v.0.1-3), geneLenDataBase(v.1.22.0), ggridges(v.0.5.2), processx(v.3.4.2), png(v.0.1-7), viridisLite(v.0.3.0), parameters(v.0.8.0), bitops(v.1.0-6), R.oo(v.1.23.0), KernSmooth(v.2.23-17), pander(v.0.6.3), ggExtra(v.0.9), Biostrings(v.2.54.0), blob(v.1.2.1), stringr(v.1.4.0), qvalue(v.2.18.0), readr(v.1.3.1), jpeg(v.0.1-8.1), gridGraphics(v.0.5-0), ggsignif(v.0.6.0), S4Vectors(v.0.24.4), scales(v.1.1.1), memoise(v.1.1.0), magrittr(v.1.5), plyr(v.1.8.6), gplots(v.3.0.3), gdata(v.2.18.0), zlibbioc(v.1.32.0), compiler(v.3.6.1), RColorBrewer(v.1.1-2), lme4(v.1.1-23), DESeq2(v.1.26.0), Rsamtools(v.2.2.3), cli(v.2.0.2), XVector(v.0.26.0), TMB(v.1.7.16), ps(v.1.3.3), htmlTable(v.1.13.3), Formula(v.1.2-3), MASS(v.7.3-51.6), mgcv(v.1.8-31), tidyselect(v.1.1.0), forcats(v.0.5.0), stringi(v.1.4.6), highr(v.0.8), yaml(v.2.2.1), GOSemSim(v.2.12.1), askpass(v.1.1), locfit(v.1.5-9.4), latticeExtra(v.0.6-29), ggrepel(v.0.8.2), fastmatch(v.1.1-0), tools(v.3.6.1), rstudioapi(v.0.11), foreach(v.1.5.0), foreign(v.0.8-76), ipmisc(v.3.0.0), gridExtra(v.2.3), farver(v.2.0.3), Rtsne(v.0.15), ggraph(v.2.0.3), digest(v.0.6.25), rvcheck(v.0.1.8), BiocManager(v.1.30.10), shiny(v.1.4.0.2), quadprog(v.1.5-8), Rcpp(v.1.0.4.6), broom(v.0.5.6), GenomicRanges(v.1.38.0), later(v.1.1.0.1), performance(v.0.4.6), httr(v.1.4.1), AnnotationDbi(v.1.48.0), effectsize(v.0.3.1), colorspace(v.1.4-1), rvest(v.0.3.5), XML(v.3.99-0.3), fs(v.1.4.1), IRanges(v.2.20.2), splines(v.3.6.1), RBGL(v.1.62.1), statmod(v.1.4.34), graphlayouts(v.0.7.0), ggplotify(v.0.0.5), sessioninfo(v.1.1.1), xtable(v.1.8-4), jsonlite(v.1.6.1), nloptr(v.1.2.2.1), tidygraph(v.1.2.0), corpcor(v.1.6.9), zeallot(v.0.1.0), testthat(v.2.3.2), R6(v.2.4.1), Vennerable(v.3.1.0.9000), broom.mixed(v.0.2.6), Hmisc(v.4.4-0), mime(v.0.9), pillar(v.1.4.4), htmltools(v.0.4.0), fastmap(v.1.0.1), glue(v.1.4.1), minqa(v.1.2.4), clusterProfiler(v.3.14.3), codetools(v.0.2-16), fgsea(v.1.12.0), pkgbuild(v.1.0.8), lattice(v.0.20-41), tibble(v.3.0.1), sva(v.3.34.0), pbkrtest(v.0.4-8.6), BiasedUrn(v.1.07), curl(v.4.3), colorRamps(v.2.3), gtools(v.3.8.2), zip(v.2.0.4), GO.db(v.3.10.0), openxlsx(v.4.1.5), openssl(v.1.4.1), survival(v.3.1-12), limma(v.3.42.2), repr(v.1.1.0), rmarkdown(v.2.2), desc(v.1.2.0), munsell(v.0.5.0), DO.db(v.2.9), fastcluster(v.1.1.25), GenomeInfoDbData(v.1.2.2), goseq(v.1.38.0), iterators(v.1.0.12), haven(v.2.3.1), reshape2(v.1.4.4), gtable(v.0.3.0) and bayestestR(v.0.6.0)
message(paste0("This is hpgltools commit: ", get_git_commit()))
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset 5f94c6aed834674e1567e1992ac0bdefab60ead1
## This is hpgltools commit: Tue Jun 9 09:50:10 2020 -0400: 5f94c6aed834674e1567e1992ac0bdefab60ead1
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
## Saving to 01_mtb_analyses_20200320-v20200320.rda.xz
tmp <- sm(saveme(filename=this_save))