There are a few methods of importing annotation data into R. The following are a few ways.
mgas_data <- sm(gbk2txdb(accession="CP008776"))
genome_size <- GenomicRanges::width(mgas_data$seq) ## This fails on travis?
mgas_cds <- as.data.frame(mgas_data$cds)
## Get rid of amino acid sequence
mgas_cds <- mgas_cds[-15]
## And EC_number because wtf is that?
mgas_cds <- mgas_cds[-16]
rownames(mgas_cds) <- mgas_cds[["locus_tag"]]
summary(mgas_data)
## Length Class Mode
## others 82 GRanges S4
## exons 1723 GRanges S4
## cds 1723 GRanges S4
## intergenic 1496 GRanges S4
## genes 1814 GRanges S4
## txdb 1 TxDb S4
## seq 1 DNAStringSet S4
microbe_ids <- sm(get_microbesonline_ids("5005"))
mgas_df <- get_microbesonline_annotation(microbe_ids$taxonomyId)
## $`Streptococcus pyogenes MGAS5005`
## [1] 293653
mgas_df <- mgas_df[[1]]
In contrast, it is possible to load most annotations of interest directly from the gff files used in the alignments. More in-depth information for the human transcriptome may be extracted from biomart.
## The old way of getting genome/annotation data
sp_gff <- "reference/mgas_5448.gff"
sp_fasta <- "reference/mgas_5448.fasta"
sp_annotations <- gff2df(sp_gff, type="gene")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo=FALSE)
## Returning a df with 14 columns and 1814 rows.
Biomart does not yet work with bacteria, but they have a REST api which might work. Look into this as necessary.
We have some annotations and will now create an expt containing the annotations and count data.
sp_expt <- sm(create_expt(metadata="sample_sheets/rnaseq_tnseq_samples.xlsx", gene_info=mgas_cds))
head(Biobase::fData(sp_expt$expressionset))
## seqnames start end width strand type locus_tag
## SP5448_00005 5448 232 1587 1356 + CDS SP5448_00005
## SP5448_00010 5448 1742 2878 1137 + CDS SP5448_00010
## SP5448_00015 5448 2953 3150 198 + CDS SP5448_00015
## SP5448_00020 5448 3480 4595 1116 + CDS SP5448_00020
## SP5448_00025 5448 4665 5234 570 + CDS SP5448_00025
## SP5448_00030 5448 5237 8740 3504 + CDS SP5448_00030
## inference
## SP5448_00005 EXISTENCE: similar to AAsequence:SwissProt:C0M7C0.1
## SP5448_00010 EXISTENCE: similar to AAsequence:RefSeq:WP_002981935.1
## SP5448_00015 EXISTENCE: similar to AAsequence:RefSeq:WP_003048966.1
## SP5448_00020 EXISTENCE: similar to AAsequence:RefSeq:WP_000244522.1
## SP5448_00025 EXISTENCE: similar to AAsequence:SwissProt:P0DD50.1
## SP5448_00030 EXISTENCE: similar to AAsequence:RefSeq:WP_010921764.1
## note
## SP5448_00005 Derived by automated computational analysis usinggene prediction method: Protein Homology.
## SP5448_00010 Derived by automated computational analysis usinggene prediction method: Protein Homology.
## SP5448_00015 Derived by automated computational analysis usinggene prediction method: Protein Homology.
## SP5448_00020 Derived by automated computational analysis usinggene prediction method: Protein Homology.
## SP5448_00025 Derived by automated computational analysis usinggene prediction method: Protein Homology.
## SP5448_00030 Derived by automated computational analysis usinggene prediction method: Protein Homology.
## codon_start transl_table product
## SP5448_00005 1 11 chromosomal replication initiator protein DnaA
## SP5448_00010 1 11 DNA polymerase III subunit beta
## SP5448_00015 1 11 hypothetical protein
## SP5448_00020 1 11 GTP-binding protein
## SP5448_00025 1 11 peptidyl-tRNA hydrolase
## SP5448_00030 1 11 transcription-repair coupling factor
## protein_id
## SP5448_00005 AKK69518.1
## SP5448_00010 AKK69519.1
## SP5448_00015 AKK69520.1
## SP5448_00020 AKK69521.1
## SP5448_00025 AKK69522.1
## SP5448_00030 AKK69523.1
## translation
## SP5448_00005 MTENEQIFWNRVLELAQSQLKQATYEFFVHDARLLKVDKHIATIYLDQMKELFWEKNLKDVILTAGFEVYNAQISVDYVFEEDLMIEQNQTKINQKPKQQALNSLPTVTSDLNSKYSFENFIQGDENRWAVAASIAVANTPGTTYNPLFIWGGPGLGKTHLLNAIGNSVLLENPNARIKYITAENFINEFVIHIRLDTMDELKEKFRNLDLLLIDDIQSLAKKTLSGTQEEFFNTFNALHNNNKQIVLTSDRTPDHLNDLEDRLVTRFKWGLTVNITPPDFETRVAILTNKIQEYNFIFPQDTIEYLAGQFDSNVRDLEGALKDISLVANFKQIDTITVDIAAEAIRARKQDGPKMTVIPIEEIQAQVGKFYGVTVKEIKATKRTQNIVLARQVAMFLAREMTDNSLPKIGKEFGGRDHSTVLHAYNKIKNMISQDESLRIEIETIKNKIK
## SP5448_00010 MIQFSINRTLFIHALNTTKRAISTKNAIPILSSIKIEVTSTGVTLTGSNGQISIENTIPVSNENAGLLITSPGAILLEASFFINIISSLPDISINVKEIEQHQVVLTSGKSEITLKGKDVDQYPRLQEVSTENPLILKTKLLKSIIAETAFAASLQESRPILTGVHIVLSNHKDFKAVATDSHRMSQRLITLDNTSADFDVVIPSKSLREFSAVFTDDIETVEVFFSPSQILFRSEHISFYTRLLEGNYPDTDRLLMTEFETEVVFNTQSLRHAMERAFLISNATQNGTVKLEITQNHISAHVNSPEVGKVNEDLDIVSQSGSDLTISFNPTYLIESLKAIKSETVKIHFLSPVRPFTLTPGDEEESFIQLITPVRTN
## SP5448_00015 MYQIGSFVEMKKPHACVIKETGKKANQWKVLRVGADIKIQCTNCQHVIMMSRYDFERKLKKVLQP
## SP5448_00020 MALTAGIVGLPNVGKSTLFNAITKAGAEAANYPFATIDPNVGMVEVPDERLQKLTELITPKKTVPTTFEFTDIAGIVKGASRGEGLGNKFLANIREIDAIVHVVRAFDDENVMREQGREDAFVDPIADIDTINLELILADLESINKRYARVEKMARTQKDKESVAEFNVLQKIKPVLEDGKSARTIEFTEDEAKVVKGLFLLTTKPVLYVANVDEDKVANPDGIDYVKQIRDFAATENAEVVVISARAEEEISELDDEDKEEFLEAIGLTESGVDKLTRAAYHLLGLGTYFTAGEKEVRAWTFKRGIKAPQAAGIIHSDFERGFIRAVTMSYDDLMTYGSEKAVKEAGRLREEGKEYVVQDGDIMEFRFNV
## SP5448_00025 MVKMIVGLGNPGSKYEKTKHNIGFMAIDNIVKNLDVTFTDDKNFKAQIGSTFINHEKVYFVKPTTFMNNSGIAVKALLTYYNIDITDLIVIYDDLDMEVSKLRLRSKGSAGGHNGIKSIIAHIGTQEFNRIKVGIGRPLKGMTVINHVMGQFNTEDNIAISLTLDRVVNAVKFYLQENDFEKTMQKFNG
## SP5448_00030 MDILELFSQNKKVQSWHSGLTTLGRQLVMGLSGSSKTLAIASAYLDDQKKIVVVTSTQNEVEKLASDLSSLLDEELVFQFFADDVAAAEFIFASMDKALSRIETLQFLRNPKSQGVLIVSLSGLRILLPNPDVFTKSQIQLTVGEDYDSDTLTKQLMTIGYQKVSQVISPGEFSRRGDILDIYEITQELPYRLEFFGDDIDSIRQFHPETQKSFEQLEGIFINPASDLIFEVSDFQRGIEQLEKALQTAQDDKKSYLEDVLAVSKNGFKHKDIRKFQSLFYEKEWSLLDYIPKGTPIFFDDFQKLVDKNARFDLEIANLLTEDLQQGKALSNLNYFTDNYRELRHYKPATFFSNFHKGLGNIKFDQMHQLTQYAMQEFFNQFPLLIDEIKRYQKNQTTVIVQVESQYAYERLEKSFQDYQFRLPLVSANQIVSRESQIVIGAISSGFYFADEKLALITEHEIYHKKIKRRARRSNISNAERLKDYNELAVGDYVVHNVHGIGRFLGIETIQIQGIHRDYVTIQYQNSDRISLPIDQISSLSKYVSADGKEPKINKLNDGRFQKTKQKVARQVEDIADDLLKLYAERSQQKGFSFSPDDDLQRAFDDDFAFVETEDQLRSIKEIKADMESMQPMDRLLVGDVGFGKTEVAMRAAFKAVNDHKQVAVLVPTTVLAQQHYENFKARFENYPVEVDVLSRFRSKKEQAETLERVRKGQIDIIIGTHRLLSKDVVFSDLGLIVIDEEQRFGVKHKETLKELKTKVDVLTLTATPIPRTLHMSMLGIRDLSVIETPPTNRYPVQTYVLENNPGLVREAIIREMDRGGQIFYVYNKVDTIEKKVAELQELVPEASIGFVHGQMSEIQLENTLIDFINGDYDVLVATTIIETGVDISNVNTLFIENADHMGLSTLYQLRGRVGRSNRIAYAYLMYRPDKVLTEVSEKRLEAIKGFTELGSGFKIAMRDLSIRGAGNILGASQSGFIDSVGFEMYSQLLEQAIASKQGKTTVRQKGNTEINLQIDAYLPDDYIADERQKIDIYKRIREIQSREDYLNLQDELMDRFGEYPDQVAYLLEIALLKHYMDNAFAELVERKNNQVIVRFEVTSLTYFLTQDYFEALSKTHLKAKISEHQGKIDIVFDVRHQKDYRILEELMLFGERLSEIKIRKNNSVFK
## EC_number gene_synonym gene_id transcript_id
## SP5448_00005 character(0) character(0) SP5448_00005 SP5448_00005.1
## SP5448_00010 character(0) character(0) SP5448_00010 SP5448_00010.1
## SP5448_00015 character(0) character(0) SP5448_00015 SP5448_00015.1
## SP5448_00020 character(0) character(0) SP5448_00020 SP5448_00020.1
## SP5448_00025 character(0) character(0) SP5448_00025 SP5448_00025.1
## SP5448_00030 character(0) character(0) SP5448_00030 SP5448_00030.1
thyexpress <- expt_subset(expt=sp_expt, subset="type=='RNASeq'&batch!='z'")
thytnseq <- expt_subset(expt=sp_expt, subset="condition=='thyd0'|condition=='thyd1'|condition=='thyd2'|condition=='thyd3'")
thyrnatn <- expt_subset(expt=sp_expt, subset="batch!='z'&(type=='RNASeq'|condition=='thyd0'|condition=='thyd1'|condition=='thyd2'|condition=='thyd3')")
subcu <- expt_subset(expt=sp_expt, subset="media=='mouse_skin'|condition=='subcut0'")
pmnexp1 <- expt_subset(expt=sp_expt, subset="batch=='k'|batch=='l'")
pmnexp2 <- expt_subset(expt=sp_expt, subset="batch=='m'|batch=='n'|batch=='o'")
abscess <- expt_subset(expt=sp_expt, subset="media=='rabbit_abscess'")
rpoe <- expt_subset(expt=sp_expt, subset="batch=='z'")
cpsy <- expt_subset(expt=sp_expt, subset="experiment=='rnaseq_cpsy'")
new_pmn <- expt_subset(expt=sp_expt, subset="experiment=='pmn_infection'")
tmp <- sm(saveme(filename=this_save))