I am using a concatenation of Esmeraldo, Non-Esmeraldo, and the Unassigned.
load_gff_annotations("reference/tcruzi_clbrener_v37.gff") tc_annot <-
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Returning a df with 15 columns and 99449 rows.
library(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
##
library(org.Tcruzi.CL.Brener.Non.Esmeraldo.like.v50.eg.db)
##
library(org.Tcruzi.CL.Brener.v50.eg.db)
##
grepl(pattern = "^ANNOT_",
annot_column_idx <-x = keytypes(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db))
keytypes(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db)[annot_column_idx]
annot_columns <- load_orgdb_annotations("org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db",
esmer_org <-keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
load_orgdb_annotations("org.Tcruzi.CL.Brener.Non.Esmeraldo.like.v50.eg.db",
nonesmer_org <-keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
load_orgdb_annotations("org.Tcruzi.CL.Brener.v50.eg.db",
unas_org <-keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Some requested columns are not available: ANNOT_GENE_ENTREZ_LINK.
## The following are available: ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK_DISPLAYTEXT, ANNOT_GENE_ENTREZ_LINK_URL, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL, CHR_ID, EVIDENCE, EVIDENCEALL, GENE_TYPE, GID, GO, GOALL, GODB_EVIDENCE_CODE, GODB_GO_ID, GODB_GO_TERM_NAME, GODB_IS_NOT, GODB_ONTOLOGY, GODB_REFERENCE, GODB_SORT_KEY, GODB_SOURCE, GODB_SUPPORT_FOR_EVIDENCE_CODE_ASSIGNMENT, GODB_TRANSCRIPT_ID_S, GOSLIM_GO_ID, GOSLIM_GO_SLIM_TERM_NAME, GOSLIM_GO_TERM_NAME, GOSLIM_IS_NOT, GOSLIM_ONTOLOGY, GOSLIM_SLIM_GO_ID, INTERPRO_DESCRIPTION, INTERPRO_E_VALUE, INTERPRO_END_MIN, INTERPRO_ID, INTERPRO_NAME, INTERPRO_PRIMARY_ID, INTERPRO_SECONDARY_ID, INTERPRO_START_MIN, INTERPRO_TRANSCRIPT_ID_S, LINKOUT_DATABASE, LINKOUT_EXT_ID, LINKOUT_LINK_URL, LINKOUT_SOURCE_ID, ONTOLOGY, ONTOLOGYALL, ORTHOLOGS_COUNT, ORTHOLOGS_GID, ORTHOLOGS_ORGANISM, ORTHOLOGS_PRODUCT, ORTHOLOGS_SYNTENIC, PATHWAY_EC_NUMBER_MATCHED_IN_PATHWAY, PATHWAY_EXACT_EC_NUMBER_MATCH, PATHWAY_EXPASY_URL, PATHWAY_ID, PATHWAY_SOURCE, PATHWAY_SOURCE_ID, PATHWAY_X_REACTIONS_MATCHING_EC_NUMBER, PDB_P_VALUE, PDB_PDB_ID, PDB_PDB_MOLECULAR_DESCRIPTION, PDB_PDB_STRUCTURE, PDB_PDB_URL, PDB_PVALUE_EXP, PDB_PVALUE_MANT, PDB_TAXON, PDB_TRANSCRIPT_ID, PDB_X_IDENTITY, PDB_X_OF_VEUPATHDB_PROTEIN_COVERED, PUBMED_AUTHORS, PUBMED_DOI, PUBMED_ID, PUBMED_TITLE
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
esmer_org[["genes"]]
clb_annot <- rbind(clb_annot, nonesmer_org[["genes"]])
clb_annot <- unas_org[["genes"]]
unas_annot <-"annot_gene_entrez_link"]] <- ""
unas_annot[[ rbind(clb_annot, unas_annot)
clb_annot <-colnames(clb_annot) <- gsub(pattern = "^ANNOT_", replacement = "", x = colnames(clb_annot))
c("annot_gene_name", "annot_so_term_name", "annot_gene_location_text", "annot_gene_product",
wanted_columns <-"annot_pfam_description", "annot_pseudo_string", "annot_transcript_product")
clb_annot[, wanted_columns] sub_annot <-
create_expt("sample_sheets/all_samples.xlsx",
tc_hisat <-gene_info = sub_annot,
file_column = "tcruzihisatfile")
## Reading the sample metadata.
## The sample definitions comprises: 8 rows(samples) and 40 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /mnt/sshfs_10186/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/tcruzi_fernanda_2021/preprocessing/hpgl1216/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows.
## preprocessing/hpgl1217/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1218/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1219/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1220/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1221/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1222/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1223/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## Finished reading count data.
## Matched 25099 annotations and counts.
## Bringing together the count matrix and gene information.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 25099 rows and 8 columns.
exclude_genes_expt(tc_hisat, column = "annot_so_term_name",
tc_hisat_mrna <-method = "keep", patterns = "protein_coding")
## Before removal, there were 25099 entries.
## Now there are 23304 entries.
## Percent kept: 19.471, 21.623, 13.559, 11.884, 11.664, 13.531, 14.286, 22.946
## Percent removed: 80.529, 78.377, 86.441, 88.116, 88.336, 86.469, 85.714, 77.054
semantic_expt_filter(tc_hisat_mrna,
tc_hisat_mrna_semantic <-semantic_column = "annot_transcript_product")
## Hit 927 genes for term mucin.
## Now the matrix has 22377 elements.
## Hit 1523 genes for term sialidase.
## Now the matrix has 20854 elements.
## Hit 752 genes for term RHS.
## Now the matrix has 20102 elements.
## Hit 1331 genes for term MASP.
## Now the matrix has 18771 elements.
## Hit 564 genes for term DGF.
## Now the matrix has 18207 elements.
## Hit 424 genes for term GP63.
## Now the matrix has 17783 elements.
pp(file = "images/fm_libsize.png")
## Going to write the image to: images/fm_libsize.png when dev.off() is called.
plot_libsize(tc_hisat)$plot
dev.off()
## png
## 2
plot_libsize(tc_hisat_mrna_semantic)$plot
normalize_expt(tc_hisat_mrna, filter = "simple", convert = "cpm",
tc_first <-transform = "log2", norm = "quant")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(simple(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: simple
## Removing 1819 low-count genes (21485 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## The method is: raw.
## Step 4: not doing batch correction.
## Step 4: transforming the data with log2.
## transform_counts: Found 9946 values equal to 0, adding 1 to the matrix.
normalize_expt(tc_hisat_mrna_semantic, filter = "simple", convert = "cpm",
tc_second <-norm = "quant", transform = "log2")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(simple(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: simple
## Removing 978 low-count genes (16805 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## The method is: raw.
## Step 4: not doing batch correction.
## Step 4: transforming the data with log2.
## transform_counts: Found 5019 values equal to 0, adding 1 to the matrix.
pp(file = "images/initial_pca.png")
## Going to write the image to: images/initial_pca.png when dev.off() is called.
plot_pca(tc_first)$plot
dev.off()
## png
## 2
plot_pca(tc_second)$plot
subset_expt(tc_hisat_mrna_semantic,
tc_stationary <-subset="condition=='station_wt'|condition=='station_wt'")
## Using a subset expression.
## There were 8, now there are 2 samples.
normalize_expt(tc_stationary, filter = TRUE, norm = "quant",
tc_station_norm <-convert = "cpm", transform = "log2")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(cbcb(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: cbcb
## Removing 4475 low-count genes (13308 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## The method is: raw.
## Step 4: not doing batch correction.
## Step 4: transforming the data with log2.
plot_pca(tc_station_norm)$plot
## Error in pc_table[, y_pc]: subscript out of bounds
If I heard correctly, the goal is to subtract the wt from the tagged samples. So, contrasts should take this into account.
list(
keepers <-"wt" = c("station_wt", "log_wt"),
"tagged" = c("station_tag", "log_tag"),
"log" = c("log_tag", "log_wt"),
"station" = c("station_tag", "station_wt"))
list(
extra <-"all" = c("station", "log"))
all_pairwise(tc_hisat_mrna_semantic, model_batch = FALSE, filter = TRUE) pairs <-
## Plotting a PCA before surrogate/batch inclusion.
## Not putting labels on the PC plot.
## Assuming no batch in model for testing pca.
## Not putting labels on the PC plot.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(pairs, keepers=keepers, excel="excel/test_tables.xlsx") tables <-
## Deleting the file excel/test_tables.xlsx before writing the tables.
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Working on 1/4: wt which is: station_wt/log_wt.
## Found table with station_wt_vs_log_wt
## Working on 2/4: tagged which is: station_tag/log_tag.
## Found table with station_tag_vs_log_tag
## Working on 3/4: log which is: log_tag/log_wt.
## Found inverse table with log_wt_vs_log_tag
## Working on 4/4: station which is: station_tag/station_wt.
## Found inverse table with station_wt_vs_station_tag
## Adding venn plots for wt.
## Limma expression coefficients for wt; R^2: 0.71; equation: y = 0.85x + 0.717
## Deseq expression coefficients for wt; R^2: 0.683; equation: y = 0.68x + 1.46
## Edger expression coefficients for wt; R^2: 0.672; equation: y = 0.646x + 1.9
## Adding venn plots for tagged.
## Limma expression coefficients for tagged; R^2: 0.698; equation: y = 0.835x + 0.797
## Deseq expression coefficients for tagged; R^2: 0.638; equation: y = 0.66x + 1.55
## Edger expression coefficients for tagged; R^2: 0.635; equation: y = 0.625x + 1.99
## Adding venn plots for log.
## Limma expression coefficients for log; R^2: 0.811; equation: y = 0.89x + 0.54
## Deseq expression coefficients for log; R^2: 0.801; equation: y = 0.872x + 0.557
## Edger expression coefficients for log; R^2: 0.804; equation: y = 0.875x + 0.634
## Adding venn plots for station.
## Limma expression coefficients for station; R^2: 0.757; equation: y = 0.864x + 0.668
## Deseq expression coefficients for station; R^2: 0.727; equation: y = 0.843x + 0.607
## Edger expression coefficients for station; R^2: 0.721; equation: y = 0.821x + 0.895
## Writing summary information, compare_plot is: TRUE.
## Performing save of excel/test_tables.xlsx.
I am thinking we should do a very-literal subtraction of the cpm counts.
data.frame(row.names=c("log1", "log3", "station1", "station2"))
new_meta <-"numerator"]] <- c("tag_log_r1", "tag_log_r2", "tag_station_r3", "tag_station_r4")
new_meta[["denominator"]] <- c("wt_log_r1", "wt_log_r2", "wt_station_r3", "wt_station_r4")
new_meta[["condition"]] <- c("log", "log", "station", "station")
new_meta[["batch"]] <- c("b1", "b2", "b3", "b4")
new_meta[[ subtract_expt(tc_hisat_mrna_semantic, new_meta = new_meta, handle_negative = "zero",
subtracted_expt <-sample_column = "sample")
## This function will replace the expt$expressionset slot with:
## cpm(data)
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Filter is false, this should likely be set to something, good
## choices include cbcb, kofa, pofa (anything but FALSE). If you want this to
## stay FALSE, keep in mind that if other normalizations are performed, then the
## resulting libsizes are likely to be strange (potentially negative!)
## Leaving the data in its current base format, keep in mind that
## some metrics are easier to see when the data is log2 transformed, but
## EdgeR/DESeq do not accept transformed data.
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: not doing count filtering.
## Step 2: not normalizing the data.
## Step 3: converting the data with cpm.
## The method is: raw.
## Step 4: not doing batch correction.
## Step 4: not transforming the data.
## Error in subtract_expt(tc_hisat_mrna_semantic, new_meta = new_meta, handle_negative = "zero", : object 'subtractions' not found
write_expt(subtracted_expt, excel="excel/subtracted_testing.xlsx") printed <-
## Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'exprs': object 'subtracted_expt' not found
::pander(sessionInfo())
pandermessage(paste0("This is hpgltools commit: ", get_git_commit()))
paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
this_save <-message(paste0("Saving to ", this_save))
sm(saveme(filename=this_save)) tmp <-
loadme(filename=this_save)