I am using a concatenation of Esmeraldo, Non-Esmeraldo, and the Unassigned.
load_gff_annotations("reference/tcruzi_clbrener_v37.gff") tc_annot <-
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Returning a df with 15 columns and 99449 rows.
library(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
##
library(org.Tcruzi.CL.Brener.Non.Esmeraldo.like.v50.eg.db)
##
library(org.Tcruzi.CL.Brener.v50.eg.db)
##
grepl(pattern = "^ANNOT_",
annot_column_idx <-x = keytypes(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db))
keytypes(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db)[annot_column_idx]
annot_columns <- load_orgdb_annotations("org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db",
esmer_org <-keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
load_orgdb_annotations("org.Tcruzi.CL.Brener.Non.Esmeraldo.like.v50.eg.db",
nonesmer_org <-keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
load_orgdb_annotations("org.Tcruzi.CL.Brener.v50.eg.db",
unas_org <-keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Some requested columns are not available: ANNOT_GENE_ENTREZ_LINK.
## The following are available: ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK_DISPLAYTEXT, ANNOT_GENE_ENTREZ_LINK_URL, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL, CHR_ID, EVIDENCE, EVIDENCEALL, GENE_TYPE, GID, GO, GOALL, GODB_EVIDENCE_CODE, GODB_GO_ID, GODB_GO_TERM_NAME, GODB_IS_NOT, GODB_ONTOLOGY, GODB_REFERENCE, GODB_SORT_KEY, GODB_SOURCE, GODB_SUPPORT_FOR_EVIDENCE_CODE_ASSIGNMENT, GODB_TRANSCRIPT_ID_S, GOSLIM_GO_ID, GOSLIM_GO_SLIM_TERM_NAME, GOSLIM_GO_TERM_NAME, GOSLIM_IS_NOT, GOSLIM_ONTOLOGY, GOSLIM_SLIM_GO_ID, INTERPRO_DESCRIPTION, INTERPRO_E_VALUE, INTERPRO_END_MIN, INTERPRO_ID, INTERPRO_NAME, INTERPRO_PRIMARY_ID, INTERPRO_SECONDARY_ID, INTERPRO_START_MIN, INTERPRO_TRANSCRIPT_ID_S, LINKOUT_DATABASE, LINKOUT_EXT_ID, LINKOUT_LINK_URL, LINKOUT_SOURCE_ID, ONTOLOGY, ONTOLOGYALL, ORTHOLOGS_COUNT, ORTHOLOGS_GID, ORTHOLOGS_ORGANISM, ORTHOLOGS_PRODUCT, ORTHOLOGS_SYNTENIC, PATHWAY_EC_NUMBER_MATCHED_IN_PATHWAY, PATHWAY_EXACT_EC_NUMBER_MATCH, PATHWAY_EXPASY_URL, PATHWAY_ID, PATHWAY_SOURCE, PATHWAY_SOURCE_ID, PATHWAY_X_REACTIONS_MATCHING_EC_NUMBER, PDB_P_VALUE, PDB_PDB_ID, PDB_PDB_MOLECULAR_DESCRIPTION, PDB_PDB_STRUCTURE, PDB_PDB_URL, PDB_PVALUE_EXP, PDB_PVALUE_MANT, PDB_TAXON, PDB_TRANSCRIPT_ID, PDB_X_IDENTITY, PDB_X_OF_VEUPATHDB_PROTEIN_COVERED, PUBMED_AUTHORS, PUBMED_DOI, PUBMED_ID, PUBMED_TITLE
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
esmer_org[["genes"]]
clb_annot <- rbind(clb_annot, nonesmer_org[["genes"]])
clb_annot <- unas_org[["genes"]]
unas_annot <-"annot_gene_entrez_link"]] <- ""
unas_annot[[ rbind(clb_annot, unas_annot)
clb_annot <-colnames(clb_annot) <- gsub(pattern = "^ANNOT_", replacement = "", x = colnames(clb_annot))
create_expt("sample_sheets/all_samples.xlsx",
tc_hisat <-gene_info = clb_annot,
file_column = "tcruzihisatfile")
## Reading the sample metadata.
## The sample definitions comprises: 8 rows(samples) and 38 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /mnt/sshfs_10186/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/tcruzi_fernanda_2021/preprocessing/hpgl1216/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows.
## preprocessing/hpgl1217/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1218/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1219/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1220/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1221/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1222/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1223/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## Finished reading count data.
## Matched 25099 annotations and counts.
## Bringing together the count matrix and gene information.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 25099 rows and 8 columns.
pp(file = "images/fm_libsize.png")
## Going to write the image to: images/fm_libsize.png when dev.off() is called.
plot_libsize(tc_hisat)$plot
dev.off()
## png
## 2
normalize_expt(tc_hisat, filter = "simple", convert = "cpm",
tc_first <-transform = "log2", norm = "quant")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(simple(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Not correcting the count-data for batch effects. If batch is
## included in EdgerR/limma's model, then this is probably wise; but in extreme
## batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: simple
## Removing 3262 low-count genes (21837 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## The method is: raw.
## Step 4: not doing batch correction.
## Step 4: transforming the data with log2.
## transform_counts: Found 14438 values equal to 0, adding 1 to the matrix.
pp(file = "images/initial_pca.png")
## Going to write the image to: images/initial_pca.png when dev.off() is called.
plot_pca(tc_first)$plot
dev.off()
## png
## 2
If I heard correctly, the goal is to subtract the wt from the tagged samples. So, contrasts should take this into account.
list(
keepers <-"wt" = c("station_wt", "log_wt"),
"tagged" = c("station_tag", "log_tag"),
"log" = c("log_tag", "log_wt"),
"station" = c("station_tag", "station_wt"))
list(
extra <-"all" = c("station", "log"))
all_pairwise(tc_hisat, model_batch = "svaseq", filter = TRUE) pairs <-
## batch_counts: Before batch/surrogate estimation, 119881 entries are x>1: 91%.
## batch_counts: Before batch/surrogate estimation, 6828 entries are x==0: 5%.
## The be method chose 1 surrogate variable.
## Attempting svaseq estimation with 1 surrogate.
## Plotting a PCA before surrogate/batch inclusion.
## Not putting labels on the PC plot.
## Using svaseq to visualize before/after batch inclusion.
## Performing a test normalization with: raw
## This function will replace the expt$expressionset slot with:
## log2(svaseq(cpm(cbcb(data))))
## It will save copies of each step along the way
## in expt$normalized with the corresponding libsizes. Keep libsizes in mind
## when invoking limma. The appropriate libsize is non-log(cpm(normalized)).
## This is most likely kept at:
## 'new_expt$normalized$intermediate_counts$normalization$libsizes'
## A copy of this may also be found at:
## new_expt$best_libsize
## Leaving the data unnormalized. This is necessary for DESeq, but
## EdgeR/limma might benefit from normalization. Good choices include quantile,
## size-factor, tmm, etc.
## Step 1: performing count filter with option: cbcb
## Removing 0 low-count genes (16505 remaining).
## Step 2: not normalizing the data.
## Step 3: converting the data with cpm.
## The method is: svaseq.
## Step 4: doing batch correction with svaseq.
## Using the current state of normalization.
## Passing the data to all_adjusters using the svaseq estimate type.
## batch_counts: Before batch/surrogate estimation, 113063 entries are x>1: 86%.
## batch_counts: Before batch/surrogate estimation, 6828 entries are x==0: 5%.
## batch_counts: Before batch/surrogate estimation, 12149 entries are 0<x<1: 9%.
## The be method chose 1 surrogate variable.
## Attempting svaseq estimation with 1 surrogate.
## There are 2147 (2%) elements which are < 0 after batch correction.
## Setting low elements to zero.
## Step 4: transforming the data with log2.
## transform_counts: Found 2147 values equal to 0, adding 1 to the matrix.
## Not putting labels on the PC plot.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
combine_de_tables(pairs, keepers=keepers, excel="excel/test_tables.xlsx") tables <-
## Deleting the file excel/test_tables.xlsx before writing the tables.
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Working on 1/4: wt which is: station_wt/log_wt.
## Found table with station_wt_vs_log_wt
## Working on 2/4: tagged which is: station_tag/log_tag.
## Found table with station_tag_vs_log_tag
## Working on 3/4: log which is: log_tag/log_wt.
## Found inverse table with log_wt_vs_log_tag
## Working on 4/4: station which is: station_tag/station_wt.
## Found inverse table with station_wt_vs_station_tag
## Adding venn plots for wt.
## Limma expression coefficients for wt; R^2: 0.493; equation: y = 0.675x + 0.788
## Deseq expression coefficients for wt; R^2: 0.435; equation: y = 0.575x + 1.88
## Edger expression coefficients for wt; R^2: 0.44; equation: y = 0.571x + 3.45
## Adding venn plots for tagged.
## Limma expression coefficients for tagged; R^2: 0.543; equation: y = 0.694x + 0.735
## Deseq expression coefficients for tagged; R^2: 0.437; equation: y = 0.552x + 2.01
## Edger expression coefficients for tagged; R^2: 0.454; equation: y = 0.56x + 3.99
## Adding venn plots for log.
## Limma expression coefficients for log; R^2: 0.762; equation: y = 0.827x + 0.443
## Deseq expression coefficients for log; R^2: 0.774; equation: y = 0.865x + 0.567
## Edger expression coefficients for log; R^2: 0.785; equation: y = 0.872x + 1.14
## Adding venn plots for station.
## Limma expression coefficients for station; R^2: 0.736; equation: y = 0.833x + 0.425
## Deseq expression coefficients for station; R^2: 0.673; equation: y = 0.828x + 0.654
## Edger expression coefficients for station; R^2: 0.678; equation: y = 0.83x + 1.23
## Writing summary information, compare_plot is: TRUE.
## Performing save of excel/test_tables.xlsx.
::pander(sessionInfo())
pandermessage(paste0("This is hpgltools commit: ", get_git_commit()))
paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
this_save <-message(paste0("Saving to ", this_save))
sm(saveme(filename=this_save)) tmp <-
loadme(filename=this_save)