1 T. cruzi

1.1 Annotations

I am using a concatenation of Esmeraldo, Non-Esmeraldo, and the Unassigned.

tc_annot <- load_gff_annotations("reference/tcruzi_clbrener_v37.gff")
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = TRUE)
## Trying attempt: rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Had a successful gff import with rtracklayer::import.gff3(gff, sequenceRegionsAsSeqinfo = FALSE)
## Returning a df with 15 columns and 99449 rows.
library(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
library(org.Tcruzi.CL.Brener.Non.Esmeraldo.like.v50.eg.db)
## 
library(org.Tcruzi.CL.Brener.v50.eg.db)
## 
annot_column_idx <- grepl(pattern = "^ANNOT_",
                          x = keytypes(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db))
annot_columns <- keytypes(org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db)[annot_column_idx]
esmer_org <- load_orgdb_annotations("org.Tcruzi.CL.Brener.Esmeraldo.like.v50.eg.db",
                                    keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
nonesmer_org <- load_orgdb_annotations("org.Tcruzi.CL.Brener.Non.Esmeraldo.like.v50.eg.db",
                                       keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
unas_org <- load_orgdb_annotations("org.Tcruzi.CL.Brener.v50.eg.db",
                                   keytype = "gid", fields = annot_columns)
## Unable to find CDSNAME, setting it to ANNOT_EXTERNAL_DB_NAME.
## Unable to find CDSCHROM in the db, removing it.
## Unable to find CDSSTRAND in the db, removing it.
## Unable to find CDSSTART in the db, removing it.
## Unable to find CDSEND in the db, removing it.
## Some requested columns are not available: ANNOT_GENE_ENTREZ_LINK.
## The following are available: ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_ENTREZ_LINK_DISPLAYTEXT, ANNOT_GENE_ENTREZ_LINK_URL, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL, CHR_ID, EVIDENCE, EVIDENCEALL, GENE_TYPE, GID, GO, GOALL, GODB_EVIDENCE_CODE, GODB_GO_ID, GODB_GO_TERM_NAME, GODB_IS_NOT, GODB_ONTOLOGY, GODB_REFERENCE, GODB_SORT_KEY, GODB_SOURCE, GODB_SUPPORT_FOR_EVIDENCE_CODE_ASSIGNMENT, GODB_TRANSCRIPT_ID_S, GOSLIM_GO_ID, GOSLIM_GO_SLIM_TERM_NAME, GOSLIM_GO_TERM_NAME, GOSLIM_IS_NOT, GOSLIM_ONTOLOGY, GOSLIM_SLIM_GO_ID, INTERPRO_DESCRIPTION, INTERPRO_E_VALUE, INTERPRO_END_MIN, INTERPRO_ID, INTERPRO_NAME, INTERPRO_PRIMARY_ID, INTERPRO_SECONDARY_ID, INTERPRO_START_MIN, INTERPRO_TRANSCRIPT_ID_S, LINKOUT_DATABASE, LINKOUT_EXT_ID, LINKOUT_LINK_URL, LINKOUT_SOURCE_ID, ONTOLOGY, ONTOLOGYALL, ORTHOLOGS_COUNT, ORTHOLOGS_GID, ORTHOLOGS_ORGANISM, ORTHOLOGS_PRODUCT, ORTHOLOGS_SYNTENIC, PATHWAY_EC_NUMBER_MATCHED_IN_PATHWAY, PATHWAY_EXACT_EC_NUMBER_MATCH, PATHWAY_EXPASY_URL, PATHWAY_ID, PATHWAY_SOURCE, PATHWAY_SOURCE_ID, PATHWAY_X_REACTIONS_MATCHING_EC_NUMBER, PDB_P_VALUE, PDB_PDB_ID, PDB_PDB_MOLECULAR_DESCRIPTION, PDB_PDB_STRUCTURE, PDB_PDB_URL, PDB_PVALUE_EXP, PDB_PVALUE_MANT, PDB_TAXON, PDB_TRANSCRIPT_ID, PDB_X_IDENTITY, PDB_X_OF_VEUPATHDB_PROTEIN_COVERED, PUBMED_AUTHORS, PUBMED_DOI, PUBMED_ID, PUBMED_TITLE
## Extracted all gene ids.
## Attempting to select: ANNOT_EXTERNAL_DB_NAME, GENE_TYPE, ANNOT_AA_SEQUENCE_ID, ANNOT_ANNOTATED_GO_COMPONENT, ANNOT_ANNOTATED_GO_FUNCTION, ANNOT_ANNOTATED_GO_ID_COMPONENT, ANNOT_ANNOTATED_GO_ID_FUNCTION, ANNOT_ANNOTATED_GO_ID_PROCESS, ANNOT_ANNOTATED_GO_PROCESS, ANNOT_ANTICODON, ANNOT_CDS, ANNOT_CDS_LENGTH, ANNOT_CHROMOSOME, ANNOT_CODING_END, ANNOT_CODING_START, ANNOT_EC_NUMBERS, ANNOT_EC_NUMBERS_DERIVED, ANNOT_EXON_COUNT, ANNOT_EXTERNAL_DB_NAME, ANNOT_EXTERNAL_DB_VERSION, ANNOT_FIVE_PRIME_UTR_LENGTH, ANNOT_GENE_CONTEXT_END, ANNOT_GENE_CONTEXT_START, ANNOT_GENE_END_MAX, ANNOT_GENE_END_MAX_TEXT, ANNOT_GENE_ENTREZ_ID, ANNOT_GENE_EXON_COUNT, ANNOT_GENE_HTS_NONCODING_SNPS, ANNOT_GENE_HTS_NONSYN_SYN_RATIO, ANNOT_GENE_HTS_NONSYNONYMOUS_SNPS, ANNOT_GENE_HTS_STOP_CODON_SNPS, ANNOT_GENE_HTS_SYNONYMOUS_SNPS, ANNOT_GENE_LOCATION_TEXT, ANNOT_GENE_NAME, ANNOT_GENE_ORTHOLOG_NUMBER, ANNOT_GENE_ORTHOMCL_NAME, ANNOT_GENE_PARALOG_NUMBER, ANNOT_GENE_PREVIOUS_IDS, ANNOT_GENE_PRODUCT, ANNOT_GENE_START_MIN, ANNOT_GENE_START_MIN_TEXT, ANNOT_GENE_TOTAL_HTS_SNPS, ANNOT_GENE_TRANSCRIPT_COUNT, ANNOT_GENE_TYPE, ANNOT_GENEDB_NEW_ID, ANNOT_GENEDB_NEW_PRODUCT, ANNOT_GENUS_SPECIES, ANNOT_HAS_MISSING_TRANSCRIPTS, ANNOT_INTERPRO_DESCRIPTION, ANNOT_INTERPRO_ID, ANNOT_IS_DEPRECATED, ANNOT_IS_PSEUDO, ANNOT_ISOELECTRIC_POINT, ANNOT_LOCATION_TEXT, ANNOT_MOLECULAR_WEIGHT, ANNOT_NCBI_TAX_ID, ANNOT_NEW_PRODUCT_NAME, ANNOT_ORTHOMCL_LINK, ANNOT_OVERVIEW, ANNOT_PFAM_DESCRIPTION, ANNOT_PFAM_ID, ANNOT_PIRSF_DESCRIPTION, ANNOT_PIRSF_ID, ANNOT_PREDICTED_GO_COMPONENT, ANNOT_PREDICTED_GO_FUNCTION, ANNOT_PREDICTED_GO_ID_COMPONENT, ANNOT_PREDICTED_GO_ID_FUNCTION, ANNOT_PREDICTED_GO_ID_PROCESS, ANNOT_PREDICTED_GO_PROCESS, ANNOT_PRIMARY_KEY, ANNOT_PROSITEPROFILES_DESCRIPTION, ANNOT_PROSITEPROFILES_ID, ANNOT_PROTEIN_LENGTH, ANNOT_PROTEIN_SEQUENCE, ANNOT_PROTEIN_SOURCE_ID, ANNOT_PSEUDO_STRING, ANNOT_SEQUENCE_DATABASE_NAME, ANNOT_SEQUENCE_ID, ANNOT_SIGNALP_PEPTIDE, ANNOT_SIGNALP_SCORES, ANNOT_SMART_DESCRIPTION, ANNOT_SMART_ID, ANNOT_SNPOVERVIEW, ANNOT_SO_ID, ANNOT_SO_TERM_DEFINITION, ANNOT_SO_TERM_NAME, ANNOT_SO_VERSION, ANNOT_STRAND, ANNOT_STRAND_PLUS_MINUS, ANNOT_SUPERFAMILY_DESCRIPTION, ANNOT_SUPERFAMILY_ID, ANNOT_THREE_PRIME_UTR_LENGTH, ANNOT_TIGRFAM_DESCRIPTION, ANNOT_TIGRFAM_ID, ANNOT_TM_COUNT, ANNOT_TRANS_FOUND_PER_GENE_INTERNAL, ANNOT_TRANSCRIPT_INDEX_PER_GENE, ANNOT_TRANSCRIPT_LENGTH, ANNOT_TRANSCRIPT_LINK, ANNOT_TRANSCRIPT_PRODUCT, ANNOT_TRANSCRIPT_SEQUENCE, ANNOT_TRANSCRIPTS_FOUND_PER_GENE, ANNOT_UNIPROT_ID, ANNOT_UNIPROT_ID_INTERNAL, ANNOT_UNIPROT_LINK_DISPLAYTEXT, ANNOT_UNIPROT_LINK_URL
## 'select()' returned 1:1 mapping between keys and columns
clb_annot <- esmer_org[["genes"]]
clb_annot <- rbind(clb_annot, nonesmer_org[["genes"]])
unas_annot <- unas_org[["genes"]]
unas_annot[["annot_gene_entrez_link"]] <- ""
clb_annot <- rbind(clb_annot, unas_annot)
colnames(clb_annot) <- gsub(pattern = "^ANNOT_", replacement = "", x = colnames(clb_annot))

1.2 Metadata

1.3 Create expressionset

1.4 Hisat2 expressionset

tc_hisat <- create_expt("sample_sheets/all_samples.xlsx",
                        gene_info = clb_annot,
                        file_column = "tcruzihisatfile")
## Reading the sample metadata.
## The sample definitions comprises: 8 rows(samples) and 38 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /mnt/sshfs_10186/cbcbsub01/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/tcruzi_fernanda_2021/preprocessing/hpgl1216/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows.
## preprocessing/hpgl1217/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1218/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1219/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1220/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1221/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1222/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## preprocessing/hpgl1223/outputs/hisat2_tcruzi_clbrener_v37/r1.count_tcruzi_clbrener_v37_sno_gene_ID.count.xz contains 25104 rows and merges to 25104 rows.
## Finished reading count data.
## Matched 25099 annotations and counts.
## Bringing together the count matrix and gene information.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 25099 rows and 8 columns.
pp(file = "images/fm_libsize.png")
## Going to write the image to: images/fm_libsize.png when dev.off() is called.
plot_libsize(tc_hisat)$plot
dev.off()
## png 
##   2
tc_first <- normalize_expt(tc_hisat, filter = "simple", convert = "cpm",
                           transform = "log2", norm = "quant")
## This function will replace the expt$expressionset slot with:
## log2(cpm(quant(simple(data))))
## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Not correcting the count-data for batch effects.  If batch is
##  included in EdgerR/limma's model, then this is probably wise; but in extreme
##  batch effects this is a good parameter to play with.
## Step 1: performing count filter with option: simple
## Removing 3262 low-count genes (21837 remaining).
## Step 2: normalizing the data with quant.
## Step 3: converting the data with cpm.
## The method is: raw.
## Step 4: not doing batch correction.
## Step 4: transforming the data with log2.
## transform_counts: Found 14438 values equal to 0, adding 1 to the matrix.
pp(file = "images/initial_pca.png")
## Going to write the image to: images/initial_pca.png when dev.off() is called.
plot_pca(tc_first)$plot
dev.off()
## png 
##   2

2 Subtractions

If I heard correctly, the goal is to subtract the wt from the tagged samples. So, contrasts should take this into account.

keepers <- list(
    "wt" = c("station_wt", "log_wt"),
    "tagged" = c("station_tag", "log_tag"),
    "log" = c("log_tag", "log_wt"),
    "station" = c("station_tag", "station_wt"))
extra <- list(
    "all" = c("station", "log"))
pairs <- all_pairwise(tc_hisat, model_batch = "svaseq", filter = TRUE)
## batch_counts: Before batch/surrogate estimation, 119881 entries are x>1: 91%.
## batch_counts: Before batch/surrogate estimation, 6828 entries are x==0: 5%.
## The be method chose 1 surrogate variable.
## Attempting svaseq estimation with 1 surrogate.
## Plotting a PCA before surrogate/batch inclusion.
## Not putting labels on the PC plot.
## Using svaseq to visualize before/after batch inclusion.
## Performing a test normalization with: raw
## This function will replace the expt$expressionset slot with:
## log2(svaseq(cpm(cbcb(data))))
## It will save copies of each step along the way
##  in expt$normalized with the corresponding libsizes. Keep libsizes in mind
##  when invoking limma.  The appropriate libsize is non-log(cpm(normalized)).
##  This is most likely kept at:
##  'new_expt$normalized$intermediate_counts$normalization$libsizes'
##  A copy of this may also be found at:
##  new_expt$best_libsize
## Leaving the data unnormalized.  This is necessary for DESeq, but
##  EdgeR/limma might benefit from normalization.  Good choices include quantile,
##  size-factor, tmm, etc.
## Step 1: performing count filter with option: cbcb
## Removing 0 low-count genes (16505 remaining).
## Step 2: not normalizing the data.
## Step 3: converting the data with cpm.
## The method is: svaseq.
## Step 4: doing batch correction with svaseq.
## Using the current state of normalization.
## Passing the data to all_adjusters using the svaseq estimate type.
## batch_counts: Before batch/surrogate estimation, 113063 entries are x>1: 86%.
## batch_counts: Before batch/surrogate estimation, 6828 entries are x==0: 5%.
## batch_counts: Before batch/surrogate estimation, 12149 entries are 0<x<1: 9%.
## The be method chose 1 surrogate variable.
## Attempting svaseq estimation with 1 surrogate.
## There are 2147 (2%) elements which are < 0 after batch correction.
## Setting low elements to zero.
## Step 4: transforming the data with log2.
## transform_counts: Found 2147 values equal to 0, adding 1 to the matrix.
## Not putting labels on the PC plot.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.

tables <- combine_de_tables(pairs, keepers=keepers, excel="excel/test_tables.xlsx")
## Deleting the file excel/test_tables.xlsx before writing the tables.
## Writing a legend of columns.
## Printing a pca plot before/after surrogates/batch estimation.
## Working on 1/4: wt which is: station_wt/log_wt.
## Found table with station_wt_vs_log_wt
## Working on 2/4: tagged which is: station_tag/log_tag.
## Found table with station_tag_vs_log_tag
## Working on 3/4: log which is: log_tag/log_wt.
## Found inverse table with log_wt_vs_log_tag
## Working on 4/4: station which is: station_tag/station_wt.
## Found inverse table with station_wt_vs_station_tag
## Adding venn plots for wt.

## Limma expression coefficients for wt; R^2: 0.493; equation: y = 0.675x + 0.788
## Deseq expression coefficients for wt; R^2: 0.435; equation: y = 0.575x + 1.88
## Edger expression coefficients for wt; R^2: 0.44; equation: y = 0.571x + 3.45
## Adding venn plots for tagged.

## Limma expression coefficients for tagged; R^2: 0.543; equation: y = 0.694x + 0.735
## Deseq expression coefficients for tagged; R^2: 0.437; equation: y = 0.552x + 2.01
## Edger expression coefficients for tagged; R^2: 0.454; equation: y = 0.56x + 3.99
## Adding venn plots for log.

## Limma expression coefficients for log; R^2: 0.762; equation: y = 0.827x + 0.443
## Deseq expression coefficients for log; R^2: 0.774; equation: y = 0.865x + 0.567
## Edger expression coefficients for log; R^2: 0.785; equation: y = 0.872x + 1.14
## Adding venn plots for station.

## Limma expression coefficients for station; R^2: 0.736; equation: y = 0.833x + 0.425
## Deseq expression coefficients for station; R^2: 0.673; equation: y = 0.828x + 0.654
## Edger expression coefficients for station; R^2: 0.678; equation: y = 0.83x + 1.23
## Writing summary information, compare_plot is: TRUE.
## Performing save of excel/test_tables.xlsx.

pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
loadme(filename=this_save)
LS0tCnRpdGxlOiAiRmVybmFuZGEncyBjb29sIFJCUCBiaW5kaW5nIGV4cGVyaW1lbnQuIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB0YW5nbwogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogICAgc2VsZl9jb250YWluZWQ6IHRydWUKICAgIHRoZW1lOiByZWFkYWJsZQogICAgdG9jOiB0cnVlCiAgICB0b2NfZmxvYXQ6CiAgICAgIGNvbGxhcHNlZDogZmFsc2UKICAgICAgc21vb3RoX3Njcm9sbDogZmFsc2UKICBybWRmb3JtYXRzOjpyZWFkdGhlZG93bjoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgICBmaWdfY2FwdGlvbjogdHJ1ZQogICAgZmlnX2hlaWdodDogNwogICAgZmlnX3dpZHRoOiA3CiAgICBoaWdobGlnaHQ6IHRhbmdvCiAgICB3aWR0aDogMzAwCiAgICBrZWVwX21kOiBmYWxzZQogICAgbW9kZTogc2VsZmNvbnRhaW5lZAogICAgdG9jX2Zsb2F0OiB0cnVlCiAgQmlvY1N0eWxlOjpodG1sX2RvY3VtZW50OgogICAgY29kZV9kb3dubG9hZDogdHJ1ZQogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICBmaWdfY2FwdGlvbjogdHJ1ZQogICAgZmlnX2hlaWdodDogNwogICAgZmlnX3dpZHRoOiA3CiAgICBoaWdobGlnaHQ6IHRhbmdvCiAgICBrZWVwX21kOiBmYWxzZQogICAgbW9kZTogc2VsZmNvbnRhaW5lZAogICAgdG9jX2Zsb2F0OiB0cnVlCi0tLQoKPHN0eWxlIHR5cGU9InRleHQvY3NzIj4KYm9keSwgdGQgewogIGZvbnQtc2l6ZTogMTZweDsKfQpjb2RlLnJ7CiAgZm9udC1zaXplOiAxNnB4Owp9CnByZSB7CiBmb250LXNpemU6IDE2cHgKfQo8L3N0eWxlPgoKYGBge3Igb3B0aW9ucywgaW5jbHVkZT1GQUxTRX0KbGlicmFyeSgiaHBnbHRvb2xzIikKdHQgPC0gZGV2dG9vbHM6OmxvYWRfYWxsKCJ+L2hwZ2x0b29scyIpCmtuaXRyOjpvcHRzX2tuaXQkc2V0KHdpZHRoPTEyMCwKICAgICAgICAgICAgICAgICAgICAgcHJvZ3Jlc3M9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZT1UUlVFLAogICAgICAgICAgICAgICAgICAgICBlY2hvPVRSVUUpCmtuaXRyOjpvcHRzX2NodW5rJHNldChlcnJvcj1UUlVFLAogICAgICAgICAgICAgICAgICAgICAgZHBpPTk2KQpvbGRfb3B0aW9ucyA8LSBvcHRpb25zKGRpZ2l0cz00LAogICAgICAgICAgICAgICAgICAgICAgIHN0cmluZ3NBc0ZhY3RvcnM9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAga25pdHIuZHVwbGljYXRlLmxhYmVsPSJhbGxvdyIpCmdncGxvdDI6OnRoZW1lX3NldChnZ3Bsb3QyOjp0aGVtZV9idyhiYXNlX3NpemU9MTApKQpydW5kYXRlIDwtIGZvcm1hdChTeXMuRGF0ZSgpLCBmb3JtYXQ9IiVZJW0lZCIpCnByZXZpb3VzX2ZpbGUgPC0gInVuZGVmaW5lZC5SbWQiCnZlciA8LSAiMjAyMTAzIgoKIyN0bXAgPC0gc20obG9hZG1lKGZpbGVuYW1lPXBhc3RlMChnc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IiIsIHg9cHJldmlvdXNfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKSkpCnJtZF9maWxlIDwtICJpbmRleC5SbWQiCmBgYAoKIyBULiBjcnV6aQoKIyMgQW5ub3RhdGlvbnMKCkkgYW0gdXNpbmcgYSBjb25jYXRlbmF0aW9uIG9mIEVzbWVyYWxkbywgTm9uLUVzbWVyYWxkbywgYW5kIHRoZSBVbmFzc2lnbmVkLgoKYGBge3IgYW5ub3RhdGlvbnN9CnRjX2Fubm90IDwtIGxvYWRfZ2ZmX2Fubm90YXRpb25zKCJyZWZlcmVuY2UvdGNydXppX2NsYnJlbmVyX3YzNy5nZmYiKQoKbGlicmFyeShvcmcuVGNydXppLkNMLkJyZW5lci5Fc21lcmFsZG8ubGlrZS52NTAuZWcuZGIpCmxpYnJhcnkob3JnLlRjcnV6aS5DTC5CcmVuZXIuTm9uLkVzbWVyYWxkby5saWtlLnY1MC5lZy5kYikKbGlicmFyeShvcmcuVGNydXppLkNMLkJyZW5lci52NTAuZWcuZGIpCmFubm90X2NvbHVtbl9pZHggPC0gZ3JlcGwocGF0dGVybiA9ICJeQU5OT1RfIiwKICAgICAgICAgICAgICAgICAgICAgICAgICB4ID0ga2V5dHlwZXMob3JnLlRjcnV6aS5DTC5CcmVuZXIuRXNtZXJhbGRvLmxpa2UudjUwLmVnLmRiKSkKYW5ub3RfY29sdW1ucyA8LSBrZXl0eXBlcyhvcmcuVGNydXppLkNMLkJyZW5lci5Fc21lcmFsZG8ubGlrZS52NTAuZWcuZGIpW2Fubm90X2NvbHVtbl9pZHhdCmVzbWVyX29yZyA8LSBsb2FkX29yZ2RiX2Fubm90YXRpb25zKCJvcmcuVGNydXppLkNMLkJyZW5lci5Fc21lcmFsZG8ubGlrZS52NTAuZWcuZGIiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBrZXl0eXBlID0gImdpZCIsIGZpZWxkcyA9IGFubm90X2NvbHVtbnMpCm5vbmVzbWVyX29yZyA8LSBsb2FkX29yZ2RiX2Fubm90YXRpb25zKCJvcmcuVGNydXppLkNMLkJyZW5lci5Ob24uRXNtZXJhbGRvLmxpa2UudjUwLmVnLmRiIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAga2V5dHlwZSA9ICJnaWQiLCBmaWVsZHMgPSBhbm5vdF9jb2x1bW5zKQp1bmFzX29yZyA8LSBsb2FkX29yZ2RiX2Fubm90YXRpb25zKCJvcmcuVGNydXppLkNMLkJyZW5lci52NTAuZWcuZGIiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGtleXR5cGUgPSAiZ2lkIiwgZmllbGRzID0gYW5ub3RfY29sdW1ucykKY2xiX2Fubm90IDwtIGVzbWVyX29yZ1tbImdlbmVzIl1dCmNsYl9hbm5vdCA8LSByYmluZChjbGJfYW5ub3QsIG5vbmVzbWVyX29yZ1tbImdlbmVzIl1dKQp1bmFzX2Fubm90IDwtIHVuYXNfb3JnW1siZ2VuZXMiXV0KdW5hc19hbm5vdFtbImFubm90X2dlbmVfZW50cmV6X2xpbmsiXV0gPC0gIiIKY2xiX2Fubm90IDwtIHJiaW5kKGNsYl9hbm5vdCwgdW5hc19hbm5vdCkKY29sbmFtZXMoY2xiX2Fubm90KSA8LSBnc3ViKHBhdHRlcm4gPSAiXkFOTk9UXyIsIHJlcGxhY2VtZW50ID0gIiIsIHggPSBjb2xuYW1lcyhjbGJfYW5ub3QpKQpgYGAKCiMjIE1ldGFkYXRhCgojIyBDcmVhdGUgZXhwcmVzc2lvbnNldAoKIyMgSGlzYXQyIGV4cHJlc3Npb25zZXQKCmBgYHtyIGhpc2F0Mn0KdGNfaGlzYXQgPC0gY3JlYXRlX2V4cHQoInNhbXBsZV9zaGVldHMvYWxsX3NhbXBsZXMueGxzeCIsCiAgICAgICAgICAgICAgICAgICAgICAgIGdlbmVfaW5mbyA9IGNsYl9hbm5vdCwKICAgICAgICAgICAgICAgICAgICAgICAgZmlsZV9jb2x1bW4gPSAidGNydXppaGlzYXRmaWxlIikKCnBwKGZpbGUgPSAiaW1hZ2VzL2ZtX2xpYnNpemUucG5nIikKcGxvdF9saWJzaXplKHRjX2hpc2F0KSRwbG90CmRldi5vZmYoKQoKdGNfZmlyc3QgPC0gbm9ybWFsaXplX2V4cHQodGNfaGlzYXQsIGZpbHRlciA9ICJzaW1wbGUiLCBjb252ZXJ0ID0gImNwbSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHRyYW5zZm9ybSA9ICJsb2cyIiwgbm9ybSA9ICJxdWFudCIpCgpwcChmaWxlID0gImltYWdlcy9pbml0aWFsX3BjYS5wbmciKQpwbG90X3BjYSh0Y19maXJzdCkkcGxvdApkZXYub2ZmKCkKYGBgCgojIFN1YnRyYWN0aW9ucwoKSWYgSSBoZWFyZCBjb3JyZWN0bHksIHRoZSBnb2FsIGlzIHRvIHN1YnRyYWN0IHRoZSB3dCBmcm9tIHRoZSB0YWdnZWQgc2FtcGxlcy4gU28sIGNvbnRyYXN0cwpzaG91bGQgdGFrZSB0aGlzIGludG8gYWNjb3VudC4KCmBgYHtyIGNvbnRyYXN0c30Ka2VlcGVycyA8LSBsaXN0KAogICAgInd0IiA9IGMoInN0YXRpb25fd3QiLCAibG9nX3d0IiksCiAgICAidGFnZ2VkIiA9IGMoInN0YXRpb25fdGFnIiwgImxvZ190YWciKSwKICAgICJsb2ciID0gYygibG9nX3RhZyIsICJsb2dfd3QiKSwKICAgICJzdGF0aW9uIiA9IGMoInN0YXRpb25fdGFnIiwgInN0YXRpb25fd3QiKSkKZXh0cmEgPC0gbGlzdCgKICAgICJhbGwiID0gYygic3RhdGlvbiIsICJsb2ciKSkKcGFpcnMgPC0gYWxsX3BhaXJ3aXNlKHRjX2hpc2F0LCBtb2RlbF9iYXRjaCA9ICJzdmFzZXEiLCBmaWx0ZXIgPSBUUlVFKQp0YWJsZXMgPC0gY29tYmluZV9kZV90YWJsZXMocGFpcnMsIGtlZXBlcnM9a2VlcGVycywgZXhjZWw9ImV4Y2VsL3Rlc3RfdGFibGVzLnhsc3giKQpgYGAKCmBgYHtyIHNhdmVtZSwgZXZhbD1GQUxTRX0KcGFuZGVyOjpwYW5kZXIoc2Vzc2lvbkluZm8oKSkKbWVzc2FnZShwYXN0ZTAoIlRoaXMgaXMgaHBnbHRvb2xzIGNvbW1pdDogIiwgZ2V0X2dpdF9jb21taXQoKSkpCnRoaXNfc2F2ZSA8LSBwYXN0ZTAoZ3N1YihwYXR0ZXJuPSJcXC5SbWQiLCByZXBsYWNlPSIiLCB4PXJtZF9maWxlKSwgIi12IiwgdmVyLCAiLnJkYS54eiIpCm1lc3NhZ2UocGFzdGUwKCJTYXZpbmcgdG8gIiwgdGhpc19zYXZlKSkKdG1wIDwtIHNtKHNhdmVtZShmaWxlbmFtZT10aGlzX3NhdmUpKQpgYGAKCgpgYGB7ciBsb2FkbWUsIGV2YWw9RkFMU0V9CmxvYWRtZShmaWxlbmFtZT10aGlzX3NhdmUpCmBgYAo=