This data is found in supplemental table S1, which I copied to /cbcb/lab/nelsayed/ref_data/Science_2005/ This file looks like it will be difficult to work with, the first column is a gene identifier from each of the three main species – easy enough. The second column is a ‘secondary gene identifier’ and appears to me to be inconsistent. The third is a cluster identifier, unfortunately it too is inconsistent. The next column ‘COG type’, appears to be the best. Finally there is a predicted protein type which of course contains primarily the text ‘hypothetical’ or ‘amastin.’
science_2005 = read.csv("/cbcb/lab/nelsayed/ref_data/science_2005/1112181s_tableS1.txt", sep="\t", comment="#", header=TRUE)
colnames(science_2005) = c("primary_gene","secondary_gene","jaccard","cog_type","cog_id","gene_product")
science_agg = science_2005
science_agg = as.data.frame(lapply(science_agg, function(x) if(is.character(x)|is.factor(x)) gsub("\\*new gene in LmjF.v5.0\\*","lmundef",x) else x))
science_agg = as.data.frame(lapply(science_agg, function(x) if(is.character(x)|is.factor(x)) gsub("\\* new gene in LmjF.v5.0\\*","lmundef",x) else x))
science_agg = as.data.frame(lapply(science_agg, function(x) if(is.character(x)|is.factor(x)) gsub("\\*new gene in TcBr.v4.0\\*","tcundef",x) else x))
science_agg = as.data.frame(lapply(science_agg, function(x) if(is.character(x)|is.factor(x)) gsub("\\*new gene in Tb927.v4.0\\*","tbundef",x) else x))
science_agg$cog = science_agg$cog_id
science_agg$cog = ifelse(science_agg$cog == "lmundef", as.character(paste0('nocog', science_agg$primary_gene)), as.character(science_agg$cog_id))
science_agg$cog = ifelse(science_agg$cog == '', as.character(paste0('nocog', science_agg$primary_gene)), as.character(science_agg$cog_id))
science_agg$cog = ifelse(science_agg$cog == "", as.character(paste0('nocog', science_agg$primary_gene)), as.character(science_agg$cog_id))
length(levels(as.factor(science_agg$cog)))
## [1] 7176
science_agg$cog = ifelse(science_agg$cog == "", as.character(paste0('nocog', science_agg$primary_gene)), as.character(science_agg$cog_id))
length(levels(as.factor(science_agg$cog)))
## [1] 16802
science_lmcogs = science_agg[grep("^Lm", science_agg$primary_gene, perl=TRUE),]
science_lmcogs = science_lmcogs[,c("primary_gene","cog")]
science_aggregated = aggregate(primary_gene~cog, paste, collapse=",", data=science_agg)
science_merged = merge(science_lmcogs, science_aggregated, by.x="cog", by.y="cog", all.x=TRUE)
science_merged = merge(science_merged, science_agg, by.x="primary_gene.x", by.y="primary_gene")
colnames(science_merged) = c("ID","cog","genes","secondary_gene","jaccard","cog_type","cog_id","gene_product","cog_again")
science_merged = science_merged[-9] ## Drop the duplicate column
science_merged$ID = gsub("LmjF", "LmjF\\.", science_merged$ID)
science_merged$genes = gsub("LmjF", "LmjF\\.", science_merged$genes)
science_merged$genes = gsub("Tb", "Tb927\\.", science_merged$genes)
This stanza marks the place where the first two tables are joined.
head(science_merged)
## ID cog
## 1 LmjF.01.0010 nocogLmjF01.0010
## 2 LmjF.01.0020 20332099_cogs
## 3 LmjF.01.0030 19584616_cogs
## 4 LmjF.01.0040 nocogLmjF01.0040
## 5 LmjF.01.0050 nocogLmjF01.0050
## 6 LmjF.01.0060 21566042_cogs
## genes
## 1 LmjF.01.0010
## 2 Tc00.1047053506895.50,Tc00.1047053508501.300,LmjF.01.0020,Tb927.09.160.2280
## 3 Tc00.1047053506895.40,Tc00.1047053508501.290,LmjF.01.0030,Tb927.09.160.2260
## 4 LmjF.01.0040
## 5 LmjF.01.0050
## 6 Tc00.1047053506895.30,Tc00.1047053508501.280,LmjF.01.0060,Tb927.09.160.2250
## secondary_gene jaccard cog_type cog_id
## 1 singleton
## 2 3-way COG [Lm+Tb+Tc] 20332099_cogs
## 3 3-way COG [Lm+Tb+Tc] 19584616_cogs
## 4 singleton
## 5 singleton
## 6 3-way COG [Lm+Tb+Tc] 21566042_cogs
## gene_product
## 1 hypothetical protein, unknown function
## 2 hypothetical protein, conserved
## 3 MCAK-like kinesin, putative
## 4 hypothetical protein, unknown function
## 5 carboxylase, putative
## 6 hypothetical protein, conserved
dim(science_merged)
## [1] 8299 8
head(tritryp_xrefs)
## ID Product_Description Gene_Name X._TM_Domains
## 1 LmjF.01.0010 hypothetical protein, unknown function null 2
## 2 LmjF.01.0020 hypothetical protein, conserved null 0
## 3 LmjF.01.0030 MCAK-like kinesin, putative null 0
## 4 LmjF.01.0040 hypothetical protein, unknown function null 0
## 5 LmjF.01.0050 carboxylase, putative null 0
## 6 LmjF.01.0060 hypothetical protein, conserved null 0
## Ortholog_count Paralog_count Ortholog_Group SignalP_Scores SignalP_Peptide UniProt_ID
## 1 9 0 OG5_183100 NN Sum HMM null
## 2 21 0 OG5_148223 null null null
## 3 43 1 OG5_127200 null null null
## 4 7 0 OG5_183101 NN Sum HMM null
## 5 32 1 OG5_126626 null null null
## 6 17 0 OG5_148224 NN Sum HMM null
## Entrez_Gene_ID Gene_Name_or_Symbol tbrucei
## 1 null null <NA>
## 2 null null Tb927.9.3670
## 3 null null Tb927.11.3280,Tb927.9.3650
## 4 null null <NA>
## 5 null null Tb927.4.5350,Tb927.8.6970
## 6 null null Tb927.9.3640
## tcruzi
## 1 <NA>
## 2 TcCLB.506895.50,TcCLB.508501.300
## 3 TcCLB.506895.40,TcCLB.509589.30,TcCLB.503999.20,TcCLB.508501.290
## 4 <NA>
## 5 TcCLB.504835.20,TcCLB.509913.10
## 6 TcCLB.506895.30,TcCLB.508501.280
dim(tritryp_xrefs)
## [1] 9378 14
tri_sci = merge(science_merged, tritryp_xrefs, by.x="ID", by.y="ID", all.y=TRUE)
original_columns = c("ID","science_cog","science_genes","science_sec_gene","science_jaccard","science_cog_type","science_cog_id","science_gene_product")
new_columns = c("tritryp_description","tritryp_gene_name","tritryp_tm_domains","tritryp_ortholog_count","tritryp_paralog_count","tritryp_ortholog_group","tritryp_signalp_score","tritryp_signalp_peptide","tritryp_uniprot","tritryp_entrez_geneid","tritryp_gene_symbol","tritryp_brucei_homologs","tritryp_cruzi_homologs")
all_columns = c(original_columns, new_columns)
colnames(tri_sci) = all_columns
dim(tri_sci)
## [1] 9378 21
signalp_data = read.csv(file="reference/lmajor_signalp_cleavage.gff", sep="\t", header=TRUE, comment="#")
signalp_data = signalp_data[,c("sequence.name","end","score","source")]
colnames(signalp_data) = c("ID","end","score","source")
dim(tri_sci)
## [1] 9378 21
dim(signalp_data)
## [1] 840 4
tri_sci_sig = merge(tri_sci, signalp_data, by.x="ID", by.y="ID", all.x=TRUE)
dim(tri_sci_sig)
## [1] 9378 24
koh_data = read.csv(file="reference/koh.txt", sep=" ", header=FALSE)
colnames(koh_data) = c("ID","koh_score")
tri_sci_sig = merge(tri_sci_sig, koh_data, by.x="ID", by.y="ID", all.x=TRUE)
new_columns = c("signalp_end","signalp_score","signalp_version", "koh_score")
all_columns = c(all_columns, new_columns)
colnames(tri_sci_sig) = all_columns
I specify query/library because I have a reciprocal search. Also, I am using the UCSC known peptides, which have a separate file to cross reference them to the ENSG identifiers, so I will need to merge across both files.
pepnames = read.csv("reference/knownGene.txt.gz", sep="\t", header=FALSE)
colnames(pepnames) = c("ID","chr","strand","start","end","start2","start3","num","exons","exons2","q","UCSC")
fasta_search = read.csv("reference/lmajor_vs_hsapiens.out.table", sep="\t", header=TRUE)
fasta_pepnames = merge(pepnames, fasta_search, by.x="ID", by.y="Hit.ID")
fasta_pepnames = fasta_pepnames[,c("UCSC","Query.Name","Query.length","Hit.Length","Score","E","Identity","Hit.length","Hit.Matches")]
fasta_pepnames$Query.Name = make.names(fasta_pepnames$Query.Name, unique=TRUE)
tri_sci_sig_fas = merge(tri_sci_sig, fasta_pepnames, by.x="ID", by.y="Query.Name", all.x=TRUE)
new_columns = c("fasta_ucsc_id","fasta_query_length","fasta_libhit_length","fasta_score","fasta_e","fasta_identity","fasta_hit_length","fasta_hit_matches")
all_columns = c(all_columns, new_columns)
colnames(tri_sci_sig_fas) = all_columns
##write_xls(data=tri_sci_sig_fas, file="merged_tables")
## This is from Table S3 of:
## "An exosome-based secretion pathway is responsible for protein export from Leishmania and communication with macrophages."
## http://jcs.biologists.org/content/123/6/842.short
secretome = read.csv("reference/secretome.csv", header=TRUE)
colnames(secretome) = c("ID","name","exosome","secretome")
secretome$ID = gsub(pattern="LmjF", replacement="LmjF\\.", x=secretome$ID)
tri_sci_sig_fas_sec = merge(tri_sci_sig_fas, secretome, by.x="ID", by.y="ID", all.x=TRUE)
## This comma separated file was extracted from table S4 from:
## http://www.genomebiology.com/2008/9/2/R35/additional
## "Proteomic analysis of the secretome of Leishmania donovani"
communication = read.csv("reference/communication.csv", header=TRUE)
colnames(communication) = c("ID","communication_name", "ms_gpi")
tri_sci_sig_fas_sec = merge(tri_sci_sig_fas_sec, communication, by.x="ID", by.y="ID", all.x=TRUE)
## The following table came from Supplementary 5 in
## http://www.genomebiology.com/2008/9/2/R35
## "Proteomic analysis of the secretome of Leishmania donovani"
donovani_cmca_ratio = read.csv("reference/proteomic_analysis_secretome_donovani_s5.csv", header=TRUE)
## The following comes from table S2 from the same paper
donovani_s2 = read.csv("reference/gb-2008-9-2-r35-s2.csv", header=TRUE)
colnames(donovani_s2) = c("accession","id","meanlmmcmca1","rel_pep_ratio1","meanlmmcmca2","rel_pep_ratio2","meanlmmcmca3","rel_pep_ratio3","meanlmmcmca4","rel_pep_ratio4","meanlmmcmca_all")
##write_xls(data=donovani_s2, file="donovani_s2_table")
##dim(subset(donovani_s2, meanlmcmca_all > 1.7))
head(donovani_s2)
## accession id meanlmmcmca1
## 1 LmjF14.1360 myo-inositol-1-phosphate synthase 3.0297
## 2 LmjF23.0200 endoribonuclease L-PSP (pb5), putative 1.2585
## 3 LmjF15.1203 60S acidic ribosomal protein P2 NA
## 4 LmjF35.2420 phosphoinositide-binding protein, putative NA
## 5 LmjF16.0140 eukaryotic translation initiation factor 1A, putative NA
## 6 LmjF32.2180 hypothetical protein, conserved 0.7839
## rel_pep_ratio1 meanlmmcmca2 rel_pep_ratio2 meanlmmcmca3 rel_pep_ratio3 meanlmmcmca4
## 1 454.31 3.077 101.2 6.500 319.11 3.1091
## 2 0.00 NA 0.0 7.122 308.26 3.3414
## 3 0.00 2.695 163.5 6.020 91.37 2.4791
## 4 0.00 4.264 798.9 5.273 0.00 0.9933
## 5 0.00 3.257 138.9 5.509 96.61 1.0225
## 6 91.71 2.749 419.4 5.710 0.00 NA
## rel_pep_ratio4 meanlmmcmca_all
## 1 183.26 3.929
## 2 328.36 3.907
## 3 156.92 3.731
## 4 12.14 3.510
## 5 0.00 3.263
## 6 23.26 3.081
donovani_s2$accession = gsub(pattern="LmjF", replacement="LmjF\\.", x=donovani_s2$accession)
tri_sci_sig_fas_sec = merge(tri_sci_sig_fas_sec, donovani_s2, by.x="ID", by.y="accession", all.x=TRUE)
## The following table comes from 10.1371/journal.pntd.0000842
## PLoS Negl Trop Dis. 2010 Oct; 4(10): e842
## It is table 1 from: Leishmania effector molecules identified by CMAT.
## It required a small amount of manual playing to make into a usable csv
kima = read.csv("reference/kima_table.csv")
tri_sci_sig_fas_sec = merge(tri_sci_sig_fas_sec, kima, by.x="ID", by.y="name", all.x=TRUE)
new_columns = c("secretome_name","secretome_exosome_boolean","secretome_boolean","communication_name","communication_hit_type","lds2_id","lds2_lmmcma1","lds2_relpepratio1","lds2_lmmcma2","lds2_relpepratio2","lds2_lmmcma3","lds2_relpepratio3","lds2_lmmcma4","lds2_relpepratio4","lds2_meanlmmcma","kima_clone","kima_homolog","kima_chromosome","kima_signal","kima_comment")
all_columns = c(all_columns, new_columns)
colnames(tri_sci_sig_fas_sec) = all_columns
rnaseq_human_mouse = read.csv("csv/geneID_Lmajor_human_mouse_DEbothup_metac_amast4.txt", header=FALSE)
rnaseq_human_mouse$rnaseq_hsmmup = 1
rnaseq_fourhpi = read.csv("csv/TableS7-4hpi.csv", header=TRUE)
tri_sci_sig_fas_sec_rna = merge(tri_sci_sig_fas_sec, rnaseq_human_mouse, by.x="ID", by.y="V1", all.x=TRUE)
tri_sci_sig_fas_sec_rna = merge(tri_sci_sig_fas_sec_rna, rnaseq_fourhpi, by.x="ID", by.y="ID", all.x=TRUE)
new_columns = c("rnaseq_hsmmup","rnaseq_4hpidesc","rnaseq_4hpifc","rnaseq_4hpipval","rnaseq_4hpipregofun","rnaseq_4hpipredgoproc")
all_columns = c(all_columns, new_columns)
colnames(tri_sci_sig_fas_sec_rna) = all_columns
tri_sci_sig_fas_sec_rna$rnaseq_hsmmup[is.na(tri_sci_sig_fas_sec_rna$rnaseq_hsmmup)] = 0
##write_xls(data=merged_de, file="merged")
Since I do not have the number of genes alone for brucei/cruzi, I will cheat and subtract the number observed from the size of the two genomes and assume that is correct (it probably is not).
library(TxDb.Lmajor.friedlin.TriTrypDB27)
library(org.Lmajor.friedlin.eg.db)
library(Leishmania.major.Friedlin)
summary(genes(Leishmania.major.Friedlin))
## [1] "GRanges object with 9377 ranges and 1 metadata column"
## 9377
library(TxDb.Tbrucei.TREU927.TriTrypDB27)
library(org.Tbrucei.TREU927.eg.db)
library(Trypanosoma.brucei.treu927)
summary(genes(Trypanosoma.brucei.treu927))
## [1] "GRanges object with 12093 ranges and 1 metadata column"
## 12093
library(TxDb.Tcruzi.CLBrenerEsmeraldo.TriTrypDB27)
library(org.Tcruzi.CLBrenerEsmeraldo.eg.db)
library(Trypanosoma.cruzi.CLBrener.Esmeraldo)
## 10597
summary(genes(Trypanosoma.cruzi.CLBrener.Esmeraldo))
## [1] "GRanges object with 10597 ranges and 1 metadata column"
rnaseq <- read.csv("csv/lmajor_metac_vs_amast4.csv")
head(rnaseq)
## X logFC AveExpr t P.Value adj.P.Val B fold_change
## 1 LmjF.19.1530-1 1.480 8.453 14.50 8.359e-14 3.599e-10 21.44 2.7899
## 2 LmjF.26.0640-1 1.822 7.266 14.49 8.488e-14 3.599e-10 21.31 3.5367
## 3 LmjF.36.2030-1 1.387 8.786 11.68 1.026e-11 1.822e-08 16.86 2.6156
## 4 LmjF.26.0620-1 1.472 7.271 11.69 1.007e-11 1.822e-08 16.83 2.7732
## 5 LmjF.17.0890-1 -1.745 6.556 -11.65 1.074e-11 1.822e-08 16.75 0.2984
## 6 LmjF.17.0020-1 2.154 8.166 11.51 1.414e-11 1.998e-08 16.53 4.4518
head(tri_sci_sig_fas_sec)
## ID science_cog
## 1 LmjF.01.0010 nocogLmjF01.0010
## 2 LmjF.01.0020 20332099_cogs
## 3 LmjF.01.0030 19584616_cogs
## 4 LmjF.01.0040 nocogLmjF01.0040
## 5 LmjF.01.0050 nocogLmjF01.0050
## 6 LmjF.01.0060 21566042_cogs
## science_genes
## 1 LmjF.01.0010
## 2 Tc00.1047053506895.50,Tc00.1047053508501.300,LmjF.01.0020,Tb927.09.160.2280
## 3 Tc00.1047053506895.40,Tc00.1047053508501.290,LmjF.01.0030,Tb927.09.160.2260
## 4 LmjF.01.0040
## 5 LmjF.01.0050
## 6 Tc00.1047053506895.30,Tc00.1047053508501.280,LmjF.01.0060,Tb927.09.160.2250
## science_sec_gene science_jaccard science_cog_type science_cog_id
## 1 singleton
## 2 3-way COG [Lm+Tb+Tc] 20332099_cogs
## 3 3-way COG [Lm+Tb+Tc] 19584616_cogs
## 4 singleton
## 5 singleton
## 6 3-way COG [Lm+Tb+Tc] 21566042_cogs
## science_gene_product tritryp_description
## 1 hypothetical protein, unknown function hypothetical protein, unknown function
## 2 hypothetical protein, conserved hypothetical protein, conserved
## 3 MCAK-like kinesin, putative MCAK-like kinesin, putative
## 4 hypothetical protein, unknown function hypothetical protein, unknown function
## 5 carboxylase, putative carboxylase, putative
## 6 hypothetical protein, conserved hypothetical protein, conserved
## tritryp_gene_name tritryp_tm_domains tritryp_ortholog_count tritryp_paralog_count
## 1 null 2 9 0
## 2 null 0 21 0
## 3 null 0 43 1
## 4 null 0 7 0
## 5 null 0 32 1
## 6 null 0 17 0
## tritryp_ortholog_group tritryp_signalp_score tritryp_signalp_peptide tritryp_uniprot
## 1 OG5_183100 NN Sum HMM null
## 2 OG5_148223 null null null
## 3 OG5_127200 null null null
## 4 OG5_183101 NN Sum HMM null
## 5 OG5_126626 null null null
## 6 OG5_148224 NN Sum HMM null
## tritryp_entrez_geneid tritryp_gene_symbol tritryp_brucei_homologs
## 1 null null <NA>
## 2 null null Tb927.9.3670
## 3 null null Tb927.11.3280,Tb927.9.3650
## 4 null null <NA>
## 5 null null Tb927.4.5350,Tb927.8.6970
## 6 null null Tb927.9.3640
## tritryp_cruzi_homologs signalp_end
## 1 <NA> 24
## 2 TcCLB.506895.50,TcCLB.508501.300 NA
## 3 TcCLB.506895.40,TcCLB.509589.30,TcCLB.503999.20,TcCLB.508501.290 NA
## 4 <NA> NA
## 5 TcCLB.504835.20,TcCLB.509913.10 NA
## 6 TcCLB.506895.30,TcCLB.508501.280 NA
## signalp_score signalp_version koh_score fasta_ucsc_id fasta_query_length
## 1 0.863 SignalP-4.1 NA <NA> NA
## 2 NA <NA> NA <NA> NA
## 3 NA <NA> NA ENST00000372224.7 668
## 4 NA <NA> NA <NA> NA
## 5 NA <NA> NA ENST00000376285.4 665
## 6 NA <NA> NA <NA> NA
## fasta_libhit_length fasta_score fasta_e fasta_identity fasta_hit_length
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 725 50.0 3.2e-56 0.373 807
## 4 NA NA NA NA NA
## 5 728 100.7 0.0e+00 0.526 733
## 6 NA NA NA NA NA
## fasta_hit_matches secretome_name secretome_exosome_boolean secretome_boolean
## 1 NA <NA> NA NA
## 2 NA <NA> NA NA
## 3 428 <NA> NA NA
## 4 NA <NA> NA NA
## 5 534 <NA> NA NA
## 6 NA <NA> NA NA
## communication_name communication_hit_type lds2_id lds2_lmmcma1 lds2_relpepratio1
## 1 <NA> <NA> <NA> NA NA
## 2 <NA> <NA> <NA> NA NA
## 3 <NA> <NA> <NA> NA NA
## 4 <NA> <NA> <NA> NA NA
## 5 <NA> <NA> <NA> NA NA
## 6 <NA> <NA> <NA> NA NA
## lds2_lmmcma2 lds2_relpepratio2 lds2_lmmcma3 lds2_relpepratio3 lds2_lmmcma4
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
## lds2_relpepratio4 lds2_meanlmmcma kima_clone kima_homolog kima_chromosome kima_signal
## 1 NA NA <NA> <NA> NA <NA>
## 2 NA NA <NA> <NA> NA <NA>
## 3 NA NA <NA> <NA> NA <NA>
## 4 NA NA <NA> <NA> NA <NA>
## 5 NA NA <NA> <NA> NA <NA>
## 6 NA NA <NA> <NA> NA <NA>
## kima_comment
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
rownames(rnaseq) <- gsub(pattern="\\-1$", replacement="", x=rnaseq[["X"]])
rnaseq_orthologs <- merge(rnaseq, tri_sci_sig_fas_sec, by.x="row.names", by.y="ID", all.x=TRUE)
write.csv(file="csv/rnaseq_orthologs.csv", x=rnaseq_orthologs)
## Lm/Tc: The set of genes where tritryp_cruzi_homologs is not NA
lmtc_orthology <- !is.na(rnaseq_orthologs[["tritryp_cruzi_homologs"]])
sum(lmtc_orthology)
## [1] 6451
lmtc_orthologs <- rnaseq_orthologs[lmtc_orthology,]
lmtc_orthologs_sig <- lmtc_orthologs[["adj.P.Val"]] <= 0.05
sum(lmtc_orthologs_sig)
## [1] 2416
lmtc_orthologs_sig <- lmtc_orthologs[lmtc_orthologs_sig, ]
lmtc_orthologs_sigup <- lmtc_orthologs_sig[["logFC"]] > 0.0
sum(lmtc_orthologs_sigup)
## [1] 1398
lmtc_orthologs_sigup <- lmtc_orthologs[lmtc_orthologs_sigup, ]
write.csv(file="csv/lmtc_orthologs_sigup.csv", x=lmtc_orthologs_sigup)
lmtc_orthologs_sigdown <- lmtc_orthologs_sig[["logFC"]] < 0.0
sum(lmtc_orthologs_sigdown)
## [1] 1018
lmtc_orthologs_sigdown <- lmtc_orthologs[lmtc_orthologs_sigdown, ]
write.csv(file="csv/lmtc_orthologs_sigdown.csv", x=lmtc_orthologs_sigdown)
lmtc_orthologs_sig2up <- lmtc_orthologs_sig[["logFC"]] >= 1.0
sum(lmtc_orthologs_sig2up)
## [1] 86
lmtc_orthologs_sig2up <- lmtc_orthologs[lmtc_orthologs_sig2up, ]
write.csv(file="csv/lmtc_orthologs_sig2up.csv", x=lmtc_orthologs_sig2up)
lmtc_orthologs_sig2down <- lmtc_orthologs_sig[["logFC"]] <= -1.0
sum(lmtc_orthologs_sig2down)
## [1] 115
lmtc_orthologs_sig2down <- lmtc_orthologs[lmtc_orthologs_sig2down, ]
write.csv(file="csv/lmtc_orthologs_sig2down.csv", x=lmtc_orthologs_sig2down)
## Lm/Tb: The set of genes where tritryp_cruzi_homologs is not NA
lmtb_orthology <- !is.na(rnaseq_orthologs[["tritryp_brucei_homologs"]])
sum(lmtb_orthology)
## [1] 6103
lmtb_orthologs <- rnaseq_orthologs[lmtb_orthology,]
lmtb_orthologs_sig <- lmtb_orthologs[["adj.P.Val"]] <= 0.05
sum(lmtb_orthologs_sig)
## [1] 2308
lmtb_orthologs_sig <-lmtb_orthologs[lmtb_orthologs_sig, ]
lmtb_orthologs_sigup <- lmtb_orthologs_sig[["logFC"]] > 0.0
sum(lmtb_orthologs_sigup)
## [1] 1337
lmtb_orthologs_sigup <-lmtb_orthologs[lmtb_orthologs_sigup, ]
write.csv(file="csv/lmtb_orthologs_sigup.csv", x=lmtb_orthologs_sigup)
lmtb_orthologs_sigdown <- lmtb_orthologs_sig[["logFC"]] < 0.0
sum(lmtb_orthologs_sigdown)
## [1] 971
lmtb_orthologs_sigdown <-lmtb_orthologs[lmtb_orthologs_sigdown, ]
write.csv(file="csv/lmtb_orthologs_sigdown.csv", x=lmtb_orthologs_sigdown)
lmtb_orthologs_sig2up <- lmtb_orthologs_sig[["logFC"]] >= 1.0
sum(lmtb_orthologs_sig2up)
## [1] 84
lmtb_orthologs_sig2up <- lmtb_orthologs[lmtb_orthologs_sig2up, ]
write.csv(file="csv/lmtb_orthologs_sig2up.csv", x=lmtb_orthologs_sig2up)
lmtb_orthologs_sig2down <- lmtb_orthologs_sig[["logFC"]] <= -1.0
sum(lmtb_orthologs_sig2down)
## [1] 111
lmtb_orthologs_sig2down <- lmtb_orthologs[lmtb_orthologs_sig2down, ]
write.csv(file="csv/lmtb_orthologs_sig2down.csv", x=lmtb_orthologs_sig2down)
## Lm/Tc/Tb: The set of genes where tritryp_cruzi_homologs is not NA
lmtctb_orthology <- !is.na(rnaseq_orthologs[["tritryp_brucei_homologs"]]) & !is.na(rnaseq_orthologs[["tritryp_cruzi_homologs"]])
sum(lmtctb_orthology)
## [1] 5989
lmtctb_orthologs <- rnaseq_orthologs[lmtctb_orthology,]
lmtctb_orthologs_sig <- lmtctb_orthologs[["adj.P.Val"]] <= 0.05
sum(lmtctb_orthologs_sig)
## [1] 2260
lmtctb_orthologs_sig <-lmtctb_orthologs[lmtctb_orthologs_sig, ]
lmtctb_orthologs_sigup <- lmtctb_orthologs_sig[["logFC"]] > 0.0
sum(lmtctb_orthologs_sigup)
## [1] 1313
lmtctb_orthologs_sigup <-lmtctb_orthologs[lmtctb_orthologs_sigup, ]
write.csv(file="csv/lmtctb_orthologs_sigup.csv", x=lmtctb_orthologs_sigup)
lmtctb_orthologs_sigdown <- lmtctb_orthologs_sig[["logFC"]] < 0.0
sum(lmtctb_orthologs_sigdown)
## [1] 947
lmtctb_orthologs_sigdown <-lmtctb_orthologs[lmtctb_orthologs_sigdown, ]
write.csv(file="csv/lmtctb_orthologs_sigdown.csv", x=lmtctb_orthologs_sigdown)
lmtctb_orthologs_sig2up <- lmtctb_orthologs_sig[["logFC"]] >= 1.0
sum(lmtctb_orthologs_sig2up)
## [1] 84
lmtctb_orthologs_sig2up <- lmtctb_orthologs[lmtctb_orthologs_sig2up, ]
write.csv(file="csv/lmtctb_orthologs_sig2up.csv", x=lmtctb_orthologs_sig2up)
lmtctb_orthologs_sig2down <- lmtctb_orthologs_sig[["logFC"]] <= -1.0
sum(lmtctb_orthologs_sig2down)
## [1] 109
lmtctb_orthologs_sig2down <- lmtctb_orthologs[lmtctb_orthologs_sig2down, ]
write.csv(file="csv/lmtctb_orthologs_sig2down.csv", x=lmtctb_orthologs_sig2down)
## Lm/Tc/NOTTb: The set of genes where tritryp_cruzi_homologs is not NA but brucei is Na
lmtc_notb_orthology <- !is.na(rnaseq_orthologs[["tritryp_cruzi_homologs"]]) & is.na(rnaseq_orthologs[["tritryp_brucei_homologs"]])
sum(lmtc_notb_orthology)
## [1] 462
lmtc_notb_orthologs <- rnaseq_orthologs[lmtc_notb_orthology,]
lmtc_notb_orthologs_sig <- lmtc_notb_orthologs[["adj.P.Val"]] <= 0.05
sum(lmtc_notb_orthologs_sig)
## [1] 156
lmtc_notb_orthologs_sig <-lmtc_notb_orthologs[lmtc_notb_orthologs_sig, ]
lmtc_notb_orthologs_sigup <- lmtc_notb_orthologs_sig[["logFC"]] > 0.0
sum(lmtc_notb_orthologs_sigup)
## [1] 85
lmtc_notb_orthologs_sigup <-lmtc_notb_orthologs[lmtc_notb_orthologs_sigup, ]
write.csv(file="csv/lmtc_notb_orthologs_sigup.csv", x=lmtc_notb_orthologs_sigup)
lmtc_notb_orthologs_sigdown <- lmtc_notb_orthologs_sig[["logFC"]] < 0.0
sum(lmtc_notb_orthologs_sigdown)
## [1] 71
lmtc_notb_orthologs_sigdown <-lmtc_notb_orthologs[lmtc_notb_orthologs_sigdown, ]
write.csv(file="csv/lmtc_notb_orthologs_sigdown.csv", x=lmtc_notb_orthologs_sigdown)
lmtc_notb_orthologs_sig2up <- lmtc_notb_orthologs_sig[["logFC"]] >= 1.0
sum(lmtc_notb_orthologs_sig2up)
## [1] 2
lmtc_notb_orthologs_sig2up <- lmtc_notb_orthologs[lmtc_notb_orthologs_sig2up, ]
write.csv(file="csv/lmtc_notb_orthologs_sig2up.csv", x=lmtc_notb_orthologs_sig2up)
lmtc_notb_orthologs_sig2down <- lmtc_notb_orthologs_sig[["logFC"]] <= -1.0
sum(lmtc_notb_orthologs_sig2down)
## [1] 6
lmtc_notb_orthologs_sig2down <- lmtc_notb_orthologs[lmtc_notb_orthologs_sig2down, ]
write.csv(file="csv/lmtc_notb_orthologs_sig2down.csv", x=lmtc_notb_orthologs_sig2down)
## Lm/Tb/NOTTc: The set of genes where tritryp_brucei_homologs is not NA but cruzi is NA
lmtb_notc_orthology <- !is.na(rnaseq_orthologs[["tritryp_brucei_homologs"]]) & is.na(rnaseq_orthologs[["tritryp_cruzi_homologs"]])
sum(lmtb_notc_orthology)
## [1] 114
lmtb_notc_orthologs <- rnaseq_orthologs[lmtb_notc_orthology,]
lmtb_notc_orthologs_sig <- lmtb_notc_orthologs[["adj.P.Val"]] <= 0.05
sum(lmtb_notc_orthologs_sig)
## [1] 48
lmtb_notc_orthologs_sig <-lmtb_notc_orthologs[lmtb_notc_orthologs_sig, ]
lmtb_notc_orthologs_sigup <- lmtb_notc_orthologs_sig[["logFC"]] > 0.0
sum(lmtb_notc_orthologs_sigup)
## [1] 24
lmtb_notc_orthologs_sigup <-lmtb_notc_orthologs[lmtb_notc_orthologs_sigup, ]
write.csv(file="csv/lmtb_notc_orthologs_sigup.csv", x=lmtb_notc_orthologs_sigup)
lmtb_notc_orthologs_sigdown <- lmtb_notc_orthologs_sig[["logFC"]] < 0.0
sum(lmtb_notc_orthologs_sigdown)
## [1] 24
lmtb_notc_orthologs_sigdown <-lmtb_notc_orthologs[lmtb_notc_orthologs_sigdown, ]
write.csv(file="csv/lmtb_notc_orthologs_sigdown.csv", x=lmtb_notc_orthologs_sigdown)
lmtb_notc_orthologs_sig2up <- lmtb_notc_orthologs_sig[["logFC"]] >= 1.0
sum(lmtb_notc_orthologs_sig2up)
## [1] 0
lmtb_notc_orthologs_sig2up <- lmtb_notc_orthologs[lmtb_notc_orthologs_sig2up, ]
write.csv(file="csv/lmtb_notc_orthologs_sig2up.csv", x=lmtb_notc_orthologs_sig2up)
lmtb_notc_orthologs_sig2down <- lmtb_notc_orthologs_sig[["logFC"]] <= -1.0
sum(lmtb_notc_orthologs_sig2down)
## [1] 2
lmtb_notc_orthologs_sig2down <- lmtb_notc_orthologs[lmtb_notc_orthologs_sig2down, ]
write.csv(file="csv/lmtb_notc_orthologs_sig2down.csv", x=lmtb_notc_orthologs_sig2down)
## Lm/NOTb/NOTTc: The set of genes where tritryp_brucei_homologs is NA and cruzi is NA
lm_notbnotc_orthology <- is.na(rnaseq_orthologs[["tritryp_brucei_homologs"]]) & is.na(rnaseq_orthologs[["tritryp_cruzi_homologs"]])
sum(lm_notbnotc_orthology)
## [1] 1915
lm_notbnotc_orthologs <- rnaseq_orthologs[lm_notbnotc_orthology,]
lm_notbnotc_orthologs_sig <- lm_notbnotc_orthologs[["adj.P.Val"]] <= 0.05
sum(lm_notbnotc_orthologs_sig)
## [1] 760
lm_notbnotc_orthologs_sig <-lm_notbnotc_orthologs[lm_notbnotc_orthologs_sig, ]
lm_notbnotc_orthologs_sigup <- lm_notbnotc_orthologs_sig[["logFC"]] > 0.0
sum(lm_notbnotc_orthologs_sigup)
## [1] 310
lm_notbnotc_orthologs_sigup <-lm_notbnotc_orthologs[lm_notbnotc_orthologs_sigup, ]
write.csv(file="csv/lm_notbnotc_orthologs_sigup.csv", x=lm_notbnotc_orthologs_sigup)
lm_notbnotc_orthologs_sigdown <- lm_notbnotc_orthologs_sig[["logFC"]] < 0.0
sum(lm_notbnotc_orthologs_sigdown)
## [1] 450
lm_notbnotc_orthologs_sigdown <-lm_notbnotc_orthologs[lm_notbnotc_orthologs_sigdown, ]
write.csv(file="csv/lm_notbnotc_orthologs_sigdown.csv", x=lm_notbnotc_orthologs_sigdown)
lm_notbnotc_orthologs_sig2up <- lm_notbnotc_orthologs_sig[["logFC"]] >= 1.0
sum(lm_notbnotc_orthologs_sig2up)
## [1] 27
lm_notbnotc_orthologs_sig2up <- lm_notbnotc_orthologs[lm_notbnotc_orthologs_sig2up, ]
write.csv(file="csv/lm_notbnotc_orthologs_sig2up.csv", x=lm_notbnotc_orthologs_sig2up)
lm_notbnotc_orthologs_sig2down <- lm_notbnotc_orthologs_sig[["logFC"]] <= -1.0
sum(lm_notbnotc_orthologs_sig2down)
## [1] 74
lm_notbnotc_orthologs_sig2down <- lm_notbnotc_orthologs[lm_notbnotc_orthologs_sig2down, ]
write.csv(file="csv/lm_notbnotc_orthologs_sig2down.csv", x=lm_notbnotc_orthologs_sig2down)
library(venneuler)
library(hpgltools)
## This plots the set of p-value ups not log2 limited
ones <- c("Lm"=310, "Tc"=306, "Tb"=358)
twos <- c("Lm&Tc"=85, "Lm&Tb"=24)
threes <- c("Lm&Tb&Tc"=1313)
fun_venn_plot(ones=ones, twos=twos, threes=threes)
## Error: object 'centers' not found
## Repeat for the p-value downs not log2 limited
threes <- c("Lm&Tb&Tc" = 947)
twos <- c("Lm&Tc" = 71, "Lm&Tb" = 24)
ones <- c("Lm" = 450, "Tc" = 326, "Tb" = 380)
fun_venn_plot(ones=ones, twos=twos, threes=threes)
## Error: object 'centers' not found
## Repeat for the p-value ups with log2 limit
threes <- c("Lm&Tb&Tc" = 84)
twos <- c("Lm&Tc" = 2, "Lm&Tb" = 0)
ones <- c("Lm" = 27, "Tc" = 20, "Tb" = 24)
fun_venn_plot(ones=ones, twos=twos, threes=threes)
## Error: object 'centers' not found
## Repeat for the p-value downs with log2 limit
threes <- c("Lm&Tb&Tc" = 109)
twos <- c("Lm&Tc" = 6, "Lm&Tb" = 2)
ones <- c("Lm" = 74, "Tc" = 60, "Tb" = 66)
fun_venn_plot(ones=ones, twos=twos, threes=threes)
## Error: object 'centers' not found
##save(list=c('merged_de'), file="merged_table.RData")
load('merged_table.RData')
library(sqldf)
head(merged_de)
colnames(merged_de)
all_fasta_low_e = sqldf("SELECT * from merged_de WHERE fasta_e < '0.001'")
head(all_fasta_low_e)
de_up = sqldf("SELECT * FROM merged_de_four WHERE fourhpi_fc > '1'")
de_up_sec = sqldf("SELECT * FROM de_up WHERE secretome_name is not NULL and fourhpi_fc > '2'")
de_upup = sqldf("SELECT * FROM merged_de_four WHERE fourhpi_fc > '2'")
de_up_nohuman = sqldf("SELECT * FROM de_up WHERE fasta_e is NULL")
de_up_nohuman_ranked = de_up_nohuman[ order(de_up_nohuman$fourhpi_fc, decreasing=TRUE),]
de_up_nohuman_ranked_200 = head(de_up_nohuman_ranked, n=200)
de_up_nohuman_ranked_notb = sqldf("SELECT * FROM de_up_nohuman_ranked_200 WHERE tritryp_brucei_homologs is NULL")
a = de_up_nohuman_ranked_notb
dim(de_up_nohuman)
de_up_nohuman_signalp = sqldf("SELECT * FROM de_up_nohuman WHERE signalp_score is not NULL")
dim(de_up_nohuman_signalp)
fc_filter2 = sqldf("SELECT * FROM de_up_nohuman_signalp WHERE fourhpi_fc > '2'")
fc_filter = sqldf("SELECT * FROM de_up_nohuman_signalp WHERE fourhpi_fc > '1.5'")
fc_filter_nobrucei = sqldf("SELECT * FROM fc_filter WHERE tritryp_brucei_homologs is NULL")
fc_filter_nohomolog = sqldf("SELECT * FROM fc_filter WHERE tritryp_brucei_homologs is NULL AND tritryp_cruzi_homologs is NULL")
dim(fc_filter_nohomolog)
fc_filter2$secretome_name
fc_filter_nohomolog$secretome_name
secretome_hits = sqldf("SELECT * FROM merged_de_four WHERE secretome_name is not NULL")
Before printing the tables below, perform the following operations.
Once complete, do the following:
Provide a table which is: a. Rank order all by fold change (highest->lowest)
This table includes only columns describing the following: a. ID, FC, tbrucei boolean, mean_cmca > 1.7 boolean, mouse boolean, kima boolean, sum of columns (3,4,5,6)
## Requisite #1: The set of genes which are not in the fasta search between lmajor and homo sapiens.
saloe_start = tri_sci_sig_fas_sec_rna
library("sqldf")
saloe = sqldf("SELECT * FROM saloe_start WHERE fasta_e is NULL")
saloe = saloe[ order(saloe$rnaseq_4hpifc, decreasing=TRUE),]
saloe = saloe[,c("ID","rnaseq_4hpifc","rnaseq_4hpipval","tritryp_brucei_homologs","lds2_meanlmmcma","rnaseq_hsmmup","kima_clone")]
saloe$tritryp_brucei_homologs[is.na(saloe$tritryp_brucei_homologs)] = 0
saloe$tritryp_brucei_homologs[saloe$tritryp_brucei_homologs != 0] = 1
saloe$tritryp_brucei_homologs = as.numeric(saloe$tritryp_brucei_homologs)
saloe$kima_clone[is.na(saloe$kima_clone)] = 0
saloe$lds2_meanlmmcma = as.numeric(saloe$lds2_meanlmmcma)
saloe$lds2_meanlmmcma[is.na(saloe$lds2_meanlmmcma)] = 0
saloe$lds2_meanlmmcma[saloe$lds2_meanlmmcma < 1.7 ] = 0
saloe$lds2_meanlmmcma[saloe$lds2_meanlmmcma >= 1.7 ] = 1
write_xls(data=saloe, file="saloe_v1")
tt <- sm(saveme(filename="xrefs.rda.xz"))