1 An attempt at running Steve’s analyses without significant (any?) modifications.

My goal in this document is to repeat Steve’s analyses without changing anything more signifcant than the fact that I copied his count tables to the directory ‘count_tables’.

When I do change things, I will leave the original text in a non-evaluated block immediately before my modified version. The completely unmodified script is in the scripts/ directory.

I will also break up the various pieces into separate blocks so that I can comment on anything along the way.

require(rtracklayer)
## Loading required package: rtracklayer
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:rtracklayer':
## 
##     space
## The following object is masked from 'package:IRanges':
## 
##     space
## The following object is masked from 'package:S4Vectors':
## 
##     space
## The following object is masked from 'package:stats':
## 
##     lowess
library(RColorBrewer)
library(edgeR)
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(cbcbSEQ)
## Loading required package: corpcor
## Loading required package: preprocessCore
## Loading required package: sva
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.8-23. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
## 
## Attaching package: 'cbcbSEQ'
## The following object is masked from 'package:hpgltools':
## 
##     pcRes
library(hpgltools)

2 Read the count tables, original

I am of course not working in dropbox and I put Steve’s count tables into a directory ‘count_tables’ from within this tree.

setwd('~/Dropbox/lab_work/Leishmania Research/La')

###### Reading & collapsing tables, calculating RPKMs ######
hpgl0721.lmexicana <- read.table("hpgl0721_lmexicana.count",col.names=c("id","hpgl0721"))
hpgl0722.lmexicana <- read.table("hpgl0722_lmexicana.count",col.names=c("id","hpgl0722"))
hpgl0723.lmexicana <- read.table("hpgl0723_lmexicana.count",col.names=c("id","hpgl0723"))
hpgl0724.lmexicana <- read.table("hpgl0724_lmexicana.count",col.names=c("id","hpgl0724"))
hpgl0725.lmexicana <- read.table("hpgl0725_lmexicana.count",col.names=c("id","hpgl0725"))
hpgl0726.lmexicana <- read.table("hpgl0726_lmexicana.count",col.names=c("id","hpgl0726"))
hpgl0727.lmexicana <- read.table("hpgl0727_lmexicana.count",col.names=c("id","hpgl0727"))
hpgl0728.lmexicana <- read.table("hpgl0728_lmexicana.count",col.names=c("id","hpgl0728"))
hpgl0729.lmexicana <- read.table("hpgl0729_lmexicana.count",col.names=c("id","hpgl0729"))
hpgl0730.lmexicana <- read.table("hpgl0730_lmexicana.count",col.names=c("id","hpgl0730"))

In conversations with Steve, it sppears that the first 4 tables were not used.

###### Reading & collapsing tables, calculating RPKMs ######
##hpgl0721.lmexicana <- read.table("count_tables/hpgl0721_lmexicana.count",col.names=c("id","hpgl0721"))
##hpgl0722.lmexicana <- read.table("count_tables/hpgl0722_lmexicana.count",col.names=c("id","hpgl0722"))
##hpgl0723.lmexicana <- read.table("count_tables/hpgl0723_lmexicana.count",col.names=c("id","hpgl0723"))
##hpgl0724.lmexicana <- read.table("count_tables/hpgl0724_lmexicana.count",col.names=c("id","hpgl0724"))
hpgl0725.lmexicana <- read.table("count_tables/hpgl0725_lmexicana.count",col.names=c("id","hpgl0725"))
hpgl0726.lmexicana <- read.table("count_tables/hpgl0726_lmexicana.count",col.names=c("id","hpgl0726"))
hpgl0727.lmexicana <- read.table("count_tables/hpgl0727_lmexicana.count",col.names=c("id","hpgl0727"))
hpgl0728.lmexicana <- read.table("count_tables/hpgl0728_lmexicana.count",col.names=c("id","hpgl0728"))
hpgl0729.lmexicana <- read.table("count_tables/hpgl0729_lmexicana.count",col.names=c("id","hpgl0729"))
hpgl0730.lmexicana <- read.table("count_tables/hpgl0730_lmexicana.count",col.names=c("id","hpgl0730"))

3 Making a list of counts, original

##list.of.data.frames <- list(hpgl0721.lmexicana,hpgl0722.lmexicana,hpgl0723.lmexicana,hpgl0724.lmexicana,hpgl0725.lmexicana,hpgl0726.lmexicana,hpgl0727.lmexicana,hpgl0728.lmexicana,hpgl0729.lmexicana,hpgl0730.lmexicana)
list.of.data.frames <- list(hpgl0725.lmexicana,hpgl0726.lmexicana,hpgl0727.lmexicana,hpgl0728.lmexicana,hpgl0729.lmexicana,hpgl0730.lmexicana)
countsTable = Reduce(function(...) merge(..., by="id", all=T), list.of.data.frames)
countsTable <- countsTable[-c(1:5),]
countsTable <- countsTable[-grep("rfamscan",countsTable$id),]
rownames(countsTable) <- countsTable$id
countsTable <- data.matrix(countsTable[,-1])
## This is interesting, doing this peculiar sapply strsplit/substr operation allows one to have
## non-unique rownames in a matrix.
rownames(countsTable) <- sapply(rownames(countsTable), function(x) strsplit(x,'-')[[1]][1])
rownames(countsTable) <- sapply(rownames(countsTable), function(x) substr(x,6,nchar(x)))
df_list <- list(hpgl0725.lmexicana,
                hpgl0726.lmexicana,
                hpgl0727.lmexicana,
                hpgl0728.lmexicana,
                hpgl0729.lmexicana,
                hpgl0730.lmexicana)
my_counts <- Reduce(function(...) merge(..., by="id", all=TRUE), df_list)
rownames(my_counts) <- my_counts[["id"]]
my_counts <- my_counts[, -1]
keepers_idx <- !grepl(pattern="^_", x=rownames(my_counts))
my_counts <- my_counts[keepers_idx, ]
keepers_idx <- !grepl(pattern="rfamscan", x=rownames(my_counts))
my_counts <- my_counts[keepers_idx, ]
rownames(my_counts) <- gsub(pattern="^exon_", replacement="", x=rownames(my_counts))
rownames(my_counts) <- gsub(pattern="\\-1$", replacement="", x=rownames(my_counts))

4 Summing exons, original

This is an interesting method of getting all the exon data from multiple transcripts into a final count table.

##lmex.collapse <- matrix(0,0,nrow=length(unique(rownames(countsTable))),ncol=10)
## atb changed the static ncol=10 to the number of columns in countsTable
## because we are no longer using 721,722,723,724.
lmex.collapse <- matrix(0,0,nrow=length(unique(rownames(countsTable))),ncol=ncol(countsTable))
rownames(lmex.collapse) <- unique(rownames(countsTable))
for(i in 1:nrow(lmex.collapse)){
  temp.mat <- subset(countsTable,rownames(countsTable) %in% rownames(lmex.collapse)[i])
  if(nrow(temp.mat)==1){
    lmex.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lmex.collapse[i,] <- colSums(countsTable[rownames(countsTable) %in% rownames(lmex.collapse)[i],])
  }
}
## Added by atb
colnames(lmex.collapse) <- colnames(countsTable)

5 My method of summing exons

multi_exon_idx <- grep(pattern="\\-\\d+$", x=rownames(my_counts))
for (idx in multi_exon_idx) {
  gene <- gsub(pattern="\\-\\d+$", replacement="", x=rownames(my_counts)[idx])
  exon_counts <- my_counts[idx, ]
  gene_counts <- my_counts[gene, ]
  total_counts <- gene_counts + exon_counts
  my_counts[gene, ] <- total_counts
}
my_counts <- my_counts[-multi_exon_idx, ]
my_counts <- as.matrix(my_counts)
all.equal(my_counts, lmex.collapse)
## Error in mode(current): object 'lmex.collapse' not found
lmexvitro.countsTable <- read.table("../Lb/20141211_Lmexicana81_434-435_437_440_443_446_453-454_458_462_466_470_491-492_496_500_504_508_CDSonly.count", header=T)
rownames(lmexvitro.countsTable) <- lmexvitro.countsTable$id
lmexvitro.countsTable <- data.matrix(lmexvitro.countsTable[,c(2:6,8:12,14:18)])
rownames(lmexvitro.countsTable) <- sapply(rownames(lmexvitro.countsTable), function(x) strsplit(x,'-')[[1]][1])
lmexvitro.collapse <- matrix(0,0,nrow=length(unique(rownames(lmexvitro.countsTable))),ncol=15)
rownames(lmexvitro.collapse) <- unique(rownames(lmexvitro.countsTable))
for(i in 1:nrow(lmexvitro.collapse)){
  temp.mat <- subset(lmexvitro.countsTable,rownames(lmexvitro.countsTable) %in% rownames(lmexvitro.collapse)[i])
  if(nrow(temp.mat)==1){
    lmexvitro.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lmexvitro.collapse[i,] <- colSums(lmexvitro.countsTable[rownames(lmexvitro.countsTable) %in% rownames(lmexvitro.collapse)[i],])
  }
}

lb.countsTable <- read.table('../Lb/20170215_Lb_allgenes_countTable.txt') #Read in Lb counts
lb.countsTable <- data.matrix(lb.countsTable[,c(2,3,5,16,19,23)])
rownames(lb.countsTable) <- sapply(rownames(lb.countsTable), function(x) strsplit(x,'-')[[1]][1])
lb.collapse <- matrix(0,0,nrow=length(unique(rownames(lb.countsTable))),ncol=6)
rownames(lb.collapse) <- unique(rownames(lb.countsTable))
for(i in 1:nrow(lb.collapse)){
  temp.mat <- subset(lb.countsTable,rownames(lb.countsTable) %in% rownames(lb.collapse)[i])
  if(nrow(temp.mat)==1){
    lb.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lb.collapse[i,] <- colSums(lb.countsTable[rownames(lb.countsTable) %in% rownames(lb.collapse)[i],])
  }
}
lb.collapse <- lb.collapse[!rownames(lb.collapse)=='LbrM.26.0210a',]
rownames(lb.collapse)[rownames(lb.collapse)=='LbrM.20.0191'] <- 'LbrM.10.0191'

{
  lmajor.72counts <- read.table('../Lb/Lmajor_72hrs.txt',header=F,sep='\t',col.names=c("id","lm72.1","lm72.2","lm72.3","lm72.4"))
  hpgl0364.lmajor <- read.table("htseq_HPGL0364_Lmajor60_20140329.txt",col.names=c("id","hpgl0364"))
  hpgl0374.lmajor <- read.table("htseq_HPGL0374_Lmajor60_20140329.txt",col.names=c("id","hpgl0374"))
  hpgl0397.lmajor <- read.table("htseq_HPGL0397_Lmajor60_20140712.txt",col.names=c("id","hpgl0397"))
  hpgl0456.lmajor <- read.table("htseq_HPGL0456_Lmajor60_20141001.txt",col.names=c("id","hpgl0456"))
  hpgl0494.lmajor <- read.table("htseq_HPGL0494_Lmajor60_20141211.txt",col.names=c("id","hpgl0494"))
  hpgl0366.lmajor <- read.table("htseq_HPGL0366_Lmajor60_20140329.txt",col.names=c("id","hpgl0366"))
  hpgl0376.lmajor <- read.table("htseq_HPGL0376_Lmajor60_20140329.txt",col.names=c("id","hpgl0376"))
  hpgl0399.lmajor <- read.table("htseq_HPGL0399_Lmajor60_20140712.txt",col.names=c("id","hpgl0399"))
  hpgl0459.lmajor <- read.table("htseq_HPGL0459_Lmajor60_20141001.txt",col.names=c("id","hpgl0459"))
  hpgl0497.lmajor <- read.table("htseq_HPGL0497_Lmajor60_20141211.txt",col.names=c("id","hpgl0497"))
  hpgl0368.lmajor <- read.table("htseq_HPGL0368_Lmajor60_20140329.txt",col.names=c("id","hpgl0368"))
  hpgl0378.lmajor <- read.table("htseq_HPGL0378_Lmajor60_20140329.txt",col.names=c("id","hpgl0378"))
  hpgl0401.lmajor <- read.table("htseq_HPGL0401_Lmajor60_20140712.txt",col.names=c("id","hpgl0401"))
  hpgl0463.lmajor <- read.table("htseq_HPGL0463_Lmajor60_20141001.txt",col.names=c("id","hpgl0463"))
  hpgl0501.lmajor <- read.table("htseq_HPGL0501_Lmajor60_20141211.txt",col.names=c("id","hpgl0501"))
  hpgl0370.lmajor <- read.table("htseq_HPGL0370_Lmajor60_20140329.txt",col.names=c("id","hpgl0370"))
  hpgl0380.lmajor <- read.table("htseq_HPGL0380_Lmajor60_20140329.txt",col.names=c("id","hpgl0380"))
  hpgl0403.lmajor <- read.table("htseq_HPGL0403_Lmajor60_20140712.txt",col.names=c("id","hpgl0403"))
  hpgl0467.lmajor <- read.table("htseq_HPGL0467_Lmajor60_20141001.txt",col.names=c("id","hpgl0467"))
  hpgl0505.lmajor <- read.table("htseq_HPGL0505_Lmajor60_20141211.txt",col.names=c("id","hpgl0505"))

  list.of.data.frames <- list(hpgl0364.lmajor,hpgl0374.lmajor,hpgl0397.lmajor,hpgl0456.lmajor,hpgl0494.lmajor,
                              hpgl0366.lmajor,hpgl0376.lmajor,hpgl0399.lmajor,hpgl0459.lmajor,hpgl0497.lmajor,
                              hpgl0368.lmajor,hpgl0378.lmajor,hpgl0401.lmajor,hpgl0463.lmajor,hpgl0501.lmajor,
                              hpgl0370.lmajor,hpgl0380.lmajor,hpgl0403.lmajor,hpgl0467.lmajor,hpgl0505.lmajor,
                              lmajor.72counts)
}
lmaj.countsTable = Reduce(function(...) merge(..., by="id", all=T), list.of.data.frames)
lmaj.countsTable <- lmaj.countsTable[-c(1:2,9466:9474),]
rownames(lmaj.countsTable) <- lmaj.countsTable$id
lmaj.countsTable <- data.matrix(lmaj.countsTable[,-1])
rownames(lmaj.countsTable) <- sapply(rownames(lmaj.countsTable), function(x) substr(x,6,nchar(x)))
rownames(lmaj.countsTable) <- sapply(rownames(lmaj.countsTable), function(x) strsplit(x,'-')[[1]][1])
lmaj.collapse <- matrix(0,0,nrow=length(unique(rownames(lmaj.countsTable))),ncol=24)
rownames(lmaj.collapse) <- unique(rownames(lmaj.countsTable))
for(i in 1:nrow(lmaj.collapse)){
  temp.mat <- subset(lmaj.countsTable,rownames(lmaj.countsTable) %in% rownames(lmaj.collapse)[i])
  if(nrow(temp.mat)==1){
    lmaj.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lmaj.collapse[i,] <- colSums(lmaj.countsTable[rownames(lmaj.countsTable) %in% rownames(lmaj.collapse)[i],])
  }
}

lmex.db <- import.gff('../Lb/TriTrypDB-27_LmexicanaMHOMGT2001U1103.gff')
lmex.hist <- data.frame(chr=seqnames(lmex.db),ID=lmex.db$ID,
                        Name=lmex.db$type,
                        size=width(lmex.db),
                        start=start(lmex.db),end=end(lmex.db))
lmex.hist <- lmex.hist[lmex.hist$Name %in% 'CDS',]
lmexhist.collapse <- matrix(0,0,nrow=nrow(lmex.collapse),ncol=2)
rownames(lmexhist.collapse) <- rownames(lmex.collapse)
for(i in 1:nrow(lmexhist.collapse)){
  temp.mat <- lmex.hist[grep(rownames(lmexhist.collapse)[i],lmex.hist$ID),]
  if(nrow(temp.mat)==1){
    lmexhist.collapse[i,2] <- temp.mat$size
  }
  if(nrow(temp.mat)>1){
    lmexhist.collapse[i,2] <- sum(temp.mat$size)
  }
}
lmex_rpkm <- rpkm(lmex.collapse,as.numeric(lmexhist.collapse[,2]))
lmexvitro_rpkm <- rpkm(lmexvitro.collapse,as.numeric(lmexhist.collapse[,2]))

lmex_rpkm <- lmex_rpkm[order(rowSums(lmex_rpkm[,5:10]),decreasing=T),]
datCor <- cor(lmex_rpkm[,5:10])
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=c('DCL.1','DCL.2','DCL.3','DCL.4','DCL.5','DCL.6'),
          labCol=c('DCL.1','DCL.2','DCL.3','DCL.4','DCL.5','DCL.6'),
          scale="none", cellnote = format(round(datCor, 2), nsmall = 0),
          breaks = seq(0,1,0.01),
          notecex=0.5, notecol = 'black', trace="none",
          srtCol=45,main='Pearson correlation (RPKMs)')

lb.db <- import.gff("../Lb/TriTrypDB-26_LbraziliensisMHOMBR75M2904.gff")
lb.hist <- data.frame(chr=seqnames(lb.db),ID=lb.db$ID,
                      Name=lb.db$type,
                      size=width(lb.db),
                      start=start(lb.db),end=end(lb.db))
lb.hist <- lb.hist[lb.hist$Name %in% 'CDS',]
lbhist.collapse <- matrix(0,0,nrow=nrow(lb.collapse),ncol=2)
rownames(lbhist.collapse) <- rownames(lb.collapse)
for(i in 1:nrow(lbhist.collapse)){
  temp.mat <- lb.hist[grep(rownames(lbhist.collapse)[i],lb.hist$ID),]
  if(nrow(temp.mat)==1){
    lbhist.collapse[i,2] <- temp.mat$size
  }
  if(nrow(temp.mat)>1){
    lbhist.collapse[i,2] <- sum(temp.mat$size)
  }
}
lbhist.collapse[rownames(lbhist.collapse)=="LbrM.10.0191",2] <- 645
lb_rpkm <- rpkm(lb.collapse,gene.length=as.numeric(lbhist.collapse[,2]))

lmajor.db <- import.gff('../Lb/TriTrypDB-27_LmajorFriedlin.gff')
lmaj.hist <- data.frame(chr=seqnames(lmajor.db),ID=lmajor.db$ID,
                        Name=lmajor.db$type,
                        size=width(lmajor.db),
                        start=start(lmajor.db),end=end(lmajor.db))
lmaj.hist <- lmaj.hist[lmaj.hist$Name %in% 'exon',]
lmajhist.collapse <- matrix(0,0,nrow=nrow(lmaj.collapse),ncol=2)
rownames(lmajhist.collapse) <- rownames(lmaj.collapse)
for(i in 1:nrow(lmajhist.collapse)){
  temp.mat <- lmaj.hist[grep(rownames(lmajhist.collapse)[i],lmaj.hist$ID),]
  if(nrow(temp.mat)==1){
    lmajhist.collapse[i,2] <- temp.mat$size
  }
  if(nrow(temp.mat)>1){
    lmajhist.collapse[i,2] <- sum(temp.mat$size)
  }
}
lmaj_rpkm <- rpkm(lmaj.collapse,as.numeric(lmajhist.collapse[,2]))

nrow(lmex_rpkm)
nrow(lb_rpkm)
nrow(lmaj_rpkm)

## Coverage of *L. amazonensis* DCL lesions
barplot(colSums(countsTable), las=3, main='Lmx Raw Counts By Sample (DCL)')
dim(lmex_rpkm)

## Coverage of *L. amazonensis* LCL parasite *in vitro* macrophage infection
barplot(colSums(lmexvitro.countsTable), las=3, main='Lmx Raw Counts By Sample (LCL in vitro)')
dim(lmexvitro_rpkm)

## Coverage of *L. braziliensis* LCL lesions
barplot(colSums(lb.countsTable), las=3, main='Lbr Raw Counts By Sample (LCL in vivo)')
dim(lb_rpkm)

## Coverage of *L. major* LCL parasite *in vitro* macrophage infection
barplot(colSums(lmaj.countsTable), las=3, main='Lmj Raw Counts By Sample (LCL in vitro)')
dim(lmaj_rpkm)

## Coverage of all parasites, all conditions
barplot(c(colSums(countsTable),colSums(lmexvitro.countsTable),
          colSums(lb.countsTable),colSums(lmaj.countsTable)),
        las=3)

##### Generate single-reciprocal ortholog tables #####
##Compare Lmex vs. all
lmex_rpkm <- rpkm(lmex.collapse,as.numeric(lmexhist.collapse[,2]))
Lmex.to.paralogs <- read.table("../Lb/Lmx_ama.paralogs.ids.txt")
Lmex.paralogs <- matrix(0,0,ncol=2,nrow=nrow(Lmex.to.paralogs))
row=1
pb <- txtProgressBar(0,nrow(lmex_rpkm),initial=0,style=3)
for(i in 1:nrow(lmex_rpkm)){
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
  if(rownames(lmex_rpkm)[i] %in% Lmex.to.paralogs[,1] & rownames(lmex_rpkm)[i+1] %in% Lmex.to.paralogs[,1]){
    row1 <- which(Lmex.to.paralogs[,1]==rownames(lmex_rpkm)[i]) #row of current gene
    row2 <- which(Lmex.to.paralogs[,1]==rownames(lmex_rpkm)[i+1]) #row of next gene
    if((row2-row1)>1){ #if there are paralogs
      for(j in (row1+1):(row2-1)){ #loop from the row after current gene to row before next gene
        Lmex.paralogs[row,1] <- rownames(lmex_rpkm)[i] #Lmex gene
        Lmex.paralogs[row,2] <- as.character(Lmex.to.paralogs[j,]) #Paralog gene
        row=row+1
      }
    }
  }
}

lmex_hom <- matrix(0,0,ncol=3,nrow=nrow(lmex_rpkm))
lmex_hom[,1] <- rownames(lmex_rpkm)
pb <- txtProgressBar(0,nrow(lmex_rpkm),initial=0,style=3)
for(i in 1:nrow(lmex_rpkm)){
  tempid <- lmex_hom[i,1]
  if(!tempid %in% Lmex.paralogs[,1]){
    lmex_hom[i,2:3] <- 0
  }
  if(tempid %in% Lmex.paralogs[,1]){
    temphom.bra <- subset(Lmex.paralogs, Lmex.paralogs[,1] %in% tempid)
    temphom.bra <- subset(temphom.bra,substr(temphom.bra[,2],1,3)=="Lbr")
    temphom.bra <- subset(temphom.bra,temphom.bra[,2] %in% rownames(lb_rpkm))
    temphom.maj <- subset(Lmex.paralogs, Lmex.paralogs[,1] %in% tempid)
    temphom.maj <- subset(temphom.maj,substr(temphom.maj[,2],1,4)=="LmjF")
    temphom.maj <- subset(temphom.maj,temphom.maj[,2] %in% rownames(lmaj_rpkm))
    if(nrow(temphom.bra)==0 | nrow(temphom.maj)==0){
      lmex_hom[i,2:3] <- 0
    }
    if(nrow(temphom.bra)==1 & nrow(temphom.maj)==1){
      lmex_hom[i,2] <- temphom.bra[,2]
      lmex_hom[i,3] <- temphom.maj[,2]
    }
    if(nrow(temphom.bra)>1 | nrow(temphom.maj)>1){
      lmex_hom[i,2:3] <- 0
    }
  }
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
}
lmex_hom <- lmex_hom[lmex_hom[,2]!=0 & lmex_hom[,3]!=0,]

#Compare Lbraz vs. all
Lb.to.paralogs <- read.table("../Lb/Lbraz.paralogs.ids.txt")
Lb.paralogs <- matrix(0,0,ncol=2,nrow=nrow(Lb.to.paralogs))
row=1
pb <- txtProgressBar(0,nrow(lb_rpkm),initial=0,style=3)
for(i in 1:nrow(lb_rpkm)){
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
  if(rownames(lb_rpkm)[i] %in% Lb.to.paralogs[,1] & rownames(lb_rpkm)[i+1] %in% Lb.to.paralogs[,1]){
    row1 <- which(Lb.to.paralogs[,1]==rownames(lb_rpkm)[i]) #row of current gene
    row2 <- which(Lb.to.paralogs[,1]==rownames(lb_rpkm)[i+1]) #row of next gene
    if((row2-row1)>1){ #if there are paralogs
      for(j in (row1+1):(row2-1)){ #loop from the row after current gene to row before next gene
        Lb.paralogs[row,1] <- rownames(lb_rpkm)[i] #Lmex gene
        Lb.paralogs[row,2] <- as.character(Lb.to.paralogs[j,]) #Paralog gene
        row=row+1
      }
    }
  }
}

lb_hom <- matrix(0,0,ncol=3,nrow=nrow(lb_rpkm))
lb_hom[,1] <- rownames(lb_rpkm)
pb <- txtProgressBar(0,nrow(lb_rpkm),initial=0,style=3)
for(i in 1:nrow(lb_rpkm)){
  tempid <- lb_hom[i,1]
  if(!tempid %in% Lb.paralogs[,1]){
    lb_hom[i,2:3] <- 0
  }
  if(tempid %in% Lb.paralogs[,1]){
    temphom.1 <- subset(Lb.paralogs, Lb.paralogs[,1] %in% tempid)
    temphom.1 <- subset(temphom.1,substr(temphom.1[,2],1,3)=="Lmx")
    temphom.1 <- subset(temphom.1,temphom.1[,2] %in% rownames(lmex_rpkm))
    temphom.2 <- subset(Lb.paralogs, Lb.paralogs[,1] %in% tempid)
    temphom.2 <- subset(temphom.2,substr(temphom.2[,2],1,3)=="Lmj")
    temphom.2 <- subset(temphom.2,temphom.2[,2] %in% rownames(lmaj_rpkm))
    if(nrow(temphom.1)==0 | nrow(temphom.2)==0){
      lb_hom[i,2:3] <- 0
    }
    if(nrow(temphom.1)==1 & nrow(temphom.2)==1){
      lb_hom[i,2] <- temphom.1[,2]
      lb_hom[i,3] <- temphom.2[,2]
    }
    if(nrow(temphom.1)>1 | nrow(temphom.2)>1){
      lb_hom[i,2:3] <- 0
    }
  }
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
}
lb_hom <- lb_hom[lb_hom[,2]!=0 & lb_hom[,3]!=0,]

#Compare Lmaj vs. all
Lmaj.to.paralogs <- read.csv("../Lb/LmjF.paralogs.ids.txt",header=F)
Lmaj.paralogs <- matrix(0,0,ncol=2,nrow=nrow(Lmaj.to.paralogs))
row=1
pb <- txtProgressBar(0,nrow(lmaj_rpkm),initial=0,style=3)
for(i in 1:nrow(lmaj_rpkm)){
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
  if(rownames(lmaj_rpkm)[i] %in% Lmaj.to.paralogs[,1] & rownames(lmaj_rpkm)[i+1] %in% Lmaj.to.paralogs[,1]){
    row1 <- which(Lmaj.to.paralogs[,1]==rownames(lmaj_rpkm)[i]) #row of current gene
    row2 <- which(Lmaj.to.paralogs[,1]==rownames(lmaj_rpkm)[i+1]) #row of next gene
    if((row2-row1)>1){ #if there are paralogs
      for(j in (row1+1):(row2-1)){ #loop from the row after current gene to row before next gene
        Lmaj.paralogs[row,1] <- rownames(lmaj_rpkm)[i] #Lmex gene
        Lmaj.paralogs[row,2] <- as.character(Lmaj.to.paralogs[j,]) #Paralog gene
        row=row+1
      }
    }
  }
}

lmaj_hom <- matrix(0,0,ncol=3,nrow=nrow(lmaj_rpkm))
lmaj_hom[,1] <- rownames(lmaj_rpkm)
pb <- txtProgressBar(0,nrow(lmaj_rpkm),initial=0,style=3)
for(i in 1:nrow(lmaj_rpkm)){
  tempid <- lmaj_hom[i,1]
  if(!tempid %in% Lmaj.paralogs[,1]){
    lmaj_hom[i,2:3] <- 0
  }
  if(tempid %in% Lmaj.paralogs[,1]){
    temphom.1 <- subset(Lmaj.paralogs, Lmaj.paralogs[,1] %in% tempid)
    temphom.1 <- subset(temphom.1,substr(temphom.1[,2],1,3)=="Lmx")
    temphom.1 <- subset(temphom.1,temphom.1[,2] %in% rownames(lmex_rpkm))
    temphom.2 <- subset(Lmaj.paralogs, Lmaj.paralogs[,1] %in% tempid)
    temphom.2 <- subset(temphom.2,substr(temphom.2[,2],1,3)=="Lbr")
    temphom.2 <- subset(temphom.2,temphom.2[,2] %in% rownames(lb_rpkm))
    if(nrow(temphom.1)==0 | nrow(temphom.2)==0){
      lmaj_hom[i,2:3] <- 0
    }
    if(nrow(temphom.1)==1 & nrow(temphom.2)==1){
      lmaj_hom[i,2] <- temphom.1[,2]
      lmaj_hom[i,3] <- temphom.2[,2]
    }
    if(nrow(temphom.1)>1 | nrow(temphom.2)>1){
      lmaj_hom[i,2:3] <- 0
    }
  }
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
}
lmaj_hom <- lmaj_hom[lmaj_hom[,2]!=0 & lmaj_hom[,3]!=0,]

lmex_hom.all <- lmex_hom[lmex_hom[,1] %in% lb_hom[,2] &
                           lmex_hom[,1] %in% lmaj_hom[,2] &
                           lmex_hom[,2] %in% lb_hom[,1] &
                           lmex_hom[,2] %in% lmaj_hom[,3] &
                           lmex_hom[,3] %in% lb_hom[,3] &
                           lmex_hom[,3] %in% lmaj_hom[,1],]
colnames(lmex_hom.all) <- c('lmx.id','lbr.id','lmj.id')

lmex.rpkm.homs <- cbind(rownames(lmex_rpkm),lmex_rpkm[,5:10])
colnames(lmex.rpkm.homs) <- c('lmx.id',paste(rep('lmxvivo',times=6), c(1:6), sep="."))
lmex.rpkm.homs <- merge(lmex.rpkm.homs, lmex_hom.all, by='lmx.id')

lmex.vitro.rpkm.homs <- cbind(rownames(lmexvitro_rpkm),lmexvitro_rpkm)
colnames(lmex.vitro.rpkm.homs) <- c('lmx.id',paste(rep(c("lmxexternal", "lmxvitro4","lmxvitro24",
                                                         "lmxvitro48", "lmxvitro72"),times=3), c(1:15), sep="."))
lmex.vitro.rpkm.homs <- merge(lmex.vitro.rpkm.homs, lmex_hom.all, by='lmx.id')

lb.rpkm.homs <- cbind(rownames(lb_rpkm),lb_rpkm)
colnames(lb.rpkm.homs) <- c('lbr.id',paste(rep('lbrvivo',times=6), c(1:6), sep="."))
lb.rpkm.homs <- merge(lb.rpkm.homs, lmex_hom.all, by='lbr.id')

lmaj.rpkm.homs <- cbind(rownames(lmaj_rpkm),lmaj_rpkm)
colnames(lmaj.rpkm.homs) <- c('lmj.id',paste(c(rep("lmjexternal",times=5),rep("lmjvitro4",times=5),rep("lmjvitro24",times=5),
                                               rep("lmjvitro48",times=5),rep("lmjvitro72",times=4)),c(1:24), sep="."))
lmaj.rpkm.homs <- merge(lmaj.rpkm.homs, lmex_hom.all, by='lmj.id')

list.of.data.frames <- list(lmex.rpkm.homs,lmex.vitro.rpkm.homs,lb.rpkm.homs,lmaj.rpkm.homs)
all.rpkm = Reduce(function(...) merge(..., by="lmx.id", all=T), list.of.data.frames)
all.rpkm <- all.rpkm[,c(2:7,10:24,28:33,36:59)]
all.rpkm <- apply(all.rpkm,2,as.numeric)
rownames(all.rpkm) <- lmex_hom.all[,1]
q.rpkm <- cbind(lmex_hom.all,qNorm(all.rpkm))

dim(lmex_hom)
dim(lb_hom)
dim(lmaj_hom)
head(lmex_hom.all)
nrow(lmex_hom.all)

###### RPKM correlations#####
col.nam <- c(paste(rep('Lmex-vivo',times=6), c(1:6), sep="."),
             rep(c("external","Lmex-vitro4","Lmex-vitro24","Lmex-vitro48","Lmex-vitro72"),times=3),
             paste(rep('Lb-vivo',times=6), c(1:6), sep="."),
             paste(rep("lmjexternal",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro4",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro24",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro48",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro72",times=4),c(1:4), sep='.'))
# Ran correlations using pearson, spearman, and by correlating rpkms of top quartile and bottom three quartiles
#Top quartile of DCL L.mx is 1:2063 after sorting on rowSums of columns 1:6
#Also looked at quantile normalization effect on correlations
#all.rpkm <- all.rpkm[order(rowSums(all.rpkm),decreasing=T),]
datCor <- cor(all.rpkm,method='spearman')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0,1,length=100),
          trace="none",
          srtCol=45,main='Spearman correlation (RPKMs)')

#Quantile normalize rpkm values across samples
q.rpkm <- qNorm(all.rpkm)
datCor <- cor(q.rpkm,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0,1,length=100),
          trace="none",
          srtCol=45,main='Pearson correlation (qnorm RPKMs)')

#Correlation between L. amazonensis samples
lmexvitro_rpkm <- lmexvitro_rpkm[match(rownames(lmex_rpkm),rownames(lmexvitro_rpkm)),]
lmexall.rpkm <- cbind(lmex_rpkm[,5:10],lmexvitro_rpkm)
lmexall.rpkm <- apply(lmexall.rpkm,2,as.numeric)
rownames(lmexall.rpkm) <- rownames(lmex_rpkm)
col.nam <- c(paste(rep('Lmex-vivo',times=6), c(1:6), sep="."),
             rep(c("external", "Lmex-vitro4","Lmex-vitro24", "Lmex-vitro48", "Lmex-vitro72"),times=3))
datCor <- cor(lmexall.rpkm,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0.55,1,length=100),
          trace="none",
          srtCol=45,main='Pearson correlation (Lmx RPKMs)')
qmex.rpkm <- qNorm(lmexall.rpkm)
rownames(qmex.rpkm) <- rownames(lmex_rpkm)
datCor <- cor(qmex.rpkm,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0.55,1,length=100),
          trace="none",
          srtCol=45,main='Pearson correlation (qnorm Lmx RPKMs)')

#######All Leishmania comparison/table generation#####
lmaj_rpkm <- lmaj_rpkm[rownames(lmaj_rpkm) %in% lmajor.tritrypdb[,1],]
lmex_rpkm <- lmex_rpkm[rownames(lmex_rpkm) %in% lmex.tritrypdb[,1],]
lmexvitro_rpkm <- lmexvitro_rpkm[rownames(lmexvitro_rpkm) %in% lmex.tritrypdb[,1],]
lb_rpkm <- lb_rpkm[rownames(lb_rpkm) %in% lb.tritrypdb[,1],]

all_comp <- matrix(0,0,nrow=nrow(all.rpkm)+
                     (nrow(lb_rpkm)-nrow(all.rpkm))+
                     (nrow(lmaj_rpkm)-nrow(all.rpkm))+
                     (nrow(lmex_rpkm)-nrow(all.rpkm)),ncol=16)
common.genes = Reduce(function(...) merge(..., by="lmx.id", all=T), list.of.data.frames)
common.genes <- common.genes[,c(1,8,9)]
all_comp[1:nrow(all.rpkm),2] <- as.character(common.genes[,1])
all_comp[1:nrow(all.rpkm),9] <- as.character(common.genes[,2])
all_comp[1:nrow(all.rpkm),13] <- as.character(common.genes[,3])
all_comp[1:nrow(all.rpkm),4] <- rowMeans(all.rpkm[,1:6])
all_comp[1:nrow(all.rpkm),7] <- rowMeans(all.rpkm[,c(11,16,21)])
all_comp[1:nrow(all.rpkm),11] <- rowMeans(all.rpkm[,22:26])
all_comp[1:nrow(all.rpkm),15] <- rowMeans(all.rpkm[,48:51])
for(i in 1:nrow(all.rpkm)){
  all_comp[i,1] <- as.character(lmex.tritrypdb[lmex.tritrypdb[,1] == as.character(all_comp[i,2]),][,2])
  all_comp[i,5] <- sd(all.rpkm[i,1:6])/sqrt(6)
  all_comp[i,8] <- sd(all.rpkm[i,c(11,16,21)])/sqrt(3)
  all_comp[i,12] <- sd(all.rpkm[i,22:26])/sqrt(6)
  all_comp[i,16] <- sd(all.rpkm[i,48:51])/sqrt(4)
}
lmexvivo_unique <- lmex_rpkm[!rownames(lmex_rpkm) %in% common.genes[,1],]
lmexvitro_unique <- lmexvitro_rpkm[!rownames(lmexvitro_rpkm) %in% common.genes[,1],]
lmexvitro_unique <- lmexvitro_unique[match(rownames(lmexvivo_unique),rownames(lmexvitro_unique)),]
all_comp[(nrow(all.rpkm)+1):nrow(lmex_rpkm),2] <- rownames(lmexvivo_unique)
all_comp[(nrow(all.rpkm)+1):nrow(lmex_rpkm),4] <- rowMeans(lmexvivo_unique[,5:10])
all_comp[(nrow(all.rpkm)+1):nrow(lmex_rpkm),7] <- rowMeans(lmexvitro_unique[,c(5,10,15)])
for(i in (nrow(all.rpkm)+1):nrow(lmex_rpkm)){
  row = i-nrow(all.rpkm)
  all_comp[i,1] <- as.character(lmex.tritrypdb[lmex.tritrypdb[,1] == as.character(all_comp[i,2]),][,2])
  all_comp[i,5] <- sd(lmexvivo_unique[row,5:10])/sqrt(6)
  all_comp[i,8] <- sd(lmexvitro_unique[row,])/sqrt(3)
}
lb_unique <- lb_rpkm[!rownames(lb_rpkm) %in% common.genes[,2],]
all_comp[(nrow(lmex_rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)),9] <- rownames(lb_unique)
all_comp[(nrow(lmex_rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)),11] <- rowMeans(lb_unique)
for(i in (nrow(lmex_rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm))){
  row = i-nrow(lmex_rpkm)
  all_comp[i,1] <- as.character(lb.tritrypdb[lb.tritrypdb[,1] == as.character(all_comp[i,9]),][,2])
  all_comp[i,12] <- sd(lb_unique[row,])/sqrt(6)
}
lmj_unique <- lmaj_rpkm[!rownames(lmaj_rpkm) %in% common.genes[,3],]
lmajor.tritrypdb <- lmajor.tritrypdb[lmajor.tritrypdb[,1] %in% rownames(lmaj_rpkm),]
all_comp[(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+nrow(lmaj_rpkm)-nrow(all.rpkm)),13] <- rownames(lmj_unique)
all_comp[(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+nrow(lmaj_rpkm)-nrow(all.rpkm)),15] <- rowMeans(lmj_unique[,21:24])
for(i in (nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+nrow(lmaj_rpkm)-nrow(all.rpkm))){
  row = i-nrow(lmex_rpkm)-(nrow(lb_rpkm)-nrow(all.rpkm))
  all_comp[i,1] <- as.character(lmajor.tritrypdb[lmajor.tritrypdb[,1] == as.character(all_comp[i,13]),][,2][1])
  all_comp[i,16] <- sd(lmj_unique[row,21:24])/sqrt(4)
}

all_comp <- all_comp[order(as.numeric(all_comp[,4]),decreasing=T),]
all_comp[1:nrow(lmex_rpkm),3] <- c(1:nrow(lmex_rpkm))
all_comp <- all_comp[order(as.numeric(all_comp[,7]),decreasing=T),]
all_comp[1:nrow(lmexvitro_rpkm),6] <- c(1:nrow(lmexvitro_rpkm))
all_comp <- all_comp[order(as.numeric(all_comp[,11]),decreasing=T),]
all_comp[1:nrow(lb_rpkm),10] <- c(1:nrow(lb_rpkm))
all_comp <- all_comp[order(as.numeric(all_comp[,15]),decreasing=T),]
all_comp[1:nrow(lmaj_rpkm),14] <- c(1:nrow(lmaj_rpkm))

write.table(all_comp,'Lmx_vivo_72hrvitro_Lbr_vivo_Lmaj_72hrvitro_genecomp_20180106.txt',sep='\t')

all_comp <- all_comp[order(as.numeric(all_comp[,3])),]
plot(all_comp[4313:4912,3],all_comp[4313:4912,6],pch=19,cex=0.2,
     xlab='Rank (DCL in vivo)',ylab='Rank (LCL in vitro-72hrs)')
abline(h=1000,col='red',lty=2)
la72low <- as.numeric(all_comp[4313:4912,6])>2000
text(x=as.numeric(all_comp[4313:4912,3])[la72low],
     y=as.numeric(all_comp[4313:4912,6])[la72low],
     labels=all_comp[4313:4912,2][la72low], cex=.7, pos=2)
all_comp <- all_comp[all_comp[,2] %in% common.genes[,1],]
for(i in 1:nrow(all_comp)){
  all_comp[i,4] <- 1-(as.numeric(all_comp[i,3])/nrow(lmex_rpkm))
  all_comp[i,7] <- 1-(as.numeric(all_comp[i,6])/nrow(lmex_rpkm))
  all_comp[i,11] <- 1-(as.numeric(all_comp[i,10])/nrow(lb_rpkm))
  all_comp[i,15] <- 1-(as.numeric(all_comp[i,14])/nrow(lmaj_rpkm))
}

lmxrank <- lmex_rpkm[,5:10]
for(i in 1:ncol(lmxrank)){
  lmxrank <- lmxrank[order(as.numeric(lmxrank[,i]),decreasing = T),]
  lmxrank[,i] <- c(1:nrow(lmxrank))
  for(j in 1:nrow(lmxrank)){
    lmxrank[j,i] <- 1-(as.numeric(lmxrank[j,i])/nrow(lmxrank))
  }
}
lmxrank <- lmxrank[rownames(lmxrank) %in% all_comp[,2],]
lmxrank <- lmxrank[match(all_comp[,2],rownames(lmxrank)),]
lmxvitrorank <- lmexvitro_rpkm[,c(5,10,15)]
for(i in 1:ncol(lmxvitrorank)){
  lmxvitrorank <- lmxvitrorank[order(as.numeric(lmxvitrorank[,i]),decreasing = T),]
  lmxvitrorank[,i] <- c(1:nrow(lmxvitrorank))
  for(j in 1:nrow(lmxvitrorank)){
    lmxvitrorank[j,i] <- 1-(as.numeric(lmxvitrorank[j,i])/nrow(lmxvitrorank))
  }
}
lmxvitrorank <- lmxvitrorank[rownames(lmxvitrorank) %in% all_comp[,2],]
lmxvitrorank <- lmxvitrorank[match(all_comp[,2],rownames(lmxvitrorank)),]
lbrrank <- lb_rpkm
for(i in 1:ncol(lbrrank)){
  lbrrank <- lbrrank[order(as.numeric(lbrrank[,i]),decreasing = T),]
  lbrrank[,i] <- c(1:nrow(lbrrank))
  for(j in 1:nrow(lbrrank)){
    lbrrank[j,i] <- 1-(as.numeric(lbrrank[j,i])/nrow(lbrrank))
  }
}
lbrrank <- lbrrank[rownames(lbrrank) %in% all_comp[,9],]
lbrrank <- lbrrank[match(all_comp[,9],rownames(lbrrank)),]
lmajrank <- lmaj_rpkm[,21:24]
for(i in 1:ncol(lmajrank)){
  lmajrank <- lmajrank[order(as.numeric(lmajrank[,i]),decreasing = T),]
  lmajrank[,i] <- c(1:nrow(lmajrank))
  for(j in 1:nrow(lmajrank)){
    lmajrank[j,i] <- 1-(as.numeric(lmajrank[j,i])/nrow(lmajrank))
  }
}
lmajrank <- lmajrank[rownames(lmajrank) %in% all_comp[,13],]
lmajrank <- lmajrank[match(all_comp[,13],rownames(lmajrank)),]

all_pctl <- cbind(lmxrank,lmxvitrorank,lbrrank,lmajrank)
pctl <- all_pctl
heatmap.2(pctl[rowMeans(all_pctl[,1:6])>0.95,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
heatmap.2(pctl[rowMeans(all_pctl[,7:9])>0.95,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
heatmap.2(pctl[rowMeans(all_pctl[,10:15])>0.95,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
heatmap.2(pctl[rowMeans(all_pctl[,16:19])>0.9,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
datCor <- cor(pctl,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          scale="none",breaks=seq(0,1,length=101),
          trace="none",col= colorRampPalette(brewer.pal(9, "GnBu"))(100),
          srtCol=45,main='Pearson correlation')

q.all <- log2CPM(all_pctl,
                 lib.size = c(colSums(lmex_rpkm)[5:10],colSums(lmexvitro_rpkm)[c(5,10,15)],
                              colSums(lb_rpkm),colSums(lmaj_rpkm)[21:24]))
s <- makeSVD(all_pctl)
col <- c(rep("red",times=6),rep("blue",times=3),rep("orange",times=6),rep("purple",times=4))
plotPC(s$v,
       s$d,
       col="black",
       pch=22,
       bg=col, cex=2.6)
library(pca3d)
pca_3d <- prcomp(t(all_pctl))
pca3d(pca_3d, components = 1:3,
      group = col)
pcRes(s$v, s$d, col)[1:5,]


pctl <- read.table('allspec_rank.txt',sep='\t',header=T)[,1:4]

sum(as.numeric(pctl[,1]) > 0.95 & as.numeric(pctl[,2]) > 0.95)/sum(pctl[,1] > 0.95)
sum(as.numeric(pctl[,1]) > 0.95 & as.numeric(pctl[,3]) > 0.95)/sum(pctl[,1] > 0.95)
sum(as.numeric(pctl[,1]) > 0.95 & as.numeric(pctl[,4]) > 0.95)/sum(pctl[,1] > 0.95)
sum(as.numeric(pctl[,2]) > 0.95 & as.numeric(pctl[,1]) > 0.95)/sum(pctl[,2] > 0.95)
sum(as.numeric(pctl[,2]) > 0.95 & as.numeric(pctl[,3]) > 0.95)/sum(pctl[,2] > 0.95)
sum(as.numeric(pctl[,2]) > 0.95 & as.numeric(pctl[,4]) > 0.95)/sum(pctl[,2] > 0.95)
sum(as.numeric(pctl[,3]) > 0.95 & as.numeric(pctl[,1]) > 0.95)/sum(pctl[,3] > 0.95)
sum(as.numeric(pctl[,3]) > 0.95 & as.numeric(pctl[,2]) > 0.95)/sum(pctl[,3] > 0.95)
sum(as.numeric(pctl[,3]) > 0.95 & as.numeric(pctl[,4]) > 0.95)/sum(pctl[,3] > 0.95)
sum(as.numeric(pctl[,4]) > 0.95 & as.numeric(pctl[,1]) > 0.95)/sum(pctl[,4] > 0.95)
sum(as.numeric(pctl[,4]) > 0.95 & as.numeric(pctl[,2]) > 0.95)/sum(pctl[,4] > 0.95)
sum(as.numeric(pctl[,4]) > 0.95 & as.numeric(pctl[,3]) > 0.95)/sum(pctl[,4] > 0.95)

pctl <- pctl[order(as.numeric(pctl[,1]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,1] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
pctl <- pctl[order(as.numeric(pctl[,2]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,2] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
pctl <- pctl[order(as.numeric(pctl[,3]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,3] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
pctl <- pctl[order(as.numeric(pctl[,4]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,4] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
---
title: "Leishmania strains 201805: Collecting annotation information."
author: "atb abelew@gmail.com"
date: "`r Sys.Date()`"
output:
 html_document:
  code_download: true
  code_folding: show
  fig_caption: true
  fig_height: 7
  fig_width: 7
  highlight: default
  keep_md: false
  mode: selfcontained
  number_sections: true
  self_contained: true
  theme: readable
  toc: true
  toc_float:
   collapsed: false
   smooth_scroll: false
---

<style>
  body .main-container {
    max-width: 1600px;
  }
</style>

```{r options, include=FALSE}
if (!isTRUE(get0("skip_load"))) {
  library(hpgltools)
  tt <- sm(devtools::load_all("~/hpgltools"))
  knitr::opts_knit$set(progress=TRUE,
                       verbose=TRUE,
                       width=90,
                       echo=TRUE)
  knitr::opts_chunk$set(error=TRUE,
                        fig.width=8,
                        fig.height=8,
                        dpi=96)
  old_options <- options(digits=4,
                         stringsAsFactors=FALSE,
                         knitr.duplicate.label="allow")
  ggplot2::theme_set(ggplot2::theme_bw(base_size=12))
  ver <- "20180507"
  previous_file <- paste0("index_v", ver, ".Rmd")

  tmp <- try(sm(loadme(filename=gsub(pattern="\\.Rmd", replace="\\.rda\\.xz", x=previous_file))))
  rmd_file <- paste0("20180220_leish_compall.Rmd")
  savefile <- gsub(pattern="\\.Rmd", replace="\\.rda\\.xz", x=rmd_file)
}
```

# An attempt at running Steve's analyses without significant (any?) modifications.

My goal in this document is to repeat Steve's analyses without changing anything
more signifcant than the fact that I copied his count tables to the directory
'count_tables'.

When I do change things, I will leave the original text in a non-evaluated block
immediately before my modified version.  The completely unmodified script is in
the scripts/ directory.

I will also break up the various pieces into separate blocks so that I can
comment on anything along the way.

```{r loading}
require(rtracklayer)
library(gplots)
library(RColorBrewer)
library(edgeR)
library(cbcbSEQ)
library(hpgltools)
```

# Read the count tables, original

I am of course not working in dropbox and I put Steve's count tables into a
directory 'count_tables' from within this tree.

```{r count_tables, eval=FALSE}
setwd('~/Dropbox/lab_work/Leishmania Research/La')

###### Reading & collapsing tables, calculating RPKMs ######
hpgl0721.lmexicana <- read.table("hpgl0721_lmexicana.count",col.names=c("id","hpgl0721"))
hpgl0722.lmexicana <- read.table("hpgl0722_lmexicana.count",col.names=c("id","hpgl0722"))
hpgl0723.lmexicana <- read.table("hpgl0723_lmexicana.count",col.names=c("id","hpgl0723"))
hpgl0724.lmexicana <- read.table("hpgl0724_lmexicana.count",col.names=c("id","hpgl0724"))
hpgl0725.lmexicana <- read.table("hpgl0725_lmexicana.count",col.names=c("id","hpgl0725"))
hpgl0726.lmexicana <- read.table("hpgl0726_lmexicana.count",col.names=c("id","hpgl0726"))
hpgl0727.lmexicana <- read.table("hpgl0727_lmexicana.count",col.names=c("id","hpgl0727"))
hpgl0728.lmexicana <- read.table("hpgl0728_lmexicana.count",col.names=c("id","hpgl0728"))
hpgl0729.lmexicana <- read.table("hpgl0729_lmexicana.count",col.names=c("id","hpgl0729"))
hpgl0730.lmexicana <- read.table("hpgl0730_lmexicana.count",col.names=c("id","hpgl0730"))
```

In conversations with Steve, it sppears that the first 4 tables were not used.

```{r my_count_tables}
###### Reading & collapsing tables, calculating RPKMs ######
##hpgl0721.lmexicana <- read.table("count_tables/hpgl0721_lmexicana.count",col.names=c("id","hpgl0721"))
##hpgl0722.lmexicana <- read.table("count_tables/hpgl0722_lmexicana.count",col.names=c("id","hpgl0722"))
##hpgl0723.lmexicana <- read.table("count_tables/hpgl0723_lmexicana.count",col.names=c("id","hpgl0723"))
##hpgl0724.lmexicana <- read.table("count_tables/hpgl0724_lmexicana.count",col.names=c("id","hpgl0724"))
hpgl0725.lmexicana <- read.table("count_tables/hpgl0725_lmexicana.count",col.names=c("id","hpgl0725"))
hpgl0726.lmexicana <- read.table("count_tables/hpgl0726_lmexicana.count",col.names=c("id","hpgl0726"))
hpgl0727.lmexicana <- read.table("count_tables/hpgl0727_lmexicana.count",col.names=c("id","hpgl0727"))
hpgl0728.lmexicana <- read.table("count_tables/hpgl0728_lmexicana.count",col.names=c("id","hpgl0728"))
hpgl0729.lmexicana <- read.table("count_tables/hpgl0729_lmexicana.count",col.names=c("id","hpgl0729"))
hpgl0730.lmexicana <- read.table("count_tables/hpgl0730_lmexicana.count",col.names=c("id","hpgl0730"))
```

# Making a list of counts, original

```{r count_list_original}
##list.of.data.frames <- list(hpgl0721.lmexicana,hpgl0722.lmexicana,hpgl0723.lmexicana,hpgl0724.lmexicana,hpgl0725.lmexicana,hpgl0726.lmexicana,hpgl0727.lmexicana,hpgl0728.lmexicana,hpgl0729.lmexicana,hpgl0730.lmexicana)
list.of.data.frames <- list(hpgl0725.lmexicana,hpgl0726.lmexicana,hpgl0727.lmexicana,hpgl0728.lmexicana,hpgl0729.lmexicana,hpgl0730.lmexicana)
countsTable = Reduce(function(...) merge(..., by="id", all=T), list.of.data.frames)
countsTable <- countsTable[-c(1:5),]
countsTable <- countsTable[-grep("rfamscan",countsTable$id),]
rownames(countsTable) <- countsTable$id
countsTable <- data.matrix(countsTable[,-1])
## This is interesting, doing this peculiar sapply strsplit/substr operation allows one to have
## non-unique rownames in a matrix.
rownames(countsTable) <- sapply(rownames(countsTable), function(x) strsplit(x,'-')[[1]][1])
rownames(countsTable) <- sapply(rownames(countsTable), function(x) substr(x,6,nchar(x)))
```

```{r my_count_list}
df_list <- list(hpgl0725.lmexicana,
                hpgl0726.lmexicana,
                hpgl0727.lmexicana,
                hpgl0728.lmexicana,
                hpgl0729.lmexicana,
                hpgl0730.lmexicana)
my_counts <- Reduce(function(...) merge(..., by="id", all=TRUE), df_list)
rownames(my_counts) <- my_counts[["id"]]
my_counts <- my_counts[, -1]
keepers_idx <- !grepl(pattern="^_", x=rownames(my_counts))
my_counts <- my_counts[keepers_idx, ]
keepers_idx <- !grepl(pattern="rfamscan", x=rownames(my_counts))
my_counts <- my_counts[keepers_idx, ]
rownames(my_counts) <- gsub(pattern="^exon_", replacement="", x=rownames(my_counts))
rownames(my_counts) <- gsub(pattern="\\-1$", replacement="", x=rownames(my_counts))
```

# Summing exons, original

This is an interesting method of getting all the exon data from multiple
transcripts into a final count table.

```{r sum_exons_original, eval=FALSE}
##lmex.collapse <- matrix(0,0,nrow=length(unique(rownames(countsTable))),ncol=10)
## atb changed the static ncol=10 to the number of columns in countsTable
## because we are no longer using 721,722,723,724.
lmex.collapse <- matrix(0,0,nrow=length(unique(rownames(countsTable))),ncol=ncol(countsTable))
rownames(lmex.collapse) <- unique(rownames(countsTable))
for(i in 1:nrow(lmex.collapse)){
  temp.mat <- subset(countsTable,rownames(countsTable) %in% rownames(lmex.collapse)[i])
  if(nrow(temp.mat)==1){
    lmex.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lmex.collapse[i,] <- colSums(countsTable[rownames(countsTable) %in% rownames(lmex.collapse)[i],])
  }
}
## Added by atb
colnames(lmex.collapse) <- colnames(countsTable)
```

# My method of summing exons

```{r sum_exons}
multi_exon_idx <- grep(pattern="\\-\\d+$", x=rownames(my_counts))
for (idx in multi_exon_idx) {
  gene <- gsub(pattern="\\-\\d+$", replacement="", x=rownames(my_counts)[idx])
  exon_counts <- my_counts[idx, ]
  gene_counts <- my_counts[gene, ]
  total_counts <- gene_counts + exon_counts
  my_counts[gene, ] <- total_counts
}
my_counts <- my_counts[-multi_exon_idx, ]
my_counts <- as.matrix(my_counts)
all.equal(my_counts, lmex.collapse)
```



```{r the_rest, eval=FALSE}
lmexvitro.countsTable <- read.table("../Lb/20141211_Lmexicana81_434-435_437_440_443_446_453-454_458_462_466_470_491-492_496_500_504_508_CDSonly.count", header=T)
rownames(lmexvitro.countsTable) <- lmexvitro.countsTable$id
lmexvitro.countsTable <- data.matrix(lmexvitro.countsTable[,c(2:6,8:12,14:18)])
rownames(lmexvitro.countsTable) <- sapply(rownames(lmexvitro.countsTable), function(x) strsplit(x,'-')[[1]][1])
lmexvitro.collapse <- matrix(0,0,nrow=length(unique(rownames(lmexvitro.countsTable))),ncol=15)
rownames(lmexvitro.collapse) <- unique(rownames(lmexvitro.countsTable))
for(i in 1:nrow(lmexvitro.collapse)){
  temp.mat <- subset(lmexvitro.countsTable,rownames(lmexvitro.countsTable) %in% rownames(lmexvitro.collapse)[i])
  if(nrow(temp.mat)==1){
    lmexvitro.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lmexvitro.collapse[i,] <- colSums(lmexvitro.countsTable[rownames(lmexvitro.countsTable) %in% rownames(lmexvitro.collapse)[i],])
  }
}

lb.countsTable <- read.table('../Lb/20170215_Lb_allgenes_countTable.txt') #Read in Lb counts
lb.countsTable <- data.matrix(lb.countsTable[,c(2,3,5,16,19,23)])
rownames(lb.countsTable) <- sapply(rownames(lb.countsTable), function(x) strsplit(x,'-')[[1]][1])
lb.collapse <- matrix(0,0,nrow=length(unique(rownames(lb.countsTable))),ncol=6)
rownames(lb.collapse) <- unique(rownames(lb.countsTable))
for(i in 1:nrow(lb.collapse)){
  temp.mat <- subset(lb.countsTable,rownames(lb.countsTable) %in% rownames(lb.collapse)[i])
  if(nrow(temp.mat)==1){
    lb.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lb.collapse[i,] <- colSums(lb.countsTable[rownames(lb.countsTable) %in% rownames(lb.collapse)[i],])
  }
}
lb.collapse <- lb.collapse[!rownames(lb.collapse)=='LbrM.26.0210a',]
rownames(lb.collapse)[rownames(lb.collapse)=='LbrM.20.0191'] <- 'LbrM.10.0191'

{
  lmajor.72counts <- read.table('../Lb/Lmajor_72hrs.txt',header=F,sep='\t',col.names=c("id","lm72.1","lm72.2","lm72.3","lm72.4"))
  hpgl0364.lmajor <- read.table("htseq_HPGL0364_Lmajor60_20140329.txt",col.names=c("id","hpgl0364"))
  hpgl0374.lmajor <- read.table("htseq_HPGL0374_Lmajor60_20140329.txt",col.names=c("id","hpgl0374"))
  hpgl0397.lmajor <- read.table("htseq_HPGL0397_Lmajor60_20140712.txt",col.names=c("id","hpgl0397"))
  hpgl0456.lmajor <- read.table("htseq_HPGL0456_Lmajor60_20141001.txt",col.names=c("id","hpgl0456"))
  hpgl0494.lmajor <- read.table("htseq_HPGL0494_Lmajor60_20141211.txt",col.names=c("id","hpgl0494"))
  hpgl0366.lmajor <- read.table("htseq_HPGL0366_Lmajor60_20140329.txt",col.names=c("id","hpgl0366"))
  hpgl0376.lmajor <- read.table("htseq_HPGL0376_Lmajor60_20140329.txt",col.names=c("id","hpgl0376"))
  hpgl0399.lmajor <- read.table("htseq_HPGL0399_Lmajor60_20140712.txt",col.names=c("id","hpgl0399"))
  hpgl0459.lmajor <- read.table("htseq_HPGL0459_Lmajor60_20141001.txt",col.names=c("id","hpgl0459"))
  hpgl0497.lmajor <- read.table("htseq_HPGL0497_Lmajor60_20141211.txt",col.names=c("id","hpgl0497"))
  hpgl0368.lmajor <- read.table("htseq_HPGL0368_Lmajor60_20140329.txt",col.names=c("id","hpgl0368"))
  hpgl0378.lmajor <- read.table("htseq_HPGL0378_Lmajor60_20140329.txt",col.names=c("id","hpgl0378"))
  hpgl0401.lmajor <- read.table("htseq_HPGL0401_Lmajor60_20140712.txt",col.names=c("id","hpgl0401"))
  hpgl0463.lmajor <- read.table("htseq_HPGL0463_Lmajor60_20141001.txt",col.names=c("id","hpgl0463"))
  hpgl0501.lmajor <- read.table("htseq_HPGL0501_Lmajor60_20141211.txt",col.names=c("id","hpgl0501"))
  hpgl0370.lmajor <- read.table("htseq_HPGL0370_Lmajor60_20140329.txt",col.names=c("id","hpgl0370"))
  hpgl0380.lmajor <- read.table("htseq_HPGL0380_Lmajor60_20140329.txt",col.names=c("id","hpgl0380"))
  hpgl0403.lmajor <- read.table("htseq_HPGL0403_Lmajor60_20140712.txt",col.names=c("id","hpgl0403"))
  hpgl0467.lmajor <- read.table("htseq_HPGL0467_Lmajor60_20141001.txt",col.names=c("id","hpgl0467"))
  hpgl0505.lmajor <- read.table("htseq_HPGL0505_Lmajor60_20141211.txt",col.names=c("id","hpgl0505"))

  list.of.data.frames <- list(hpgl0364.lmajor,hpgl0374.lmajor,hpgl0397.lmajor,hpgl0456.lmajor,hpgl0494.lmajor,
                              hpgl0366.lmajor,hpgl0376.lmajor,hpgl0399.lmajor,hpgl0459.lmajor,hpgl0497.lmajor,
                              hpgl0368.lmajor,hpgl0378.lmajor,hpgl0401.lmajor,hpgl0463.lmajor,hpgl0501.lmajor,
                              hpgl0370.lmajor,hpgl0380.lmajor,hpgl0403.lmajor,hpgl0467.lmajor,hpgl0505.lmajor,
                              lmajor.72counts)
}
lmaj.countsTable = Reduce(function(...) merge(..., by="id", all=T), list.of.data.frames)
lmaj.countsTable <- lmaj.countsTable[-c(1:2,9466:9474),]
rownames(lmaj.countsTable) <- lmaj.countsTable$id
lmaj.countsTable <- data.matrix(lmaj.countsTable[,-1])
rownames(lmaj.countsTable) <- sapply(rownames(lmaj.countsTable), function(x) substr(x,6,nchar(x)))
rownames(lmaj.countsTable) <- sapply(rownames(lmaj.countsTable), function(x) strsplit(x,'-')[[1]][1])
lmaj.collapse <- matrix(0,0,nrow=length(unique(rownames(lmaj.countsTable))),ncol=24)
rownames(lmaj.collapse) <- unique(rownames(lmaj.countsTable))
for(i in 1:nrow(lmaj.collapse)){
  temp.mat <- subset(lmaj.countsTable,rownames(lmaj.countsTable) %in% rownames(lmaj.collapse)[i])
  if(nrow(temp.mat)==1){
    lmaj.collapse[i,] <- temp.mat
  }
  if(nrow(temp.mat)>1){
    lmaj.collapse[i,] <- colSums(lmaj.countsTable[rownames(lmaj.countsTable) %in% rownames(lmaj.collapse)[i],])
  }
}

lmex.db <- import.gff('../Lb/TriTrypDB-27_LmexicanaMHOMGT2001U1103.gff')
lmex.hist <- data.frame(chr=seqnames(lmex.db),ID=lmex.db$ID,
                        Name=lmex.db$type,
                        size=width(lmex.db),
                        start=start(lmex.db),end=end(lmex.db))
lmex.hist <- lmex.hist[lmex.hist$Name %in% 'CDS',]
lmexhist.collapse <- matrix(0,0,nrow=nrow(lmex.collapse),ncol=2)
rownames(lmexhist.collapse) <- rownames(lmex.collapse)
for(i in 1:nrow(lmexhist.collapse)){
  temp.mat <- lmex.hist[grep(rownames(lmexhist.collapse)[i],lmex.hist$ID),]
  if(nrow(temp.mat)==1){
    lmexhist.collapse[i,2] <- temp.mat$size
  }
  if(nrow(temp.mat)>1){
    lmexhist.collapse[i,2] <- sum(temp.mat$size)
  }
}
lmex_rpkm <- rpkm(lmex.collapse,as.numeric(lmexhist.collapse[,2]))
lmexvitro_rpkm <- rpkm(lmexvitro.collapse,as.numeric(lmexhist.collapse[,2]))

lmex_rpkm <- lmex_rpkm[order(rowSums(lmex_rpkm[,5:10]),decreasing=T),]
datCor <- cor(lmex_rpkm[,5:10])
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=c('DCL.1','DCL.2','DCL.3','DCL.4','DCL.5','DCL.6'),
          labCol=c('DCL.1','DCL.2','DCL.3','DCL.4','DCL.5','DCL.6'),
          scale="none", cellnote = format(round(datCor, 2), nsmall = 0),
          breaks = seq(0,1,0.01),
          notecex=0.5, notecol = 'black', trace="none",
          srtCol=45,main='Pearson correlation (RPKMs)')

lb.db <- import.gff("../Lb/TriTrypDB-26_LbraziliensisMHOMBR75M2904.gff")
lb.hist <- data.frame(chr=seqnames(lb.db),ID=lb.db$ID,
                      Name=lb.db$type,
                      size=width(lb.db),
                      start=start(lb.db),end=end(lb.db))
lb.hist <- lb.hist[lb.hist$Name %in% 'CDS',]
lbhist.collapse <- matrix(0,0,nrow=nrow(lb.collapse),ncol=2)
rownames(lbhist.collapse) <- rownames(lb.collapse)
for(i in 1:nrow(lbhist.collapse)){
  temp.mat <- lb.hist[grep(rownames(lbhist.collapse)[i],lb.hist$ID),]
  if(nrow(temp.mat)==1){
    lbhist.collapse[i,2] <- temp.mat$size
  }
  if(nrow(temp.mat)>1){
    lbhist.collapse[i,2] <- sum(temp.mat$size)
  }
}
lbhist.collapse[rownames(lbhist.collapse)=="LbrM.10.0191",2] <- 645
lb_rpkm <- rpkm(lb.collapse,gene.length=as.numeric(lbhist.collapse[,2]))

lmajor.db <- import.gff('../Lb/TriTrypDB-27_LmajorFriedlin.gff')
lmaj.hist <- data.frame(chr=seqnames(lmajor.db),ID=lmajor.db$ID,
                        Name=lmajor.db$type,
                        size=width(lmajor.db),
                        start=start(lmajor.db),end=end(lmajor.db))
lmaj.hist <- lmaj.hist[lmaj.hist$Name %in% 'exon',]
lmajhist.collapse <- matrix(0,0,nrow=nrow(lmaj.collapse),ncol=2)
rownames(lmajhist.collapse) <- rownames(lmaj.collapse)
for(i in 1:nrow(lmajhist.collapse)){
  temp.mat <- lmaj.hist[grep(rownames(lmajhist.collapse)[i],lmaj.hist$ID),]
  if(nrow(temp.mat)==1){
    lmajhist.collapse[i,2] <- temp.mat$size
  }
  if(nrow(temp.mat)>1){
    lmajhist.collapse[i,2] <- sum(temp.mat$size)
  }
}
lmaj_rpkm <- rpkm(lmaj.collapse,as.numeric(lmajhist.collapse[,2]))

nrow(lmex_rpkm)
nrow(lb_rpkm)
nrow(lmaj_rpkm)

## Coverage of *L. amazonensis* DCL lesions
barplot(colSums(countsTable), las=3, main='Lmx Raw Counts By Sample (DCL)')
dim(lmex_rpkm)

## Coverage of *L. amazonensis* LCL parasite *in vitro* macrophage infection
barplot(colSums(lmexvitro.countsTable), las=3, main='Lmx Raw Counts By Sample (LCL in vitro)')
dim(lmexvitro_rpkm)

## Coverage of *L. braziliensis* LCL lesions
barplot(colSums(lb.countsTable), las=3, main='Lbr Raw Counts By Sample (LCL in vivo)')
dim(lb_rpkm)

## Coverage of *L. major* LCL parasite *in vitro* macrophage infection
barplot(colSums(lmaj.countsTable), las=3, main='Lmj Raw Counts By Sample (LCL in vitro)')
dim(lmaj_rpkm)

## Coverage of all parasites, all conditions
barplot(c(colSums(countsTable),colSums(lmexvitro.countsTable),
          colSums(lb.countsTable),colSums(lmaj.countsTable)),
        las=3)

##### Generate single-reciprocal ortholog tables #####
##Compare Lmex vs. all
lmex_rpkm <- rpkm(lmex.collapse,as.numeric(lmexhist.collapse[,2]))
Lmex.to.paralogs <- read.table("../Lb/Lmx_ama.paralogs.ids.txt")
Lmex.paralogs <- matrix(0,0,ncol=2,nrow=nrow(Lmex.to.paralogs))
row=1
pb <- txtProgressBar(0,nrow(lmex_rpkm),initial=0,style=3)
for(i in 1:nrow(lmex_rpkm)){
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
  if(rownames(lmex_rpkm)[i] %in% Lmex.to.paralogs[,1] & rownames(lmex_rpkm)[i+1] %in% Lmex.to.paralogs[,1]){
    row1 <- which(Lmex.to.paralogs[,1]==rownames(lmex_rpkm)[i]) #row of current gene
    row2 <- which(Lmex.to.paralogs[,1]==rownames(lmex_rpkm)[i+1]) #row of next gene
    if((row2-row1)>1){ #if there are paralogs
      for(j in (row1+1):(row2-1)){ #loop from the row after current gene to row before next gene
        Lmex.paralogs[row,1] <- rownames(lmex_rpkm)[i] #Lmex gene
        Lmex.paralogs[row,2] <- as.character(Lmex.to.paralogs[j,]) #Paralog gene
        row=row+1
      }
    }
  }
}

lmex_hom <- matrix(0,0,ncol=3,nrow=nrow(lmex_rpkm))
lmex_hom[,1] <- rownames(lmex_rpkm)
pb <- txtProgressBar(0,nrow(lmex_rpkm),initial=0,style=3)
for(i in 1:nrow(lmex_rpkm)){
  tempid <- lmex_hom[i,1]
  if(!tempid %in% Lmex.paralogs[,1]){
    lmex_hom[i,2:3] <- 0
  }
  if(tempid %in% Lmex.paralogs[,1]){
    temphom.bra <- subset(Lmex.paralogs, Lmex.paralogs[,1] %in% tempid)
    temphom.bra <- subset(temphom.bra,substr(temphom.bra[,2],1,3)=="Lbr")
    temphom.bra <- subset(temphom.bra,temphom.bra[,2] %in% rownames(lb_rpkm))
    temphom.maj <- subset(Lmex.paralogs, Lmex.paralogs[,1] %in% tempid)
    temphom.maj <- subset(temphom.maj,substr(temphom.maj[,2],1,4)=="LmjF")
    temphom.maj <- subset(temphom.maj,temphom.maj[,2] %in% rownames(lmaj_rpkm))
    if(nrow(temphom.bra)==0 | nrow(temphom.maj)==0){
      lmex_hom[i,2:3] <- 0
    }
    if(nrow(temphom.bra)==1 & nrow(temphom.maj)==1){
      lmex_hom[i,2] <- temphom.bra[,2]
      lmex_hom[i,3] <- temphom.maj[,2]
    }
    if(nrow(temphom.bra)>1 | nrow(temphom.maj)>1){
      lmex_hom[i,2:3] <- 0
    }
  }
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
}
lmex_hom <- lmex_hom[lmex_hom[,2]!=0 & lmex_hom[,3]!=0,]

#Compare Lbraz vs. all
Lb.to.paralogs <- read.table("../Lb/Lbraz.paralogs.ids.txt")
Lb.paralogs <- matrix(0,0,ncol=2,nrow=nrow(Lb.to.paralogs))
row=1
pb <- txtProgressBar(0,nrow(lb_rpkm),initial=0,style=3)
for(i in 1:nrow(lb_rpkm)){
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
  if(rownames(lb_rpkm)[i] %in% Lb.to.paralogs[,1] & rownames(lb_rpkm)[i+1] %in% Lb.to.paralogs[,1]){
    row1 <- which(Lb.to.paralogs[,1]==rownames(lb_rpkm)[i]) #row of current gene
    row2 <- which(Lb.to.paralogs[,1]==rownames(lb_rpkm)[i+1]) #row of next gene
    if((row2-row1)>1){ #if there are paralogs
      for(j in (row1+1):(row2-1)){ #loop from the row after current gene to row before next gene
        Lb.paralogs[row,1] <- rownames(lb_rpkm)[i] #Lmex gene
        Lb.paralogs[row,2] <- as.character(Lb.to.paralogs[j,]) #Paralog gene
        row=row+1
      }
    }
  }
}

lb_hom <- matrix(0,0,ncol=3,nrow=nrow(lb_rpkm))
lb_hom[,1] <- rownames(lb_rpkm)
pb <- txtProgressBar(0,nrow(lb_rpkm),initial=0,style=3)
for(i in 1:nrow(lb_rpkm)){
  tempid <- lb_hom[i,1]
  if(!tempid %in% Lb.paralogs[,1]){
    lb_hom[i,2:3] <- 0
  }
  if(tempid %in% Lb.paralogs[,1]){
    temphom.1 <- subset(Lb.paralogs, Lb.paralogs[,1] %in% tempid)
    temphom.1 <- subset(temphom.1,substr(temphom.1[,2],1,3)=="Lmx")
    temphom.1 <- subset(temphom.1,temphom.1[,2] %in% rownames(lmex_rpkm))
    temphom.2 <- subset(Lb.paralogs, Lb.paralogs[,1] %in% tempid)
    temphom.2 <- subset(temphom.2,substr(temphom.2[,2],1,3)=="Lmj")
    temphom.2 <- subset(temphom.2,temphom.2[,2] %in% rownames(lmaj_rpkm))
    if(nrow(temphom.1)==0 | nrow(temphom.2)==0){
      lb_hom[i,2:3] <- 0
    }
    if(nrow(temphom.1)==1 & nrow(temphom.2)==1){
      lb_hom[i,2] <- temphom.1[,2]
      lb_hom[i,3] <- temphom.2[,2]
    }
    if(nrow(temphom.1)>1 | nrow(temphom.2)>1){
      lb_hom[i,2:3] <- 0
    }
  }
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
}
lb_hom <- lb_hom[lb_hom[,2]!=0 & lb_hom[,3]!=0,]

#Compare Lmaj vs. all
Lmaj.to.paralogs <- read.csv("../Lb/LmjF.paralogs.ids.txt",header=F)
Lmaj.paralogs <- matrix(0,0,ncol=2,nrow=nrow(Lmaj.to.paralogs))
row=1
pb <- txtProgressBar(0,nrow(lmaj_rpkm),initial=0,style=3)
for(i in 1:nrow(lmaj_rpkm)){
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
  if(rownames(lmaj_rpkm)[i] %in% Lmaj.to.paralogs[,1] & rownames(lmaj_rpkm)[i+1] %in% Lmaj.to.paralogs[,1]){
    row1 <- which(Lmaj.to.paralogs[,1]==rownames(lmaj_rpkm)[i]) #row of current gene
    row2 <- which(Lmaj.to.paralogs[,1]==rownames(lmaj_rpkm)[i+1]) #row of next gene
    if((row2-row1)>1){ #if there are paralogs
      for(j in (row1+1):(row2-1)){ #loop from the row after current gene to row before next gene
        Lmaj.paralogs[row,1] <- rownames(lmaj_rpkm)[i] #Lmex gene
        Lmaj.paralogs[row,2] <- as.character(Lmaj.to.paralogs[j,]) #Paralog gene
        row=row+1
      }
    }
  }
}

lmaj_hom <- matrix(0,0,ncol=3,nrow=nrow(lmaj_rpkm))
lmaj_hom[,1] <- rownames(lmaj_rpkm)
pb <- txtProgressBar(0,nrow(lmaj_rpkm),initial=0,style=3)
for(i in 1:nrow(lmaj_rpkm)){
  tempid <- lmaj_hom[i,1]
  if(!tempid %in% Lmaj.paralogs[,1]){
    lmaj_hom[i,2:3] <- 0
  }
  if(tempid %in% Lmaj.paralogs[,1]){
    temphom.1 <- subset(Lmaj.paralogs, Lmaj.paralogs[,1] %in% tempid)
    temphom.1 <- subset(temphom.1,substr(temphom.1[,2],1,3)=="Lmx")
    temphom.1 <- subset(temphom.1,temphom.1[,2] %in% rownames(lmex_rpkm))
    temphom.2 <- subset(Lmaj.paralogs, Lmaj.paralogs[,1] %in% tempid)
    temphom.2 <- subset(temphom.2,substr(temphom.2[,2],1,3)=="Lbr")
    temphom.2 <- subset(temphom.2,temphom.2[,2] %in% rownames(lb_rpkm))
    if(nrow(temphom.1)==0 | nrow(temphom.2)==0){
      lmaj_hom[i,2:3] <- 0
    }
    if(nrow(temphom.1)==1 & nrow(temphom.2)==1){
      lmaj_hom[i,2] <- temphom.1[,2]
      lmaj_hom[i,3] <- temphom.2[,2]
    }
    if(nrow(temphom.1)>1 | nrow(temphom.2)>1){
      lmaj_hom[i,2:3] <- 0
    }
  }
  Sys.sleep(0.01); setTxtProgressBar(pb,i)
}
lmaj_hom <- lmaj_hom[lmaj_hom[,2]!=0 & lmaj_hom[,3]!=0,]

lmex_hom.all <- lmex_hom[lmex_hom[,1] %in% lb_hom[,2] &
                           lmex_hom[,1] %in% lmaj_hom[,2] &
                           lmex_hom[,2] %in% lb_hom[,1] &
                           lmex_hom[,2] %in% lmaj_hom[,3] &
                           lmex_hom[,3] %in% lb_hom[,3] &
                           lmex_hom[,3] %in% lmaj_hom[,1],]
colnames(lmex_hom.all) <- c('lmx.id','lbr.id','lmj.id')

lmex.rpkm.homs <- cbind(rownames(lmex_rpkm),lmex_rpkm[,5:10])
colnames(lmex.rpkm.homs) <- c('lmx.id',paste(rep('lmxvivo',times=6), c(1:6), sep="."))
lmex.rpkm.homs <- merge(lmex.rpkm.homs, lmex_hom.all, by='lmx.id')

lmex.vitro.rpkm.homs <- cbind(rownames(lmexvitro_rpkm),lmexvitro_rpkm)
colnames(lmex.vitro.rpkm.homs) <- c('lmx.id',paste(rep(c("lmxexternal", "lmxvitro4","lmxvitro24",
                                                         "lmxvitro48", "lmxvitro72"),times=3), c(1:15), sep="."))
lmex.vitro.rpkm.homs <- merge(lmex.vitro.rpkm.homs, lmex_hom.all, by='lmx.id')

lb.rpkm.homs <- cbind(rownames(lb_rpkm),lb_rpkm)
colnames(lb.rpkm.homs) <- c('lbr.id',paste(rep('lbrvivo',times=6), c(1:6), sep="."))
lb.rpkm.homs <- merge(lb.rpkm.homs, lmex_hom.all, by='lbr.id')

lmaj.rpkm.homs <- cbind(rownames(lmaj_rpkm),lmaj_rpkm)
colnames(lmaj.rpkm.homs) <- c('lmj.id',paste(c(rep("lmjexternal",times=5),rep("lmjvitro4",times=5),rep("lmjvitro24",times=5),
                                               rep("lmjvitro48",times=5),rep("lmjvitro72",times=4)),c(1:24), sep="."))
lmaj.rpkm.homs <- merge(lmaj.rpkm.homs, lmex_hom.all, by='lmj.id')

list.of.data.frames <- list(lmex.rpkm.homs,lmex.vitro.rpkm.homs,lb.rpkm.homs,lmaj.rpkm.homs)
all.rpkm = Reduce(function(...) merge(..., by="lmx.id", all=T), list.of.data.frames)
all.rpkm <- all.rpkm[,c(2:7,10:24,28:33,36:59)]
all.rpkm <- apply(all.rpkm,2,as.numeric)
rownames(all.rpkm) <- lmex_hom.all[,1]
q.rpkm <- cbind(lmex_hom.all,qNorm(all.rpkm))

dim(lmex_hom)
dim(lb_hom)
dim(lmaj_hom)
head(lmex_hom.all)
nrow(lmex_hom.all)

###### RPKM correlations#####
col.nam <- c(paste(rep('Lmex-vivo',times=6), c(1:6), sep="."),
             rep(c("external","Lmex-vitro4","Lmex-vitro24","Lmex-vitro48","Lmex-vitro72"),times=3),
             paste(rep('Lb-vivo',times=6), c(1:6), sep="."),
             paste(rep("lmjexternal",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro4",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro24",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro48",times=5),c(1:5), sep='.'),
             paste(rep("lmjvitro72",times=4),c(1:4), sep='.'))
# Ran correlations using pearson, spearman, and by correlating rpkms of top quartile and bottom three quartiles
#Top quartile of DCL L.mx is 1:2063 after sorting on rowSums of columns 1:6
#Also looked at quantile normalization effect on correlations
#all.rpkm <- all.rpkm[order(rowSums(all.rpkm),decreasing=T),]
datCor <- cor(all.rpkm,method='spearman')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0,1,length=100),
          trace="none",
          srtCol=45,main='Spearman correlation (RPKMs)')

#Quantile normalize rpkm values across samples
q.rpkm <- qNorm(all.rpkm)
datCor <- cor(q.rpkm,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0,1,length=100),
          trace="none",
          srtCol=45,main='Pearson correlation (qnorm RPKMs)')

#Correlation between L. amazonensis samples
lmexvitro_rpkm <- lmexvitro_rpkm[match(rownames(lmex_rpkm),rownames(lmexvitro_rpkm)),]
lmexall.rpkm <- cbind(lmex_rpkm[,5:10],lmexvitro_rpkm)
lmexall.rpkm <- apply(lmexall.rpkm,2,as.numeric)
rownames(lmexall.rpkm) <- rownames(lmex_rpkm)
col.nam <- c(paste(rep('Lmex-vivo',times=6), c(1:6), sep="."),
             rep(c("external", "Lmex-vitro4","Lmex-vitro24", "Lmex-vitro48", "Lmex-vitro72"),times=3))
datCor <- cor(lmexall.rpkm,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0.55,1,length=100),
          trace="none",
          srtCol=45,main='Pearson correlation (Lmx RPKMs)')
qmex.rpkm <- qNorm(lmexall.rpkm)
rownames(qmex.rpkm) <- rownames(lmex_rpkm)
datCor <- cor(qmex.rpkm,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          labRow=col.nam,
          labCol=col.nam,
          scale="none",breaks=seq(0.55,1,length=100),
          trace="none",
          srtCol=45,main='Pearson correlation (qnorm Lmx RPKMs)')

#######All Leishmania comparison/table generation#####
lmaj_rpkm <- lmaj_rpkm[rownames(lmaj_rpkm) %in% lmajor.tritrypdb[,1],]
lmex_rpkm <- lmex_rpkm[rownames(lmex_rpkm) %in% lmex.tritrypdb[,1],]
lmexvitro_rpkm <- lmexvitro_rpkm[rownames(lmexvitro_rpkm) %in% lmex.tritrypdb[,1],]
lb_rpkm <- lb_rpkm[rownames(lb_rpkm) %in% lb.tritrypdb[,1],]

all_comp <- matrix(0,0,nrow=nrow(all.rpkm)+
                     (nrow(lb_rpkm)-nrow(all.rpkm))+
                     (nrow(lmaj_rpkm)-nrow(all.rpkm))+
                     (nrow(lmex_rpkm)-nrow(all.rpkm)),ncol=16)
common.genes = Reduce(function(...) merge(..., by="lmx.id", all=T), list.of.data.frames)
common.genes <- common.genes[,c(1,8,9)]
all_comp[1:nrow(all.rpkm),2] <- as.character(common.genes[,1])
all_comp[1:nrow(all.rpkm),9] <- as.character(common.genes[,2])
all_comp[1:nrow(all.rpkm),13] <- as.character(common.genes[,3])
all_comp[1:nrow(all.rpkm),4] <- rowMeans(all.rpkm[,1:6])
all_comp[1:nrow(all.rpkm),7] <- rowMeans(all.rpkm[,c(11,16,21)])
all_comp[1:nrow(all.rpkm),11] <- rowMeans(all.rpkm[,22:26])
all_comp[1:nrow(all.rpkm),15] <- rowMeans(all.rpkm[,48:51])
for(i in 1:nrow(all.rpkm)){
  all_comp[i,1] <- as.character(lmex.tritrypdb[lmex.tritrypdb[,1] == as.character(all_comp[i,2]),][,2])
  all_comp[i,5] <- sd(all.rpkm[i,1:6])/sqrt(6)
  all_comp[i,8] <- sd(all.rpkm[i,c(11,16,21)])/sqrt(3)
  all_comp[i,12] <- sd(all.rpkm[i,22:26])/sqrt(6)
  all_comp[i,16] <- sd(all.rpkm[i,48:51])/sqrt(4)
}
lmexvivo_unique <- lmex_rpkm[!rownames(lmex_rpkm) %in% common.genes[,1],]
lmexvitro_unique <- lmexvitro_rpkm[!rownames(lmexvitro_rpkm) %in% common.genes[,1],]
lmexvitro_unique <- lmexvitro_unique[match(rownames(lmexvivo_unique),rownames(lmexvitro_unique)),]
all_comp[(nrow(all.rpkm)+1):nrow(lmex_rpkm),2] <- rownames(lmexvivo_unique)
all_comp[(nrow(all.rpkm)+1):nrow(lmex_rpkm),4] <- rowMeans(lmexvivo_unique[,5:10])
all_comp[(nrow(all.rpkm)+1):nrow(lmex_rpkm),7] <- rowMeans(lmexvitro_unique[,c(5,10,15)])
for(i in (nrow(all.rpkm)+1):nrow(lmex_rpkm)){
  row = i-nrow(all.rpkm)
  all_comp[i,1] <- as.character(lmex.tritrypdb[lmex.tritrypdb[,1] == as.character(all_comp[i,2]),][,2])
  all_comp[i,5] <- sd(lmexvivo_unique[row,5:10])/sqrt(6)
  all_comp[i,8] <- sd(lmexvitro_unique[row,])/sqrt(3)
}
lb_unique <- lb_rpkm[!rownames(lb_rpkm) %in% common.genes[,2],]
all_comp[(nrow(lmex_rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)),9] <- rownames(lb_unique)
all_comp[(nrow(lmex_rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)),11] <- rowMeans(lb_unique)
for(i in (nrow(lmex_rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm))){
  row = i-nrow(lmex_rpkm)
  all_comp[i,1] <- as.character(lb.tritrypdb[lb.tritrypdb[,1] == as.character(all_comp[i,9]),][,2])
  all_comp[i,12] <- sd(lb_unique[row,])/sqrt(6)
}
lmj_unique <- lmaj_rpkm[!rownames(lmaj_rpkm) %in% common.genes[,3],]
lmajor.tritrypdb <- lmajor.tritrypdb[lmajor.tritrypdb[,1] %in% rownames(lmaj_rpkm),]
all_comp[(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+nrow(lmaj_rpkm)-nrow(all.rpkm)),13] <- rownames(lmj_unique)
all_comp[(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+nrow(lmaj_rpkm)-nrow(all.rpkm)),15] <- rowMeans(lmj_unique[,21:24])
for(i in (nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+1):(nrow(lmex_rpkm)+nrow(lb_rpkm)-nrow(all.rpkm)+nrow(lmaj_rpkm)-nrow(all.rpkm))){
  row = i-nrow(lmex_rpkm)-(nrow(lb_rpkm)-nrow(all.rpkm))
  all_comp[i,1] <- as.character(lmajor.tritrypdb[lmajor.tritrypdb[,1] == as.character(all_comp[i,13]),][,2][1])
  all_comp[i,16] <- sd(lmj_unique[row,21:24])/sqrt(4)
}

all_comp <- all_comp[order(as.numeric(all_comp[,4]),decreasing=T),]
all_comp[1:nrow(lmex_rpkm),3] <- c(1:nrow(lmex_rpkm))
all_comp <- all_comp[order(as.numeric(all_comp[,7]),decreasing=T),]
all_comp[1:nrow(lmexvitro_rpkm),6] <- c(1:nrow(lmexvitro_rpkm))
all_comp <- all_comp[order(as.numeric(all_comp[,11]),decreasing=T),]
all_comp[1:nrow(lb_rpkm),10] <- c(1:nrow(lb_rpkm))
all_comp <- all_comp[order(as.numeric(all_comp[,15]),decreasing=T),]
all_comp[1:nrow(lmaj_rpkm),14] <- c(1:nrow(lmaj_rpkm))

write.table(all_comp,'Lmx_vivo_72hrvitro_Lbr_vivo_Lmaj_72hrvitro_genecomp_20180106.txt',sep='\t')

all_comp <- all_comp[order(as.numeric(all_comp[,3])),]
plot(all_comp[4313:4912,3],all_comp[4313:4912,6],pch=19,cex=0.2,
     xlab='Rank (DCL in vivo)',ylab='Rank (LCL in vitro-72hrs)')
abline(h=1000,col='red',lty=2)
la72low <- as.numeric(all_comp[4313:4912,6])>2000
text(x=as.numeric(all_comp[4313:4912,3])[la72low],
     y=as.numeric(all_comp[4313:4912,6])[la72low],
     labels=all_comp[4313:4912,2][la72low], cex=.7, pos=2)
all_comp <- all_comp[all_comp[,2] %in% common.genes[,1],]
for(i in 1:nrow(all_comp)){
  all_comp[i,4] <- 1-(as.numeric(all_comp[i,3])/nrow(lmex_rpkm))
  all_comp[i,7] <- 1-(as.numeric(all_comp[i,6])/nrow(lmex_rpkm))
  all_comp[i,11] <- 1-(as.numeric(all_comp[i,10])/nrow(lb_rpkm))
  all_comp[i,15] <- 1-(as.numeric(all_comp[i,14])/nrow(lmaj_rpkm))
}

lmxrank <- lmex_rpkm[,5:10]
for(i in 1:ncol(lmxrank)){
  lmxrank <- lmxrank[order(as.numeric(lmxrank[,i]),decreasing = T),]
  lmxrank[,i] <- c(1:nrow(lmxrank))
  for(j in 1:nrow(lmxrank)){
    lmxrank[j,i] <- 1-(as.numeric(lmxrank[j,i])/nrow(lmxrank))
  }
}
lmxrank <- lmxrank[rownames(lmxrank) %in% all_comp[,2],]
lmxrank <- lmxrank[match(all_comp[,2],rownames(lmxrank)),]
lmxvitrorank <- lmexvitro_rpkm[,c(5,10,15)]
for(i in 1:ncol(lmxvitrorank)){
  lmxvitrorank <- lmxvitrorank[order(as.numeric(lmxvitrorank[,i]),decreasing = T),]
  lmxvitrorank[,i] <- c(1:nrow(lmxvitrorank))
  for(j in 1:nrow(lmxvitrorank)){
    lmxvitrorank[j,i] <- 1-(as.numeric(lmxvitrorank[j,i])/nrow(lmxvitrorank))
  }
}
lmxvitrorank <- lmxvitrorank[rownames(lmxvitrorank) %in% all_comp[,2],]
lmxvitrorank <- lmxvitrorank[match(all_comp[,2],rownames(lmxvitrorank)),]
lbrrank <- lb_rpkm
for(i in 1:ncol(lbrrank)){
  lbrrank <- lbrrank[order(as.numeric(lbrrank[,i]),decreasing = T),]
  lbrrank[,i] <- c(1:nrow(lbrrank))
  for(j in 1:nrow(lbrrank)){
    lbrrank[j,i] <- 1-(as.numeric(lbrrank[j,i])/nrow(lbrrank))
  }
}
lbrrank <- lbrrank[rownames(lbrrank) %in% all_comp[,9],]
lbrrank <- lbrrank[match(all_comp[,9],rownames(lbrrank)),]
lmajrank <- lmaj_rpkm[,21:24]
for(i in 1:ncol(lmajrank)){
  lmajrank <- lmajrank[order(as.numeric(lmajrank[,i]),decreasing = T),]
  lmajrank[,i] <- c(1:nrow(lmajrank))
  for(j in 1:nrow(lmajrank)){
    lmajrank[j,i] <- 1-(as.numeric(lmajrank[j,i])/nrow(lmajrank))
  }
}
lmajrank <- lmajrank[rownames(lmajrank) %in% all_comp[,13],]
lmajrank <- lmajrank[match(all_comp[,13],rownames(lmajrank)),]

all_pctl <- cbind(lmxrank,lmxvitrorank,lbrrank,lmajrank)
pctl <- all_pctl
heatmap.2(pctl[rowMeans(all_pctl[,1:6])>0.95,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
heatmap.2(pctl[rowMeans(all_pctl[,7:9])>0.95,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
heatmap.2(pctl[rowMeans(all_pctl[,10:15])>0.95,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
heatmap.2(pctl[rowMeans(all_pctl[,16:19])>0.9,],trace='none',
          col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
datCor <- cor(pctl,method='pearson')
heatmap.2(datCor,
          margins=c(10, 10),
          scale="none",breaks=seq(0,1,length=101),
          trace="none",col= colorRampPalette(brewer.pal(9, "GnBu"))(100),
          srtCol=45,main='Pearson correlation')

q.all <- log2CPM(all_pctl,
                 lib.size = c(colSums(lmex_rpkm)[5:10],colSums(lmexvitro_rpkm)[c(5,10,15)],
                              colSums(lb_rpkm),colSums(lmaj_rpkm)[21:24]))
s <- makeSVD(all_pctl)
col <- c(rep("red",times=6),rep("blue",times=3),rep("orange",times=6),rep("purple",times=4))
plotPC(s$v,
       s$d,
       col="black",
       pch=22,
       bg=col, cex=2.6)
library(pca3d)
pca_3d <- prcomp(t(all_pctl))
pca3d(pca_3d, components = 1:3,
      group = col)
pcRes(s$v, s$d, col)[1:5,]


pctl <- read.table('allspec_rank.txt',sep='\t',header=T)[,1:4]

sum(as.numeric(pctl[,1]) > 0.95 & as.numeric(pctl[,2]) > 0.95)/sum(pctl[,1] > 0.95)
sum(as.numeric(pctl[,1]) > 0.95 & as.numeric(pctl[,3]) > 0.95)/sum(pctl[,1] > 0.95)
sum(as.numeric(pctl[,1]) > 0.95 & as.numeric(pctl[,4]) > 0.95)/sum(pctl[,1] > 0.95)
sum(as.numeric(pctl[,2]) > 0.95 & as.numeric(pctl[,1]) > 0.95)/sum(pctl[,2] > 0.95)
sum(as.numeric(pctl[,2]) > 0.95 & as.numeric(pctl[,3]) > 0.95)/sum(pctl[,2] > 0.95)
sum(as.numeric(pctl[,2]) > 0.95 & as.numeric(pctl[,4]) > 0.95)/sum(pctl[,2] > 0.95)
sum(as.numeric(pctl[,3]) > 0.95 & as.numeric(pctl[,1]) > 0.95)/sum(pctl[,3] > 0.95)
sum(as.numeric(pctl[,3]) > 0.95 & as.numeric(pctl[,2]) > 0.95)/sum(pctl[,3] > 0.95)
sum(as.numeric(pctl[,3]) > 0.95 & as.numeric(pctl[,4]) > 0.95)/sum(pctl[,3] > 0.95)
sum(as.numeric(pctl[,4]) > 0.95 & as.numeric(pctl[,1]) > 0.95)/sum(pctl[,4] > 0.95)
sum(as.numeric(pctl[,4]) > 0.95 & as.numeric(pctl[,2]) > 0.95)/sum(pctl[,4] > 0.95)
sum(as.numeric(pctl[,4]) > 0.95 & as.numeric(pctl[,3]) > 0.95)/sum(pctl[,4] > 0.95)

pctl <- pctl[order(as.numeric(pctl[,1]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,1] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
pctl <- pctl[order(as.numeric(pctl[,2]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,2] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
pctl <- pctl[order(as.numeric(pctl[,3]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,3] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
pctl <- pctl[order(as.numeric(pctl[,4]),decreasing=T),]
heatmap.2(apply(pctl[pctl[,4] > 0.8,],2,as.numeric),trace='none',
          Rowv = F, Colv = F, dendrogram = 'none',col = colorRampPalette(brewer.pal(9, "GnBu"))(500))
```
