## Time-stamp: <Sat Jun 28 23:16:49 2014 Ashton Trey Belew (abelew@gmail.com)>
load("RData")
source("R/myr.R")
##rm(list=ls())
##save(list = ls(all=TRUE), file="RData")
I am pulling data from 3 primary sources: * The gas gff annotations/fasta genome * The NCBI reference genome for the strain 5005 * The microbesonline.org tab delimited file.
More information may be found in the reference/ directory.
annotations = import.gff3("reference/gff/nz131.gff.gz", asRangedData=FALSE)
annotation_info = as.data.frame(annotations)
genes = annotation_info[annotation_info$type=="gene",]
gene_annotations = subset(genes, select = c("start", "end", "width", "strand", "Name", "protein_id", "ID"))
short_annotations = gene_annotations[,c("ID", "Name")]
short_annotations$spy = rownames(short_annotations)
colnames(short_annotations) = c("ID", "Name", "Spy")
write_xls(short_annotations, "genes", rowname="ID")
microbes = read.csv(file="reference/microbesonline/nz131_annotations.tab.gz", header=1, sep="\t")
microbes_go = microbes[,c("sysName","GO")]
go_entries = strsplit(as.character(microbes_go$GO), split=",", perl=TRUE)
microbes_go_oneperrow = data.frame(name = rep(microbes_go$sysName, sapply(go_entries, length)), GO = unlist(go_entries))
microbes_go = microbes_go_oneperrow
rm(microbes_go_oneperrow)
rm(go_entries)
## These are used for gene ontology stuff...
microbes_lengths = microbes[,c("sysName", "start","stop")]
microbes_lengths$length = abs(microbes$start - microbes$stop)
microbes_lengths = microbes_lengths[,c("sysName","length")]
I am going to be potentially throwing tooltips all over the place, so let us make a quick datastructure for them.
tooltip_data = genes
tooltip_data = tooltip_data[,c("ID","Name", "locus_tag")]
tooltip_data$tooltip = paste(tooltip_data$Name, tooltip_data$locus_tag, sep=": ")
tooltip_data$tooltip = gsub("\\+", " ", tooltip_data$tooltip)
tooltip_data = tooltip_data[-1]
tooltip_data = tooltip_data[-1]
head(tooltip_data)
## locus_tag tooltip
## 12 Spy49_0001 dnaA: Spy49_0001
## 16 Spy49_0002 dnaN: Spy49_0002
## 20 <NA> Spy49_0003: NA
## 24 <NA> Spy49_0004: NA
## 28 Spy49_0005 pth: Spy49_0005
## 32 Spy49_0006 trcF: Spy49_0006
I have a sample design csv file called all_samples.csv which should contain all the needed information for this process.
The count tables come from the preprocessing directory. In each HPGL directory there is a counts directory. I am pulling from these the count tables generated from bowtie runs which asked for a randomly assigned multimatch (-M 1) with 0 mismatches (-v 0). These files are called something like: 05v0M1gen_NZ131_t0_genome.count as an example.
library_colors = list(lib1="yellow4",
lib2="darkgreen",
lib3="red",
lib4="darkblue")
time_colors = list(t0="lightgray",
t1="darkgray",
t2="black")
## The all_samples.csv controls the experimental settings
## all the following lines manipulate the information therein
## in order to set up media types, replicates, etc
sample_definitions = read.csv(file="all_samples_nz131.csv", sep=",")
## If I have summary lines in the csv, they will not start with 'HPGL' and so should be dropped.
sample_definitions = sample_definitions[grepl('^HPGL', sample_definitions$Sample.ID, perl=TRUE),]
## Pre set the color scheme
sample_definitions$colors = library_colors
## This long statement just writes out a computer path name containing the count files to read by sample name.
sample_definitions$counts = paste("data/count_tables/", sample_definitions$Strain, "-", sample_definitions$Replicate, sample_definitions$Time, ".count.gz", sep="")
sample_definitions = as.data.frame(sample_definitions)
rownames(sample_definitions) = sample_definitions$Sample.ID
## my_read_files does just that, it reads the count tables and makes a large raw count table from them.
all_count_tables = my_read_files(as.character(sample_definitions$Sample.ID), as.character(sample_definitions$counts))
## make it into a matrix for use as an expressionset
all_count_matrix = as.matrix(all_count_tables)
gene_info = all_count_matrix[rownames(all_count_matrix) %in% microbes$sysName,]
all_count_matrix = all_count_matrix[rownames(all_count_matrix) %in% microbes$sysName,]
metadata = new("AnnotatedDataFrame", data.frame(sample=sample_definitions$Sample.ID,
condition=sample_definitions$Time,
batch=sample_definitions$Library,
strain=sample_definitions$Strain,
color=as.character(sample_definitions$colors),
counts=sample_definitions$counts))
## Now generate the expressionset object
sampleNames(metadata) = colnames(all_count_matrix)
feature_data = new("AnnotatedDataFrame", as.data.frame(gene_info))
featureNames(feature_data) = rownames(all_count_matrix)
experiment = new("ExpressionSet", exprs=all_count_matrix,
phenoData=metadata, featureData=feature_data)
## print some information to see that it worked
print(experiment)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1764 features, 4 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HPGL0426 HPGL0427 HPGL0428 HPGL0429
## varLabels: sample condition ... counts (6 total)
## varMetadata: labelDescription
## featureData
## featureNames: Spy49_0001 Spy49_0002 ... Spy49_tRNAval4 (1764
## total)
## fvarLabels: HPGL0426 HPGL0427 HPGL0428 HPGL0429
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
summary(exprs(experiment))
## HPGL0426 HPGL0427 HPGL0428 HPGL0429
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 7 Median : 13 Median : 3 Median : 0
## Mean : 20 Mean : 67 Mean : 65 Mean : 18
## 3rd Qu.: 22 3rd Qu.: 52 3rd Qu.: 13 3rd Qu.: 2
## Max. :5800 Max. :42458 Max. :73486 Max. :15212
head(fData(experiment))
## HPGL0426 HPGL0427 HPGL0428 HPGL0429
## Spy49_0001 0 5 2 0
## Spy49_0002 2 5 3 0
## Spy49_0003 7 14 2 0
## Spy49_0004 9 16 7 3
## Spy49_0005 0 0 0 0
## Spy49_0006 109 366 168 27
head(pData(experiment))
## sample condition batch strain color
## HPGL0426 HPGL0426 t0 1 NZ131 yellow4
## HPGL0427 HPGL0427 t1 1 NZ131 darkgreen
## HPGL0428 HPGL0428 t2 1 NZ131 red
## HPGL0429 HPGL0429 t3 1 NZ131 darkblue
## counts
## HPGL0426 data/count_tables/NZ131-l1t0.count.gz
## HPGL0427 data/count_tables/NZ131-l1t1.count.gz
## HPGL0428 data/count_tables/NZ131-l1t2.count.gz
## HPGL0429 data/count_tables/NZ131-l1t3.count.gz
dim(fData(experiment))
## [1] 1764 4
I can use my normalization toy on this data, but that seems like a terrible idea
all_expt = expt_subset(experiment, "")
head(exprs(all_expt$expressionset))
## HPGL0426 HPGL0427 HPGL0428 HPGL0429
## Spy49_0001 0 5 2 0
## Spy49_0002 2 5 3 0
## Spy49_0003 7 14 2 0
## Spy49_0004 9 16 7 3
## Spy49_0005 0 0 0 0
## Spy49_0006 109 366 168 27
head(microbes)
## locusId accession GI scaffoldId start stop strand sysName
## 1 5671438 YP_002285060.1 209558588 119384 232 1587 + Spy49_0001
## 2 5671439 YP_002285061.1 209558589 119384 1742 2878 + Spy49_0002
## 3 5671440 YP_002285062.1 209558590 119384 2953 3150 + Spy49_0003
## 4 5671441 YP_002285063.1 209558591 119384 3480 4595 + Spy49_0004
## 5 5671442 YP_002285064.1 209558592 119384 4665 5234 + Spy49_0005
## 6 5671443 YP_002285065.1 209558593 119384 5321 8740 + Spy49_0006
## name desc
## 1 dnaA Chromosomal replication initiator protein dnaA (RefSeq)
## 2 dnaN Beta subunit of DNA polymerase III (RefSeq)
## 3 Spy49_0003 hypothetical protein (RefSeq)
## 4 Spy49_0004 GTP-binding and nucleic acid-binding protein YchF (RefSeq)
## 5 pth Peptidyl-tRNA hydrolase (RefSeq)
## 6 trcF Transcription-repair coupling factor (RefSeq)
## COG COGFun
## 1 COG593 L
## 2 COG592 L
## 3 COG4481 S
## 4 COG12 J
## 5 COG193 J
## 6 COG1197 LK
## COGDesc
## 1 ATPase involved in DNA replication initiation
## 2 DNA polymerase sliding clamp subunit (PCNA homolog)
## 3 Uncharacterized protein conserved in bacteria
## 4 Predicted GTPase, probable translation factor
## 5 Peptidyl-tRNA hydrolase
## 6 Transcription-repair coupling factor (superfamily II helicase)
## TIGRFam
## 1 TIGR00362 chromosomal replication initiator protein DnaA [dnaA]
## 2 TIGR00663 DNA polymerase III, beta subunit [dnaN]
## 3
## 4 TIGR00092 GTP-binding protein YchF [ychF]
## 5 TIGR00447 peptidyl-tRNA hydrolase [pth]
## 6 TIGR00580 transcription-repair coupling factor [mfd]
## TIGRRoles
## 1 DNA metabolism:DNA replication, recombination, and repair
## 2 DNA metabolism:DNA replication, recombination, and repair
## 3
## 4 Unknown function:General
## 5 Protein synthesis:Other
## 6 DNA metabolism:DNA replication, recombination, and repair
## GO
## 1 GO:0006270,GO:0006275,GO:0003688,GO:0017111,GO:0005524
## 2 GO:0006260,GO:0016451,GO:0003890,GO:0003895,GO:0016000,GO:0016452,GO:0003891,GO:0016448,GO:0019984,GO:0003677,GO:0003893,GO:0008408,GO:0016449,GO:0003889,GO:0003894,GO:0015999,GO:0016450
## 3
## 4 GO:0015684,GO:0009276,GO:0005622,GO:0016021,GO:0015093,GO:0005525
## 5 GO:0006412,GO:0004045
## 6 GO:0006281,GO:0006355,GO:0005524,GO:0003684,GO:0003700,GO:0008026
## EC ECDesc
## 1
## 2 2.7.7.7 DNA-directed DNA polymerase.
## 3
## 4
## 5 3.1.1.29 Aminoacyl-tRNA hydrolase.
## 6 3.6.1.-
microbes$width = abs(microbes$start - microbes$stop)
annotations = microbes
annotations = annotations[grep('pseudo', annotations$sysName, perl=TRUE, invert=TRUE),]
annotations = annotations[grep('^$', annotations$sysName, perl=TRUE, invert=TRUE),]
rownames(annotations) = annotations$sysName
all_qrpkm = my_norm(expt=all_expt, norm="quant", filter="log2", filter_low=FALSE, out_type="rpkm", annotations=annotations)$counts
## [1] "Applying normalization: quant"
## [1] "Setting output type as: rpkm"
## [1] "Applying: log2 filter"
all_norm_expt = all_expt
## I did that so that I can replace the expressionset with the
## normalized data more easily...
normalized_expressionset = all_norm_expt$expressionset
exprs(normalized_expressionset) = all_qrpkm
all_norm_expt$expressionset = normalized_expressionset
all_design = data.frame(all_norm_expt$design)
all_design$longnames = paste(all_design$strain, all_design$batch, all_design$condition, sep="_")
my_boxplot(expt=all_norm_expt, names=all_design$longnames)
## Using id as id variables
my_disheat(expt=all_norm_expt, names=all_design$longnames)
## [1] "Generating distance matrix using: euclidean"
my_corheat(expt=all_norm_expt, names=all_design$longnames)
## Compare library t0 t1
comp = exprs(expt_subset(all_norm_expt, "condition=='t0'|condition=='t1'")$expressionset)
pdf(file="figures/comparisons/nz131/t0_vs_t1_scatter.pdf")
sc = my_linear_scatter(comp)
## [1] "Calculating correlation between the axes."
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 76.54, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8655 0.8872
## sample estimates:
## cor
## 0.8768
##
## [1] "Calculating linear model between the axes"
##
## Call:
## lmrob(formula = second ~ first, data = df, method = "SMDM")
## \--> method = "SMDM"
## Residuals:
## Min 1Q Median 3Q Max
## -8.712 -0.426 -0.106 0.531 9.039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.10568 0.04229 2.5 0.013 *
## first 0.98439 0.00629 156.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 1.02
## Multiple R-squared: 0.928, Adjusted R-squared: 0.927
## Convergence in 8 IRWLS iterations
##
## Robustness weights:
## 6 observations c(605,632,941,1577,1605,1711)
## are outliers with |weight| = 0 ( < 5.7e-05);
## 1244 weights are ~= 1. The remaining 514 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0003 0.2640 0.7210 0.6070 0.9480 0.9990
## Algorithmic parameters:
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4 bb tuning.psi1
## -5.0e-01 1.5e+00 NA 5.0e-01 5.0e-01 -5.0e-01
## tuning.psi2 tuning.psi3 tuning.psi4 refine.tol rel.tol solve.tol
## 1.5e+00 9.5e-01 NA 1.0e-07 1.0e-07 1.0e-07
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd numpoints
## 200 0 1000 0 10
## fast.s.large.n
## 2000
## psi subsampling cov
## "lqq" "nonsingular" ".vcov.w"
## seed : int(0)
## [1] "Generating histogram of the x axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating histogram of the y axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating a histogram comparing the axes."
## [1] "Summarise the data."
## first second
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 6.84 Median : 6.91
## Mean : 5.46 Mean : 5.41
## 3rd Qu.: 8.43 3rd Qu.: 8.48
## Max. :18.44 Max. :18.44
## [1] "Uncorrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 0.64
##
## P value adjustment method: none
## [1] "Bon Ferroni corrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 0.64
##
## P value adjustment method: bonferroni
## [1] "No binwidth provided, setting it to 0.00184380094599986 in order to have 10000 bins."
sc$correlation
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 76.54, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8655 0.8872
## sample estimates:
## cor
## 0.8768
dev.off()
## Cairo
## 2
## Compare library t0 t2
comp = exprs(expt_subset(all_norm_expt, "condition=='t0'|condition=='t2'")$expressionset)
pdf(file="figures/comparisons/nz131/t0_vs_t2_scatter.pdf")
sc = my_linear_scatter(comp)
## [1] "Calculating correlation between the axes."
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 55.87, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7820 0.8157
## sample estimates:
## cor
## 0.7995
##
## [1] "Calculating linear model between the axes"
##
## Call:
## lmrob(formula = second ~ first, data = df, method = "SMDM")
## \--> method = "SMDM"
## Residuals:
## Min 1Q Median 3Q Max
## -11.4429 -0.6674 -0.0115 0.7285 12.6039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0115 0.0679 0.17 0.87
## first 0.9768 0.0102 95.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 1.72
## Multiple R-squared: 0.818, Adjusted R-squared: 0.818
## Convergence in 9 IRWLS iterations
##
## Robustness weights:
## 1287 weights are ~= 1. The remaining 477 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0019 0.2870 0.7140 0.6200 0.9440 0.9990
## Algorithmic parameters:
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4 bb tuning.psi1
## -5.0e-01 1.5e+00 NA 5.0e-01 5.0e-01 -5.0e-01
## tuning.psi2 tuning.psi3 tuning.psi4 refine.tol rel.tol solve.tol
## 1.5e+00 9.5e-01 NA 1.0e-07 1.0e-07 1.0e-07
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd numpoints
## 200 0 1000 0 10
## fast.s.large.n
## 2000
## psi subsampling cov
## "lqq" "nonsingular" ".vcov.w"
## seed : int(0)
## [1] "Generating histogram of the x axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating histogram of the y axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating a histogram comparing the axes."
## [1] "Summarise the data."
## first second
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 6.84 Median : 6.88
## Mean : 5.46 Mean : 5.03
## 3rd Qu.: 8.43 3rd Qu.: 8.51
## Max. :18.44 Max. :18.44
## [1] "Uncorrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 0.00078
##
## P value adjustment method: none
## [1] "Bon Ferroni corrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 0.00078
##
## P value adjustment method: bonferroni
## [1] "No binwidth provided, setting it to 0.00184409346271565 in order to have 10000 bins."
sc$correlation
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 55.87, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7820 0.8157
## sample estimates:
## cor
## 0.7995
dev.off()
## Cairo
## 2
## Compare library t0 t3
comp = exprs(expt_subset(all_norm_expt, "condition=='t0'|condition=='t3'")$expressionset)
pdf(file="figures/comparisons/nz131/t0_vs_t3_scatter.pdf")
sc = my_linear_scatter(comp)
## [1] "Calculating correlation between the axes."
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 20.41, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3988 0.4743
## sample estimates:
## cor
## 0.4373
##
## [1] "Calculating linear model between the axes"
##
## Call:
## lmrob(formula = second ~ first, data = df, method = "SMDM")
## \--> method = "SMDM"
## Residuals:
## Min 1Q Median 3Q Max
## -4.68100 -1.80218 0.00757 1.67505 8.76040
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6549 0.0956 48.7 <2e-16 ***
## first 0.2751 0.0146 18.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 2.2
## Multiple R-squared: 0.173, Adjusted R-squared: 0.172
## Convergence in 5 IRWLS iterations
##
## Robustness weights:
## 1174 weights are ~= 1. The remaining 590 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.175 0.918 0.955 0.930 0.986 0.999
## Algorithmic parameters:
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4 bb tuning.psi1
## -5.0e-01 1.5e+00 NA 5.0e-01 5.0e-01 -5.0e-01
## tuning.psi2 tuning.psi3 tuning.psi4 refine.tol rel.tol solve.tol
## 1.5e+00 9.5e-01 NA 1.0e-07 1.0e-07 1.0e-07
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd numpoints
## 200 0 1000 0 10
## fast.s.large.n
## 2000
## psi subsampling cov
## "lqq" "nonsingular" ".vcov.w"
## seed : int(0)
## [1] "Generating histogram of the x axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating histogram of the y axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating a histogram comparing the axes."
## [1] "Summarise the data."
## first second
## Min. : 0.00 Min. : 2.25
## 1st Qu.: 0.00 1st Qu.: 4.22
## Median : 6.84 Median : 5.46
## Mean : 5.46 Mean : 6.19
## 3rd Qu.: 8.43 3rd Qu.: 8.29
## Max. :18.44 Max. :18.49
## [1] "Uncorrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 3.1e-12
##
## P value adjustment method: none
## [1] "Bon Ferroni corrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 3.1e-12
##
## P value adjustment method: bonferroni
## [1] "No binwidth provided, setting it to 0.00184879482325277 in order to have 10000 bins."
sc$correlation
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 20.41, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3988 0.4743
## sample estimates:
## cor
## 0.4373
dev.off()
## Cairo
## 2
## Compare library t1 t2
comp = exprs(expt_subset(all_norm_expt, "condition=='t1'|condition=='t2'")$expressionset)
pdf(file="figures/comparisons/nz131/t1_vs_t2_scatter.pdf")
sc = my_linear_scatter(comp)
## [1] "Calculating correlation between the axes."
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 84.96, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8870 0.9053
## sample estimates:
## cor
## 0.8965
##
## [1] "Calculating linear model between the axes"
##
## Call:
## lmrob(formula = second ~ first, data = df, method = "SMDM")
## \--> method = "SMDM"
## Residuals:
## Min 1Q Median 3Q Max
## -8.6638 -0.4533 -0.0503 0.2463 8.5292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.05033 0.02662 1.89 0.059 .
## first 0.99574 0.00398 249.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.662
## Multiple R-squared: 0.973, Adjusted R-squared: 0.973
## Convergence in 7 IRWLS iterations
##
## Robustness weights:
## 94 observations c(59,68,132,176,236,247,294,351,364,372,458,464,471,484,509,517,535,537,552,555,558,562,564,567,615,649,670,686,704,707,714,715,728,750,796,838,852,874,876,892,893,904,909,914,915,919,922,939,940,947,948,949,972,985,1014,1024,1046,1061,1066,1068,1102,1105,1115,1117,1118,1119,1120,1166,1175,1202,1211,1227,1251,1288,1321,1327,1328,1345,1353,1361,1378,1424,1453,1459,1534,1592,1595,1622,1624,1631,1635,1640,1661,1674)
## are outliers with |weight| <= 1.4e-05 ( < 5.7e-05);
## 1179 weights are ~= 1. The remaining 491 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001 0.3380 0.7630 0.6330 0.9330 0.9990
## Algorithmic parameters:
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4 bb tuning.psi1
## -5.0e-01 1.5e+00 NA 5.0e-01 5.0e-01 -5.0e-01
## tuning.psi2 tuning.psi3 tuning.psi4 refine.tol rel.tol solve.tol
## 1.5e+00 9.5e-01 NA 1.0e-07 1.0e-07 1.0e-07
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd numpoints
## 200 0 1000 0 10
## fast.s.large.n
## 2000
## psi subsampling cov
## "lqq" "nonsingular" ".vcov.w"
## seed : int(0)
## [1] "Generating histogram of the x axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating histogram of the y axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating a histogram comparing the axes."
## [1] "Summarise the data."
## first second
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 6.91 Median : 6.88
## Mean : 5.41 Mean : 5.03
## 3rd Qu.: 8.48 3rd Qu.: 8.51
## Max. :18.44 Max. :18.44
## [1] "Uncorrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 0.0041
##
## P value adjustment method: none
## [1] "Bon Ferroni corrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 0.0041
##
## P value adjustment method: bonferroni
## [1] "No binwidth provided, setting it to 0.00184409346271565 in order to have 10000 bins."
sc$correlation
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 84.96, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8870 0.9053
## sample estimates:
## cor
## 0.8965
dev.off()
## Cairo
## 2
## Compare library t1 t3
comp = exprs(expt_subset(all_norm_expt, "condition=='t1'|condition=='t3'")$expressionset)
pdf(file="figures/comparisons/nz131/t1_vs_t3_scatter.pdf")
sc = my_linear_scatter(comp)
## [1] "Calculating correlation between the axes."
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 24.56, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4693 0.5389
## sample estimates:
## cor
## 0.5049
##
## [1] "Calculating linear model between the axes"
##
## Call:
## lmrob(formula = second ~ first, data = df, method = "SMDM")
## \--> method = "SMDM"
## Residuals:
## Min 1Q Median 3Q Max
## -4.422 -1.705 0.123 1.596 8.276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.4955 0.0900 50.0 <2e-16 ***
## first 0.3101 0.0137 22.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 2.12
## Multiple R-squared: 0.233, Adjusted R-squared: 0.233
## Convergence in 5 IRWLS iterations
##
## Robustness weights:
## 1171 weights are ~= 1. The remaining 593 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.185 0.917 0.952 0.930 0.984 0.999
## Algorithmic parameters:
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4 bb tuning.psi1
## -5.0e-01 1.5e+00 NA 5.0e-01 5.0e-01 -5.0e-01
## tuning.psi2 tuning.psi3 tuning.psi4 refine.tol rel.tol solve.tol
## 1.5e+00 9.5e-01 NA 1.0e-07 1.0e-07 1.0e-07
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd numpoints
## 200 0 1000 0 10
## fast.s.large.n
## 2000
## psi subsampling cov
## "lqq" "nonsingular" ".vcov.w"
## seed : int(0)
## [1] "Generating histogram of the x axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating histogram of the y axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating a histogram comparing the axes."
## [1] "Summarise the data."
## first second
## Min. : 0.00 Min. : 2.25
## 1st Qu.: 0.00 1st Qu.: 4.22
## Median : 6.91 Median : 5.46
## Mean : 5.41 Mean : 6.19
## 3rd Qu.: 8.48 3rd Qu.: 8.29
## Max. :18.44 Max. :18.49
## [1] "Uncorrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 1.5e-13
##
## P value adjustment method: none
## [1] "Bon Ferroni corrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second 1.5e-13
##
## P value adjustment method: bonferroni
## [1] "No binwidth provided, setting it to 0.00184879482325277 in order to have 10000 bins."
sc$correlation
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 24.56, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4693 0.5389
## sample estimates:
## cor
## 0.5049
dev.off()
## Cairo
## 2
## Compare library t2 t3
comp = exprs(expt_subset(all_norm_expt, "condition=='t2'|condition=='t3'")$expressionset)
pdf(file="figures/comparisons/nz131/t2_vs_t3_scatter.pdf")
sc = my_linear_scatter(comp)
## [1] "Calculating correlation between the axes."
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 26.12, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4938 0.5611
## sample estimates:
## cor
## 0.5283
##
## [1] "Calculating linear model between the axes"
##
## Call:
## lmrob(formula = second ~ first, data = df, method = "SMDM")
## \--> method = "SMDM"
## Residuals:
## Min 1Q Median 3Q Max
## -4.353 -1.719 0.241 1.536 8.259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6665 0.0806 57.9 <2e-16 ***
## first 0.3017 0.0125 24.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 2.08
## Multiple R-squared: 0.259, Adjusted R-squared: 0.258
## Convergence in 6 IRWLS iterations
##
## Robustness weights:
## 1155 weights are ~= 1. The remaining 609 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.177 0.918 0.948 0.929 0.982 0.999
## Algorithmic parameters:
## tuning.chi1 tuning.chi2 tuning.chi3 tuning.chi4 bb tuning.psi1
## -5.0e-01 1.5e+00 NA 5.0e-01 5.0e-01 -5.0e-01
## tuning.psi2 tuning.psi3 tuning.psi4 refine.tol rel.tol solve.tol
## 1.5e+00 9.5e-01 NA 1.0e-07 1.0e-07 1.0e-07
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd numpoints
## 200 0 1000 0 10
## fast.s.large.n
## 2000
## psi subsampling cov
## "lqq" "nonsingular" ".vcov.w"
## seed : int(0)
## [1] "Generating histogram of the x axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating histogram of the y axis."
## [1] "No binwidth provided, setting it to in order to have 10000 bins."
## [1] "Generating a histogram comparing the axes."
## [1] "Summarise the data."
## first second
## Min. : 0.00 Min. : 2.25
## 1st Qu.: 0.00 1st Qu.: 4.22
## Median : 6.88 Median : 5.46
## Mean : 5.03 Mean : 6.19
## 3rd Qu.: 8.51 3rd Qu.: 8.29
## Max. :18.44 Max. :18.49
## [1] "Uncorrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second <2e-16
##
## P value adjustment method: none
## [1] "Bon Ferroni corrected t test(s) between columns:"
##
## Pairwise comparisons using t tests with pooled SD
##
## data: play_all$expression and play_all$cond
##
## first
## second <2e-16
##
## P value adjustment method: bonferroni
## [1] "No binwidth provided, setting it to 0.00184879482325277 in order to have 10000 bins."
sc$correlation
##
## Pearson's product-moment correlation
##
## data: df[, 1] and df[, 2]
## t = 26.12, df = 1762, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4938 0.5611
## sample estimates:
## cor
## 0.5283
dev.off()
## Cairo
## 2