Data input!

The following stanza should only need to be run once, loading the RData should give you access to the included data.

gff_annotations = import.gff3(gff_file, asRangedData=FALSE)

## Warning: gff-version directive indicates version is ##gff-version 3, not 3

annotations = as.data.frame(gff_annotations)
genes = annotations[annotations$type=="gene",]
gene_annotations = subset(genes, select=c("start", "end", "width", "strand", "Name", "description"))
write_xls(genes, "genes", rowname="ID")

I want to create a coding sequence structure

amazonensis_ranges = as(gff_annotations, "GRanges")
amazonensis_cds = subset(amazonensis_ranges, type=="CDS")
amazonensis_cds_df = as.data.frame(amazonensis_cds)
amazonensis_cds_from_fasta = read.fasta(file = "reference/TriTrypDB-7.0_LmexicanaMHOMGT2001U1103_AnnotatedCDSs.fasta.gz")

Make tooltips

I might want to make fun clicky graphs later, I use a set of tooltips for this I also disabled this stanza because it should be in the RData after the first run.

tooltip_data = genes
tooltip_data = tooltip_data[,c(11,12)]
tooltip_data$tooltip = paste(tooltip_data$Name, tooltip_data$description, sep=": ")
tooltip_data$tooltip = gsub("\\+", " ", tooltip_data$tooltip)
rownames(tooltip_data) = tooltip_data$Name
tooltip_data = tooltip_data[-1]
tooltip_data = tooltip_data[-1]
head(tooltip_data)

Load count tables

Now load the various count tables. Use the data_design to define the files to read.

require.auto("hash")
count_tables = hash()
for (row in 1:dim(wt_data_design)[1]) {
    ## Grab the hpgl id from the design table
    hpgl_id = wt_data_design[row,1]
    print(paste("Reading:", hpgl_id))
    ## Get the filename of the gzipped count table
    count_table_file = paste("data/", hpgl_id, ".count.gz", sep="")
    ## Read it
    count_table = read.table(count_table_file, header=FALSE)
    ## Set the column names to id, hpgl_id
    colnames(count_table) = c("id",as.character(hpgl_id))
    ## Send the data to a hash, because I like hashes
    count_tables[ hpgl_id ] = count_table
}

## [1] "Reading: HPGL0199"
## [1] "Reading: HPGL0200"
## [1] "Reading: HPGL0205"
## [1] "Reading: HPGL0206"
## [1] "Reading: HPGL0211"
## [1] "Reading: HPGL0212"

## Now merge them into a single data structure
wt_merged = merge_recurse(as.list(count_tables), by.x="id", by.y="id")
## Re-index the table using the id as the rownames
rownames(wt_merged) = wt_merged$id
wt_merged = wt_merged[,-1]
wt_merged = wt_merged[,HPGL_IDs]

write_xls(wt_merged, "raw_counts")

Set up experimental design

I am going to devote this stanza to setting up the condition/batch factors which will be used extensively later for normalization/voom/limma/etc

These are just grabbing stuff from the data_design csv file and making factors from them.

## I no longer need this stanza, remove it soon!
wt_cols = as.data.frame(colnames(wt_merged))
colnames(wt_cols) = c("sample")
wt_conditions = wt_data_design$condition
wt_batches = wt_data_design$batch

wt_colors = wt_data_design$condition
wt_colors = gsub("wt_dfe", chosen_colors$deplete, wt_colors)
wt_colors = gsub("wt_rfe", chosen_colors$replete, wt_colors)
wt_colors = factor(wt_colors)
wt_colors

## [1] black gray  black gray  black gray 
## Levels: black gray

Save data

yay!