hs_annot <- sm(load_biomart_annotations(year = "2020"))
hs_annot <- hs_annot[["annotation"]]
hs_annot[["transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE)
tx_gene_map <- hs_annot[, c("transcript", "ensembl_gene_id")]
summary(hs_annot)
## ensembl_transcript_id ensembl_gene_id version transcript_version
## Length:227921 Length:227921 Min. : 1.0 Min. : 1.00
## Class :character Class :character 1st Qu.: 6.0 1st Qu.: 1.00
## Mode :character Mode :character Median :12.0 Median : 1.00
## Mean :10.7 Mean : 3.08
## 3rd Qu.:16.0 3rd Qu.: 5.00
## Max. :29.0 Max. :17.00
##
## hgnc_symbol description gene_biotype cds_length
## Length:227921 Length:227921 Length:227921 Min. : 3
## Class :character Class :character Class :character 1st Qu.: 357
## Mode :character Mode :character Mode :character Median : 694
## Mean : 1139
## 3rd Qu.: 1446
## Max. :107976
## NA's :127343
## chromosome_name strand start_position end_position
## Length:227921 Length:227921 Min. :5.77e+02 Min. :6.47e+02
## Class :character Class :character 1st Qu.:3.11e+07 1st Qu.:3.12e+07
## Mode :character Mode :character Median :6.04e+07 Median :6.06e+07
## Mean :7.41e+07 Mean :7.42e+07
## 3rd Qu.:1.09e+08 3rd Qu.:1.09e+08
## Max. :2.49e+08 Max. :2.49e+08
##
## transcript
## Length:227921
## Class :character
## Mode :character
##
##
##
##
hs_go <- sm(load_biomart_go()[["go"]])
hs_length <- hs_annot[, c("ensembl_gene_id", "cds_length")]
colnames(hs_length) <- c("ID", "length")
The sample sheet is copied from our shared online sheet and updated with each release of sequencing data.
samplesheet <- "sample_sheets/macrogen_samples.xlsx"
## Create the expressionset and immediately pass it to a filter
## removing the non protein coding genes.
hs_expt <- create_expt(samplesheet,
file_column = "file",
gene_info = hs_annot) %>%
exclude_genes_expt(column = "gene_biotype", method = "keep", pattern = "protein_coding")
## Reading the sample metadata.
## The sample definitions comprises: 27 rows(samples) and 4 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20132/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20133/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20134/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20179/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20187/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10036/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10046/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10063/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10073/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10093/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/m1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/m2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/sfb/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/su1158/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/su1160/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## Finished reading count data.
## Matched 21452 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21481 rows and 27 columns.
## Before removal, there were 21481 genes, now there are 19941.
## There are 5 samples which kept less than 90 percent counts.
## wt1029_b2 wt1029_b3 wt1031_b2 wt2008_b2 wt2010_b3
## 83.67 88.98 89.81 86.08 86.82
libsize <- plot_libsize(hs_expt)
libsize$plot
nonzero <- plot_nonzero(hs_expt)
nonzero$plot
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
all_norm <- sm(normalize_expt(hs_expt, transform = "log2", norm = "quant",
convert = "cpm", filter = TRUE))
all_pca <- plot_pca(all_norm, plot_labels = FALSE, plot_title = "PCA - Cell type")
all_pca$plot
tmp <- loadme(filename = savefile)