I downloaded the zip file from macrogen and processed the samples before receiving the sample sheet from Maria Adelaida and made a skeleton sheet with the samplenames as I created them. Upon receiving the sample sheet, I merged the skeleton with the actual sheet and continued.
hs_annot <- sm(load_biomart_annotations(year = "2020"))
hs_annot <- hs_annot[["annotation"]]
hs_annot[["transcript"]] <- paste0(rownames(hs_annot), ".", hs_annot[["version"]])
rownames(hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique = TRUE)
tx_gene_map <- hs_annot[, c("transcript", "ensembl_gene_id")]
summary(hs_annot)
## ensembl_transcript_id ensembl_gene_id version transcript_version
## Length:227921 Length:227921 Min. : 1.0 Min. : 1.00
## Class :character Class :character 1st Qu.: 6.0 1st Qu.: 1.00
## Mode :character Mode :character Median :12.0 Median : 1.00
## Mean :10.7 Mean : 3.08
## 3rd Qu.:16.0 3rd Qu.: 5.00
## Max. :29.0 Max. :17.00
##
## hgnc_symbol description gene_biotype cds_length
## Length:227921 Length:227921 Length:227921 Min. : 3
## Class :character Class :character Class :character 1st Qu.: 357
## Mode :character Mode :character Mode :character Median : 694
## Mean : 1139
## 3rd Qu.: 1446
## Max. :107976
## NA's :127343
## chromosome_name strand start_position end_position
## Length:227921 Length:227921 Min. :5.77e+02 Min. :6.47e+02
## Class :character Class :character 1st Qu.:3.11e+07 1st Qu.:3.12e+07
## Mode :character Mode :character Median :6.04e+07 Median :6.06e+07
## Mean :7.41e+07 Mean :7.42e+07
## 3rd Qu.:1.09e+08 3rd Qu.:1.09e+08
## Max. :2.49e+08 Max. :2.49e+08
##
## transcript
## Length:227921
## Class :character
## Mode :character
##
##
##
##
hs_go <- sm(load_biomart_go()[["go"]])
hs_length <- hs_annot[, c("ensembl_gene_id", "cds_length")]
colnames(hs_length) <- c("ID", "length")
The sample sheet is copied from our shared online sheet and updated with each release of sequencing data.
samplesheet <- "sample_sheets/macrogen_samples.xlsx"
## Create the expressionset and immediately pass it to a filter
## removing the non protein coding genes.
hs_expt <- create_expt(samplesheet,
file_column = "file",
gene_info = hs_annot) %>%
exclude_genes_expt(column = "gene_biotype", method = "keep", pattern = "protein_coding")
## Reading the sample metadata.
## The sample definitions comprises: 27 rows(samples) and 18 columns(metadata fields).
## Reading count tables.
## Reading count files with read.table().
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2008_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1029_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt1031_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/wt2010_b3/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/sfb/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/m1/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/m2/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/su1158/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20179/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10036/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20187/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10046/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20132/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10063/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20133/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10093/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/su1160/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/a_20134/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## /fs01/cbcb-lab/nelsayed/scratch/atb/rnaseq/lpanamensis_tmrc_2019/preprocessing/c_10073/outputs/hisat2_hg38_100/r1.count_hg38_100_sno_gene_gene_id.count.xz contains 21486 rows and merges to 21486 rows.
## Finished reading count data.
## Matched 21452 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 21481 rows and 27 columns.
## Before removal, there were 21481 genes, now there are 19941.
## There are 5 samples which kept less than 90 percent counts.
## wt2008_b2 wt1029_b2 wt1029_b3 wt1031_b2 wt2010_b3
## 86.08 83.67 88.98 89.81 86.82
libsize <- plot_libsize(hs_expt)
libsize$plot
nonzero <- plot_nonzero(hs_expt)
nonzero$plot
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Could you please help us with a couple of requests?
In order to make these queries more legible, I am changing the sample sheet slightly so that the WT (not wild type!) samples have the donor set to condition and the visit number set to batch for the trust samples. For the others I set condition to ‘a’ or ‘c’ and batch to just ‘undefined’. Finally, I made a column ‘experiment’ containing either ‘trust’, ‘stimulation’, or ‘clinical’ for what I think are the three experiments performed.
Start with #1 and #4 above, e.g. query the trust samples.
wt_expt <- subset_expt(hs_expt, subset = "experiment=='trust'")
## There were 27, now there are 12 samples.
wt_norm <- normalize_expt(wt_expt, transform = "log2", convert = "cpm",
norm = "quant", filter = TRUE)
## Removing 6279 low-count genes (13662 remaining).
## transform_counts: Found 3 values equal to 0, adding 1 to the matrix.
wt_pca <- plot_pca(wt_norm)
wt_pca$plot
From this initial PCA plot, it seems pretty clear that visit number is a strong factor (visit 1 on the left, visit 2 in the middle, visit 3 on the right).
Now let us write out the data, with some plots.
wt_written <- write_expt(wt_expt,
excel = glue::glue("excel/trust_samples-v{ver}.xlsx"))
## Deleting the file excel/trust_samples-v202105.xlsx before writing the tables.
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
##
## Total:113 s
## Writing the normalized reads.
## Graphing the normalized reads.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
## Warning in doTryCatch(return(expr), name, parentenv, handler): display list
## redraw incomplete
##
## Total:90 s
## Writing the median reads by factor.
Now let us look at the clinical samples, for which Maria Adelaida would like a look at the asymptomatic vs. the chronic.
clinical_expt <- subset_expt(hs_expt, subset = "experiment=='clinical'") %>%
set_expt_conditions(fact = "clinicalpresentation")
## There were 27, now there are 12 samples.
clinical_norm <- normalize_expt(clinical_expt, transform = "log2", convert = "cpm",
norm = "quant", filter = TRUE)
## Removing 8474 low-count genes (11467 remaining).
## transform_counts: Found 8 values equal to 0, adding 1 to the matrix.
clinical_pca <- plot_pca(clinical_norm)
clinical_pca$plot
clinical_nb <- normalize_expt(clinical_expt, transform = "log2", convert = "cpm",
filter = TRUE, batch = "svaseq")
## Removing 8474 low-count genes (11467 remaining).
## batch_counts: Before batch/surrogate estimation, 901 entries are x==0: 1%.
## batch_counts: Before batch/surrogate estimation, 4233 entries are 0<x<1: 3%.
## Setting 116 low elements to zero.
## transform_counts: Found 116 values equal to 0, adding 1 to the matrix.
clinical_nbpca <- plot_pca(clinical_nb)
clinical_nbpca$plot
clinical_de <- all_pairwise(clinical_expt, model_batch = "svaseq", filter = TRUE)
## batch_counts: Before batch/surrogate estimation, 901 entries are x==0: 1%.
## Plotting a PCA before surrogate/batch inclusion.
## Using svaseq to visualize before/after batch inclusion.
## Performing a test normalization with: raw
## Removing 0 low-count genes (11467 remaining).
## batch_counts: Before batch/surrogate estimation, 901 entries are x==0: 1%.
## batch_counts: Before batch/surrogate estimation, 4233 entries are 0<x<1: 3%.
## Setting 116 low elements to zero.
## transform_counts: Found 116 values equal to 0, adding 1 to the matrix.
## Finished running DE analyses, collecting outputs.
## Comparing analyses.
clinical_tables <- combine_de_tables(clinical_de,
excel = glue::glue("excel/clinical_tables-v{ver}.xlsx"))
## Deleting the file excel/clinical_tables-v202105.xlsx before writing the tables.
tmp <- loadme(filename = savefile)